1/*****************************************************************************
2
3Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2008, Google Inc.
5Copyright (c) 2012, Facebook Inc.
6Copyright (c) 2015, 2018, MariaDB Corporation.
7
8Portions of this file contain modifications contributed and copyrighted by
9Google, Inc. Those modifications are gratefully acknowledged and are described
10briefly in the InnoDB documentation. The contributions by Google are
11incorporated with their permission, and subject to the conditions contained in
12the file COPYING.Google.
13
14This program is free software; you can redistribute it and/or modify it under
15the terms of the GNU General Public License as published by the Free Software
16Foundation; version 2 of the License.
17
18This program is distributed in the hope that it will be useful, but WITHOUT
19ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
25
26*****************************************************************************/
27
28/**************************************************//**
29@file btr/btr0cur.cc
30The index tree cursor
31
32All changes that row operations make to a B-tree or the records
33there must go through this module! Undo log records are written here
34of every modify or insert of a clustered index record.
35
36 NOTE!!!
37To make sure we do not run out of disk space during a pessimistic
38insert or update, we have to reserve 2 x the height of the index tree
39many pages in the tablespace before we start the operation, because
40if leaf splitting has been started, it is difficult to undo, except
41by crashing the database and doing a roll-forward.
42
43Created 10/16/1994 Heikki Tuuri
44*******************************************************/
45
46#include "btr0cur.h"
47#include "row0upd.h"
48#include "mtr0log.h"
49#include "page0page.h"
50#include "page0zip.h"
51#include "rem0rec.h"
52#include "rem0cmp.h"
53#include "buf0lru.h"
54#include "btr0btr.h"
55#include "btr0sea.h"
56#include "row0log.h"
57#include "row0purge.h"
58#include "row0upd.h"
59#include "trx0rec.h"
60#include "trx0roll.h"
61#include "que0que.h"
62#include "row0row.h"
63#include "srv0srv.h"
64#include "ibuf0ibuf.h"
65#include "lock0lock.h"
66#include "zlib.h"
67#include "srv0start.h"
68
69/** Buffered B-tree operation types, introduced as part of delete buffering. */
70enum btr_op_t {
71 BTR_NO_OP = 0, /*!< Not buffered */
72 BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
73 BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
74 BTR_DELETE_OP, /*!< Purge a delete-marked record */
75 BTR_DELMARK_OP /*!< Mark a record for deletion */
76};
77
78/** Modification types for the B-tree operation.
79 Note that the order must be DELETE, BOTH, INSERT !!
80 */
81enum btr_intention_t {
82 BTR_INTENTION_DELETE,
83 BTR_INTENTION_BOTH,
84 BTR_INTENTION_INSERT
85};
86
87/** For the index->lock scalability improvement, only possibility of clear
88performance regression observed was caused by grown huge history list length.
89That is because the exclusive use of index->lock also worked as reserving
90free blocks and read IO bandwidth with priority. To avoid huge glowing history
91list as same level with previous implementation, prioritizes pessimistic tree
92operations by purge as the previous, when it seems to be growing huge.
93
94 Experimentally, the history list length starts to affect to performance
95throughput clearly from about 100000. */
96#define BTR_CUR_FINE_HISTORY_LENGTH 100000
97
98/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
99ulint btr_cur_n_non_sea;
100/** Old value of btr_cur_n_non_sea. Copied by
101srv_refresh_innodb_monitor_stats(). Referenced by
102srv_printf_innodb_monitor(). */
103ulint btr_cur_n_non_sea_old;
104#ifdef BTR_CUR_HASH_ADAPT
105/** Number of successful adaptive hash index lookups in
106btr_cur_search_to_nth_level(). */
107ulint btr_cur_n_sea;
108/** Old value of btr_cur_n_sea. Copied by
109srv_refresh_innodb_monitor_stats(). Referenced by
110srv_printf_innodb_monitor(). */
111ulint btr_cur_n_sea_old;
112#endif /* BTR_CUR_HASH_ADAPT */
113
114#ifdef UNIV_DEBUG
115/* Flag to limit optimistic insert records */
116uint btr_cur_limit_optimistic_insert_debug;
117#endif /* UNIV_DEBUG */
118
119/** In the optimistic insert, if the insert does not fit, but this much space
120can be released by page reorganize, then it is reorganized */
121#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32)
122
123/** The structure of a BLOB part header */
124/* @{ */
125/*--------------------------------------*/
126#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
127 page */
128#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
129 FIL_NULL if none */
130/*--------------------------------------*/
131#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
132 part header, in bytes */
133
134/** Estimated table level stats from sampled value.
135@param value sampled stats
136@param index index being sampled
137@param sample number of sampled rows
138@param ext_size external stored data size
139@param not_empty table not empty
140@return estimated table wide stats from sampled value */
141#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
142 (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
143 + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
144
145/* @} */
146
147/*******************************************************************//**
148Marks all extern fields in a record as owned by the record. This function
149should be called if the delete mark of a record is removed: a not delete
150marked record always owns all its extern fields. */
151static
152void
153btr_cur_unmark_extern_fields(
154/*=========================*/
155 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
156 part will be updated, or NULL */
157 rec_t* rec, /*!< in/out: record in a clustered index */
158 dict_index_t* index, /*!< in: index of the page */
159 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
160 mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
161/*******************************************************************//**
162Adds path information to the cursor for the current page, for which
163the binary search has been performed. */
164static
165void
166btr_cur_add_path_info(
167/*==================*/
168 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
169 ulint height, /*!< in: height of the page in tree;
170 0 means leaf node */
171 ulint root_height); /*!< in: root node height in tree */
172/***********************************************************//**
173Frees the externally stored fields for a record, if the field is mentioned
174in the update vector. */
175static
176void
177btr_rec_free_updated_extern_fields(
178/*===============================*/
179 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
180 X-latched */
181 rec_t* rec, /*!< in: record */
182 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
183 part will be updated, or NULL */
184 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
185 const upd_t* update, /*!< in: update vector */
186 bool rollback,/*!< in: performing rollback? */
187 mtr_t* mtr); /*!< in: mini-transaction handle which contains
188 an X-latch to record page and to the tree */
189/***********************************************************//**
190Frees the externally stored fields for a record. */
191static
192void
193btr_rec_free_externally_stored_fields(
194/*==================================*/
195 dict_index_t* index, /*!< in: index of the data, the index
196 tree MUST be X-latched */
197 rec_t* rec, /*!< in: record */
198 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
199 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
200 part will be updated, or NULL */
201 bool rollback,/*!< in: performing rollback? */
202 mtr_t* mtr); /*!< in: mini-transaction handle which contains
203 an X-latch to record page and to the index
204 tree */
205
206/*==================== B-TREE SEARCH =========================*/
207
208/** Latches the leaf page or pages requested.
209@param[in] block leaf page where the search converged
210@param[in] page_id page id of the leaf
211@param[in] latch_mode BTR_SEARCH_LEAF, ...
212@param[in] cursor cursor
213@param[in] mtr mini-transaction
214@return blocks and savepoints which actually latched. */
215btr_latch_leaves_t
216btr_cur_latch_leaves(
217 buf_block_t* block,
218 const page_id_t& page_id,
219 const page_size_t& page_size,
220 ulint latch_mode,
221 btr_cur_t* cursor,
222 mtr_t* mtr)
223{
224 ulint mode;
225 ulint left_page_no;
226 ulint right_page_no;
227 buf_block_t* get_block;
228 page_t* page = buf_block_get_frame(block);
229 bool spatial;
230 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
231
232 compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
233 compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
234 compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
235
236 spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
237 ut_ad(buf_page_in_file(&block->page));
238
239 switch (latch_mode) {
240 case BTR_SEARCH_LEAF:
241 case BTR_MODIFY_LEAF:
242 case BTR_SEARCH_TREE:
243 if (spatial) {
244 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
245 = mtr_set_savepoint(mtr);
246 }
247
248 mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
249 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
250 get_block = btr_block_get(page_id, page_size, mode,
251 cursor->index, mtr);
252 latch_leaves.blocks[1] = get_block;
253#ifdef UNIV_BTR_DEBUG
254 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
255#endif /* UNIV_BTR_DEBUG */
256 if (spatial) {
257 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
258 = get_block;
259 }
260
261 return(latch_leaves);
262 case BTR_MODIFY_TREE:
263 /* It is exclusive for other operations which calls
264 btr_page_set_prev() */
265 ut_ad(mtr_memo_contains_flagged(
266 mtr,
267 dict_index_get_lock(cursor->index),
268 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
269 /* x-latch also siblings from left to right */
270 left_page_no = btr_page_get_prev(page, mtr);
271 mode = latch_mode;
272
273 if (left_page_no != FIL_NULL) {
274
275 if (spatial) {
276 cursor->rtr_info->tree_savepoints[
277 RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
278 }
279
280 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
281 get_block = btr_block_get(
282 page_id_t(page_id.space(), left_page_no),
283 page_size, RW_X_LATCH, cursor->index, mtr);
284 latch_leaves.blocks[0] = get_block;
285
286 if (spatial) {
287 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
288 = get_block;
289 }
290 }
291
292 if (spatial) {
293 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
294 = mtr_set_savepoint(mtr);
295 }
296
297 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
298 get_block = btr_block_get(
299 page_id, page_size, RW_X_LATCH, cursor->index, mtr);
300 latch_leaves.blocks[1] = get_block;
301
302#ifdef UNIV_BTR_DEBUG
303 /* Sanity check only after both the blocks are latched. */
304 if (latch_leaves.blocks[0] != NULL) {
305 ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
306 == page_is_comp(page));
307 ut_a(btr_page_get_next(
308 latch_leaves.blocks[0]->frame, mtr)
309 == page_get_page_no(page));
310 }
311 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
312#endif /* UNIV_BTR_DEBUG */
313
314 if (spatial) {
315 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
316 = get_block;
317 }
318
319 right_page_no = btr_page_get_next(page, mtr);
320
321 if (right_page_no != FIL_NULL) {
322 if (spatial) {
323 cursor->rtr_info->tree_savepoints[
324 RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
325 mtr);
326 }
327 latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
328 get_block = btr_block_get(
329 page_id_t(page_id.space(), right_page_no),
330 page_size, RW_X_LATCH, cursor->index, mtr);
331 latch_leaves.blocks[2] = get_block;
332#ifdef UNIV_BTR_DEBUG
333 ut_a(page_is_comp(get_block->frame)
334 == page_is_comp(page));
335 ut_a(btr_page_get_prev(get_block->frame, mtr)
336 == page_get_page_no(page));
337#endif /* UNIV_BTR_DEBUG */
338 if (spatial) {
339 cursor->rtr_info->tree_blocks[
340 RTR_MAX_LEVELS + 2] = get_block;
341 }
342 }
343
344 return(latch_leaves);
345
346 case BTR_SEARCH_PREV:
347 case BTR_MODIFY_PREV:
348 mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
349 /* latch also left sibling */
350 rw_lock_s_lock(&block->lock);
351 left_page_no = btr_page_get_prev(page, mtr);
352 rw_lock_s_unlock(&block->lock);
353
354 if (left_page_no != FIL_NULL) {
355 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
356 get_block = btr_block_get(
357 page_id_t(page_id.space(), left_page_no),
358 page_size, mode, cursor->index, mtr);
359 latch_leaves.blocks[0] = get_block;
360 cursor->left_block = get_block;
361#ifdef UNIV_BTR_DEBUG
362 ut_a(page_is_comp(get_block->frame)
363 == page_is_comp(page));
364 ut_a(btr_page_get_next(get_block->frame, mtr)
365 == page_get_page_no(page));
366#endif /* UNIV_BTR_DEBUG */
367 }
368
369 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
370 get_block = btr_block_get(page_id, page_size, mode,
371 cursor->index, mtr);
372 latch_leaves.blocks[1] = get_block;
373#ifdef UNIV_BTR_DEBUG
374 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
375#endif /* UNIV_BTR_DEBUG */
376 return(latch_leaves);
377 case BTR_CONT_MODIFY_TREE:
378 ut_ad(dict_index_is_spatial(cursor->index));
379 return(latch_leaves);
380 }
381
382 ut_error;
383 return(latch_leaves);
384}
385
386/** Load the instant ALTER TABLE metadata from the clustered index
387when loading a table definition.
388@param[in,out] index clustered index definition
389@param[in,out] mtr mini-transaction
390@return error code
391@retval DB_SUCCESS if no error occurred
392@retval DB_CORRUPTION if any corruption was noticed */
393static
394dberr_t
395btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
396{
397 ut_ad(index->is_primary());
398 ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
399 ut_ad(index->table->supports_instant());
400 ut_ad(index->table->is_readable());
401
402 page_t* root = btr_root_get(index, mtr);
403
404 if (!root || btr_cur_instant_root_init(index, root)) {
405 ib::error() << "Table " << index->table->name
406 << " has an unreadable root page";
407 index->table->corrupted = true;
408 return DB_CORRUPTION;
409 }
410
411 ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
412
413 if (!index->is_instant()) {
414 return DB_SUCCESS;
415 }
416
417 btr_cur_t cur;
418 dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF,
419 &cur, 0, mtr);
420 if (err != DB_SUCCESS) {
421 index->table->corrupted = true;
422 return err;
423 }
424
425 ut_ad(page_cur_is_before_first(&cur.page_cur));
426 ut_ad(page_is_leaf(cur.page_cur.block->frame));
427
428 page_cur_move_to_next(&cur.page_cur);
429
430 const rec_t* rec = cur.page_cur.rec;
431
432 if (page_rec_is_supremum(rec) || !rec_is_default_row(rec, index)) {
433 ib::error() << "Table " << index->table->name
434 << " is missing instant ALTER metadata";
435 index->table->corrupted = true;
436 return DB_CORRUPTION;
437 }
438
439 if (dict_table_is_comp(index->table)) {
440 if (rec_get_info_bits(rec, true) != REC_INFO_MIN_REC_FLAG
441 && rec_get_status(rec) != REC_STATUS_COLUMNS_ADDED) {
442incompatible:
443 ib::error() << "Table " << index->table->name
444 << " contains unrecognizable "
445 "instant ALTER metadata";
446 index->table->corrupted = true;
447 return DB_CORRUPTION;
448 }
449 } else if (rec_get_info_bits(rec, false) != REC_INFO_MIN_REC_FLAG) {
450 goto incompatible;
451 }
452
453 /* Read the 'default row'. We can get here on server restart
454 or when the table was evicted from the data dictionary cache
455 and is now being accessed again.
456
457 Here, READ COMMITTED and REPEATABLE READ should be equivalent.
458 Committing the ADD COLUMN operation would acquire
459 MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
460 concurrent operations on the table, including table eviction
461 from the cache. */
462
463 mem_heap_t* heap = NULL;
464 ulint* offsets = rec_get_offsets(rec, index, NULL, true,
465 ULINT_UNDEFINED, &heap);
466 if (rec_offs_any_default(offsets)) {
467inconsistent:
468 mem_heap_free(heap);
469 goto incompatible;
470 }
471
472 /* In fact, because we only ever append fields to the 'default
473 value' record, it is also OK to perform READ UNCOMMITTED and
474 then ignore any extra fields, provided that
475 trx_sys.is_registered(DB_TRX_ID). */
476 if (rec_offs_n_fields(offsets) > index->n_fields
477 && !trx_sys.is_registered(current_trx(),
478 row_get_rec_trx_id(rec, index,
479 offsets))) {
480 goto inconsistent;
481 }
482
483 for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
484 ulint len;
485 const byte* data = rec_get_nth_field(rec, offsets, i, &len);
486 dict_col_t* col = index->fields[i].col;
487 ut_ad(!col->is_instant());
488 ut_ad(!col->def_val.data);
489 col->def_val.len = len;
490 switch (len) {
491 case UNIV_SQL_NULL:
492 continue;
493 case 0:
494 col->def_val.data = field_ref_zero;
495 continue;
496 }
497 ut_ad(len != UNIV_SQL_DEFAULT);
498 if (!rec_offs_nth_extern(offsets, i)) {
499 col->def_val.data = mem_heap_dup(
500 index->table->heap, data, len);
501 } else if (len < BTR_EXTERN_FIELD_REF_SIZE
502 || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
503 field_ref_zero,
504 BTR_EXTERN_FIELD_REF_SIZE)) {
505 col->def_val.len = UNIV_SQL_DEFAULT;
506 goto inconsistent;
507 } else {
508 col->def_val.data = btr_copy_externally_stored_field(
509 &col->def_val.len, data,
510 dict_table_page_size(index->table),
511 len, index->table->heap);
512 }
513 }
514
515 mem_heap_free(heap);
516 return DB_SUCCESS;
517}
518
519/** Load the instant ALTER TABLE metadata from the clustered index
520when loading a table definition.
521@param[in,out] table table definition from the data dictionary
522@return error code
523@retval DB_SUCCESS if no error occurred */
524dberr_t
525btr_cur_instant_init(dict_table_t* table)
526{
527 mtr_t mtr;
528 dict_index_t* index = dict_table_get_first_index(table);
529 mtr.start();
530 dberr_t err = index
531 ? btr_cur_instant_init_low(index, &mtr)
532 : DB_CORRUPTION;
533 mtr.commit();
534 return(err);
535}
536
537/** Initialize the n_core_null_bytes on first access to a clustered
538index root page.
539@param[in] index clustered index that is on its first access
540@param[in] page clustered index root page
541@return whether the page is corrupted */
542bool
543btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
544{
545 ut_ad(page_is_root(page));
546 ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
547 ut_ad(index->is_primary());
548 ut_ad(!index->is_instant());
549 ut_ad(index->table->supports_instant());
550 /* This is normally executed as part of btr_cur_instant_init()
551 when dict_load_table_one() is loading a table definition.
552 Other threads should not access or modify the n_core_null_bytes,
553 n_core_fields before dict_load_table_one() returns.
554
555 This can also be executed during IMPORT TABLESPACE, where the
556 table definition is exclusively locked. */
557
558 switch (fil_page_get_type(page)) {
559 default:
560 ut_ad(!"wrong page type");
561 return true;
562 case FIL_PAGE_INDEX:
563 /* The field PAGE_INSTANT is guaranteed 0 on clustered
564 index root pages of ROW_FORMAT=COMPACT or
565 ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
566 ut_ad(!page_is_comp(page) || !page_get_instant(page));
567 index->n_core_null_bytes = UT_BITS_IN_BYTES(
568 unsigned(index->n_nullable));
569 return false;
570 case FIL_PAGE_TYPE_INSTANT:
571 break;
572 }
573
574 uint16_t n = page_get_instant(page);
575 if (n < index->n_uniq + DATA_ROLL_PTR || n > index->n_fields) {
576 /* The PRIMARY KEY (or hidden DB_ROW_ID) and
577 DB_TRX_ID,DB_ROLL_PTR columns must always be present
578 as 'core' fields. All fields, including those for
579 instantly added columns, must be present in the data
580 dictionary. */
581 return true;
582 }
583 index->n_core_fields = n;
584 ut_ad(!index->is_dummy);
585 ut_d(index->is_dummy = true);
586 index->n_core_null_bytes = n == index->n_fields
587 ? UT_BITS_IN_BYTES(unsigned(index->n_nullable))
588 : UT_BITS_IN_BYTES(index->get_n_nullable(n));
589 ut_d(index->is_dummy = false);
590 return false;
591}
592
593/** Optimistically latches the leaf page or pages requested.
594@param[in] block guessed buffer block
595@param[in] modify_clock modify clock value
596@param[in,out] latch_mode BTR_SEARCH_LEAF, ...
597@param[in,out] cursor cursor
598@param[in] file file name
599@param[in] line line where called
600@param[in] mtr mini-transaction
601@return true if success */
602bool
603btr_cur_optimistic_latch_leaves(
604 buf_block_t* block,
605 ib_uint64_t modify_clock,
606 ulint* latch_mode,
607 btr_cur_t* cursor,
608 const char* file,
609 unsigned line,
610 mtr_t* mtr)
611{
612 ulint mode;
613 ulint left_page_no;
614
615 switch (*latch_mode) {
616 case BTR_SEARCH_LEAF:
617 case BTR_MODIFY_LEAF:
618 return(buf_page_optimistic_get(*latch_mode, block,
619 modify_clock, file, line, mtr));
620 case BTR_SEARCH_PREV:
621 case BTR_MODIFY_PREV:
622 mode = *latch_mode == BTR_SEARCH_PREV
623 ? RW_S_LATCH : RW_X_LATCH;
624
625 buf_page_mutex_enter(block);
626 if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
627 buf_page_mutex_exit(block);
628 return(false);
629 }
630 /* pin the block not to be relocated */
631 buf_block_buf_fix_inc(block, file, line);
632 buf_page_mutex_exit(block);
633
634 rw_lock_s_lock(&block->lock);
635 if (block->modify_clock != modify_clock) {
636 rw_lock_s_unlock(&block->lock);
637
638 goto unpin_failed;
639 }
640 left_page_no = btr_page_get_prev(
641 buf_block_get_frame(block), mtr);
642 rw_lock_s_unlock(&block->lock);
643
644 if (left_page_no != FIL_NULL) {
645 cursor->left_block = btr_block_get(
646 page_id_t(cursor->index->table->space->id,
647 left_page_no),
648 page_size_t(cursor->index->table->space
649 ->flags),
650 mode, cursor->index, mtr);
651 } else {
652 cursor->left_block = NULL;
653 }
654
655 if (buf_page_optimistic_get(mode, block, modify_clock,
656 file, line, mtr)) {
657 if (btr_page_get_prev(buf_block_get_frame(block), mtr)
658 == left_page_no) {
659 /* adjust buf_fix_count */
660 buf_page_mutex_enter(block);
661 buf_block_buf_fix_dec(block);
662 buf_page_mutex_exit(block);
663
664 *latch_mode = mode;
665 return(true);
666 } else {
667 /* release the block */
668 btr_leaf_page_release(block, mode, mtr);
669 }
670 }
671
672 /* release the left block */
673 if (cursor->left_block != NULL) {
674 btr_leaf_page_release(cursor->left_block,
675 mode, mtr);
676 }
677unpin_failed:
678 /* unpin the block */
679 buf_page_mutex_enter(block);
680 buf_block_buf_fix_dec(block);
681 buf_page_mutex_exit(block);
682
683 return(false);
684
685 default:
686 ut_error;
687 return(false);
688 }
689}
690
691/**
692Gets intention in btr_intention_t from latch_mode, and cleares the intention
693at the latch_mode.
694@param latch_mode in/out: pointer to latch_mode
695@return intention for latching tree */
696static
697btr_intention_t
698btr_cur_get_and_clear_intention(
699 ulint *latch_mode)
700{
701 btr_intention_t intention;
702
703 switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
704 case BTR_LATCH_FOR_INSERT:
705 intention = BTR_INTENTION_INSERT;
706 break;
707 case BTR_LATCH_FOR_DELETE:
708 intention = BTR_INTENTION_DELETE;
709 break;
710 default:
711 /* both or unknown */
712 intention = BTR_INTENTION_BOTH;
713 }
714 *latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
715
716 return(intention);
717}
718
719/**
720Gets the desired latch type for the root leaf (root page is root leaf)
721at the latch mode.
722@param latch_mode in: BTR_SEARCH_LEAF, ...
723@return latch type */
724static
725rw_lock_type_t
726btr_cur_latch_for_root_leaf(
727 ulint latch_mode)
728{
729 switch (latch_mode) {
730 case BTR_SEARCH_LEAF:
731 case BTR_SEARCH_TREE:
732 case BTR_SEARCH_PREV:
733 return(RW_S_LATCH);
734 case BTR_MODIFY_LEAF:
735 case BTR_MODIFY_TREE:
736 case BTR_MODIFY_PREV:
737 return(RW_X_LATCH);
738 case BTR_CONT_MODIFY_TREE:
739 case BTR_CONT_SEARCH_TREE:
740 /* A root page should be latched already,
741 and don't need to be latched here.
742 fall through (RW_NO_LATCH) */
743 case BTR_NO_LATCHES:
744 return(RW_NO_LATCH);
745 }
746
747 ut_error;
748 return(RW_NO_LATCH); /* avoid compiler warnings */
749}
750
751/** Detects whether the modifying record might need a modifying tree structure.
752@param[in] index index
753@param[in] page page
754@param[in] lock_intention lock intention for the tree operation
755@param[in] rec record (current node_ptr)
756@param[in] rec_size size of the record or max size of node_ptr
757@param[in] page_size page size
758@param[in] mtr mtr
759@return true if tree modification is needed */
760static
761bool
762btr_cur_will_modify_tree(
763 dict_index_t* index,
764 const page_t* page,
765 btr_intention_t lock_intention,
766 const rec_t* rec,
767 ulint rec_size,
768 const page_size_t& page_size,
769 mtr_t* mtr)
770{
771 ut_ad(!page_is_leaf(page));
772 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
773 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
774
775 /* Pessimistic delete of the first record causes delete & insert
776 of node_ptr at upper level. And a subsequent page shrink is
777 possible. It causes delete of node_ptr at the upper level.
778 So we should pay attention also to 2nd record not only
779 first record and last record. Because if the "delete & insert" are
780 done for the different page, the 2nd record become
781 first record and following compress might delete the record and causes
782 the uppper level node_ptr modification. */
783
784 if (lock_intention <= BTR_INTENTION_BOTH) {
785 ulint margin;
786
787 /* check delete will cause. (BTR_INTENTION_BOTH
788 or BTR_INTENTION_DELETE) */
789 /* first, 2nd, 2nd-last and last records are 4 records */
790 if (page_get_n_recs(page) < 5) {
791 return(true);
792 }
793
794 /* is first, 2nd or last record */
795 if (page_rec_is_first(rec, page)
796 || (page_has_next(page)
797 && (page_rec_is_last(rec, page)
798 || page_rec_is_second_last(rec, page)))
799 || (page_has_prev(page)
800 && page_rec_is_second(rec, page))) {
801 return(true);
802 }
803
804 if (lock_intention == BTR_INTENTION_BOTH) {
805 /* Delete at leftmost record in a page causes delete
806 & insert at its parent page. After that, the delete
807 might cause btr_compress() and delete record at its
808 parent page. Thus we should consider max 2 deletes. */
809
810 margin = rec_size * 2;
811 } else {
812 ut_ad(lock_intention == BTR_INTENTION_DELETE);
813
814 margin = rec_size;
815 }
816 /* NOTE: call mach_read_from_4() directly to avoid assertion
817 failure. It is safe because we already have SX latch of the
818 index tree */
819 if (page_get_data_size(page)
820 < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
821 || (mach_read_from_4(page + FIL_PAGE_NEXT)
822 == FIL_NULL
823 && mach_read_from_4(page + FIL_PAGE_PREV)
824 == FIL_NULL)) {
825 return(true);
826 }
827 }
828
829 if (lock_intention >= BTR_INTENTION_BOTH) {
830 /* check insert will cause. BTR_INTENTION_BOTH
831 or BTR_INTENTION_INSERT*/
832
833 /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
834 we should check it here in advance, since the max allowable
835 records in a page is limited. */
836 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
837 return(true));
838
839 /* needs 2 records' space for the case the single split and
840 insert cannot fit.
841 page_get_max_insert_size_after_reorganize() includes space
842 for page directory already */
843 ulint max_size
844 = page_get_max_insert_size_after_reorganize(page, 2);
845
846 if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
847 || max_size < rec_size * 2) {
848 return(true);
849 }
850 /* TODO: optimize this condition for compressed page.
851 this is based on the worst compress rate.
852 currently looking only uncompressed page, but we can look
853 also compressed page page_zip_available() if already in the
854 buffer pool */
855 /* needs 2 records' space also for worst compress rate. */
856 if (page_size.is_compressed()
857 && page_zip_empty_size(index->n_fields,
858 page_size.physical())
859 < rec_size * 2 + page_get_data_size(page)
860 + page_dir_calc_reserved_space(
861 ulint(page_get_n_recs(page)) + 2) + 1) {
862 return(true);
863 }
864 }
865
866 return(false);
867}
868
869/** Detects whether the modifying record might need a opposite modification
870to the intention.
871@param[in] page page
872@param[in] lock_intention lock intention for the tree operation
873@param[in] rec record (current node_ptr)
874@return true if tree modification is needed */
875static
876bool
877btr_cur_need_opposite_intention(
878 const page_t* page,
879 btr_intention_t lock_intention,
880 const rec_t* rec)
881{
882 switch (lock_intention) {
883 case BTR_INTENTION_DELETE:
884 return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
885 (page_has_next(page) && page_rec_is_last(rec, page));
886 case BTR_INTENTION_INSERT:
887 return page_has_next(page) && page_rec_is_last(rec, page);
888 case BTR_INTENTION_BOTH:
889 return(false);
890 }
891
892 ut_error;
893 return(false);
894}
895
896/********************************************************************//**
897Searches an index tree and positions a tree cursor on a given level.
898NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
899to node pointer page number fields on the upper levels of the tree!
900Note that if mode is PAGE_CUR_LE, which is used in inserts, then
901cursor->up_match and cursor->low_match both will have sensible values.
902If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
903
904If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
905search tuple should be performed in the B-tree. InnoDB does an insert
906immediately after the cursor. Thus, the cursor may end up on a user record,
907or on a page infimum record. */
908dberr_t
909btr_cur_search_to_nth_level_func(
910 dict_index_t* index, /*!< in: index */
911 ulint level, /*!< in: the tree level of search */
912 const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
913 tuple must be set so that it cannot get
914 compared to the node ptr page number field! */
915 page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
916 Inserts should always be made using
917 PAGE_CUR_LE to search the position! */
918 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
919 at most one of BTR_INSERT, BTR_DELETE_MARK,
920 BTR_DELETE, or BTR_ESTIMATE;
921 cursor->left_block is used to store a pointer
922 to the left neighbor page, in the cases
923 BTR_SEARCH_PREV and BTR_MODIFY_PREV;
924 NOTE that if ahi_latch, we might not have a
925 cursor page latch, we assume that ahi_latch
926 protects the record! */
927 btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
928 s- or x-latched, but see also above! */
929#ifdef BTR_CUR_HASH_ADAPT
930 rw_lock_t* ahi_latch,
931 /*!< in: currently held btr_search_latch
932 (in RW_S_LATCH mode), or NULL */
933#endif /* BTR_CUR_HASH_ADAPT */
934 const char* file, /*!< in: file name */
935 unsigned line, /*!< in: line where called */
936 mtr_t* mtr, /*!< in: mtr */
937 ib_uint64_t autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written
938 (0 if none) */
939{
940 page_t* page = NULL; /* remove warning */
941 buf_block_t* block;
942 buf_block_t* guess;
943 ulint height;
944 ulint up_match;
945 ulint up_bytes;
946 ulint low_match;
947 ulint low_bytes;
948 ulint savepoint;
949 ulint rw_latch;
950 page_cur_mode_t page_mode;
951 page_cur_mode_t search_mode = PAGE_CUR_UNSUPP;
952 ulint buf_mode;
953 ulint estimate;
954 ulint node_ptr_max_size = srv_page_size / 2;
955 page_cur_t* page_cursor;
956 btr_op_t btr_op;
957 ulint root_height = 0; /* remove warning */
958 dberr_t err = DB_SUCCESS;
959
960 ulint upper_rw_latch, root_leaf_rw_latch;
961 btr_intention_t lock_intention;
962 bool modify_external;
963 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
964 ulint tree_savepoints[BTR_MAX_LEVELS];
965 ulint n_blocks = 0;
966 ulint n_releases = 0;
967 bool detected_same_key_root = false;
968
969 bool retrying_for_search_prev = false;
970 ulint leftmost_from_level = 0;
971 buf_block_t** prev_tree_blocks = NULL;
972 ulint* prev_tree_savepoints = NULL;
973 ulint prev_n_blocks = 0;
974 ulint prev_n_releases = 0;
975 bool need_path = true;
976 bool rtree_parent_modified = false;
977 bool mbr_adj = false;
978 bool found = false;
979
980 DBUG_ENTER("btr_cur_search_to_nth_level");
981
982#ifdef BTR_CUR_ADAPT
983 btr_search_t* info;
984#endif /* BTR_CUR_ADAPT */
985 mem_heap_t* heap = NULL;
986 ulint offsets_[REC_OFFS_NORMAL_SIZE];
987 ulint* offsets = offsets_;
988 ulint offsets2_[REC_OFFS_NORMAL_SIZE];
989 ulint* offsets2 = offsets2_;
990 rec_offs_init(offsets_);
991 rec_offs_init(offsets2_);
992 /* Currently, PAGE_CUR_LE is the only search mode used for searches
993 ending to upper levels */
994
995 ut_ad(level == 0 || mode == PAGE_CUR_LE
996 || RTREE_SEARCH_MODE(mode));
997 ut_ad(dict_index_check_search_tuple(index, tuple));
998 ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
999 ut_ad(dtuple_check_typed(tuple));
1000 ut_ad(!(index->type & DICT_FTS));
1001 ut_ad(index->page != FIL_NULL);
1002
1003 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
1004 UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
1005 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
1006 UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
1007#ifdef UNIV_DEBUG
1008 cursor->up_match = ULINT_UNDEFINED;
1009 cursor->low_match = ULINT_UNDEFINED;
1010#endif /* UNIV_DEBUG */
1011
1012 ibool s_latch_by_caller;
1013
1014 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
1015
1016 ut_ad(!s_latch_by_caller
1017 || srv_read_only_mode
1018 || mtr_memo_contains_flagged(mtr,
1019 dict_index_get_lock(index),
1020 MTR_MEMO_S_LOCK
1021 | MTR_MEMO_SX_LOCK));
1022
1023 /* These flags are mutually exclusive, they are lumped together
1024 with the latch mode for historical reasons. It's possible for
1025 none of the flags to be set. */
1026 switch (UNIV_EXPECT(latch_mode
1027 & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
1028 0)) {
1029 case 0:
1030 btr_op = BTR_NO_OP;
1031 break;
1032 case BTR_INSERT:
1033 btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
1034 ? BTR_INSERT_IGNORE_UNIQUE_OP
1035 : BTR_INSERT_OP;
1036 break;
1037 case BTR_DELETE:
1038 btr_op = BTR_DELETE_OP;
1039 ut_a(cursor->purge_node);
1040 break;
1041 case BTR_DELETE_MARK:
1042 btr_op = BTR_DELMARK_OP;
1043 break;
1044 default:
1045 /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
1046 should be specified at a time */
1047 ut_error;
1048 }
1049
1050 /* Operations on the insert buffer tree cannot be buffered. */
1051 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
1052 /* Operations on the clustered index cannot be buffered. */
1053 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
1054 /* Operations on the temporary table(indexes) cannot be buffered. */
1055 ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary());
1056 /* Operation on the spatial index cannot be buffered. */
1057 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
1058
1059 estimate = latch_mode & BTR_ESTIMATE;
1060
1061 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
1062
1063 modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
1064
1065 /* Turn the flags unrelated to the latch mode off. */
1066 latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
1067
1068 ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
1069
1070 ut_ad(!s_latch_by_caller
1071 || latch_mode == BTR_SEARCH_LEAF
1072 || latch_mode == BTR_SEARCH_TREE
1073 || latch_mode == BTR_MODIFY_LEAF);
1074
1075 ut_ad(autoinc == 0 || dict_index_is_clust(index));
1076 ut_ad(autoinc == 0
1077 || latch_mode == BTR_MODIFY_TREE
1078 || latch_mode == BTR_MODIFY_LEAF);
1079 ut_ad(autoinc == 0 || level == 0);
1080
1081 cursor->flag = BTR_CUR_BINARY;
1082 cursor->index = index;
1083
1084#ifndef BTR_CUR_ADAPT
1085 guess = NULL;
1086#else
1087 info = btr_search_get_info(index);
1088
1089 if (!buf_pool_is_obsolete(info->withdraw_clock)) {
1090 guess = info->root_guess;
1091 } else {
1092 guess = NULL;
1093 }
1094
1095#ifdef BTR_CUR_HASH_ADAPT
1096
1097# ifdef UNIV_SEARCH_PERF_STAT
1098 info->n_searches++;
1099# endif
1100 if (autoinc == 0
1101 && latch_mode <= BTR_MODIFY_LEAF
1102 && info->last_hash_succ
1103# ifdef MYSQL_INDEX_DISABLE_AHI
1104 && !index->disable_ahi
1105# endif
1106 && !estimate
1107# ifdef PAGE_CUR_LE_OR_EXTENDS
1108 && mode != PAGE_CUR_LE_OR_EXTENDS
1109# endif /* PAGE_CUR_LE_OR_EXTENDS */
1110 && !dict_index_is_spatial(index)
1111 /* If !ahi_latch, we do a dirty read of
1112 btr_search_enabled below, and btr_search_guess_on_hash()
1113 will have to check it again. */
1114 && btr_search_enabled
1115 && !modify_external
1116 && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
1117 && btr_search_guess_on_hash(index, info, tuple, mode,
1118 latch_mode, cursor,
1119 ahi_latch, mtr)) {
1120
1121 /* Search using the hash index succeeded */
1122
1123 ut_ad(cursor->up_match != ULINT_UNDEFINED
1124 || mode != PAGE_CUR_GE);
1125 ut_ad(cursor->up_match != ULINT_UNDEFINED
1126 || mode != PAGE_CUR_LE);
1127 ut_ad(cursor->low_match != ULINT_UNDEFINED
1128 || mode != PAGE_CUR_LE);
1129 btr_cur_n_sea++;
1130
1131 DBUG_RETURN(err);
1132 }
1133# endif /* BTR_CUR_HASH_ADAPT */
1134#endif /* BTR_CUR_ADAPT */
1135 btr_cur_n_non_sea++;
1136
1137 /* If the hash search did not succeed, do binary search down the
1138 tree */
1139
1140#ifdef BTR_CUR_HASH_ADAPT
1141 if (ahi_latch) {
1142 /* Release possible search latch to obey latching order */
1143 rw_lock_s_unlock(ahi_latch);
1144 }
1145#endif /* BTR_CUR_HASH_ADAPT */
1146
1147 /* Store the position of the tree latch we push to mtr so that we
1148 know how to release it when we have latched leaf node(s) */
1149
1150 savepoint = mtr_set_savepoint(mtr);
1151
1152 switch (latch_mode) {
1153 case BTR_MODIFY_TREE:
1154 /* Most of delete-intended operations are purging.
1155 Free blocks and read IO bandwidth should be prior
1156 for them, when the history list is glowing huge. */
1157 if (lock_intention == BTR_INTENTION_DELETE
1158 && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH
1159 && buf_get_n_pending_read_ios()) {
1160 mtr_x_lock(dict_index_get_lock(index), mtr);
1161 } else if (dict_index_is_spatial(index)
1162 && lock_intention <= BTR_INTENTION_BOTH) {
1163 /* X lock the if there is possibility of
1164 pessimistic delete on spatial index. As we could
1165 lock upward for the tree */
1166
1167 mtr_x_lock(dict_index_get_lock(index), mtr);
1168 } else {
1169 mtr_sx_lock(dict_index_get_lock(index), mtr);
1170 }
1171 upper_rw_latch = RW_X_LATCH;
1172 break;
1173 case BTR_CONT_MODIFY_TREE:
1174 case BTR_CONT_SEARCH_TREE:
1175 /* Do nothing */
1176 ut_ad(srv_read_only_mode
1177 || mtr_memo_contains_flagged(mtr,
1178 dict_index_get_lock(index),
1179 MTR_MEMO_X_LOCK
1180 | MTR_MEMO_SX_LOCK));
1181 if (dict_index_is_spatial(index)
1182 && latch_mode == BTR_CONT_MODIFY_TREE) {
1183 /* If we are about to locating parent page for split
1184 and/or merge operation for R-Tree index, X latch
1185 the parent */
1186 upper_rw_latch = RW_X_LATCH;
1187 } else {
1188 upper_rw_latch = RW_NO_LATCH;
1189 }
1190 break;
1191 default:
1192 if (!srv_read_only_mode) {
1193 if (s_latch_by_caller) {
1194 ut_ad(rw_lock_own(dict_index_get_lock(index),
1195 RW_LOCK_S));
1196 } else if (!modify_external) {
1197 /* BTR_SEARCH_TREE is intended to be used with
1198 BTR_ALREADY_S_LATCHED */
1199 ut_ad(latch_mode != BTR_SEARCH_TREE);
1200
1201 mtr_s_lock(dict_index_get_lock(index), mtr);
1202 } else {
1203 /* BTR_MODIFY_EXTERNAL needs to be excluded */
1204 mtr_sx_lock(dict_index_get_lock(index), mtr);
1205 }
1206 upper_rw_latch = RW_S_LATCH;
1207 } else {
1208 upper_rw_latch = RW_NO_LATCH;
1209 }
1210 }
1211 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1212
1213 page_cursor = btr_cur_get_page_cur(cursor);
1214
1215 const page_size_t page_size(index->table->space->flags);
1216
1217 /* Start with the root page. */
1218 page_id_t page_id(index->table->space->id, index->page);
1219
1220 if (root_leaf_rw_latch == RW_X_LATCH) {
1221 node_ptr_max_size = dict_index_node_ptr_max_size(index);
1222 }
1223
1224 up_match = 0;
1225 up_bytes = 0;
1226 low_match = 0;
1227 low_bytes = 0;
1228
1229 height = ULINT_UNDEFINED;
1230
1231 /* We use these modified search modes on non-leaf levels of the
1232 B-tree. These let us end up in the right B-tree leaf. In that leaf
1233 we use the original search mode. */
1234
1235 switch (mode) {
1236 case PAGE_CUR_GE:
1237 page_mode = PAGE_CUR_L;
1238 break;
1239 case PAGE_CUR_G:
1240 page_mode = PAGE_CUR_LE;
1241 break;
1242 default:
1243#ifdef PAGE_CUR_LE_OR_EXTENDS
1244 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1245 || RTREE_SEARCH_MODE(mode)
1246 || mode == PAGE_CUR_LE_OR_EXTENDS);
1247#else /* PAGE_CUR_LE_OR_EXTENDS */
1248 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1249 || RTREE_SEARCH_MODE(mode));
1250#endif /* PAGE_CUR_LE_OR_EXTENDS */
1251 page_mode = mode;
1252 break;
1253 }
1254
1255 /* Loop and search until we arrive at the desired level */
1256 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1257
1258search_loop:
1259 buf_mode = BUF_GET;
1260 rw_latch = RW_NO_LATCH;
1261 rtree_parent_modified = false;
1262
1263 if (height != 0) {
1264 /* We are about to fetch the root or a non-leaf page. */
1265 if ((latch_mode != BTR_MODIFY_TREE || height == level)
1266 && !retrying_for_search_prev) {
1267 /* If doesn't have SX or X latch of index,
1268 each pages should be latched before reading. */
1269 if (height == ULINT_UNDEFINED
1270 && upper_rw_latch == RW_S_LATCH
1271 && (modify_external || autoinc)) {
1272 /* needs sx-latch of root page
1273 for fseg operation or for writing
1274 PAGE_ROOT_AUTO_INC */
1275 rw_latch = RW_SX_LATCH;
1276 } else {
1277 rw_latch = upper_rw_latch;
1278 }
1279 }
1280 } else if (latch_mode <= BTR_MODIFY_LEAF) {
1281 rw_latch = latch_mode;
1282
1283 if (btr_op != BTR_NO_OP
1284 && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1285
1286 /* Try to buffer the operation if the leaf
1287 page is not in the buffer pool. */
1288
1289 buf_mode = btr_op == BTR_DELETE_OP
1290 ? BUF_GET_IF_IN_POOL_OR_WATCH
1291 : BUF_GET_IF_IN_POOL;
1292 }
1293 }
1294
1295retry_page_get:
1296 ut_ad(n_blocks < BTR_MAX_LEVELS);
1297 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1298 block = buf_page_get_gen(page_id, page_size, rw_latch, guess,
1299 buf_mode, file, line, mtr, &err);
1300 tree_blocks[n_blocks] = block;
1301
1302 /* Note that block==NULL signifies either an error or change
1303 buffering. */
1304
1305 if (err != DB_SUCCESS) {
1306 ut_ad(block == NULL);
1307 if (err == DB_DECRYPTION_FAILED) {
1308 ib_push_warning((void *)NULL,
1309 DB_DECRYPTION_FAILED,
1310 "Table %s is encrypted but encryption service or"
1311 " used key_id is not available. "
1312 " Can't continue reading table.",
1313 index->table->name);
1314 index->table->file_unreadable = true;
1315 }
1316
1317 goto func_exit;
1318 }
1319
1320 if (block == NULL) {
1321 /* This must be a search to perform an insert/delete
1322 mark/ delete; try using the insert/delete buffer */
1323
1324 ut_ad(height == 0);
1325 ut_ad(cursor->thr);
1326
1327 switch (btr_op) {
1328 case BTR_INSERT_OP:
1329 case BTR_INSERT_IGNORE_UNIQUE_OP:
1330 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1331 ut_ad(!dict_index_is_spatial(index));
1332
1333 if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1334 page_id, page_size, cursor->thr)) {
1335
1336 cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1337
1338 goto func_exit;
1339 }
1340 break;
1341
1342 case BTR_DELMARK_OP:
1343 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1344 ut_ad(!dict_index_is_spatial(index));
1345
1346 if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1347 index, page_id, page_size,
1348 cursor->thr)) {
1349
1350 cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1351
1352 goto func_exit;
1353 }
1354
1355 break;
1356
1357 case BTR_DELETE_OP:
1358 ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1359 ut_ad(!dict_index_is_spatial(index));
1360
1361 if (!row_purge_poss_sec(cursor->purge_node,
1362 index, tuple)) {
1363
1364 /* The record cannot be purged yet. */
1365 cursor->flag = BTR_CUR_DELETE_REF;
1366 } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1367 index, page_id, page_size,
1368 cursor->thr)) {
1369
1370 /* The purge was buffered. */
1371 cursor->flag = BTR_CUR_DELETE_IBUF;
1372 } else {
1373 /* The purge could not be buffered. */
1374 buf_pool_watch_unset(page_id);
1375 break;
1376 }
1377
1378 buf_pool_watch_unset(page_id);
1379 goto func_exit;
1380
1381 default:
1382 ut_error;
1383 }
1384
1385 /* Insert to the insert/delete buffer did not succeed, we
1386 must read the page from disk. */
1387
1388 buf_mode = BUF_GET;
1389
1390 goto retry_page_get;
1391 }
1392
1393 if (retrying_for_search_prev && height != 0) {
1394 /* also latch left sibling */
1395 ulint left_page_no;
1396 buf_block_t* get_block;
1397
1398 ut_ad(rw_latch == RW_NO_LATCH);
1399
1400 rw_latch = upper_rw_latch;
1401
1402 rw_lock_s_lock(&block->lock);
1403 left_page_no = btr_page_get_prev(
1404 buf_block_get_frame(block), mtr);
1405 rw_lock_s_unlock(&block->lock);
1406
1407 if (left_page_no != FIL_NULL) {
1408 ut_ad(prev_n_blocks < leftmost_from_level);
1409
1410 prev_tree_savepoints[prev_n_blocks]
1411 = mtr_set_savepoint(mtr);
1412 get_block = buf_page_get_gen(
1413 page_id_t(page_id.space(), left_page_no),
1414 page_size, rw_latch, NULL, buf_mode,
1415 file, line, mtr, &err);
1416 prev_tree_blocks[prev_n_blocks] = get_block;
1417 prev_n_blocks++;
1418
1419 if (err != DB_SUCCESS) {
1420 if (err == DB_DECRYPTION_FAILED) {
1421 ib_push_warning((void *)NULL,
1422 DB_DECRYPTION_FAILED,
1423 "Table %s is encrypted but encryption service or"
1424 " used key_id is not available. "
1425 " Can't continue reading table.",
1426 index->table->name);
1427 index->table->file_unreadable = true;
1428 }
1429
1430 goto func_exit;
1431 }
1432
1433 /* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1434 without their parent page's lock. So, not needed to
1435 retry here, because we have the parent page's lock. */
1436 }
1437
1438 /* release RW_NO_LATCH page and lock with RW_S_LATCH */
1439 mtr_release_block_at_savepoint(
1440 mtr, tree_savepoints[n_blocks],
1441 tree_blocks[n_blocks]);
1442
1443 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1444 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1445 buf_mode, file, line, mtr, &err);
1446 tree_blocks[n_blocks] = block;
1447
1448 if (err != DB_SUCCESS) {
1449 if (err == DB_DECRYPTION_FAILED) {
1450 ib_push_warning((void *)NULL,
1451 DB_DECRYPTION_FAILED,
1452 "Table %s is encrypted but encryption service or"
1453 " used key_id is not available. "
1454 " Can't continue reading table.",
1455 index->table->name);
1456 index->table->file_unreadable = true;
1457 }
1458
1459 goto func_exit;
1460 }
1461 }
1462
1463 page = buf_block_get_frame(block);
1464
1465 if (height == ULINT_UNDEFINED
1466 && page_is_leaf(page)
1467 && rw_latch != RW_NO_LATCH
1468 && rw_latch != root_leaf_rw_latch) {
1469 /* The root page is also a leaf page (root_leaf).
1470 We should reacquire the page, because the root page
1471 is latched differently from leaf pages. */
1472 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1473 ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1474 ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
1475 ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
1476
1477 ut_ad(n_blocks == 0);
1478 mtr_release_block_at_savepoint(
1479 mtr, tree_savepoints[n_blocks],
1480 tree_blocks[n_blocks]);
1481
1482 upper_rw_latch = root_leaf_rw_latch;
1483 goto search_loop;
1484 }
1485
1486 if (rw_latch != RW_NO_LATCH) {
1487#ifdef UNIV_ZIP_DEBUG
1488 const page_zip_des_t* page_zip
1489 = buf_block_get_page_zip(block);
1490 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1491#endif /* UNIV_ZIP_DEBUG */
1492
1493 buf_block_dbg_add_level(
1494 block, dict_index_is_ibuf(index)
1495 ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1496 }
1497
1498 ut_ad(fil_page_index_page_check(page));
1499 ut_ad(index->id == btr_page_get_index_id(page));
1500
1501 if (height == ULINT_UNDEFINED) {
1502 /* We are in the root node */
1503
1504 height = btr_page_get_level(page);
1505 root_height = height;
1506 cursor->tree_height = root_height + 1;
1507
1508 if (dict_index_is_spatial(index)) {
1509 ut_ad(cursor->rtr_info);
1510
1511 node_seq_t seq_no = rtr_get_current_ssn_id(index);
1512
1513 /* If SSN in memory is not initialized, fetch
1514 it from root page */
1515 if (seq_no < 1) {
1516 node_seq_t root_seq_no;
1517
1518 root_seq_no = page_get_ssn_id(page);
1519
1520 mutex_enter(&(index->rtr_ssn.mutex));
1521 index->rtr_ssn.seq_no = root_seq_no + 1;
1522 mutex_exit(&(index->rtr_ssn.mutex));
1523 }
1524
1525 /* Save the MBR */
1526 cursor->rtr_info->thr = cursor->thr;
1527 rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1528 }
1529
1530#ifdef BTR_CUR_ADAPT
1531 if (block != guess) {
1532 info->root_guess = block;
1533 info->withdraw_clock = buf_withdraw_clock;
1534 }
1535#endif
1536 }
1537
1538 if (height == 0) {
1539 if (rw_latch == RW_NO_LATCH) {
1540
1541 latch_leaves = btr_cur_latch_leaves(
1542 block, page_id, page_size, latch_mode,
1543 cursor, mtr);
1544 }
1545
1546 switch (latch_mode) {
1547 case BTR_MODIFY_TREE:
1548 case BTR_CONT_MODIFY_TREE:
1549 case BTR_CONT_SEARCH_TREE:
1550 break;
1551 default:
1552 if (!s_latch_by_caller
1553 && !srv_read_only_mode
1554 && !modify_external) {
1555 /* Release the tree s-latch */
1556 /* NOTE: BTR_MODIFY_EXTERNAL
1557 needs to keep tree sx-latch */
1558 mtr_release_s_latch_at_savepoint(
1559 mtr, savepoint,
1560 dict_index_get_lock(index));
1561 }
1562
1563 /* release upper blocks */
1564 if (retrying_for_search_prev) {
1565 ut_ad(!autoinc);
1566 for (;
1567 prev_n_releases < prev_n_blocks;
1568 prev_n_releases++) {
1569 mtr_release_block_at_savepoint(
1570 mtr,
1571 prev_tree_savepoints[
1572 prev_n_releases],
1573 prev_tree_blocks[
1574 prev_n_releases]);
1575 }
1576 }
1577
1578 for (; n_releases < n_blocks; n_releases++) {
1579 if (n_releases == 0
1580 && (modify_external || autoinc)) {
1581 /* keep the root page latch */
1582 ut_ad(mtr_memo_contains_flagged(
1583 mtr, tree_blocks[n_releases],
1584 MTR_MEMO_PAGE_SX_FIX
1585 | MTR_MEMO_PAGE_X_FIX));
1586 continue;
1587 }
1588
1589 mtr_release_block_at_savepoint(
1590 mtr, tree_savepoints[n_releases],
1591 tree_blocks[n_releases]);
1592 }
1593 }
1594
1595 page_mode = mode;
1596 }
1597
1598 if (dict_index_is_spatial(index)) {
1599 /* Remember the page search mode */
1600 search_mode = page_mode;
1601
1602 /* Some adjustment on search mode, when the
1603 page search mode is PAGE_CUR_RTREE_LOCATE
1604 or PAGE_CUR_RTREE_INSERT, as we are searching
1605 with MBRs. When it is not the target level, we
1606 should search all sub-trees that "CONTAIN" the
1607 search range/MBR. When it is at the target
1608 level, the search becomes PAGE_CUR_LE */
1609 if (page_mode == PAGE_CUR_RTREE_LOCATE
1610 && level == height) {
1611 if (level == 0) {
1612 page_mode = PAGE_CUR_LE;
1613 } else {
1614 page_mode = PAGE_CUR_RTREE_GET_FATHER;
1615 }
1616 }
1617
1618 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1619 page_mode = (level == height)
1620 ? PAGE_CUR_LE
1621 : PAGE_CUR_RTREE_INSERT;
1622
1623 ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1624 }
1625
1626 /* "need_path" indicates if we need to tracking the parent
1627 pages, if it is not spatial comparison, then no need to
1628 track it */
1629 if (page_mode < PAGE_CUR_CONTAIN) {
1630 need_path = false;
1631 }
1632
1633 up_match = 0;
1634 low_match = 0;
1635
1636 if (latch_mode == BTR_MODIFY_TREE
1637 || latch_mode == BTR_CONT_MODIFY_TREE
1638 || latch_mode == BTR_CONT_SEARCH_TREE) {
1639 /* Tree are locked, no need for Page Lock to protect
1640 the "path" */
1641 cursor->rtr_info->need_page_lock = false;
1642 }
1643 }
1644
1645 if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1646 ut_ad(need_path);
1647 found = rtr_cur_search_with_match(
1648 block, index, tuple, page_mode, page_cursor,
1649 cursor->rtr_info);
1650
1651 /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1652 if (search_mode == PAGE_CUR_RTREE_INSERT
1653 && cursor->rtr_info->mbr_adj) {
1654 if (latch_mode & BTR_MODIFY_LEAF) {
1655 /* Parent MBR needs updated, should retry
1656 with BTR_MODIFY_TREE */
1657 goto func_exit;
1658 } else if (latch_mode & BTR_MODIFY_TREE) {
1659 rtree_parent_modified = true;
1660 cursor->rtr_info->mbr_adj = false;
1661 mbr_adj = true;
1662 } else {
1663 ut_ad(0);
1664 }
1665 }
1666
1667 if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1668 cursor->low_match =
1669 DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1670 }
1671#ifdef BTR_CUR_HASH_ADAPT
1672 } else if (height == 0 && btr_search_enabled
1673 && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
1674 && !dict_index_is_spatial(index)) {
1675 /* The adaptive hash index is only used when searching
1676 for leaf pages (height==0), but not in r-trees.
1677 We only need the byte prefix comparison for the purpose
1678 of updating the adaptive hash index. */
1679 page_cur_search_with_match_bytes(
1680 block, index, tuple, page_mode, &up_match, &up_bytes,
1681 &low_match, &low_bytes, page_cursor);
1682#endif /* BTR_CUR_HASH_ADAPT */
1683 } else {
1684 /* Search for complete index fields. */
1685 up_bytes = low_bytes = 0;
1686 page_cur_search_with_match(
1687 block, index, tuple, page_mode, &up_match,
1688 &low_match, page_cursor,
1689 need_path ? cursor->rtr_info : NULL);
1690 }
1691
1692 if (estimate) {
1693 btr_cur_add_path_info(cursor, height, root_height);
1694 }
1695
1696 /* If this is the desired level, leave the loop */
1697
1698 ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor)));
1699
1700 /* Add Predicate lock if it is serializable isolation
1701 and only if it is in the search case */
1702 if (dict_index_is_spatial(index)
1703 && cursor->rtr_info->need_prdt_lock
1704 && mode != PAGE_CUR_RTREE_INSERT
1705 && mode != PAGE_CUR_RTREE_LOCATE
1706 && mode >= PAGE_CUR_CONTAIN) {
1707 trx_t* trx = thr_get_trx(cursor->thr);
1708 lock_prdt_t prdt;
1709
1710 lock_mutex_enter();
1711 lock_init_prdt_from_mbr(
1712 &prdt, &cursor->rtr_info->mbr, mode,
1713 trx->lock.lock_heap);
1714 lock_mutex_exit();
1715
1716 if (rw_latch == RW_NO_LATCH && height != 0) {
1717 rw_lock_s_lock(&(block->lock));
1718 }
1719
1720 lock_prdt_lock(block, &prdt, index, LOCK_S,
1721 LOCK_PREDICATE, cursor->thr);
1722
1723 if (rw_latch == RW_NO_LATCH && height != 0) {
1724 rw_lock_s_unlock(&(block->lock));
1725 }
1726 }
1727
1728 if (level != height) {
1729
1730 const rec_t* node_ptr;
1731 ut_ad(height > 0);
1732
1733 height--;
1734 guess = NULL;
1735
1736 node_ptr = page_cur_get_rec(page_cursor);
1737
1738 offsets = rec_get_offsets(node_ptr, index, offsets, false,
1739 ULINT_UNDEFINED, &heap);
1740
1741 /* If the rec is the first or last in the page for
1742 pessimistic delete intention, it might cause node_ptr insert
1743 for the upper level. We should change the intention and retry.
1744 */
1745 if (latch_mode == BTR_MODIFY_TREE
1746 && btr_cur_need_opposite_intention(
1747 page, lock_intention, node_ptr)) {
1748
1749need_opposite_intention:
1750 ut_ad(upper_rw_latch == RW_X_LATCH);
1751
1752 if (n_releases > 0) {
1753 /* release root block */
1754 mtr_release_block_at_savepoint(
1755 mtr, tree_savepoints[0],
1756 tree_blocks[0]);
1757 }
1758
1759 /* release all blocks */
1760 for (; n_releases <= n_blocks; n_releases++) {
1761 mtr_release_block_at_savepoint(
1762 mtr, tree_savepoints[n_releases],
1763 tree_blocks[n_releases]);
1764 }
1765
1766 lock_intention = BTR_INTENTION_BOTH;
1767
1768 page_id.set_page_no(index->page);
1769 up_match = 0;
1770 low_match = 0;
1771 height = ULINT_UNDEFINED;
1772
1773 n_blocks = 0;
1774 n_releases = 0;
1775
1776 goto search_loop;
1777 }
1778
1779 if (dict_index_is_spatial(index)) {
1780 if (page_rec_is_supremum(node_ptr)) {
1781 cursor->low_match = 0;
1782 cursor->up_match = 0;
1783 goto func_exit;
1784 }
1785
1786 /* If we are doing insertion or record locating,
1787 remember the tree nodes we visited */
1788 if (page_mode == PAGE_CUR_RTREE_INSERT
1789 || (search_mode == PAGE_CUR_RTREE_LOCATE
1790 && (latch_mode != BTR_MODIFY_LEAF))) {
1791 bool add_latch = false;
1792
1793 if (latch_mode == BTR_MODIFY_TREE
1794 && rw_latch == RW_NO_LATCH) {
1795 ut_ad(mtr_memo_contains_flagged(
1796 mtr, dict_index_get_lock(index),
1797 MTR_MEMO_X_LOCK
1798 | MTR_MEMO_SX_LOCK));
1799 rw_lock_s_lock(&block->lock);
1800 add_latch = true;
1801 }
1802
1803 /* Store the parent cursor location */
1804#ifdef UNIV_DEBUG
1805 ulint num_stored = rtr_store_parent_path(
1806 block, cursor, latch_mode,
1807 height + 1, mtr);
1808#else
1809 rtr_store_parent_path(
1810 block, cursor, latch_mode,
1811 height + 1, mtr);
1812#endif
1813
1814 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1815 btr_pcur_t* r_cursor =
1816 rtr_get_parent_cursor(
1817 cursor, height + 1,
1818 true);
1819 /* If it is insertion, there should
1820 be only one parent for each level
1821 traverse */
1822#ifdef UNIV_DEBUG
1823 ut_ad(num_stored == 1);
1824#endif
1825
1826 node_ptr = btr_pcur_get_rec(r_cursor);
1827
1828 }
1829
1830 if (add_latch) {
1831 rw_lock_s_unlock(&block->lock);
1832 }
1833
1834 ut_ad(!page_rec_is_supremum(node_ptr));
1835 }
1836
1837 ut_ad(page_mode == search_mode
1838 || (page_mode == PAGE_CUR_WITHIN
1839 && search_mode == PAGE_CUR_RTREE_LOCATE));
1840
1841 page_mode = search_mode;
1842 }
1843
1844 /* If the first or the last record of the page
1845 or the same key value to the first record or last record,
1846 the another page might be choosen when BTR_CONT_MODIFY_TREE.
1847 So, the parent page should not released to avoiding deadlock
1848 with blocking the another search with the same key value. */
1849 if (!detected_same_key_root
1850 && lock_intention == BTR_INTENTION_BOTH
1851 && !dict_index_is_unique(index)
1852 && latch_mode == BTR_MODIFY_TREE
1853 && (up_match >= rec_offs_n_fields(offsets) - 1
1854 || low_match >= rec_offs_n_fields(offsets) - 1)) {
1855 const rec_t* first_rec = page_rec_get_next_const(
1856 page_get_infimum_rec(page));
1857 ulint matched_fields;
1858
1859 ut_ad(upper_rw_latch == RW_X_LATCH);
1860
1861 if (node_ptr == first_rec
1862 || page_rec_is_last(node_ptr, page)) {
1863 detected_same_key_root = true;
1864 } else {
1865 matched_fields = 0;
1866
1867 offsets2 = rec_get_offsets(
1868 first_rec, index, offsets2,
1869 false, ULINT_UNDEFINED, &heap);
1870 cmp_rec_rec_with_match(node_ptr, first_rec,
1871 offsets, offsets2, index, FALSE,
1872 &matched_fields);
1873
1874 if (matched_fields
1875 >= rec_offs_n_fields(offsets) - 1) {
1876 detected_same_key_root = true;
1877 } else {
1878 const rec_t* last_rec;
1879
1880 last_rec = page_rec_get_prev_const(
1881 page_get_supremum_rec(page));
1882
1883 matched_fields = 0;
1884
1885 offsets2 = rec_get_offsets(
1886 last_rec, index, offsets2,
1887 false, ULINT_UNDEFINED, &heap);
1888 cmp_rec_rec_with_match(
1889 node_ptr, last_rec,
1890 offsets, offsets2, index,
1891 FALSE, &matched_fields);
1892 if (matched_fields
1893 >= rec_offs_n_fields(offsets) - 1) {
1894 detected_same_key_root = true;
1895 }
1896 }
1897 }
1898 }
1899
1900 /* If the page might cause modify_tree,
1901 we should not release the parent page's lock. */
1902 if (!detected_same_key_root
1903 && latch_mode == BTR_MODIFY_TREE
1904 && !btr_cur_will_modify_tree(
1905 index, page, lock_intention, node_ptr,
1906 node_ptr_max_size, page_size, mtr)
1907 && !rtree_parent_modified) {
1908 ut_ad(upper_rw_latch == RW_X_LATCH);
1909 ut_ad(n_releases <= n_blocks);
1910
1911 /* we can release upper blocks */
1912 for (; n_releases < n_blocks; n_releases++) {
1913 if (n_releases == 0) {
1914 /* we should not release root page
1915 to pin to same block. */
1916 continue;
1917 }
1918
1919 /* release unused blocks to unpin */
1920 mtr_release_block_at_savepoint(
1921 mtr, tree_savepoints[n_releases],
1922 tree_blocks[n_releases]);
1923 }
1924 }
1925
1926 if (height == level
1927 && latch_mode == BTR_MODIFY_TREE) {
1928 ut_ad(upper_rw_latch == RW_X_LATCH);
1929 /* we should sx-latch root page, if released already.
1930 It contains seg_header. */
1931 if (n_releases > 0) {
1932 mtr_block_sx_latch_at_savepoint(
1933 mtr, tree_savepoints[0],
1934 tree_blocks[0]);
1935 }
1936
1937 /* x-latch the branch blocks not released yet. */
1938 for (ulint i = n_releases; i <= n_blocks; i++) {
1939 mtr_block_x_latch_at_savepoint(
1940 mtr, tree_savepoints[i],
1941 tree_blocks[i]);
1942 }
1943 }
1944
1945 /* We should consider prev_page of parent page, if the node_ptr
1946 is the leftmost of the page. because BTR_SEARCH_PREV and
1947 BTR_MODIFY_PREV latches prev_page of the leaf page. */
1948 if ((latch_mode == BTR_SEARCH_PREV
1949 || latch_mode == BTR_MODIFY_PREV)
1950 && !retrying_for_search_prev) {
1951 /* block should be latched for consistent
1952 btr_page_get_prev() */
1953 ut_ad(mtr_memo_contains_flagged(mtr, block,
1954 MTR_MEMO_PAGE_S_FIX
1955 | MTR_MEMO_PAGE_X_FIX));
1956
1957 if (page_has_prev(page)
1958 && page_rec_is_first(node_ptr, page)) {
1959
1960 if (leftmost_from_level == 0) {
1961 leftmost_from_level = height + 1;
1962 }
1963 } else {
1964 leftmost_from_level = 0;
1965 }
1966
1967 if (height == 0 && leftmost_from_level > 0) {
1968 /* should retry to get also prev_page
1969 from level==leftmost_from_level. */
1970 retrying_for_search_prev = true;
1971
1972 prev_tree_blocks = static_cast<buf_block_t**>(
1973 ut_malloc_nokey(sizeof(buf_block_t*)
1974 * leftmost_from_level));
1975
1976 prev_tree_savepoints = static_cast<ulint*>(
1977 ut_malloc_nokey(sizeof(ulint)
1978 * leftmost_from_level));
1979
1980 /* back to the level (leftmost_from_level+1) */
1981 ulint idx = n_blocks
1982 - (leftmost_from_level - 1);
1983
1984 page_id.set_page_no(
1985 tree_blocks[idx]->page.id.page_no());
1986
1987 for (ulint i = n_blocks
1988 - (leftmost_from_level - 1);
1989 i <= n_blocks; i++) {
1990 mtr_release_block_at_savepoint(
1991 mtr, tree_savepoints[i],
1992 tree_blocks[i]);
1993 }
1994
1995 n_blocks -= (leftmost_from_level - 1);
1996 height = leftmost_from_level;
1997 ut_ad(n_releases == 0);
1998
1999 /* replay up_match, low_match */
2000 up_match = 0;
2001 low_match = 0;
2002 rtr_info_t* rtr_info = need_path
2003 ? cursor->rtr_info : NULL;
2004
2005 for (ulint i = 0; i < n_blocks; i++) {
2006 page_cur_search_with_match(
2007 tree_blocks[i], index, tuple,
2008 page_mode, &up_match,
2009 &low_match, page_cursor,
2010 rtr_info);
2011 }
2012
2013 goto search_loop;
2014 }
2015 }
2016
2017 /* Go to the child node */
2018 page_id.set_page_no(
2019 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2020
2021 n_blocks++;
2022
2023 if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
2024 /* We're doing a search on an ibuf tree and we're one
2025 level above the leaf page. */
2026
2027 ut_ad(level == 0);
2028
2029 buf_mode = BUF_GET;
2030 rw_latch = RW_NO_LATCH;
2031 goto retry_page_get;
2032 }
2033
2034 if (dict_index_is_spatial(index)
2035 && page_mode >= PAGE_CUR_CONTAIN
2036 && page_mode != PAGE_CUR_RTREE_INSERT) {
2037 ut_ad(need_path);
2038 rtr_node_path_t* path =
2039 cursor->rtr_info->path;
2040
2041 if (!path->empty() && found) {
2042 ut_ad(path->back().page_no
2043 == page_id.page_no());
2044 path->pop_back();
2045#ifdef UNIV_DEBUG
2046 if (page_mode == PAGE_CUR_RTREE_LOCATE
2047 && (latch_mode != BTR_MODIFY_LEAF)) {
2048 btr_pcur_t* cur
2049 = cursor->rtr_info->parent_path->back(
2050 ).cursor;
2051 rec_t* my_node_ptr
2052 = btr_pcur_get_rec(cur);
2053
2054 offsets = rec_get_offsets(
2055 my_node_ptr, index, offsets,
2056 false, ULINT_UNDEFINED, &heap);
2057
2058 ulint my_page_no
2059 = btr_node_ptr_get_child_page_no(
2060 my_node_ptr, offsets);
2061
2062 ut_ad(page_id.page_no() == my_page_no);
2063 }
2064#endif
2065 }
2066 }
2067
2068 goto search_loop;
2069 } else if (!dict_index_is_spatial(index)
2070 && latch_mode == BTR_MODIFY_TREE
2071 && lock_intention == BTR_INTENTION_INSERT
2072 && page_has_next(page)
2073 && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
2074
2075 /* btr_insert_into_right_sibling() might cause
2076 deleting node_ptr at upper level */
2077
2078 guess = NULL;
2079
2080 if (height == 0) {
2081 /* release the leaf pages if latched */
2082 for (uint i = 0; i < 3; i++) {
2083 if (latch_leaves.blocks[i] != NULL) {
2084 mtr_release_block_at_savepoint(
2085 mtr, latch_leaves.savepoints[i],
2086 latch_leaves.blocks[i]);
2087 latch_leaves.blocks[i] = NULL;
2088 }
2089 }
2090 }
2091
2092 goto need_opposite_intention;
2093 }
2094
2095 if (level != 0) {
2096 ut_ad(!autoinc);
2097
2098 if (upper_rw_latch == RW_NO_LATCH) {
2099 /* latch the page */
2100 buf_block_t* child_block;
2101
2102 if (latch_mode == BTR_CONT_MODIFY_TREE) {
2103 child_block = btr_block_get(
2104 page_id, page_size, RW_X_LATCH,
2105 index, mtr);
2106 } else {
2107 ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
2108 child_block = btr_block_get(
2109 page_id, page_size, RW_SX_LATCH,
2110 index, mtr);
2111 }
2112
2113 btr_assert_not_corrupted(child_block, index);
2114 } else {
2115 ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
2116 btr_assert_not_corrupted(block, index);
2117
2118 if (s_latch_by_caller) {
2119 ut_ad(latch_mode == BTR_SEARCH_TREE);
2120 /* to exclude modifying tree operations
2121 should sx-latch the index. */
2122 ut_ad(mtr_memo_contains(
2123 mtr, dict_index_get_lock(index),
2124 MTR_MEMO_SX_LOCK));
2125 /* because has sx-latch of index,
2126 can release upper blocks. */
2127 for (; n_releases < n_blocks; n_releases++) {
2128 mtr_release_block_at_savepoint(
2129 mtr,
2130 tree_savepoints[n_releases],
2131 tree_blocks[n_releases]);
2132 }
2133 }
2134 }
2135
2136 if (page_mode <= PAGE_CUR_LE) {
2137 cursor->low_match = low_match;
2138 cursor->up_match = up_match;
2139 }
2140 } else {
2141 cursor->low_match = low_match;
2142 cursor->low_bytes = low_bytes;
2143 cursor->up_match = up_match;
2144 cursor->up_bytes = up_bytes;
2145
2146 if (autoinc) {
2147 page_set_autoinc(tree_blocks[0],
2148 index, autoinc, mtr, false);
2149 }
2150
2151#ifdef BTR_CUR_HASH_ADAPT
2152 /* We do a dirty read of btr_search_enabled here. We
2153 will properly check btr_search_enabled again in
2154 btr_search_build_page_hash_index() before building a
2155 page hash index, while holding search latch. */
2156 if (!btr_search_enabled) {
2157# ifdef MYSQL_INDEX_DISABLE_AHI
2158 } else if (index->disable_ahi) {
2159# endif
2160 } else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) {
2161 ut_ad(index->is_instant());
2162 /* This may be a search tuple for
2163 btr_pcur_restore_position(). */
2164 ut_ad(tuple->info_bits == REC_INFO_DEFAULT_ROW
2165 || tuple->info_bits == REC_INFO_MIN_REC_FLAG);
2166 } else if (rec_is_default_row(btr_cur_get_rec(cursor),
2167 index)) {
2168 /* Only user records belong in the adaptive
2169 hash index. */
2170 } else {
2171 btr_search_info_update(index, cursor);
2172 }
2173#endif /* BTR_CUR_HASH_ADAPT */
2174 ut_ad(cursor->up_match != ULINT_UNDEFINED
2175 || mode != PAGE_CUR_GE);
2176 ut_ad(cursor->up_match != ULINT_UNDEFINED
2177 || mode != PAGE_CUR_LE);
2178 ut_ad(cursor->low_match != ULINT_UNDEFINED
2179 || mode != PAGE_CUR_LE);
2180 }
2181
2182 /* For spatial index, remember what blocks are still latched */
2183 if (dict_index_is_spatial(index)
2184 && (latch_mode == BTR_MODIFY_TREE
2185 || latch_mode == BTR_MODIFY_LEAF)) {
2186 for (ulint i = 0; i < n_releases; i++) {
2187 cursor->rtr_info->tree_blocks[i] = NULL;
2188 cursor->rtr_info->tree_savepoints[i] = 0;
2189 }
2190
2191 for (ulint i = n_releases; i <= n_blocks; i++) {
2192 cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
2193 cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
2194 }
2195 }
2196
2197func_exit:
2198
2199 if (UNIV_LIKELY_NULL(heap)) {
2200 mem_heap_free(heap);
2201 }
2202
2203 if (retrying_for_search_prev) {
2204 ut_free(prev_tree_blocks);
2205 ut_free(prev_tree_savepoints);
2206 }
2207
2208 if (mbr_adj) {
2209 /* remember that we will need to adjust parent MBR */
2210 cursor->rtr_info->mbr_adj = true;
2211 }
2212
2213#ifdef BTR_CUR_HASH_ADAPT
2214 if (ahi_latch) {
2215 rw_lock_s_lock(ahi_latch);
2216 }
2217#endif /* BTR_CUR_HASH_ADAPT */
2218
2219 DBUG_RETURN(err);
2220}
2221
2222/*****************************************************************//**
2223Opens a cursor at either end of an index. */
2224dberr_t
2225btr_cur_open_at_index_side_func(
2226/*============================*/
2227 bool from_left, /*!< in: true if open to the low end,
2228 false if to the high end */
2229 dict_index_t* index, /*!< in: index */
2230 ulint latch_mode, /*!< in: latch mode */
2231 btr_cur_t* cursor, /*!< in/out: cursor */
2232 ulint level, /*!< in: level to search for
2233 (0=leaf). */
2234 const char* file, /*!< in: file name */
2235 unsigned line, /*!< in: line where called */
2236 mtr_t* mtr) /*!< in/out: mini-transaction */
2237{
2238 page_cur_t* page_cursor;
2239 ulint node_ptr_max_size = srv_page_size / 2;
2240 ulint height;
2241 ulint root_height = 0; /* remove warning */
2242 rec_t* node_ptr;
2243 ulint estimate;
2244 ulint savepoint;
2245 ulint upper_rw_latch, root_leaf_rw_latch;
2246 btr_intention_t lock_intention;
2247 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2248 ulint tree_savepoints[BTR_MAX_LEVELS];
2249 ulint n_blocks = 0;
2250 ulint n_releases = 0;
2251 mem_heap_t* heap = NULL;
2252 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2253 ulint* offsets = offsets_;
2254 dberr_t err = DB_SUCCESS;
2255
2256 rec_offs_init(offsets_);
2257
2258 estimate = latch_mode & BTR_ESTIMATE;
2259 latch_mode &= ulint(~BTR_ESTIMATE);
2260
2261 ut_ad(level != ULINT_UNDEFINED);
2262
2263 bool s_latch_by_caller;
2264
2265 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2266 latch_mode &= ulint(~BTR_ALREADY_S_LATCHED);
2267
2268 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2269
2270 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2271
2272 /* This function doesn't need to lock left page of the leaf page */
2273 if (latch_mode == BTR_SEARCH_PREV) {
2274 latch_mode = BTR_SEARCH_LEAF;
2275 } else if (latch_mode == BTR_MODIFY_PREV) {
2276 latch_mode = BTR_MODIFY_LEAF;
2277 }
2278
2279 /* Store the position of the tree latch we push to mtr so that we
2280 know how to release it when we have latched the leaf node */
2281
2282 savepoint = mtr_set_savepoint(mtr);
2283
2284 switch (latch_mode) {
2285 case BTR_CONT_MODIFY_TREE:
2286 case BTR_CONT_SEARCH_TREE:
2287 upper_rw_latch = RW_NO_LATCH;
2288 break;
2289 case BTR_MODIFY_TREE:
2290 /* Most of delete-intended operations are purging.
2291 Free blocks and read IO bandwidth should be prior
2292 for them, when the history list is glowing huge. */
2293 if (lock_intention == BTR_INTENTION_DELETE
2294 && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH
2295 && buf_get_n_pending_read_ios()) {
2296 mtr_x_lock(dict_index_get_lock(index), mtr);
2297 } else {
2298 mtr_sx_lock(dict_index_get_lock(index), mtr);
2299 }
2300 upper_rw_latch = RW_X_LATCH;
2301 break;
2302 default:
2303 ut_ad(!s_latch_by_caller
2304 || mtr_memo_contains_flagged(mtr,
2305 dict_index_get_lock(index),
2306 MTR_MEMO_SX_LOCK
2307 | MTR_MEMO_S_LOCK));
2308 if (!srv_read_only_mode) {
2309 if (!s_latch_by_caller) {
2310 /* BTR_SEARCH_TREE is intended to be used with
2311 BTR_ALREADY_S_LATCHED */
2312 ut_ad(latch_mode != BTR_SEARCH_TREE);
2313
2314 mtr_s_lock(dict_index_get_lock(index), mtr);
2315 }
2316 upper_rw_latch = RW_S_LATCH;
2317 } else {
2318 upper_rw_latch = RW_NO_LATCH;
2319 }
2320 }
2321 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2322
2323 page_cursor = btr_cur_get_page_cur(cursor);
2324 cursor->index = index;
2325
2326 page_id_t page_id(index->table->space->id, index->page);
2327 const page_size_t page_size(index->table->space->flags);
2328
2329 if (root_leaf_rw_latch == RW_X_LATCH) {
2330 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2331 }
2332
2333 height = ULINT_UNDEFINED;
2334
2335 for (;;) {
2336 buf_block_t* block;
2337 ulint rw_latch;
2338
2339 ut_ad(n_blocks < BTR_MAX_LEVELS);
2340
2341 if (height != 0
2342 && (latch_mode != BTR_MODIFY_TREE
2343 || height == level)) {
2344 rw_latch = upper_rw_latch;
2345 } else {
2346 rw_latch = RW_NO_LATCH;
2347 }
2348
2349 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2350 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2351 BUF_GET, file, line, mtr, &err);
2352 ut_ad((block != NULL) == (err == DB_SUCCESS));
2353 tree_blocks[n_blocks] = block;
2354
2355 if (err != DB_SUCCESS) {
2356 if (err == DB_DECRYPTION_FAILED) {
2357 ib_push_warning((void *)NULL,
2358 DB_DECRYPTION_FAILED,
2359 "Table %s is encrypted but encryption service or"
2360 " used key_id is not available. "
2361 " Can't continue reading table.",
2362 index->table->name);
2363 index->table->file_unreadable = true;
2364 }
2365
2366 goto exit_loop;
2367 }
2368
2369 const page_t* page = buf_block_get_frame(block);
2370
2371 if (height == ULINT_UNDEFINED
2372 && page_is_leaf(page)
2373 && rw_latch != RW_NO_LATCH
2374 && rw_latch != root_leaf_rw_latch) {
2375 /* We should retry to get the page, because the root page
2376 is latched with different level as a leaf page. */
2377 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2378 ut_ad(rw_latch == RW_S_LATCH);
2379
2380 ut_ad(n_blocks == 0);
2381 mtr_release_block_at_savepoint(
2382 mtr, tree_savepoints[n_blocks],
2383 tree_blocks[n_blocks]);
2384
2385 upper_rw_latch = root_leaf_rw_latch;
2386 continue;
2387 }
2388
2389 ut_ad(fil_page_index_page_check(page));
2390 ut_ad(index->id == btr_page_get_index_id(page));
2391
2392 if (height == ULINT_UNDEFINED) {
2393 /* We are in the root node */
2394
2395 height = btr_page_get_level(page);
2396 root_height = height;
2397 ut_a(height >= level);
2398 } else {
2399 /* TODO: flag the index corrupted if this fails */
2400 ut_ad(height == btr_page_get_level(page));
2401 }
2402
2403 if (height == level) {
2404 if (srv_read_only_mode) {
2405 btr_cur_latch_leaves(
2406 block, page_id, page_size,
2407 latch_mode, cursor, mtr);
2408 } else if (height == 0) {
2409 if (rw_latch == RW_NO_LATCH) {
2410 btr_cur_latch_leaves(
2411 block, page_id, page_size,
2412 latch_mode, cursor, mtr);
2413 }
2414 /* In versions <= 3.23.52 we had
2415 forgotten to release the tree latch
2416 here. If in an index scan we had to
2417 scan far to find a record visible to
2418 the current transaction, that could
2419 starve others waiting for the tree
2420 latch. */
2421
2422 switch (latch_mode) {
2423 case BTR_MODIFY_TREE:
2424 case BTR_CONT_MODIFY_TREE:
2425 case BTR_CONT_SEARCH_TREE:
2426 break;
2427 default:
2428 if (!s_latch_by_caller) {
2429 /* Release the tree s-latch */
2430 mtr_release_s_latch_at_savepoint(
2431 mtr, savepoint,
2432 dict_index_get_lock(
2433 index));
2434 }
2435
2436 /* release upper blocks */
2437 for (; n_releases < n_blocks;
2438 n_releases++) {
2439 mtr_release_block_at_savepoint(
2440 mtr,
2441 tree_savepoints[
2442 n_releases],
2443 tree_blocks[
2444 n_releases]);
2445 }
2446 }
2447 } else { /* height != 0 */
2448 /* We already have the block latched. */
2449 ut_ad(latch_mode == BTR_SEARCH_TREE);
2450 ut_ad(s_latch_by_caller);
2451 ut_ad(upper_rw_latch == RW_S_LATCH);
2452
2453 ut_ad(mtr_memo_contains(mtr, block,
2454 upper_rw_latch));
2455
2456 if (s_latch_by_caller) {
2457 /* to exclude modifying tree operations
2458 should sx-latch the index. */
2459 ut_ad(mtr_memo_contains(
2460 mtr,
2461 dict_index_get_lock(index),
2462 MTR_MEMO_SX_LOCK));
2463 /* because has sx-latch of index,
2464 can release upper blocks. */
2465 for (; n_releases < n_blocks;
2466 n_releases++) {
2467 mtr_release_block_at_savepoint(
2468 mtr,
2469 tree_savepoints[
2470 n_releases],
2471 tree_blocks[
2472 n_releases]);
2473 }
2474 }
2475 }
2476 }
2477
2478 if (from_left) {
2479 page_cur_set_before_first(block, page_cursor);
2480 } else {
2481 page_cur_set_after_last(block, page_cursor);
2482 }
2483
2484 if (height == level) {
2485 if (estimate) {
2486 btr_cur_add_path_info(cursor, height,
2487 root_height);
2488 }
2489
2490 break;
2491 }
2492
2493 ut_ad(height > 0);
2494
2495 if (from_left) {
2496 page_cur_move_to_next(page_cursor);
2497 } else {
2498 page_cur_move_to_prev(page_cursor);
2499 }
2500
2501 if (estimate) {
2502 btr_cur_add_path_info(cursor, height, root_height);
2503 }
2504
2505 height--;
2506
2507 node_ptr = page_cur_get_rec(page_cursor);
2508 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2509 false, ULINT_UNDEFINED, &heap);
2510
2511 /* If the rec is the first or last in the page for
2512 pessimistic delete intention, it might cause node_ptr insert
2513 for the upper level. We should change the intention and retry.
2514 */
2515 if (latch_mode == BTR_MODIFY_TREE
2516 && btr_cur_need_opposite_intention(
2517 page, lock_intention, node_ptr)) {
2518
2519 ut_ad(upper_rw_latch == RW_X_LATCH);
2520 /* release all blocks */
2521 for (; n_releases <= n_blocks; n_releases++) {
2522 mtr_release_block_at_savepoint(
2523 mtr, tree_savepoints[n_releases],
2524 tree_blocks[n_releases]);
2525 }
2526
2527 lock_intention = BTR_INTENTION_BOTH;
2528
2529 page_id.set_page_no(dict_index_get_page(index));
2530
2531 height = ULINT_UNDEFINED;
2532
2533 n_blocks = 0;
2534 n_releases = 0;
2535
2536 continue;
2537 }
2538
2539 if (latch_mode == BTR_MODIFY_TREE
2540 && !btr_cur_will_modify_tree(
2541 cursor->index, page, lock_intention, node_ptr,
2542 node_ptr_max_size, page_size, mtr)) {
2543 ut_ad(upper_rw_latch == RW_X_LATCH);
2544 ut_ad(n_releases <= n_blocks);
2545
2546 /* we can release upper blocks */
2547 for (; n_releases < n_blocks; n_releases++) {
2548 if (n_releases == 0) {
2549 /* we should not release root page
2550 to pin to same block. */
2551 continue;
2552 }
2553
2554 /* release unused blocks to unpin */
2555 mtr_release_block_at_savepoint(
2556 mtr, tree_savepoints[n_releases],
2557 tree_blocks[n_releases]);
2558 }
2559 }
2560
2561 if (height == level
2562 && latch_mode == BTR_MODIFY_TREE) {
2563 ut_ad(upper_rw_latch == RW_X_LATCH);
2564 /* we should sx-latch root page, if released already.
2565 It contains seg_header. */
2566 if (n_releases > 0) {
2567 mtr_block_sx_latch_at_savepoint(
2568 mtr, tree_savepoints[0],
2569 tree_blocks[0]);
2570 }
2571
2572 /* x-latch the branch blocks not released yet. */
2573 for (ulint i = n_releases; i <= n_blocks; i++) {
2574 mtr_block_x_latch_at_savepoint(
2575 mtr, tree_savepoints[i],
2576 tree_blocks[i]);
2577 }
2578 }
2579
2580 /* Go to the child node */
2581 page_id.set_page_no(
2582 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2583
2584 n_blocks++;
2585 }
2586
2587 exit_loop:
2588 if (heap) {
2589 mem_heap_free(heap);
2590 }
2591
2592 return err;
2593}
2594
2595/**********************************************************************//**
2596Positions a cursor at a randomly chosen position within a B-tree.
2597@return true if the index is available and we have put the cursor, false
2598if the index is unavailable */
2599bool
2600btr_cur_open_at_rnd_pos_func(
2601/*=========================*/
2602 dict_index_t* index, /*!< in: index */
2603 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
2604 btr_cur_t* cursor, /*!< in/out: B-tree cursor */
2605 const char* file, /*!< in: file name */
2606 unsigned line, /*!< in: line where called */
2607 mtr_t* mtr) /*!< in: mtr */
2608{
2609 page_cur_t* page_cursor;
2610 ulint node_ptr_max_size = srv_page_size / 2;
2611 ulint height;
2612 rec_t* node_ptr;
2613 ulint savepoint;
2614 ulint upper_rw_latch, root_leaf_rw_latch;
2615 btr_intention_t lock_intention;
2616 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2617 ulint tree_savepoints[BTR_MAX_LEVELS];
2618 ulint n_blocks = 0;
2619 ulint n_releases = 0;
2620 mem_heap_t* heap = NULL;
2621 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2622 ulint* offsets = offsets_;
2623 rec_offs_init(offsets_);
2624
2625 ut_ad(!dict_index_is_spatial(index));
2626
2627 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2628
2629 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2630
2631 savepoint = mtr_set_savepoint(mtr);
2632
2633 switch (latch_mode) {
2634 case BTR_MODIFY_TREE:
2635 /* Most of delete-intended operations are purging.
2636 Free blocks and read IO bandwidth should be prior
2637 for them, when the history list is glowing huge. */
2638 if (lock_intention == BTR_INTENTION_DELETE
2639 && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH
2640 && buf_get_n_pending_read_ios()) {
2641 mtr_x_lock(dict_index_get_lock(index), mtr);
2642 } else {
2643 mtr_sx_lock(dict_index_get_lock(index), mtr);
2644 }
2645 upper_rw_latch = RW_X_LATCH;
2646 break;
2647 case BTR_SEARCH_PREV:
2648 case BTR_MODIFY_PREV:
2649 /* This function doesn't support left uncle
2650 page lock for left leaf page lock, when
2651 needed. */
2652 case BTR_SEARCH_TREE:
2653 case BTR_CONT_MODIFY_TREE:
2654 case BTR_CONT_SEARCH_TREE:
2655 ut_ad(0);
2656 /* fall through */
2657 default:
2658 if (!srv_read_only_mode) {
2659 mtr_s_lock(dict_index_get_lock(index), mtr);
2660 upper_rw_latch = RW_S_LATCH;
2661 } else {
2662 upper_rw_latch = RW_NO_LATCH;
2663 }
2664 }
2665
2666 DBUG_EXECUTE_IF("test_index_is_unavailable",
2667 return(false););
2668
2669 if (index->page == FIL_NULL) {
2670 /* Since we don't hold index lock until just now, the index
2671 could be modified by others, for example, if this is a
2672 statistics updater for referenced table, it could be marked
2673 as unavailable by 'DROP TABLE' in the mean time, since
2674 we don't hold lock for statistics updater */
2675 return(false);
2676 }
2677
2678 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2679
2680 page_cursor = btr_cur_get_page_cur(cursor);
2681 cursor->index = index;
2682
2683 page_id_t page_id(index->table->space->id, index->page);
2684 const page_size_t page_size(index->table->space->flags);
2685 dberr_t err = DB_SUCCESS;
2686
2687 if (root_leaf_rw_latch == RW_X_LATCH) {
2688 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2689 }
2690
2691 height = ULINT_UNDEFINED;
2692
2693 for (;;) {
2694 buf_block_t* block;
2695 page_t* page;
2696 ulint rw_latch;
2697
2698 ut_ad(n_blocks < BTR_MAX_LEVELS);
2699
2700 if (height != 0
2701 && latch_mode != BTR_MODIFY_TREE) {
2702 rw_latch = upper_rw_latch;
2703 } else {
2704 rw_latch = RW_NO_LATCH;
2705 }
2706
2707 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2708 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2709 BUF_GET, file, line, mtr, &err);
2710 tree_blocks[n_blocks] = block;
2711
2712 ut_ad((block != NULL) == (err == DB_SUCCESS));
2713
2714 if (err != DB_SUCCESS) {
2715 if (err == DB_DECRYPTION_FAILED) {
2716 ib_push_warning((void *)NULL,
2717 DB_DECRYPTION_FAILED,
2718 "Table %s is encrypted but encryption service or"
2719 " used key_id is not available. "
2720 " Can't continue reading table.",
2721 index->table->name);
2722 index->table->file_unreadable = true;
2723 }
2724
2725 goto exit_loop;
2726 }
2727
2728 page = buf_block_get_frame(block);
2729
2730 if (height == ULINT_UNDEFINED
2731 && page_is_leaf(page)
2732 && rw_latch != RW_NO_LATCH
2733 && rw_latch != root_leaf_rw_latch) {
2734 /* We should retry to get the page, because the root page
2735 is latched with different level as a leaf page. */
2736 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2737 ut_ad(rw_latch == RW_S_LATCH);
2738
2739 ut_ad(n_blocks == 0);
2740 mtr_release_block_at_savepoint(
2741 mtr, tree_savepoints[n_blocks],
2742 tree_blocks[n_blocks]);
2743
2744 upper_rw_latch = root_leaf_rw_latch;
2745 continue;
2746 }
2747
2748 ut_ad(fil_page_index_page_check(page));
2749 ut_ad(index->id == btr_page_get_index_id(page));
2750
2751 if (height == ULINT_UNDEFINED) {
2752 /* We are in the root node */
2753
2754 height = btr_page_get_level(page);
2755 }
2756
2757 if (height == 0) {
2758 if (rw_latch == RW_NO_LATCH
2759 || srv_read_only_mode) {
2760 btr_cur_latch_leaves(
2761 block, page_id, page_size,
2762 latch_mode, cursor, mtr);
2763 }
2764
2765 /* btr_cur_open_at_index_side_func() and
2766 btr_cur_search_to_nth_level() release
2767 tree s-latch here.*/
2768 switch (latch_mode) {
2769 case BTR_MODIFY_TREE:
2770 case BTR_CONT_MODIFY_TREE:
2771 case BTR_CONT_SEARCH_TREE:
2772 break;
2773 default:
2774 /* Release the tree s-latch */
2775 if (!srv_read_only_mode) {
2776 mtr_release_s_latch_at_savepoint(
2777 mtr, savepoint,
2778 dict_index_get_lock(index));
2779 }
2780
2781 /* release upper blocks */
2782 for (; n_releases < n_blocks; n_releases++) {
2783 mtr_release_block_at_savepoint(
2784 mtr,
2785 tree_savepoints[n_releases],
2786 tree_blocks[n_releases]);
2787 }
2788 }
2789 }
2790
2791 page_cur_open_on_rnd_user_rec(block, page_cursor);
2792
2793 if (height == 0) {
2794
2795 break;
2796 }
2797
2798 ut_ad(height > 0);
2799
2800 height--;
2801
2802 node_ptr = page_cur_get_rec(page_cursor);
2803 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2804 false, ULINT_UNDEFINED, &heap);
2805
2806 /* If the rec is the first or last in the page for
2807 pessimistic delete intention, it might cause node_ptr insert
2808 for the upper level. We should change the intention and retry.
2809 */
2810 if (latch_mode == BTR_MODIFY_TREE
2811 && btr_cur_need_opposite_intention(
2812 page, lock_intention, node_ptr)) {
2813
2814 ut_ad(upper_rw_latch == RW_X_LATCH);
2815 /* release all blocks */
2816 for (; n_releases <= n_blocks; n_releases++) {
2817 mtr_release_block_at_savepoint(
2818 mtr, tree_savepoints[n_releases],
2819 tree_blocks[n_releases]);
2820 }
2821
2822 lock_intention = BTR_INTENTION_BOTH;
2823
2824 page_id.set_page_no(dict_index_get_page(index));
2825
2826 height = ULINT_UNDEFINED;
2827
2828 n_blocks = 0;
2829 n_releases = 0;
2830
2831 continue;
2832 }
2833
2834 if (latch_mode == BTR_MODIFY_TREE
2835 && !btr_cur_will_modify_tree(
2836 cursor->index, page, lock_intention, node_ptr,
2837 node_ptr_max_size, page_size, mtr)) {
2838 ut_ad(upper_rw_latch == RW_X_LATCH);
2839 ut_ad(n_releases <= n_blocks);
2840
2841 /* we can release upper blocks */
2842 for (; n_releases < n_blocks; n_releases++) {
2843 if (n_releases == 0) {
2844 /* we should not release root page
2845 to pin to same block. */
2846 continue;
2847 }
2848
2849 /* release unused blocks to unpin */
2850 mtr_release_block_at_savepoint(
2851 mtr, tree_savepoints[n_releases],
2852 tree_blocks[n_releases]);
2853 }
2854 }
2855
2856 if (height == 0
2857 && latch_mode == BTR_MODIFY_TREE) {
2858 ut_ad(upper_rw_latch == RW_X_LATCH);
2859 /* we should sx-latch root page, if released already.
2860 It contains seg_header. */
2861 if (n_releases > 0) {
2862 mtr_block_sx_latch_at_savepoint(
2863 mtr, tree_savepoints[0],
2864 tree_blocks[0]);
2865 }
2866
2867 /* x-latch the branch blocks not released yet. */
2868 for (ulint i = n_releases; i <= n_blocks; i++) {
2869 mtr_block_x_latch_at_savepoint(
2870 mtr, tree_savepoints[i],
2871 tree_blocks[i]);
2872 }
2873 }
2874
2875 /* Go to the child node */
2876 page_id.set_page_no(
2877 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2878
2879 n_blocks++;
2880 }
2881
2882 exit_loop:
2883 if (UNIV_LIKELY_NULL(heap)) {
2884 mem_heap_free(heap);
2885 }
2886
2887 return(true);
2888}
2889
2890/*==================== B-TREE INSERT =========================*/
2891
2892/*************************************************************//**
2893Inserts a record if there is enough space, or if enough space can
2894be freed by reorganizing. Differs from btr_cur_optimistic_insert because
2895no heuristics is applied to whether it pays to use CPU time for
2896reorganizing the page or not.
2897
2898IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
2899if this is a compressed leaf page in a secondary index.
2900This has to be done either within the same mini-transaction,
2901or by invoking ibuf_reset_free_bits() before mtr_commit().
2902
2903@return pointer to inserted record if succeed, else NULL */
2904static MY_ATTRIBUTE((nonnull, warn_unused_result))
2905rec_t*
2906btr_cur_insert_if_possible(
2907/*=======================*/
2908 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
2909 cursor stays valid */
2910 const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
2911 have been stored to tuple */
2912 ulint** offsets,/*!< out: offsets on *rec */
2913 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
2914 ulint n_ext, /*!< in: number of externally stored columns */
2915 mtr_t* mtr) /*!< in/out: mini-transaction */
2916{
2917 page_cur_t* page_cursor;
2918 rec_t* rec;
2919
2920 ut_ad(dtuple_check_typed(tuple));
2921
2922 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2923 MTR_MEMO_PAGE_X_FIX));
2924 page_cursor = btr_cur_get_page_cur(cursor);
2925
2926 /* Now, try the insert */
2927 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2928 offsets, heap, n_ext, mtr);
2929
2930 /* If the record did not fit, reorganize.
2931 For compressed pages, page_cur_tuple_insert()
2932 attempted this already. */
2933 if (!rec && !page_cur_get_page_zip(page_cursor)
2934 && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
2935 rec = page_cur_tuple_insert(
2936 page_cursor, tuple, cursor->index,
2937 offsets, heap, n_ext, mtr);
2938 }
2939
2940 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
2941 return(rec);
2942}
2943
2944/*************************************************************//**
2945For an insert, checks the locks and does the undo logging if desired.
2946@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
2947UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
2948dberr_t
2949btr_cur_ins_lock_and_undo(
2950/*======================*/
2951 ulint flags, /*!< in: undo logging and locking flags: if
2952 not zero, the parameters index and thr
2953 should be specified */
2954 btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
2955 dtuple_t* entry, /*!< in/out: entry to insert */
2956 que_thr_t* thr, /*!< in: query thread or NULL */
2957 mtr_t* mtr, /*!< in/out: mini-transaction */
2958 bool* inherit)/*!< out: true if the inserted new record maybe
2959 should inherit LOCK_GAP type locks from the
2960 successor record */
2961{
2962 dict_index_t* index;
2963 dberr_t err = DB_SUCCESS;
2964 rec_t* rec;
2965 roll_ptr_t roll_ptr;
2966
2967 /* Check if we have to wait for a lock: enqueue an explicit lock
2968 request if yes */
2969
2970 rec = btr_cur_get_rec(cursor);
2971 index = cursor->index;
2972
2973 ut_ad(!dict_index_is_online_ddl(index)
2974 || dict_index_is_clust(index)
2975 || (flags & BTR_CREATE_FLAG));
2976 ut_ad(mtr->is_named_space(index->table->space));
2977
2978 /* Check if there is predicate or GAP lock preventing the insertion */
2979 if (!(flags & BTR_NO_LOCKING_FLAG)) {
2980 if (dict_index_is_spatial(index)) {
2981 lock_prdt_t prdt;
2982 rtr_mbr_t mbr;
2983
2984 rtr_get_mbr_from_tuple(entry, &mbr);
2985
2986 /* Use on stack MBR variable to test if a lock is
2987 needed. If so, the predicate (MBR) will be allocated
2988 from lock heap in lock_prdt_insert_check_and_lock() */
2989 lock_init_prdt_from_mbr(
2990 &prdt, &mbr, 0, NULL);
2991
2992 err = lock_prdt_insert_check_and_lock(
2993 flags, rec, btr_cur_get_block(cursor),
2994 index, thr, mtr, &prdt);
2995 *inherit = false;
2996 } else {
2997 err = lock_rec_insert_check_and_lock(
2998 flags, rec, btr_cur_get_block(cursor),
2999 index, thr, mtr, inherit);
3000 }
3001 }
3002
3003 if (err != DB_SUCCESS
3004 || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
3005 || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
3006
3007 return(err);
3008 }
3009
3010 if (flags & BTR_NO_UNDO_LOG_FLAG) {
3011 roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
3012 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3013upd_sys:
3014 row_upd_index_entry_sys_field(entry, index,
3015 DATA_ROLL_PTR, roll_ptr);
3016 }
3017 } else {
3018 err = trx_undo_report_row_operation(thr, index, entry,
3019 NULL, 0, NULL, NULL,
3020 &roll_ptr);
3021 if (err == DB_SUCCESS) {
3022 goto upd_sys;
3023 }
3024 }
3025
3026 return(err);
3027}
3028
3029/**
3030Prefetch siblings of the leaf for the pessimistic operation.
3031@param block leaf page */
3032static
3033void
3034btr_cur_prefetch_siblings(
3035 buf_block_t* block)
3036{
3037 page_t* page = buf_block_get_frame(block);
3038
3039 ut_ad(page_is_leaf(page));
3040
3041 ulint left_page_no = fil_page_get_prev(page);
3042 ulint right_page_no = fil_page_get_next(page);
3043
3044 if (left_page_no != FIL_NULL) {
3045 buf_read_page_background(
3046 page_id_t(block->page.id.space(), left_page_no),
3047 block->page.size, false);
3048 }
3049 if (right_page_no != FIL_NULL) {
3050 buf_read_page_background(
3051 page_id_t(block->page.id.space(), right_page_no),
3052 block->page.size, false);
3053 }
3054 if (left_page_no != FIL_NULL
3055 || right_page_no != FIL_NULL) {
3056 os_aio_simulated_wake_handler_threads();
3057 }
3058}
3059
3060/*************************************************************//**
3061Tries to perform an insert to a page in an index tree, next to cursor.
3062It is assumed that mtr holds an x-latch on the page. The operation does
3063not succeed if there is too little space on the page. If there is just
3064one record on the page, the insert will always succeed; this is to
3065prevent trying to split a page with just one record.
3066@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3067dberr_t
3068btr_cur_optimistic_insert(
3069/*======================*/
3070 ulint flags, /*!< in: undo logging and locking flags: if not
3071 zero, the parameters index and thr should be
3072 specified */
3073 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3074 cursor stays valid */
3075 ulint** offsets,/*!< out: offsets on *rec */
3076 mem_heap_t** heap, /*!< in/out: pointer to memory heap */
3077 dtuple_t* entry, /*!< in/out: entry to insert */
3078 rec_t** rec, /*!< out: pointer to inserted record if
3079 succeed */
3080 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3081 be stored externally by the caller */
3082 ulint n_ext, /*!< in: number of externally stored columns */
3083 que_thr_t* thr, /*!< in/out: query thread; can be NULL if
3084 !(~flags
3085 & (BTR_NO_LOCKING_FLAG
3086 | BTR_NO_UNDO_LOG_FLAG)) */
3087 mtr_t* mtr) /*!< in/out: mini-transaction;
3088 if this function returns DB_SUCCESS on
3089 a leaf page of a secondary index in a
3090 compressed tablespace, the caller must
3091 mtr_commit(mtr) before latching
3092 any further pages */
3093{
3094 big_rec_t* big_rec_vec = NULL;
3095 dict_index_t* index;
3096 page_cur_t* page_cursor;
3097 buf_block_t* block;
3098 page_t* page;
3099 rec_t* dummy;
3100 bool leaf;
3101 bool reorg;
3102 bool inherit = true;
3103 ulint rec_size;
3104 dberr_t err;
3105
3106 ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
3107 *big_rec = NULL;
3108
3109 block = btr_cur_get_block(cursor);
3110 page = buf_block_get_frame(block);
3111 index = cursor->index;
3112
3113 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3114 ut_ad(!dict_index_is_online_ddl(index)
3115 || dict_index_is_clust(index)
3116 || (flags & BTR_CREATE_FLAG));
3117 ut_ad(dtuple_check_typed(entry));
3118
3119 const page_size_t& page_size = block->page.size;
3120
3121#ifdef UNIV_DEBUG_VALGRIND
3122 if (page_size.is_compressed()) {
3123 UNIV_MEM_ASSERT_RW(page, page_size.logical());
3124 UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3125 }
3126#endif /* UNIV_DEBUG_VALGRIND */
3127
3128 leaf = page_is_leaf(page);
3129
3130 /* Calculate the record size when entry is converted to a record */
3131 rec_size = rec_get_converted_size(index, entry, n_ext);
3132
3133 if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3134 dtuple_get_n_fields(entry), page_size)) {
3135
3136 /* The record is so big that we have to store some fields
3137 externally on separate database pages */
3138 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3139
3140 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3141
3142 return(DB_TOO_BIG_RECORD);
3143 }
3144
3145 rec_size = rec_get_converted_size(index, entry, n_ext);
3146 }
3147
3148 if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3149 if (big_rec_vec != NULL) {
3150 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3151 }
3152
3153 return(DB_TOO_BIG_RECORD);
3154 }
3155
3156 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3157 goto fail);
3158
3159 if (leaf && page_size.is_compressed()
3160 && (page_get_data_size(page) + rec_size
3161 >= dict_index_zip_pad_optimal_page_size(index))) {
3162 /* If compression padding tells us that insertion will
3163 result in too packed up page i.e.: which is likely to
3164 cause compression failure then don't do an optimistic
3165 insertion. */
3166fail:
3167 err = DB_FAIL;
3168
3169 /* prefetch siblings of the leaf for the pessimistic
3170 operation, if the page is leaf. */
3171 if (page_is_leaf(page)) {
3172 btr_cur_prefetch_siblings(block);
3173 }
3174fail_err:
3175
3176 if (big_rec_vec) {
3177 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3178 }
3179
3180 return(err);
3181 }
3182
3183 ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
3184
3185 if (page_has_garbage(page)) {
3186 if ((max_size < rec_size
3187 || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3188 && page_get_n_recs(page) > 1
3189 && page_get_max_insert_size(page, 1) < rec_size) {
3190
3191 goto fail;
3192 }
3193 } else if (max_size < rec_size) {
3194 goto fail;
3195 }
3196
3197 /* If there have been many consecutive inserts to the
3198 clustered index leaf page of an uncompressed table, check if
3199 we have to split the page to reserve enough free space for
3200 future updates of records. */
3201
3202 if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3203 && page_get_n_recs(page) >= 2
3204 && dict_index_get_space_reserve() + rec_size > max_size
3205 && (btr_page_get_split_rec_to_right(cursor, &dummy)
3206 || btr_page_get_split_rec_to_left(cursor, &dummy))) {
3207 goto fail;
3208 }
3209
3210 page_cursor = btr_cur_get_page_cur(cursor);
3211
3212 DBUG_LOG("ib_cur",
3213 "insert " << index->name << " (" << index->id << ") by "
3214 << ib::hex(thr ? thr->graph->trx->id : 0)
3215 << ' ' << rec_printer(entry).str());
3216 DBUG_EXECUTE_IF("do_page_reorganize",
3217 btr_page_reorganize(page_cursor, index, mtr););
3218
3219 /* Now, try the insert */
3220 {
3221 const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
3222
3223 /* Check locks and write to the undo log,
3224 if specified */
3225 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3226 thr, mtr, &inherit);
3227 if (err != DB_SUCCESS) {
3228 goto fail_err;
3229 }
3230
3231#ifdef UNIV_DEBUG
3232 if (!(flags & BTR_CREATE_FLAG)
3233 && index->is_primary() && page_is_leaf(page)) {
3234 const dfield_t* trx_id = dtuple_get_nth_field(
3235 entry, dict_col_get_clust_pos(
3236 dict_table_get_sys_col(index->table,
3237 DATA_TRX_ID),
3238 index));
3239
3240 ut_ad(trx_id->len == DATA_TRX_ID_LEN);
3241 ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
3242 ut_ad(*static_cast<const byte*>
3243 (trx_id[1].data) & 0x80);
3244 if (flags & BTR_NO_UNDO_LOG_FLAG) {
3245 ut_ad(!memcmp(trx_id->data, reset_trx_id,
3246 DATA_TRX_ID_LEN));
3247 } else {
3248 ut_ad(thr->graph->trx->id);
3249 ut_ad(thr->graph->trx->id
3250 == trx_read_trx_id(
3251 static_cast<const byte*>(
3252 trx_id->data)));
3253 }
3254 }
3255#endif
3256
3257 *rec = page_cur_tuple_insert(
3258 page_cursor, entry, index, offsets, heap,
3259 n_ext, mtr);
3260
3261 reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3262 }
3263
3264 if (*rec) {
3265 } else if (page_size.is_compressed()) {
3266 ut_ad(!index->table->is_temporary());
3267 /* Reset the IBUF_BITMAP_FREE bits, because
3268 page_cur_tuple_insert() will have attempted page
3269 reorganize before failing. */
3270 if (leaf
3271 && !dict_index_is_clust(index)) {
3272 ibuf_reset_free_bits(block);
3273 }
3274
3275 goto fail;
3276 } else {
3277 ut_ad(!reorg);
3278
3279 /* If the record did not fit, reorganize */
3280 if (!btr_page_reorganize(page_cursor, index, mtr)) {
3281 ut_ad(0);
3282 goto fail;
3283 }
3284
3285 ut_ad(page_get_max_insert_size(page, 1) == max_size);
3286
3287 reorg = TRUE;
3288
3289 *rec = page_cur_tuple_insert(page_cursor, entry, index,
3290 offsets, heap, n_ext, mtr);
3291
3292 if (UNIV_UNLIKELY(!*rec)) {
3293 ib::fatal() << "Cannot insert tuple " << *entry
3294 << "into index " << index->name
3295 << " of table " << index->table->name
3296 << ". Max size: " << max_size;
3297 }
3298 }
3299
3300#ifdef BTR_CUR_HASH_ADAPT
3301 if (!leaf) {
3302# ifdef MYSQL_INDEX_DISABLE_AHI
3303 } else if (index->disable_ahi) {
3304# endif
3305 } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
3306 ut_ad(entry->info_bits == REC_INFO_DEFAULT_ROW);
3307 ut_ad(index->is_instant());
3308 ut_ad(flags == BTR_NO_LOCKING_FLAG);
3309 } else {
3310 rw_lock_t* ahi_latch = btr_get_search_latch(index);
3311 if (!reorg && cursor->flag == BTR_CUR_HASH) {
3312 btr_search_update_hash_node_on_insert(
3313 cursor, ahi_latch);
3314 } else {
3315 btr_search_update_hash_on_insert(cursor, ahi_latch);
3316 }
3317 }
3318#endif /* BTR_CUR_HASH_ADAPT */
3319
3320 if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3321
3322 lock_update_insert(block, *rec);
3323 }
3324
3325 if (leaf
3326 && !dict_index_is_clust(index)
3327 && !index->table->is_temporary()) {
3328 /* Update the free bits of the B-tree page in the
3329 insert buffer bitmap. */
3330
3331 /* The free bits in the insert buffer bitmap must
3332 never exceed the free space on a page. It is safe to
3333 decrement or reset the bits in the bitmap in a
3334 mini-transaction that is committed before the
3335 mini-transaction that affects the free space. */
3336
3337 /* It is unsafe to increment the bits in a separately
3338 committed mini-transaction, because in crash recovery,
3339 the free bits could momentarily be set too high. */
3340
3341 if (page_size.is_compressed()) {
3342 /* Update the bits in the same mini-transaction. */
3343 ibuf_update_free_bits_zip(block, mtr);
3344 } else {
3345 /* Decrement the bits in a separate
3346 mini-transaction. */
3347 ibuf_update_free_bits_if_full(
3348 block, max_size,
3349 rec_size + PAGE_DIR_SLOT_SIZE);
3350 }
3351 }
3352
3353 *big_rec = big_rec_vec;
3354
3355 return(DB_SUCCESS);
3356}
3357
3358/*************************************************************//**
3359Performs an insert on a page of an index tree. It is assumed that mtr
3360holds an x-latch on the tree and on the cursor page. If the insert is
3361made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3362to brothers of page, if those brothers exist.
3363@return DB_SUCCESS or error number */
3364dberr_t
3365btr_cur_pessimistic_insert(
3366/*=======================*/
3367 ulint flags, /*!< in: undo logging and locking flags: if not
3368 zero, the parameter thr should be
3369 specified; if no undo logging is specified,
3370 then the caller must have reserved enough
3371 free extents in the file space so that the
3372 insertion will certainly succeed */
3373 btr_cur_t* cursor, /*!< in: cursor after which to insert;
3374 cursor stays valid */
3375 ulint** offsets,/*!< out: offsets on *rec */
3376 mem_heap_t** heap, /*!< in/out: pointer to memory heap
3377 that can be emptied */
3378 dtuple_t* entry, /*!< in/out: entry to insert */
3379 rec_t** rec, /*!< out: pointer to inserted record if
3380 succeed */
3381 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3382 be stored externally by the caller */
3383 ulint n_ext, /*!< in: number of externally stored columns */
3384 que_thr_t* thr, /*!< in/out: query thread; can be NULL if
3385 !(~flags
3386 & (BTR_NO_LOCKING_FLAG
3387 | BTR_NO_UNDO_LOG_FLAG)) */
3388 mtr_t* mtr) /*!< in/out: mini-transaction */
3389{
3390 dict_index_t* index = cursor->index;
3391 big_rec_t* big_rec_vec = NULL;
3392 dberr_t err;
3393 bool inherit = false;
3394 bool success;
3395 ulint n_reserved = 0;
3396
3397 ut_ad(dtuple_check_typed(entry));
3398 ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
3399
3400 *big_rec = NULL;
3401
3402 ut_ad(mtr_memo_contains_flagged(
3403 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3404 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
3405 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
3406 MTR_MEMO_PAGE_X_FIX));
3407 ut_ad(!dict_index_is_online_ddl(index)
3408 || dict_index_is_clust(index)
3409 || (flags & BTR_CREATE_FLAG));
3410
3411 cursor->flag = BTR_CUR_BINARY;
3412
3413 /* Check locks and write to undo log, if specified */
3414
3415 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3416 thr, mtr, &inherit);
3417
3418 if (err != DB_SUCCESS) {
3419
3420 return(err);
3421 }
3422
3423 if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
3424 /* First reserve enough free space for the file segments
3425 of the index tree, so that the insert will not fail because
3426 of lack of space */
3427
3428 ulint n_extents = cursor->tree_height / 16 + 3;
3429
3430 success = fsp_reserve_free_extents(&n_reserved,
3431 index->table->space,
3432 n_extents, FSP_NORMAL, mtr);
3433 if (!success) {
3434 return(DB_OUT_OF_FILE_SPACE);
3435 }
3436 }
3437
3438 if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3439 dict_table_is_comp(index->table),
3440 dtuple_get_n_fields(entry),
3441 dict_table_page_size(index->table))) {
3442 /* The record is so big that we have to store some fields
3443 externally on separate database pages */
3444
3445 if (UNIV_LIKELY_NULL(big_rec_vec)) {
3446 /* This should never happen, but we handle
3447 the situation in a robust manner. */
3448 ut_ad(0);
3449 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3450 }
3451
3452 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3453
3454 if (big_rec_vec == NULL) {
3455
3456 index->table->space->release_free_extents(n_reserved);
3457 return(DB_TOO_BIG_RECORD);
3458 }
3459 }
3460
3461 if (dict_index_get_page(index)
3462 == btr_cur_get_block(cursor)->page.id.page_no()) {
3463
3464 /* The page is the root page */
3465 *rec = btr_root_raise_and_insert(
3466 flags, cursor, offsets, heap, entry, n_ext, mtr);
3467 } else {
3468 *rec = btr_page_split_and_insert(
3469 flags, cursor, offsets, heap, entry, n_ext, mtr);
3470 }
3471
3472 if (*rec == NULL && os_has_said_disk_full) {
3473 return(DB_OUT_OF_FILE_SPACE);
3474 }
3475
3476 ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3477 || dict_index_is_spatial(index));
3478
3479 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3480 ut_ad(!index->table->is_temporary());
3481 if (dict_index_is_spatial(index)) {
3482 /* Do nothing */
3483 } else {
3484 /* The cursor might be moved to the other page
3485 and the max trx id field should be updated after
3486 the cursor was fixed. */
3487 if (!dict_index_is_clust(index)) {
3488 page_update_max_trx_id(
3489 btr_cur_get_block(cursor),
3490 btr_cur_get_page_zip(cursor),
3491 thr_get_trx(thr)->id, mtr);
3492 }
3493
3494 if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3495 || btr_page_get_prev(
3496 buf_block_get_frame(
3497 btr_cur_get_block(cursor)), mtr)
3498 == FIL_NULL) {
3499 /* split and inserted need to call
3500 lock_update_insert() always. */
3501 inherit = true;
3502 }
3503 }
3504 }
3505
3506 if (!page_is_leaf(btr_cur_get_page(cursor))) {
3507 ut_ad(!big_rec_vec);
3508 } else {
3509#ifdef BTR_CUR_HASH_ADAPT
3510# ifdef MYSQL_INDEX_DISABLE_AHI
3511 if (index->disable_ahi); else
3512# endif
3513 if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
3514 ut_ad(entry->info_bits == REC_INFO_DEFAULT_ROW);
3515 ut_ad(index->is_instant());
3516 ut_ad((flags & ulint(~BTR_KEEP_IBUF_BITMAP))
3517 == BTR_NO_LOCKING_FLAG);
3518 } else {
3519 btr_search_update_hash_on_insert(
3520 cursor, btr_get_search_latch(index));
3521 }
3522#endif /* BTR_CUR_HASH_ADAPT */
3523 if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3524
3525 lock_update_insert(btr_cur_get_block(cursor), *rec);
3526 }
3527 }
3528
3529 index->table->space->release_free_extents(n_reserved);
3530 *big_rec = big_rec_vec;
3531
3532 return(DB_SUCCESS);
3533}
3534
3535/*==================== B-TREE UPDATE =========================*/
3536
3537/*************************************************************//**
3538For an update, checks the locks and does the undo logging.
3539@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
3540UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3541dberr_t
3542btr_cur_upd_lock_and_undo(
3543/*======================*/
3544 ulint flags, /*!< in: undo logging and locking flags */
3545 btr_cur_t* cursor, /*!< in: cursor on record to update */
3546 const ulint* offsets,/*!< in: rec_get_offsets() on cursor */
3547 const upd_t* update, /*!< in: update vector */
3548 ulint cmpl_info,/*!< in: compiler info on secondary index
3549 updates */
3550 que_thr_t* thr, /*!< in: query thread
3551 (can be NULL if BTR_NO_LOCKING_FLAG) */
3552 mtr_t* mtr, /*!< in/out: mini-transaction */
3553 roll_ptr_t* roll_ptr)/*!< out: roll pointer */
3554{
3555 dict_index_t* index;
3556 const rec_t* rec;
3557 dberr_t err;
3558
3559 ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
3560
3561 rec = btr_cur_get_rec(cursor);
3562 index = cursor->index;
3563
3564 ut_ad(rec_offs_validate(rec, index, offsets));
3565 ut_ad(mtr->is_named_space(index->table->space));
3566
3567 if (!dict_index_is_clust(index)) {
3568 ut_ad(dict_index_is_online_ddl(index)
3569 == !!(flags & BTR_CREATE_FLAG));
3570
3571 /* We do undo logging only when we update a clustered index
3572 record */
3573 return(lock_sec_rec_modify_check_and_lock(
3574 flags, btr_cur_get_block(cursor), rec,
3575 index, thr, mtr));
3576 }
3577
3578 /* Check if we have to wait for a lock: enqueue an explicit lock
3579 request if yes */
3580
3581 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3582 err = lock_clust_rec_modify_check_and_lock(
3583 flags, btr_cur_get_block(cursor), rec, index,
3584 offsets, thr);
3585 if (err != DB_SUCCESS) {
3586 return(err);
3587 }
3588 }
3589
3590 /* Append the info about the update in the undo log */
3591
3592 return((flags & BTR_NO_UNDO_LOG_FLAG)
3593 ? DB_SUCCESS
3594 : trx_undo_report_row_operation(
3595 thr, index, NULL, update,
3596 cmpl_info, rec, offsets, roll_ptr));
3597}
3598
3599/***********************************************************//**
3600Writes a redo log record of updating a record in-place. */
3601void
3602btr_cur_update_in_place_log(
3603/*========================*/
3604 ulint flags, /*!< in: flags */
3605 const rec_t* rec, /*!< in: record */
3606 dict_index_t* index, /*!< in: index of the record */
3607 const upd_t* update, /*!< in: update vector */
3608 trx_id_t trx_id, /*!< in: transaction id */
3609 roll_ptr_t roll_ptr, /*!< in: roll ptr */
3610 mtr_t* mtr) /*!< in: mtr */
3611{
3612 byte* log_ptr;
3613 const page_t* page = page_align(rec);
3614 ut_ad(flags < 256);
3615 ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3616
3617 log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3618 ? MLOG_COMP_REC_UPDATE_IN_PLACE
3619 : MLOG_REC_UPDATE_IN_PLACE,
3620 1 + DATA_ROLL_PTR_LEN + 14 + 2
3621 + MLOG_BUF_MARGIN);
3622
3623 if (!log_ptr) {
3624 /* Logging in mtr is switched off during crash recovery */
3625 return;
3626 }
3627
3628 /* For secondary indexes, we could skip writing the dummy system fields
3629 to the redo log but we have to change redo log parsing of
3630 MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3631 new redo log record. For now, just write dummy sys fields to the redo
3632 log if we are updating a secondary index record.
3633 */
3634 mach_write_to_1(log_ptr, flags);
3635 log_ptr++;
3636
3637 if (dict_index_is_clust(index)) {
3638 log_ptr = row_upd_write_sys_vals_to_log(
3639 index, trx_id, roll_ptr, log_ptr, mtr);
3640 } else {
3641 /* Dummy system fields for a secondary index */
3642 /* TRX_ID Position */
3643 log_ptr += mach_write_compressed(log_ptr, 0);
3644 /* ROLL_PTR */
3645 trx_write_roll_ptr(log_ptr, 0);
3646 log_ptr += DATA_ROLL_PTR_LEN;
3647 /* TRX_ID */
3648 log_ptr += mach_u64_write_compressed(log_ptr, 0);
3649 }
3650
3651 mach_write_to_2(log_ptr, page_offset(rec));
3652 log_ptr += 2;
3653
3654 row_upd_index_write_log(update, log_ptr, mtr);
3655}
3656
3657/***********************************************************//**
3658Parses a redo log record of updating a record in-place.
3659@return end of log record or NULL */
3660byte*
3661btr_cur_parse_update_in_place(
3662/*==========================*/
3663 byte* ptr, /*!< in: buffer */
3664 byte* end_ptr,/*!< in: buffer end */
3665 page_t* page, /*!< in/out: page or NULL */
3666 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
3667 dict_index_t* index) /*!< in: index corresponding to page */
3668{
3669 ulint flags;
3670 rec_t* rec;
3671 upd_t* update;
3672 ulint pos;
3673 trx_id_t trx_id;
3674 roll_ptr_t roll_ptr;
3675 ulint rec_offset;
3676 mem_heap_t* heap;
3677 ulint* offsets;
3678
3679 if (end_ptr < ptr + 1) {
3680
3681 return(NULL);
3682 }
3683
3684 flags = mach_read_from_1(ptr);
3685 ptr++;
3686
3687 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3688
3689 if (ptr == NULL) {
3690
3691 return(NULL);
3692 }
3693
3694 if (end_ptr < ptr + 2) {
3695
3696 return(NULL);
3697 }
3698
3699 rec_offset = mach_read_from_2(ptr);
3700 ptr += 2;
3701
3702 ut_a(rec_offset <= srv_page_size);
3703
3704 heap = mem_heap_create(256);
3705
3706 ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3707
3708 if (!ptr || !page) {
3709
3710 goto func_exit;
3711 }
3712
3713 ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3714 rec = page + rec_offset;
3715
3716 /* We do not need to reserve search latch, as the page is only
3717 being recovered, and there cannot be a hash index to it. */
3718
3719 /* The function rtr_update_mbr_field_in_place() is generating
3720 these records on node pointer pages; therefore we have to
3721 check if this is a leaf page. */
3722
3723 offsets = rec_get_offsets(rec, index, NULL,
3724 flags != (BTR_NO_UNDO_LOG_FLAG
3725 | BTR_NO_LOCKING_FLAG
3726 | BTR_KEEP_SYS_FLAG)
3727 || page_is_leaf(page),
3728 ULINT_UNDEFINED, &heap);
3729
3730 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3731 row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3732 pos, trx_id, roll_ptr);
3733 }
3734
3735 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3736
3737func_exit:
3738 mem_heap_free(heap);
3739
3740 return(ptr);
3741}
3742
3743/*************************************************************//**
3744See if there is enough place in the page modification log to log
3745an update-in-place.
3746
3747@retval false if out of space; IBUF_BITMAP_FREE will be reset
3748outside mtr if the page was recompressed
3749@retval true if enough place;
3750
3751IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3752a secondary index leaf page. This has to be done either within the
3753same mini-transaction, or by invoking ibuf_reset_free_bits() before
3754mtr_commit(mtr). */
3755bool
3756btr_cur_update_alloc_zip_func(
3757/*==========================*/
3758 page_zip_des_t* page_zip,/*!< in/out: compressed page */
3759 page_cur_t* cursor, /*!< in/out: B-tree page cursor */
3760 dict_index_t* index, /*!< in: the index corresponding to cursor */
3761#ifdef UNIV_DEBUG
3762 ulint* offsets,/*!< in/out: offsets of the cursor record */
3763#endif /* UNIV_DEBUG */
3764 ulint length, /*!< in: size needed */
3765 bool create, /*!< in: true=delete-and-insert,
3766 false=update-in-place */
3767 mtr_t* mtr) /*!< in/out: mini-transaction */
3768{
3769
3770 /* Have a local copy of the variables as these can change
3771 dynamically. */
3772 const page_t* page = page_cur_get_page(cursor);
3773
3774 ut_ad(page_zip == page_cur_get_page_zip(cursor));
3775 ut_ad(!dict_index_is_ibuf(index));
3776 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3777
3778 if (page_zip_available(page_zip, dict_index_is_clust(index),
3779 length, create)) {
3780 return(true);
3781 }
3782
3783 if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3784 /* The page has been freshly compressed, so
3785 reorganizing it will not help. */
3786 return(false);
3787 }
3788
3789 if (create && page_is_leaf(page)
3790 && (length + page_get_data_size(page)
3791 >= dict_index_zip_pad_optimal_page_size(index))) {
3792 return(false);
3793 }
3794
3795 if (!btr_page_reorganize(cursor, index, mtr)) {
3796 goto out_of_space;
3797 }
3798
3799 rec_offs_make_valid(page_cur_get_rec(cursor), index,
3800 page_is_leaf(page), offsets);
3801
3802 /* After recompressing a page, we must make sure that the free
3803 bits in the insert buffer bitmap will not exceed the free
3804 space on the page. Because this function will not attempt
3805 recompression unless page_zip_available() fails above, it is
3806 safe to reset the free bits if page_zip_available() fails
3807 again, below. The free bits can safely be reset in a separate
3808 mini-transaction. If page_zip_available() succeeds below, we
3809 can be sure that the btr_page_reorganize() above did not reduce
3810 the free space available on the page. */
3811
3812 if (page_zip_available(page_zip, dict_index_is_clust(index),
3813 length, create)) {
3814 return(true);
3815 }
3816
3817out_of_space:
3818 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3819
3820 /* Out of space: reset the free bits. */
3821 if (!dict_index_is_clust(index)
3822 && !index->table->is_temporary()
3823 && page_is_leaf(page)) {
3824 ibuf_reset_free_bits(page_cur_get_block(cursor));
3825 }
3826
3827 return(false);
3828}
3829
3830/*************************************************************//**
3831Updates a record when the update causes no size changes in its fields.
3832We assume here that the ordering fields of the record do not change.
3833@return locking or undo log related error code, or
3834@retval DB_SUCCESS on success
3835@retval DB_ZIP_OVERFLOW if there is not enough space left
3836on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3837dberr_t
3838btr_cur_update_in_place(
3839/*====================*/
3840 ulint flags, /*!< in: undo logging and locking flags */
3841 btr_cur_t* cursor, /*!< in: cursor on the record to update;
3842 cursor stays valid and positioned on the
3843 same record */
3844 ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3845 const upd_t* update, /*!< in: update vector */
3846 ulint cmpl_info,/*!< in: compiler info on secondary index
3847 updates */
3848 que_thr_t* thr, /*!< in: query thread */
3849 trx_id_t trx_id, /*!< in: transaction id */
3850 mtr_t* mtr) /*!< in/out: mini-transaction; if this
3851 is a secondary index, the caller must
3852 mtr_commit(mtr) before latching any
3853 further pages */
3854{
3855 dict_index_t* index;
3856 buf_block_t* block;
3857 page_zip_des_t* page_zip;
3858 dberr_t err;
3859 rec_t* rec;
3860 roll_ptr_t roll_ptr = 0;
3861 ulint was_delete_marked;
3862
3863 ut_ad(page_is_leaf(cursor->page_cur.block->frame));
3864 rec = btr_cur_get_rec(cursor);
3865 index = cursor->index;
3866 ut_ad(rec_offs_validate(rec, index, offsets));
3867 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3868 ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG));
3869 /* The insert buffer tree should never be updated in place. */
3870 ut_ad(!dict_index_is_ibuf(index));
3871 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3872 || dict_index_is_clust(index));
3873 ut_ad(thr_get_trx(thr)->id == trx_id
3874 || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
3875 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3876 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3877 ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3878 ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3879 ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
3880
3881 DBUG_LOG("ib_cur",
3882 "update-in-place " << index->name << " (" << index->id
3883 << ") by " << ib::hex(trx_id) << ": "
3884 << rec_printer(rec, offsets).str());
3885
3886 block = btr_cur_get_block(cursor);
3887 page_zip = buf_block_get_page_zip(block);
3888
3889 /* Check that enough space is available on the compressed page. */
3890 if (page_zip) {
3891 ut_ad(!index->table->is_temporary());
3892
3893 if (!btr_cur_update_alloc_zip(
3894 page_zip, btr_cur_get_page_cur(cursor),
3895 index, offsets, rec_offs_size(offsets),
3896 false, mtr)) {
3897 return(DB_ZIP_OVERFLOW);
3898 }
3899
3900 rec = btr_cur_get_rec(cursor);
3901 }
3902
3903 /* Do lock checking and undo logging */
3904 err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
3905 update, cmpl_info,
3906 thr, mtr, &roll_ptr);
3907 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3908 /* We may need to update the IBUF_BITMAP_FREE
3909 bits after a reorganize that was done in
3910 btr_cur_update_alloc_zip(). */
3911 goto func_exit;
3912 }
3913
3914 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3915 row_upd_rec_sys_fields(rec, NULL, index, offsets,
3916 thr_get_trx(thr), roll_ptr);
3917 }
3918
3919 was_delete_marked = rec_get_deleted_flag(
3920 rec, page_is_comp(buf_block_get_frame(block)));
3921 /* In delete-marked records, DB_TRX_ID must always refer to an
3922 existing undo log record. */
3923 ut_ad(!was_delete_marked
3924 || !dict_index_is_clust(index)
3925 || row_get_rec_trx_id(rec, index, offsets));
3926
3927#ifdef BTR_CUR_HASH_ADAPT
3928 {
3929 rw_lock_t* ahi_latch = block->index
3930 ? btr_get_search_latch(index) : NULL;
3931 if (ahi_latch) {
3932 /* TO DO: Can we skip this if none of the fields
3933 index->search_info->curr_n_fields
3934 are being updated? */
3935
3936 /* The function row_upd_changes_ord_field_binary
3937 does not work on a secondary index. */
3938
3939 if (!dict_index_is_clust(index)
3940 || row_upd_changes_ord_field_binary(
3941 index, update, thr, NULL, NULL)) {
3942 ut_ad(!(update->info_bits
3943 & REC_INFO_MIN_REC_FLAG));
3944 /* Remove possible hash index pointer
3945 to this record */
3946 btr_search_update_hash_on_delete(cursor);
3947 }
3948
3949 rw_lock_x_lock(ahi_latch);
3950 }
3951
3952 assert_block_ahi_valid(block);
3953#endif /* BTR_CUR_HASH_ADAPT */
3954
3955 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3956
3957#ifdef BTR_CUR_HASH_ADAPT
3958 if (ahi_latch) {
3959 rw_lock_x_unlock(ahi_latch);
3960 }
3961 }
3962#endif /* BTR_CUR_HASH_ADAPT */
3963
3964 btr_cur_update_in_place_log(flags, rec, index, update,
3965 trx_id, roll_ptr, mtr);
3966
3967 if (was_delete_marked
3968 && !rec_get_deleted_flag(
3969 rec, page_is_comp(buf_block_get_frame(block)))) {
3970 /* The new updated record owns its possible externally
3971 stored fields */
3972
3973 btr_cur_unmark_extern_fields(page_zip,
3974 rec, index, offsets, mtr);
3975 }
3976
3977 ut_ad(err == DB_SUCCESS);
3978
3979func_exit:
3980 if (page_zip
3981 && !(flags & BTR_KEEP_IBUF_BITMAP)
3982 && !dict_index_is_clust(index)
3983 && page_is_leaf(buf_block_get_frame(block))) {
3984 /* Update the free bits in the insert buffer. */
3985 ut_ad(!index->table->is_temporary());
3986 ibuf_update_free_bits_zip(block, mtr);
3987 }
3988
3989 return(err);
3990}
3991
3992/** Trim an update tuple due to instant ADD COLUMN, if needed.
3993For normal records, the trailing instantly added fields that match
3994the 'default row' are omitted.
3995
3996For the special 'default row' record on a table on which instant
3997ADD COLUMN has already been executed, both ADD COLUMN and the
3998rollback of ADD COLUMN need to be handled specially.
3999
4000@param[in,out] entry index entry
4001@param[in] index index
4002@param[in] update update vector
4003@param[in] thr execution thread */
4004static inline
4005void
4006btr_cur_trim(
4007 dtuple_t* entry,
4008 const dict_index_t* index,
4009 const upd_t* update,
4010 const que_thr_t* thr)
4011{
4012 if (!index->is_instant()) {
4013 } else if (UNIV_UNLIKELY(update->info_bits == REC_INFO_DEFAULT_ROW)) {
4014 /* We are either updating a 'default row'
4015 (instantly adding columns to a table where instant ADD was
4016 already executed) or rolling back such an operation. */
4017 ut_ad(!upd_get_nth_field(update, 0)->orig_len);
4018 ut_ad(upd_get_nth_field(update, 0)->field_no
4019 > index->n_core_fields);
4020
4021 if (thr->graph->trx->in_rollback) {
4022 /* This rollback can occur either as part of
4023 ha_innobase::commit_inplace_alter_table() rolling
4024 back after a failed innobase_add_instant_try(),
4025 or as part of crash recovery. Either way, the
4026 table will be in the data dictionary cache, with
4027 the instantly added columns going to be removed
4028 later in the rollback. */
4029 ut_ad(index->table->cached);
4030 /* The DB_TRX_ID,DB_ROLL_PTR are always last,
4031 and there should be some change to roll back.
4032 The first field in the update vector is the
4033 first instantly added column logged by
4034 innobase_add_instant_try(). */
4035 ut_ad(update->n_fields > 2);
4036 ulint n_fields = upd_get_nth_field(update, 0)
4037 ->field_no;
4038 ut_ad(n_fields + 1 >= entry->n_fields);
4039 entry->n_fields = n_fields;
4040 }
4041 } else {
4042 entry->trim(*index);
4043 }
4044}
4045
4046/*************************************************************//**
4047Tries to update a record on a page in an index tree. It is assumed that mtr
4048holds an x-latch on the page. The operation does not succeed if there is too
4049little space on the page or if the update would result in too empty a page,
4050so that tree compression is recommended. We assume here that the ordering
4051fields of the record do not change.
4052@return error code, including
4053@retval DB_SUCCESS on success
4054@retval DB_OVERFLOW if the updated record does not fit
4055@retval DB_UNDERFLOW if the page would become too empty
4056@retval DB_ZIP_OVERFLOW if there is not enough space left
4057on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4058dberr_t
4059btr_cur_optimistic_update(
4060/*======================*/
4061 ulint flags, /*!< in: undo logging and locking flags */
4062 btr_cur_t* cursor, /*!< in: cursor on the record to update;
4063 cursor stays valid and positioned on the
4064 same record */
4065 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
4066 mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
4067 const upd_t* update, /*!< in: update vector; this must also
4068 contain trx id and roll ptr fields */
4069 ulint cmpl_info,/*!< in: compiler info on secondary index
4070 updates */
4071 que_thr_t* thr, /*!< in: query thread */
4072 trx_id_t trx_id, /*!< in: transaction id */
4073 mtr_t* mtr) /*!< in/out: mini-transaction; if this
4074 is a secondary index, the caller must
4075 mtr_commit(mtr) before latching any
4076 further pages */
4077{
4078 dict_index_t* index;
4079 page_cur_t* page_cursor;
4080 dberr_t err;
4081 buf_block_t* block;
4082 page_t* page;
4083 page_zip_des_t* page_zip;
4084 rec_t* rec;
4085 ulint max_size;
4086 ulint new_rec_size;
4087 ulint old_rec_size;
4088 ulint max_ins_size = 0;
4089 dtuple_t* new_entry;
4090 roll_ptr_t roll_ptr;
4091 ulint i;
4092 ulint n_ext;
4093
4094 block = btr_cur_get_block(cursor);
4095 page = buf_block_get_frame(block);
4096 rec = btr_cur_get_rec(cursor);
4097 index = cursor->index;
4098 ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG));
4099 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4100 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4101 /* This is intended only for leaf page updates */
4102 ut_ad(page_is_leaf(page));
4103 /* The insert buffer tree should never be updated in place. */
4104 ut_ad(!dict_index_is_ibuf(index));
4105 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4106 || dict_index_is_clust(index));
4107 ut_ad(thr_get_trx(thr)->id == trx_id
4108 || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
4109 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4110 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4111 ut_ad(fil_page_index_page_check(page));
4112 ut_ad(btr_page_get_index_id(page) == index->id);
4113
4114 *offsets = rec_get_offsets(rec, index, *offsets, true,
4115 ULINT_UNDEFINED, heap);
4116#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4117 ut_a(!rec_offs_any_null_extern(rec, *offsets)
4118 || trx_is_recv(thr_get_trx(thr)));
4119#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4120
4121 const bool is_default_row = update->info_bits == REC_INFO_DEFAULT_ROW;
4122
4123 if (UNIV_LIKELY(!is_default_row)
4124 && !row_upd_changes_field_size_or_external(index, *offsets,
4125 update)) {
4126
4127 /* The simplest and the most common case: the update does not
4128 change the size of any field and none of the updated fields is
4129 externally stored in rec or update, and there is enough space
4130 on the compressed page to log the update. */
4131
4132 return(btr_cur_update_in_place(
4133 flags, cursor, *offsets, update,
4134 cmpl_info, thr, trx_id, mtr));
4135 }
4136
4137 if (rec_offs_any_extern(*offsets)) {
4138any_extern:
4139 /* Externally stored fields are treated in pessimistic
4140 update */
4141
4142 /* prefetch siblings of the leaf for the pessimistic
4143 operation. */
4144 btr_cur_prefetch_siblings(block);
4145
4146 return(DB_OVERFLOW);
4147 }
4148
4149 for (i = 0; i < upd_get_n_fields(update); i++) {
4150 if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4151
4152 goto any_extern;
4153 }
4154 }
4155
4156 DBUG_LOG("ib_cur",
4157 "update " << index->name << " (" << index->id << ") by "
4158 << ib::hex(trx_id) << ": "
4159 << rec_printer(rec, *offsets).str());
4160
4161 page_cursor = btr_cur_get_page_cur(cursor);
4162
4163 if (!*heap) {
4164 *heap = mem_heap_create(
4165 rec_offs_size(*offsets)
4166 + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4167 }
4168
4169 new_entry = row_rec_to_index_entry(rec, index, *offsets,
4170 &n_ext, *heap);
4171 /* We checked above that there are no externally stored fields. */
4172 ut_a(!n_ext);
4173
4174 /* The page containing the clustered index record
4175 corresponding to new_entry is latched in mtr.
4176 Thus the following call is safe. */
4177 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4178 *heap);
4179 btr_cur_trim(new_entry, index, update, thr);
4180 old_rec_size = rec_offs_size(*offsets);
4181 new_rec_size = rec_get_converted_size(index, new_entry, 0);
4182
4183 page_zip = buf_block_get_page_zip(block);
4184#ifdef UNIV_ZIP_DEBUG
4185 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4186#endif /* UNIV_ZIP_DEBUG */
4187
4188 if (page_zip) {
4189 ut_ad(!index->table->is_temporary());
4190
4191 if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
4192 dict_index_get_n_fields(index),
4193 dict_table_page_size(index->table))) {
4194 goto any_extern;
4195 }
4196
4197 if (!btr_cur_update_alloc_zip(
4198 page_zip, page_cursor, index, *offsets,
4199 new_rec_size, true, mtr)) {
4200 return(DB_ZIP_OVERFLOW);
4201 }
4202
4203 rec = page_cur_get_rec(page_cursor);
4204 }
4205
4206 /* We limit max record size to 16k even for 64k page size. */
4207 if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
4208 (!dict_table_is_comp(index->table)
4209 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
4210 err = DB_OVERFLOW;
4211
4212 goto func_exit;
4213 }
4214
4215 if (UNIV_UNLIKELY(new_rec_size
4216 >= (page_get_free_space_of_empty(page_is_comp(page))
4217 / 2))) {
4218 /* We may need to update the IBUF_BITMAP_FREE
4219 bits after a reorganize that was done in
4220 btr_cur_update_alloc_zip(). */
4221 err = DB_OVERFLOW;
4222 goto func_exit;
4223 }
4224
4225 if (UNIV_UNLIKELY(page_get_data_size(page)
4226 - old_rec_size + new_rec_size
4227 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4228 /* We may need to update the IBUF_BITMAP_FREE
4229 bits after a reorganize that was done in
4230 btr_cur_update_alloc_zip(). */
4231
4232 /* The page would become too empty */
4233 err = DB_UNDERFLOW;
4234 goto func_exit;
4235 }
4236
4237 /* We do not attempt to reorganize if the page is compressed.
4238 This is because the page may fail to compress after reorganization. */
4239 max_size = page_zip
4240 ? page_get_max_insert_size(page, 1)
4241 : (old_rec_size
4242 + page_get_max_insert_size_after_reorganize(page, 1));
4243
4244 if (!page_zip) {
4245 max_ins_size = page_get_max_insert_size_after_reorganize(
4246 page, 1);
4247 }
4248
4249 if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4250 && (max_size >= new_rec_size))
4251 || (page_get_n_recs(page) <= 1))) {
4252
4253 /* We may need to update the IBUF_BITMAP_FREE
4254 bits after a reorganize that was done in
4255 btr_cur_update_alloc_zip(). */
4256
4257 /* There was not enough space, or it did not pay to
4258 reorganize: for simplicity, we decide what to do assuming a
4259 reorganization is needed, though it might not be necessary */
4260
4261 err = DB_OVERFLOW;
4262 goto func_exit;
4263 }
4264
4265 /* Do lock checking and undo logging */
4266 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4267 update, cmpl_info,
4268 thr, mtr, &roll_ptr);
4269 if (err != DB_SUCCESS) {
4270 /* We may need to update the IBUF_BITMAP_FREE
4271 bits after a reorganize that was done in
4272 btr_cur_update_alloc_zip(). */
4273 goto func_exit;
4274 }
4275
4276 /* Ok, we may do the replacement. Store on the page infimum the
4277 explicit locks on rec, before deleting rec (see the comment in
4278 btr_cur_pessimistic_update). */
4279 if (!dict_table_is_locking_disabled(index->table)) {
4280 lock_rec_store_on_page_infimum(block, rec);
4281 }
4282
4283 if (UNIV_UNLIKELY(is_default_row)) {
4284 ut_ad(new_entry->info_bits == REC_INFO_DEFAULT_ROW);
4285 ut_ad(index->is_instant());
4286 /* This can be innobase_add_instant_try() performing a
4287 subsequent instant ADD COLUMN, or its rollback by
4288 row_undo_mod_clust_low(). */
4289 ut_ad(flags & BTR_NO_LOCKING_FLAG);
4290 } else {
4291 btr_search_update_hash_on_delete(cursor);
4292 }
4293
4294 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4295
4296 page_cur_move_to_prev(page_cursor);
4297
4298 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4299 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4300 roll_ptr);
4301 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4302 trx_id);
4303 }
4304
4305 /* There are no externally stored columns in new_entry */
4306 rec = btr_cur_insert_if_possible(
4307 cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4308 ut_a(rec); /* <- We calculated above the insert would fit */
4309
4310 if (UNIV_UNLIKELY(is_default_row)) {
4311 /* We must empty the PAGE_FREE list, because if this
4312 was a rollback, the shortened 'default row' record
4313 would have too many fields, and we would be unable to
4314 know the size of the freed record. */
4315 btr_page_reorganize(page_cursor, index, mtr);
4316 } else if (!dict_table_is_locking_disabled(index->table)) {
4317 /* Restore the old explicit lock state on the record */
4318 lock_rec_restore_from_page_infimum(block, rec, block);
4319 }
4320
4321 page_cur_move_to_next(page_cursor);
4322 ut_ad(err == DB_SUCCESS);
4323
4324func_exit:
4325 if (!(flags & BTR_KEEP_IBUF_BITMAP)
4326 && !dict_index_is_clust(index)) {
4327 /* Update the free bits in the insert buffer. */
4328 if (page_zip) {
4329 ut_ad(!index->table->is_temporary());
4330 ibuf_update_free_bits_zip(block, mtr);
4331 } else if (!index->table->is_temporary()) {
4332 ibuf_update_free_bits_low(block, max_ins_size, mtr);
4333 }
4334 }
4335
4336 if (err != DB_SUCCESS) {
4337 /* prefetch siblings of the leaf for the pessimistic
4338 operation. */
4339 btr_cur_prefetch_siblings(block);
4340 }
4341
4342 return(err);
4343}
4344
4345/*************************************************************//**
4346If, in a split, a new supremum record was created as the predecessor of the
4347updated record, the supremum record must inherit exactly the locks on the
4348updated record. In the split it may have inherited locks from the successor
4349of the updated record, which is not correct. This function restores the
4350right locks for the new supremum. */
4351static
4352void
4353btr_cur_pess_upd_restore_supremum(
4354/*==============================*/
4355 buf_block_t* block, /*!< in: buffer block of rec */
4356 const rec_t* rec, /*!< in: updated record */
4357 mtr_t* mtr) /*!< in: mtr */
4358{
4359 page_t* page;
4360 buf_block_t* prev_block;
4361
4362 page = buf_block_get_frame(block);
4363
4364 if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4365 /* Updated record is not the first user record on its page */
4366
4367 return;
4368 }
4369
4370 const ulint prev_page_no = btr_page_get_prev(page, mtr);
4371
4372 const page_id_t page_id(block->page.id.space(), prev_page_no);
4373
4374 ut_ad(prev_page_no != FIL_NULL);
4375 prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4376#ifdef UNIV_BTR_DEBUG
4377 ut_a(btr_page_get_next(prev_block->frame, mtr)
4378 == page_get_page_no(page));
4379#endif /* UNIV_BTR_DEBUG */
4380
4381 /* We must already have an x-latch on prev_block! */
4382 ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4383
4384 lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4385 PAGE_HEAP_NO_SUPREMUM,
4386 page_rec_get_heap_no(rec));
4387}
4388
4389/*************************************************************//**
4390Performs an update of a record on a page of a tree. It is assumed
4391that mtr holds an x-latch on the tree and on the cursor page. If the
4392update is made on the leaf level, to avoid deadlocks, mtr must also
4393own x-latches to brothers of page, if those brothers exist. We assume
4394here that the ordering fields of the record do not change.
4395@return DB_SUCCESS or error code */
4396dberr_t
4397btr_cur_pessimistic_update(
4398/*=======================*/
4399 ulint flags, /*!< in: undo logging, locking, and rollback
4400 flags */
4401 btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
4402 cursor may become invalid if *big_rec == NULL
4403 || !(flags & BTR_KEEP_POS_FLAG) */
4404 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
4405 mem_heap_t** offsets_heap,
4406 /*!< in/out: pointer to memory heap
4407 that can be emptied */
4408 mem_heap_t* entry_heap,
4409 /*!< in/out: memory heap for allocating
4410 big_rec and the index tuple */
4411 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
4412 be stored externally by the caller */
4413 upd_t* update, /*!< in/out: update vector; this is allowed to
4414 also contain trx id and roll ptr fields.
4415 Non-updated columns that are moved offpage will
4416 be appended to this. */
4417 ulint cmpl_info,/*!< in: compiler info on secondary index
4418 updates */
4419 que_thr_t* thr, /*!< in: query thread */
4420 trx_id_t trx_id, /*!< in: transaction id */
4421 mtr_t* mtr) /*!< in/out: mini-transaction; must be
4422 committed before latching any further pages */
4423{
4424 big_rec_t* big_rec_vec = NULL;
4425 big_rec_t* dummy_big_rec;
4426 dict_index_t* index;
4427 buf_block_t* block;
4428 page_t* page;
4429 page_zip_des_t* page_zip;
4430 rec_t* rec;
4431 page_cur_t* page_cursor;
4432 dberr_t err;
4433 dberr_t optim_err;
4434 roll_ptr_t roll_ptr;
4435 ibool was_first;
4436 ulint n_reserved = 0;
4437 ulint n_ext;
4438 ulint max_ins_size = 0;
4439
4440 *offsets = NULL;
4441 *big_rec = NULL;
4442
4443 block = btr_cur_get_block(cursor);
4444 page = buf_block_get_frame(block);
4445 page_zip = buf_block_get_page_zip(block);
4446 index = cursor->index;
4447
4448 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4449 MTR_MEMO_X_LOCK |
4450 MTR_MEMO_SX_LOCK));
4451 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4452#ifdef UNIV_ZIP_DEBUG
4453 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4454#endif /* UNIV_ZIP_DEBUG */
4455 ut_ad(!page_zip || !index->table->is_temporary());
4456 /* The insert buffer tree should never be updated in place. */
4457 ut_ad(!dict_index_is_ibuf(index));
4458 ut_ad(trx_id > 0
4459 || (flags & BTR_KEEP_SYS_FLAG));
4460 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4461 || dict_index_is_clust(index));
4462 ut_ad(thr_get_trx(thr)->id == trx_id
4463 || (flags & ulint(~BTR_KEEP_POS_FLAG))
4464 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4465 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4466
4467 err = optim_err = btr_cur_optimistic_update(
4468 flags | BTR_KEEP_IBUF_BITMAP,
4469 cursor, offsets, offsets_heap, update,
4470 cmpl_info, thr, trx_id, mtr);
4471
4472 switch (err) {
4473 case DB_ZIP_OVERFLOW:
4474 case DB_UNDERFLOW:
4475 case DB_OVERFLOW:
4476 break;
4477 default:
4478 err_exit:
4479 /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4480 For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4481 already reset by btr_cur_update_alloc_zip() if the
4482 page was recompressed. */
4483 if (page_zip
4484 && optim_err != DB_ZIP_OVERFLOW
4485 && !dict_index_is_clust(index)
4486 && page_is_leaf(page)) {
4487 ut_ad(!index->table->is_temporary());
4488 ibuf_update_free_bits_zip(block, mtr);
4489 }
4490
4491 if (big_rec_vec != NULL) {
4492 dtuple_big_rec_free(big_rec_vec);
4493 }
4494
4495 return(err);
4496 }
4497
4498 rec = btr_cur_get_rec(cursor);
4499
4500 *offsets = rec_get_offsets(
4501 rec, index, *offsets, page_is_leaf(page),
4502 ULINT_UNDEFINED, offsets_heap);
4503
4504 dtuple_t* new_entry = row_rec_to_index_entry(
4505 rec, index, *offsets, &n_ext, entry_heap);
4506
4507 /* The page containing the clustered index record
4508 corresponding to new_entry is latched in mtr. If the
4509 clustered index record is delete-marked, then its externally
4510 stored fields cannot have been purged yet, because then the
4511 purge would also have removed the clustered index record
4512 itself. Thus the following call is safe. */
4513 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4514 entry_heap);
4515 btr_cur_trim(new_entry, index, update, thr);
4516
4517 const bool is_default_row = new_entry->info_bits
4518 & REC_INFO_MIN_REC_FLAG;
4519
4520 /* We have to set appropriate extern storage bits in the new
4521 record to be inserted: we have to remember which fields were such */
4522
4523 ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4524 ut_ad(rec_offs_validate(rec, index, *offsets));
4525 n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);
4526
4527 if ((flags & BTR_NO_UNDO_LOG_FLAG)
4528 && rec_offs_any_extern(*offsets)) {
4529 /* We are in a transaction rollback undoing a row
4530 update: we must free possible externally stored fields
4531 which got new values in the update, if they are not
4532 inherited values. They can be inherited if we have
4533 updated the primary key to another value, and then
4534 update it back again. */
4535
4536 ut_ad(big_rec_vec == NULL);
4537 ut_ad(dict_index_is_clust(index));
4538 ut_ad(thr_get_trx(thr)->in_rollback);
4539
4540 DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4541
4542 btr_rec_free_updated_extern_fields(
4543 index, rec, page_zip, *offsets, update, true, mtr);
4544 }
4545
4546 if (page_zip_rec_needs_ext(
4547 rec_get_converted_size(index, new_entry, n_ext),
4548 page_is_comp(page),
4549 dict_index_get_n_fields(index),
4550 block->page.size)) {
4551
4552 big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4553 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4554
4555 /* We cannot goto return_after_reservations,
4556 because we may need to update the
4557 IBUF_BITMAP_FREE bits, which was suppressed by
4558 BTR_KEEP_IBUF_BITMAP. */
4559#ifdef UNIV_ZIP_DEBUG
4560 ut_a(!page_zip
4561 || page_zip_validate(page_zip, page, index));
4562#endif /* UNIV_ZIP_DEBUG */
4563 index->table->space->release_free_extents(n_reserved);
4564 err = DB_TOO_BIG_RECORD;
4565 goto err_exit;
4566 }
4567
4568 ut_ad(page_is_leaf(page));
4569 ut_ad(dict_index_is_clust(index));
4570 ut_ad(flags & BTR_KEEP_POS_FLAG);
4571 }
4572
4573 /* Do lock checking and undo logging */
4574 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4575 update, cmpl_info,
4576 thr, mtr, &roll_ptr);
4577 if (err != DB_SUCCESS) {
4578 goto err_exit;
4579 }
4580
4581 if (optim_err == DB_OVERFLOW) {
4582
4583 /* First reserve enough free space for the file segments
4584 of the index tree, so that the update will not fail because
4585 of lack of space */
4586
4587 ulint n_extents = cursor->tree_height / 16 + 3;
4588
4589 if (!fsp_reserve_free_extents(
4590 &n_reserved, index->table->space, n_extents,
4591 flags & BTR_NO_UNDO_LOG_FLAG
4592 ? FSP_CLEANING : FSP_NORMAL,
4593 mtr)) {
4594 err = DB_OUT_OF_FILE_SPACE;
4595 goto err_exit;
4596 }
4597 }
4598
4599 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4600 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4601 roll_ptr);
4602 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4603 trx_id);
4604 }
4605
4606 if (!page_zip) {
4607 max_ins_size = page_get_max_insert_size_after_reorganize(
4608 page, 1);
4609 }
4610
4611 if (UNIV_UNLIKELY(is_default_row)) {
4612 ut_ad(new_entry->info_bits == REC_INFO_DEFAULT_ROW);
4613 ut_ad(index->is_instant());
4614 /* This can be innobase_add_instant_try() performing a
4615 subsequent instant ADD COLUMN, or its rollback by
4616 row_undo_mod_clust_low(). */
4617 ut_ad(flags & BTR_NO_LOCKING_FLAG);
4618 } else {
4619 btr_search_update_hash_on_delete(cursor);
4620
4621 /* Store state of explicit locks on rec on the page
4622 infimum record, before deleting rec. The page infimum
4623 acts as a dummy carrier of the locks, taking care also
4624 of lock releases, before we can move the locks back on
4625 the actual record. There is a special case: if we are
4626 inserting on the root page and the insert causes a
4627 call of btr_root_raise_and_insert. Therefore we cannot
4628 in the lock system delete the lock structs set on the
4629 root page even if the root page carries just node
4630 pointers. */
4631 if (!dict_table_is_locking_disabled(index->table)) {
4632 lock_rec_store_on_page_infimum(block, rec);
4633 }
4634 }
4635
4636#ifdef UNIV_ZIP_DEBUG
4637 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4638#endif /* UNIV_ZIP_DEBUG */
4639 page_cursor = btr_cur_get_page_cur(cursor);
4640
4641 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4642
4643 page_cur_move_to_prev(page_cursor);
4644
4645 rec = btr_cur_insert_if_possible(cursor, new_entry,
4646 offsets, offsets_heap, n_ext, mtr);
4647
4648 if (rec) {
4649 page_cursor->rec = rec;
4650
4651 if (UNIV_UNLIKELY(is_default_row)) {
4652 /* We must empty the PAGE_FREE list, because if this
4653 was a rollback, the shortened 'default row' record
4654 would have too many fields, and we would be unable to
4655 know the size of the freed record. */
4656 btr_page_reorganize(page_cursor, index, mtr);
4657 rec = page_cursor->rec;
4658 } else if (!dict_table_is_locking_disabled(index->table)) {
4659 lock_rec_restore_from_page_infimum(
4660 btr_cur_get_block(cursor), rec, block);
4661 }
4662
4663 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4664 /* The new inserted record owns its possible externally
4665 stored fields */
4666 btr_cur_unmark_extern_fields(
4667 page_zip, rec, index, *offsets, mtr);
4668 } else {
4669 /* In delete-marked records, DB_TRX_ID must
4670 always refer to an existing undo log record. */
4671 ut_ad(row_get_rec_trx_id(rec, index, *offsets));
4672 }
4673
4674 bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4675 ut_ad(!adjust || page_is_leaf(page));
4676
4677 if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4678 if (adjust) {
4679 rec_offs_make_valid(page_cursor->rec, index,
4680 true, *offsets);
4681 }
4682 } else if (!dict_index_is_clust(index)
4683 && page_is_leaf(page)) {
4684 /* Update the free bits in the insert buffer.
4685 This is the same block which was skipped by
4686 BTR_KEEP_IBUF_BITMAP. */
4687 if (page_zip) {
4688 ut_ad(!index->table->is_temporary());
4689 ibuf_update_free_bits_zip(block, mtr);
4690 } else if (!index->table->is_temporary()) {
4691 ibuf_update_free_bits_low(block, max_ins_size,
4692 mtr);
4693 }
4694 }
4695
4696 if (!srv_read_only_mode
4697 && !big_rec_vec
4698 && page_is_leaf(page)
4699 && !dict_index_is_online_ddl(index)) {
4700
4701 mtr_memo_release(mtr, dict_index_get_lock(index),
4702 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4703
4704 /* NOTE: We cannot release root block latch here, because it
4705 has segment header and already modified in most of cases.*/
4706 }
4707
4708 err = DB_SUCCESS;
4709 goto return_after_reservations;
4710 } else {
4711 /* If the page is compressed and it initially
4712 compresses very well, and there is a subsequent insert
4713 of a badly-compressing record, it is possible for
4714 btr_cur_optimistic_update() to return DB_UNDERFLOW and
4715 btr_cur_insert_if_possible() to return FALSE. */
4716 ut_a(page_zip || optim_err != DB_UNDERFLOW);
4717
4718 /* Out of space: reset the free bits.
4719 This is the same block which was skipped by
4720 BTR_KEEP_IBUF_BITMAP. */
4721 if (!dict_index_is_clust(index)
4722 && !index->table->is_temporary()
4723 && page_is_leaf(page)) {
4724 ibuf_reset_free_bits(block);
4725 }
4726 }
4727
4728 if (big_rec_vec != NULL) {
4729 ut_ad(page_is_leaf(page));
4730 ut_ad(dict_index_is_clust(index));
4731 ut_ad(flags & BTR_KEEP_POS_FLAG);
4732
4733 /* btr_page_split_and_insert() in
4734 btr_cur_pessimistic_insert() invokes
4735 mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4736 We must keep the index->lock when we created a
4737 big_rec, so that row_upd_clust_rec() can store the
4738 big_rec in the same mini-transaction. */
4739
4740 ut_ad(mtr_memo_contains_flagged(mtr,
4741 dict_index_get_lock(index),
4742 MTR_MEMO_X_LOCK |
4743 MTR_MEMO_SX_LOCK));
4744
4745 mtr_sx_lock(dict_index_get_lock(index), mtr);
4746 }
4747
4748 /* Was the record to be updated positioned as the first user
4749 record on its page? */
4750 was_first = page_cur_is_before_first(page_cursor);
4751
4752 /* Lock checks and undo logging were already performed by
4753 btr_cur_upd_lock_and_undo(). We do not try
4754 btr_cur_optimistic_insert() because
4755 btr_cur_insert_if_possible() already failed above. */
4756
4757 err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4758 | BTR_NO_LOCKING_FLAG
4759 | BTR_KEEP_SYS_FLAG,
4760 cursor, offsets, offsets_heap,
4761 new_entry, &rec,
4762 &dummy_big_rec, n_ext, NULL, mtr);
4763 ut_a(rec);
4764 ut_a(err == DB_SUCCESS);
4765 ut_a(dummy_big_rec == NULL);
4766 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4767 page_cursor->rec = rec;
4768
4769 /* Multiple transactions cannot simultaneously operate on the
4770 same temp-table in parallel.
4771 max_trx_id is ignored for temp tables because it not required
4772 for MVCC. */
4773 if (dict_index_is_sec_or_ibuf(index)
4774 && !index->table->is_temporary()) {
4775 /* Update PAGE_MAX_TRX_ID in the index page header.
4776 It was not updated by btr_cur_pessimistic_insert()
4777 because of BTR_NO_LOCKING_FLAG. */
4778 buf_block_t* rec_block;
4779
4780 rec_block = btr_cur_get_block(cursor);
4781
4782 page_update_max_trx_id(rec_block,
4783 buf_block_get_page_zip(rec_block),
4784 trx_id, mtr);
4785 }
4786
4787 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4788 /* The new inserted record owns its possible externally
4789 stored fields */
4790 buf_block_t* rec_block = btr_cur_get_block(cursor);
4791
4792#ifdef UNIV_ZIP_DEBUG
4793 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4794 page = buf_block_get_frame(rec_block);
4795#endif /* UNIV_ZIP_DEBUG */
4796 page_zip = buf_block_get_page_zip(rec_block);
4797
4798 btr_cur_unmark_extern_fields(page_zip,
4799 rec, index, *offsets, mtr);
4800 } else {
4801 /* In delete-marked records, DB_TRX_ID must
4802 always refer to an existing undo log record. */
4803 ut_ad(row_get_rec_trx_id(rec, index, *offsets));
4804 }
4805
4806 if (UNIV_UNLIKELY(is_default_row)) {
4807 /* We must empty the PAGE_FREE list, because if this
4808 was a rollback, the shortened 'default row' record
4809 would have too many fields, and we would be unable to
4810 know the size of the freed record. */
4811 btr_page_reorganize(page_cursor, index, mtr);
4812 rec = page_cursor->rec;
4813 } else if (!dict_table_is_locking_disabled(index->table)) {
4814 lock_rec_restore_from_page_infimum(
4815 btr_cur_get_block(cursor), rec, block);
4816 }
4817
4818 /* If necessary, restore also the correct lock state for a new,
4819 preceding supremum record created in a page split. While the old
4820 record was nonexistent, the supremum might have inherited its locks
4821 from a wrong record. */
4822
4823 if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4824 btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4825 rec, mtr);
4826 }
4827
4828return_after_reservations:
4829#ifdef UNIV_ZIP_DEBUG
4830 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4831#endif /* UNIV_ZIP_DEBUG */
4832
4833 index->table->space->release_free_extents(n_reserved);
4834 *big_rec = big_rec_vec;
4835 return(err);
4836}
4837
4838/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4839
4840/****************************************************************//**
4841Writes the redo log record for delete marking or unmarking of an index
4842record. */
4843UNIV_INLINE
4844void
4845btr_cur_del_mark_set_clust_rec_log(
4846/*===============================*/
4847 rec_t* rec, /*!< in: record */
4848 dict_index_t* index, /*!< in: index of the record */
4849 trx_id_t trx_id, /*!< in: transaction id */
4850 roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
4851 mtr_t* mtr) /*!< in: mtr */
4852{
4853 byte* log_ptr;
4854
4855 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4856 ut_ad(mtr->is_named_space(index->table->space));
4857
4858 log_ptr = mlog_open_and_write_index(mtr, rec, index,
4859 page_rec_is_comp(rec)
4860 ? MLOG_COMP_REC_CLUST_DELETE_MARK
4861 : MLOG_REC_CLUST_DELETE_MARK,
4862 1 + 1 + DATA_ROLL_PTR_LEN
4863 + 14 + 2);
4864
4865 if (!log_ptr) {
4866 /* Logging in mtr is switched off during crash recovery */
4867 return;
4868 }
4869
4870 *log_ptr++ = 0;
4871 *log_ptr++ = 1;
4872
4873 log_ptr = row_upd_write_sys_vals_to_log(
4874 index, trx_id, roll_ptr, log_ptr, mtr);
4875 mach_write_to_2(log_ptr, page_offset(rec));
4876 log_ptr += 2;
4877
4878 mlog_close(mtr, log_ptr);
4879}
4880
4881/****************************************************************//**
4882Parses the redo log record for delete marking or unmarking of a clustered
4883index record.
4884@return end of log record or NULL */
4885byte*
4886btr_cur_parse_del_mark_set_clust_rec(
4887/*=================================*/
4888 byte* ptr, /*!< in: buffer */
4889 byte* end_ptr,/*!< in: buffer end */
4890 page_t* page, /*!< in/out: page or NULL */
4891 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
4892 dict_index_t* index) /*!< in: index corresponding to page */
4893{
4894 ulint flags;
4895 ulint val;
4896 ulint pos;
4897 trx_id_t trx_id;
4898 roll_ptr_t roll_ptr;
4899 ulint offset;
4900 rec_t* rec;
4901
4902 ut_ad(!page
4903 || !!page_is_comp(page) == dict_table_is_comp(index->table));
4904
4905 if (end_ptr < ptr + 2) {
4906
4907 return(NULL);
4908 }
4909
4910 flags = mach_read_from_1(ptr);
4911 ptr++;
4912 val = mach_read_from_1(ptr);
4913 ptr++;
4914
4915 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4916
4917 if (ptr == NULL) {
4918
4919 return(NULL);
4920 }
4921
4922 if (end_ptr < ptr + 2) {
4923
4924 return(NULL);
4925 }
4926
4927 offset = mach_read_from_2(ptr);
4928 ptr += 2;
4929
4930 ut_a(offset <= srv_page_size);
4931
4932 /* In delete-marked records, DB_TRX_ID must
4933 always refer to an existing undo log record. */
4934 ut_ad(trx_id || (flags & BTR_KEEP_SYS_FLAG));
4935
4936 if (page) {
4937 rec = page + offset;
4938
4939 /* We do not need to reserve search latch, as the page
4940 is only being recovered, and there cannot be a hash index to
4941 it. Besides, these fields are being updated in place
4942 and the adaptive hash index does not depend on them. */
4943
4944 btr_rec_set_deleted_flag(rec, page_zip, val);
4945 /* pos is the offset of DB_TRX_ID in the clustered index.
4946 Debug assertions may also access DB_ROLL_PTR at pos+1.
4947 Therefore, we must compute offsets for the first pos+2
4948 clustered index fields. */
4949 ut_ad(pos <= MAX_REF_PARTS);
4950
4951 ulint offsets[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
4952 rec_offs_init(offsets);
4953 mem_heap_t* heap = NULL;
4954
4955 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4956 row_upd_rec_sys_fields_in_recovery(
4957 rec, page_zip,
4958 rec_get_offsets(rec, index, offsets, true,
4959 pos + 2, &heap),
4960 pos, trx_id, roll_ptr);
4961 } else {
4962 /* In delete-marked records, DB_TRX_ID must
4963 always refer to an existing undo log record. */
4964 ut_ad(memcmp(rec_get_nth_field(
4965 rec,
4966 rec_get_offsets(rec, index,
4967 offsets, true,
4968 pos, &heap),
4969 pos, &offset),
4970 field_ref_zero, DATA_TRX_ID_LEN));
4971 ut_ad(offset == DATA_TRX_ID_LEN);
4972 }
4973
4974 if (UNIV_LIKELY_NULL(heap)) {
4975 mem_heap_free(heap);
4976 }
4977 }
4978
4979 return(ptr);
4980}
4981
4982/***********************************************************//**
4983Marks a clustered index record deleted. Writes an undo log record to
4984undo log on this delete marking. Writes in the trx id field the id
4985of the deleting transaction, and in the roll ptr field pointer to the
4986undo log record created.
4987@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4988dberr_t
4989btr_cur_del_mark_set_clust_rec(
4990/*===========================*/
4991 buf_block_t* block, /*!< in/out: buffer block of the record */
4992 rec_t* rec, /*!< in/out: record */
4993 dict_index_t* index, /*!< in: clustered index of the record */
4994 const ulint* offsets,/*!< in: rec_get_offsets(rec) */
4995 que_thr_t* thr, /*!< in: query thread */
4996 const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
4997 contains the virtual cols if there are any */
4998 mtr_t* mtr) /*!< in/out: mini-transaction */
4999{
5000 roll_ptr_t roll_ptr;
5001 dberr_t err;
5002 page_zip_des_t* page_zip;
5003 trx_t* trx;
5004
5005 ut_ad(dict_index_is_clust(index));
5006 ut_ad(rec_offs_validate(rec, index, offsets));
5007 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
5008 ut_ad(buf_block_get_frame(block) == page_align(rec));
5009 ut_ad(page_rec_is_leaf(rec));
5010 ut_ad(mtr->is_named_space(index->table->space));
5011
5012 if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
5013 /* We may already have delete-marked this record
5014 when executing an ON DELETE CASCADE operation. */
5015 ut_ad(row_get_rec_trx_id(rec, index, offsets)
5016 == thr_get_trx(thr)->id);
5017 return(DB_SUCCESS);
5018 }
5019
5020 err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
5021 rec, index, offsets, thr);
5022
5023 if (err != DB_SUCCESS) {
5024
5025 return(err);
5026 }
5027
5028 err = trx_undo_report_row_operation(thr, index,
5029 entry, NULL, 0, rec, offsets,
5030 &roll_ptr);
5031 if (err != DB_SUCCESS) {
5032
5033 return(err);
5034 }
5035
5036 /* The search latch is not needed here, because
5037 the adaptive hash index does not depend on the delete-mark
5038 and the delete-mark is being updated in place. */
5039
5040 page_zip = buf_block_get_page_zip(block);
5041
5042 btr_rec_set_deleted_flag(rec, page_zip, TRUE);
5043
5044 trx = thr_get_trx(thr);
5045
5046 DBUG_LOG("ib_cur",
5047 "delete-mark clust " << index->table->name
5048 << " (" << index->id << ") by "
5049 << ib::hex(trx_get_id_for_print(trx)) << ": "
5050 << rec_printer(rec, offsets).str());
5051
5052 if (dict_index_is_online_ddl(index)) {
5053 row_log_table_delete(rec, index, offsets, NULL);
5054 }
5055
5056 row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
5057
5058 btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
5059 roll_ptr, mtr);
5060
5061 return(err);
5062}
5063
5064/****************************************************************//**
5065Writes the redo log record for a delete mark setting of a secondary
5066index record. */
5067UNIV_INLINE
5068void
5069btr_cur_del_mark_set_sec_rec_log(
5070/*=============================*/
5071 rec_t* rec, /*!< in: record */
5072 ibool val, /*!< in: value to set */
5073 mtr_t* mtr) /*!< in: mtr */
5074{
5075 byte* log_ptr;
5076 ut_ad(val <= 1);
5077
5078 log_ptr = mlog_open(mtr, 11 + 1 + 2);
5079
5080 if (!log_ptr) {
5081 /* Logging in mtr is switched off during crash recovery:
5082 in that case mlog_open returns NULL */
5083 return;
5084 }
5085
5086 log_ptr = mlog_write_initial_log_record_fast(
5087 rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
5088 mach_write_to_1(log_ptr, val);
5089 log_ptr++;
5090
5091 mach_write_to_2(log_ptr, page_offset(rec));
5092 log_ptr += 2;
5093
5094 mlog_close(mtr, log_ptr);
5095}
5096
5097/****************************************************************//**
5098Parses the redo log record for delete marking or unmarking of a secondary
5099index record.
5100@return end of log record or NULL */
5101byte*
5102btr_cur_parse_del_mark_set_sec_rec(
5103/*===============================*/
5104 byte* ptr, /*!< in: buffer */
5105 byte* end_ptr,/*!< in: buffer end */
5106 page_t* page, /*!< in/out: page or NULL */
5107 page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
5108{
5109 ulint val;
5110 ulint offset;
5111 rec_t* rec;
5112
5113 if (end_ptr < ptr + 3) {
5114
5115 return(NULL);
5116 }
5117
5118 val = mach_read_from_1(ptr);
5119 ptr++;
5120
5121 offset = mach_read_from_2(ptr);
5122 ptr += 2;
5123
5124 ut_a(offset <= srv_page_size);
5125
5126 if (page) {
5127 rec = page + offset;
5128
5129 /* We do not need to reserve search latch, as the page
5130 is only being recovered, and there cannot be a hash index to
5131 it. Besides, the delete-mark flag is being updated in place
5132 and the adaptive hash index does not depend on it. */
5133
5134 btr_rec_set_deleted_flag(rec, page_zip, val);
5135 }
5136
5137 return(ptr);
5138}
5139
5140/***********************************************************//**
5141Sets a secondary index record delete mark to TRUE or FALSE.
5142@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
5143dberr_t
5144btr_cur_del_mark_set_sec_rec(
5145/*=========================*/
5146 ulint flags, /*!< in: locking flag */
5147 btr_cur_t* cursor, /*!< in: cursor */
5148 ibool val, /*!< in: value to set */
5149 que_thr_t* thr, /*!< in: query thread */
5150 mtr_t* mtr) /*!< in/out: mini-transaction */
5151{
5152 buf_block_t* block;
5153 rec_t* rec;
5154 dberr_t err;
5155
5156 block = btr_cur_get_block(cursor);
5157 rec = btr_cur_get_rec(cursor);
5158
5159 err = lock_sec_rec_modify_check_and_lock(flags,
5160 btr_cur_get_block(cursor),
5161 rec, cursor->index, thr, mtr);
5162 if (err != DB_SUCCESS) {
5163
5164 return(err);
5165 }
5166
5167 ut_ad(!!page_rec_is_comp(rec)
5168 == dict_table_is_comp(cursor->index->table));
5169
5170 DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
5171 IB_ID_FMT ") by " TRX_ID_FMT,
5172 unsigned(val),
5173 block->page.id.space(), block->page.id.page_no(),
5174 unsigned(page_rec_get_heap_no(rec)),
5175 cursor->index->name(), cursor->index->id,
5176 trx_get_id_for_print(thr_get_trx(thr))));
5177
5178 /* We do not need to reserve search latch, as the
5179 delete-mark flag is being updated in place and the adaptive
5180 hash index does not depend on it. */
5181 btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
5182
5183 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5184
5185 return(DB_SUCCESS);
5186}
5187
5188/***********************************************************//**
5189Sets a secondary index record's delete mark to the given value. This
5190function is only used by the insert buffer merge mechanism. */
5191void
5192btr_cur_set_deleted_flag_for_ibuf(
5193/*==============================*/
5194 rec_t* rec, /*!< in/out: record */
5195 page_zip_des_t* page_zip, /*!< in/out: compressed page
5196 corresponding to rec, or NULL
5197 when the tablespace is
5198 uncompressed */
5199 ibool val, /*!< in: value to set */
5200 mtr_t* mtr) /*!< in/out: mini-transaction */
5201{
5202 /* We do not need to reserve search latch, as the page
5203 has just been read to the buffer pool and there cannot be
5204 a hash index to it. Besides, the delete-mark flag is being
5205 updated in place and the adaptive hash index does not depend
5206 on it. */
5207
5208 btr_rec_set_deleted_flag(rec, page_zip, val);
5209
5210 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5211}
5212
5213/*==================== B-TREE RECORD REMOVE =========================*/
5214
5215/*************************************************************//**
5216Tries to compress a page of the tree if it seems useful. It is assumed
5217that mtr holds an x-latch on the tree and on the cursor page. To avoid
5218deadlocks, mtr must also own x-latches to brothers of page, if those
5219brothers exist. NOTE: it is assumed that the caller has reserved enough
5220free extents so that the compression will always succeed if done!
5221@return TRUE if compression occurred */
5222ibool
5223btr_cur_compress_if_useful(
5224/*=======================*/
5225 btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
5226 cursor does not stay valid if !adjust and
5227 compression occurs */
5228 ibool adjust, /*!< in: TRUE if should adjust the
5229 cursor position even if compression occurs */
5230 mtr_t* mtr) /*!< in/out: mini-transaction */
5231{
5232 ut_ad(mtr_memo_contains_flagged(
5233 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5234 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
5235 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5236 MTR_MEMO_PAGE_X_FIX));
5237
5238 if (dict_index_is_spatial(cursor->index)) {
5239 const page_t* page = btr_cur_get_page(cursor);
5240 const trx_t* trx = NULL;
5241
5242 if (cursor->rtr_info->thr != NULL) {
5243 trx = thr_get_trx(cursor->rtr_info->thr);
5244 }
5245
5246 /* Check whether page lock prevents the compression */
5247 if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5248 page_get_page_no(page))) {
5249 return(false);
5250 }
5251 }
5252
5253 return(btr_cur_compress_recommendation(cursor, mtr)
5254 && btr_compress(cursor, adjust, mtr));
5255}
5256
5257/*******************************************************//**
5258Removes the record on which the tree cursor is positioned on a leaf page.
5259It is assumed that the mtr has an x-latch on the page where the cursor is
5260positioned, but no latch on the whole tree.
5261@return TRUE if success, i.e., the page did not become too empty */
5262ibool
5263btr_cur_optimistic_delete_func(
5264/*===========================*/
5265 btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
5266 delete; cursor stays valid: if deletion
5267 succeeds, on function exit it points to the
5268 successor of the deleted record */
5269#ifdef UNIV_DEBUG
5270 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5271#endif /* UNIV_DEBUG */
5272 mtr_t* mtr) /*!< in: mtr; if this function returns
5273 TRUE on a leaf page of a secondary
5274 index, the mtr must be committed
5275 before latching any further pages */
5276{
5277 buf_block_t* block;
5278 rec_t* rec;
5279 mem_heap_t* heap = NULL;
5280 ulint offsets_[REC_OFFS_NORMAL_SIZE];
5281 ulint* offsets = offsets_;
5282 ibool no_compress_needed;
5283 rec_offs_init(offsets_);
5284
5285 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5286 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5287 MTR_MEMO_PAGE_X_FIX));
5288 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5289 MTR_MEMO_PAGE_X_FIX));
5290 ut_ad(mtr->is_named_space(cursor->index->table->space));
5291
5292 /* This is intended only for leaf page deletions */
5293
5294 block = btr_cur_get_block(cursor);
5295
5296 ut_ad(page_is_leaf(buf_block_get_frame(block)));
5297 ut_ad(!dict_index_is_online_ddl(cursor->index)
5298 || dict_index_is_clust(cursor->index)
5299 || (flags & BTR_CREATE_FLAG));
5300
5301 rec = btr_cur_get_rec(cursor);
5302
5303 if (UNIV_UNLIKELY(page_is_root(block->frame)
5304 && page_get_n_recs(block->frame) == 1
5305 + (cursor->index->is_instant()
5306 && !rec_is_default_row(rec, cursor->index)))) {
5307 /* The whole index (and table) becomes logically empty.
5308 Empty the whole page. That is, if we are deleting the
5309 only user record, also delete the 'default row' record
5310 if one exists (it exists if and only if is_instant()).
5311 If we are deleting the 'default row' record and the
5312 table becomes empty, clean up the whole page. */
5313 dict_index_t* index = cursor->index;
5314 ut_ad(!index->is_instant()
5315 || rec_is_default_row(
5316 page_rec_get_next_const(
5317 page_get_infimum_rec(block->frame)),
5318 index));
5319 if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
5320 & REC_INFO_MIN_REC_FLAG)) {
5321 /* This should be rolling back instant ADD COLUMN.
5322 If this is a recovered transaction, then
5323 index->is_instant() will hold until the
5324 insert into SYS_COLUMNS is rolled back. */
5325 ut_ad(index->table->supports_instant());
5326 ut_ad(index->is_primary());
5327 } else {
5328 lock_update_delete(block, rec);
5329 }
5330 btr_page_empty(block, buf_block_get_page_zip(block),
5331 index, 0, mtr);
5332 page_cur_set_after_last(block, btr_cur_get_page_cur(cursor));
5333
5334 if (index->is_primary()) {
5335 /* Concurrent access is prevented by
5336 root_block->lock X-latch, so this should be
5337 safe. */
5338 index->remove_instant();
5339 }
5340
5341 return true;
5342 }
5343
5344 offsets = rec_get_offsets(rec, cursor->index, offsets, true,
5345 ULINT_UNDEFINED, &heap);
5346
5347 no_compress_needed = !rec_offs_any_extern(offsets)
5348 && btr_cur_can_delete_without_compress(
5349 cursor, rec_offs_size(offsets), mtr);
5350
5351 if (no_compress_needed) {
5352
5353 page_t* page = buf_block_get_frame(block);
5354 page_zip_des_t* page_zip= buf_block_get_page_zip(block);
5355
5356 if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
5357 & REC_INFO_MIN_REC_FLAG)) {
5358 /* This should be rolling back instant ADD COLUMN.
5359 If this is a recovered transaction, then
5360 index->is_instant() will hold until the
5361 insert into SYS_COLUMNS is rolled back. */
5362 ut_ad(cursor->index->table->supports_instant());
5363 ut_ad(cursor->index->is_primary());
5364 ut_ad(!page_zip);
5365 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5366 cursor->index, offsets, mtr);
5367 /* We must empty the PAGE_FREE list, because
5368 after rollback, this deleted 'default row' record
5369 would have too many fields, and we would be
5370 unable to know the size of the freed record. */
5371 btr_page_reorganize(btr_cur_get_page_cur(cursor),
5372 cursor->index, mtr);
5373 goto func_exit;
5374 } else {
5375 lock_update_delete(block, rec);
5376
5377 btr_search_update_hash_on_delete(cursor);
5378 }
5379
5380 if (page_zip) {
5381#ifdef UNIV_ZIP_DEBUG
5382 ut_a(page_zip_validate(page_zip, page, cursor->index));
5383#endif /* UNIV_ZIP_DEBUG */
5384 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5385 cursor->index, offsets, mtr);
5386#ifdef UNIV_ZIP_DEBUG
5387 ut_a(page_zip_validate(page_zip, page, cursor->index));
5388#endif /* UNIV_ZIP_DEBUG */
5389
5390 /* On compressed pages, the IBUF_BITMAP_FREE
5391 space is not affected by deleting (purging)
5392 records, because it is defined as the minimum
5393 of space available *without* reorganize, and
5394 space available in the modification log. */
5395 } else {
5396 const ulint max_ins
5397 = page_get_max_insert_size_after_reorganize(
5398 page, 1);
5399
5400 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5401 cursor->index, offsets, mtr);
5402
5403 /* The change buffer does not handle inserts
5404 into non-leaf pages, into clustered indexes,
5405 or into the change buffer. */
5406 if (!dict_index_is_clust(cursor->index)
5407 && !cursor->index->table->is_temporary()
5408 && !dict_index_is_ibuf(cursor->index)) {
5409 ibuf_update_free_bits_low(block, max_ins, mtr);
5410 }
5411 }
5412 } else {
5413 /* prefetch siblings of the leaf for the pessimistic
5414 operation. */
5415 btr_cur_prefetch_siblings(block);
5416 }
5417
5418func_exit:
5419 if (UNIV_LIKELY_NULL(heap)) {
5420 mem_heap_free(heap);
5421 }
5422
5423 return(no_compress_needed);
5424}
5425
5426/*************************************************************//**
5427Removes the record on which the tree cursor is positioned. Tries
5428to compress the page if its fillfactor drops below a threshold
5429or if it is the only page on the level. It is assumed that mtr holds
5430an x-latch on the tree and on the cursor page. To avoid deadlocks,
5431mtr must also own x-latches to brothers of page, if those brothers
5432exist.
5433@return TRUE if compression occurred and FALSE if not or something
5434wrong. */
5435ibool
5436btr_cur_pessimistic_delete(
5437/*=======================*/
5438 dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5439 the latter may occur because we may have
5440 to update node pointers on upper levels,
5441 and in the case of variable length keys
5442 these may actually grow in size */
5443 ibool has_reserved_extents, /*!< in: TRUE if the
5444 caller has already reserved enough free
5445 extents so that he knows that the operation
5446 will succeed */
5447 btr_cur_t* cursor, /*!< in: cursor on the record to delete;
5448 if compression does not occur, the cursor
5449 stays valid: it points to successor of
5450 deleted record on function exit */
5451 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5452 bool rollback,/*!< in: performing rollback? */
5453 mtr_t* mtr) /*!< in: mtr */
5454{
5455 buf_block_t* block;
5456 page_t* page;
5457 page_zip_des_t* page_zip;
5458 dict_index_t* index;
5459 rec_t* rec;
5460 ulint n_reserved = 0;
5461 bool success;
5462 ibool ret = FALSE;
5463 mem_heap_t* heap;
5464 ulint* offsets;
5465#ifdef UNIV_DEBUG
5466 bool parent_latched = false;
5467#endif /* UNIV_DEBUG */
5468
5469 block = btr_cur_get_block(cursor);
5470 page = buf_block_get_frame(block);
5471 index = btr_cur_get_index(cursor);
5472
5473 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5474 ut_ad(!dict_index_is_online_ddl(index)
5475 || dict_index_is_clust(index)
5476 || (flags & BTR_CREATE_FLAG));
5477 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5478 MTR_MEMO_X_LOCK
5479 | MTR_MEMO_SX_LOCK));
5480 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
5481 ut_ad(mtr->is_named_space(index->table->space));
5482
5483 if (!has_reserved_extents) {
5484 /* First reserve enough free space for the file segments
5485 of the index tree, so that the node pointer updates will
5486 not fail because of lack of space */
5487
5488 ulint n_extents = cursor->tree_height / 32 + 1;
5489
5490 success = fsp_reserve_free_extents(&n_reserved,
5491 index->table->space,
5492 n_extents,
5493 FSP_CLEANING, mtr);
5494 if (!success) {
5495 *err = DB_OUT_OF_FILE_SPACE;
5496
5497 return(FALSE);
5498 }
5499 }
5500
5501 heap = mem_heap_create(1024);
5502 rec = btr_cur_get_rec(cursor);
5503 page_zip = buf_block_get_page_zip(block);
5504#ifdef UNIV_ZIP_DEBUG
5505 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5506#endif /* UNIV_ZIP_DEBUG */
5507
5508 offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page),
5509 ULINT_UNDEFINED, &heap);
5510
5511 if (rec_offs_any_extern(offsets)) {
5512 btr_rec_free_externally_stored_fields(index,
5513 rec, offsets, page_zip,
5514 rollback, mtr);
5515#ifdef UNIV_ZIP_DEBUG
5516 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5517#endif /* UNIV_ZIP_DEBUG */
5518 }
5519
5520 if (page_is_leaf(page)) {
5521 const bool is_default_row = rec_get_info_bits(
5522 rec, page_rec_is_comp(rec)) & REC_INFO_MIN_REC_FLAG;
5523 if (UNIV_UNLIKELY(is_default_row)) {
5524 /* This should be rolling back instant ADD COLUMN.
5525 If this is a recovered transaction, then
5526 index->is_instant() will hold until the
5527 insert into SYS_COLUMNS is rolled back. */
5528 ut_ad(rollback);
5529 ut_ad(index->table->supports_instant());
5530 ut_ad(index->is_primary());
5531 } else if (flags == 0) {
5532 lock_update_delete(block, rec);
5533 }
5534
5535 if (!page_is_root(page)) {
5536 if (page_get_n_recs(page) < 2) {
5537 goto discard_page;
5538 }
5539 } else if (page_get_n_recs(page) == 1
5540 + (index->is_instant()
5541 && !rec_is_default_row(rec, index))) {
5542 /* The whole index (and table) becomes logically empty.
5543 Empty the whole page. That is, if we are deleting the
5544 only user record, also delete the 'default row' record
5545 if one exists (it exists if and only if is_instant()).
5546 If we are deleting the 'default row' record and the
5547 table becomes empty, clean up the whole page. */
5548 ut_ad(!index->is_instant()
5549 || rec_is_default_row(
5550 page_rec_get_next_const(
5551 page_get_infimum_rec(page)),
5552 index));
5553 btr_page_empty(block, page_zip, index, 0, mtr);
5554 page_cur_set_after_last(block,
5555 btr_cur_get_page_cur(cursor));
5556 if (index->is_primary()) {
5557 /* Concurrent access is prevented by
5558 index->lock and root_block->lock
5559 X-latch, so this should be safe. */
5560 index->remove_instant();
5561 }
5562 ret = TRUE;
5563 goto return_after_reservations;
5564 }
5565
5566 if (UNIV_LIKELY(!is_default_row)) {
5567 btr_search_update_hash_on_delete(cursor);
5568 } else {
5569 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5570 index, offsets, mtr);
5571 /* We must empty the PAGE_FREE list, because
5572 after rollback, this deleted 'default row' record
5573 would carry too many fields, and we would be
5574 unable to know the size of the freed record. */
5575 btr_page_reorganize(btr_cur_get_page_cur(cursor),
5576 index, mtr);
5577 ut_ad(!ret);
5578 goto return_after_reservations;
5579 }
5580 } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
5581 if (page_rec_is_last(rec, page)) {
5582discard_page:
5583 ut_ad(page_get_n_recs(page) == 1);
5584 /* If there is only one record, drop
5585 the whole page. */
5586
5587 btr_discard_page(cursor, mtr);
5588
5589 ret = TRUE;
5590 goto return_after_reservations;
5591 }
5592
5593 rec_t* next_rec = page_rec_get_next(rec);
5594
5595 if (!page_has_prev(page)) {
5596
5597 /* If we delete the leftmost node pointer on a
5598 non-leaf level, we must mark the new leftmost node
5599 pointer as the predefined minimum record */
5600
5601 /* This will make page_zip_validate() fail until
5602 page_cur_delete_rec() completes. This is harmless,
5603 because everything will take place within a single
5604 mini-transaction and because writing to the redo log
5605 is an atomic operation (performed by mtr_commit()). */
5606 btr_set_min_rec_mark(next_rec, mtr);
5607 } else if (dict_index_is_spatial(index)) {
5608 /* For rtree, if delete the leftmost node pointer,
5609 we need to update parent page. */
5610 rtr_mbr_t father_mbr;
5611 rec_t* father_rec;
5612 btr_cur_t father_cursor;
5613 ulint* offsets;
5614 bool upd_ret;
5615 ulint len;
5616
5617 rtr_page_get_father_block(NULL, heap, index,
5618 block, mtr, NULL,
5619 &father_cursor);
5620 offsets = rec_get_offsets(
5621 btr_cur_get_rec(&father_cursor), index, NULL,
5622 false, ULINT_UNDEFINED, &heap);
5623
5624 father_rec = btr_cur_get_rec(&father_cursor);
5625 rtr_read_mbr(rec_get_nth_field(
5626 father_rec, offsets, 0, &len), &father_mbr);
5627
5628 upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5629 NULL, page, &father_mbr,
5630 next_rec, mtr);
5631
5632 if (!upd_ret) {
5633 *err = DB_ERROR;
5634
5635 mem_heap_free(heap);
5636 return(FALSE);
5637 }
5638
5639 ut_d(parent_latched = true);
5640 } else {
5641 /* Otherwise, if we delete the leftmost node pointer
5642 on a page, we have to change the parent node pointer
5643 so that it is equal to the new leftmost node pointer
5644 on the page */
5645
5646 btr_node_ptr_delete(index, block, mtr);
5647 const ulint level = btr_page_get_level(page);
5648
5649 dtuple_t* node_ptr = dict_index_build_node_ptr(
5650 index, next_rec, block->page.id.page_no(),
5651 heap, level);
5652
5653 btr_insert_on_non_leaf_level(
5654 flags, index, level + 1, node_ptr, mtr);
5655
5656 ut_d(parent_latched = true);
5657 }
5658 }
5659
5660 page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5661#ifdef UNIV_ZIP_DEBUG
5662 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5663#endif /* UNIV_ZIP_DEBUG */
5664
5665return_after_reservations:
5666 /* btr_check_node_ptr() needs parent block latched */
5667 ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5668
5669 *err = DB_SUCCESS;
5670
5671 mem_heap_free(heap);
5672
5673 if (ret == FALSE) {
5674 ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5675 }
5676
5677 if (!srv_read_only_mode
5678 && page_is_leaf(page)
5679 && !dict_index_is_online_ddl(index)) {
5680
5681 mtr_memo_release(mtr, dict_index_get_lock(index),
5682 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5683
5684 /* NOTE: We cannot release root block latch here, because it
5685 has segment header and already modified in most of cases.*/
5686 }
5687
5688 index->table->space->release_free_extents(n_reserved);
5689 return(ret);
5690}
5691
5692/*******************************************************************//**
5693Adds path information to the cursor for the current page, for which
5694the binary search has been performed. */
5695static
5696void
5697btr_cur_add_path_info(
5698/*==================*/
5699 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
5700 ulint height, /*!< in: height of the page in tree;
5701 0 means leaf node */
5702 ulint root_height) /*!< in: root node height in tree */
5703{
5704 btr_path_t* slot;
5705 const rec_t* rec;
5706 const page_t* page;
5707
5708 ut_a(cursor->path_arr);
5709
5710 if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5711 /* Do nothing; return empty path */
5712
5713 slot = cursor->path_arr;
5714 slot->nth_rec = ULINT_UNDEFINED;
5715
5716 return;
5717 }
5718
5719 if (height == 0) {
5720 /* Mark end of slots for path */
5721 slot = cursor->path_arr + root_height + 1;
5722 slot->nth_rec = ULINT_UNDEFINED;
5723 }
5724
5725 rec = btr_cur_get_rec(cursor);
5726
5727 slot = cursor->path_arr + (root_height - height);
5728
5729 page = page_align(rec);
5730
5731 slot->nth_rec = page_rec_get_n_recs_before(rec);
5732 slot->n_recs = page_get_n_recs(page);
5733 slot->page_no = page_get_page_no(page);
5734 slot->page_level = btr_page_get_level(page);
5735}
5736
5737/*******************************************************************//**
5738Estimate the number of rows between slot1 and slot2 for any level on a
5739B-tree. This function starts from slot1->page and reads a few pages to
5740the right, counting their records. If we reach slot2->page quickly then
5741we know exactly how many records there are between slot1 and slot2 and
5742we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5743then we calculate the average number of records in the pages scanned
5744so far and assume that all pages that we did not scan up to slot2->page
5745contain the same number of records, then we multiply that average to
5746the number of pages between slot1->page and slot2->page (which is
5747n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5748@return number of rows, not including the borders (exact or estimated) */
5749static
5750ha_rows
5751btr_estimate_n_rows_in_range_on_level(
5752/*==================================*/
5753 dict_index_t* index, /*!< in: index */
5754 btr_path_t* slot1, /*!< in: left border */
5755 btr_path_t* slot2, /*!< in: right border */
5756 ha_rows n_rows_on_prev_level, /*!< in: number of rows
5757 on the previous level for the
5758 same descend paths; used to
5759 determine the number of pages
5760 on this level */
5761 bool* is_n_rows_exact) /*!< out: TRUE if the returned
5762 value is exact i.e. not an
5763 estimation */
5764{
5765 ha_rows n_rows = 0;
5766 uint n_pages_read = 0;
5767 ulint level;
5768
5769 /* Assume by default that we will scan all pages between
5770 slot1->page_no and slot2->page_no. */
5771 *is_n_rows_exact = true;
5772
5773 /* Add records from slot1->page_no which are to the right of
5774 the record which serves as a left border of the range, if any
5775 (we don't include the record itself in this count). */
5776 if (slot1->nth_rec <= slot1->n_recs) {
5777 n_rows += slot1->n_recs - slot1->nth_rec;
5778 }
5779
5780 /* Add records from slot2->page_no which are to the left of
5781 the record which servers as a right border of the range, if any
5782 (we don't include the record itself in this count). */
5783 if (slot2->nth_rec > 1) {
5784 n_rows += slot2->nth_rec - 1;
5785 }
5786
5787 /* Count the records in the pages between slot1->page_no and
5788 slot2->page_no (non inclusive), if any. */
5789
5790 /* Do not read more than this number of pages in order not to hurt
5791 performance with this code which is just an estimation. If we read
5792 this many pages before reaching slot2->page_no then we estimate the
5793 average from the pages scanned so far. */
5794# define N_PAGES_READ_LIMIT 10
5795
5796 const fil_space_t* space = index->table->space;
5797 page_id_t page_id(space->id, slot1->page_no);
5798 const page_size_t page_size(space->flags);
5799
5800 level = slot1->page_level;
5801
5802 do {
5803 mtr_t mtr;
5804 page_t* page;
5805 buf_block_t* block;
5806 dberr_t err=DB_SUCCESS;
5807
5808 mtr_start(&mtr);
5809
5810 /* Fetch the page. Because we are not holding the
5811 index->lock, the tree may have changed and we may be
5812 attempting to read a page that is no longer part of
5813 the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5814 silence a debug assertion about this. */
5815 block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5816 NULL, BUF_GET_POSSIBLY_FREED,
5817 __FILE__, __LINE__, &mtr, &err);
5818
5819 ut_ad((block != NULL) == (err == DB_SUCCESS));
5820
5821 if (err != DB_SUCCESS) {
5822 if (err == DB_DECRYPTION_FAILED) {
5823 ib_push_warning((void *)NULL,
5824 DB_DECRYPTION_FAILED,
5825 "Table %s is encrypted but encryption service or"
5826 " used key_id is not available. "
5827 " Can't continue reading table.",
5828 index->table->name);
5829 index->table->file_unreadable = true;
5830 }
5831
5832 mtr_commit(&mtr);
5833 goto inexact;
5834 }
5835
5836 page = buf_block_get_frame(block);
5837
5838 /* It is possible that the tree has been reorganized in the
5839 meantime and this is a different page. If this happens the
5840 calculated estimate will be bogus, which is not fatal as
5841 this is only an estimate. We are sure that a page with
5842 page_no exists because InnoDB never frees pages, only
5843 reuses them. */
5844 if (!fil_page_index_page_check(page)
5845 || btr_page_get_index_id(page) != index->id
5846 || btr_page_get_level(page) != level) {
5847
5848 /* The page got reused for something else */
5849 mtr_commit(&mtr);
5850 goto inexact;
5851 }
5852
5853 /* It is possible but highly unlikely that the page was
5854 originally written by an old version of InnoDB that did
5855 not initialize FIL_PAGE_TYPE on other than B-tree pages.
5856 For example, this could be an almost-empty BLOB page
5857 that happens to contain the magic values in the fields
5858 that we checked above. */
5859
5860 n_pages_read++;
5861
5862 if (page_id.page_no() != slot1->page_no) {
5863 /* Do not count the records on slot1->page_no,
5864 we already counted them before this loop. */
5865 n_rows += page_get_n_recs(page);
5866 }
5867
5868 page_id.set_page_no(btr_page_get_next(page, &mtr));
5869
5870 mtr_commit(&mtr);
5871
5872 if (n_pages_read == N_PAGES_READ_LIMIT
5873 || page_id.page_no() == FIL_NULL) {
5874 /* Either we read too many pages or
5875 we reached the end of the level without passing
5876 through slot2->page_no, the tree must have changed
5877 in the meantime */
5878 goto inexact;
5879 }
5880
5881 } while (page_id.page_no() != slot2->page_no);
5882
5883 return(n_rows);
5884
5885inexact:
5886
5887 *is_n_rows_exact = false;
5888
5889 /* We did interrupt before reaching slot2->page */
5890
5891 if (n_pages_read > 0) {
5892 /* The number of pages on this level is
5893 n_rows_on_prev_level, multiply it by the
5894 average number of recs per page so far */
5895 n_rows = n_rows_on_prev_level * n_rows / n_pages_read;
5896 } else {
5897 /* The tree changed before we could even
5898 start with slot1->page_no */
5899 n_rows = 10;
5900 }
5901
5902 return(n_rows);
5903}
5904
5905/** If the tree gets changed too much between the two dives for the left
5906and right boundary then btr_estimate_n_rows_in_range_low() will retry
5907that many times before giving up and returning the value stored in
5908rows_in_range_arbitrary_ret_val. */
5909static const unsigned rows_in_range_max_retries = 4;
5910
5911/** We pretend that a range has that many records if the tree keeps changing
5912for rows_in_range_max_retries retries while we try to estimate the records
5913in a given range. */
5914static const ha_rows rows_in_range_arbitrary_ret_val = 10;
5915
5916/** Estimates the number of rows in a given index range.
5917@param[in] index index
5918@param[in] tuple1 range start, may also be empty tuple
5919@param[in] mode1 search mode for range start
5920@param[in] tuple2 range end, may also be empty tuple
5921@param[in] mode2 search mode for range end
5922@param[in] nth_attempt if the tree gets modified too much while
5923we are trying to analyze it, then we will retry (this function will call
5924itself, incrementing this parameter)
5925@return estimated number of rows; if after rows_in_range_max_retries
5926retries the tree keeps changing, then we will just return
5927rows_in_range_arbitrary_ret_val as a result (if
5928nth_attempt >= rows_in_range_max_retries and the tree is modified between
5929the two dives). */
5930static
5931ha_rows
5932btr_estimate_n_rows_in_range_low(
5933 dict_index_t* index,
5934 const dtuple_t* tuple1,
5935 page_cur_mode_t mode1,
5936 const dtuple_t* tuple2,
5937 page_cur_mode_t mode2,
5938 unsigned nth_attempt)
5939{
5940 btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
5941 btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
5942 btr_cur_t cursor;
5943 btr_path_t* slot1;
5944 btr_path_t* slot2;
5945 bool diverged;
5946 bool diverged_lot;
5947 ulint divergence_level;
5948 ha_rows n_rows;
5949 bool is_n_rows_exact;
5950 ulint i;
5951 mtr_t mtr;
5952 ha_rows table_n_rows;
5953
5954 table_n_rows = dict_table_get_n_rows(index->table);
5955
5956 /* Below we dive to the two records specified by tuple1 and tuple2 and
5957 we remember the entire dive paths from the tree root. The place where
5958 the tuple1 path ends on the leaf level we call "left border" of our
5959 interval and the place where the tuple2 path ends on the leaf level -
5960 "right border". We take care to either include or exclude the interval
5961 boundaries depending on whether <, <=, > or >= was specified. For
5962 example if "5 < x AND x <= 10" then we should not include the left
5963 boundary, but should include the right one. */
5964
5965 mtr_start(&mtr);
5966
5967 cursor.path_arr = path1;
5968
5969 bool should_count_the_left_border;
5970
5971 if (dtuple_get_n_fields(tuple1) > 0) {
5972
5973 btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5974 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5975 &cursor, 0,
5976 __FILE__, __LINE__, &mtr);
5977
5978 ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5979
5980 /* We should count the border if there are any records to
5981 match the criteria, i.e. if the maximum record on the tree is
5982 5 and x > 3 is specified then the cursor will be positioned at
5983 5 and we should count the border, but if x > 7 is specified,
5984 then the cursor will be positioned at 'sup' on the rightmost
5985 leaf page in the tree and we should not count the border. */
5986 should_count_the_left_border
5987 = !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5988 } else {
5989 dberr_t err = DB_SUCCESS;
5990
5991 err = btr_cur_open_at_index_side(true, index,
5992 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5993 &cursor, 0, &mtr);
5994
5995 if (err != DB_SUCCESS) {
5996 ib::warn() << " Error code: " << err
5997 << " btr_estimate_n_rows_in_range_low "
5998 << " called from file: "
5999 << __FILE__ << " line: " << __LINE__
6000 << " table: " << index->table->name
6001 << " index: " << index->name;
6002 }
6003
6004 ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
6005
6006 /* The range specified is wihout a left border, just
6007 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
6008 positioned the cursor on the infimum record on the leftmost
6009 page, which must not be counted. */
6010 should_count_the_left_border = false;
6011 }
6012
6013 mtr_commit(&mtr);
6014
6015 if (!index->is_readable()) {
6016 return 0;
6017 }
6018
6019 mtr_start(&mtr);
6020
6021 cursor.path_arr = path2;
6022
6023 bool should_count_the_right_border;
6024
6025 if (dtuple_get_n_fields(tuple2) > 0) {
6026
6027 btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
6028 BTR_SEARCH_LEAF | BTR_ESTIMATE,
6029 &cursor, 0,
6030 __FILE__, __LINE__, &mtr);
6031
6032 const rec_t* rec = btr_cur_get_rec(&cursor);
6033
6034 ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
6035
6036 should_count_the_right_border
6037 = (mode2 == PAGE_CUR_LE /* if the range is '<=' */
6038 /* and the record was found */
6039 && cursor.low_match >= dtuple_get_n_fields(tuple2))
6040 || (mode2 == PAGE_CUR_L /* or if the range is '<' */
6041 /* and there are any records to match the criteria,
6042 i.e. if the minimum record on the tree is 5 and
6043 x < 7 is specified then the cursor will be
6044 positioned at 5 and we should count the border, but
6045 if x < 2 is specified, then the cursor will be
6046 positioned at 'inf' and we should not count the
6047 border */
6048 && !page_rec_is_infimum(rec));
6049 /* Notice that for "WHERE col <= 'foo'" MySQL passes to
6050 ha_innobase::records_in_range():
6051 min_key=NULL (left-unbounded) which is expected
6052 max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
6053 unexpected - one would expect
6054 flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
6055 cursor will be positioned on the first record to the right of
6056 the requested one (can also be positioned on the 'sup') and
6057 we should not count the right border. */
6058 } else {
6059 dberr_t err = DB_SUCCESS;
6060
6061 err = btr_cur_open_at_index_side(false, index,
6062 BTR_SEARCH_LEAF | BTR_ESTIMATE,
6063 &cursor, 0, &mtr);
6064
6065 if (err != DB_SUCCESS) {
6066 ib::warn() << " Error code: " << err
6067 << " btr_estimate_n_rows_in_range_low "
6068 << " called from file: "
6069 << __FILE__ << " line: " << __LINE__
6070 << " table: " << index->table->name
6071 << " index: " << index->name;
6072 }
6073
6074 ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
6075
6076 /* The range specified is wihout a right border, just
6077 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
6078 positioned the cursor on the supremum record on the rightmost
6079 page, which must not be counted. */
6080 should_count_the_right_border = false;
6081 }
6082
6083 mtr_commit(&mtr);
6084
6085 /* We have the path information for the range in path1 and path2 */
6086
6087 n_rows = 0;
6088 is_n_rows_exact = true;
6089
6090 /* This becomes true when the two paths do not pass through the
6091 same pages anymore. */
6092 diverged = false;
6093
6094 /* This becomes true when the paths are not the same or adjacent
6095 any more. This means that they pass through the same or
6096 neighboring-on-the-same-level pages only. */
6097 diverged_lot = false;
6098
6099 /* This is the level where paths diverged a lot. */
6100 divergence_level = 1000000;
6101
6102 for (i = 0; ; i++) {
6103 ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
6104
6105 slot1 = path1 + i;
6106 slot2 = path2 + i;
6107
6108 if (slot1->nth_rec == ULINT_UNDEFINED
6109 || slot2->nth_rec == ULINT_UNDEFINED) {
6110
6111 /* Here none of the borders were counted. For example,
6112 if on the leaf level we descended to:
6113 (inf, a, b, c, d, e, f, sup)
6114 ^ ^
6115 path1 path2
6116 then n_rows will be 2 (c and d). */
6117
6118 if (is_n_rows_exact) {
6119 /* Only fiddle to adjust this off-by-one
6120 if the number is exact, otherwise we do
6121 much grosser adjustments below. */
6122
6123 btr_path_t* last1 = &path1[i - 1];
6124 btr_path_t* last2 = &path2[i - 1];
6125
6126 /* If both paths end up on the same record on
6127 the leaf level. */
6128 if (last1->page_no == last2->page_no
6129 && last1->nth_rec == last2->nth_rec) {
6130
6131 /* n_rows can be > 0 here if the paths
6132 were first different and then converged
6133 to the same record on the leaf level.
6134 For example:
6135 SELECT ... LIKE 'wait/synch/rwlock%'
6136 mode1=PAGE_CUR_GE,
6137 tuple1="wait/synch/rwlock"
6138 path1[0]={nth_rec=58, n_recs=58,
6139 page_no=3, page_level=1}
6140 path1[1]={nth_rec=56, n_recs=55,
6141 page_no=119, page_level=0}
6142
6143 mode2=PAGE_CUR_G
6144 tuple2="wait/synch/rwlock"
6145 path2[0]={nth_rec=57, n_recs=57,
6146 page_no=3, page_level=1}
6147 path2[1]={nth_rec=56, n_recs=55,
6148 page_no=119, page_level=0} */
6149
6150 /* If the range is such that we should
6151 count both borders, then avoid
6152 counting that record twice - once as a
6153 left border and once as a right
6154 border. */
6155 if (should_count_the_left_border
6156 && should_count_the_right_border) {
6157
6158 n_rows = 1;
6159 } else {
6160 /* Some of the borders should
6161 not be counted, e.g. [3,3). */
6162 n_rows = 0;
6163 }
6164 } else {
6165 if (should_count_the_left_border) {
6166 n_rows++;
6167 }
6168
6169 if (should_count_the_right_border) {
6170 n_rows++;
6171 }
6172 }
6173 }
6174
6175 if (i > divergence_level + 1 && !is_n_rows_exact) {
6176 /* In trees whose height is > 1 our algorithm
6177 tends to underestimate: multiply the estimate
6178 by 2: */
6179
6180 n_rows = n_rows * 2;
6181 }
6182
6183 DBUG_EXECUTE_IF("bug14007649", return(n_rows););
6184
6185 /* Do not estimate the number of rows in the range
6186 to over 1 / 2 of the estimated rows in the whole
6187 table */
6188
6189 if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
6190
6191 n_rows = table_n_rows / 2;
6192
6193 /* If there are just 0 or 1 rows in the table,
6194 then we estimate all rows are in the range */
6195
6196 if (n_rows == 0) {
6197 n_rows = table_n_rows;
6198 }
6199 }
6200
6201 return(n_rows);
6202 }
6203
6204 if (!diverged && slot1->nth_rec != slot2->nth_rec) {
6205
6206 /* If both slots do not point to the same page,
6207 this means that the tree must have changed between
6208 the dive for slot1 and the dive for slot2 at the
6209 beginning of this function. */
6210 if (slot1->page_no != slot2->page_no
6211 || slot1->page_level != slot2->page_level) {
6212
6213 /* If the tree keeps changing even after a
6214 few attempts, then just return some arbitrary
6215 number. */
6216 if (nth_attempt >= rows_in_range_max_retries) {
6217 return(rows_in_range_arbitrary_ret_val);
6218 }
6219
6220 return btr_estimate_n_rows_in_range_low(
6221 index, tuple1, mode1,
6222 tuple2, mode2, nth_attempt + 1);
6223 }
6224
6225 diverged = true;
6226
6227 if (slot1->nth_rec < slot2->nth_rec) {
6228 /* We do not count the borders (nor the left
6229 nor the right one), thus "- 1". */
6230 n_rows = slot2->nth_rec - slot1->nth_rec - 1;
6231
6232 if (n_rows > 0) {
6233 /* There is at least one row between
6234 the two borders pointed to by slot1
6235 and slot2, so on the level below the
6236 slots will point to non-adjacent
6237 pages. */
6238 diverged_lot = true;
6239 divergence_level = i;
6240 }
6241 } else {
6242 /* It is possible that
6243 slot1->nth_rec >= slot2->nth_rec
6244 if, for example, we have a single page
6245 tree which contains (inf, 5, 6, supr)
6246 and we select where x > 20 and x < 30;
6247 in this case slot1->nth_rec will point
6248 to the supr record and slot2->nth_rec
6249 will point to 6. */
6250 n_rows = 0;
6251 should_count_the_left_border = false;
6252 should_count_the_right_border = false;
6253 }
6254
6255 } else if (diverged && !diverged_lot) {
6256
6257 if (slot1->nth_rec < slot1->n_recs
6258 || slot2->nth_rec > 1) {
6259
6260 diverged_lot = true;
6261 divergence_level = i;
6262
6263 n_rows = 0;
6264
6265 if (slot1->nth_rec < slot1->n_recs) {
6266 n_rows += slot1->n_recs
6267 - slot1->nth_rec;
6268 }
6269
6270 if (slot2->nth_rec > 1) {
6271 n_rows += slot2->nth_rec - 1;
6272 }
6273 }
6274 } else if (diverged_lot) {
6275
6276 n_rows = btr_estimate_n_rows_in_range_on_level(
6277 index, slot1, slot2, n_rows,
6278 &is_n_rows_exact);
6279 }
6280 }
6281}
6282
6283/** Estimates the number of rows in a given index range.
6284@param[in] index index
6285@param[in] tuple1 range start, may also be empty tuple
6286@param[in] mode1 search mode for range start
6287@param[in] tuple2 range end, may also be empty tuple
6288@param[in] mode2 search mode for range end
6289@return estimated number of rows */
6290ha_rows
6291btr_estimate_n_rows_in_range(
6292 dict_index_t* index,
6293 const dtuple_t* tuple1,
6294 page_cur_mode_t mode1,
6295 const dtuple_t* tuple2,
6296 page_cur_mode_t mode2)
6297{
6298 return btr_estimate_n_rows_in_range_low(
6299 index, tuple1, mode1, tuple2, mode2, 1);
6300}
6301
6302/*******************************************************************//**
6303Record the number of non_null key values in a given index for
6304each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6305The estimates are eventually stored in the array:
6306index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6307static
6308void
6309btr_record_not_null_field_in_rec(
6310/*=============================*/
6311 ulint n_unique, /*!< in: dict_index_get_n_unique(index),
6312 number of columns uniquely determine
6313 an index entry */
6314 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
6315 its size could be for all fields or
6316 that of "n_unique" */
6317 ib_uint64_t* n_not_null) /*!< in/out: array to record number of
6318 not null rows for n-column prefix */
6319{
6320 ulint i;
6321
6322 ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6323
6324 if (n_not_null == NULL) {
6325 return;
6326 }
6327
6328 for (i = 0; i < n_unique; i++) {
6329 if (rec_offs_nth_sql_null(offsets, i)) {
6330 break;
6331 }
6332
6333 n_not_null[i]++;
6334 }
6335}
6336
6337/*******************************************************************//**
6338Estimates the number of different key values in a given index, for
6339each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6340The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
63410..n_uniq-1) and the number of pages that were sampled is saved in
6342index->stat_n_sample_sizes[].
6343If innodb_stats_method is nulls_ignored, we also record the number of
6344non-null values for each prefix and stored the estimates in
6345array index->stat_n_non_null_key_vals.
6346@return true if the index is available and we get the estimated numbers,
6347false if the index is unavailable. */
6348bool
6349btr_estimate_number_of_different_key_vals(
6350/*======================================*/
6351 dict_index_t* index) /*!< in: index */
6352{
6353 btr_cur_t cursor;
6354 page_t* page;
6355 rec_t* rec;
6356 ulint n_cols;
6357 ib_uint64_t* n_diff;
6358 ib_uint64_t* n_not_null;
6359 ibool stats_null_not_equal;
6360 uintmax_t n_sample_pages=1; /* number of pages to sample */
6361 ulint not_empty_flag = 0;
6362 ulint total_external_size = 0;
6363 ulint i;
6364 ulint j;
6365 uintmax_t add_on;
6366 mtr_t mtr;
6367 mem_heap_t* heap = NULL;
6368 ulint* offsets_rec = NULL;
6369 ulint* offsets_next_rec = NULL;
6370
6371 /* For spatial index, there is no such stats can be
6372 fetched. */
6373 if (dict_index_is_spatial(index)) {
6374 return(false);
6375 }
6376
6377 n_cols = dict_index_get_n_unique(index);
6378
6379 heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6380 * n_cols
6381 + dict_index_get_n_fields(index)
6382 * (sizeof *offsets_rec
6383 + sizeof *offsets_next_rec));
6384
6385 n_diff = (ib_uint64_t*) mem_heap_zalloc(
6386 heap, n_cols * sizeof(n_diff[0]));
6387
6388 n_not_null = NULL;
6389
6390 /* Check srv_innodb_stats_method setting, and decide whether we
6391 need to record non-null value and also decide if NULL is
6392 considered equal (by setting stats_null_not_equal value) */
6393 switch (srv_innodb_stats_method) {
6394 case SRV_STATS_NULLS_IGNORED:
6395 n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6396 heap, n_cols * sizeof *n_not_null);
6397 /* fall through */
6398
6399 case SRV_STATS_NULLS_UNEQUAL:
6400 /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6401 case, we will treat NULLs as unequal value */
6402 stats_null_not_equal = TRUE;
6403 break;
6404
6405 case SRV_STATS_NULLS_EQUAL:
6406 stats_null_not_equal = FALSE;
6407 break;
6408
6409 default:
6410 ut_error;
6411 }
6412
6413 if (srv_stats_sample_traditional) {
6414 /* It makes no sense to test more pages than are contained
6415 in the index, thus we lower the number if it is too high */
6416 if (srv_stats_transient_sample_pages > index->stat_index_size) {
6417 if (index->stat_index_size > 0) {
6418 n_sample_pages = index->stat_index_size;
6419 }
6420 } else {
6421 n_sample_pages = srv_stats_transient_sample_pages;
6422 }
6423 } else {
6424 /* New logaritmic number of pages that are estimated.
6425 Number of pages estimated should be between 1 and
6426 index->stat_index_size.
6427
6428 If we have only 0 or 1 index pages then we can only take 1
6429 sample. We have already initialized n_sample_pages to 1.
6430
6431 So taking index size as I and sample as S and log(I)*S as L
6432
6433 requirement 1) we want the out limit of the expression to not exceed I;
6434 requirement 2) we want the ideal pages to be at least S;
6435 so the current expression is min(I, max( min(S,I), L)
6436
6437 looking for simplifications:
6438
6439 case 1: assume S < I
6440 min(I, max( min(S,I), L) -> min(I , max( S, L))
6441
6442 but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
6443
6444 so we have: min(I , L)
6445
6446 case 2: assume I < S
6447 min(I, max( min(S,I), L) -> min(I, max( I, L))
6448
6449 case 2a: L > I
6450 min(I, max( I, L)) -> min(I, L) -> I
6451
6452 case 2b: when L < I
6453 min(I, max( I, L)) -> min(I, I ) -> I
6454
6455 so taking all case2 paths is I, our expression is:
6456 n_pages = S < I? min(I,L) : I
6457 */
6458 if (index->stat_index_size > 1) {
6459 n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) ?
6460 ut_min(static_cast<ulint>(index->stat_index_size),
6461 static_cast<ulint>(log2(index->stat_index_size)*srv_stats_transient_sample_pages))
6462 : index->stat_index_size;
6463
6464 }
6465 }
6466
6467 /* Sanity check */
6468 ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
6469
6470 /* We sample some pages in the index to get an estimate */
6471
6472 for (i = 0; i < n_sample_pages; i++) {
6473 mtr_start(&mtr);
6474
6475 bool available;
6476
6477 available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6478 &cursor, &mtr);
6479
6480 if (!available) {
6481 mtr_commit(&mtr);
6482 mem_heap_free(heap);
6483
6484 return(false);
6485 }
6486
6487 /* Count the number of different key values for each prefix of
6488 the key on this index page. If the prefix does not determine
6489 the index record uniquely in the B-tree, then we subtract one
6490 because otherwise our algorithm would give a wrong estimate
6491 for an index where there is just one key value. */
6492
6493 if (!index->is_readable()) {
6494 mtr_commit(&mtr);
6495 goto exit_loop;
6496 }
6497
6498 page = btr_cur_get_page(&cursor);
6499
6500 rec = page_rec_get_next(page_get_infimum_rec(page));
6501 const bool is_leaf = page_is_leaf(page);
6502
6503 if (!page_rec_is_supremum(rec)) {
6504 not_empty_flag = 1;
6505 offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6506 is_leaf,
6507 ULINT_UNDEFINED, &heap);
6508
6509 if (n_not_null != NULL) {
6510 btr_record_not_null_field_in_rec(
6511 n_cols, offsets_rec, n_not_null);
6512 }
6513 }
6514
6515 while (!page_rec_is_supremum(rec)) {
6516 ulint matched_fields;
6517 rec_t* next_rec = page_rec_get_next(rec);
6518 if (page_rec_is_supremum(next_rec)) {
6519 total_external_size +=
6520 btr_rec_get_externally_stored_len(
6521 rec, offsets_rec);
6522 break;
6523 }
6524
6525 offsets_next_rec = rec_get_offsets(next_rec, index,
6526 offsets_next_rec,
6527 is_leaf,
6528 ULINT_UNDEFINED,
6529 &heap);
6530
6531 cmp_rec_rec_with_match(rec, next_rec,
6532 offsets_rec, offsets_next_rec,
6533 index, stats_null_not_equal,
6534 &matched_fields);
6535
6536 for (j = matched_fields; j < n_cols; j++) {
6537 /* We add one if this index record has
6538 a different prefix from the previous */
6539
6540 n_diff[j]++;
6541 }
6542
6543 if (n_not_null != NULL) {
6544 btr_record_not_null_field_in_rec(
6545 n_cols, offsets_next_rec, n_not_null);
6546 }
6547
6548 total_external_size
6549 += btr_rec_get_externally_stored_len(
6550 rec, offsets_rec);
6551
6552 rec = next_rec;
6553 /* Initialize offsets_rec for the next round
6554 and assign the old offsets_rec buffer to
6555 offsets_next_rec. */
6556 {
6557 ulint* offsets_tmp = offsets_rec;
6558 offsets_rec = offsets_next_rec;
6559 offsets_next_rec = offsets_tmp;
6560 }
6561 }
6562
6563 if (n_cols == dict_index_get_n_unique_in_tree(index)
6564 && page_has_siblings(page)) {
6565
6566 /* If there is more than one leaf page in the tree,
6567 we add one because we know that the first record
6568 on the page certainly had a different prefix than the
6569 last record on the previous index page in the
6570 alphabetical order. Before this fix, if there was
6571 just one big record on each clustered index page, the
6572 algorithm grossly underestimated the number of rows
6573 in the table. */
6574
6575 n_diff[n_cols - 1]++;
6576 }
6577
6578 mtr_commit(&mtr);
6579 }
6580
6581exit_loop:
6582 /* If we saw k borders between different key values on
6583 n_sample_pages leaf pages, we can estimate how many
6584 there will be in index->stat_n_leaf_pages */
6585
6586 /* We must take into account that our sample actually represents
6587 also the pages used for external storage of fields (those pages are
6588 included in index->stat_n_leaf_pages) */
6589
6590 for (j = 0; j < n_cols; j++) {
6591 index->stat_n_diff_key_vals[j]
6592 = BTR_TABLE_STATS_FROM_SAMPLE(
6593 n_diff[j], index, n_sample_pages,
6594 total_external_size, not_empty_flag);
6595
6596 /* If the tree is small, smaller than
6597 10 * n_sample_pages + total_external_size, then
6598 the above estimate is ok. For bigger trees it is common that we
6599 do not see any borders between key values in the few pages
6600 we pick. But still there may be n_sample_pages
6601 different key values, or even more. Let us try to approximate
6602 that: */
6603
6604 add_on = index->stat_n_leaf_pages
6605 / (10 * (n_sample_pages
6606 + total_external_size));
6607
6608 if (add_on > n_sample_pages) {
6609 add_on = n_sample_pages;
6610 }
6611
6612 index->stat_n_diff_key_vals[j] += add_on;
6613
6614 index->stat_n_sample_sizes[j] = n_sample_pages;
6615
6616 /* Update the stat_n_non_null_key_vals[] with our
6617 sampled result. stat_n_non_null_key_vals[] is created
6618 and initialized to zero in dict_index_add_to_cache(),
6619 along with stat_n_diff_key_vals[] array */
6620 if (n_not_null != NULL) {
6621 index->stat_n_non_null_key_vals[j] =
6622 BTR_TABLE_STATS_FROM_SAMPLE(
6623 n_not_null[j], index, n_sample_pages,
6624 total_external_size, not_empty_flag);
6625 }
6626 }
6627
6628 mem_heap_free(heap);
6629
6630 return(true);
6631}
6632
6633/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6634
6635/***********************************************************//**
6636Gets the offset of the pointer to the externally stored part of a field.
6637@return offset of the pointer to the externally stored part */
6638static
6639ulint
6640btr_rec_get_field_ref_offs(
6641/*=======================*/
6642 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6643 ulint n) /*!< in: index of the external field */
6644{
6645 ulint field_ref_offs;
6646 ulint local_len;
6647
6648 ut_a(rec_offs_nth_extern(offsets, n));
6649 field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6650 ut_a(len_is_stored(local_len));
6651 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6652
6653 return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6654}
6655
6656/** Gets a pointer to the externally stored part of a field.
6657@param rec record
6658@param offsets rec_get_offsets(rec)
6659@param n index of the externally stored field
6660@return pointer to the externally stored part */
6661#define btr_rec_get_field_ref(rec, offsets, n) \
6662 ((rec) + btr_rec_get_field_ref_offs(offsets, n))
6663
6664/** Gets the externally stored size of a record, in units of a database page.
6665@param[in] rec record
6666@param[in] offsets array returned by rec_get_offsets()
6667@return externally stored part, in units of a database page */
6668ulint
6669btr_rec_get_externally_stored_len(
6670 const rec_t* rec,
6671 const ulint* offsets)
6672{
6673 ulint n_fields;
6674 ulint total_extern_len = 0;
6675 ulint i;
6676
6677 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6678
6679 if (!rec_offs_any_extern(offsets)) {
6680 return(0);
6681 }
6682
6683 n_fields = rec_offs_n_fields(offsets);
6684
6685 for (i = 0; i < n_fields; i++) {
6686 if (rec_offs_nth_extern(offsets, i)) {
6687
6688 ulint extern_len = mach_read_from_4(
6689 btr_rec_get_field_ref(rec, offsets, i)
6690 + BTR_EXTERN_LEN + 4);
6691
6692 total_extern_len += ut_calc_align(
6693 extern_len, ulint(srv_page_size));
6694 }
6695 }
6696
6697 return total_extern_len >> srv_page_size_shift;
6698}
6699
6700/*******************************************************************//**
6701Sets the ownership bit of an externally stored field in a record. */
6702static
6703void
6704btr_cur_set_ownership_of_extern_field(
6705/*==================================*/
6706 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6707 part will be updated, or NULL */
6708 rec_t* rec, /*!< in/out: clustered index record */
6709 dict_index_t* index, /*!< in: index of the page */
6710 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6711 ulint i, /*!< in: field number */
6712 ibool val, /*!< in: value to set */
6713 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6714{
6715 byte* data;
6716 ulint local_len;
6717 ulint byte_val;
6718
6719 data = rec_get_nth_field(rec, offsets, i, &local_len);
6720 ut_ad(rec_offs_nth_extern(offsets, i));
6721 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6722
6723 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6724
6725 byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6726
6727 if (val) {
6728 byte_val &= ~BTR_EXTERN_OWNER_FLAG;
6729 } else {
6730#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6731 ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6732#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6733 byte_val |= BTR_EXTERN_OWNER_FLAG;
6734 }
6735
6736 if (page_zip) {
6737 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6738 page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6739 } else if (mtr != NULL) {
6740
6741 mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6742 MLOG_1BYTE, mtr);
6743 } else {
6744 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6745 }
6746}
6747
6748/*******************************************************************//**
6749Marks non-updated off-page fields as disowned by this record. The ownership
6750must be transferred to the updated record which is inserted elsewhere in the
6751index tree. In purge only the owner of externally stored field is allowed
6752to free the field. */
6753void
6754btr_cur_disown_inherited_fields(
6755/*============================*/
6756 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6757 part will be updated, or NULL */
6758 rec_t* rec, /*!< in/out: record in a clustered index */
6759 dict_index_t* index, /*!< in: index of the page */
6760 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6761 const upd_t* update, /*!< in: update vector */
6762 mtr_t* mtr) /*!< in/out: mini-transaction */
6763{
6764 ulint i;
6765
6766 ut_ad(rec_offs_validate(rec, index, offsets));
6767 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6768 ut_ad(rec_offs_any_extern(offsets));
6769
6770 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6771 if (rec_offs_nth_extern(offsets, i)
6772 && !upd_get_field_by_field_no(update, i, false)) {
6773 btr_cur_set_ownership_of_extern_field(
6774 page_zip, rec, index, offsets, i, FALSE, mtr);
6775 }
6776 }
6777}
6778
6779/*******************************************************************//**
6780Marks all extern fields in a record as owned by the record. This function
6781should be called if the delete mark of a record is removed: a not delete
6782marked record always owns all its extern fields. */
6783static
6784void
6785btr_cur_unmark_extern_fields(
6786/*=========================*/
6787 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6788 part will be updated, or NULL */
6789 rec_t* rec, /*!< in/out: record in a clustered index */
6790 dict_index_t* index, /*!< in: index of the page */
6791 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6792 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6793{
6794 ulint n;
6795 ulint i;
6796
6797 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6798 n = rec_offs_n_fields(offsets);
6799
6800 if (!rec_offs_any_extern(offsets)) {
6801
6802 return;
6803 }
6804
6805 for (i = 0; i < n; i++) {
6806 if (rec_offs_nth_extern(offsets, i)) {
6807
6808 btr_cur_set_ownership_of_extern_field(
6809 page_zip, rec, index, offsets, i, TRUE, mtr);
6810 }
6811 }
6812}
6813
6814/*******************************************************************//**
6815Flags the data tuple fields that are marked as extern storage in the
6816update vector. We use this function to remember which fields we must
6817mark as extern storage in a record inserted for an update.
6818@return number of flagged external columns */
6819ulint
6820btr_push_update_extern_fields(
6821/*==========================*/
6822 dtuple_t* tuple, /*!< in/out: data tuple */
6823 const upd_t* update, /*!< in: update vector */
6824 mem_heap_t* heap) /*!< in: memory heap */
6825{
6826 ulint n_pushed = 0;
6827 ulint n;
6828 const upd_field_t* uf;
6829
6830 uf = update->fields;
6831 n = upd_get_n_fields(update);
6832
6833 for (; n--; uf++) {
6834 if (dfield_is_ext(&uf->new_val)) {
6835 dfield_t* field
6836 = dtuple_get_nth_field(tuple, uf->field_no);
6837
6838 if (!dfield_is_ext(field)) {
6839 dfield_set_ext(field);
6840 n_pushed++;
6841 }
6842
6843 switch (uf->orig_len) {
6844 byte* data;
6845 ulint len;
6846 byte* buf;
6847 case 0:
6848 break;
6849 case BTR_EXTERN_FIELD_REF_SIZE:
6850 /* Restore the original locally stored
6851 part of the column. In the undo log,
6852 InnoDB writes a longer prefix of externally
6853 stored columns, so that column prefixes
6854 in secondary indexes can be reconstructed. */
6855 dfield_set_data(field,
6856 (byte*) dfield_get_data(field)
6857 + dfield_get_len(field)
6858 - BTR_EXTERN_FIELD_REF_SIZE,
6859 BTR_EXTERN_FIELD_REF_SIZE);
6860 dfield_set_ext(field);
6861 break;
6862 default:
6863 /* Reconstruct the original locally
6864 stored part of the column. The data
6865 will have to be copied. */
6866 ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
6867
6868 data = (byte*) dfield_get_data(field);
6869 len = dfield_get_len(field);
6870
6871 buf = (byte*) mem_heap_alloc(heap,
6872 uf->orig_len);
6873 /* Copy the locally stored prefix. */
6874 memcpy(buf, data,
6875 unsigned(uf->orig_len)
6876 - BTR_EXTERN_FIELD_REF_SIZE);
6877 /* Copy the BLOB pointer. */
6878 memcpy(buf + unsigned(uf->orig_len)
6879 - BTR_EXTERN_FIELD_REF_SIZE,
6880 data + len - BTR_EXTERN_FIELD_REF_SIZE,
6881 BTR_EXTERN_FIELD_REF_SIZE);
6882
6883 dfield_set_data(field, buf, uf->orig_len);
6884 dfield_set_ext(field);
6885 }
6886 }
6887 }
6888
6889 return(n_pushed);
6890}
6891
6892/*******************************************************************//**
6893Returns the length of a BLOB part stored on the header page.
6894@return part length */
6895static
6896ulint
6897btr_blob_get_part_len(
6898/*==================*/
6899 const byte* blob_header) /*!< in: blob header */
6900{
6901 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6902}
6903
6904/*******************************************************************//**
6905Returns the page number where the next BLOB part is stored.
6906@return page number or FIL_NULL if no more pages */
6907static
6908ulint
6909btr_blob_get_next_page_no(
6910/*======================*/
6911 const byte* blob_header) /*!< in: blob header */
6912{
6913 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6914}
6915
6916/*******************************************************************//**
6917Deallocate a buffer block that was reserved for a BLOB part. */
6918static
6919void
6920btr_blob_free(
6921/*==========*/
6922 buf_block_t* block, /*!< in: buffer block */
6923 ibool all, /*!< in: TRUE=remove also the compressed page
6924 if there is one */
6925 mtr_t* mtr) /*!< in: mini-transaction to commit */
6926{
6927 buf_pool_t* buf_pool = buf_pool_from_block(block);
6928 ulint space = block->page.id.space();
6929 ulint page_no = block->page.id.page_no();
6930
6931 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
6932
6933 mtr_commit(mtr);
6934
6935 buf_pool_mutex_enter(buf_pool);
6936
6937 /* Only free the block if it is still allocated to
6938 the same file page. */
6939
6940 if (buf_block_get_state(block)
6941 == BUF_BLOCK_FILE_PAGE
6942 && block->page.id.space() == space
6943 && block->page.id.page_no() == page_no) {
6944
6945 if (!buf_LRU_free_page(&block->page, all)
6946 && all && block->page.zip.data) {
6947 /* Attempt to deallocate the uncompressed page
6948 if the whole block cannot be deallocted. */
6949
6950 buf_LRU_free_page(&block->page, false);
6951 }
6952 }
6953
6954 buf_pool_mutex_exit(buf_pool);
6955}
6956
6957/** Helper class used while writing blob pages, during insert or update. */
6958struct btr_blob_log_check_t {
6959 /** Persistent cursor on a clusterex index record with blobs. */
6960 btr_pcur_t* m_pcur;
6961 /** Mini transaction holding the latches for m_pcur */
6962 mtr_t* m_mtr;
6963 /** rec_get_offsets(rec, index); offset of clust_rec */
6964 const ulint* m_offsets;
6965 /** The block containing clustered record */
6966 buf_block_t** m_block;
6967 /** The clustered record pointer */
6968 rec_t** m_rec;
6969 /** The blob operation code */
6970 enum blob_op m_op;
6971
6972 /** Constructor
6973 @param[in] pcur persistent cursor on a clustered
6974 index record with blobs.
6975 @param[in] mtr mini-transaction holding latches for
6976 pcur.
6977 @param[in] offsets offsets of the clust_rec
6978 @param[in,out] block record block containing pcur record
6979 @param[in,out] rec the clustered record pointer
6980 @param[in] op the blob operation code */
6981 btr_blob_log_check_t(
6982 btr_pcur_t* pcur,
6983 mtr_t* mtr,
6984 const ulint* offsets,
6985 buf_block_t** block,
6986 rec_t** rec,
6987 enum blob_op op)
6988 : m_pcur(pcur),
6989 m_mtr(mtr),
6990 m_offsets(offsets),
6991 m_block(block),
6992 m_rec(rec),
6993 m_op(op)
6994 {
6995 ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6996 ut_ad((*m_block)->frame == page_align(*m_rec));
6997 ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6998 }
6999
7000 /** Check if there is enough space in log file. Commit and re-start the
7001 mini transaction. */
7002 void check()
7003 {
7004 dict_index_t* index = m_pcur->index();
7005 ulint offs = 0;
7006 ulint page_no = ULINT_UNDEFINED;
7007 FlushObserver* observer = m_mtr->get_flush_observer();
7008
7009 if (m_op == BTR_STORE_INSERT_BULK) {
7010 offs = page_offset(*m_rec);
7011 page_no = page_get_page_no(
7012 buf_block_get_frame(*m_block));
7013
7014 buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
7015 } else {
7016 btr_pcur_store_position(m_pcur, m_mtr);
7017 }
7018 m_mtr->commit();
7019
7020 DEBUG_SYNC_C("blob_write_middle");
7021
7022 log_free_check();
7023
7024 DEBUG_SYNC_C("blob_write_middle_after_check");
7025
7026 const mtr_log_t log_mode = m_mtr->get_log_mode();
7027 m_mtr->start();
7028 m_mtr->set_log_mode(log_mode);
7029 index->set_modified(*m_mtr);
7030 m_mtr->set_flush_observer(observer);
7031
7032 if (m_op == BTR_STORE_INSERT_BULK) {
7033 mtr_x_lock(dict_index_get_lock(index), m_mtr);
7034 m_pcur->btr_cur.page_cur.block = btr_block_get(
7035 page_id_t(index->table->space->id, page_no),
7036 page_size_t(index->table->space->flags),
7037 RW_X_LATCH, index, m_mtr);
7038 m_pcur->btr_cur.page_cur.rec
7039 = m_pcur->btr_cur.page_cur.block->frame
7040 + offs;
7041
7042 buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block);
7043 } else {
7044 ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
7045 bool ret = btr_pcur_restore_position(
7046 BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
7047 m_pcur, m_mtr);
7048
7049 ut_a(ret);
7050 }
7051
7052 *m_block = btr_pcur_get_block(m_pcur);
7053 *m_rec = btr_pcur_get_rec(m_pcur);
7054
7055 rec_offs_make_valid(*m_rec, index, true,
7056 const_cast<ulint*>(m_offsets));
7057
7058 ut_ad(m_mtr->memo_contains_page_flagged(
7059 *m_rec,
7060 MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
7061
7062 ut_ad(mtr_memo_contains_flagged(m_mtr,
7063 dict_index_get_lock(index),
7064 MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK));
7065 }
7066};
7067
7068/*******************************************************************//**
7069Stores the fields in big_rec_vec to the tablespace and puts pointers to
7070them in rec. The extern flags in rec will have to be set beforehand.
7071The fields are stored on pages allocated from leaf node
7072file segment of the index tree.
7073
7074TODO: If the allocation extends the tablespace, it will not be redo logged, in
7075any mini-transaction. Tablespace extension should be redo-logged, so that
7076recovery will not fail when the big_rec was written to the extended portion of
7077the file, in case the file was somehow truncated in the crash.
7078
7079@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
7080dberr_t
7081btr_store_big_rec_extern_fields(
7082/*============================*/
7083 btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
7084 btr_mtr is restarted, then this can
7085 be repositioned. */
7086 ulint* offsets, /*!< in/out: rec_get_offsets() on
7087 pcur. the "external storage" flags
7088 in offsets will correctly correspond
7089 to rec when this function returns */
7090 const big_rec_t*big_rec_vec, /*!< in: vector containing fields
7091 to be stored externally */
7092 mtr_t* btr_mtr, /*!< in/out: mtr containing the
7093 latches to the clustered index. can be
7094 committed and restarted. */
7095 enum blob_op op) /*! in: operation code */
7096{
7097 ulint rec_page_no;
7098 byte* field_ref;
7099 ulint extern_len;
7100 ulint store_len;
7101 ulint page_no;
7102 ulint space_id;
7103 ulint prev_page_no;
7104 ulint hint_page_no;
7105 ulint i;
7106 mtr_t mtr;
7107 mtr_t mtr_bulk;
7108 mem_heap_t* heap = NULL;
7109 page_zip_des_t* page_zip;
7110 z_stream c_stream;
7111 dberr_t error = DB_SUCCESS;
7112 dict_index_t* index = pcur->index();
7113 buf_block_t* rec_block = btr_pcur_get_block(pcur);
7114 rec_t* rec = btr_pcur_get_rec(pcur);
7115
7116 ut_ad(rec_offs_validate(rec, index, offsets));
7117 ut_ad(rec_offs_any_extern(offsets));
7118 ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
7119 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
7120 ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
7121 ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
7122 ut_a(dict_index_is_clust(index));
7123
7124 ut_a(dict_table_page_size(index->table)
7125 .equals_to(rec_block->page.size));
7126
7127 btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
7128 &rec, op);
7129 page_zip = buf_block_get_page_zip(rec_block);
7130 space_id = rec_block->page.id.space();
7131 rec_page_no = rec_block->page.id.page_no();
7132 ut_a(fil_page_index_page_check(page_align(rec))
7133 || op == BTR_STORE_INSERT_BULK);
7134
7135 if (page_zip) {
7136 int err;
7137
7138 /* Zlib deflate needs 128 kilobytes for the default
7139 window size, plus 512 << memLevel, plus a few
7140 kilobytes for small objects. We use reduced memLevel
7141 to limit the memory consumption, and preallocate the
7142 heap, hoping to avoid memory fragmentation. */
7143 heap = mem_heap_create(250000);
7144 page_zip_set_alloc(&c_stream, heap);
7145
7146 err = deflateInit2(&c_stream, int(page_zip_level),
7147 Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
7148 ut_a(err == Z_OK);
7149 }
7150
7151#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7152 /* All pointers to externally stored columns in the record
7153 must either be zero or they must be pointers to inherited
7154 columns, owned by this record or an earlier record version. */
7155 for (i = 0; i < big_rec_vec->n_fields; i++) {
7156 field_ref = btr_rec_get_field_ref(
7157 rec, offsets, big_rec_vec->fields[i].field_no);
7158
7159 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7160 /* Either this must be an update in place,
7161 or the BLOB must be inherited, or the BLOB pointer
7162 must be zero (will be written in this function). */
7163 ut_a(op == BTR_STORE_UPDATE
7164 || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
7165 || !memcmp(field_ref, field_ref_zero,
7166 BTR_EXTERN_FIELD_REF_SIZE));
7167 }
7168#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7169
7170 const page_size_t page_size(dict_table_page_size(index->table));
7171
7172 /* Space available in compressed page to carry blob data */
7173 const ulint payload_size_zip = page_size.physical()
7174 - FIL_PAGE_DATA;
7175
7176 /* Space available in uncompressed page to carry blob data */
7177 const ulint payload_size = page_size.physical()
7178 - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
7179
7180 /* We have to create a file segment to the tablespace
7181 for each field and put the pointer to the field in rec */
7182
7183 for (i = 0; i < big_rec_vec->n_fields; i++) {
7184 const ulint field_no = big_rec_vec->fields[i].field_no;
7185
7186 field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
7187#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7188 /* A zero BLOB pointer should have been initially inserted. */
7189 ut_a(!memcmp(field_ref, field_ref_zero,
7190 BTR_EXTERN_FIELD_REF_SIZE));
7191#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7192 extern_len = big_rec_vec->fields[i].len;
7193 UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
7194 extern_len);
7195
7196 ut_a(extern_len > 0);
7197
7198 prev_page_no = FIL_NULL;
7199
7200 if (page_zip) {
7201 int err = deflateReset(&c_stream);
7202 ut_a(err == Z_OK);
7203
7204 c_stream.next_in = (Bytef*)
7205 big_rec_vec->fields[i].data;
7206 c_stream.avail_in = static_cast<uInt>(extern_len);
7207 }
7208
7209 for (ulint blob_npages = 0;; ++blob_npages) {
7210 buf_block_t* block;
7211 page_t* page;
7212 const ulint commit_freq = 4;
7213 ulint r_extents;
7214
7215 ut_ad(page_align(field_ref) == page_align(rec));
7216
7217 if (!(blob_npages % commit_freq)) {
7218
7219 redo_log.check();
7220
7221 field_ref = btr_rec_get_field_ref(
7222 rec, offsets, field_no);
7223
7224 page_zip = buf_block_get_page_zip(rec_block);
7225 rec_page_no = rec_block->page.id.page_no();
7226 }
7227
7228 mtr.start();
7229 index->set_modified(mtr);
7230 mtr.set_log_mode(btr_mtr->get_log_mode());
7231 mtr.set_flush_observer(btr_mtr->get_flush_observer());
7232
7233 buf_page_get(rec_block->page.id,
7234 rec_block->page.size, RW_X_LATCH, &mtr);
7235
7236 if (prev_page_no == FIL_NULL) {
7237 hint_page_no = 1 + rec_page_no;
7238 } else {
7239 hint_page_no = prev_page_no + 1;
7240 }
7241
7242 mtr_t *alloc_mtr;
7243
7244 if (op == BTR_STORE_INSERT_BULK) {
7245 mtr_bulk.start();
7246 mtr_bulk.set_spaces(mtr);
7247 alloc_mtr = &mtr_bulk;
7248 } else {
7249 alloc_mtr = &mtr;
7250 }
7251
7252 if (!fsp_reserve_free_extents(&r_extents,
7253 index->table->space, 1,
7254 FSP_BLOB, alloc_mtr,
7255 1)) {
7256
7257 alloc_mtr->commit();
7258 error = DB_OUT_OF_FILE_SPACE;
7259 goto func_exit;
7260 }
7261
7262 block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
7263 0, alloc_mtr, &mtr);
7264
7265 index->table->space->release_free_extents(r_extents);
7266
7267 if (op == BTR_STORE_INSERT_BULK) {
7268 mtr_bulk.commit();
7269 }
7270
7271 ut_a(block != NULL);
7272
7273 page_no = block->page.id.page_no();
7274 page = buf_block_get_frame(block);
7275
7276 if (prev_page_no != FIL_NULL) {
7277 buf_block_t* prev_block;
7278 page_t* prev_page;
7279
7280 prev_block = buf_page_get(
7281 page_id_t(space_id, prev_page_no),
7282 rec_block->page.size,
7283 RW_X_LATCH, &mtr);
7284
7285 buf_block_dbg_add_level(prev_block,
7286 SYNC_EXTERN_STORAGE);
7287 prev_page = buf_block_get_frame(prev_block);
7288
7289 if (page_zip) {
7290 mlog_write_ulint(
7291 prev_page + FIL_PAGE_NEXT,
7292 page_no, MLOG_4BYTES, &mtr);
7293 memcpy(buf_block_get_page_zip(
7294 prev_block)
7295 ->data + FIL_PAGE_NEXT,
7296 prev_page + FIL_PAGE_NEXT, 4);
7297 } else {
7298 mlog_write_ulint(
7299 prev_page + FIL_PAGE_DATA
7300 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7301 page_no, MLOG_4BYTES, &mtr);
7302 }
7303
7304 } else if (dict_index_is_online_ddl(index)) {
7305 row_log_table_blob_alloc(index, page_no);
7306 }
7307
7308 if (page_zip) {
7309 int err;
7310 page_zip_des_t* blob_page_zip;
7311
7312 /* Write FIL_PAGE_TYPE to the redo log
7313 separately, before logging any other
7314 changes to the page, so that the debug
7315 assertions in
7316 recv_parse_or_apply_log_rec_body() can
7317 be made simpler. Before InnoDB Plugin
7318 1.0.4, the initialization of
7319 FIL_PAGE_TYPE was logged as part of
7320 the mlog_log_string() below. */
7321
7322 mlog_write_ulint(page + FIL_PAGE_TYPE,
7323 prev_page_no == FIL_NULL
7324 ? FIL_PAGE_TYPE_ZBLOB
7325 : FIL_PAGE_TYPE_ZBLOB2,
7326 MLOG_2BYTES, &mtr);
7327
7328 c_stream.next_out = page
7329 + FIL_PAGE_DATA;
7330 c_stream.avail_out = static_cast<uInt>(
7331 payload_size_zip);
7332
7333 err = deflate(&c_stream, Z_FINISH);
7334 ut_a(err == Z_OK || err == Z_STREAM_END);
7335 ut_a(err == Z_STREAM_END
7336 || c_stream.avail_out == 0);
7337
7338 /* Write the "next BLOB page" pointer */
7339 mlog_write_ulint(page + FIL_PAGE_NEXT,
7340 FIL_NULL, MLOG_4BYTES, &mtr);
7341 /* Initialize the unused "prev page" pointer */
7342 mlog_write_ulint(page + FIL_PAGE_PREV,
7343 FIL_NULL, MLOG_4BYTES, &mtr);
7344 /* Write a back pointer to the record
7345 into the otherwise unused area. This
7346 information could be useful in
7347 debugging. Later, we might want to
7348 implement the possibility to relocate
7349 BLOB pages. Then, we would need to be
7350 able to adjust the BLOB pointer in the
7351 record. We do not store the heap
7352 number of the record, because it can
7353 change in page_zip_reorganize() or
7354 btr_page_reorganize(). However, also
7355 the page number of the record may
7356 change when B-tree nodes are split or
7357 merged.
7358 NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
7359 used by R-tree index for a Split Sequence
7360 Number */
7361 ut_ad(!dict_index_is_spatial(index));
7362
7363 mlog_write_ulint(page
7364 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
7365 space_id,
7366 MLOG_4BYTES, &mtr);
7367 mlog_write_ulint(page
7368 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
7369 rec_page_no,
7370 MLOG_4BYTES, &mtr);
7371
7372 /* Zero out the unused part of the page. */
7373 memset(page + page_zip_get_size(page_zip)
7374 - c_stream.avail_out,
7375 0, c_stream.avail_out);
7376 mlog_log_string(page
7377 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
7378 page_zip_get_size(page_zip)
7379 - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
7380 &mtr);
7381 /* Copy the page to compressed storage,
7382 because it will be flushed to disk
7383 from there. */
7384 blob_page_zip = buf_block_get_page_zip(block);
7385 ut_ad(blob_page_zip);
7386 ut_ad(page_zip_get_size(blob_page_zip)
7387 == page_zip_get_size(page_zip));
7388 memcpy(blob_page_zip->data, page,
7389 page_zip_get_size(page_zip));
7390
7391 if (err == Z_OK && prev_page_no != FIL_NULL) {
7392
7393 goto next_zip_page;
7394 }
7395
7396 if (err == Z_STREAM_END) {
7397 mach_write_to_4(field_ref
7398 + BTR_EXTERN_LEN, 0);
7399 mach_write_to_4(field_ref
7400 + BTR_EXTERN_LEN + 4,
7401 c_stream.total_in);
7402 } else {
7403 memset(field_ref + BTR_EXTERN_LEN,
7404 0, 8);
7405 }
7406
7407 if (prev_page_no == FIL_NULL) {
7408 ut_ad(blob_npages == 0);
7409 mach_write_to_4(field_ref
7410 + BTR_EXTERN_SPACE_ID,
7411 space_id);
7412
7413 mach_write_to_4(field_ref
7414 + BTR_EXTERN_PAGE_NO,
7415 page_no);
7416
7417 mach_write_to_4(field_ref
7418 + BTR_EXTERN_OFFSET,
7419 FIL_PAGE_NEXT);
7420 }
7421
7422 /* We compress a page when finish bulk insert.*/
7423 if (op != BTR_STORE_INSERT_BULK) {
7424 page_zip_write_blob_ptr(
7425 page_zip, rec, index, offsets,
7426 field_no, &mtr);
7427 }
7428
7429next_zip_page:
7430 prev_page_no = page_no;
7431
7432 /* Commit mtr and release the
7433 uncompressed page frame to save memory. */
7434 btr_blob_free(block, FALSE, &mtr);
7435
7436 if (err == Z_STREAM_END) {
7437 break;
7438 }
7439 } else {
7440 mlog_write_ulint(page + FIL_PAGE_TYPE,
7441 FIL_PAGE_TYPE_BLOB,
7442 MLOG_2BYTES, &mtr);
7443
7444 if (extern_len > payload_size) {
7445 store_len = payload_size;
7446 } else {
7447 store_len = extern_len;
7448 }
7449
7450 mlog_write_string(page + FIL_PAGE_DATA
7451 + BTR_BLOB_HDR_SIZE,
7452 (const byte*)
7453 big_rec_vec->fields[i].data
7454 + big_rec_vec->fields[i].len
7455 - extern_len,
7456 store_len, &mtr);
7457 mlog_write_ulint(page + FIL_PAGE_DATA
7458 + BTR_BLOB_HDR_PART_LEN,
7459 store_len, MLOG_4BYTES, &mtr);
7460 mlog_write_ulint(page + FIL_PAGE_DATA
7461 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7462 FIL_NULL, MLOG_4BYTES, &mtr);
7463
7464 extern_len -= store_len;
7465
7466 mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7467 MLOG_4BYTES, &mtr);
7468 mlog_write_ulint(field_ref
7469 + BTR_EXTERN_LEN + 4,
7470 big_rec_vec->fields[i].len
7471 - extern_len,
7472 MLOG_4BYTES, &mtr);
7473
7474 if (prev_page_no == FIL_NULL) {
7475 ut_ad(blob_npages == 0);
7476 mlog_write_ulint(field_ref
7477 + BTR_EXTERN_SPACE_ID,
7478 space_id, MLOG_4BYTES,
7479 &mtr);
7480
7481 mlog_write_ulint(field_ref
7482 + BTR_EXTERN_PAGE_NO,
7483 page_no, MLOG_4BYTES,
7484 &mtr);
7485
7486 mlog_write_ulint(field_ref
7487 + BTR_EXTERN_OFFSET,
7488 FIL_PAGE_DATA,
7489 MLOG_4BYTES,
7490 &mtr);
7491 }
7492
7493 prev_page_no = page_no;
7494
7495 mtr.commit();
7496
7497 if (extern_len == 0) {
7498 break;
7499 }
7500 }
7501 }
7502
7503 DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7504 error = DB_OUT_OF_FILE_SPACE;
7505 goto func_exit;);
7506
7507 rec_offs_make_nth_extern(offsets, field_no);
7508 }
7509
7510func_exit:
7511 if (page_zip) {
7512 deflateEnd(&c_stream);
7513 }
7514
7515 if (heap != NULL) {
7516 mem_heap_free(heap);
7517 }
7518
7519#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7520 /* All pointers to externally stored columns in the record
7521 must be valid. */
7522 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7523 if (!rec_offs_nth_extern(offsets, i)) {
7524 continue;
7525 }
7526
7527 field_ref = btr_rec_get_field_ref(rec, offsets, i);
7528
7529 /* The pointer must not be zero if the operation
7530 succeeded. */
7531 ut_a(0 != memcmp(field_ref, field_ref_zero,
7532 BTR_EXTERN_FIELD_REF_SIZE)
7533 || error != DB_SUCCESS);
7534 /* The column must not be disowned by this record. */
7535 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7536 }
7537#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7538 return(error);
7539}
7540
7541/*******************************************************************//**
7542Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7543static
7544void
7545btr_check_blob_fil_page_type(
7546/*=========================*/
7547 ulint space_id, /*!< in: space id */
7548 ulint page_no, /*!< in: page number */
7549 const page_t* page, /*!< in: page */
7550 ibool read) /*!< in: TRUE=read, FALSE=purge */
7551{
7552 ulint type = fil_page_get_type(page);
7553
7554 ut_a(space_id == page_get_space_id(page));
7555 ut_a(page_no == page_get_page_no(page));
7556
7557 if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7558 ulint flags = fil_space_get_flags(space_id);
7559
7560#ifndef UNIV_DEBUG /* Improve debug test coverage */
7561 if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
7562 /* Old versions of InnoDB did not initialize
7563 FIL_PAGE_TYPE on BLOB pages. Do not print
7564 anything about the type mismatch when reading
7565 a BLOB page that may be from old versions. */
7566 return;
7567 }
7568#endif /* !UNIV_DEBUG */
7569
7570 ib::fatal() << "FIL_PAGE_TYPE=" << type
7571 << " on BLOB " << (read ? "read" : "purge")
7572 << " space " << space_id << " page " << page_no
7573 << " flags " << flags;
7574 }
7575}
7576
7577/*******************************************************************//**
7578Frees the space in an externally stored field to the file space
7579management if the field in data is owned by the externally stored field,
7580in a rollback we may have the additional condition that the field must
7581not be inherited. */
7582void
7583btr_free_externally_stored_field(
7584/*=============================*/
7585 dict_index_t* index, /*!< in: index of the data, the index
7586 tree MUST be X-latched; if the tree
7587 height is 1, then also the root page
7588 must be X-latched! (this is relevant
7589 in the case this function is called
7590 from purge where 'data' is located on
7591 an undo log page, not an index
7592 page) */
7593 byte* field_ref, /*!< in/out: field reference */
7594 const rec_t* rec, /*!< in: record containing field_ref, for
7595 page_zip_write_blob_ptr(), or NULL */
7596 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
7597 or NULL */
7598 page_zip_des_t* page_zip, /*!< in: compressed page corresponding
7599 to rec, or NULL if rec == NULL */
7600 ulint i, /*!< in: field number of field_ref;
7601 ignored if rec == NULL */
7602 bool rollback, /*!< in: performing rollback? */
7603 mtr_t* local_mtr) /*!< in: mtr
7604 containing the latch to data an an
7605 X-latch to the index tree */
7606{
7607 page_t* page;
7608 const ulint space_id = mach_read_from_4(
7609 field_ref + BTR_EXTERN_SPACE_ID);
7610 const ulint start_page = mach_read_from_4(
7611 field_ref + BTR_EXTERN_PAGE_NO);
7612 ulint page_no;
7613 ulint next_page_no;
7614 mtr_t mtr;
7615
7616 ut_ad(index->is_primary());
7617 ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7618 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
7619 ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
7620 MTR_MEMO_PAGE_X_FIX));
7621 ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7622 ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7623 ut_ad(local_mtr->is_named_space(
7624 page_get_space_id(page_align(field_ref))));
7625
7626 if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7627 BTR_EXTERN_FIELD_REF_SIZE))) {
7628 /* In the rollback, we may encounter a clustered index
7629 record with some unwritten off-page columns. There is
7630 nothing to free then. */
7631 ut_a(rollback);
7632 return;
7633 }
7634
7635 ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7636 & ~((BTR_EXTERN_OWNER_FLAG
7637 | BTR_EXTERN_INHERITED_FLAG) << 24)));
7638 ut_ad(space_id == index->table->space->id);
7639
7640 const page_size_t ext_page_size(dict_table_page_size(index->table));
7641 const page_size_t& rec_page_size(rec == NULL
7642 ? univ_page_size
7643 : ext_page_size);
7644 if (rec == NULL) {
7645 /* This is a call from row_purge_upd_exist_or_extern(). */
7646 ut_ad(!page_zip);
7647 }
7648
7649 for (;;) {
7650#ifdef UNIV_DEBUG
7651 buf_block_t* rec_block;
7652#endif /* UNIV_DEBUG */
7653 buf_block_t* ext_block;
7654
7655 mtr_start(&mtr);
7656 mtr.set_spaces(*local_mtr);
7657 mtr.set_log_mode(local_mtr->get_log_mode());
7658
7659 ut_ad(!index->table->is_temporary()
7660 || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7661
7662 const page_t* p = page_align(field_ref);
7663
7664 const page_id_t page_id(page_get_space_id(p),
7665 page_get_page_no(p));
7666
7667#ifdef UNIV_DEBUG
7668 rec_block =
7669#endif /* UNIV_DEBUG */
7670 buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7671
7672 buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7673 page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7674
7675 if (/* There is no external storage data */
7676 page_no == FIL_NULL
7677 /* This field does not own the externally stored field */
7678 || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7679 & BTR_EXTERN_OWNER_FLAG)
7680 /* Rollback and inherited field */
7681 || (rollback
7682 && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7683 & BTR_EXTERN_INHERITED_FLAG))) {
7684
7685 /* Do not free */
7686 mtr_commit(&mtr);
7687
7688 return;
7689 }
7690
7691 if (page_no == start_page && dict_index_is_online_ddl(index)) {
7692 row_log_table_blob_free(index, start_page);
7693 }
7694
7695 ext_block = buf_page_get(
7696 page_id_t(space_id, page_no), ext_page_size,
7697 RW_X_LATCH, &mtr);
7698
7699 buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7700 page = buf_block_get_frame(ext_block);
7701
7702 if (ext_page_size.is_compressed()) {
7703 /* Note that page_zip will be NULL
7704 in row_purge_upd_exist_or_extern(). */
7705 switch (fil_page_get_type(page)) {
7706 case FIL_PAGE_TYPE_ZBLOB:
7707 case FIL_PAGE_TYPE_ZBLOB2:
7708 break;
7709 default:
7710 ut_error;
7711 }
7712 next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7713
7714 btr_page_free_low(index, ext_block, 0,
7715 true, &mtr);
7716
7717 if (page_zip != NULL) {
7718 mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7719 next_page_no);
7720 mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7721 0);
7722 page_zip_write_blob_ptr(page_zip, rec, index,
7723 offsets, i, &mtr);
7724 } else {
7725 mlog_write_ulint(field_ref
7726 + BTR_EXTERN_PAGE_NO,
7727 next_page_no,
7728 MLOG_4BYTES, &mtr);
7729 mlog_write_ulint(field_ref
7730 + BTR_EXTERN_LEN + 4, 0,
7731 MLOG_4BYTES, &mtr);
7732 }
7733 } else {
7734 ut_a(!page_zip);
7735 btr_check_blob_fil_page_type(space_id, page_no, page,
7736 FALSE);
7737
7738 next_page_no = mach_read_from_4(
7739 page + FIL_PAGE_DATA
7740 + BTR_BLOB_HDR_NEXT_PAGE_NO);
7741
7742 /* We must supply the page level (= 0) as an argument
7743 because we did not store it on the page (we save the
7744 space overhead from an index page header. */
7745 btr_page_free_low(index, ext_block, 0,
7746 true, &mtr);
7747
7748 mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7749 next_page_no,
7750 MLOG_4BYTES, &mtr);
7751 /* Zero out the BLOB length. If the server
7752 crashes during the execution of this function,
7753 trx_rollback_all_recovered() could
7754 dereference the half-deleted BLOB, fetching a
7755 wrong prefix for the BLOB. */
7756 mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7757 0,
7758 MLOG_4BYTES, &mtr);
7759 }
7760
7761 /* Commit mtr and release the BLOB block to save memory. */
7762 btr_blob_free(ext_block, TRUE, &mtr);
7763 }
7764}
7765
7766/***********************************************************//**
7767Frees the externally stored fields for a record. */
7768static
7769void
7770btr_rec_free_externally_stored_fields(
7771/*==================================*/
7772 dict_index_t* index, /*!< in: index of the data, the index
7773 tree MUST be X-latched */
7774 rec_t* rec, /*!< in/out: record */
7775 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7776 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7777 part will be updated, or NULL */
7778 bool rollback,/*!< in: performing rollback? */
7779 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7780 an X-latch to record page and to the index
7781 tree */
7782{
7783 ulint n_fields;
7784 ulint i;
7785
7786 ut_ad(rec_offs_validate(rec, index, offsets));
7787 ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
7788 ut_ad(index->is_primary());
7789 ut_ad(page_rec_is_leaf(rec));
7790 /* Free possible externally stored fields in the record */
7791
7792 ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7793 n_fields = rec_offs_n_fields(offsets);
7794
7795 for (i = 0; i < n_fields; i++) {
7796 if (rec_offs_nth_extern(offsets, i)) {
7797 btr_free_externally_stored_field(
7798 index, btr_rec_get_field_ref(rec, offsets, i),
7799 rec, offsets, page_zip, i, rollback, mtr);
7800 }
7801 }
7802}
7803
7804/***********************************************************//**
7805Frees the externally stored fields for a record, if the field is mentioned
7806in the update vector. */
7807static
7808void
7809btr_rec_free_updated_extern_fields(
7810/*===============================*/
7811 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
7812 X-latched */
7813 rec_t* rec, /*!< in/out: record */
7814 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7815 part will be updated, or NULL */
7816 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7817 const upd_t* update, /*!< in: update vector */
7818 bool rollback,/*!< in: performing rollback? */
7819 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7820 an X-latch to record page and to the tree */
7821{
7822 ulint n_fields;
7823 ulint i;
7824
7825 ut_ad(rec_offs_validate(rec, index, offsets));
7826 ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
7827
7828 /* Free possible externally stored fields in the record */
7829
7830 n_fields = upd_get_n_fields(update);
7831
7832 for (i = 0; i < n_fields; i++) {
7833 const upd_field_t* ufield = upd_get_nth_field(update, i);
7834
7835 if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7836 ulint len;
7837 byte* data = rec_get_nth_field(
7838 rec, offsets, ufield->field_no, &len);
7839 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7840
7841 btr_free_externally_stored_field(
7842 index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7843 rec, offsets, page_zip,
7844 ufield->field_no, rollback, mtr);
7845 }
7846 }
7847}
7848
7849/*******************************************************************//**
7850Copies the prefix of an uncompressed BLOB. The clustered index record
7851that points to this BLOB must be protected by a lock or a page latch.
7852@return number of bytes written to buf */
7853static
7854ulint
7855btr_copy_blob_prefix(
7856/*=================*/
7857 byte* buf, /*!< out: the externally stored part of
7858 the field, or a prefix of it */
7859 ulint len, /*!< in: length of buf, in bytes */
7860 ulint space_id,/*!< in: space id of the BLOB pages */
7861 ulint page_no,/*!< in: page number of the first BLOB page */
7862 ulint offset) /*!< in: offset on the first BLOB page */
7863{
7864 ulint copied_len = 0;
7865
7866 for (;;) {
7867 mtr_t mtr;
7868 buf_block_t* block;
7869 const page_t* page;
7870 const byte* blob_header;
7871 ulint part_len;
7872 ulint copy_len;
7873
7874 mtr_start(&mtr);
7875
7876 block = buf_page_get(page_id_t(space_id, page_no),
7877 univ_page_size, RW_S_LATCH, &mtr);
7878 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7879 page = buf_block_get_frame(block);
7880
7881 btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7882
7883 blob_header = page + offset;
7884 part_len = btr_blob_get_part_len(blob_header);
7885 copy_len = ut_min(part_len, len - copied_len);
7886
7887 memcpy(buf + copied_len,
7888 blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7889 copied_len += copy_len;
7890
7891 page_no = btr_blob_get_next_page_no(blob_header);
7892
7893 mtr_commit(&mtr);
7894
7895 if (page_no == FIL_NULL || copy_len != part_len) {
7896 UNIV_MEM_ASSERT_RW(buf, copied_len);
7897 return(copied_len);
7898 }
7899
7900 /* On other BLOB pages except the first the BLOB header
7901 always is at the page data start: */
7902
7903 offset = FIL_PAGE_DATA;
7904
7905 ut_ad(copied_len <= len);
7906 }
7907}
7908
7909/** Copies the prefix of a compressed BLOB.
7910The clustered index record that points to this BLOB must be protected
7911by a lock or a page latch.
7912@param[out] buf the externally stored part of the field,
7913or a prefix of it
7914@param[in] len length of buf, in bytes
7915@param[in] page_size compressed BLOB page size
7916@param[in] space_id space id of the BLOB pages
7917@param[in] offset offset on the first BLOB page
7918@return number of bytes written to buf */
7919static
7920ulint
7921btr_copy_zblob_prefix(
7922 byte* buf,
7923 ulint len,
7924 const page_size_t& page_size,
7925 ulint space_id,
7926 ulint page_no,
7927 ulint offset)
7928{
7929 ulint page_type = FIL_PAGE_TYPE_ZBLOB;
7930 mem_heap_t* heap;
7931 int err;
7932 z_stream d_stream;
7933
7934 d_stream.next_out = buf;
7935 d_stream.avail_out = static_cast<uInt>(len);
7936 d_stream.next_in = Z_NULL;
7937 d_stream.avail_in = 0;
7938
7939 /* Zlib inflate needs 32 kilobytes for the default
7940 window size, plus a few kilobytes for small objects. */
7941 heap = mem_heap_create(40000);
7942 page_zip_set_alloc(&d_stream, heap);
7943
7944 ut_ad(page_size.is_compressed());
7945 ut_ad(space_id);
7946
7947 err = inflateInit(&d_stream);
7948 ut_a(err == Z_OK);
7949
7950 for (;;) {
7951 buf_page_t* bpage;
7952 ulint next_page_no;
7953
7954 /* There is no latch on bpage directly. Instead,
7955 bpage is protected by the B-tree page latch that
7956 is being held on the clustered index record, or,
7957 in row_merge_copy_blobs(), by an exclusive table lock. */
7958 bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7959 page_size);
7960
7961 if (UNIV_UNLIKELY(!bpage)) {
7962 ib::error() << "Cannot load compressed BLOB "
7963 << page_id_t(space_id, page_no);
7964 goto func_exit;
7965 }
7966
7967 if (UNIV_UNLIKELY
7968 (fil_page_get_type(bpage->zip.data) != page_type)) {
7969
7970 ib::error() << "Unexpected type "
7971 << fil_page_get_type(bpage->zip.data)
7972 << " of compressed BLOB page "
7973 << page_id_t(space_id, page_no);
7974
7975 ut_ad(0);
7976 goto end_of_blob;
7977 }
7978
7979 next_page_no = mach_read_from_4(bpage->zip.data + offset);
7980
7981 if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7982 /* When the BLOB begins at page header,
7983 the compressed data payload does not
7984 immediately follow the next page pointer. */
7985 offset = FIL_PAGE_DATA;
7986 } else {
7987 offset += 4;
7988 }
7989
7990 d_stream.next_in = bpage->zip.data + offset;
7991 d_stream.avail_in = static_cast<uInt>(page_size.physical()
7992 - offset);
7993
7994 err = inflate(&d_stream, Z_NO_FLUSH);
7995 switch (err) {
7996 case Z_OK:
7997 if (!d_stream.avail_out) {
7998 goto end_of_blob;
7999 }
8000 break;
8001 case Z_STREAM_END:
8002 if (next_page_no == FIL_NULL) {
8003 goto end_of_blob;
8004 }
8005 /* fall through */
8006 default:
8007inflate_error:
8008 ib::error() << "inflate() of compressed BLOB page "
8009 << page_id_t(space_id, page_no)
8010 << " returned " << err
8011 << " (" << d_stream.msg << ")";
8012
8013 case Z_BUF_ERROR:
8014 goto end_of_blob;
8015 }
8016
8017 if (next_page_no == FIL_NULL) {
8018 if (!d_stream.avail_in) {
8019 ib::error()
8020 << "Unexpected end of compressed "
8021 << "BLOB page "
8022 << page_id_t(space_id, page_no);
8023 } else {
8024 err = inflate(&d_stream, Z_FINISH);
8025 switch (err) {
8026 case Z_STREAM_END:
8027 case Z_BUF_ERROR:
8028 break;
8029 default:
8030 goto inflate_error;
8031 }
8032 }
8033
8034end_of_blob:
8035 buf_page_release_zip(bpage);
8036 goto func_exit;
8037 }
8038
8039 buf_page_release_zip(bpage);
8040
8041 /* On other BLOB pages except the first
8042 the BLOB header always is at the page header: */
8043
8044 page_no = next_page_no;
8045 offset = FIL_PAGE_NEXT;
8046 page_type = FIL_PAGE_TYPE_ZBLOB2;
8047 }
8048
8049func_exit:
8050 inflateEnd(&d_stream);
8051 mem_heap_free(heap);
8052 UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
8053 return(d_stream.total_out);
8054}
8055
8056/** Copies the prefix of an externally stored field of a record.
8057The clustered index record that points to this BLOB must be protected
8058by a lock or a page latch.
8059@param[out] buf the externally stored part of the
8060field, or a prefix of it
8061@param[in] len length of buf, in bytes
8062@param[in] page_size BLOB page size
8063@param[in] space_id space id of the first BLOB page
8064@param[in] page_no page number of the first BLOB page
8065@param[in] offset offset on the first BLOB page
8066@return number of bytes written to buf */
8067static
8068ulint
8069btr_copy_externally_stored_field_prefix_low(
8070 byte* buf,
8071 ulint len,
8072 const page_size_t& page_size,
8073 ulint space_id,
8074 ulint page_no,
8075 ulint offset)
8076{
8077 if (len == 0) {
8078 return(0);
8079 }
8080
8081 if (page_size.is_compressed()) {
8082 return(btr_copy_zblob_prefix(buf, len, page_size,
8083 space_id, page_no, offset));
8084 } else {
8085 ut_ad(page_size.equals_to(univ_page_size));
8086 return(btr_copy_blob_prefix(buf, len, space_id,
8087 page_no, offset));
8088 }
8089}
8090
8091/** Copies the prefix of an externally stored field of a record.
8092The clustered index record must be protected by a lock or a page latch.
8093@param[out] buf the field, or a prefix of it
8094@param[in] len length of buf, in bytes
8095@param[in] page_size BLOB page size
8096@param[in] data 'internally' stored part of the field
8097containing also the reference to the external part; must be protected by
8098a lock or a page latch
8099@param[in] local_len length of data, in bytes
8100@return the length of the copied field, or 0 if the column was being
8101or has been deleted */
8102ulint
8103btr_copy_externally_stored_field_prefix(
8104 byte* buf,
8105 ulint len,
8106 const page_size_t& page_size,
8107 const byte* data,
8108 ulint local_len)
8109{
8110 ulint space_id;
8111 ulint page_no;
8112 ulint offset;
8113
8114 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8115
8116 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
8117
8118 if (UNIV_UNLIKELY(local_len >= len)) {
8119 memcpy(buf, data, len);
8120 return(len);
8121 }
8122
8123 memcpy(buf, data, local_len);
8124 data += local_len;
8125
8126 ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
8127
8128 if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
8129 /* The externally stored part of the column has been
8130 (partially) deleted. Signal the half-deleted BLOB
8131 to the caller. */
8132
8133 return(0);
8134 }
8135
8136 space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
8137
8138 page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
8139
8140 offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
8141
8142 return(local_len
8143 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
8144 len - local_len,
8145 page_size,
8146 space_id, page_no,
8147 offset));
8148}
8149
8150/** Copies an externally stored field of a record to mem heap.
8151The clustered index record must be protected by a lock or a page latch.
8152@param[out] len length of the whole field
8153@param[in] data 'internally' stored part of the field
8154containing also the reference to the external part; must be protected by
8155a lock or a page latch
8156@param[in] page_size BLOB page size
8157@param[in] local_len length of data
8158@param[in,out] heap mem heap
8159@return the whole field copied to heap */
8160byte*
8161btr_copy_externally_stored_field(
8162 ulint* len,
8163 const byte* data,
8164 const page_size_t& page_size,
8165 ulint local_len,
8166 mem_heap_t* heap)
8167{
8168 ulint space_id;
8169 ulint page_no;
8170 ulint offset;
8171 ulint extern_len;
8172 byte* buf;
8173
8174 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8175
8176 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
8177
8178 space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
8179
8180 page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
8181
8182 offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
8183
8184 /* Currently a BLOB cannot be bigger than 4 GB; we
8185 leave the 4 upper bytes in the length field unused */
8186
8187 extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
8188
8189 buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
8190
8191 memcpy(buf, data, local_len);
8192 *len = local_len
8193 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
8194 extern_len,
8195 page_size,
8196 space_id,
8197 page_no, offset);
8198
8199 return(buf);
8200}
8201
8202/** Copies an externally stored field of a record to mem heap.
8203@param[in] rec record in a clustered index; must be
8204protected by a lock or a page latch
8205@param[in] offset array returned by rec_get_offsets()
8206@param[in] page_size BLOB page size
8207@param[in] no field number
8208@param[out] len length of the field
8209@param[in,out] heap mem heap
8210@return the field copied to heap, or NULL if the field is incomplete */
8211byte*
8212btr_rec_copy_externally_stored_field(
8213 const rec_t* rec,
8214 const ulint* offsets,
8215 const page_size_t& page_size,
8216 ulint no,
8217 ulint* len,
8218 mem_heap_t* heap)
8219{
8220 ulint local_len;
8221 const byte* data;
8222
8223 ut_a(rec_offs_nth_extern(offsets, no));
8224
8225 /* An externally stored field can contain some initial
8226 data from the field, and in the last 20 bytes it has the
8227 space id, page number, and offset where the rest of the
8228 field data is stored, and the data length in addition to
8229 the data stored locally. We may need to store some data
8230 locally to get the local record length above the 128 byte
8231 limit so that field offsets are stored in two bytes, and
8232 the extern bit is available in those two bytes. */
8233
8234 data = rec_get_nth_field(rec, offsets, no, &local_len);
8235
8236 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8237
8238 if (UNIV_UNLIKELY
8239 (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
8240 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
8241 /* The externally stored field was not written yet.
8242 This record should only be seen by
8243 recv_recovery_rollback_active() or any
8244 TRX_ISO_READ_UNCOMMITTED transactions. */
8245 return(NULL);
8246 }
8247
8248 return(btr_copy_externally_stored_field(len, data,
8249 page_size, local_len, heap));
8250}
8251