1/*****************************************************************************
2
3Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2008, Google Inc.
5Copyright (c) 2015, 2018, MariaDB Corporation.
6
7Portions of this file contain modifications contributed and copyrighted by
8Google, Inc. Those modifications are gratefully acknowledged and are described
9briefly in the InnoDB documentation. The contributions by Google are
10incorporated with their permission, and subject to the conditions contained in
11the file COPYING.Google.
12
13This program is free software; you can redistribute it and/or modify it under
14the terms of the GNU General Public License as published by the Free Software
15Foundation; version 2 of the License.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25*****************************************************************************/
26
27/***************************************************//**
28@file row/row0sel.cc
29Select
30
31Created 12/19/1997 Heikki Tuuri
32*******************************************************/
33
34#include "row0sel.h"
35#include "dict0dict.h"
36#include "dict0boot.h"
37#include "trx0undo.h"
38#include "trx0trx.h"
39#include "btr0btr.h"
40#include "btr0cur.h"
41#include "btr0sea.h"
42#include "gis0rtree.h"
43#include "mach0data.h"
44#include "que0que.h"
45#include "row0upd.h"
46#include "row0row.h"
47#include "row0vers.h"
48#include "rem0cmp.h"
49#include "lock0lock.h"
50#include "eval0eval.h"
51#include "pars0sym.h"
52#include "pars0pars.h"
53#include "row0mysql.h"
54#include "buf0lru.h"
55#include "srv0srv.h"
56#include "ha_prototypes.h"
57#include "srv0mon.h"
58#include "ut0new.h"
59
60/* Maximum number of rows to prefetch; MySQL interface has another parameter */
61#define SEL_MAX_N_PREFETCH 16
62
63/* Number of rows fetched, after which to start prefetching; MySQL interface
64has another parameter */
65#define SEL_PREFETCH_LIMIT 1
66
67/* When a select has accessed about this many pages, it returns control back
68to que_run_threads: this is to allow canceling runaway queries */
69
70#define SEL_COST_LIMIT 100
71
72/* Flags for search shortcut */
73#define SEL_FOUND 0
74#define SEL_EXHAUSTED 1
75#define SEL_RETRY 2
76
77/********************************************************************//**
78Returns TRUE if the user-defined column in a secondary index record
79is alphabetically the same as the corresponding BLOB column in the clustered
80index record.
81NOTE: the comparison is NOT done as a binary comparison, but character
82fields are compared with collation!
83@return TRUE if the columns are equal */
84static
85ibool
86row_sel_sec_rec_is_for_blob(
87/*========================*/
88 ulint mtype, /*!< in: main type */
89 ulint prtype, /*!< in: precise type */
90 ulint mbminlen, /*!< in: minimum length of
91 a character, in bytes */
92 ulint mbmaxlen, /*!< in: maximum length of
93 a character, in bytes */
94 const byte* clust_field, /*!< in: the locally stored part of
95 the clustered index column, including
96 the BLOB pointer; the clustered
97 index record must be covered by
98 a lock or a page latch to protect it
99 against deletion (rollback or purge) */
100 ulint clust_len, /*!< in: length of clust_field */
101 const byte* sec_field, /*!< in: column in secondary index */
102 ulint sec_len, /*!< in: length of sec_field */
103 ulint prefix_len, /*!< in: index column prefix length
104 in bytes */
105 dict_table_t* table) /*!< in: table */
106{
107 ulint len;
108 byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
109
110 /* This function should never be invoked on tables in
111 ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they
112 should always contain enough prefix in the clustered index record. */
113 ut_ad(dict_table_has_atomic_blobs(table));
114 ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
115 ut_ad(prefix_len >= sec_len);
116 ut_ad(prefix_len > 0);
117 ut_a(prefix_len <= sizeof buf);
118
119 if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
120 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
121 /* The externally stored field was not written yet.
122 This record should only be seen by
123 recv_recovery_rollback_active() or any
124 TRX_ISO_READ_UNCOMMITTED transactions. */
125 return(FALSE);
126 }
127
128 len = btr_copy_externally_stored_field_prefix(
129 buf, prefix_len, page_size_t(table->space->flags),
130 clust_field, clust_len);
131
132 if (len == 0) {
133 /* The BLOB was being deleted as the server crashed.
134 There should not be any secondary index records
135 referring to this clustered index record, because
136 btr_free_externally_stored_field() is called after all
137 secondary index entries of the row have been purged. */
138 return(FALSE);
139 }
140
141 len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
142 prefix_len, len, (const char*) buf);
143
144 return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
145}
146
147/** Returns TRUE if the user-defined column values in a secondary index record
148are alphabetically the same as the corresponding columns in the clustered
149index record.
150NOTE: the comparison is NOT done as a binary comparison, but character
151fields are compared with collation!
152@param[in] sec_rec secondary index record
153@param[in] sec_index secondary index
154@param[in] clust_rec clustered index record;
155 must be protected by a page s-latch
156@param[in] clust_index clustered index
157@param[in] thr query thread
158@return TRUE if the secondary record is equal to the corresponding
159fields in the clustered record, when compared with collation;
160FALSE if not equal or if the clustered record has been marked for deletion */
161static
162ibool
163row_sel_sec_rec_is_for_clust_rec(
164 const rec_t* sec_rec,
165 dict_index_t* sec_index,
166 const rec_t* clust_rec,
167 dict_index_t* clust_index,
168 que_thr_t* thr)
169{
170 const byte* sec_field;
171 ulint sec_len;
172 const byte* clust_field;
173 ulint n;
174 ulint i;
175 mem_heap_t* heap = NULL;
176 ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
177 ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
178 ulint* clust_offs = clust_offsets_;
179 ulint* sec_offs = sec_offsets_;
180 ibool is_equal = TRUE;
181
182 rec_offs_init(clust_offsets_);
183 rec_offs_init(sec_offsets_);
184
185 if (rec_get_deleted_flag(clust_rec,
186 dict_table_is_comp(clust_index->table))) {
187 /* In delete-marked records, DB_TRX_ID must
188 always refer to an existing undo log record. */
189 ut_ad(rec_get_trx_id(clust_rec, clust_index));
190
191 /* The clustered index record is delete-marked;
192 it is not visible in the read view. Besides,
193 if there are any externally stored columns,
194 some of them may have already been purged. */
195 return(FALSE);
196 }
197
198 heap = mem_heap_create(256);
199
200 clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
201 true, ULINT_UNDEFINED, &heap);
202 sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
203 true, ULINT_UNDEFINED, &heap);
204
205 n = dict_index_get_n_ordering_defined_by_user(sec_index);
206
207 for (i = 0; i < n; i++) {
208 const dict_field_t* ifield;
209 const dict_col_t* col;
210 ulint clust_pos = 0;
211 ulint clust_len = 0;
212 ulint len;
213 bool is_virtual;
214
215 ifield = dict_index_get_nth_field(sec_index, i);
216 col = dict_field_get_col(ifield);
217
218 is_virtual = col->is_virtual();
219
220 /* For virtual column, its value will need to be
221 reconstructed from base column in cluster index */
222 if (is_virtual) {
223 const dict_v_col_t* v_col;
224 const dtuple_t* row;
225 dfield_t* vfield;
226 row_ext_t* ext;
227
228 v_col = reinterpret_cast<const dict_v_col_t*>(col);
229
230 row = row_build(ROW_COPY_POINTERS,
231 clust_index, clust_rec,
232 clust_offs,
233 NULL, NULL, NULL, &ext, heap);
234
235 vfield = innobase_get_computed_value(
236 row, v_col, clust_index,
237 &heap, NULL, NULL,
238 thr_get_trx(thr)->mysql_thd,
239 thr->prebuilt->m_mysql_table, NULL,
240 NULL, NULL);
241
242 clust_len = vfield->len;
243 clust_field = static_cast<byte*>(vfield->data);
244 } else {
245 clust_pos = dict_col_get_clust_pos(col, clust_index);
246 ut_ad(!rec_offs_nth_default(clust_offs, clust_pos));
247 clust_field = rec_get_nth_field(
248 clust_rec, clust_offs, clust_pos, &clust_len);
249 }
250
251 sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
252
253 len = clust_len;
254
255 if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL
256 && sec_len != UNIV_SQL_NULL && !is_virtual) {
257
258 if (rec_offs_nth_extern(clust_offs, clust_pos)) {
259 len -= BTR_EXTERN_FIELD_REF_SIZE;
260 }
261
262 len = dtype_get_at_most_n_mbchars(
263 col->prtype, col->mbminlen, col->mbmaxlen,
264 ifield->prefix_len, len, (char*) clust_field);
265
266 if (rec_offs_nth_extern(clust_offs, clust_pos)
267 && len < sec_len) {
268 if (!row_sel_sec_rec_is_for_blob(
269 col->mtype, col->prtype,
270 col->mbminlen, col->mbmaxlen,
271 clust_field, clust_len,
272 sec_field, sec_len,
273 ifield->prefix_len,
274 clust_index->table)) {
275 goto inequal;
276 }
277
278 continue;
279 }
280 }
281
282 /* For spatial index, the first field is MBR, we check
283 if the MBR is equal or not. */
284 if (dict_index_is_spatial(sec_index) && i == 0) {
285 rtr_mbr_t tmp_mbr;
286 rtr_mbr_t sec_mbr;
287 byte* dptr =
288 const_cast<byte*>(clust_field);
289
290 ut_ad(clust_len != UNIV_SQL_NULL);
291
292 /* For externally stored field, we need to get full
293 geo data to generate the MBR for comparing. */
294 if (rec_offs_nth_extern(clust_offs, clust_pos)) {
295 dptr = btr_copy_externally_stored_field(
296 &clust_len, dptr,
297 page_size_t(clust_index->table->space
298 ->flags),
299 len, heap);
300 }
301
302 rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
303 static_cast<uint>(clust_len
304 - GEO_DATA_HEADER_SIZE),
305 SPDIMS,
306 reinterpret_cast<double*>(
307 &tmp_mbr));
308 rtr_read_mbr(sec_field, &sec_mbr);
309
310 if (!MBR_EQUAL_CMP(&sec_mbr, &tmp_mbr)) {
311 is_equal = FALSE;
312 goto func_exit;
313 }
314 } else {
315
316 if (0 != cmp_data_data(col->mtype, col->prtype,
317 clust_field, len,
318 sec_field, sec_len)) {
319inequal:
320 is_equal = FALSE;
321 goto func_exit;
322 }
323 }
324 }
325
326func_exit:
327 if (UNIV_LIKELY_NULL(heap)) {
328 mem_heap_free(heap);
329 }
330 return(is_equal);
331}
332
333/*********************************************************************//**
334Creates a select node struct.
335@return own: select node struct */
336sel_node_t*
337sel_node_create(
338/*============*/
339 mem_heap_t* heap) /*!< in: memory heap where created */
340{
341 sel_node_t* node;
342
343 node = static_cast<sel_node_t*>(
344 mem_heap_alloc(heap, sizeof(sel_node_t)));
345
346 node->common.type = QUE_NODE_SELECT;
347 node->state = SEL_NODE_OPEN;
348
349 node->plans = NULL;
350
351 return(node);
352}
353
354/*********************************************************************//**
355Frees the memory private to a select node when a query graph is freed,
356does not free the heap where the node was originally created. */
357void
358sel_node_free_private(
359/*==================*/
360 sel_node_t* node) /*!< in: select node struct */
361{
362 ulint i;
363 plan_t* plan;
364
365 if (node->plans != NULL) {
366 for (i = 0; i < node->n_tables; i++) {
367 plan = sel_node_get_nth_plan(node, i);
368
369 btr_pcur_close(&(plan->pcur));
370 btr_pcur_close(&(plan->clust_pcur));
371
372 if (plan->old_vers_heap) {
373 mem_heap_free(plan->old_vers_heap);
374 }
375 }
376 }
377}
378
379/*********************************************************************//**
380Evaluates the values in a select list. If there are aggregate functions,
381their argument value is added to the aggregate total. */
382UNIV_INLINE
383void
384sel_eval_select_list(
385/*=================*/
386 sel_node_t* node) /*!< in: select node */
387{
388 que_node_t* exp;
389
390 exp = node->select_list;
391
392 while (exp) {
393 eval_exp(exp);
394
395 exp = que_node_get_next(exp);
396 }
397}
398
399/*********************************************************************//**
400Assigns the values in the select list to the possible into-variables in
401SELECT ... INTO ... */
402UNIV_INLINE
403void
404sel_assign_into_var_values(
405/*=======================*/
406 sym_node_t* var, /*!< in: first variable in a list of
407 variables */
408 sel_node_t* node) /*!< in: select node */
409{
410 que_node_t* exp;
411
412 if (var == NULL) {
413
414 return;
415 }
416
417 for (exp = node->select_list;
418 var != 0;
419 var = static_cast<sym_node_t*>(que_node_get_next(var))) {
420
421 ut_ad(exp);
422
423 eval_node_copy_val(var->alias, exp);
424
425 exp = que_node_get_next(exp);
426 }
427}
428
429/*********************************************************************//**
430Resets the aggregate value totals in the select list of an aggregate type
431query. */
432UNIV_INLINE
433void
434sel_reset_aggregate_vals(
435/*=====================*/
436 sel_node_t* node) /*!< in: select node */
437{
438 func_node_t* func_node;
439
440 ut_ad(node->is_aggregate);
441
442 for (func_node = static_cast<func_node_t*>(node->select_list);
443 func_node != 0;
444 func_node = static_cast<func_node_t*>(
445 que_node_get_next(func_node))) {
446
447 eval_node_set_int_val(func_node, 0);
448 }
449
450 node->aggregate_already_fetched = FALSE;
451}
452
453/*********************************************************************//**
454Copies the input variable values when an explicit cursor is opened. */
455UNIV_INLINE
456void
457row_sel_copy_input_variable_vals(
458/*=============================*/
459 sel_node_t* node) /*!< in: select node */
460{
461 sym_node_t* var;
462
463 var = UT_LIST_GET_FIRST(node->copy_variables);
464
465 while (var) {
466 eval_node_copy_val(var, var->alias);
467
468 var->indirection = NULL;
469
470 var = UT_LIST_GET_NEXT(col_var_list, var);
471 }
472}
473
474/*********************************************************************//**
475Fetches the column values from a record. */
476static
477void
478row_sel_fetch_columns(
479/*==================*/
480 dict_index_t* index, /*!< in: record index */
481 const rec_t* rec, /*!< in: record in a clustered or non-clustered
482 index; must be protected by a page latch */
483 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
484 sym_node_t* column) /*!< in: first column in a column list, or
485 NULL */
486{
487 dfield_t* val;
488 ulint index_type;
489 ulint field_no;
490 const byte* data;
491 ulint len;
492
493 ut_ad(rec_offs_validate(rec, index, offsets));
494
495 if (dict_index_is_clust(index)) {
496 index_type = SYM_CLUST_FIELD_NO;
497 } else {
498 index_type = SYM_SEC_FIELD_NO;
499 }
500
501 while (column) {
502 mem_heap_t* heap = NULL;
503 ibool needs_copy;
504
505 field_no = column->field_nos[index_type];
506
507 if (field_no != ULINT_UNDEFINED) {
508
509 if (UNIV_UNLIKELY(rec_offs_nth_extern(
510 offsets, field_no) != 0)) {
511
512 /* Copy an externally stored field to the
513 temporary heap, if possible. */
514
515 heap = mem_heap_create(1);
516
517 data = btr_rec_copy_externally_stored_field(
518 rec, offsets,
519 dict_table_page_size(index->table),
520 field_no, &len, heap);
521
522 /* data == NULL means that the
523 externally stored field was not
524 written yet. This record
525 should only be seen by
526 recv_recovery_rollback_active() or any
527 TRX_ISO_READ_UNCOMMITTED
528 transactions. The InnoDB SQL parser
529 (the sole caller of this function)
530 does not implement READ UNCOMMITTED,
531 and it is not involved during rollback. */
532 ut_a(data);
533 ut_a(len != UNIV_SQL_NULL);
534
535 needs_copy = TRUE;
536 } else {
537 data = rec_get_nth_cfield(rec, index, offsets,
538 field_no, &len);
539 needs_copy = column->copy_val;
540 }
541
542 if (needs_copy) {
543 eval_node_copy_and_alloc_val(column, data,
544 len);
545 } else {
546 val = que_node_get_val(column);
547 dfield_set_data(val, data, len);
548 }
549
550 if (UNIV_LIKELY_NULL(heap)) {
551 mem_heap_free(heap);
552 }
553 }
554
555 column = UT_LIST_GET_NEXT(col_var_list, column);
556 }
557}
558
559/*********************************************************************//**
560Allocates a prefetch buffer for a column when prefetch is first time done. */
561static
562void
563sel_col_prefetch_buf_alloc(
564/*=======================*/
565 sym_node_t* column) /*!< in: symbol table node for a column */
566{
567 sel_buf_t* sel_buf;
568 ulint i;
569
570 ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
571
572 column->prefetch_buf = static_cast<sel_buf_t*>(
573 ut_malloc_nokey(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
574
575 for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
576 sel_buf = column->prefetch_buf + i;
577
578 sel_buf->data = NULL;
579 sel_buf->len = 0;
580 sel_buf->val_buf_size = 0;
581 }
582}
583
584/*********************************************************************//**
585Frees a prefetch buffer for a column, including the dynamically allocated
586memory for data stored there. */
587void
588sel_col_prefetch_buf_free(
589/*======================*/
590 sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
591{
592 sel_buf_t* sel_buf;
593 ulint i;
594
595 for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
596 sel_buf = prefetch_buf + i;
597
598 if (sel_buf->val_buf_size > 0) {
599
600 ut_free(sel_buf->data);
601 }
602 }
603
604 ut_free(prefetch_buf);
605}
606
607/*********************************************************************//**
608Pops the column values for a prefetched, cached row from the column prefetch
609buffers and places them to the val fields in the column nodes. */
610static
611void
612sel_dequeue_prefetched_row(
613/*=======================*/
614 plan_t* plan) /*!< in: plan node for a table */
615{
616 sym_node_t* column;
617 sel_buf_t* sel_buf;
618 dfield_t* val;
619 byte* data;
620 ulint len;
621 ulint val_buf_size;
622
623 ut_ad(plan->n_rows_prefetched > 0);
624
625 column = UT_LIST_GET_FIRST(plan->columns);
626
627 while (column) {
628 val = que_node_get_val(column);
629
630 if (!column->copy_val) {
631 /* We did not really push any value for the
632 column */
633
634 ut_ad(!column->prefetch_buf);
635 ut_ad(que_node_get_val_buf_size(column) == 0);
636 ut_d(dfield_set_null(val));
637
638 goto next_col;
639 }
640
641 ut_ad(column->prefetch_buf);
642 ut_ad(!dfield_is_ext(val));
643
644 sel_buf = column->prefetch_buf + plan->first_prefetched;
645
646 data = sel_buf->data;
647 len = sel_buf->len;
648 val_buf_size = sel_buf->val_buf_size;
649
650 /* We must keep track of the allocated memory for
651 column values to be able to free it later: therefore
652 we swap the values for sel_buf and val */
653
654 sel_buf->data = static_cast<byte*>(dfield_get_data(val));
655 sel_buf->len = dfield_get_len(val);
656 sel_buf->val_buf_size = que_node_get_val_buf_size(column);
657
658 dfield_set_data(val, data, len);
659 que_node_set_val_buf_size(column, val_buf_size);
660next_col:
661 column = UT_LIST_GET_NEXT(col_var_list, column);
662 }
663
664 plan->n_rows_prefetched--;
665
666 plan->first_prefetched++;
667}
668
669/*********************************************************************//**
670Pushes the column values for a prefetched, cached row to the column prefetch
671buffers from the val fields in the column nodes. */
672UNIV_INLINE
673void
674sel_enqueue_prefetched_row(
675/*=======================*/
676 plan_t* plan) /*!< in: plan node for a table */
677{
678 sym_node_t* column;
679 sel_buf_t* sel_buf;
680 dfield_t* val;
681 byte* data;
682 ulint len;
683 ulint pos;
684 ulint val_buf_size;
685
686 if (plan->n_rows_prefetched == 0) {
687 pos = 0;
688 plan->first_prefetched = 0;
689 } else {
690 pos = plan->n_rows_prefetched;
691
692 /* We have the convention that pushing new rows starts only
693 after the prefetch stack has been emptied: */
694
695 ut_ad(plan->first_prefetched == 0);
696 }
697
698 plan->n_rows_prefetched++;
699
700 ut_ad(pos < SEL_MAX_N_PREFETCH);
701
702 for (column = UT_LIST_GET_FIRST(plan->columns);
703 column != 0;
704 column = UT_LIST_GET_NEXT(col_var_list, column)) {
705
706 if (!column->copy_val) {
707 /* There is no sense to push pointers to database
708 page fields when we do not keep latch on the page! */
709 continue;
710 }
711
712 if (!column->prefetch_buf) {
713 /* Allocate a new prefetch buffer */
714
715 sel_col_prefetch_buf_alloc(column);
716 }
717
718 sel_buf = column->prefetch_buf + pos;
719
720 val = que_node_get_val(column);
721
722 data = static_cast<byte*>(dfield_get_data(val));
723 len = dfield_get_len(val);
724 val_buf_size = que_node_get_val_buf_size(column);
725
726 /* We must keep track of the allocated memory for
727 column values to be able to free it later: therefore
728 we swap the values for sel_buf and val */
729
730 dfield_set_data(val, sel_buf->data, sel_buf->len);
731 que_node_set_val_buf_size(column, sel_buf->val_buf_size);
732
733 sel_buf->data = data;
734 sel_buf->len = len;
735 sel_buf->val_buf_size = val_buf_size;
736 }
737}
738
739/*********************************************************************//**
740Builds a previous version of a clustered index record for a consistent read
741@return DB_SUCCESS or error code */
742static MY_ATTRIBUTE((nonnull, warn_unused_result))
743dberr_t
744row_sel_build_prev_vers(
745/*====================*/
746 ReadView* read_view, /*!< in: read view */
747 dict_index_t* index, /*!< in: plan node for table */
748 rec_t* rec, /*!< in: record in a clustered index */
749 ulint** offsets, /*!< in/out: offsets returned by
750 rec_get_offsets(rec, plan->index) */
751 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
752 the offsets are allocated */
753 mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
754 rec_t** old_vers, /*!< out: old version, or NULL if the
755 record does not exist in the view:
756 i.e., it was freshly inserted
757 afterwards */
758 mtr_t* mtr) /*!< in: mtr */
759{
760 dberr_t err;
761
762 if (*old_vers_heap) {
763 mem_heap_empty(*old_vers_heap);
764 } else {
765 *old_vers_heap = mem_heap_create(512);
766 }
767
768 err = row_vers_build_for_consistent_read(
769 rec, mtr, index, offsets, read_view, offset_heap,
770 *old_vers_heap, old_vers, NULL);
771 return(err);
772}
773
774/*********************************************************************//**
775Builds the last committed version of a clustered index record for a
776semi-consistent read. */
777static
778void
779row_sel_build_committed_vers_for_mysql(
780/*===================================*/
781 dict_index_t* clust_index, /*!< in: clustered index */
782 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
783 const rec_t* rec, /*!< in: record in a clustered index */
784 ulint** offsets, /*!< in/out: offsets returned by
785 rec_get_offsets(rec, clust_index) */
786 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
787 the offsets are allocated */
788 const rec_t** old_vers, /*!< out: old version, or NULL if the
789 record does not exist in the view:
790 i.e., it was freshly inserted
791 afterwards */
792 const dtuple_t**vrow, /*!< out: to be filled with old virtual
793 column version if any */
794 mtr_t* mtr) /*!< in: mtr */
795{
796 if (prebuilt->old_vers_heap) {
797 mem_heap_empty(prebuilt->old_vers_heap);
798 } else {
799 prebuilt->old_vers_heap = mem_heap_create(
800 rec_offs_size(*offsets));
801 }
802
803 row_vers_build_for_semi_consistent_read(prebuilt->trx,
804 rec, mtr, clust_index, offsets, offset_heap,
805 prebuilt->old_vers_heap, old_vers, vrow);
806}
807
808/*********************************************************************//**
809Tests the conditions which determine when the index segment we are searching
810through has been exhausted.
811@return TRUE if row passed the tests */
812UNIV_INLINE
813ibool
814row_sel_test_end_conds(
815/*===================*/
816 plan_t* plan) /*!< in: plan for the table; the column values must
817 already have been retrieved and the right sides of
818 comparisons evaluated */
819{
820 func_node_t* cond;
821
822 /* All conditions in end_conds are comparisons of a column to an
823 expression */
824
825 for (cond = UT_LIST_GET_FIRST(plan->end_conds);
826 cond != 0;
827 cond = UT_LIST_GET_NEXT(cond_list, cond)) {
828
829 /* Evaluate the left side of the comparison, i.e., get the
830 column value if there is an indirection */
831
832 eval_sym(static_cast<sym_node_t*>(cond->args));
833
834 /* Do the comparison */
835
836 if (!eval_cmp(cond)) {
837
838 return(FALSE);
839 }
840 }
841
842 return(TRUE);
843}
844
845/*********************************************************************//**
846Tests the other conditions.
847@return TRUE if row passed the tests */
848UNIV_INLINE
849ibool
850row_sel_test_other_conds(
851/*=====================*/
852 plan_t* plan) /*!< in: plan for the table; the column values must
853 already have been retrieved */
854{
855 func_node_t* cond;
856
857 cond = UT_LIST_GET_FIRST(plan->other_conds);
858
859 while (cond) {
860 eval_exp(cond);
861
862 if (!eval_node_get_ibool_val(cond)) {
863
864 return(FALSE);
865 }
866
867 cond = UT_LIST_GET_NEXT(cond_list, cond);
868 }
869
870 return(TRUE);
871}
872
873/*********************************************************************//**
874Retrieves the clustered index record corresponding to a record in a
875non-clustered index. Does the necessary locking.
876@return DB_SUCCESS or error code */
877static MY_ATTRIBUTE((nonnull, warn_unused_result))
878dberr_t
879row_sel_get_clust_rec(
880/*==================*/
881 sel_node_t* node, /*!< in: select_node */
882 plan_t* plan, /*!< in: plan node for table */
883 rec_t* rec, /*!< in: record in a non-clustered index */
884 que_thr_t* thr, /*!< in: query thread */
885 rec_t** out_rec,/*!< out: clustered record or an old version of
886 it, NULL if the old version did not exist
887 in the read view, i.e., it was a fresh
888 inserted version */
889 mtr_t* mtr) /*!< in: mtr used to get access to the
890 non-clustered record; the same mtr is used to
891 access the clustered index */
892{
893 dict_index_t* index;
894 rec_t* clust_rec;
895 rec_t* old_vers;
896 dberr_t err;
897 mem_heap_t* heap = NULL;
898 ulint offsets_[REC_OFFS_NORMAL_SIZE];
899 ulint* offsets = offsets_;
900 rec_offs_init(offsets_);
901
902 *out_rec = NULL;
903
904 offsets = rec_get_offsets(rec,
905 btr_pcur_get_btr_cur(&plan->pcur)->index,
906 offsets, true, ULINT_UNDEFINED, &heap);
907
908 row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
909
910 index = dict_table_get_first_index(plan->table);
911
912 btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
913 BTR_SEARCH_LEAF, &plan->clust_pcur,
914 0, mtr);
915
916 clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
917
918 /* Note: only if the search ends up on a non-infimum record is the
919 low_match value the real match to the search tuple */
920
921 if (!page_rec_is_user_rec(clust_rec)
922 || btr_pcur_get_low_match(&(plan->clust_pcur))
923 < dict_index_get_n_unique(index)) {
924
925 ut_a(rec_get_deleted_flag(rec,
926 dict_table_is_comp(plan->table)));
927 ut_a(node->read_view);
928
929 /* In a rare case it is possible that no clust rec is found
930 for a delete-marked secondary index record: if in row0umod.cc
931 in row_undo_mod_remove_clust_low() we have already removed
932 the clust rec, while purge is still cleaning and removing
933 secondary index records associated with earlier versions of
934 the clustered index record. In that case we know that the
935 clustered index record did not exist in the read view of
936 trx. */
937
938 goto func_exit;
939 }
940
941 offsets = rec_get_offsets(clust_rec, index, offsets, true,
942 ULINT_UNDEFINED, &heap);
943
944 if (!node->read_view) {
945 /* Try to place a lock on the index record */
946 ulint lock_type;
947 trx_t* trx;
948
949 trx = thr_get_trx(thr);
950
951 /* If innodb_locks_unsafe_for_binlog option is used
952 or this session is using READ COMMITTED or lower isolation level
953 we lock only the record, i.e., next-key locking is
954 not used. */
955 if (srv_locks_unsafe_for_binlog
956 || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
957 lock_type = LOCK_REC_NOT_GAP;
958 } else {
959 lock_type = LOCK_ORDINARY;
960 }
961
962 err = lock_clust_rec_read_check_and_lock(
963 0, btr_pcur_get_block(&plan->clust_pcur),
964 clust_rec, index, offsets,
965 static_cast<lock_mode>(node->row_lock_mode),
966 lock_type,
967 thr);
968
969 switch (err) {
970 case DB_SUCCESS:
971 case DB_SUCCESS_LOCKED_REC:
972 /* Declare the variable uninitialized in Valgrind.
973 It should be set to DB_SUCCESS at func_exit. */
974 UNIV_MEM_INVALID(&err, sizeof err);
975 break;
976 default:
977 goto err_exit;
978 }
979 } else {
980 /* This is a non-locking consistent read: if necessary, fetch
981 a previous version of the record */
982
983 old_vers = NULL;
984
985 if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
986 node->read_view)) {
987
988 err = row_sel_build_prev_vers(
989 node->read_view, index, clust_rec,
990 &offsets, &heap, &plan->old_vers_heap,
991 &old_vers, mtr);
992
993 if (err != DB_SUCCESS) {
994
995 goto err_exit;
996 }
997
998 clust_rec = old_vers;
999
1000 if (clust_rec == NULL) {
1001 goto func_exit;
1002 }
1003 }
1004
1005 /* If we had to go to an earlier version of row or the
1006 secondary index record is delete marked, then it may be that
1007 the secondary index record corresponding to clust_rec
1008 (or old_vers) is not rec; in that case we must ignore
1009 such row because in our snapshot rec would not have existed.
1010 Remember that from rec we cannot see directly which transaction
1011 id corresponds to it: we have to go to the clustered index
1012 record. A query where we want to fetch all rows where
1013 the secondary index value is in some interval would return
1014 a wrong result if we would not drop rows which we come to
1015 visit through secondary index records that would not really
1016 exist in our snapshot. */
1017
1018 if ((old_vers
1019 || rec_get_deleted_flag(rec, dict_table_is_comp(
1020 plan->table)))
1021 && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
1022 clust_rec, index,
1023 thr)) {
1024 goto func_exit;
1025 }
1026 }
1027
1028 /* Fetch the columns needed in test conditions. The clustered
1029 index record is protected by a page latch that was acquired
1030 when plan->clust_pcur was positioned. The latch will not be
1031 released until mtr->commit(). */
1032
1033 ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
1034 row_sel_fetch_columns(index, clust_rec, offsets,
1035 UT_LIST_GET_FIRST(plan->columns));
1036 *out_rec = clust_rec;
1037func_exit:
1038 err = DB_SUCCESS;
1039err_exit:
1040 if (UNIV_LIKELY_NULL(heap)) {
1041 mem_heap_free(heap);
1042 }
1043 return(err);
1044}
1045
1046/*********************************************************************//**
1047Sets a lock on a page of R-Tree record. This is all or none action,
1048mostly due to we cannot reposition a record in R-Tree (with the
1049nature of splitting)
1050@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
1051UNIV_INLINE
1052dberr_t
1053sel_set_rtr_rec_lock(
1054/*=================*/
1055 btr_pcur_t* pcur, /*!< in: cursor */
1056 const rec_t* first_rec,/*!< in: record */
1057 dict_index_t* index, /*!< in: index */
1058 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
1059 ulint mode, /*!< in: lock mode */
1060 ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
1061 LOC_REC_NOT_GAP */
1062 que_thr_t* thr, /*!< in: query thread */
1063 mtr_t* mtr) /*!< in: mtr */
1064{
1065 matched_rec_t* match = pcur->btr_cur.rtr_info->matches;
1066 mem_heap_t* heap = NULL;
1067 dberr_t err = DB_SUCCESS;
1068 trx_t* trx = thr_get_trx(thr);
1069 buf_block_t* cur_block = btr_pcur_get_block(pcur);
1070 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1071 ulint* my_offsets = const_cast<ulint*>(offsets);
1072 rec_t* rec = const_cast<rec_t*>(first_rec);
1073 rtr_rec_vector* match_rec;
1074 rtr_rec_vector::iterator end;
1075
1076 rec_offs_init(offsets_);
1077
1078 if (match->locked || page_rec_is_supremum(first_rec)) {
1079 return(DB_SUCCESS_LOCKED_REC);
1080 }
1081
1082 ut_ad(page_align(first_rec) == cur_block->frame);
1083 ut_ad(match->valid);
1084
1085 rw_lock_x_lock(&(match->block.lock));
1086retry:
1087 cur_block = btr_pcur_get_block(pcur);
1088 ut_ad(rw_lock_own(&(match->block.lock), RW_LOCK_X)
1089 || rw_lock_own(&(match->block.lock), RW_LOCK_S));
1090 ut_ad(page_is_leaf(buf_block_get_frame(cur_block)));
1091
1092 err = lock_sec_rec_read_check_and_lock(
1093 0, cur_block, rec, index, my_offsets,
1094 static_cast<lock_mode>(mode), type, thr);
1095
1096 if (err == DB_LOCK_WAIT) {
1097re_scan:
1098 mtr->commit();
1099 trx->error_state = err;
1100 que_thr_stop_for_mysql(thr);
1101 thr->lock_state = QUE_THR_LOCK_ROW;
1102 if (row_mysql_handle_errors(
1103 &err, trx, thr, NULL)) {
1104 thr->lock_state = QUE_THR_LOCK_NOLOCK;
1105 mtr->start();
1106
1107 mutex_enter(&match->rtr_match_mutex);
1108 if (!match->valid && match->matched_recs->empty()) {
1109 mutex_exit(&match->rtr_match_mutex);
1110 err = DB_RECORD_NOT_FOUND;
1111 goto func_end;
1112 }
1113 mutex_exit(&match->rtr_match_mutex);
1114
1115 /* MDEV-14059 FIXME: why re-latch the block?
1116 pcur is already positioned on it! */
1117 ulint page_no = page_get_page_no(
1118 btr_pcur_get_page(pcur));
1119
1120 cur_block = buf_page_get_gen(
1121 page_id_t(index->table->space->id, page_no),
1122 page_size_t(index->table->space->flags),
1123 RW_X_LATCH, NULL, BUF_GET,
1124 __FILE__, __LINE__, mtr, &err);
1125 } else {
1126 mtr->start();
1127 goto func_end;
1128 }
1129
1130 DEBUG_SYNC_C("rtr_set_lock_wait");
1131
1132 if (!match->valid) {
1133 /* Page got deleted */
1134 mtr->commit();
1135 mtr->start();
1136 err = DB_RECORD_NOT_FOUND;
1137 goto func_end;
1138 }
1139
1140 match->matched_recs->clear();
1141
1142 rtr_cur_search_with_match(
1143 cur_block, index,
1144 pcur->btr_cur.rtr_info->search_tuple,
1145 pcur->btr_cur.rtr_info->search_mode,
1146 &pcur->btr_cur.page_cur,
1147 pcur->btr_cur.rtr_info);
1148
1149 if (!page_is_leaf(buf_block_get_frame(cur_block))) {
1150 /* Page got splitted and promoted (only for
1151 root page it is possible). Release the
1152 page and ask for a re-search */
1153 mtr->commit();
1154 mtr->start();
1155 err = DB_RECORD_NOT_FOUND;
1156 goto func_end;
1157 }
1158
1159 rec = btr_pcur_get_rec(pcur);
1160 my_offsets = offsets_;
1161 my_offsets = rec_get_offsets(rec, index, my_offsets, true,
1162 ULINT_UNDEFINED, &heap);
1163
1164 /* No match record */
1165 if (page_rec_is_supremum(rec) || !match->valid) {
1166 mtr->commit();
1167 mtr->start();
1168 err = DB_RECORD_NOT_FOUND;
1169 goto func_end;
1170 }
1171
1172 goto retry;
1173 }
1174
1175 my_offsets = offsets_;
1176 match_rec = match->matched_recs;
1177 end = match_rec->end();
1178
1179 for (rtr_rec_vector::iterator it = match_rec->begin();
1180 it != end; ++it) {
1181 rtr_rec_t* rtr_rec = &(*it);
1182
1183 my_offsets = rec_get_offsets(
1184 rtr_rec->r_rec, index, my_offsets, true,
1185 ULINT_UNDEFINED, &heap);
1186
1187 err = lock_sec_rec_read_check_and_lock(
1188 0, &match->block, rtr_rec->r_rec, index,
1189 my_offsets, static_cast<lock_mode>(mode),
1190 type, thr);
1191
1192 if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
1193 rtr_rec->locked = true;
1194 } else if (err == DB_LOCK_WAIT) {
1195 goto re_scan;
1196 } else {
1197 goto func_end;
1198 }
1199 }
1200
1201 match->locked = true;
1202
1203func_end:
1204 rw_lock_x_unlock(&(match->block.lock));
1205 if (heap != NULL) {
1206 mem_heap_free(heap);
1207 }
1208
1209 ut_ad(err != DB_LOCK_WAIT);
1210
1211 return(err);
1212}
1213
1214/*********************************************************************//**
1215Sets a lock on a record.
1216@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
1217UNIV_INLINE
1218dberr_t
1219sel_set_rec_lock(
1220/*=============*/
1221 btr_pcur_t* pcur, /*!< in: cursor */
1222 const rec_t* rec, /*!< in: record */
1223 dict_index_t* index, /*!< in: index */
1224 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
1225 ulint mode, /*!< in: lock mode */
1226 ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
1227 LOC_REC_NOT_GAP */
1228 que_thr_t* thr, /*!< in: query thread */
1229 mtr_t* mtr) /*!< in: mtr */
1230{
1231 trx_t* trx;
1232 dberr_t err = DB_SUCCESS;
1233 const buf_block_t* block;
1234
1235 block = btr_pcur_get_block(pcur);
1236
1237 trx = thr_get_trx(thr);
1238
1239 if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
1240 if (buf_LRU_buf_pool_running_out()) {
1241
1242 return(DB_LOCK_TABLE_FULL);
1243 }
1244 }
1245
1246 if (dict_index_is_clust(index)) {
1247 err = lock_clust_rec_read_check_and_lock(
1248 0, block, rec, index, offsets,
1249 static_cast<lock_mode>(mode), type, thr);
1250 } else {
1251
1252 if (dict_index_is_spatial(index)) {
1253 if (type == LOCK_GAP || type == LOCK_ORDINARY) {
1254 ut_ad(0);
1255 ib::error() << "Incorrectly request GAP lock "
1256 "on RTree";
1257 return(DB_SUCCESS);
1258 }
1259 err = sel_set_rtr_rec_lock(pcur, rec, index, offsets,
1260 mode, type, thr, mtr);
1261 } else {
1262 err = lock_sec_rec_read_check_and_lock(
1263 0, block, rec, index, offsets,
1264 static_cast<lock_mode>(mode), type, thr);
1265 }
1266 }
1267
1268 return(err);
1269}
1270
1271/*********************************************************************//**
1272Opens a pcur to a table index. */
1273static
1274void
1275row_sel_open_pcur(
1276/*==============*/
1277 plan_t* plan, /*!< in: table plan */
1278#ifdef BTR_CUR_HASH_ADAPT
1279 rw_lock_t* ahi_latch,
1280 /*!< in: the adaptive hash index latch */
1281#endif /* BTR_CUR_HASH_ADAPT */
1282 mtr_t* mtr) /*!< in/out: mini-transaction */
1283{
1284 dict_index_t* index;
1285 func_node_t* cond;
1286 que_node_t* exp;
1287 ulint n_fields;
1288 ulint i;
1289
1290 index = plan->index;
1291
1292 /* Calculate the value of the search tuple: the exact match columns
1293 get their expressions evaluated when we evaluate the right sides of
1294 end_conds */
1295
1296 cond = UT_LIST_GET_FIRST(plan->end_conds);
1297
1298 while (cond) {
1299 eval_exp(que_node_get_next(cond->args));
1300
1301 cond = UT_LIST_GET_NEXT(cond_list, cond);
1302 }
1303
1304 if (plan->tuple) {
1305 n_fields = dtuple_get_n_fields(plan->tuple);
1306
1307 if (plan->n_exact_match < n_fields) {
1308 /* There is a non-exact match field which must be
1309 evaluated separately */
1310
1311 eval_exp(plan->tuple_exps[n_fields - 1]);
1312 }
1313
1314 for (i = 0; i < n_fields; i++) {
1315 exp = plan->tuple_exps[i];
1316
1317 dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1318 que_node_get_val(exp));
1319 }
1320
1321 /* Open pcur to the index */
1322
1323 btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1324 BTR_SEARCH_LEAF, &plan->pcur,
1325 ahi_latch, mtr);
1326 } else {
1327 /* Open the cursor to the start or the end of the index
1328 (FALSE: no init) */
1329
1330 btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1331 &(plan->pcur), false, 0, mtr);
1332 }
1333
1334 ut_ad(plan->n_rows_prefetched == 0);
1335 ut_ad(plan->n_rows_fetched == 0);
1336 ut_ad(plan->cursor_at_end == FALSE);
1337
1338 plan->pcur_is_open = TRUE;
1339}
1340
1341/*********************************************************************//**
1342Restores a stored pcur position to a table index.
1343@return TRUE if the cursor should be moved to the next record after we
1344return from this function (moved to the previous, in the case of a
1345descending cursor) without processing again the current cursor
1346record */
1347static
1348ibool
1349row_sel_restore_pcur_pos(
1350/*=====================*/
1351 plan_t* plan, /*!< in: table plan */
1352 mtr_t* mtr) /*!< in: mtr */
1353{
1354 ibool equal_position;
1355 ulint relative_position;
1356
1357 ut_ad(!plan->cursor_at_end);
1358
1359 relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1360
1361 equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1362 &(plan->pcur), mtr);
1363
1364 /* If the cursor is traveling upwards, and relative_position is
1365
1366 (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1367 yet on the successor of the page infimum;
1368 (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1369 first record GREATER than the predecessor of a page supremum; we have
1370 not yet processed the cursor record: no need to move the cursor to the
1371 next record;
1372 (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1373 last record LESS or EQUAL to the old stored user record; (a) if
1374 equal_position is FALSE, this means that the cursor is now on a record
1375 less than the old user record, and we must move to the next record;
1376 (b) if equal_position is TRUE, then if
1377 plan->stored_cursor_rec_processed is TRUE, we must move to the next
1378 record, else there is no need to move the cursor. */
1379
1380 if (plan->asc) {
1381 if (relative_position == BTR_PCUR_ON) {
1382
1383 if (equal_position) {
1384
1385 return(plan->stored_cursor_rec_processed);
1386 }
1387
1388 return(TRUE);
1389 }
1390
1391 ut_ad(relative_position == BTR_PCUR_AFTER
1392 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1393
1394 return(FALSE);
1395 }
1396
1397 /* If the cursor is traveling downwards, and relative_position is
1398
1399 (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1400 the last record LESS than the successor of a page infimum; we have not
1401 processed the cursor record: no need to move the cursor;
1402 (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1403 first record GREATER than the predecessor of a page supremum; we have
1404 processed the cursor record: we should move the cursor to the previous
1405 record;
1406 (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1407 last record LESS or EQUAL to the old stored user record; (a) if
1408 equal_position is FALSE, this means that the cursor is now on a record
1409 less than the old user record, and we need not move to the previous
1410 record; (b) if equal_position is TRUE, then if
1411 plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1412 record, else there is no need to move the cursor. */
1413
1414 if (relative_position == BTR_PCUR_BEFORE
1415 || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1416
1417 return(FALSE);
1418 }
1419
1420 if (relative_position == BTR_PCUR_ON) {
1421
1422 if (equal_position) {
1423
1424 return(plan->stored_cursor_rec_processed);
1425 }
1426
1427 return(FALSE);
1428 }
1429
1430 ut_ad(relative_position == BTR_PCUR_AFTER
1431 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1432
1433 return(TRUE);
1434}
1435
1436/*********************************************************************//**
1437Resets a plan cursor to a closed state. */
1438UNIV_INLINE
1439void
1440plan_reset_cursor(
1441/*==============*/
1442 plan_t* plan) /*!< in: plan */
1443{
1444 plan->pcur_is_open = FALSE;
1445 plan->cursor_at_end = FALSE;
1446 plan->n_rows_fetched = 0;
1447 plan->n_rows_prefetched = 0;
1448}
1449
1450#ifdef BTR_CUR_HASH_ADAPT
1451/*********************************************************************//**
1452Tries to do a shortcut to fetch a clustered index record with a unique key,
1453using the hash index if possible (not always).
1454@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1455static
1456ulint
1457row_sel_try_search_shortcut(
1458/*========================*/
1459 sel_node_t* node, /*!< in: select node for a consistent read */
1460 plan_t* plan, /*!< in: plan for a unique search in clustered
1461 index */
1462 mtr_t* mtr) /*!< in: mtr */
1463{
1464 dict_index_t* index = plan->index;
1465
1466 ut_ad(node->read_view);
1467 ut_ad(plan->unique_search);
1468 ut_ad(!plan->must_get_clust);
1469
1470 rw_lock_t* ahi_latch = btr_get_search_latch(index);
1471 rw_lock_s_lock(ahi_latch);
1472
1473 row_sel_open_pcur(plan, ahi_latch, mtr);
1474
1475 const rec_t* rec = btr_pcur_get_rec(&(plan->pcur));
1476
1477 if (!page_rec_is_user_rec(rec) || rec_is_default_row(rec, index)) {
1478retry:
1479 rw_lock_s_unlock(ahi_latch);
1480 return(SEL_RETRY);
1481 }
1482
1483 ut_ad(plan->mode == PAGE_CUR_GE);
1484
1485 /* As the cursor is now placed on a user record after a search with
1486 the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1487 fields in the user record matched to the search tuple */
1488
1489 if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1490exhausted:
1491 rw_lock_s_unlock(ahi_latch);
1492 return(SEL_EXHAUSTED);
1493 }
1494
1495 /* This is a non-locking consistent read: if necessary, fetch
1496 a previous version of the record */
1497
1498 mem_heap_t* heap = NULL;
1499 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1500 ulint* offsets = offsets_;
1501 rec_offs_init(offsets_);
1502 offsets = rec_get_offsets(rec, index, offsets, true,
1503 ULINT_UNDEFINED, &heap);
1504
1505 if (dict_index_is_clust(index)) {
1506 if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1507 node->read_view)) {
1508 goto retry;
1509 }
1510 } else if (!srv_read_only_mode
1511 && !lock_sec_rec_cons_read_sees(
1512 rec, index, node->read_view)) {
1513 goto retry;
1514 }
1515
1516 if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1517 goto exhausted;
1518 }
1519
1520 /* Fetch the columns needed in test conditions. The index
1521 record is protected by a page latch that was acquired when
1522 plan->pcur was positioned. The latch will not be released
1523 until mtr->commit(). */
1524
1525 row_sel_fetch_columns(index, rec, offsets,
1526 UT_LIST_GET_FIRST(plan->columns));
1527
1528 /* Test the rest of search conditions */
1529
1530 if (!row_sel_test_other_conds(plan)) {
1531 goto exhausted;
1532 }
1533
1534 ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1535
1536 plan->n_rows_fetched++;
1537 rw_lock_s_unlock(ahi_latch);
1538
1539 if (UNIV_LIKELY_NULL(heap)) {
1540 mem_heap_free(heap);
1541 }
1542 return(SEL_FOUND);
1543}
1544#endif /* BTR_CUR_HASH_ADAPT */
1545
1546/*********************************************************************//**
1547Performs a select step.
1548@return DB_SUCCESS or error code */
1549static MY_ATTRIBUTE((warn_unused_result))
1550dberr_t
1551row_sel(
1552/*====*/
1553 sel_node_t* node, /*!< in: select node */
1554 que_thr_t* thr) /*!< in: query thread */
1555{
1556 dict_index_t* index;
1557 plan_t* plan;
1558 mtr_t mtr;
1559 ibool moved;
1560 rec_t* rec;
1561 rec_t* old_vers;
1562 rec_t* clust_rec;
1563 ibool consistent_read;
1564
1565 /* The following flag becomes TRUE when we are doing a
1566 consistent read from a non-clustered index and we must look
1567 at the clustered index to find out the previous delete mark
1568 state of the non-clustered record: */
1569
1570 ibool cons_read_requires_clust_rec = FALSE;
1571 ulint cost_counter = 0;
1572 ibool cursor_just_opened;
1573 ibool must_go_to_next;
1574 ibool mtr_has_extra_clust_latch = FALSE;
1575 /* TRUE if the search was made using
1576 a non-clustered index, and we had to
1577 access the clustered record: now &mtr
1578 contains a clustered index latch, and
1579 &mtr must be committed before we move
1580 to the next non-clustered record */
1581 dberr_t err;
1582 mem_heap_t* heap = NULL;
1583 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1584 ulint* offsets = offsets_;
1585 rec_offs_init(offsets_);
1586
1587 ut_ad(thr->run_node == node);
1588
1589 if (node->read_view) {
1590 /* In consistent reads, we try to do with the hash index and
1591 not to use the buffer page get. This is to reduce memory bus
1592 load resulting from semaphore operations. The search latch
1593 will be s-locked when we access an index with a unique search
1594 condition, but not locked when we access an index with a
1595 less selective search condition. */
1596
1597 consistent_read = TRUE;
1598 } else {
1599 consistent_read = FALSE;
1600 }
1601
1602table_loop:
1603 /* TABLE LOOP
1604 ----------
1605 This is the outer major loop in calculating a join. We come here when
1606 node->fetch_table changes, and after adding a row to aggregate totals
1607 and, of course, when this function is called. */
1608
1609 ut_ad(mtr_has_extra_clust_latch == FALSE);
1610
1611 plan = sel_node_get_nth_plan(node, node->fetch_table);
1612 index = plan->index;
1613
1614 if (plan->n_rows_prefetched > 0) {
1615 sel_dequeue_prefetched_row(plan);
1616
1617 goto next_table_no_mtr;
1618 }
1619
1620 if (plan->cursor_at_end) {
1621 /* The cursor has already reached the result set end: no more
1622 rows to process for this table cursor, as also the prefetch
1623 stack was empty */
1624
1625 ut_ad(plan->pcur_is_open);
1626
1627 goto table_exhausted_no_mtr;
1628 }
1629
1630 /* Open a cursor to index, or restore an open cursor position */
1631
1632 mtr.start();
1633
1634#ifdef BTR_CUR_HASH_ADAPT
1635 if (consistent_read && plan->unique_search && !plan->pcur_is_open
1636 && !plan->must_get_clust) {
1637 switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
1638 case SEL_FOUND:
1639 goto next_table;
1640 case SEL_EXHAUSTED:
1641 goto table_exhausted;
1642 default:
1643 ut_ad(0);
1644 /* fall through */
1645 case SEL_RETRY:
1646 break;
1647 }
1648
1649 plan_reset_cursor(plan);
1650
1651 mtr.commit();
1652 mtr.start();
1653 }
1654#endif /* BTR_CUR_HASH_ADAPT */
1655
1656 if (!plan->pcur_is_open) {
1657 /* Evaluate the expressions to build the search tuple and
1658 open the cursor */
1659 row_sel_open_pcur(plan,
1660#ifdef BTR_CUR_HASH_ADAPT
1661 NULL,
1662#endif /* BTR_CUR_HASH_ADAPT */
1663 &mtr);
1664
1665 cursor_just_opened = TRUE;
1666
1667 /* A new search was made: increment the cost counter */
1668 cost_counter++;
1669 } else {
1670 /* Restore pcur position to the index */
1671
1672 must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1673
1674 cursor_just_opened = FALSE;
1675
1676 if (must_go_to_next) {
1677 /* We have already processed the cursor record: move
1678 to the next */
1679
1680 goto next_rec;
1681 }
1682 }
1683
1684rec_loop:
1685 /* RECORD LOOP
1686 -----------
1687 In this loop we use pcur and try to fetch a qualifying row, and
1688 also fill the prefetch buffer for this table if n_rows_fetched has
1689 exceeded a threshold. While we are inside this loop, the following
1690 holds:
1691 (1) &mtr is started,
1692 (2) pcur is positioned and open.
1693
1694 NOTE that if cursor_just_opened is TRUE here, it means that we came
1695 to this point right after row_sel_open_pcur. */
1696
1697 ut_ad(mtr_has_extra_clust_latch == FALSE);
1698
1699 rec = btr_pcur_get_rec(&(plan->pcur));
1700
1701 /* PHASE 1: Set a lock if specified */
1702
1703 if (!node->asc && cursor_just_opened
1704 && !page_rec_is_supremum(rec)) {
1705
1706 /* Do not support "descending search" for Spatial index */
1707 ut_ad(!dict_index_is_spatial(index));
1708
1709 /* When we open a cursor for a descending search, we must set
1710 a next-key lock on the successor record: otherwise it would
1711 be possible to insert new records next to the cursor position,
1712 and it might be that these new records should appear in the
1713 search result set, resulting in the phantom problem. */
1714
1715 if (!consistent_read) {
1716 rec_t* next_rec = page_rec_get_next(rec);
1717 ulint lock_type;
1718 trx_t* trx;
1719
1720 trx = thr_get_trx(thr);
1721
1722 offsets = rec_get_offsets(next_rec, index, offsets,
1723 true,
1724 ULINT_UNDEFINED, &heap);
1725
1726 /* If innodb_locks_unsafe_for_binlog option is used
1727 or this session is using READ COMMITTED or lower isolation
1728 level, we lock only the record, i.e., next-key
1729 locking is not used. */
1730 if (srv_locks_unsafe_for_binlog
1731 || trx->isolation_level
1732 <= TRX_ISO_READ_COMMITTED) {
1733
1734 if (page_rec_is_supremum(next_rec)) {
1735
1736 goto skip_lock;
1737 }
1738
1739 lock_type = LOCK_REC_NOT_GAP;
1740 } else {
1741 lock_type = LOCK_ORDINARY;
1742 }
1743
1744 err = sel_set_rec_lock(&plan->pcur,
1745 next_rec, index, offsets,
1746 node->row_lock_mode,
1747 lock_type, thr, &mtr);
1748
1749 switch (err) {
1750 case DB_SUCCESS_LOCKED_REC:
1751 err = DB_SUCCESS;
1752 /* fall through */
1753 case DB_SUCCESS:
1754 break;
1755 default:
1756 /* Note that in this case we will store in pcur
1757 the PREDECESSOR of the record we are waiting
1758 the lock for */
1759 goto lock_wait_or_error;
1760 }
1761 }
1762 }
1763
1764skip_lock:
1765 if (page_rec_is_infimum(rec)) {
1766
1767 /* The infimum record on a page cannot be in the result set,
1768 and neither can a record lock be placed on it: we skip such
1769 a record. We also increment the cost counter as we may have
1770 processed yet another page of index. */
1771
1772 cost_counter++;
1773
1774 goto next_rec;
1775 }
1776
1777 if (rec_is_default_row(rec, index)) {
1778 /* Skip the 'default row' pseudo-record. */
1779 cost_counter++;
1780 goto next_rec;
1781 }
1782
1783 if (!consistent_read) {
1784 /* Try to place a lock on the index record */
1785 ulint lock_type;
1786 trx_t* trx;
1787
1788 offsets = rec_get_offsets(rec, index, offsets, true,
1789 ULINT_UNDEFINED, &heap);
1790
1791 trx = thr_get_trx(thr);
1792
1793 /* If innodb_locks_unsafe_for_binlog option is used
1794 or this session is using READ COMMITTED or lower isolation level,
1795 we lock only the record, i.e., next-key locking is
1796 not used. */
1797 if (srv_locks_unsafe_for_binlog
1798 || trx->isolation_level <= TRX_ISO_READ_COMMITTED
1799 || dict_index_is_spatial(index)) {
1800
1801 if (page_rec_is_supremum(rec)) {
1802
1803 goto next_rec;
1804 }
1805
1806 lock_type = LOCK_REC_NOT_GAP;
1807 } else {
1808 lock_type = LOCK_ORDINARY;
1809 }
1810
1811 err = sel_set_rec_lock(&plan->pcur,
1812 rec, index, offsets,
1813 node->row_lock_mode, lock_type,
1814 thr, &mtr);
1815
1816 switch (err) {
1817 case DB_SUCCESS_LOCKED_REC:
1818 err = DB_SUCCESS;
1819 /* fall through */
1820 case DB_SUCCESS:
1821 break;
1822 default:
1823 goto lock_wait_or_error;
1824 }
1825 }
1826
1827 if (page_rec_is_supremum(rec)) {
1828
1829 /* A page supremum record cannot be in the result set: skip
1830 it now when we have placed a possible lock on it */
1831
1832 goto next_rec;
1833 }
1834
1835 ut_ad(page_rec_is_user_rec(rec));
1836
1837 if (cost_counter > SEL_COST_LIMIT) {
1838
1839 /* Now that we have placed the necessary locks, we can stop
1840 for a while and store the cursor position; NOTE that if we
1841 would store the cursor position BEFORE placing a record lock,
1842 it might happen that the cursor would jump over some records
1843 that another transaction could meanwhile insert adjacent to
1844 the cursor: this would result in the phantom problem. */
1845
1846 goto stop_for_a_while;
1847 }
1848
1849 /* PHASE 2: Check a mixed index mix id if needed */
1850
1851 if (plan->unique_search && cursor_just_opened) {
1852
1853 ut_ad(plan->mode == PAGE_CUR_GE);
1854
1855 /* As the cursor is now placed on a user record after a search
1856 with the mode PAGE_CUR_GE, the up_match field in the cursor
1857 tells how many fields in the user record matched to the search
1858 tuple */
1859
1860 if (btr_pcur_get_up_match(&(plan->pcur))
1861 < plan->n_exact_match) {
1862 goto table_exhausted;
1863 }
1864
1865 /* Ok, no need to test end_conds or mix id */
1866
1867 }
1868
1869 /* We are ready to look at a possible new index entry in the result
1870 set: the cursor is now placed on a user record */
1871
1872 /* PHASE 3: Get previous version in a consistent read */
1873
1874 cons_read_requires_clust_rec = FALSE;
1875 offsets = rec_get_offsets(rec, index, offsets, true,
1876 ULINT_UNDEFINED, &heap);
1877
1878 if (consistent_read) {
1879 /* This is a non-locking consistent read: if necessary, fetch
1880 a previous version of the record */
1881
1882 if (dict_index_is_clust(index)) {
1883
1884 if (!lock_clust_rec_cons_read_sees(
1885 rec, index, offsets, node->read_view)) {
1886
1887 err = row_sel_build_prev_vers(
1888 node->read_view, index, rec,
1889 &offsets, &heap, &plan->old_vers_heap,
1890 &old_vers, &mtr);
1891
1892 if (err != DB_SUCCESS) {
1893
1894 goto lock_wait_or_error;
1895 }
1896
1897 if (old_vers == NULL) {
1898 /* The record does not exist
1899 in our read view. Skip it, but
1900 first attempt to determine
1901 whether the index segment we
1902 are searching through has been
1903 exhausted. */
1904
1905 offsets = rec_get_offsets(
1906 rec, index, offsets, true,
1907 ULINT_UNDEFINED, &heap);
1908
1909 /* Fetch the columns needed in
1910 test conditions. The clustered
1911 index record is protected by a
1912 page latch that was acquired
1913 by row_sel_open_pcur() or
1914 row_sel_restore_pcur_pos().
1915 The latch will not be released
1916 until mtr.commit(). */
1917
1918 row_sel_fetch_columns(
1919 index, rec, offsets,
1920 UT_LIST_GET_FIRST(
1921 plan->columns));
1922
1923 if (!row_sel_test_end_conds(plan)) {
1924
1925 goto table_exhausted;
1926 }
1927
1928 goto next_rec;
1929 }
1930
1931 rec = old_vers;
1932 }
1933 } else if (!srv_read_only_mode
1934 && !lock_sec_rec_cons_read_sees(
1935 rec, index, node->read_view)) {
1936
1937 cons_read_requires_clust_rec = TRUE;
1938 }
1939 }
1940
1941 /* PHASE 4: Test search end conditions and deleted flag */
1942
1943 /* Fetch the columns needed in test conditions. The record is
1944 protected by a page latch that was acquired by
1945 row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1946 will not be released until mtr.commit(). */
1947
1948 row_sel_fetch_columns(index, rec, offsets,
1949 UT_LIST_GET_FIRST(plan->columns));
1950
1951 /* Test the selection end conditions: these can only contain columns
1952 which already are found in the index, even though the index might be
1953 non-clustered */
1954
1955 if (plan->unique_search && cursor_just_opened) {
1956
1957 /* No test necessary: the test was already made above */
1958
1959 } else if (!row_sel_test_end_conds(plan)) {
1960
1961 goto table_exhausted;
1962 }
1963
1964 if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1965 && !cons_read_requires_clust_rec) {
1966
1967 /* The record is delete marked: we can skip it if this is
1968 not a consistent read which might see an earlier version
1969 of a non-clustered index record */
1970
1971 if (plan->unique_search) {
1972
1973 goto table_exhausted;
1974 }
1975
1976 goto next_rec;
1977 }
1978
1979 /* PHASE 5: Get the clustered index record, if needed and if we did
1980 not do the search using the clustered index */
1981
1982 if (plan->must_get_clust || cons_read_requires_clust_rec) {
1983
1984 /* It was a non-clustered index and we must fetch also the
1985 clustered index record */
1986
1987 err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1988 &mtr);
1989 mtr_has_extra_clust_latch = TRUE;
1990
1991 if (err != DB_SUCCESS) {
1992
1993 goto lock_wait_or_error;
1994 }
1995
1996 /* Retrieving the clustered record required a search:
1997 increment the cost counter */
1998
1999 cost_counter++;
2000
2001 if (clust_rec == NULL) {
2002 /* The record did not exist in the read view */
2003 ut_ad(consistent_read);
2004
2005 goto next_rec;
2006 }
2007
2008 if (rec_get_deleted_flag(clust_rec,
2009 dict_table_is_comp(plan->table))) {
2010 /* In delete-marked records, DB_TRX_ID must
2011 always refer to an existing update_undo log record. */
2012 ut_ad(rec_get_trx_id(clust_rec,
2013 dict_table_get_first_index(
2014 plan->table)));
2015
2016 /* The record is delete marked: we can skip it */
2017
2018 goto next_rec;
2019 }
2020
2021 if (node->can_get_updated) {
2022
2023 btr_pcur_store_position(&(plan->clust_pcur), &mtr);
2024 }
2025 }
2026
2027 /* PHASE 6: Test the rest of search conditions */
2028
2029 if (!row_sel_test_other_conds(plan)) {
2030
2031 if (plan->unique_search) {
2032
2033 goto table_exhausted;
2034 }
2035
2036 goto next_rec;
2037 }
2038
2039 /* PHASE 7: We found a new qualifying row for the current table; push
2040 the row if prefetch is on, or move to the next table in the join */
2041
2042 plan->n_rows_fetched++;
2043
2044 ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
2045
2046 if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
2047 || plan->unique_search || plan->no_prefetch) {
2048
2049 /* No prefetch in operation: go to the next table */
2050
2051 goto next_table;
2052 }
2053
2054 sel_enqueue_prefetched_row(plan);
2055
2056 if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
2057
2058 /* The prefetch buffer is now full */
2059
2060 sel_dequeue_prefetched_row(plan);
2061
2062 goto next_table;
2063 }
2064
2065next_rec:
2066 if (mtr_has_extra_clust_latch) {
2067
2068 /* We must commit &mtr if we are moving to the next
2069 non-clustered index record, because we could break the
2070 latching order if we would access a different clustered
2071 index page right away without releasing the previous. */
2072
2073 goto commit_mtr_for_a_while;
2074 }
2075
2076 if (node->asc) {
2077 moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
2078 } else {
2079 moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
2080 }
2081
2082 if (!moved) {
2083
2084 goto table_exhausted;
2085 }
2086
2087 cursor_just_opened = FALSE;
2088
2089 /* END OF RECORD LOOP
2090 ------------------ */
2091 goto rec_loop;
2092
2093next_table:
2094 /* We found a record which satisfies the conditions: we can move to
2095 the next table or return a row in the result set */
2096
2097 ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
2098
2099 if (plan->unique_search && !node->can_get_updated) {
2100
2101 plan->cursor_at_end = TRUE;
2102 } else {
2103 plan->stored_cursor_rec_processed = TRUE;
2104
2105 btr_pcur_store_position(&(plan->pcur), &mtr);
2106 }
2107
2108 mtr.commit();
2109
2110 mtr_has_extra_clust_latch = FALSE;
2111
2112next_table_no_mtr:
2113 /* If we use 'goto' to this label, it means that the row was popped
2114 from the prefetched rows stack, and &mtr is already committed */
2115
2116 if (node->fetch_table + 1 == node->n_tables) {
2117
2118 sel_eval_select_list(node);
2119
2120 if (node->is_aggregate) {
2121
2122 goto table_loop;
2123 }
2124
2125 sel_assign_into_var_values(node->into_list, node);
2126
2127 thr->run_node = que_node_get_parent(node);
2128
2129 err = DB_SUCCESS;
2130 goto func_exit;
2131 }
2132
2133 node->fetch_table++;
2134
2135 /* When we move to the next table, we first reset the plan cursor:
2136 we do not care about resetting it when we backtrack from a table */
2137
2138 plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
2139
2140 goto table_loop;
2141
2142table_exhausted:
2143 /* The table cursor pcur reached the result set end: backtrack to the
2144 previous table in the join if we do not have cached prefetched rows */
2145
2146 plan->cursor_at_end = TRUE;
2147
2148 mtr.commit();
2149
2150 mtr_has_extra_clust_latch = FALSE;
2151
2152 if (plan->n_rows_prefetched > 0) {
2153 /* The table became exhausted during a prefetch */
2154
2155 sel_dequeue_prefetched_row(plan);
2156
2157 goto next_table_no_mtr;
2158 }
2159
2160table_exhausted_no_mtr:
2161 if (node->fetch_table == 0) {
2162 err = DB_SUCCESS;
2163
2164 if (node->is_aggregate && !node->aggregate_already_fetched) {
2165
2166 node->aggregate_already_fetched = TRUE;
2167
2168 sel_assign_into_var_values(node->into_list, node);
2169
2170 thr->run_node = que_node_get_parent(node);
2171 } else {
2172 node->state = SEL_NODE_NO_MORE_ROWS;
2173
2174 thr->run_node = que_node_get_parent(node);
2175 }
2176
2177 goto func_exit;
2178 }
2179
2180 node->fetch_table--;
2181
2182 goto table_loop;
2183
2184stop_for_a_while:
2185 /* Return control for a while to que_run_threads, so that runaway
2186 queries can be canceled. NOTE that when we come here, we must, in a
2187 locking read, have placed the necessary (possibly waiting request)
2188 record lock on the cursor record or its successor: when we reposition
2189 the cursor, this record lock guarantees that nobody can meanwhile have
2190 inserted new records which should have appeared in the result set,
2191 which would result in the phantom problem. */
2192
2193 plan->stored_cursor_rec_processed = FALSE;
2194 btr_pcur_store_position(&(plan->pcur), &mtr);
2195
2196 mtr.commit();
2197 ut_ad(!sync_check_iterate(sync_check()));
2198
2199 err = DB_SUCCESS;
2200 goto func_exit;
2201
2202commit_mtr_for_a_while:
2203 /* Stores the cursor position and commits &mtr; this is used if
2204 &mtr may contain latches which would break the latching order if
2205 &mtr would not be committed and the latches released. */
2206
2207 plan->stored_cursor_rec_processed = TRUE;
2208
2209 btr_pcur_store_position(&(plan->pcur), &mtr);
2210
2211 mtr.commit();
2212
2213 mtr_has_extra_clust_latch = FALSE;
2214 ut_ad(!sync_check_iterate(dict_sync_check()));
2215
2216 goto table_loop;
2217
2218lock_wait_or_error:
2219 /* See the note at stop_for_a_while: the same holds for this case */
2220
2221 ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
2222
2223 plan->stored_cursor_rec_processed = FALSE;
2224 btr_pcur_store_position(&(plan->pcur), &mtr);
2225
2226 mtr.commit();
2227
2228func_exit:
2229 ut_ad(!sync_check_iterate(dict_sync_check()));
2230
2231 if (heap != NULL) {
2232 mem_heap_free(heap);
2233 }
2234 return(err);
2235}
2236
2237/**********************************************************************//**
2238Performs a select step. This is a high-level function used in SQL execution
2239graphs.
2240@return query thread to run next or NULL */
2241que_thr_t*
2242row_sel_step(
2243/*=========*/
2244 que_thr_t* thr) /*!< in: query thread */
2245{
2246 sel_node_t* node;
2247
2248 ut_ad(thr);
2249
2250 node = static_cast<sel_node_t*>(thr->run_node);
2251
2252 ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2253
2254 /* If this is a new time this node is executed (or when execution
2255 resumes after wait for a table intention lock), set intention locks
2256 on the tables, or assign a read view */
2257
2258 if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2259
2260 node->state = SEL_NODE_OPEN;
2261 }
2262
2263 if (node->state == SEL_NODE_OPEN) {
2264
2265 /* It may be that the current session has not yet started
2266 its transaction, or it has been committed: */
2267
2268 trx_start_if_not_started_xa(thr_get_trx(thr), false);
2269
2270 plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2271
2272 if (node->consistent_read) {
2273 trx_t *trx = thr_get_trx(thr);
2274 /* Assign a read view for the query */
2275 trx->read_view.open(trx);
2276 node->read_view = trx->read_view.is_open() ?
2277 &trx->read_view : NULL;
2278 } else {
2279 sym_node_t* table_node;
2280 lock_mode i_lock_mode;
2281
2282 if (node->set_x_locks) {
2283 i_lock_mode = LOCK_IX;
2284 } else {
2285 i_lock_mode = LOCK_IS;
2286 }
2287
2288 for (table_node = node->table_list;
2289 table_node != 0;
2290 table_node = static_cast<sym_node_t*>(
2291 que_node_get_next(table_node))) {
2292
2293 dberr_t err = lock_table(
2294 0, table_node->table, i_lock_mode,
2295 thr);
2296
2297 if (err != DB_SUCCESS) {
2298 trx_t* trx;
2299
2300 trx = thr_get_trx(thr);
2301 trx->error_state = err;
2302
2303 return(NULL);
2304 }
2305 }
2306 }
2307
2308 /* If this is an explicit cursor, copy stored procedure
2309 variable values, so that the values cannot change between
2310 fetches (currently, we copy them also for non-explicit
2311 cursors) */
2312
2313 if (node->explicit_cursor
2314 && UT_LIST_GET_FIRST(node->copy_variables)) {
2315
2316 row_sel_copy_input_variable_vals(node);
2317 }
2318
2319 node->state = SEL_NODE_FETCH;
2320 node->fetch_table = 0;
2321
2322 if (node->is_aggregate) {
2323 /* Reset the aggregate total values */
2324 sel_reset_aggregate_vals(node);
2325 }
2326 }
2327
2328 dberr_t err = row_sel(node, thr);
2329
2330 /* NOTE! if queries are parallelized, the following assignment may
2331 have problems; the assignment should be made only if thr is the
2332 only top-level thr in the graph: */
2333
2334 thr->graph->last_sel_node = node;
2335
2336 if (err != DB_SUCCESS) {
2337 thr_get_trx(thr)->error_state = err;
2338
2339 return(NULL);
2340 }
2341
2342 return(thr);
2343}
2344
2345/**********************************************************************//**
2346Performs a fetch for a cursor.
2347@return query thread to run next or NULL */
2348que_thr_t*
2349fetch_step(
2350/*=======*/
2351 que_thr_t* thr) /*!< in: query thread */
2352{
2353 sel_node_t* sel_node;
2354 fetch_node_t* node;
2355
2356 ut_ad(thr);
2357
2358 node = static_cast<fetch_node_t*>(thr->run_node);
2359 sel_node = node->cursor_def;
2360
2361 ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2362
2363 if (thr->prev_node != que_node_get_parent(node)) {
2364
2365 if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2366
2367 if (node->into_list) {
2368 sel_assign_into_var_values(node->into_list,
2369 sel_node);
2370 } else {
2371 ibool ret = (*node->func->func)(
2372 sel_node, node->func->arg);
2373
2374 if (!ret) {
2375 sel_node->state
2376 = SEL_NODE_NO_MORE_ROWS;
2377 }
2378 }
2379 }
2380
2381 thr->run_node = que_node_get_parent(node);
2382
2383 return(thr);
2384 }
2385
2386 /* Make the fetch node the parent of the cursor definition for
2387 the time of the fetch, so that execution knows to return to this
2388 fetch node after a row has been selected or we know that there is
2389 no row left */
2390
2391 sel_node->common.parent = node;
2392
2393 if (sel_node->state == SEL_NODE_CLOSED) {
2394 ib::error() << "fetch called on a closed cursor";
2395
2396 thr_get_trx(thr)->error_state = DB_ERROR;
2397
2398 return(NULL);
2399 }
2400
2401 thr->run_node = sel_node;
2402
2403 return(thr);
2404}
2405
2406/***********************************************************//**
2407Prints a row in a select result.
2408@return query thread to run next or NULL */
2409que_thr_t*
2410row_printf_step(
2411/*============*/
2412 que_thr_t* thr) /*!< in: query thread */
2413{
2414 row_printf_node_t* node;
2415 sel_node_t* sel_node;
2416 que_node_t* arg;
2417
2418 ut_ad(thr);
2419
2420 node = static_cast<row_printf_node_t*>(thr->run_node);
2421
2422 sel_node = node->sel_node;
2423
2424 ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2425
2426 if (thr->prev_node == que_node_get_parent(node)) {
2427
2428 /* Reset the cursor */
2429 sel_node->state = SEL_NODE_OPEN;
2430
2431 /* Fetch next row to print */
2432
2433 thr->run_node = sel_node;
2434
2435 return(thr);
2436 }
2437
2438 if (sel_node->state != SEL_NODE_FETCH) {
2439
2440 ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2441
2442 /* No more rows to print */
2443
2444 thr->run_node = que_node_get_parent(node);
2445
2446 return(thr);
2447 }
2448
2449 arg = sel_node->select_list;
2450
2451 while (arg) {
2452 dfield_print_also_hex(que_node_get_val(arg));
2453
2454 fputs(" ::: ", stderr);
2455
2456 arg = que_node_get_next(arg);
2457 }
2458
2459 putc('\n', stderr);
2460
2461 /* Fetch next row to print */
2462
2463 thr->run_node = sel_node;
2464
2465 return(thr);
2466}
2467
2468/****************************************************************//**
2469Converts a key value stored in MySQL format to an Innobase dtuple. The last
2470field of the key value may be just a prefix of a fixed length field: hence
2471the parameter key_len. But currently we do not allow search keys where the
2472last field is only a prefix of the full key field len and print a warning if
2473such appears. A counterpart of this function is
2474ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2475void
2476row_sel_convert_mysql_key_to_innobase(
2477/*==================================*/
2478 dtuple_t* tuple, /*!< in/out: tuple where to build;
2479 NOTE: we assume that the type info
2480 in the tuple is already according
2481 to index! */
2482 byte* buf, /*!< in: buffer to use in field
2483 conversions; NOTE that dtuple->data
2484 may end up pointing inside buf so
2485 do not discard that buffer while
2486 the tuple is being used. See
2487 row_mysql_store_col_in_innobase_format()
2488 in the case of DATA_INT */
2489 ulint buf_len, /*!< in: buffer length */
2490 dict_index_t* index, /*!< in: index of the key value */
2491 const byte* key_ptr, /*!< in: MySQL key value */
2492 ulint key_len) /*!< in: MySQL key value length */
2493{
2494 byte* original_buf = buf;
2495 const byte* original_key_ptr = key_ptr;
2496 dict_field_t* field;
2497 dfield_t* dfield;
2498 ulint data_offset;
2499 ulint data_len;
2500 ulint data_field_len;
2501 ibool is_null;
2502 const byte* key_end;
2503 ulint n_fields = 0;
2504
2505 /* For documentation of the key value storage format in MySQL, see
2506 ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2507
2508 key_end = key_ptr + key_len;
2509
2510 /* Permit us to access any field in the tuple (ULINT_MAX): */
2511
2512 dtuple_set_n_fields(tuple, ULINT_MAX);
2513
2514 dfield = dtuple_get_nth_field(tuple, 0);
2515 field = dict_index_get_nth_field(index, 0);
2516
2517 if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2518 /* A special case: we are looking for a position in the
2519 generated clustered index which InnoDB automatically added
2520 to a table with no primary key: the first and the only
2521 ordering column is ROW_ID which InnoDB stored to the key_ptr
2522 buffer. */
2523
2524 ut_a(key_len == DATA_ROW_ID_LEN);
2525
2526 dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2527
2528 dtuple_set_n_fields(tuple, 1);
2529
2530 return;
2531 }
2532
2533 while (key_ptr < key_end) {
2534
2535 ulint type = dfield_get_type(dfield)->mtype;
2536 ut_a(field->col->mtype == type);
2537
2538 data_offset = 0;
2539 is_null = FALSE;
2540
2541 if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2542 /* The first byte in the field tells if this is
2543 an SQL NULL value */
2544
2545 data_offset = 1;
2546
2547 if (*key_ptr != 0) {
2548 dfield_set_null(dfield);
2549
2550 is_null = TRUE;
2551 }
2552 }
2553
2554 /* Calculate data length and data field total length */
2555 if (DATA_LARGE_MTYPE(type) || DATA_GEOMETRY_MTYPE(type)) {
2556
2557 /* For R-tree index, data length should be the
2558 total size of the wkb data.*/
2559 if (dict_index_is_spatial(index)) {
2560 ut_ad(DATA_GEOMETRY_MTYPE(type));
2561 data_len = key_len;
2562 data_field_len = data_offset + data_len;
2563 } else {
2564 /* The key field is a column prefix of a BLOB
2565 or TEXT. */
2566
2567 ut_a(field->prefix_len > 0);
2568
2569 /* MySQL stores the actual data length to the
2570 first 2 bytes after the optional SQL NULL
2571 marker byte. The storage format is
2572 little-endian, that is, the most significant
2573 byte at a higher address. In UTF-8, MySQL
2574 seems to reserve field->prefix_len bytes for
2575 storing this field in the key value buffer,
2576 even though the actual value only takes data
2577 len bytes from the start. */
2578
2579 data_len = ulint(key_ptr[data_offset])
2580 | ulint(key_ptr[data_offset + 1]) << 8;
2581 data_field_len = data_offset + 2
2582 + field->prefix_len;
2583
2584 data_offset += 2;
2585
2586 /* Now that we know the length, we store the
2587 column value like it would be a fixed char
2588 field */
2589 }
2590
2591
2592 } else if (field->prefix_len > 0) {
2593 /* Looks like MySQL pads unused end bytes in the
2594 prefix with space. Therefore, also in UTF-8, it is ok
2595 to compare with a prefix containing full prefix_len
2596 bytes, and no need to take at most prefix_len / 3
2597 UTF-8 characters from the start.
2598 If the prefix is used as the upper end of a LIKE
2599 'abc%' query, then MySQL pads the end with chars
2600 0xff. TODO: in that case does it any harm to compare
2601 with the full prefix_len bytes. How do characters
2602 0xff in UTF-8 behave? */
2603
2604 data_len = field->prefix_len;
2605 data_field_len = data_offset + data_len;
2606 } else {
2607 data_len = dfield_get_type(dfield)->len;
2608 data_field_len = data_offset + data_len;
2609 }
2610
2611 if ((dtype_get_mysql_type(dfield_get_type(dfield))
2612 == DATA_MYSQL_TRUE_VARCHAR)
2613 && (type != DATA_INT)) {
2614 /* In a MySQL key value format, a true VARCHAR is
2615 always preceded by 2 bytes of a length field.
2616 dfield_get_type(dfield)->len returns the maximum
2617 'payload' len in bytes. That does not include the
2618 2 bytes that tell the actual data length.
2619
2620 We added the check != DATA_INT to make sure we do
2621 not treat MySQL ENUM or SET as a true VARCHAR! */
2622
2623 data_len += 2;
2624 data_field_len += 2;
2625 }
2626
2627 /* Storing may use at most data_len bytes of buf */
2628
2629 if (UNIV_LIKELY(!is_null)) {
2630 buf = row_mysql_store_col_in_innobase_format(
2631 dfield, buf,
2632 FALSE, /* MySQL key value format col */
2633 key_ptr + data_offset, data_len,
2634 dict_table_is_comp(index->table));
2635 ut_a(buf <= original_buf + buf_len);
2636 }
2637
2638 key_ptr += data_field_len;
2639
2640 if (UNIV_UNLIKELY(key_ptr > key_end)) {
2641 /* The last field in key was not a complete key field
2642 but a prefix of it.
2643
2644 Print a warning about this! HA_READ_PREFIX_LAST does
2645 not currently work in InnoDB with partial-field key
2646 value prefixes. Since MySQL currently uses a padding
2647 trick to calculate LIKE 'abc%' type queries there
2648 should never be partial-field prefixes in searches. */
2649
2650 ib::warn() << "Using a partial-field key prefix in"
2651 " search, index " << index->name
2652 << " of table " << index->table->name
2653 << ". Last data field length "
2654 << data_field_len << " bytes, key ptr now"
2655 " exceeds key end by " << (key_ptr - key_end)
2656 << " bytes. Key value in the MySQL format:";
2657
2658 ut_print_buf(stderr, original_key_ptr, key_len);
2659 putc('\n', stderr);
2660
2661 if (!is_null) {
2662 ulint len = dfield_get_len(dfield);
2663 dfield_set_len(dfield, len
2664 - (ulint) (key_ptr - key_end));
2665 }
2666 ut_ad(0);
2667 }
2668
2669 n_fields++;
2670 field++;
2671 dfield++;
2672 }
2673
2674 ut_a(buf <= original_buf + buf_len);
2675
2676 /* We set the length of tuple to n_fields: we assume that the memory
2677 area allocated for it is big enough (usually bigger than n_fields). */
2678
2679 dtuple_set_n_fields(tuple, n_fields);
2680}
2681
2682/**************************************************************//**
2683Stores the row id to the prebuilt struct. */
2684static
2685void
2686row_sel_store_row_id_to_prebuilt(
2687/*=============================*/
2688 row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
2689 const rec_t* index_rec, /*!< in: record */
2690 const dict_index_t* index, /*!< in: index of the record */
2691 const ulint* offsets) /*!< in: rec_get_offsets
2692 (index_rec, index) */
2693{
2694 const byte* data;
2695 ulint len;
2696
2697 ut_ad(rec_offs_validate(index_rec, index, offsets));
2698
2699 data = rec_get_nth_field(
2700 index_rec, offsets,
2701 dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2702
2703 if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2704
2705 ib::error() << "Row id field is wrong length " << len << " in"
2706 " index " << index->name
2707 << " of table " << index->table->name
2708 << ", Field number "
2709 << dict_index_get_sys_col_pos(index, DATA_ROW_ID)
2710 << ", record:";
2711
2712 rec_print_new(stderr, index_rec, offsets);
2713 putc('\n', stderr);
2714 ut_error;
2715 }
2716
2717 ut_memcpy(prebuilt->row_id, data, len);
2718}
2719
2720/**************************************************************//**
2721Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2722function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
2723void
2724row_sel_field_store_in_mysql_format_func(
2725 byte* dest,
2726 const mysql_row_templ_t* templ,
2727#ifdef UNIV_DEBUG
2728 const dict_index_t* index,
2729 ulint field_no,
2730#endif /* UNIV_DEBUG */
2731 const byte* data,
2732 ulint len)
2733{
2734 byte* ptr;
2735#ifdef UNIV_DEBUG
2736 const dict_field_t* field
2737 = templ->is_virtual
2738 ? NULL : dict_index_get_nth_field(index, field_no);
2739#endif /* UNIV_DEBUG */
2740
2741 ut_ad(len != UNIV_SQL_NULL);
2742 UNIV_MEM_ASSERT_RW(data, len);
2743 UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
2744 UNIV_MEM_INVALID(dest, templ->mysql_col_len);
2745
2746 switch (templ->type) {
2747 const byte* field_end;
2748 byte* pad;
2749 case DATA_INT:
2750 /* Convert integer data from Innobase to a little-endian
2751 format, sign bit restored to normal */
2752
2753 ptr = dest + len;
2754
2755 for (;;) {
2756 ptr--;
2757 *ptr = *data;
2758 if (ptr == dest) {
2759 break;
2760 }
2761 data++;
2762 }
2763
2764 if (!templ->is_unsigned) {
2765 dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2766 }
2767
2768 ut_ad(templ->mysql_col_len == len);
2769 break;
2770
2771 case DATA_VARCHAR:
2772 case DATA_VARMYSQL:
2773 case DATA_BINARY:
2774 field_end = dest + templ->mysql_col_len;
2775
2776 if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2777 /* This is a >= 5.0.3 type true VARCHAR. Store the
2778 length of the data to the first byte or the first
2779 two bytes of dest. */
2780
2781 dest = row_mysql_store_true_var_len(
2782 dest, len, templ->mysql_length_bytes);
2783 /* Copy the actual data. Leave the rest of the
2784 buffer uninitialized. */
2785 memcpy(dest, data, len);
2786 break;
2787 }
2788
2789 /* Copy the actual data */
2790 ut_memcpy(dest, data, len);
2791
2792 /* Pad with trailing spaces. */
2793
2794 pad = dest + len;
2795
2796 ut_ad(templ->mbminlen <= templ->mbmaxlen);
2797
2798 /* We treat some Unicode charset strings specially. */
2799 switch (templ->mbminlen) {
2800 case 4:
2801 /* InnoDB should never have stripped partial
2802 UTF-32 characters. */
2803 ut_a(!(len & 3));
2804 break;
2805 case 2:
2806 /* A space char is two bytes,
2807 0x0020 in UCS2 and UTF-16 */
2808
2809 if (UNIV_UNLIKELY(len & 1)) {
2810 /* A 0x20 has been stripped from the column.
2811 Pad it back. */
2812
2813 if (pad < field_end) {
2814 *pad++ = 0x20;
2815 }
2816 }
2817 }
2818
2819 row_mysql_pad_col(templ->mbminlen, pad,
2820 ulint(field_end - pad));
2821 break;
2822
2823 case DATA_BLOB:
2824 /* Store a pointer to the BLOB buffer to dest: the BLOB was
2825 already copied to the buffer in row_sel_store_mysql_rec */
2826
2827 row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2828 len);
2829 break;
2830
2831 case DATA_GEOMETRY:
2832 /* We store all geometry data as BLOB data at server layer. */
2833 row_mysql_store_geometry(dest, templ->mysql_col_len, data, len);
2834 break;
2835
2836 case DATA_MYSQL:
2837 memcpy(dest, data, len);
2838
2839 ut_ad(templ->mysql_col_len >= len);
2840 ut_ad(templ->mbmaxlen >= templ->mbminlen);
2841
2842 /* If field_no equals to templ->icp_rec_field_no,
2843 we are examining a row pointed by "icp_rec_field_no".
2844 There is possibility that icp_rec_field_no refers to
2845 a field in a secondary index while templ->rec_field_no
2846 points to field in a primary index. The length
2847 should still be equal, unless the field pointed
2848 by icp_rec_field_no has a prefix */
2849 ut_ad(templ->mbmaxlen > templ->mbminlen
2850 || templ->mysql_col_len == len
2851 || (field_no == templ->icp_rec_field_no
2852 && field->prefix_len > 0));
2853
2854 /* The following assertion would fail for old tables
2855 containing UTF-8 ENUM columns due to Bug #9526. */
2856 ut_ad(!templ->mbmaxlen
2857 || !(templ->mysql_col_len % templ->mbmaxlen));
2858 ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
2859 || (field_no == templ->icp_rec_field_no
2860 && field->prefix_len > 0)
2861 || templ->rec_field_is_prefix);
2862
2863 ut_ad(templ->is_virtual
2864 || !(field->prefix_len % templ->mbmaxlen));
2865
2866 if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
2867 /* Pad with spaces. This undoes the stripping
2868 done in row0mysql.cc, function
2869 row_mysql_store_col_in_innobase_format(). */
2870
2871 memset(dest + len, 0x20, templ->mysql_col_len - len);
2872 }
2873 break;
2874
2875 default:
2876#ifdef UNIV_DEBUG
2877 case DATA_SYS_CHILD:
2878 case DATA_SYS:
2879 /* These column types should never be shipped to MySQL. */
2880 ut_ad(0);
2881 /* fall through */
2882
2883 case DATA_CHAR:
2884 case DATA_FIXBINARY:
2885 case DATA_FLOAT:
2886 case DATA_DOUBLE:
2887 case DATA_DECIMAL:
2888 /* Above are the valid column types for MySQL data. */
2889#endif /* UNIV_DEBUG */
2890 ut_ad((templ->is_virtual && !field)
2891 || (field && field->prefix_len
2892 ? field->prefix_len == len
2893 : templ->mysql_col_len == len));
2894 memcpy(dest, data, len);
2895 }
2896}
2897
2898#ifdef UNIV_DEBUG
2899/** Convert a field from Innobase format to MySQL format. */
2900# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
2901 row_sel_store_mysql_field_func(m,p,r,i,o,f,t)
2902#else /* UNIV_DEBUG */
2903/** Convert a field from Innobase format to MySQL format. */
2904# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
2905 row_sel_store_mysql_field_func(m,p,r,o,f,t)
2906#endif /* UNIV_DEBUG */
2907/** Convert a field in the Innobase format to a field in the MySQL format.
2908@param[out] mysql_rec record in the MySQL format
2909@param[in,out] prebuilt prebuilt struct
2910@param[in] rec InnoDB record; must be protected
2911 by a page latch
2912@param[in] index index of rec
2913@param[in] offsets array returned by rec_get_offsets()
2914@param[in] field_no templ->rec_field_no or
2915 templ->clust_rec_field_no
2916 or templ->icp_rec_field_no
2917@param[in] templ row template
2918*/
2919static MY_ATTRIBUTE((warn_unused_result))
2920ibool
2921row_sel_store_mysql_field_func(
2922 byte* mysql_rec,
2923 row_prebuilt_t* prebuilt,
2924 const rec_t* rec,
2925#ifdef UNIV_DEBUG
2926 const dict_index_t* index,
2927#endif
2928 const ulint* offsets,
2929 ulint field_no,
2930 const mysql_row_templ_t*templ)
2931{
2932 DBUG_ENTER("row_sel_store_mysql_field_func");
2933
2934 const byte* data;
2935 ulint len;
2936
2937 ut_ad(prebuilt->default_rec);
2938 ut_ad(templ);
2939 ut_ad(templ >= prebuilt->mysql_template);
2940 ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
2941 ut_ad(field_no == templ->clust_rec_field_no
2942 || field_no == templ->rec_field_no
2943 || field_no == templ->icp_rec_field_no);
2944 ut_ad(rec_offs_validate(rec, index, offsets));
2945
2946 if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) {
2947
2948 mem_heap_t* heap;
2949 /* Copy an externally stored field to a temporary heap */
2950
2951 ut_ad(field_no == templ->clust_rec_field_no);
2952
2953 if (DATA_LARGE_MTYPE(templ->type)) {
2954 if (prebuilt->blob_heap == NULL) {
2955 prebuilt->blob_heap = mem_heap_create(
2956 srv_page_size);
2957 }
2958
2959 heap = prebuilt->blob_heap;
2960 } else {
2961 heap = mem_heap_create(srv_page_size);
2962 }
2963
2964 /* NOTE: if we are retrieving a big BLOB, we may
2965 already run out of memory in the next call, which
2966 causes an assert */
2967
2968 data = btr_rec_copy_externally_stored_field(
2969 rec, offsets,
2970 dict_table_page_size(prebuilt->table),
2971 field_no, &len, heap);
2972
2973 if (UNIV_UNLIKELY(!data)) {
2974 /* The externally stored field was not written
2975 yet. This record should only be seen by
2976 recv_recovery_rollback_active() or any
2977 TRX_ISO_READ_UNCOMMITTED transactions. */
2978
2979 if (heap != prebuilt->blob_heap) {
2980 mem_heap_free(heap);
2981 }
2982
2983 ut_a(prebuilt->trx->isolation_level
2984 == TRX_ISO_READ_UNCOMMITTED);
2985 DBUG_RETURN(FALSE);
2986 }
2987
2988 ut_a(len != UNIV_SQL_NULL);
2989
2990 row_sel_field_store_in_mysql_format(
2991 mysql_rec + templ->mysql_col_offset,
2992 templ, index, field_no, data, len);
2993
2994 if (heap != prebuilt->blob_heap) {
2995 mem_heap_free(heap);
2996 }
2997 } else {
2998 /* The field is stored in the index record, or
2999 in the 'default row' for instant ADD COLUMN. */
3000
3001 if (rec_offs_nth_default(offsets, field_no)) {
3002 ut_ad(dict_index_is_clust(index));
3003 ut_ad(index->is_instant());
3004 const dict_index_t* clust_index
3005 = dict_table_get_first_index(prebuilt->table);
3006 ut_ad(index == clust_index);
3007 data = clust_index->instant_field_value(field_no,&len);
3008 } else {
3009 data = rec_get_nth_field(rec, offsets, field_no, &len);
3010 }
3011
3012 if (len == UNIV_SQL_NULL) {
3013 /* MySQL assumes that the field for an SQL
3014 NULL value is set to the default value. */
3015 ut_ad(templ->mysql_null_bit_mask);
3016
3017 UNIV_MEM_ASSERT_RW(prebuilt->default_rec
3018 + templ->mysql_col_offset,
3019 templ->mysql_col_len);
3020 mysql_rec[templ->mysql_null_byte_offset]
3021 |= (byte) templ->mysql_null_bit_mask;
3022 memcpy(mysql_rec + templ->mysql_col_offset,
3023 (const byte*) prebuilt->default_rec
3024 + templ->mysql_col_offset,
3025 templ->mysql_col_len);
3026 DBUG_RETURN(TRUE);
3027 }
3028
3029 if (DATA_LARGE_MTYPE(templ->type)
3030 || DATA_GEOMETRY_MTYPE(templ->type)) {
3031
3032 /* It is a BLOB field locally stored in the
3033 InnoDB record: we MUST copy its contents to
3034 prebuilt->blob_heap here because
3035 row_sel_field_store_in_mysql_format() stores a
3036 pointer to the data, and the data passed to us
3037 will be invalid as soon as the
3038 mini-transaction is committed and the page
3039 latch on the clustered index page is
3040 released. */
3041
3042 if (prebuilt->blob_heap == NULL) {
3043 prebuilt->blob_heap = mem_heap_create(
3044 srv_page_size);
3045 DBUG_PRINT("anna", ("blob_heap allocated: %p",
3046 prebuilt->blob_heap));
3047 }
3048
3049 data = static_cast<byte*>(
3050 mem_heap_dup(prebuilt->blob_heap, data, len));
3051 }
3052
3053 row_sel_field_store_in_mysql_format(
3054 mysql_rec + templ->mysql_col_offset,
3055 templ, index, field_no, data, len);
3056 }
3057
3058 ut_ad(len != UNIV_SQL_NULL);
3059
3060 if (templ->mysql_null_bit_mask) {
3061 /* It is a nullable column with a non-NULL
3062 value */
3063 mysql_rec[templ->mysql_null_byte_offset]
3064 &= ~(byte) templ->mysql_null_bit_mask;
3065 }
3066
3067 DBUG_RETURN(TRUE);
3068}
3069
3070/** Convert a row in the Innobase format to a row in the MySQL format.
3071Note that the template in prebuilt may advise us to copy only a few
3072columns to mysql_rec, other columns are left blank. All columns may not
3073be needed in the query.
3074@param[out] mysql_rec row in the MySQL format
3075@param[in] prebuilt prebuilt structure
3076@param[in] rec Innobase record in the index
3077 which was described in prebuilt's
3078 template, or in the clustered index;
3079 must be protected by a page latch
3080@param[in] vrow virtual columns
3081@param[in] rec_clust whether the rec in the clustered index
3082@param[in] index index of rec
3083@param[in] offsets array returned by rec_get_offsets(rec)
3084@return TRUE on success, FALSE if not all columns could be retrieved */
3085static MY_ATTRIBUTE((warn_unused_result))
3086ibool
3087row_sel_store_mysql_rec(
3088 byte* mysql_rec,
3089 row_prebuilt_t* prebuilt,
3090 const rec_t* rec,
3091 const dtuple_t* vrow,
3092 bool rec_clust,
3093 const dict_index_t* index,
3094 const ulint* offsets)
3095{
3096 DBUG_ENTER("row_sel_store_mysql_rec");
3097
3098 ut_ad(rec_clust || index == prebuilt->index);
3099 ut_ad(!rec_clust || dict_index_is_clust(index));
3100
3101 if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
3102 row_mysql_prebuilt_free_blob_heap(prebuilt);
3103 }
3104
3105 for (ulint i = 0; i < prebuilt->n_template; i++) {
3106 const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3107
3108 if (templ->is_virtual && dict_index_is_clust(index)) {
3109
3110 /* Skip virtual columns if it is not a covered
3111 search or virtual key read is not requested. */
3112 if (!dict_index_has_virtual(prebuilt->index)
3113 || (!prebuilt->read_just_key
3114 && !prebuilt->m_read_virtual_key)
3115 || !rec_clust) {
3116 continue;
3117 }
3118
3119 dict_v_col_t* col;
3120 col = dict_table_get_nth_v_col(
3121 index->table, templ->clust_rec_field_no);
3122
3123 ut_ad(vrow);
3124
3125 const dfield_t* dfield = dtuple_get_nth_v_field(
3126 vrow, col->v_pos);
3127
3128 /* If this is a partitioned table, it might request
3129 InnoDB to fill out virtual column data for serach
3130 index key values while other non key columns are also
3131 getting selected. The non-key virtual columns may
3132 not be materialized and we should skip them. */
3133 if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
3134#ifdef UNIV_DEBUG
3135 ulint prefix;
3136#endif /* UNIV_DEBUG */
3137 ut_ad(prebuilt->m_read_virtual_key);
3138
3139 /* If it is part of index key the data should
3140 have been materialized. */
3141 ut_ad(dict_index_get_nth_col_or_prefix_pos(
3142 prebuilt->index, col->v_pos, false,
3143 true, &prefix) == ULINT_UNDEFINED);
3144
3145 continue;
3146 }
3147
3148 if (dfield->len == UNIV_SQL_NULL) {
3149 mysql_rec[templ->mysql_null_byte_offset]
3150 |= (byte) templ->mysql_null_bit_mask;
3151 memcpy(mysql_rec
3152 + templ->mysql_col_offset,
3153 (const byte*) prebuilt->default_rec
3154 + templ->mysql_col_offset,
3155 templ->mysql_col_len);
3156 } else {
3157 row_sel_field_store_in_mysql_format(
3158 mysql_rec + templ->mysql_col_offset,
3159 templ, index, templ->clust_rec_field_no,
3160 (const byte*)dfield->data, dfield->len);
3161 if (templ->mysql_null_bit_mask) {
3162 mysql_rec[
3163 templ->mysql_null_byte_offset]
3164 &= ~(byte) templ->mysql_null_bit_mask;
3165 }
3166 }
3167
3168 continue;
3169 }
3170
3171 const ulint field_no
3172 = rec_clust
3173 ? templ->clust_rec_field_no
3174 : templ->rec_field_no;
3175 /* We should never deliver column prefixes to MySQL,
3176 except for evaluating innobase_index_cond(). */
3177 /* ...actually, we do want to do this in order to
3178 support the prefix query optimization.
3179
3180 ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
3181 == 0);
3182
3183 ...so we disable this assert. */
3184
3185 if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3186 rec, index, offsets,
3187 field_no, templ)) {
3188
3189 DBUG_RETURN(FALSE);
3190 }
3191 }
3192
3193 /* FIXME: We only need to read the doc_id if an FTS indexed
3194 column is being updated.
3195 NOTE, the record can be cluster or secondary index record.
3196 if secondary index is used then FTS_DOC_ID column should be part
3197 of this index. */
3198 if (dict_table_has_fts_index(prebuilt->table)) {
3199 if (dict_index_is_clust(index)
3200 || prebuilt->fts_doc_id_in_read_set) {
3201 prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
3202 prebuilt->table, rec, index, NULL);
3203 }
3204 }
3205
3206 DBUG_RETURN(TRUE);
3207}
3208
3209/*********************************************************************//**
3210Builds a previous version of a clustered index record for a consistent read
3211@return DB_SUCCESS or error code */
3212static MY_ATTRIBUTE((warn_unused_result))
3213dberr_t
3214row_sel_build_prev_vers_for_mysql(
3215/*==============================*/
3216 ReadView* read_view, /*!< in: read view */
3217 dict_index_t* clust_index, /*!< in: clustered index */
3218 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
3219 const rec_t* rec, /*!< in: record in a clustered index */
3220 ulint** offsets, /*!< in/out: offsets returned by
3221 rec_get_offsets(rec, clust_index) */
3222 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
3223 the offsets are allocated */
3224 rec_t** old_vers, /*!< out: old version, or NULL if the
3225 record does not exist in the view:
3226 i.e., it was freshly inserted
3227 afterwards */
3228 const dtuple_t**vrow, /*!< out: dtuple to hold old virtual
3229 column data */
3230 mtr_t* mtr) /*!< in: mtr */
3231{
3232 dberr_t err;
3233
3234 if (prebuilt->old_vers_heap) {
3235 mem_heap_empty(prebuilt->old_vers_heap);
3236 } else {
3237 prebuilt->old_vers_heap = mem_heap_create(200);
3238 }
3239
3240 err = row_vers_build_for_consistent_read(
3241 rec, mtr, clust_index, offsets, read_view, offset_heap,
3242 prebuilt->old_vers_heap, old_vers, vrow);
3243 return(err);
3244}
3245
3246/*********************************************************************//**
3247Retrieves the clustered index record corresponding to a record in a
3248non-clustered index. Does the necessary locking. Used in the MySQL
3249interface.
3250@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
3251static MY_ATTRIBUTE((warn_unused_result))
3252dberr_t
3253row_sel_get_clust_rec_for_mysql(
3254/*============================*/
3255 row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
3256 dict_index_t* sec_index,/*!< in: secondary index where rec resides */
3257 const rec_t* rec, /*!< in: record in a non-clustered index; if
3258 this is a locking read, then rec is not
3259 allowed to be delete-marked, and that would
3260 not make sense either */
3261 que_thr_t* thr, /*!< in: query thread */
3262 const rec_t** out_rec,/*!< out: clustered record or an old version of
3263 it, NULL if the old version did not exist
3264 in the read view, i.e., it was a fresh
3265 inserted version */
3266 ulint** offsets,/*!< in: offsets returned by
3267 rec_get_offsets(rec, sec_index);
3268 out: offsets returned by
3269 rec_get_offsets(out_rec, clust_index) */
3270 mem_heap_t** offset_heap,/*!< in/out: memory heap from which
3271 the offsets are allocated */
3272 const dtuple_t**vrow, /*!< out: virtual column to fill */
3273 mtr_t* mtr) /*!< in: mtr used to get access to the
3274 non-clustered record; the same mtr is used to
3275 access the clustered index */
3276{
3277 dict_index_t* clust_index;
3278 const rec_t* clust_rec;
3279 rec_t* old_vers;
3280 dberr_t err;
3281 trx_t* trx;
3282
3283 *out_rec = NULL;
3284 trx = thr_get_trx(thr);
3285
3286 srv_stats.n_sec_rec_cluster_reads.inc(
3287 thd_get_thread_id(trx->mysql_thd));
3288
3289 row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
3290 sec_index, *offsets);
3291
3292 clust_index = dict_table_get_first_index(sec_index->table);
3293
3294 btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
3295 PAGE_CUR_LE, BTR_SEARCH_LEAF,
3296 prebuilt->clust_pcur, 0, mtr);
3297
3298 clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
3299
3300 prebuilt->clust_pcur->trx_if_known = trx;
3301
3302 /* Note: only if the search ends up on a non-infimum record is the
3303 low_match value the real match to the search tuple */
3304
3305 if (!page_rec_is_user_rec(clust_rec)
3306 || btr_pcur_get_low_match(prebuilt->clust_pcur)
3307 < dict_index_get_n_unique(clust_index)) {
3308 btr_cur_t* btr_cur = btr_pcur_get_btr_cur(prebuilt->pcur);
3309
3310 /* If this is a spatial index scan, and we are reading
3311 from a shadow buffer, the record could be already
3312 deleted (due to rollback etc.). So get the original
3313 page and verify that */
3314 if (dict_index_is_spatial(sec_index)
3315 && btr_cur->rtr_info->matches
3316 && (page_align(rec)
3317 == btr_cur->rtr_info->matches->block.frame
3318 || rec != btr_pcur_get_rec(prebuilt->pcur))) {
3319#ifdef UNIV_DEBUG
3320 rtr_info_t* rtr_info = btr_cur->rtr_info;
3321 mutex_enter(&rtr_info->matches->rtr_match_mutex);
3322 /* The page could be deallocated (by rollback etc.) */
3323 if (!rtr_info->matches->valid) {
3324 mutex_exit(&rtr_info->matches->rtr_match_mutex);
3325 clust_rec = NULL;
3326
3327 err = DB_SUCCESS;
3328 goto func_exit;
3329 }
3330 mutex_exit(&rtr_info->matches->rtr_match_mutex);
3331
3332 if (rec_get_deleted_flag(rec,
3333 dict_table_is_comp(sec_index->table))
3334 && prebuilt->select_lock_type == LOCK_NONE) {
3335
3336 clust_rec = NULL;
3337
3338 err = DB_SUCCESS;
3339 goto func_exit;
3340 }
3341
3342 if (rec != btr_pcur_get_rec(prebuilt->pcur)) {
3343 clust_rec = NULL;
3344
3345 err = DB_SUCCESS;
3346 goto func_exit;
3347 }
3348
3349 /* FIXME: Why is this block not the
3350 same as btr_pcur_get_block(prebuilt->pcur),
3351 and is it not unsafe to use RW_NO_LATCH here? */
3352 buf_block_t* block = buf_page_get_gen(
3353 btr_pcur_get_block(prebuilt->pcur)->page.id,
3354 dict_table_page_size(sec_index->table),
3355 RW_NO_LATCH, NULL, BUF_GET,
3356 __FILE__, __LINE__, mtr, &err);
3357 mem_heap_t* heap = mem_heap_create(256);
3358 dtuple_t* tuple = dict_index_build_data_tuple(
3359 rec, sec_index, true,
3360 sec_index->n_fields, heap);
3361 page_cur_t page_cursor;
3362
3363 ulint low_match = page_cur_search(
3364 block, sec_index, tuple,
3365 PAGE_CUR_LE, &page_cursor);
3366
3367 ut_ad(low_match < dtuple_get_n_fields_cmp(tuple));
3368 mem_heap_free(heap);
3369 clust_rec = NULL;
3370
3371 err = DB_SUCCESS;
3372 goto func_exit;
3373#endif /* UNIV_DEBUG */
3374 } else if (!rec_get_deleted_flag(rec,
3375 dict_table_is_comp(sec_index->table))
3376 || prebuilt->select_lock_type != LOCK_NONE) {
3377 /* In a rare case it is possible that no clust
3378 rec is found for a delete-marked secondary index
3379 record: if in row0umod.cc in
3380 row_undo_mod_remove_clust_low() we have already removed
3381 the clust rec, while purge is still cleaning and
3382 removing secondary index records associated with
3383 earlier versions of the clustered index record.
3384 In that case we know that the clustered index
3385 record did not exist in the read view of trx. */
3386 ib::error() << "Clustered record for sec rec not found"
3387 " index " << sec_index->name
3388 << " of table " << sec_index->table->name;
3389
3390 fputs("InnoDB: sec index record ", stderr);
3391 rec_print(stderr, rec, sec_index);
3392 fputs("\n"
3393 "InnoDB: clust index record ", stderr);
3394 rec_print(stderr, clust_rec, clust_index);
3395 putc('\n', stderr);
3396 trx_print(stderr, trx, 600);
3397 fputs("\n"
3398 "InnoDB: Submit a detailed bug report"
3399 " to https://jira.mariadb.org/\n", stderr);
3400 ut_ad(0);
3401 }
3402
3403 clust_rec = NULL;
3404
3405 err = DB_SUCCESS;
3406 goto func_exit;
3407 }
3408
3409 *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, true,
3410 ULINT_UNDEFINED, offset_heap);
3411
3412 if (prebuilt->select_lock_type != LOCK_NONE) {
3413 /* Try to place a lock on the index record; we are searching
3414 the clust rec with a unique condition, hence
3415 we set a LOCK_REC_NOT_GAP type lock */
3416
3417 err = lock_clust_rec_read_check_and_lock(
3418 0, btr_pcur_get_block(prebuilt->clust_pcur),
3419 clust_rec, clust_index, *offsets,
3420 static_cast<lock_mode>(prebuilt->select_lock_type),
3421 LOCK_REC_NOT_GAP,
3422 thr);
3423
3424 switch (err) {
3425 case DB_SUCCESS:
3426 case DB_SUCCESS_LOCKED_REC:
3427 break;
3428 default:
3429 goto err_exit;
3430 }
3431 } else {
3432 /* This is a non-locking consistent read: if necessary, fetch
3433 a previous version of the record */
3434
3435 old_vers = NULL;
3436
3437 /* If the isolation level allows reading of uncommitted data,
3438 then we never look for an earlier version */
3439
3440 if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3441 && !lock_clust_rec_cons_read_sees(
3442 clust_rec, clust_index, *offsets,
3443 &trx->read_view)) {
3444
3445 /* The following call returns 'offsets' associated with
3446 'old_vers' */
3447 err = row_sel_build_prev_vers_for_mysql(
3448 &trx->read_view, clust_index, prebuilt,
3449 clust_rec, offsets, offset_heap, &old_vers,
3450 vrow, mtr);
3451
3452 if (err != DB_SUCCESS || old_vers == NULL) {
3453
3454 goto err_exit;
3455 }
3456
3457 clust_rec = old_vers;
3458 }
3459
3460 /* If we had to go to an earlier version of row or the
3461 secondary index record is delete marked, then it may be that
3462 the secondary index record corresponding to clust_rec
3463 (or old_vers) is not rec; in that case we must ignore
3464 such row because in our snapshot rec would not have existed.
3465 Remember that from rec we cannot see directly which transaction
3466 id corresponds to it: we have to go to the clustered index
3467 record. A query where we want to fetch all rows where
3468 the secondary index value is in some interval would return
3469 a wrong result if we would not drop rows which we come to
3470 visit through secondary index records that would not really
3471 exist in our snapshot. */
3472
3473 /* And for spatial index, since the rec is from shadow buffer,
3474 so we need to check if it's exactly match the clust_rec. */
3475 if (clust_rec
3476 && (old_vers
3477 || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3478 || dict_index_is_spatial(sec_index)
3479 || rec_get_deleted_flag(rec, dict_table_is_comp(
3480 sec_index->table)))
3481 && !row_sel_sec_rec_is_for_clust_rec(
3482 rec, sec_index, clust_rec, clust_index, thr)) {
3483 clust_rec = NULL;
3484 }
3485
3486 err = DB_SUCCESS;
3487 }
3488
3489func_exit:
3490 *out_rec = clust_rec;
3491
3492 if (prebuilt->select_lock_type != LOCK_NONE) {
3493 /* We may use the cursor in update or in unlock_row():
3494 store its position */
3495
3496 btr_pcur_store_position(prebuilt->clust_pcur, mtr);
3497 }
3498
3499err_exit:
3500 return(err);
3501}
3502
3503/********************************************************************//**
3504Restores cursor position after it has been stored. We have to take into
3505account that the record cursor was positioned on may have been deleted.
3506Then we may have to move the cursor one step up or down.
3507@return true if we may need to process the record the cursor is now
3508positioned on (i.e. we should not go to the next record yet) */
3509static
3510bool
3511sel_restore_position_for_mysql(
3512/*===========================*/
3513 ibool* same_user_rec, /*!< out: TRUE if we were able to restore
3514 the cursor on a user record with the
3515 same ordering prefix in in the
3516 B-tree index */
3517 ulint latch_mode, /*!< in: latch mode wished in
3518 restoration */
3519 btr_pcur_t* pcur, /*!< in: cursor whose position
3520 has been stored */
3521 ibool moves_up, /*!< in: TRUE if the cursor moves up
3522 in the index */
3523 mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
3524 mtr temporarily! */
3525{
3526 ibool success;
3527
3528 success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3529
3530 *same_user_rec = success;
3531
3532 ut_ad(!success || pcur->rel_pos == BTR_PCUR_ON);
3533#ifdef UNIV_DEBUG
3534 if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
3535 ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
3536 || pcur->rel_pos == BTR_PCUR_AFTER);
3537 } else {
3538 ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
3539 ut_ad((pcur->rel_pos == BTR_PCUR_ON)
3540 == btr_pcur_is_on_user_rec(pcur));
3541 }
3542#endif /* UNIV_DEBUG */
3543
3544 /* The position may need be adjusted for rel_pos and moves_up. */
3545
3546 switch (pcur->rel_pos) {
3547 case BTR_PCUR_ON:
3548 if (!success && moves_up) {
3549next:
3550 if (btr_pcur_move_to_next(pcur, mtr)
3551 && rec_is_default_row(btr_pcur_get_rec(pcur),
3552 pcur->btr_cur.index)) {
3553 btr_pcur_move_to_next(pcur, mtr);
3554 }
3555
3556 return true;
3557 }
3558 return(!success);
3559 case BTR_PCUR_AFTER_LAST_IN_TREE:
3560 case BTR_PCUR_BEFORE_FIRST_IN_TREE:
3561 return true;
3562 case BTR_PCUR_AFTER:
3563 /* positioned to record after pcur->old_rec. */
3564 pcur->pos_state = BTR_PCUR_IS_POSITIONED;
3565prev:
3566 if (btr_pcur_is_on_user_rec(pcur) && !moves_up
3567 && !rec_is_default_row(btr_pcur_get_rec(pcur),
3568 pcur->btr_cur.index)) {
3569 btr_pcur_move_to_prev(pcur, mtr);
3570 }
3571 return true;
3572 case BTR_PCUR_BEFORE:
3573 /* For non optimistic restoration:
3574 The position is now set to the record before pcur->old_rec.
3575
3576 For optimistic restoration:
3577 The position also needs to take the previous search_mode into
3578 consideration. */
3579
3580 switch (pcur->pos_state) {
3581 case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
3582 pcur->pos_state = BTR_PCUR_IS_POSITIONED;
3583 if (pcur->search_mode == PAGE_CUR_GE) {
3584 /* Positioned during Greater or Equal search
3585 with BTR_PCUR_BEFORE. Optimistic restore to
3586 the same record. If scanning for lower then
3587 we must move to previous record.
3588 This can happen with:
3589 HANDLER READ idx a = (const);
3590 HANDLER READ idx PREV; */
3591 goto prev;
3592 }
3593 return true;
3594 case BTR_PCUR_IS_POSITIONED:
3595 if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3596 goto next;
3597 }
3598 return true;
3599 case BTR_PCUR_WAS_POSITIONED:
3600 case BTR_PCUR_NOT_POSITIONED:
3601 break;
3602 }
3603 }
3604 ut_ad(0);
3605 return true;
3606}
3607
3608/********************************************************************//**
3609Copies a cached field for MySQL from the fetch cache. */
3610static
3611void
3612row_sel_copy_cached_field_for_mysql(
3613/*================================*/
3614 byte* buf, /*!< in/out: row buffer */
3615 const byte* cache, /*!< in: cached row */
3616 const mysql_row_templ_t*templ) /*!< in: column template */
3617{
3618 ulint len;
3619
3620 buf += templ->mysql_col_offset;
3621 cache += templ->mysql_col_offset;
3622
3623 UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
3624
3625 if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
3626 && (templ->type != DATA_INT)) {
3627 /* Check for != DATA_INT to make sure we do
3628 not treat MySQL ENUM or SET as a true VARCHAR!
3629 Find the actual length of the true VARCHAR field. */
3630 row_mysql_read_true_varchar(
3631 &len, cache, templ->mysql_length_bytes);
3632 len += templ->mysql_length_bytes;
3633 UNIV_MEM_INVALID(buf, templ->mysql_col_len);
3634 } else {
3635 len = templ->mysql_col_len;
3636 }
3637
3638 ut_memcpy(buf, cache, len);
3639}
3640
3641/** Copy used fields from cached row.
3642Copy cache record field by field, don't touch fields that
3643are not covered by current key.
3644@param[out] buf Where to copy the MySQL row.
3645@param[in] cached_rec What to copy (in MySQL row format).
3646@param[in] prebuilt prebuilt struct. */
3647void
3648row_sel_copy_cached_fields_for_mysql(
3649 byte* buf,
3650 const byte* cached_rec,
3651 row_prebuilt_t* prebuilt)
3652{
3653 const mysql_row_templ_t*templ;
3654 ulint i;
3655 for (i = 0; i < prebuilt->n_template; i++) {
3656 templ = prebuilt->mysql_template + i;
3657
3658 /* Skip virtual columns */
3659 if (templ->is_virtual) {
3660 continue;
3661 }
3662
3663 row_sel_copy_cached_field_for_mysql(
3664 buf, cached_rec, templ);
3665 /* Copy NULL bit of the current field from cached_rec
3666 to buf */
3667 if (templ->mysql_null_bit_mask) {
3668 buf[templ->mysql_null_byte_offset]
3669 ^= (buf[templ->mysql_null_byte_offset]
3670 ^ cached_rec[templ->mysql_null_byte_offset])
3671 & (byte) templ->mysql_null_bit_mask;
3672 }
3673 }
3674}
3675
3676/********************************************************************//**
3677Pops a cached row for MySQL from the fetch cache. */
3678UNIV_INLINE
3679void
3680row_sel_dequeue_cached_row_for_mysql(
3681/*=================================*/
3682 byte* buf, /*!< in/out: buffer where to copy the
3683 row */
3684 row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
3685{
3686 ulint i;
3687 const mysql_row_templ_t*templ;
3688 const byte* cached_rec;
3689 ut_ad(prebuilt->n_fetch_cached > 0);
3690 ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3691
3692 UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
3693
3694 cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
3695
3696 if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3697 row_sel_copy_cached_fields_for_mysql(buf, cached_rec, prebuilt);
3698 } else if (prebuilt->mysql_prefix_len > 63) {
3699 /* The record is long. Copy it field by field, in case
3700 there are some long VARCHAR column of which only a
3701 small length is being used. */
3702 UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
3703
3704 /* First copy the NULL bits. */
3705 ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
3706 /* Then copy the requested fields. */
3707
3708 for (i = 0; i < prebuilt->n_template; i++) {
3709 templ = prebuilt->mysql_template + i;
3710
3711 /* Skip virtual columns */
3712 if (templ->is_virtual
3713 && !(dict_index_has_virtual(prebuilt->index)
3714 && prebuilt->read_just_key)) {
3715 continue;
3716 }
3717
3718 row_sel_copy_cached_field_for_mysql(
3719 buf, cached_rec, templ);
3720 }
3721 } else {
3722 ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
3723 }
3724
3725 prebuilt->n_fetch_cached--;
3726 prebuilt->fetch_cache_first++;
3727
3728 if (prebuilt->n_fetch_cached == 0) {
3729 prebuilt->fetch_cache_first = 0;
3730 }
3731}
3732
3733/********************************************************************//**
3734Initialise the prefetch cache. */
3735UNIV_INLINE
3736void
3737row_sel_prefetch_cache_init(
3738/*========================*/
3739 row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
3740{
3741 ulint i;
3742 ulint sz;
3743 byte* ptr;
3744
3745 /* Reserve space for the magic number. */
3746 sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
3747 ptr = static_cast<byte*>(ut_malloc_nokey(sz));
3748
3749 for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
3750
3751 /* A user has reported memory corruption in these
3752 buffers in Linux. Put magic numbers there to help
3753 to track a possible bug. */
3754
3755 mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3756 ptr += 4;
3757
3758 prebuilt->fetch_cache[i] = ptr;
3759 ptr += prebuilt->mysql_row_len;
3760
3761 mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3762 ptr += 4;
3763 }
3764}
3765
3766/********************************************************************//**
3767Get the last fetch cache buffer from the queue.
3768@return pointer to buffer. */
3769UNIV_INLINE
3770byte*
3771row_sel_fetch_last_buf(
3772/*===================*/
3773 row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
3774{
3775 ut_ad(!prebuilt->templ_contains_blob);
3776 ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3777
3778 if (prebuilt->fetch_cache[0] == NULL) {
3779 /* Allocate memory for the fetch cache */
3780 ut_ad(prebuilt->n_fetch_cached == 0);
3781
3782 row_sel_prefetch_cache_init(prebuilt);
3783 }
3784
3785 ut_ad(prebuilt->fetch_cache_first == 0);
3786 UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3787 prebuilt->mysql_row_len);
3788
3789 return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
3790}
3791
3792/********************************************************************//**
3793Pushes a row for MySQL to the fetch cache. */
3794UNIV_INLINE
3795void
3796row_sel_enqueue_cache_row_for_mysql(
3797/*================================*/
3798 byte* mysql_rec, /*!< in/out: MySQL record */
3799 row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
3800{
3801 /* For non ICP code path the row should already exist in the
3802 next fetch cache slot. */
3803
3804 if (prebuilt->idx_cond != NULL) {
3805 byte* dest = row_sel_fetch_last_buf(prebuilt);
3806
3807 ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
3808 }
3809
3810 ++prebuilt->n_fetch_cached;
3811}
3812
3813#ifdef BTR_CUR_HASH_ADAPT
3814/*********************************************************************//**
3815Tries to do a shortcut to fetch a clustered index record with a unique key,
3816using the hash index if possible (not always). We assume that the search
3817mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3818btr search latch has been locked in S-mode if AHI is enabled.
3819@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3820static
3821ulint
3822row_sel_try_search_shortcut_for_mysql(
3823/*==================================*/
3824 const rec_t** out_rec,/*!< out: record if found */
3825 row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
3826 ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3827 mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3828 mtr_t* mtr) /*!< in: started mtr */
3829{
3830 dict_index_t* index = prebuilt->index;
3831 const dtuple_t* search_tuple = prebuilt->search_tuple;
3832 btr_pcur_t* pcur = prebuilt->pcur;
3833 trx_t* trx = prebuilt->trx;
3834 const rec_t* rec;
3835
3836 ut_ad(dict_index_is_clust(index));
3837 ut_ad(!prebuilt->templ_contains_blob);
3838
3839 rw_lock_t* ahi_latch = btr_get_search_latch(index);
3840 rw_lock_s_lock(ahi_latch);
3841 btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3842 BTR_SEARCH_LEAF, pcur, ahi_latch, mtr);
3843 rec = btr_pcur_get_rec(pcur);
3844
3845 if (!page_rec_is_user_rec(rec) || rec_is_default_row(rec, index)) {
3846retry:
3847 rw_lock_s_unlock(ahi_latch);
3848 return(SEL_RETRY);
3849 }
3850
3851 /* As the cursor is now placed on a user record after a search with
3852 the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3853 fields in the user record matched to the search tuple */
3854
3855 if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3856exhausted:
3857 rw_lock_s_unlock(ahi_latch);
3858 return(SEL_EXHAUSTED);
3859 }
3860
3861 /* This is a non-locking consistent read: if necessary, fetch
3862 a previous version of the record */
3863
3864 *offsets = rec_get_offsets(rec, index, *offsets, true,
3865 ULINT_UNDEFINED, heap);
3866
3867 if (!lock_clust_rec_cons_read_sees(rec, index, *offsets,
3868 &trx->read_view)) {
3869 goto retry;
3870 }
3871
3872 if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3873 /* In delete-marked records, DB_TRX_ID must
3874 always refer to an existing undo log record. */
3875 ut_ad(row_get_rec_trx_id(rec, index, *offsets));
3876 goto exhausted;
3877 }
3878
3879 *out_rec = rec;
3880
3881 rw_lock_s_unlock(ahi_latch);
3882 return(SEL_FOUND);
3883}
3884#endif /* BTR_CUR_HASH_ADAPT */
3885
3886/*********************************************************************//**
3887Check a pushed-down index condition.
3888@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
3889static
3890ICP_RESULT
3891row_search_idx_cond_check(
3892/*======================*/
3893 byte* mysql_rec, /*!< out: record
3894 in MySQL format (invalid unless
3895 prebuilt->idx_cond!=NULL and
3896 we return ICP_MATCH) */
3897 row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
3898 for the table handle */
3899 const rec_t* rec, /*!< in: InnoDB record */
3900 const ulint* offsets) /*!< in: rec_get_offsets() */
3901{
3902 ICP_RESULT result;
3903 ulint i;
3904
3905 ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
3906
3907 if (!prebuilt->idx_cond) {
3908 return(ICP_MATCH);
3909 }
3910
3911 MONITOR_INC(MONITOR_ICP_ATTEMPTS);
3912
3913 /* Convert to MySQL format those fields that are needed for
3914 evaluating the index condition. */
3915
3916 if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
3917 mem_heap_empty(prebuilt->blob_heap);
3918 }
3919
3920 for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
3921 const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3922
3923 /* Skip virtual columns */
3924 if (templ->is_virtual) {
3925 continue;
3926 }
3927
3928 if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3929 rec, prebuilt->index, offsets,
3930 templ->icp_rec_field_no,
3931 templ)) {
3932 return(ICP_NO_MATCH);
3933 }
3934 }
3935
3936 /* We assume that the index conditions on
3937 case-insensitive columns are case-insensitive. The
3938 case of such columns may be wrong in a secondary
3939 index, if the case of the column has been updated in
3940 the past, or a record has been deleted and a record
3941 inserted in a different case. */
3942 result = innobase_index_cond(prebuilt->idx_cond);
3943 switch (result) {
3944 case ICP_MATCH:
3945 /* Convert the remaining fields to MySQL format.
3946 If this is a secondary index record, we must defer
3947 this until we have fetched the clustered index record. */
3948 if (!prebuilt->need_to_access_clustered
3949 || dict_index_is_clust(prebuilt->index)) {
3950 if (!row_sel_store_mysql_rec(
3951 mysql_rec, prebuilt, rec, NULL, false,
3952 prebuilt->index, offsets)) {
3953 ut_ad(dict_index_is_clust(prebuilt->index));
3954 return(ICP_NO_MATCH);
3955 }
3956 }
3957 MONITOR_INC(MONITOR_ICP_MATCH);
3958 return(result);
3959 case ICP_NO_MATCH:
3960 MONITOR_INC(MONITOR_ICP_NO_MATCH);
3961 return(result);
3962 case ICP_OUT_OF_RANGE:
3963 MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
3964 return(result);
3965 case ICP_ERROR:
3966 case ICP_ABORTED_BY_USER:
3967 return(result);
3968 }
3969
3970 ut_error;
3971 return(result);
3972}
3973
3974/** Extract virtual column data from a virtual index record and fill a dtuple
3975@param[in] rec the virtual (secondary) index record
3976@param[in] index the virtual index
3977@param[in,out] vrow the dtuple where data extract to
3978@param[in] heap memory heap to allocate memory
3979*/
3980static
3981void
3982row_sel_fill_vrow(
3983 const rec_t* rec,
3984 dict_index_t* index,
3985 const dtuple_t** vrow,
3986 mem_heap_t* heap)
3987{
3988 ulint offsets_[REC_OFFS_NORMAL_SIZE];
3989 ulint* offsets = offsets_;
3990 rec_offs_init(offsets_);
3991
3992 ut_ad(!(*vrow));
3993 ut_ad(heap);
3994 ut_ad(!dict_index_is_clust(index));
3995 ut_ad(!index->is_instant());
3996 ut_ad(page_rec_is_leaf(rec));
3997
3998 offsets = rec_get_offsets(rec, index, offsets, true,
3999 ULINT_UNDEFINED, &heap);
4000
4001 *vrow = dtuple_create_with_vcol(
4002 heap, 0, dict_table_get_n_v_cols(index->table));
4003
4004 /* Initialize all virtual row's mtype to DATA_MISSING */
4005 dtuple_init_v_fld(*vrow);
4006
4007 for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
4008 const dict_field_t* field;
4009 const dict_col_t* col;
4010
4011 field = dict_index_get_nth_field(index, i);
4012 col = dict_field_get_col(field);
4013
4014 if (col->is_virtual()) {
4015 const byte* data;
4016 ulint len;
4017
4018 data = rec_get_nth_field(rec, offsets, i, &len);
4019
4020 const dict_v_col_t* vcol = reinterpret_cast<
4021 const dict_v_col_t*>(col);
4022
4023 dfield_t* dfield = dtuple_get_nth_v_field(
4024 *vrow, vcol->v_pos);
4025 dfield_set_data(dfield, data, len);
4026 dict_col_copy_type(col, dfield_get_type(dfield));
4027 }
4028 }
4029}
4030
4031/** Return the record field length in characters.
4032@param[in] col table column of the field
4033@param[in] field_no field number
4034@param[in] rec physical record
4035@param[in] offsets field offsets in the physical record
4036@return field length in characters. */
4037static
4038size_t
4039rec_field_len_in_chars(
4040 const dict_col_t* col,
4041 const ulint field_no,
4042 const rec_t* rec,
4043 const ulint* offsets)
4044{
4045 const ulint cset = dtype_get_charset_coll(col->prtype);
4046 const CHARSET_INFO* cs = all_charsets[cset];
4047 ulint rec_field_len;
4048 const char* rec_field = reinterpret_cast<const char *>(
4049 rec_get_nth_field(
4050 rec, offsets, field_no, &rec_field_len));
4051
4052 if (UNIV_UNLIKELY(!cs)) {
4053 ib::warn() << "Missing collation " << cset;
4054 return SIZE_T_MAX;
4055 }
4056
4057 return(cs->cset->numchars(cs, rec_field, rec_field + rec_field_len));
4058}
4059
4060/** Avoid the clustered index lookup if all the following conditions
4061are true:
40621) all columns are in secondary index
40632) all values for columns that are prefix-only indexes are shorter
4064than the prefix size. This optimization can avoid many IOs for certain schemas.
4065@return true, to avoid clustered index lookup. */
4066static
4067bool row_search_with_covering_prefix(
4068 row_prebuilt_t* prebuilt,
4069 const rec_t* rec,
4070 const ulint* offsets)
4071{
4072 const dict_index_t* index = prebuilt->index;
4073 ut_ad(!dict_index_is_clust(index));
4074
4075 if (!srv_prefix_index_cluster_optimization) {
4076 return false;
4077 }
4078
4079 /** Optimization only applicable if there the number of secondary index
4080 fields are greater than or equal to number of clustered index fields. */
4081 if (prebuilt->n_template > index->n_fields) {
4082 return false;
4083 }
4084
4085 for (ulint i = 0; i < prebuilt->n_template; i++) {
4086 mysql_row_templ_t* templ = prebuilt->mysql_template + i;
4087 ulint j = templ->rec_prefix_field_no;
4088
4089 /** Condition (1) : is the field in the index. */
4090 if (j == ULINT_UNDEFINED) {
4091 return false;
4092 }
4093
4094 /** Condition (2): If this is a prefix index then
4095 row's value size shorter than prefix length. */
4096
4097 if (!templ->rec_field_is_prefix) {
4098 continue;
4099 }
4100
4101 ulint rec_size = rec_offs_nth_size(offsets, j);
4102 const dict_field_t* field = dict_index_get_nth_field(index, j);
4103 ulint max_chars = field->prefix_len / templ->mbmaxlen;
4104
4105 ut_a(field->prefix_len > 0);
4106
4107 if (rec_size < max_chars) {
4108 /* Record in bytes shorter than the index
4109 prefix length in char. */
4110 continue;
4111 }
4112
4113 if (rec_size * templ->mbminlen >= field->prefix_len) {
4114 /* Shortest representation string by the
4115 byte length of the record is longer than the
4116 maximum possible index prefix. */
4117 return false;
4118 }
4119
4120 size_t num_chars = rec_field_len_in_chars(
4121 field->col, j, rec, offsets);
4122
4123 if (num_chars >= max_chars) {
4124 /* No of chars to store the record exceeds
4125 the index prefix character length. */
4126 return false;
4127 }
4128 }
4129
4130 /* If prefix index optimization condition satisfied then
4131 for all columns above, use rec_prefix_field_no instead of
4132 rec_field_no, and skip the clustered lookup below. */
4133 for (ulint i = 0; i < prebuilt->n_template; i++) {
4134 mysql_row_templ_t* templ = prebuilt->mysql_template + i;
4135 templ->rec_field_no = templ->rec_prefix_field_no;
4136 ut_a(templ->rec_field_no != ULINT_UNDEFINED);
4137 }
4138
4139 srv_stats.n_sec_rec_cluster_reads_avoided.inc();
4140 return true;
4141}
4142
4143/** Searches for rows in the database using cursor.
4144Function is mainly used for tables that are shared across connections and
4145so it employs technique that can help re-construct the rows that
4146transaction is suppose to see.
4147It also has optimization such as pre-caching the rows, using AHI, etc.
4148
4149@param[out] buf buffer for the fetched row in MySQL format
4150@param[in] mode search mode PAGE_CUR_L
4151@param[in,out] prebuilt prebuilt struct for the table handler;
4152 this contains the info to search_tuple,
4153 index; if search tuple contains 0 field then
4154 we position the cursor at start or the end of
4155 index, depending on 'mode'
4156@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
4157@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
4158 Note: if this is != 0, then prebuilt must has a
4159 pcur with stored position! In opening of a
4160 cursor 'direction' should be 0.
4161@return DB_SUCCESS or error code */
4162dberr_t
4163row_search_mvcc(
4164 byte* buf,
4165 page_cur_mode_t mode,
4166 row_prebuilt_t* prebuilt,
4167 ulint match_mode,
4168 ulint direction)
4169{
4170 DBUG_ENTER("row_search_mvcc");
4171 DBUG_ASSERT(prebuilt->index->table == prebuilt->table);
4172
4173 dict_index_t* index = prebuilt->index;
4174 ibool comp = dict_table_is_comp(prebuilt->table);
4175 const dtuple_t* search_tuple = prebuilt->search_tuple;
4176 btr_pcur_t* pcur = prebuilt->pcur;
4177 trx_t* trx = prebuilt->trx;
4178 dict_index_t* clust_index;
4179 que_thr_t* thr;
4180 const rec_t* UNINIT_VAR(rec);
4181 const dtuple_t* vrow = NULL;
4182 const rec_t* result_rec = NULL;
4183 const rec_t* clust_rec;
4184 dberr_t err = DB_SUCCESS;
4185 ibool unique_search = FALSE;
4186 ibool mtr_has_extra_clust_latch = FALSE;
4187 ibool moves_up = FALSE;
4188 ibool set_also_gap_locks = TRUE;
4189 /* if the query is a plain locking SELECT, and the isolation level
4190 is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
4191 ibool did_semi_consistent_read = FALSE;
4192 /* if the returned record was locked and we did a semi-consistent
4193 read (fetch the newest committed version), then this is set to
4194 TRUE */
4195 ulint next_offs;
4196 ibool same_user_rec;
4197 mtr_t mtr;
4198 mem_heap_t* heap = NULL;
4199 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4200 ulint* offsets = offsets_;
4201 ibool table_lock_waited = FALSE;
4202 byte* next_buf = 0;
4203 bool spatial_search = false;
4204
4205 rec_offs_init(offsets_);
4206
4207 ut_ad(index && pcur && search_tuple);
4208 ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
4209 ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
4210
4211 /* We don't support FTS queries from the HANDLER interfaces, because
4212 we implemented FTS as reversed inverted index with auxiliary tables.
4213 So anything related to traditional index query would not apply to
4214 it. */
4215 if (prebuilt->index->type & DICT_FTS) {
4216 DBUG_RETURN(DB_END_OF_INDEX);
4217 }
4218
4219 ut_ad(!sync_check_iterate(sync_check()));
4220
4221 if (!prebuilt->table->space) {
4222 DBUG_RETURN(DB_TABLESPACE_DELETED);
4223 } else if (!prebuilt->table->is_readable()) {
4224 DBUG_RETURN(prebuilt->table->space
4225 ? DB_DECRYPTION_FAILED
4226 : DB_TABLESPACE_NOT_FOUND);
4227 } else if (!prebuilt->index_usable) {
4228 DBUG_RETURN(DB_MISSING_HISTORY);
4229 } else if (prebuilt->index->is_corrupted()) {
4230 DBUG_RETURN(DB_CORRUPTION);
4231 }
4232
4233 /* We need to get the virtual column values stored in secondary
4234 index key, if this is covered index scan or virtual key read is
4235 requested. */
4236 bool need_vrow = dict_index_has_virtual(prebuilt->index)
4237 && (prebuilt->read_just_key
4238 || prebuilt->m_read_virtual_key);
4239
4240 /* Reset the new record lock info if srv_locks_unsafe_for_binlog
4241 is set or session is using a READ COMMITED isolation level. Then
4242 we are able to remove the record locks set here on an individual
4243 row. */
4244 prebuilt->new_rec_locks = 0;
4245
4246 /*-------------------------------------------------------------*/
4247 /* PHASE 1: Try to pop the row from the prefetch cache */
4248
4249 if (UNIV_UNLIKELY(direction == 0)) {
4250 trx->op_info = "starting index read";
4251
4252 prebuilt->n_rows_fetched = 0;
4253 prebuilt->n_fetch_cached = 0;
4254 prebuilt->fetch_cache_first = 0;
4255
4256 if (prebuilt->sel_graph == NULL) {
4257 /* Build a dummy select query graph */
4258 row_prebuild_sel_graph(prebuilt);
4259 }
4260 } else {
4261 trx->op_info = "fetching rows";
4262
4263 if (prebuilt->n_rows_fetched == 0) {
4264 prebuilt->fetch_direction = direction;
4265 }
4266
4267 if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
4268 if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
4269 ut_error;
4270 /* TODO: scrollable cursor: restore cursor to
4271 the place of the latest returned row,
4272 or better: prevent caching for a scroll
4273 cursor! */
4274 }
4275
4276 prebuilt->n_rows_fetched = 0;
4277 prebuilt->n_fetch_cached = 0;
4278 prebuilt->fetch_cache_first = 0;
4279
4280 } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
4281 row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
4282
4283 prebuilt->n_rows_fetched++;
4284
4285 err = DB_SUCCESS;
4286 goto func_exit;
4287 }
4288
4289 if (prebuilt->fetch_cache_first > 0
4290 && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
4291
4292 /* The previous returned row was popped from the fetch
4293 cache, but the cache was not full at the time of the
4294 popping: no more rows can exist in the result set */
4295
4296 err = DB_RECORD_NOT_FOUND;
4297 goto func_exit;
4298 }
4299
4300 prebuilt->n_rows_fetched++;
4301
4302 if (prebuilt->n_rows_fetched > 1000000000) {
4303 /* Prevent wrap-over */
4304 prebuilt->n_rows_fetched = 500000000;
4305 }
4306
4307 mode = pcur->search_mode;
4308 }
4309
4310 /* In a search where at most one record in the index may match, we
4311 can use a LOCK_REC_NOT_GAP type record lock when locking a
4312 non-delete-marked matching record.
4313
4314 Note that in a unique secondary index there may be different
4315 delete-marked versions of a record where only the primary key
4316 values differ: thus in a secondary index we must use next-key
4317 locks when locking delete-marked records. */
4318
4319 if (match_mode == ROW_SEL_EXACT
4320 && dict_index_is_unique(index)
4321 && dtuple_get_n_fields(search_tuple)
4322 == dict_index_get_n_unique(index)
4323 && (dict_index_is_clust(index)
4324 || !dtuple_contains_null(search_tuple))) {
4325
4326 /* Note above that a UNIQUE secondary index can contain many
4327 rows with the same key value if one of the columns is the SQL
4328 null. A clustered index under MySQL can never contain null
4329 columns because we demand that all the columns in primary key
4330 are non-null. */
4331
4332 unique_search = TRUE;
4333
4334 /* Even if the condition is unique, MySQL seems to try to
4335 retrieve also a second row if a primary key contains more than
4336 1 column. Return immediately if this is not a HANDLER
4337 command. */
4338
4339 if (UNIV_UNLIKELY(direction != 0
4340 && !prebuilt->used_in_HANDLER)) {
4341
4342 err = DB_RECORD_NOT_FOUND;
4343 goto func_exit;
4344 }
4345 }
4346
4347 /* We don't support sequencial scan for Rtree index, because it
4348 is no meaning to do so. */
4349 if (dict_index_is_spatial(index)
4350 && !RTREE_SEARCH_MODE(mode)) {
4351 err = DB_END_OF_INDEX;
4352 goto func_exit;
4353 }
4354
4355 mtr.start();
4356
4357#ifdef BTR_CUR_HASH_ADAPT
4358 /*-------------------------------------------------------------*/
4359 /* PHASE 2: Try fast adaptive hash index search if possible */
4360
4361 /* Next test if this is the special case where we can use the fast
4362 adaptive hash index to try the search. Since we must release the
4363 search system latch when we retrieve an externally stored field, we
4364 cannot use the adaptive hash index in a search in the case the row
4365 may be long and there may be externally stored fields */
4366
4367 if (UNIV_UNLIKELY(direction == 0)
4368 && unique_search
4369 && btr_search_enabled
4370 && dict_index_is_clust(index)
4371 && !prebuilt->templ_contains_blob
4372 && !prebuilt->used_in_HANDLER
4373 && (prebuilt->mysql_row_len < srv_page_size / 8)) {
4374
4375 mode = PAGE_CUR_GE;
4376
4377 if (prebuilt->select_lock_type == LOCK_NONE
4378 && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
4379 && trx->read_view.is_open()) {
4380
4381 /* This is a SELECT query done as a consistent read,
4382 and the read view has already been allocated:
4383 let us try a search shortcut through the hash
4384 index. */
4385
4386 switch (row_sel_try_search_shortcut_for_mysql(
4387 &rec, prebuilt, &offsets, &heap,
4388 &mtr)) {
4389 case SEL_FOUND:
4390 /* At this point, rec is protected by
4391 a page latch that was acquired by
4392 row_sel_try_search_shortcut_for_mysql().
4393 The latch will not be released until
4394 mtr.commit(). */
4395 ut_ad(!rec_get_deleted_flag(rec, comp));
4396
4397 if (prebuilt->idx_cond) {
4398 switch (row_search_idx_cond_check(
4399 buf, prebuilt,
4400 rec, offsets)) {
4401 case ICP_NO_MATCH:
4402 case ICP_OUT_OF_RANGE:
4403 case ICP_ABORTED_BY_USER:
4404 case ICP_ERROR:
4405 goto shortcut_mismatch;
4406 case ICP_MATCH:
4407 goto shortcut_match;
4408 }
4409 }
4410
4411 if (!row_sel_store_mysql_rec(
4412 buf, prebuilt,
4413 rec, NULL, false, index,
4414 offsets)) {
4415 /* Only fresh inserts may contain
4416 incomplete externally stored
4417 columns. Pretend that such
4418 records do not exist. Such
4419 records may only be accessed
4420 at the READ UNCOMMITTED
4421 isolation level or when
4422 rolling back a recovered
4423 transaction. Rollback happens
4424 at a lower level, not here. */
4425
4426 /* Proceed as in case SEL_RETRY. */
4427 break;
4428 }
4429
4430 shortcut_match:
4431 mtr.commit();
4432
4433 /* NOTE that we do NOT store the cursor
4434 position */
4435 err = DB_SUCCESS;
4436 goto func_exit;
4437
4438 case SEL_EXHAUSTED:
4439 shortcut_mismatch:
4440 mtr.commit();
4441 /* NOTE that we do NOT store the cursor
4442 position */
4443 err = DB_RECORD_NOT_FOUND;
4444 goto func_exit;
4445
4446 case SEL_RETRY:
4447 break;
4448
4449 default:
4450 ut_ad(0);
4451 }
4452
4453 mtr.commit();
4454 mtr.start();
4455 }
4456 }
4457#endif /* BTR_CUR_HASH_ADAPT */
4458
4459 /*-------------------------------------------------------------*/
4460 /* PHASE 3: Open or restore index cursor position */
4461
4462 spatial_search = dict_index_is_spatial(index)
4463 && mode >= PAGE_CUR_CONTAIN;
4464
4465 /* The state of a running trx can only be changed by the
4466 thread that is currently serving the transaction. Because we
4467 are that thread, we can read trx->state without holding any
4468 mutex. */
4469 ut_ad(prebuilt->sql_stat_start
4470 || trx->state == TRX_STATE_ACTIVE
4471 || (prebuilt->table->no_rollback()
4472 && trx->state == TRX_STATE_NOT_STARTED));
4473
4474 ut_ad(!trx_is_started(trx) || trx->state == TRX_STATE_ACTIVE);
4475
4476 ut_ad(prebuilt->sql_stat_start
4477 || prebuilt->select_lock_type != LOCK_NONE
4478 || trx->read_view.is_open()
4479 || prebuilt->table->no_rollback()
4480 || srv_read_only_mode);
4481
4482 if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
4483 && prebuilt->select_lock_type != LOCK_NONE
4484 && trx->mysql_thd != NULL
4485 && thd_is_select(trx->mysql_thd)) {
4486 /* It is a plain locking SELECT and the isolation
4487 level is low: do not lock gaps */
4488
4489 set_also_gap_locks = FALSE;
4490 }
4491
4492 /* Note that if the search mode was GE or G, then the cursor
4493 naturally moves upward (in fetch next) in alphabetical order,
4494 otherwise downward */
4495
4496 if (UNIV_UNLIKELY(direction == 0)) {
4497 if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G
4498 || mode >= PAGE_CUR_CONTAIN) {
4499 moves_up = TRUE;
4500 }
4501 } else if (direction == ROW_SEL_NEXT) {
4502 moves_up = TRUE;
4503 }
4504
4505 thr = que_fork_get_first_thr(prebuilt->sel_graph);
4506
4507 que_thr_move_to_run_state_for_mysql(thr, trx);
4508
4509 clust_index = dict_table_get_first_index(prebuilt->table);
4510
4511 /* Do some start-of-statement preparations */
4512
4513 if (prebuilt->table->no_rollback()) {
4514 /* NO_ROLLBACK tables do not support MVCC or locking. */
4515 prebuilt->select_lock_type = LOCK_NONE;
4516 prebuilt->sql_stat_start = FALSE;
4517 } else if (!prebuilt->sql_stat_start) {
4518 /* No need to set an intention lock or assign a read view */
4519 ut_a(prebuilt->select_lock_type != LOCK_NONE
4520 || srv_read_only_mode || trx->read_view.is_open());
4521 } else {
4522 prebuilt->sql_stat_start = FALSE;
4523 trx_start_if_not_started(trx, false);
4524
4525 if (prebuilt->select_lock_type == LOCK_NONE) {
4526 trx->read_view.open(trx);
4527 } else {
4528wait_table_again:
4529 err = lock_table(0, prebuilt->table,
4530 prebuilt->select_lock_type == LOCK_S
4531 ? LOCK_IS : LOCK_IX, thr);
4532
4533 if (err != DB_SUCCESS) {
4534
4535 table_lock_waited = TRUE;
4536 goto lock_table_wait;
4537 }
4538 }
4539 }
4540
4541 /* Open or restore index cursor position */
4542
4543 if (UNIV_LIKELY(direction != 0)) {
4544 if (spatial_search) {
4545 /* R-Tree access does not need to do
4546 cursor position and resposition */
4547 goto next_rec;
4548 }
4549
4550 bool need_to_process = sel_restore_position_for_mysql(
4551 &same_user_rec, BTR_SEARCH_LEAF,
4552 pcur, moves_up, &mtr);
4553
4554 if (UNIV_UNLIKELY(need_to_process)) {
4555 if (UNIV_UNLIKELY(prebuilt->row_read_type
4556 == ROW_READ_DID_SEMI_CONSISTENT)) {
4557 /* We did a semi-consistent read,
4558 but the record was removed in
4559 the meantime. */
4560 prebuilt->row_read_type
4561 = ROW_READ_TRY_SEMI_CONSISTENT;
4562 }
4563 } else if (UNIV_LIKELY(prebuilt->row_read_type
4564 != ROW_READ_DID_SEMI_CONSISTENT)) {
4565
4566 /* The cursor was positioned on the record
4567 that we returned previously. If we need
4568 to repeat a semi-consistent read as a
4569 pessimistic locking read, the record
4570 cannot be skipped. */
4571
4572 goto next_rec;
4573 }
4574
4575 } else if (dtuple_get_n_fields(search_tuple) > 0) {
4576 pcur->btr_cur.thr = thr;
4577
4578 if (dict_index_is_spatial(index)) {
4579 bool need_pred_lock;
4580
4581 need_pred_lock = (set_also_gap_locks
4582 && !(srv_locks_unsafe_for_binlog
4583 || trx->isolation_level
4584 <= TRX_ISO_READ_COMMITTED)
4585 && prebuilt->select_lock_type
4586 != LOCK_NONE);
4587
4588 if (!prebuilt->rtr_info) {
4589 prebuilt->rtr_info = rtr_create_rtr_info(
4590 need_pred_lock, true,
4591 btr_pcur_get_btr_cur(pcur), index);
4592 prebuilt->rtr_info->search_tuple = search_tuple;
4593 prebuilt->rtr_info->search_mode = mode;
4594 rtr_info_update_btr(btr_pcur_get_btr_cur(pcur),
4595 prebuilt->rtr_info);
4596 } else {
4597 rtr_info_reinit_in_cursor(
4598 btr_pcur_get_btr_cur(pcur),
4599 index, need_pred_lock);
4600 prebuilt->rtr_info->search_tuple = search_tuple;
4601 prebuilt->rtr_info->search_mode = mode;
4602 }
4603 }
4604
4605 err = btr_pcur_open_with_no_init(index, search_tuple, mode,
4606 BTR_SEARCH_LEAF,
4607 pcur, 0, &mtr);
4608
4609 if (err != DB_SUCCESS) {
4610 rec = NULL;
4611 goto lock_wait_or_error;
4612 }
4613
4614 pcur->trx_if_known = trx;
4615
4616 rec = btr_pcur_get_rec(pcur);
4617 ut_ad(page_rec_is_leaf(rec));
4618
4619 if (!moves_up
4620 && !page_rec_is_supremum(rec)
4621 && set_also_gap_locks
4622 && !(srv_locks_unsafe_for_binlog
4623 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4624 && prebuilt->select_lock_type != LOCK_NONE
4625 && !dict_index_is_spatial(index)) {
4626
4627 /* Try to place a gap lock on the next index record
4628 to prevent phantoms in ORDER BY ... DESC queries */
4629 const rec_t* next_rec = page_rec_get_next_const(rec);
4630
4631 offsets = rec_get_offsets(next_rec, index, offsets,
4632 true,
4633 ULINT_UNDEFINED, &heap);
4634 err = sel_set_rec_lock(pcur,
4635 next_rec, index, offsets,
4636 prebuilt->select_lock_type,
4637 LOCK_GAP, thr, &mtr);
4638
4639 switch (err) {
4640 case DB_SUCCESS_LOCKED_REC:
4641 err = DB_SUCCESS;
4642 /* fall through */
4643 case DB_SUCCESS:
4644 break;
4645 default:
4646 goto lock_wait_or_error;
4647 }
4648 }
4649 } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
4650 err = btr_pcur_open_at_index_side(
4651 mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
4652 pcur, false, 0, &mtr);
4653
4654 if (err != DB_SUCCESS) {
4655 if (err == DB_DECRYPTION_FAILED) {
4656 ib_push_warning(trx->mysql_thd,
4657 DB_DECRYPTION_FAILED,
4658 "Table %s is encrypted but encryption service or"
4659 " used key_id is not available. "
4660 " Can't continue reading table.",
4661 prebuilt->table->name);
4662 index->table->file_unreadable = true;
4663 }
4664 rec = NULL;
4665 goto lock_wait_or_error;
4666 }
4667 }
4668
4669rec_loop:
4670 DEBUG_SYNC_C("row_search_rec_loop");
4671 if (trx_is_interrupted(trx)) {
4672 if (!spatial_search) {
4673 btr_pcur_store_position(pcur, &mtr);
4674 }
4675 err = DB_INTERRUPTED;
4676 goto normal_return;
4677 }
4678
4679 /*-------------------------------------------------------------*/
4680 /* PHASE 4: Look for matching records in a loop */
4681
4682 rec = btr_pcur_get_rec(pcur);
4683
4684 if (!index->table->is_readable()) {
4685 err = DB_DECRYPTION_FAILED;
4686 goto lock_wait_or_error;
4687 }
4688
4689 ut_ad(!!page_rec_is_comp(rec) == comp);
4690 ut_ad(page_rec_is_leaf(rec));
4691
4692 if (page_rec_is_infimum(rec)) {
4693
4694 /* The infimum record on a page cannot be in the result set,
4695 and neither can a record lock be placed on it: we skip such
4696 a record. */
4697
4698 goto next_rec;
4699 }
4700
4701 if (page_rec_is_supremum(rec)) {
4702
4703 if (set_also_gap_locks
4704 && !(srv_locks_unsafe_for_binlog
4705 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4706 && prebuilt->select_lock_type != LOCK_NONE
4707 && !dict_index_is_spatial(index)) {
4708
4709 /* Try to place a lock on the index record */
4710
4711 /* If innodb_locks_unsafe_for_binlog option is used
4712 or this session is using a READ COMMITTED or lower isolation
4713 level we do not lock gaps. Supremum record is really
4714 a gap and therefore we do not set locks there. */
4715
4716 offsets = rec_get_offsets(rec, index, offsets, true,
4717 ULINT_UNDEFINED, &heap);
4718 err = sel_set_rec_lock(pcur,
4719 rec, index, offsets,
4720 prebuilt->select_lock_type,
4721 LOCK_ORDINARY, thr, &mtr);
4722
4723 switch (err) {
4724 case DB_SUCCESS_LOCKED_REC:
4725 err = DB_SUCCESS;
4726 /* fall through */
4727 case DB_SUCCESS:
4728 break;
4729 default:
4730 goto lock_wait_or_error;
4731 }
4732 }
4733
4734 /* A page supremum record cannot be in the result set: skip
4735 it now that we have placed a possible lock on it */
4736
4737 goto next_rec;
4738 }
4739
4740 /*-------------------------------------------------------------*/
4741 /* Do sanity checks in case our cursor has bumped into page
4742 corruption */
4743
4744 if (comp) {
4745 if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
4746 /* Skip the 'default row' pseudo-record. */
4747 ut_ad(index->is_instant());
4748 goto next_rec;
4749 }
4750
4751 next_offs = rec_get_next_offs(rec, TRUE);
4752 if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
4753
4754 goto wrong_offs;
4755 }
4756 } else {
4757 if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) {
4758 /* Skip the 'default row' pseudo-record. */
4759 ut_ad(index->is_instant());
4760 goto next_rec;
4761 }
4762
4763 next_offs = rec_get_next_offs(rec, FALSE);
4764 if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
4765
4766 goto wrong_offs;
4767 }
4768 }
4769
4770 if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) {
4771
4772wrong_offs:
4773 if (srv_force_recovery == 0 || moves_up == FALSE) {
4774 ib::error() << "Rec address "
4775 << static_cast<const void*>(rec)
4776 << ", buf block fix count "
4777 << btr_cur_get_block(
4778 btr_pcur_get_btr_cur(pcur))->page
4779 .buf_fix_count;
4780
4781 ib::error() << "Index corruption: rec offs "
4782 << page_offset(rec) << " next offs "
4783 << next_offs << ", page no "
4784 << page_get_page_no(page_align(rec))
4785 << ", index " << index->name
4786 << " of table " << index->table->name
4787 << ". Run CHECK TABLE. You may need to"
4788 " restore from a backup, or dump + drop +"
4789 " reimport the table.";
4790 ut_ad(0);
4791 err = DB_CORRUPTION;
4792
4793 goto lock_wait_or_error;
4794 } else {
4795 /* The user may be dumping a corrupt table. Jump
4796 over the corruption to recover as much as possible. */
4797
4798 ib::info() << "Index corruption: rec offs "
4799 << page_offset(rec) << " next offs "
4800 << next_offs << ", page no "
4801 << page_get_page_no(page_align(rec))
4802 << ", index " << index->name
4803 << " of table " << index->table->name
4804 << ". We try to skip the rest of the page.";
4805
4806 btr_pcur_move_to_last_on_page(pcur, &mtr);
4807
4808 goto next_rec;
4809 }
4810 }
4811 /*-------------------------------------------------------------*/
4812
4813 /* Calculate the 'offsets' associated with 'rec' */
4814
4815 ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur)));
4816 ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
4817
4818 offsets = rec_get_offsets(rec, index, offsets, true,
4819 ULINT_UNDEFINED, &heap);
4820
4821 if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
4822 if (!rec_validate(rec, offsets)
4823 || !btr_index_rec_validate(rec, index, FALSE)) {
4824
4825 ib::error() << "Index corruption: rec offs "
4826 << page_offset(rec) << " next offs "
4827 << next_offs << ", page no "
4828 << page_get_page_no(page_align(rec))
4829 << ", index " << index->name
4830 << " of table " << index->table->name
4831 << ". We try to skip the record.";
4832
4833 goto next_rec;
4834 }
4835 }
4836
4837 /* Note that we cannot trust the up_match value in the cursor at this
4838 place because we can arrive here after moving the cursor! Thus
4839 we have to recompare rec and search_tuple to determine if they
4840 match enough. */
4841
4842 if (match_mode == ROW_SEL_EXACT) {
4843 /* Test if the index record matches completely to search_tuple
4844 in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4845
4846 /* fputs("Comparing rec and search tuple\n", stderr); */
4847
4848 if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4849
4850 if (set_also_gap_locks
4851 && !(srv_locks_unsafe_for_binlog
4852 || trx->isolation_level
4853 <= TRX_ISO_READ_COMMITTED)
4854 && prebuilt->select_lock_type != LOCK_NONE
4855 && !dict_index_is_spatial(index)) {
4856
4857 /* Try to place a gap lock on the index
4858 record only if innodb_locks_unsafe_for_binlog
4859 option is not set or this session is not
4860 using a READ COMMITTED or lower isolation level. */
4861
4862 err = sel_set_rec_lock(
4863 pcur,
4864 rec, index, offsets,
4865 prebuilt->select_lock_type, LOCK_GAP,
4866 thr, &mtr);
4867
4868 switch (err) {
4869 case DB_SUCCESS_LOCKED_REC:
4870 case DB_SUCCESS:
4871 break;
4872 default:
4873 goto lock_wait_or_error;
4874 }
4875 }
4876
4877 btr_pcur_store_position(pcur, &mtr);
4878
4879 /* The found record was not a match, but may be used
4880 as NEXT record (index_next). Set the relative position
4881 to BTR_PCUR_BEFORE, to reflect that the position of
4882 the persistent cursor is before the found/stored row
4883 (pcur->old_rec). */
4884 ut_ad(pcur->rel_pos == BTR_PCUR_ON);
4885 pcur->rel_pos = BTR_PCUR_BEFORE;
4886
4887 err = DB_RECORD_NOT_FOUND;
4888 goto normal_return;
4889 }
4890
4891 } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4892
4893 if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4894
4895 if (set_also_gap_locks
4896 && !(srv_locks_unsafe_for_binlog
4897 || trx->isolation_level
4898 <= TRX_ISO_READ_COMMITTED)
4899 && prebuilt->select_lock_type != LOCK_NONE
4900 && !dict_index_is_spatial(index)) {
4901
4902 /* Try to place a gap lock on the index
4903 record only if innodb_locks_unsafe_for_binlog
4904 option is not set or this session is not
4905 using a READ COMMITTED or lower isolation level. */
4906
4907 err = sel_set_rec_lock(
4908 pcur,
4909 rec, index, offsets,
4910 prebuilt->select_lock_type, LOCK_GAP,
4911 thr, &mtr);
4912
4913 switch (err) {
4914 case DB_SUCCESS_LOCKED_REC:
4915 case DB_SUCCESS:
4916 break;
4917 default:
4918 goto lock_wait_or_error;
4919 }
4920 }
4921
4922 btr_pcur_store_position(pcur, &mtr);
4923
4924 /* The found record was not a match, but may be used
4925 as NEXT record (index_next). Set the relative position
4926 to BTR_PCUR_BEFORE, to reflect that the position of
4927 the persistent cursor is before the found/stored row
4928 (pcur->old_rec). */
4929 ut_ad(pcur->rel_pos == BTR_PCUR_ON);
4930 pcur->rel_pos = BTR_PCUR_BEFORE;
4931
4932 err = DB_RECORD_NOT_FOUND;
4933 goto normal_return;
4934 }
4935 }
4936
4937 /* We are ready to look at a possible new index entry in the result
4938 set: the cursor is now placed on a user record */
4939
4940 if (prebuilt->select_lock_type != LOCK_NONE) {
4941 /* Try to place a lock on the index record; note that delete
4942 marked records are a special case in a unique search. If there
4943 is a non-delete marked record, then it is enough to lock its
4944 existence with LOCK_REC_NOT_GAP. */
4945
4946 /* If innodb_locks_unsafe_for_binlog option is used
4947 or this session is using a READ COMMITED isolation
4948 level we lock only the record, i.e., next-key locking is
4949 not used. */
4950
4951 ulint lock_type;
4952
4953 if (srv_locks_unsafe_for_binlog
4954 || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
4955 /* At READ COMMITTED or READ UNCOMMITTED
4956 isolation levels, do not lock committed
4957 delete-marked records. */
4958 if (!rec_get_deleted_flag(rec, comp)) {
4959 goto no_gap_lock;
4960 }
4961 if (index == clust_index) {
4962 trx_id_t trx_id = row_get_rec_trx_id(
4963 rec, index, offsets);
4964 /* In delete-marked records, DB_TRX_ID must
4965 always refer to an existing undo log record. */
4966 ut_ad(trx_id);
4967 if (!trx_sys.is_registered(trx, trx_id)) {
4968 /* The clustered index record
4969 was delete-marked in a committed
4970 transaction. Ignore the record. */
4971 goto locks_ok_del_marked;
4972 }
4973 } else if (trx_t* t = row_vers_impl_x_locked(
4974 trx, rec, index, offsets)) {
4975 /* The record belongs to an active
4976 transaction. We must acquire a lock. */
4977 t->release_reference();
4978 } else {
4979 /* The secondary index record does not
4980 point to a delete-marked clustered index
4981 record that belongs to an active transaction.
4982 Ignore the secondary index record, because
4983 it is not locked. */
4984 goto next_rec;
4985 }
4986
4987 goto no_gap_lock;
4988 }
4989
4990 if (!set_also_gap_locks
4991 || (unique_search && !rec_get_deleted_flag(rec, comp))
4992 || dict_index_is_spatial(index)) {
4993
4994 goto no_gap_lock;
4995 } else {
4996 lock_type = LOCK_ORDINARY;
4997 }
4998
4999 /* If we are doing a 'greater or equal than a primary key
5000 value' search from a clustered index, and we find a record
5001 that has that exact primary key value, then there is no need
5002 to lock the gap before the record, because no insert in the
5003 gap can be in our search range. That is, no phantom row can
5004 appear that way.
5005
5006 An example: if col1 is the primary key, the search is WHERE
5007 col1 >= 100, and we find a record where col1 = 100, then no
5008 need to lock the gap before that record. */
5009
5010 if (index == clust_index
5011 && mode == PAGE_CUR_GE
5012 && direction == 0
5013 && dtuple_get_n_fields_cmp(search_tuple)
5014 == dict_index_get_n_unique(index)
5015 && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
5016no_gap_lock:
5017 lock_type = LOCK_REC_NOT_GAP;
5018 }
5019
5020 err = sel_set_rec_lock(pcur,
5021 rec, index, offsets,
5022 prebuilt->select_lock_type,
5023 lock_type, thr, &mtr);
5024
5025 switch (err) {
5026 const rec_t* old_vers;
5027 case DB_SUCCESS_LOCKED_REC:
5028 if (srv_locks_unsafe_for_binlog
5029 || trx->isolation_level
5030 <= TRX_ISO_READ_COMMITTED) {
5031 /* Note that a record of
5032 prebuilt->index was locked. */
5033 prebuilt->new_rec_locks = 1;
5034 }
5035 err = DB_SUCCESS;
5036 /* fall through */
5037 case DB_SUCCESS:
5038 break;
5039 case DB_LOCK_WAIT:
5040 /* Lock wait for R-tree should already
5041 be handled in sel_set_rtr_rec_lock() */
5042 ut_ad(!dict_index_is_spatial(index));
5043 /* Never unlock rows that were part of a conflict. */
5044 prebuilt->new_rec_locks = 0;
5045
5046 if (UNIV_LIKELY(prebuilt->row_read_type
5047 != ROW_READ_TRY_SEMI_CONSISTENT)
5048 || unique_search
5049 || index != clust_index) {
5050
5051 goto lock_wait_or_error;
5052 }
5053
5054 /* The following call returns 'offsets'
5055 associated with 'old_vers' */
5056 row_sel_build_committed_vers_for_mysql(
5057 clust_index, prebuilt, rec,
5058 &offsets, &heap, &old_vers, need_vrow ? &vrow : NULL,
5059 &mtr);
5060
5061 /* Check whether it was a deadlock or not, if not
5062 a deadlock and the transaction had to wait then
5063 release the lock it is waiting on. */
5064
5065 err = lock_trx_handle_wait(trx);
5066
5067 switch (err) {
5068 case DB_SUCCESS:
5069 /* The lock was granted while we were
5070 searching for the last committed version.
5071 Do a normal locking read. */
5072
5073 offsets = rec_get_offsets(
5074 rec, index, offsets, true,
5075 ULINT_UNDEFINED, &heap);
5076 goto locks_ok;
5077 case DB_DEADLOCK:
5078 goto lock_wait_or_error;
5079 case DB_LOCK_WAIT:
5080 ut_ad(!dict_index_is_spatial(index));
5081 err = DB_SUCCESS;
5082 break;
5083 default:
5084 ut_error;
5085 }
5086
5087 if (old_vers == NULL) {
5088 /* The row was not yet committed */
5089
5090 goto next_rec;
5091 }
5092
5093 did_semi_consistent_read = TRUE;
5094 rec = old_vers;
5095 break;
5096 case DB_RECORD_NOT_FOUND:
5097 if (dict_index_is_spatial(index)) {
5098 goto next_rec;
5099 } else {
5100 goto lock_wait_or_error;
5101 }
5102
5103 default:
5104
5105 goto lock_wait_or_error;
5106 }
5107 } else {
5108 /* This is a non-locking consistent read: if necessary, fetch
5109 a previous version of the record */
5110
5111 if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
5112 || prebuilt->table->no_rollback()) {
5113
5114 /* Do nothing: we let a non-locking SELECT read the
5115 latest version of the record */
5116
5117 } else if (index == clust_index) {
5118
5119 /* Fetch a previous version of the row if the current
5120 one is not visible in the snapshot; if we have a very
5121 high force recovery level set, we try to avoid crashes
5122 by skipping this lookup */
5123
5124 if (!lock_clust_rec_cons_read_sees(
5125 rec, index, offsets, &trx->read_view)) {
5126 ut_ad(srv_force_recovery
5127 < SRV_FORCE_NO_UNDO_LOG_SCAN);
5128 rec_t* old_vers;
5129 /* The following call returns 'offsets'
5130 associated with 'old_vers' */
5131 err = row_sel_build_prev_vers_for_mysql(
5132 &trx->read_view, clust_index,
5133 prebuilt, rec, &offsets, &heap,
5134 &old_vers, need_vrow ? &vrow : NULL,
5135 &mtr);
5136
5137 if (err != DB_SUCCESS) {
5138
5139 goto lock_wait_or_error;
5140 }
5141
5142 if (old_vers == NULL) {
5143 /* The row did not exist yet in
5144 the read view */
5145
5146 goto next_rec;
5147 }
5148
5149 rec = old_vers;
5150 }
5151 } else {
5152 /* We are looking into a non-clustered index,
5153 and to get the right version of the record we
5154 have to look also into the clustered index: this
5155 is necessary, because we can only get the undo
5156 information via the clustered index record. */
5157
5158 ut_ad(!dict_index_is_clust(index));
5159
5160 if (!srv_read_only_mode
5161 && !lock_sec_rec_cons_read_sees(
5162 rec, index, &trx->read_view)) {
5163 /* We should look at the clustered index.
5164 However, as this is a non-locking read,
5165 we can skip the clustered index lookup if
5166 the condition does not match the secondary
5167 index entry. */
5168 switch (row_search_idx_cond_check(
5169 buf, prebuilt, rec, offsets)) {
5170 case ICP_NO_MATCH:
5171 goto next_rec;
5172 case ICP_OUT_OF_RANGE:
5173 case ICP_ABORTED_BY_USER:
5174 case ICP_ERROR:
5175 err = DB_RECORD_NOT_FOUND;
5176 goto idx_cond_failed;
5177 case ICP_MATCH:
5178 goto requires_clust_rec;
5179 }
5180
5181 ut_error;
5182 }
5183 }
5184 }
5185
5186locks_ok:
5187 /* NOTE that at this point rec can be an old version of a clustered
5188 index record built for a consistent read. We cannot assume after this
5189 point that rec is on a buffer pool page. Functions like
5190 page_rec_is_comp() cannot be used! */
5191
5192 if (rec_get_deleted_flag(rec, comp)) {
5193locks_ok_del_marked:
5194 /* In delete-marked records, DB_TRX_ID must
5195 always refer to an existing undo log record. */
5196 ut_ad(index != clust_index
5197 || row_get_rec_trx_id(rec, index, offsets));
5198
5199 /* The record is delete-marked: we can skip it */
5200
5201 /* This is an optimization to skip setting the next key lock
5202 on the record that follows this delete-marked record. This
5203 optimization works because of the unique search criteria
5204 which precludes the presence of a range lock between this
5205 delete marked record and the record following it.
5206
5207 For now this is applicable only to clustered indexes while
5208 doing a unique search except for HANDLER queries because
5209 HANDLER allows NEXT and PREV even in unique search on
5210 clustered index. There is scope for further optimization
5211 applicable to unique secondary indexes. Current behaviour is
5212 to widen the scope of a lock on an already delete marked record
5213 if the same record is deleted twice by the same transaction */
5214 if (index == clust_index && unique_search
5215 && !prebuilt->used_in_HANDLER) {
5216
5217 err = DB_RECORD_NOT_FOUND;
5218
5219 goto normal_return;
5220 }
5221
5222 goto next_rec;
5223 }
5224
5225 /* Check if the record matches the index condition. */
5226 switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
5227 case ICP_NO_MATCH:
5228 if (did_semi_consistent_read) {
5229 row_unlock_for_mysql(prebuilt, TRUE);
5230 }
5231 goto next_rec;
5232 case ICP_OUT_OF_RANGE:
5233 case ICP_ABORTED_BY_USER:
5234 case ICP_ERROR:
5235 err = DB_RECORD_NOT_FOUND;
5236 goto idx_cond_failed;
5237 case ICP_MATCH:
5238 break;
5239 }
5240
5241 if (index != clust_index && prebuilt->need_to_access_clustered) {
5242 if (row_search_with_covering_prefix(prebuilt, rec, offsets)) {
5243 goto use_covering_index;
5244 }
5245requires_clust_rec:
5246 ut_ad(index != clust_index);
5247 /* We use a 'goto' to the preceding label if a consistent
5248 read of a secondary index record requires us to look up old
5249 versions of the associated clustered index record. */
5250
5251 ut_ad(rec_offs_validate(rec, index, offsets));
5252
5253 /* It was a non-clustered index and we must fetch also the
5254 clustered index record */
5255
5256 mtr_has_extra_clust_latch = TRUE;
5257
5258 ut_ad(!vrow);
5259 /* The following call returns 'offsets' associated with
5260 'clust_rec'. Note that 'clust_rec' can be an old version
5261 built for a consistent read. */
5262
5263 err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
5264 thr, &clust_rec,
5265 &offsets, &heap,
5266 need_vrow ? &vrow : NULL,
5267 &mtr);
5268 switch (err) {
5269 case DB_SUCCESS:
5270 if (clust_rec == NULL) {
5271 /* The record did not exist in the read view */
5272 ut_ad(prebuilt->select_lock_type == LOCK_NONE
5273 || dict_index_is_spatial(index));
5274
5275 goto next_rec;
5276 }
5277 break;
5278 case DB_SUCCESS_LOCKED_REC:
5279 ut_a(clust_rec != NULL);
5280 if (srv_locks_unsafe_for_binlog
5281 || trx->isolation_level
5282 <= TRX_ISO_READ_COMMITTED) {
5283 /* Note that the clustered index record
5284 was locked. */
5285 prebuilt->new_rec_locks = 2;
5286 }
5287 err = DB_SUCCESS;
5288 break;
5289 default:
5290 vrow = NULL;
5291 goto lock_wait_or_error;
5292 }
5293
5294 if (rec_get_deleted_flag(clust_rec, comp)) {
5295
5296 /* The record is delete marked: we can skip it */
5297
5298 if ((srv_locks_unsafe_for_binlog
5299 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
5300 && prebuilt->select_lock_type != LOCK_NONE) {
5301
5302 /* No need to keep a lock on a delete-marked
5303 record if we do not want to use next-key
5304 locking. */
5305
5306 row_unlock_for_mysql(prebuilt, TRUE);
5307 }
5308
5309 goto next_rec;
5310 }
5311
5312 if (need_vrow && !vrow) {
5313 if (!heap) {
5314 heap = mem_heap_create(100);
5315 }
5316 row_sel_fill_vrow(rec, index, &vrow, heap);
5317 }
5318
5319 result_rec = clust_rec;
5320 ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
5321
5322 if (prebuilt->idx_cond) {
5323 /* Convert the record to MySQL format. We were
5324 unable to do this in row_search_idx_cond_check(),
5325 because the condition is on the secondary index
5326 and the requested column is in the clustered index.
5327 We convert all fields, including those that
5328 may have been used in ICP, because the
5329 secondary index may contain a column prefix
5330 rather than the full column. Also, as noted
5331 in Bug #56680, the column in the secondary
5332 index may be in the wrong case, and the
5333 authoritative case is in result_rec, the
5334 appropriate version of the clustered index record. */
5335 if (!row_sel_store_mysql_rec(
5336 buf, prebuilt, result_rec, vrow,
5337 true, clust_index, offsets)) {
5338 goto next_rec;
5339 }
5340 }
5341 } else {
5342use_covering_index:
5343 result_rec = rec;
5344 }
5345
5346 /* We found a qualifying record 'result_rec'. At this point,
5347 'offsets' are associated with 'result_rec'. */
5348
5349 ut_ad(rec_offs_validate(result_rec,
5350 result_rec != rec ? clust_index : index,
5351 offsets));
5352 ut_ad(!rec_get_deleted_flag(result_rec, comp));
5353
5354 /* Decide whether to prefetch extra rows.
5355 At this point, the clustered index record is protected
5356 by a page latch that was acquired when pcur was positioned.
5357 The latch will not be released until mtr.commit(). */
5358
5359 if ((match_mode == ROW_SEL_EXACT
5360 || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
5361 && prebuilt->select_lock_type == LOCK_NONE
5362 && !prebuilt->m_no_prefetch
5363 && !prebuilt->templ_contains_blob
5364 && !prebuilt->clust_index_was_generated
5365 && !prebuilt->used_in_HANDLER
5366 && prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE
5367 && !prebuilt->in_fts_query) {
5368
5369 /* Inside an update, for example, we do not cache rows,
5370 since we may use the cursor position to do the actual
5371 update, that is why we require ...lock_type == LOCK_NONE.
5372 Since we keep space in prebuilt only for the BLOBs of
5373 a single row, we cannot cache rows in the case there
5374 are BLOBs in the fields to be fetched. In HANDLER we do
5375 not cache rows because there the cursor is a scrollable
5376 cursor. */
5377
5378 ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
5379
5380 /* We only convert from InnoDB row format to MySQL row
5381 format when ICP is disabled. */
5382
5383 if (!prebuilt->idx_cond) {
5384
5385 /* We use next_buf to track the allocation of buffers
5386 where we store and enqueue the buffers for our
5387 pre-fetch optimisation.
5388
5389 If next_buf == 0 then we store the converted record
5390 directly into the MySQL record buffer (buf). If it is
5391 != 0 then we allocate a pre-fetch buffer and store the
5392 converted record there.
5393
5394 If the conversion fails and the MySQL record buffer
5395 was not written to then we reset next_buf so that
5396 we can re-use the MySQL record buffer in the next
5397 iteration. */
5398
5399 next_buf = next_buf
5400 ? row_sel_fetch_last_buf(prebuilt) : buf;
5401
5402 if (!row_sel_store_mysql_rec(
5403 next_buf, prebuilt, result_rec, vrow,
5404 result_rec != rec,
5405 result_rec != rec ? clust_index : index,
5406 offsets)) {
5407
5408 if (next_buf == buf) {
5409 ut_a(prebuilt->n_fetch_cached == 0);
5410 next_buf = 0;
5411 }
5412
5413 /* Only fresh inserts may contain incomplete
5414 externally stored columns. Pretend that such
5415 records do not exist. Such records may only be
5416 accessed at the READ UNCOMMITTED isolation
5417 level or when rolling back a recovered
5418 transaction. Rollback happens at a lower
5419 level, not here. */
5420 goto next_rec;
5421 }
5422
5423 if (next_buf != buf) {
5424 row_sel_enqueue_cache_row_for_mysql(
5425 next_buf, prebuilt);
5426 }
5427 } else {
5428 row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
5429 }
5430
5431 if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
5432 goto next_rec;
5433 }
5434
5435 } else {
5436 if (UNIV_UNLIKELY
5437 (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
5438 /* CHECK TABLE: fetch the row */
5439
5440 if (result_rec != rec
5441 && !prebuilt->need_to_access_clustered) {
5442 /* We used 'offsets' for the clust
5443 rec, recalculate them for 'rec' */
5444 offsets = rec_get_offsets(rec, index, offsets,
5445 true,
5446 ULINT_UNDEFINED,
5447 &heap);
5448 result_rec = rec;
5449 }
5450
5451 memcpy(buf + 4, result_rec
5452 - rec_offs_extra_size(offsets),
5453 rec_offs_size(offsets));
5454 mach_write_to_4(buf,
5455 rec_offs_extra_size(offsets) + 4);
5456 } else if (!prebuilt->idx_cond) {
5457 /* The record was not yet converted to MySQL format. */
5458 if (!row_sel_store_mysql_rec(
5459 buf, prebuilt, result_rec, vrow,
5460 result_rec != rec,
5461 result_rec != rec ? clust_index : index,
5462 offsets)) {
5463 /* Only fresh inserts may contain
5464 incomplete externally stored
5465 columns. Pretend that such records do
5466 not exist. Such records may only be
5467 accessed at the READ UNCOMMITTED
5468 isolation level or when rolling back a
5469 recovered transaction. Rollback
5470 happens at a lower level, not here. */
5471 goto next_rec;
5472 }
5473 }
5474
5475 if (prebuilt->clust_index_was_generated) {
5476 row_sel_store_row_id_to_prebuilt(
5477 prebuilt, result_rec,
5478 result_rec == rec ? index : clust_index,
5479 offsets);
5480 }
5481 }
5482
5483 /* From this point on, 'offsets' are invalid. */
5484
5485 /* We have an optimization to save CPU time: if this is a consistent
5486 read on a unique condition on the clustered index, then we do not
5487 store the pcur position, because any fetch next or prev will anyway
5488 return 'end of file'. Exceptions are locking reads and the MySQL
5489 HANDLER command where the user can move the cursor with PREV or NEXT
5490 even after a unique search. */
5491
5492 err = DB_SUCCESS;
5493
5494idx_cond_failed:
5495 if (!unique_search
5496 || !dict_index_is_clust(index)
5497 || direction != 0
5498 || prebuilt->select_lock_type != LOCK_NONE
5499 || prebuilt->used_in_HANDLER) {
5500
5501 /* Inside an update always store the cursor position */
5502
5503 if (!spatial_search) {
5504 btr_pcur_store_position(pcur, &mtr);
5505 }
5506 }
5507
5508 goto normal_return;
5509
5510next_rec:
5511 /* Reset the old and new "did semi-consistent read" flags. */
5512 if (UNIV_UNLIKELY(prebuilt->row_read_type
5513 == ROW_READ_DID_SEMI_CONSISTENT)) {
5514 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5515 }
5516 did_semi_consistent_read = FALSE;
5517 prebuilt->new_rec_locks = 0;
5518 vrow = NULL;
5519
5520 /*-------------------------------------------------------------*/
5521 /* PHASE 5: Move the cursor to the next index record */
5522
5523 /* NOTE: For moves_up==FALSE, the mini-transaction will be
5524 committed and restarted every time when switching b-tree
5525 pages. For moves_up==TRUE in index condition pushdown, we can
5526 scan an entire secondary index tree within a single
5527 mini-transaction. As long as the prebuilt->idx_cond does not
5528 match, we do not need to consult the clustered index or
5529 return records to MySQL, and thus we can avoid repositioning
5530 the cursor. What prevents us from buffer-fixing all leaf pages
5531 within the mini-transaction is the btr_leaf_page_release()
5532 call in btr_pcur_move_to_next_page(). Only the leaf page where
5533 the cursor is positioned will remain buffer-fixed.
5534 For R-tree spatial search, we also commit the mini-transaction
5535 each time */
5536
5537 if (spatial_search) {
5538 /* No need to do store restore for R-tree */
5539 mtr.commit();
5540 mtr.start();
5541 mtr_has_extra_clust_latch = FALSE;
5542 } else if (mtr_has_extra_clust_latch) {
5543 /* If we have extra cluster latch, we must commit
5544 mtr if we are moving to the next non-clustered
5545 index record, because we could break the latching
5546 order if we would access a different clustered
5547 index page right away without releasing the previous. */
5548
5549 btr_pcur_store_position(pcur, &mtr);
5550 mtr.commit();
5551 mtr_has_extra_clust_latch = FALSE;
5552
5553 mtr.start();
5554
5555 if (sel_restore_position_for_mysql(&same_user_rec,
5556 BTR_SEARCH_LEAF,
5557 pcur, moves_up, &mtr)) {
5558 goto rec_loop;
5559 }
5560 }
5561
5562 if (moves_up) {
5563 bool move;
5564
5565 if (spatial_search) {
5566 move = rtr_pcur_move_to_next(
5567 search_tuple, mode, pcur, 0, &mtr);
5568 } else {
5569 move = btr_pcur_move_to_next(pcur, &mtr);
5570 }
5571
5572 if (!move) {
5573not_moved:
5574 if (!spatial_search) {
5575 btr_pcur_store_position(pcur, &mtr);
5576 }
5577
5578 if (match_mode != 0) {
5579 err = DB_RECORD_NOT_FOUND;
5580 } else {
5581 err = DB_END_OF_INDEX;
5582 }
5583
5584 goto normal_return;
5585 }
5586 } else {
5587 if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
5588 goto not_moved;
5589 }
5590 }
5591
5592 goto rec_loop;
5593
5594lock_wait_or_error:
5595 /* Reset the old and new "did semi-consistent read" flags. */
5596 if (UNIV_UNLIKELY(prebuilt->row_read_type
5597 == ROW_READ_DID_SEMI_CONSISTENT)) {
5598 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5599 }
5600 did_semi_consistent_read = FALSE;
5601
5602 /*-------------------------------------------------------------*/
5603 if (!dict_index_is_spatial(index)) {
5604 if (rec) {
5605 btr_pcur_store_position(pcur, &mtr);
5606 }
5607 }
5608
5609lock_table_wait:
5610 mtr.commit();
5611 mtr_has_extra_clust_latch = FALSE;
5612
5613 trx->error_state = err;
5614
5615 /* The following is a patch for MySQL */
5616
5617 if (thr->is_active) {
5618 que_thr_stop_for_mysql(thr);
5619 }
5620
5621 thr->lock_state = QUE_THR_LOCK_ROW;
5622
5623 if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
5624 /* It was a lock wait, and it ended */
5625
5626 thr->lock_state = QUE_THR_LOCK_NOLOCK;
5627 mtr.start();
5628
5629 /* Table lock waited, go try to obtain table lock
5630 again */
5631 if (table_lock_waited) {
5632 table_lock_waited = FALSE;
5633
5634 goto wait_table_again;
5635 }
5636
5637 if (!dict_index_is_spatial(index)) {
5638 sel_restore_position_for_mysql(
5639 &same_user_rec, BTR_SEARCH_LEAF, pcur,
5640 moves_up, &mtr);
5641 }
5642
5643 if ((srv_locks_unsafe_for_binlog
5644 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
5645 && !same_user_rec) {
5646
5647 /* Since we were not able to restore the cursor
5648 on the same user record, we cannot use
5649 row_unlock_for_mysql() to unlock any records, and
5650 we must thus reset the new rec lock info. Since
5651 in lock0lock.cc we have blocked the inheriting of gap
5652 X-locks, we actually do not have any new record locks
5653 set in this case.
5654
5655 Note that if we were able to restore on the 'same'
5656 user record, it is still possible that we were actually
5657 waiting on a delete-marked record, and meanwhile
5658 it was removed by purge and inserted again by some
5659 other user. But that is no problem, because in
5660 rec_loop we will again try to set a lock, and
5661 new_rec_lock_info in trx will be right at the end. */
5662
5663 prebuilt->new_rec_locks = 0;
5664 }
5665
5666 mode = pcur->search_mode;
5667
5668 goto rec_loop;
5669 }
5670
5671 thr->lock_state = QUE_THR_LOCK_NOLOCK;
5672
5673 goto func_exit;
5674
5675normal_return:
5676 /*-------------------------------------------------------------*/
5677 {
5678 /* handler_index_cond_check() may pull TR_table search
5679 which initates another row_search_mvcc(). */
5680 ulint n_active_thrs= trx->lock.n_active_thrs;
5681 trx->lock.n_active_thrs= 1;
5682 que_thr_stop_for_mysql_no_error(thr, trx);
5683 trx->lock.n_active_thrs= n_active_thrs - 1;
5684 }
5685
5686 mtr.commit();
5687
5688 DEBUG_SYNC_C("row_search_for_mysql_before_return");
5689
5690 if (prebuilt->idx_cond != 0) {
5691
5692 /* When ICP is active we don't write to the MySQL buffer
5693 directly, only to buffers that are enqueued in the pre-fetch
5694 queue. We need to dequeue the first buffer and copy the contents
5695 to the record buffer that was passed in by MySQL. */
5696
5697 if (prebuilt->n_fetch_cached > 0) {
5698 row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
5699 err = DB_SUCCESS;
5700 }
5701
5702 } else if (next_buf != 0) {
5703
5704 /* We may or may not have enqueued some buffers to the
5705 pre-fetch queue, but we definitely wrote to the record
5706 buffer passed to use by MySQL. */
5707
5708 DEBUG_SYNC_C("row_search_cached_row");
5709 err = DB_SUCCESS;
5710 }
5711
5712#ifdef UNIV_DEBUG
5713 if (dict_index_is_spatial(index) && err != DB_SUCCESS
5714 && err != DB_END_OF_INDEX && err != DB_INTERRUPTED) {
5715 rtr_node_path_t* path = pcur->btr_cur.rtr_info->path;
5716
5717 ut_ad(path->empty());
5718 }
5719#endif
5720
5721func_exit:
5722 trx->op_info = "";
5723 if (heap != NULL) {
5724 mem_heap_free(heap);
5725 }
5726
5727 /* Set or reset the "did semi-consistent read" flag on return.
5728 The flag did_semi_consistent_read is set if and only if
5729 the record being returned was fetched with a semi-consistent read. */
5730 ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
5731 || !did_semi_consistent_read);
5732
5733 if (prebuilt->row_read_type != ROW_READ_WITH_LOCKS) {
5734 if (did_semi_consistent_read) {
5735 prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
5736 } else {
5737 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5738 }
5739 }
5740
5741 ut_ad(!sync_check_iterate(sync_check()));
5742
5743 DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
5744
5745 DBUG_RETURN(err);
5746}
5747
5748/********************************************************************//**
5749Count rows in a R-Tree leaf level.
5750@return DB_SUCCESS if successful */
5751dberr_t
5752row_count_rtree_recs(
5753/*=================*/
5754 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
5755 table handle; this contains the info
5756 of search_tuple, index; if search
5757 tuple contains 0 fields then we
5758 position the cursor at the start or
5759 the end of the index, depending on
5760 'mode' */
5761 ulint* n_rows) /*!< out: number of entries
5762 seen in the consistent read */
5763{
5764 dict_index_t* index = prebuilt->index;
5765 dberr_t ret = DB_SUCCESS;
5766 mtr_t mtr;
5767 mem_heap_t* heap;
5768 dtuple_t* entry;
5769 dtuple_t* search_entry = prebuilt->search_tuple;
5770 ulint entry_len;
5771 ulint i;
5772 byte* buf;
5773
5774 ut_a(dict_index_is_spatial(index));
5775
5776 *n_rows = 0;
5777
5778 heap = mem_heap_create(256);
5779
5780 /* Build a search tuple. */
5781 entry_len = dict_index_get_n_fields(index);
5782 entry = dtuple_create(heap, entry_len);
5783
5784 for (i = 0; i < entry_len; i++) {
5785 const dict_field_t* ind_field
5786 = dict_index_get_nth_field(index, i);
5787 const dict_col_t* col
5788 = ind_field->col;
5789 dfield_t* dfield
5790 = dtuple_get_nth_field(entry, i);
5791
5792 if (i == 0) {
5793 double* mbr;
5794 double tmp_mbr[SPDIMS * 2];
5795
5796 dfield->type.mtype = DATA_GEOMETRY;
5797 dfield->type.prtype |= DATA_GIS_MBR;
5798
5799 /* Allocate memory for mbr field */
5800 mbr = static_cast<double*>
5801 (mem_heap_alloc(heap, DATA_MBR_LEN));
5802
5803 /* Set mbr field data. */
5804 dfield_set_data(dfield, mbr, DATA_MBR_LEN);
5805
5806 for (uint j = 0; j < SPDIMS; j++) {
5807 tmp_mbr[j * 2] = DBL_MAX;
5808 tmp_mbr[j * 2 + 1] = -DBL_MAX;
5809 }
5810 dfield_write_mbr(dfield, tmp_mbr);
5811 continue;
5812 }
5813
5814 dfield->type.mtype = col->mtype;
5815 dfield->type.prtype = col->prtype;
5816
5817 }
5818
5819 prebuilt->search_tuple = entry;
5820
5821 ulint bufsize = std::max<ulint>(srv_page_size,
5822 prebuilt->mysql_row_len);
5823 buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
5824
5825 ulint cnt = 1000;
5826
5827 ret = row_search_for_mysql(buf, PAGE_CUR_WITHIN, prebuilt, 0, 0);
5828loop:
5829 /* Check thd->killed every 1,000 scanned rows */
5830 if (--cnt == 0) {
5831 if (trx_is_interrupted(prebuilt->trx)) {
5832 ret = DB_INTERRUPTED;
5833 goto func_exit;
5834 }
5835 cnt = 1000;
5836 }
5837
5838 switch (ret) {
5839 case DB_SUCCESS:
5840 break;
5841 case DB_DEADLOCK:
5842 case DB_LOCK_TABLE_FULL:
5843 case DB_LOCK_WAIT_TIMEOUT:
5844 case DB_INTERRUPTED:
5845 goto func_exit;
5846 default:
5847 /* fall through (this error is ignored by CHECK TABLE) */
5848 case DB_END_OF_INDEX:
5849 ret = DB_SUCCESS;
5850func_exit:
5851 prebuilt->search_tuple = search_entry;
5852 ut_free(buf);
5853 mem_heap_free(heap);
5854
5855 return(ret);
5856 }
5857
5858 *n_rows = *n_rows + 1;
5859
5860 ret = row_search_for_mysql(
5861 buf, PAGE_CUR_WITHIN, prebuilt, 0, ROW_SEL_NEXT);
5862
5863 goto loop;
5864}
5865
5866/*******************************************************************//**
5867Checks if MySQL at the moment is allowed for this table to retrieve a
5868consistent read result, or store it to the query cache.
5869@return whether storing or retrieving from the query cache is permitted */
5870bool
5871row_search_check_if_query_cache_permitted(
5872/*======================================*/
5873 trx_t* trx, /*!< in: transaction object */
5874 const char* norm_name) /*!< in: concatenation of database name,
5875 '/' char, table name */
5876{
5877 dict_table_t* table = dict_table_open_on_name(
5878 norm_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
5879
5880 if (table == NULL) {
5881
5882 return(false);
5883 }
5884
5885 /* Start the transaction if it is not started yet */
5886
5887 trx_start_if_not_started(trx, false);
5888
5889 /* If there are locks on the table or some trx has invalidated the
5890 cache before this transaction started then this transaction cannot
5891 read/write from/to the cache.
5892
5893 If a read view has not been created for the transaction then it doesn't
5894 really matter what this transaction sees. If a read view was created
5895 then the view low_limit_id is the max trx id that this transaction
5896 saw at the time of the read view creation. */
5897
5898 const bool ret = lock_table_get_n_locks(table) == 0
5899 && ((trx->id != 0 && trx->id >= table->query_cache_inv_id)
5900 || !trx->read_view.is_open()
5901 || trx->read_view.low_limit_id()
5902 >= table->query_cache_inv_id);
5903 if (ret) {
5904 /* If the isolation level is high, assign a read view for the
5905 transaction if it does not yet have one */
5906
5907 if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ) {
5908 trx->read_view.open(trx);
5909 }
5910 }
5911
5912 dict_table_close(table, FALSE, FALSE);
5913
5914 return(ret);
5915}
5916
5917/*******************************************************************//**
5918Read the AUTOINC column from the current row. If the value is less than
59190 and the type is not unsigned then we reset the value to 0.
5920@return value read from the column */
5921static
5922ib_uint64_t
5923row_search_autoinc_read_column(
5924/*===========================*/
5925 dict_index_t* index, /*!< in: index to read from */
5926 const rec_t* rec, /*!< in: current rec */
5927 ulint col_no, /*!< in: column number */
5928 ulint mtype, /*!< in: column main type */
5929 ibool unsigned_type) /*!< in: signed or unsigned flag */
5930{
5931 ulint len;
5932 const byte* data;
5933 ib_uint64_t value;
5934 mem_heap_t* heap = NULL;
5935 ulint offsets_[REC_OFFS_NORMAL_SIZE];
5936 ulint* offsets = offsets_;
5937
5938 rec_offs_init(offsets_);
5939 ut_ad(page_rec_is_leaf(rec));
5940
5941 offsets = rec_get_offsets(rec, index, offsets, true,
5942 col_no + 1, &heap);
5943
5944 if (rec_offs_nth_sql_null(offsets, col_no)) {
5945 /* There is no non-NULL value in the auto-increment column. */
5946 value = 0;
5947 goto func_exit;
5948 }
5949
5950 data = rec_get_nth_field(rec, offsets, col_no, &len);
5951
5952 value = row_parse_int(data, len, mtype, unsigned_type);
5953
5954func_exit:
5955 if (UNIV_LIKELY_NULL(heap)) {
5956 mem_heap_free(heap);
5957 }
5958
5959 return(value);
5960}
5961
5962/** Get the maximum and non-delete-marked record in an index.
5963@param[in] index index tree
5964@param[in,out] mtr mini-transaction (may be committed and restarted)
5965@return maximum record, page s-latched in mtr
5966@retval NULL if there are no records, or if all of them are delete-marked */
5967static
5968const rec_t*
5969row_search_get_max_rec(
5970 dict_index_t* index,
5971 mtr_t* mtr)
5972{
5973 btr_pcur_t pcur;
5974 const rec_t* rec;
5975 /* Open at the high/right end (false), and init cursor */
5976 btr_pcur_open_at_index_side(
5977 false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr);
5978
5979 do {
5980 const page_t* page;
5981
5982 page = btr_pcur_get_page(&pcur);
5983 rec = page_find_rec_max_not_deleted(page);
5984
5985 if (page_rec_is_user_rec(rec)) {
5986 break;
5987 } else {
5988 rec = NULL;
5989 }
5990 btr_pcur_move_before_first_on_page(&pcur);
5991 } while (btr_pcur_move_to_prev(&pcur, mtr));
5992
5993 btr_pcur_close(&pcur);
5994
5995 ut_ad(!rec
5996 || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
5997 & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
5998 return(rec);
5999}
6000
6001/** Read the max AUTOINC value from an index.
6002@param[in] index index starting with an AUTO_INCREMENT column
6003@return the largest AUTO_INCREMENT value
6004@retval 0 if no records were found */
6005ib_uint64_t
6006row_search_max_autoinc(dict_index_t* index)
6007{
6008 const dict_field_t* dfield = dict_index_get_nth_field(index, 0);
6009
6010 ib_uint64_t value = 0;
6011
6012 mtr_t mtr;
6013 mtr.start();
6014
6015 if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) {
6016 value = row_search_autoinc_read_column(
6017 index, rec, 0,
6018 dfield->col->mtype,
6019 dfield->col->prtype & DATA_UNSIGNED);
6020 }
6021
6022 mtr.commit();
6023 return(value);
6024}
6025