1/*****************************************************************************
2
3Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, 2018, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file row/row0uins.cc
22Fresh insert undo
23
24Created 2/25/1997 Heikki Tuuri
25*******************************************************/
26
27#include "row0uins.h"
28#include "dict0dict.h"
29#include "dict0stats.h"
30#include "dict0boot.h"
31#include "dict0crea.h"
32#include "trx0undo.h"
33#include "trx0roll.h"
34#include "btr0btr.h"
35#include "mach0data.h"
36#include "row0undo.h"
37#include "row0vers.h"
38#include "row0log.h"
39#include "trx0trx.h"
40#include "trx0rec.h"
41#include "row0row.h"
42#include "row0upd.h"
43#include "que0que.h"
44#include "ibuf0ibuf.h"
45#include "log0log.h"
46#include "fil0fil.h"
47
48/*************************************************************************
49IMPORTANT NOTE: Any operation that generates redo MUST check that there
50is enough space in the redo log before for that operation. This is
51done by calling log_free_check(). The reason for checking the
52availability of the redo log space before the start of the operation is
53that we MUST not hold any synchonization objects when performing the
54check.
55If you make a change in this module make sure that no codepath is
56introduced where a call to log_free_check() is bypassed. */
57
58/***************************************************************//**
59Removes a clustered index record. The pcur in node was positioned on the
60record, now it is detached.
61@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
62static MY_ATTRIBUTE((nonnull, warn_unused_result))
63dberr_t
64row_undo_ins_remove_clust_rec(
65/*==========================*/
66 undo_node_t* node) /*!< in: undo node */
67{
68 btr_cur_t* btr_cur;
69 ibool success;
70 dberr_t err;
71 ulint n_tries = 0;
72 mtr_t mtr;
73 dict_index_t* index = node->pcur.btr_cur.index;
74 bool online;
75
76 ut_ad(dict_index_is_clust(index));
77 ut_ad(node->trx->in_rollback);
78
79 mtr.start();
80 if (index->table->is_temporary()) {
81 ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
82 mtr.set_log_mode(MTR_LOG_NO_REDO);
83 } else {
84 index->set_modified(mtr);
85 }
86
87 /* This is similar to row_undo_mod_clust(). The DDL thread may
88 already have copied this row from the log to the new table.
89 We must log the removal, so that the row will be correctly
90 purged. However, we can log the removal out of sync with the
91 B-tree modification. */
92
93 online = dict_index_is_online_ddl(index);
94 if (online) {
95 ut_ad(node->trx->dict_operation_lock_mode
96 != RW_X_LATCH);
97 ut_ad(node->table->id != DICT_INDEXES_ID);
98 mtr_s_lock(dict_index_get_lock(index), &mtr);
99 }
100
101 success = btr_pcur_restore_position(
102 online
103 ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
104 : BTR_MODIFY_LEAF, &node->pcur, &mtr);
105 ut_a(success);
106
107 btr_cur = btr_pcur_get_btr_cur(&node->pcur);
108
109 ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index)
110 == node->trx->id);
111 ut_ad(!rec_get_deleted_flag(
112 btr_cur_get_rec(btr_cur),
113 dict_table_is_comp(btr_cur->index->table)));
114
115 if (online && dict_index_is_online_ddl(index)) {
116 const rec_t* rec = btr_cur_get_rec(btr_cur);
117 mem_heap_t* heap = NULL;
118 const ulint* offsets = rec_get_offsets(
119 rec, index, NULL, true, ULINT_UNDEFINED, &heap);
120 row_log_table_delete(rec, index, offsets, NULL);
121 mem_heap_free(heap);
122 }
123
124 switch (node->table->id) {
125 case DICT_INDEXES_ID:
126 ut_ad(!online);
127 ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
128 ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
129
130 dict_drop_index_tree(
131 btr_pcur_get_rec(&node->pcur), &(node->pcur), &mtr);
132
133 mtr.commit();
134
135 mtr.start();
136
137 success = btr_pcur_restore_position(
138 BTR_MODIFY_LEAF, &node->pcur, &mtr);
139 ut_a(success);
140 break;
141 case DICT_COLUMNS_ID:
142 /* This is rolling back an INSERT into SYS_COLUMNS.
143 If it was part of an instant ADD COLUMN operation, we
144 must modify the table definition. At this point, any
145 corresponding operation to the 'default row' will have
146 been rolled back. */
147 ut_ad(!online);
148 ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
149 ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
150 const rec_t* rec = btr_pcur_get_rec(&node->pcur);
151 if (rec_get_n_fields_old(rec)
152 != DICT_NUM_FIELDS__SYS_COLUMNS) {
153 break;
154 }
155 ulint len;
156 const byte* data = rec_get_nth_field_old(
157 rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
158 if (len != 8) {
159 break;
160 }
161 const table_id_t table_id = mach_read_from_8(data);
162 data = rec_get_nth_field_old(rec, DICT_FLD__SYS_COLUMNS__POS,
163 &len);
164 if (len != 4) {
165 break;
166 }
167 const unsigned pos = mach_read_from_4(data);
168 if (pos == 0 || pos >= (1U << 16)) {
169 break;
170 }
171 dict_table_t* table = dict_table_open_on_id(
172 table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
173 if (!table) {
174 break;
175 }
176
177 dict_index_t* index = dict_table_get_first_index(table);
178
179 if (index && index->is_instant()
180 && DATA_N_SYS_COLS + 1 + pos == table->n_cols) {
181 /* This is the rollback of an instant ADD COLUMN.
182 Remove the column from the dictionary cache,
183 but keep the system columns. */
184 table->rollback_instant(pos);
185 }
186
187 dict_table_close(table, true, false);
188 }
189
190 if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
191 err = DB_SUCCESS;
192 goto func_exit;
193 }
194
195 btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
196retry:
197 /* If did not succeed, try pessimistic descent to tree */
198 mtr.start();
199 if (index->table->is_temporary()) {
200 mtr.set_log_mode(MTR_LOG_NO_REDO);
201 } else {
202 index->set_modified(mtr);
203 }
204
205 success = btr_pcur_restore_position(
206 BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
207 &node->pcur, &mtr);
208 ut_a(success);
209
210 btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, true, &mtr);
211
212 /* The delete operation may fail if we have little
213 file space left: TODO: easiest to crash the database
214 and restart with more file space */
215
216 if (err == DB_OUT_OF_FILE_SPACE
217 && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
218
219 btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
220
221 n_tries++;
222
223 os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
224
225 goto retry;
226 }
227
228func_exit:
229 btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
230 if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_DEFAULT) {
231 /* When rolling back the very first instant ADD COLUMN
232 operation, reset the root page to the basic state. */
233 ut_ad(!index->table->is_temporary());
234 mtr.start();
235 if (page_t* root = btr_root_get(index, &mtr)) {
236 byte* page_type = root + FIL_PAGE_TYPE;
237 ut_ad(mach_read_from_2(page_type)
238 == FIL_PAGE_TYPE_INSTANT
239 || mach_read_from_2(page_type)
240 == FIL_PAGE_INDEX);
241 index->set_modified(mtr);
242 mlog_write_ulint(page_type, FIL_PAGE_INDEX,
243 MLOG_2BYTES, &mtr);
244 byte* instant = PAGE_INSTANT + PAGE_HEADER + root;
245 mlog_write_ulint(instant,
246 page_ptr_get_direction(instant + 1),
247 MLOG_2BYTES, &mtr);
248 }
249 mtr.commit();
250 }
251
252 return(err);
253}
254
255/***************************************************************//**
256Removes a secondary index entry if found.
257@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
258static MY_ATTRIBUTE((nonnull, warn_unused_result))
259dberr_t
260row_undo_ins_remove_sec_low(
261/*========================*/
262 ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
263 depending on whether we wish optimistic or
264 pessimistic descent down the index tree */
265 dict_index_t* index, /*!< in: index */
266 dtuple_t* entry, /*!< in: index entry to remove */
267 que_thr_t* thr) /*!< in: query thread */
268{
269 btr_pcur_t pcur;
270 btr_cur_t* btr_cur;
271 dberr_t err = DB_SUCCESS;
272 mtr_t mtr;
273 enum row_search_result search_result;
274 const bool modify_leaf = mode == BTR_MODIFY_LEAF;
275
276 memset(&pcur, 0, sizeof(pcur));
277
278 row_mtr_start(&mtr, index, !modify_leaf);
279
280 if (modify_leaf) {
281 mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
282 mtr_s_lock(dict_index_get_lock(index), &mtr);
283 } else {
284 ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
285 mtr_sx_lock(dict_index_get_lock(index), &mtr);
286 }
287
288 if (row_log_online_op_try(index, entry, 0)) {
289 goto func_exit_no_pcur;
290 }
291
292 if (dict_index_is_spatial(index)) {
293 if (modify_leaf) {
294 mode |= BTR_RTREE_DELETE_MARK;
295 }
296 btr_pcur_get_btr_cur(&pcur)->thr = thr;
297 mode |= BTR_RTREE_UNDO_INS;
298 }
299
300 search_result = row_search_index_entry(index, entry, mode,
301 &pcur, &mtr);
302
303 switch (search_result) {
304 case ROW_NOT_FOUND:
305 goto func_exit;
306 case ROW_FOUND:
307 if (dict_index_is_spatial(index)
308 && rec_get_deleted_flag(
309 btr_pcur_get_rec(&pcur),
310 dict_table_is_comp(index->table))) {
311 ib::error() << "Record found in index " << index->name
312 << " is deleted marked on insert rollback.";
313 ut_ad(0);
314 }
315 break;
316
317 case ROW_BUFFERED:
318 case ROW_NOT_DELETED_REF:
319 /* These are invalid outcomes, because the mode passed
320 to row_search_index_entry() did not include any of the
321 flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
322 ut_error;
323 }
324
325 btr_cur = btr_pcur_get_btr_cur(&pcur);
326
327 if (modify_leaf) {
328 err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
329 ? DB_SUCCESS : DB_FAIL;
330 } else {
331 /* Passing rollback=false here, because we are
332 deleting a secondary index record: the distinction
333 only matters when deleting a record that contains
334 externally stored columns. */
335 ut_ad(!dict_index_is_clust(index));
336 btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
337 false, &mtr);
338 }
339func_exit:
340 btr_pcur_close(&pcur);
341func_exit_no_pcur:
342 mtr_commit(&mtr);
343
344 return(err);
345}
346
347/***************************************************************//**
348Removes a secondary index entry from the index if found. Tries first
349optimistic, then pessimistic descent down the tree.
350@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
351static MY_ATTRIBUTE((nonnull, warn_unused_result))
352dberr_t
353row_undo_ins_remove_sec(
354/*====================*/
355 dict_index_t* index, /*!< in: index */
356 dtuple_t* entry, /*!< in: index entry to insert */
357 que_thr_t* thr) /*!< in: query thread */
358{
359 dberr_t err;
360 ulint n_tries = 0;
361
362 /* Try first optimistic descent to the B-tree */
363
364 err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
365
366 if (err == DB_SUCCESS) {
367
368 return(err);
369 }
370
371 /* Try then pessimistic descent to the B-tree */
372retry:
373 err = row_undo_ins_remove_sec_low(
374 BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
375 index, entry, thr);
376
377 /* The delete operation may fail if we have little
378 file space left: TODO: easiest to crash the database
379 and restart with more file space */
380
381 if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
382
383 n_tries++;
384
385 os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
386
387 goto retry;
388 }
389
390 return(err);
391}
392
393/***********************************************************//**
394Parses the row reference and other info in a fresh insert undo record. */
395static
396void
397row_undo_ins_parse_undo_rec(
398/*========================*/
399 undo_node_t* node, /*!< in/out: row undo node */
400 ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */
401{
402 dict_index_t* clust_index;
403 byte* ptr;
404 undo_no_t undo_no;
405 table_id_t table_id;
406 ulint dummy;
407 bool dummy_extern;
408
409 ut_ad(node);
410
411 ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy,
412 &dummy_extern, &undo_no, &table_id);
413
414 node->update = NULL;
415 node->table = dict_table_open_on_id(
416 table_id, dict_locked, DICT_TABLE_OP_NORMAL);
417
418 /* Skip the UNDO if we can't find the table or the .ibd file. */
419 if (UNIV_UNLIKELY(node->table == NULL)) {
420 return;
421 }
422
423 switch (node->rec_type) {
424 default:
425 ut_ad(!"wrong undo record type");
426 goto close_table;
427 case TRX_UNDO_INSERT_DEFAULT:
428 case TRX_UNDO_INSERT_REC:
429 break;
430 case TRX_UNDO_RENAME_TABLE:
431 dict_table_t* table = node->table;
432 ut_ad(!table->is_temporary());
433 ut_ad(dict_table_is_file_per_table(table)
434 == !is_system_tablespace(table->space->id));
435 size_t len = mach_read_from_2(node->undo_rec)
436 + size_t(node->undo_rec - ptr) - 2;
437 ptr[len] = 0;
438 const char* name = reinterpret_cast<char*>(ptr);
439 if (strcmp(table->name.m_name, name)) {
440 dict_table_rename_in_cache(table, name, false);
441 }
442 goto close_table;
443 }
444
445 if (UNIV_UNLIKELY(!fil_table_accessible(node->table))) {
446close_table:
447 /* Normally, tables should not disappear or become
448 unaccessible during ROLLBACK, because they should be
449 protected by InnoDB table locks. TRUNCATE TABLE
450 or table corruption could be valid exceptions.
451
452 FIXME: When running out of temporary tablespace, it
453 would probably be better to just drop all temporary
454 tables (and temporary undo log records) of the current
455 connection, instead of doing this rollback. */
456 dict_table_close(node->table, dict_locked, FALSE);
457 node->table = NULL;
458 } else {
459 ut_ad(!node->table->skip_alter_undo);
460 clust_index = dict_table_get_first_index(node->table);
461
462 if (clust_index != NULL) {
463 if (node->rec_type == TRX_UNDO_INSERT_REC) {
464 ptr = trx_undo_rec_get_row_ref(
465 ptr, clust_index, &node->ref,
466 node->heap);
467 } else {
468 node->ref = &trx_undo_default_rec;
469 }
470
471 if (!row_undo_search_clust_to_pcur(node)) {
472 /* An error probably occurred during
473 an insert into the clustered index,
474 after we wrote the undo log record. */
475 goto close_table;
476 }
477 if (node->table->n_v_cols) {
478 trx_undo_read_v_cols(node->table, ptr,
479 node->row, false);
480 }
481
482 } else {
483 ib::warn() << "Table " << node->table->name
484 << " has no indexes,"
485 " ignoring the table";
486 goto close_table;
487 }
488 }
489}
490
491/***************************************************************//**
492Removes secondary index records.
493@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
494static MY_ATTRIBUTE((nonnull, warn_unused_result))
495dberr_t
496row_undo_ins_remove_sec_rec(
497/*========================*/
498 undo_node_t* node, /*!< in/out: row undo node */
499 que_thr_t* thr) /*!< in: query thread */
500{
501 dberr_t err = DB_SUCCESS;
502 dict_index_t* index = node->index;
503 mem_heap_t* heap;
504
505 heap = mem_heap_create(1024);
506
507 while (index != NULL) {
508 dtuple_t* entry;
509
510 if (index->type & DICT_FTS) {
511 dict_table_next_uncorrupted_index(index);
512 continue;
513 }
514
515 /* An insert undo record TRX_UNDO_INSERT_REC will
516 always contain all fields of the index. It does not
517 matter if any indexes were created afterwards; all
518 index entries can be reconstructed from the row. */
519 entry = row_build_index_entry(
520 node->row, node->ext, index, heap);
521 if (UNIV_UNLIKELY(!entry)) {
522 /* The database must have crashed after
523 inserting a clustered index record but before
524 writing all the externally stored columns of
525 that record, or a statement is being rolled
526 back because an error occurred while storing
527 off-page columns.
528
529 Because secondary index entries are inserted
530 after the clustered index record, we may
531 assume that the secondary index record does
532 not exist. */
533 } else {
534 err = row_undo_ins_remove_sec(index, entry, thr);
535
536 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
537 goto func_exit;
538 }
539 }
540
541 mem_heap_empty(heap);
542 dict_table_next_uncorrupted_index(index);
543 }
544
545func_exit:
546 node->index = index;
547 mem_heap_free(heap);
548 return(err);
549}
550
551/***********************************************************//**
552Undoes a fresh insert of a row to a table. A fresh insert means that
553the same clustered index unique key did not have any record, even delete
554marked, at the time of the insert. InnoDB is eager in a rollback:
555if it figures out that an index record will be removed in the purge
556anyway, it will remove it in the rollback.
557@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
558dberr_t
559row_undo_ins(
560/*=========*/
561 undo_node_t* node, /*!< in: row undo node */
562 que_thr_t* thr) /*!< in: query thread */
563{
564 dberr_t err;
565 ibool dict_locked;
566
567 ut_ad(node->state == UNDO_NODE_INSERT);
568 ut_ad(node->trx->in_rollback);
569 ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr));
570
571 dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH;
572
573 row_undo_ins_parse_undo_rec(node, dict_locked);
574
575 if (node->table == NULL) {
576 return(DB_SUCCESS);
577 }
578
579 /* Iterate over all the indexes and undo the insert.*/
580
581 node->index = dict_table_get_first_index(node->table);
582 ut_ad(dict_index_is_clust(node->index));
583
584 switch (node->rec_type) {
585 default:
586 ut_ad(!"wrong undo record type");
587 case TRX_UNDO_INSERT_REC:
588 /* Skip the clustered index (the first index) */
589 node->index = dict_table_get_next_index(node->index);
590
591 dict_table_skip_corrupt_index(node->index);
592
593 err = row_undo_ins_remove_sec_rec(node, thr);
594
595 if (err != DB_SUCCESS) {
596 break;
597 }
598
599 /* fall through */
600 case TRX_UNDO_INSERT_DEFAULT:
601 log_free_check();
602
603 if (node->table->id == DICT_INDEXES_ID) {
604 ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
605
606 if (!dict_locked) {
607 mutex_enter(&dict_sys->mutex);
608 }
609 }
610
611 // FIXME: We need to update the dict_index_t::space and
612 // page number fields too.
613 err = row_undo_ins_remove_clust_rec(node);
614
615 if (node->table->id == DICT_INDEXES_ID
616 && !dict_locked) {
617
618 mutex_exit(&dict_sys->mutex);
619 }
620
621 if (err == DB_SUCCESS && node->table->stat_initialized) {
622 /* Not protected by dict_table_stats_lock() for
623 performance reasons, we would rather get garbage
624 in stat_n_rows (which is just an estimate anyway)
625 than protecting the following code with a latch. */
626 dict_table_n_rows_dec(node->table);
627
628 /* Do not attempt to update statistics when
629 executing ROLLBACK in the InnoDB SQL
630 interpreter, because in that case we would
631 already be holding dict_sys->mutex, which
632 would be acquired when updating statistics. */
633 if (!dict_locked) {
634 dict_stats_update_if_needed(node->table);
635 }
636 }
637 }
638
639 dict_table_close(node->table, dict_locked, FALSE);
640
641 node->table = NULL;
642
643 return(err);
644}
645