1/*****************************************************************************
2
3Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file row/row0undo.cc
22Row undo
23
24Created 1/8/1997 Heikki Tuuri
25*******************************************************/
26
27#include "ha_prototypes.h"
28
29#include "row0undo.h"
30#include "fsp0fsp.h"
31#include "mach0data.h"
32#include "trx0rseg.h"
33#include "trx0trx.h"
34#include "trx0roll.h"
35#include "trx0undo.h"
36#include "trx0purge.h"
37#include "trx0rec.h"
38#include "que0que.h"
39#include "row0row.h"
40#include "row0uins.h"
41#include "row0umod.h"
42#include "row0upd.h"
43#include "row0mysql.h"
44#include "srv0srv.h"
45#include "srv0start.h"
46
47/* How to undo row operations?
48(1) For an insert, we have stored a prefix of the clustered index record
49in the undo log. Using it, we look for the clustered record, and using
50that we look for the records in the secondary indexes. The insert operation
51may have been left incomplete, if the database crashed, for example.
52We may have look at the trx id and roll ptr to make sure the record in the
53clustered index is really the one for which the undo log record was
54written. We can use the framework we get from the original insert op.
55(2) Delete marking: We can use the framework we get from the original
56delete mark op. We only have to check the trx id.
57(3) Update: This may be the most complicated. We have to use the framework
58we get from the original update op.
59
60What if the same trx repeatedly deletes and inserts an identical row.
61Then the row id changes and also roll ptr. What if the row id was not
62part of the ordering fields in the clustered index? Maybe we have to write
63it to undo log. Well, maybe not, because if we order the row id and trx id
64in descending order, then the only undeleted copy is the first in the
65index. Our searches in row operations always position the cursor before
66the first record in the result set. But, if there is no key defined for
67a table, then it would be desirable that row id is in ascending order.
68So, lets store row id in descending order only if it is not an ordering
69field in the clustered index.
70
71NOTE: Deletes and inserts may lead to situation where there are identical
72records in a secondary index. Is that a problem in the B-tree? Yes.
73Also updates can lead to this, unless trx id and roll ptr are included in
74ord fields.
75(1) Fix in clustered indexes: include row id, trx id, and roll ptr
76in node pointers of B-tree.
77(2) Fix in secondary indexes: include all fields in node pointers, and
78if an entry is inserted, check if it is equal to the right neighbor,
79in which case update the right neighbor: the neighbor must be delete
80marked, set it unmarked and write the trx id of the current transaction.
81
82What if the same trx repeatedly updates the same row, updating a secondary
83index field or not? Updating a clustered index ordering field?
84
85(1) If it does not update the secondary index and not the clustered index
86ord field. Then the secondary index record stays unchanged, but the
87trx id in the secondary index record may be smaller than in the clustered
88index record. This is no problem?
89(2) If it updates secondary index ord field but not clustered: then in
90secondary index there are delete marked records, which differ in an
91ord field. No problem.
92(3) Updates clustered ord field but not secondary, and secondary index
93is unique. Then the record in secondary index is just updated at the
94clustered ord field.
95(4)
96
97Problem with duplicate records:
98Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
99bigger trx id has inserted and delete marked a similar row, our trx inserts
100again a similar row, and a trx with an even bigger id delete marks it. Then
101the position of the row should change in the index if the trx id affects
102the alphabetical ordering.
103
104Fix 2: If an insert encounters a similar row marked deleted, we turn the
105insert into an 'update' of the row marked deleted. Then we must write undo
106info on the update. A problem: what if a purge operation tries to remove
107the delete marked row?
108
109We can think of the database row versions as a linked list which starts
110from the record in the clustered index, and is linked by roll ptrs
111through undo logs. The secondary index records are references which tell
112what kinds of records can be found in this linked list for a record
113in the clustered index.
114
115How to do the purge? A record can be removed from the clustered index
116if its linked list becomes empty, i.e., the row has been marked deleted
117and its roll ptr points to the record in the undo log we are going through,
118doing the purge. Similarly, during a rollback, a record can be removed
119if the stored roll ptr in the undo log points to a trx already (being) purged,
120or if the roll ptr is NULL, i.e., it was a fresh insert. */
121
122/********************************************************************//**
123Creates a row undo node to a query graph.
124@return own: undo node */
125undo_node_t*
126row_undo_node_create(
127/*=================*/
128 trx_t* trx, /*!< in/out: transaction */
129 que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
130 mem_heap_t* heap) /*!< in: memory heap where created */
131{
132 undo_node_t* undo;
133
134 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
135 || trx_state_eq(trx, TRX_STATE_PREPARED));
136 ut_ad(parent);
137
138 undo = static_cast<undo_node_t*>(
139 mem_heap_alloc(heap, sizeof(undo_node_t)));
140
141 undo->common.type = QUE_NODE_UNDO;
142 undo->common.parent = parent;
143
144 undo->state = UNDO_NODE_FETCH_NEXT;
145 undo->trx = trx;
146
147 btr_pcur_init(&(undo->pcur));
148
149 undo->heap = mem_heap_create(256);
150
151 return(undo);
152}
153
154/***********************************************************//**
155Looks for the clustered index record when node has the row reference.
156The pcur in node is used in the search. If found, stores the row to node,
157and stores the position of pcur, and detaches it. The pcur must be closed
158by the caller in any case.
159@return true if found; NOTE the node->pcur must be closed by the
160caller, regardless of the return value */
161bool
162row_undo_search_clust_to_pcur(
163/*==========================*/
164 undo_node_t* node) /*!< in/out: row undo node */
165{
166 dict_index_t* clust_index;
167 bool found;
168 mtr_t mtr;
169 row_ext_t** ext;
170 const rec_t* rec;
171 mem_heap_t* heap = NULL;
172 ulint offsets_[REC_OFFS_NORMAL_SIZE];
173 ulint* offsets = offsets_;
174 rec_offs_init(offsets_);
175
176 ut_ad(!node->table->skip_alter_undo);
177
178 mtr_start(&mtr);
179
180 clust_index = dict_table_get_first_index(node->table);
181
182 found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
183 node->table, node->ref, &mtr);
184
185 if (!found) {
186 goto func_exit;
187 }
188
189 rec = btr_pcur_get_rec(&node->pcur);
190
191 offsets = rec_get_offsets(rec, clust_index, offsets, true,
192 ULINT_UNDEFINED, &heap);
193
194 found = row_get_rec_roll_ptr(rec, clust_index, offsets)
195 == node->roll_ptr;
196
197 if (found) {
198 ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
199 == node->trx->id);
200
201 if (dict_table_has_atomic_blobs(node->table)) {
202 /* There is no prefix of externally stored
203 columns in the clustered index record. Build a
204 cache of column prefixes. */
205 ext = &node->ext;
206 } else {
207 /* REDUNDANT and COMPACT formats store a local
208 768-byte prefix of each externally stored
209 column. No cache is needed. */
210 ext = NULL;
211 node->ext = NULL;
212 }
213
214 node->row = row_build(ROW_COPY_DATA, clust_index, rec,
215 offsets, NULL,
216 NULL, NULL, ext, node->heap);
217
218 /* We will need to parse out virtual column info from undo
219 log, first mark them DATA_MISSING. So we will know if the
220 value gets updated */
221 if (node->table->n_v_cols
222 && node->state != UNDO_NODE_INSERT
223 && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
224 for (ulint i = 0;
225 i < dict_table_get_n_v_cols(node->table); i++) {
226 dfield_get_type(dtuple_get_nth_v_field(
227 node->row, i))->mtype = DATA_MISSING;
228 }
229 }
230
231 if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
232 ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG
233 || node->row->info_bits == 0);
234 node->undo_row = dtuple_copy(node->row, node->heap);
235 row_upd_replace(node->undo_row, &node->undo_ext,
236 clust_index, node->update, node->heap);
237 } else {
238 ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG)
239 == (node->rec_type == TRX_UNDO_INSERT_DEFAULT));
240 node->undo_row = NULL;
241 node->undo_ext = NULL;
242 }
243
244 btr_pcur_store_position(&node->pcur, &mtr);
245 }
246
247 if (heap) {
248 mem_heap_free(heap);
249 }
250
251func_exit:
252 btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
253 return(found);
254}
255
256/***********************************************************//**
257Fetches an undo log record and does the undo for the recorded operation.
258If none left, or a partial rollback completed, returns control to the
259parent node, which is always a query thread node.
260@return DB_SUCCESS if operation successfully completed, else error code */
261static MY_ATTRIBUTE((warn_unused_result))
262dberr_t
263row_undo(
264/*=====*/
265 undo_node_t* node, /*!< in: row undo node */
266 que_thr_t* thr) /*!< in: query thread */
267{
268 trx_t* trx = node->trx;
269 ut_ad(trx->in_rollback);
270
271 if (node->state == UNDO_NODE_FETCH_NEXT) {
272
273 node->undo_rec = trx_roll_pop_top_rec_of_trx(
274 trx, &node->roll_ptr, node->heap);
275
276 if (!node->undo_rec) {
277 /* Rollback completed for this query thread */
278 thr->run_node = que_node_get_parent(node);
279 return(DB_SUCCESS);
280 }
281
282 node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
283 node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr)
284 ? UNDO_NODE_INSERT : UNDO_NODE_MODIFY;
285 }
286
287 /* Prevent DROP TABLE etc. while we are rolling back this row.
288 If we are doing a TABLE CREATE or some other dictionary operation,
289 then we already have dict_operation_lock locked in x-mode. Do not
290 try to lock again, because that would cause a hang. */
291
292 const bool locked_data_dict = (trx->dict_operation_lock_mode == 0);
293
294 if (locked_data_dict) {
295
296 row_mysql_freeze_data_dictionary(trx);
297 }
298
299 dberr_t err;
300
301 if (node->state == UNDO_NODE_INSERT) {
302
303 err = row_undo_ins(node, thr);
304
305 node->state = UNDO_NODE_FETCH_NEXT;
306 } else {
307 ut_ad(node->state == UNDO_NODE_MODIFY);
308 err = row_undo_mod(node, thr);
309 }
310
311 if (locked_data_dict) {
312
313 row_mysql_unfreeze_data_dictionary(trx);
314 }
315
316 /* Do some cleanup */
317 btr_pcur_close(&(node->pcur));
318
319 mem_heap_empty(node->heap);
320
321 thr->run_node = node;
322
323 return(err);
324}
325
326/***********************************************************//**
327Undoes a row operation in a table. This is a high-level function used
328in SQL execution graphs.
329@return query thread to run next or NULL */
330que_thr_t*
331row_undo_step(
332/*==========*/
333 que_thr_t* thr) /*!< in: query thread */
334{
335 dberr_t err;
336 undo_node_t* node;
337 trx_t* trx;
338
339 ut_ad(thr);
340
341 srv_inc_activity_count();
342
343 trx = thr_get_trx(thr);
344
345 node = static_cast<undo_node_t*>(thr->run_node);
346
347 ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
348
349 if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE
350 && !srv_undo_sources
351 && !srv_is_being_started)
352 && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
353 /* Shutdown has been initiated. */
354 trx->error_state = DB_INTERRUPTED;
355 return NULL;
356 }
357
358 if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
359 trx_roll_report_progress();
360 }
361
362 err = row_undo(node, thr);
363
364 trx->error_state = err;
365
366 if (err != DB_SUCCESS) {
367 /* SQL error detected */
368
369 if (err == DB_OUT_OF_FILE_SPACE) {
370 ib::fatal() << "Out of tablespace during rollback."
371 " Consider increasing your tablespace.";
372 }
373
374 ib::fatal() << "Error (" << ut_strerr(err) << ") in rollback.";
375 }
376
377 return(thr);
378}
379