row0undo.cc source code [MariaDB/storage/innobase/row/row0undo.cc]

1	/*****************************************************************************
2
3	Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2017, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19
20	/************************************************//**
21	@file row/row0undo.cc
22	Row undo
23
24	Created 1/8/1997 Heikki Tuuri
25	*******************************************************/
26
27	#include "ha_prototypes.h"
28
29	#include "row0undo.h"
30	#include "fsp0fsp.h"
31	#include "mach0data.h"
32	#include "trx0rseg.h"
33	#include "trx0trx.h"
34	#include "trx0roll.h"
35	#include "trx0undo.h"
36	#include "trx0purge.h"
37	#include "trx0rec.h"
38	#include "que0que.h"
39	#include "row0row.h"
40	#include "row0uins.h"
41	#include "row0umod.h"
42	#include "row0upd.h"
43	#include "row0mysql.h"
44	#include "srv0srv.h"
45	#include "srv0start.h"
46
47	/ How to undo row operations?*
48	(1) For an insert, we have stored a prefix of the clustered index record
49	in the undo log. Using it, we look for the clustered record, and using
50	that we look for the records in the secondary indexes. The insert operation
51	may have been left incomplete, if the database crashed, for example.
52	We may have look at the trx id and roll ptr to make sure the record in the
53	clustered index is really the one for which the undo log record was
54	written. We can use the framework we get from the original insert op.
55	(2) Delete marking: We can use the framework we get from the original
56	delete mark op. We only have to check the trx id.
57	(3) Update: This may be the most complicated. We have to use the framework
58	we get from the original update op.
59
60	What if the same trx repeatedly deletes and inserts an identical row.
61	Then the row id changes and also roll ptr. What if the row id was not
62	part of the ordering fields in the clustered index? Maybe we have to write
63	it to undo log. Well, maybe not, because if we order the row id and trx id
64	in descending order, then the only undeleted copy is the first in the
65	index. Our searches in row operations always position the cursor before
66	the first record in the result set. But, if there is no key defined for
67	a table, then it would be desirable that row id is in ascending order.
68	So, lets store row id in descending order only if it is not an ordering
69	field in the clustered index.
70
71	NOTE: Deletes and inserts may lead to situation where there are identical
72	records in a secondary index. Is that a problem in the B-tree? Yes.
73	Also updates can lead to this, unless trx id and roll ptr are included in
74	ord fields.
75	(1) Fix in clustered indexes: include row id, trx id, and roll ptr
76	in node pointers of B-tree.
77	(2) Fix in secondary indexes: include all fields in node pointers, and
78	if an entry is inserted, check if it is equal to the right neighbor,
79	in which case update the right neighbor: the neighbor must be delete
80	marked, set it unmarked and write the trx id of the current transaction.
81
82	What if the same trx repeatedly updates the same row, updating a secondary
83	index field or not? Updating a clustered index ordering field?
84
85	(1) If it does not update the secondary index and not the clustered index
86	ord field. Then the secondary index record stays unchanged, but the
87	trx id in the secondary index record may be smaller than in the clustered
88	index record. This is no problem?
89	(2) If it updates secondary index ord field but not clustered: then in
90	secondary index there are delete marked records, which differ in an
91	ord field. No problem.
92	(3) Updates clustered ord field but not secondary, and secondary index
93	is unique. Then the record in secondary index is just updated at the
94	clustered ord field.
95	(4)
96
97	Problem with duplicate records:
98	Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
99	bigger trx id has inserted and delete marked a similar row, our trx inserts
100	again a similar row, and a trx with an even bigger id delete marks it. Then
101	the position of the row should change in the index if the trx id affects
102	the alphabetical ordering.
103
104	Fix 2: If an insert encounters a similar row marked deleted, we turn the
105	insert into an 'update' of the row marked deleted. Then we must write undo
106	info on the update. A problem: what if a purge operation tries to remove
107	the delete marked row?
108
109	We can think of the database row versions as a linked list which starts
110	from the record in the clustered index, and is linked by roll ptrs
111	through undo logs. The secondary index records are references which tell
112	what kinds of records can be found in this linked list for a record
113	in the clustered index.
114
115	How to do the purge? A record can be removed from the clustered index
116	if its linked list becomes empty, i.e., the row has been marked deleted
117	and its roll ptr points to the record in the undo log we are going through,
118	doing the purge. Similarly, during a rollback, a record can be removed
119	if the stored roll ptr in the undo log points to a trx already (being) purged,
120	or if the roll ptr is NULL, i.e., it was a fresh insert. /*
121
122	/******************************************************************//**
123	Creates a row undo node to a query graph.
124	@return own: undo node /*
125	undo_node_t*
126	row_undo_node_create(
127	/=================/
128	trx_t* trx, /!< in/out: transaction /
129	que_thr_t* parent, /!< in: parent node, i.e., a thr node /
130	mem_heap_t* heap) /!< in: memory heap where created /
131	{
132	undo_node_t* undo;
133
134	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
135	\|\| trx_state_eq(trx, TRX_STATE_PREPARED));
136	ut_ad(parent);
137
138	undo = static_cast<undo_node_t*>(
139	mem_heap_alloc(heap, sizeof(undo_node_t)));
140
141	undo->common.type = QUE_NODE_UNDO;
142	undo->common.parent = parent;
143
144	undo->state = UNDO_NODE_FETCH_NEXT;
145	undo->trx = trx;
146
147	btr_pcur_init(&(undo->pcur));
148
149	undo->heap = mem_heap_create(`256`);
150
151	return(undo);
152	}
153
154	/*********************************************************//**
155	Looks for the clustered index record when node has the row reference.
156	The pcur in node is used in the search. If found, stores the row to node,
157	and stores the position of pcur, and detaches it. The pcur must be closed
158	by the caller in any case.
159	@return true if found; NOTE the node->pcur must be closed by the
160	caller, regardless of the return value /*
161	bool
162	row_undo_search_clust_to_pcur(
163	/==========================/
164	undo_node_t* node) /!< in/out: row undo node /
165	{
166	dict_index_t* clust_index;
167	bool found;
168	mtr_t mtr;
169	row_ext_t** ext;
170	const rec_t* rec;
171	mem_heap_t* heap = NULL;
172	ulint offsets_[REC_OFFS_NORMAL_SIZE];
173	ulint* offsets = offsets_;
174	rec_offs_init(offsets_);
175
176	ut_ad(!node->table->skip_alter_undo);
177
178	mtr_start(&mtr);
179
180	clust_index = dict_table_get_first_index(node->table);
181
182	found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
183	node->table, node->ref, &mtr);
184
185	if (!found) {
186	goto func_exit;
187	}
188
189	rec = btr_pcur_get_rec(&node->pcur);
190
191	offsets = rec_get_offsets(rec, clust_index, offsets, true,
192	ULINT_UNDEFINED, &heap);
193
194	found = row_get_rec_roll_ptr(rec, clust_index, offsets)
195	== node->roll_ptr;
196
197	if (found) {
198	ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
199	== node->trx->id);
200
201	if (dict_table_has_atomic_blobs(node->table)) {
202	/ There is no prefix of externally stored*
203	columns in the clustered index record. Build a
204	cache of column prefixes. /*
205	ext = &node->ext;
206	} else {
207	/ REDUNDANT and COMPACT formats store a local*
208	768-byte prefix of each externally stored
209	column. No cache is needed. /*
210	ext = NULL;
211	node->ext = NULL;
212	}
213
214	node->row = row_build(ROW_COPY_DATA, clust_index, rec,
215	offsets, NULL,
216	NULL, NULL, ext, node->heap);
217
218	/ We will need to parse out virtual column info from undo*
219	log, first mark them DATA_MISSING. So we will know if the
220	value gets updated /*
221	if (node->table->n_v_cols
222	&& node->state != UNDO_NODE_INSERT
223	&& !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
224	for (ulint i = `0`;
225	i < dict_table_get_n_v_cols(node->table); i++) {
226	dfield_get_type(dtuple_get_nth_v_field(
227	node->row, i))->mtype = DATA_MISSING;
228	}
229	}
230
231	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
232	ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG
233	\|\| node->row->info_bits == `0`);
234	node->undo_row = dtuple_copy(node->row, node->heap);
235	row_upd_replace(node->undo_row, &node->undo_ext,
236	clust_index, node->update, node->heap);
237	} else {
238	ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG)
239	== (node->rec_type == TRX_UNDO_INSERT_DEFAULT));
240	node->undo_row = NULL;
241	node->undo_ext = NULL;
242	}
243
244	btr_pcur_store_position(&node->pcur, &mtr);
245	}
246
247	if (heap) {
248	mem_heap_free(heap);
249	}
250
251	func_exit:
252	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
253	return(found);
254	}
255
256	/*********************************************************//**
257	Fetches an undo log record and does the undo for the recorded operation.
258	If none left, or a partial rollback completed, returns control to the
259	parent node, which is always a query thread node.
260	@return DB_SUCCESS if operation successfully completed, else error code /*
261	static MY_ATTRIBUTE((warn_unused_result))
262	dberr_t
263	row_undo(
264	/=====/
265	undo_node_t* node, /!< in: row undo node /
266	que_thr_t* thr) /!< in: query thread /
267	{
268	trx_t* trx = node->trx;
269	ut_ad(trx->in_rollback);
270
271	if (node->state == UNDO_NODE_FETCH_NEXT) {
272
273	node->undo_rec = trx_roll_pop_top_rec_of_trx(
274	trx, &node->roll_ptr, node->heap);
275
276	if (!node->undo_rec) {
277	/ Rollback completed for this query thread /
278	thr->run_node = que_node_get_parent(node);
279	return(DB_SUCCESS);
280	}
281
282	node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
283	node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr)
284	? UNDO_NODE_INSERT : UNDO_NODE_MODIFY;
285	}
286
287	/ Prevent DROP TABLE etc. while we are rolling back this row.*
288	If we are doing a TABLE CREATE or some other dictionary operation,
289	then we already have dict_operation_lock locked in x-mode. Do not
290	try to lock again, because that would cause a hang. /*
291
292	const bool locked_data_dict = (trx->dict_operation_lock_mode == `0`);
293
294	if (locked_data_dict) {
295
296	row_mysql_freeze_data_dictionary(trx);
297	}
298
299	dberr_t err;
300
301	if (node->state == UNDO_NODE_INSERT) {
302
303	err = row_undo_ins(node, thr);
304
305	node->state = UNDO_NODE_FETCH_NEXT;
306	} else {
307	ut_ad(node->state == UNDO_NODE_MODIFY);
308	err = row_undo_mod(node, thr);
309	}
310
311	if (locked_data_dict) {
312
313	row_mysql_unfreeze_data_dictionary(trx);
314	}
315
316	/ Do some cleanup /
317	btr_pcur_close(&(node->pcur));
318
319	mem_heap_empty(node->heap);
320
321	thr->run_node = node;
322
323	return(err);
324	}
325
326	/*********************************************************//**
327	Undoes a row operation in a table. This is a high-level function used
328	in SQL execution graphs.
329	@return query thread to run next or NULL /*
330	que_thr_t*
331	row_undo_step(
332	/==========/
333	que_thr_t* thr) /!< in: query thread /
334	{
335	dberr_t err;
336	undo_node_t* node;
337	trx_t* trx;
338
339	ut_ad(thr);
340
341	srv_inc_activity_count();
342
343	trx = thr_get_trx(thr);
344
345	node = static_cast<undo_node_t*>(thr->run_node);
346
347	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
348
349	if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE
350	&& !srv_undo_sources
351	&& !srv_is_being_started)
352	&& (srv_fast_shutdown == `3` \|\| trx == trx_roll_crash_recv_trx)) {
353	/ Shutdown has been initiated. /
354	trx->error_state = DB_INTERRUPTED;
355	return NULL;
356	}
357
358	if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
359	trx_roll_report_progress();
360	}
361
362	err = row_undo(node, thr);
363
364	trx->error_state = err;
365
366	if (err != DB_SUCCESS) {
367	/ SQL error detected /
368
369	if (err == DB_OUT_OF_FILE_SPACE) {
370	ib::fatal () << "Out of tablespace during rollback."
371	" Consider increasing your tablespace.";
372	}
373
374	ib::fatal () << "Error (" << ut_strerr(err) << ") in rollback.";
375	}
376
377	return(thr);
378	}
379

Browse the source code of MariaDB/storage/innobase/row/row0undo.cc