1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2017, MariaDB Corporation. |
5 | |
6 | This program is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free Software |
8 | Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, but WITHOUT |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License along with |
15 | this program; if not, write to the Free Software Foundation, Inc., |
16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
17 | |
18 | *****************************************************************************/ |
19 | |
20 | /**************************************************//** |
21 | @file row/row0undo.cc |
22 | Row undo |
23 | |
24 | Created 1/8/1997 Heikki Tuuri |
25 | *******************************************************/ |
26 | |
27 | #include "ha_prototypes.h" |
28 | |
29 | #include "row0undo.h" |
30 | #include "fsp0fsp.h" |
31 | #include "mach0data.h" |
32 | #include "trx0rseg.h" |
33 | #include "trx0trx.h" |
34 | #include "trx0roll.h" |
35 | #include "trx0undo.h" |
36 | #include "trx0purge.h" |
37 | #include "trx0rec.h" |
38 | #include "que0que.h" |
39 | #include "row0row.h" |
40 | #include "row0uins.h" |
41 | #include "row0umod.h" |
42 | #include "row0upd.h" |
43 | #include "row0mysql.h" |
44 | #include "srv0srv.h" |
45 | #include "srv0start.h" |
46 | |
47 | /* How to undo row operations? |
48 | (1) For an insert, we have stored a prefix of the clustered index record |
49 | in the undo log. Using it, we look for the clustered record, and using |
50 | that we look for the records in the secondary indexes. The insert operation |
51 | may have been left incomplete, if the database crashed, for example. |
52 | We may have look at the trx id and roll ptr to make sure the record in the |
53 | clustered index is really the one for which the undo log record was |
54 | written. We can use the framework we get from the original insert op. |
55 | (2) Delete marking: We can use the framework we get from the original |
56 | delete mark op. We only have to check the trx id. |
57 | (3) Update: This may be the most complicated. We have to use the framework |
58 | we get from the original update op. |
59 | |
60 | What if the same trx repeatedly deletes and inserts an identical row. |
61 | Then the row id changes and also roll ptr. What if the row id was not |
62 | part of the ordering fields in the clustered index? Maybe we have to write |
63 | it to undo log. Well, maybe not, because if we order the row id and trx id |
64 | in descending order, then the only undeleted copy is the first in the |
65 | index. Our searches in row operations always position the cursor before |
66 | the first record in the result set. But, if there is no key defined for |
67 | a table, then it would be desirable that row id is in ascending order. |
68 | So, lets store row id in descending order only if it is not an ordering |
69 | field in the clustered index. |
70 | |
71 | NOTE: Deletes and inserts may lead to situation where there are identical |
72 | records in a secondary index. Is that a problem in the B-tree? Yes. |
73 | Also updates can lead to this, unless trx id and roll ptr are included in |
74 | ord fields. |
75 | (1) Fix in clustered indexes: include row id, trx id, and roll ptr |
76 | in node pointers of B-tree. |
77 | (2) Fix in secondary indexes: include all fields in node pointers, and |
78 | if an entry is inserted, check if it is equal to the right neighbor, |
79 | in which case update the right neighbor: the neighbor must be delete |
80 | marked, set it unmarked and write the trx id of the current transaction. |
81 | |
82 | What if the same trx repeatedly updates the same row, updating a secondary |
83 | index field or not? Updating a clustered index ordering field? |
84 | |
85 | (1) If it does not update the secondary index and not the clustered index |
86 | ord field. Then the secondary index record stays unchanged, but the |
87 | trx id in the secondary index record may be smaller than in the clustered |
88 | index record. This is no problem? |
89 | (2) If it updates secondary index ord field but not clustered: then in |
90 | secondary index there are delete marked records, which differ in an |
91 | ord field. No problem. |
92 | (3) Updates clustered ord field but not secondary, and secondary index |
93 | is unique. Then the record in secondary index is just updated at the |
94 | clustered ord field. |
95 | (4) |
96 | |
97 | Problem with duplicate records: |
98 | Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a |
99 | bigger trx id has inserted and delete marked a similar row, our trx inserts |
100 | again a similar row, and a trx with an even bigger id delete marks it. Then |
101 | the position of the row should change in the index if the trx id affects |
102 | the alphabetical ordering. |
103 | |
104 | Fix 2: If an insert encounters a similar row marked deleted, we turn the |
105 | insert into an 'update' of the row marked deleted. Then we must write undo |
106 | info on the update. A problem: what if a purge operation tries to remove |
107 | the delete marked row? |
108 | |
109 | We can think of the database row versions as a linked list which starts |
110 | from the record in the clustered index, and is linked by roll ptrs |
111 | through undo logs. The secondary index records are references which tell |
112 | what kinds of records can be found in this linked list for a record |
113 | in the clustered index. |
114 | |
115 | How to do the purge? A record can be removed from the clustered index |
116 | if its linked list becomes empty, i.e., the row has been marked deleted |
117 | and its roll ptr points to the record in the undo log we are going through, |
118 | doing the purge. Similarly, during a rollback, a record can be removed |
119 | if the stored roll ptr in the undo log points to a trx already (being) purged, |
120 | or if the roll ptr is NULL, i.e., it was a fresh insert. */ |
121 | |
122 | /********************************************************************//** |
123 | Creates a row undo node to a query graph. |
124 | @return own: undo node */ |
125 | undo_node_t* |
126 | row_undo_node_create( |
127 | /*=================*/ |
128 | trx_t* trx, /*!< in/out: transaction */ |
129 | que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ |
130 | mem_heap_t* heap) /*!< in: memory heap where created */ |
131 | { |
132 | undo_node_t* undo; |
133 | |
134 | ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) |
135 | || trx_state_eq(trx, TRX_STATE_PREPARED)); |
136 | ut_ad(parent); |
137 | |
138 | undo = static_cast<undo_node_t*>( |
139 | mem_heap_alloc(heap, sizeof(undo_node_t))); |
140 | |
141 | undo->common.type = QUE_NODE_UNDO; |
142 | undo->common.parent = parent; |
143 | |
144 | undo->state = UNDO_NODE_FETCH_NEXT; |
145 | undo->trx = trx; |
146 | |
147 | btr_pcur_init(&(undo->pcur)); |
148 | |
149 | undo->heap = mem_heap_create(256); |
150 | |
151 | return(undo); |
152 | } |
153 | |
154 | /***********************************************************//** |
155 | Looks for the clustered index record when node has the row reference. |
156 | The pcur in node is used in the search. If found, stores the row to node, |
157 | and stores the position of pcur, and detaches it. The pcur must be closed |
158 | by the caller in any case. |
159 | @return true if found; NOTE the node->pcur must be closed by the |
160 | caller, regardless of the return value */ |
161 | bool |
162 | row_undo_search_clust_to_pcur( |
163 | /*==========================*/ |
164 | undo_node_t* node) /*!< in/out: row undo node */ |
165 | { |
166 | dict_index_t* clust_index; |
167 | bool found; |
168 | mtr_t mtr; |
169 | row_ext_t** ext; |
170 | const rec_t* rec; |
171 | mem_heap_t* heap = NULL; |
172 | ulint offsets_[REC_OFFS_NORMAL_SIZE]; |
173 | ulint* offsets = offsets_; |
174 | rec_offs_init(offsets_); |
175 | |
176 | ut_ad(!node->table->skip_alter_undo); |
177 | |
178 | mtr_start(&mtr); |
179 | |
180 | clust_index = dict_table_get_first_index(node->table); |
181 | |
182 | found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF, |
183 | node->table, node->ref, &mtr); |
184 | |
185 | if (!found) { |
186 | goto func_exit; |
187 | } |
188 | |
189 | rec = btr_pcur_get_rec(&node->pcur); |
190 | |
191 | offsets = rec_get_offsets(rec, clust_index, offsets, true, |
192 | ULINT_UNDEFINED, &heap); |
193 | |
194 | found = row_get_rec_roll_ptr(rec, clust_index, offsets) |
195 | == node->roll_ptr; |
196 | |
197 | if (found) { |
198 | ut_ad(row_get_rec_trx_id(rec, clust_index, offsets) |
199 | == node->trx->id); |
200 | |
201 | if (dict_table_has_atomic_blobs(node->table)) { |
202 | /* There is no prefix of externally stored |
203 | columns in the clustered index record. Build a |
204 | cache of column prefixes. */ |
205 | ext = &node->ext; |
206 | } else { |
207 | /* REDUNDANT and COMPACT formats store a local |
208 | 768-byte prefix of each externally stored |
209 | column. No cache is needed. */ |
210 | ext = NULL; |
211 | node->ext = NULL; |
212 | } |
213 | |
214 | node->row = row_build(ROW_COPY_DATA, clust_index, rec, |
215 | offsets, NULL, |
216 | NULL, NULL, ext, node->heap); |
217 | |
218 | /* We will need to parse out virtual column info from undo |
219 | log, first mark them DATA_MISSING. So we will know if the |
220 | value gets updated */ |
221 | if (node->table->n_v_cols |
222 | && node->state != UNDO_NODE_INSERT |
223 | && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { |
224 | for (ulint i = 0; |
225 | i < dict_table_get_n_v_cols(node->table); i++) { |
226 | dfield_get_type(dtuple_get_nth_v_field( |
227 | node->row, i))->mtype = DATA_MISSING; |
228 | } |
229 | } |
230 | |
231 | if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { |
232 | ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG |
233 | || node->row->info_bits == 0); |
234 | node->undo_row = dtuple_copy(node->row, node->heap); |
235 | row_upd_replace(node->undo_row, &node->undo_ext, |
236 | clust_index, node->update, node->heap); |
237 | } else { |
238 | ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG) |
239 | == (node->rec_type == TRX_UNDO_INSERT_DEFAULT)); |
240 | node->undo_row = NULL; |
241 | node->undo_ext = NULL; |
242 | } |
243 | |
244 | btr_pcur_store_position(&node->pcur, &mtr); |
245 | } |
246 | |
247 | if (heap) { |
248 | mem_heap_free(heap); |
249 | } |
250 | |
251 | func_exit: |
252 | btr_pcur_commit_specify_mtr(&node->pcur, &mtr); |
253 | return(found); |
254 | } |
255 | |
256 | /***********************************************************//** |
257 | Fetches an undo log record and does the undo for the recorded operation. |
258 | If none left, or a partial rollback completed, returns control to the |
259 | parent node, which is always a query thread node. |
260 | @return DB_SUCCESS if operation successfully completed, else error code */ |
261 | static MY_ATTRIBUTE((warn_unused_result)) |
262 | dberr_t |
263 | row_undo( |
264 | /*=====*/ |
265 | undo_node_t* node, /*!< in: row undo node */ |
266 | que_thr_t* thr) /*!< in: query thread */ |
267 | { |
268 | trx_t* trx = node->trx; |
269 | ut_ad(trx->in_rollback); |
270 | |
271 | if (node->state == UNDO_NODE_FETCH_NEXT) { |
272 | |
273 | node->undo_rec = trx_roll_pop_top_rec_of_trx( |
274 | trx, &node->roll_ptr, node->heap); |
275 | |
276 | if (!node->undo_rec) { |
277 | /* Rollback completed for this query thread */ |
278 | thr->run_node = que_node_get_parent(node); |
279 | return(DB_SUCCESS); |
280 | } |
281 | |
282 | node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); |
283 | node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr) |
284 | ? UNDO_NODE_INSERT : UNDO_NODE_MODIFY; |
285 | } |
286 | |
287 | /* Prevent DROP TABLE etc. while we are rolling back this row. |
288 | If we are doing a TABLE CREATE or some other dictionary operation, |
289 | then we already have dict_operation_lock locked in x-mode. Do not |
290 | try to lock again, because that would cause a hang. */ |
291 | |
292 | const bool locked_data_dict = (trx->dict_operation_lock_mode == 0); |
293 | |
294 | if (locked_data_dict) { |
295 | |
296 | row_mysql_freeze_data_dictionary(trx); |
297 | } |
298 | |
299 | dberr_t err; |
300 | |
301 | if (node->state == UNDO_NODE_INSERT) { |
302 | |
303 | err = row_undo_ins(node, thr); |
304 | |
305 | node->state = UNDO_NODE_FETCH_NEXT; |
306 | } else { |
307 | ut_ad(node->state == UNDO_NODE_MODIFY); |
308 | err = row_undo_mod(node, thr); |
309 | } |
310 | |
311 | if (locked_data_dict) { |
312 | |
313 | row_mysql_unfreeze_data_dictionary(trx); |
314 | } |
315 | |
316 | /* Do some cleanup */ |
317 | btr_pcur_close(&(node->pcur)); |
318 | |
319 | mem_heap_empty(node->heap); |
320 | |
321 | thr->run_node = node; |
322 | |
323 | return(err); |
324 | } |
325 | |
326 | /***********************************************************//** |
327 | Undoes a row operation in a table. This is a high-level function used |
328 | in SQL execution graphs. |
329 | @return query thread to run next or NULL */ |
330 | que_thr_t* |
331 | row_undo_step( |
332 | /*==========*/ |
333 | que_thr_t* thr) /*!< in: query thread */ |
334 | { |
335 | dberr_t err; |
336 | undo_node_t* node; |
337 | trx_t* trx; |
338 | |
339 | ut_ad(thr); |
340 | |
341 | srv_inc_activity_count(); |
342 | |
343 | trx = thr_get_trx(thr); |
344 | |
345 | node = static_cast<undo_node_t*>(thr->run_node); |
346 | |
347 | ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); |
348 | |
349 | if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE |
350 | && !srv_undo_sources |
351 | && !srv_is_being_started) |
352 | && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) { |
353 | /* Shutdown has been initiated. */ |
354 | trx->error_state = DB_INTERRUPTED; |
355 | return NULL; |
356 | } |
357 | |
358 | if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) { |
359 | trx_roll_report_progress(); |
360 | } |
361 | |
362 | err = row_undo(node, thr); |
363 | |
364 | trx->error_state = err; |
365 | |
366 | if (err != DB_SUCCESS) { |
367 | /* SQL error detected */ |
368 | |
369 | if (err == DB_OUT_OF_FILE_SPACE) { |
370 | ib::fatal() << "Out of tablespace during rollback." |
371 | " Consider increasing your tablespace." ; |
372 | } |
373 | |
374 | ib::fatal() << "Error (" << ut_strerr(err) << ") in rollback." ; |
375 | } |
376 | |
377 | return(thr); |
378 | } |
379 | |