| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2017, MariaDB Corporation. |
| 5 | |
| 6 | This program is free software; you can redistribute it and/or modify it under |
| 7 | the terms of the GNU General Public License as published by the Free Software |
| 8 | Foundation; version 2 of the License. |
| 9 | |
| 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU General Public License along with |
| 15 | this program; if not, write to the Free Software Foundation, Inc., |
| 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 17 | |
| 18 | *****************************************************************************/ |
| 19 | |
| 20 | /**************************************************//** |
| 21 | @file row/row0undo.cc |
| 22 | Row undo |
| 23 | |
| 24 | Created 1/8/1997 Heikki Tuuri |
| 25 | *******************************************************/ |
| 26 | |
| 27 | #include "ha_prototypes.h" |
| 28 | |
| 29 | #include "row0undo.h" |
| 30 | #include "fsp0fsp.h" |
| 31 | #include "mach0data.h" |
| 32 | #include "trx0rseg.h" |
| 33 | #include "trx0trx.h" |
| 34 | #include "trx0roll.h" |
| 35 | #include "trx0undo.h" |
| 36 | #include "trx0purge.h" |
| 37 | #include "trx0rec.h" |
| 38 | #include "que0que.h" |
| 39 | #include "row0row.h" |
| 40 | #include "row0uins.h" |
| 41 | #include "row0umod.h" |
| 42 | #include "row0upd.h" |
| 43 | #include "row0mysql.h" |
| 44 | #include "srv0srv.h" |
| 45 | #include "srv0start.h" |
| 46 | |
| 47 | /* How to undo row operations? |
| 48 | (1) For an insert, we have stored a prefix of the clustered index record |
| 49 | in the undo log. Using it, we look for the clustered record, and using |
| 50 | that we look for the records in the secondary indexes. The insert operation |
| 51 | may have been left incomplete, if the database crashed, for example. |
| 52 | We may have look at the trx id and roll ptr to make sure the record in the |
| 53 | clustered index is really the one for which the undo log record was |
| 54 | written. We can use the framework we get from the original insert op. |
| 55 | (2) Delete marking: We can use the framework we get from the original |
| 56 | delete mark op. We only have to check the trx id. |
| 57 | (3) Update: This may be the most complicated. We have to use the framework |
| 58 | we get from the original update op. |
| 59 | |
| 60 | What if the same trx repeatedly deletes and inserts an identical row. |
| 61 | Then the row id changes and also roll ptr. What if the row id was not |
| 62 | part of the ordering fields in the clustered index? Maybe we have to write |
| 63 | it to undo log. Well, maybe not, because if we order the row id and trx id |
| 64 | in descending order, then the only undeleted copy is the first in the |
| 65 | index. Our searches in row operations always position the cursor before |
| 66 | the first record in the result set. But, if there is no key defined for |
| 67 | a table, then it would be desirable that row id is in ascending order. |
| 68 | So, lets store row id in descending order only if it is not an ordering |
| 69 | field in the clustered index. |
| 70 | |
| 71 | NOTE: Deletes and inserts may lead to situation where there are identical |
| 72 | records in a secondary index. Is that a problem in the B-tree? Yes. |
| 73 | Also updates can lead to this, unless trx id and roll ptr are included in |
| 74 | ord fields. |
| 75 | (1) Fix in clustered indexes: include row id, trx id, and roll ptr |
| 76 | in node pointers of B-tree. |
| 77 | (2) Fix in secondary indexes: include all fields in node pointers, and |
| 78 | if an entry is inserted, check if it is equal to the right neighbor, |
| 79 | in which case update the right neighbor: the neighbor must be delete |
| 80 | marked, set it unmarked and write the trx id of the current transaction. |
| 81 | |
| 82 | What if the same trx repeatedly updates the same row, updating a secondary |
| 83 | index field or not? Updating a clustered index ordering field? |
| 84 | |
| 85 | (1) If it does not update the secondary index and not the clustered index |
| 86 | ord field. Then the secondary index record stays unchanged, but the |
| 87 | trx id in the secondary index record may be smaller than in the clustered |
| 88 | index record. This is no problem? |
| 89 | (2) If it updates secondary index ord field but not clustered: then in |
| 90 | secondary index there are delete marked records, which differ in an |
| 91 | ord field. No problem. |
| 92 | (3) Updates clustered ord field but not secondary, and secondary index |
| 93 | is unique. Then the record in secondary index is just updated at the |
| 94 | clustered ord field. |
| 95 | (4) |
| 96 | |
| 97 | Problem with duplicate records: |
| 98 | Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a |
| 99 | bigger trx id has inserted and delete marked a similar row, our trx inserts |
| 100 | again a similar row, and a trx with an even bigger id delete marks it. Then |
| 101 | the position of the row should change in the index if the trx id affects |
| 102 | the alphabetical ordering. |
| 103 | |
| 104 | Fix 2: If an insert encounters a similar row marked deleted, we turn the |
| 105 | insert into an 'update' of the row marked deleted. Then we must write undo |
| 106 | info on the update. A problem: what if a purge operation tries to remove |
| 107 | the delete marked row? |
| 108 | |
| 109 | We can think of the database row versions as a linked list which starts |
| 110 | from the record in the clustered index, and is linked by roll ptrs |
| 111 | through undo logs. The secondary index records are references which tell |
| 112 | what kinds of records can be found in this linked list for a record |
| 113 | in the clustered index. |
| 114 | |
| 115 | How to do the purge? A record can be removed from the clustered index |
| 116 | if its linked list becomes empty, i.e., the row has been marked deleted |
| 117 | and its roll ptr points to the record in the undo log we are going through, |
| 118 | doing the purge. Similarly, during a rollback, a record can be removed |
| 119 | if the stored roll ptr in the undo log points to a trx already (being) purged, |
| 120 | or if the roll ptr is NULL, i.e., it was a fresh insert. */ |
| 121 | |
| 122 | /********************************************************************//** |
| 123 | Creates a row undo node to a query graph. |
| 124 | @return own: undo node */ |
| 125 | undo_node_t* |
| 126 | row_undo_node_create( |
| 127 | /*=================*/ |
| 128 | trx_t* trx, /*!< in/out: transaction */ |
| 129 | que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ |
| 130 | mem_heap_t* heap) /*!< in: memory heap where created */ |
| 131 | { |
| 132 | undo_node_t* undo; |
| 133 | |
| 134 | ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) |
| 135 | || trx_state_eq(trx, TRX_STATE_PREPARED)); |
| 136 | ut_ad(parent); |
| 137 | |
| 138 | undo = static_cast<undo_node_t*>( |
| 139 | mem_heap_alloc(heap, sizeof(undo_node_t))); |
| 140 | |
| 141 | undo->common.type = QUE_NODE_UNDO; |
| 142 | undo->common.parent = parent; |
| 143 | |
| 144 | undo->state = UNDO_NODE_FETCH_NEXT; |
| 145 | undo->trx = trx; |
| 146 | |
| 147 | btr_pcur_init(&(undo->pcur)); |
| 148 | |
| 149 | undo->heap = mem_heap_create(256); |
| 150 | |
| 151 | return(undo); |
| 152 | } |
| 153 | |
| 154 | /***********************************************************//** |
| 155 | Looks for the clustered index record when node has the row reference. |
| 156 | The pcur in node is used in the search. If found, stores the row to node, |
| 157 | and stores the position of pcur, and detaches it. The pcur must be closed |
| 158 | by the caller in any case. |
| 159 | @return true if found; NOTE the node->pcur must be closed by the |
| 160 | caller, regardless of the return value */ |
| 161 | bool |
| 162 | row_undo_search_clust_to_pcur( |
| 163 | /*==========================*/ |
| 164 | undo_node_t* node) /*!< in/out: row undo node */ |
| 165 | { |
| 166 | dict_index_t* clust_index; |
| 167 | bool found; |
| 168 | mtr_t mtr; |
| 169 | row_ext_t** ext; |
| 170 | const rec_t* rec; |
| 171 | mem_heap_t* heap = NULL; |
| 172 | ulint offsets_[REC_OFFS_NORMAL_SIZE]; |
| 173 | ulint* offsets = offsets_; |
| 174 | rec_offs_init(offsets_); |
| 175 | |
| 176 | ut_ad(!node->table->skip_alter_undo); |
| 177 | |
| 178 | mtr_start(&mtr); |
| 179 | |
| 180 | clust_index = dict_table_get_first_index(node->table); |
| 181 | |
| 182 | found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF, |
| 183 | node->table, node->ref, &mtr); |
| 184 | |
| 185 | if (!found) { |
| 186 | goto func_exit; |
| 187 | } |
| 188 | |
| 189 | rec = btr_pcur_get_rec(&node->pcur); |
| 190 | |
| 191 | offsets = rec_get_offsets(rec, clust_index, offsets, true, |
| 192 | ULINT_UNDEFINED, &heap); |
| 193 | |
| 194 | found = row_get_rec_roll_ptr(rec, clust_index, offsets) |
| 195 | == node->roll_ptr; |
| 196 | |
| 197 | if (found) { |
| 198 | ut_ad(row_get_rec_trx_id(rec, clust_index, offsets) |
| 199 | == node->trx->id); |
| 200 | |
| 201 | if (dict_table_has_atomic_blobs(node->table)) { |
| 202 | /* There is no prefix of externally stored |
| 203 | columns in the clustered index record. Build a |
| 204 | cache of column prefixes. */ |
| 205 | ext = &node->ext; |
| 206 | } else { |
| 207 | /* REDUNDANT and COMPACT formats store a local |
| 208 | 768-byte prefix of each externally stored |
| 209 | column. No cache is needed. */ |
| 210 | ext = NULL; |
| 211 | node->ext = NULL; |
| 212 | } |
| 213 | |
| 214 | node->row = row_build(ROW_COPY_DATA, clust_index, rec, |
| 215 | offsets, NULL, |
| 216 | NULL, NULL, ext, node->heap); |
| 217 | |
| 218 | /* We will need to parse out virtual column info from undo |
| 219 | log, first mark them DATA_MISSING. So we will know if the |
| 220 | value gets updated */ |
| 221 | if (node->table->n_v_cols |
| 222 | && node->state != UNDO_NODE_INSERT |
| 223 | && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { |
| 224 | for (ulint i = 0; |
| 225 | i < dict_table_get_n_v_cols(node->table); i++) { |
| 226 | dfield_get_type(dtuple_get_nth_v_field( |
| 227 | node->row, i))->mtype = DATA_MISSING; |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { |
| 232 | ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG |
| 233 | || node->row->info_bits == 0); |
| 234 | node->undo_row = dtuple_copy(node->row, node->heap); |
| 235 | row_upd_replace(node->undo_row, &node->undo_ext, |
| 236 | clust_index, node->update, node->heap); |
| 237 | } else { |
| 238 | ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG) |
| 239 | == (node->rec_type == TRX_UNDO_INSERT_DEFAULT)); |
| 240 | node->undo_row = NULL; |
| 241 | node->undo_ext = NULL; |
| 242 | } |
| 243 | |
| 244 | btr_pcur_store_position(&node->pcur, &mtr); |
| 245 | } |
| 246 | |
| 247 | if (heap) { |
| 248 | mem_heap_free(heap); |
| 249 | } |
| 250 | |
| 251 | func_exit: |
| 252 | btr_pcur_commit_specify_mtr(&node->pcur, &mtr); |
| 253 | return(found); |
| 254 | } |
| 255 | |
| 256 | /***********************************************************//** |
| 257 | Fetches an undo log record and does the undo for the recorded operation. |
| 258 | If none left, or a partial rollback completed, returns control to the |
| 259 | parent node, which is always a query thread node. |
| 260 | @return DB_SUCCESS if operation successfully completed, else error code */ |
| 261 | static MY_ATTRIBUTE((warn_unused_result)) |
| 262 | dberr_t |
| 263 | row_undo( |
| 264 | /*=====*/ |
| 265 | undo_node_t* node, /*!< in: row undo node */ |
| 266 | que_thr_t* thr) /*!< in: query thread */ |
| 267 | { |
| 268 | trx_t* trx = node->trx; |
| 269 | ut_ad(trx->in_rollback); |
| 270 | |
| 271 | if (node->state == UNDO_NODE_FETCH_NEXT) { |
| 272 | |
| 273 | node->undo_rec = trx_roll_pop_top_rec_of_trx( |
| 274 | trx, &node->roll_ptr, node->heap); |
| 275 | |
| 276 | if (!node->undo_rec) { |
| 277 | /* Rollback completed for this query thread */ |
| 278 | thr->run_node = que_node_get_parent(node); |
| 279 | return(DB_SUCCESS); |
| 280 | } |
| 281 | |
| 282 | node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); |
| 283 | node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr) |
| 284 | ? UNDO_NODE_INSERT : UNDO_NODE_MODIFY; |
| 285 | } |
| 286 | |
| 287 | /* Prevent DROP TABLE etc. while we are rolling back this row. |
| 288 | If we are doing a TABLE CREATE or some other dictionary operation, |
| 289 | then we already have dict_operation_lock locked in x-mode. Do not |
| 290 | try to lock again, because that would cause a hang. */ |
| 291 | |
| 292 | const bool locked_data_dict = (trx->dict_operation_lock_mode == 0); |
| 293 | |
| 294 | if (locked_data_dict) { |
| 295 | |
| 296 | row_mysql_freeze_data_dictionary(trx); |
| 297 | } |
| 298 | |
| 299 | dberr_t err; |
| 300 | |
| 301 | if (node->state == UNDO_NODE_INSERT) { |
| 302 | |
| 303 | err = row_undo_ins(node, thr); |
| 304 | |
| 305 | node->state = UNDO_NODE_FETCH_NEXT; |
| 306 | } else { |
| 307 | ut_ad(node->state == UNDO_NODE_MODIFY); |
| 308 | err = row_undo_mod(node, thr); |
| 309 | } |
| 310 | |
| 311 | if (locked_data_dict) { |
| 312 | |
| 313 | row_mysql_unfreeze_data_dictionary(trx); |
| 314 | } |
| 315 | |
| 316 | /* Do some cleanup */ |
| 317 | btr_pcur_close(&(node->pcur)); |
| 318 | |
| 319 | mem_heap_empty(node->heap); |
| 320 | |
| 321 | thr->run_node = node; |
| 322 | |
| 323 | return(err); |
| 324 | } |
| 325 | |
| 326 | /***********************************************************//** |
| 327 | Undoes a row operation in a table. This is a high-level function used |
| 328 | in SQL execution graphs. |
| 329 | @return query thread to run next or NULL */ |
| 330 | que_thr_t* |
| 331 | row_undo_step( |
| 332 | /*==========*/ |
| 333 | que_thr_t* thr) /*!< in: query thread */ |
| 334 | { |
| 335 | dberr_t err; |
| 336 | undo_node_t* node; |
| 337 | trx_t* trx; |
| 338 | |
| 339 | ut_ad(thr); |
| 340 | |
| 341 | srv_inc_activity_count(); |
| 342 | |
| 343 | trx = thr_get_trx(thr); |
| 344 | |
| 345 | node = static_cast<undo_node_t*>(thr->run_node); |
| 346 | |
| 347 | ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); |
| 348 | |
| 349 | if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE |
| 350 | && !srv_undo_sources |
| 351 | && !srv_is_being_started) |
| 352 | && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) { |
| 353 | /* Shutdown has been initiated. */ |
| 354 | trx->error_state = DB_INTERRUPTED; |
| 355 | return NULL; |
| 356 | } |
| 357 | |
| 358 | if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) { |
| 359 | trx_roll_report_progress(); |
| 360 | } |
| 361 | |
| 362 | err = row_undo(node, thr); |
| 363 | |
| 364 | trx->error_state = err; |
| 365 | |
| 366 | if (err != DB_SUCCESS) { |
| 367 | /* SQL error detected */ |
| 368 | |
| 369 | if (err == DB_OUT_OF_FILE_SPACE) { |
| 370 | ib::fatal() << "Out of tablespace during rollback." |
| 371 | " Consider increasing your tablespace." ; |
| 372 | } |
| 373 | |
| 374 | ib::fatal() << "Error (" << ut_strerr(err) << ") in rollback." ; |
| 375 | } |
| 376 | |
| 377 | return(thr); |
| 378 | } |
| 379 | |