| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2015, 2017, MariaDB Corporation. |
| 5 | |
| 6 | This program is free software; you can redistribute it and/or modify it under |
| 7 | the terms of the GNU General Public License as published by the Free Software |
| 8 | Foundation; version 2 of the License. |
| 9 | |
| 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU General Public License along with |
| 15 | this program; if not, write to the Free Software Foundation, Inc., |
| 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 17 | |
| 18 | *****************************************************************************/ |
| 19 | |
| 20 | /**************************************************//** |
| 21 | @file include/row0merge.h |
| 22 | Index build routines using a merge sort |
| 23 | |
| 24 | Created 13/06/2005 Jan Lindstrom |
| 25 | *******************************************************/ |
| 26 | |
| 27 | #ifndef row0merge_h |
| 28 | #define row0merge_h |
| 29 | |
| 30 | #include "univ.i" |
| 31 | #include "data0data.h" |
| 32 | #include "dict0types.h" |
| 33 | #include "trx0types.h" |
| 34 | #include "que0types.h" |
| 35 | #include "mtr0mtr.h" |
| 36 | #include "rem0types.h" |
| 37 | #include "rem0rec.h" |
| 38 | #include "btr0types.h" |
| 39 | #include "row0mysql.h" |
| 40 | #include "lock0types.h" |
| 41 | #include "srv0srv.h" |
| 42 | #include "ut0stage.h" |
| 43 | |
| 44 | /* Reserve free space from every block for key_version */ |
| 45 | #define ROW_MERGE_RESERVE_SIZE 4 |
| 46 | |
| 47 | /* Cluster index read task is mandatory */ |
| 48 | #define COST_READ_CLUSTERED_INDEX 1.0 |
| 49 | |
| 50 | /* Basic fixed cost to build all type of index */ |
| 51 | #define COST_BUILD_INDEX_STATIC 0.5 |
| 52 | /* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */ |
| 53 | #define COST_BUILD_INDEX_DYNAMIC 0.5 |
| 54 | |
| 55 | /* Sum of below two must be 1.0 */ |
| 56 | #define PCT_COST_MERGESORT_INDEX 0.4 |
| 57 | #define PCT_COST_INSERT_INDEX 0.6 |
| 58 | |
| 59 | // Forward declaration |
| 60 | struct ib_sequence_t; |
| 61 | |
| 62 | /** @brief Block size for I/O operations in merge sort. |
| 63 | |
| 64 | The minimum is srv_page_size, or page_get_free_space_of_empty() |
| 65 | rounded to a power of 2. |
| 66 | |
| 67 | When not creating a PRIMARY KEY that contains column prefixes, this |
| 68 | can be set as small as srv_page_size / 2. */ |
| 69 | typedef byte row_merge_block_t; |
| 70 | |
| 71 | /** @brief Secondary buffer for I/O operations of merge records. |
| 72 | |
| 73 | This buffer is used for writing or reading a record that spans two |
| 74 | row_merge_block_t. Thus, it must be able to hold one merge record, |
| 75 | whose maximum size is the same as the minimum size of |
| 76 | row_merge_block_t. */ |
| 77 | typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX]; |
| 78 | |
| 79 | /** @brief Merge record in row_merge_block_t. |
| 80 | |
| 81 | The format is the same as a record in ROW_FORMAT=COMPACT with the |
| 82 | exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ |
| 83 | typedef byte mrec_t; |
| 84 | |
| 85 | /** Merge record in row_merge_buf_t */ |
| 86 | struct mtuple_t { |
| 87 | dfield_t* fields; /*!< data fields */ |
| 88 | }; |
| 89 | |
| 90 | /** Buffer for sorting in main memory. */ |
| 91 | struct row_merge_buf_t { |
| 92 | mem_heap_t* heap; /*!< memory heap where allocated */ |
| 93 | dict_index_t* index; /*!< the index the tuples belong to */ |
| 94 | ulint total_size; /*!< total amount of data bytes */ |
| 95 | ulint n_tuples; /*!< number of data tuples */ |
| 96 | ulint max_tuples; /*!< maximum number of data tuples */ |
| 97 | mtuple_t* tuples; /*!< array of data tuples */ |
| 98 | mtuple_t* tmp_tuples; /*!< temporary copy of tuples, |
| 99 | for sorting */ |
| 100 | }; |
| 101 | |
| 102 | /** Information about temporary files used in merge sort */ |
| 103 | struct merge_file_t { |
| 104 | pfs_os_file_t fd; /*!< file descriptor */ |
| 105 | ulint offset; /*!< file offset (end of file) */ |
| 106 | ib_uint64_t n_rec; /*!< number of records in the file */ |
| 107 | }; |
| 108 | |
| 109 | /** Index field definition */ |
| 110 | struct index_field_t { |
| 111 | ulint col_no; /*!< column offset */ |
| 112 | ulint prefix_len; /*!< column prefix length, or 0 |
| 113 | if indexing the whole column */ |
| 114 | bool is_v_col; /*!< whether this is a virtual column */ |
| 115 | }; |
| 116 | |
| 117 | /** Definition of an index being created */ |
| 118 | struct index_def_t { |
| 119 | const char* name; /*!< index name */ |
| 120 | bool rebuild; /*!< whether the table is rebuilt */ |
| 121 | ulint ind_type; /*!< 0, DICT_UNIQUE, |
| 122 | or DICT_CLUSTERED */ |
| 123 | ulint key_number; /*!< MySQL key number, |
| 124 | or ULINT_UNDEFINED if none */ |
| 125 | ulint n_fields; /*!< number of fields in index */ |
| 126 | index_field_t* fields; /*!< field definitions */ |
| 127 | st_mysql_ftparser* |
| 128 | parser; /*!< fulltext parser plugin */ |
| 129 | }; |
| 130 | |
| 131 | /** Structure for reporting duplicate records. */ |
| 132 | struct row_merge_dup_t { |
| 133 | dict_index_t* index; /*!< index being sorted */ |
| 134 | struct TABLE* table; /*!< MySQL table object */ |
| 135 | const ulint* col_map;/*!< mapping of column numbers |
| 136 | in table to the rebuilt table |
| 137 | (index->table), or NULL if not |
| 138 | rebuilding table */ |
| 139 | ulint n_dup; /*!< number of duplicates */ |
| 140 | }; |
| 141 | |
| 142 | /*************************************************************//** |
| 143 | Report a duplicate key. */ |
| 144 | void |
| 145 | row_merge_dup_report( |
| 146 | /*=================*/ |
| 147 | row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ |
| 148 | const dfield_t* entry) /*!< in: duplicate index entry */ |
| 149 | MY_ATTRIBUTE((nonnull)); |
| 150 | |
| 151 | /*********************************************************************//** |
| 152 | Sets an exclusive lock on a table, for the duration of creating indexes. |
| 153 | @return error code or DB_SUCCESS */ |
| 154 | dberr_t |
| 155 | row_merge_lock_table( |
| 156 | /*=================*/ |
| 157 | trx_t* trx, /*!< in/out: transaction */ |
| 158 | dict_table_t* table, /*!< in: table to lock */ |
| 159 | enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ |
| 160 | MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); |
| 161 | |
| 162 | /*********************************************************************//** |
| 163 | Drop indexes that were created before an error occurred. |
| 164 | The data dictionary must have been locked exclusively by the caller, |
| 165 | because the transaction will not be committed. */ |
| 166 | void |
| 167 | row_merge_drop_indexes_dict( |
| 168 | /*========================*/ |
| 169 | trx_t* trx, /*!< in/out: dictionary transaction */ |
| 170 | table_id_t table_id)/*!< in: table identifier */ |
| 171 | MY_ATTRIBUTE((nonnull)); |
| 172 | |
| 173 | /*********************************************************************//** |
| 174 | Drop those indexes which were created before an error occurred. |
| 175 | The data dictionary must have been locked exclusively by the caller, |
| 176 | because the transaction will not be committed. */ |
| 177 | void |
| 178 | row_merge_drop_indexes( |
| 179 | /*===================*/ |
| 180 | trx_t* trx, /*!< in/out: transaction */ |
| 181 | dict_table_t* table, /*!< in/out: table containing the indexes */ |
| 182 | ibool locked) /*!< in: TRUE=table locked, |
| 183 | FALSE=may need to do a lazy drop */ |
| 184 | MY_ATTRIBUTE((nonnull)); |
| 185 | |
| 186 | /*********************************************************************//** |
| 187 | Drop all partially created indexes during crash recovery. */ |
| 188 | void |
| 189 | row_merge_drop_temp_indexes(void); |
| 190 | /*=============================*/ |
| 191 | |
| 192 | /** Create temporary merge files in the given paramater path, and if |
| 193 | UNIV_PFS_IO defined, register the file descriptor with Performance Schema. |
| 194 | @param[in] path location for creating temporary merge files, or NULL |
| 195 | @return File descriptor */ |
| 196 | pfs_os_file_t |
| 197 | row_merge_file_create_low( |
| 198 | const char* path) |
| 199 | MY_ATTRIBUTE((warn_unused_result)); |
| 200 | /*********************************************************************//** |
| 201 | Destroy a merge file. And de-register the file from Performance Schema |
| 202 | if UNIV_PFS_IO is defined. */ |
| 203 | void |
| 204 | row_merge_file_destroy_low( |
| 205 | /*=======================*/ |
| 206 | const pfs_os_file_t& fd); /*!< in: merge file descriptor */ |
| 207 | |
| 208 | /*********************************************************************//** |
| 209 | Provide a new pathname for a table that is being renamed if it belongs to |
| 210 | a file-per-table tablespace. The caller is responsible for freeing the |
| 211 | memory allocated for the return value. |
| 212 | @return new pathname of tablespace file, or NULL if space = 0 */ |
| 213 | char* |
| 214 | row_make_new_pathname( |
| 215 | /*==================*/ |
| 216 | dict_table_t* table, /*!< in: table to be renamed */ |
| 217 | const char* new_name) /*!< in: new name */ |
| 218 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
| 219 | |
| 220 | /*********************************************************************//** |
| 221 | Rename the tables in the data dictionary. The data dictionary must |
| 222 | have been locked exclusively by the caller, because the transaction |
| 223 | will not be committed. |
| 224 | @return error code or DB_SUCCESS */ |
| 225 | dberr_t |
| 226 | row_merge_rename_tables_dict( |
| 227 | /*=========================*/ |
| 228 | dict_table_t* old_table, /*!< in/out: old table, renamed to |
| 229 | tmp_name */ |
| 230 | dict_table_t* new_table, /*!< in/out: new table, renamed to |
| 231 | old_table->name */ |
| 232 | const char* tmp_name, /*!< in: new name for old_table */ |
| 233 | trx_t* trx) /*!< in/out: dictionary transaction */ |
| 234 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
| 235 | |
| 236 | /*********************************************************************//** |
| 237 | Rename an index in the dictionary that was created. The data |
| 238 | dictionary must have been locked exclusively by the caller, because |
| 239 | the transaction will not be committed. |
| 240 | @return DB_SUCCESS if all OK */ |
| 241 | dberr_t |
| 242 | row_merge_rename_index_to_add( |
| 243 | /*==========================*/ |
| 244 | trx_t* trx, /*!< in/out: transaction */ |
| 245 | table_id_t table_id, /*!< in: table identifier */ |
| 246 | index_id_t index_id) /*!< in: index identifier */ |
| 247 | MY_ATTRIBUTE((nonnull(1), warn_unused_result)); |
| 248 | |
| 249 | /*********************************************************************//** |
| 250 | Rename an index in the dictionary that is to be dropped. The data |
| 251 | dictionary must have been locked exclusively by the caller, because |
| 252 | the transaction will not be committed. |
| 253 | @return DB_SUCCESS if all OK */ |
| 254 | dberr_t |
| 255 | row_merge_rename_index_to_drop( |
| 256 | /*===========================*/ |
| 257 | trx_t* trx, /*!< in/out: transaction */ |
| 258 | table_id_t table_id, /*!< in: table identifier */ |
| 259 | index_id_t index_id) /*!< in: index identifier */ |
| 260 | MY_ATTRIBUTE((nonnull(1), warn_unused_result)); |
| 261 | |
| 262 | /** Create the index and load in to the dictionary. |
| 263 | @param[in,out] table the index is on this table |
| 264 | @param[in] index_def the index definition |
| 265 | @param[in] add_v new virtual columns added along with add |
| 266 | index call |
| 267 | @return index, or NULL on error */ |
| 268 | dict_index_t* |
| 269 | row_merge_create_index( |
| 270 | dict_table_t* table, |
| 271 | const index_def_t* index_def, |
| 272 | const dict_add_v_col_t* add_v) |
| 273 | MY_ATTRIBUTE((warn_unused_result)); |
| 274 | |
| 275 | /*********************************************************************//** |
| 276 | Check if a transaction can use an index. |
| 277 | @return whether the index can be used by the transaction */ |
| 278 | bool |
| 279 | row_merge_is_index_usable( |
| 280 | /*======================*/ |
| 281 | const trx_t* trx, /*!< in: transaction */ |
| 282 | const dict_index_t* index) /*!< in: index to check */ |
| 283 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
| 284 | |
| 285 | /*********************************************************************//** |
| 286 | Drop a table. The caller must have ensured that the background stats |
| 287 | thread is not processing the table. This can be done by calling |
| 288 | dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and |
| 289 | before calling this function. |
| 290 | @return DB_SUCCESS or error code */ |
| 291 | dberr_t |
| 292 | row_merge_drop_table( |
| 293 | /*=================*/ |
| 294 | trx_t* trx, /*!< in: transaction */ |
| 295 | dict_table_t* table) /*!< in: table instance to drop */ |
| 296 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
| 297 | |
| 298 | /** Build indexes on a table by reading a clustered index, creating a temporary |
| 299 | file containing index entries, merge sorting these index entries and inserting |
| 300 | sorted index entries to indexes. |
| 301 | @param[in] trx transaction |
| 302 | @param[in] old_table table where rows are read from |
| 303 | @param[in] new_table table where indexes are created; identical to |
| 304 | old_table unless creating a PRIMARY KEY |
| 305 | @param[in] online true if creating indexes online |
| 306 | @param[in] indexes indexes to be created |
| 307 | @param[in] key_numbers MySQL key numbers |
| 308 | @param[in] n_indexes size of indexes[] |
| 309 | @param[in,out] table MySQL table, for reporting erroneous key value |
| 310 | if applicable |
| 311 | @param[in] defaults default values of added, changed columns, or NULL |
| 312 | @param[in] col_map mapping of old column numbers to new ones, or |
| 313 | NULL if old_table == new_table |
| 314 | @param[in] add_autoinc number of added AUTO_INCREMENT columns, or |
| 315 | ULINT_UNDEFINED if none is added |
| 316 | @param[in,out] sequence autoinc sequence |
| 317 | @param[in] skip_pk_sort whether the new PRIMARY KEY will follow |
| 318 | existing order |
| 319 | @param[in,out] stage performance schema accounting object, used by |
| 320 | ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of |
| 321 | this function and it will be passed to other functions for further accounting. |
| 322 | @param[in] add_v new virtual columns added along with indexes |
| 323 | @param[in] eval_table mysql table used to evaluate virtual column |
| 324 | value, see innobase_get_computed_value(). |
| 325 | @param[in] drop_historical whether to drop historical system rows |
| 326 | @return DB_SUCCESS or error code */ |
| 327 | dberr_t |
| 328 | row_merge_build_indexes( |
| 329 | trx_t* trx, |
| 330 | dict_table_t* old_table, |
| 331 | dict_table_t* new_table, |
| 332 | bool online, |
| 333 | dict_index_t** indexes, |
| 334 | const ulint* key_numbers, |
| 335 | ulint n_indexes, |
| 336 | struct TABLE* table, |
| 337 | const dtuple_t* defaults, |
| 338 | const ulint* col_map, |
| 339 | ulint add_autoinc, |
| 340 | ib_sequence_t& sequence, |
| 341 | bool skip_pk_sort, |
| 342 | ut_stage_alter_t* stage, |
| 343 | const dict_add_v_col_t* add_v, |
| 344 | struct TABLE* eval_table, |
| 345 | bool drop_historical) |
| 346 | MY_ATTRIBUTE((warn_unused_result)); |
| 347 | |
| 348 | /********************************************************************//** |
| 349 | Write a buffer to a block. */ |
| 350 | void |
| 351 | row_merge_buf_write( |
| 352 | /*================*/ |
| 353 | const row_merge_buf_t* buf, /*!< in: sorted buffer */ |
| 354 | const merge_file_t* of, /*!< in: output file */ |
| 355 | row_merge_block_t* block) /*!< out: buffer for writing to file */ |
| 356 | MY_ATTRIBUTE((nonnull)); |
| 357 | |
| 358 | /********************************************************************//** |
| 359 | Sort a buffer. */ |
| 360 | void |
| 361 | row_merge_buf_sort( |
| 362 | /*===============*/ |
| 363 | row_merge_buf_t* buf, /*!< in/out: sort buffer */ |
| 364 | row_merge_dup_t* dup) /*!< in/out: reporter of duplicates |
| 365 | (NULL if non-unique index) */ |
| 366 | MY_ATTRIBUTE((nonnull(1))); |
| 367 | |
| 368 | /********************************************************************//** |
| 369 | Write a merge block to the file system. |
| 370 | @return whether the request was completed successfully */ |
| 371 | UNIV_INTERN |
| 372 | bool |
| 373 | row_merge_write( |
| 374 | /*============*/ |
| 375 | const pfs_os_file_t& fd, /*!< in: file descriptor */ |
| 376 | ulint offset, /*!< in: offset where to write, |
| 377 | in number of row_merge_block_t elements */ |
| 378 | const void* buf, /*!< in: data */ |
| 379 | void* crypt_buf, /*!< in: crypt buf or NULL */ |
| 380 | ulint space) /*!< in: space id */ |
| 381 | MY_ATTRIBUTE((warn_unused_result)); |
| 382 | |
| 383 | /********************************************************************//** |
| 384 | Empty a sort buffer. |
| 385 | @return sort buffer */ |
| 386 | row_merge_buf_t* |
| 387 | row_merge_buf_empty( |
| 388 | /*================*/ |
| 389 | row_merge_buf_t* buf) /*!< in,own: sort buffer */ |
| 390 | MY_ATTRIBUTE((warn_unused_result, nonnull)); |
| 391 | |
| 392 | /** Create a merge file in the given location. |
| 393 | @param[out] merge_file merge file structure |
| 394 | @param[in] path location for creating temporary file, or NULL |
| 395 | @return file descriptor, or -1 on failure */ |
| 396 | pfs_os_file_t |
| 397 | row_merge_file_create( |
| 398 | merge_file_t* merge_file, |
| 399 | const char* path) |
| 400 | MY_ATTRIBUTE((warn_unused_result, nonnull(1))); |
| 401 | |
| 402 | /** Merge disk files. |
| 403 | @param[in] trx transaction |
| 404 | @param[in] dup descriptor of index being created |
| 405 | @param[in,out] file file containing index entries |
| 406 | @param[in,out] block 3 buffers |
| 407 | @param[in,out] tmpfd temporary file handle |
| 408 | @param[in] update_progress true, if we should update progress status |
| 409 | @param[in] pct_progress total progress percent until now |
| 410 | @param[in] pct_ocst current progress percent |
| 411 | @param[in] crypt_block crypt buf or NULL |
| 412 | @param[in] space space_id |
| 413 | @param[in,out] stage performance schema accounting object, used by |
| 414 | ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially |
| 415 | and then stage->inc() will be called for each record processed. |
| 416 | @return DB_SUCCESS or error code */ |
| 417 | dberr_t |
| 418 | row_merge_sort( |
| 419 | /*===========*/ |
| 420 | trx_t* trx, |
| 421 | const row_merge_dup_t* dup, |
| 422 | merge_file_t* file, |
| 423 | row_merge_block_t* block, |
| 424 | pfs_os_file_t* tmpfd, |
| 425 | const bool update_progress, |
| 426 | const double pct_progress, |
| 427 | const double pct_cost, |
| 428 | row_merge_block_t* crypt_block, |
| 429 | ulint space, |
| 430 | ut_stage_alter_t* stage = NULL) |
| 431 | MY_ATTRIBUTE((warn_unused_result)); |
| 432 | |
| 433 | /*********************************************************************//** |
| 434 | Allocate a sort buffer. |
| 435 | @return own: sort buffer */ |
| 436 | row_merge_buf_t* |
| 437 | row_merge_buf_create( |
| 438 | /*=================*/ |
| 439 | dict_index_t* index) /*!< in: secondary index */ |
| 440 | MY_ATTRIBUTE((warn_unused_result, nonnull, malloc)); |
| 441 | |
| 442 | /*********************************************************************//** |
| 443 | Deallocate a sort buffer. */ |
| 444 | void |
| 445 | row_merge_buf_free( |
| 446 | /*===============*/ |
| 447 | row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ |
| 448 | MY_ATTRIBUTE((nonnull)); |
| 449 | |
| 450 | /*********************************************************************//** |
| 451 | Destroy a merge file. */ |
| 452 | void |
| 453 | row_merge_file_destroy( |
| 454 | /*===================*/ |
| 455 | merge_file_t* merge_file) /*!< in/out: merge file structure */ |
| 456 | MY_ATTRIBUTE((nonnull)); |
| 457 | |
| 458 | /** Read a merge block from the file system. |
| 459 | @return whether the request was completed successfully */ |
| 460 | bool |
| 461 | row_merge_read( |
| 462 | /*===========*/ |
| 463 | const pfs_os_file_t& fd, /*!< in: file descriptor */ |
| 464 | ulint offset, /*!< in: offset where to read |
| 465 | in number of row_merge_block_t |
| 466 | elements */ |
| 467 | row_merge_block_t* buf, /*!< out: data */ |
| 468 | row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */ |
| 469 | ulint space) /*!< in: space id */ |
| 470 | MY_ATTRIBUTE((warn_unused_result)); |
| 471 | |
| 472 | /********************************************************************//** |
| 473 | Read a merge record. |
| 474 | @return pointer to next record, or NULL on I/O error or end of list */ |
| 475 | const byte* |
| 476 | row_merge_read_rec( |
| 477 | /*===============*/ |
| 478 | row_merge_block_t* block, /*!< in/out: file buffer */ |
| 479 | mrec_buf_t* buf, /*!< in/out: secondary buffer */ |
| 480 | const byte* b, /*!< in: pointer to record */ |
| 481 | const dict_index_t* index, /*!< in: index of the record */ |
| 482 | const pfs_os_file_t& fd, /*!< in: file descriptor */ |
| 483 | ulint* foffs, /*!< in/out: file offset */ |
| 484 | const mrec_t** mrec, /*!< out: pointer to merge record, |
| 485 | or NULL on end of list |
| 486 | (non-NULL on I/O error) */ |
| 487 | ulint* offsets,/*!< out: offsets of mrec */ |
| 488 | row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ |
| 489 | ulint space) /*!< in: space id */ |
| 490 | MY_ATTRIBUTE((warn_unused_result)); |
| 491 | #endif /* row0merge.h */ |
| 492 | |