1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2015, 2017, MariaDB Corporation. |
5 | |
6 | This program is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free Software |
8 | Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, but WITHOUT |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License along with |
15 | this program; if not, write to the Free Software Foundation, Inc., |
16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
17 | |
18 | *****************************************************************************/ |
19 | |
20 | /**************************************************//** |
21 | @file include/row0merge.h |
22 | Index build routines using a merge sort |
23 | |
24 | Created 13/06/2005 Jan Lindstrom |
25 | *******************************************************/ |
26 | |
27 | #ifndef row0merge_h |
28 | #define row0merge_h |
29 | |
30 | #include "univ.i" |
31 | #include "data0data.h" |
32 | #include "dict0types.h" |
33 | #include "trx0types.h" |
34 | #include "que0types.h" |
35 | #include "mtr0mtr.h" |
36 | #include "rem0types.h" |
37 | #include "rem0rec.h" |
38 | #include "btr0types.h" |
39 | #include "row0mysql.h" |
40 | #include "lock0types.h" |
41 | #include "srv0srv.h" |
42 | #include "ut0stage.h" |
43 | |
44 | /* Reserve free space from every block for key_version */ |
45 | #define ROW_MERGE_RESERVE_SIZE 4 |
46 | |
47 | /* Cluster index read task is mandatory */ |
48 | #define COST_READ_CLUSTERED_INDEX 1.0 |
49 | |
50 | /* Basic fixed cost to build all type of index */ |
51 | #define COST_BUILD_INDEX_STATIC 0.5 |
52 | /* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */ |
53 | #define COST_BUILD_INDEX_DYNAMIC 0.5 |
54 | |
55 | /* Sum of below two must be 1.0 */ |
56 | #define PCT_COST_MERGESORT_INDEX 0.4 |
57 | #define PCT_COST_INSERT_INDEX 0.6 |
58 | |
59 | // Forward declaration |
60 | struct ib_sequence_t; |
61 | |
62 | /** @brief Block size for I/O operations in merge sort. |
63 | |
64 | The minimum is srv_page_size, or page_get_free_space_of_empty() |
65 | rounded to a power of 2. |
66 | |
67 | When not creating a PRIMARY KEY that contains column prefixes, this |
68 | can be set as small as srv_page_size / 2. */ |
69 | typedef byte row_merge_block_t; |
70 | |
71 | /** @brief Secondary buffer for I/O operations of merge records. |
72 | |
73 | This buffer is used for writing or reading a record that spans two |
74 | row_merge_block_t. Thus, it must be able to hold one merge record, |
75 | whose maximum size is the same as the minimum size of |
76 | row_merge_block_t. */ |
77 | typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX]; |
78 | |
79 | /** @brief Merge record in row_merge_block_t. |
80 | |
81 | The format is the same as a record in ROW_FORMAT=COMPACT with the |
82 | exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ |
83 | typedef byte mrec_t; |
84 | |
85 | /** Merge record in row_merge_buf_t */ |
86 | struct mtuple_t { |
87 | dfield_t* fields; /*!< data fields */ |
88 | }; |
89 | |
90 | /** Buffer for sorting in main memory. */ |
91 | struct row_merge_buf_t { |
92 | mem_heap_t* heap; /*!< memory heap where allocated */ |
93 | dict_index_t* index; /*!< the index the tuples belong to */ |
94 | ulint total_size; /*!< total amount of data bytes */ |
95 | ulint n_tuples; /*!< number of data tuples */ |
96 | ulint max_tuples; /*!< maximum number of data tuples */ |
97 | mtuple_t* tuples; /*!< array of data tuples */ |
98 | mtuple_t* tmp_tuples; /*!< temporary copy of tuples, |
99 | for sorting */ |
100 | }; |
101 | |
102 | /** Information about temporary files used in merge sort */ |
103 | struct merge_file_t { |
104 | pfs_os_file_t fd; /*!< file descriptor */ |
105 | ulint offset; /*!< file offset (end of file) */ |
106 | ib_uint64_t n_rec; /*!< number of records in the file */ |
107 | }; |
108 | |
109 | /** Index field definition */ |
110 | struct index_field_t { |
111 | ulint col_no; /*!< column offset */ |
112 | ulint prefix_len; /*!< column prefix length, or 0 |
113 | if indexing the whole column */ |
114 | bool is_v_col; /*!< whether this is a virtual column */ |
115 | }; |
116 | |
117 | /** Definition of an index being created */ |
118 | struct index_def_t { |
119 | const char* name; /*!< index name */ |
120 | bool rebuild; /*!< whether the table is rebuilt */ |
121 | ulint ind_type; /*!< 0, DICT_UNIQUE, |
122 | or DICT_CLUSTERED */ |
123 | ulint key_number; /*!< MySQL key number, |
124 | or ULINT_UNDEFINED if none */ |
125 | ulint n_fields; /*!< number of fields in index */ |
126 | index_field_t* fields; /*!< field definitions */ |
127 | st_mysql_ftparser* |
128 | parser; /*!< fulltext parser plugin */ |
129 | }; |
130 | |
131 | /** Structure for reporting duplicate records. */ |
132 | struct row_merge_dup_t { |
133 | dict_index_t* index; /*!< index being sorted */ |
134 | struct TABLE* table; /*!< MySQL table object */ |
135 | const ulint* col_map;/*!< mapping of column numbers |
136 | in table to the rebuilt table |
137 | (index->table), or NULL if not |
138 | rebuilding table */ |
139 | ulint n_dup; /*!< number of duplicates */ |
140 | }; |
141 | |
142 | /*************************************************************//** |
143 | Report a duplicate key. */ |
144 | void |
145 | row_merge_dup_report( |
146 | /*=================*/ |
147 | row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ |
148 | const dfield_t* entry) /*!< in: duplicate index entry */ |
149 | MY_ATTRIBUTE((nonnull)); |
150 | |
151 | /*********************************************************************//** |
152 | Sets an exclusive lock on a table, for the duration of creating indexes. |
153 | @return error code or DB_SUCCESS */ |
154 | dberr_t |
155 | row_merge_lock_table( |
156 | /*=================*/ |
157 | trx_t* trx, /*!< in/out: transaction */ |
158 | dict_table_t* table, /*!< in: table to lock */ |
159 | enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ |
160 | MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); |
161 | |
162 | /*********************************************************************//** |
163 | Drop indexes that were created before an error occurred. |
164 | The data dictionary must have been locked exclusively by the caller, |
165 | because the transaction will not be committed. */ |
166 | void |
167 | row_merge_drop_indexes_dict( |
168 | /*========================*/ |
169 | trx_t* trx, /*!< in/out: dictionary transaction */ |
170 | table_id_t table_id)/*!< in: table identifier */ |
171 | MY_ATTRIBUTE((nonnull)); |
172 | |
173 | /*********************************************************************//** |
174 | Drop those indexes which were created before an error occurred. |
175 | The data dictionary must have been locked exclusively by the caller, |
176 | because the transaction will not be committed. */ |
177 | void |
178 | row_merge_drop_indexes( |
179 | /*===================*/ |
180 | trx_t* trx, /*!< in/out: transaction */ |
181 | dict_table_t* table, /*!< in/out: table containing the indexes */ |
182 | ibool locked) /*!< in: TRUE=table locked, |
183 | FALSE=may need to do a lazy drop */ |
184 | MY_ATTRIBUTE((nonnull)); |
185 | |
186 | /*********************************************************************//** |
187 | Drop all partially created indexes during crash recovery. */ |
188 | void |
189 | row_merge_drop_temp_indexes(void); |
190 | /*=============================*/ |
191 | |
192 | /** Create temporary merge files in the given paramater path, and if |
193 | UNIV_PFS_IO defined, register the file descriptor with Performance Schema. |
194 | @param[in] path location for creating temporary merge files, or NULL |
195 | @return File descriptor */ |
196 | pfs_os_file_t |
197 | row_merge_file_create_low( |
198 | const char* path) |
199 | MY_ATTRIBUTE((warn_unused_result)); |
200 | /*********************************************************************//** |
201 | Destroy a merge file. And de-register the file from Performance Schema |
202 | if UNIV_PFS_IO is defined. */ |
203 | void |
204 | row_merge_file_destroy_low( |
205 | /*=======================*/ |
206 | const pfs_os_file_t& fd); /*!< in: merge file descriptor */ |
207 | |
208 | /*********************************************************************//** |
209 | Provide a new pathname for a table that is being renamed if it belongs to |
210 | a file-per-table tablespace. The caller is responsible for freeing the |
211 | memory allocated for the return value. |
212 | @return new pathname of tablespace file, or NULL if space = 0 */ |
213 | char* |
214 | row_make_new_pathname( |
215 | /*==================*/ |
216 | dict_table_t* table, /*!< in: table to be renamed */ |
217 | const char* new_name) /*!< in: new name */ |
218 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
219 | |
220 | /*********************************************************************//** |
221 | Rename the tables in the data dictionary. The data dictionary must |
222 | have been locked exclusively by the caller, because the transaction |
223 | will not be committed. |
224 | @return error code or DB_SUCCESS */ |
225 | dberr_t |
226 | row_merge_rename_tables_dict( |
227 | /*=========================*/ |
228 | dict_table_t* old_table, /*!< in/out: old table, renamed to |
229 | tmp_name */ |
230 | dict_table_t* new_table, /*!< in/out: new table, renamed to |
231 | old_table->name */ |
232 | const char* tmp_name, /*!< in: new name for old_table */ |
233 | trx_t* trx) /*!< in/out: dictionary transaction */ |
234 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
235 | |
236 | /*********************************************************************//** |
237 | Rename an index in the dictionary that was created. The data |
238 | dictionary must have been locked exclusively by the caller, because |
239 | the transaction will not be committed. |
240 | @return DB_SUCCESS if all OK */ |
241 | dberr_t |
242 | row_merge_rename_index_to_add( |
243 | /*==========================*/ |
244 | trx_t* trx, /*!< in/out: transaction */ |
245 | table_id_t table_id, /*!< in: table identifier */ |
246 | index_id_t index_id) /*!< in: index identifier */ |
247 | MY_ATTRIBUTE((nonnull(1), warn_unused_result)); |
248 | |
249 | /*********************************************************************//** |
250 | Rename an index in the dictionary that is to be dropped. The data |
251 | dictionary must have been locked exclusively by the caller, because |
252 | the transaction will not be committed. |
253 | @return DB_SUCCESS if all OK */ |
254 | dberr_t |
255 | row_merge_rename_index_to_drop( |
256 | /*===========================*/ |
257 | trx_t* trx, /*!< in/out: transaction */ |
258 | table_id_t table_id, /*!< in: table identifier */ |
259 | index_id_t index_id) /*!< in: index identifier */ |
260 | MY_ATTRIBUTE((nonnull(1), warn_unused_result)); |
261 | |
262 | /** Create the index and load in to the dictionary. |
263 | @param[in,out] table the index is on this table |
264 | @param[in] index_def the index definition |
265 | @param[in] add_v new virtual columns added along with add |
266 | index call |
267 | @return index, or NULL on error */ |
268 | dict_index_t* |
269 | row_merge_create_index( |
270 | dict_table_t* table, |
271 | const index_def_t* index_def, |
272 | const dict_add_v_col_t* add_v) |
273 | MY_ATTRIBUTE((warn_unused_result)); |
274 | |
275 | /*********************************************************************//** |
276 | Check if a transaction can use an index. |
277 | @return whether the index can be used by the transaction */ |
278 | bool |
279 | row_merge_is_index_usable( |
280 | /*======================*/ |
281 | const trx_t* trx, /*!< in: transaction */ |
282 | const dict_index_t* index) /*!< in: index to check */ |
283 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
284 | |
285 | /*********************************************************************//** |
286 | Drop a table. The caller must have ensured that the background stats |
287 | thread is not processing the table. This can be done by calling |
288 | dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and |
289 | before calling this function. |
290 | @return DB_SUCCESS or error code */ |
291 | dberr_t |
292 | row_merge_drop_table( |
293 | /*=================*/ |
294 | trx_t* trx, /*!< in: transaction */ |
295 | dict_table_t* table) /*!< in: table instance to drop */ |
296 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
297 | |
298 | /** Build indexes on a table by reading a clustered index, creating a temporary |
299 | file containing index entries, merge sorting these index entries and inserting |
300 | sorted index entries to indexes. |
301 | @param[in] trx transaction |
302 | @param[in] old_table table where rows are read from |
303 | @param[in] new_table table where indexes are created; identical to |
304 | old_table unless creating a PRIMARY KEY |
305 | @param[in] online true if creating indexes online |
306 | @param[in] indexes indexes to be created |
307 | @param[in] key_numbers MySQL key numbers |
308 | @param[in] n_indexes size of indexes[] |
309 | @param[in,out] table MySQL table, for reporting erroneous key value |
310 | if applicable |
311 | @param[in] defaults default values of added, changed columns, or NULL |
312 | @param[in] col_map mapping of old column numbers to new ones, or |
313 | NULL if old_table == new_table |
314 | @param[in] add_autoinc number of added AUTO_INCREMENT columns, or |
315 | ULINT_UNDEFINED if none is added |
316 | @param[in,out] sequence autoinc sequence |
317 | @param[in] skip_pk_sort whether the new PRIMARY KEY will follow |
318 | existing order |
319 | @param[in,out] stage performance schema accounting object, used by |
320 | ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of |
321 | this function and it will be passed to other functions for further accounting. |
322 | @param[in] add_v new virtual columns added along with indexes |
323 | @param[in] eval_table mysql table used to evaluate virtual column |
324 | value, see innobase_get_computed_value(). |
325 | @param[in] drop_historical whether to drop historical system rows |
326 | @return DB_SUCCESS or error code */ |
327 | dberr_t |
328 | row_merge_build_indexes( |
329 | trx_t* trx, |
330 | dict_table_t* old_table, |
331 | dict_table_t* new_table, |
332 | bool online, |
333 | dict_index_t** indexes, |
334 | const ulint* key_numbers, |
335 | ulint n_indexes, |
336 | struct TABLE* table, |
337 | const dtuple_t* defaults, |
338 | const ulint* col_map, |
339 | ulint add_autoinc, |
340 | ib_sequence_t& sequence, |
341 | bool skip_pk_sort, |
342 | ut_stage_alter_t* stage, |
343 | const dict_add_v_col_t* add_v, |
344 | struct TABLE* eval_table, |
345 | bool drop_historical) |
346 | MY_ATTRIBUTE((warn_unused_result)); |
347 | |
348 | /********************************************************************//** |
349 | Write a buffer to a block. */ |
350 | void |
351 | row_merge_buf_write( |
352 | /*================*/ |
353 | const row_merge_buf_t* buf, /*!< in: sorted buffer */ |
354 | const merge_file_t* of, /*!< in: output file */ |
355 | row_merge_block_t* block) /*!< out: buffer for writing to file */ |
356 | MY_ATTRIBUTE((nonnull)); |
357 | |
358 | /********************************************************************//** |
359 | Sort a buffer. */ |
360 | void |
361 | row_merge_buf_sort( |
362 | /*===============*/ |
363 | row_merge_buf_t* buf, /*!< in/out: sort buffer */ |
364 | row_merge_dup_t* dup) /*!< in/out: reporter of duplicates |
365 | (NULL if non-unique index) */ |
366 | MY_ATTRIBUTE((nonnull(1))); |
367 | |
368 | /********************************************************************//** |
369 | Write a merge block to the file system. |
370 | @return whether the request was completed successfully */ |
371 | UNIV_INTERN |
372 | bool |
373 | row_merge_write( |
374 | /*============*/ |
375 | const pfs_os_file_t& fd, /*!< in: file descriptor */ |
376 | ulint offset, /*!< in: offset where to write, |
377 | in number of row_merge_block_t elements */ |
378 | const void* buf, /*!< in: data */ |
379 | void* crypt_buf, /*!< in: crypt buf or NULL */ |
380 | ulint space) /*!< in: space id */ |
381 | MY_ATTRIBUTE((warn_unused_result)); |
382 | |
383 | /********************************************************************//** |
384 | Empty a sort buffer. |
385 | @return sort buffer */ |
386 | row_merge_buf_t* |
387 | row_merge_buf_empty( |
388 | /*================*/ |
389 | row_merge_buf_t* buf) /*!< in,own: sort buffer */ |
390 | MY_ATTRIBUTE((warn_unused_result, nonnull)); |
391 | |
392 | /** Create a merge file in the given location. |
393 | @param[out] merge_file merge file structure |
394 | @param[in] path location for creating temporary file, or NULL |
395 | @return file descriptor, or -1 on failure */ |
396 | pfs_os_file_t |
397 | row_merge_file_create( |
398 | merge_file_t* merge_file, |
399 | const char* path) |
400 | MY_ATTRIBUTE((warn_unused_result, nonnull(1))); |
401 | |
402 | /** Merge disk files. |
403 | @param[in] trx transaction |
404 | @param[in] dup descriptor of index being created |
405 | @param[in,out] file file containing index entries |
406 | @param[in,out] block 3 buffers |
407 | @param[in,out] tmpfd temporary file handle |
408 | @param[in] update_progress true, if we should update progress status |
409 | @param[in] pct_progress total progress percent until now |
410 | @param[in] pct_ocst current progress percent |
411 | @param[in] crypt_block crypt buf or NULL |
412 | @param[in] space space_id |
413 | @param[in,out] stage performance schema accounting object, used by |
414 | ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially |
415 | and then stage->inc() will be called for each record processed. |
416 | @return DB_SUCCESS or error code */ |
417 | dberr_t |
418 | row_merge_sort( |
419 | /*===========*/ |
420 | trx_t* trx, |
421 | const row_merge_dup_t* dup, |
422 | merge_file_t* file, |
423 | row_merge_block_t* block, |
424 | pfs_os_file_t* tmpfd, |
425 | const bool update_progress, |
426 | const double pct_progress, |
427 | const double pct_cost, |
428 | row_merge_block_t* crypt_block, |
429 | ulint space, |
430 | ut_stage_alter_t* stage = NULL) |
431 | MY_ATTRIBUTE((warn_unused_result)); |
432 | |
433 | /*********************************************************************//** |
434 | Allocate a sort buffer. |
435 | @return own: sort buffer */ |
436 | row_merge_buf_t* |
437 | row_merge_buf_create( |
438 | /*=================*/ |
439 | dict_index_t* index) /*!< in: secondary index */ |
440 | MY_ATTRIBUTE((warn_unused_result, nonnull, malloc)); |
441 | |
442 | /*********************************************************************//** |
443 | Deallocate a sort buffer. */ |
444 | void |
445 | row_merge_buf_free( |
446 | /*===============*/ |
447 | row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ |
448 | MY_ATTRIBUTE((nonnull)); |
449 | |
450 | /*********************************************************************//** |
451 | Destroy a merge file. */ |
452 | void |
453 | row_merge_file_destroy( |
454 | /*===================*/ |
455 | merge_file_t* merge_file) /*!< in/out: merge file structure */ |
456 | MY_ATTRIBUTE((nonnull)); |
457 | |
458 | /** Read a merge block from the file system. |
459 | @return whether the request was completed successfully */ |
460 | bool |
461 | row_merge_read( |
462 | /*===========*/ |
463 | const pfs_os_file_t& fd, /*!< in: file descriptor */ |
464 | ulint offset, /*!< in: offset where to read |
465 | in number of row_merge_block_t |
466 | elements */ |
467 | row_merge_block_t* buf, /*!< out: data */ |
468 | row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */ |
469 | ulint space) /*!< in: space id */ |
470 | MY_ATTRIBUTE((warn_unused_result)); |
471 | |
472 | /********************************************************************//** |
473 | Read a merge record. |
474 | @return pointer to next record, or NULL on I/O error or end of list */ |
475 | const byte* |
476 | row_merge_read_rec( |
477 | /*===============*/ |
478 | row_merge_block_t* block, /*!< in/out: file buffer */ |
479 | mrec_buf_t* buf, /*!< in/out: secondary buffer */ |
480 | const byte* b, /*!< in: pointer to record */ |
481 | const dict_index_t* index, /*!< in: index of the record */ |
482 | const pfs_os_file_t& fd, /*!< in: file descriptor */ |
483 | ulint* foffs, /*!< in/out: file offset */ |
484 | const mrec_t** mrec, /*!< out: pointer to merge record, |
485 | or NULL on end of list |
486 | (non-NULL on I/O error) */ |
487 | ulint* offsets,/*!< out: offsets of mrec */ |
488 | row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ |
489 | ulint space) /*!< in: space id */ |
490 | MY_ATTRIBUTE((warn_unused_result)); |
491 | #endif /* row0merge.h */ |
492 | |