1/*****************************************************************************
2
3Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2015, 2017, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file include/row0merge.h
22Index build routines using a merge sort
23
24Created 13/06/2005 Jan Lindstrom
25*******************************************************/
26
27#ifndef row0merge_h
28#define row0merge_h
29
30#include "univ.i"
31#include "data0data.h"
32#include "dict0types.h"
33#include "trx0types.h"
34#include "que0types.h"
35#include "mtr0mtr.h"
36#include "rem0types.h"
37#include "rem0rec.h"
38#include "btr0types.h"
39#include "row0mysql.h"
40#include "lock0types.h"
41#include "srv0srv.h"
42#include "ut0stage.h"
43
44/* Reserve free space from every block for key_version */
45#define ROW_MERGE_RESERVE_SIZE 4
46
47/* Cluster index read task is mandatory */
48#define COST_READ_CLUSTERED_INDEX 1.0
49
50/* Basic fixed cost to build all type of index */
51#define COST_BUILD_INDEX_STATIC 0.5
52/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
53#define COST_BUILD_INDEX_DYNAMIC 0.5
54
55/* Sum of below two must be 1.0 */
56#define PCT_COST_MERGESORT_INDEX 0.4
57#define PCT_COST_INSERT_INDEX 0.6
58
59// Forward declaration
60struct ib_sequence_t;
61
62/** @brief Block size for I/O operations in merge sort.
63
64The minimum is srv_page_size, or page_get_free_space_of_empty()
65rounded to a power of 2.
66
67When not creating a PRIMARY KEY that contains column prefixes, this
68can be set as small as srv_page_size / 2. */
69typedef byte row_merge_block_t;
70
71/** @brief Secondary buffer for I/O operations of merge records.
72
73This buffer is used for writing or reading a record that spans two
74row_merge_block_t. Thus, it must be able to hold one merge record,
75whose maximum size is the same as the minimum size of
76row_merge_block_t. */
77typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX];
78
79/** @brief Merge record in row_merge_block_t.
80
81The format is the same as a record in ROW_FORMAT=COMPACT with the
82exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
83typedef byte mrec_t;
84
85/** Merge record in row_merge_buf_t */
86struct mtuple_t {
87 dfield_t* fields; /*!< data fields */
88};
89
90/** Buffer for sorting in main memory. */
91struct row_merge_buf_t {
92 mem_heap_t* heap; /*!< memory heap where allocated */
93 dict_index_t* index; /*!< the index the tuples belong to */
94 ulint total_size; /*!< total amount of data bytes */
95 ulint n_tuples; /*!< number of data tuples */
96 ulint max_tuples; /*!< maximum number of data tuples */
97 mtuple_t* tuples; /*!< array of data tuples */
98 mtuple_t* tmp_tuples; /*!< temporary copy of tuples,
99 for sorting */
100};
101
102/** Information about temporary files used in merge sort */
103struct merge_file_t {
104 pfs_os_file_t fd; /*!< file descriptor */
105 ulint offset; /*!< file offset (end of file) */
106 ib_uint64_t n_rec; /*!< number of records in the file */
107};
108
109/** Index field definition */
110struct index_field_t {
111 ulint col_no; /*!< column offset */
112 ulint prefix_len; /*!< column prefix length, or 0
113 if indexing the whole column */
114 bool is_v_col; /*!< whether this is a virtual column */
115};
116
117/** Definition of an index being created */
118struct index_def_t {
119 const char* name; /*!< index name */
120 bool rebuild; /*!< whether the table is rebuilt */
121 ulint ind_type; /*!< 0, DICT_UNIQUE,
122 or DICT_CLUSTERED */
123 ulint key_number; /*!< MySQL key number,
124 or ULINT_UNDEFINED if none */
125 ulint n_fields; /*!< number of fields in index */
126 index_field_t* fields; /*!< field definitions */
127 st_mysql_ftparser*
128 parser; /*!< fulltext parser plugin */
129};
130
131/** Structure for reporting duplicate records. */
132struct row_merge_dup_t {
133 dict_index_t* index; /*!< index being sorted */
134 struct TABLE* table; /*!< MySQL table object */
135 const ulint* col_map;/*!< mapping of column numbers
136 in table to the rebuilt table
137 (index->table), or NULL if not
138 rebuilding table */
139 ulint n_dup; /*!< number of duplicates */
140};
141
142/*************************************************************//**
143Report a duplicate key. */
144void
145row_merge_dup_report(
146/*=================*/
147 row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
148 const dfield_t* entry) /*!< in: duplicate index entry */
149 MY_ATTRIBUTE((nonnull));
150
151/*********************************************************************//**
152Sets an exclusive lock on a table, for the duration of creating indexes.
153@return error code or DB_SUCCESS */
154dberr_t
155row_merge_lock_table(
156/*=================*/
157 trx_t* trx, /*!< in/out: transaction */
158 dict_table_t* table, /*!< in: table to lock */
159 enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
160 MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
161
162/*********************************************************************//**
163Drop indexes that were created before an error occurred.
164The data dictionary must have been locked exclusively by the caller,
165because the transaction will not be committed. */
166void
167row_merge_drop_indexes_dict(
168/*========================*/
169 trx_t* trx, /*!< in/out: dictionary transaction */
170 table_id_t table_id)/*!< in: table identifier */
171 MY_ATTRIBUTE((nonnull));
172
173/*********************************************************************//**
174Drop those indexes which were created before an error occurred.
175The data dictionary must have been locked exclusively by the caller,
176because the transaction will not be committed. */
177void
178row_merge_drop_indexes(
179/*===================*/
180 trx_t* trx, /*!< in/out: transaction */
181 dict_table_t* table, /*!< in/out: table containing the indexes */
182 ibool locked) /*!< in: TRUE=table locked,
183 FALSE=may need to do a lazy drop */
184 MY_ATTRIBUTE((nonnull));
185
186/*********************************************************************//**
187Drop all partially created indexes during crash recovery. */
188void
189row_merge_drop_temp_indexes(void);
190/*=============================*/
191
192/** Create temporary merge files in the given paramater path, and if
193UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
194@param[in] path location for creating temporary merge files, or NULL
195@return File descriptor */
196pfs_os_file_t
197row_merge_file_create_low(
198 const char* path)
199 MY_ATTRIBUTE((warn_unused_result));
200/*********************************************************************//**
201Destroy a merge file. And de-register the file from Performance Schema
202if UNIV_PFS_IO is defined. */
203void
204row_merge_file_destroy_low(
205/*=======================*/
206 const pfs_os_file_t& fd); /*!< in: merge file descriptor */
207
208/*********************************************************************//**
209Provide a new pathname for a table that is being renamed if it belongs to
210a file-per-table tablespace. The caller is responsible for freeing the
211memory allocated for the return value.
212@return new pathname of tablespace file, or NULL if space = 0 */
213char*
214row_make_new_pathname(
215/*==================*/
216 dict_table_t* table, /*!< in: table to be renamed */
217 const char* new_name) /*!< in: new name */
218 MY_ATTRIBUTE((nonnull, warn_unused_result));
219
220/*********************************************************************//**
221Rename the tables in the data dictionary. The data dictionary must
222have been locked exclusively by the caller, because the transaction
223will not be committed.
224@return error code or DB_SUCCESS */
225dberr_t
226row_merge_rename_tables_dict(
227/*=========================*/
228 dict_table_t* old_table, /*!< in/out: old table, renamed to
229 tmp_name */
230 dict_table_t* new_table, /*!< in/out: new table, renamed to
231 old_table->name */
232 const char* tmp_name, /*!< in: new name for old_table */
233 trx_t* trx) /*!< in/out: dictionary transaction */
234 MY_ATTRIBUTE((nonnull, warn_unused_result));
235
236/*********************************************************************//**
237Rename an index in the dictionary that was created. The data
238dictionary must have been locked exclusively by the caller, because
239the transaction will not be committed.
240@return DB_SUCCESS if all OK */
241dberr_t
242row_merge_rename_index_to_add(
243/*==========================*/
244 trx_t* trx, /*!< in/out: transaction */
245 table_id_t table_id, /*!< in: table identifier */
246 index_id_t index_id) /*!< in: index identifier */
247 MY_ATTRIBUTE((nonnull(1), warn_unused_result));
248
249/*********************************************************************//**
250Rename an index in the dictionary that is to be dropped. The data
251dictionary must have been locked exclusively by the caller, because
252the transaction will not be committed.
253@return DB_SUCCESS if all OK */
254dberr_t
255row_merge_rename_index_to_drop(
256/*===========================*/
257 trx_t* trx, /*!< in/out: transaction */
258 table_id_t table_id, /*!< in: table identifier */
259 index_id_t index_id) /*!< in: index identifier */
260 MY_ATTRIBUTE((nonnull(1), warn_unused_result));
261
262/** Create the index and load in to the dictionary.
263@param[in,out] table the index is on this table
264@param[in] index_def the index definition
265@param[in] add_v new virtual columns added along with add
266 index call
267@return index, or NULL on error */
268dict_index_t*
269row_merge_create_index(
270 dict_table_t* table,
271 const index_def_t* index_def,
272 const dict_add_v_col_t* add_v)
273 MY_ATTRIBUTE((warn_unused_result));
274
275/*********************************************************************//**
276Check if a transaction can use an index.
277@return whether the index can be used by the transaction */
278bool
279row_merge_is_index_usable(
280/*======================*/
281 const trx_t* trx, /*!< in: transaction */
282 const dict_index_t* index) /*!< in: index to check */
283 MY_ATTRIBUTE((nonnull, warn_unused_result));
284
285/*********************************************************************//**
286Drop a table. The caller must have ensured that the background stats
287thread is not processing the table. This can be done by calling
288dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
289before calling this function.
290@return DB_SUCCESS or error code */
291dberr_t
292row_merge_drop_table(
293/*=================*/
294 trx_t* trx, /*!< in: transaction */
295 dict_table_t* table) /*!< in: table instance to drop */
296 MY_ATTRIBUTE((nonnull, warn_unused_result));
297
298/** Build indexes on a table by reading a clustered index, creating a temporary
299file containing index entries, merge sorting these index entries and inserting
300sorted index entries to indexes.
301@param[in] trx transaction
302@param[in] old_table table where rows are read from
303@param[in] new_table table where indexes are created; identical to
304old_table unless creating a PRIMARY KEY
305@param[in] online true if creating indexes online
306@param[in] indexes indexes to be created
307@param[in] key_numbers MySQL key numbers
308@param[in] n_indexes size of indexes[]
309@param[in,out] table MySQL table, for reporting erroneous key value
310if applicable
311@param[in] defaults default values of added, changed columns, or NULL
312@param[in] col_map mapping of old column numbers to new ones, or
313NULL if old_table == new_table
314@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
315ULINT_UNDEFINED if none is added
316@param[in,out] sequence autoinc sequence
317@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
318existing order
319@param[in,out] stage performance schema accounting object, used by
320ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
321this function and it will be passed to other functions for further accounting.
322@param[in] add_v new virtual columns added along with indexes
323@param[in] eval_table mysql table used to evaluate virtual column
324 value, see innobase_get_computed_value().
325@param[in] drop_historical whether to drop historical system rows
326@return DB_SUCCESS or error code */
327dberr_t
328row_merge_build_indexes(
329 trx_t* trx,
330 dict_table_t* old_table,
331 dict_table_t* new_table,
332 bool online,
333 dict_index_t** indexes,
334 const ulint* key_numbers,
335 ulint n_indexes,
336 struct TABLE* table,
337 const dtuple_t* defaults,
338 const ulint* col_map,
339 ulint add_autoinc,
340 ib_sequence_t& sequence,
341 bool skip_pk_sort,
342 ut_stage_alter_t* stage,
343 const dict_add_v_col_t* add_v,
344 struct TABLE* eval_table,
345 bool drop_historical)
346 MY_ATTRIBUTE((warn_unused_result));
347
348/********************************************************************//**
349Write a buffer to a block. */
350void
351row_merge_buf_write(
352/*================*/
353 const row_merge_buf_t* buf, /*!< in: sorted buffer */
354 const merge_file_t* of, /*!< in: output file */
355 row_merge_block_t* block) /*!< out: buffer for writing to file */
356 MY_ATTRIBUTE((nonnull));
357
358/********************************************************************//**
359Sort a buffer. */
360void
361row_merge_buf_sort(
362/*===============*/
363 row_merge_buf_t* buf, /*!< in/out: sort buffer */
364 row_merge_dup_t* dup) /*!< in/out: reporter of duplicates
365 (NULL if non-unique index) */
366 MY_ATTRIBUTE((nonnull(1)));
367
368/********************************************************************//**
369Write a merge block to the file system.
370@return whether the request was completed successfully */
371UNIV_INTERN
372bool
373row_merge_write(
374/*============*/
375 const pfs_os_file_t& fd, /*!< in: file descriptor */
376 ulint offset, /*!< in: offset where to write,
377 in number of row_merge_block_t elements */
378 const void* buf, /*!< in: data */
379 void* crypt_buf, /*!< in: crypt buf or NULL */
380 ulint space) /*!< in: space id */
381 MY_ATTRIBUTE((warn_unused_result));
382
383/********************************************************************//**
384Empty a sort buffer.
385@return sort buffer */
386row_merge_buf_t*
387row_merge_buf_empty(
388/*================*/
389 row_merge_buf_t* buf) /*!< in,own: sort buffer */
390 MY_ATTRIBUTE((warn_unused_result, nonnull));
391
392/** Create a merge file in the given location.
393@param[out] merge_file merge file structure
394@param[in] path location for creating temporary file, or NULL
395@return file descriptor, or -1 on failure */
396pfs_os_file_t
397row_merge_file_create(
398 merge_file_t* merge_file,
399 const char* path)
400 MY_ATTRIBUTE((warn_unused_result, nonnull(1)));
401
402/** Merge disk files.
403@param[in] trx transaction
404@param[in] dup descriptor of index being created
405@param[in,out] file file containing index entries
406@param[in,out] block 3 buffers
407@param[in,out] tmpfd temporary file handle
408@param[in] update_progress true, if we should update progress status
409@param[in] pct_progress total progress percent until now
410@param[in] pct_ocst current progress percent
411@param[in] crypt_block crypt buf or NULL
412@param[in] space space_id
413@param[in,out] stage performance schema accounting object, used by
414ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
415and then stage->inc() will be called for each record processed.
416@return DB_SUCCESS or error code */
417dberr_t
418row_merge_sort(
419/*===========*/
420 trx_t* trx,
421 const row_merge_dup_t* dup,
422 merge_file_t* file,
423 row_merge_block_t* block,
424 pfs_os_file_t* tmpfd,
425 const bool update_progress,
426 const double pct_progress,
427 const double pct_cost,
428 row_merge_block_t* crypt_block,
429 ulint space,
430 ut_stage_alter_t* stage = NULL)
431 MY_ATTRIBUTE((warn_unused_result));
432
433/*********************************************************************//**
434Allocate a sort buffer.
435@return own: sort buffer */
436row_merge_buf_t*
437row_merge_buf_create(
438/*=================*/
439 dict_index_t* index) /*!< in: secondary index */
440 MY_ATTRIBUTE((warn_unused_result, nonnull, malloc));
441
442/*********************************************************************//**
443Deallocate a sort buffer. */
444void
445row_merge_buf_free(
446/*===============*/
447 row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */
448 MY_ATTRIBUTE((nonnull));
449
450/*********************************************************************//**
451Destroy a merge file. */
452void
453row_merge_file_destroy(
454/*===================*/
455 merge_file_t* merge_file) /*!< in/out: merge file structure */
456 MY_ATTRIBUTE((nonnull));
457
458/** Read a merge block from the file system.
459@return whether the request was completed successfully */
460bool
461row_merge_read(
462/*===========*/
463 const pfs_os_file_t& fd, /*!< in: file descriptor */
464 ulint offset, /*!< in: offset where to read
465 in number of row_merge_block_t
466 elements */
467 row_merge_block_t* buf, /*!< out: data */
468 row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */
469 ulint space) /*!< in: space id */
470 MY_ATTRIBUTE((warn_unused_result));
471
472/********************************************************************//**
473Read a merge record.
474@return pointer to next record, or NULL on I/O error or end of list */
475const byte*
476row_merge_read_rec(
477/*===============*/
478 row_merge_block_t* block, /*!< in/out: file buffer */
479 mrec_buf_t* buf, /*!< in/out: secondary buffer */
480 const byte* b, /*!< in: pointer to record */
481 const dict_index_t* index, /*!< in: index of the record */
482 const pfs_os_file_t& fd, /*!< in: file descriptor */
483 ulint* foffs, /*!< in/out: file offset */
484 const mrec_t** mrec, /*!< out: pointer to merge record,
485 or NULL on end of list
486 (non-NULL on I/O error) */
487 ulint* offsets,/*!< out: offsets of mrec */
488 row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
489 ulint space) /*!< in: space id */
490 MY_ATTRIBUTE((warn_unused_result));
491#endif /* row0merge.h */
492