1/*****************************************************************************
2
3Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file ut/ut0stage.h
22Supplementary code to performance schema stage instrumentation.
23
24Created Nov 12, 2014 Vasil Dimov
25*******************************************************/
26
27#ifndef ut0stage_h
28#define ut0stage_h
29
30#include <algorithm>
31#include <math.h>
32
33#include "my_global.h" /* needed for headers from mysql/psi/ */
34
35#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */
36#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */
37
38#include "univ.i"
39
40#include "dict0mem.h" /* dict_index_t */
41#include "row0log.h" /* row_log_estimate_work() */
42#include "srv0srv.h" /* ut_stage_alter_t */
43
44#ifdef HAVE_PSI_STAGE_INTERFACE
45
46typedef void PSI_stage_progress;
47
48/** Class used to report ALTER TABLE progress via performance_schema.
49The only user of this class is the ALTER TABLE code and it calls the methods
50in the following order
51constructor
52begin_phase_read_pk()
53 multiple times:
54 n_pk_recs_inc() // once per record read
55 inc() // once per page read
56end_phase_read_pk()
57if any new indexes are being added, for each one:
58 begin_phase_sort()
59 multiple times:
60 inc() // once per record sorted
61 begin_phase_insert()
62 multiple times:
63 inc() // once per record inserted
64 being_phase_log_index()
65 multiple times:
66 inc() // once per log-block applied
67begin_phase_flush()
68 multiple times:
69 inc() // once per page flushed
70begin_phase_log_table()
71 multiple times:
72 inc() // once per log-block applied
73begin_phase_end()
74destructor
75
76This class knows the specifics of each phase and tries to increment the
77progress in an even manner across the entire ALTER TABLE lifetime. */
78class ut_stage_alter_t {
79public:
80 /** Constructor.
81 @param[in] pk primary key of the old table */
82 explicit
83 ut_stage_alter_t(
84 const dict_index_t* pk)
85 :
86 m_progress(NULL),
87 m_pk(pk),
88 m_n_pk_recs(0),
89 m_n_pk_pages(0),
90 m_n_recs_processed(0),
91 m_n_flush_pages(0),
92 m_cur_phase(NOT_STARTED)
93 {
94 }
95
96 /** Destructor. */
97 ~ut_stage_alter_t();
98
99 /** Flag an ALTER TABLE start (read primary key phase).
100 @param[in] n_sort_indexes number of indexes that will be sorted
101 during ALTER TABLE, used for estimating the total work to be done */
102 void
103 begin_phase_read_pk(
104 ulint n_sort_indexes);
105
106 /** Increment the number of records in PK (table) with 1.
107 This is used to get more accurate estimate about the number of
108 records per page which is needed because some phases work on
109 per-page basis while some work on per-record basis and we want
110 to get the progress as even as possible. */
111 void
112 n_pk_recs_inc();
113
114 /** Flag either one record or one page processed, depending on the
115 current phase.
116 @param[in] inc_val flag this many units processed at once */
117 void
118 inc(
119 ulint inc_val = 1);
120
121 /** Flag the end of reading of the primary key.
122 Here we know the exact number of pages and records and calculate
123 the number of records per page and refresh the estimate. */
124 void
125 end_phase_read_pk();
126
127 /** Flag the beginning of the sort phase.
128 @param[in] sort_multi_factor since merge sort processes
129 one page more than once we only update the estimate once per this
130 many pages processed. */
131 void
132 begin_phase_sort(
133 double sort_multi_factor);
134
135 /** Flag the beginning of the insert phase. */
136 void
137 begin_phase_insert();
138
139 /** Flag the beginning of the flush phase.
140 @param[in] n_flush_pages this many pages are going to be
141 flushed */
142 void
143 begin_phase_flush(
144 ulint n_flush_pages);
145
146 /** Flag the beginning of the log index phase. */
147 void
148 begin_phase_log_index();
149
150 /** Flag the beginning of the log table phase. */
151 void
152 begin_phase_log_table();
153
154 /** Flag the beginning of the end phase. */
155 void
156 begin_phase_end();
157
158private:
159
160 /** Update the estimate of total work to be done. */
161 void
162 reestimate();
163
164 /** Change the current phase.
165 @param[in] new_stage pointer to the new stage to change to */
166 void
167 change_phase(
168 const PSI_stage_info* new_stage);
169
170 /** Performance schema accounting object. */
171 /* TODO: MySQL 5.7 PSI */
172 PSI_stage_progress* m_progress;
173
174 /** Old table PK. Used for calculating the estimate. */
175 const dict_index_t* m_pk;
176
177 /** Number of records in the primary key (table), including delete
178 marked records. */
179 ulint m_n_pk_recs;
180
181 /** Number of leaf pages in the primary key. */
182 ulint m_n_pk_pages;
183
184 /** Estimated number of records per page in the primary key. */
185 double m_n_recs_per_page;
186
187 /** Number of indexes that are being added. */
188 ulint m_n_sort_indexes;
189
190 /** During the sort phase, increment the counter once per this
191 many pages processed. This is because sort processes one page more
192 than once. */
193 ulint m_sort_multi_factor;
194
195 /** Number of records processed during sort & insert phases. We
196 need to increment the counter only once page, or once per
197 recs-per-page records. */
198 ulint m_n_recs_processed;
199
200 /** Number of pages to flush. */
201 ulint m_n_flush_pages;
202
203 /** Current phase. */
204 enum {
205 NOT_STARTED = 0,
206 READ_PK = 1,
207 SORT = 2,
208 INSERT = 3,
209 FLUSH = 4,
210 /* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
211 LOG_INDEX = 5,
212 LOG_TABLE = 6, */
213 LOG_INNODB_INDEX = 5,
214 LOG_INNODB_TABLE = 6,
215 END = 7,
216 } m_cur_phase;
217};
218
219/** Destructor. */
220inline
221ut_stage_alter_t::~ut_stage_alter_t()
222{
223 if (m_progress == NULL) {
224 return;
225 }
226
227 /* TODO: MySQL 5.7 PSI: Set completed = estimated before we quit.
228 mysql_stage_set_work_completed(
229 m_progress,
230 mysql_stage_get_work_estimated(m_progress));
231
232 mysql_end_stage();
233 */
234}
235
236/** Flag an ALTER TABLE start (read primary key phase).
237@param[in] n_sort_indexes number of indexes that will be sorted
238during ALTER TABLE, used for estimating the total work to be done */
239inline
240void
241ut_stage_alter_t::begin_phase_read_pk(
242 ulint n_sort_indexes)
243{
244 m_n_sort_indexes = n_sort_indexes;
245
246 m_cur_phase = READ_PK;
247
248 /* TODO: MySQL 5.7 PSI
249 m_progress = mysql_set_stage(
250 srv_stage_alter_table_read_pk_internal_sort.m_key);
251
252 mysql_stage_set_work_completed(m_progress, 0);
253 */
254 reestimate();
255}
256
257/** Increment the number of records in PK (table) with 1.
258This is used to get more accurate estimate about the number of
259records per page which is needed because some phases work on
260per-page basis while some work on per-record basis and we want
261to get the progress as even as possible. */
262inline
263void
264ut_stage_alter_t::n_pk_recs_inc()
265{
266 m_n_pk_recs++;
267}
268
269/** Flag either one record or one page processed, depending on the
270current phase. */
271inline
272void
273ut_stage_alter_t::inc(ulint)
274{
275 if (m_progress == NULL) {
276 return;
277 }
278
279 ulint multi_factor = 1;
280 bool should_proceed = true;
281
282 switch (m_cur_phase) {
283 case NOT_STARTED:
284 ut_error;
285 case READ_PK:
286 m_n_pk_pages++;
287#if 0 /* TODO: MySQL 5.7 PSI */
288 ut_ad(inc_val == 1);
289 /* Overall the read pk phase will read all the pages from the
290 PK and will do work, proportional to the number of added
291 indexes, thus when this is called once per read page we
292 increment with 1 + m_n_sort_indexes */
293 inc_val = 1 + m_n_sort_indexes;
294#endif
295 break;
296 case SORT:
297 multi_factor = m_sort_multi_factor;
298 /* fall through */
299 case INSERT: {
300 /* Increment the progress every nth record. During
301 sort and insert phases, this method is called once per
302 record processed. We need fractional point numbers here
303 because "records per page" is such a number naturally and
304 to avoid rounding skew we want, for example: if there are
305 (double) N records per page, then the work_completed
306 should be incremented on the inc() calls round(k*N),
307 for k=1,2,3... */
308 const double every_nth = m_n_recs_per_page * multi_factor;
309
310 const ulint k = static_cast<ulint>(
311 round(m_n_recs_processed / every_nth));
312
313 const ulint nth = static_cast<ulint>(
314 round(k * every_nth));
315
316 should_proceed = m_n_recs_processed == nth;
317
318 m_n_recs_processed++;
319
320 break;
321 }
322 case FLUSH:
323 break;
324 /* JAN: TODO: MySQL 5.7
325 case LOG_INDEX:
326 break;
327 case LOG_TABLE:
328 break; */
329 case LOG_INNODB_INDEX:
330 case LOG_INNODB_TABLE:
331 break;
332 case END:
333 break;
334 }
335
336 if (should_proceed) {
337 /* TODO: MySQL 5.7 PSI
338 mysql_stage_inc_work_completed(m_progress, inc_val);
339 */
340 reestimate();
341 }
342}
343
344/** Flag the end of reading of the primary key.
345Here we know the exact number of pages and records and calculate
346the number of records per page and refresh the estimate. */
347inline
348void
349ut_stage_alter_t::end_phase_read_pk()
350{
351 reestimate();
352
353 if (m_n_pk_pages == 0) {
354 /* The number of pages in the PK could be 0 if the tree is
355 empty. In this case we set m_n_recs_per_page to 1 to avoid
356 division by zero later. */
357 m_n_recs_per_page = 1.0;
358 } else {
359 m_n_recs_per_page = std::max(
360 static_cast<double>(m_n_pk_recs) / m_n_pk_pages,
361 1.0);
362 }
363}
364
365/** Flag the beginning of the sort phase.
366@param[in] sort_multi_factor since merge sort processes
367one page more than once we only update the estimate once per this
368many pages processed. */
369inline
370void
371ut_stage_alter_t::begin_phase_sort(
372 double sort_multi_factor)
373{
374 if (sort_multi_factor <= 1.0) {
375 m_sort_multi_factor = 1;
376 } else {
377 m_sort_multi_factor = static_cast<ulint>(
378 round(sort_multi_factor));
379 }
380
381 change_phase(&srv_stage_alter_table_merge_sort);
382}
383
384/** Flag the beginning of the insert phase. */
385inline
386void
387ut_stage_alter_t::begin_phase_insert()
388{
389 change_phase(&srv_stage_alter_table_insert);
390}
391
392/** Flag the beginning of the flush phase.
393@param[in] n_flush_pages this many pages are going to be
394flushed */
395inline
396void
397ut_stage_alter_t::begin_phase_flush(
398 ulint n_flush_pages)
399{
400 m_n_flush_pages = n_flush_pages;
401
402 reestimate();
403
404 change_phase(&srv_stage_alter_table_flush);
405}
406
407/** Flag the beginning of the log index phase. */
408inline
409void
410ut_stage_alter_t::begin_phase_log_index()
411{
412 change_phase(&srv_stage_alter_table_log_index);
413}
414
415/** Flag the beginning of the log table phase. */
416inline
417void
418ut_stage_alter_t::begin_phase_log_table()
419{
420 change_phase(&srv_stage_alter_table_log_table);
421}
422
423/** Flag the beginning of the end phase. */
424inline
425void
426ut_stage_alter_t::begin_phase_end()
427{
428 change_phase(&srv_stage_alter_table_end);
429}
430
431/** Update the estimate of total work to be done. */
432inline
433void
434ut_stage_alter_t::reestimate()
435{
436 if (m_progress == NULL) {
437 return;
438 }
439
440 /* During the log table phase we calculate the estimate as
441 work done so far + log size remaining. */
442 if (m_cur_phase == LOG_INNODB_TABLE) {
443 /* TODO: MySQL 5.7 PSI
444 mysql_stage_set_work_estimated(
445 m_progress,
446 mysql_stage_get_work_completed(m_progress)
447 + row_log_estimate_work(m_pk));
448 */
449 return;
450 }
451
452 /* During the other phases we use a formula, regardless of
453 how much work has been done so far. */
454
455 /* For number of pages in the PK - if the PK has not been
456 read yet, use stat_n_leaf_pages (approximate), otherwise
457 use the exact number we gathered. */
458 const ulint n_pk_pages
459 = m_cur_phase != READ_PK
460 ? m_n_pk_pages
461 : m_pk->stat_n_leaf_pages;
462
463 /* If flush phase has not started yet and we do not know how
464 many pages are to be flushed, then use a wild guess - the
465 number of pages in the PK / 2. */
466 if (m_n_flush_pages == 0) {
467 m_n_flush_pages = n_pk_pages / 2;
468 }
469
470 ulonglong estimate __attribute__((unused))
471 = n_pk_pages
472 * (1 /* read PK */
473 + m_n_sort_indexes /* row_merge_buf_sort() inside the
474 read PK per created index */
475 + m_n_sort_indexes * 2 /* sort & insert per created index */)
476 + m_n_flush_pages
477 + row_log_estimate_work(m_pk);
478
479 /* Prevent estimate < completed */
480 /* TODO: MySQL 5.7 PSI
481 estimate = std::max(estimate,
482 mysql_stage_get_work_completed(m_progress));
483
484 mysql_stage_set_work_estimated(m_progress, estimate);
485 */
486}
487
488/** Change the current phase.
489@param[in] new_stage pointer to the new stage to change to */
490inline
491void
492ut_stage_alter_t::change_phase(
493 const PSI_stage_info* new_stage)
494{
495 if (m_progress == NULL) {
496 return;
497 }
498
499 if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) {
500 m_cur_phase = READ_PK;
501 } else if (new_stage == &srv_stage_alter_table_merge_sort) {
502 m_cur_phase = SORT;
503 } else if (new_stage == &srv_stage_alter_table_insert) {
504 m_cur_phase = INSERT;
505 } else if (new_stage == &srv_stage_alter_table_flush) {
506 m_cur_phase = FLUSH;
507 /* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
508 } else if (new_stage == &srv_stage_alter_table_log_index) {
509 m_cur_phase = LOG_INNODB_INDEX;
510 } else if (new_stage == &srv_stage_alter_table_log_table) {
511 m_cur_phase = LOG_INNODB_TABLE;
512 } else if (new_stage == &srv_stage_alter_table_end) {
513 m_cur_phase = END;
514 } else {
515 ut_error;
516 }
517
518 /* TODO: MySQL 5.7 PSI
519 const ulonglong c = mysql_stage_get_work_completed(m_progress);
520 const ulonglong e = mysql_stage_get_work_estimated(m_progress);
521
522 m_progress = mysql_set_stage(new_stage->m_key);
523
524 mysql_stage_set_work_completed(m_progress, c);
525 mysql_stage_set_work_estimated(m_progress, e);
526 */
527}
528#else /* HAVE_PSI_STAGE_INTERFACE */
529
530class ut_stage_alter_t {
531public:
532 explicit ut_stage_alter_t(const dict_index_t*) {}
533
534 void begin_phase_read_pk(ulint) {}
535
536 void n_pk_recs_inc() {}
537
538 void inc() {}
539 void inc(ulint) {}
540
541 void end_phase_read_pk() {}
542
543 void begin_phase_sort(double) {}
544
545 void begin_phase_insert() {}
546
547 void begin_phase_flush(ulint) {}
548
549 void begin_phase_log_index() {}
550
551 void begin_phase_log_table() {}
552
553 void begin_phase_end() {}
554};
555
556#endif /* HAVE_PSI_STAGE_INTERFACE */
557
558#endif /* ut0stage_h */
559