1/* Copyright (C) 2006,2007 MySQL AB
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
15
16/*
17 WL#3071 Maria checkpoint
18 First version written by Guilhem Bichot on 2006-04-27.
19*/
20
21/* Here is the implementation of this module */
22
23/** @todo RECOVERY BUG this is unreviewed code */
24/*
25 Summary:
26 checkpoints are done either by a background thread (checkpoint every Nth
27 second) or by a client.
28 In ha_maria, it's not made available to clients, and will soon be done by a
29 background thread (periodically taking checkpoints and flushing dirty
30 pages).
31*/
32
33#include "maria_def.h"
34#include "ma_pagecache.h"
35#include "ma_blockrec.h"
36#include "ma_checkpoint.h"
37#include "ma_loghandler_lsn.h"
38#include "ma_servicethread.h"
39#include "ma_crypt.h"
40
41/** @brief type of checkpoint currently running */
42static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
43/** @brief protects checkpoint_in_progress */
44static mysql_mutex_t LOCK_checkpoint;
45/** @brief for killing the background checkpoint thread */
46static mysql_cond_t COND_checkpoint;
47/** @brief control structure for checkpoint background thread */
48static MA_SERVICE_THREAD_CONTROL checkpoint_control=
49 {0, FALSE, FALSE, &LOCK_checkpoint, &COND_checkpoint};
50/* is ulong like pagecache->blocks_changed */
51static uint pages_to_flush_before_next_checkpoint;
52static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
53 *dfiles_end; /**< list of data files ends here */
54static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
55 *kfiles_end; /**< list of index files ends here */
56/* those two statistics below could serve in SHOW GLOBAL STATUS */
57static uint checkpoints_total= 0, /**< all checkpoint requests made */
58 checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
59
60struct st_filter_param
61{
62 LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
63 uint max_pages; /**< stop after flushing this number pages */
64}; /**< information to determine which dirty pages should be flushed */
65
66static enum pagecache_flush_filter_result
67filter_flush_file_medium(enum pagecache_page_type type,
68 pgcache_page_no_t page,
69 LSN rec_lsn, void *arg);
70static enum pagecache_flush_filter_result
71filter_flush_file_full(enum pagecache_page_type type,
72 pgcache_page_no_t page,
73 LSN rec_lsn, void *arg);
74static enum pagecache_flush_filter_result
75filter_flush_file_evenly(enum pagecache_page_type type,
76 pgcache_page_no_t pageno,
77 LSN rec_lsn, void *arg);
78static int really_execute_checkpoint(void);
79pthread_handler_t ma_checkpoint_background(void *arg);
80static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
81
82/**
83 @brief Does a checkpoint
84
85 @param level what level of checkpoint to do
86 @param no_wait if another checkpoint of same or stronger level
87 is already running, consider our job done
88
89 @note In ha_maria, there can never be two threads trying a checkpoint at
90 the same time.
91
92 @return Operation status
93 @retval 0 ok
94 @retval !=0 error
95*/
96
97int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
98{
99 int result= 0;
100 DBUG_ENTER("ma_checkpoint_execute");
101
102 if (!checkpoint_control.inited)
103 {
104 /*
105 If ha_maria failed to start, maria_panic_hton is called, we come here.
106 */
107 DBUG_RETURN(0);
108 }
109 DBUG_ASSERT(level > CHECKPOINT_NONE);
110
111 /* look for already running checkpoints */
112 mysql_mutex_lock(&LOCK_checkpoint);
113 while (checkpoint_in_progress != CHECKPOINT_NONE)
114 {
115 if (no_wait && (checkpoint_in_progress >= level))
116 {
117 /*
118 If we are the checkpoint background thread, we don't wait (it's
119 smarter to flush pages instead of waiting here while the other thread
120 finishes its checkpoint).
121 */
122 mysql_mutex_unlock(&LOCK_checkpoint);
123 goto end;
124 }
125 mysql_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
126 }
127
128 checkpoint_in_progress= level;
129 mysql_mutex_unlock(&LOCK_checkpoint);
130 /* from then on, we are sure to be and stay the only checkpointer */
131
132 result= really_execute_checkpoint();
133 DBUG_EXECUTE_IF("maria_crash_after_checkpoint",
134 { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
135
136 mysql_cond_broadcast(&COND_checkpoint);
137end:
138 DBUG_RETURN(result);
139}
140
141
142/**
143 @brief Does a checkpoint, really; expects no other checkpoints
144 running.
145
146 Checkpoint level requested is read from checkpoint_in_progress.
147
148 @return Operation status
149 @retval 0 ok
150 @retval !=0 error
151*/
152
153static int really_execute_checkpoint(void)
154{
155 uint i, error= 0;
156 /** @brief checkpoint_start_log_horizon will be stored there */
157 char *ptr;
158 LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
159 LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
160 TRANSLOG_ADDRESS checkpoint_start_log_horizon;
161 char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
162 DBUG_ENTER("really_execute_checkpoint");
163 DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
164 bzero(&record_pieces, sizeof(record_pieces));
165
166 /*
167 STEP 1: record current end-of-log position using log's lock. It is
168 critical for the correctness of Checkpoint (related to memory visibility
169 rules, the log's lock is a mutex).
170 "Horizon" is a lower bound of the LSN of the next log record.
171 */
172 checkpoint_start_log_horizon= translog_get_horizon();
173 DBUG_PRINT("info",("checkpoint_start_log_horizon " LSN_FMT,
174 LSN_IN_PARTS(checkpoint_start_log_horizon)));
175 lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
176
177 /*
178 STEP 2: fetch information about transactions.
179 We must fetch transactions before dirty pages. Indeed, a transaction
180 first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
181 to 0. If we fetched pages first, we may see no dirty page yet, then we
182 fetch transactions but the transaction has already reset its rec_lsn to 0
183 so we miss rec_lsn again.
184 For a similar reason (over-allocated bitmap pages) we have to fetch
185 transactions before flushing bitmap pages.
186
187 min_trn_rec_lsn will serve to lower the starting point of the REDO phase
188 (down from checkpoint_start_log_horizon).
189 */
190 if (unlikely(trnman_collect_transactions(&record_pieces[0],
191 &record_pieces[1],
192 &min_trn_rec_lsn,
193 &min_first_undo_lsn)))
194 goto err;
195
196
197 /* STEP 3: fetch information about table files */
198 if (unlikely(collect_tables(&record_pieces[2],
199 checkpoint_start_log_horizon)))
200 goto err;
201
202
203 /* STEP 4: fetch information about dirty pages */
204 /*
205 It's better to do it _after_ having flushed some data pages (which
206 collect_tables() may have done), because those are now non-dirty and so we
207 have a more up-to-date dirty pages list to put into the checkpoint record,
208 and thus we will have less work at Recovery.
209 */
210 /* Using default pagecache for now */
211 if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
212 &record_pieces[3],
213 &min_page_rec_lsn)))
214 goto err;
215
216
217 /* LAST STEP: now write the checkpoint log record */
218 {
219 LSN lsn;
220 translog_size_t total_rec_length;
221 /*
222 the log handler is allowed to modify "str" and "length" (but not "*str")
223 of its argument, so we must not pass it record_pieces directly,
224 otherwise we would later not know what memory pieces to my_free().
225 */
226 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
227 log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
228 (uchar*) checkpoint_start_log_horizon_char;
229 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
230 sizeof(checkpoint_start_log_horizon_char);
231 for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
232 {
233 log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].str= (uchar*)record_pieces[i].str;
234 log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].length= record_pieces[i].length;
235 total_rec_length+= (translog_size_t) record_pieces[i].length;
236 }
237 if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
238 &dummy_transaction_object, NULL,
239 total_rec_length,
240 sizeof(log_array)/sizeof(log_array[0]),
241 log_array, NULL, NULL) ||
242 translog_flush(lsn)))
243 goto err;
244 translog_lock();
245 /*
246 This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
247 such hook would be called before translog_flush (and we must be sure
248 that log was flushed before we write to the control file).
249 */
250 if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
251 max_trid_in_control_file,
252 recovery_failures)))
253 {
254 translog_unlock();
255 goto err;
256 }
257 translog_unlock();
258 }
259
260 /*
261 Note that we should not alter memory structures until we have successfully
262 written the checkpoint record and control file.
263 */
264 /* checkpoint succeeded */
265 ptr= record_pieces[3].str;
266 pages_to_flush_before_next_checkpoint= uint4korr(ptr);
267 DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
268 pages_to_flush_before_next_checkpoint));
269
270 /* compute log's low-water mark */
271 {
272 TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
273 set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
274 set_if_smaller(log_low_water_mark, min_first_undo_lsn);
275 set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
276 /**
277 Now purge unneeded logs.
278 As some systems have an unreliable fsync (drive lying), we could try to
279 be robust against that: remember a few previous checkpoints in the
280 control file, and not purge logs immediately... Think about it.
281 */
282 if (translog_purge(log_low_water_mark))
283 ma_message_no_user(0, "log purging failed");
284 }
285
286 goto end;
287
288err:
289 error= 1;
290 ma_message_no_user(0, "checkpoint failed");
291 /* we were possibly not able to determine what pages to flush */
292 pages_to_flush_before_next_checkpoint= 0;
293
294end:
295 for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
296 my_free(record_pieces[i].str);
297 mysql_mutex_lock(&LOCK_checkpoint);
298 checkpoint_in_progress= CHECKPOINT_NONE;
299 checkpoints_total++;
300 checkpoints_ok_total+= !error;
301 mysql_mutex_unlock(&LOCK_checkpoint);
302 DBUG_RETURN(error);
303}
304
305
306/**
307 @brief Initializes the checkpoint module
308
309 @param interval If one wants the module to create a
310 thread which will periodically do
311 checkpoints, and flush dirty pages, in the
312 background, it should specify a non-zero
313 interval in seconds. The thread will then be
314 created and will take checkpoints separated by
315 approximately 'interval' second.
316
317 @note A checkpoint is taken only if there has been some significant
318 activity since the previous checkpoint. Between checkpoint N and N+1 the
319 thread flushes all dirty pages which were already dirty at the time of
320 checkpoint N.
321
322 @return Operation status
323 @retval 0 ok
324 @retval !=0 error
325*/
326
327int ma_checkpoint_init(ulong interval)
328{
329 int res= 0;
330 DBUG_ENTER("ma_checkpoint_init");
331 if (ma_service_thread_control_init(&checkpoint_control))
332 res= 1;
333 else if (interval > 0)
334 {
335 size_t intv= interval;
336 compile_time_assert(sizeof(void *) >= sizeof(ulong));
337 if ((res= mysql_thread_create(key_thread_checkpoint,
338 &checkpoint_control.thread, NULL,
339 ma_checkpoint_background,
340 (void*) intv)))
341 checkpoint_control.killed= TRUE;
342 }
343 else
344 checkpoint_control.killed= TRUE;
345 DBUG_RETURN(res);
346}
347
348
349#ifndef DBUG_OFF
350/**
351 Function used to test recovery: flush some table pieces and then caller
352 crashes.
353
354 @param what_to_flush 0: current bitmap and all data pages
355 1: state
356 2: all bitmap pages
357*/
358static void flush_all_tables(int what_to_flush)
359{
360 int res= 0;
361 LIST *pos; /**< to iterate over open tables */
362 mysql_mutex_lock(&THR_LOCK_maria);
363 for (pos= maria_open_list; pos; pos= pos->next)
364 {
365 MARIA_HA *info= (MARIA_HA*)pos->data;
366 if (info->s->now_transactional)
367 {
368 switch (what_to_flush)
369 {
370 case 0:
371 res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
372 FLUSH_KEEP, FLUSH_KEEP);
373 break;
374 case 1:
375 res= _ma_state_info_write(info->s,
376 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
377 MA_STATE_INFO_WRITE_LOCK);
378 DBUG_PRINT("maria_flush_states",
379 ("is_of_horizon: LSN " LSN_FMT,
380 LSN_IN_PARTS(info->s->state.is_of_horizon)));
381 break;
382 case 2:
383 res= _ma_bitmap_flush_all(info->s);
384 break;
385 }
386 }
387 DBUG_ASSERT(res == 0);
388 }
389 mysql_mutex_unlock(&THR_LOCK_maria);
390}
391#endif
392
393
394/**
395 @brief Destroys the checkpoint module
396*/
397
398void ma_checkpoint_end(void)
399{
400 DBUG_ENTER("ma_checkpoint_end");
401 /*
402 Some intentional crash methods, usually triggered by
403 SET MARIA_CHECKPOINT_INTERVAL=X
404 */
405 DBUG_EXECUTE_IF("maria_flush_bitmap",
406 {
407 DBUG_PRINT("maria_flush_bitmap", ("now"));
408 flush_all_tables(2);
409 });
410 DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
411 {
412 DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
413 flush_all_tables(0);
414 });
415 DBUG_EXECUTE_IF("maria_flush_whole_log",
416 {
417 DBUG_PRINT("maria_flush_whole_log", ("now"));
418 translog_flush(translog_get_horizon());
419 });
420 /*
421 Note that for WAL reasons, maria_flush_states requires
422 maria_flush_whole_log.
423 */
424 DBUG_EXECUTE_IF("maria_flush_states",
425 {
426 DBUG_PRINT("maria_flush_states", ("now"));
427 flush_all_tables(1);
428 });
429 DBUG_EXECUTE_IF("maria_crash",
430 { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
431
432 if (checkpoint_control.inited)
433 {
434 ma_service_thread_control_end(&checkpoint_control);
435 my_free(dfiles);
436 my_free(kfiles);
437 dfiles= kfiles= NULL;
438 }
439 DBUG_VOID_RETURN;
440}
441
442
443/**
444 @brief dirty-page filtering criteria for MEDIUM checkpoint.
445
446 We flush data/index pages which have been dirty since the previous
447 checkpoint (this is the two-checkpoint rule: the REDO phase will not have
448 to start from earlier than the next-to-last checkpoint).
449 Bitmap pages are handled by _ma_bitmap_flush_all().
450
451 @param type Page's type
452 @param pageno Page's number
453 @param rec_lsn Page's rec_lsn
454 @param arg filter_param
455*/
456
457static enum pagecache_flush_filter_result
458filter_flush_file_medium(enum pagecache_page_type type,
459 pgcache_page_no_t pageno __attribute__ ((unused)),
460 LSN rec_lsn, void *arg)
461{
462 struct st_filter_param *param= (struct st_filter_param *)arg;
463 return (type == PAGECACHE_LSN_PAGE) &&
464 (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
465}
466
467
468/**
469 @brief dirty-page filtering criteria for FULL checkpoint.
470
471 We flush all dirty data/index pages.
472 Bitmap pages are handled by _ma_bitmap_flush_all().
473
474 @param type Page's type
475 @param pageno Page's number
476 @param rec_lsn Page's rec_lsn
477 @param arg filter_param
478*/
479
480static enum pagecache_flush_filter_result
481filter_flush_file_full(enum pagecache_page_type type,
482 pgcache_page_no_t pageno __attribute__ ((unused)),
483 LSN rec_lsn __attribute__ ((unused)),
484 void *arg __attribute__ ((unused)))
485{
486 return (type == PAGECACHE_LSN_PAGE);
487}
488
489
490/**
491 @brief dirty-page filtering criteria for background flushing thread.
492
493 We flush data/index pages which have been dirty since the previous
494 checkpoint (this is the two-checkpoint rule: the REDO phase will not have
495 to start from earlier than the next-to-last checkpoint), and no
496 bitmap pages. But we flush no more than a certain number of pages (to have
497 an even flushing, no write burst).
498 The reason to not flush bitmap pages is that they may not be in a flushable
499 state at this moment and we don't want to wait for them.
500
501 @param type Page's type
502 @param pageno Page's number
503 @param rec_lsn Page's rec_lsn
504 @param arg filter_param
505*/
506
507static enum pagecache_flush_filter_result
508filter_flush_file_evenly(enum pagecache_page_type type,
509 pgcache_page_no_t pageno __attribute__ ((unused)),
510 LSN rec_lsn, void *arg)
511{
512 struct st_filter_param *param= (struct st_filter_param *)arg;
513 if (unlikely(param->max_pages == 0)) /* all flushed already */
514 return FLUSH_FILTER_SKIP_ALL;
515 if ((type == PAGECACHE_LSN_PAGE) &&
516 (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
517 {
518 param->max_pages--;
519 return FLUSH_FILTER_OK;
520 }
521 return FLUSH_FILTER_SKIP_TRY_NEXT;
522}
523
524
525/**
526 @brief Background thread which does checkpoints and flushes periodically.
527
528 Takes a checkpoint. After this, all pages dirty at the time of that
529 checkpoint are flushed evenly until it is time to take another checkpoint.
530 This ensures that the REDO phase starts at earliest (in LSN time) at the
531 next-to-last checkpoint record ("two-checkpoint rule").
532
533 @note MikaelR questioned why the same thread does two different jobs, the
534 risk could be that while a checkpoint happens no LRD flushing happens.
535*/
536
537static ulong maria_checkpoint_min_cache_activity= 10*1024*1024;
538/* Set in ha_maria.cc */
539ulong maria_checkpoint_min_log_activity= 1*1024*1024;
540
541pthread_handler_t ma_checkpoint_background(void *arg)
542{
543 /** @brief At least this of log/page bytes written between checkpoints */
544 /*
545 If the interval could be changed by the user while we are in this thread,
546 it could be annoying: for example it could cause "case 2" to be executed
547 right after "case 0", thus having 'dfile' unset. So the thread cares only
548 about the interval's value when it started.
549 */
550 const size_t interval= (size_t)arg;
551 size_t sleeps, sleep_time;
552 TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
553 translog_get_horizon();
554 ulonglong pagecache_flushes_at_last_checkpoint=
555 maria_pagecache->global_cache_write;
556 uint UNINIT_VAR(pages_bunch_size);
557 struct st_filter_param filter_param;
558 PAGECACHE_FILE *UNINIT_VAR(dfile); /**< data file currently being flushed */
559 PAGECACHE_FILE *UNINIT_VAR(kfile); /**< index file currently being flushed */
560
561 my_thread_init();
562 DBUG_PRINT("info",("Maria background checkpoint thread starts"));
563 DBUG_ASSERT(interval > 0);
564
565 PSI_CALL_set_thread_user_host(0,0,0,0);
566
567 /*
568 Recovery ended with all tables closed and a checkpoint: no need to take
569 one immediately.
570 */
571 sleeps= 1;
572 pages_to_flush_before_next_checkpoint= 0;
573
574 for(;;) /* iterations of checkpoints and dirty page flushing */
575 {
576#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
577 sleeps=0;
578#endif
579 switch (sleeps % interval)
580 {
581 case 0:
582 {
583 /* If checkpoints are disabled, wait 1 second and try again */
584 if (maria_checkpoint_disabled)
585 {
586 sleep_time= 1;
587 break;
588 }
589 {
590 TRANSLOG_ADDRESS horizon= translog_get_horizon();
591
592 /*
593 With background flushing evenly distributed over the time
594 between two checkpoints, we should have only little flushing to do
595 in the checkpoint.
596 */
597 /*
598 No checkpoint if little work of interest for recovery was done
599 since last checkpoint. Such work includes log writing (lengthens
600 recovery, checkpoint would shorten it), page flushing (checkpoint
601 would decrease the amount of read pages in recovery).
602 In case of one short statement per minute (very low load), we don't
603 want to checkpoint every minute, hence the positive
604 maria_checkpoint_min_activity.
605 */
606 if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <=
607 maria_checkpoint_min_log_activity &&
608 ((ulonglong) (maria_pagecache->global_cache_write -
609 pagecache_flushes_at_last_checkpoint) *
610 maria_pagecache->block_size) <=
611 maria_checkpoint_min_cache_activity)
612 {
613 /*
614 Not enough has happend since last checkpoint.
615 Sleep for a while and try again later
616 */
617 sleep_time= interval;
618 break;
619 }
620 sleep_time= 1;
621 ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
622 /*
623 Snapshot this kind of "state" of the engine. Note that the value
624 below is possibly greater than last_checkpoint_lsn.
625 */
626 log_horizon_at_last_checkpoint= translog_get_horizon();
627 pagecache_flushes_at_last_checkpoint=
628 maria_pagecache->global_cache_write;
629 /*
630 If the checkpoint above succeeded it has set d|kfiles and
631 d|kfiles_end. If is has failed, it has set
632 pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
633 and sleep until the next checkpoint.
634 */
635 }
636 break;
637 }
638 case 1:
639 /* set up parameters for background page flushing */
640 filter_param.up_to_lsn= last_checkpoint_lsn;
641 pages_bunch_size= pages_to_flush_before_next_checkpoint / (uint)interval;
642 dfile= dfiles;
643 kfile= kfiles;
644 /* fall through */
645 default:
646 if (pages_bunch_size > 0)
647 {
648 DBUG_PRINT("checkpoint",
649 ("Maria background checkpoint thread: %u pages",
650 pages_bunch_size));
651 /* flush a bunch of dirty pages */
652 filter_param.max_pages= pages_bunch_size;
653 while (dfile != dfiles_end)
654 {
655 /*
656 We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
657 smarter to move to the next file than wait for this one to be
658 completely flushed, which may take long.
659 StaleFilePointersInFlush: notice how below we use "dfile" which
660 is an OS file descriptor plus some function and MARIA_SHARE
661 pointers; this data dates from a previous checkpoint; since then,
662 the table may have been closed (so MARIA_SHARE* became stale), and
663 the file descriptor reassigned to another table which does not
664 have the same CRC-read-set callbacks: it is thus important that
665 flush_pagecache_blocks_with_filter() does not use the pointers,
666 only the OS file descriptor.
667 */
668 int res=
669 flush_pagecache_blocks_with_filter(maria_pagecache,
670 dfile, FLUSH_KEEP_LAZY,
671 filter_flush_file_evenly,
672 &filter_param);
673 if (unlikely(res & PCFLUSH_ERROR))
674 ma_message_no_user(0, "background data page flush failed");
675 if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
676 break; /* and we will continue with the same file */
677 dfile++; /* otherwise all this file is flushed, move to next file */
678 /*
679 MikaelR noted that he observed that Linux's file cache may never
680 fsync to disk until this cache is full, at which point it decides
681 to empty the cache, making the machine very slow. A solution was
682 to fsync after writing 2 MB. So we might want to fsync() here if
683 we wrote enough pages.
684 */
685 }
686 while (kfile != kfiles_end)
687 {
688 int res=
689 flush_pagecache_blocks_with_filter(maria_pagecache,
690 kfile, FLUSH_KEEP_LAZY,
691 filter_flush_file_evenly,
692 &filter_param);
693 if (unlikely(res & PCFLUSH_ERROR))
694 ma_message_no_user(0, "background index page flush failed");
695 if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
696 break; /* and we will continue with the same file */
697 kfile++; /* otherwise all this file is flushed, move to next file */
698 }
699 sleep_time= 1;
700 }
701 else
702 {
703 /* Can directly sleep until the next checkpoint moment */
704 sleep_time= interval - (sleeps % interval);
705 }
706 }
707 if (my_service_thread_sleep(&checkpoint_control,
708 sleep_time * 1000000000ULL))
709 break;
710 sleeps+= sleep_time;
711 }
712 DBUG_PRINT("info",("Maria background checkpoint thread ends"));
713 {
714 CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
715 /*
716 That's the final one, which guarantees that a clean shutdown always ends
717 with a checkpoint.
718 */
719 DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
720 ma_checkpoint_execute(level, FALSE);
721 }
722 my_thread_end();
723 return 0;
724}
725
726
727/**
728 @brief Allocates buffer and stores in it some info about open tables,
729 does some flushing on those.
730
731 Does the allocation because the caller cannot know the size itself.
732 Memory freeing is to be done by the caller (if the "str" member of the
733 LEX_STRING is not NULL).
734 The caller is taking a checkpoint.
735
736 @param[out] str pointer to where the allocated buffer,
737 and its size, will be put; buffer will be filled
738 with info about open tables
739 @param checkpoint_start_log_horizon Of the in-progress checkpoint
740 record.
741
742 @return Operation status
743 @retval 0 OK
744 @retval 1 Error
745*/
746
747static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
748{
749 MARIA_SHARE **distinct_shares= NULL;
750 char *ptr;
751 uint error= 1, sync_error= 0, nb, nb_stored, i;
752 my_bool unmark_tables= TRUE;
753 size_t total_names_length;
754 LIST *pos; /**< to iterate over open tables */
755 struct st_state_copy {
756 uint index;
757 MARIA_STATE_INFO state;
758 };
759 struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
760 *state_copies_end, /**< cache ends here */
761 *state_copy; /**< iterator in cache */
762 TRANSLOG_ADDRESS UNINIT_VAR(state_copies_horizon); /**< horizon of states' _copies_ */
763 struct st_filter_param filter_param;
764 PAGECACHE_FLUSH_FILTER filter;
765 DBUG_ENTER("collect_tables");
766
767 /* let's make a list of distinct shares */
768 mysql_mutex_lock(&THR_LOCK_maria);
769 for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
770 {
771 MARIA_HA *info= (MARIA_HA*)pos->data;
772 MARIA_SHARE *share= info->s;
773 /* the first three variables below can never change */
774 if (share->base.born_transactional && !share->temporary &&
775 share->mode != O_RDONLY &&
776 !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
777 {
778 /*
779 Apart from us, only maria_close() reads/sets in_checkpoint but cannot
780 run now as we hold THR_LOCK_maria.
781 */
782 /*
783 This table is relevant for checkpoint and not already seen. Mark it,
784 so that it is not seen again in the loop.
785 */
786 nb++;
787 DBUG_ASSERT(share->in_checkpoint == 0);
788 /* This flag ensures that we count only _distinct_ shares. */
789 share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
790 }
791 }
792 if (unlikely((distinct_shares=
793 (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
794 MYF(MY_WME))) == NULL))
795 goto err;
796 for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
797 {
798 MARIA_HA *info= (MARIA_HA*)pos->data;
799 MARIA_SHARE *share= info->s;
800 if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
801 {
802 distinct_shares[i++]= share;
803 /*
804 With this we prevent the share from going away while we later flush
805 and force it without holding THR_LOCK_maria. For example if the share
806 could be my_free()d by maria_close() we would have a problem when we
807 access it to flush the table. We "pin" the share pointer.
808 And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
809 not seen again in the loop.
810 */
811 share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
812 total_names_length+= share->open_file_name.length;
813 }
814 }
815
816 DBUG_ASSERT(i == nb);
817 mysql_mutex_unlock(&THR_LOCK_maria);
818 DBUG_PRINT("info",("found %u table shares", nb));
819
820 str->length=
821 4 + /* number of tables */
822 (2 + /* short id */
823 LSN_STORE_SIZE + /* first_log_write_at_lsn */
824 1 /* end-of-name 0 */
825 ) * nb + total_names_length;
826 if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
827 goto err;
828
829 ptr= str->str;
830 ptr+= 4; /* real number of stored tables is not yet know */
831
832 /* only possible checkpointer, so can do the read below without mutex */
833 filter_param.up_to_lsn= last_checkpoint_lsn;
834 switch(checkpoint_in_progress)
835 {
836 case CHECKPOINT_MEDIUM:
837 filter= &filter_flush_file_medium;
838 break;
839 case CHECKPOINT_FULL:
840 filter= &filter_flush_file_full;
841 break;
842 case CHECKPOINT_INDIRECT:
843 filter= NULL;
844 break;
845 default:
846 DBUG_ASSERT(0);
847 goto err;
848 }
849
850 /*
851 The principle of reading/writing the state below is explained in
852 ma_recovery.c, look for "Recovery of the state".
853 */
854#define STATE_COPIES 1024
855 state_copies= (struct st_state_copy *)
856 my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
857 dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
858 /* avoid size of 0 for my_realloc */
859 MY_MAX(1, nb) * sizeof(PAGECACHE_FILE),
860 MYF(MY_WME | MY_ALLOW_ZERO_PTR));
861 kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
862 /* avoid size of 0 for my_realloc */
863 MY_MAX(1, nb) * sizeof(PAGECACHE_FILE),
864 MYF(MY_WME | MY_ALLOW_ZERO_PTR));
865 if (unlikely((state_copies == NULL) ||
866 (dfiles == NULL) || (kfiles == NULL)))
867 goto err;
868 state_copy= state_copies_end= NULL;
869 dfiles_end= dfiles;
870 kfiles_end= kfiles;
871
872 for (nb_stored= 0, i= 0; i < nb; i++)
873 {
874 MARIA_SHARE *share= distinct_shares[i];
875 PAGECACHE_FILE kfile, dfile;
876 my_bool ignore_share;
877 if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
878 {
879 /*
880 No need for a mutex to read the above, only us can write *this* bit of
881 the in_checkpoint bitmap
882 */
883 continue;
884 }
885 /**
886 @todo We should not look at tables which didn't change since last
887 checkpoint.
888 */
889 DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
890 if (state_copy == state_copies_end) /* we have no more cached states */
891 {
892 /*
893 Collect and cache a bunch of states. We do this for many states at a
894 time, to not lock/unlock the log's lock too often.
895 */
896 uint j, bound= MY_MIN(nb, i + STATE_COPIES);
897 state_copy= state_copies;
898 /* part of the state is protected by log's lock */
899 translog_lock();
900 state_copies_horizon= translog_get_horizon_no_lock();
901 for (j= i; j < bound; j++)
902 {
903 MARIA_SHARE *share2= distinct_shares[j];
904 if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
905 continue;
906 state_copy->index= j;
907 state_copy->state= share2->state; /* we copy the state */
908 state_copy++;
909 /*
910 data_file_length is not updated under log's lock by the bitmap
911 code, but writing a wrong data_file_length is ok: a next
912 maria_close() will correct it; if we crash before, Recovery will
913 set it to the true physical size.
914 */
915 }
916 translog_unlock();
917 if (state_copy == state_copies)
918 break; /* Nothing to do */
919
920 /**
921 We are going to flush these states.
922 Before, all records describing how to undo such state must be
923 in the log (WAL). Usually this means UNDOs. In the special case of
924 data|key_file_length, recovery just needs to open the table to fix the
925 length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
926 understand it must open a table, is enough; so as long as
927 data|key_file_length is updated after writing any log record it's ok:
928 if we copied new value above, it means the record was before
929 state_copies_horizon and we flush such record below.
930 Apart from data|key_file_length which are easily recoverable from the
931 real file's size, all other state members must be updated only when
932 writing the UNDO; otherwise, if updated before, if their new value is
933 flushed by a checkpoint and there is a crash before UNDO is written,
934 their REDO group will be missing or at least incomplete and skipped
935 by recovery, so bad state value will stay. For example, setting
936 key_root before writing the UNDO: the table would have old index
937 pages (they were pinned at time of crash) and a new, thus wrong,
938 key_root.
939 @todo RECOVERY BUG check that all code honours that.
940 */
941 if (translog_flush(state_copies_horizon))
942 goto err;
943 /* now we have cached states and they are WAL-safe*/
944 state_copies_end= state_copy-1;
945 state_copy= state_copies;
946 }
947
948 /* locate our state among these cached ones */
949 for ( ; state_copy->index != i; state_copy++)
950 DBUG_ASSERT(state_copy <= state_copies_end);
951
952 /* OS file descriptors are ints which we stored in 4 bytes */
953 compile_time_assert(sizeof(int) <= 4);
954 /*
955 Protect against maria_close() (which does some memory freeing in
956 MARIA_FILE_BITMAP) with close_lock. intern_lock is not
957 sufficient as we, as well as maria_close(), are going to unlock
958 intern_lock in the middle of manipulating the table. Serializing us and
959 maria_close() should help avoid problems.
960 */
961 mysql_mutex_lock(&share->close_lock);
962 mysql_mutex_lock(&share->intern_lock);
963 /*
964 Tables in a normal state have their two file descriptors open.
965 In some rare cases like REPAIR, some descriptor may be closed or even
966 -1. If that happened, the _ma_state_info_write() may fail. This is
967 prevented by enclosing all all places which close/change kfile.file with
968 intern_lock.
969 */
970 kfile= share->kfile;
971 dfile= share->bitmap.file;
972 /*
973 Ignore table which has no logged writes (all its future log records will
974 be found naturally by Recovery). Ignore obsolete shares (_before_
975 setting themselves to last_version=0 they already did all flush and
976 sync; if we flush their state now we may be flushing an obsolete state
977 onto a newer one (assuming the table has been reopened with a different
978 share but of course same physical index file).
979 */
980 ignore_share= (share->id == 0) | (share->last_version == 0);
981 DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
982 if (!ignore_share)
983 {
984 size_t open_file_name_len= share->open_file_name.length + 1;
985 /* remember the descriptors for background flush */
986 *(dfiles_end++)= dfile;
987 *(kfiles_end++)= kfile;
988 /* we will store this table in the record */
989 nb_stored++;
990 int2store(ptr, share->id);
991 ptr+= 2;
992 lsn_store(ptr, share->lsn_of_file_id);
993 ptr+= LSN_STORE_SIZE;
994 /*
995 first_bitmap_with_space is not updated under log's lock, and is
996 important. We would need the bitmap's lock to get it right. Recovery
997 of this is not clear, so we just play safe: write it out as
998 unknown: if crash, _ma_bitmap_init() at next open (for example in
999 Recovery) will convert it to 0 and thus the first insertion will
1000 search for free space from the file's first bitmap (0) -
1001 under-optimal but safe.
1002 If no crash, maria_close() will write the exact value.
1003 */
1004 state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
1005 memcpy(ptr, share->open_file_name.str, open_file_name_len);
1006 ptr+= open_file_name_len;
1007 if (cmp_translog_addr(share->state.is_of_horizon,
1008 checkpoint_start_log_horizon) >= 0)
1009 {
1010 /*
1011 State was flushed recently, it does not hold down the log's
1012 low-water mark and will not give avoidable work to Recovery. So we
1013 needn't flush it. Also, it is possible that while we copied the
1014 state above (under log's lock, without intern_lock) it was being
1015 modified in memory or flushed to disk (without log's lock, under
1016 intern_lock, like in maria_extra()), so our copy may be incorrect
1017 and we should not flush it.
1018 It may also be a share which got last_version==0 since we checked
1019 last_version; in this case, it flushed its state and the LSN test
1020 above will catch it.
1021 */
1022 }
1023 else
1024 {
1025 /*
1026 We could do the state flush only if share->changed, but it's
1027 tricky.
1028 Consider a maria_write() which has written REDO,UNDO, and before it
1029 calls _ma_writeinfo() (setting share->changed=1), checkpoint
1030 happens and sees share->changed=0, does not flush state. It is
1031 possible that Recovery does not start from before the REDO and thus
1032 the state is not recovered. A solution may be to set
1033 share->changed=1 under log mutex when writing log records.
1034
1035 The current solution is to keep a copy the last saved state and
1036 not write the state if it was same as last time. It's ok if
1037 is_of_horizon would be different on disk if all other data is
1038 the same.
1039 */
1040 DBUG_ASSERT(share->last_version != 0);
1041 state_copy->state.is_of_horizon= share->state.is_of_horizon=
1042 share->checkpoint_state.is_of_horizon= state_copies_horizon;
1043 if (kfile.file >= 0 && memcmp(&share->checkpoint_state,
1044 &state_copy->state,
1045 sizeof(state_copy->state)))
1046 {
1047 sync_error|=
1048 _ma_state_info_write_sub(kfile.file, &state_copy->state,
1049 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
1050 memcpy(&share->checkpoint_state,
1051 &state_copy->state, sizeof(state_copy->state));
1052 }
1053 /*
1054 We don't set share->changed=0 because it may interfere with a
1055 concurrent _ma_writeinfo() doing share->changed=1 (cancel its
1056 effect). The sad consequence is that we will flush the same state at
1057 each checkpoint if the table was once written and then not anymore.
1058 */
1059 }
1060 }
1061#ifdef EXTRA_DEBUG_BITMAP
1062 else
1063 {
1064 DBUG_ASSERT(share->bitmap.changed == 0 &&
1065 share->bitmap.changed_not_flushed == 0);
1066 }
1067#endif
1068
1069 /*
1070 _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
1071 otherwise this would deadlock with allocate_and_write_block_record()
1072 calling _ma_set_share_data_file_length()
1073 */
1074 mysql_mutex_unlock(&share->intern_lock);
1075
1076 if (!ignore_share)
1077 {
1078 /*
1079 share->bitmap is valid because it's destroyed under close_lock which
1080 we hold.
1081 */
1082 if (_ma_bitmap_flush_all(share))
1083 {
1084 sync_error= 1;
1085 /** @todo all write failures should mark table corrupted */
1086 ma_message_no_user(0, "checkpoint bitmap page flush failed");
1087 }
1088 DBUG_ASSERT(share->pagecache == maria_pagecache);
1089 }
1090 /*
1091 Clean up any unused states.
1092 TODO: Only do this call if there has been # (10?) ended transactions
1093 since last call.
1094 We had to release intern_lock to respect lock order with LOCK_trn_list.
1095 */
1096 _ma_remove_not_visible_states_with_lock(share, FALSE);
1097
1098 if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1099 {
1100 /*
1101 maria_close() left us free the share. When it run it set share->id
1102 to 0. As it run before we locked close_lock, we should have seen this
1103 and so this assertion should be true:
1104 */
1105 DBUG_ASSERT(ignore_share);
1106 mysql_mutex_destroy(&share->intern_lock);
1107 mysql_mutex_unlock(&share->close_lock);
1108 mysql_mutex_destroy(&share->close_lock);
1109 ma_crypt_free(share);
1110 my_free(share);
1111 }
1112 else
1113 {
1114 /* share goes back to normal state */
1115 share->in_checkpoint= 0;
1116 mysql_mutex_unlock(&share->close_lock);
1117 }
1118
1119 /*
1120 We do the big disk writes out of intern_lock to not block other
1121 users of this table (intern_lock is taken at the start and end of
1122 every statement). This means that file descriptors may be invalid
1123 (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
1124 under Windows, or REPAIR). This should not be a problem as we use
1125 MY_IGNORE_BADFD. Descriptors may even point to other files but then
1126 the old blocks (of before the close) must have been flushed for sure,
1127 so our flush will flush new blocks (of after the latest open) and that
1128 should do no harm.
1129 */
1130 /*
1131 If CHECKPOINT_MEDIUM, this big flush below may result in a
1132 serious write burst. Realize that all pages dirtied between the
1133 last checkpoint and the one we are doing now, will be flushed at
1134 next checkpoint, except those evicted by LRU eviction (depending on
1135 the size of the page cache compared to the size of the working data
1136 set, eviction may be rare or frequent).
1137 We avoid that burst by anticipating: those pages are flushed
1138 in bunches spanned regularly over the time interval between now and
1139 the next checkpoint, by a background thread. Thus the next checkpoint
1140 will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
1141 only a little slower than CHECKPOINT_INDIRECT).
1142 */
1143
1144 /*
1145 PageCacheFlushConcurrencyBugs
1146 Inside the page cache, calls to flush_pagecache_blocks_int() on the same
1147 file are serialized. Examples of concurrency bugs which happened when we
1148 didn't have this serialization:
1149 - maria_chk_size() (via CHECK TABLE) happens concurrently with
1150 Checkpoint: Checkpoint is flushing a page: it pins the page and is
1151 pre-empted, maria_chk_size() wants to flush this page too so gets an
1152 error because Checkpoint pinned this page. Such error makes
1153 maria_chk_size() mark the table as corrupted.
1154 - maria_close() happens concurrently with Checkpoint:
1155 Checkpoint is flushing a page: it registers a request on the page, is
1156 pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
1157 FLUSH_RELEASE will cause a free_block() which assumes the page is in the
1158 LRU, but it is not (as Checkpoint registered a request). Crash.
1159 - one thread is evicting a page of the file out of the LRU: it marks it
1160 iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
1161 of the same file concurrently (like above). Then one flusher sees the
1162 page is in switch, removes it from changed_blocks[] and puts it in its
1163 first_in_switch, so the other flusher will not see the page at all and
1164 return too early. If it's maria_close() which returns too early, then
1165 maria_close() may close the file descriptor, and the other flusher, and
1166 the evicter will fail to write their page: corruption.
1167 */
1168
1169 if (!ignore_share)
1170 {
1171 if (filter != NULL)
1172 {
1173 if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1174 &dfile, FLUSH_KEEP_LAZY,
1175 filter, &filter_param) &
1176 PCFLUSH_ERROR))
1177 ma_message_no_user(0, "checkpoint data page flush failed");
1178 if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1179 &kfile, FLUSH_KEEP_LAZY,
1180 filter, &filter_param) &
1181 PCFLUSH_ERROR))
1182 ma_message_no_user(0, "checkpoint index page flush failed");
1183 }
1184 /*
1185 fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
1186 per second, so if you have touched 1000 files it's 7 seconds).
1187 */
1188 sync_error|=
1189 mysql_file_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
1190 mysql_file_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
1191 /*
1192 in case of error, we continue because writing other tables to disk is
1193 still useful.
1194 */
1195 }
1196 }
1197
1198 if (sync_error)
1199 goto err;
1200 /* We maybe over-estimated (due to share->id==0 or last_version==0) */
1201 DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
1202 str->length= (uint)(ptr - str->str);
1203 /*
1204 As we support max 65k tables open at a time (2-byte short id), we
1205 assume uint is enough for the cumulated length of table names; and
1206 LEX_STRING::length is uint.
1207 */
1208 int4store(str->str, nb_stored);
1209 error= unmark_tables= 0;
1210
1211err:
1212 if (unlikely(unmark_tables))
1213 {
1214 /* maria_close() uses THR_LOCK_maria from start to end */
1215 mysql_mutex_lock(&THR_LOCK_maria);
1216 for (i= 0; i < nb; i++)
1217 {
1218 MARIA_SHARE *share= distinct_shares[i];
1219 if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1220 {
1221 /* maria_close() left us to free the share */
1222 mysql_mutex_destroy(&share->intern_lock);
1223 ma_crypt_free(share);
1224 my_free(share);
1225 }
1226 else
1227 {
1228 /* share goes back to normal state */
1229 share->in_checkpoint= 0;
1230 }
1231 }
1232 mysql_mutex_unlock(&THR_LOCK_maria);
1233 }
1234 my_free(distinct_shares);
1235 my_free(state_copies);
1236 DBUG_RETURN(error);
1237}
1238