| 1 | /* Copyright (C) 2006,2007 MySQL AB |
| 2 | |
| 3 | This program is free software; you can redistribute it and/or modify |
| 4 | it under the terms of the GNU General Public License as published by |
| 5 | the Free Software Foundation; version 2 of the License. |
| 6 | |
| 7 | This program is distributed in the hope that it will be useful, |
| 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 10 | GNU General Public License for more details. |
| 11 | |
| 12 | You should have received a copy of the GNU General Public License |
| 13 | along with this program; if not, write to the Free Software |
| 14 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
| 15 | |
| 16 | /* |
| 17 | WL#3071 Maria checkpoint |
| 18 | First version written by Guilhem Bichot on 2006-04-27. |
| 19 | */ |
| 20 | |
| 21 | /* Here is the implementation of this module */ |
| 22 | |
| 23 | /** @todo RECOVERY BUG this is unreviewed code */ |
| 24 | /* |
| 25 | Summary: |
| 26 | checkpoints are done either by a background thread (checkpoint every Nth |
| 27 | second) or by a client. |
| 28 | In ha_maria, it's not made available to clients, and will soon be done by a |
| 29 | background thread (periodically taking checkpoints and flushing dirty |
| 30 | pages). |
| 31 | */ |
| 32 | |
| 33 | #include "maria_def.h" |
| 34 | #include "ma_pagecache.h" |
| 35 | #include "ma_blockrec.h" |
| 36 | #include "ma_checkpoint.h" |
| 37 | #include "ma_loghandler_lsn.h" |
| 38 | #include "ma_servicethread.h" |
| 39 | #include "ma_crypt.h" |
| 40 | |
| 41 | /** @brief type of checkpoint currently running */ |
| 42 | static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE; |
| 43 | /** @brief protects checkpoint_in_progress */ |
| 44 | static mysql_mutex_t LOCK_checkpoint; |
| 45 | /** @brief for killing the background checkpoint thread */ |
| 46 | static mysql_cond_t COND_checkpoint; |
| 47 | /** @brief control structure for checkpoint background thread */ |
| 48 | static MA_SERVICE_THREAD_CONTROL checkpoint_control= |
| 49 | {0, FALSE, FALSE, &LOCK_checkpoint, &COND_checkpoint}; |
| 50 | /* is ulong like pagecache->blocks_changed */ |
| 51 | static uint pages_to_flush_before_next_checkpoint; |
| 52 | static PAGECACHE_FILE *dfiles, /**< data files to flush in background */ |
| 53 | *dfiles_end; /**< list of data files ends here */ |
| 54 | static PAGECACHE_FILE *kfiles, /**< index files to flush in background */ |
| 55 | *kfiles_end; /**< list of index files ends here */ |
| 56 | /* those two statistics below could serve in SHOW GLOBAL STATUS */ |
| 57 | static uint checkpoints_total= 0, /**< all checkpoint requests made */ |
| 58 | checkpoints_ok_total= 0; /**< all checkpoints which succeeded */ |
| 59 | |
| 60 | struct st_filter_param |
| 61 | { |
| 62 | LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */ |
| 63 | uint max_pages; /**< stop after flushing this number pages */ |
| 64 | }; /**< information to determine which dirty pages should be flushed */ |
| 65 | |
| 66 | static enum pagecache_flush_filter_result |
| 67 | filter_flush_file_medium(enum pagecache_page_type type, |
| 68 | pgcache_page_no_t page, |
| 69 | LSN rec_lsn, void *arg); |
| 70 | static enum pagecache_flush_filter_result |
| 71 | filter_flush_file_full(enum pagecache_page_type type, |
| 72 | pgcache_page_no_t page, |
| 73 | LSN rec_lsn, void *arg); |
| 74 | static enum pagecache_flush_filter_result |
| 75 | filter_flush_file_evenly(enum pagecache_page_type type, |
| 76 | pgcache_page_no_t pageno, |
| 77 | LSN rec_lsn, void *arg); |
| 78 | static int really_execute_checkpoint(void); |
| 79 | pthread_handler_t ma_checkpoint_background(void *arg); |
| 80 | static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon); |
| 81 | |
| 82 | /** |
| 83 | @brief Does a checkpoint |
| 84 | |
| 85 | @param level what level of checkpoint to do |
| 86 | @param no_wait if another checkpoint of same or stronger level |
| 87 | is already running, consider our job done |
| 88 | |
| 89 | @note In ha_maria, there can never be two threads trying a checkpoint at |
| 90 | the same time. |
| 91 | |
| 92 | @return Operation status |
| 93 | @retval 0 ok |
| 94 | @retval !=0 error |
| 95 | */ |
| 96 | |
| 97 | int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait) |
| 98 | { |
| 99 | int result= 0; |
| 100 | DBUG_ENTER("ma_checkpoint_execute" ); |
| 101 | |
| 102 | if (!checkpoint_control.inited) |
| 103 | { |
| 104 | /* |
| 105 | If ha_maria failed to start, maria_panic_hton is called, we come here. |
| 106 | */ |
| 107 | DBUG_RETURN(0); |
| 108 | } |
| 109 | DBUG_ASSERT(level > CHECKPOINT_NONE); |
| 110 | |
| 111 | /* look for already running checkpoints */ |
| 112 | mysql_mutex_lock(&LOCK_checkpoint); |
| 113 | while (checkpoint_in_progress != CHECKPOINT_NONE) |
| 114 | { |
| 115 | if (no_wait && (checkpoint_in_progress >= level)) |
| 116 | { |
| 117 | /* |
| 118 | If we are the checkpoint background thread, we don't wait (it's |
| 119 | smarter to flush pages instead of waiting here while the other thread |
| 120 | finishes its checkpoint). |
| 121 | */ |
| 122 | mysql_mutex_unlock(&LOCK_checkpoint); |
| 123 | goto end; |
| 124 | } |
| 125 | mysql_cond_wait(&COND_checkpoint, &LOCK_checkpoint); |
| 126 | } |
| 127 | |
| 128 | checkpoint_in_progress= level; |
| 129 | mysql_mutex_unlock(&LOCK_checkpoint); |
| 130 | /* from then on, we are sure to be and stay the only checkpointer */ |
| 131 | |
| 132 | result= really_execute_checkpoint(); |
| 133 | DBUG_EXECUTE_IF("maria_crash_after_checkpoint" , |
| 134 | { DBUG_PRINT("maria_crash" , ("now" )); DBUG_SUICIDE(); }); |
| 135 | |
| 136 | mysql_cond_broadcast(&COND_checkpoint); |
| 137 | end: |
| 138 | DBUG_RETURN(result); |
| 139 | } |
| 140 | |
| 141 | |
| 142 | /** |
| 143 | @brief Does a checkpoint, really; expects no other checkpoints |
| 144 | running. |
| 145 | |
| 146 | Checkpoint level requested is read from checkpoint_in_progress. |
| 147 | |
| 148 | @return Operation status |
| 149 | @retval 0 ok |
| 150 | @retval !=0 error |
| 151 | */ |
| 152 | |
| 153 | static int really_execute_checkpoint(void) |
| 154 | { |
| 155 | uint i, error= 0; |
| 156 | /** @brief checkpoint_start_log_horizon will be stored there */ |
| 157 | char *ptr; |
| 158 | LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */ |
| 159 | LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn; |
| 160 | TRANSLOG_ADDRESS checkpoint_start_log_horizon; |
| 161 | char checkpoint_start_log_horizon_char[LSN_STORE_SIZE]; |
| 162 | DBUG_ENTER("really_execute_checkpoint" ); |
| 163 | DBUG_PRINT("enter" , ("level: %d" , checkpoint_in_progress)); |
| 164 | bzero(&record_pieces, sizeof(record_pieces)); |
| 165 | |
| 166 | /* |
| 167 | STEP 1: record current end-of-log position using log's lock. It is |
| 168 | critical for the correctness of Checkpoint (related to memory visibility |
| 169 | rules, the log's lock is a mutex). |
| 170 | "Horizon" is a lower bound of the LSN of the next log record. |
| 171 | */ |
| 172 | checkpoint_start_log_horizon= translog_get_horizon(); |
| 173 | DBUG_PRINT("info" ,("checkpoint_start_log_horizon " LSN_FMT, |
| 174 | LSN_IN_PARTS(checkpoint_start_log_horizon))); |
| 175 | lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); |
| 176 | |
| 177 | /* |
| 178 | STEP 2: fetch information about transactions. |
| 179 | We must fetch transactions before dirty pages. Indeed, a transaction |
| 180 | first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn |
| 181 | to 0. If we fetched pages first, we may see no dirty page yet, then we |
| 182 | fetch transactions but the transaction has already reset its rec_lsn to 0 |
| 183 | so we miss rec_lsn again. |
| 184 | For a similar reason (over-allocated bitmap pages) we have to fetch |
| 185 | transactions before flushing bitmap pages. |
| 186 | |
| 187 | min_trn_rec_lsn will serve to lower the starting point of the REDO phase |
| 188 | (down from checkpoint_start_log_horizon). |
| 189 | */ |
| 190 | if (unlikely(trnman_collect_transactions(&record_pieces[0], |
| 191 | &record_pieces[1], |
| 192 | &min_trn_rec_lsn, |
| 193 | &min_first_undo_lsn))) |
| 194 | goto err; |
| 195 | |
| 196 | |
| 197 | /* STEP 3: fetch information about table files */ |
| 198 | if (unlikely(collect_tables(&record_pieces[2], |
| 199 | checkpoint_start_log_horizon))) |
| 200 | goto err; |
| 201 | |
| 202 | |
| 203 | /* STEP 4: fetch information about dirty pages */ |
| 204 | /* |
| 205 | It's better to do it _after_ having flushed some data pages (which |
| 206 | collect_tables() may have done), because those are now non-dirty and so we |
| 207 | have a more up-to-date dirty pages list to put into the checkpoint record, |
| 208 | and thus we will have less work at Recovery. |
| 209 | */ |
| 210 | /* Using default pagecache for now */ |
| 211 | if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache, |
| 212 | &record_pieces[3], |
| 213 | &min_page_rec_lsn))) |
| 214 | goto err; |
| 215 | |
| 216 | |
| 217 | /* LAST STEP: now write the checkpoint log record */ |
| 218 | { |
| 219 | LSN lsn; |
| 220 | translog_size_t total_rec_length; |
| 221 | /* |
| 222 | the log handler is allowed to modify "str" and "length" (but not "*str") |
| 223 | of its argument, so we must not pass it record_pieces directly, |
| 224 | otherwise we would later not know what memory pieces to my_free(). |
| 225 | */ |
| 226 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5]; |
| 227 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= |
| 228 | (uchar*) checkpoint_start_log_horizon_char; |
| 229 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length= |
| 230 | sizeof(checkpoint_start_log_horizon_char); |
| 231 | for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) |
| 232 | { |
| 233 | log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].str= (uchar*)record_pieces[i].str; |
| 234 | log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].length= record_pieces[i].length; |
| 235 | total_rec_length+= (translog_size_t) record_pieces[i].length; |
| 236 | } |
| 237 | if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT, |
| 238 | &dummy_transaction_object, NULL, |
| 239 | total_rec_length, |
| 240 | sizeof(log_array)/sizeof(log_array[0]), |
| 241 | log_array, NULL, NULL) || |
| 242 | translog_flush(lsn))) |
| 243 | goto err; |
| 244 | translog_lock(); |
| 245 | /* |
| 246 | This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because |
| 247 | such hook would be called before translog_flush (and we must be sure |
| 248 | that log was flushed before we write to the control file). |
| 249 | */ |
| 250 | if (unlikely(ma_control_file_write_and_force(lsn, last_logno, |
| 251 | max_trid_in_control_file, |
| 252 | recovery_failures))) |
| 253 | { |
| 254 | translog_unlock(); |
| 255 | goto err; |
| 256 | } |
| 257 | translog_unlock(); |
| 258 | } |
| 259 | |
| 260 | /* |
| 261 | Note that we should not alter memory structures until we have successfully |
| 262 | written the checkpoint record and control file. |
| 263 | */ |
| 264 | /* checkpoint succeeded */ |
| 265 | ptr= record_pieces[3].str; |
| 266 | pages_to_flush_before_next_checkpoint= uint4korr(ptr); |
| 267 | DBUG_PRINT("checkpoint" ,("%u pages to flush before next checkpoint" , |
| 268 | pages_to_flush_before_next_checkpoint)); |
| 269 | |
| 270 | /* compute log's low-water mark */ |
| 271 | { |
| 272 | TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn; |
| 273 | set_if_smaller(log_low_water_mark, min_trn_rec_lsn); |
| 274 | set_if_smaller(log_low_water_mark, min_first_undo_lsn); |
| 275 | set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon); |
| 276 | /** |
| 277 | Now purge unneeded logs. |
| 278 | As some systems have an unreliable fsync (drive lying), we could try to |
| 279 | be robust against that: remember a few previous checkpoints in the |
| 280 | control file, and not purge logs immediately... Think about it. |
| 281 | */ |
| 282 | if (translog_purge(log_low_water_mark)) |
| 283 | ma_message_no_user(0, "log purging failed" ); |
| 284 | } |
| 285 | |
| 286 | goto end; |
| 287 | |
| 288 | err: |
| 289 | error= 1; |
| 290 | ma_message_no_user(0, "checkpoint failed" ); |
| 291 | /* we were possibly not able to determine what pages to flush */ |
| 292 | pages_to_flush_before_next_checkpoint= 0; |
| 293 | |
| 294 | end: |
| 295 | for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) |
| 296 | my_free(record_pieces[i].str); |
| 297 | mysql_mutex_lock(&LOCK_checkpoint); |
| 298 | checkpoint_in_progress= CHECKPOINT_NONE; |
| 299 | checkpoints_total++; |
| 300 | checkpoints_ok_total+= !error; |
| 301 | mysql_mutex_unlock(&LOCK_checkpoint); |
| 302 | DBUG_RETURN(error); |
| 303 | } |
| 304 | |
| 305 | |
| 306 | /** |
| 307 | @brief Initializes the checkpoint module |
| 308 | |
| 309 | @param interval If one wants the module to create a |
| 310 | thread which will periodically do |
| 311 | checkpoints, and flush dirty pages, in the |
| 312 | background, it should specify a non-zero |
| 313 | interval in seconds. The thread will then be |
| 314 | created and will take checkpoints separated by |
| 315 | approximately 'interval' second. |
| 316 | |
| 317 | @note A checkpoint is taken only if there has been some significant |
| 318 | activity since the previous checkpoint. Between checkpoint N and N+1 the |
| 319 | thread flushes all dirty pages which were already dirty at the time of |
| 320 | checkpoint N. |
| 321 | |
| 322 | @return Operation status |
| 323 | @retval 0 ok |
| 324 | @retval !=0 error |
| 325 | */ |
| 326 | |
| 327 | int ma_checkpoint_init(ulong interval) |
| 328 | { |
| 329 | int res= 0; |
| 330 | DBUG_ENTER("ma_checkpoint_init" ); |
| 331 | if (ma_service_thread_control_init(&checkpoint_control)) |
| 332 | res= 1; |
| 333 | else if (interval > 0) |
| 334 | { |
| 335 | size_t intv= interval; |
| 336 | compile_time_assert(sizeof(void *) >= sizeof(ulong)); |
| 337 | if ((res= mysql_thread_create(key_thread_checkpoint, |
| 338 | &checkpoint_control.thread, NULL, |
| 339 | ma_checkpoint_background, |
| 340 | (void*) intv))) |
| 341 | checkpoint_control.killed= TRUE; |
| 342 | } |
| 343 | else |
| 344 | checkpoint_control.killed= TRUE; |
| 345 | DBUG_RETURN(res); |
| 346 | } |
| 347 | |
| 348 | |
| 349 | #ifndef DBUG_OFF |
| 350 | /** |
| 351 | Function used to test recovery: flush some table pieces and then caller |
| 352 | crashes. |
| 353 | |
| 354 | @param what_to_flush 0: current bitmap and all data pages |
| 355 | 1: state |
| 356 | 2: all bitmap pages |
| 357 | */ |
| 358 | static void flush_all_tables(int what_to_flush) |
| 359 | { |
| 360 | int res= 0; |
| 361 | LIST *pos; /**< to iterate over open tables */ |
| 362 | mysql_mutex_lock(&THR_LOCK_maria); |
| 363 | for (pos= maria_open_list; pos; pos= pos->next) |
| 364 | { |
| 365 | MARIA_HA *info= (MARIA_HA*)pos->data; |
| 366 | if (info->s->now_transactional) |
| 367 | { |
| 368 | switch (what_to_flush) |
| 369 | { |
| 370 | case 0: |
| 371 | res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, |
| 372 | FLUSH_KEEP, FLUSH_KEEP); |
| 373 | break; |
| 374 | case 1: |
| 375 | res= _ma_state_info_write(info->s, |
| 376 | MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET| |
| 377 | MA_STATE_INFO_WRITE_LOCK); |
| 378 | DBUG_PRINT("maria_flush_states" , |
| 379 | ("is_of_horizon: LSN " LSN_FMT, |
| 380 | LSN_IN_PARTS(info->s->state.is_of_horizon))); |
| 381 | break; |
| 382 | case 2: |
| 383 | res= _ma_bitmap_flush_all(info->s); |
| 384 | break; |
| 385 | } |
| 386 | } |
| 387 | DBUG_ASSERT(res == 0); |
| 388 | } |
| 389 | mysql_mutex_unlock(&THR_LOCK_maria); |
| 390 | } |
| 391 | #endif |
| 392 | |
| 393 | |
| 394 | /** |
| 395 | @brief Destroys the checkpoint module |
| 396 | */ |
| 397 | |
| 398 | void ma_checkpoint_end(void) |
| 399 | { |
| 400 | DBUG_ENTER("ma_checkpoint_end" ); |
| 401 | /* |
| 402 | Some intentional crash methods, usually triggered by |
| 403 | SET MARIA_CHECKPOINT_INTERVAL=X |
| 404 | */ |
| 405 | DBUG_EXECUTE_IF("maria_flush_bitmap" , |
| 406 | { |
| 407 | DBUG_PRINT("maria_flush_bitmap" , ("now" )); |
| 408 | flush_all_tables(2); |
| 409 | }); |
| 410 | DBUG_EXECUTE_IF("maria_flush_whole_page_cache" , |
| 411 | { |
| 412 | DBUG_PRINT("maria_flush_whole_page_cache" , ("now" )); |
| 413 | flush_all_tables(0); |
| 414 | }); |
| 415 | DBUG_EXECUTE_IF("maria_flush_whole_log" , |
| 416 | { |
| 417 | DBUG_PRINT("maria_flush_whole_log" , ("now" )); |
| 418 | translog_flush(translog_get_horizon()); |
| 419 | }); |
| 420 | /* |
| 421 | Note that for WAL reasons, maria_flush_states requires |
| 422 | maria_flush_whole_log. |
| 423 | */ |
| 424 | DBUG_EXECUTE_IF("maria_flush_states" , |
| 425 | { |
| 426 | DBUG_PRINT("maria_flush_states" , ("now" )); |
| 427 | flush_all_tables(1); |
| 428 | }); |
| 429 | DBUG_EXECUTE_IF("maria_crash" , |
| 430 | { DBUG_PRINT("maria_crash" , ("now" )); DBUG_SUICIDE(); }); |
| 431 | |
| 432 | if (checkpoint_control.inited) |
| 433 | { |
| 434 | ma_service_thread_control_end(&checkpoint_control); |
| 435 | my_free(dfiles); |
| 436 | my_free(kfiles); |
| 437 | dfiles= kfiles= NULL; |
| 438 | } |
| 439 | DBUG_VOID_RETURN; |
| 440 | } |
| 441 | |
| 442 | |
| 443 | /** |
| 444 | @brief dirty-page filtering criteria for MEDIUM checkpoint. |
| 445 | |
| 446 | We flush data/index pages which have been dirty since the previous |
| 447 | checkpoint (this is the two-checkpoint rule: the REDO phase will not have |
| 448 | to start from earlier than the next-to-last checkpoint). |
| 449 | Bitmap pages are handled by _ma_bitmap_flush_all(). |
| 450 | |
| 451 | @param type Page's type |
| 452 | @param pageno Page's number |
| 453 | @param rec_lsn Page's rec_lsn |
| 454 | @param arg filter_param |
| 455 | */ |
| 456 | |
| 457 | static enum pagecache_flush_filter_result |
| 458 | filter_flush_file_medium(enum pagecache_page_type type, |
| 459 | pgcache_page_no_t pageno __attribute__ ((unused)), |
| 460 | LSN rec_lsn, void *arg) |
| 461 | { |
| 462 | struct st_filter_param *param= (struct st_filter_param *)arg; |
| 463 | return (type == PAGECACHE_LSN_PAGE) && |
| 464 | (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0); |
| 465 | } |
| 466 | |
| 467 | |
| 468 | /** |
| 469 | @brief dirty-page filtering criteria for FULL checkpoint. |
| 470 | |
| 471 | We flush all dirty data/index pages. |
| 472 | Bitmap pages are handled by _ma_bitmap_flush_all(). |
| 473 | |
| 474 | @param type Page's type |
| 475 | @param pageno Page's number |
| 476 | @param rec_lsn Page's rec_lsn |
| 477 | @param arg filter_param |
| 478 | */ |
| 479 | |
| 480 | static enum pagecache_flush_filter_result |
| 481 | filter_flush_file_full(enum pagecache_page_type type, |
| 482 | pgcache_page_no_t pageno __attribute__ ((unused)), |
| 483 | LSN rec_lsn __attribute__ ((unused)), |
| 484 | void *arg __attribute__ ((unused))) |
| 485 | { |
| 486 | return (type == PAGECACHE_LSN_PAGE); |
| 487 | } |
| 488 | |
| 489 | |
| 490 | /** |
| 491 | @brief dirty-page filtering criteria for background flushing thread. |
| 492 | |
| 493 | We flush data/index pages which have been dirty since the previous |
| 494 | checkpoint (this is the two-checkpoint rule: the REDO phase will not have |
| 495 | to start from earlier than the next-to-last checkpoint), and no |
| 496 | bitmap pages. But we flush no more than a certain number of pages (to have |
| 497 | an even flushing, no write burst). |
| 498 | The reason to not flush bitmap pages is that they may not be in a flushable |
| 499 | state at this moment and we don't want to wait for them. |
| 500 | |
| 501 | @param type Page's type |
| 502 | @param pageno Page's number |
| 503 | @param rec_lsn Page's rec_lsn |
| 504 | @param arg filter_param |
| 505 | */ |
| 506 | |
| 507 | static enum pagecache_flush_filter_result |
| 508 | filter_flush_file_evenly(enum pagecache_page_type type, |
| 509 | pgcache_page_no_t pageno __attribute__ ((unused)), |
| 510 | LSN rec_lsn, void *arg) |
| 511 | { |
| 512 | struct st_filter_param *param= (struct st_filter_param *)arg; |
| 513 | if (unlikely(param->max_pages == 0)) /* all flushed already */ |
| 514 | return FLUSH_FILTER_SKIP_ALL; |
| 515 | if ((type == PAGECACHE_LSN_PAGE) && |
| 516 | (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) |
| 517 | { |
| 518 | param->max_pages--; |
| 519 | return FLUSH_FILTER_OK; |
| 520 | } |
| 521 | return FLUSH_FILTER_SKIP_TRY_NEXT; |
| 522 | } |
| 523 | |
| 524 | |
| 525 | /** |
| 526 | @brief Background thread which does checkpoints and flushes periodically. |
| 527 | |
| 528 | Takes a checkpoint. After this, all pages dirty at the time of that |
| 529 | checkpoint are flushed evenly until it is time to take another checkpoint. |
| 530 | This ensures that the REDO phase starts at earliest (in LSN time) at the |
| 531 | next-to-last checkpoint record ("two-checkpoint rule"). |
| 532 | |
| 533 | @note MikaelR questioned why the same thread does two different jobs, the |
| 534 | risk could be that while a checkpoint happens no LRD flushing happens. |
| 535 | */ |
| 536 | |
| 537 | static ulong maria_checkpoint_min_cache_activity= 10*1024*1024; |
| 538 | /* Set in ha_maria.cc */ |
| 539 | ulong maria_checkpoint_min_log_activity= 1*1024*1024; |
| 540 | |
| 541 | pthread_handler_t ma_checkpoint_background(void *arg) |
| 542 | { |
| 543 | /** @brief At least this of log/page bytes written between checkpoints */ |
| 544 | /* |
| 545 | If the interval could be changed by the user while we are in this thread, |
| 546 | it could be annoying: for example it could cause "case 2" to be executed |
| 547 | right after "case 0", thus having 'dfile' unset. So the thread cares only |
| 548 | about the interval's value when it started. |
| 549 | */ |
| 550 | const size_t interval= (size_t)arg; |
| 551 | size_t sleeps, sleep_time; |
| 552 | TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= |
| 553 | translog_get_horizon(); |
| 554 | ulonglong pagecache_flushes_at_last_checkpoint= |
| 555 | maria_pagecache->global_cache_write; |
| 556 | uint UNINIT_VAR(pages_bunch_size); |
| 557 | struct st_filter_param filter_param; |
| 558 | PAGECACHE_FILE *UNINIT_VAR(dfile); /**< data file currently being flushed */ |
| 559 | PAGECACHE_FILE *UNINIT_VAR(kfile); /**< index file currently being flushed */ |
| 560 | |
| 561 | my_thread_init(); |
| 562 | DBUG_PRINT("info" ,("Maria background checkpoint thread starts" )); |
| 563 | DBUG_ASSERT(interval > 0); |
| 564 | |
| 565 | PSI_CALL_set_thread_user_host(0,0,0,0); |
| 566 | |
| 567 | /* |
| 568 | Recovery ended with all tables closed and a checkpoint: no need to take |
| 569 | one immediately. |
| 570 | */ |
| 571 | sleeps= 1; |
| 572 | pages_to_flush_before_next_checkpoint= 0; |
| 573 | |
| 574 | for(;;) /* iterations of checkpoints and dirty page flushing */ |
| 575 | { |
| 576 | #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ |
| 577 | sleeps=0; |
| 578 | #endif |
| 579 | switch (sleeps % interval) |
| 580 | { |
| 581 | case 0: |
| 582 | { |
| 583 | /* If checkpoints are disabled, wait 1 second and try again */ |
| 584 | if (maria_checkpoint_disabled) |
| 585 | { |
| 586 | sleep_time= 1; |
| 587 | break; |
| 588 | } |
| 589 | { |
| 590 | TRANSLOG_ADDRESS horizon= translog_get_horizon(); |
| 591 | |
| 592 | /* |
| 593 | With background flushing evenly distributed over the time |
| 594 | between two checkpoints, we should have only little flushing to do |
| 595 | in the checkpoint. |
| 596 | */ |
| 597 | /* |
| 598 | No checkpoint if little work of interest for recovery was done |
| 599 | since last checkpoint. Such work includes log writing (lengthens |
| 600 | recovery, checkpoint would shorten it), page flushing (checkpoint |
| 601 | would decrease the amount of read pages in recovery). |
| 602 | In case of one short statement per minute (very low load), we don't |
| 603 | want to checkpoint every minute, hence the positive |
| 604 | maria_checkpoint_min_activity. |
| 605 | */ |
| 606 | if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <= |
| 607 | maria_checkpoint_min_log_activity && |
| 608 | ((ulonglong) (maria_pagecache->global_cache_write - |
| 609 | pagecache_flushes_at_last_checkpoint) * |
| 610 | maria_pagecache->block_size) <= |
| 611 | maria_checkpoint_min_cache_activity) |
| 612 | { |
| 613 | /* |
| 614 | Not enough has happend since last checkpoint. |
| 615 | Sleep for a while and try again later |
| 616 | */ |
| 617 | sleep_time= interval; |
| 618 | break; |
| 619 | } |
| 620 | sleep_time= 1; |
| 621 | ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE); |
| 622 | /* |
| 623 | Snapshot this kind of "state" of the engine. Note that the value |
| 624 | below is possibly greater than last_checkpoint_lsn. |
| 625 | */ |
| 626 | log_horizon_at_last_checkpoint= translog_get_horizon(); |
| 627 | pagecache_flushes_at_last_checkpoint= |
| 628 | maria_pagecache->global_cache_write; |
| 629 | /* |
| 630 | If the checkpoint above succeeded it has set d|kfiles and |
| 631 | d|kfiles_end. If is has failed, it has set |
| 632 | pages_to_flush_before_next_checkpoint to 0 so we will skip flushing |
| 633 | and sleep until the next checkpoint. |
| 634 | */ |
| 635 | } |
| 636 | break; |
| 637 | } |
| 638 | case 1: |
| 639 | /* set up parameters for background page flushing */ |
| 640 | filter_param.up_to_lsn= last_checkpoint_lsn; |
| 641 | pages_bunch_size= pages_to_flush_before_next_checkpoint / (uint)interval; |
| 642 | dfile= dfiles; |
| 643 | kfile= kfiles; |
| 644 | /* fall through */ |
| 645 | default: |
| 646 | if (pages_bunch_size > 0) |
| 647 | { |
| 648 | DBUG_PRINT("checkpoint" , |
| 649 | ("Maria background checkpoint thread: %u pages" , |
| 650 | pages_bunch_size)); |
| 651 | /* flush a bunch of dirty pages */ |
| 652 | filter_param.max_pages= pages_bunch_size; |
| 653 | while (dfile != dfiles_end) |
| 654 | { |
| 655 | /* |
| 656 | We use FLUSH_KEEP_LAZY: if a file is already in flush, it's |
| 657 | smarter to move to the next file than wait for this one to be |
| 658 | completely flushed, which may take long. |
| 659 | StaleFilePointersInFlush: notice how below we use "dfile" which |
| 660 | is an OS file descriptor plus some function and MARIA_SHARE |
| 661 | pointers; this data dates from a previous checkpoint; since then, |
| 662 | the table may have been closed (so MARIA_SHARE* became stale), and |
| 663 | the file descriptor reassigned to another table which does not |
| 664 | have the same CRC-read-set callbacks: it is thus important that |
| 665 | flush_pagecache_blocks_with_filter() does not use the pointers, |
| 666 | only the OS file descriptor. |
| 667 | */ |
| 668 | int res= |
| 669 | flush_pagecache_blocks_with_filter(maria_pagecache, |
| 670 | dfile, FLUSH_KEEP_LAZY, |
| 671 | filter_flush_file_evenly, |
| 672 | &filter_param); |
| 673 | if (unlikely(res & PCFLUSH_ERROR)) |
| 674 | ma_message_no_user(0, "background data page flush failed" ); |
| 675 | if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ |
| 676 | break; /* and we will continue with the same file */ |
| 677 | dfile++; /* otherwise all this file is flushed, move to next file */ |
| 678 | /* |
| 679 | MikaelR noted that he observed that Linux's file cache may never |
| 680 | fsync to disk until this cache is full, at which point it decides |
| 681 | to empty the cache, making the machine very slow. A solution was |
| 682 | to fsync after writing 2 MB. So we might want to fsync() here if |
| 683 | we wrote enough pages. |
| 684 | */ |
| 685 | } |
| 686 | while (kfile != kfiles_end) |
| 687 | { |
| 688 | int res= |
| 689 | flush_pagecache_blocks_with_filter(maria_pagecache, |
| 690 | kfile, FLUSH_KEEP_LAZY, |
| 691 | filter_flush_file_evenly, |
| 692 | &filter_param); |
| 693 | if (unlikely(res & PCFLUSH_ERROR)) |
| 694 | ma_message_no_user(0, "background index page flush failed" ); |
| 695 | if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ |
| 696 | break; /* and we will continue with the same file */ |
| 697 | kfile++; /* otherwise all this file is flushed, move to next file */ |
| 698 | } |
| 699 | sleep_time= 1; |
| 700 | } |
| 701 | else |
| 702 | { |
| 703 | /* Can directly sleep until the next checkpoint moment */ |
| 704 | sleep_time= interval - (sleeps % interval); |
| 705 | } |
| 706 | } |
| 707 | if (my_service_thread_sleep(&checkpoint_control, |
| 708 | sleep_time * 1000000000ULL)) |
| 709 | break; |
| 710 | sleeps+= sleep_time; |
| 711 | } |
| 712 | DBUG_PRINT("info" ,("Maria background checkpoint thread ends" )); |
| 713 | { |
| 714 | CHECKPOINT_LEVEL level= CHECKPOINT_FULL; |
| 715 | /* |
| 716 | That's the final one, which guarantees that a clean shutdown always ends |
| 717 | with a checkpoint. |
| 718 | */ |
| 719 | DBUG_EXECUTE_IF("maria_checkpoint_indirect" , level= CHECKPOINT_INDIRECT;); |
| 720 | ma_checkpoint_execute(level, FALSE); |
| 721 | } |
| 722 | my_thread_end(); |
| 723 | return 0; |
| 724 | } |
| 725 | |
| 726 | |
| 727 | /** |
| 728 | @brief Allocates buffer and stores in it some info about open tables, |
| 729 | does some flushing on those. |
| 730 | |
| 731 | Does the allocation because the caller cannot know the size itself. |
| 732 | Memory freeing is to be done by the caller (if the "str" member of the |
| 733 | LEX_STRING is not NULL). |
| 734 | The caller is taking a checkpoint. |
| 735 | |
| 736 | @param[out] str pointer to where the allocated buffer, |
| 737 | and its size, will be put; buffer will be filled |
| 738 | with info about open tables |
| 739 | @param checkpoint_start_log_horizon Of the in-progress checkpoint |
| 740 | record. |
| 741 | |
| 742 | @return Operation status |
| 743 | @retval 0 OK |
| 744 | @retval 1 Error |
| 745 | */ |
| 746 | |
| 747 | static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) |
| 748 | { |
| 749 | MARIA_SHARE **distinct_shares= NULL; |
| 750 | char *ptr; |
| 751 | uint error= 1, sync_error= 0, nb, nb_stored, i; |
| 752 | my_bool unmark_tables= TRUE; |
| 753 | size_t total_names_length; |
| 754 | LIST *pos; /**< to iterate over open tables */ |
| 755 | struct st_state_copy { |
| 756 | uint index; |
| 757 | MARIA_STATE_INFO state; |
| 758 | }; |
| 759 | struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */ |
| 760 | *state_copies_end, /**< cache ends here */ |
| 761 | *state_copy; /**< iterator in cache */ |
| 762 | TRANSLOG_ADDRESS UNINIT_VAR(state_copies_horizon); /**< horizon of states' _copies_ */ |
| 763 | struct st_filter_param filter_param; |
| 764 | PAGECACHE_FLUSH_FILTER filter; |
| 765 | DBUG_ENTER("collect_tables" ); |
| 766 | |
| 767 | /* let's make a list of distinct shares */ |
| 768 | mysql_mutex_lock(&THR_LOCK_maria); |
| 769 | for (nb= 0, pos= maria_open_list; pos; pos= pos->next) |
| 770 | { |
| 771 | MARIA_HA *info= (MARIA_HA*)pos->data; |
| 772 | MARIA_SHARE *share= info->s; |
| 773 | /* the first three variables below can never change */ |
| 774 | if (share->base.born_transactional && !share->temporary && |
| 775 | share->mode != O_RDONLY && |
| 776 | !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)) |
| 777 | { |
| 778 | /* |
| 779 | Apart from us, only maria_close() reads/sets in_checkpoint but cannot |
| 780 | run now as we hold THR_LOCK_maria. |
| 781 | */ |
| 782 | /* |
| 783 | This table is relevant for checkpoint and not already seen. Mark it, |
| 784 | so that it is not seen again in the loop. |
| 785 | */ |
| 786 | nb++; |
| 787 | DBUG_ASSERT(share->in_checkpoint == 0); |
| 788 | /* This flag ensures that we count only _distinct_ shares. */ |
| 789 | share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP; |
| 790 | } |
| 791 | } |
| 792 | if (unlikely((distinct_shares= |
| 793 | (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *), |
| 794 | MYF(MY_WME))) == NULL)) |
| 795 | goto err; |
| 796 | for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next) |
| 797 | { |
| 798 | MARIA_HA *info= (MARIA_HA*)pos->data; |
| 799 | MARIA_SHARE *share= info->s; |
| 800 | if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP) |
| 801 | { |
| 802 | distinct_shares[i++]= share; |
| 803 | /* |
| 804 | With this we prevent the share from going away while we later flush |
| 805 | and force it without holding THR_LOCK_maria. For example if the share |
| 806 | could be my_free()d by maria_close() we would have a problem when we |
| 807 | access it to flush the table. We "pin" the share pointer. |
| 808 | And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is |
| 809 | not seen again in the loop. |
| 810 | */ |
| 811 | share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME; |
| 812 | total_names_length+= share->open_file_name.length; |
| 813 | } |
| 814 | } |
| 815 | |
| 816 | DBUG_ASSERT(i == nb); |
| 817 | mysql_mutex_unlock(&THR_LOCK_maria); |
| 818 | DBUG_PRINT("info" ,("found %u table shares" , nb)); |
| 819 | |
| 820 | str->length= |
| 821 | 4 + /* number of tables */ |
| 822 | (2 + /* short id */ |
| 823 | LSN_STORE_SIZE + /* first_log_write_at_lsn */ |
| 824 | 1 /* end-of-name 0 */ |
| 825 | ) * nb + total_names_length; |
| 826 | if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL)) |
| 827 | goto err; |
| 828 | |
| 829 | ptr= str->str; |
| 830 | ptr+= 4; /* real number of stored tables is not yet know */ |
| 831 | |
| 832 | /* only possible checkpointer, so can do the read below without mutex */ |
| 833 | filter_param.up_to_lsn= last_checkpoint_lsn; |
| 834 | switch(checkpoint_in_progress) |
| 835 | { |
| 836 | case CHECKPOINT_MEDIUM: |
| 837 | filter= &filter_flush_file_medium; |
| 838 | break; |
| 839 | case CHECKPOINT_FULL: |
| 840 | filter= &filter_flush_file_full; |
| 841 | break; |
| 842 | case CHECKPOINT_INDIRECT: |
| 843 | filter= NULL; |
| 844 | break; |
| 845 | default: |
| 846 | DBUG_ASSERT(0); |
| 847 | goto err; |
| 848 | } |
| 849 | |
| 850 | /* |
| 851 | The principle of reading/writing the state below is explained in |
| 852 | ma_recovery.c, look for "Recovery of the state". |
| 853 | */ |
| 854 | #define STATE_COPIES 1024 |
| 855 | state_copies= (struct st_state_copy *) |
| 856 | my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME)); |
| 857 | dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles, |
| 858 | /* avoid size of 0 for my_realloc */ |
| 859 | MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), |
| 860 | MYF(MY_WME | MY_ALLOW_ZERO_PTR)); |
| 861 | kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles, |
| 862 | /* avoid size of 0 for my_realloc */ |
| 863 | MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), |
| 864 | MYF(MY_WME | MY_ALLOW_ZERO_PTR)); |
| 865 | if (unlikely((state_copies == NULL) || |
| 866 | (dfiles == NULL) || (kfiles == NULL))) |
| 867 | goto err; |
| 868 | state_copy= state_copies_end= NULL; |
| 869 | dfiles_end= dfiles; |
| 870 | kfiles_end= kfiles; |
| 871 | |
| 872 | for (nb_stored= 0, i= 0; i < nb; i++) |
| 873 | { |
| 874 | MARIA_SHARE *share= distinct_shares[i]; |
| 875 | PAGECACHE_FILE kfile, dfile; |
| 876 | my_bool ignore_share; |
| 877 | if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) |
| 878 | { |
| 879 | /* |
| 880 | No need for a mutex to read the above, only us can write *this* bit of |
| 881 | the in_checkpoint bitmap |
| 882 | */ |
| 883 | continue; |
| 884 | } |
| 885 | /** |
| 886 | @todo We should not look at tables which didn't change since last |
| 887 | checkpoint. |
| 888 | */ |
| 889 | DBUG_PRINT("info" ,("looking at table '%s'" , share->open_file_name.str)); |
| 890 | if (state_copy == state_copies_end) /* we have no more cached states */ |
| 891 | { |
| 892 | /* |
| 893 | Collect and cache a bunch of states. We do this for many states at a |
| 894 | time, to not lock/unlock the log's lock too often. |
| 895 | */ |
| 896 | uint j, bound= MY_MIN(nb, i + STATE_COPIES); |
| 897 | state_copy= state_copies; |
| 898 | /* part of the state is protected by log's lock */ |
| 899 | translog_lock(); |
| 900 | state_copies_horizon= translog_get_horizon_no_lock(); |
| 901 | for (j= i; j < bound; j++) |
| 902 | { |
| 903 | MARIA_SHARE *share2= distinct_shares[j]; |
| 904 | if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) |
| 905 | continue; |
| 906 | state_copy->index= j; |
| 907 | state_copy->state= share2->state; /* we copy the state */ |
| 908 | state_copy++; |
| 909 | /* |
| 910 | data_file_length is not updated under log's lock by the bitmap |
| 911 | code, but writing a wrong data_file_length is ok: a next |
| 912 | maria_close() will correct it; if we crash before, Recovery will |
| 913 | set it to the true physical size. |
| 914 | */ |
| 915 | } |
| 916 | translog_unlock(); |
| 917 | if (state_copy == state_copies) |
| 918 | break; /* Nothing to do */ |
| 919 | |
| 920 | /** |
| 921 | We are going to flush these states. |
| 922 | Before, all records describing how to undo such state must be |
| 923 | in the log (WAL). Usually this means UNDOs. In the special case of |
| 924 | data|key_file_length, recovery just needs to open the table to fix the |
| 925 | length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to |
| 926 | understand it must open a table, is enough; so as long as |
| 927 | data|key_file_length is updated after writing any log record it's ok: |
| 928 | if we copied new value above, it means the record was before |
| 929 | state_copies_horizon and we flush such record below. |
| 930 | Apart from data|key_file_length which are easily recoverable from the |
| 931 | real file's size, all other state members must be updated only when |
| 932 | writing the UNDO; otherwise, if updated before, if their new value is |
| 933 | flushed by a checkpoint and there is a crash before UNDO is written, |
| 934 | their REDO group will be missing or at least incomplete and skipped |
| 935 | by recovery, so bad state value will stay. For example, setting |
| 936 | key_root before writing the UNDO: the table would have old index |
| 937 | pages (they were pinned at time of crash) and a new, thus wrong, |
| 938 | key_root. |
| 939 | @todo RECOVERY BUG check that all code honours that. |
| 940 | */ |
| 941 | if (translog_flush(state_copies_horizon)) |
| 942 | goto err; |
| 943 | /* now we have cached states and they are WAL-safe*/ |
| 944 | state_copies_end= state_copy-1; |
| 945 | state_copy= state_copies; |
| 946 | } |
| 947 | |
| 948 | /* locate our state among these cached ones */ |
| 949 | for ( ; state_copy->index != i; state_copy++) |
| 950 | DBUG_ASSERT(state_copy <= state_copies_end); |
| 951 | |
| 952 | /* OS file descriptors are ints which we stored in 4 bytes */ |
| 953 | compile_time_assert(sizeof(int) <= 4); |
| 954 | /* |
| 955 | Protect against maria_close() (which does some memory freeing in |
| 956 | MARIA_FILE_BITMAP) with close_lock. intern_lock is not |
| 957 | sufficient as we, as well as maria_close(), are going to unlock |
| 958 | intern_lock in the middle of manipulating the table. Serializing us and |
| 959 | maria_close() should help avoid problems. |
| 960 | */ |
| 961 | mysql_mutex_lock(&share->close_lock); |
| 962 | mysql_mutex_lock(&share->intern_lock); |
| 963 | /* |
| 964 | Tables in a normal state have their two file descriptors open. |
| 965 | In some rare cases like REPAIR, some descriptor may be closed or even |
| 966 | -1. If that happened, the _ma_state_info_write() may fail. This is |
| 967 | prevented by enclosing all all places which close/change kfile.file with |
| 968 | intern_lock. |
| 969 | */ |
| 970 | kfile= share->kfile; |
| 971 | dfile= share->bitmap.file; |
| 972 | /* |
| 973 | Ignore table which has no logged writes (all its future log records will |
| 974 | be found naturally by Recovery). Ignore obsolete shares (_before_ |
| 975 | setting themselves to last_version=0 they already did all flush and |
| 976 | sync; if we flush their state now we may be flushing an obsolete state |
| 977 | onto a newer one (assuming the table has been reopened with a different |
| 978 | share but of course same physical index file). |
| 979 | */ |
| 980 | ignore_share= (share->id == 0) | (share->last_version == 0); |
| 981 | DBUG_PRINT("info" , ("ignore_share: %d" , ignore_share)); |
| 982 | if (!ignore_share) |
| 983 | { |
| 984 | size_t open_file_name_len= share->open_file_name.length + 1; |
| 985 | /* remember the descriptors for background flush */ |
| 986 | *(dfiles_end++)= dfile; |
| 987 | *(kfiles_end++)= kfile; |
| 988 | /* we will store this table in the record */ |
| 989 | nb_stored++; |
| 990 | int2store(ptr, share->id); |
| 991 | ptr+= 2; |
| 992 | lsn_store(ptr, share->lsn_of_file_id); |
| 993 | ptr+= LSN_STORE_SIZE; |
| 994 | /* |
| 995 | first_bitmap_with_space is not updated under log's lock, and is |
| 996 | important. We would need the bitmap's lock to get it right. Recovery |
| 997 | of this is not clear, so we just play safe: write it out as |
| 998 | unknown: if crash, _ma_bitmap_init() at next open (for example in |
| 999 | Recovery) will convert it to 0 and thus the first insertion will |
| 1000 | search for free space from the file's first bitmap (0) - |
| 1001 | under-optimal but safe. |
| 1002 | If no crash, maria_close() will write the exact value. |
| 1003 | */ |
| 1004 | state_copy->state.first_bitmap_with_space= ~(ulonglong)0; |
| 1005 | memcpy(ptr, share->open_file_name.str, open_file_name_len); |
| 1006 | ptr+= open_file_name_len; |
| 1007 | if (cmp_translog_addr(share->state.is_of_horizon, |
| 1008 | checkpoint_start_log_horizon) >= 0) |
| 1009 | { |
| 1010 | /* |
| 1011 | State was flushed recently, it does not hold down the log's |
| 1012 | low-water mark and will not give avoidable work to Recovery. So we |
| 1013 | needn't flush it. Also, it is possible that while we copied the |
| 1014 | state above (under log's lock, without intern_lock) it was being |
| 1015 | modified in memory or flushed to disk (without log's lock, under |
| 1016 | intern_lock, like in maria_extra()), so our copy may be incorrect |
| 1017 | and we should not flush it. |
| 1018 | It may also be a share which got last_version==0 since we checked |
| 1019 | last_version; in this case, it flushed its state and the LSN test |
| 1020 | above will catch it. |
| 1021 | */ |
| 1022 | } |
| 1023 | else |
| 1024 | { |
| 1025 | /* |
| 1026 | We could do the state flush only if share->changed, but it's |
| 1027 | tricky. |
| 1028 | Consider a maria_write() which has written REDO,UNDO, and before it |
| 1029 | calls _ma_writeinfo() (setting share->changed=1), checkpoint |
| 1030 | happens and sees share->changed=0, does not flush state. It is |
| 1031 | possible that Recovery does not start from before the REDO and thus |
| 1032 | the state is not recovered. A solution may be to set |
| 1033 | share->changed=1 under log mutex when writing log records. |
| 1034 | |
| 1035 | The current solution is to keep a copy the last saved state and |
| 1036 | not write the state if it was same as last time. It's ok if |
| 1037 | is_of_horizon would be different on disk if all other data is |
| 1038 | the same. |
| 1039 | */ |
| 1040 | DBUG_ASSERT(share->last_version != 0); |
| 1041 | state_copy->state.is_of_horizon= share->state.is_of_horizon= |
| 1042 | share->checkpoint_state.is_of_horizon= state_copies_horizon; |
| 1043 | if (kfile.file >= 0 && memcmp(&share->checkpoint_state, |
| 1044 | &state_copy->state, |
| 1045 | sizeof(state_copy->state))) |
| 1046 | { |
| 1047 | sync_error|= |
| 1048 | _ma_state_info_write_sub(kfile.file, &state_copy->state, |
| 1049 | MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET); |
| 1050 | memcpy(&share->checkpoint_state, |
| 1051 | &state_copy->state, sizeof(state_copy->state)); |
| 1052 | } |
| 1053 | /* |
| 1054 | We don't set share->changed=0 because it may interfere with a |
| 1055 | concurrent _ma_writeinfo() doing share->changed=1 (cancel its |
| 1056 | effect). The sad consequence is that we will flush the same state at |
| 1057 | each checkpoint if the table was once written and then not anymore. |
| 1058 | */ |
| 1059 | } |
| 1060 | } |
| 1061 | #ifdef EXTRA_DEBUG_BITMAP |
| 1062 | else |
| 1063 | { |
| 1064 | DBUG_ASSERT(share->bitmap.changed == 0 && |
| 1065 | share->bitmap.changed_not_flushed == 0); |
| 1066 | } |
| 1067 | #endif |
| 1068 | |
| 1069 | /* |
| 1070 | _ma_bitmap_flush_all() may wait, so don't keep intern_lock as |
| 1071 | otherwise this would deadlock with allocate_and_write_block_record() |
| 1072 | calling _ma_set_share_data_file_length() |
| 1073 | */ |
| 1074 | mysql_mutex_unlock(&share->intern_lock); |
| 1075 | |
| 1076 | if (!ignore_share) |
| 1077 | { |
| 1078 | /* |
| 1079 | share->bitmap is valid because it's destroyed under close_lock which |
| 1080 | we hold. |
| 1081 | */ |
| 1082 | if (_ma_bitmap_flush_all(share)) |
| 1083 | { |
| 1084 | sync_error= 1; |
| 1085 | /** @todo all write failures should mark table corrupted */ |
| 1086 | ma_message_no_user(0, "checkpoint bitmap page flush failed" ); |
| 1087 | } |
| 1088 | DBUG_ASSERT(share->pagecache == maria_pagecache); |
| 1089 | } |
| 1090 | /* |
| 1091 | Clean up any unused states. |
| 1092 | TODO: Only do this call if there has been # (10?) ended transactions |
| 1093 | since last call. |
| 1094 | We had to release intern_lock to respect lock order with LOCK_trn_list. |
| 1095 | */ |
| 1096 | _ma_remove_not_visible_states_with_lock(share, FALSE); |
| 1097 | |
| 1098 | if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) |
| 1099 | { |
| 1100 | /* |
| 1101 | maria_close() left us free the share. When it run it set share->id |
| 1102 | to 0. As it run before we locked close_lock, we should have seen this |
| 1103 | and so this assertion should be true: |
| 1104 | */ |
| 1105 | DBUG_ASSERT(ignore_share); |
| 1106 | mysql_mutex_destroy(&share->intern_lock); |
| 1107 | mysql_mutex_unlock(&share->close_lock); |
| 1108 | mysql_mutex_destroy(&share->close_lock); |
| 1109 | ma_crypt_free(share); |
| 1110 | my_free(share); |
| 1111 | } |
| 1112 | else |
| 1113 | { |
| 1114 | /* share goes back to normal state */ |
| 1115 | share->in_checkpoint= 0; |
| 1116 | mysql_mutex_unlock(&share->close_lock); |
| 1117 | } |
| 1118 | |
| 1119 | /* |
| 1120 | We do the big disk writes out of intern_lock to not block other |
| 1121 | users of this table (intern_lock is taken at the start and end of |
| 1122 | every statement). This means that file descriptors may be invalid |
| 1123 | (files may have been closed for example by HA_EXTRA_PREPARE_FOR_* |
| 1124 | under Windows, or REPAIR). This should not be a problem as we use |
| 1125 | MY_IGNORE_BADFD. Descriptors may even point to other files but then |
| 1126 | the old blocks (of before the close) must have been flushed for sure, |
| 1127 | so our flush will flush new blocks (of after the latest open) and that |
| 1128 | should do no harm. |
| 1129 | */ |
| 1130 | /* |
| 1131 | If CHECKPOINT_MEDIUM, this big flush below may result in a |
| 1132 | serious write burst. Realize that all pages dirtied between the |
| 1133 | last checkpoint and the one we are doing now, will be flushed at |
| 1134 | next checkpoint, except those evicted by LRU eviction (depending on |
| 1135 | the size of the page cache compared to the size of the working data |
| 1136 | set, eviction may be rare or frequent). |
| 1137 | We avoid that burst by anticipating: those pages are flushed |
| 1138 | in bunches spanned regularly over the time interval between now and |
| 1139 | the next checkpoint, by a background thread. Thus the next checkpoint |
| 1140 | will have only little flushing to do (CHECKPOINT_MEDIUM should thus be |
| 1141 | only a little slower than CHECKPOINT_INDIRECT). |
| 1142 | */ |
| 1143 | |
| 1144 | /* |
| 1145 | PageCacheFlushConcurrencyBugs |
| 1146 | Inside the page cache, calls to flush_pagecache_blocks_int() on the same |
| 1147 | file are serialized. Examples of concurrency bugs which happened when we |
| 1148 | didn't have this serialization: |
| 1149 | - maria_chk_size() (via CHECK TABLE) happens concurrently with |
| 1150 | Checkpoint: Checkpoint is flushing a page: it pins the page and is |
| 1151 | pre-empted, maria_chk_size() wants to flush this page too so gets an |
| 1152 | error because Checkpoint pinned this page. Such error makes |
| 1153 | maria_chk_size() mark the table as corrupted. |
| 1154 | - maria_close() happens concurrently with Checkpoint: |
| 1155 | Checkpoint is flushing a page: it registers a request on the page, is |
| 1156 | pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE: |
| 1157 | FLUSH_RELEASE will cause a free_block() which assumes the page is in the |
| 1158 | LRU, but it is not (as Checkpoint registered a request). Crash. |
| 1159 | - one thread is evicting a page of the file out of the LRU: it marks it |
| 1160 | iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes |
| 1161 | of the same file concurrently (like above). Then one flusher sees the |
| 1162 | page is in switch, removes it from changed_blocks[] and puts it in its |
| 1163 | first_in_switch, so the other flusher will not see the page at all and |
| 1164 | return too early. If it's maria_close() which returns too early, then |
| 1165 | maria_close() may close the file descriptor, and the other flusher, and |
| 1166 | the evicter will fail to write their page: corruption. |
| 1167 | */ |
| 1168 | |
| 1169 | if (!ignore_share) |
| 1170 | { |
| 1171 | if (filter != NULL) |
| 1172 | { |
| 1173 | if ((flush_pagecache_blocks_with_filter(maria_pagecache, |
| 1174 | &dfile, FLUSH_KEEP_LAZY, |
| 1175 | filter, &filter_param) & |
| 1176 | PCFLUSH_ERROR)) |
| 1177 | ma_message_no_user(0, "checkpoint data page flush failed" ); |
| 1178 | if ((flush_pagecache_blocks_with_filter(maria_pagecache, |
| 1179 | &kfile, FLUSH_KEEP_LAZY, |
| 1180 | filter, &filter_param) & |
| 1181 | PCFLUSH_ERROR)) |
| 1182 | ma_message_no_user(0, "checkpoint index page flush failed" ); |
| 1183 | } |
| 1184 | /* |
| 1185 | fsyncs the fd, that's the loooong operation (e.g. max 150 fsync |
| 1186 | per second, so if you have touched 1000 files it's 7 seconds). |
| 1187 | */ |
| 1188 | sync_error|= |
| 1189 | mysql_file_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) | |
| 1190 | mysql_file_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD)); |
| 1191 | /* |
| 1192 | in case of error, we continue because writing other tables to disk is |
| 1193 | still useful. |
| 1194 | */ |
| 1195 | } |
| 1196 | } |
| 1197 | |
| 1198 | if (sync_error) |
| 1199 | goto err; |
| 1200 | /* We maybe over-estimated (due to share->id==0 or last_version==0) */ |
| 1201 | DBUG_ASSERT(str->length >= (uint)(ptr - str->str)); |
| 1202 | str->length= (uint)(ptr - str->str); |
| 1203 | /* |
| 1204 | As we support max 65k tables open at a time (2-byte short id), we |
| 1205 | assume uint is enough for the cumulated length of table names; and |
| 1206 | LEX_STRING::length is uint. |
| 1207 | */ |
| 1208 | int4store(str->str, nb_stored); |
| 1209 | error= unmark_tables= 0; |
| 1210 | |
| 1211 | err: |
| 1212 | if (unlikely(unmark_tables)) |
| 1213 | { |
| 1214 | /* maria_close() uses THR_LOCK_maria from start to end */ |
| 1215 | mysql_mutex_lock(&THR_LOCK_maria); |
| 1216 | for (i= 0; i < nb; i++) |
| 1217 | { |
| 1218 | MARIA_SHARE *share= distinct_shares[i]; |
| 1219 | if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) |
| 1220 | { |
| 1221 | /* maria_close() left us to free the share */ |
| 1222 | mysql_mutex_destroy(&share->intern_lock); |
| 1223 | ma_crypt_free(share); |
| 1224 | my_free(share); |
| 1225 | } |
| 1226 | else |
| 1227 | { |
| 1228 | /* share goes back to normal state */ |
| 1229 | share->in_checkpoint= 0; |
| 1230 | } |
| 1231 | } |
| 1232 | mysql_mutex_unlock(&THR_LOCK_maria); |
| 1233 | } |
| 1234 | my_free(distinct_shares); |
| 1235 | my_free(state_copies); |
| 1236 | DBUG_RETURN(error); |
| 1237 | } |
| 1238 | |