1 | /* Copyright (C) 2006,2007 MySQL AB |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
15 | |
16 | /* |
17 | WL#3071 Maria checkpoint |
18 | First version written by Guilhem Bichot on 2006-04-27. |
19 | */ |
20 | |
21 | /* Here is the implementation of this module */ |
22 | |
23 | /** @todo RECOVERY BUG this is unreviewed code */ |
24 | /* |
25 | Summary: |
26 | checkpoints are done either by a background thread (checkpoint every Nth |
27 | second) or by a client. |
28 | In ha_maria, it's not made available to clients, and will soon be done by a |
29 | background thread (periodically taking checkpoints and flushing dirty |
30 | pages). |
31 | */ |
32 | |
33 | #include "maria_def.h" |
34 | #include "ma_pagecache.h" |
35 | #include "ma_blockrec.h" |
36 | #include "ma_checkpoint.h" |
37 | #include "ma_loghandler_lsn.h" |
38 | #include "ma_servicethread.h" |
39 | #include "ma_crypt.h" |
40 | |
41 | /** @brief type of checkpoint currently running */ |
42 | static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE; |
43 | /** @brief protects checkpoint_in_progress */ |
44 | static mysql_mutex_t LOCK_checkpoint; |
45 | /** @brief for killing the background checkpoint thread */ |
46 | static mysql_cond_t COND_checkpoint; |
47 | /** @brief control structure for checkpoint background thread */ |
48 | static MA_SERVICE_THREAD_CONTROL checkpoint_control= |
49 | {0, FALSE, FALSE, &LOCK_checkpoint, &COND_checkpoint}; |
50 | /* is ulong like pagecache->blocks_changed */ |
51 | static uint pages_to_flush_before_next_checkpoint; |
52 | static PAGECACHE_FILE *dfiles, /**< data files to flush in background */ |
53 | *dfiles_end; /**< list of data files ends here */ |
54 | static PAGECACHE_FILE *kfiles, /**< index files to flush in background */ |
55 | *kfiles_end; /**< list of index files ends here */ |
56 | /* those two statistics below could serve in SHOW GLOBAL STATUS */ |
57 | static uint checkpoints_total= 0, /**< all checkpoint requests made */ |
58 | checkpoints_ok_total= 0; /**< all checkpoints which succeeded */ |
59 | |
60 | struct st_filter_param |
61 | { |
62 | LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */ |
63 | uint max_pages; /**< stop after flushing this number pages */ |
64 | }; /**< information to determine which dirty pages should be flushed */ |
65 | |
66 | static enum pagecache_flush_filter_result |
67 | filter_flush_file_medium(enum pagecache_page_type type, |
68 | pgcache_page_no_t page, |
69 | LSN rec_lsn, void *arg); |
70 | static enum pagecache_flush_filter_result |
71 | filter_flush_file_full(enum pagecache_page_type type, |
72 | pgcache_page_no_t page, |
73 | LSN rec_lsn, void *arg); |
74 | static enum pagecache_flush_filter_result |
75 | filter_flush_file_evenly(enum pagecache_page_type type, |
76 | pgcache_page_no_t pageno, |
77 | LSN rec_lsn, void *arg); |
78 | static int really_execute_checkpoint(void); |
79 | pthread_handler_t ma_checkpoint_background(void *arg); |
80 | static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon); |
81 | |
82 | /** |
83 | @brief Does a checkpoint |
84 | |
85 | @param level what level of checkpoint to do |
86 | @param no_wait if another checkpoint of same or stronger level |
87 | is already running, consider our job done |
88 | |
89 | @note In ha_maria, there can never be two threads trying a checkpoint at |
90 | the same time. |
91 | |
92 | @return Operation status |
93 | @retval 0 ok |
94 | @retval !=0 error |
95 | */ |
96 | |
97 | int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait) |
98 | { |
99 | int result= 0; |
100 | DBUG_ENTER("ma_checkpoint_execute" ); |
101 | |
102 | if (!checkpoint_control.inited) |
103 | { |
104 | /* |
105 | If ha_maria failed to start, maria_panic_hton is called, we come here. |
106 | */ |
107 | DBUG_RETURN(0); |
108 | } |
109 | DBUG_ASSERT(level > CHECKPOINT_NONE); |
110 | |
111 | /* look for already running checkpoints */ |
112 | mysql_mutex_lock(&LOCK_checkpoint); |
113 | while (checkpoint_in_progress != CHECKPOINT_NONE) |
114 | { |
115 | if (no_wait && (checkpoint_in_progress >= level)) |
116 | { |
117 | /* |
118 | If we are the checkpoint background thread, we don't wait (it's |
119 | smarter to flush pages instead of waiting here while the other thread |
120 | finishes its checkpoint). |
121 | */ |
122 | mysql_mutex_unlock(&LOCK_checkpoint); |
123 | goto end; |
124 | } |
125 | mysql_cond_wait(&COND_checkpoint, &LOCK_checkpoint); |
126 | } |
127 | |
128 | checkpoint_in_progress= level; |
129 | mysql_mutex_unlock(&LOCK_checkpoint); |
130 | /* from then on, we are sure to be and stay the only checkpointer */ |
131 | |
132 | result= really_execute_checkpoint(); |
133 | DBUG_EXECUTE_IF("maria_crash_after_checkpoint" , |
134 | { DBUG_PRINT("maria_crash" , ("now" )); DBUG_SUICIDE(); }); |
135 | |
136 | mysql_cond_broadcast(&COND_checkpoint); |
137 | end: |
138 | DBUG_RETURN(result); |
139 | } |
140 | |
141 | |
142 | /** |
143 | @brief Does a checkpoint, really; expects no other checkpoints |
144 | running. |
145 | |
146 | Checkpoint level requested is read from checkpoint_in_progress. |
147 | |
148 | @return Operation status |
149 | @retval 0 ok |
150 | @retval !=0 error |
151 | */ |
152 | |
153 | static int really_execute_checkpoint(void) |
154 | { |
155 | uint i, error= 0; |
156 | /** @brief checkpoint_start_log_horizon will be stored there */ |
157 | char *ptr; |
158 | LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */ |
159 | LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn; |
160 | TRANSLOG_ADDRESS checkpoint_start_log_horizon; |
161 | char checkpoint_start_log_horizon_char[LSN_STORE_SIZE]; |
162 | DBUG_ENTER("really_execute_checkpoint" ); |
163 | DBUG_PRINT("enter" , ("level: %d" , checkpoint_in_progress)); |
164 | bzero(&record_pieces, sizeof(record_pieces)); |
165 | |
166 | /* |
167 | STEP 1: record current end-of-log position using log's lock. It is |
168 | critical for the correctness of Checkpoint (related to memory visibility |
169 | rules, the log's lock is a mutex). |
170 | "Horizon" is a lower bound of the LSN of the next log record. |
171 | */ |
172 | checkpoint_start_log_horizon= translog_get_horizon(); |
173 | DBUG_PRINT("info" ,("checkpoint_start_log_horizon " LSN_FMT, |
174 | LSN_IN_PARTS(checkpoint_start_log_horizon))); |
175 | lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); |
176 | |
177 | /* |
178 | STEP 2: fetch information about transactions. |
179 | We must fetch transactions before dirty pages. Indeed, a transaction |
180 | first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn |
181 | to 0. If we fetched pages first, we may see no dirty page yet, then we |
182 | fetch transactions but the transaction has already reset its rec_lsn to 0 |
183 | so we miss rec_lsn again. |
184 | For a similar reason (over-allocated bitmap pages) we have to fetch |
185 | transactions before flushing bitmap pages. |
186 | |
187 | min_trn_rec_lsn will serve to lower the starting point of the REDO phase |
188 | (down from checkpoint_start_log_horizon). |
189 | */ |
190 | if (unlikely(trnman_collect_transactions(&record_pieces[0], |
191 | &record_pieces[1], |
192 | &min_trn_rec_lsn, |
193 | &min_first_undo_lsn))) |
194 | goto err; |
195 | |
196 | |
197 | /* STEP 3: fetch information about table files */ |
198 | if (unlikely(collect_tables(&record_pieces[2], |
199 | checkpoint_start_log_horizon))) |
200 | goto err; |
201 | |
202 | |
203 | /* STEP 4: fetch information about dirty pages */ |
204 | /* |
205 | It's better to do it _after_ having flushed some data pages (which |
206 | collect_tables() may have done), because those are now non-dirty and so we |
207 | have a more up-to-date dirty pages list to put into the checkpoint record, |
208 | and thus we will have less work at Recovery. |
209 | */ |
210 | /* Using default pagecache for now */ |
211 | if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache, |
212 | &record_pieces[3], |
213 | &min_page_rec_lsn))) |
214 | goto err; |
215 | |
216 | |
217 | /* LAST STEP: now write the checkpoint log record */ |
218 | { |
219 | LSN lsn; |
220 | translog_size_t total_rec_length; |
221 | /* |
222 | the log handler is allowed to modify "str" and "length" (but not "*str") |
223 | of its argument, so we must not pass it record_pieces directly, |
224 | otherwise we would later not know what memory pieces to my_free(). |
225 | */ |
226 | LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5]; |
227 | log_array[TRANSLOG_INTERNAL_PARTS + 0].str= |
228 | (uchar*) checkpoint_start_log_horizon_char; |
229 | log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length= |
230 | sizeof(checkpoint_start_log_horizon_char); |
231 | for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) |
232 | { |
233 | log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].str= (uchar*)record_pieces[i].str; |
234 | log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].length= record_pieces[i].length; |
235 | total_rec_length+= (translog_size_t) record_pieces[i].length; |
236 | } |
237 | if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT, |
238 | &dummy_transaction_object, NULL, |
239 | total_rec_length, |
240 | sizeof(log_array)/sizeof(log_array[0]), |
241 | log_array, NULL, NULL) || |
242 | translog_flush(lsn))) |
243 | goto err; |
244 | translog_lock(); |
245 | /* |
246 | This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because |
247 | such hook would be called before translog_flush (and we must be sure |
248 | that log was flushed before we write to the control file). |
249 | */ |
250 | if (unlikely(ma_control_file_write_and_force(lsn, last_logno, |
251 | max_trid_in_control_file, |
252 | recovery_failures))) |
253 | { |
254 | translog_unlock(); |
255 | goto err; |
256 | } |
257 | translog_unlock(); |
258 | } |
259 | |
260 | /* |
261 | Note that we should not alter memory structures until we have successfully |
262 | written the checkpoint record and control file. |
263 | */ |
264 | /* checkpoint succeeded */ |
265 | ptr= record_pieces[3].str; |
266 | pages_to_flush_before_next_checkpoint= uint4korr(ptr); |
267 | DBUG_PRINT("checkpoint" ,("%u pages to flush before next checkpoint" , |
268 | pages_to_flush_before_next_checkpoint)); |
269 | |
270 | /* compute log's low-water mark */ |
271 | { |
272 | TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn; |
273 | set_if_smaller(log_low_water_mark, min_trn_rec_lsn); |
274 | set_if_smaller(log_low_water_mark, min_first_undo_lsn); |
275 | set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon); |
276 | /** |
277 | Now purge unneeded logs. |
278 | As some systems have an unreliable fsync (drive lying), we could try to |
279 | be robust against that: remember a few previous checkpoints in the |
280 | control file, and not purge logs immediately... Think about it. |
281 | */ |
282 | if (translog_purge(log_low_water_mark)) |
283 | ma_message_no_user(0, "log purging failed" ); |
284 | } |
285 | |
286 | goto end; |
287 | |
288 | err: |
289 | error= 1; |
290 | ma_message_no_user(0, "checkpoint failed" ); |
291 | /* we were possibly not able to determine what pages to flush */ |
292 | pages_to_flush_before_next_checkpoint= 0; |
293 | |
294 | end: |
295 | for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) |
296 | my_free(record_pieces[i].str); |
297 | mysql_mutex_lock(&LOCK_checkpoint); |
298 | checkpoint_in_progress= CHECKPOINT_NONE; |
299 | checkpoints_total++; |
300 | checkpoints_ok_total+= !error; |
301 | mysql_mutex_unlock(&LOCK_checkpoint); |
302 | DBUG_RETURN(error); |
303 | } |
304 | |
305 | |
306 | /** |
307 | @brief Initializes the checkpoint module |
308 | |
309 | @param interval If one wants the module to create a |
310 | thread which will periodically do |
311 | checkpoints, and flush dirty pages, in the |
312 | background, it should specify a non-zero |
313 | interval in seconds. The thread will then be |
314 | created and will take checkpoints separated by |
315 | approximately 'interval' second. |
316 | |
317 | @note A checkpoint is taken only if there has been some significant |
318 | activity since the previous checkpoint. Between checkpoint N and N+1 the |
319 | thread flushes all dirty pages which were already dirty at the time of |
320 | checkpoint N. |
321 | |
322 | @return Operation status |
323 | @retval 0 ok |
324 | @retval !=0 error |
325 | */ |
326 | |
327 | int ma_checkpoint_init(ulong interval) |
328 | { |
329 | int res= 0; |
330 | DBUG_ENTER("ma_checkpoint_init" ); |
331 | if (ma_service_thread_control_init(&checkpoint_control)) |
332 | res= 1; |
333 | else if (interval > 0) |
334 | { |
335 | size_t intv= interval; |
336 | compile_time_assert(sizeof(void *) >= sizeof(ulong)); |
337 | if ((res= mysql_thread_create(key_thread_checkpoint, |
338 | &checkpoint_control.thread, NULL, |
339 | ma_checkpoint_background, |
340 | (void*) intv))) |
341 | checkpoint_control.killed= TRUE; |
342 | } |
343 | else |
344 | checkpoint_control.killed= TRUE; |
345 | DBUG_RETURN(res); |
346 | } |
347 | |
348 | |
349 | #ifndef DBUG_OFF |
350 | /** |
351 | Function used to test recovery: flush some table pieces and then caller |
352 | crashes. |
353 | |
354 | @param what_to_flush 0: current bitmap and all data pages |
355 | 1: state |
356 | 2: all bitmap pages |
357 | */ |
358 | static void flush_all_tables(int what_to_flush) |
359 | { |
360 | int res= 0; |
361 | LIST *pos; /**< to iterate over open tables */ |
362 | mysql_mutex_lock(&THR_LOCK_maria); |
363 | for (pos= maria_open_list; pos; pos= pos->next) |
364 | { |
365 | MARIA_HA *info= (MARIA_HA*)pos->data; |
366 | if (info->s->now_transactional) |
367 | { |
368 | switch (what_to_flush) |
369 | { |
370 | case 0: |
371 | res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, |
372 | FLUSH_KEEP, FLUSH_KEEP); |
373 | break; |
374 | case 1: |
375 | res= _ma_state_info_write(info->s, |
376 | MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET| |
377 | MA_STATE_INFO_WRITE_LOCK); |
378 | DBUG_PRINT("maria_flush_states" , |
379 | ("is_of_horizon: LSN " LSN_FMT, |
380 | LSN_IN_PARTS(info->s->state.is_of_horizon))); |
381 | break; |
382 | case 2: |
383 | res= _ma_bitmap_flush_all(info->s); |
384 | break; |
385 | } |
386 | } |
387 | DBUG_ASSERT(res == 0); |
388 | } |
389 | mysql_mutex_unlock(&THR_LOCK_maria); |
390 | } |
391 | #endif |
392 | |
393 | |
394 | /** |
395 | @brief Destroys the checkpoint module |
396 | */ |
397 | |
398 | void ma_checkpoint_end(void) |
399 | { |
400 | DBUG_ENTER("ma_checkpoint_end" ); |
401 | /* |
402 | Some intentional crash methods, usually triggered by |
403 | SET MARIA_CHECKPOINT_INTERVAL=X |
404 | */ |
405 | DBUG_EXECUTE_IF("maria_flush_bitmap" , |
406 | { |
407 | DBUG_PRINT("maria_flush_bitmap" , ("now" )); |
408 | flush_all_tables(2); |
409 | }); |
410 | DBUG_EXECUTE_IF("maria_flush_whole_page_cache" , |
411 | { |
412 | DBUG_PRINT("maria_flush_whole_page_cache" , ("now" )); |
413 | flush_all_tables(0); |
414 | }); |
415 | DBUG_EXECUTE_IF("maria_flush_whole_log" , |
416 | { |
417 | DBUG_PRINT("maria_flush_whole_log" , ("now" )); |
418 | translog_flush(translog_get_horizon()); |
419 | }); |
420 | /* |
421 | Note that for WAL reasons, maria_flush_states requires |
422 | maria_flush_whole_log. |
423 | */ |
424 | DBUG_EXECUTE_IF("maria_flush_states" , |
425 | { |
426 | DBUG_PRINT("maria_flush_states" , ("now" )); |
427 | flush_all_tables(1); |
428 | }); |
429 | DBUG_EXECUTE_IF("maria_crash" , |
430 | { DBUG_PRINT("maria_crash" , ("now" )); DBUG_SUICIDE(); }); |
431 | |
432 | if (checkpoint_control.inited) |
433 | { |
434 | ma_service_thread_control_end(&checkpoint_control); |
435 | my_free(dfiles); |
436 | my_free(kfiles); |
437 | dfiles= kfiles= NULL; |
438 | } |
439 | DBUG_VOID_RETURN; |
440 | } |
441 | |
442 | |
443 | /** |
444 | @brief dirty-page filtering criteria for MEDIUM checkpoint. |
445 | |
446 | We flush data/index pages which have been dirty since the previous |
447 | checkpoint (this is the two-checkpoint rule: the REDO phase will not have |
448 | to start from earlier than the next-to-last checkpoint). |
449 | Bitmap pages are handled by _ma_bitmap_flush_all(). |
450 | |
451 | @param type Page's type |
452 | @param pageno Page's number |
453 | @param rec_lsn Page's rec_lsn |
454 | @param arg filter_param |
455 | */ |
456 | |
457 | static enum pagecache_flush_filter_result |
458 | filter_flush_file_medium(enum pagecache_page_type type, |
459 | pgcache_page_no_t pageno __attribute__ ((unused)), |
460 | LSN rec_lsn, void *arg) |
461 | { |
462 | struct st_filter_param *param= (struct st_filter_param *)arg; |
463 | return (type == PAGECACHE_LSN_PAGE) && |
464 | (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0); |
465 | } |
466 | |
467 | |
468 | /** |
469 | @brief dirty-page filtering criteria for FULL checkpoint. |
470 | |
471 | We flush all dirty data/index pages. |
472 | Bitmap pages are handled by _ma_bitmap_flush_all(). |
473 | |
474 | @param type Page's type |
475 | @param pageno Page's number |
476 | @param rec_lsn Page's rec_lsn |
477 | @param arg filter_param |
478 | */ |
479 | |
480 | static enum pagecache_flush_filter_result |
481 | filter_flush_file_full(enum pagecache_page_type type, |
482 | pgcache_page_no_t pageno __attribute__ ((unused)), |
483 | LSN rec_lsn __attribute__ ((unused)), |
484 | void *arg __attribute__ ((unused))) |
485 | { |
486 | return (type == PAGECACHE_LSN_PAGE); |
487 | } |
488 | |
489 | |
490 | /** |
491 | @brief dirty-page filtering criteria for background flushing thread. |
492 | |
493 | We flush data/index pages which have been dirty since the previous |
494 | checkpoint (this is the two-checkpoint rule: the REDO phase will not have |
495 | to start from earlier than the next-to-last checkpoint), and no |
496 | bitmap pages. But we flush no more than a certain number of pages (to have |
497 | an even flushing, no write burst). |
498 | The reason to not flush bitmap pages is that they may not be in a flushable |
499 | state at this moment and we don't want to wait for them. |
500 | |
501 | @param type Page's type |
502 | @param pageno Page's number |
503 | @param rec_lsn Page's rec_lsn |
504 | @param arg filter_param |
505 | */ |
506 | |
507 | static enum pagecache_flush_filter_result |
508 | filter_flush_file_evenly(enum pagecache_page_type type, |
509 | pgcache_page_no_t pageno __attribute__ ((unused)), |
510 | LSN rec_lsn, void *arg) |
511 | { |
512 | struct st_filter_param *param= (struct st_filter_param *)arg; |
513 | if (unlikely(param->max_pages == 0)) /* all flushed already */ |
514 | return FLUSH_FILTER_SKIP_ALL; |
515 | if ((type == PAGECACHE_LSN_PAGE) && |
516 | (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) |
517 | { |
518 | param->max_pages--; |
519 | return FLUSH_FILTER_OK; |
520 | } |
521 | return FLUSH_FILTER_SKIP_TRY_NEXT; |
522 | } |
523 | |
524 | |
525 | /** |
526 | @brief Background thread which does checkpoints and flushes periodically. |
527 | |
528 | Takes a checkpoint. After this, all pages dirty at the time of that |
529 | checkpoint are flushed evenly until it is time to take another checkpoint. |
530 | This ensures that the REDO phase starts at earliest (in LSN time) at the |
531 | next-to-last checkpoint record ("two-checkpoint rule"). |
532 | |
533 | @note MikaelR questioned why the same thread does two different jobs, the |
534 | risk could be that while a checkpoint happens no LRD flushing happens. |
535 | */ |
536 | |
537 | static ulong maria_checkpoint_min_cache_activity= 10*1024*1024; |
538 | /* Set in ha_maria.cc */ |
539 | ulong maria_checkpoint_min_log_activity= 1*1024*1024; |
540 | |
541 | pthread_handler_t ma_checkpoint_background(void *arg) |
542 | { |
543 | /** @brief At least this of log/page bytes written between checkpoints */ |
544 | /* |
545 | If the interval could be changed by the user while we are in this thread, |
546 | it could be annoying: for example it could cause "case 2" to be executed |
547 | right after "case 0", thus having 'dfile' unset. So the thread cares only |
548 | about the interval's value when it started. |
549 | */ |
550 | const size_t interval= (size_t)arg; |
551 | size_t sleeps, sleep_time; |
552 | TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= |
553 | translog_get_horizon(); |
554 | ulonglong pagecache_flushes_at_last_checkpoint= |
555 | maria_pagecache->global_cache_write; |
556 | uint UNINIT_VAR(pages_bunch_size); |
557 | struct st_filter_param filter_param; |
558 | PAGECACHE_FILE *UNINIT_VAR(dfile); /**< data file currently being flushed */ |
559 | PAGECACHE_FILE *UNINIT_VAR(kfile); /**< index file currently being flushed */ |
560 | |
561 | my_thread_init(); |
562 | DBUG_PRINT("info" ,("Maria background checkpoint thread starts" )); |
563 | DBUG_ASSERT(interval > 0); |
564 | |
565 | PSI_CALL_set_thread_user_host(0,0,0,0); |
566 | |
567 | /* |
568 | Recovery ended with all tables closed and a checkpoint: no need to take |
569 | one immediately. |
570 | */ |
571 | sleeps= 1; |
572 | pages_to_flush_before_next_checkpoint= 0; |
573 | |
574 | for(;;) /* iterations of checkpoints and dirty page flushing */ |
575 | { |
576 | #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ |
577 | sleeps=0; |
578 | #endif |
579 | switch (sleeps % interval) |
580 | { |
581 | case 0: |
582 | { |
583 | /* If checkpoints are disabled, wait 1 second and try again */ |
584 | if (maria_checkpoint_disabled) |
585 | { |
586 | sleep_time= 1; |
587 | break; |
588 | } |
589 | { |
590 | TRANSLOG_ADDRESS horizon= translog_get_horizon(); |
591 | |
592 | /* |
593 | With background flushing evenly distributed over the time |
594 | between two checkpoints, we should have only little flushing to do |
595 | in the checkpoint. |
596 | */ |
597 | /* |
598 | No checkpoint if little work of interest for recovery was done |
599 | since last checkpoint. Such work includes log writing (lengthens |
600 | recovery, checkpoint would shorten it), page flushing (checkpoint |
601 | would decrease the amount of read pages in recovery). |
602 | In case of one short statement per minute (very low load), we don't |
603 | want to checkpoint every minute, hence the positive |
604 | maria_checkpoint_min_activity. |
605 | */ |
606 | if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <= |
607 | maria_checkpoint_min_log_activity && |
608 | ((ulonglong) (maria_pagecache->global_cache_write - |
609 | pagecache_flushes_at_last_checkpoint) * |
610 | maria_pagecache->block_size) <= |
611 | maria_checkpoint_min_cache_activity) |
612 | { |
613 | /* |
614 | Not enough has happend since last checkpoint. |
615 | Sleep for a while and try again later |
616 | */ |
617 | sleep_time= interval; |
618 | break; |
619 | } |
620 | sleep_time= 1; |
621 | ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE); |
622 | /* |
623 | Snapshot this kind of "state" of the engine. Note that the value |
624 | below is possibly greater than last_checkpoint_lsn. |
625 | */ |
626 | log_horizon_at_last_checkpoint= translog_get_horizon(); |
627 | pagecache_flushes_at_last_checkpoint= |
628 | maria_pagecache->global_cache_write; |
629 | /* |
630 | If the checkpoint above succeeded it has set d|kfiles and |
631 | d|kfiles_end. If is has failed, it has set |
632 | pages_to_flush_before_next_checkpoint to 0 so we will skip flushing |
633 | and sleep until the next checkpoint. |
634 | */ |
635 | } |
636 | break; |
637 | } |
638 | case 1: |
639 | /* set up parameters for background page flushing */ |
640 | filter_param.up_to_lsn= last_checkpoint_lsn; |
641 | pages_bunch_size= pages_to_flush_before_next_checkpoint / (uint)interval; |
642 | dfile= dfiles; |
643 | kfile= kfiles; |
644 | /* fall through */ |
645 | default: |
646 | if (pages_bunch_size > 0) |
647 | { |
648 | DBUG_PRINT("checkpoint" , |
649 | ("Maria background checkpoint thread: %u pages" , |
650 | pages_bunch_size)); |
651 | /* flush a bunch of dirty pages */ |
652 | filter_param.max_pages= pages_bunch_size; |
653 | while (dfile != dfiles_end) |
654 | { |
655 | /* |
656 | We use FLUSH_KEEP_LAZY: if a file is already in flush, it's |
657 | smarter to move to the next file than wait for this one to be |
658 | completely flushed, which may take long. |
659 | StaleFilePointersInFlush: notice how below we use "dfile" which |
660 | is an OS file descriptor plus some function and MARIA_SHARE |
661 | pointers; this data dates from a previous checkpoint; since then, |
662 | the table may have been closed (so MARIA_SHARE* became stale), and |
663 | the file descriptor reassigned to another table which does not |
664 | have the same CRC-read-set callbacks: it is thus important that |
665 | flush_pagecache_blocks_with_filter() does not use the pointers, |
666 | only the OS file descriptor. |
667 | */ |
668 | int res= |
669 | flush_pagecache_blocks_with_filter(maria_pagecache, |
670 | dfile, FLUSH_KEEP_LAZY, |
671 | filter_flush_file_evenly, |
672 | &filter_param); |
673 | if (unlikely(res & PCFLUSH_ERROR)) |
674 | ma_message_no_user(0, "background data page flush failed" ); |
675 | if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ |
676 | break; /* and we will continue with the same file */ |
677 | dfile++; /* otherwise all this file is flushed, move to next file */ |
678 | /* |
679 | MikaelR noted that he observed that Linux's file cache may never |
680 | fsync to disk until this cache is full, at which point it decides |
681 | to empty the cache, making the machine very slow. A solution was |
682 | to fsync after writing 2 MB. So we might want to fsync() here if |
683 | we wrote enough pages. |
684 | */ |
685 | } |
686 | while (kfile != kfiles_end) |
687 | { |
688 | int res= |
689 | flush_pagecache_blocks_with_filter(maria_pagecache, |
690 | kfile, FLUSH_KEEP_LAZY, |
691 | filter_flush_file_evenly, |
692 | &filter_param); |
693 | if (unlikely(res & PCFLUSH_ERROR)) |
694 | ma_message_no_user(0, "background index page flush failed" ); |
695 | if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ |
696 | break; /* and we will continue with the same file */ |
697 | kfile++; /* otherwise all this file is flushed, move to next file */ |
698 | } |
699 | sleep_time= 1; |
700 | } |
701 | else |
702 | { |
703 | /* Can directly sleep until the next checkpoint moment */ |
704 | sleep_time= interval - (sleeps % interval); |
705 | } |
706 | } |
707 | if (my_service_thread_sleep(&checkpoint_control, |
708 | sleep_time * 1000000000ULL)) |
709 | break; |
710 | sleeps+= sleep_time; |
711 | } |
712 | DBUG_PRINT("info" ,("Maria background checkpoint thread ends" )); |
713 | { |
714 | CHECKPOINT_LEVEL level= CHECKPOINT_FULL; |
715 | /* |
716 | That's the final one, which guarantees that a clean shutdown always ends |
717 | with a checkpoint. |
718 | */ |
719 | DBUG_EXECUTE_IF("maria_checkpoint_indirect" , level= CHECKPOINT_INDIRECT;); |
720 | ma_checkpoint_execute(level, FALSE); |
721 | } |
722 | my_thread_end(); |
723 | return 0; |
724 | } |
725 | |
726 | |
727 | /** |
728 | @brief Allocates buffer and stores in it some info about open tables, |
729 | does some flushing on those. |
730 | |
731 | Does the allocation because the caller cannot know the size itself. |
732 | Memory freeing is to be done by the caller (if the "str" member of the |
733 | LEX_STRING is not NULL). |
734 | The caller is taking a checkpoint. |
735 | |
736 | @param[out] str pointer to where the allocated buffer, |
737 | and its size, will be put; buffer will be filled |
738 | with info about open tables |
739 | @param checkpoint_start_log_horizon Of the in-progress checkpoint |
740 | record. |
741 | |
742 | @return Operation status |
743 | @retval 0 OK |
744 | @retval 1 Error |
745 | */ |
746 | |
747 | static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) |
748 | { |
749 | MARIA_SHARE **distinct_shares= NULL; |
750 | char *ptr; |
751 | uint error= 1, sync_error= 0, nb, nb_stored, i; |
752 | my_bool unmark_tables= TRUE; |
753 | size_t total_names_length; |
754 | LIST *pos; /**< to iterate over open tables */ |
755 | struct st_state_copy { |
756 | uint index; |
757 | MARIA_STATE_INFO state; |
758 | }; |
759 | struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */ |
760 | *state_copies_end, /**< cache ends here */ |
761 | *state_copy; /**< iterator in cache */ |
762 | TRANSLOG_ADDRESS UNINIT_VAR(state_copies_horizon); /**< horizon of states' _copies_ */ |
763 | struct st_filter_param filter_param; |
764 | PAGECACHE_FLUSH_FILTER filter; |
765 | DBUG_ENTER("collect_tables" ); |
766 | |
767 | /* let's make a list of distinct shares */ |
768 | mysql_mutex_lock(&THR_LOCK_maria); |
769 | for (nb= 0, pos= maria_open_list; pos; pos= pos->next) |
770 | { |
771 | MARIA_HA *info= (MARIA_HA*)pos->data; |
772 | MARIA_SHARE *share= info->s; |
773 | /* the first three variables below can never change */ |
774 | if (share->base.born_transactional && !share->temporary && |
775 | share->mode != O_RDONLY && |
776 | !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)) |
777 | { |
778 | /* |
779 | Apart from us, only maria_close() reads/sets in_checkpoint but cannot |
780 | run now as we hold THR_LOCK_maria. |
781 | */ |
782 | /* |
783 | This table is relevant for checkpoint and not already seen. Mark it, |
784 | so that it is not seen again in the loop. |
785 | */ |
786 | nb++; |
787 | DBUG_ASSERT(share->in_checkpoint == 0); |
788 | /* This flag ensures that we count only _distinct_ shares. */ |
789 | share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP; |
790 | } |
791 | } |
792 | if (unlikely((distinct_shares= |
793 | (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *), |
794 | MYF(MY_WME))) == NULL)) |
795 | goto err; |
796 | for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next) |
797 | { |
798 | MARIA_HA *info= (MARIA_HA*)pos->data; |
799 | MARIA_SHARE *share= info->s; |
800 | if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP) |
801 | { |
802 | distinct_shares[i++]= share; |
803 | /* |
804 | With this we prevent the share from going away while we later flush |
805 | and force it without holding THR_LOCK_maria. For example if the share |
806 | could be my_free()d by maria_close() we would have a problem when we |
807 | access it to flush the table. We "pin" the share pointer. |
808 | And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is |
809 | not seen again in the loop. |
810 | */ |
811 | share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME; |
812 | total_names_length+= share->open_file_name.length; |
813 | } |
814 | } |
815 | |
816 | DBUG_ASSERT(i == nb); |
817 | mysql_mutex_unlock(&THR_LOCK_maria); |
818 | DBUG_PRINT("info" ,("found %u table shares" , nb)); |
819 | |
820 | str->length= |
821 | 4 + /* number of tables */ |
822 | (2 + /* short id */ |
823 | LSN_STORE_SIZE + /* first_log_write_at_lsn */ |
824 | 1 /* end-of-name 0 */ |
825 | ) * nb + total_names_length; |
826 | if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL)) |
827 | goto err; |
828 | |
829 | ptr= str->str; |
830 | ptr+= 4; /* real number of stored tables is not yet know */ |
831 | |
832 | /* only possible checkpointer, so can do the read below without mutex */ |
833 | filter_param.up_to_lsn= last_checkpoint_lsn; |
834 | switch(checkpoint_in_progress) |
835 | { |
836 | case CHECKPOINT_MEDIUM: |
837 | filter= &filter_flush_file_medium; |
838 | break; |
839 | case CHECKPOINT_FULL: |
840 | filter= &filter_flush_file_full; |
841 | break; |
842 | case CHECKPOINT_INDIRECT: |
843 | filter= NULL; |
844 | break; |
845 | default: |
846 | DBUG_ASSERT(0); |
847 | goto err; |
848 | } |
849 | |
850 | /* |
851 | The principle of reading/writing the state below is explained in |
852 | ma_recovery.c, look for "Recovery of the state". |
853 | */ |
854 | #define STATE_COPIES 1024 |
855 | state_copies= (struct st_state_copy *) |
856 | my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME)); |
857 | dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles, |
858 | /* avoid size of 0 for my_realloc */ |
859 | MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), |
860 | MYF(MY_WME | MY_ALLOW_ZERO_PTR)); |
861 | kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles, |
862 | /* avoid size of 0 for my_realloc */ |
863 | MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), |
864 | MYF(MY_WME | MY_ALLOW_ZERO_PTR)); |
865 | if (unlikely((state_copies == NULL) || |
866 | (dfiles == NULL) || (kfiles == NULL))) |
867 | goto err; |
868 | state_copy= state_copies_end= NULL; |
869 | dfiles_end= dfiles; |
870 | kfiles_end= kfiles; |
871 | |
872 | for (nb_stored= 0, i= 0; i < nb; i++) |
873 | { |
874 | MARIA_SHARE *share= distinct_shares[i]; |
875 | PAGECACHE_FILE kfile, dfile; |
876 | my_bool ignore_share; |
877 | if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) |
878 | { |
879 | /* |
880 | No need for a mutex to read the above, only us can write *this* bit of |
881 | the in_checkpoint bitmap |
882 | */ |
883 | continue; |
884 | } |
885 | /** |
886 | @todo We should not look at tables which didn't change since last |
887 | checkpoint. |
888 | */ |
889 | DBUG_PRINT("info" ,("looking at table '%s'" , share->open_file_name.str)); |
890 | if (state_copy == state_copies_end) /* we have no more cached states */ |
891 | { |
892 | /* |
893 | Collect and cache a bunch of states. We do this for many states at a |
894 | time, to not lock/unlock the log's lock too often. |
895 | */ |
896 | uint j, bound= MY_MIN(nb, i + STATE_COPIES); |
897 | state_copy= state_copies; |
898 | /* part of the state is protected by log's lock */ |
899 | translog_lock(); |
900 | state_copies_horizon= translog_get_horizon_no_lock(); |
901 | for (j= i; j < bound; j++) |
902 | { |
903 | MARIA_SHARE *share2= distinct_shares[j]; |
904 | if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) |
905 | continue; |
906 | state_copy->index= j; |
907 | state_copy->state= share2->state; /* we copy the state */ |
908 | state_copy++; |
909 | /* |
910 | data_file_length is not updated under log's lock by the bitmap |
911 | code, but writing a wrong data_file_length is ok: a next |
912 | maria_close() will correct it; if we crash before, Recovery will |
913 | set it to the true physical size. |
914 | */ |
915 | } |
916 | translog_unlock(); |
917 | if (state_copy == state_copies) |
918 | break; /* Nothing to do */ |
919 | |
920 | /** |
921 | We are going to flush these states. |
922 | Before, all records describing how to undo such state must be |
923 | in the log (WAL). Usually this means UNDOs. In the special case of |
924 | data|key_file_length, recovery just needs to open the table to fix the |
925 | length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to |
926 | understand it must open a table, is enough; so as long as |
927 | data|key_file_length is updated after writing any log record it's ok: |
928 | if we copied new value above, it means the record was before |
929 | state_copies_horizon and we flush such record below. |
930 | Apart from data|key_file_length which are easily recoverable from the |
931 | real file's size, all other state members must be updated only when |
932 | writing the UNDO; otherwise, if updated before, if their new value is |
933 | flushed by a checkpoint and there is a crash before UNDO is written, |
934 | their REDO group will be missing or at least incomplete and skipped |
935 | by recovery, so bad state value will stay. For example, setting |
936 | key_root before writing the UNDO: the table would have old index |
937 | pages (they were pinned at time of crash) and a new, thus wrong, |
938 | key_root. |
939 | @todo RECOVERY BUG check that all code honours that. |
940 | */ |
941 | if (translog_flush(state_copies_horizon)) |
942 | goto err; |
943 | /* now we have cached states and they are WAL-safe*/ |
944 | state_copies_end= state_copy-1; |
945 | state_copy= state_copies; |
946 | } |
947 | |
948 | /* locate our state among these cached ones */ |
949 | for ( ; state_copy->index != i; state_copy++) |
950 | DBUG_ASSERT(state_copy <= state_copies_end); |
951 | |
952 | /* OS file descriptors are ints which we stored in 4 bytes */ |
953 | compile_time_assert(sizeof(int) <= 4); |
954 | /* |
955 | Protect against maria_close() (which does some memory freeing in |
956 | MARIA_FILE_BITMAP) with close_lock. intern_lock is not |
957 | sufficient as we, as well as maria_close(), are going to unlock |
958 | intern_lock in the middle of manipulating the table. Serializing us and |
959 | maria_close() should help avoid problems. |
960 | */ |
961 | mysql_mutex_lock(&share->close_lock); |
962 | mysql_mutex_lock(&share->intern_lock); |
963 | /* |
964 | Tables in a normal state have their two file descriptors open. |
965 | In some rare cases like REPAIR, some descriptor may be closed or even |
966 | -1. If that happened, the _ma_state_info_write() may fail. This is |
967 | prevented by enclosing all all places which close/change kfile.file with |
968 | intern_lock. |
969 | */ |
970 | kfile= share->kfile; |
971 | dfile= share->bitmap.file; |
972 | /* |
973 | Ignore table which has no logged writes (all its future log records will |
974 | be found naturally by Recovery). Ignore obsolete shares (_before_ |
975 | setting themselves to last_version=0 they already did all flush and |
976 | sync; if we flush their state now we may be flushing an obsolete state |
977 | onto a newer one (assuming the table has been reopened with a different |
978 | share but of course same physical index file). |
979 | */ |
980 | ignore_share= (share->id == 0) | (share->last_version == 0); |
981 | DBUG_PRINT("info" , ("ignore_share: %d" , ignore_share)); |
982 | if (!ignore_share) |
983 | { |
984 | size_t open_file_name_len= share->open_file_name.length + 1; |
985 | /* remember the descriptors for background flush */ |
986 | *(dfiles_end++)= dfile; |
987 | *(kfiles_end++)= kfile; |
988 | /* we will store this table in the record */ |
989 | nb_stored++; |
990 | int2store(ptr, share->id); |
991 | ptr+= 2; |
992 | lsn_store(ptr, share->lsn_of_file_id); |
993 | ptr+= LSN_STORE_SIZE; |
994 | /* |
995 | first_bitmap_with_space is not updated under log's lock, and is |
996 | important. We would need the bitmap's lock to get it right. Recovery |
997 | of this is not clear, so we just play safe: write it out as |
998 | unknown: if crash, _ma_bitmap_init() at next open (for example in |
999 | Recovery) will convert it to 0 and thus the first insertion will |
1000 | search for free space from the file's first bitmap (0) - |
1001 | under-optimal but safe. |
1002 | If no crash, maria_close() will write the exact value. |
1003 | */ |
1004 | state_copy->state.first_bitmap_with_space= ~(ulonglong)0; |
1005 | memcpy(ptr, share->open_file_name.str, open_file_name_len); |
1006 | ptr+= open_file_name_len; |
1007 | if (cmp_translog_addr(share->state.is_of_horizon, |
1008 | checkpoint_start_log_horizon) >= 0) |
1009 | { |
1010 | /* |
1011 | State was flushed recently, it does not hold down the log's |
1012 | low-water mark and will not give avoidable work to Recovery. So we |
1013 | needn't flush it. Also, it is possible that while we copied the |
1014 | state above (under log's lock, without intern_lock) it was being |
1015 | modified in memory or flushed to disk (without log's lock, under |
1016 | intern_lock, like in maria_extra()), so our copy may be incorrect |
1017 | and we should not flush it. |
1018 | It may also be a share which got last_version==0 since we checked |
1019 | last_version; in this case, it flushed its state and the LSN test |
1020 | above will catch it. |
1021 | */ |
1022 | } |
1023 | else |
1024 | { |
1025 | /* |
1026 | We could do the state flush only if share->changed, but it's |
1027 | tricky. |
1028 | Consider a maria_write() which has written REDO,UNDO, and before it |
1029 | calls _ma_writeinfo() (setting share->changed=1), checkpoint |
1030 | happens and sees share->changed=0, does not flush state. It is |
1031 | possible that Recovery does not start from before the REDO and thus |
1032 | the state is not recovered. A solution may be to set |
1033 | share->changed=1 under log mutex when writing log records. |
1034 | |
1035 | The current solution is to keep a copy the last saved state and |
1036 | not write the state if it was same as last time. It's ok if |
1037 | is_of_horizon would be different on disk if all other data is |
1038 | the same. |
1039 | */ |
1040 | DBUG_ASSERT(share->last_version != 0); |
1041 | state_copy->state.is_of_horizon= share->state.is_of_horizon= |
1042 | share->checkpoint_state.is_of_horizon= state_copies_horizon; |
1043 | if (kfile.file >= 0 && memcmp(&share->checkpoint_state, |
1044 | &state_copy->state, |
1045 | sizeof(state_copy->state))) |
1046 | { |
1047 | sync_error|= |
1048 | _ma_state_info_write_sub(kfile.file, &state_copy->state, |
1049 | MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET); |
1050 | memcpy(&share->checkpoint_state, |
1051 | &state_copy->state, sizeof(state_copy->state)); |
1052 | } |
1053 | /* |
1054 | We don't set share->changed=0 because it may interfere with a |
1055 | concurrent _ma_writeinfo() doing share->changed=1 (cancel its |
1056 | effect). The sad consequence is that we will flush the same state at |
1057 | each checkpoint if the table was once written and then not anymore. |
1058 | */ |
1059 | } |
1060 | } |
1061 | #ifdef EXTRA_DEBUG_BITMAP |
1062 | else |
1063 | { |
1064 | DBUG_ASSERT(share->bitmap.changed == 0 && |
1065 | share->bitmap.changed_not_flushed == 0); |
1066 | } |
1067 | #endif |
1068 | |
1069 | /* |
1070 | _ma_bitmap_flush_all() may wait, so don't keep intern_lock as |
1071 | otherwise this would deadlock with allocate_and_write_block_record() |
1072 | calling _ma_set_share_data_file_length() |
1073 | */ |
1074 | mysql_mutex_unlock(&share->intern_lock); |
1075 | |
1076 | if (!ignore_share) |
1077 | { |
1078 | /* |
1079 | share->bitmap is valid because it's destroyed under close_lock which |
1080 | we hold. |
1081 | */ |
1082 | if (_ma_bitmap_flush_all(share)) |
1083 | { |
1084 | sync_error= 1; |
1085 | /** @todo all write failures should mark table corrupted */ |
1086 | ma_message_no_user(0, "checkpoint bitmap page flush failed" ); |
1087 | } |
1088 | DBUG_ASSERT(share->pagecache == maria_pagecache); |
1089 | } |
1090 | /* |
1091 | Clean up any unused states. |
1092 | TODO: Only do this call if there has been # (10?) ended transactions |
1093 | since last call. |
1094 | We had to release intern_lock to respect lock order with LOCK_trn_list. |
1095 | */ |
1096 | _ma_remove_not_visible_states_with_lock(share, FALSE); |
1097 | |
1098 | if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) |
1099 | { |
1100 | /* |
1101 | maria_close() left us free the share. When it run it set share->id |
1102 | to 0. As it run before we locked close_lock, we should have seen this |
1103 | and so this assertion should be true: |
1104 | */ |
1105 | DBUG_ASSERT(ignore_share); |
1106 | mysql_mutex_destroy(&share->intern_lock); |
1107 | mysql_mutex_unlock(&share->close_lock); |
1108 | mysql_mutex_destroy(&share->close_lock); |
1109 | ma_crypt_free(share); |
1110 | my_free(share); |
1111 | } |
1112 | else |
1113 | { |
1114 | /* share goes back to normal state */ |
1115 | share->in_checkpoint= 0; |
1116 | mysql_mutex_unlock(&share->close_lock); |
1117 | } |
1118 | |
1119 | /* |
1120 | We do the big disk writes out of intern_lock to not block other |
1121 | users of this table (intern_lock is taken at the start and end of |
1122 | every statement). This means that file descriptors may be invalid |
1123 | (files may have been closed for example by HA_EXTRA_PREPARE_FOR_* |
1124 | under Windows, or REPAIR). This should not be a problem as we use |
1125 | MY_IGNORE_BADFD. Descriptors may even point to other files but then |
1126 | the old blocks (of before the close) must have been flushed for sure, |
1127 | so our flush will flush new blocks (of after the latest open) and that |
1128 | should do no harm. |
1129 | */ |
1130 | /* |
1131 | If CHECKPOINT_MEDIUM, this big flush below may result in a |
1132 | serious write burst. Realize that all pages dirtied between the |
1133 | last checkpoint and the one we are doing now, will be flushed at |
1134 | next checkpoint, except those evicted by LRU eviction (depending on |
1135 | the size of the page cache compared to the size of the working data |
1136 | set, eviction may be rare or frequent). |
1137 | We avoid that burst by anticipating: those pages are flushed |
1138 | in bunches spanned regularly over the time interval between now and |
1139 | the next checkpoint, by a background thread. Thus the next checkpoint |
1140 | will have only little flushing to do (CHECKPOINT_MEDIUM should thus be |
1141 | only a little slower than CHECKPOINT_INDIRECT). |
1142 | */ |
1143 | |
1144 | /* |
1145 | PageCacheFlushConcurrencyBugs |
1146 | Inside the page cache, calls to flush_pagecache_blocks_int() on the same |
1147 | file are serialized. Examples of concurrency bugs which happened when we |
1148 | didn't have this serialization: |
1149 | - maria_chk_size() (via CHECK TABLE) happens concurrently with |
1150 | Checkpoint: Checkpoint is flushing a page: it pins the page and is |
1151 | pre-empted, maria_chk_size() wants to flush this page too so gets an |
1152 | error because Checkpoint pinned this page. Such error makes |
1153 | maria_chk_size() mark the table as corrupted. |
1154 | - maria_close() happens concurrently with Checkpoint: |
1155 | Checkpoint is flushing a page: it registers a request on the page, is |
1156 | pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE: |
1157 | FLUSH_RELEASE will cause a free_block() which assumes the page is in the |
1158 | LRU, but it is not (as Checkpoint registered a request). Crash. |
1159 | - one thread is evicting a page of the file out of the LRU: it marks it |
1160 | iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes |
1161 | of the same file concurrently (like above). Then one flusher sees the |
1162 | page is in switch, removes it from changed_blocks[] and puts it in its |
1163 | first_in_switch, so the other flusher will not see the page at all and |
1164 | return too early. If it's maria_close() which returns too early, then |
1165 | maria_close() may close the file descriptor, and the other flusher, and |
1166 | the evicter will fail to write their page: corruption. |
1167 | */ |
1168 | |
1169 | if (!ignore_share) |
1170 | { |
1171 | if (filter != NULL) |
1172 | { |
1173 | if ((flush_pagecache_blocks_with_filter(maria_pagecache, |
1174 | &dfile, FLUSH_KEEP_LAZY, |
1175 | filter, &filter_param) & |
1176 | PCFLUSH_ERROR)) |
1177 | ma_message_no_user(0, "checkpoint data page flush failed" ); |
1178 | if ((flush_pagecache_blocks_with_filter(maria_pagecache, |
1179 | &kfile, FLUSH_KEEP_LAZY, |
1180 | filter, &filter_param) & |
1181 | PCFLUSH_ERROR)) |
1182 | ma_message_no_user(0, "checkpoint index page flush failed" ); |
1183 | } |
1184 | /* |
1185 | fsyncs the fd, that's the loooong operation (e.g. max 150 fsync |
1186 | per second, so if you have touched 1000 files it's 7 seconds). |
1187 | */ |
1188 | sync_error|= |
1189 | mysql_file_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) | |
1190 | mysql_file_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD)); |
1191 | /* |
1192 | in case of error, we continue because writing other tables to disk is |
1193 | still useful. |
1194 | */ |
1195 | } |
1196 | } |
1197 | |
1198 | if (sync_error) |
1199 | goto err; |
1200 | /* We maybe over-estimated (due to share->id==0 or last_version==0) */ |
1201 | DBUG_ASSERT(str->length >= (uint)(ptr - str->str)); |
1202 | str->length= (uint)(ptr - str->str); |
1203 | /* |
1204 | As we support max 65k tables open at a time (2-byte short id), we |
1205 | assume uint is enough for the cumulated length of table names; and |
1206 | LEX_STRING::length is uint. |
1207 | */ |
1208 | int4store(str->str, nb_stored); |
1209 | error= unmark_tables= 0; |
1210 | |
1211 | err: |
1212 | if (unlikely(unmark_tables)) |
1213 | { |
1214 | /* maria_close() uses THR_LOCK_maria from start to end */ |
1215 | mysql_mutex_lock(&THR_LOCK_maria); |
1216 | for (i= 0; i < nb; i++) |
1217 | { |
1218 | MARIA_SHARE *share= distinct_shares[i]; |
1219 | if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) |
1220 | { |
1221 | /* maria_close() left us to free the share */ |
1222 | mysql_mutex_destroy(&share->intern_lock); |
1223 | ma_crypt_free(share); |
1224 | my_free(share); |
1225 | } |
1226 | else |
1227 | { |
1228 | /* share goes back to normal state */ |
1229 | share->in_checkpoint= 0; |
1230 | } |
1231 | } |
1232 | mysql_mutex_unlock(&THR_LOCK_maria); |
1233 | } |
1234 | my_free(distinct_shares); |
1235 | my_free(state_copies); |
1236 | DBUG_RETURN(error); |
1237 | } |
1238 | |