1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2009, Google Inc. |
5 | Copyright (c) 2014, 2018, MariaDB Corporation. |
6 | |
7 | Portions of this file contain modifications contributed and copyrighted by |
8 | Google, Inc. Those modifications are gratefully acknowledged and are described |
9 | briefly in the InnoDB documentation. The contributions by Google are |
10 | incorporated with their permission, and subject to the conditions contained in |
11 | the file COPYING.Google. |
12 | |
13 | This program is free software; you can redistribute it and/or modify it under |
14 | the terms of the GNU General Public License as published by the Free Software |
15 | Foundation; version 2 of the License. |
16 | |
17 | This program is distributed in the hope that it will be useful, but WITHOUT |
18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
19 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
20 | |
21 | You should have received a copy of the GNU General Public License along with |
22 | this program; if not, write to the Free Software Foundation, Inc., |
23 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
24 | |
25 | *****************************************************************************/ |
26 | |
27 | /**************************************************//** |
28 | @file log/log0log.cc |
29 | Database log |
30 | |
31 | Created 12/9/1995 Heikki Tuuri |
32 | *******************************************************/ |
33 | |
34 | #include "ha_prototypes.h" |
35 | #include <debug_sync.h> |
36 | #include <my_service_manager.h> |
37 | |
38 | #include "log0log.h" |
39 | #include "log0crypt.h" |
40 | #include "mem0mem.h" |
41 | #include "buf0buf.h" |
42 | #include "buf0flu.h" |
43 | #include "lock0lock.h" |
44 | #include "log0recv.h" |
45 | #include "fil0fil.h" |
46 | #include "dict0boot.h" |
47 | #include "dict0stats_bg.h" |
48 | #include "btr0defragment.h" |
49 | #include "srv0srv.h" |
50 | #include "srv0start.h" |
51 | #include "trx0sys.h" |
52 | #include "trx0trx.h" |
53 | #include "trx0roll.h" |
54 | #include "srv0mon.h" |
55 | #include "sync0sync.h" |
56 | |
57 | /* |
58 | General philosophy of InnoDB redo-logs: |
59 | |
60 | 1) Every change to a contents of a data page must be done |
61 | through mtr, which in mtr_commit() writes log records |
62 | to the InnoDB redo log. |
63 | |
64 | 2) Normally these changes are performed using a mlog_write_ulint() |
65 | or similar function. |
66 | |
67 | 3) In some page level operations only a code number of a |
68 | c-function and its parameters are written to the log to |
69 | reduce the size of the log. |
70 | |
71 | 3a) You should not add parameters to these kind of functions |
72 | (e.g. trx_undo_header_create()) |
73 | |
74 | 3b) You should not add such functionality which either change |
75 | working when compared with the old or are dependent on data |
76 | outside of the page. These kind of functions should implement |
77 | self-contained page transformation and it should be unchanged |
78 | if you don't have very essential reasons to change log |
79 | semantics or format. |
80 | |
81 | */ |
82 | |
83 | /** Redo log system */ |
84 | log_t log_sys; |
85 | |
86 | /** Whether to generate and require checksums on the redo log pages */ |
87 | my_bool innodb_log_checksums; |
88 | |
89 | /** Pointer to the log checksum calculation function */ |
90 | log_checksum_func_t log_checksum_algorithm_ptr; |
91 | |
92 | /* Next log block number to do dummy record filling if no log records written |
93 | for a while */ |
94 | static ulint next_lbn_to_pad = 0; |
95 | |
96 | /* These control how often we print warnings if the last checkpoint is too |
97 | old */ |
98 | static bool log_has_printed_chkp_warning = false; |
99 | static time_t log_last_warning_time; |
100 | |
101 | static bool log_has_printed_chkp_margine_warning = false; |
102 | static time_t log_last_margine_warning_time; |
103 | |
104 | /* A margin for free space in the log buffer before a log entry is catenated */ |
105 | #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE) |
106 | |
107 | /* Margins for free space in the log buffer after a log entry is catenated */ |
108 | #define LOG_BUF_FLUSH_RATIO 2 |
109 | #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \ |
110 | + (4U << srv_page_size_shift)) |
111 | |
112 | /* This parameter controls asynchronous making of a new checkpoint; the value |
113 | should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */ |
114 | |
115 | #define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32 |
116 | |
117 | /* This parameter controls synchronous preflushing of modified buffer pages */ |
118 | #define LOG_POOL_PREFLUSH_RATIO_SYNC 16 |
119 | |
120 | /* The same ratio for asynchronous preflushing; this value should be less than |
121 | the previous */ |
122 | #define LOG_POOL_PREFLUSH_RATIO_ASYNC 8 |
123 | |
124 | /* Codes used in unlocking flush latches */ |
125 | #define LOG_UNLOCK_NONE_FLUSHED_LOCK 1 |
126 | #define LOG_UNLOCK_FLUSH_LOCK 2 |
127 | |
128 | /** Event to wake up log_scrub_thread */ |
129 | os_event_t log_scrub_event; |
130 | /** Whether log_scrub_thread is active */ |
131 | bool log_scrub_thread_active; |
132 | |
133 | extern "C" UNIV_INTERN |
134 | os_thread_ret_t |
135 | DECLARE_THREAD(log_scrub_thread)(void*); |
136 | |
137 | /****************************************************************//** |
138 | Returns the oldest modified block lsn in the pool, or log_sys.lsn if none |
139 | exists. |
140 | @return LSN of oldest modification */ |
141 | static |
142 | lsn_t |
143 | log_buf_pool_get_oldest_modification(void) |
144 | /*======================================*/ |
145 | { |
146 | lsn_t lsn; |
147 | |
148 | ut_ad(log_mutex_own()); |
149 | |
150 | lsn = buf_pool_get_oldest_modification(); |
151 | |
152 | if (!lsn) { |
153 | |
154 | lsn = log_sys.lsn; |
155 | } |
156 | |
157 | return(lsn); |
158 | } |
159 | |
160 | /** Extends the log buffer. |
161 | @param[in] len requested minimum size in bytes */ |
162 | void log_buffer_extend(ulong len) |
163 | { |
164 | byte tmp_buf[OS_FILE_LOG_BLOCK_SIZE]; |
165 | |
166 | log_mutex_enter_all(); |
167 | |
168 | while (log_sys.is_extending) { |
169 | /* Another thread is trying to extend already. |
170 | Needs to wait for. */ |
171 | log_mutex_exit_all(); |
172 | |
173 | log_buffer_flush_to_disk(); |
174 | |
175 | log_mutex_enter_all(); |
176 | |
177 | if (srv_log_buffer_size > len) { |
178 | /* Already extended enough by the others */ |
179 | log_mutex_exit_all(); |
180 | return; |
181 | } |
182 | } |
183 | |
184 | if (len >= srv_log_buffer_size / 2) { |
185 | DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash" , |
186 | DBUG_SUICIDE();); |
187 | |
188 | /* log_buffer is too small. try to extend instead of crash. */ |
189 | ib::warn() << "The redo log transaction size " << len << |
190 | " exceeds innodb_log_buffer_size=" |
191 | << srv_log_buffer_size << " / 2). Trying to extend it." ; |
192 | } |
193 | |
194 | log_sys.is_extending = true; |
195 | |
196 | while (ut_calc_align_down(log_sys.buf_free, |
197 | OS_FILE_LOG_BLOCK_SIZE) |
198 | != ut_calc_align_down(log_sys.buf_next_to_write, |
199 | OS_FILE_LOG_BLOCK_SIZE)) { |
200 | /* Buffer might have >1 blocks to write still. */ |
201 | log_mutex_exit_all(); |
202 | |
203 | log_buffer_flush_to_disk(); |
204 | |
205 | log_mutex_enter_all(); |
206 | } |
207 | |
208 | ulong move_start = ut_calc_align_down( |
209 | log_sys.buf_free, |
210 | OS_FILE_LOG_BLOCK_SIZE); |
211 | ulong move_end = log_sys.buf_free; |
212 | |
213 | /* store the last log block in buffer */ |
214 | ut_memcpy(tmp_buf, log_sys.buf + move_start, |
215 | move_end - move_start); |
216 | |
217 | log_sys.buf_free -= move_start; |
218 | log_sys.buf_next_to_write -= move_start; |
219 | |
220 | /* free previous after getting the right address */ |
221 | if (!log_sys.first_in_use) { |
222 | log_sys.buf -= srv_log_buffer_size; |
223 | } |
224 | ut_free_dodump(log_sys.buf, srv_log_buffer_size * 2); |
225 | |
226 | /* reallocate log buffer */ |
227 | srv_log_buffer_size = len; |
228 | |
229 | log_sys.buf = static_cast<byte*>( |
230 | ut_malloc_dontdump(srv_log_buffer_size * 2)); |
231 | TRASH_ALLOC(log_sys.buf, srv_log_buffer_size * 2); |
232 | |
233 | log_sys.first_in_use = true; |
234 | |
235 | log_sys.max_buf_free = srv_log_buffer_size / LOG_BUF_FLUSH_RATIO |
236 | - LOG_BUF_FLUSH_MARGIN; |
237 | |
238 | /* restore the last log block */ |
239 | ut_memcpy(log_sys.buf, tmp_buf, move_end - move_start); |
240 | |
241 | ut_ad(log_sys.is_extending); |
242 | log_sys.is_extending = false; |
243 | |
244 | log_mutex_exit_all(); |
245 | |
246 | ib::info() << "innodb_log_buffer_size was extended to " |
247 | << srv_log_buffer_size << "." ; |
248 | } |
249 | |
250 | /** Calculate actual length in redo buffer and file including |
251 | block header and trailer. |
252 | @param[in] len length to write |
253 | @return actual length to write including header and trailer. */ |
254 | static inline |
255 | ulint |
256 | log_calculate_actual_len( |
257 | ulint len) |
258 | { |
259 | ut_ad(log_mutex_own()); |
260 | |
261 | /* actual length stored per block */ |
262 | const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE |
263 | - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); |
264 | |
265 | /* actual data length in last block already written */ |
266 | ulint = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE); |
267 | |
268 | ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE); |
269 | extra_len -= LOG_BLOCK_HDR_SIZE; |
270 | |
271 | /* total extra length for block header and trailer */ |
272 | extra_len = ((len + extra_len) / len_per_blk) |
273 | * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); |
274 | |
275 | return(len + extra_len); |
276 | } |
277 | |
278 | /** Check margin not to overwrite transaction log from the last checkpoint. |
279 | If would estimate the log write to exceed the log_group_capacity, |
280 | waits for the checkpoint is done enough. |
281 | @param[in] len length of the data to be written */ |
282 | |
283 | void |
284 | log_margin_checkpoint_age( |
285 | ulint len) |
286 | { |
287 | ulint margin = log_calculate_actual_len(len); |
288 | |
289 | ut_ad(log_mutex_own()); |
290 | |
291 | if (margin > log_sys.log_group_capacity) { |
292 | /* return with warning output to avoid deadlock */ |
293 | if (!log_has_printed_chkp_margine_warning |
294 | || difftime(time(NULL), |
295 | log_last_margine_warning_time) > 15) { |
296 | log_has_printed_chkp_margine_warning = true; |
297 | log_last_margine_warning_time = time(NULL); |
298 | |
299 | ib::error() << "The transaction log files are too" |
300 | " small for the single transaction log (size=" |
301 | << len << "). So, the last checkpoint age" |
302 | " might exceed the log group capacity " |
303 | << log_sys.log_group_capacity << "." ; |
304 | } |
305 | |
306 | return; |
307 | } |
308 | |
309 | /* Our margin check should ensure that we never reach this condition. |
310 | Try to do checkpoint once. We cannot keep waiting here as it might |
311 | result in hang in case the current mtr has latch on oldest lsn */ |
312 | if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin |
313 | > log_sys.log_group_capacity) { |
314 | /* The log write of 'len' might overwrite the transaction log |
315 | after the last checkpoint. Makes checkpoint. */ |
316 | |
317 | bool flushed_enough = false; |
318 | |
319 | if (log_sys.lsn - log_buf_pool_get_oldest_modification() |
320 | + margin |
321 | <= log_sys.log_group_capacity) { |
322 | flushed_enough = true; |
323 | } |
324 | |
325 | log_sys.check_flush_or_checkpoint = true; |
326 | log_mutex_exit(); |
327 | |
328 | DEBUG_SYNC_C("margin_checkpoint_age_rescue" ); |
329 | |
330 | if (!flushed_enough) { |
331 | os_thread_sleep(100000); |
332 | } |
333 | log_checkpoint(true, false); |
334 | |
335 | log_mutex_enter(); |
336 | } |
337 | |
338 | return; |
339 | } |
340 | |
341 | /** Open the log for log_write_low. The log must be closed with log_close. |
342 | @param[in] len length of the data to be written |
343 | @return start lsn of the log record */ |
344 | lsn_t |
345 | log_reserve_and_open( |
346 | ulint len) |
347 | { |
348 | ulint len_upper_limit; |
349 | #ifdef UNIV_DEBUG |
350 | ulint count = 0; |
351 | #endif /* UNIV_DEBUG */ |
352 | |
353 | loop: |
354 | ut_ad(log_mutex_own()); |
355 | |
356 | if (log_sys.is_extending) { |
357 | log_mutex_exit(); |
358 | |
359 | /* Log buffer size is extending. Writing up to the next block |
360 | should wait for the extending finished. */ |
361 | |
362 | os_thread_sleep(100000); |
363 | |
364 | ut_ad(++count < 50); |
365 | |
366 | log_mutex_enter(); |
367 | goto loop; |
368 | } |
369 | |
370 | /* Calculate an upper limit for the space the string may take in the |
371 | log buffer */ |
372 | |
373 | len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size |
374 | + (5 * len) / 4; |
375 | |
376 | if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) { |
377 | log_mutex_exit(); |
378 | |
379 | DEBUG_SYNC_C("log_buf_size_exceeded" ); |
380 | |
381 | /* Not enough free space, do a write of the log buffer */ |
382 | log_buffer_sync_in_background(false); |
383 | |
384 | srv_stats.log_waits.inc(); |
385 | |
386 | ut_ad(++count < 50); |
387 | |
388 | log_mutex_enter(); |
389 | goto loop; |
390 | } |
391 | |
392 | return(log_sys.lsn); |
393 | } |
394 | |
395 | /************************************************************//** |
396 | Writes to the log the string given. It is assumed that the caller holds the |
397 | log mutex. */ |
398 | void |
399 | log_write_low( |
400 | /*==========*/ |
401 | const byte* str, /*!< in: string */ |
402 | ulint str_len) /*!< in: string length */ |
403 | { |
404 | ulint len; |
405 | ulint data_len; |
406 | byte* log_block; |
407 | |
408 | ut_ad(log_mutex_own()); |
409 | part_loop: |
410 | /* Calculate a part length */ |
411 | |
412 | data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; |
413 | |
414 | if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { |
415 | |
416 | /* The string fits within the current log block */ |
417 | |
418 | len = str_len; |
419 | } else { |
420 | data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; |
421 | |
422 | len = OS_FILE_LOG_BLOCK_SIZE |
423 | - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) |
424 | - LOG_BLOCK_TRL_SIZE; |
425 | } |
426 | |
427 | memcpy(log_sys.buf + log_sys.buf_free, str, len); |
428 | |
429 | str_len -= len; |
430 | str = str + len; |
431 | |
432 | log_block = static_cast<byte*>( |
433 | ut_align_down(log_sys.buf + log_sys.buf_free, |
434 | OS_FILE_LOG_BLOCK_SIZE)); |
435 | |
436 | log_block_set_data_len(log_block, data_len); |
437 | |
438 | if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { |
439 | /* This block became full */ |
440 | log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); |
441 | log_block_set_checkpoint_no(log_block, |
442 | log_sys.next_checkpoint_no); |
443 | len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE; |
444 | |
445 | log_sys.lsn += len; |
446 | |
447 | /* Initialize the next block header */ |
448 | log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, |
449 | log_sys.lsn); |
450 | } else { |
451 | log_sys.lsn += len; |
452 | } |
453 | |
454 | log_sys.buf_free += ulong(len); |
455 | |
456 | ut_ad(log_sys.buf_free <= srv_log_buffer_size); |
457 | |
458 | if (str_len > 0) { |
459 | goto part_loop; |
460 | } |
461 | |
462 | srv_stats.log_write_requests.inc(); |
463 | } |
464 | |
465 | /************************************************************//** |
466 | Closes the log. |
467 | @return lsn */ |
468 | lsn_t |
469 | log_close(void) |
470 | /*===========*/ |
471 | { |
472 | byte* log_block; |
473 | ulint first_rec_group; |
474 | lsn_t oldest_lsn; |
475 | lsn_t lsn; |
476 | lsn_t checkpoint_age; |
477 | |
478 | ut_ad(log_mutex_own()); |
479 | |
480 | lsn = log_sys.lsn; |
481 | |
482 | log_block = static_cast<byte*>( |
483 | ut_align_down(log_sys.buf + log_sys.buf_free, |
484 | OS_FILE_LOG_BLOCK_SIZE)); |
485 | |
486 | first_rec_group = log_block_get_first_rec_group(log_block); |
487 | |
488 | if (first_rec_group == 0) { |
489 | /* We initialized a new log block which was not written |
490 | full by the current mtr: the next mtr log record group |
491 | will start within this block at the offset data_len */ |
492 | |
493 | log_block_set_first_rec_group( |
494 | log_block, log_block_get_data_len(log_block)); |
495 | } |
496 | |
497 | if (log_sys.buf_free > log_sys.max_buf_free) { |
498 | log_sys.check_flush_or_checkpoint = true; |
499 | } |
500 | |
501 | checkpoint_age = lsn - log_sys.last_checkpoint_lsn; |
502 | |
503 | if (checkpoint_age >= log_sys.log_group_capacity) { |
504 | DBUG_EXECUTE_IF( |
505 | "print_all_chkp_warnings" , |
506 | log_has_printed_chkp_warning = false;); |
507 | |
508 | if (!log_has_printed_chkp_warning |
509 | || difftime(time(NULL), log_last_warning_time) > 15) { |
510 | |
511 | log_has_printed_chkp_warning = true; |
512 | log_last_warning_time = time(NULL); |
513 | |
514 | ib::error() << "The age of the last checkpoint is " |
515 | << checkpoint_age << ", which exceeds the log" |
516 | " group capacity " |
517 | << log_sys.log_group_capacity |
518 | << "." ; |
519 | } |
520 | } |
521 | |
522 | if (checkpoint_age <= log_sys.max_modified_age_sync) { |
523 | goto function_exit; |
524 | } |
525 | |
526 | oldest_lsn = buf_pool_get_oldest_modification(); |
527 | |
528 | if (!oldest_lsn |
529 | || lsn - oldest_lsn > log_sys.max_modified_age_sync |
530 | || checkpoint_age > log_sys.max_checkpoint_age_async) { |
531 | log_sys.check_flush_or_checkpoint = true; |
532 | } |
533 | function_exit: |
534 | |
535 | return(lsn); |
536 | } |
537 | |
538 | /** Calculate the recommended highest values for lsn - last_checkpoint_lsn |
539 | and lsn - buf_get_oldest_modification(). |
540 | @param[in] file_size requested innodb_log_file_size |
541 | @retval true on success |
542 | @retval false if the smallest log group is too small to |
543 | accommodate the number of OS threads in the database server */ |
544 | bool |
545 | log_set_capacity(ulonglong file_size) |
546 | { |
547 | lsn_t margin; |
548 | ulint free; |
549 | |
550 | lsn_t smallest_capacity = (file_size - LOG_FILE_HDR_SIZE) |
551 | * srv_n_log_files; |
552 | /* Add extra safety */ |
553 | smallest_capacity -= smallest_capacity / 10; |
554 | |
555 | /* For each OS thread we must reserve so much free space in the |
556 | smallest log group that it can accommodate the log entries produced |
557 | by single query steps: running out of free log space is a serious |
558 | system error which requires rebooting the database. */ |
559 | |
560 | free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency) |
561 | + LOG_CHECKPOINT_EXTRA_FREE; |
562 | if (free >= smallest_capacity / 2) { |
563 | ib::error() << "Cannot continue operation. ib_logfiles are too" |
564 | " small for innodb_thread_concurrency=" |
565 | << srv_thread_concurrency << ". The combined size of" |
566 | " ib_logfiles should be bigger than" |
567 | " 200 kB * innodb_thread_concurrency. " |
568 | << INNODB_PARAMETERS_MSG; |
569 | return(false); |
570 | } |
571 | |
572 | margin = smallest_capacity - free; |
573 | margin = margin - margin / 10; /* Add still some extra safety */ |
574 | |
575 | log_mutex_enter(); |
576 | |
577 | log_sys.log_group_capacity = smallest_capacity; |
578 | |
579 | log_sys.max_modified_age_async = margin |
580 | - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC; |
581 | log_sys.max_modified_age_sync = margin |
582 | - margin / LOG_POOL_PREFLUSH_RATIO_SYNC; |
583 | |
584 | log_sys.max_checkpoint_age_async = margin - margin |
585 | / LOG_POOL_CHECKPOINT_RATIO_ASYNC; |
586 | log_sys.max_checkpoint_age = margin; |
587 | |
588 | log_mutex_exit(); |
589 | |
590 | return(true); |
591 | } |
592 | |
593 | /** Initialize the redo log subsystem. */ |
594 | void log_t::create() |
595 | { |
596 | ut_ad(this == &log_sys); |
597 | ut_ad(!is_initialised()); |
598 | m_initialised= true; |
599 | |
600 | mutex_create(LATCH_ID_LOG_SYS, &mutex); |
601 | mutex_create(LATCH_ID_LOG_WRITE, &write_mutex); |
602 | mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex); |
603 | |
604 | /* Start the lsn from one log block from zero: this way every |
605 | log record has a non-zero start lsn, a fact which we will use */ |
606 | |
607 | lsn= LOG_START_LSN; |
608 | |
609 | ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE); |
610 | ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift); |
611 | |
612 | buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size * 2)); |
613 | TRASH_ALLOC(buf, srv_log_buffer_size * 2); |
614 | |
615 | first_in_use= true; |
616 | |
617 | max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO - |
618 | LOG_BUF_FLUSH_MARGIN; |
619 | check_flush_or_checkpoint= true; |
620 | |
621 | n_log_ios_old= n_log_ios; |
622 | last_printout_time= time(NULL); |
623 | |
624 | buf_next_to_write= 0; |
625 | is_extending= false; |
626 | write_lsn= lsn; |
627 | flushed_to_disk_lsn= 0; |
628 | n_pending_flushes= 0; |
629 | flush_event = os_event_create("log_flush_event" ); |
630 | os_event_set(flush_event); |
631 | n_log_ios= 0; |
632 | n_log_ios_old= 0; |
633 | log_group_capacity= 0; |
634 | max_modified_age_async= 0; |
635 | max_modified_age_sync= 0; |
636 | max_checkpoint_age_async= 0; |
637 | max_checkpoint_age= 0; |
638 | next_checkpoint_no= 0; |
639 | next_checkpoint_lsn= 0; |
640 | append_on_checkpoint= NULL; |
641 | n_pending_checkpoint_writes= 0; |
642 | |
643 | last_checkpoint_lsn= lsn; |
644 | rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK); |
645 | |
646 | log_block_init(buf, lsn); |
647 | log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE); |
648 | |
649 | buf_free= LOG_BLOCK_HDR_SIZE; |
650 | lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE; |
651 | |
652 | MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn); |
653 | |
654 | log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log; |
655 | if (log_scrub_thread_active) { |
656 | log_scrub_event= os_event_create("log_scrub_event" ); |
657 | os_thread_create(log_scrub_thread, NULL, NULL); |
658 | } |
659 | } |
660 | |
661 | /** Initialize the redo log. |
662 | @param[in] n_files number of files */ |
663 | void log_t::files::create(ulint n_files) |
664 | { |
665 | ut_ad(n_files <= SRV_N_LOG_FILES_MAX); |
666 | ut_ad(this == &log_sys.log); |
667 | ut_ad(log_sys.is_initialised()); |
668 | |
669 | this->n_files= n_files; |
670 | format= srv_encrypt_log |
671 | ? LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED |
672 | : LOG_HEADER_FORMAT_CURRENT; |
673 | file_size= srv_log_file_size; |
674 | state= LOG_GROUP_OK; |
675 | lsn= LOG_START_LSN; |
676 | lsn_offset= LOG_FILE_HDR_SIZE; |
677 | |
678 | byte* ptr= static_cast<byte*>(ut_zalloc_nokey(LOG_FILE_HDR_SIZE * n_files |
679 | + OS_FILE_LOG_BLOCK_SIZE)); |
680 | file_header_bufs_ptr= ptr; |
681 | ptr= static_cast<byte*>(ut_align(ptr, OS_FILE_LOG_BLOCK_SIZE)); |
682 | |
683 | memset(file_header_bufs, 0, sizeof file_header_bufs); |
684 | |
685 | for (ulint i = 0; i < n_files; i++, ptr += LOG_FILE_HDR_SIZE) |
686 | file_header_bufs[i] = ptr; |
687 | } |
688 | |
689 | /******************************************************//** |
690 | Writes a log file header to a log file space. */ |
691 | static |
692 | void |
693 | ( |
694 | ulint nth_file, /*!< in: header to the nth file in the |
695 | log file space */ |
696 | lsn_t start_lsn) /*!< in: log file data starts at this |
697 | lsn */ |
698 | { |
699 | byte* buf; |
700 | lsn_t dest_offset; |
701 | |
702 | ut_ad(log_write_mutex_own()); |
703 | ut_ad(!recv_no_log_write); |
704 | ut_a(nth_file < log_sys.log.n_files); |
705 | ut_ad((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED) |
706 | == LOG_HEADER_FORMAT_CURRENT); |
707 | |
708 | buf = log_sys.log.file_header_bufs[nth_file]; |
709 | |
710 | memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE); |
711 | mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format); |
712 | mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn); |
713 | strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR, |
714 | LOG_HEADER_CREATOR_CURRENT); |
715 | ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR |
716 | >= sizeof LOG_HEADER_CREATOR_CURRENT); |
717 | log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf)); |
718 | |
719 | dest_offset = nth_file * log_sys.log.file_size; |
720 | |
721 | DBUG_PRINT("ib_log" , ("write " LSN_PF |
722 | " file " ULINTPF " header" , |
723 | start_lsn, nth_file)); |
724 | |
725 | log_sys.n_log_ios++; |
726 | |
727 | MONITOR_INC(MONITOR_LOG_IO); |
728 | |
729 | srv_stats.os_log_pending_writes.inc(); |
730 | |
731 | const ulint page_no = ulint(dest_offset >> srv_page_size_shift); |
732 | |
733 | fil_io(IORequestLogWrite, true, |
734 | page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), |
735 | univ_page_size, |
736 | ulint(dest_offset & (srv_page_size - 1)), |
737 | OS_FILE_LOG_BLOCK_SIZE, buf, NULL); |
738 | |
739 | srv_stats.os_log_pending_writes.dec(); |
740 | } |
741 | |
742 | /******************************************************//** |
743 | Stores a 4-byte checksum to the trailer checksum field of a log block |
744 | before writing it to a log file. This checksum is used in recovery to |
745 | check the consistency of a log block. */ |
746 | static |
747 | void |
748 | log_block_store_checksum( |
749 | /*=====================*/ |
750 | byte* block) /*!< in/out: pointer to a log block */ |
751 | { |
752 | log_block_set_checksum(block, log_block_calc_checksum(block)); |
753 | } |
754 | |
755 | /******************************************************//** |
756 | Writes a buffer to a log file. */ |
757 | static |
758 | void |
759 | log_write_buf( |
760 | byte* buf, /*!< in: buffer */ |
761 | ulint len, /*!< in: buffer len; must be divisible |
762 | by OS_FILE_LOG_BLOCK_SIZE */ |
763 | #ifdef UNIV_DEBUG |
764 | ulint pad_len, /*!< in: pad len in the buffer len */ |
765 | #endif /* UNIV_DEBUG */ |
766 | lsn_t start_lsn, /*!< in: start lsn of the buffer; must |
767 | be divisible by |
768 | OS_FILE_LOG_BLOCK_SIZE */ |
769 | ulint new_data_offset)/*!< in: start offset of new data in |
770 | buf: this parameter is used to decide |
771 | if we have to write a new log file |
772 | header */ |
773 | { |
774 | ulint write_len; |
775 | bool = new_data_offset == 0; |
776 | lsn_t next_offset; |
777 | ulint i; |
778 | |
779 | ut_ad(log_write_mutex_own()); |
780 | ut_ad(!recv_no_log_write); |
781 | ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0); |
782 | ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); |
783 | |
784 | loop: |
785 | if (len == 0) { |
786 | |
787 | return; |
788 | } |
789 | |
790 | next_offset = log_sys.log.calc_lsn_offset(start_lsn); |
791 | |
792 | if (write_header |
793 | && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) { |
794 | /* We start to write a new log file instance in the group */ |
795 | |
796 | ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX); |
797 | |
798 | log_file_header_flush( |
799 | ulint(next_offset / log_sys.log.file_size), start_lsn); |
800 | srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE); |
801 | |
802 | srv_stats.log_writes.inc(); |
803 | } |
804 | |
805 | if ((next_offset % log_sys.log.file_size) + len |
806 | > log_sys.log.file_size) { |
807 | /* if the above condition holds, then the below expression |
808 | is < len which is ulint, so the typecast is ok */ |
809 | write_len = ulint(log_sys.log.file_size |
810 | - (next_offset % log_sys.log.file_size)); |
811 | } else { |
812 | write_len = len; |
813 | } |
814 | |
815 | DBUG_PRINT("ib_log" , |
816 | ("write " LSN_PF " to " LSN_PF |
817 | ": len " ULINTPF |
818 | " blocks " ULINTPF ".." ULINTPF, |
819 | start_lsn, next_offset, |
820 | write_len, |
821 | log_block_get_hdr_no(buf), |
822 | log_block_get_hdr_no( |
823 | buf + write_len |
824 | - OS_FILE_LOG_BLOCK_SIZE))); |
825 | |
826 | ut_ad(pad_len >= len |
827 | || log_block_get_hdr_no(buf) |
828 | == log_block_convert_lsn_to_no(start_lsn)); |
829 | |
830 | /* Calculate the checksums for each log block and write them to |
831 | the trailer fields of the log blocks */ |
832 | |
833 | for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { |
834 | ut_ad(pad_len >= len |
835 | || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len |
836 | || log_block_get_hdr_no( |
837 | buf + i * OS_FILE_LOG_BLOCK_SIZE) |
838 | == log_block_get_hdr_no(buf) + i); |
839 | log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE); |
840 | } |
841 | |
842 | log_sys.n_log_ios++; |
843 | |
844 | MONITOR_INC(MONITOR_LOG_IO); |
845 | |
846 | srv_stats.os_log_pending_writes.inc(); |
847 | |
848 | ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX); |
849 | |
850 | const ulint page_no = ulint(next_offset >> srv_page_size_shift); |
851 | |
852 | fil_io(IORequestLogWrite, true, |
853 | page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no), |
854 | univ_page_size, |
855 | ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL); |
856 | |
857 | srv_stats.os_log_pending_writes.dec(); |
858 | |
859 | srv_stats.os_log_written.add(write_len); |
860 | srv_stats.log_writes.inc(); |
861 | |
862 | if (write_len < len) { |
863 | start_lsn += write_len; |
864 | len -= write_len; |
865 | buf += write_len; |
866 | |
867 | write_header = true; |
868 | |
869 | goto loop; |
870 | } |
871 | } |
872 | |
873 | /** Flush the recently written changes to the log file. |
874 | and invoke log_mutex_enter(). */ |
875 | static |
876 | void |
877 | log_write_flush_to_disk_low() |
878 | { |
879 | /* FIXME: This is not holding log_sys.mutex while |
880 | calling os_event_set()! */ |
881 | ut_a(log_sys.n_pending_flushes == 1); /* No other threads here */ |
882 | |
883 | bool do_flush = srv_file_flush_method != SRV_O_DSYNC; |
884 | |
885 | if (do_flush) { |
886 | fil_flush(SRV_LOG_SPACE_FIRST_ID); |
887 | } |
888 | |
889 | MONITOR_DEC(MONITOR_PENDING_LOG_FLUSH); |
890 | |
891 | log_mutex_enter(); |
892 | if (do_flush) { |
893 | log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn; |
894 | } |
895 | |
896 | log_sys.n_pending_flushes--; |
897 | |
898 | os_event_set(log_sys.flush_event); |
899 | } |
900 | |
901 | /** Switch the log buffer in use, and copy the content of last block |
902 | from old log buffer to the head of the to be used one. Thus, buf_free and |
903 | buf_next_to_write would be changed accordingly */ |
904 | static inline |
905 | void |
906 | log_buffer_switch() |
907 | { |
908 | ut_ad(log_mutex_own()); |
909 | ut_ad(log_write_mutex_own()); |
910 | |
911 | const byte* old_buf = log_sys.buf; |
912 | ulint area_end = ut_calc_align(log_sys.buf_free, |
913 | OS_FILE_LOG_BLOCK_SIZE); |
914 | |
915 | if (log_sys.first_in_use) { |
916 | log_sys.first_in_use = false; |
917 | ut_ad(log_sys.buf == ut_align(log_sys.buf, |
918 | OS_FILE_LOG_BLOCK_SIZE)); |
919 | log_sys.buf += srv_log_buffer_size; |
920 | } else { |
921 | log_sys.first_in_use = true; |
922 | log_sys.buf -= srv_log_buffer_size; |
923 | ut_ad(log_sys.buf == ut_align(log_sys.buf, |
924 | OS_FILE_LOG_BLOCK_SIZE)); |
925 | } |
926 | |
927 | /* Copy the last block to new buf */ |
928 | ut_memcpy(log_sys.buf, |
929 | old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE, |
930 | OS_FILE_LOG_BLOCK_SIZE); |
931 | |
932 | log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE; |
933 | log_sys.buf_next_to_write = log_sys.buf_free; |
934 | } |
935 | |
936 | /** Ensure that the log has been written to the log file up to a given |
937 | log entry (such as that of a transaction commit). Start a new write, or |
938 | wait and check if an already running write is covering the request. |
939 | @param[in] lsn log sequence number that should be |
940 | included in the redo log file write |
941 | @param[in] flush_to_disk whether the written log should also |
942 | be flushed to the file system */ |
943 | void |
944 | log_write_up_to( |
945 | lsn_t lsn, |
946 | bool flush_to_disk) |
947 | { |
948 | #ifdef UNIV_DEBUG |
949 | ulint loop_count = 0; |
950 | #endif /* UNIV_DEBUG */ |
951 | byte* write_buf; |
952 | lsn_t write_lsn; |
953 | |
954 | ut_ad(!srv_read_only_mode); |
955 | |
956 | if (recv_no_ibuf_operations) { |
957 | /* Recovery is running and no operations on the log files are |
958 | allowed yet (the variable name .._no_ibuf_.. is misleading) */ |
959 | |
960 | return; |
961 | } |
962 | |
963 | if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { |
964 | service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, |
965 | "log write up to: " LSN_PF, |
966 | lsn); |
967 | } |
968 | |
969 | loop: |
970 | ut_ad(++loop_count < 128); |
971 | |
972 | #if UNIV_WORD_SIZE > 7 |
973 | /* We can do a dirty read of LSN. */ |
974 | /* NOTE: Currently doesn't do dirty read for |
975 | (flush_to_disk == true) case, because the log_mutex |
976 | contention also works as the arbitrator for write-IO |
977 | (fsync) bandwidth between log files and data files. */ |
978 | if (!flush_to_disk && log_sys.write_lsn >= lsn) { |
979 | return; |
980 | } |
981 | #endif |
982 | |
983 | log_write_mutex_enter(); |
984 | ut_ad(!recv_no_log_write); |
985 | |
986 | lsn_t limit_lsn = flush_to_disk |
987 | ? log_sys.flushed_to_disk_lsn |
988 | : log_sys.write_lsn; |
989 | |
990 | if (limit_lsn >= lsn) { |
991 | log_write_mutex_exit(); |
992 | return; |
993 | } |
994 | |
995 | /* If it is a write call we should just go ahead and do it |
996 | as we checked that write_lsn is not where we'd like it to |
997 | be. If we have to flush as well then we check if there is a |
998 | pending flush and based on that we wait for it to finish |
999 | before proceeding further. */ |
1000 | if (flush_to_disk |
1001 | && (log_sys.n_pending_flushes > 0 |
1002 | || !os_event_is_set(log_sys.flush_event))) { |
1003 | /* Figure out if the current flush will do the job |
1004 | for us. */ |
1005 | bool work_done = log_sys.current_flush_lsn >= lsn; |
1006 | |
1007 | log_write_mutex_exit(); |
1008 | |
1009 | os_event_wait(log_sys.flush_event); |
1010 | |
1011 | if (work_done) { |
1012 | return; |
1013 | } else { |
1014 | goto loop; |
1015 | } |
1016 | } |
1017 | |
1018 | log_mutex_enter(); |
1019 | if (!flush_to_disk |
1020 | && log_sys.buf_free == log_sys.buf_next_to_write) { |
1021 | /* Nothing to write and no flush to disk requested */ |
1022 | log_mutex_exit_all(); |
1023 | return; |
1024 | } |
1025 | |
1026 | ulint start_offset; |
1027 | ulint end_offset; |
1028 | ulint area_start; |
1029 | ulint area_end; |
1030 | ulong write_ahead_size = srv_log_write_ahead_size; |
1031 | ulint pad_size; |
1032 | |
1033 | DBUG_PRINT("ib_log" , ("write " LSN_PF " to " LSN_PF, |
1034 | log_sys.write_lsn, |
1035 | log_sys.lsn)); |
1036 | if (flush_to_disk) { |
1037 | log_sys.n_pending_flushes++; |
1038 | log_sys.current_flush_lsn = log_sys.lsn; |
1039 | MONITOR_INC(MONITOR_PENDING_LOG_FLUSH); |
1040 | os_event_reset(log_sys.flush_event); |
1041 | |
1042 | if (log_sys.buf_free == log_sys.buf_next_to_write) { |
1043 | /* Nothing to write, flush only */ |
1044 | log_mutex_exit_all(); |
1045 | log_write_flush_to_disk_low(); |
1046 | log_mutex_exit(); |
1047 | return; |
1048 | } |
1049 | } |
1050 | |
1051 | start_offset = log_sys.buf_next_to_write; |
1052 | end_offset = log_sys.buf_free; |
1053 | |
1054 | area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE); |
1055 | area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE); |
1056 | |
1057 | ut_ad(area_end - area_start > 0); |
1058 | |
1059 | log_block_set_flush_bit(log_sys.buf + area_start, TRUE); |
1060 | log_block_set_checkpoint_no( |
1061 | log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE, |
1062 | log_sys.next_checkpoint_no); |
1063 | |
1064 | write_lsn = log_sys.lsn; |
1065 | write_buf = log_sys.buf; |
1066 | |
1067 | log_buffer_switch(); |
1068 | |
1069 | log_sys.log.set_fields(log_sys.write_lsn); |
1070 | |
1071 | log_mutex_exit(); |
1072 | /* Erase the end of the last log block. */ |
1073 | memset(write_buf + end_offset, 0, |
1074 | ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1)); |
1075 | |
1076 | /* Calculate pad_size if needed. */ |
1077 | pad_size = 0; |
1078 | if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) { |
1079 | ulint end_offset_in_unit; |
1080 | lsn_t end_offset = log_sys.log.calc_lsn_offset( |
1081 | ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE)); |
1082 | end_offset_in_unit = (ulint) (end_offset % write_ahead_size); |
1083 | |
1084 | if (end_offset_in_unit > 0 |
1085 | && (area_end - area_start) > end_offset_in_unit) { |
1086 | /* The first block in the unit was initialized |
1087 | after the last writing. |
1088 | Needs to be written padded data once. */ |
1089 | pad_size = std::min<ulint>( |
1090 | ulint(write_ahead_size) - end_offset_in_unit, |
1091 | srv_log_buffer_size - area_end); |
1092 | ::memset(write_buf + area_end, 0, pad_size); |
1093 | } |
1094 | } |
1095 | |
1096 | if (log_sys.is_encrypted()) { |
1097 | log_crypt(write_buf + area_start, log_sys.write_lsn, |
1098 | area_end - area_start); |
1099 | } |
1100 | |
1101 | /* Do the write to the log files */ |
1102 | log_write_buf( |
1103 | write_buf + area_start, area_end - area_start + pad_size, |
1104 | #ifdef UNIV_DEBUG |
1105 | pad_size, |
1106 | #endif /* UNIV_DEBUG */ |
1107 | ut_uint64_align_down(log_sys.write_lsn, |
1108 | OS_FILE_LOG_BLOCK_SIZE), |
1109 | start_offset - area_start); |
1110 | srv_stats.log_padded.add(pad_size); |
1111 | log_sys.write_lsn = write_lsn; |
1112 | |
1113 | |
1114 | if (srv_file_flush_method == SRV_O_DSYNC) { |
1115 | /* O_SYNC means the OS did not buffer the log file at all: |
1116 | so we have also flushed to disk what we have written */ |
1117 | log_sys.flushed_to_disk_lsn = log_sys.write_lsn; |
1118 | } |
1119 | |
1120 | log_write_mutex_exit(); |
1121 | |
1122 | if (flush_to_disk) { |
1123 | log_write_flush_to_disk_low(); |
1124 | ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn; |
1125 | log_mutex_exit(); |
1126 | |
1127 | innobase_mysql_log_notify(flush_lsn); |
1128 | } |
1129 | } |
1130 | |
1131 | /** write to the log file up to the last log entry. |
1132 | @param[in] sync whether we want the written log |
1133 | also to be flushed to disk. */ |
1134 | void |
1135 | log_buffer_flush_to_disk( |
1136 | bool sync) |
1137 | { |
1138 | ut_ad(!srv_read_only_mode); |
1139 | log_write_up_to(log_get_lsn(), sync); |
1140 | } |
1141 | |
1142 | /****************************************************************//** |
1143 | This functions writes the log buffer to the log file and if 'flush' |
1144 | is set it forces a flush of the log file as well. This is meant to be |
1145 | called from background master thread only as it does not wait for |
1146 | the write (+ possible flush) to finish. */ |
1147 | void |
1148 | log_buffer_sync_in_background( |
1149 | /*==========================*/ |
1150 | bool flush) /*!< in: flush the logs to disk */ |
1151 | { |
1152 | lsn_t lsn; |
1153 | |
1154 | log_mutex_enter(); |
1155 | |
1156 | lsn = log_sys.lsn; |
1157 | |
1158 | if (flush |
1159 | && log_sys.n_pending_flushes > 0 |
1160 | && log_sys.current_flush_lsn >= lsn) { |
1161 | /* The write + flush will write enough */ |
1162 | log_mutex_exit(); |
1163 | return; |
1164 | } |
1165 | |
1166 | log_mutex_exit(); |
1167 | |
1168 | log_write_up_to(lsn, flush); |
1169 | } |
1170 | |
1171 | /******************************************************************** |
1172 | |
1173 | Tries to establish a big enough margin of free space in the log buffer, such |
1174 | that a new log entry can be catenated without an immediate need for a flush. */ |
1175 | static |
1176 | void |
1177 | log_flush_margin(void) |
1178 | /*==================*/ |
1179 | { |
1180 | lsn_t lsn = 0; |
1181 | |
1182 | log_mutex_enter(); |
1183 | |
1184 | if (log_sys.buf_free > log_sys.max_buf_free) { |
1185 | /* We can write during flush */ |
1186 | lsn = log_sys.lsn; |
1187 | } |
1188 | |
1189 | log_mutex_exit(); |
1190 | |
1191 | if (lsn) { |
1192 | log_write_up_to(lsn, false); |
1193 | } |
1194 | } |
1195 | |
1196 | /** Advances the smallest lsn for which there are unflushed dirty blocks in the |
1197 | buffer pool. |
1198 | NOTE: this function may only be called if the calling thread owns no |
1199 | synchronization objects! |
1200 | @param[in] new_oldest try to advance oldest_modified_lsn at least to |
1201 | this lsn |
1202 | @return false if there was a flush batch of the same type running, |
1203 | which means that we could not start this flush batch */ |
1204 | static |
1205 | bool |
1206 | log_preflush_pool_modified_pages( |
1207 | lsn_t new_oldest) |
1208 | { |
1209 | bool success; |
1210 | |
1211 | if (recv_recovery_on) { |
1212 | /* If the recovery is running, we must first apply all |
1213 | log records to their respective file pages to get the |
1214 | right modify lsn values to these pages: otherwise, there |
1215 | might be pages on disk which are not yet recovered to the |
1216 | current lsn, and even after calling this function, we could |
1217 | not know how up-to-date the disk version of the database is, |
1218 | and we could not make a new checkpoint on the basis of the |
1219 | info on the buffer pool only. */ |
1220 | recv_apply_hashed_log_recs(true); |
1221 | } |
1222 | |
1223 | if (new_oldest == LSN_MAX |
1224 | || !buf_page_cleaner_is_active |
1225 | || srv_is_being_started) { |
1226 | |
1227 | ulint n_pages; |
1228 | |
1229 | success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages); |
1230 | |
1231 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
1232 | |
1233 | if (!success) { |
1234 | MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); |
1235 | } |
1236 | |
1237 | MONITOR_INC_VALUE_CUMULATIVE( |
1238 | MONITOR_FLUSH_SYNC_TOTAL_PAGE, |
1239 | MONITOR_FLUSH_SYNC_COUNT, |
1240 | MONITOR_FLUSH_SYNC_PAGES, |
1241 | n_pages); |
1242 | } else { |
1243 | /* better to wait for flushed by page cleaner */ |
1244 | |
1245 | if (srv_flush_sync) { |
1246 | /* wake page cleaner for IO burst */ |
1247 | buf_flush_request_force(new_oldest); |
1248 | } |
1249 | |
1250 | buf_flush_wait_flushed(new_oldest); |
1251 | |
1252 | success = true; |
1253 | } |
1254 | |
1255 | return(success); |
1256 | } |
1257 | |
1258 | /******************************************************//** |
1259 | Completes a checkpoint. */ |
1260 | static |
1261 | void |
1262 | log_complete_checkpoint(void) |
1263 | /*=========================*/ |
1264 | { |
1265 | ut_ad(log_mutex_own()); |
1266 | ut_ad(log_sys.n_pending_checkpoint_writes == 0); |
1267 | |
1268 | log_sys.next_checkpoint_no++; |
1269 | |
1270 | log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; |
1271 | MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, |
1272 | log_sys.lsn - log_sys.last_checkpoint_lsn); |
1273 | |
1274 | DBUG_PRINT("ib_log" , ("checkpoint ended at " LSN_PF |
1275 | ", flushed to " LSN_PF, |
1276 | log_sys.last_checkpoint_lsn, |
1277 | log_sys.flushed_to_disk_lsn)); |
1278 | |
1279 | rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT); |
1280 | } |
1281 | |
1282 | /** Complete an asynchronous checkpoint write. */ |
1283 | void log_t::complete_checkpoint() |
1284 | { |
1285 | ut_ad(this == &log_sys); |
1286 | MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE); |
1287 | |
1288 | log_mutex_enter(); |
1289 | |
1290 | ut_ad(n_pending_checkpoint_writes > 0); |
1291 | |
1292 | if (!--n_pending_checkpoint_writes) { |
1293 | log_complete_checkpoint(); |
1294 | } |
1295 | |
1296 | log_mutex_exit(); |
1297 | } |
1298 | |
1299 | /** Write checkpoint info to the log header. |
1300 | @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */ |
1301 | static |
1302 | void |
1303 | log_group_checkpoint(lsn_t end_lsn) |
1304 | { |
1305 | lsn_t lsn_offset; |
1306 | |
1307 | ut_ad(!srv_read_only_mode); |
1308 | ut_ad(log_mutex_own()); |
1309 | ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn); |
1310 | ut_ad(end_lsn <= log_sys.lsn); |
1311 | ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn |
1312 | || srv_shutdown_state != SRV_SHUTDOWN_NONE); |
1313 | |
1314 | DBUG_PRINT("ib_log" , ("checkpoint " UINT64PF " at " LSN_PF |
1315 | " written" , |
1316 | log_sys.next_checkpoint_no, |
1317 | log_sys.next_checkpoint_lsn)); |
1318 | |
1319 | byte* buf = log_sys.checkpoint_buf; |
1320 | memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE); |
1321 | |
1322 | mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no); |
1323 | mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn); |
1324 | |
1325 | if (log_sys.is_encrypted()) { |
1326 | log_crypt_write_checkpoint_buf(buf); |
1327 | } |
1328 | |
1329 | lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn); |
1330 | mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset); |
1331 | mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, |
1332 | srv_log_buffer_size); |
1333 | mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn); |
1334 | |
1335 | log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf)); |
1336 | |
1337 | MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE); |
1338 | |
1339 | log_sys.n_log_ios++; |
1340 | |
1341 | MONITOR_INC(MONITOR_LOG_IO); |
1342 | |
1343 | ut_ad(LOG_CHECKPOINT_1 < srv_page_size); |
1344 | ut_ad(LOG_CHECKPOINT_2 < srv_page_size); |
1345 | |
1346 | if (log_sys.n_pending_checkpoint_writes++ == 0) { |
1347 | rw_lock_x_lock_gen(&log_sys.checkpoint_lock, |
1348 | LOG_CHECKPOINT); |
1349 | } |
1350 | |
1351 | /* Note: We alternate the physical place of the checkpoint info. |
1352 | See the (next_checkpoint_no & 1) below. */ |
1353 | |
1354 | fil_io(IORequestLogWrite, false, |
1355 | page_id_t(SRV_LOG_SPACE_FIRST_ID, 0), |
1356 | univ_page_size, |
1357 | (log_sys.next_checkpoint_no & 1) |
1358 | ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1, |
1359 | OS_FILE_LOG_BLOCK_SIZE, |
1360 | buf, reinterpret_cast<void*>(1) /* checkpoint write */); |
1361 | } |
1362 | |
1363 | /** Read a log group header page to log_sys.checkpoint_buf. |
1364 | @param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */ |
1365 | void (ulint ) |
1366 | { |
1367 | ut_ad(log_mutex_own()); |
1368 | |
1369 | log_sys.n_log_ios++; |
1370 | |
1371 | MONITOR_INC(MONITOR_LOG_IO); |
1372 | |
1373 | fil_io(IORequestLogRead, true, |
1374 | page_id_t(SRV_LOG_SPACE_FIRST_ID, |
1375 | header >> srv_page_size_shift), |
1376 | univ_page_size, header & (srv_page_size - 1), |
1377 | OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL); |
1378 | } |
1379 | |
1380 | /** Write checkpoint info to the log header and invoke log_mutex_exit(). |
1381 | @param[in] sync whether to wait for the write to complete |
1382 | @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */ |
1383 | void |
1384 | log_write_checkpoint_info(bool sync, lsn_t end_lsn) |
1385 | { |
1386 | ut_ad(log_mutex_own()); |
1387 | ut_ad(!srv_read_only_mode); |
1388 | |
1389 | log_group_checkpoint(end_lsn); |
1390 | |
1391 | log_mutex_exit(); |
1392 | |
1393 | MONITOR_INC(MONITOR_NUM_CHECKPOINT); |
1394 | |
1395 | if (sync) { |
1396 | /* Wait for the checkpoint write to complete */ |
1397 | rw_lock_s_lock(&log_sys.checkpoint_lock); |
1398 | rw_lock_s_unlock(&log_sys.checkpoint_lock); |
1399 | |
1400 | DBUG_EXECUTE_IF( |
1401 | "crash_after_checkpoint" , |
1402 | DBUG_SUICIDE();); |
1403 | } |
1404 | } |
1405 | |
1406 | /** Set extra data to be written to the redo log during checkpoint. |
1407 | @param[in] buf data to be appended on checkpoint, or NULL |
1408 | @return pointer to previous data to be appended on checkpoint */ |
1409 | mtr_buf_t* |
1410 | log_append_on_checkpoint( |
1411 | mtr_buf_t* buf) |
1412 | { |
1413 | log_mutex_enter(); |
1414 | mtr_buf_t* old = log_sys.append_on_checkpoint; |
1415 | log_sys.append_on_checkpoint = buf; |
1416 | log_mutex_exit(); |
1417 | return(old); |
1418 | } |
1419 | |
1420 | /** Make a checkpoint. Note that this function does not flush dirty |
1421 | blocks from the buffer pool: it only checks what is lsn of the oldest |
1422 | modification in the pool, and writes information about the lsn in |
1423 | log files. Use log_make_checkpoint_at() to flush also the pool. |
1424 | @param[in] sync whether to wait for the write to complete |
1425 | @param[in] write_always force a write even if no log |
1426 | has been generated since the latest checkpoint |
1427 | @return true if success, false if a checkpoint write was already running */ |
1428 | bool |
1429 | log_checkpoint( |
1430 | bool sync, |
1431 | bool write_always) |
1432 | { |
1433 | lsn_t oldest_lsn; |
1434 | |
1435 | ut_ad(!srv_read_only_mode); |
1436 | |
1437 | DBUG_EXECUTE_IF("no_checkpoint" , |
1438 | /* We sleep for a long enough time, forcing |
1439 | the checkpoint doesn't happen any more. */ |
1440 | os_thread_sleep(360000000);); |
1441 | |
1442 | if (recv_recovery_is_on()) { |
1443 | recv_apply_hashed_log_recs(true); |
1444 | } |
1445 | |
1446 | switch (srv_file_flush_method) { |
1447 | case SRV_NOSYNC: |
1448 | break; |
1449 | case SRV_O_DSYNC: |
1450 | case SRV_FSYNC: |
1451 | case SRV_LITTLESYNC: |
1452 | case SRV_O_DIRECT: |
1453 | case SRV_O_DIRECT_NO_FSYNC: |
1454 | #ifdef _WIN32 |
1455 | case SRV_ALL_O_DIRECT_FSYNC: |
1456 | #endif |
1457 | fil_flush_file_spaces(FIL_TYPE_TABLESPACE); |
1458 | } |
1459 | |
1460 | log_mutex_enter(); |
1461 | |
1462 | ut_ad(!recv_no_log_write); |
1463 | oldest_lsn = log_buf_pool_get_oldest_modification(); |
1464 | |
1465 | /* Because log also contains headers and dummy log records, |
1466 | log_buf_pool_get_oldest_modification() will return log_sys.lsn |
1467 | if the buffer pool contains no dirty buffers. |
1468 | We must make sure that the log is flushed up to that lsn. |
1469 | If there are dirty buffers in the buffer pool, then our |
1470 | write-ahead-logging algorithm ensures that the log has been |
1471 | flushed up to oldest_lsn. */ |
1472 | |
1473 | ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); |
1474 | if (!write_always |
1475 | && oldest_lsn |
1476 | <= log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) { |
1477 | /* Do nothing, because nothing was logged (other than |
1478 | a MLOG_CHECKPOINT marker) since the previous checkpoint. */ |
1479 | log_mutex_exit(); |
1480 | return(true); |
1481 | } |
1482 | /* Repeat the MLOG_FILE_NAME records after the checkpoint, in |
1483 | case some log records between the checkpoint and log_sys.lsn |
1484 | need them. Finally, write a MLOG_CHECKPOINT marker. Redo log |
1485 | apply expects to see a MLOG_CHECKPOINT after the checkpoint, |
1486 | except on clean shutdown, where the log will be empty after |
1487 | the checkpoint. |
1488 | It is important that we write out the redo log before any |
1489 | further dirty pages are flushed to the tablespace files. At |
1490 | this point, because log_mutex_own(), mtr_commit() in other |
1491 | threads will be blocked, and no pages can be added to the |
1492 | flush lists. */ |
1493 | lsn_t flush_lsn = oldest_lsn; |
1494 | const lsn_t end_lsn = log_sys.lsn; |
1495 | const bool do_write |
1496 | = srv_shutdown_state == SRV_SHUTDOWN_NONE |
1497 | || flush_lsn != end_lsn; |
1498 | |
1499 | if (fil_names_clear(flush_lsn, do_write)) { |
1500 | ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT); |
1501 | flush_lsn = log_sys.lsn; |
1502 | } |
1503 | |
1504 | log_mutex_exit(); |
1505 | |
1506 | log_write_up_to(flush_lsn, true); |
1507 | |
1508 | DBUG_EXECUTE_IF( |
1509 | "using_wa_checkpoint_middle" , |
1510 | if (write_always) { |
1511 | DEBUG_SYNC_C("wa_checkpoint_middle" ); |
1512 | |
1513 | const my_bool b = TRUE; |
1514 | buf_flush_page_cleaner_disabled_debug_update( |
1515 | NULL, NULL, NULL, &b); |
1516 | dict_stats_disabled_debug_update( |
1517 | NULL, NULL, NULL, &b); |
1518 | srv_master_thread_disabled_debug_update( |
1519 | NULL, NULL, NULL, &b); |
1520 | }); |
1521 | |
1522 | log_mutex_enter(); |
1523 | |
1524 | ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn); |
1525 | ut_ad(flush_lsn >= oldest_lsn); |
1526 | |
1527 | if (log_sys.last_checkpoint_lsn >= oldest_lsn) { |
1528 | log_mutex_exit(); |
1529 | return(true); |
1530 | } |
1531 | |
1532 | if (log_sys.n_pending_checkpoint_writes > 0) { |
1533 | /* A checkpoint write is running */ |
1534 | log_mutex_exit(); |
1535 | |
1536 | if (sync) { |
1537 | /* Wait for the checkpoint write to complete */ |
1538 | rw_lock_s_lock(&log_sys.checkpoint_lock); |
1539 | rw_lock_s_unlock(&log_sys.checkpoint_lock); |
1540 | } |
1541 | |
1542 | return(false); |
1543 | } |
1544 | |
1545 | log_sys.next_checkpoint_lsn = oldest_lsn; |
1546 | log_write_checkpoint_info(sync, end_lsn); |
1547 | ut_ad(!log_mutex_own()); |
1548 | |
1549 | return(true); |
1550 | } |
1551 | |
1552 | /** Make a checkpoint at or after a specified LSN. |
1553 | @param[in] lsn the log sequence number, or LSN_MAX |
1554 | for the latest LSN |
1555 | @param[in] write_always force a write even if no log |
1556 | has been generated since the latest checkpoint */ |
1557 | void |
1558 | log_make_checkpoint_at( |
1559 | lsn_t lsn, |
1560 | bool write_always) |
1561 | { |
1562 | /* Preflush pages synchronously */ |
1563 | |
1564 | while (!log_preflush_pool_modified_pages(lsn)) { |
1565 | /* Flush as much as we can */ |
1566 | } |
1567 | |
1568 | while (!log_checkpoint(true, write_always)) { |
1569 | /* Force a checkpoint */ |
1570 | } |
1571 | } |
1572 | |
1573 | /****************************************************************//** |
1574 | Tries to establish a big enough margin of free space in the log groups, such |
1575 | that a new log entry can be catenated without an immediate need for a |
1576 | checkpoint. NOTE: this function may only be called if the calling thread |
1577 | owns no synchronization objects! */ |
1578 | static |
1579 | void |
1580 | log_checkpoint_margin(void) |
1581 | /*=======================*/ |
1582 | { |
1583 | lsn_t age; |
1584 | lsn_t checkpoint_age; |
1585 | ib_uint64_t advance; |
1586 | lsn_t oldest_lsn; |
1587 | bool success; |
1588 | loop: |
1589 | advance = 0; |
1590 | |
1591 | log_mutex_enter(); |
1592 | ut_ad(!recv_no_log_write); |
1593 | |
1594 | if (!log_sys.check_flush_or_checkpoint) { |
1595 | log_mutex_exit(); |
1596 | return; |
1597 | } |
1598 | |
1599 | oldest_lsn = log_buf_pool_get_oldest_modification(); |
1600 | |
1601 | age = log_sys.lsn - oldest_lsn; |
1602 | |
1603 | if (age > log_sys.max_modified_age_sync) { |
1604 | |
1605 | /* A flush is urgent: we have to do a synchronous preflush */ |
1606 | advance = age - log_sys.max_modified_age_sync; |
1607 | } |
1608 | |
1609 | checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn; |
1610 | |
1611 | bool checkpoint_sync; |
1612 | bool do_checkpoint; |
1613 | |
1614 | if (checkpoint_age > log_sys.max_checkpoint_age) { |
1615 | /* A checkpoint is urgent: we do it synchronously */ |
1616 | checkpoint_sync = true; |
1617 | do_checkpoint = true; |
1618 | } else if (checkpoint_age > log_sys.max_checkpoint_age_async) { |
1619 | /* A checkpoint is not urgent: do it asynchronously */ |
1620 | do_checkpoint = true; |
1621 | checkpoint_sync = false; |
1622 | log_sys.check_flush_or_checkpoint = false; |
1623 | } else { |
1624 | do_checkpoint = false; |
1625 | checkpoint_sync = false; |
1626 | log_sys.check_flush_or_checkpoint = false; |
1627 | } |
1628 | |
1629 | log_mutex_exit(); |
1630 | |
1631 | if (advance) { |
1632 | lsn_t new_oldest = oldest_lsn + advance; |
1633 | |
1634 | success = log_preflush_pool_modified_pages(new_oldest); |
1635 | |
1636 | /* If the flush succeeded, this thread has done its part |
1637 | and can proceed. If it did not succeed, there was another |
1638 | thread doing a flush at the same time. */ |
1639 | if (!success) { |
1640 | log_mutex_enter(); |
1641 | log_sys.check_flush_or_checkpoint = true; |
1642 | log_mutex_exit(); |
1643 | goto loop; |
1644 | } |
1645 | } |
1646 | |
1647 | if (do_checkpoint) { |
1648 | log_checkpoint(checkpoint_sync, FALSE); |
1649 | |
1650 | if (checkpoint_sync) { |
1651 | |
1652 | goto loop; |
1653 | } |
1654 | } |
1655 | } |
1656 | |
1657 | /** |
1658 | Checks that there is enough free space in the log to start a new query step. |
1659 | Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this |
1660 | function may only be called if the calling thread owns no synchronization |
1661 | objects! */ |
1662 | void |
1663 | log_check_margins(void) |
1664 | { |
1665 | bool check; |
1666 | |
1667 | do { |
1668 | log_flush_margin(); |
1669 | log_checkpoint_margin(); |
1670 | log_mutex_enter(); |
1671 | ut_ad(!recv_no_log_write); |
1672 | check = log_sys.check_flush_or_checkpoint; |
1673 | log_mutex_exit(); |
1674 | } while (check); |
1675 | } |
1676 | |
1677 | /****************************************************************//** |
1678 | Makes a checkpoint at the latest lsn and writes it to first page of each |
1679 | data file in the database, so that we know that the file spaces contain |
1680 | all modifications up to that lsn. This can only be called at database |
1681 | shutdown. This function also writes all log in log files to the log archive. */ |
1682 | void |
1683 | logs_empty_and_mark_files_at_shutdown(void) |
1684 | /*=======================================*/ |
1685 | { |
1686 | lsn_t lsn; |
1687 | ulint count = 0; |
1688 | |
1689 | ib::info() << "Starting shutdown..." ; |
1690 | |
1691 | /* Wait until the master thread and all other operations are idle: our |
1692 | algorithm only works if the server is idle at shutdown */ |
1693 | |
1694 | srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; |
1695 | loop: |
1696 | ut_ad(lock_sys.is_initialised() || !srv_was_started); |
1697 | ut_ad(log_sys.is_initialised() || !srv_was_started); |
1698 | ut_ad(fil_system.is_initialised() || !srv_was_started); |
1699 | os_event_set(srv_buf_resize_event); |
1700 | |
1701 | if (!srv_read_only_mode) { |
1702 | os_event_set(srv_error_event); |
1703 | os_event_set(srv_monitor_event); |
1704 | os_event_set(srv_buf_dump_event); |
1705 | if (lock_sys.timeout_thread_active) { |
1706 | os_event_set(lock_sys.timeout_event); |
1707 | } |
1708 | if (dict_stats_event) { |
1709 | os_event_set(dict_stats_event); |
1710 | } else { |
1711 | ut_ad(!srv_dict_stats_thread_active); |
1712 | } |
1713 | if (recv_sys && recv_sys->flush_start) { |
1714 | /* This is in case recv_writer_thread was never |
1715 | started, or buf_flush_page_cleaner_coordinator |
1716 | failed to notice its termination. */ |
1717 | os_event_set(recv_sys->flush_start); |
1718 | } |
1719 | } |
1720 | #define COUNT_INTERVAL 600U |
1721 | #define CHECK_INTERVAL 100000U |
1722 | os_thread_sleep(CHECK_INTERVAL); |
1723 | |
1724 | count++; |
1725 | |
1726 | /* Check that there are no longer transactions, except for |
1727 | PREPARED ones. We need this wait even for the 'very fast' |
1728 | shutdown, because the InnoDB layer may have committed or |
1729 | prepared transactions and we don't want to lose them. */ |
1730 | |
1731 | if (ulint total_trx = srv_was_started && !srv_read_only_mode |
1732 | && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO |
1733 | ? trx_sys.any_active_transactions() : 0) { |
1734 | |
1735 | if (srv_print_verbose_log && count > COUNT_INTERVAL) { |
1736 | service_manager_extend_timeout( |
1737 | COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2, |
1738 | "Waiting for %lu active transactions to finish" , |
1739 | (ulong) total_trx); |
1740 | ib::info() << "Waiting for " << total_trx << " active" |
1741 | << " transactions to finish" ; |
1742 | |
1743 | count = 0; |
1744 | } |
1745 | |
1746 | goto loop; |
1747 | } |
1748 | |
1749 | /* We need these threads to stop early in shutdown. */ |
1750 | const char* thread_name; |
1751 | |
1752 | if (srv_error_monitor_active) { |
1753 | thread_name = "srv_error_monitor_thread" ; |
1754 | } else if (srv_monitor_active) { |
1755 | thread_name = "srv_monitor_thread" ; |
1756 | } else if (srv_buf_resize_thread_active) { |
1757 | thread_name = "buf_resize_thread" ; |
1758 | goto wait_suspend_loop; |
1759 | } else if (srv_dict_stats_thread_active) { |
1760 | thread_name = "dict_stats_thread" ; |
1761 | } else if (lock_sys.timeout_thread_active) { |
1762 | thread_name = "lock_wait_timeout_thread" ; |
1763 | } else if (srv_buf_dump_thread_active) { |
1764 | thread_name = "buf_dump_thread" ; |
1765 | goto wait_suspend_loop; |
1766 | } else if (btr_defragment_thread_active) { |
1767 | thread_name = "btr_defragment_thread" ; |
1768 | } else if (srv_fast_shutdown != 2 && trx_rollback_is_active) { |
1769 | thread_name = "rollback of recovered transactions" ; |
1770 | } else { |
1771 | thread_name = NULL; |
1772 | } |
1773 | |
1774 | if (thread_name) { |
1775 | ut_ad(!srv_read_only_mode); |
1776 | wait_suspend_loop: |
1777 | service_manager_extend_timeout( |
1778 | COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2, |
1779 | "Waiting for %s to exit" , thread_name); |
1780 | if (srv_print_verbose_log && count > COUNT_INTERVAL) { |
1781 | ib::info() << "Waiting for " << thread_name |
1782 | << "to exit" ; |
1783 | count = 0; |
1784 | } |
1785 | goto loop; |
1786 | } |
1787 | |
1788 | /* Check that the background threads are suspended */ |
1789 | |
1790 | switch (srv_get_active_thread_type()) { |
1791 | case SRV_NONE: |
1792 | if (!srv_n_fil_crypt_threads_started) { |
1793 | srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE; |
1794 | break; |
1795 | } |
1796 | os_event_set(fil_crypt_threads_event); |
1797 | thread_name = "fil_crypt_thread" ; |
1798 | goto wait_suspend_loop; |
1799 | case SRV_PURGE: |
1800 | case SRV_WORKER: |
1801 | ut_ad(!"purge was not shut down" ); |
1802 | srv_purge_wakeup(); |
1803 | thread_name = "purge thread" ; |
1804 | goto wait_suspend_loop; |
1805 | case SRV_MASTER: |
1806 | thread_name = "master thread" ; |
1807 | goto wait_suspend_loop; |
1808 | } |
1809 | |
1810 | /* At this point only page_cleaner should be active. We wait |
1811 | here to let it complete the flushing of the buffer pools |
1812 | before proceeding further. */ |
1813 | |
1814 | count = 0; |
1815 | service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2, |
1816 | "Waiting for page cleaner" ); |
1817 | while (buf_page_cleaner_is_active) { |
1818 | ++count; |
1819 | os_thread_sleep(CHECK_INTERVAL); |
1820 | if (srv_print_verbose_log && count > COUNT_INTERVAL) { |
1821 | service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2, |
1822 | "Waiting for page cleaner" ); |
1823 | ib::info() << "Waiting for page_cleaner to " |
1824 | "finish flushing of buffer pool" ; |
1825 | count = 0; |
1826 | } |
1827 | } |
1828 | |
1829 | if (log_scrub_thread_active) { |
1830 | ut_ad(!srv_read_only_mode); |
1831 | os_event_set(log_scrub_event); |
1832 | } |
1833 | |
1834 | if (log_sys.is_initialised()) { |
1835 | log_mutex_enter(); |
1836 | const ulint n_write = log_sys.n_pending_checkpoint_writes; |
1837 | const ulint n_flush = log_sys.n_pending_flushes; |
1838 | log_mutex_exit(); |
1839 | |
1840 | if (log_scrub_thread_active || n_write || n_flush) { |
1841 | if (srv_print_verbose_log && count > 600) { |
1842 | ib::info() << "Pending checkpoint_writes: " |
1843 | << n_write |
1844 | << ". Pending log flush writes: " |
1845 | << n_flush; |
1846 | count = 0; |
1847 | } |
1848 | goto loop; |
1849 | } |
1850 | } |
1851 | |
1852 | ut_ad(!log_scrub_thread_active); |
1853 | |
1854 | if (!buf_pool_ptr) { |
1855 | ut_ad(!srv_was_started); |
1856 | } else if (ulint pending_io = buf_pool_check_no_pending_io()) { |
1857 | if (srv_print_verbose_log && count > 600) { |
1858 | ib::info() << "Waiting for " << pending_io << " buffer" |
1859 | " page I/Os to complete" ; |
1860 | count = 0; |
1861 | } |
1862 | |
1863 | goto loop; |
1864 | } |
1865 | |
1866 | if (srv_fast_shutdown == 2 || !srv_was_started) { |
1867 | if (!srv_read_only_mode && srv_was_started) { |
1868 | ib::info() << "MySQL has requested a very fast" |
1869 | " shutdown without flushing the InnoDB buffer" |
1870 | " pool to data files. At the next mysqld" |
1871 | " startup InnoDB will do a crash recovery!" ; |
1872 | |
1873 | /* In this fastest shutdown we do not flush the |
1874 | buffer pool: |
1875 | |
1876 | it is essentially a 'crash' of the InnoDB server. |
1877 | Make sure that the log is all flushed to disk, so |
1878 | that we can recover all committed transactions in |
1879 | a crash recovery. We must not write the lsn stamps |
1880 | to the data files, since at a startup InnoDB deduces |
1881 | from the stamps if the previous shutdown was clean. */ |
1882 | |
1883 | log_buffer_flush_to_disk(); |
1884 | } |
1885 | |
1886 | srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; |
1887 | |
1888 | if (fil_system.is_initialised()) { |
1889 | fil_close_all_files(); |
1890 | } |
1891 | return; |
1892 | } |
1893 | |
1894 | if (!srv_read_only_mode) { |
1895 | service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, |
1896 | "ensuring dirty buffer pool are written to log" ); |
1897 | log_make_checkpoint_at(LSN_MAX, TRUE); |
1898 | |
1899 | log_mutex_enter(); |
1900 | |
1901 | lsn = log_sys.lsn; |
1902 | |
1903 | const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn; |
1904 | ut_ad(lsn >= log_sys.last_checkpoint_lsn); |
1905 | |
1906 | log_mutex_exit(); |
1907 | |
1908 | if (lsn_changed) { |
1909 | goto loop; |
1910 | } |
1911 | |
1912 | /* Ensure that all buffered changes are written to the |
1913 | redo log before fil_close_all_files(). */ |
1914 | fil_flush_file_spaces(FIL_TYPE_LOG); |
1915 | } else { |
1916 | lsn = srv_start_lsn; |
1917 | } |
1918 | |
1919 | srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; |
1920 | |
1921 | /* Make some checks that the server really is quiet */ |
1922 | ut_a(srv_get_active_thread_type() == SRV_NONE); |
1923 | |
1924 | service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, |
1925 | "Free innodb buffer pool" ); |
1926 | buf_all_freed(); |
1927 | |
1928 | ut_a(lsn == log_sys.lsn |
1929 | || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); |
1930 | |
1931 | if (lsn < srv_start_lsn) { |
1932 | ib::error() << "Shutdown LSN=" << lsn |
1933 | << " is less than start LSN=" << srv_start_lsn; |
1934 | } |
1935 | |
1936 | srv_shutdown_lsn = lsn; |
1937 | |
1938 | if (!srv_read_only_mode) { |
1939 | dberr_t err = fil_write_flushed_lsn(lsn); |
1940 | |
1941 | if (err != DB_SUCCESS) { |
1942 | ib::error() << "Writing flushed lsn " << lsn |
1943 | << " failed; error=" << err; |
1944 | } |
1945 | } |
1946 | |
1947 | fil_close_all_files(); |
1948 | |
1949 | /* Make some checks that the server really is quiet */ |
1950 | ut_a(srv_get_active_thread_type() == SRV_NONE); |
1951 | |
1952 | ut_a(lsn == log_sys.lsn |
1953 | || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); |
1954 | } |
1955 | |
1956 | /******************************************************//** |
1957 | Peeks the current lsn. |
1958 | @return TRUE if success, FALSE if could not get the log system mutex */ |
1959 | ibool |
1960 | log_peek_lsn( |
1961 | /*=========*/ |
1962 | lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */ |
1963 | { |
1964 | if (0 == mutex_enter_nowait(&(log_sys.mutex))) { |
1965 | *lsn = log_sys.lsn; |
1966 | |
1967 | log_mutex_exit(); |
1968 | |
1969 | return(TRUE); |
1970 | } |
1971 | |
1972 | return(FALSE); |
1973 | } |
1974 | |
1975 | /******************************************************//** |
1976 | Prints info of the log. */ |
1977 | void |
1978 | log_print( |
1979 | /*======*/ |
1980 | FILE* file) /*!< in: file where to print */ |
1981 | { |
1982 | double time_elapsed; |
1983 | time_t current_time; |
1984 | |
1985 | log_mutex_enter(); |
1986 | |
1987 | fprintf(file, |
1988 | "Log sequence number " LSN_PF "\n" |
1989 | "Log flushed up to " LSN_PF "\n" |
1990 | "Pages flushed up to " LSN_PF "\n" |
1991 | "Last checkpoint at " LSN_PF "\n" , |
1992 | log_sys.lsn, |
1993 | log_sys.flushed_to_disk_lsn, |
1994 | log_buf_pool_get_oldest_modification(), |
1995 | log_sys.last_checkpoint_lsn); |
1996 | |
1997 | current_time = time(NULL); |
1998 | |
1999 | time_elapsed = difftime(current_time, |
2000 | log_sys.last_printout_time); |
2001 | |
2002 | if (time_elapsed <= 0) { |
2003 | time_elapsed = 1; |
2004 | } |
2005 | |
2006 | fprintf(file, |
2007 | ULINTPF " pending log flushes, " |
2008 | ULINTPF " pending chkp writes\n" |
2009 | ULINTPF " log i/o's done, %.2f log i/o's/second\n" , |
2010 | log_sys.n_pending_flushes, |
2011 | log_sys.n_pending_checkpoint_writes, |
2012 | log_sys.n_log_ios, |
2013 | static_cast<double>( |
2014 | log_sys.n_log_ios - log_sys.n_log_ios_old) |
2015 | / time_elapsed); |
2016 | |
2017 | log_sys.n_log_ios_old = log_sys.n_log_ios; |
2018 | log_sys.last_printout_time = current_time; |
2019 | |
2020 | log_mutex_exit(); |
2021 | } |
2022 | |
2023 | /**********************************************************************//** |
2024 | Refreshes the statistics used to print per-second averages. */ |
2025 | void |
2026 | log_refresh_stats(void) |
2027 | /*===================*/ |
2028 | { |
2029 | log_sys.n_log_ios_old = log_sys.n_log_ios; |
2030 | log_sys.last_printout_time = time(NULL); |
2031 | } |
2032 | |
2033 | /** Shut down the redo log subsystem. */ |
2034 | void log_t::close() |
2035 | { |
2036 | ut_ad(this == &log_sys); |
2037 | if (!is_initialised()) return; |
2038 | m_initialised = false; |
2039 | log.close(); |
2040 | |
2041 | if (!first_in_use) |
2042 | buf -= srv_log_buffer_size; |
2043 | ut_free_dodump(buf, srv_log_buffer_size * 2); |
2044 | buf = NULL; |
2045 | |
2046 | os_event_destroy(flush_event); |
2047 | |
2048 | rw_lock_free(&checkpoint_lock); |
2049 | /* rw_lock_free() already called checkpoint_lock.~rw_lock_t(); |
2050 | tame the debug assertions when the destructor will be called once more. */ |
2051 | ut_ad(checkpoint_lock.magic_n == 0); |
2052 | ut_d(checkpoint_lock.magic_n = RW_LOCK_MAGIC_N); |
2053 | |
2054 | mutex_free(&mutex); |
2055 | mutex_free(&write_mutex); |
2056 | mutex_free(&log_flush_order_mutex); |
2057 | |
2058 | if (!srv_read_only_mode && srv_scrub_log) |
2059 | os_event_destroy(log_scrub_event); |
2060 | |
2061 | recv_sys_close(); |
2062 | } |
2063 | |
2064 | /******************************************************//** |
2065 | Pads the current log block full with dummy log records. Used in producing |
2066 | consistent archived log files and scrubbing redo log. */ |
2067 | static |
2068 | void |
2069 | log_pad_current_log_block(void) |
2070 | /*===========================*/ |
2071 | { |
2072 | byte b = MLOG_DUMMY_RECORD; |
2073 | ulint pad_length; |
2074 | ulint i; |
2075 | lsn_t lsn; |
2076 | |
2077 | ut_ad(!recv_no_log_write); |
2078 | /* We retrieve lsn only because otherwise gcc crashed on HP-UX */ |
2079 | lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE); |
2080 | |
2081 | pad_length = OS_FILE_LOG_BLOCK_SIZE |
2082 | - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) |
2083 | - LOG_BLOCK_TRL_SIZE; |
2084 | if (pad_length |
2085 | == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE |
2086 | - LOG_BLOCK_TRL_SIZE)) { |
2087 | |
2088 | pad_length = 0; |
2089 | } |
2090 | |
2091 | if (pad_length) { |
2092 | srv_stats.n_log_scrubs.inc(); |
2093 | } |
2094 | |
2095 | for (i = 0; i < pad_length; i++) { |
2096 | log_write_low(&b, 1); |
2097 | } |
2098 | |
2099 | lsn = log_sys.lsn; |
2100 | |
2101 | log_close(); |
2102 | |
2103 | ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE); |
2104 | } |
2105 | |
2106 | /*****************************************************************//* |
2107 | If no log record has been written for a while, fill current log |
2108 | block with dummy records. */ |
2109 | static |
2110 | void |
2111 | log_scrub() |
2112 | /*=========*/ |
2113 | { |
2114 | log_mutex_enter(); |
2115 | ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn); |
2116 | |
2117 | if (next_lbn_to_pad == cur_lbn) |
2118 | { |
2119 | log_pad_current_log_block(); |
2120 | } |
2121 | |
2122 | next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn); |
2123 | log_mutex_exit(); |
2124 | } |
2125 | |
2126 | /* log scrubbing speed, in bytes/sec */ |
2127 | UNIV_INTERN ulonglong innodb_scrub_log_speed; |
2128 | |
2129 | /*****************************************************************//** |
2130 | This is the main thread for log scrub. It waits for an event and |
2131 | when waked up fills current log block with dummy records and |
2132 | sleeps again. |
2133 | @return this function does not return, it calls os_thread_exit() */ |
2134 | extern "C" UNIV_INTERN |
2135 | os_thread_ret_t |
2136 | DECLARE_THREAD(log_scrub_thread)(void*) |
2137 | { |
2138 | ut_ad(!srv_read_only_mode); |
2139 | |
2140 | while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) { |
2141 | /* log scrubbing interval in µs. */ |
2142 | ulonglong interval = 1000*1000*512/innodb_scrub_log_speed; |
2143 | |
2144 | os_event_wait_time(log_scrub_event, static_cast<ulint>(interval)); |
2145 | |
2146 | log_scrub(); |
2147 | |
2148 | os_event_reset(log_scrub_event); |
2149 | } |
2150 | |
2151 | log_scrub_thread_active = false; |
2152 | |
2153 | /* We count the number of threads in os_thread_exit(). A created |
2154 | thread should always use that to exit and not use return() to exit. */ |
2155 | os_thread_exit(); |
2156 | |
2157 | OS_THREAD_DUMMY_RETURN; |
2158 | } |
2159 | |