1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * checkpointer.c |
4 | * |
5 | * The checkpointer is new as of Postgres 9.2. It handles all checkpoints. |
6 | * Checkpoints are automatically dispatched after a certain amount of time has |
7 | * elapsed since the last one, and it can be signaled to perform requested |
8 | * checkpoints as well. (The GUC parameter that mandates a checkpoint every |
9 | * so many WAL segments is implemented by having backends signal when they |
10 | * fill WAL segments; the checkpointer itself doesn't watch for the |
11 | * condition.) |
12 | * |
13 | * The checkpointer is started by the postmaster as soon as the startup |
14 | * subprocess finishes, or as soon as recovery begins if we are doing archive |
15 | * recovery. It remains alive until the postmaster commands it to terminate. |
16 | * Normal termination is by SIGUSR2, which instructs the checkpointer to |
17 | * execute a shutdown checkpoint and then exit(0). (All backends must be |
18 | * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; |
19 | * like any backend, the checkpointer will simply abort and exit on SIGQUIT. |
20 | * |
21 | * If the checkpointer exits unexpectedly, the postmaster treats that the same |
22 | * as a backend crash: shared memory may be corrupted, so remaining backends |
23 | * should be killed by SIGQUIT and then a recovery cycle started. (Even if |
24 | * shared memory isn't corrupted, we have lost information about which |
25 | * files need to be fsync'd for the next checkpoint, and so a system |
26 | * restart needs to be forced.) |
27 | * |
28 | * |
29 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
30 | * |
31 | * |
32 | * IDENTIFICATION |
33 | * src/backend/postmaster/checkpointer.c |
34 | * |
35 | *------------------------------------------------------------------------- |
36 | */ |
37 | #include "postgres.h" |
38 | |
39 | #include <signal.h> |
40 | #include <sys/time.h> |
41 | #include <time.h> |
42 | #include <unistd.h> |
43 | |
44 | #include "access/xlog.h" |
45 | #include "access/xlog_internal.h" |
46 | #include "libpq/pqsignal.h" |
47 | #include "miscadmin.h" |
48 | #include "pgstat.h" |
49 | #include "postmaster/bgwriter.h" |
50 | #include "replication/syncrep.h" |
51 | #include "storage/bufmgr.h" |
52 | #include "storage/condition_variable.h" |
53 | #include "storage/fd.h" |
54 | #include "storage/ipc.h" |
55 | #include "storage/lwlock.h" |
56 | #include "storage/proc.h" |
57 | #include "storage/shmem.h" |
58 | #include "storage/smgr.h" |
59 | #include "storage/spin.h" |
60 | #include "utils/guc.h" |
61 | #include "utils/memutils.h" |
62 | #include "utils/resowner.h" |
63 | |
64 | |
65 | /*---------- |
66 | * Shared memory area for communication between checkpointer and backends |
67 | * |
68 | * The ckpt counters allow backends to watch for completion of a checkpoint |
69 | * request they send. Here's how it works: |
70 | * * At start of a checkpoint, checkpointer reads (and clears) the request |
71 | * flags and increments ckpt_started, while holding ckpt_lck. |
72 | * * On completion of a checkpoint, checkpointer sets ckpt_done to |
73 | * equal ckpt_started. |
74 | * * On failure of a checkpoint, checkpointer increments ckpt_failed |
75 | * and sets ckpt_done to equal ckpt_started. |
76 | * |
77 | * The algorithm for backends is: |
78 | * 1. Record current values of ckpt_failed and ckpt_started, and |
79 | * set request flags, while holding ckpt_lck. |
80 | * 2. Send signal to request checkpoint. |
81 | * 3. Sleep until ckpt_started changes. Now you know a checkpoint has |
82 | * begun since you started this algorithm (although *not* that it was |
83 | * specifically initiated by your signal), and that it is using your flags. |
84 | * 4. Record new value of ckpt_started. |
85 | * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo |
86 | * arithmetic here in case counters wrap around.) Now you know a |
87 | * checkpoint has started and completed, but not whether it was |
88 | * successful. |
89 | * 6. If ckpt_failed is different from the originally saved value, |
90 | * assume request failed; otherwise it was definitely successful. |
91 | * |
92 | * ckpt_flags holds the OR of the checkpoint request flags sent by all |
93 | * requesting backends since the last checkpoint start. The flags are |
94 | * chosen so that OR'ing is the correct way to combine multiple requests. |
95 | * |
96 | * num_backend_writes is used to count the number of buffer writes performed |
97 | * by user backend processes. This counter should be wide enough that it |
98 | * can't overflow during a single processing cycle. num_backend_fsync |
99 | * counts the subset of those writes that also had to do their own fsync, |
100 | * because the checkpointer failed to absorb their request. |
101 | * |
102 | * The requests array holds fsync requests sent by backends and not yet |
103 | * absorbed by the checkpointer. |
104 | * |
105 | * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and |
106 | * the requests fields are protected by CheckpointerCommLock. |
107 | *---------- |
108 | */ |
109 | typedef struct |
110 | { |
111 | SyncRequestType type; /* request type */ |
112 | FileTag ftag; /* file identifier */ |
113 | } CheckpointerRequest; |
114 | |
115 | typedef struct |
116 | { |
117 | pid_t checkpointer_pid; /* PID (0 if not started) */ |
118 | |
119 | slock_t ckpt_lck; /* protects all the ckpt_* fields */ |
120 | |
121 | int ckpt_started; /* advances when checkpoint starts */ |
122 | int ckpt_done; /* advances when checkpoint done */ |
123 | int ckpt_failed; /* advances when checkpoint fails */ |
124 | |
125 | int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ |
126 | |
127 | ConditionVariable start_cv; /* signaled when ckpt_started advances */ |
128 | ConditionVariable done_cv; /* signaled when ckpt_done advances */ |
129 | |
130 | uint32 num_backend_writes; /* counts user backend buffer writes */ |
131 | uint32 num_backend_fsync; /* counts user backend fsync calls */ |
132 | |
133 | int num_requests; /* current # of requests */ |
134 | int max_requests; /* allocated array size */ |
135 | CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; |
136 | } CheckpointerShmemStruct; |
137 | |
138 | static CheckpointerShmemStruct *CheckpointerShmem; |
139 | |
140 | /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ |
141 | #define WRITES_PER_ABSORB 1000 |
142 | |
143 | /* |
144 | * GUC parameters |
145 | */ |
146 | int CheckPointTimeout = 300; |
147 | int CheckPointWarning = 30; |
148 | double CheckPointCompletionTarget = 0.5; |
149 | |
150 | /* |
151 | * Flags set by interrupt handlers for later service in the main loop. |
152 | */ |
153 | static volatile sig_atomic_t got_SIGHUP = false; |
154 | static volatile sig_atomic_t shutdown_requested = false; |
155 | |
156 | /* |
157 | * Private state |
158 | */ |
159 | static bool ckpt_active = false; |
160 | |
161 | /* these values are valid when ckpt_active is true: */ |
162 | static pg_time_t ckpt_start_time; |
163 | static XLogRecPtr ckpt_start_recptr; |
164 | static double ckpt_cached_elapsed; |
165 | |
166 | static pg_time_t last_checkpoint_time; |
167 | static pg_time_t last_xlog_switch_time; |
168 | |
169 | /* Prototypes for private functions */ |
170 | |
171 | static void CheckArchiveTimeout(void); |
172 | static bool IsCheckpointOnSchedule(double progress); |
173 | static bool ImmediateCheckpointRequested(void); |
174 | static bool CompactCheckpointerRequestQueue(void); |
175 | static void UpdateSharedMemoryConfig(void); |
176 | |
177 | /* Signal handlers */ |
178 | |
179 | static void chkpt_quickdie(SIGNAL_ARGS); |
180 | static void ChkptSigHupHandler(SIGNAL_ARGS); |
181 | static void ReqCheckpointHandler(SIGNAL_ARGS); |
182 | static void chkpt_sigusr1_handler(SIGNAL_ARGS); |
183 | static void ReqShutdownHandler(SIGNAL_ARGS); |
184 | |
185 | |
186 | /* |
187 | * Main entry point for checkpointer process |
188 | * |
189 | * This is invoked from AuxiliaryProcessMain, which has already created the |
190 | * basic execution environment, but not enabled signals yet. |
191 | */ |
192 | void |
193 | CheckpointerMain(void) |
194 | { |
195 | sigjmp_buf local_sigjmp_buf; |
196 | MemoryContext checkpointer_context; |
197 | |
198 | CheckpointerShmem->checkpointer_pid = MyProcPid; |
199 | |
200 | /* |
201 | * Properly accept or ignore signals the postmaster might send us |
202 | * |
203 | * Note: we deliberately ignore SIGTERM, because during a standard Unix |
204 | * system shutdown cycle, init will SIGTERM all processes at once. We |
205 | * want to wait for the backends to exit, whereupon the postmaster will |
206 | * tell us it's okay to shut down (via SIGUSR2). |
207 | */ |
208 | pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config file */ |
209 | pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ |
210 | pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ |
211 | pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ |
212 | pqsignal(SIGALRM, SIG_IGN); |
213 | pqsignal(SIGPIPE, SIG_IGN); |
214 | pqsignal(SIGUSR1, chkpt_sigusr1_handler); |
215 | pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ |
216 | |
217 | /* |
218 | * Reset some signals that are accepted by postmaster but not here |
219 | */ |
220 | pqsignal(SIGCHLD, SIG_DFL); |
221 | |
222 | /* We allow SIGQUIT (quickdie) at all times */ |
223 | sigdelset(&BlockSig, SIGQUIT); |
224 | |
225 | /* |
226 | * Initialize so that first time-driven event happens at the correct time. |
227 | */ |
228 | last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); |
229 | |
230 | /* |
231 | * Create a memory context that we will do all our work in. We do this so |
232 | * that we can reset the context during error recovery and thereby avoid |
233 | * possible memory leaks. Formerly this code just ran in |
234 | * TopMemoryContext, but resetting that would be a really bad idea. |
235 | */ |
236 | checkpointer_context = AllocSetContextCreate(TopMemoryContext, |
237 | "Checkpointer" , |
238 | ALLOCSET_DEFAULT_SIZES); |
239 | MemoryContextSwitchTo(checkpointer_context); |
240 | |
241 | /* |
242 | * If an exception is encountered, processing resumes here. |
243 | * |
244 | * See notes in postgres.c about the design of this coding. |
245 | */ |
246 | if (sigsetjmp(local_sigjmp_buf, 1) != 0) |
247 | { |
248 | /* Since not using PG_TRY, must reset error stack by hand */ |
249 | error_context_stack = NULL; |
250 | |
251 | /* Prevent interrupts while cleaning up */ |
252 | HOLD_INTERRUPTS(); |
253 | |
254 | /* Report the error to the server log */ |
255 | EmitErrorReport(); |
256 | |
257 | /* |
258 | * These operations are really just a minimal subset of |
259 | * AbortTransaction(). We don't have very many resources to worry |
260 | * about in checkpointer, but we do have LWLocks, buffers, and temp |
261 | * files. |
262 | */ |
263 | LWLockReleaseAll(); |
264 | ConditionVariableCancelSleep(); |
265 | pgstat_report_wait_end(); |
266 | AbortBufferIO(); |
267 | UnlockBuffers(); |
268 | ReleaseAuxProcessResources(false); |
269 | AtEOXact_Buffers(false); |
270 | AtEOXact_SMgr(); |
271 | AtEOXact_Files(false); |
272 | AtEOXact_HashTables(false); |
273 | |
274 | /* Warn any waiting backends that the checkpoint failed. */ |
275 | if (ckpt_active) |
276 | { |
277 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
278 | CheckpointerShmem->ckpt_failed++; |
279 | CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; |
280 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
281 | |
282 | ConditionVariableBroadcast(&CheckpointerShmem->done_cv); |
283 | |
284 | ckpt_active = false; |
285 | } |
286 | |
287 | /* |
288 | * Now return to normal top-level context and clear ErrorContext for |
289 | * next time. |
290 | */ |
291 | MemoryContextSwitchTo(checkpointer_context); |
292 | FlushErrorState(); |
293 | |
294 | /* Flush any leaked data in the top-level context */ |
295 | MemoryContextResetAndDeleteChildren(checkpointer_context); |
296 | |
297 | /* Now we can allow interrupts again */ |
298 | RESUME_INTERRUPTS(); |
299 | |
300 | /* |
301 | * Sleep at least 1 second after any error. A write error is likely |
302 | * to be repeated, and we don't want to be filling the error logs as |
303 | * fast as we can. |
304 | */ |
305 | pg_usleep(1000000L); |
306 | |
307 | /* |
308 | * Close all open files after any error. This is helpful on Windows, |
309 | * where holding deleted files open causes various strange errors. |
310 | * It's not clear we need it elsewhere, but shouldn't hurt. |
311 | */ |
312 | smgrcloseall(); |
313 | } |
314 | |
315 | /* We can now handle ereport(ERROR) */ |
316 | PG_exception_stack = &local_sigjmp_buf; |
317 | |
318 | /* |
319 | * Unblock signals (they were blocked when the postmaster forked us) |
320 | */ |
321 | PG_SETMASK(&UnBlockSig); |
322 | |
323 | /* |
324 | * Ensure all shared memory values are set correctly for the config. Doing |
325 | * this here ensures no race conditions from other concurrent updaters. |
326 | */ |
327 | UpdateSharedMemoryConfig(); |
328 | |
329 | /* |
330 | * Advertise our latch that backends can use to wake us up while we're |
331 | * sleeping. |
332 | */ |
333 | ProcGlobal->checkpointerLatch = &MyProc->procLatch; |
334 | |
335 | /* |
336 | * Loop forever |
337 | */ |
338 | for (;;) |
339 | { |
340 | bool do_checkpoint = false; |
341 | int flags = 0; |
342 | pg_time_t now; |
343 | int elapsed_secs; |
344 | int cur_timeout; |
345 | |
346 | /* Clear any already-pending wakeups */ |
347 | ResetLatch(MyLatch); |
348 | |
349 | /* |
350 | * Process any requests or signals received recently. |
351 | */ |
352 | AbsorbSyncRequests(); |
353 | |
354 | if (got_SIGHUP) |
355 | { |
356 | got_SIGHUP = false; |
357 | ProcessConfigFile(PGC_SIGHUP); |
358 | |
359 | /* |
360 | * Checkpointer is the last process to shut down, so we ask it to |
361 | * hold the keys for a range of other tasks required most of which |
362 | * have nothing to do with checkpointing at all. |
363 | * |
364 | * For various reasons, some config values can change dynamically |
365 | * so the primary copy of them is held in shared memory to make |
366 | * sure all backends see the same value. We make Checkpointer |
367 | * responsible for updating the shared memory copy if the |
368 | * parameter setting changes because of SIGHUP. |
369 | */ |
370 | UpdateSharedMemoryConfig(); |
371 | } |
372 | if (shutdown_requested) |
373 | { |
374 | /* |
375 | * From here on, elog(ERROR) should end with exit(1), not send |
376 | * control back to the sigsetjmp block above |
377 | */ |
378 | ExitOnAnyError = true; |
379 | /* Close down the database */ |
380 | ShutdownXLOG(0, 0); |
381 | /* Normal exit from the checkpointer is here */ |
382 | proc_exit(0); /* done */ |
383 | } |
384 | |
385 | /* |
386 | * Detect a pending checkpoint request by checking whether the flags |
387 | * word in shared memory is nonzero. We shouldn't need to acquire the |
388 | * ckpt_lck for this. |
389 | */ |
390 | if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) |
391 | { |
392 | do_checkpoint = true; |
393 | BgWriterStats.m_requested_checkpoints++; |
394 | } |
395 | |
396 | /* |
397 | * Force a checkpoint if too much time has elapsed since the last one. |
398 | * Note that we count a timed checkpoint in stats only when this |
399 | * occurs without an external request, but we set the CAUSE_TIME flag |
400 | * bit even if there is also an external request. |
401 | */ |
402 | now = (pg_time_t) time(NULL); |
403 | elapsed_secs = now - last_checkpoint_time; |
404 | if (elapsed_secs >= CheckPointTimeout) |
405 | { |
406 | if (!do_checkpoint) |
407 | BgWriterStats.m_timed_checkpoints++; |
408 | do_checkpoint = true; |
409 | flags |= CHECKPOINT_CAUSE_TIME; |
410 | } |
411 | |
412 | /* |
413 | * Do a checkpoint if requested. |
414 | */ |
415 | if (do_checkpoint) |
416 | { |
417 | bool ckpt_performed = false; |
418 | bool do_restartpoint; |
419 | |
420 | /* |
421 | * Check if we should perform a checkpoint or a restartpoint. As a |
422 | * side-effect, RecoveryInProgress() initializes TimeLineID if |
423 | * it's not set yet. |
424 | */ |
425 | do_restartpoint = RecoveryInProgress(); |
426 | |
427 | /* |
428 | * Atomically fetch the request flags to figure out what kind of a |
429 | * checkpoint we should perform, and increase the started-counter |
430 | * to acknowledge that we've started a new checkpoint. |
431 | */ |
432 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
433 | flags |= CheckpointerShmem->ckpt_flags; |
434 | CheckpointerShmem->ckpt_flags = 0; |
435 | CheckpointerShmem->ckpt_started++; |
436 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
437 | |
438 | ConditionVariableBroadcast(&CheckpointerShmem->start_cv); |
439 | |
440 | /* |
441 | * The end-of-recovery checkpoint is a real checkpoint that's |
442 | * performed while we're still in recovery. |
443 | */ |
444 | if (flags & CHECKPOINT_END_OF_RECOVERY) |
445 | do_restartpoint = false; |
446 | |
447 | /* |
448 | * We will warn if (a) too soon since last checkpoint (whatever |
449 | * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag |
450 | * since the last checkpoint start. Note in particular that this |
451 | * implementation will not generate warnings caused by |
452 | * CheckPointTimeout < CheckPointWarning. |
453 | */ |
454 | if (!do_restartpoint && |
455 | (flags & CHECKPOINT_CAUSE_XLOG) && |
456 | elapsed_secs < CheckPointWarning) |
457 | ereport(LOG, |
458 | (errmsg_plural("checkpoints are occurring too frequently (%d second apart)" , |
459 | "checkpoints are occurring too frequently (%d seconds apart)" , |
460 | elapsed_secs, |
461 | elapsed_secs), |
462 | errhint("Consider increasing the configuration parameter \"max_wal_size\"." ))); |
463 | |
464 | /* |
465 | * Initialize checkpointer-private variables used during |
466 | * checkpoint. |
467 | */ |
468 | ckpt_active = true; |
469 | if (do_restartpoint) |
470 | ckpt_start_recptr = GetXLogReplayRecPtr(NULL); |
471 | else |
472 | ckpt_start_recptr = GetInsertRecPtr(); |
473 | ckpt_start_time = now; |
474 | ckpt_cached_elapsed = 0; |
475 | |
476 | /* |
477 | * Do the checkpoint. |
478 | */ |
479 | if (!do_restartpoint) |
480 | { |
481 | CreateCheckPoint(flags); |
482 | ckpt_performed = true; |
483 | } |
484 | else |
485 | ckpt_performed = CreateRestartPoint(flags); |
486 | |
487 | /* |
488 | * After any checkpoint, close all smgr files. This is so we |
489 | * won't hang onto smgr references to deleted files indefinitely. |
490 | */ |
491 | smgrcloseall(); |
492 | |
493 | /* |
494 | * Indicate checkpoint completion to any waiting backends. |
495 | */ |
496 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
497 | CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; |
498 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
499 | |
500 | ConditionVariableBroadcast(&CheckpointerShmem->done_cv); |
501 | |
502 | if (ckpt_performed) |
503 | { |
504 | /* |
505 | * Note we record the checkpoint start time not end time as |
506 | * last_checkpoint_time. This is so that time-driven |
507 | * checkpoints happen at a predictable spacing. |
508 | */ |
509 | last_checkpoint_time = now; |
510 | } |
511 | else |
512 | { |
513 | /* |
514 | * We were not able to perform the restartpoint (checkpoints |
515 | * throw an ERROR in case of error). Most likely because we |
516 | * have not received any new checkpoint WAL records since the |
517 | * last restartpoint. Try again in 15 s. |
518 | */ |
519 | last_checkpoint_time = now - CheckPointTimeout + 15; |
520 | } |
521 | |
522 | ckpt_active = false; |
523 | } |
524 | |
525 | /* Check for archive_timeout and switch xlog files if necessary. */ |
526 | CheckArchiveTimeout(); |
527 | |
528 | /* |
529 | * Send off activity statistics to the stats collector. (The reason |
530 | * why we re-use bgwriter-related code for this is that the bgwriter |
531 | * and checkpointer used to be just one process. It's probably not |
532 | * worth the trouble to split the stats support into two independent |
533 | * stats message types.) |
534 | */ |
535 | pgstat_send_bgwriter(); |
536 | |
537 | /* |
538 | * Sleep until we are signaled or it's time for another checkpoint or |
539 | * xlog file switch. |
540 | */ |
541 | now = (pg_time_t) time(NULL); |
542 | elapsed_secs = now - last_checkpoint_time; |
543 | if (elapsed_secs >= CheckPointTimeout) |
544 | continue; /* no sleep for us ... */ |
545 | cur_timeout = CheckPointTimeout - elapsed_secs; |
546 | if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) |
547 | { |
548 | elapsed_secs = now - last_xlog_switch_time; |
549 | if (elapsed_secs >= XLogArchiveTimeout) |
550 | continue; /* no sleep for us ... */ |
551 | cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); |
552 | } |
553 | |
554 | (void) WaitLatch(MyLatch, |
555 | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
556 | cur_timeout * 1000L /* convert to ms */ , |
557 | WAIT_EVENT_CHECKPOINTER_MAIN); |
558 | } |
559 | } |
560 | |
561 | /* |
562 | * CheckArchiveTimeout -- check for archive_timeout and switch xlog files |
563 | * |
564 | * This will switch to a new WAL file and force an archive file write if |
565 | * meaningful activity is recorded in the current WAL file. This includes most |
566 | * writes, including just a single checkpoint record, but excludes WAL records |
567 | * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like |
568 | * snapshots of running transactions). Such records, depending on |
569 | * configuration, occur on regular intervals and don't contain important |
570 | * information. This avoids generating archives with a few unimportant |
571 | * records. |
572 | */ |
573 | static void |
574 | CheckArchiveTimeout(void) |
575 | { |
576 | pg_time_t now; |
577 | pg_time_t last_time; |
578 | XLogRecPtr last_switch_lsn; |
579 | |
580 | if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) |
581 | return; |
582 | |
583 | now = (pg_time_t) time(NULL); |
584 | |
585 | /* First we do a quick check using possibly-stale local state. */ |
586 | if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) |
587 | return; |
588 | |
589 | /* |
590 | * Update local state ... note that last_xlog_switch_time is the last time |
591 | * a switch was performed *or requested*. |
592 | */ |
593 | last_time = GetLastSegSwitchData(&last_switch_lsn); |
594 | |
595 | last_xlog_switch_time = Max(last_xlog_switch_time, last_time); |
596 | |
597 | /* Now we can do the real checks */ |
598 | if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) |
599 | { |
600 | /* |
601 | * Switch segment only when "important" WAL has been logged since the |
602 | * last segment switch (last_switch_lsn points to end of segment |
603 | * switch occurred in). |
604 | */ |
605 | if (GetLastImportantRecPtr() > last_switch_lsn) |
606 | { |
607 | XLogRecPtr switchpoint; |
608 | |
609 | /* mark switch as unimportant, avoids triggering checkpoints */ |
610 | switchpoint = RequestXLogSwitch(true); |
611 | |
612 | /* |
613 | * If the returned pointer points exactly to a segment boundary, |
614 | * assume nothing happened. |
615 | */ |
616 | if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0) |
617 | elog(DEBUG1, "write-ahead log switch forced (archive_timeout=%d)" , |
618 | XLogArchiveTimeout); |
619 | } |
620 | |
621 | /* |
622 | * Update state in any case, so we don't retry constantly when the |
623 | * system is idle. |
624 | */ |
625 | last_xlog_switch_time = now; |
626 | } |
627 | } |
628 | |
629 | /* |
630 | * Returns true if an immediate checkpoint request is pending. (Note that |
631 | * this does not check the *current* checkpoint's IMMEDIATE flag, but whether |
632 | * there is one pending behind it.) |
633 | */ |
634 | static bool |
635 | ImmediateCheckpointRequested(void) |
636 | { |
637 | volatile CheckpointerShmemStruct *cps = CheckpointerShmem; |
638 | |
639 | /* |
640 | * We don't need to acquire the ckpt_lck in this case because we're only |
641 | * looking at a single flag bit. |
642 | */ |
643 | if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) |
644 | return true; |
645 | return false; |
646 | } |
647 | |
648 | /* |
649 | * CheckpointWriteDelay -- control rate of checkpoint |
650 | * |
651 | * This function is called after each page write performed by BufferSync(). |
652 | * It is responsible for throttling BufferSync()'s write rate to hit |
653 | * checkpoint_completion_target. |
654 | * |
655 | * The checkpoint request flags should be passed in; currently the only one |
656 | * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. |
657 | * |
658 | * 'progress' is an estimate of how much of the work has been done, as a |
659 | * fraction between 0.0 meaning none, and 1.0 meaning all done. |
660 | */ |
661 | void |
662 | CheckpointWriteDelay(int flags, double progress) |
663 | { |
664 | static int absorb_counter = WRITES_PER_ABSORB; |
665 | |
666 | /* Do nothing if checkpoint is being executed by non-checkpointer process */ |
667 | if (!AmCheckpointerProcess()) |
668 | return; |
669 | |
670 | /* |
671 | * Perform the usual duties and take a nap, unless we're behind schedule, |
672 | * in which case we just try to catch up as quickly as possible. |
673 | */ |
674 | if (!(flags & CHECKPOINT_IMMEDIATE) && |
675 | !shutdown_requested && |
676 | !ImmediateCheckpointRequested() && |
677 | IsCheckpointOnSchedule(progress)) |
678 | { |
679 | if (got_SIGHUP) |
680 | { |
681 | got_SIGHUP = false; |
682 | ProcessConfigFile(PGC_SIGHUP); |
683 | /* update shmem copies of config variables */ |
684 | UpdateSharedMemoryConfig(); |
685 | } |
686 | |
687 | AbsorbSyncRequests(); |
688 | absorb_counter = WRITES_PER_ABSORB; |
689 | |
690 | CheckArchiveTimeout(); |
691 | |
692 | /* |
693 | * Report interim activity statistics to the stats collector. |
694 | */ |
695 | pgstat_send_bgwriter(); |
696 | |
697 | /* |
698 | * This sleep used to be connected to bgwriter_delay, typically 200ms. |
699 | * That resulted in more frequent wakeups if not much work to do. |
700 | * Checkpointer and bgwriter are no longer related so take the Big |
701 | * Sleep. |
702 | */ |
703 | pg_usleep(100000L); |
704 | } |
705 | else if (--absorb_counter <= 0) |
706 | { |
707 | /* |
708 | * Absorb pending fsync requests after each WRITES_PER_ABSORB write |
709 | * operations even when we don't sleep, to prevent overflow of the |
710 | * fsync request queue. |
711 | */ |
712 | AbsorbSyncRequests(); |
713 | absorb_counter = WRITES_PER_ABSORB; |
714 | } |
715 | } |
716 | |
717 | /* |
718 | * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint |
719 | * (or restartpoint) in time? |
720 | * |
721 | * Compares the current progress against the time/segments elapsed since last |
722 | * checkpoint, and returns true if the progress we've made this far is greater |
723 | * than the elapsed time/segments. |
724 | */ |
725 | static bool |
726 | IsCheckpointOnSchedule(double progress) |
727 | { |
728 | XLogRecPtr recptr; |
729 | struct timeval now; |
730 | double elapsed_xlogs, |
731 | elapsed_time; |
732 | |
733 | Assert(ckpt_active); |
734 | |
735 | /* Scale progress according to checkpoint_completion_target. */ |
736 | progress *= CheckPointCompletionTarget; |
737 | |
738 | /* |
739 | * Check against the cached value first. Only do the more expensive |
740 | * calculations once we reach the target previously calculated. Since |
741 | * neither time or WAL insert pointer moves backwards, a freshly |
742 | * calculated value can only be greater than or equal to the cached value. |
743 | */ |
744 | if (progress < ckpt_cached_elapsed) |
745 | return false; |
746 | |
747 | /* |
748 | * Check progress against WAL segments written and CheckPointSegments. |
749 | * |
750 | * We compare the current WAL insert location against the location |
751 | * computed before calling CreateCheckPoint. The code in XLogInsert that |
752 | * actually triggers a checkpoint when CheckPointSegments is exceeded |
753 | * compares against RedoRecptr, so this is not completely accurate. |
754 | * However, it's good enough for our purposes, we're only calculating an |
755 | * estimate anyway. |
756 | * |
757 | * During recovery, we compare last replayed WAL record's location with |
758 | * the location computed before calling CreateRestartPoint. That maintains |
759 | * the same pacing as we have during checkpoints in normal operation, but |
760 | * we might exceed max_wal_size by a fair amount. That's because there can |
761 | * be a large gap between a checkpoint's redo-pointer and the checkpoint |
762 | * record itself, and we only start the restartpoint after we've seen the |
763 | * checkpoint record. (The gap is typically up to CheckPointSegments * |
764 | * checkpoint_completion_target where checkpoint_completion_target is the |
765 | * value that was in effect when the WAL was generated). |
766 | */ |
767 | if (RecoveryInProgress()) |
768 | recptr = GetXLogReplayRecPtr(NULL); |
769 | else |
770 | recptr = GetInsertRecPtr(); |
771 | elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / |
772 | wal_segment_size) / CheckPointSegments; |
773 | |
774 | if (progress < elapsed_xlogs) |
775 | { |
776 | ckpt_cached_elapsed = elapsed_xlogs; |
777 | return false; |
778 | } |
779 | |
780 | /* |
781 | * Check progress against time elapsed and checkpoint_timeout. |
782 | */ |
783 | gettimeofday(&now, NULL); |
784 | elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + |
785 | now.tv_usec / 1000000.0) / CheckPointTimeout; |
786 | |
787 | if (progress < elapsed_time) |
788 | { |
789 | ckpt_cached_elapsed = elapsed_time; |
790 | return false; |
791 | } |
792 | |
793 | /* It looks like we're on schedule. */ |
794 | return true; |
795 | } |
796 | |
797 | |
798 | /* -------------------------------- |
799 | * signal handler routines |
800 | * -------------------------------- |
801 | */ |
802 | |
803 | /* |
804 | * chkpt_quickdie() occurs when signalled SIGQUIT by the postmaster. |
805 | * |
806 | * Some backend has bought the farm, |
807 | * so we need to stop what we're doing and exit. |
808 | */ |
809 | static void |
810 | chkpt_quickdie(SIGNAL_ARGS) |
811 | { |
812 | /* |
813 | * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here |
814 | * because shared memory may be corrupted, so we don't want to try to |
815 | * clean up our transaction. Just nail the windows shut and get out of |
816 | * town. The callbacks wouldn't be safe to run from a signal handler, |
817 | * anyway. |
818 | * |
819 | * Note we do _exit(2) not _exit(0). This is to force the postmaster into |
820 | * a system reset cycle if someone sends a manual SIGQUIT to a random |
821 | * backend. This is necessary precisely because we don't clean up our |
822 | * shared memory state. (The "dead man switch" mechanism in pmsignal.c |
823 | * should ensure the postmaster sees this as a crash, too, but no harm in |
824 | * being doubly sure.) |
825 | */ |
826 | _exit(2); |
827 | } |
828 | |
829 | /* SIGHUP: set flag to re-read config file at next convenient time */ |
830 | static void |
831 | ChkptSigHupHandler(SIGNAL_ARGS) |
832 | { |
833 | int save_errno = errno; |
834 | |
835 | got_SIGHUP = true; |
836 | SetLatch(MyLatch); |
837 | |
838 | errno = save_errno; |
839 | } |
840 | |
841 | /* SIGINT: set flag to run a normal checkpoint right away */ |
842 | static void |
843 | ReqCheckpointHandler(SIGNAL_ARGS) |
844 | { |
845 | int save_errno = errno; |
846 | |
847 | /* |
848 | * The signalling process should have set ckpt_flags nonzero, so all we |
849 | * need do is ensure that our main loop gets kicked out of any wait. |
850 | */ |
851 | SetLatch(MyLatch); |
852 | |
853 | errno = save_errno; |
854 | } |
855 | |
856 | /* SIGUSR1: used for latch wakeups */ |
857 | static void |
858 | chkpt_sigusr1_handler(SIGNAL_ARGS) |
859 | { |
860 | int save_errno = errno; |
861 | |
862 | latch_sigusr1_handler(); |
863 | |
864 | errno = save_errno; |
865 | } |
866 | |
867 | /* SIGUSR2: set flag to run a shutdown checkpoint and exit */ |
868 | static void |
869 | ReqShutdownHandler(SIGNAL_ARGS) |
870 | { |
871 | int save_errno = errno; |
872 | |
873 | shutdown_requested = true; |
874 | SetLatch(MyLatch); |
875 | |
876 | errno = save_errno; |
877 | } |
878 | |
879 | |
880 | /* -------------------------------- |
881 | * communication with backends |
882 | * -------------------------------- |
883 | */ |
884 | |
885 | /* |
886 | * CheckpointerShmemSize |
887 | * Compute space needed for checkpointer-related shared memory |
888 | */ |
889 | Size |
890 | CheckpointerShmemSize(void) |
891 | { |
892 | Size size; |
893 | |
894 | /* |
895 | * Currently, the size of the requests[] array is arbitrarily set equal to |
896 | * NBuffers. This may prove too large or small ... |
897 | */ |
898 | size = offsetof(CheckpointerShmemStruct, requests); |
899 | size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest))); |
900 | |
901 | return size; |
902 | } |
903 | |
904 | /* |
905 | * CheckpointerShmemInit |
906 | * Allocate and initialize checkpointer-related shared memory |
907 | */ |
908 | void |
909 | CheckpointerShmemInit(void) |
910 | { |
911 | Size size = CheckpointerShmemSize(); |
912 | bool found; |
913 | |
914 | CheckpointerShmem = (CheckpointerShmemStruct *) |
915 | ShmemInitStruct("Checkpointer Data" , |
916 | size, |
917 | &found); |
918 | |
919 | if (!found) |
920 | { |
921 | /* |
922 | * First time through, so initialize. Note that we zero the whole |
923 | * requests array; this is so that CompactCheckpointerRequestQueue can |
924 | * assume that any pad bytes in the request structs are zeroes. |
925 | */ |
926 | MemSet(CheckpointerShmem, 0, size); |
927 | SpinLockInit(&CheckpointerShmem->ckpt_lck); |
928 | CheckpointerShmem->max_requests = NBuffers; |
929 | ConditionVariableInit(&CheckpointerShmem->start_cv); |
930 | ConditionVariableInit(&CheckpointerShmem->done_cv); |
931 | } |
932 | } |
933 | |
934 | /* |
935 | * RequestCheckpoint |
936 | * Called in backend processes to request a checkpoint |
937 | * |
938 | * flags is a bitwise OR of the following: |
939 | * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. |
940 | * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. |
941 | * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, |
942 | * ignoring checkpoint_completion_target parameter. |
943 | * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred |
944 | * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or |
945 | * CHECKPOINT_END_OF_RECOVERY). |
946 | * CHECKPOINT_WAIT: wait for completion before returning (otherwise, |
947 | * just signal checkpointer to do it, and return). |
948 | * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. |
949 | * (This affects logging, and in particular enables CheckPointWarning.) |
950 | */ |
951 | void |
952 | RequestCheckpoint(int flags) |
953 | { |
954 | int ntries; |
955 | int old_failed, |
956 | old_started; |
957 | |
958 | /* |
959 | * If in a standalone backend, just do it ourselves. |
960 | */ |
961 | if (!IsPostmasterEnvironment) |
962 | { |
963 | /* |
964 | * There's no point in doing slow checkpoints in a standalone backend, |
965 | * because there's no other backends the checkpoint could disrupt. |
966 | */ |
967 | CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); |
968 | |
969 | /* |
970 | * After any checkpoint, close all smgr files. This is so we won't |
971 | * hang onto smgr references to deleted files indefinitely. |
972 | */ |
973 | smgrcloseall(); |
974 | |
975 | return; |
976 | } |
977 | |
978 | /* |
979 | * Atomically set the request flags, and take a snapshot of the counters. |
980 | * When we see ckpt_started > old_started, we know the flags we set here |
981 | * have been seen by checkpointer. |
982 | * |
983 | * Note that we OR the flags with any existing flags, to avoid overriding |
984 | * a "stronger" request by another backend. The flag senses must be |
985 | * chosen to make this work! |
986 | */ |
987 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
988 | |
989 | old_failed = CheckpointerShmem->ckpt_failed; |
990 | old_started = CheckpointerShmem->ckpt_started; |
991 | CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED); |
992 | |
993 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
994 | |
995 | /* |
996 | * Send signal to request checkpoint. It's possible that the checkpointer |
997 | * hasn't started yet, or is in process of restarting, so we will retry a |
998 | * few times if needed. (Actually, more than a few times, since on slow |
999 | * or overloaded buildfarm machines, it's been observed that the |
1000 | * checkpointer can take several seconds to start.) However, if not told |
1001 | * to wait for the checkpoint to occur, we consider failure to send the |
1002 | * signal to be nonfatal and merely LOG it. The checkpointer should see |
1003 | * the request when it does start, with or without getting a signal. |
1004 | */ |
1005 | #define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */ |
1006 | for (ntries = 0;; ntries++) |
1007 | { |
1008 | if (CheckpointerShmem->checkpointer_pid == 0) |
1009 | { |
1010 | if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) |
1011 | { |
1012 | elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, |
1013 | "could not signal for checkpoint: checkpointer is not running" ); |
1014 | break; |
1015 | } |
1016 | } |
1017 | else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0) |
1018 | { |
1019 | if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) |
1020 | { |
1021 | elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, |
1022 | "could not signal for checkpoint: %m" ); |
1023 | break; |
1024 | } |
1025 | } |
1026 | else |
1027 | break; /* signal sent successfully */ |
1028 | |
1029 | CHECK_FOR_INTERRUPTS(); |
1030 | pg_usleep(100000L); /* wait 0.1 sec, then retry */ |
1031 | } |
1032 | |
1033 | /* |
1034 | * If requested, wait for completion. We detect completion according to |
1035 | * the algorithm given above. |
1036 | */ |
1037 | if (flags & CHECKPOINT_WAIT) |
1038 | { |
1039 | int new_started, |
1040 | new_failed; |
1041 | |
1042 | /* Wait for a new checkpoint to start. */ |
1043 | ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv); |
1044 | for (;;) |
1045 | { |
1046 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1047 | new_started = CheckpointerShmem->ckpt_started; |
1048 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1049 | |
1050 | if (new_started != old_started) |
1051 | break; |
1052 | |
1053 | ConditionVariableSleep(&CheckpointerShmem->start_cv, |
1054 | WAIT_EVENT_CHECKPOINT_START); |
1055 | } |
1056 | ConditionVariableCancelSleep(); |
1057 | |
1058 | /* |
1059 | * We are waiting for ckpt_done >= new_started, in a modulo sense. |
1060 | */ |
1061 | ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv); |
1062 | for (;;) |
1063 | { |
1064 | int new_done; |
1065 | |
1066 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1067 | new_done = CheckpointerShmem->ckpt_done; |
1068 | new_failed = CheckpointerShmem->ckpt_failed; |
1069 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1070 | |
1071 | if (new_done - new_started >= 0) |
1072 | break; |
1073 | |
1074 | ConditionVariableSleep(&CheckpointerShmem->done_cv, |
1075 | WAIT_EVENT_CHECKPOINT_DONE); |
1076 | } |
1077 | ConditionVariableCancelSleep(); |
1078 | |
1079 | if (new_failed != old_failed) |
1080 | ereport(ERROR, |
1081 | (errmsg("checkpoint request failed" ), |
1082 | errhint("Consult recent messages in the server log for details." ))); |
1083 | } |
1084 | } |
1085 | |
1086 | /* |
1087 | * ForwardSyncRequest |
1088 | * Forward a file-fsync request from a backend to the checkpointer |
1089 | * |
1090 | * Whenever a backend is compelled to write directly to a relation |
1091 | * (which should be seldom, if the background writer is getting its job done), |
1092 | * the backend calls this routine to pass over knowledge that the relation |
1093 | * is dirty and must be fsync'd before next checkpoint. We also use this |
1094 | * opportunity to count such writes for statistical purposes. |
1095 | * |
1096 | * To avoid holding the lock for longer than necessary, we normally write |
1097 | * to the requests[] queue without checking for duplicates. The checkpointer |
1098 | * will have to eliminate dups internally anyway. However, if we discover |
1099 | * that the queue is full, we make a pass over the entire queue to compact |
1100 | * it. This is somewhat expensive, but the alternative is for the backend |
1101 | * to perform its own fsync, which is far more expensive in practice. It |
1102 | * is theoretically possible a backend fsync might still be necessary, if |
1103 | * the queue is full and contains no duplicate entries. In that case, we |
1104 | * let the backend know by returning false. |
1105 | */ |
1106 | bool |
1107 | ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) |
1108 | { |
1109 | CheckpointerRequest *request; |
1110 | bool too_full; |
1111 | |
1112 | if (!IsUnderPostmaster) |
1113 | return false; /* probably shouldn't even get here */ |
1114 | |
1115 | if (AmCheckpointerProcess()) |
1116 | elog(ERROR, "ForwardSyncRequest must not be called in checkpointer" ); |
1117 | |
1118 | LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); |
1119 | |
1120 | /* Count all backend writes regardless of if they fit in the queue */ |
1121 | if (!AmBackgroundWriterProcess()) |
1122 | CheckpointerShmem->num_backend_writes++; |
1123 | |
1124 | /* |
1125 | * If the checkpointer isn't running or the request queue is full, the |
1126 | * backend will have to perform its own fsync request. But before forcing |
1127 | * that to happen, we can try to compact the request queue. |
1128 | */ |
1129 | if (CheckpointerShmem->checkpointer_pid == 0 || |
1130 | (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests && |
1131 | !CompactCheckpointerRequestQueue())) |
1132 | { |
1133 | /* |
1134 | * Count the subset of writes where backends have to do their own |
1135 | * fsync |
1136 | */ |
1137 | if (!AmBackgroundWriterProcess()) |
1138 | CheckpointerShmem->num_backend_fsync++; |
1139 | LWLockRelease(CheckpointerCommLock); |
1140 | return false; |
1141 | } |
1142 | |
1143 | /* OK, insert request */ |
1144 | request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; |
1145 | request->ftag = *ftag; |
1146 | request->type = type; |
1147 | |
1148 | /* If queue is more than half full, nudge the checkpointer to empty it */ |
1149 | too_full = (CheckpointerShmem->num_requests >= |
1150 | CheckpointerShmem->max_requests / 2); |
1151 | |
1152 | LWLockRelease(CheckpointerCommLock); |
1153 | |
1154 | /* ... but not till after we release the lock */ |
1155 | if (too_full && ProcGlobal->checkpointerLatch) |
1156 | SetLatch(ProcGlobal->checkpointerLatch); |
1157 | |
1158 | return true; |
1159 | } |
1160 | |
1161 | /* |
1162 | * CompactCheckpointerRequestQueue |
1163 | * Remove duplicates from the request queue to avoid backend fsyncs. |
1164 | * Returns "true" if any entries were removed. |
1165 | * |
1166 | * Although a full fsync request queue is not common, it can lead to severe |
1167 | * performance problems when it does happen. So far, this situation has |
1168 | * only been observed to occur when the system is under heavy write load, |
1169 | * and especially during the "sync" phase of a checkpoint. Without this |
1170 | * logic, each backend begins doing an fsync for every block written, which |
1171 | * gets very expensive and can slow down the whole system. |
1172 | * |
1173 | * Trying to do this every time the queue is full could lose if there |
1174 | * aren't any removable entries. But that should be vanishingly rare in |
1175 | * practice: there's one queue entry per shared buffer. |
1176 | */ |
1177 | static bool |
1178 | CompactCheckpointerRequestQueue(void) |
1179 | { |
1180 | struct CheckpointerSlotMapping |
1181 | { |
1182 | CheckpointerRequest request; |
1183 | int slot; |
1184 | }; |
1185 | |
1186 | int n, |
1187 | preserve_count; |
1188 | int num_skipped = 0; |
1189 | HASHCTL ctl; |
1190 | HTAB *htab; |
1191 | bool *skip_slot; |
1192 | |
1193 | /* must hold CheckpointerCommLock in exclusive mode */ |
1194 | Assert(LWLockHeldByMe(CheckpointerCommLock)); |
1195 | |
1196 | /* Initialize skip_slot array */ |
1197 | skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); |
1198 | |
1199 | /* Initialize temporary hash table */ |
1200 | MemSet(&ctl, 0, sizeof(ctl)); |
1201 | ctl.keysize = sizeof(CheckpointerRequest); |
1202 | ctl.entrysize = sizeof(struct CheckpointerSlotMapping); |
1203 | ctl.hcxt = CurrentMemoryContext; |
1204 | |
1205 | htab = hash_create("CompactCheckpointerRequestQueue" , |
1206 | CheckpointerShmem->num_requests, |
1207 | &ctl, |
1208 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
1209 | |
1210 | /* |
1211 | * The basic idea here is that a request can be skipped if it's followed |
1212 | * by a later, identical request. It might seem more sensible to work |
1213 | * backwards from the end of the queue and check whether a request is |
1214 | * *preceded* by an earlier, identical request, in the hopes of doing less |
1215 | * copying. But that might change the semantics, if there's an |
1216 | * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it |
1217 | * this way. It would be possible to be even smarter if we made the code |
1218 | * below understand the specific semantics of such requests (it could blow |
1219 | * away preceding entries that would end up being canceled anyhow), but |
1220 | * it's not clear that the extra complexity would buy us anything. |
1221 | */ |
1222 | for (n = 0; n < CheckpointerShmem->num_requests; n++) |
1223 | { |
1224 | CheckpointerRequest *request; |
1225 | struct CheckpointerSlotMapping *slotmap; |
1226 | bool found; |
1227 | |
1228 | /* |
1229 | * We use the request struct directly as a hashtable key. This |
1230 | * assumes that any padding bytes in the structs are consistently the |
1231 | * same, which should be okay because we zeroed them in |
1232 | * CheckpointerShmemInit. Note also that RelFileNode had better |
1233 | * contain no pad bytes. |
1234 | */ |
1235 | request = &CheckpointerShmem->requests[n]; |
1236 | slotmap = hash_search(htab, request, HASH_ENTER, &found); |
1237 | if (found) |
1238 | { |
1239 | /* Duplicate, so mark the previous occurrence as skippable */ |
1240 | skip_slot[slotmap->slot] = true; |
1241 | num_skipped++; |
1242 | } |
1243 | /* Remember slot containing latest occurrence of this request value */ |
1244 | slotmap->slot = n; |
1245 | } |
1246 | |
1247 | /* Done with the hash table. */ |
1248 | hash_destroy(htab); |
1249 | |
1250 | /* If no duplicates, we're out of luck. */ |
1251 | if (!num_skipped) |
1252 | { |
1253 | pfree(skip_slot); |
1254 | return false; |
1255 | } |
1256 | |
1257 | /* We found some duplicates; remove them. */ |
1258 | preserve_count = 0; |
1259 | for (n = 0; n < CheckpointerShmem->num_requests; n++) |
1260 | { |
1261 | if (skip_slot[n]) |
1262 | continue; |
1263 | CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; |
1264 | } |
1265 | ereport(DEBUG1, |
1266 | (errmsg("compacted fsync request queue from %d entries to %d entries" , |
1267 | CheckpointerShmem->num_requests, preserve_count))); |
1268 | CheckpointerShmem->num_requests = preserve_count; |
1269 | |
1270 | /* Cleanup. */ |
1271 | pfree(skip_slot); |
1272 | return true; |
1273 | } |
1274 | |
1275 | /* |
1276 | * AbsorbSyncRequests |
1277 | * Retrieve queued sync requests and pass them to sync mechanism. |
1278 | * |
1279 | * This is exported because it must be called during CreateCheckPoint; |
1280 | * we have to be sure we have accepted all pending requests just before |
1281 | * we start fsync'ing. Since CreateCheckPoint sometimes runs in |
1282 | * non-checkpointer processes, do nothing if not checkpointer. |
1283 | */ |
1284 | void |
1285 | AbsorbSyncRequests(void) |
1286 | { |
1287 | CheckpointerRequest *requests = NULL; |
1288 | CheckpointerRequest *request; |
1289 | int n; |
1290 | |
1291 | if (!AmCheckpointerProcess()) |
1292 | return; |
1293 | |
1294 | LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); |
1295 | |
1296 | /* Transfer stats counts into pending pgstats message */ |
1297 | BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes; |
1298 | BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync; |
1299 | |
1300 | CheckpointerShmem->num_backend_writes = 0; |
1301 | CheckpointerShmem->num_backend_fsync = 0; |
1302 | |
1303 | /* |
1304 | * We try to avoid holding the lock for a long time by copying the request |
1305 | * array, and processing the requests after releasing the lock. |
1306 | * |
1307 | * Once we have cleared the requests from shared memory, we have to PANIC |
1308 | * if we then fail to absorb them (eg, because our hashtable runs out of |
1309 | * memory). This is because the system cannot run safely if we are unable |
1310 | * to fsync what we have been told to fsync. Fortunately, the hashtable |
1311 | * is so small that the problem is quite unlikely to arise in practice. |
1312 | */ |
1313 | n = CheckpointerShmem->num_requests; |
1314 | if (n > 0) |
1315 | { |
1316 | requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); |
1317 | memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); |
1318 | } |
1319 | |
1320 | START_CRIT_SECTION(); |
1321 | |
1322 | CheckpointerShmem->num_requests = 0; |
1323 | |
1324 | LWLockRelease(CheckpointerCommLock); |
1325 | |
1326 | for (request = requests; n > 0; request++, n--) |
1327 | RememberSyncRequest(&request->ftag, request->type); |
1328 | |
1329 | END_CRIT_SECTION(); |
1330 | |
1331 | if (requests) |
1332 | pfree(requests); |
1333 | } |
1334 | |
1335 | /* |
1336 | * Update any shared memory configurations based on config parameters |
1337 | */ |
1338 | static void |
1339 | UpdateSharedMemoryConfig(void) |
1340 | { |
1341 | /* update global shmem state for sync rep */ |
1342 | SyncRepUpdateSyncStandbysDefined(); |
1343 | |
1344 | /* |
1345 | * If full_page_writes has been changed by SIGHUP, we update it in shared |
1346 | * memory and write an XLOG_FPW_CHANGE record. |
1347 | */ |
1348 | UpdateFullPageWrites(); |
1349 | |
1350 | elog(DEBUG2, "checkpointer updated shared memory configuration values" ); |
1351 | } |
1352 | |
1353 | /* |
1354 | * FirstCallSinceLastCheckpoint allows a process to take an action once |
1355 | * per checkpoint cycle by asynchronously checking for checkpoint completion. |
1356 | */ |
1357 | bool |
1358 | FirstCallSinceLastCheckpoint(void) |
1359 | { |
1360 | static int ckpt_done = 0; |
1361 | int new_done; |
1362 | bool FirstCall = false; |
1363 | |
1364 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1365 | new_done = CheckpointerShmem->ckpt_done; |
1366 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1367 | |
1368 | if (new_done != ckpt_done) |
1369 | FirstCall = true; |
1370 | |
1371 | ckpt_done = new_done; |
1372 | |
1373 | return FirstCall; |
1374 | } |
1375 | |