| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * checkpointer.c |
| 4 | * |
| 5 | * The checkpointer is new as of Postgres 9.2. It handles all checkpoints. |
| 6 | * Checkpoints are automatically dispatched after a certain amount of time has |
| 7 | * elapsed since the last one, and it can be signaled to perform requested |
| 8 | * checkpoints as well. (The GUC parameter that mandates a checkpoint every |
| 9 | * so many WAL segments is implemented by having backends signal when they |
| 10 | * fill WAL segments; the checkpointer itself doesn't watch for the |
| 11 | * condition.) |
| 12 | * |
| 13 | * The checkpointer is started by the postmaster as soon as the startup |
| 14 | * subprocess finishes, or as soon as recovery begins if we are doing archive |
| 15 | * recovery. It remains alive until the postmaster commands it to terminate. |
| 16 | * Normal termination is by SIGUSR2, which instructs the checkpointer to |
| 17 | * execute a shutdown checkpoint and then exit(0). (All backends must be |
| 18 | * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; |
| 19 | * like any backend, the checkpointer will simply abort and exit on SIGQUIT. |
| 20 | * |
| 21 | * If the checkpointer exits unexpectedly, the postmaster treats that the same |
| 22 | * as a backend crash: shared memory may be corrupted, so remaining backends |
| 23 | * should be killed by SIGQUIT and then a recovery cycle started. (Even if |
| 24 | * shared memory isn't corrupted, we have lost information about which |
| 25 | * files need to be fsync'd for the next checkpoint, and so a system |
| 26 | * restart needs to be forced.) |
| 27 | * |
| 28 | * |
| 29 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 30 | * |
| 31 | * |
| 32 | * IDENTIFICATION |
| 33 | * src/backend/postmaster/checkpointer.c |
| 34 | * |
| 35 | *------------------------------------------------------------------------- |
| 36 | */ |
| 37 | #include "postgres.h" |
| 38 | |
| 39 | #include <signal.h> |
| 40 | #include <sys/time.h> |
| 41 | #include <time.h> |
| 42 | #include <unistd.h> |
| 43 | |
| 44 | #include "access/xlog.h" |
| 45 | #include "access/xlog_internal.h" |
| 46 | #include "libpq/pqsignal.h" |
| 47 | #include "miscadmin.h" |
| 48 | #include "pgstat.h" |
| 49 | #include "postmaster/bgwriter.h" |
| 50 | #include "replication/syncrep.h" |
| 51 | #include "storage/bufmgr.h" |
| 52 | #include "storage/condition_variable.h" |
| 53 | #include "storage/fd.h" |
| 54 | #include "storage/ipc.h" |
| 55 | #include "storage/lwlock.h" |
| 56 | #include "storage/proc.h" |
| 57 | #include "storage/shmem.h" |
| 58 | #include "storage/smgr.h" |
| 59 | #include "storage/spin.h" |
| 60 | #include "utils/guc.h" |
| 61 | #include "utils/memutils.h" |
| 62 | #include "utils/resowner.h" |
| 63 | |
| 64 | |
| 65 | /*---------- |
| 66 | * Shared memory area for communication between checkpointer and backends |
| 67 | * |
| 68 | * The ckpt counters allow backends to watch for completion of a checkpoint |
| 69 | * request they send. Here's how it works: |
| 70 | * * At start of a checkpoint, checkpointer reads (and clears) the request |
| 71 | * flags and increments ckpt_started, while holding ckpt_lck. |
| 72 | * * On completion of a checkpoint, checkpointer sets ckpt_done to |
| 73 | * equal ckpt_started. |
| 74 | * * On failure of a checkpoint, checkpointer increments ckpt_failed |
| 75 | * and sets ckpt_done to equal ckpt_started. |
| 76 | * |
| 77 | * The algorithm for backends is: |
| 78 | * 1. Record current values of ckpt_failed and ckpt_started, and |
| 79 | * set request flags, while holding ckpt_lck. |
| 80 | * 2. Send signal to request checkpoint. |
| 81 | * 3. Sleep until ckpt_started changes. Now you know a checkpoint has |
| 82 | * begun since you started this algorithm (although *not* that it was |
| 83 | * specifically initiated by your signal), and that it is using your flags. |
| 84 | * 4. Record new value of ckpt_started. |
| 85 | * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo |
| 86 | * arithmetic here in case counters wrap around.) Now you know a |
| 87 | * checkpoint has started and completed, but not whether it was |
| 88 | * successful. |
| 89 | * 6. If ckpt_failed is different from the originally saved value, |
| 90 | * assume request failed; otherwise it was definitely successful. |
| 91 | * |
| 92 | * ckpt_flags holds the OR of the checkpoint request flags sent by all |
| 93 | * requesting backends since the last checkpoint start. The flags are |
| 94 | * chosen so that OR'ing is the correct way to combine multiple requests. |
| 95 | * |
| 96 | * num_backend_writes is used to count the number of buffer writes performed |
| 97 | * by user backend processes. This counter should be wide enough that it |
| 98 | * can't overflow during a single processing cycle. num_backend_fsync |
| 99 | * counts the subset of those writes that also had to do their own fsync, |
| 100 | * because the checkpointer failed to absorb their request. |
| 101 | * |
| 102 | * The requests array holds fsync requests sent by backends and not yet |
| 103 | * absorbed by the checkpointer. |
| 104 | * |
| 105 | * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and |
| 106 | * the requests fields are protected by CheckpointerCommLock. |
| 107 | *---------- |
| 108 | */ |
| 109 | typedef struct |
| 110 | { |
| 111 | SyncRequestType type; /* request type */ |
| 112 | FileTag ftag; /* file identifier */ |
| 113 | } CheckpointerRequest; |
| 114 | |
| 115 | typedef struct |
| 116 | { |
| 117 | pid_t checkpointer_pid; /* PID (0 if not started) */ |
| 118 | |
| 119 | slock_t ckpt_lck; /* protects all the ckpt_* fields */ |
| 120 | |
| 121 | int ckpt_started; /* advances when checkpoint starts */ |
| 122 | int ckpt_done; /* advances when checkpoint done */ |
| 123 | int ckpt_failed; /* advances when checkpoint fails */ |
| 124 | |
| 125 | int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ |
| 126 | |
| 127 | ConditionVariable start_cv; /* signaled when ckpt_started advances */ |
| 128 | ConditionVariable done_cv; /* signaled when ckpt_done advances */ |
| 129 | |
| 130 | uint32 num_backend_writes; /* counts user backend buffer writes */ |
| 131 | uint32 num_backend_fsync; /* counts user backend fsync calls */ |
| 132 | |
| 133 | int num_requests; /* current # of requests */ |
| 134 | int max_requests; /* allocated array size */ |
| 135 | CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; |
| 136 | } CheckpointerShmemStruct; |
| 137 | |
| 138 | static CheckpointerShmemStruct *CheckpointerShmem; |
| 139 | |
| 140 | /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ |
| 141 | #define WRITES_PER_ABSORB 1000 |
| 142 | |
| 143 | /* |
| 144 | * GUC parameters |
| 145 | */ |
| 146 | int CheckPointTimeout = 300; |
| 147 | int CheckPointWarning = 30; |
| 148 | double CheckPointCompletionTarget = 0.5; |
| 149 | |
| 150 | /* |
| 151 | * Flags set by interrupt handlers for later service in the main loop. |
| 152 | */ |
| 153 | static volatile sig_atomic_t got_SIGHUP = false; |
| 154 | static volatile sig_atomic_t shutdown_requested = false; |
| 155 | |
| 156 | /* |
| 157 | * Private state |
| 158 | */ |
| 159 | static bool ckpt_active = false; |
| 160 | |
| 161 | /* these values are valid when ckpt_active is true: */ |
| 162 | static pg_time_t ckpt_start_time; |
| 163 | static XLogRecPtr ckpt_start_recptr; |
| 164 | static double ckpt_cached_elapsed; |
| 165 | |
| 166 | static pg_time_t last_checkpoint_time; |
| 167 | static pg_time_t last_xlog_switch_time; |
| 168 | |
| 169 | /* Prototypes for private functions */ |
| 170 | |
| 171 | static void CheckArchiveTimeout(void); |
| 172 | static bool IsCheckpointOnSchedule(double progress); |
| 173 | static bool ImmediateCheckpointRequested(void); |
| 174 | static bool CompactCheckpointerRequestQueue(void); |
| 175 | static void UpdateSharedMemoryConfig(void); |
| 176 | |
| 177 | /* Signal handlers */ |
| 178 | |
| 179 | static void chkpt_quickdie(SIGNAL_ARGS); |
| 180 | static void ChkptSigHupHandler(SIGNAL_ARGS); |
| 181 | static void ReqCheckpointHandler(SIGNAL_ARGS); |
| 182 | static void chkpt_sigusr1_handler(SIGNAL_ARGS); |
| 183 | static void ReqShutdownHandler(SIGNAL_ARGS); |
| 184 | |
| 185 | |
| 186 | /* |
| 187 | * Main entry point for checkpointer process |
| 188 | * |
| 189 | * This is invoked from AuxiliaryProcessMain, which has already created the |
| 190 | * basic execution environment, but not enabled signals yet. |
| 191 | */ |
| 192 | void |
| 193 | CheckpointerMain(void) |
| 194 | { |
| 195 | sigjmp_buf local_sigjmp_buf; |
| 196 | MemoryContext checkpointer_context; |
| 197 | |
| 198 | CheckpointerShmem->checkpointer_pid = MyProcPid; |
| 199 | |
| 200 | /* |
| 201 | * Properly accept or ignore signals the postmaster might send us |
| 202 | * |
| 203 | * Note: we deliberately ignore SIGTERM, because during a standard Unix |
| 204 | * system shutdown cycle, init will SIGTERM all processes at once. We |
| 205 | * want to wait for the backends to exit, whereupon the postmaster will |
| 206 | * tell us it's okay to shut down (via SIGUSR2). |
| 207 | */ |
| 208 | pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config file */ |
| 209 | pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ |
| 210 | pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ |
| 211 | pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ |
| 212 | pqsignal(SIGALRM, SIG_IGN); |
| 213 | pqsignal(SIGPIPE, SIG_IGN); |
| 214 | pqsignal(SIGUSR1, chkpt_sigusr1_handler); |
| 215 | pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ |
| 216 | |
| 217 | /* |
| 218 | * Reset some signals that are accepted by postmaster but not here |
| 219 | */ |
| 220 | pqsignal(SIGCHLD, SIG_DFL); |
| 221 | |
| 222 | /* We allow SIGQUIT (quickdie) at all times */ |
| 223 | sigdelset(&BlockSig, SIGQUIT); |
| 224 | |
| 225 | /* |
| 226 | * Initialize so that first time-driven event happens at the correct time. |
| 227 | */ |
| 228 | last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); |
| 229 | |
| 230 | /* |
| 231 | * Create a memory context that we will do all our work in. We do this so |
| 232 | * that we can reset the context during error recovery and thereby avoid |
| 233 | * possible memory leaks. Formerly this code just ran in |
| 234 | * TopMemoryContext, but resetting that would be a really bad idea. |
| 235 | */ |
| 236 | checkpointer_context = AllocSetContextCreate(TopMemoryContext, |
| 237 | "Checkpointer" , |
| 238 | ALLOCSET_DEFAULT_SIZES); |
| 239 | MemoryContextSwitchTo(checkpointer_context); |
| 240 | |
| 241 | /* |
| 242 | * If an exception is encountered, processing resumes here. |
| 243 | * |
| 244 | * See notes in postgres.c about the design of this coding. |
| 245 | */ |
| 246 | if (sigsetjmp(local_sigjmp_buf, 1) != 0) |
| 247 | { |
| 248 | /* Since not using PG_TRY, must reset error stack by hand */ |
| 249 | error_context_stack = NULL; |
| 250 | |
| 251 | /* Prevent interrupts while cleaning up */ |
| 252 | HOLD_INTERRUPTS(); |
| 253 | |
| 254 | /* Report the error to the server log */ |
| 255 | EmitErrorReport(); |
| 256 | |
| 257 | /* |
| 258 | * These operations are really just a minimal subset of |
| 259 | * AbortTransaction(). We don't have very many resources to worry |
| 260 | * about in checkpointer, but we do have LWLocks, buffers, and temp |
| 261 | * files. |
| 262 | */ |
| 263 | LWLockReleaseAll(); |
| 264 | ConditionVariableCancelSleep(); |
| 265 | pgstat_report_wait_end(); |
| 266 | AbortBufferIO(); |
| 267 | UnlockBuffers(); |
| 268 | ReleaseAuxProcessResources(false); |
| 269 | AtEOXact_Buffers(false); |
| 270 | AtEOXact_SMgr(); |
| 271 | AtEOXact_Files(false); |
| 272 | AtEOXact_HashTables(false); |
| 273 | |
| 274 | /* Warn any waiting backends that the checkpoint failed. */ |
| 275 | if (ckpt_active) |
| 276 | { |
| 277 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 278 | CheckpointerShmem->ckpt_failed++; |
| 279 | CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; |
| 280 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 281 | |
| 282 | ConditionVariableBroadcast(&CheckpointerShmem->done_cv); |
| 283 | |
| 284 | ckpt_active = false; |
| 285 | } |
| 286 | |
| 287 | /* |
| 288 | * Now return to normal top-level context and clear ErrorContext for |
| 289 | * next time. |
| 290 | */ |
| 291 | MemoryContextSwitchTo(checkpointer_context); |
| 292 | FlushErrorState(); |
| 293 | |
| 294 | /* Flush any leaked data in the top-level context */ |
| 295 | MemoryContextResetAndDeleteChildren(checkpointer_context); |
| 296 | |
| 297 | /* Now we can allow interrupts again */ |
| 298 | RESUME_INTERRUPTS(); |
| 299 | |
| 300 | /* |
| 301 | * Sleep at least 1 second after any error. A write error is likely |
| 302 | * to be repeated, and we don't want to be filling the error logs as |
| 303 | * fast as we can. |
| 304 | */ |
| 305 | pg_usleep(1000000L); |
| 306 | |
| 307 | /* |
| 308 | * Close all open files after any error. This is helpful on Windows, |
| 309 | * where holding deleted files open causes various strange errors. |
| 310 | * It's not clear we need it elsewhere, but shouldn't hurt. |
| 311 | */ |
| 312 | smgrcloseall(); |
| 313 | } |
| 314 | |
| 315 | /* We can now handle ereport(ERROR) */ |
| 316 | PG_exception_stack = &local_sigjmp_buf; |
| 317 | |
| 318 | /* |
| 319 | * Unblock signals (they were blocked when the postmaster forked us) |
| 320 | */ |
| 321 | PG_SETMASK(&UnBlockSig); |
| 322 | |
| 323 | /* |
| 324 | * Ensure all shared memory values are set correctly for the config. Doing |
| 325 | * this here ensures no race conditions from other concurrent updaters. |
| 326 | */ |
| 327 | UpdateSharedMemoryConfig(); |
| 328 | |
| 329 | /* |
| 330 | * Advertise our latch that backends can use to wake us up while we're |
| 331 | * sleeping. |
| 332 | */ |
| 333 | ProcGlobal->checkpointerLatch = &MyProc->procLatch; |
| 334 | |
| 335 | /* |
| 336 | * Loop forever |
| 337 | */ |
| 338 | for (;;) |
| 339 | { |
| 340 | bool do_checkpoint = false; |
| 341 | int flags = 0; |
| 342 | pg_time_t now; |
| 343 | int elapsed_secs; |
| 344 | int cur_timeout; |
| 345 | |
| 346 | /* Clear any already-pending wakeups */ |
| 347 | ResetLatch(MyLatch); |
| 348 | |
| 349 | /* |
| 350 | * Process any requests or signals received recently. |
| 351 | */ |
| 352 | AbsorbSyncRequests(); |
| 353 | |
| 354 | if (got_SIGHUP) |
| 355 | { |
| 356 | got_SIGHUP = false; |
| 357 | ProcessConfigFile(PGC_SIGHUP); |
| 358 | |
| 359 | /* |
| 360 | * Checkpointer is the last process to shut down, so we ask it to |
| 361 | * hold the keys for a range of other tasks required most of which |
| 362 | * have nothing to do with checkpointing at all. |
| 363 | * |
| 364 | * For various reasons, some config values can change dynamically |
| 365 | * so the primary copy of them is held in shared memory to make |
| 366 | * sure all backends see the same value. We make Checkpointer |
| 367 | * responsible for updating the shared memory copy if the |
| 368 | * parameter setting changes because of SIGHUP. |
| 369 | */ |
| 370 | UpdateSharedMemoryConfig(); |
| 371 | } |
| 372 | if (shutdown_requested) |
| 373 | { |
| 374 | /* |
| 375 | * From here on, elog(ERROR) should end with exit(1), not send |
| 376 | * control back to the sigsetjmp block above |
| 377 | */ |
| 378 | ExitOnAnyError = true; |
| 379 | /* Close down the database */ |
| 380 | ShutdownXLOG(0, 0); |
| 381 | /* Normal exit from the checkpointer is here */ |
| 382 | proc_exit(0); /* done */ |
| 383 | } |
| 384 | |
| 385 | /* |
| 386 | * Detect a pending checkpoint request by checking whether the flags |
| 387 | * word in shared memory is nonzero. We shouldn't need to acquire the |
| 388 | * ckpt_lck for this. |
| 389 | */ |
| 390 | if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) |
| 391 | { |
| 392 | do_checkpoint = true; |
| 393 | BgWriterStats.m_requested_checkpoints++; |
| 394 | } |
| 395 | |
| 396 | /* |
| 397 | * Force a checkpoint if too much time has elapsed since the last one. |
| 398 | * Note that we count a timed checkpoint in stats only when this |
| 399 | * occurs without an external request, but we set the CAUSE_TIME flag |
| 400 | * bit even if there is also an external request. |
| 401 | */ |
| 402 | now = (pg_time_t) time(NULL); |
| 403 | elapsed_secs = now - last_checkpoint_time; |
| 404 | if (elapsed_secs >= CheckPointTimeout) |
| 405 | { |
| 406 | if (!do_checkpoint) |
| 407 | BgWriterStats.m_timed_checkpoints++; |
| 408 | do_checkpoint = true; |
| 409 | flags |= CHECKPOINT_CAUSE_TIME; |
| 410 | } |
| 411 | |
| 412 | /* |
| 413 | * Do a checkpoint if requested. |
| 414 | */ |
| 415 | if (do_checkpoint) |
| 416 | { |
| 417 | bool ckpt_performed = false; |
| 418 | bool do_restartpoint; |
| 419 | |
| 420 | /* |
| 421 | * Check if we should perform a checkpoint or a restartpoint. As a |
| 422 | * side-effect, RecoveryInProgress() initializes TimeLineID if |
| 423 | * it's not set yet. |
| 424 | */ |
| 425 | do_restartpoint = RecoveryInProgress(); |
| 426 | |
| 427 | /* |
| 428 | * Atomically fetch the request flags to figure out what kind of a |
| 429 | * checkpoint we should perform, and increase the started-counter |
| 430 | * to acknowledge that we've started a new checkpoint. |
| 431 | */ |
| 432 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 433 | flags |= CheckpointerShmem->ckpt_flags; |
| 434 | CheckpointerShmem->ckpt_flags = 0; |
| 435 | CheckpointerShmem->ckpt_started++; |
| 436 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 437 | |
| 438 | ConditionVariableBroadcast(&CheckpointerShmem->start_cv); |
| 439 | |
| 440 | /* |
| 441 | * The end-of-recovery checkpoint is a real checkpoint that's |
| 442 | * performed while we're still in recovery. |
| 443 | */ |
| 444 | if (flags & CHECKPOINT_END_OF_RECOVERY) |
| 445 | do_restartpoint = false; |
| 446 | |
| 447 | /* |
| 448 | * We will warn if (a) too soon since last checkpoint (whatever |
| 449 | * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag |
| 450 | * since the last checkpoint start. Note in particular that this |
| 451 | * implementation will not generate warnings caused by |
| 452 | * CheckPointTimeout < CheckPointWarning. |
| 453 | */ |
| 454 | if (!do_restartpoint && |
| 455 | (flags & CHECKPOINT_CAUSE_XLOG) && |
| 456 | elapsed_secs < CheckPointWarning) |
| 457 | ereport(LOG, |
| 458 | (errmsg_plural("checkpoints are occurring too frequently (%d second apart)" , |
| 459 | "checkpoints are occurring too frequently (%d seconds apart)" , |
| 460 | elapsed_secs, |
| 461 | elapsed_secs), |
| 462 | errhint("Consider increasing the configuration parameter \"max_wal_size\"." ))); |
| 463 | |
| 464 | /* |
| 465 | * Initialize checkpointer-private variables used during |
| 466 | * checkpoint. |
| 467 | */ |
| 468 | ckpt_active = true; |
| 469 | if (do_restartpoint) |
| 470 | ckpt_start_recptr = GetXLogReplayRecPtr(NULL); |
| 471 | else |
| 472 | ckpt_start_recptr = GetInsertRecPtr(); |
| 473 | ckpt_start_time = now; |
| 474 | ckpt_cached_elapsed = 0; |
| 475 | |
| 476 | /* |
| 477 | * Do the checkpoint. |
| 478 | */ |
| 479 | if (!do_restartpoint) |
| 480 | { |
| 481 | CreateCheckPoint(flags); |
| 482 | ckpt_performed = true; |
| 483 | } |
| 484 | else |
| 485 | ckpt_performed = CreateRestartPoint(flags); |
| 486 | |
| 487 | /* |
| 488 | * After any checkpoint, close all smgr files. This is so we |
| 489 | * won't hang onto smgr references to deleted files indefinitely. |
| 490 | */ |
| 491 | smgrcloseall(); |
| 492 | |
| 493 | /* |
| 494 | * Indicate checkpoint completion to any waiting backends. |
| 495 | */ |
| 496 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 497 | CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; |
| 498 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 499 | |
| 500 | ConditionVariableBroadcast(&CheckpointerShmem->done_cv); |
| 501 | |
| 502 | if (ckpt_performed) |
| 503 | { |
| 504 | /* |
| 505 | * Note we record the checkpoint start time not end time as |
| 506 | * last_checkpoint_time. This is so that time-driven |
| 507 | * checkpoints happen at a predictable spacing. |
| 508 | */ |
| 509 | last_checkpoint_time = now; |
| 510 | } |
| 511 | else |
| 512 | { |
| 513 | /* |
| 514 | * We were not able to perform the restartpoint (checkpoints |
| 515 | * throw an ERROR in case of error). Most likely because we |
| 516 | * have not received any new checkpoint WAL records since the |
| 517 | * last restartpoint. Try again in 15 s. |
| 518 | */ |
| 519 | last_checkpoint_time = now - CheckPointTimeout + 15; |
| 520 | } |
| 521 | |
| 522 | ckpt_active = false; |
| 523 | } |
| 524 | |
| 525 | /* Check for archive_timeout and switch xlog files if necessary. */ |
| 526 | CheckArchiveTimeout(); |
| 527 | |
| 528 | /* |
| 529 | * Send off activity statistics to the stats collector. (The reason |
| 530 | * why we re-use bgwriter-related code for this is that the bgwriter |
| 531 | * and checkpointer used to be just one process. It's probably not |
| 532 | * worth the trouble to split the stats support into two independent |
| 533 | * stats message types.) |
| 534 | */ |
| 535 | pgstat_send_bgwriter(); |
| 536 | |
| 537 | /* |
| 538 | * Sleep until we are signaled or it's time for another checkpoint or |
| 539 | * xlog file switch. |
| 540 | */ |
| 541 | now = (pg_time_t) time(NULL); |
| 542 | elapsed_secs = now - last_checkpoint_time; |
| 543 | if (elapsed_secs >= CheckPointTimeout) |
| 544 | continue; /* no sleep for us ... */ |
| 545 | cur_timeout = CheckPointTimeout - elapsed_secs; |
| 546 | if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) |
| 547 | { |
| 548 | elapsed_secs = now - last_xlog_switch_time; |
| 549 | if (elapsed_secs >= XLogArchiveTimeout) |
| 550 | continue; /* no sleep for us ... */ |
| 551 | cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); |
| 552 | } |
| 553 | |
| 554 | (void) WaitLatch(MyLatch, |
| 555 | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
| 556 | cur_timeout * 1000L /* convert to ms */ , |
| 557 | WAIT_EVENT_CHECKPOINTER_MAIN); |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | /* |
| 562 | * CheckArchiveTimeout -- check for archive_timeout and switch xlog files |
| 563 | * |
| 564 | * This will switch to a new WAL file and force an archive file write if |
| 565 | * meaningful activity is recorded in the current WAL file. This includes most |
| 566 | * writes, including just a single checkpoint record, but excludes WAL records |
| 567 | * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like |
| 568 | * snapshots of running transactions). Such records, depending on |
| 569 | * configuration, occur on regular intervals and don't contain important |
| 570 | * information. This avoids generating archives with a few unimportant |
| 571 | * records. |
| 572 | */ |
| 573 | static void |
| 574 | CheckArchiveTimeout(void) |
| 575 | { |
| 576 | pg_time_t now; |
| 577 | pg_time_t last_time; |
| 578 | XLogRecPtr last_switch_lsn; |
| 579 | |
| 580 | if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) |
| 581 | return; |
| 582 | |
| 583 | now = (pg_time_t) time(NULL); |
| 584 | |
| 585 | /* First we do a quick check using possibly-stale local state. */ |
| 586 | if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) |
| 587 | return; |
| 588 | |
| 589 | /* |
| 590 | * Update local state ... note that last_xlog_switch_time is the last time |
| 591 | * a switch was performed *or requested*. |
| 592 | */ |
| 593 | last_time = GetLastSegSwitchData(&last_switch_lsn); |
| 594 | |
| 595 | last_xlog_switch_time = Max(last_xlog_switch_time, last_time); |
| 596 | |
| 597 | /* Now we can do the real checks */ |
| 598 | if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) |
| 599 | { |
| 600 | /* |
| 601 | * Switch segment only when "important" WAL has been logged since the |
| 602 | * last segment switch (last_switch_lsn points to end of segment |
| 603 | * switch occurred in). |
| 604 | */ |
| 605 | if (GetLastImportantRecPtr() > last_switch_lsn) |
| 606 | { |
| 607 | XLogRecPtr switchpoint; |
| 608 | |
| 609 | /* mark switch as unimportant, avoids triggering checkpoints */ |
| 610 | switchpoint = RequestXLogSwitch(true); |
| 611 | |
| 612 | /* |
| 613 | * If the returned pointer points exactly to a segment boundary, |
| 614 | * assume nothing happened. |
| 615 | */ |
| 616 | if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0) |
| 617 | elog(DEBUG1, "write-ahead log switch forced (archive_timeout=%d)" , |
| 618 | XLogArchiveTimeout); |
| 619 | } |
| 620 | |
| 621 | /* |
| 622 | * Update state in any case, so we don't retry constantly when the |
| 623 | * system is idle. |
| 624 | */ |
| 625 | last_xlog_switch_time = now; |
| 626 | } |
| 627 | } |
| 628 | |
| 629 | /* |
| 630 | * Returns true if an immediate checkpoint request is pending. (Note that |
| 631 | * this does not check the *current* checkpoint's IMMEDIATE flag, but whether |
| 632 | * there is one pending behind it.) |
| 633 | */ |
| 634 | static bool |
| 635 | ImmediateCheckpointRequested(void) |
| 636 | { |
| 637 | volatile CheckpointerShmemStruct *cps = CheckpointerShmem; |
| 638 | |
| 639 | /* |
| 640 | * We don't need to acquire the ckpt_lck in this case because we're only |
| 641 | * looking at a single flag bit. |
| 642 | */ |
| 643 | if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) |
| 644 | return true; |
| 645 | return false; |
| 646 | } |
| 647 | |
| 648 | /* |
| 649 | * CheckpointWriteDelay -- control rate of checkpoint |
| 650 | * |
| 651 | * This function is called after each page write performed by BufferSync(). |
| 652 | * It is responsible for throttling BufferSync()'s write rate to hit |
| 653 | * checkpoint_completion_target. |
| 654 | * |
| 655 | * The checkpoint request flags should be passed in; currently the only one |
| 656 | * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. |
| 657 | * |
| 658 | * 'progress' is an estimate of how much of the work has been done, as a |
| 659 | * fraction between 0.0 meaning none, and 1.0 meaning all done. |
| 660 | */ |
| 661 | void |
| 662 | CheckpointWriteDelay(int flags, double progress) |
| 663 | { |
| 664 | static int absorb_counter = WRITES_PER_ABSORB; |
| 665 | |
| 666 | /* Do nothing if checkpoint is being executed by non-checkpointer process */ |
| 667 | if (!AmCheckpointerProcess()) |
| 668 | return; |
| 669 | |
| 670 | /* |
| 671 | * Perform the usual duties and take a nap, unless we're behind schedule, |
| 672 | * in which case we just try to catch up as quickly as possible. |
| 673 | */ |
| 674 | if (!(flags & CHECKPOINT_IMMEDIATE) && |
| 675 | !shutdown_requested && |
| 676 | !ImmediateCheckpointRequested() && |
| 677 | IsCheckpointOnSchedule(progress)) |
| 678 | { |
| 679 | if (got_SIGHUP) |
| 680 | { |
| 681 | got_SIGHUP = false; |
| 682 | ProcessConfigFile(PGC_SIGHUP); |
| 683 | /* update shmem copies of config variables */ |
| 684 | UpdateSharedMemoryConfig(); |
| 685 | } |
| 686 | |
| 687 | AbsorbSyncRequests(); |
| 688 | absorb_counter = WRITES_PER_ABSORB; |
| 689 | |
| 690 | CheckArchiveTimeout(); |
| 691 | |
| 692 | /* |
| 693 | * Report interim activity statistics to the stats collector. |
| 694 | */ |
| 695 | pgstat_send_bgwriter(); |
| 696 | |
| 697 | /* |
| 698 | * This sleep used to be connected to bgwriter_delay, typically 200ms. |
| 699 | * That resulted in more frequent wakeups if not much work to do. |
| 700 | * Checkpointer and bgwriter are no longer related so take the Big |
| 701 | * Sleep. |
| 702 | */ |
| 703 | pg_usleep(100000L); |
| 704 | } |
| 705 | else if (--absorb_counter <= 0) |
| 706 | { |
| 707 | /* |
| 708 | * Absorb pending fsync requests after each WRITES_PER_ABSORB write |
| 709 | * operations even when we don't sleep, to prevent overflow of the |
| 710 | * fsync request queue. |
| 711 | */ |
| 712 | AbsorbSyncRequests(); |
| 713 | absorb_counter = WRITES_PER_ABSORB; |
| 714 | } |
| 715 | } |
| 716 | |
| 717 | /* |
| 718 | * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint |
| 719 | * (or restartpoint) in time? |
| 720 | * |
| 721 | * Compares the current progress against the time/segments elapsed since last |
| 722 | * checkpoint, and returns true if the progress we've made this far is greater |
| 723 | * than the elapsed time/segments. |
| 724 | */ |
| 725 | static bool |
| 726 | IsCheckpointOnSchedule(double progress) |
| 727 | { |
| 728 | XLogRecPtr recptr; |
| 729 | struct timeval now; |
| 730 | double elapsed_xlogs, |
| 731 | elapsed_time; |
| 732 | |
| 733 | Assert(ckpt_active); |
| 734 | |
| 735 | /* Scale progress according to checkpoint_completion_target. */ |
| 736 | progress *= CheckPointCompletionTarget; |
| 737 | |
| 738 | /* |
| 739 | * Check against the cached value first. Only do the more expensive |
| 740 | * calculations once we reach the target previously calculated. Since |
| 741 | * neither time or WAL insert pointer moves backwards, a freshly |
| 742 | * calculated value can only be greater than or equal to the cached value. |
| 743 | */ |
| 744 | if (progress < ckpt_cached_elapsed) |
| 745 | return false; |
| 746 | |
| 747 | /* |
| 748 | * Check progress against WAL segments written and CheckPointSegments. |
| 749 | * |
| 750 | * We compare the current WAL insert location against the location |
| 751 | * computed before calling CreateCheckPoint. The code in XLogInsert that |
| 752 | * actually triggers a checkpoint when CheckPointSegments is exceeded |
| 753 | * compares against RedoRecptr, so this is not completely accurate. |
| 754 | * However, it's good enough for our purposes, we're only calculating an |
| 755 | * estimate anyway. |
| 756 | * |
| 757 | * During recovery, we compare last replayed WAL record's location with |
| 758 | * the location computed before calling CreateRestartPoint. That maintains |
| 759 | * the same pacing as we have during checkpoints in normal operation, but |
| 760 | * we might exceed max_wal_size by a fair amount. That's because there can |
| 761 | * be a large gap between a checkpoint's redo-pointer and the checkpoint |
| 762 | * record itself, and we only start the restartpoint after we've seen the |
| 763 | * checkpoint record. (The gap is typically up to CheckPointSegments * |
| 764 | * checkpoint_completion_target where checkpoint_completion_target is the |
| 765 | * value that was in effect when the WAL was generated). |
| 766 | */ |
| 767 | if (RecoveryInProgress()) |
| 768 | recptr = GetXLogReplayRecPtr(NULL); |
| 769 | else |
| 770 | recptr = GetInsertRecPtr(); |
| 771 | elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / |
| 772 | wal_segment_size) / CheckPointSegments; |
| 773 | |
| 774 | if (progress < elapsed_xlogs) |
| 775 | { |
| 776 | ckpt_cached_elapsed = elapsed_xlogs; |
| 777 | return false; |
| 778 | } |
| 779 | |
| 780 | /* |
| 781 | * Check progress against time elapsed and checkpoint_timeout. |
| 782 | */ |
| 783 | gettimeofday(&now, NULL); |
| 784 | elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + |
| 785 | now.tv_usec / 1000000.0) / CheckPointTimeout; |
| 786 | |
| 787 | if (progress < elapsed_time) |
| 788 | { |
| 789 | ckpt_cached_elapsed = elapsed_time; |
| 790 | return false; |
| 791 | } |
| 792 | |
| 793 | /* It looks like we're on schedule. */ |
| 794 | return true; |
| 795 | } |
| 796 | |
| 797 | |
| 798 | /* -------------------------------- |
| 799 | * signal handler routines |
| 800 | * -------------------------------- |
| 801 | */ |
| 802 | |
| 803 | /* |
| 804 | * chkpt_quickdie() occurs when signalled SIGQUIT by the postmaster. |
| 805 | * |
| 806 | * Some backend has bought the farm, |
| 807 | * so we need to stop what we're doing and exit. |
| 808 | */ |
| 809 | static void |
| 810 | chkpt_quickdie(SIGNAL_ARGS) |
| 811 | { |
| 812 | /* |
| 813 | * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here |
| 814 | * because shared memory may be corrupted, so we don't want to try to |
| 815 | * clean up our transaction. Just nail the windows shut and get out of |
| 816 | * town. The callbacks wouldn't be safe to run from a signal handler, |
| 817 | * anyway. |
| 818 | * |
| 819 | * Note we do _exit(2) not _exit(0). This is to force the postmaster into |
| 820 | * a system reset cycle if someone sends a manual SIGQUIT to a random |
| 821 | * backend. This is necessary precisely because we don't clean up our |
| 822 | * shared memory state. (The "dead man switch" mechanism in pmsignal.c |
| 823 | * should ensure the postmaster sees this as a crash, too, but no harm in |
| 824 | * being doubly sure.) |
| 825 | */ |
| 826 | _exit(2); |
| 827 | } |
| 828 | |
| 829 | /* SIGHUP: set flag to re-read config file at next convenient time */ |
| 830 | static void |
| 831 | ChkptSigHupHandler(SIGNAL_ARGS) |
| 832 | { |
| 833 | int save_errno = errno; |
| 834 | |
| 835 | got_SIGHUP = true; |
| 836 | SetLatch(MyLatch); |
| 837 | |
| 838 | errno = save_errno; |
| 839 | } |
| 840 | |
| 841 | /* SIGINT: set flag to run a normal checkpoint right away */ |
| 842 | static void |
| 843 | ReqCheckpointHandler(SIGNAL_ARGS) |
| 844 | { |
| 845 | int save_errno = errno; |
| 846 | |
| 847 | /* |
| 848 | * The signalling process should have set ckpt_flags nonzero, so all we |
| 849 | * need do is ensure that our main loop gets kicked out of any wait. |
| 850 | */ |
| 851 | SetLatch(MyLatch); |
| 852 | |
| 853 | errno = save_errno; |
| 854 | } |
| 855 | |
| 856 | /* SIGUSR1: used for latch wakeups */ |
| 857 | static void |
| 858 | chkpt_sigusr1_handler(SIGNAL_ARGS) |
| 859 | { |
| 860 | int save_errno = errno; |
| 861 | |
| 862 | latch_sigusr1_handler(); |
| 863 | |
| 864 | errno = save_errno; |
| 865 | } |
| 866 | |
| 867 | /* SIGUSR2: set flag to run a shutdown checkpoint and exit */ |
| 868 | static void |
| 869 | ReqShutdownHandler(SIGNAL_ARGS) |
| 870 | { |
| 871 | int save_errno = errno; |
| 872 | |
| 873 | shutdown_requested = true; |
| 874 | SetLatch(MyLatch); |
| 875 | |
| 876 | errno = save_errno; |
| 877 | } |
| 878 | |
| 879 | |
| 880 | /* -------------------------------- |
| 881 | * communication with backends |
| 882 | * -------------------------------- |
| 883 | */ |
| 884 | |
| 885 | /* |
| 886 | * CheckpointerShmemSize |
| 887 | * Compute space needed for checkpointer-related shared memory |
| 888 | */ |
| 889 | Size |
| 890 | CheckpointerShmemSize(void) |
| 891 | { |
| 892 | Size size; |
| 893 | |
| 894 | /* |
| 895 | * Currently, the size of the requests[] array is arbitrarily set equal to |
| 896 | * NBuffers. This may prove too large or small ... |
| 897 | */ |
| 898 | size = offsetof(CheckpointerShmemStruct, requests); |
| 899 | size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest))); |
| 900 | |
| 901 | return size; |
| 902 | } |
| 903 | |
| 904 | /* |
| 905 | * CheckpointerShmemInit |
| 906 | * Allocate and initialize checkpointer-related shared memory |
| 907 | */ |
| 908 | void |
| 909 | CheckpointerShmemInit(void) |
| 910 | { |
| 911 | Size size = CheckpointerShmemSize(); |
| 912 | bool found; |
| 913 | |
| 914 | CheckpointerShmem = (CheckpointerShmemStruct *) |
| 915 | ShmemInitStruct("Checkpointer Data" , |
| 916 | size, |
| 917 | &found); |
| 918 | |
| 919 | if (!found) |
| 920 | { |
| 921 | /* |
| 922 | * First time through, so initialize. Note that we zero the whole |
| 923 | * requests array; this is so that CompactCheckpointerRequestQueue can |
| 924 | * assume that any pad bytes in the request structs are zeroes. |
| 925 | */ |
| 926 | MemSet(CheckpointerShmem, 0, size); |
| 927 | SpinLockInit(&CheckpointerShmem->ckpt_lck); |
| 928 | CheckpointerShmem->max_requests = NBuffers; |
| 929 | ConditionVariableInit(&CheckpointerShmem->start_cv); |
| 930 | ConditionVariableInit(&CheckpointerShmem->done_cv); |
| 931 | } |
| 932 | } |
| 933 | |
| 934 | /* |
| 935 | * RequestCheckpoint |
| 936 | * Called in backend processes to request a checkpoint |
| 937 | * |
| 938 | * flags is a bitwise OR of the following: |
| 939 | * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. |
| 940 | * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. |
| 941 | * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, |
| 942 | * ignoring checkpoint_completion_target parameter. |
| 943 | * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred |
| 944 | * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or |
| 945 | * CHECKPOINT_END_OF_RECOVERY). |
| 946 | * CHECKPOINT_WAIT: wait for completion before returning (otherwise, |
| 947 | * just signal checkpointer to do it, and return). |
| 948 | * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. |
| 949 | * (This affects logging, and in particular enables CheckPointWarning.) |
| 950 | */ |
| 951 | void |
| 952 | RequestCheckpoint(int flags) |
| 953 | { |
| 954 | int ntries; |
| 955 | int old_failed, |
| 956 | old_started; |
| 957 | |
| 958 | /* |
| 959 | * If in a standalone backend, just do it ourselves. |
| 960 | */ |
| 961 | if (!IsPostmasterEnvironment) |
| 962 | { |
| 963 | /* |
| 964 | * There's no point in doing slow checkpoints in a standalone backend, |
| 965 | * because there's no other backends the checkpoint could disrupt. |
| 966 | */ |
| 967 | CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); |
| 968 | |
| 969 | /* |
| 970 | * After any checkpoint, close all smgr files. This is so we won't |
| 971 | * hang onto smgr references to deleted files indefinitely. |
| 972 | */ |
| 973 | smgrcloseall(); |
| 974 | |
| 975 | return; |
| 976 | } |
| 977 | |
| 978 | /* |
| 979 | * Atomically set the request flags, and take a snapshot of the counters. |
| 980 | * When we see ckpt_started > old_started, we know the flags we set here |
| 981 | * have been seen by checkpointer. |
| 982 | * |
| 983 | * Note that we OR the flags with any existing flags, to avoid overriding |
| 984 | * a "stronger" request by another backend. The flag senses must be |
| 985 | * chosen to make this work! |
| 986 | */ |
| 987 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 988 | |
| 989 | old_failed = CheckpointerShmem->ckpt_failed; |
| 990 | old_started = CheckpointerShmem->ckpt_started; |
| 991 | CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED); |
| 992 | |
| 993 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 994 | |
| 995 | /* |
| 996 | * Send signal to request checkpoint. It's possible that the checkpointer |
| 997 | * hasn't started yet, or is in process of restarting, so we will retry a |
| 998 | * few times if needed. (Actually, more than a few times, since on slow |
| 999 | * or overloaded buildfarm machines, it's been observed that the |
| 1000 | * checkpointer can take several seconds to start.) However, if not told |
| 1001 | * to wait for the checkpoint to occur, we consider failure to send the |
| 1002 | * signal to be nonfatal and merely LOG it. The checkpointer should see |
| 1003 | * the request when it does start, with or without getting a signal. |
| 1004 | */ |
| 1005 | #define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */ |
| 1006 | for (ntries = 0;; ntries++) |
| 1007 | { |
| 1008 | if (CheckpointerShmem->checkpointer_pid == 0) |
| 1009 | { |
| 1010 | if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) |
| 1011 | { |
| 1012 | elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, |
| 1013 | "could not signal for checkpoint: checkpointer is not running" ); |
| 1014 | break; |
| 1015 | } |
| 1016 | } |
| 1017 | else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0) |
| 1018 | { |
| 1019 | if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) |
| 1020 | { |
| 1021 | elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, |
| 1022 | "could not signal for checkpoint: %m" ); |
| 1023 | break; |
| 1024 | } |
| 1025 | } |
| 1026 | else |
| 1027 | break; /* signal sent successfully */ |
| 1028 | |
| 1029 | CHECK_FOR_INTERRUPTS(); |
| 1030 | pg_usleep(100000L); /* wait 0.1 sec, then retry */ |
| 1031 | } |
| 1032 | |
| 1033 | /* |
| 1034 | * If requested, wait for completion. We detect completion according to |
| 1035 | * the algorithm given above. |
| 1036 | */ |
| 1037 | if (flags & CHECKPOINT_WAIT) |
| 1038 | { |
| 1039 | int new_started, |
| 1040 | new_failed; |
| 1041 | |
| 1042 | /* Wait for a new checkpoint to start. */ |
| 1043 | ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv); |
| 1044 | for (;;) |
| 1045 | { |
| 1046 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 1047 | new_started = CheckpointerShmem->ckpt_started; |
| 1048 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 1049 | |
| 1050 | if (new_started != old_started) |
| 1051 | break; |
| 1052 | |
| 1053 | ConditionVariableSleep(&CheckpointerShmem->start_cv, |
| 1054 | WAIT_EVENT_CHECKPOINT_START); |
| 1055 | } |
| 1056 | ConditionVariableCancelSleep(); |
| 1057 | |
| 1058 | /* |
| 1059 | * We are waiting for ckpt_done >= new_started, in a modulo sense. |
| 1060 | */ |
| 1061 | ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv); |
| 1062 | for (;;) |
| 1063 | { |
| 1064 | int new_done; |
| 1065 | |
| 1066 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 1067 | new_done = CheckpointerShmem->ckpt_done; |
| 1068 | new_failed = CheckpointerShmem->ckpt_failed; |
| 1069 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 1070 | |
| 1071 | if (new_done - new_started >= 0) |
| 1072 | break; |
| 1073 | |
| 1074 | ConditionVariableSleep(&CheckpointerShmem->done_cv, |
| 1075 | WAIT_EVENT_CHECKPOINT_DONE); |
| 1076 | } |
| 1077 | ConditionVariableCancelSleep(); |
| 1078 | |
| 1079 | if (new_failed != old_failed) |
| 1080 | ereport(ERROR, |
| 1081 | (errmsg("checkpoint request failed" ), |
| 1082 | errhint("Consult recent messages in the server log for details." ))); |
| 1083 | } |
| 1084 | } |
| 1085 | |
| 1086 | /* |
| 1087 | * ForwardSyncRequest |
| 1088 | * Forward a file-fsync request from a backend to the checkpointer |
| 1089 | * |
| 1090 | * Whenever a backend is compelled to write directly to a relation |
| 1091 | * (which should be seldom, if the background writer is getting its job done), |
| 1092 | * the backend calls this routine to pass over knowledge that the relation |
| 1093 | * is dirty and must be fsync'd before next checkpoint. We also use this |
| 1094 | * opportunity to count such writes for statistical purposes. |
| 1095 | * |
| 1096 | * To avoid holding the lock for longer than necessary, we normally write |
| 1097 | * to the requests[] queue without checking for duplicates. The checkpointer |
| 1098 | * will have to eliminate dups internally anyway. However, if we discover |
| 1099 | * that the queue is full, we make a pass over the entire queue to compact |
| 1100 | * it. This is somewhat expensive, but the alternative is for the backend |
| 1101 | * to perform its own fsync, which is far more expensive in practice. It |
| 1102 | * is theoretically possible a backend fsync might still be necessary, if |
| 1103 | * the queue is full and contains no duplicate entries. In that case, we |
| 1104 | * let the backend know by returning false. |
| 1105 | */ |
| 1106 | bool |
| 1107 | ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) |
| 1108 | { |
| 1109 | CheckpointerRequest *request; |
| 1110 | bool too_full; |
| 1111 | |
| 1112 | if (!IsUnderPostmaster) |
| 1113 | return false; /* probably shouldn't even get here */ |
| 1114 | |
| 1115 | if (AmCheckpointerProcess()) |
| 1116 | elog(ERROR, "ForwardSyncRequest must not be called in checkpointer" ); |
| 1117 | |
| 1118 | LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); |
| 1119 | |
| 1120 | /* Count all backend writes regardless of if they fit in the queue */ |
| 1121 | if (!AmBackgroundWriterProcess()) |
| 1122 | CheckpointerShmem->num_backend_writes++; |
| 1123 | |
| 1124 | /* |
| 1125 | * If the checkpointer isn't running or the request queue is full, the |
| 1126 | * backend will have to perform its own fsync request. But before forcing |
| 1127 | * that to happen, we can try to compact the request queue. |
| 1128 | */ |
| 1129 | if (CheckpointerShmem->checkpointer_pid == 0 || |
| 1130 | (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests && |
| 1131 | !CompactCheckpointerRequestQueue())) |
| 1132 | { |
| 1133 | /* |
| 1134 | * Count the subset of writes where backends have to do their own |
| 1135 | * fsync |
| 1136 | */ |
| 1137 | if (!AmBackgroundWriterProcess()) |
| 1138 | CheckpointerShmem->num_backend_fsync++; |
| 1139 | LWLockRelease(CheckpointerCommLock); |
| 1140 | return false; |
| 1141 | } |
| 1142 | |
| 1143 | /* OK, insert request */ |
| 1144 | request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; |
| 1145 | request->ftag = *ftag; |
| 1146 | request->type = type; |
| 1147 | |
| 1148 | /* If queue is more than half full, nudge the checkpointer to empty it */ |
| 1149 | too_full = (CheckpointerShmem->num_requests >= |
| 1150 | CheckpointerShmem->max_requests / 2); |
| 1151 | |
| 1152 | LWLockRelease(CheckpointerCommLock); |
| 1153 | |
| 1154 | /* ... but not till after we release the lock */ |
| 1155 | if (too_full && ProcGlobal->checkpointerLatch) |
| 1156 | SetLatch(ProcGlobal->checkpointerLatch); |
| 1157 | |
| 1158 | return true; |
| 1159 | } |
| 1160 | |
| 1161 | /* |
| 1162 | * CompactCheckpointerRequestQueue |
| 1163 | * Remove duplicates from the request queue to avoid backend fsyncs. |
| 1164 | * Returns "true" if any entries were removed. |
| 1165 | * |
| 1166 | * Although a full fsync request queue is not common, it can lead to severe |
| 1167 | * performance problems when it does happen. So far, this situation has |
| 1168 | * only been observed to occur when the system is under heavy write load, |
| 1169 | * and especially during the "sync" phase of a checkpoint. Without this |
| 1170 | * logic, each backend begins doing an fsync for every block written, which |
| 1171 | * gets very expensive and can slow down the whole system. |
| 1172 | * |
| 1173 | * Trying to do this every time the queue is full could lose if there |
| 1174 | * aren't any removable entries. But that should be vanishingly rare in |
| 1175 | * practice: there's one queue entry per shared buffer. |
| 1176 | */ |
| 1177 | static bool |
| 1178 | CompactCheckpointerRequestQueue(void) |
| 1179 | { |
| 1180 | struct CheckpointerSlotMapping |
| 1181 | { |
| 1182 | CheckpointerRequest request; |
| 1183 | int slot; |
| 1184 | }; |
| 1185 | |
| 1186 | int n, |
| 1187 | preserve_count; |
| 1188 | int num_skipped = 0; |
| 1189 | HASHCTL ctl; |
| 1190 | HTAB *htab; |
| 1191 | bool *skip_slot; |
| 1192 | |
| 1193 | /* must hold CheckpointerCommLock in exclusive mode */ |
| 1194 | Assert(LWLockHeldByMe(CheckpointerCommLock)); |
| 1195 | |
| 1196 | /* Initialize skip_slot array */ |
| 1197 | skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); |
| 1198 | |
| 1199 | /* Initialize temporary hash table */ |
| 1200 | MemSet(&ctl, 0, sizeof(ctl)); |
| 1201 | ctl.keysize = sizeof(CheckpointerRequest); |
| 1202 | ctl.entrysize = sizeof(struct CheckpointerSlotMapping); |
| 1203 | ctl.hcxt = CurrentMemoryContext; |
| 1204 | |
| 1205 | htab = hash_create("CompactCheckpointerRequestQueue" , |
| 1206 | CheckpointerShmem->num_requests, |
| 1207 | &ctl, |
| 1208 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| 1209 | |
| 1210 | /* |
| 1211 | * The basic idea here is that a request can be skipped if it's followed |
| 1212 | * by a later, identical request. It might seem more sensible to work |
| 1213 | * backwards from the end of the queue and check whether a request is |
| 1214 | * *preceded* by an earlier, identical request, in the hopes of doing less |
| 1215 | * copying. But that might change the semantics, if there's an |
| 1216 | * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it |
| 1217 | * this way. It would be possible to be even smarter if we made the code |
| 1218 | * below understand the specific semantics of such requests (it could blow |
| 1219 | * away preceding entries that would end up being canceled anyhow), but |
| 1220 | * it's not clear that the extra complexity would buy us anything. |
| 1221 | */ |
| 1222 | for (n = 0; n < CheckpointerShmem->num_requests; n++) |
| 1223 | { |
| 1224 | CheckpointerRequest *request; |
| 1225 | struct CheckpointerSlotMapping *slotmap; |
| 1226 | bool found; |
| 1227 | |
| 1228 | /* |
| 1229 | * We use the request struct directly as a hashtable key. This |
| 1230 | * assumes that any padding bytes in the structs are consistently the |
| 1231 | * same, which should be okay because we zeroed them in |
| 1232 | * CheckpointerShmemInit. Note also that RelFileNode had better |
| 1233 | * contain no pad bytes. |
| 1234 | */ |
| 1235 | request = &CheckpointerShmem->requests[n]; |
| 1236 | slotmap = hash_search(htab, request, HASH_ENTER, &found); |
| 1237 | if (found) |
| 1238 | { |
| 1239 | /* Duplicate, so mark the previous occurrence as skippable */ |
| 1240 | skip_slot[slotmap->slot] = true; |
| 1241 | num_skipped++; |
| 1242 | } |
| 1243 | /* Remember slot containing latest occurrence of this request value */ |
| 1244 | slotmap->slot = n; |
| 1245 | } |
| 1246 | |
| 1247 | /* Done with the hash table. */ |
| 1248 | hash_destroy(htab); |
| 1249 | |
| 1250 | /* If no duplicates, we're out of luck. */ |
| 1251 | if (!num_skipped) |
| 1252 | { |
| 1253 | pfree(skip_slot); |
| 1254 | return false; |
| 1255 | } |
| 1256 | |
| 1257 | /* We found some duplicates; remove them. */ |
| 1258 | preserve_count = 0; |
| 1259 | for (n = 0; n < CheckpointerShmem->num_requests; n++) |
| 1260 | { |
| 1261 | if (skip_slot[n]) |
| 1262 | continue; |
| 1263 | CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; |
| 1264 | } |
| 1265 | ereport(DEBUG1, |
| 1266 | (errmsg("compacted fsync request queue from %d entries to %d entries" , |
| 1267 | CheckpointerShmem->num_requests, preserve_count))); |
| 1268 | CheckpointerShmem->num_requests = preserve_count; |
| 1269 | |
| 1270 | /* Cleanup. */ |
| 1271 | pfree(skip_slot); |
| 1272 | return true; |
| 1273 | } |
| 1274 | |
| 1275 | /* |
| 1276 | * AbsorbSyncRequests |
| 1277 | * Retrieve queued sync requests and pass them to sync mechanism. |
| 1278 | * |
| 1279 | * This is exported because it must be called during CreateCheckPoint; |
| 1280 | * we have to be sure we have accepted all pending requests just before |
| 1281 | * we start fsync'ing. Since CreateCheckPoint sometimes runs in |
| 1282 | * non-checkpointer processes, do nothing if not checkpointer. |
| 1283 | */ |
| 1284 | void |
| 1285 | AbsorbSyncRequests(void) |
| 1286 | { |
| 1287 | CheckpointerRequest *requests = NULL; |
| 1288 | CheckpointerRequest *request; |
| 1289 | int n; |
| 1290 | |
| 1291 | if (!AmCheckpointerProcess()) |
| 1292 | return; |
| 1293 | |
| 1294 | LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); |
| 1295 | |
| 1296 | /* Transfer stats counts into pending pgstats message */ |
| 1297 | BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes; |
| 1298 | BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync; |
| 1299 | |
| 1300 | CheckpointerShmem->num_backend_writes = 0; |
| 1301 | CheckpointerShmem->num_backend_fsync = 0; |
| 1302 | |
| 1303 | /* |
| 1304 | * We try to avoid holding the lock for a long time by copying the request |
| 1305 | * array, and processing the requests after releasing the lock. |
| 1306 | * |
| 1307 | * Once we have cleared the requests from shared memory, we have to PANIC |
| 1308 | * if we then fail to absorb them (eg, because our hashtable runs out of |
| 1309 | * memory). This is because the system cannot run safely if we are unable |
| 1310 | * to fsync what we have been told to fsync. Fortunately, the hashtable |
| 1311 | * is so small that the problem is quite unlikely to arise in practice. |
| 1312 | */ |
| 1313 | n = CheckpointerShmem->num_requests; |
| 1314 | if (n > 0) |
| 1315 | { |
| 1316 | requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); |
| 1317 | memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); |
| 1318 | } |
| 1319 | |
| 1320 | START_CRIT_SECTION(); |
| 1321 | |
| 1322 | CheckpointerShmem->num_requests = 0; |
| 1323 | |
| 1324 | LWLockRelease(CheckpointerCommLock); |
| 1325 | |
| 1326 | for (request = requests; n > 0; request++, n--) |
| 1327 | RememberSyncRequest(&request->ftag, request->type); |
| 1328 | |
| 1329 | END_CRIT_SECTION(); |
| 1330 | |
| 1331 | if (requests) |
| 1332 | pfree(requests); |
| 1333 | } |
| 1334 | |
| 1335 | /* |
| 1336 | * Update any shared memory configurations based on config parameters |
| 1337 | */ |
| 1338 | static void |
| 1339 | UpdateSharedMemoryConfig(void) |
| 1340 | { |
| 1341 | /* update global shmem state for sync rep */ |
| 1342 | SyncRepUpdateSyncStandbysDefined(); |
| 1343 | |
| 1344 | /* |
| 1345 | * If full_page_writes has been changed by SIGHUP, we update it in shared |
| 1346 | * memory and write an XLOG_FPW_CHANGE record. |
| 1347 | */ |
| 1348 | UpdateFullPageWrites(); |
| 1349 | |
| 1350 | elog(DEBUG2, "checkpointer updated shared memory configuration values" ); |
| 1351 | } |
| 1352 | |
| 1353 | /* |
| 1354 | * FirstCallSinceLastCheckpoint allows a process to take an action once |
| 1355 | * per checkpoint cycle by asynchronously checking for checkpoint completion. |
| 1356 | */ |
| 1357 | bool |
| 1358 | FirstCallSinceLastCheckpoint(void) |
| 1359 | { |
| 1360 | static int ckpt_done = 0; |
| 1361 | int new_done; |
| 1362 | bool FirstCall = false; |
| 1363 | |
| 1364 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
| 1365 | new_done = CheckpointerShmem->ckpt_done; |
| 1366 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
| 1367 | |
| 1368 | if (new_done != ckpt_done) |
| 1369 | FirstCall = true; |
| 1370 | |
| 1371 | ckpt_done = new_done; |
| 1372 | |
| 1373 | return FirstCall; |
| 1374 | } |
| 1375 | |