| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * sync.c |
| 4 | * File synchronization management code. |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | * |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/storage/sync/sync.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include <unistd.h> |
| 18 | #include <fcntl.h> |
| 19 | #include <sys/file.h> |
| 20 | |
| 21 | #include "miscadmin.h" |
| 22 | #include "pgstat.h" |
| 23 | #include "access/xlogutils.h" |
| 24 | #include "access/xlog.h" |
| 25 | #include "commands/tablespace.h" |
| 26 | #include "portability/instr_time.h" |
| 27 | #include "postmaster/bgwriter.h" |
| 28 | #include "storage/bufmgr.h" |
| 29 | #include "storage/ipc.h" |
| 30 | #include "storage/md.h" |
| 31 | #include "utils/hsearch.h" |
| 32 | #include "utils/memutils.h" |
| 33 | #include "utils/inval.h" |
| 34 | |
| 35 | static MemoryContext pendingOpsCxt; /* context for the pending ops state */ |
| 36 | |
| 37 | /* |
| 38 | * In some contexts (currently, standalone backends and the checkpointer) |
| 39 | * we keep track of pending fsync operations: we need to remember all relation |
| 40 | * segments that have been written since the last checkpoint, so that we can |
| 41 | * fsync them down to disk before completing the next checkpoint. This hash |
| 42 | * table remembers the pending operations. We use a hash table mostly as |
| 43 | * a convenient way of merging duplicate requests. |
| 44 | * |
| 45 | * We use a similar mechanism to remember no-longer-needed files that can |
| 46 | * be deleted after the next checkpoint, but we use a linked list instead of |
| 47 | * a hash table, because we don't expect there to be any duplicate requests. |
| 48 | * |
| 49 | * These mechanisms are only used for non-temp relations; we never fsync |
| 50 | * temp rels, nor do we need to postpone their deletion (see comments in |
| 51 | * mdunlink). |
| 52 | * |
| 53 | * (Regular backends do not track pending operations locally, but forward |
| 54 | * them to the checkpointer.) |
| 55 | */ |
| 56 | typedef uint16 CycleCtr; /* can be any convenient integer size */ |
| 57 | |
| 58 | typedef struct |
| 59 | { |
| 60 | FileTag tag; /* identifies handler and file */ |
| 61 | CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */ |
| 62 | bool canceled; /* canceled is true if we canceled "recently" */ |
| 63 | } PendingFsyncEntry; |
| 64 | |
| 65 | typedef struct |
| 66 | { |
| 67 | FileTag tag; /* identifies handler and file */ |
| 68 | CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */ |
| 69 | } PendingUnlinkEntry; |
| 70 | |
| 71 | static HTAB *pendingOps = NULL; |
| 72 | static List *pendingUnlinks = NIL; |
| 73 | static MemoryContext pendingOpsCxt; /* context for the above */ |
| 74 | |
| 75 | static CycleCtr sync_cycle_ctr = 0; |
| 76 | static CycleCtr checkpoint_cycle_ctr = 0; |
| 77 | |
| 78 | /* Intervals for calling AbsorbSyncRequests */ |
| 79 | #define FSYNCS_PER_ABSORB 10 |
| 80 | #define UNLINKS_PER_ABSORB 10 |
| 81 | |
| 82 | /* |
| 83 | * Function pointers for handling sync and unlink requests. |
| 84 | */ |
| 85 | typedef struct SyncOps |
| 86 | { |
| 87 | int (*sync_syncfiletag) (const FileTag *ftag, char *path); |
| 88 | int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); |
| 89 | bool (*sync_filetagmatches) (const FileTag *ftag, |
| 90 | const FileTag *candidate); |
| 91 | } SyncOps; |
| 92 | |
| 93 | static const SyncOps syncsw[] = { |
| 94 | /* magnetic disk */ |
| 95 | { |
| 96 | .sync_syncfiletag = mdsyncfiletag, |
| 97 | .sync_unlinkfiletag = mdunlinkfiletag, |
| 98 | .sync_filetagmatches = mdfiletagmatches |
| 99 | } |
| 100 | }; |
| 101 | |
| 102 | /* |
| 103 | * Initialize data structures for the file sync tracking. |
| 104 | */ |
| 105 | void |
| 106 | InitSync(void) |
| 107 | { |
| 108 | /* |
| 109 | * Create pending-operations hashtable if we need it. Currently, we need |
| 110 | * it if we are standalone (not under a postmaster) or if we are a startup |
| 111 | * or checkpointer auxiliary process. |
| 112 | */ |
| 113 | if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) |
| 114 | { |
| 115 | HASHCTL hash_ctl; |
| 116 | |
| 117 | /* |
| 118 | * XXX: The checkpointer needs to add entries to the pending ops table |
| 119 | * when absorbing fsync requests. That is done within a critical |
| 120 | * section, which isn't usually allowed, but we make an exception. It |
| 121 | * means that there's a theoretical possibility that you run out of |
| 122 | * memory while absorbing fsync requests, which leads to a PANIC. |
| 123 | * Fortunately the hash table is small so that's unlikely to happen in |
| 124 | * practice. |
| 125 | */ |
| 126 | pendingOpsCxt = AllocSetContextCreate(TopMemoryContext, |
| 127 | "Pending ops context" , |
| 128 | ALLOCSET_DEFAULT_SIZES); |
| 129 | MemoryContextAllowInCriticalSection(pendingOpsCxt, true); |
| 130 | |
| 131 | MemSet(&hash_ctl, 0, sizeof(hash_ctl)); |
| 132 | hash_ctl.keysize = sizeof(FileTag); |
| 133 | hash_ctl.entrysize = sizeof(PendingFsyncEntry); |
| 134 | hash_ctl.hcxt = pendingOpsCxt; |
| 135 | pendingOps = hash_create("Pending Ops Table" , |
| 136 | 100L, |
| 137 | &hash_ctl, |
| 138 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| 139 | pendingUnlinks = NIL; |
| 140 | } |
| 141 | |
| 142 | } |
| 143 | |
| 144 | /* |
| 145 | * SyncPreCheckpoint() -- Do pre-checkpoint work |
| 146 | * |
| 147 | * To distinguish unlink requests that arrived before this checkpoint |
| 148 | * started from those that arrived during the checkpoint, we use a cycle |
| 149 | * counter similar to the one we use for fsync requests. That cycle |
| 150 | * counter is incremented here. |
| 151 | * |
| 152 | * This must be called *before* the checkpoint REDO point is determined. |
| 153 | * That ensures that we won't delete files too soon. |
| 154 | * |
| 155 | * Note that we can't do anything here that depends on the assumption |
| 156 | * that the checkpoint will be completed. |
| 157 | */ |
| 158 | void |
| 159 | SyncPreCheckpoint(void) |
| 160 | { |
| 161 | /* |
| 162 | * Any unlink requests arriving after this point will be assigned the next |
| 163 | * cycle counter, and won't be unlinked until next checkpoint. |
| 164 | */ |
| 165 | checkpoint_cycle_ctr++; |
| 166 | } |
| 167 | |
| 168 | /* |
| 169 | * SyncPostCheckpoint() -- Do post-checkpoint work |
| 170 | * |
| 171 | * Remove any lingering files that can now be safely removed. |
| 172 | */ |
| 173 | void |
| 174 | SyncPostCheckpoint(void) |
| 175 | { |
| 176 | int absorb_counter; |
| 177 | |
| 178 | absorb_counter = UNLINKS_PER_ABSORB; |
| 179 | while (pendingUnlinks != NIL) |
| 180 | { |
| 181 | PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); |
| 182 | char path[MAXPGPATH]; |
| 183 | |
| 184 | /* |
| 185 | * New entries are appended to the end, so if the entry is new we've |
| 186 | * reached the end of old entries. |
| 187 | * |
| 188 | * Note: if just the right number of consecutive checkpoints fail, we |
| 189 | * could be fooled here by cycle_ctr wraparound. However, the only |
| 190 | * consequence is that we'd delay unlinking for one more checkpoint, |
| 191 | * which is perfectly tolerable. |
| 192 | */ |
| 193 | if (entry->cycle_ctr == checkpoint_cycle_ctr) |
| 194 | break; |
| 195 | |
| 196 | /* Unlink the file */ |
| 197 | if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, |
| 198 | path) < 0) |
| 199 | { |
| 200 | /* |
| 201 | * There's a race condition, when the database is dropped at the |
| 202 | * same time that we process the pending unlink requests. If the |
| 203 | * DROP DATABASE deletes the file before we do, we will get ENOENT |
| 204 | * here. rmtree() also has to ignore ENOENT errors, to deal with |
| 205 | * the possibility that we delete the file first. |
| 206 | */ |
| 207 | if (errno != ENOENT) |
| 208 | ereport(WARNING, |
| 209 | (errcode_for_file_access(), |
| 210 | errmsg("could not remove file \"%s\": %m" , path))); |
| 211 | } |
| 212 | |
| 213 | /* And remove the list entry */ |
| 214 | pendingUnlinks = list_delete_first(pendingUnlinks); |
| 215 | pfree(entry); |
| 216 | |
| 217 | /* |
| 218 | * As in ProcessSyncRequests, we don't want to stop absorbing fsync |
| 219 | * requests for along time when there are many deletions to be done. |
| 220 | * We can safely call AbsorbSyncRequests() at this point in the loop |
| 221 | * (note it might try to delete list entries). |
| 222 | */ |
| 223 | if (--absorb_counter <= 0) |
| 224 | { |
| 225 | AbsorbSyncRequests(); |
| 226 | absorb_counter = UNLINKS_PER_ABSORB; |
| 227 | } |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | /* |
| 232 | |
| 233 | * ProcessSyncRequests() -- Process queued fsync requests. |
| 234 | */ |
| 235 | void |
| 236 | ProcessSyncRequests(void) |
| 237 | { |
| 238 | static bool sync_in_progress = false; |
| 239 | |
| 240 | HASH_SEQ_STATUS hstat; |
| 241 | PendingFsyncEntry *entry; |
| 242 | int absorb_counter; |
| 243 | |
| 244 | /* Statistics on sync times */ |
| 245 | int processed = 0; |
| 246 | instr_time sync_start, |
| 247 | sync_end, |
| 248 | sync_diff; |
| 249 | uint64 elapsed; |
| 250 | uint64 longest = 0; |
| 251 | uint64 total_elapsed = 0; |
| 252 | |
| 253 | /* |
| 254 | * This is only called during checkpoints, and checkpoints should only |
| 255 | * occur in processes that have created a pendingOps. |
| 256 | */ |
| 257 | if (!pendingOps) |
| 258 | elog(ERROR, "cannot sync without a pendingOps table" ); |
| 259 | |
| 260 | /* |
| 261 | * If we are in the checkpointer, the sync had better include all fsync |
| 262 | * requests that were queued by backends up to this point. The tightest |
| 263 | * race condition that could occur is that a buffer that must be written |
| 264 | * and fsync'd for the checkpoint could have been dumped by a backend just |
| 265 | * before it was visited by BufferSync(). We know the backend will have |
| 266 | * queued an fsync request before clearing the buffer's dirtybit, so we |
| 267 | * are safe as long as we do an Absorb after completing BufferSync(). |
| 268 | */ |
| 269 | AbsorbSyncRequests(); |
| 270 | |
| 271 | /* |
| 272 | * To avoid excess fsync'ing (in the worst case, maybe a never-terminating |
| 273 | * checkpoint), we want to ignore fsync requests that are entered into the |
| 274 | * hashtable after this point --- they should be processed next time, |
| 275 | * instead. We use sync_cycle_ctr to tell old entries apart from new |
| 276 | * ones: new ones will have cycle_ctr equal to the incremented value of |
| 277 | * sync_cycle_ctr. |
| 278 | * |
| 279 | * In normal circumstances, all entries present in the table at this point |
| 280 | * will have cycle_ctr exactly equal to the current (about to be old) |
| 281 | * value of sync_cycle_ctr. However, if we fail partway through the |
| 282 | * fsync'ing loop, then older values of cycle_ctr might remain when we |
| 283 | * come back here to try again. Repeated checkpoint failures would |
| 284 | * eventually wrap the counter around to the point where an old entry |
| 285 | * might appear new, causing us to skip it, possibly allowing a checkpoint |
| 286 | * to succeed that should not have. To forestall wraparound, any time the |
| 287 | * previous ProcessSyncRequests() failed to complete, run through the |
| 288 | * table and forcibly set cycle_ctr = sync_cycle_ctr. |
| 289 | * |
| 290 | * Think not to merge this loop with the main loop, as the problem is |
| 291 | * exactly that that loop may fail before having visited all the entries. |
| 292 | * From a performance point of view it doesn't matter anyway, as this path |
| 293 | * will never be taken in a system that's functioning normally. |
| 294 | */ |
| 295 | if (sync_in_progress) |
| 296 | { |
| 297 | /* prior try failed, so update any stale cycle_ctr values */ |
| 298 | hash_seq_init(&hstat, pendingOps); |
| 299 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
| 300 | { |
| 301 | entry->cycle_ctr = sync_cycle_ctr; |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | /* Advance counter so that new hashtable entries are distinguishable */ |
| 306 | sync_cycle_ctr++; |
| 307 | |
| 308 | /* Set flag to detect failure if we don't reach the end of the loop */ |
| 309 | sync_in_progress = true; |
| 310 | |
| 311 | /* Now scan the hashtable for fsync requests to process */ |
| 312 | absorb_counter = FSYNCS_PER_ABSORB; |
| 313 | hash_seq_init(&hstat, pendingOps); |
| 314 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
| 315 | { |
| 316 | int failures; |
| 317 | |
| 318 | /* |
| 319 | * If fsync is off then we don't have to bother opening the file at |
| 320 | * all. (We delay checking until this point so that changing fsync on |
| 321 | * the fly behaves sensibly.) |
| 322 | */ |
| 323 | if (!enableFsync) |
| 324 | continue; |
| 325 | |
| 326 | /* |
| 327 | * If the entry is new then don't process it this time; it is new. |
| 328 | * Note "continue" bypasses the hash-remove call at the bottom of the |
| 329 | * loop. |
| 330 | */ |
| 331 | if (entry->cycle_ctr == sync_cycle_ctr) |
| 332 | continue; |
| 333 | |
| 334 | /* Else assert we haven't missed it */ |
| 335 | Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr); |
| 336 | |
| 337 | /* |
| 338 | * If in checkpointer, we want to absorb pending requests every so |
| 339 | * often to prevent overflow of the fsync request queue. It is |
| 340 | * unspecified whether newly-added entries will be visited by |
| 341 | * hash_seq_search, but we don't care since we don't need to process |
| 342 | * them anyway. |
| 343 | */ |
| 344 | if (--absorb_counter <= 0) |
| 345 | { |
| 346 | AbsorbSyncRequests(); |
| 347 | absorb_counter = FSYNCS_PER_ABSORB; |
| 348 | } |
| 349 | |
| 350 | /* |
| 351 | * The fsync table could contain requests to fsync segments that have |
| 352 | * been deleted (unlinked) by the time we get to them. Rather than |
| 353 | * just hoping an ENOENT (or EACCES on Windows) error can be ignored, |
| 354 | * what we do on error is absorb pending requests and then retry. |
| 355 | * Since mdunlink() queues a "cancel" message before actually |
| 356 | * unlinking, the fsync request is guaranteed to be marked canceled |
| 357 | * after the absorb if it really was this case. DROP DATABASE likewise |
| 358 | * has to tell us to forget fsync requests before it starts deletions. |
| 359 | */ |
| 360 | for (failures = 0; !entry->canceled; failures++) |
| 361 | { |
| 362 | char path[MAXPGPATH]; |
| 363 | |
| 364 | INSTR_TIME_SET_CURRENT(sync_start); |
| 365 | if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag, |
| 366 | path) == 0) |
| 367 | { |
| 368 | /* Success; update statistics about sync timing */ |
| 369 | INSTR_TIME_SET_CURRENT(sync_end); |
| 370 | sync_diff = sync_end; |
| 371 | INSTR_TIME_SUBTRACT(sync_diff, sync_start); |
| 372 | elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); |
| 373 | if (elapsed > longest) |
| 374 | longest = elapsed; |
| 375 | total_elapsed += elapsed; |
| 376 | processed++; |
| 377 | |
| 378 | if (log_checkpoints) |
| 379 | elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec" , |
| 380 | processed, |
| 381 | path, |
| 382 | (double) elapsed / 1000); |
| 383 | |
| 384 | break; /* out of retry loop */ |
| 385 | } |
| 386 | |
| 387 | /* |
| 388 | * It is possible that the relation has been dropped or truncated |
| 389 | * since the fsync request was entered. Therefore, allow ENOENT, |
| 390 | * but only if we didn't fail already on this file. |
| 391 | */ |
| 392 | if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) |
| 393 | ereport(data_sync_elevel(ERROR), |
| 394 | (errcode_for_file_access(), |
| 395 | errmsg("could not fsync file \"%s\": %m" , |
| 396 | path))); |
| 397 | else |
| 398 | ereport(DEBUG1, |
| 399 | (errcode_for_file_access(), |
| 400 | errmsg("could not fsync file \"%s\" but retrying: %m" , |
| 401 | path))); |
| 402 | |
| 403 | /* |
| 404 | * Absorb incoming requests and check to see if a cancel arrived |
| 405 | * for this relation fork. |
| 406 | */ |
| 407 | AbsorbSyncRequests(); |
| 408 | absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ |
| 409 | } /* end retry loop */ |
| 410 | |
| 411 | /* We are done with this entry, remove it */ |
| 412 | if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL) |
| 413 | elog(ERROR, "pendingOps corrupted" ); |
| 414 | } /* end loop over hashtable entries */ |
| 415 | |
| 416 | /* Return sync performance metrics for report at checkpoint end */ |
| 417 | CheckpointStats.ckpt_sync_rels = processed; |
| 418 | CheckpointStats.ckpt_longest_sync = longest; |
| 419 | CheckpointStats.ckpt_agg_sync_time = total_elapsed; |
| 420 | |
| 421 | /* Flag successful completion of ProcessSyncRequests */ |
| 422 | sync_in_progress = false; |
| 423 | } |
| 424 | |
| 425 | /* |
| 426 | * RememberSyncRequest() -- callback from checkpointer side of sync request |
| 427 | * |
| 428 | * We stuff fsync requests into the local hash table for execution |
| 429 | * during the checkpointer's next checkpoint. UNLINK requests go into a |
| 430 | * separate linked list, however, because they get processed separately. |
| 431 | * |
| 432 | * See sync.h for more information on the types of sync requests supported. |
| 433 | */ |
| 434 | void |
| 435 | RememberSyncRequest(const FileTag *ftag, SyncRequestType type) |
| 436 | { |
| 437 | Assert(pendingOps); |
| 438 | |
| 439 | if (type == SYNC_FORGET_REQUEST) |
| 440 | { |
| 441 | PendingFsyncEntry *entry; |
| 442 | |
| 443 | /* Cancel previously entered request */ |
| 444 | entry = (PendingFsyncEntry *) hash_search(pendingOps, |
| 445 | (void *) ftag, |
| 446 | HASH_FIND, |
| 447 | NULL); |
| 448 | if (entry != NULL) |
| 449 | entry->canceled = true; |
| 450 | } |
| 451 | else if (type == SYNC_FILTER_REQUEST) |
| 452 | { |
| 453 | HASH_SEQ_STATUS hstat; |
| 454 | PendingFsyncEntry *entry; |
| 455 | ListCell *cell, |
| 456 | *prev, |
| 457 | *next; |
| 458 | |
| 459 | /* Cancel matching fsync requests */ |
| 460 | hash_seq_init(&hstat, pendingOps); |
| 461 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
| 462 | { |
| 463 | if (entry->tag.handler == ftag->handler && |
| 464 | syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag)) |
| 465 | entry->canceled = true; |
| 466 | } |
| 467 | |
| 468 | /* Remove matching unlink requests */ |
| 469 | prev = NULL; |
| 470 | for (cell = list_head(pendingUnlinks); cell; cell = next) |
| 471 | { |
| 472 | PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); |
| 473 | |
| 474 | next = lnext(cell); |
| 475 | if (entry->tag.handler == ftag->handler && |
| 476 | syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag)) |
| 477 | { |
| 478 | pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); |
| 479 | pfree(entry); |
| 480 | } |
| 481 | else |
| 482 | prev = cell; |
| 483 | } |
| 484 | } |
| 485 | else if (type == SYNC_UNLINK_REQUEST) |
| 486 | { |
| 487 | /* Unlink request: put it in the linked list */ |
| 488 | MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); |
| 489 | PendingUnlinkEntry *entry; |
| 490 | |
| 491 | entry = palloc(sizeof(PendingUnlinkEntry)); |
| 492 | entry->tag = *ftag; |
| 493 | entry->cycle_ctr = checkpoint_cycle_ctr; |
| 494 | |
| 495 | pendingUnlinks = lappend(pendingUnlinks, entry); |
| 496 | |
| 497 | MemoryContextSwitchTo(oldcxt); |
| 498 | } |
| 499 | else |
| 500 | { |
| 501 | /* Normal case: enter a request to fsync this segment */ |
| 502 | MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); |
| 503 | PendingFsyncEntry *entry; |
| 504 | bool found; |
| 505 | |
| 506 | Assert(type == SYNC_REQUEST); |
| 507 | |
| 508 | entry = (PendingFsyncEntry *) hash_search(pendingOps, |
| 509 | (void *) ftag, |
| 510 | HASH_ENTER, |
| 511 | &found); |
| 512 | /* if new entry, initialize it */ |
| 513 | if (!found) |
| 514 | { |
| 515 | entry->cycle_ctr = sync_cycle_ctr; |
| 516 | entry->canceled = false; |
| 517 | } |
| 518 | |
| 519 | /* |
| 520 | * NB: it's intentional that we don't change cycle_ctr if the entry |
| 521 | * already exists. The cycle_ctr must represent the oldest fsync |
| 522 | * request that could be in the entry. |
| 523 | */ |
| 524 | |
| 525 | MemoryContextSwitchTo(oldcxt); |
| 526 | } |
| 527 | } |
| 528 | |
| 529 | /* |
| 530 | * Register the sync request locally, or forward it to the checkpointer. |
| 531 | * |
| 532 | * If retryOnError is true, we'll keep trying if there is no space in the |
| 533 | * queue. Return true if we succeeded, or false if there wasn't space. |
| 534 | */ |
| 535 | bool |
| 536 | RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, |
| 537 | bool retryOnError) |
| 538 | { |
| 539 | bool ret; |
| 540 | |
| 541 | if (pendingOps != NULL) |
| 542 | { |
| 543 | /* standalone backend or startup process: fsync state is local */ |
| 544 | RememberSyncRequest(ftag, type); |
| 545 | return true; |
| 546 | } |
| 547 | |
| 548 | for (;;) |
| 549 | { |
| 550 | /* |
| 551 | * Notify the checkpointer about it. If we fail to queue a message in |
| 552 | * retryOnError mode, we have to sleep and try again ... ugly, but |
| 553 | * hopefully won't happen often. |
| 554 | * |
| 555 | * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an |
| 556 | * error in the case of SYNC_UNLINK_REQUEST would leave the |
| 557 | * no-longer-used file still present on disk, which would be bad, so |
| 558 | * I'm inclined to assume that the checkpointer will always empty the |
| 559 | * queue soon. |
| 560 | */ |
| 561 | ret = ForwardSyncRequest(ftag, type); |
| 562 | |
| 563 | /* |
| 564 | * If we are successful in queueing the request, or we failed and were |
| 565 | * instructed not to retry on error, break. |
| 566 | */ |
| 567 | if (ret || (!ret && !retryOnError)) |
| 568 | break; |
| 569 | |
| 570 | pg_usleep(10000L); |
| 571 | } |
| 572 | |
| 573 | return ret; |
| 574 | } |
| 575 | |
| 576 | /* |
| 577 | * In archive recovery, we rely on checkpointer to do fsyncs, but we will have |
| 578 | * already created the pendingOps during initialization of the startup |
| 579 | * process. Calling this function drops the local pendingOps so that |
| 580 | * subsequent requests will be forwarded to checkpointer. |
| 581 | */ |
| 582 | void |
| 583 | EnableSyncRequestForwarding(void) |
| 584 | { |
| 585 | /* Perform any pending fsyncs we may have queued up, then drop table */ |
| 586 | if (pendingOps) |
| 587 | { |
| 588 | ProcessSyncRequests(); |
| 589 | hash_destroy(pendingOps); |
| 590 | } |
| 591 | pendingOps = NULL; |
| 592 | |
| 593 | /* |
| 594 | * We should not have any pending unlink requests, since mdunlink doesn't |
| 595 | * queue unlink requests when isRedo. |
| 596 | */ |
| 597 | Assert(pendingUnlinks == NIL); |
| 598 | } |
| 599 | |