1/*-------------------------------------------------------------------------
2 *
3 * sync.c
4 * File synchronization management code.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/sync/sync.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include <unistd.h>
18#include <fcntl.h>
19#include <sys/file.h>
20
21#include "miscadmin.h"
22#include "pgstat.h"
23#include "access/xlogutils.h"
24#include "access/xlog.h"
25#include "commands/tablespace.h"
26#include "portability/instr_time.h"
27#include "postmaster/bgwriter.h"
28#include "storage/bufmgr.h"
29#include "storage/ipc.h"
30#include "storage/md.h"
31#include "utils/hsearch.h"
32#include "utils/memutils.h"
33#include "utils/inval.h"
34
35static MemoryContext pendingOpsCxt; /* context for the pending ops state */
36
37/*
38 * In some contexts (currently, standalone backends and the checkpointer)
39 * we keep track of pending fsync operations: we need to remember all relation
40 * segments that have been written since the last checkpoint, so that we can
41 * fsync them down to disk before completing the next checkpoint. This hash
42 * table remembers the pending operations. We use a hash table mostly as
43 * a convenient way of merging duplicate requests.
44 *
45 * We use a similar mechanism to remember no-longer-needed files that can
46 * be deleted after the next checkpoint, but we use a linked list instead of
47 * a hash table, because we don't expect there to be any duplicate requests.
48 *
49 * These mechanisms are only used for non-temp relations; we never fsync
50 * temp rels, nor do we need to postpone their deletion (see comments in
51 * mdunlink).
52 *
53 * (Regular backends do not track pending operations locally, but forward
54 * them to the checkpointer.)
55 */
56typedef uint16 CycleCtr; /* can be any convenient integer size */
57
58typedef struct
59{
60 FileTag tag; /* identifies handler and file */
61 CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
62 bool canceled; /* canceled is true if we canceled "recently" */
63} PendingFsyncEntry;
64
65typedef struct
66{
67 FileTag tag; /* identifies handler and file */
68 CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
69} PendingUnlinkEntry;
70
71static HTAB *pendingOps = NULL;
72static List *pendingUnlinks = NIL;
73static MemoryContext pendingOpsCxt; /* context for the above */
74
75static CycleCtr sync_cycle_ctr = 0;
76static CycleCtr checkpoint_cycle_ctr = 0;
77
78/* Intervals for calling AbsorbSyncRequests */
79#define FSYNCS_PER_ABSORB 10
80#define UNLINKS_PER_ABSORB 10
81
82/*
83 * Function pointers for handling sync and unlink requests.
84 */
85typedef struct SyncOps
86{
87 int (*sync_syncfiletag) (const FileTag *ftag, char *path);
88 int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
89 bool (*sync_filetagmatches) (const FileTag *ftag,
90 const FileTag *candidate);
91} SyncOps;
92
93static const SyncOps syncsw[] = {
94 /* magnetic disk */
95 {
96 .sync_syncfiletag = mdsyncfiletag,
97 .sync_unlinkfiletag = mdunlinkfiletag,
98 .sync_filetagmatches = mdfiletagmatches
99 }
100};
101
102/*
103 * Initialize data structures for the file sync tracking.
104 */
105void
106InitSync(void)
107{
108 /*
109 * Create pending-operations hashtable if we need it. Currently, we need
110 * it if we are standalone (not under a postmaster) or if we are a startup
111 * or checkpointer auxiliary process.
112 */
113 if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
114 {
115 HASHCTL hash_ctl;
116
117 /*
118 * XXX: The checkpointer needs to add entries to the pending ops table
119 * when absorbing fsync requests. That is done within a critical
120 * section, which isn't usually allowed, but we make an exception. It
121 * means that there's a theoretical possibility that you run out of
122 * memory while absorbing fsync requests, which leads to a PANIC.
123 * Fortunately the hash table is small so that's unlikely to happen in
124 * practice.
125 */
126 pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
127 "Pending ops context",
128 ALLOCSET_DEFAULT_SIZES);
129 MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
130
131 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
132 hash_ctl.keysize = sizeof(FileTag);
133 hash_ctl.entrysize = sizeof(PendingFsyncEntry);
134 hash_ctl.hcxt = pendingOpsCxt;
135 pendingOps = hash_create("Pending Ops Table",
136 100L,
137 &hash_ctl,
138 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
139 pendingUnlinks = NIL;
140 }
141
142}
143
144/*
145 * SyncPreCheckpoint() -- Do pre-checkpoint work
146 *
147 * To distinguish unlink requests that arrived before this checkpoint
148 * started from those that arrived during the checkpoint, we use a cycle
149 * counter similar to the one we use for fsync requests. That cycle
150 * counter is incremented here.
151 *
152 * This must be called *before* the checkpoint REDO point is determined.
153 * That ensures that we won't delete files too soon.
154 *
155 * Note that we can't do anything here that depends on the assumption
156 * that the checkpoint will be completed.
157 */
158void
159SyncPreCheckpoint(void)
160{
161 /*
162 * Any unlink requests arriving after this point will be assigned the next
163 * cycle counter, and won't be unlinked until next checkpoint.
164 */
165 checkpoint_cycle_ctr++;
166}
167
168/*
169 * SyncPostCheckpoint() -- Do post-checkpoint work
170 *
171 * Remove any lingering files that can now be safely removed.
172 */
173void
174SyncPostCheckpoint(void)
175{
176 int absorb_counter;
177
178 absorb_counter = UNLINKS_PER_ABSORB;
179 while (pendingUnlinks != NIL)
180 {
181 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
182 char path[MAXPGPATH];
183
184 /*
185 * New entries are appended to the end, so if the entry is new we've
186 * reached the end of old entries.
187 *
188 * Note: if just the right number of consecutive checkpoints fail, we
189 * could be fooled here by cycle_ctr wraparound. However, the only
190 * consequence is that we'd delay unlinking for one more checkpoint,
191 * which is perfectly tolerable.
192 */
193 if (entry->cycle_ctr == checkpoint_cycle_ctr)
194 break;
195
196 /* Unlink the file */
197 if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
198 path) < 0)
199 {
200 /*
201 * There's a race condition, when the database is dropped at the
202 * same time that we process the pending unlink requests. If the
203 * DROP DATABASE deletes the file before we do, we will get ENOENT
204 * here. rmtree() also has to ignore ENOENT errors, to deal with
205 * the possibility that we delete the file first.
206 */
207 if (errno != ENOENT)
208 ereport(WARNING,
209 (errcode_for_file_access(),
210 errmsg("could not remove file \"%s\": %m", path)));
211 }
212
213 /* And remove the list entry */
214 pendingUnlinks = list_delete_first(pendingUnlinks);
215 pfree(entry);
216
217 /*
218 * As in ProcessSyncRequests, we don't want to stop absorbing fsync
219 * requests for along time when there are many deletions to be done.
220 * We can safely call AbsorbSyncRequests() at this point in the loop
221 * (note it might try to delete list entries).
222 */
223 if (--absorb_counter <= 0)
224 {
225 AbsorbSyncRequests();
226 absorb_counter = UNLINKS_PER_ABSORB;
227 }
228 }
229}
230
231/*
232
233 * ProcessSyncRequests() -- Process queued fsync requests.
234 */
235void
236ProcessSyncRequests(void)
237{
238 static bool sync_in_progress = false;
239
240 HASH_SEQ_STATUS hstat;
241 PendingFsyncEntry *entry;
242 int absorb_counter;
243
244 /* Statistics on sync times */
245 int processed = 0;
246 instr_time sync_start,
247 sync_end,
248 sync_diff;
249 uint64 elapsed;
250 uint64 longest = 0;
251 uint64 total_elapsed = 0;
252
253 /*
254 * This is only called during checkpoints, and checkpoints should only
255 * occur in processes that have created a pendingOps.
256 */
257 if (!pendingOps)
258 elog(ERROR, "cannot sync without a pendingOps table");
259
260 /*
261 * If we are in the checkpointer, the sync had better include all fsync
262 * requests that were queued by backends up to this point. The tightest
263 * race condition that could occur is that a buffer that must be written
264 * and fsync'd for the checkpoint could have been dumped by a backend just
265 * before it was visited by BufferSync(). We know the backend will have
266 * queued an fsync request before clearing the buffer's dirtybit, so we
267 * are safe as long as we do an Absorb after completing BufferSync().
268 */
269 AbsorbSyncRequests();
270
271 /*
272 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
273 * checkpoint), we want to ignore fsync requests that are entered into the
274 * hashtable after this point --- they should be processed next time,
275 * instead. We use sync_cycle_ctr to tell old entries apart from new
276 * ones: new ones will have cycle_ctr equal to the incremented value of
277 * sync_cycle_ctr.
278 *
279 * In normal circumstances, all entries present in the table at this point
280 * will have cycle_ctr exactly equal to the current (about to be old)
281 * value of sync_cycle_ctr. However, if we fail partway through the
282 * fsync'ing loop, then older values of cycle_ctr might remain when we
283 * come back here to try again. Repeated checkpoint failures would
284 * eventually wrap the counter around to the point where an old entry
285 * might appear new, causing us to skip it, possibly allowing a checkpoint
286 * to succeed that should not have. To forestall wraparound, any time the
287 * previous ProcessSyncRequests() failed to complete, run through the
288 * table and forcibly set cycle_ctr = sync_cycle_ctr.
289 *
290 * Think not to merge this loop with the main loop, as the problem is
291 * exactly that that loop may fail before having visited all the entries.
292 * From a performance point of view it doesn't matter anyway, as this path
293 * will never be taken in a system that's functioning normally.
294 */
295 if (sync_in_progress)
296 {
297 /* prior try failed, so update any stale cycle_ctr values */
298 hash_seq_init(&hstat, pendingOps);
299 while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
300 {
301 entry->cycle_ctr = sync_cycle_ctr;
302 }
303 }
304
305 /* Advance counter so that new hashtable entries are distinguishable */
306 sync_cycle_ctr++;
307
308 /* Set flag to detect failure if we don't reach the end of the loop */
309 sync_in_progress = true;
310
311 /* Now scan the hashtable for fsync requests to process */
312 absorb_counter = FSYNCS_PER_ABSORB;
313 hash_seq_init(&hstat, pendingOps);
314 while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
315 {
316 int failures;
317
318 /*
319 * If fsync is off then we don't have to bother opening the file at
320 * all. (We delay checking until this point so that changing fsync on
321 * the fly behaves sensibly.)
322 */
323 if (!enableFsync)
324 continue;
325
326 /*
327 * If the entry is new then don't process it this time; it is new.
328 * Note "continue" bypasses the hash-remove call at the bottom of the
329 * loop.
330 */
331 if (entry->cycle_ctr == sync_cycle_ctr)
332 continue;
333
334 /* Else assert we haven't missed it */
335 Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
336
337 /*
338 * If in checkpointer, we want to absorb pending requests every so
339 * often to prevent overflow of the fsync request queue. It is
340 * unspecified whether newly-added entries will be visited by
341 * hash_seq_search, but we don't care since we don't need to process
342 * them anyway.
343 */
344 if (--absorb_counter <= 0)
345 {
346 AbsorbSyncRequests();
347 absorb_counter = FSYNCS_PER_ABSORB;
348 }
349
350 /*
351 * The fsync table could contain requests to fsync segments that have
352 * been deleted (unlinked) by the time we get to them. Rather than
353 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
354 * what we do on error is absorb pending requests and then retry.
355 * Since mdunlink() queues a "cancel" message before actually
356 * unlinking, the fsync request is guaranteed to be marked canceled
357 * after the absorb if it really was this case. DROP DATABASE likewise
358 * has to tell us to forget fsync requests before it starts deletions.
359 */
360 for (failures = 0; !entry->canceled; failures++)
361 {
362 char path[MAXPGPATH];
363
364 INSTR_TIME_SET_CURRENT(sync_start);
365 if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
366 path) == 0)
367 {
368 /* Success; update statistics about sync timing */
369 INSTR_TIME_SET_CURRENT(sync_end);
370 sync_diff = sync_end;
371 INSTR_TIME_SUBTRACT(sync_diff, sync_start);
372 elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
373 if (elapsed > longest)
374 longest = elapsed;
375 total_elapsed += elapsed;
376 processed++;
377
378 if (log_checkpoints)
379 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
380 processed,
381 path,
382 (double) elapsed / 1000);
383
384 break; /* out of retry loop */
385 }
386
387 /*
388 * It is possible that the relation has been dropped or truncated
389 * since the fsync request was entered. Therefore, allow ENOENT,
390 * but only if we didn't fail already on this file.
391 */
392 if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
393 ereport(data_sync_elevel(ERROR),
394 (errcode_for_file_access(),
395 errmsg("could not fsync file \"%s\": %m",
396 path)));
397 else
398 ereport(DEBUG1,
399 (errcode_for_file_access(),
400 errmsg("could not fsync file \"%s\" but retrying: %m",
401 path)));
402
403 /*
404 * Absorb incoming requests and check to see if a cancel arrived
405 * for this relation fork.
406 */
407 AbsorbSyncRequests();
408 absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
409 } /* end retry loop */
410
411 /* We are done with this entry, remove it */
412 if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
413 elog(ERROR, "pendingOps corrupted");
414 } /* end loop over hashtable entries */
415
416 /* Return sync performance metrics for report at checkpoint end */
417 CheckpointStats.ckpt_sync_rels = processed;
418 CheckpointStats.ckpt_longest_sync = longest;
419 CheckpointStats.ckpt_agg_sync_time = total_elapsed;
420
421 /* Flag successful completion of ProcessSyncRequests */
422 sync_in_progress = false;
423}
424
425/*
426 * RememberSyncRequest() -- callback from checkpointer side of sync request
427 *
428 * We stuff fsync requests into the local hash table for execution
429 * during the checkpointer's next checkpoint. UNLINK requests go into a
430 * separate linked list, however, because they get processed separately.
431 *
432 * See sync.h for more information on the types of sync requests supported.
433 */
434void
435RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
436{
437 Assert(pendingOps);
438
439 if (type == SYNC_FORGET_REQUEST)
440 {
441 PendingFsyncEntry *entry;
442
443 /* Cancel previously entered request */
444 entry = (PendingFsyncEntry *) hash_search(pendingOps,
445 (void *) ftag,
446 HASH_FIND,
447 NULL);
448 if (entry != NULL)
449 entry->canceled = true;
450 }
451 else if (type == SYNC_FILTER_REQUEST)
452 {
453 HASH_SEQ_STATUS hstat;
454 PendingFsyncEntry *entry;
455 ListCell *cell,
456 *prev,
457 *next;
458
459 /* Cancel matching fsync requests */
460 hash_seq_init(&hstat, pendingOps);
461 while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
462 {
463 if (entry->tag.handler == ftag->handler &&
464 syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
465 entry->canceled = true;
466 }
467
468 /* Remove matching unlink requests */
469 prev = NULL;
470 for (cell = list_head(pendingUnlinks); cell; cell = next)
471 {
472 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
473
474 next = lnext(cell);
475 if (entry->tag.handler == ftag->handler &&
476 syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
477 {
478 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
479 pfree(entry);
480 }
481 else
482 prev = cell;
483 }
484 }
485 else if (type == SYNC_UNLINK_REQUEST)
486 {
487 /* Unlink request: put it in the linked list */
488 MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
489 PendingUnlinkEntry *entry;
490
491 entry = palloc(sizeof(PendingUnlinkEntry));
492 entry->tag = *ftag;
493 entry->cycle_ctr = checkpoint_cycle_ctr;
494
495 pendingUnlinks = lappend(pendingUnlinks, entry);
496
497 MemoryContextSwitchTo(oldcxt);
498 }
499 else
500 {
501 /* Normal case: enter a request to fsync this segment */
502 MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
503 PendingFsyncEntry *entry;
504 bool found;
505
506 Assert(type == SYNC_REQUEST);
507
508 entry = (PendingFsyncEntry *) hash_search(pendingOps,
509 (void *) ftag,
510 HASH_ENTER,
511 &found);
512 /* if new entry, initialize it */
513 if (!found)
514 {
515 entry->cycle_ctr = sync_cycle_ctr;
516 entry->canceled = false;
517 }
518
519 /*
520 * NB: it's intentional that we don't change cycle_ctr if the entry
521 * already exists. The cycle_ctr must represent the oldest fsync
522 * request that could be in the entry.
523 */
524
525 MemoryContextSwitchTo(oldcxt);
526 }
527}
528
529/*
530 * Register the sync request locally, or forward it to the checkpointer.
531 *
532 * If retryOnError is true, we'll keep trying if there is no space in the
533 * queue. Return true if we succeeded, or false if there wasn't space.
534 */
535bool
536RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
537 bool retryOnError)
538{
539 bool ret;
540
541 if (pendingOps != NULL)
542 {
543 /* standalone backend or startup process: fsync state is local */
544 RememberSyncRequest(ftag, type);
545 return true;
546 }
547
548 for (;;)
549 {
550 /*
551 * Notify the checkpointer about it. If we fail to queue a message in
552 * retryOnError mode, we have to sleep and try again ... ugly, but
553 * hopefully won't happen often.
554 *
555 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
556 * error in the case of SYNC_UNLINK_REQUEST would leave the
557 * no-longer-used file still present on disk, which would be bad, so
558 * I'm inclined to assume that the checkpointer will always empty the
559 * queue soon.
560 */
561 ret = ForwardSyncRequest(ftag, type);
562
563 /*
564 * If we are successful in queueing the request, or we failed and were
565 * instructed not to retry on error, break.
566 */
567 if (ret || (!ret && !retryOnError))
568 break;
569
570 pg_usleep(10000L);
571 }
572
573 return ret;
574}
575
576/*
577 * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
578 * already created the pendingOps during initialization of the startup
579 * process. Calling this function drops the local pendingOps so that
580 * subsequent requests will be forwarded to checkpointer.
581 */
582void
583EnableSyncRequestForwarding(void)
584{
585 /* Perform any pending fsyncs we may have queued up, then drop table */
586 if (pendingOps)
587 {
588 ProcessSyncRequests();
589 hash_destroy(pendingOps);
590 }
591 pendingOps = NULL;
592
593 /*
594 * We should not have any pending unlink requests, since mdunlink doesn't
595 * queue unlink requests when isRedo.
596 */
597 Assert(pendingUnlinks == NIL);
598}
599