1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * sync.c |
4 | * File synchronization management code. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/storage/sync/sync.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres.h" |
16 | |
17 | #include <unistd.h> |
18 | #include <fcntl.h> |
19 | #include <sys/file.h> |
20 | |
21 | #include "miscadmin.h" |
22 | #include "pgstat.h" |
23 | #include "access/xlogutils.h" |
24 | #include "access/xlog.h" |
25 | #include "commands/tablespace.h" |
26 | #include "portability/instr_time.h" |
27 | #include "postmaster/bgwriter.h" |
28 | #include "storage/bufmgr.h" |
29 | #include "storage/ipc.h" |
30 | #include "storage/md.h" |
31 | #include "utils/hsearch.h" |
32 | #include "utils/memutils.h" |
33 | #include "utils/inval.h" |
34 | |
35 | static MemoryContext pendingOpsCxt; /* context for the pending ops state */ |
36 | |
37 | /* |
38 | * In some contexts (currently, standalone backends and the checkpointer) |
39 | * we keep track of pending fsync operations: we need to remember all relation |
40 | * segments that have been written since the last checkpoint, so that we can |
41 | * fsync them down to disk before completing the next checkpoint. This hash |
42 | * table remembers the pending operations. We use a hash table mostly as |
43 | * a convenient way of merging duplicate requests. |
44 | * |
45 | * We use a similar mechanism to remember no-longer-needed files that can |
46 | * be deleted after the next checkpoint, but we use a linked list instead of |
47 | * a hash table, because we don't expect there to be any duplicate requests. |
48 | * |
49 | * These mechanisms are only used for non-temp relations; we never fsync |
50 | * temp rels, nor do we need to postpone their deletion (see comments in |
51 | * mdunlink). |
52 | * |
53 | * (Regular backends do not track pending operations locally, but forward |
54 | * them to the checkpointer.) |
55 | */ |
56 | typedef uint16 CycleCtr; /* can be any convenient integer size */ |
57 | |
58 | typedef struct |
59 | { |
60 | FileTag tag; /* identifies handler and file */ |
61 | CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */ |
62 | bool canceled; /* canceled is true if we canceled "recently" */ |
63 | } PendingFsyncEntry; |
64 | |
65 | typedef struct |
66 | { |
67 | FileTag tag; /* identifies handler and file */ |
68 | CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */ |
69 | } PendingUnlinkEntry; |
70 | |
71 | static HTAB *pendingOps = NULL; |
72 | static List *pendingUnlinks = NIL; |
73 | static MemoryContext pendingOpsCxt; /* context for the above */ |
74 | |
75 | static CycleCtr sync_cycle_ctr = 0; |
76 | static CycleCtr checkpoint_cycle_ctr = 0; |
77 | |
78 | /* Intervals for calling AbsorbSyncRequests */ |
79 | #define FSYNCS_PER_ABSORB 10 |
80 | #define UNLINKS_PER_ABSORB 10 |
81 | |
82 | /* |
83 | * Function pointers for handling sync and unlink requests. |
84 | */ |
85 | typedef struct SyncOps |
86 | { |
87 | int (*sync_syncfiletag) (const FileTag *ftag, char *path); |
88 | int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); |
89 | bool (*sync_filetagmatches) (const FileTag *ftag, |
90 | const FileTag *candidate); |
91 | } SyncOps; |
92 | |
93 | static const SyncOps syncsw[] = { |
94 | /* magnetic disk */ |
95 | { |
96 | .sync_syncfiletag = mdsyncfiletag, |
97 | .sync_unlinkfiletag = mdunlinkfiletag, |
98 | .sync_filetagmatches = mdfiletagmatches |
99 | } |
100 | }; |
101 | |
102 | /* |
103 | * Initialize data structures for the file sync tracking. |
104 | */ |
105 | void |
106 | InitSync(void) |
107 | { |
108 | /* |
109 | * Create pending-operations hashtable if we need it. Currently, we need |
110 | * it if we are standalone (not under a postmaster) or if we are a startup |
111 | * or checkpointer auxiliary process. |
112 | */ |
113 | if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) |
114 | { |
115 | HASHCTL hash_ctl; |
116 | |
117 | /* |
118 | * XXX: The checkpointer needs to add entries to the pending ops table |
119 | * when absorbing fsync requests. That is done within a critical |
120 | * section, which isn't usually allowed, but we make an exception. It |
121 | * means that there's a theoretical possibility that you run out of |
122 | * memory while absorbing fsync requests, which leads to a PANIC. |
123 | * Fortunately the hash table is small so that's unlikely to happen in |
124 | * practice. |
125 | */ |
126 | pendingOpsCxt = AllocSetContextCreate(TopMemoryContext, |
127 | "Pending ops context" , |
128 | ALLOCSET_DEFAULT_SIZES); |
129 | MemoryContextAllowInCriticalSection(pendingOpsCxt, true); |
130 | |
131 | MemSet(&hash_ctl, 0, sizeof(hash_ctl)); |
132 | hash_ctl.keysize = sizeof(FileTag); |
133 | hash_ctl.entrysize = sizeof(PendingFsyncEntry); |
134 | hash_ctl.hcxt = pendingOpsCxt; |
135 | pendingOps = hash_create("Pending Ops Table" , |
136 | 100L, |
137 | &hash_ctl, |
138 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
139 | pendingUnlinks = NIL; |
140 | } |
141 | |
142 | } |
143 | |
144 | /* |
145 | * SyncPreCheckpoint() -- Do pre-checkpoint work |
146 | * |
147 | * To distinguish unlink requests that arrived before this checkpoint |
148 | * started from those that arrived during the checkpoint, we use a cycle |
149 | * counter similar to the one we use for fsync requests. That cycle |
150 | * counter is incremented here. |
151 | * |
152 | * This must be called *before* the checkpoint REDO point is determined. |
153 | * That ensures that we won't delete files too soon. |
154 | * |
155 | * Note that we can't do anything here that depends on the assumption |
156 | * that the checkpoint will be completed. |
157 | */ |
158 | void |
159 | SyncPreCheckpoint(void) |
160 | { |
161 | /* |
162 | * Any unlink requests arriving after this point will be assigned the next |
163 | * cycle counter, and won't be unlinked until next checkpoint. |
164 | */ |
165 | checkpoint_cycle_ctr++; |
166 | } |
167 | |
168 | /* |
169 | * SyncPostCheckpoint() -- Do post-checkpoint work |
170 | * |
171 | * Remove any lingering files that can now be safely removed. |
172 | */ |
173 | void |
174 | SyncPostCheckpoint(void) |
175 | { |
176 | int absorb_counter; |
177 | |
178 | absorb_counter = UNLINKS_PER_ABSORB; |
179 | while (pendingUnlinks != NIL) |
180 | { |
181 | PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); |
182 | char path[MAXPGPATH]; |
183 | |
184 | /* |
185 | * New entries are appended to the end, so if the entry is new we've |
186 | * reached the end of old entries. |
187 | * |
188 | * Note: if just the right number of consecutive checkpoints fail, we |
189 | * could be fooled here by cycle_ctr wraparound. However, the only |
190 | * consequence is that we'd delay unlinking for one more checkpoint, |
191 | * which is perfectly tolerable. |
192 | */ |
193 | if (entry->cycle_ctr == checkpoint_cycle_ctr) |
194 | break; |
195 | |
196 | /* Unlink the file */ |
197 | if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, |
198 | path) < 0) |
199 | { |
200 | /* |
201 | * There's a race condition, when the database is dropped at the |
202 | * same time that we process the pending unlink requests. If the |
203 | * DROP DATABASE deletes the file before we do, we will get ENOENT |
204 | * here. rmtree() also has to ignore ENOENT errors, to deal with |
205 | * the possibility that we delete the file first. |
206 | */ |
207 | if (errno != ENOENT) |
208 | ereport(WARNING, |
209 | (errcode_for_file_access(), |
210 | errmsg("could not remove file \"%s\": %m" , path))); |
211 | } |
212 | |
213 | /* And remove the list entry */ |
214 | pendingUnlinks = list_delete_first(pendingUnlinks); |
215 | pfree(entry); |
216 | |
217 | /* |
218 | * As in ProcessSyncRequests, we don't want to stop absorbing fsync |
219 | * requests for along time when there are many deletions to be done. |
220 | * We can safely call AbsorbSyncRequests() at this point in the loop |
221 | * (note it might try to delete list entries). |
222 | */ |
223 | if (--absorb_counter <= 0) |
224 | { |
225 | AbsorbSyncRequests(); |
226 | absorb_counter = UNLINKS_PER_ABSORB; |
227 | } |
228 | } |
229 | } |
230 | |
231 | /* |
232 | |
233 | * ProcessSyncRequests() -- Process queued fsync requests. |
234 | */ |
235 | void |
236 | ProcessSyncRequests(void) |
237 | { |
238 | static bool sync_in_progress = false; |
239 | |
240 | HASH_SEQ_STATUS hstat; |
241 | PendingFsyncEntry *entry; |
242 | int absorb_counter; |
243 | |
244 | /* Statistics on sync times */ |
245 | int processed = 0; |
246 | instr_time sync_start, |
247 | sync_end, |
248 | sync_diff; |
249 | uint64 elapsed; |
250 | uint64 longest = 0; |
251 | uint64 total_elapsed = 0; |
252 | |
253 | /* |
254 | * This is only called during checkpoints, and checkpoints should only |
255 | * occur in processes that have created a pendingOps. |
256 | */ |
257 | if (!pendingOps) |
258 | elog(ERROR, "cannot sync without a pendingOps table" ); |
259 | |
260 | /* |
261 | * If we are in the checkpointer, the sync had better include all fsync |
262 | * requests that were queued by backends up to this point. The tightest |
263 | * race condition that could occur is that a buffer that must be written |
264 | * and fsync'd for the checkpoint could have been dumped by a backend just |
265 | * before it was visited by BufferSync(). We know the backend will have |
266 | * queued an fsync request before clearing the buffer's dirtybit, so we |
267 | * are safe as long as we do an Absorb after completing BufferSync(). |
268 | */ |
269 | AbsorbSyncRequests(); |
270 | |
271 | /* |
272 | * To avoid excess fsync'ing (in the worst case, maybe a never-terminating |
273 | * checkpoint), we want to ignore fsync requests that are entered into the |
274 | * hashtable after this point --- they should be processed next time, |
275 | * instead. We use sync_cycle_ctr to tell old entries apart from new |
276 | * ones: new ones will have cycle_ctr equal to the incremented value of |
277 | * sync_cycle_ctr. |
278 | * |
279 | * In normal circumstances, all entries present in the table at this point |
280 | * will have cycle_ctr exactly equal to the current (about to be old) |
281 | * value of sync_cycle_ctr. However, if we fail partway through the |
282 | * fsync'ing loop, then older values of cycle_ctr might remain when we |
283 | * come back here to try again. Repeated checkpoint failures would |
284 | * eventually wrap the counter around to the point where an old entry |
285 | * might appear new, causing us to skip it, possibly allowing a checkpoint |
286 | * to succeed that should not have. To forestall wraparound, any time the |
287 | * previous ProcessSyncRequests() failed to complete, run through the |
288 | * table and forcibly set cycle_ctr = sync_cycle_ctr. |
289 | * |
290 | * Think not to merge this loop with the main loop, as the problem is |
291 | * exactly that that loop may fail before having visited all the entries. |
292 | * From a performance point of view it doesn't matter anyway, as this path |
293 | * will never be taken in a system that's functioning normally. |
294 | */ |
295 | if (sync_in_progress) |
296 | { |
297 | /* prior try failed, so update any stale cycle_ctr values */ |
298 | hash_seq_init(&hstat, pendingOps); |
299 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
300 | { |
301 | entry->cycle_ctr = sync_cycle_ctr; |
302 | } |
303 | } |
304 | |
305 | /* Advance counter so that new hashtable entries are distinguishable */ |
306 | sync_cycle_ctr++; |
307 | |
308 | /* Set flag to detect failure if we don't reach the end of the loop */ |
309 | sync_in_progress = true; |
310 | |
311 | /* Now scan the hashtable for fsync requests to process */ |
312 | absorb_counter = FSYNCS_PER_ABSORB; |
313 | hash_seq_init(&hstat, pendingOps); |
314 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
315 | { |
316 | int failures; |
317 | |
318 | /* |
319 | * If fsync is off then we don't have to bother opening the file at |
320 | * all. (We delay checking until this point so that changing fsync on |
321 | * the fly behaves sensibly.) |
322 | */ |
323 | if (!enableFsync) |
324 | continue; |
325 | |
326 | /* |
327 | * If the entry is new then don't process it this time; it is new. |
328 | * Note "continue" bypasses the hash-remove call at the bottom of the |
329 | * loop. |
330 | */ |
331 | if (entry->cycle_ctr == sync_cycle_ctr) |
332 | continue; |
333 | |
334 | /* Else assert we haven't missed it */ |
335 | Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr); |
336 | |
337 | /* |
338 | * If in checkpointer, we want to absorb pending requests every so |
339 | * often to prevent overflow of the fsync request queue. It is |
340 | * unspecified whether newly-added entries will be visited by |
341 | * hash_seq_search, but we don't care since we don't need to process |
342 | * them anyway. |
343 | */ |
344 | if (--absorb_counter <= 0) |
345 | { |
346 | AbsorbSyncRequests(); |
347 | absorb_counter = FSYNCS_PER_ABSORB; |
348 | } |
349 | |
350 | /* |
351 | * The fsync table could contain requests to fsync segments that have |
352 | * been deleted (unlinked) by the time we get to them. Rather than |
353 | * just hoping an ENOENT (or EACCES on Windows) error can be ignored, |
354 | * what we do on error is absorb pending requests and then retry. |
355 | * Since mdunlink() queues a "cancel" message before actually |
356 | * unlinking, the fsync request is guaranteed to be marked canceled |
357 | * after the absorb if it really was this case. DROP DATABASE likewise |
358 | * has to tell us to forget fsync requests before it starts deletions. |
359 | */ |
360 | for (failures = 0; !entry->canceled; failures++) |
361 | { |
362 | char path[MAXPGPATH]; |
363 | |
364 | INSTR_TIME_SET_CURRENT(sync_start); |
365 | if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag, |
366 | path) == 0) |
367 | { |
368 | /* Success; update statistics about sync timing */ |
369 | INSTR_TIME_SET_CURRENT(sync_end); |
370 | sync_diff = sync_end; |
371 | INSTR_TIME_SUBTRACT(sync_diff, sync_start); |
372 | elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); |
373 | if (elapsed > longest) |
374 | longest = elapsed; |
375 | total_elapsed += elapsed; |
376 | processed++; |
377 | |
378 | if (log_checkpoints) |
379 | elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec" , |
380 | processed, |
381 | path, |
382 | (double) elapsed / 1000); |
383 | |
384 | break; /* out of retry loop */ |
385 | } |
386 | |
387 | /* |
388 | * It is possible that the relation has been dropped or truncated |
389 | * since the fsync request was entered. Therefore, allow ENOENT, |
390 | * but only if we didn't fail already on this file. |
391 | */ |
392 | if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) |
393 | ereport(data_sync_elevel(ERROR), |
394 | (errcode_for_file_access(), |
395 | errmsg("could not fsync file \"%s\": %m" , |
396 | path))); |
397 | else |
398 | ereport(DEBUG1, |
399 | (errcode_for_file_access(), |
400 | errmsg("could not fsync file \"%s\" but retrying: %m" , |
401 | path))); |
402 | |
403 | /* |
404 | * Absorb incoming requests and check to see if a cancel arrived |
405 | * for this relation fork. |
406 | */ |
407 | AbsorbSyncRequests(); |
408 | absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ |
409 | } /* end retry loop */ |
410 | |
411 | /* We are done with this entry, remove it */ |
412 | if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL) |
413 | elog(ERROR, "pendingOps corrupted" ); |
414 | } /* end loop over hashtable entries */ |
415 | |
416 | /* Return sync performance metrics for report at checkpoint end */ |
417 | CheckpointStats.ckpt_sync_rels = processed; |
418 | CheckpointStats.ckpt_longest_sync = longest; |
419 | CheckpointStats.ckpt_agg_sync_time = total_elapsed; |
420 | |
421 | /* Flag successful completion of ProcessSyncRequests */ |
422 | sync_in_progress = false; |
423 | } |
424 | |
425 | /* |
426 | * RememberSyncRequest() -- callback from checkpointer side of sync request |
427 | * |
428 | * We stuff fsync requests into the local hash table for execution |
429 | * during the checkpointer's next checkpoint. UNLINK requests go into a |
430 | * separate linked list, however, because they get processed separately. |
431 | * |
432 | * See sync.h for more information on the types of sync requests supported. |
433 | */ |
434 | void |
435 | RememberSyncRequest(const FileTag *ftag, SyncRequestType type) |
436 | { |
437 | Assert(pendingOps); |
438 | |
439 | if (type == SYNC_FORGET_REQUEST) |
440 | { |
441 | PendingFsyncEntry *entry; |
442 | |
443 | /* Cancel previously entered request */ |
444 | entry = (PendingFsyncEntry *) hash_search(pendingOps, |
445 | (void *) ftag, |
446 | HASH_FIND, |
447 | NULL); |
448 | if (entry != NULL) |
449 | entry->canceled = true; |
450 | } |
451 | else if (type == SYNC_FILTER_REQUEST) |
452 | { |
453 | HASH_SEQ_STATUS hstat; |
454 | PendingFsyncEntry *entry; |
455 | ListCell *cell, |
456 | *prev, |
457 | *next; |
458 | |
459 | /* Cancel matching fsync requests */ |
460 | hash_seq_init(&hstat, pendingOps); |
461 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
462 | { |
463 | if (entry->tag.handler == ftag->handler && |
464 | syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag)) |
465 | entry->canceled = true; |
466 | } |
467 | |
468 | /* Remove matching unlink requests */ |
469 | prev = NULL; |
470 | for (cell = list_head(pendingUnlinks); cell; cell = next) |
471 | { |
472 | PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); |
473 | |
474 | next = lnext(cell); |
475 | if (entry->tag.handler == ftag->handler && |
476 | syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag)) |
477 | { |
478 | pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); |
479 | pfree(entry); |
480 | } |
481 | else |
482 | prev = cell; |
483 | } |
484 | } |
485 | else if (type == SYNC_UNLINK_REQUEST) |
486 | { |
487 | /* Unlink request: put it in the linked list */ |
488 | MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); |
489 | PendingUnlinkEntry *entry; |
490 | |
491 | entry = palloc(sizeof(PendingUnlinkEntry)); |
492 | entry->tag = *ftag; |
493 | entry->cycle_ctr = checkpoint_cycle_ctr; |
494 | |
495 | pendingUnlinks = lappend(pendingUnlinks, entry); |
496 | |
497 | MemoryContextSwitchTo(oldcxt); |
498 | } |
499 | else |
500 | { |
501 | /* Normal case: enter a request to fsync this segment */ |
502 | MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); |
503 | PendingFsyncEntry *entry; |
504 | bool found; |
505 | |
506 | Assert(type == SYNC_REQUEST); |
507 | |
508 | entry = (PendingFsyncEntry *) hash_search(pendingOps, |
509 | (void *) ftag, |
510 | HASH_ENTER, |
511 | &found); |
512 | /* if new entry, initialize it */ |
513 | if (!found) |
514 | { |
515 | entry->cycle_ctr = sync_cycle_ctr; |
516 | entry->canceled = false; |
517 | } |
518 | |
519 | /* |
520 | * NB: it's intentional that we don't change cycle_ctr if the entry |
521 | * already exists. The cycle_ctr must represent the oldest fsync |
522 | * request that could be in the entry. |
523 | */ |
524 | |
525 | MemoryContextSwitchTo(oldcxt); |
526 | } |
527 | } |
528 | |
529 | /* |
530 | * Register the sync request locally, or forward it to the checkpointer. |
531 | * |
532 | * If retryOnError is true, we'll keep trying if there is no space in the |
533 | * queue. Return true if we succeeded, or false if there wasn't space. |
534 | */ |
535 | bool |
536 | RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, |
537 | bool retryOnError) |
538 | { |
539 | bool ret; |
540 | |
541 | if (pendingOps != NULL) |
542 | { |
543 | /* standalone backend or startup process: fsync state is local */ |
544 | RememberSyncRequest(ftag, type); |
545 | return true; |
546 | } |
547 | |
548 | for (;;) |
549 | { |
550 | /* |
551 | * Notify the checkpointer about it. If we fail to queue a message in |
552 | * retryOnError mode, we have to sleep and try again ... ugly, but |
553 | * hopefully won't happen often. |
554 | * |
555 | * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an |
556 | * error in the case of SYNC_UNLINK_REQUEST would leave the |
557 | * no-longer-used file still present on disk, which would be bad, so |
558 | * I'm inclined to assume that the checkpointer will always empty the |
559 | * queue soon. |
560 | */ |
561 | ret = ForwardSyncRequest(ftag, type); |
562 | |
563 | /* |
564 | * If we are successful in queueing the request, or we failed and were |
565 | * instructed not to retry on error, break. |
566 | */ |
567 | if (ret || (!ret && !retryOnError)) |
568 | break; |
569 | |
570 | pg_usleep(10000L); |
571 | } |
572 | |
573 | return ret; |
574 | } |
575 | |
576 | /* |
577 | * In archive recovery, we rely on checkpointer to do fsyncs, but we will have |
578 | * already created the pendingOps during initialization of the startup |
579 | * process. Calling this function drops the local pendingOps so that |
580 | * subsequent requests will be forwarded to checkpointer. |
581 | */ |
582 | void |
583 | EnableSyncRequestForwarding(void) |
584 | { |
585 | /* Perform any pending fsyncs we may have queued up, then drop table */ |
586 | if (pendingOps) |
587 | { |
588 | ProcessSyncRequests(); |
589 | hash_destroy(pendingOps); |
590 | } |
591 | pendingOps = NULL; |
592 | |
593 | /* |
594 | * We should not have any pending unlink requests, since mdunlink doesn't |
595 | * queue unlink requests when isRedo. |
596 | */ |
597 | Assert(pendingUnlinks == NIL); |
598 | } |
599 | |