sync.c source code [PostgreSQL/src/backend/storage/sync/sync.c]

1	/-------------------------------------------------------------------------*
2	*
3	* sync.c
4	* File synchronization management code.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/storage/sync/sync.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include <unistd.h>
18	#include <fcntl.h>
19	#include <sys/file.h>
20
21	#include "miscadmin.h"
22	#include "pgstat.h"
23	#include "access/xlogutils.h"
24	#include "access/xlog.h"
25	#include "commands/tablespace.h"
26	#include "portability/instr_time.h"
27	#include "postmaster/bgwriter.h"
28	#include "storage/bufmgr.h"
29	#include "storage/ipc.h"
30	#include "storage/md.h"
31	#include "utils/hsearch.h"
32	#include "utils/memutils.h"
33	#include "utils/inval.h"
34
35	static MemoryContext pendingOpsCxt; / context for the pending ops state /
36
37	/*
38	* In some contexts (currently, standalone backends and the checkpointer)
39	* we keep track of pending fsync operations: we need to remember all relation
40	* segments that have been written since the last checkpoint, so that we can
41	* fsync them down to disk before completing the next checkpoint. This hash
42	* table remembers the pending operations. We use a hash table mostly as
43	* a convenient way of merging duplicate requests.
44	*
45	* We use a similar mechanism to remember no-longer-needed files that can
46	* be deleted after the next checkpoint, but we use a linked list instead of
47	* a hash table, because we don't expect there to be any duplicate requests.
48	*
49	* These mechanisms are only used for non-temp relations; we never fsync
50	* temp rels, nor do we need to postpone their deletion (see comments in
51	* mdunlink).
52	*
53	* (Regular backends do not track pending operations locally, but forward
54	* them to the checkpointer.)
55	*/
56	typedef uint16 CycleCtr; / can be any convenient integer size /
57
58	typedef struct
59	{
60	FileTag tag; / identifies handler and file /
61	CycleCtr cycle_ctr; / sync_cycle_ctr of oldest request /
62	bool canceled; / canceled is true if we canceled "recently" /
63	} PendingFsyncEntry;
64
65	typedef struct
66	{
67	FileTag tag; / identifies handler and file /
68	CycleCtr cycle_ctr; / checkpoint_cycle_ctr when request was made /
69	} PendingUnlinkEntry;
70
71	static HTAB *pendingOps = NULL;
72	static List *pendingUnlinks = NIL;
73	static MemoryContext pendingOpsCxt; / context for the above /
74
75	static CycleCtr sync_cycle_ctr = `0`;
76	static CycleCtr checkpoint_cycle_ctr = `0`;
77
78	/ Intervals for calling AbsorbSyncRequests /
79	#define FSYNCS_PER_ABSORB 10
80	#define UNLINKS_PER_ABSORB 10
81
82	/*
83	* Function pointers for handling sync and unlink requests.
84	*/
85	typedef struct SyncOps
86	{
87	int (sync_syncfiletag) (const* FileTag ftag, char* *path);
88	int (sync_unlinkfiletag) (const* FileTag ftag, char* *path);
89	bool (sync_filetagmatches) (const* FileTag *ftag,
90	const FileTag *candidate);
91	} SyncOps;
92
93	static const SyncOps syncsw[] = {
94	/ magnetic disk /
95	{
96	.sync_syncfiletag = mdsyncfiletag,
97	.sync_unlinkfiletag = mdunlinkfiletag,
98	.sync_filetagmatches = mdfiletagmatches
99	}
100	};
101
102	/*
103	* Initialize data structures for the file sync tracking.
104	*/
105	void
106	InitSync(void)
107	{
108	/*
109	* Create pending-operations hashtable if we need it. Currently, we need
110	* it if we are standalone (not under a postmaster) or if we are a startup
111	* or checkpointer auxiliary process.
112	*/
113	if (!IsUnderPostmaster \|\| AmStartupProcess() \|\| AmCheckpointerProcess())
114	{
115	HASHCTL hash_ctl;
116
117	/*
118	* XXX: The checkpointer needs to add entries to the pending ops table
119	* when absorbing fsync requests. That is done within a critical
120	* section, which isn't usually allowed, but we make an exception. It
121	* means that there's a theoretical possibility that you run out of
122	* memory while absorbing fsync requests, which leads to a PANIC.
123	* Fortunately the hash table is small so that's unlikely to happen in
124	* practice.
125	*/
126	pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
127	"Pending ops context",
128	ALLOCSET_DEFAULT_SIZES);
129	MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
130
131	MemSet(&hash_ctl, `0`, sizeof(hash_ctl));
132	hash_ctl.keysize = sizeof(FileTag);
133	hash_ctl.entrysize = sizeof(PendingFsyncEntry);
134	hash_ctl.hcxt = pendingOpsCxt;
135	pendingOps = hash_create("Pending Ops Table",
136	`100L`,
137	&hash_ctl,
138	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
139	pendingUnlinks = NIL;
140	}
141
142	}
143
144	/*
145	* SyncPreCheckpoint() -- Do pre-checkpoint work
146	*
147	* To distinguish unlink requests that arrived before this checkpoint
148	* started from those that arrived during the checkpoint, we use a cycle
149	* counter similar to the one we use for fsync requests. That cycle
150	* counter is incremented here.
151	*
152	* This must be called before the checkpoint REDO point is determined.
153	* That ensures that we won't delete files too soon.
154	*
155	* Note that we can't do anything here that depends on the assumption
156	* that the checkpoint will be completed.
157	*/
158	void
159	SyncPreCheckpoint(void)
160	{
161	/*
162	* Any unlink requests arriving after this point will be assigned the next
163	* cycle counter, and won't be unlinked until next checkpoint.
164	*/
165	checkpoint_cycle_ctr++;
166	}
167
168	/*
169	* SyncPostCheckpoint() -- Do post-checkpoint work
170	*
171	* Remove any lingering files that can now be safely removed.
172	*/
173	void
174	SyncPostCheckpoint(void)
175	{
176	int absorb_counter;
177
178	absorb_counter = UNLINKS_PER_ABSORB;
179	while (pendingUnlinks != NIL)
180	{
181	PendingUnlinkEntry entry = (PendingUnlinkEntry ) linitial(pendingUnlinks);
182	char path[MAXPGPATH];
183
184	/*
185	* New entries are appended to the end, so if the entry is new we've
186	* reached the end of old entries.
187	*
188	* Note: if just the right number of consecutive checkpoints fail, we
189	* could be fooled here by cycle_ctr wraparound. However, the only
190	* consequence is that we'd delay unlinking for one more checkpoint,
191	* which is perfectly tolerable.
192	*/
193	if (entry->cycle_ctr == checkpoint_cycle_ctr)
194	break;
195
196	/ Unlink the file /
197	if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
198	path) < `0`)
199	{
200	/*
201	* There's a race condition, when the database is dropped at the
202	* same time that we process the pending unlink requests. If the
203	* DROP DATABASE deletes the file before we do, we will get ENOENT
204	* here. rmtree() also has to ignore ENOENT errors, to deal with
205	* the possibility that we delete the file first.
206	*/
207	if (errno != ENOENT)
208	ereport(WARNING,
209	(errcode_for_file_access(),
210	errmsg("could not remove file \"%s\": %m", path)));
211	}
212
213	/ And remove the list entry /
214	pendingUnlinks = list_delete_first(pendingUnlinks);
215	pfree(entry);
216
217	/*
218	* As in ProcessSyncRequests, we don't want to stop absorbing fsync
219	* requests for along time when there are many deletions to be done.
220	* We can safely call AbsorbSyncRequests() at this point in the loop
221	* (note it might try to delete list entries).
222	*/
223	if (--absorb_counter <= `0`)
224	{
225	AbsorbSyncRequests();
226	absorb_counter = UNLINKS_PER_ABSORB;
227	}
228	}
229	}
230
231	/*
232
233	* ProcessSyncRequests() -- Process queued fsync requests.
234	*/
235	void
236	ProcessSyncRequests(void)
237	{
238	static bool sync_in_progress = false;
239
240	HASH_SEQ_STATUS hstat;
241	PendingFsyncEntry *entry;
242	int absorb_counter;
243
244	/ Statistics on sync times /
245	int processed = `0`;
246	instr_time sync_start,
247	sync_end,
248	sync_diff;
249	uint64 elapsed;
250	uint64 longest = `0`;
251	uint64 total_elapsed = `0`;
252
253	/*
254	* This is only called during checkpoints, and checkpoints should only
255	* occur in processes that have created a pendingOps.
256	*/
257	if (!pendingOps)
258	elog(ERROR, "cannot sync without a pendingOps table");
259
260	/*
261	* If we are in the checkpointer, the sync had better include all fsync
262	* requests that were queued by backends up to this point. The tightest
263	* race condition that could occur is that a buffer that must be written
264	* and fsync'd for the checkpoint could have been dumped by a backend just
265	* before it was visited by BufferSync(). We know the backend will have
266	* queued an fsync request before clearing the buffer's dirtybit, so we
267	* are safe as long as we do an Absorb after completing BufferSync().
268	*/
269	AbsorbSyncRequests();
270
271	/*
272	* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
273	* checkpoint), we want to ignore fsync requests that are entered into the
274	* hashtable after this point --- they should be processed next time,
275	* instead. We use sync_cycle_ctr to tell old entries apart from new
276	* ones: new ones will have cycle_ctr equal to the incremented value of
277	* sync_cycle_ctr.
278	*
279	* In normal circumstances, all entries present in the table at this point
280	* will have cycle_ctr exactly equal to the current (about to be old)
281	* value of sync_cycle_ctr. However, if we fail partway through the
282	* fsync'ing loop, then older values of cycle_ctr might remain when we
283	* come back here to try again. Repeated checkpoint failures would
284	* eventually wrap the counter around to the point where an old entry
285	* might appear new, causing us to skip it, possibly allowing a checkpoint
286	* to succeed that should not have. To forestall wraparound, any time the
287	* previous ProcessSyncRequests() failed to complete, run through the
288	* table and forcibly set cycle_ctr = sync_cycle_ctr.
289	*
290	* Think not to merge this loop with the main loop, as the problem is
291	* exactly that that loop may fail before having visited all the entries.
292	* From a performance point of view it doesn't matter anyway, as this path
293	* will never be taken in a system that's functioning normally.
294	*/
295	if (sync_in_progress)
296	{
297	/ prior try failed, so update any stale cycle_ctr values /
298	hash_seq_init(&hstat, pendingOps);
299	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
300	{
301	entry->cycle_ctr = sync_cycle_ctr;
302	}
303	}
304
305	/ Advance counter so that new hashtable entries are distinguishable /
306	sync_cycle_ctr++;
307
308	/ Set flag to detect failure if we don't reach the end of the loop /
309	sync_in_progress = true;
310
311	/ Now scan the hashtable for fsync requests to process /
312	absorb_counter = FSYNCS_PER_ABSORB;
313	hash_seq_init(&hstat, pendingOps);
314	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
315	{
316	int failures;
317
318	/*
319	* If fsync is off then we don't have to bother opening the file at
320	* all. (We delay checking until this point so that changing fsync on
321	* the fly behaves sensibly.)
322	*/
323	if (!enableFsync)
324	continue;
325
326	/*
327	* If the entry is new then don't process it this time; it is new.
328	* Note "continue" bypasses the hash-remove call at the bottom of the
329	* loop.
330	*/
331	if (entry->cycle_ctr == sync_cycle_ctr)
332	continue;
333
334	/ Else assert we haven't missed it /
335	Assert((CycleCtr) (entry->cycle_ctr + `1`) == sync_cycle_ctr);
336
337	/*
338	* If in checkpointer, we want to absorb pending requests every so
339	* often to prevent overflow of the fsync request queue. It is
340	* unspecified whether newly-added entries will be visited by
341	* hash_seq_search, but we don't care since we don't need to process
342	* them anyway.
343	*/
344	if (--absorb_counter <= `0`)
345	{
346	AbsorbSyncRequests();
347	absorb_counter = FSYNCS_PER_ABSORB;
348	}
349
350	/*
351	* The fsync table could contain requests to fsync segments that have
352	* been deleted (unlinked) by the time we get to them. Rather than
353	* just hoping an ENOENT (or EACCES on Windows) error can be ignored,
354	* what we do on error is absorb pending requests and then retry.
355	* Since mdunlink() queues a "cancel" message before actually
356	* unlinking, the fsync request is guaranteed to be marked canceled
357	* after the absorb if it really was this case. DROP DATABASE likewise
358	* has to tell us to forget fsync requests before it starts deletions.
359	*/
360	for (failures = `0`; !entry->canceled; failures++)
361	{
362	char path[MAXPGPATH];
363
364	INSTR_TIME_SET_CURRENT(sync_start);
365	if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
366	path) == `0`)
367	{
368	/ Success; update statistics about sync timing /
369	INSTR_TIME_SET_CURRENT(sync_end);
370	sync_diff = sync_end;
371	INSTR_TIME_SUBTRACT(sync_diff, sync_start);
372	elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
373	if (elapsed > longest)
374	longest = elapsed;
375	total_elapsed += elapsed;
376	processed++;
377
378	if (log_checkpoints)
379	elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
380	processed,
381	path,
382	(double) elapsed / `1000`);
383
384	break; / out of retry loop /
385	}
386
387	/*
388	* It is possible that the relation has been dropped or truncated
389	* since the fsync request was entered. Therefore, allow ENOENT,
390	* but only if we didn't fail already on this file.
391	*/
392	if (!FILE_POSSIBLY_DELETED(errno) \|\| failures > `0`)
393	ereport(data_sync_elevel(ERROR),
394	(errcode_for_file_access(),
395	errmsg("could not fsync file \"%s\": %m",
396	path)));
397	else
398	ereport(DEBUG1,
399	(errcode_for_file_access(),
400	errmsg("could not fsync file \"%s\" but retrying: %m",
401	path)));
402
403	/*
404	* Absorb incoming requests and check to see if a cancel arrived
405	* for this relation fork.
406	*/
407	AbsorbSyncRequests();
408	absorb_counter = FSYNCS_PER_ABSORB; / might as well... /
409	} / end retry loop /
410
411	/ We are done with this entry, remove it /
412	if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
413	elog(ERROR, "pendingOps corrupted");
414	} / end loop over hashtable entries /
415
416	/ Return sync performance metrics for report at checkpoint end /
417	CheckpointStats.ckpt_sync_rels = processed;
418	CheckpointStats.ckpt_longest_sync = longest;
419	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
420
421	/ Flag successful completion of ProcessSyncRequests /
422	sync_in_progress = false;
423	}
424
425	/*
426	* RememberSyncRequest() -- callback from checkpointer side of sync request
427	*
428	* We stuff fsync requests into the local hash table for execution
429	* during the checkpointer's next checkpoint. UNLINK requests go into a
430	* separate linked list, however, because they get processed separately.
431	*
432	* See sync.h for more information on the types of sync requests supported.
433	*/
434	void
435	RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
436	{
437	Assert(pendingOps);
438
439	if (type == SYNC_FORGET_REQUEST)
440	{
441	PendingFsyncEntry *entry;
442
443	/ Cancel previously entered request /
444	entry = (PendingFsyncEntry *) hash_search(pendingOps,
445	(void *) ftag,
446	HASH_FIND,
447	NULL);
448	if (entry != NULL)
449	entry->canceled = true;
450	}
451	else if (type == SYNC_FILTER_REQUEST)
452	{
453	HASH_SEQ_STATUS hstat;
454	PendingFsyncEntry *entry;
455	ListCell *cell,
456	*prev,
457	*next;
458
459	/ Cancel matching fsync requests /
460	hash_seq_init(&hstat, pendingOps);
461	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
462	{
463	if (entry->tag.handler == ftag->handler &&
464	syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
465	entry->canceled = true;
466	}
467
468	/ Remove matching unlink requests /
469	prev = NULL;
470	for (cell = list_head(pendingUnlinks); cell; cell = next)
471	{
472	PendingUnlinkEntry entry = (PendingUnlinkEntry ) lfirst(cell);
473
474	next = lnext(cell);
475	if (entry->tag.handler == ftag->handler &&
476	syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
477	{
478	pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
479	pfree(entry);
480	}
481	else
482	prev = cell;
483	}
484	}
485	else if (type == SYNC_UNLINK_REQUEST)
486	{
487	/ Unlink request: put it in the linked list /
488	MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
489	PendingUnlinkEntry *entry;
490
491	entry = palloc(sizeof(PendingUnlinkEntry));
492	entry->tag = *ftag;
493	entry->cycle_ctr = checkpoint_cycle_ctr;
494
495	pendingUnlinks = lappend(pendingUnlinks, entry);
496
497	MemoryContextSwitchTo(oldcxt);
498	}
499	else
500	{
501	/ Normal case: enter a request to fsync this segment /
502	MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
503	PendingFsyncEntry *entry;
504	bool found;
505
506	Assert(type == SYNC_REQUEST);
507
508	entry = (PendingFsyncEntry *) hash_search(pendingOps,
509	(void *) ftag,
510	HASH_ENTER,
511	&found);
512	/ if new entry, initialize it /
513	if (!found)
514	{
515	entry->cycle_ctr = sync_cycle_ctr;
516	entry->canceled = false;
517	}
518
519	/*
520	* NB: it's intentional that we don't change cycle_ctr if the entry
521	* already exists. The cycle_ctr must represent the oldest fsync
522	* request that could be in the entry.
523	*/
524
525	MemoryContextSwitchTo(oldcxt);
526	}
527	}
528
529	/*
530	* Register the sync request locally, or forward it to the checkpointer.
531	*
532	* If retryOnError is true, we'll keep trying if there is no space in the
533	* queue. Return true if we succeeded, or false if there wasn't space.
534	*/
535	bool
536	RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
537	bool retryOnError)
538	{
539	bool ret;
540
541	if (pendingOps != NULL)
542	{
543	/ standalone backend or startup process: fsync state is local /
544	RememberSyncRequest(ftag, type);
545	return true;
546	}
547
548	for (;;)
549	{
550	/*
551	* Notify the checkpointer about it. If we fail to queue a message in
552	* retryOnError mode, we have to sleep and try again ... ugly, but
553	* hopefully won't happen often.
554	*
555	* XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
556	* error in the case of SYNC_UNLINK_REQUEST would leave the
557	* no-longer-used file still present on disk, which would be bad, so
558	* I'm inclined to assume that the checkpointer will always empty the
559	* queue soon.
560	*/
561	ret = ForwardSyncRequest(ftag, type);
562
563	/*
564	* If we are successful in queueing the request, or we failed and were
565	* instructed not to retry on error, break.
566	*/
567	if (ret \|\| (!ret && !retryOnError))
568	break;
569
570	pg_usleep(`10000L`);
571	}
572
573	return ret;
574	}
575
576	/*
577	* In archive recovery, we rely on checkpointer to do fsyncs, but we will have
578	* already created the pendingOps during initialization of the startup
579	* process. Calling this function drops the local pendingOps so that
580	* subsequent requests will be forwarded to checkpointer.
581	*/
582	void
583	EnableSyncRequestForwarding(void)
584	{
585	/ Perform any pending fsyncs we may have queued up, then drop table /
586	if (pendingOps)
587	{
588	ProcessSyncRequests();
589	hash_destroy(pendingOps);
590	}
591	pendingOps = NULL;
592
593	/*
594	* We should not have any pending unlink requests, since mdunlink doesn't
595	* queue unlink requests when isRedo.
596	*/
597	Assert(pendingUnlinks == NIL);
598	}
599

Browse the source code of PostgreSQL/src/backend/storage/sync/sync.c