rewriteheap.c source code [PostgreSQL/src/backend/access/heap/rewriteheap.c]

1	/-------------------------------------------------------------------------*
2	*
3	* rewriteheap.c
4	* Support functions to rewrite tables.
5	*
6	* These functions provide a facility to completely rewrite a heap, while
7	* preserving visibility information and update chains.
8	*
9	* INTERFACE
10	*
11	* The caller is responsible for creating the new heap, all catalog
12	* changes, supplying the tuples to be written to the new heap, and
13	* rebuilding indexes. The caller must hold AccessExclusiveLock on the
14	* target table, because we assume no one else is writing into it.
15	*
16	* To use the facility:
17	*
18	* begin_heap_rewrite
19	* while (fetch next tuple)
20	* {
21	* if (tuple is dead)
22	* rewrite_heap_dead_tuple
23	* else
24	* {
25	* // do any transformations here if required
26	* rewrite_heap_tuple
27	* }
28	* }
29	* end_heap_rewrite
30	*
31	* The contents of the new relation shouldn't be relied on until after
32	* end_heap_rewrite is called.
33	*
34	*
35	* IMPLEMENTATION
36	*
37	* This would be a fairly trivial affair, except that we need to maintain
38	* the ctid chains that link versions of an updated tuple together.
39	* Since the newly stored tuples will have tids different from the original
40	* ones, if we just copied t_ctid fields to the new table the links would
41	* be wrong. When we are required to copy a (presumably recently-dead or
42	* delete-in-progress) tuple whose ctid doesn't point to itself, we have
43	* to substitute the correct ctid instead.
44	*
45	* For each ctid reference from A -> B, we might encounter either A first
46	* or B first. (Note that a tuple in the middle of a chain is both A and B
47	* of different pairs.)
48	*
49	* If we encounter A first, we'll store the tuple in the unresolved_tups
50	* hash table. When we later encounter B, we remove A from the hash table,
51	* fix the ctid to point to the new location of B, and insert both A and B
52	* to the new heap.
53	*
54	* If we encounter B first, we can insert B to the new heap right away.
55	* We then add an entry to the old_new_tid_map hash table showing B's
56	* original tid (in the old heap) and new tid (in the new heap).
57	* When we later encounter A, we get the new location of B from the table,
58	* and can write A immediately with the correct ctid.
59	*
60	* Entries in the hash tables can be removed as soon as the later tuple
61	* is encountered. That helps to keep the memory usage down. At the end,
62	* both tables are usually empty; we should have encountered both A and B
63	* of each pair. However, it's possible for A to be RECENTLY_DEAD and B
64	* entirely DEAD according to HeapTupleSatisfiesVacuum, because the test
65	* for deadness using OldestXmin is not exact. In such a case we might
66	* encounter B first, and skip it, and find A later. Then A would be added
67	* to unresolved_tups, and stay there until end of the rewrite. Since
68	* this case is very unusual, we don't worry about the memory usage.
69	*
70	* Using in-memory hash tables means that we use some memory for each live
71	* update chain in the table, from the time we find one end of the
72	* reference until we find the other end. That shouldn't be a problem in
73	* practice, but if you do something like an UPDATE without a where-clause
74	* on a large table, and then run CLUSTER in the same transaction, you
75	* could run out of memory. It doesn't seem worthwhile to add support for
76	* spill-to-disk, as there shouldn't be that many RECENTLY_DEAD tuples in a
77	* table under normal circumstances. Furthermore, in the typical scenario
78	* of CLUSTERing on an unchanging key column, we'll see all the versions
79	* of a given tuple together anyway, and so the peak memory usage is only
80	* proportional to the number of RECENTLY_DEAD versions of a single row, not
81	* in the whole table. Note that if we do fail halfway through a CLUSTER,
82	* the old table is still valid, so failure is not catastrophic.
83	*
84	* We can't use the normal heap_insert function to insert into the new
85	* heap, because heap_insert overwrites the visibility information.
86	* We use a special-purpose raw_heap_insert function instead, which
87	* is optimized for bulk inserting a lot of tuples, knowing that we have
88	* exclusive access to the heap. raw_heap_insert builds new pages in
89	* local storage. When a page is full, or at the end of the process,
90	* we insert it to WAL as a single record and then write it to disk
91	* directly through smgr. Note, however, that any data sent to the new
92	* heap's TOAST table will go through the normal bufmgr.
93	*
94	*
95	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
96	* Portions Copyright (c) 1994-5, Regents of the University of California
97	*
98	* IDENTIFICATION
99	* src/backend/access/heap/rewriteheap.c
100	*
101	*-------------------------------------------------------------------------
102	*/
103	#include "postgres.h"
104
105	#include <sys/stat.h>
106	#include <unistd.h>
107
108	#include "miscadmin.h"
109
110	#include "access/heapam.h"
111	#include "access/heapam_xlog.h"
112	#include "access/rewriteheap.h"
113	#include "access/transam.h"
114	#include "access/tuptoaster.h"
115	#include "access/xact.h"
116	#include "access/xloginsert.h"
117
118	#include "catalog/catalog.h"
119
120	#include "lib/ilist.h"
121
122	#include "pgstat.h"
123
124	#include "replication/logical.h"
125	#include "replication/slot.h"
126
127	#include "storage/bufmgr.h"
128	#include "storage/fd.h"
129	#include "storage/smgr.h"
130
131	#include "utils/memutils.h"
132	#include "utils/rel.h"
133
134	#include "storage/procarray.h"
135
136	/*
137	* State associated with a rewrite operation. This is opaque to the user
138	* of the rewrite facility.
139	*/
140	typedef struct RewriteStateData
141	{
142	Relation rs_old_rel; / source heap /
143	Relation rs_new_rel; / destination heap /
144	Page rs_buffer; / page currently being built /
145	BlockNumber rs_blockno; / block where page will go /
146	bool rs_buffer_valid; / T if any tuples in buffer /
147	bool rs_use_wal; / must we WAL-log inserts? /
148	bool rs_logical_rewrite; / do we need to do logical rewriting /
149	TransactionId rs_oldest_xmin; / oldest xmin used by caller to determine*
150	* tuple visibility */
151	TransactionId rs_freeze_xid; / Xid that will be used as freeze cutoff*
152	* point */
153	TransactionId rs_logical_xmin; / Xid that will be used as cutoff point*
154	* for logical rewrites */
155	MultiXactId rs_cutoff_multi; / MultiXactId that will be used as cutoff*
156	* point for multixacts */
157	MemoryContext rs_cxt; / for hash tables and entries and tuples in*
158	* them */
159	XLogRecPtr rs_begin_lsn; / XLogInsertLsn when starting the rewrite /
160	HTAB rs_unresolved_tups; /* unmatched A tuples /
161	HTAB rs_old_new_tid_map; /* unmatched B tuples /
162	HTAB rs_logical_mappings; /* logical remapping files /
163	uint32 rs_num_rewrite_mappings; / # in memory mappings /
164	} RewriteStateData;
165
166	/*
167	* The lookup keys for the hash tables are tuple TID and xmin (we must check
168	* both to avoid false matches from dead tuples). Beware that there is
169	* probably some padding space in this struct; it must be zeroed out for
170	* correct hashtable operation.
171	*/
172	typedef struct
173	{
174	TransactionId xmin; / tuple xmin /
175	ItemPointerData tid; / tuple location in old heap /
176	} TidHashKey;
177
178	/*
179	* Entry structures for the hash tables
180	*/
181	typedef struct
182	{
183	TidHashKey key; / expected xmin/old location of B tuple /
184	ItemPointerData old_tid; / A's location in the old heap /
185	HeapTuple tuple; / A's tuple contents /
186	} UnresolvedTupData;
187
188	typedef UnresolvedTupData *UnresolvedTup;
189
190	typedef struct
191	{
192	TidHashKey key; / actual xmin/old location of B tuple /
193	ItemPointerData new_tid; / where we put it in the new heap /
194	} OldToNewMappingData;
195
196	typedef OldToNewMappingData *OldToNewMapping;
197
198	/*
199	* In-Memory data for an xid that might need logical remapping entries
200	* to be logged.
201	*/
202	typedef struct RewriteMappingFile
203	{
204	TransactionId xid; / xid that might need to see the row /
205	int vfd; / fd of mappings file /
206	off_t off; / how far have we written yet /
207	uint32 num_mappings; / number of in-memory mappings /
208	dlist_head mappings; / list of in-memory mappings /
209	char path[MAXPGPATH]; / path, for error messages /
210	} RewriteMappingFile;
211
212	/*
213	* A single In-Memory logical rewrite mapping, hanging off
214	* RewriteMappingFile->mappings.
215	*/
216	typedef struct RewriteMappingDataEntry
217	{
218	LogicalRewriteMappingData map; / map between old and new location of the*
219	* tuple */
220	dlist_node node;
221	} RewriteMappingDataEntry;
222
223
224	/ prototypes for internal functions /
225	static void raw_heap_insert(RewriteState state, HeapTuple tup);
226
227	/ internal logical remapping prototypes /
228	static void logical_begin_heap_rewrite(RewriteState state);
229	static void logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple);
230	static void logical_end_heap_rewrite(RewriteState state);
231
232
233	/*
234	* Begin a rewrite of a table
235	*
236	* old_heap old, locked heap relation tuples will be read from
237	* new_heap new, locked heap relation to insert tuples to
238	* oldest_xmin xid used by the caller to determine which tuples are dead
239	* freeze_xid xid before which tuples will be frozen
240	* min_multi multixact before which multis will be removed
241	* use_wal should the inserts to the new heap be WAL-logged?
242	*
243	* Returns an opaque RewriteState, allocated in current memory context,
244	* to be used in subsequent calls to the other functions.
245	*/
246	RewriteState
247	begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
248	TransactionId freeze_xid, MultiXactId cutoff_multi,
249	bool use_wal)
250	{
251	RewriteState state;
252	MemoryContext rw_cxt;
253	MemoryContext old_cxt;
254	HASHCTL hash_ctl;
255
256	/*
257	* To ease cleanup, make a separate context that will contain the
258	* RewriteState struct itself plus all subsidiary data.
259	*/
260	rw_cxt = AllocSetContextCreate(CurrentMemoryContext,
261	"Table rewrite",
262	ALLOCSET_DEFAULT_SIZES);
263	old_cxt = MemoryContextSwitchTo(rw_cxt);
264
265	/ Create and fill in the state struct /
266	state = palloc0(sizeof(RewriteStateData));
267
268	state->rs_old_rel = old_heap;
269	state->rs_new_rel = new_heap;
270	state->rs_buffer = (Page) palloc(BLCKSZ);
271	/ new_heap needn't be empty, just locked /
272	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
273	state->rs_buffer_valid = false;
274	state->rs_use_wal = use_wal;
275	state->rs_oldest_xmin = oldest_xmin;
276	state->rs_freeze_xid = freeze_xid;
277	state->rs_cutoff_multi = cutoff_multi;
278	state->rs_cxt = rw_cxt;
279
280	/ Initialize hash tables used to track update chains /
281	memset(&hash_ctl, `0`, sizeof(hash_ctl));
282	hash_ctl.keysize = sizeof(TidHashKey);
283	hash_ctl.entrysize = sizeof(UnresolvedTupData);
284	hash_ctl.hcxt = state->rs_cxt;
285
286	state->rs_unresolved_tups =
287	hash_create("Rewrite / Unresolved ctids",
288	`128`, / arbitrary initial size /
289	&hash_ctl,
290	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
291
292	hash_ctl.entrysize = sizeof(OldToNewMappingData);
293
294	state->rs_old_new_tid_map =
295	hash_create("Rewrite / Old to new tid map",
296	`128`, / arbitrary initial size /
297	&hash_ctl,
298	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
299
300	MemoryContextSwitchTo(old_cxt);
301
302	logical_begin_heap_rewrite(state);
303
304	return state;
305	}
306
307	/*
308	* End a rewrite.
309	*
310	* state and any other resources are freed.
311	*/
312	void
313	end_heap_rewrite(RewriteState state)
314	{
315	HASH_SEQ_STATUS seq_status;
316	UnresolvedTup unresolved;
317
318	/*
319	* Write any remaining tuples in the UnresolvedTups table. If we have any
320	* left, they should in fact be dead, but let's err on the safe side.
321	*/
322	hash_seq_init(&seq_status, state->rs_unresolved_tups);
323
324	while ((unresolved = hash_seq_search(&seq_status)) != NULL)
325	{
326	ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid);
327	raw_heap_insert(state, unresolved->tuple);
328	}
329
330	/ Write the last page, if any /
331	if (state->rs_buffer_valid)
332	{
333	if (state->rs_use_wal)
334	log_newpage(&state->rs_new_rel->rd_node,
335	MAIN_FORKNUM,
336	state->rs_blockno,
337	state->rs_buffer,
338	true);
339	RelationOpenSmgr(state->rs_new_rel);
340
341	PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
342
343	smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
344	(char *) state->rs_buffer, true);
345	}
346
347	/*
348	* If the rel is WAL-logged, must fsync before commit. We use heap_sync
349	* to ensure that the toast table gets fsync'd too.
350	*
351	* It's obvious that we must do this when not WAL-logging. It's less
352	* obvious that we have to do it even if we did WAL-log the pages. The
353	* reason is the same as in storage.c's RelationCopyStorage(): we're
354	* writing data that's not in shared buffers, and so a CHECKPOINT
355	* occurring during the rewriteheap operation won't have fsync'd data we
356	* wrote before the checkpoint.
357	*/
358	if (RelationNeedsWAL(state->rs_new_rel))
359	heap_sync(state->rs_new_rel);
360
361	logical_end_heap_rewrite(state);
362
363	/ Deleting the context frees everything /
364	MemoryContextDelete(state->rs_cxt);
365	}
366
367	/*
368	* Add a tuple to the new heap.
369	*
370	* Visibility information is copied from the original tuple, except that
371	* we "freeze" very-old tuples. Note that since we scribble on new_tuple,
372	* it had better be temp storage not a pointer to the original tuple.
373	*
374	* state opaque state as returned by begin_heap_rewrite
375	* old_tuple original tuple in the old heap
376	* new_tuple new, rewritten tuple to be inserted to new heap
377	*/
378	void
379	rewrite_heap_tuple(RewriteState state,
380	HeapTuple old_tuple, HeapTuple new_tuple)
381	{
382	MemoryContext old_cxt;
383	ItemPointerData old_tid;
384	TidHashKey hashkey;
385	bool found;
386	bool free_new;
387
388	old_cxt = MemoryContextSwitchTo(state->rs_cxt);
389
390	/*
391	* Copy the original tuple's visibility information into new_tuple.
392	*
393	* XXX we might later need to copy some t_infomask2 bits, too? Right now,
394	* we intentionally clear the HOT status bits.
395	*/
396	memcpy(&new_tuple->t_data->t_choice.t_heap,
397	&old_tuple->t_data->t_choice.t_heap,
398	sizeof(HeapTupleFields));
399
400	new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
401	new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
402	new_tuple->t_data->t_infomask \|=
403	old_tuple->t_data->t_infomask & HEAP_XACT_MASK;
404
405	/*
406	* While we have our hands on the tuple, we may as well freeze any
407	* eligible xmin or xmax, so that future VACUUM effort can be saved.
408	*/
409	heap_freeze_tuple(new_tuple->t_data,
410	state->rs_old_rel->rd_rel->relfrozenxid,
411	state->rs_old_rel->rd_rel->relminmxid,
412	state->rs_freeze_xid,
413	state->rs_cutoff_multi);
414
415	/*
416	* Invalid ctid means that ctid should point to the tuple itself. We'll
417	* override it later if the tuple is part of an update chain.
418	*/
419	ItemPointerSetInvalid(&new_tuple->t_data->t_ctid);
420
421	/*
422	* If the tuple has been updated, check the old-to-new mapping hash table.
423	*/
424	if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) \|\|
425	HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) &&
426	!HeapTupleHeaderIndicatesMovedPartitions(old_tuple->t_data) &&
427	!(ItemPointerEquals(&(old_tuple->t_self),
428	&(old_tuple->t_data->t_ctid))))
429	{
430	OldToNewMapping mapping;
431
432	memset(&hashkey, `0`, sizeof(hashkey));
433	hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data);
434	hashkey.tid = old_tuple->t_data->t_ctid;
435
436	mapping = (OldToNewMapping)
437	hash_search(state->rs_old_new_tid_map, &hashkey,
438	HASH_FIND, NULL);
439
440	if (mapping != NULL)
441	{
442	/*
443	* We've already copied the tuple that t_ctid points to, so we can
444	* set the ctid of this tuple to point to the new location, and
445	* insert it right away.
446	*/
447	new_tuple->t_data->t_ctid = mapping->new_tid;
448
449	/ We don't need the mapping entry anymore /
450	hash_search(state->rs_old_new_tid_map, &hashkey,
451	HASH_REMOVE, &found);
452	Assert(found);
453	}
454	else
455	{
456	/*
457	* We haven't seen the tuple t_ctid points to yet. Stash this
458	* tuple into unresolved_tups to be written later.
459	*/
460	UnresolvedTup unresolved;
461
462	unresolved = hash_search(state->rs_unresolved_tups, &hashkey,
463	HASH_ENTER, &found);
464	Assert(!found);
465
466	unresolved->old_tid = old_tuple->t_self;
467	unresolved->tuple = heap_copytuple(new_tuple);
468
469	/*
470	* We can't do anything more now, since we don't know where the
471	* tuple will be written.
472	*/
473	MemoryContextSwitchTo(old_cxt);
474	return;
475	}
476	}
477
478	/*
479	* Now we will write the tuple, and then check to see if it is the B tuple
480	* in any new or known pair. When we resolve a known pair, we will be
481	* able to write that pair's A tuple, and then we have to check if it
482	* resolves some other pair. Hence, we need a loop here.
483	*/
484	old_tid = old_tuple->t_self;
485	free_new = false;
486
487	for (;;)
488	{
489	ItemPointerData new_tid;
490
491	/ Insert the tuple and find out where it's put in new_heap /
492	raw_heap_insert(state, new_tuple);
493	new_tid = new_tuple->t_self;
494
495	logical_rewrite_heap_tuple(state, old_tid, new_tuple);
496
497	/*
498	* If the tuple is the updated version of a row, and the prior version
499	* wouldn't be DEAD yet, then we need to either resolve the prior
500	* version (if it's waiting in rs_unresolved_tups), or make an entry
501	* in rs_old_new_tid_map (so we can resolve it when we do see it). The
502	* previous tuple's xmax would equal this one's xmin, so it's
503	* RECENTLY_DEAD if and only if the xmin is not before OldestXmin.
504	*/
505	if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) &&
506	!TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data),
507	state->rs_oldest_xmin))
508	{
509	/*
510	* Okay, this is B in an update pair. See if we've seen A.
511	*/
512	UnresolvedTup unresolved;
513
514	memset(&hashkey, `0`, sizeof(hashkey));
515	hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data);
516	hashkey.tid = old_tid;
517
518	unresolved = hash_search(state->rs_unresolved_tups, &hashkey,
519	HASH_FIND, NULL);
520
521	if (unresolved != NULL)
522	{
523	/*
524	* We have seen and memorized the previous tuple already. Now
525	* that we know where we inserted the tuple its t_ctid points
526	* to, fix its t_ctid and insert it to the new heap.
527	*/
528	if (free_new)
529	heap_freetuple(new_tuple);
530	new_tuple = unresolved->tuple;
531	free_new = true;
532	old_tid = unresolved->old_tid;
533	new_tuple->t_data->t_ctid = new_tid;
534
535	/*
536	* We don't need the hash entry anymore, but don't free its
537	* tuple just yet.
538	*/
539	hash_search(state->rs_unresolved_tups, &hashkey,
540	HASH_REMOVE, &found);
541	Assert(found);
542
543	/ loop back to insert the previous tuple in the chain /
544	continue;
545	}
546	else
547	{
548	/*
549	* Remember the new tid of this tuple. We'll use it to set the
550	* ctid when we find the previous tuple in the chain.
551	*/
552	OldToNewMapping mapping;
553
554	mapping = hash_search(state->rs_old_new_tid_map, &hashkey,
555	HASH_ENTER, &found);
556	Assert(!found);
557
558	mapping->new_tid = new_tid;
559	}
560	}
561
562	/ Done with this (chain of) tuples, for now /
563	if (free_new)
564	heap_freetuple(new_tuple);
565	break;
566	}
567
568	MemoryContextSwitchTo(old_cxt);
569	}
570
571	/*
572	* Register a dead tuple with an ongoing rewrite. Dead tuples are not
573	* copied to the new table, but we still make note of them so that we
574	* can release some resources earlier.
575	*
576	* Returns true if a tuple was removed from the unresolved_tups table.
577	* This indicates that that tuple, previously thought to be "recently dead",
578	* is now known really dead and won't be written to the output.
579	*/
580	bool
581	rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
582	{
583	/*
584	* If we have already seen an earlier tuple in the update chain that
585	* points to this tuple, let's forget about that earlier tuple. It's in
586	* fact dead as well, our simple xmax < OldestXmin test in
587	* HeapTupleSatisfiesVacuum just wasn't enough to detect it. It happens
588	* when xmin of a tuple is greater than xmax, which sounds
589	* counter-intuitive but is perfectly valid.
590	*
591	* We don't bother to try to detect the situation the other way round,
592	* when we encounter the dead tuple first and then the recently dead one
593	* that points to it. If that happens, we'll have some unmatched entries
594	* in the UnresolvedTups hash table at the end. That can happen anyway,
595	* because a vacuum might have removed the dead tuple in the chain before
596	* us.
597	*/
598	UnresolvedTup unresolved;
599	TidHashKey hashkey;
600	bool found;
601
602	memset(&hashkey, `0`, sizeof(hashkey));
603	hashkey.xmin = HeapTupleHeaderGetXmin(old_tuple->t_data);
604	hashkey.tid = old_tuple->t_self;
605
606	unresolved = hash_search(state->rs_unresolved_tups, &hashkey,
607	HASH_FIND, NULL);
608
609	if (unresolved != NULL)
610	{
611	/ Need to free the contained tuple as well as the hashtable entry /
612	heap_freetuple(unresolved->tuple);
613	hash_search(state->rs_unresolved_tups, &hashkey,
614	HASH_REMOVE, &found);
615	Assert(found);
616	return true;
617	}
618
619	return false;
620	}
621
622	/*
623	* Insert a tuple to the new relation. This has to track heap_insert
624	* and its subsidiary functions!
625	*
626	* t_self of the tuple is set to the new TID of the tuple. If t_ctid of the
627	* tuple is invalid on entry, it's replaced with the new TID as well (in
628	* the inserted data only, not in the caller's copy).
629	*/
630	static void
631	raw_heap_insert(RewriteState state, HeapTuple tup)
632	{
633	Page page = state->rs_buffer;
634	Size pageFreeSpace,
635	saveFreeSpace;
636	Size len;
637	OffsetNumber newoff;
638	HeapTuple heaptup;
639
640	/*
641	* If the new tuple is too big for storage or contains already toasted
642	* out-of-line attributes from some other relation, invoke the toaster.
643	*
644	* Note: below this point, heaptup is the data we actually intend to store
645	* into the relation; tup is the caller's original untoasted data.
646	*/
647	if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE)
648	{
649	/ toast table entries should never be recursively toasted /
650	Assert(!HeapTupleHasExternal(tup));
651	heaptup = tup;
652	}
653	else if (HeapTupleHasExternal(tup) \|\| tup->t_len > TOAST_TUPLE_THRESHOLD)
654	{
655	int options = HEAP_INSERT_SKIP_FSM;
656
657	if (!state->rs_use_wal)
658	options \|= HEAP_INSERT_SKIP_WAL;
659
660	/*
661	* While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
662	* for the TOAST table are not logically decoded. The main heap is
663	* WAL-logged as XLOG FPI records, which are not logically decoded.
664	*/
665	options \|= HEAP_INSERT_NO_LOGICAL;
666
667	heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL,
668	options);
669	}
670	else
671	heaptup = tup;
672
673	len = MAXALIGN(heaptup->t_len); / be conservative /
674
675	/*
676	* If we're gonna fail for oversize tuple, do it right away
677	*/
678	if (len > MaxHeapTupleSize)
679	ereport(ERROR,
680	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
681	errmsg("row is too big: size %zu, maximum size %zu",
682	len, MaxHeapTupleSize)));
683
684	/ Compute desired extra freespace due to fillfactor option /
685	saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel,
686	HEAP_DEFAULT_FILLFACTOR);
687
688	/ Now we can check to see if there's enough free space already. /
689	if (state->rs_buffer_valid)
690	{
691	pageFreeSpace = PageGetHeapFreeSpace(page);
692
693	if (len + saveFreeSpace > pageFreeSpace)
694	{
695	/ Doesn't fit, so write out the existing page /
696
697	/ XLOG stuff /
698	if (state->rs_use_wal)
699	log_newpage(&state->rs_new_rel->rd_node,
700	MAIN_FORKNUM,
701	state->rs_blockno,
702	page,
703	true);
704
705	/*
706	* Now write the page. We say isTemp = true even if it's not a
707	* temp table, because there's no need for smgr to schedule an
708	* fsync for this write; we'll do it ourselves in
709	* end_heap_rewrite.
710	*/
711	RelationOpenSmgr(state->rs_new_rel);
712
713	PageSetChecksumInplace(page, state->rs_blockno);
714
715	smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
716	state->rs_blockno, (char *) page, true);
717
718	state->rs_blockno++;
719	state->rs_buffer_valid = false;
720	}
721	}
722
723	if (!state->rs_buffer_valid)
724	{
725	/ Initialize a new empty page /
726	PageInit(page, BLCKSZ, `0`);
727	state->rs_buffer_valid = true;
728	}
729
730	/ And now we can insert the tuple into the page /
731	newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len,
732	InvalidOffsetNumber, false, true);
733	if (newoff == InvalidOffsetNumber)
734	elog(ERROR, "failed to add tuple");
735
736	/ Update caller's t_self to the actual position where it was stored /
737	ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff);
738
739	/*
740	* Insert the correct position into CTID of the stored tuple, too, if the
741	* caller didn't supply a valid CTID.
742	*/
743	if (!ItemPointerIsValid(&tup->t_data->t_ctid))
744	{
745	ItemId newitemid;
746	HeapTupleHeader onpage_tup;
747
748	newitemid = PageGetItemId(page, newoff);
749	onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid);
750
751	onpage_tup->t_ctid = tup->t_self;
752	}
753
754	/ If heaptup is a private copy, release it. /
755	if (heaptup != tup)
756	heap_freetuple(heaptup);
757	}
758
759	/ ------------------------------------------------------------------------*
760	* Logical rewrite support
761	*
762	* When doing logical decoding - which relies on using cmin/cmax of catalog
763	* tuples, via xl_heap_new_cid records - heap rewrites have to log enough
764	* information to allow the decoding backend to updates its internal mapping
765	* of (relfilenode,ctid) => (cmin, cmax) to be correct for the rewritten heap.
766	*
767	* For that, every time we find a tuple that's been modified in a catalog
768	* relation within the xmin horizon of any decoding slot, we log a mapping
769	* from the old to the new location.
770	*
771	* To deal with rewrites that abort the filename of a mapping file contains
772	* the xid of the transaction performing the rewrite, which then can be
773	* checked before being read in.
774	*
775	* For efficiency we don't immediately spill every single map mapping for a
776	* row to disk but only do so in batches when we've collected several of them
777	* in memory or when end_heap_rewrite() has been called.
778	*
779	* Crash-Safety: This module diverts from the usual patterns of doing WAL
780	* since it cannot rely on checkpoint flushing out all buffers and thus
781	* waiting for exclusive locks on buffers. Usually the XLogInsert() covering
782	* buffer modifications is performed while the buffer(s) that are being
783	* modified are exclusively locked guaranteeing that both the WAL record and
784	* the modified heap are on either side of the checkpoint. But since the
785	* mapping files we log aren't in shared_buffers that interlock doesn't work.
786	*
787	* Instead we simply write the mapping files out to disk, before the
788	* XLogInsert() is performed. That guarantees that either the XLogInsert() is
789	* inserted after the checkpoint's redo pointer or that the checkpoint (via
790	* LogicalRewriteHeapCheckpoint()) has flushed the (partial) mapping file to
791	* disk. That leaves the tail end that has not yet been flushed open to
792	* corruption, which is solved by including the current offset in the
793	* xl_heap_rewrite_mapping records and truncating the mapping file to it
794	* during replay. Every time a rewrite is finished all generated mapping files
795	* are synced to disk.
796	*
797	* Note that if we were only concerned about crash safety we wouldn't have to
798	* deal with WAL logging at all - an fsync() at the end of a rewrite would be
799	* sufficient for crash safety. Any mapping that hasn't been safely flushed to
800	* disk has to be by an aborted (explicitly or via a crash) transaction and is
801	* ignored by virtue of the xid in its name being subject to a
802	* TransactionDidCommit() check. But we want to support having standbys via
803	* physical replication, both for availability and to do logical decoding
804	* there.
805	* ------------------------------------------------------------------------
806	*/
807
808	/*
809	* Do preparations for logging logical mappings during a rewrite if
810	* necessary. If we detect that we don't need to log anything we'll prevent
811	* any further action by the various logical rewrite functions.
812	*/
813	static void
814	logical_begin_heap_rewrite(RewriteState state)
815	{
816	HASHCTL hash_ctl;
817	TransactionId logical_xmin;
818
819	/*
820	* We only need to persist these mappings if the rewritten table can be
821	* accessed during logical decoding, if not, we can skip doing any
822	* additional work.
823	*/
824	state->rs_logical_rewrite =
825	RelationIsAccessibleInLogicalDecoding(state->rs_old_rel);
826
827	if (!state->rs_logical_rewrite)
828	return;
829
830	ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin);
831
832	/*
833	* If there are no logical slots in progress we don't need to do anything,
834	* there cannot be any remappings for relevant rows yet. The relation's
835	* lock protects us against races.
836	*/
837	if (logical_xmin == InvalidTransactionId)
838	{
839	state->rs_logical_rewrite = false;
840	return;
841	}
842
843	state->rs_logical_xmin = logical_xmin;
844	state->rs_begin_lsn = GetXLogInsertRecPtr();
845	state->rs_num_rewrite_mappings = `0`;
846
847	memset(&hash_ctl, `0`, sizeof(hash_ctl));
848	hash_ctl.keysize = sizeof(TransactionId);
849	hash_ctl.entrysize = sizeof(RewriteMappingFile);
850	hash_ctl.hcxt = state->rs_cxt;
851
852	state->rs_logical_mappings =
853	hash_create("Logical rewrite mapping",
854	`128`, / arbitrary initial size /
855	&hash_ctl,
856	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
857	}
858
859	/*
860	* Flush all logical in-memory mappings to disk, but don't fsync them yet.
861	*/
862	static void
863	logical_heap_rewrite_flush_mappings(RewriteState state)
864	{
865	HASH_SEQ_STATUS seq_status;
866	RewriteMappingFile *src;
867	dlist_mutable_iter iter;
868
869	Assert(state->rs_logical_rewrite);
870
871	/ no logical rewrite in progress, no need to iterate over mappings /
872	if (state->rs_num_rewrite_mappings == `0`)
873	return;
874
875	elog(DEBUG1, "flushing %u logical rewrite mapping entries",
876	state->rs_num_rewrite_mappings);
877
878	hash_seq_init(&seq_status, state->rs_logical_mappings);
879	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
880	{
881	char *waldata;
882	char *waldata_start;
883	xl_heap_rewrite_mapping xlrec;
884	Oid dboid;
885	uint32 len;
886	int written;
887
888	/ this file hasn't got any new mappings /
889	if (src->num_mappings == `0`)
890	continue;
891
892	if (state->rs_old_rel->rd_rel->relisshared)
893	dboid = InvalidOid;
894	else
895	dboid = MyDatabaseId;
896
897	xlrec.num_mappings = src->num_mappings;
898	xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel);
899	xlrec.mapped_xid = src->xid;
900	xlrec.mapped_db = dboid;
901	xlrec.offset = src->off;
902	xlrec.start_lsn = state->rs_begin_lsn;
903
904	/ write all mappings consecutively /
905	len = src->num_mappings * sizeof(LogicalRewriteMappingData);
906	waldata_start = waldata = palloc(len);
907
908	/*
909	* collect data we need to write out, but don't modify ondisk data yet
910	*/
911	dlist_foreach_modify(iter, &src->mappings)
912	{
913	RewriteMappingDataEntry *pmap;
914
915	pmap = dlist_container(RewriteMappingDataEntry, node, iter.cur);
916
917	memcpy(waldata, &pmap->map, sizeof(pmap->map));
918	waldata += sizeof(pmap->map);
919
920	/ remove from the list and free /
921	dlist_delete(&pmap->node);
922	pfree(pmap);
923
924	/ update bookkeeping /
925	state->rs_num_rewrite_mappings--;
926	src->num_mappings--;
927	}
928
929	Assert(src->num_mappings == `0`);
930	Assert(waldata == waldata_start + len);
931
932	/*
933	* Note that we deviate from the usual WAL coding practices here,
934	* check the above "Logical rewrite support" comment for reasoning.
935	*/
936	written = FileWrite(src->vfd, waldata_start, len, src->off,
937	WAIT_EVENT_LOGICAL_REWRITE_WRITE);
938	if (written != len)
939	ereport(ERROR,
940	(errcode_for_file_access(),
941	errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
942	written, len)));
943	src->off += len;
944
945	XLogBeginInsert();
946	XLogRegisterData((char ) (&xlrec), sizeof*(xlrec));
947	XLogRegisterData(waldata_start, len);
948
949	/ write xlog record /
950	XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE);
951
952	pfree(waldata_start);
953	}
954	Assert(state->rs_num_rewrite_mappings == `0`);
955	}
956
957	/*
958	* Logical remapping part of end_heap_rewrite().
959	*/
960	static void
961	logical_end_heap_rewrite(RewriteState state)
962	{
963	HASH_SEQ_STATUS seq_status;
964	RewriteMappingFile *src;
965
966	/ done, no logical rewrite in progress /
967	if (!state->rs_logical_rewrite)
968	return;
969
970	/ writeout remaining in-memory entries /
971	if (state->rs_num_rewrite_mappings > `0`)
972	logical_heap_rewrite_flush_mappings(state);
973
974	/ Iterate over all mappings we have written and fsync the files. /
975	hash_seq_init(&seq_status, state->rs_logical_mappings);
976	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
977	{
978	if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != `0`)
979	ereport(data_sync_elevel(ERROR),
980	(errcode_for_file_access(),
981	errmsg("could not fsync file \"%s\": %m", src->path)));
982	FileClose(src->vfd);
983	}
984	/ memory context cleanup will deal with the rest /
985	}
986
987	/*
988	* Log a single (old->new) mapping for 'xid'.
989	*/
990	static void
991	logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
992	LogicalRewriteMappingData *map)
993	{
994	RewriteMappingFile *src;
995	RewriteMappingDataEntry *pmap;
996	Oid relid;
997	bool found;
998
999	relid = RelationGetRelid(state->rs_old_rel);
1000
1001	/ look for existing mappings for this 'mapped' xid /
1002	src = hash_search(state->rs_logical_mappings, &xid,
1003	HASH_ENTER, &found);
1004
1005	/*
1006	* We haven't yet had the need to map anything for this xid, create
1007	* per-xid data structures.
1008	*/
1009	if (!found)
1010	{
1011	char path[MAXPGPATH];
1012	Oid dboid;
1013
1014	if (state->rs_old_rel->rd_rel->relisshared)
1015	dboid = InvalidOid;
1016	else
1017	dboid = MyDatabaseId;
1018
1019	snprintf(path, MAXPGPATH,
1020	"pg_logical/mappings/" LOGICAL_REWRITE_FORMAT,
1021	dboid, relid,
1022	(uint32) (state->rs_begin_lsn >> `32`),
1023	(uint32) state->rs_begin_lsn,
1024	xid, GetCurrentTransactionId());
1025
1026	dlist_init(&src->mappings);
1027	src->num_mappings = `0`;
1028	src->off = `0`;
1029	memcpy(src->path, path, sizeof(path));
1030	src->vfd = PathNameOpenFile(path,
1031	O_CREAT \| O_EXCL \| O_WRONLY \| PG_BINARY);
1032	if (src->vfd < `0`)
1033	ereport(ERROR,
1034	(errcode_for_file_access(),
1035	errmsg("could not create file \"%s\": %m", path)));
1036	}
1037
1038	pmap = MemoryContextAlloc(state->rs_cxt,
1039	sizeof(RewriteMappingDataEntry));
1040	memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData));
1041	dlist_push_tail(&src->mappings, &pmap->node);
1042	src->num_mappings++;
1043	state->rs_num_rewrite_mappings++;
1044
1045	/*
1046	* Write out buffer every time we've too many in-memory entries across all
1047	* mapping files.
1048	*/
1049	if (state->rs_num_rewrite_mappings >= `1000` / arbitrary number / )
1050	logical_heap_rewrite_flush_mappings(state);
1051	}
1052
1053	/*
1054	* Perform logical remapping for a tuple that's mapped from old_tid to
1055	* new_tuple->t_self by rewrite_heap_tuple() if necessary for the tuple.
1056	*/
1057	static void
1058	logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid,
1059	HeapTuple new_tuple)
1060	{
1061	ItemPointerData new_tid = new_tuple->t_self;
1062	TransactionId cutoff = state->rs_logical_xmin;
1063	TransactionId xmin;
1064	TransactionId xmax;
1065	bool do_log_xmin = false;
1066	bool do_log_xmax = false;
1067	LogicalRewriteMappingData map;
1068
1069	/ no logical rewrite in progress, we don't need to log anything /
1070	if (!state->rs_logical_rewrite)
1071	return;
1072
1073	xmin = HeapTupleHeaderGetXmin(new_tuple->t_data);
1074	/ use GetUpdateXid to correctly deal with multixacts /*
1075	xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data);
1076
1077	/*
1078	* Log the mapping iff the tuple has been created recently.
1079	*/
1080	if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff))
1081	do_log_xmin = true;
1082
1083	if (!TransactionIdIsNormal(xmax))
1084	{
1085	/*
1086	* no xmax is set, can't have any permanent ones, so this check is
1087	* sufficient
1088	*/
1089	}
1090	else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask))
1091	{
1092	/ only locked, we don't care /
1093	}
1094	else if (!TransactionIdPrecedes(xmax, cutoff))
1095	{
1096	/ tuple has been deleted recently, log /
1097	do_log_xmax = true;
1098	}
1099
1100	/ if neither needs to be logged, we're done /
1101	if (!do_log_xmin && !do_log_xmax)
1102	return;
1103
1104	/ fill out mapping information /
1105	map.old_node = state->rs_old_rel->rd_node;
1106	map.old_tid = old_tid;
1107	map.new_node = state->rs_new_rel->rd_node;
1108	map.new_tid = new_tid;
1109
1110	/ ---*
1111	* Now persist the mapping for the individual xids that are affected. We
1112	* need to log for both xmin and xmax if they aren't the same transaction
1113	* since the mapping files are per "affected" xid.
1114	* We don't muster all that much effort detecting whether xmin and xmax
1115	* are actually the same transaction, we just check whether the xid is the
1116	* same disregarding subtransactions. Logging too much is relatively
1117	* harmless and we could never do the check fully since subtransaction
1118	* data is thrown away during restarts.
1119	* ---
1120	*/
1121	if (do_log_xmin)
1122	logical_rewrite_log_mapping(state, xmin, &map);
1123	/ separately log mapping for xmax unless it'd be redundant /
1124	if (do_log_xmax && !TransactionIdEquals(xmin, xmax))
1125	logical_rewrite_log_mapping(state, xmax, &map);
1126	}
1127
1128	/*
1129	* Replay XLOG_HEAP2_REWRITE records
1130	*/
1131	void
1132	heap_xlog_logical_rewrite(XLogReaderState *r)
1133	{
1134	char path[MAXPGPATH];
1135	int fd;
1136	xl_heap_rewrite_mapping *xlrec;
1137	uint32 len;
1138	char *data;
1139
1140	xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r);
1141
1142	snprintf(path, MAXPGPATH,
1143	"pg_logical/mappings/" LOGICAL_REWRITE_FORMAT,
1144	xlrec->mapped_db, xlrec->mapped_rel,
1145	(uint32) (xlrec->start_lsn >> `32`),
1146	(uint32) xlrec->start_lsn,
1147	xlrec->mapped_xid, XLogRecGetXid(r));
1148
1149	fd = OpenTransientFile(path,
1150	O_CREAT \| O_WRONLY \| PG_BINARY);
1151	if (fd < `0`)
1152	ereport(ERROR,
1153	(errcode_for_file_access(),
1154	errmsg("could not create file \"%s\": %m", path)));
1155
1156	/*
1157	* Truncate all data that's not guaranteed to have been safely fsynced (by
1158	* previous record or by the last checkpoint).
1159	*/
1160	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE);
1161	if (ftruncate(fd, xlrec->offset) != `0`)
1162	ereport(ERROR,
1163	(errcode_for_file_access(),
1164	errmsg("could not truncate file \"%s\" to %u: %m",
1165	path, (uint32) xlrec->offset)));
1166	pgstat_report_wait_end();
1167
1168	/ now seek to the position we want to write our data to /
1169	if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset)
1170	ereport(ERROR,
1171	(errcode_for_file_access(),
1172	errmsg("could not seek to end of file \"%s\": %m",
1173	path)));
1174
1175	data = XLogRecGetData(r) + sizeof(*xlrec);
1176
1177	len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData);
1178
1179	/ write out tail end of mapping file (again) /
1180	errno = `0`;
1181	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE);
1182	if (write(fd, data, len) != len)
1183	{
1184	/ if write didn't set errno, assume problem is no disk space /
1185	if (errno == `0`)
1186	errno = ENOSPC;
1187	ereport(ERROR,
1188	(errcode_for_file_access(),
1189	errmsg("could not write to file \"%s\": %m", path)));
1190	}
1191	pgstat_report_wait_end();
1192
1193	/*
1194	* Now fsync all previously written data. We could improve things and only
1195	* do this for the last write to a file, but the required bookkeeping
1196	* doesn't seem worth the trouble.
1197	*/
1198	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
1199	if (pg_fsync(fd) != `0`)
1200	ereport(data_sync_elevel(ERROR),
1201	(errcode_for_file_access(),
1202	errmsg("could not fsync file \"%s\": %m", path)));
1203	pgstat_report_wait_end();
1204
1205	if (CloseTransientFile(fd))
1206	ereport(ERROR,
1207	(errcode_for_file_access(),
1208	errmsg("could not close file \"%s\": %m", path)));
1209	}
1210
1211	/ ---*
1212	* Perform a checkpoint for logical rewrite mappings
1213	*
1214	* This serves two tasks:
1215	* 1) Remove all mappings not needed anymore based on the logical restart LSN
1216	* 2) Flush all remaining mappings to disk, so that replay after a checkpoint
1217	* only has to deal with the parts of a mapping that have been written out
1218	* after the checkpoint started.
1219	* ---
1220	*/
1221	void
1222	CheckPointLogicalRewriteHeap(void)
1223	{
1224	XLogRecPtr cutoff;
1225	XLogRecPtr redo;
1226	DIR *mappings_dir;
1227	struct dirent *mapping_de;
1228	char path[MAXPGPATH + `20`];
1229
1230	/*
1231	* We start of with a minimum of the last redo pointer. No new decoding
1232	* slot will start before that, so that's a safe upper bound for removal.
1233	*/
1234	redo = GetRedoRecPtr();
1235
1236	/ now check for the restart ptrs from existing slots /
1237	cutoff = ReplicationSlotsComputeLogicalRestartLSN();
1238
1239	/ don't start earlier than the restart lsn /
1240	if (cutoff != InvalidXLogRecPtr && redo < cutoff)
1241	cutoff = redo;
1242
1243	mappings_dir = AllocateDir("pg_logical/mappings");
1244	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
1245	{
1246	struct stat statbuf;
1247	Oid dboid;
1248	Oid relid;
1249	XLogRecPtr lsn;
1250	TransactionId rewrite_xid;
1251	TransactionId create_xid;
1252	uint32 hi,
1253	lo;
1254
1255	if (strcmp(mapping_de->d_name, ".") == `0` \|\|
1256	strcmp(mapping_de->d_name, "..") == `0`)
1257	continue;
1258
1259	snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
1260	if (lstat(path, &statbuf) == `0` && !S_ISREG(statbuf.st_mode))
1261	continue;
1262
1263	/ Skip over files that cannot be ours. /
1264	if (strncmp(mapping_de->d_name, "map-", `4`) != `0`)
1265	continue;
1266
1267	if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
1268	&dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != `6`)
1269	elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
1270
1271	lsn = ((uint64) hi) << `32` \| lo;
1272
1273	if (lsn < cutoff \|\| cutoff == InvalidXLogRecPtr)
1274	{
1275	elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
1276	if (unlink(path) < `0`)
1277	ereport(ERROR,
1278	(errcode_for_file_access(),
1279	errmsg("could not remove file \"%s\": %m", path)));
1280	}
1281	else
1282	{
1283	int fd = OpenTransientFile(path, O_RDONLY \| PG_BINARY);
1284
1285	/*
1286	* The file cannot vanish due to concurrency since this function
1287	* is the only one removing logical mappings and it's run while
1288	* CheckpointLock is held exclusively.
1289	*/
1290	if (fd < `0`)
1291	ereport(ERROR,
1292	(errcode_for_file_access(),
1293	errmsg("could not open file \"%s\": %m", path)));
1294
1295	/*
1296	* We could try to avoid fsyncing files that either haven't
1297	* changed or have only been created since the checkpoint's start,
1298	* but it's currently not deemed worth the effort.
1299	*/
1300	pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
1301	if (pg_fsync(fd) != `0`)
1302	ereport(data_sync_elevel(ERROR),
1303	(errcode_for_file_access(),
1304	errmsg("could not fsync file \"%s\": %m", path)));
1305	pgstat_report_wait_end();
1306
1307	if (CloseTransientFile(fd))
1308	ereport(ERROR,
1309	(errcode_for_file_access(),
1310	errmsg("could not close file \"%s\": %m", path)));
1311	}
1312	}
1313	FreeDir(mappings_dir);
1314	}
1315

Browse the source code of PostgreSQL/src/backend/access/heap/rewriteheap.c