heapam.c source code [PostgreSQL/src/backend/access/heap/heapam.c]

1	/-------------------------------------------------------------------------*
2	*
3	* heapam.c
4	* heap access method code
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/heap/heapam.c
12	*
13	*
14	* INTERFACE ROUTINES
15	* heap_beginscan - begin relation scan
16	* heap_rescan - restart a relation scan
17	* heap_endscan - end relation scan
18	* heap_getnext - retrieve next tuple in scan
19	* heap_fetch - retrieve tuple with given tid
20	* heap_insert - insert tuple into a relation
21	* heap_multi_insert - insert multiple tuples into a relation
22	* heap_delete - delete a tuple from a relation
23	* heap_update - replace a tuple in a relation with another tuple
24	* heap_sync - sync heap, for when no WAL has been written
25	*
26	* NOTES
27	* This file contains the heap_ routines which implement
28	* the POSTGRES heap access method used for all POSTGRES
29	* relations.
30	*
31	*-------------------------------------------------------------------------
32	*/
33	#include "postgres.h"
34
35	#include "access/bufmask.h"
36	#include "access/genam.h"
37	#include "access/heapam.h"
38	#include "access/heapam_xlog.h"
39	#include "access/hio.h"
40	#include "access/multixact.h"
41	#include "access/parallel.h"
42	#include "access/relscan.h"
43	#include "access/sysattr.h"
44	#include "access/tableam.h"
45	#include "access/transam.h"
46	#include "access/tuptoaster.h"
47	#include "access/valid.h"
48	#include "access/visibilitymap.h"
49	#include "access/xact.h"
50	#include "access/xlog.h"
51	#include "access/xloginsert.h"
52	#include "access/xlogutils.h"
53	#include "catalog/catalog.h"
54	#include "miscadmin.h"
55	#include "pgstat.h"
56	#include "port/atomics.h"
57	#include "storage/bufmgr.h"
58	#include "storage/freespace.h"
59	#include "storage/lmgr.h"
60	#include "storage/predicate.h"
61	#include "storage/procarray.h"
62	#include "storage/smgr.h"
63	#include "storage/spin.h"
64	#include "storage/standby.h"
65	#include "utils/datum.h"
66	#include "utils/inval.h"
67	#include "utils/lsyscache.h"
68	#include "utils/relcache.h"
69	#include "utils/snapmgr.h"
70	#include "utils/spccache.h"
71
72
73	static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
74	TransactionId xid, CommandId cid, int options);
75	static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
76	Buffer newbuf, HeapTuple oldtup,
77	HeapTuple newtup, HeapTuple old_key_tup,
78	bool all_visible_cleared, bool new_all_visible_cleared);
79	static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
80	Bitmapset *interesting_cols,
81	HeapTuple oldtup, HeapTuple newtup);
82	static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
83	LockTupleMode mode, LockWaitPolicy wait_policy,
84	bool *have_tuple_lock);
85	static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
86	uint16 old_infomask2, TransactionId add_to_xmax,
87	LockTupleMode mode, bool is_update,
88	TransactionId result_xmax, uint16 result_infomask,
89	uint16 *result_infomask2);
90	static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
91	ItemPointer ctid, TransactionId xid,
92	LockTupleMode mode);
93	static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
94	uint16 *new_infomask2);
95	static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
96	uint16 t_infomask);
97	static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
98	LockTupleMode lockmode, bool *current_is_member);
99	static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
100	Relation rel, ItemPointer ctid, XLTW_Oper oper,
101	int *remaining);
102	static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
103	uint16 infomask, Relation rel, int *remaining);
104	static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
105	static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
106	bool *copy);
107
108
109	/*
110	* Each tuple lock mode has a corresponding heavyweight lock, and one or two
111	* corresponding MultiXactStatuses (one to merely lock tuples, another one to
112	* update them). This table (and the macros below) helps us determine the
113	* heavyweight lock mode and MultiXactStatus values to use for any particular
114	* tuple lock strength.
115	*
116	* Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117	* instead.
118	*/
119	static const struct
120	{
121	LOCKMODE hwlock;
122	int lockstatus;
123	int updstatus;
124	}
125
126	tupleLockExtraInfo[MaxLockTupleMode + `1`] =
127	{
128	{ / LockTupleKeyShare /
129	AccessShareLock,
130	MultiXactStatusForKeyShare,
131	-`1` / KeyShare does not allow updating tuples /
132	},
133	{ / LockTupleShare /
134	RowShareLock,
135	MultiXactStatusForShare,
136	-`1` / Share does not allow updating tuples /
137	},
138	{ / LockTupleNoKeyExclusive /
139	ExclusiveLock,
140	MultiXactStatusForNoKeyUpdate,
141	MultiXactStatusNoKeyUpdate
142	},
143	{ / LockTupleExclusive /
144	AccessExclusiveLock,
145	MultiXactStatusForUpdate,
146	MultiXactStatusUpdate
147	}
148	};
149
150	/ Get the LOCKMODE for a given MultiXactStatus /
151	#define LOCKMODE_from_mxstatus(status) \
152	(tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153
154	/*
155	* Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156	* This is more readable than having every caller translate it to lock.h's
157	* LOCKMODE.
158	*/
159	#define LockTupleTuplock(rel, tup, mode) \
160	LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161	#define UnlockTupleTuplock(rel, tup, mode) \
162	UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163	#define ConditionalLockTupleTuplock(rel, tup, mode) \
164	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165
166	#ifdef USE_PREFETCH
167	/*
168	* heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
169	* this structure to coordinate prefetching activity.
170	*/
171	typedef struct
172	{
173	BlockNumber cur_hblkno;
174	int next_item;
175	int nitems;
176	ItemPointerData *tids;
177	} XidHorizonPrefetchState;
178	#endif
179
180	/*
181	* This table maps tuple lock strength values for each particular
182	* MultiXactStatus value.
183	*/
184	static const int MultiXactStatusLock[MaxMultiXactStatus + `1`] =
185	{
186	LockTupleKeyShare, / ForKeyShare /
187	LockTupleShare, / ForShare /
188	LockTupleNoKeyExclusive, / ForNoKeyUpdate /
189	LockTupleExclusive, / ForUpdate /
190	LockTupleNoKeyExclusive, / NoKeyUpdate /
191	LockTupleExclusive / Update /
192	};
193
194	/ Get the LockTupleMode for a given MultiXactStatus /
195	#define TUPLOCK_from_mxstatus(status) \
196	(MultiXactStatusLock[(status)])
197
198	/ ----------------------------------------------------------------*
199	* heap support routines
200	* ----------------------------------------------------------------
201	*/
202
203	/ ----------------*
204	* initscan - scan code common to heap_beginscan and heap_rescan
205	* ----------------
206	*/
207	static void
208	initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
209	{
210	ParallelBlockTableScanDesc bpscan = NULL;
211	bool allow_strat;
212	bool allow_sync;
213
214	/*
215	* Determine the number of blocks we have to scan.
216	*
217	* It is sufficient to do this once at scan start, since any tuples added
218	* while the scan is in progress will be invisible to my snapshot anyway.
219	* (That is not true when using a non-MVCC snapshot. However, we couldn't
220	* guarantee to return tuples added after scan start anyway, since they
221	* might go into pages we already scanned. To guarantee consistent
222	* results for a non-MVCC snapshot, the caller must hold some higher-level
223	* lock that ensures the interesting tuple(s) won't change.)
224	*/
225	if (scan->rs_base.rs_parallel != NULL)
226	{
227	bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
228	scan->rs_nblocks = bpscan->phs_nblocks;
229	}
230	else
231	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
232
233	/*
234	* If the table is large relative to NBuffers, use a bulk-read access
235	* strategy and enable synchronized scanning (see syncscan.c). Although
236	* the thresholds for these features could be different, we make them the
237	* same so that there are only two behaviors to tune rather than four.
238	* (However, some callers need to be able to disable one or both of these
239	* behaviors, independently of the size of the table; also there is a GUC
240	* variable that can disable synchronized scanning.)
241	*
242	* Note that table_block_parallelscan_initialize has a very similar test;
243	* if you change this, consider changing that one, too.
244	*/
245	if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
246	scan->rs_nblocks > NBuffers / `4`)
247	{
248	allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != `0`;
249	allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != `0`;
250	}
251	else
252	allow_strat = allow_sync = false;
253
254	if (allow_strat)
255	{
256	/ During a rescan, keep the previous strategy object. /
257	if (scan->rs_strategy == NULL)
258	scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
259	}
260	else
261	{
262	if (scan->rs_strategy != NULL)
263	FreeAccessStrategy(scan->rs_strategy);
264	scan->rs_strategy = NULL;
265	}
266
267	if (scan->rs_base.rs_parallel != NULL)
268	{
269	/ For parallel scan, believe whatever ParallelTableScanDesc says. /
270	if (scan->rs_base.rs_parallel->phs_syncscan)
271	scan->rs_base.rs_flags \|= SO_ALLOW_SYNC;
272	else
273	scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
274	}
275	else if (keep_startblock)
276	{
277	/*
278	* When rescanning, we want to keep the previous startblock setting,
279	* so that rewinding a cursor doesn't generate surprising results.
280	* Reset the active syncscan setting, though.
281	*/
282	if (allow_sync && synchronize_seqscans)
283	scan->rs_base.rs_flags \|= SO_ALLOW_SYNC;
284	else
285	scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
286	}
287	else if (allow_sync && synchronize_seqscans)
288	{
289	scan->rs_base.rs_flags \|= SO_ALLOW_SYNC;
290	scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
291	}
292	else
293	{
294	scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295	scan->rs_startblock = `0`;
296	}
297
298	scan->rs_numblocks = InvalidBlockNumber;
299	scan->rs_inited = false;
300	scan->rs_ctup.t_data = NULL;
301	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
302	scan->rs_cbuf = InvalidBuffer;
303	scan->rs_cblock = InvalidBlockNumber;
304
305	/ page-at-a-time fields are always invalid when not rs_inited /
306
307	/*
308	* copy the scan key, if appropriate
309	*/
310	if (key != NULL)
311	memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
312
313	/*
314	* Currently, we only have a stats counter for sequential heap scans (but
315	* e.g for bitmap scans the underlying bitmap index scans will be counted,
316	* and for sample scans we update stats for tuple fetches).
317	*/
318	if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
319	pgstat_count_heap_scan(scan->rs_base.rs_rd);
320	}
321
322	/*
323	* heap_setscanlimits - restrict range of a heapscan
324	*
325	* startBlk is the page to start at
326	* numBlks is number of pages to scan (InvalidBlockNumber means "all")
327	*/
328	void
329	heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
330	{
331	HeapScanDesc scan = (HeapScanDesc) sscan;
332
333	Assert(!scan->rs_inited); / else too late to change /
334	/ else rs_startblock is significant /
335	Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
336
337	/ Check startBlk is valid (but allow case of zero blocks...) /
338	Assert(startBlk == `0` \|\| startBlk < scan->rs_nblocks);
339
340	scan->rs_startblock = startBlk;
341	scan->rs_numblocks = numBlks;
342	}
343
344	/*
345	* heapgetpage - subroutine for heapgettup()
346	*
347	* This routine reads and pins the specified page of the relation.
348	* In page-at-a-time mode it performs additional work, namely determining
349	* which tuples on the page are visible.
350	*/
351	void
352	heapgetpage(TableScanDesc sscan, BlockNumber page)
353	{
354	HeapScanDesc scan = (HeapScanDesc) sscan;
355	Buffer buffer;
356	Snapshot snapshot;
357	Page dp;
358	int lines;
359	int ntup;
360	OffsetNumber lineoff;
361	ItemId lpp;
362	bool all_visible;
363
364	Assert(page < scan->rs_nblocks);
365
366	/ release previous scan buffer, if any /
367	if (BufferIsValid(scan->rs_cbuf))
368	{
369	ReleaseBuffer(scan->rs_cbuf);
370	scan->rs_cbuf = InvalidBuffer;
371	}
372
373	/*
374	* Be sure to check for interrupts at least once per page. Checks at
375	* higher code levels won't be able to stop a seqscan that encounters many
376	* pages' worth of consecutive dead tuples.
377	*/
378	CHECK_FOR_INTERRUPTS();
379
380	/ read page using selected strategy /
381	scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
382	RBM_NORMAL, scan->rs_strategy);
383	scan->rs_cblock = page;
384
385	if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
386	return;
387
388	buffer = scan->rs_cbuf;
389	snapshot = scan->rs_base.rs_snapshot;
390
391	/*
392	* Prune and repair fragmentation for the whole page, if possible.
393	*/
394	heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
395
396	/*
397	* We must hold share lock on the buffer content while examining tuple
398	* visibility. Afterwards, however, the tuples we have found to be
399	* visible are guaranteed good as long as we hold the buffer pin.
400	*/
401	LockBuffer(buffer, BUFFER_LOCK_SHARE);
402
403	dp = BufferGetPage(buffer);
404	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
405	lines = PageGetMaxOffsetNumber(dp);
406	ntup = `0`;
407
408	/*
409	* If the all-visible flag indicates that all tuples on the page are
410	* visible to everyone, we can skip the per-tuple visibility tests.
411	*
412	* Note: In hot standby, a tuple that's already visible to all
413	* transactions in the master might still be invisible to a read-only
414	* transaction in the standby. We partly handle this problem by tracking
415	* the minimum xmin of visible tuples as the cut-off XID while marking a
416	* page all-visible on master and WAL log that along with the visibility
417	* map SET operation. In hot standby, we wait for (or abort) all
418	* transactions that can potentially may not see one or more tuples on the
419	* page. That's how index-only scans work fine in hot standby. A crucial
420	* difference between index-only scans and heap scans is that the
421	* index-only scan completely relies on the visibility map where as heap
422	* scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
423	* the page-level flag can be trusted in the same way, because it might
424	* get propagated somehow without being explicitly WAL-logged, e.g. via a
425	* full page write. Until we can prove that beyond doubt, let's check each
426	* tuple for visibility the hard way.
427	*/
428	all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
429
430	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
431	lineoff <= lines;
432	lineoff++, lpp++)
433	{
434	if (ItemIdIsNormal(lpp))
435	{
436	HeapTupleData loctup;
437	bool valid;
438
439	loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
440	loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
441	loctup.t_len = ItemIdGetLength(lpp);
442	ItemPointerSet(&(loctup.t_self), page, lineoff);
443
444	if (all_visible)
445	valid = true;
446	else
447	valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
448
449	CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
450	&loctup, buffer, snapshot);
451
452	if (valid)
453	scan->rs_vistuples[ntup++] = lineoff;
454	}
455	}
456
457	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
458
459	Assert(ntup <= MaxHeapTuplesPerPage);
460	scan->rs_ntuples = ntup;
461	}
462
463	/ ----------------*
464	* heapgettup - fetch next heap tuple
465	*
466	* Initialize the scan if not already done; then advance to the next
467	* tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
468	* or set scan->rs_ctup.t_data = NULL if no more tuples.
469	*
470	* dir == NoMovementScanDirection means "re-fetch the tuple indicated
471	* by scan->rs_ctup".
472	*
473	* Note: the reason nkeys/key are passed separately, even though they are
474	* kept in the scan descriptor, is that the caller may not want us to check
475	* the scankeys.
476	*
477	* Note: when we fall off the end of the scan in either direction, we
478	* reset rs_inited. This means that a further request with the same
479	* scan direction will restart the scan, which is a bit odd, but a
480	* request with the opposite scan direction will start a fresh scan
481	* in the proper direction. The latter is required behavior for cursors,
482	* while the former case is generally undefined behavior in Postgres
483	* so we don't care too much.
484	* ----------------
485	*/
486	static void
487	heapgettup(HeapScanDesc scan,
488	ScanDirection dir,
489	int nkeys,
490	ScanKey key)
491	{
492	HeapTuple tuple = &(scan->rs_ctup);
493	Snapshot snapshot = scan->rs_base.rs_snapshot;
494	bool backward = ScanDirectionIsBackward(dir);
495	BlockNumber page;
496	bool finished;
497	Page dp;
498	int lines;
499	OffsetNumber lineoff;
500	int linesleft;
501	ItemId lpp;
502
503	/*
504	* calculate next starting lineoff, given scan direction
505	*/
506	if (ScanDirectionIsForward(dir))
507	{
508	if (!scan->rs_inited)
509	{
510	/*
511	* return null immediately if relation is empty
512	*/
513	if (scan->rs_nblocks == `0` \|\| scan->rs_numblocks == `0`)
514	{
515	Assert(!BufferIsValid(scan->rs_cbuf));
516	tuple->t_data = NULL;
517	return;
518	}
519	if (scan->rs_base.rs_parallel != NULL)
520	{
521	ParallelBlockTableScanDesc pbscan =
522	(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
523
524	table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
525	pbscan);
526
527	page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
528	pbscan);
529
530	/ Other processes might have already finished the scan. /
531	if (page == InvalidBlockNumber)
532	{
533	Assert(!BufferIsValid(scan->rs_cbuf));
534	tuple->t_data = NULL;
535	return;
536	}
537	}
538	else
539	page = scan->rs_startblock; / first page /
540	heapgetpage((TableScanDesc) scan, page);
541	lineoff = FirstOffsetNumber; / first offnum /
542	scan->rs_inited = true;
543	}
544	else
545	{
546	/ continue from previously returned page/tuple /
547	page = scan->rs_cblock; / current page /
548	lineoff = / next offnum /
549	OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
550	}
551
552	LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
553
554	dp = BufferGetPage(scan->rs_cbuf);
555	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
556	lines = PageGetMaxOffsetNumber(dp);
557	/ page and lineoff now reference the physically next tid /
558
559	linesleft = lines - lineoff + `1`;
560	}
561	else if (backward)
562	{
563	/ backward parallel scan not supported /
564	Assert(scan->rs_base.rs_parallel == NULL);
565
566	if (!scan->rs_inited)
567	{
568	/*
569	* return null immediately if relation is empty
570	*/
571	if (scan->rs_nblocks == `0` \|\| scan->rs_numblocks == `0`)
572	{
573	Assert(!BufferIsValid(scan->rs_cbuf));
574	tuple->t_data = NULL;
575	return;
576	}
577
578	/*
579	* Disable reporting to syncscan logic in a backwards scan; it's
580	* not very likely anyone else is doing the same thing at the same
581	* time, and much more likely that we'll just bollix things for
582	* forward scanners.
583	*/
584	scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
585	/ start from last page of the scan /
586	if (scan->rs_startblock > `0`)
587	page = scan->rs_startblock - `1`;
588	else
589	page = scan->rs_nblocks - `1`;
590	heapgetpage((TableScanDesc) scan, page);
591	}
592	else
593	{
594	/ continue from previously returned page/tuple /
595	page = scan->rs_cblock; / current page /
596	}
597
598	LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
599
600	dp = BufferGetPage(scan->rs_cbuf);
601	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
602	lines = PageGetMaxOffsetNumber(dp);
603
604	if (!scan->rs_inited)
605	{
606	lineoff = lines; / final offnum /
607	scan->rs_inited = true;
608	}
609	else
610	{
611	lineoff = / previous offnum /
612	OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
613	}
614	/ page and lineoff now reference the physically previous tid /
615
616	linesleft = lineoff;
617	}
618	else
619	{
620	/*
621	* ``no movement'' scan direction: refetch prior tuple
622	*/
623	if (!scan->rs_inited)
624	{
625	Assert(!BufferIsValid(scan->rs_cbuf));
626	tuple->t_data = NULL;
627	return;
628	}
629
630	page = ItemPointerGetBlockNumber(&(tuple->t_self));
631	if (page != scan->rs_cblock)
632	heapgetpage((TableScanDesc) scan, page);
633
634	/ Since the tuple was previously fetched, needn't lock page here /
635	dp = BufferGetPage(scan->rs_cbuf);
636	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
637	lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
638	lpp = PageGetItemId(dp, lineoff);
639	Assert(ItemIdIsNormal(lpp));
640
641	tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
642	tuple->t_len = ItemIdGetLength(lpp);
643
644	return;
645	}
646
647	/*
648	* advance the scan until we find a qualifying tuple or run out of stuff
649	* to scan
650	*/
651	lpp = PageGetItemId(dp, lineoff);
652	for (;;)
653	{
654	while (linesleft > `0`)
655	{
656	if (ItemIdIsNormal(lpp))
657	{
658	bool valid;
659
660	tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
661	tuple->t_len = ItemIdGetLength(lpp);
662	ItemPointerSet(&(tuple->t_self), page, lineoff);
663
664	/*
665	* if current tuple qualifies, return it.
666	*/
667	valid = HeapTupleSatisfiesVisibility(tuple,
668	snapshot,
669	scan->rs_cbuf);
670
671	CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
672	tuple, scan->rs_cbuf,
673	snapshot);
674
675	if (valid && key != NULL)
676	HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
677	nkeys, key, valid);
678
679	if (valid)
680	{
681	LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
682	return;
683	}
684	}
685
686	/*
687	* otherwise move to the next item on the page
688	*/
689	--linesleft;
690	if (backward)
691	{
692	--lpp; / move back in this page's ItemId array /
693	--lineoff;
694	}
695	else
696	{
697	++lpp; / move forward in this page's ItemId array /
698	++lineoff;
699	}
700	}
701
702	/*
703	* if we get here, it means we've exhausted the items on this page and
704	* it's time to move to the next.
705	*/
706	LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
707
708	/*
709	* advance to next/prior page and detect end of scan
710	*/
711	if (backward)
712	{
713	finished = (page == scan->rs_startblock) \|\|
714	(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == `0` : false);
715	if (page == `0`)
716	page = scan->rs_nblocks;
717	page--;
718	}
719	else if (scan->rs_base.rs_parallel != NULL)
720	{
721	ParallelBlockTableScanDesc pbscan =
722	(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
723
724	page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
725	pbscan);
726	finished = (page == InvalidBlockNumber);
727	}
728	else
729	{
730	page++;
731	if (page >= scan->rs_nblocks)
732	page = `0`;
733	finished = (page == scan->rs_startblock) \|\|
734	(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == `0` : false);
735
736	/*
737	* Report our new scan position for synchronization purposes. We
738	* don't do that when moving backwards, however. That would just
739	* mess up any other forward-moving scanners.
740	*
741	* Note: we do this before checking for end of scan so that the
742	* final state of the position hint is back at the start of the
743	* rel. That's not strictly necessary, but otherwise when you run
744	* the same query multiple times the starting position would shift
745	* a little bit backwards on every invocation, which is confusing.
746	* We don't guarantee any specific ordering in general, though.
747	*/
748	if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
749	ss_report_location(scan->rs_base.rs_rd, page);
750	}
751
752	/*
753	* return NULL if we've exhausted all the pages
754	*/
755	if (finished)
756	{
757	if (BufferIsValid(scan->rs_cbuf))
758	ReleaseBuffer(scan->rs_cbuf);
759	scan->rs_cbuf = InvalidBuffer;
760	scan->rs_cblock = InvalidBlockNumber;
761	tuple->t_data = NULL;
762	scan->rs_inited = false;
763	return;
764	}
765
766	heapgetpage((TableScanDesc) scan, page);
767
768	LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
769
770	dp = BufferGetPage(scan->rs_cbuf);
771	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
772	lines = PageGetMaxOffsetNumber((Page) dp);
773	linesleft = lines;
774	if (backward)
775	{
776	lineoff = lines;
777	lpp = PageGetItemId(dp, lines);
778	}
779	else
780	{
781	lineoff = FirstOffsetNumber;
782	lpp = PageGetItemId(dp, FirstOffsetNumber);
783	}
784	}
785	}
786
787	/ ----------------*
788	* heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
789	*
790	* Same API as heapgettup, but used in page-at-a-time mode
791	*
792	* The internal logic is much the same as heapgettup's too, but there are some
793	* differences: we do not take the buffer content lock (that only needs to
794	* happen inside heapgetpage), and we iterate through just the tuples listed
795	* in rs_vistuples[] rather than all tuples on the page. Notice that
796	* lineindex is 0-based, where the corresponding loop variable lineoff in
797	* heapgettup is 1-based.
798	* ----------------
799	*/
800	static void
801	heapgettup_pagemode(HeapScanDesc scan,
802	ScanDirection dir,
803	int nkeys,
804	ScanKey key)
805	{
806	HeapTuple tuple = &(scan->rs_ctup);
807	bool backward = ScanDirectionIsBackward(dir);
808	BlockNumber page;
809	bool finished;
810	Page dp;
811	int lines;
812	int lineindex;
813	OffsetNumber lineoff;
814	int linesleft;
815	ItemId lpp;
816
817	/*
818	* calculate next starting lineindex, given scan direction
819	*/
820	if (ScanDirectionIsForward(dir))
821	{
822	if (!scan->rs_inited)
823	{
824	/*
825	* return null immediately if relation is empty
826	*/
827	if (scan->rs_nblocks == `0` \|\| scan->rs_numblocks == `0`)
828	{
829	Assert(!BufferIsValid(scan->rs_cbuf));
830	tuple->t_data = NULL;
831	return;
832	}
833	if (scan->rs_base.rs_parallel != NULL)
834	{
835	ParallelBlockTableScanDesc pbscan =
836	(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
837
838	table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
839	pbscan);
840
841	page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
842	pbscan);
843
844	/ Other processes might have already finished the scan. /
845	if (page == InvalidBlockNumber)
846	{
847	Assert(!BufferIsValid(scan->rs_cbuf));
848	tuple->t_data = NULL;
849	return;
850	}
851	}
852	else
853	page = scan->rs_startblock; / first page /
854	heapgetpage((TableScanDesc) scan, page);
855	lineindex = `0`;
856	scan->rs_inited = true;
857	}
858	else
859	{
860	/ continue from previously returned page/tuple /
861	page = scan->rs_cblock; / current page /
862	lineindex = scan->rs_cindex + `1`;
863	}
864
865	dp = BufferGetPage(scan->rs_cbuf);
866	TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
867	lines = scan->rs_ntuples;
868	/ page and lineindex now reference the next visible tid /
869
870	linesleft = lines - lineindex;
871	}
872	else if (backward)
873	{
874	/ backward parallel scan not supported /
875	Assert(scan->rs_base.rs_parallel == NULL);
876
877	if (!scan->rs_inited)
878	{
879	/*
880	* return null immediately if relation is empty
881	*/
882	if (scan->rs_nblocks == `0` \|\| scan->rs_numblocks == `0`)
883	{
884	Assert(!BufferIsValid(scan->rs_cbuf));
885	tuple->t_data = NULL;
886	return;
887	}
888
889	/*
890	* Disable reporting to syncscan logic in a backwards scan; it's
891	* not very likely anyone else is doing the same thing at the same
892	* time, and much more likely that we'll just bollix things for
893	* forward scanners.
894	*/
895	scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
896	/ start from last page of the scan /
897	if (scan->rs_startblock > `0`)
898	page = scan->rs_startblock - `1`;
899	else
900	page = scan->rs_nblocks - `1`;
901	heapgetpage((TableScanDesc) scan, page);
902	}
903	else
904	{
905	/ continue from previously returned page/tuple /
906	page = scan->rs_cblock; / current page /
907	}
908
909	dp = BufferGetPage(scan->rs_cbuf);
910	TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
911	lines = scan->rs_ntuples;
912
913	if (!scan->rs_inited)
914	{
915	lineindex = lines - `1`;
916	scan->rs_inited = true;
917	}
918	else
919	{
920	lineindex = scan->rs_cindex - `1`;
921	}
922	/ page and lineindex now reference the previous visible tid /
923
924	linesleft = lineindex + `1`;
925	}
926	else
927	{
928	/*
929	* ``no movement'' scan direction: refetch prior tuple
930	*/
931	if (!scan->rs_inited)
932	{
933	Assert(!BufferIsValid(scan->rs_cbuf));
934	tuple->t_data = NULL;
935	return;
936	}
937
938	page = ItemPointerGetBlockNumber(&(tuple->t_self));
939	if (page != scan->rs_cblock)
940	heapgetpage((TableScanDesc) scan, page);
941
942	/ Since the tuple was previously fetched, needn't lock page here /
943	dp = BufferGetPage(scan->rs_cbuf);
944	TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
945	lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
946	lpp = PageGetItemId(dp, lineoff);
947	Assert(ItemIdIsNormal(lpp));
948
949	tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
950	tuple->t_len = ItemIdGetLength(lpp);
951
952	/ check that rs_cindex is in sync /
953	Assert(scan->rs_cindex < scan->rs_ntuples);
954	Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
955
956	return;
957	}
958
959	/*
960	* advance the scan until we find a qualifying tuple or run out of stuff
961	* to scan
962	*/
963	for (;;)
964	{
965	while (linesleft > `0`)
966	{
967	lineoff = scan->rs_vistuples[lineindex];
968	lpp = PageGetItemId(dp, lineoff);
969	Assert(ItemIdIsNormal(lpp));
970
971	tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
972	tuple->t_len = ItemIdGetLength(lpp);
973	ItemPointerSet(&(tuple->t_self), page, lineoff);
974
975	/*
976	* if current tuple qualifies, return it.
977	*/
978	if (key != NULL)
979	{
980	bool valid;
981
982	HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
983	nkeys, key, valid);
984	if (valid)
985	{
986	scan->rs_cindex = lineindex;
987	return;
988	}
989	}
990	else
991	{
992	scan->rs_cindex = lineindex;
993	return;
994	}
995
996	/*
997	* otherwise move to the next item on the page
998	*/
999	--linesleft;
1000	if (backward)
1001	--lineindex;
1002	else
1003	++lineindex;
1004	}
1005
1006	/*
1007	* if we get here, it means we've exhausted the items on this page and
1008	* it's time to move to the next.
1009	*/
1010	if (backward)
1011	{
1012	finished = (page == scan->rs_startblock) \|\|
1013	(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == `0` : false);
1014	if (page == `0`)
1015	page = scan->rs_nblocks;
1016	page--;
1017	}
1018	else if (scan->rs_base.rs_parallel != NULL)
1019	{
1020	ParallelBlockTableScanDesc pbscan =
1021	(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1022
1023	page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1024	pbscan);
1025	finished = (page == InvalidBlockNumber);
1026	}
1027	else
1028	{
1029	page++;
1030	if (page >= scan->rs_nblocks)
1031	page = `0`;
1032	finished = (page == scan->rs_startblock) \|\|
1033	(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == `0` : false);
1034
1035	/*
1036	* Report our new scan position for synchronization purposes. We
1037	* don't do that when moving backwards, however. That would just
1038	* mess up any other forward-moving scanners.
1039	*
1040	* Note: we do this before checking for end of scan so that the
1041	* final state of the position hint is back at the start of the
1042	* rel. That's not strictly necessary, but otherwise when you run
1043	* the same query multiple times the starting position would shift
1044	* a little bit backwards on every invocation, which is confusing.
1045	* We don't guarantee any specific ordering in general, though.
1046	*/
1047	if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1048	ss_report_location(scan->rs_base.rs_rd, page);
1049	}
1050
1051	/*
1052	* return NULL if we've exhausted all the pages
1053	*/
1054	if (finished)
1055	{
1056	if (BufferIsValid(scan->rs_cbuf))
1057	ReleaseBuffer(scan->rs_cbuf);
1058	scan->rs_cbuf = InvalidBuffer;
1059	scan->rs_cblock = InvalidBlockNumber;
1060	tuple->t_data = NULL;
1061	scan->rs_inited = false;
1062	return;
1063	}
1064
1065	heapgetpage((TableScanDesc) scan, page);
1066
1067	dp = BufferGetPage(scan->rs_cbuf);
1068	TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1069	lines = scan->rs_ntuples;
1070	linesleft = lines;
1071	if (backward)
1072	lineindex = lines - `1`;
1073	else
1074	lineindex = `0`;
1075	}
1076	}
1077
1078
1079	#if defined(DISABLE_COMPLEX_MACRO)
1080	/*
1081	* This is formatted so oddly so that the correspondence to the macro
1082	* definition in access/htup_details.h is maintained.
1083	*/
1084	Datum
1085	fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1086	bool *isnull)
1087	{
1088	return (
1089	(attnum) > `0` ?
1090	(
1091	(*(isnull) = false),
1092	HeapTupleNoNulls(tup) ?
1093	(
1094	TupleDescAttr((tupleDesc), (attnum) - `1`)->attcacheoff >= `0` ?
1095	(
1096	fetchatt(TupleDescAttr((tupleDesc), (attnum) - `1`),
1097	(char *) (tup)->t_data + (tup)->t_data->t_hoff +
1098	TupleDescAttr((tupleDesc), (attnum) - `1`)->attcacheoff)
1099	)
1100	:
1101	nocachegetattr((tup), (attnum), (tupleDesc))
1102	)
1103	:
1104	(
1105	att_isnull((attnum) - `1`, (tup)->t_data->t_bits) ?
1106	(
1107	(*(isnull) = true),
1108	(Datum) NULL
1109	)
1110	:
1111	(
1112	nocachegetattr((tup), (attnum), (tupleDesc))
1113	)
1114	)
1115	)
1116	:
1117	(
1118	(Datum) NULL
1119	)
1120	);
1121	}
1122	#endif /* defined(DISABLE_COMPLEX_MACRO) */
1123
1124
1125	/ ----------------------------------------------------------------*
1126	* heap access method interface
1127	* ----------------------------------------------------------------
1128	*/
1129
1130
1131	TableScanDesc
1132	heap_beginscan(Relation relation, Snapshot snapshot,
1133	int nkeys, ScanKey key,
1134	ParallelTableScanDesc parallel_scan,
1135	uint32 flags)
1136	{
1137	HeapScanDesc scan;
1138
1139	/*
1140	* increment relation ref count while scanning relation
1141	*
1142	* This is just to make really sure the relcache entry won't go away while
1143	* the scan has a pointer to it. Caller should be holding the rel open
1144	* anyway, so this is redundant in all normal scenarios...
1145	*/
1146	RelationIncrementReferenceCount(relation);
1147
1148	/*
1149	* allocate and initialize scan descriptor
1150	*/
1151	scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1152
1153	scan->rs_base.rs_rd = relation;
1154	scan->rs_base.rs_snapshot = snapshot;
1155	scan->rs_base.rs_nkeys = nkeys;
1156	scan->rs_base.rs_flags = flags;
1157	scan->rs_base.rs_parallel = parallel_scan;
1158	scan->rs_strategy = NULL; / set in initscan /
1159
1160	/*
1161	* Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1162	*/
1163	if (!(snapshot && IsMVCCSnapshot(snapshot)))
1164	scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1165
1166	/*
1167	* For seqscan and sample scans in a serializable transaction, acquire a
1168	* predicate lock on the entire relation. This is required not only to
1169	* lock all the matching tuples, but also to conflict with new insertions
1170	* into the table. In an indexscan, we take page locks on the index pages
1171	* covering the range specified in the scan qual, but in a heap scan there
1172	* is nothing more fine-grained to lock. A bitmap scan is a different
1173	* story, there we have already scanned the index and locked the index
1174	* pages covering the predicate. But in that case we still have to lock
1175	* any matching heap tuples. For sample scan we could optimize the locking
1176	* to be at least page-level granularity, but we'd need to add per-tuple
1177	* locking for that.
1178	*/
1179	if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN \| SO_TYPE_SAMPLESCAN))
1180	{
1181	/*
1182	* Ensure a missing snapshot is noticed reliably, even if the
1183	* isolation mode means predicate locking isn't performed (and
1184	* therefore the snapshot isn't used here).
1185	*/
1186	Assert(snapshot);
1187	PredicateLockRelation(relation, snapshot);
1188	}
1189
1190	/ we only need to set this up once /
1191	scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1192
1193	/*
1194	* we do this here instead of in initscan() because heap_rescan also calls
1195	* initscan() and we don't want to allocate memory again
1196	*/
1197	if (nkeys > `0`)
1198	scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1199	else
1200	scan->rs_base.rs_key = NULL;
1201
1202	initscan(scan, key, false);
1203
1204	return (TableScanDesc) scan;
1205	}
1206
1207	void
1208	heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1209	bool allow_strat, bool allow_sync, bool allow_pagemode)
1210	{
1211	HeapScanDesc scan = (HeapScanDesc) sscan;
1212
1213	if (set_params)
1214	{
1215	if (allow_strat)
1216	scan->rs_base.rs_flags \|= SO_ALLOW_STRAT;
1217	else
1218	scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1219
1220	if (allow_sync)
1221	scan->rs_base.rs_flags \|= SO_ALLOW_SYNC;
1222	else
1223	scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1224
1225	if (allow_pagemode && scan->rs_base.rs_snapshot &&
1226	IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1227	scan->rs_base.rs_flags \|= SO_ALLOW_PAGEMODE;
1228	else
1229	scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1230	}
1231
1232	/*
1233	* unpin scan buffers
1234	*/
1235	if (BufferIsValid(scan->rs_cbuf))
1236	ReleaseBuffer(scan->rs_cbuf);
1237
1238	/*
1239	* reinitialize scan descriptor
1240	*/
1241	initscan(scan, key, true);
1242	}
1243
1244	void
1245	heap_endscan(TableScanDesc sscan)
1246	{
1247	HeapScanDesc scan = (HeapScanDesc) sscan;
1248
1249	/ Note: no locking manipulations needed /
1250
1251	/*
1252	* unpin scan buffers
1253	*/
1254	if (BufferIsValid(scan->rs_cbuf))
1255	ReleaseBuffer(scan->rs_cbuf);
1256
1257	/*
1258	* decrement relation reference count and free scan descriptor storage
1259	*/
1260	RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1261
1262	if (scan->rs_base.rs_key)
1263	pfree(scan->rs_base.rs_key);
1264
1265	if (scan->rs_strategy != NULL)
1266	FreeAccessStrategy(scan->rs_strategy);
1267
1268	if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1269	UnregisterSnapshot(scan->rs_base.rs_snapshot);
1270
1271	pfree(scan);
1272	}
1273
1274	#ifdef HEAPDEBUGALL
1275	#define HEAPDEBUG_1 \
1276	elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1277	RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1278	#define HEAPDEBUG_2 \
1279	elog(DEBUG2, "heap_getnext returning EOS")
1280	#define HEAPDEBUG_3 \
1281	elog(DEBUG2, "heap_getnext returning tuple")
1282	#else
1283	#define HEAPDEBUG_1
1284	#define HEAPDEBUG_2
1285	#define HEAPDEBUG_3
1286	#endif /* !defined(HEAPDEBUGALL) */
1287
1288
1289	HeapTuple
1290	heap_getnext(TableScanDesc sscan, ScanDirection direction)
1291	{
1292	HeapScanDesc scan = (HeapScanDesc) sscan;
1293
1294	/*
1295	* This is still widely used directly, without going through table AM, so
1296	* add a safety check. It's possible we should, at a later point,
1297	* downgrade this to an assert. The reason for checking the AM routine,
1298	* rather than the AM oid, is that this allows to write regression tests
1299	* that create another AM reusing the heap handler.
1300	*/
1301	if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1302	ereport(ERROR,
1303	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1304	errmsg_internal("only heap AM is supported")));
1305
1306	/ Note: no locking manipulations needed /
1307
1308	HEAPDEBUG_1; / heap_getnext( info ) /
1309
1310	if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1311	heapgettup_pagemode(scan, direction,
1312	scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1313	else
1314	heapgettup(scan, direction,
1315	scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1316
1317	if (scan->rs_ctup.t_data == NULL)
1318	{
1319	HEAPDEBUG_2; / heap_getnext returning EOS /
1320	return NULL;
1321	}
1322
1323	/*
1324	* if we get here it means we have a new current scan tuple, so point to
1325	* the proper return buffer and return the tuple.
1326	*/
1327	HEAPDEBUG_3; / heap_getnext returning tuple /
1328
1329	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1330
1331	return &scan->rs_ctup;
1332	}
1333
1334	#ifdef HEAPAMSLOTDEBUGALL
1335	#define HEAPAMSLOTDEBUG_1 \
1336	elog(DEBUG2, "heapam_getnextslot([%s,nkeys=%d],dir=%d) called", \
1337	RelationGetRelationName(scan->rs_base.rs_rd), scan->rs_base.rs_nkeys, (int) direction)
1338	#define HEAPAMSLOTDEBUG_2 \
1339	elog(DEBUG2, "heapam_getnextslot returning EOS")
1340	#define HEAPAMSLOTDEBUG_3 \
1341	elog(DEBUG2, "heapam_getnextslot returning tuple")
1342	#else
1343	#define HEAPAMSLOTDEBUG_1
1344	#define HEAPAMSLOTDEBUG_2
1345	#define HEAPAMSLOTDEBUG_3
1346	#endif
1347
1348	bool
1349	heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1350	{
1351	HeapScanDesc scan = (HeapScanDesc) sscan;
1352
1353	/ Note: no locking manipulations needed /
1354
1355	HEAPAMSLOTDEBUG_1; / heap_getnextslot( info ) /
1356
1357	if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1358	heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1359	else
1360	heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1361
1362	if (scan->rs_ctup.t_data == NULL)
1363	{
1364	HEAPAMSLOTDEBUG_2; / heap_getnextslot returning EOS /
1365	ExecClearTuple(slot);
1366	return false;
1367	}
1368
1369	/*
1370	* if we get here it means we have a new current scan tuple, so point to
1371	* the proper return buffer and return the tuple.
1372	*/
1373	HEAPAMSLOTDEBUG_3; / heap_getnextslot returning tuple /
1374
1375	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1376
1377	ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1378	scan->rs_cbuf);
1379	return true;
1380	}
1381
1382	/*
1383	* heap_fetch - retrieve tuple with given tid
1384	*
1385	* On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1386	* the tuple, fill in the remaining fields of *tuple, and check the tuple
1387	* against the specified snapshot.
1388	*
1389	* If successful (tuple found and passes snapshot time qual), then *userbuf
1390	* is set to the buffer holding the tuple and true is returned. The caller
1391	* must unpin the buffer when done with the tuple.
1392	*
1393	* If the tuple is not found (ie, item number references a deleted slot),
1394	* then tuple->t_data is set to NULL and false is returned.
1395	*
1396	* If the tuple is found but fails the time qual check, then false is returned
1397	* but tuple->t_data is left pointing to the tuple.
1398	*
1399	* heap_fetch does not follow HOT chains: only the exact TID requested will
1400	* be fetched.
1401	*
1402	* It is somewhat inconsistent that we ereport() on invalid block number but
1403	* return false on invalid item number. There are a couple of reasons though.
1404	* One is that the caller can relatively easily check the block number for
1405	* validity, but cannot check the item number without reading the page
1406	* himself. Another is that when we are following a t_ctid link, we can be
1407	* reasonably confident that the page number is valid (since VACUUM shouldn't
1408	* truncate off the destination page without having killed the referencing
1409	* tuple first), but the item number might well not be good.
1410	*/
1411	bool
1412	heap_fetch(Relation relation,
1413	Snapshot snapshot,
1414	HeapTuple tuple,
1415	Buffer *userbuf)
1416	{
1417	ItemPointer tid = &(tuple->t_self);
1418	ItemId lp;
1419	Buffer buffer;
1420	Page page;
1421	OffsetNumber offnum;
1422	bool valid;
1423
1424	/*
1425	* Fetch and pin the appropriate page of the relation.
1426	*/
1427	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1428
1429	/*
1430	* Need share lock on buffer to examine tuple commit status.
1431	*/
1432	LockBuffer(buffer, BUFFER_LOCK_SHARE);
1433	page = BufferGetPage(buffer);
1434	TestForOldSnapshot(snapshot, relation, page);
1435
1436	/*
1437	* We'd better check for out-of-range offnum in case of VACUUM since the
1438	* TID was obtained.
1439	*/
1440	offnum = ItemPointerGetOffsetNumber(tid);
1441	if (offnum < FirstOffsetNumber \|\| offnum > PageGetMaxOffsetNumber(page))
1442	{
1443	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1444	ReleaseBuffer(buffer);
1445	*userbuf = InvalidBuffer;
1446	tuple->t_data = NULL;
1447	return false;
1448	}
1449
1450	/*
1451	* get the item line pointer corresponding to the requested tid
1452	*/
1453	lp = PageGetItemId(page, offnum);
1454
1455	/*
1456	* Must check for deleted tuple.
1457	*/
1458	if (!ItemIdIsNormal(lp))
1459	{
1460	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1461	ReleaseBuffer(buffer);
1462	*userbuf = InvalidBuffer;
1463	tuple->t_data = NULL;
1464	return false;
1465	}
1466
1467	/*
1468	* fill in *tuple fields
1469	*/
1470	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1471	tuple->t_len = ItemIdGetLength(lp);
1472	tuple->t_tableOid = RelationGetRelid(relation);
1473
1474	/*
1475	* check tuple visibility, then release lock
1476	*/
1477	valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1478
1479	if (valid)
1480	PredicateLockTuple(relation, tuple, snapshot);
1481
1482	CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1483
1484	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1485
1486	if (valid)
1487	{
1488	/*
1489	* All checks passed, so return the tuple as valid. Caller is now
1490	* responsible for releasing the buffer.
1491	*/
1492	*userbuf = buffer;
1493
1494	return true;
1495	}
1496
1497	/ Tuple failed time qual /
1498	ReleaseBuffer(buffer);
1499	*userbuf = InvalidBuffer;
1500
1501	return false;
1502	}
1503
1504	/*
1505	* heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1506	*
1507	* On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1508	* of a HOT chain), and buffer is the buffer holding this tuple. We search
1509	* for the first chain member satisfying the given snapshot. If one is
1510	* found, we update *tid to reference that tuple's offset number, and
1511	* return true. If no match, return false without modifying *tid.
1512	*
1513	* heapTuple is a caller-supplied buffer. When a match is found, we return
1514	* the tuple here, in addition to updating *tid. If no match is found, the
1515	* contents of this buffer on return are undefined.
1516	*
1517	* If all_dead is not NULL, we check non-visible tuples to see if they are
1518	* globally dead; *all_dead is set true if all members of the HOT chain
1519	* are vacuumable, false if not.
1520	*
1521	* Unlike heap_fetch, the caller must already have pin and (at least) share
1522	* lock on the buffer; it is still pinned/locked at exit. Also unlike
1523	* heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1524	*/
1525	bool
1526	heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1527	Snapshot snapshot, HeapTuple heapTuple,
1528	bool *all_dead, bool first_call)
1529	{
1530	Page dp = (Page) BufferGetPage(buffer);
1531	TransactionId prev_xmax = InvalidTransactionId;
1532	BlockNumber blkno;
1533	OffsetNumber offnum;
1534	bool at_chain_start;
1535	bool valid;
1536	bool skip;
1537
1538	/ If this is not the first call, previous call returned a (live!) tuple /
1539	if (all_dead)
1540	*all_dead = first_call;
1541
1542	blkno = ItemPointerGetBlockNumber(tid);
1543	offnum = ItemPointerGetOffsetNumber(tid);
1544	at_chain_start = first_call;
1545	skip = !first_call;
1546
1547	Assert(TransactionIdIsValid(RecentGlobalXmin));
1548	Assert(BufferGetBlockNumber(buffer) == blkno);
1549
1550	/ Scan through possible multiple members of HOT-chain /
1551	for (;;)
1552	{
1553	ItemId lp;
1554
1555	/ check for bogus TID /
1556	if (offnum < FirstOffsetNumber \|\| offnum > PageGetMaxOffsetNumber(dp))
1557	break;
1558
1559	lp = PageGetItemId(dp, offnum);
1560
1561	/ check for unused, dead, or redirected items /
1562	if (!ItemIdIsNormal(lp))
1563	{
1564	/ We should only see a redirect at start of chain /
1565	if (ItemIdIsRedirected(lp) && at_chain_start)
1566	{
1567	/ Follow the redirect /
1568	offnum = ItemIdGetRedirect(lp);
1569	at_chain_start = false;
1570	continue;
1571	}
1572	/ else must be end of chain /
1573	break;
1574	}
1575
1576	/*
1577	* Update heapTuple to point to the element of the HOT chain we're
1578	* currently investigating. Having t_self set correctly is important
1579	* because the SSI checks and the *Satisfies routine for historical
1580	* MVCC snapshots need the correct tid to decide about the visibility.
1581	*/
1582	heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1583	heapTuple->t_len = ItemIdGetLength(lp);
1584	heapTuple->t_tableOid = RelationGetRelid(relation);
1585	ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1586
1587	/*
1588	* Shouldn't see a HEAP_ONLY tuple at chain start.
1589	*/
1590	if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1591	break;
1592
1593	/*
1594	* The xmin should match the previous xmax value, else chain is
1595	* broken.
1596	*/
1597	if (TransactionIdIsValid(prev_xmax) &&
1598	!TransactionIdEquals(prev_xmax,
1599	HeapTupleHeaderGetXmin(heapTuple->t_data)))
1600	break;
1601
1602	/*
1603	* When first_call is true (and thus, skip is initially false) we'll
1604	* return the first tuple we find. But on later passes, heapTuple
1605	* will initially be pointing to the tuple we returned last time.
1606	* Returning it again would be incorrect (and would loop forever), so
1607	* we skip it and return the next match we find.
1608	*/
1609	if (!skip)
1610	{
1611	/ If it's visible per the snapshot, we must return it /
1612	valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1613	CheckForSerializableConflictOut(valid, relation, heapTuple,
1614	buffer, snapshot);
1615
1616	if (valid)
1617	{
1618	ItemPointerSetOffsetNumber(tid, offnum);
1619	PredicateLockTuple(relation, heapTuple, snapshot);
1620	if (all_dead)
1621	*all_dead = false;
1622	return true;
1623	}
1624	}
1625	skip = false;
1626
1627	/*
1628	* If we can't see it, maybe no one else can either. At caller
1629	* request, check whether all chain members are dead to all
1630	* transactions.
1631	*
1632	* Note: if you change the criterion here for what is "dead", fix the
1633	* planner's get_actual_variable_range() function to match.
1634	*/
1635	if (all_dead && *all_dead &&
1636	!HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
1637	*all_dead = false;
1638
1639	/*
1640	* Check to see if HOT chain continues past this tuple; if so fetch
1641	* the next offnum and loop around.
1642	*/
1643	if (HeapTupleIsHotUpdated(heapTuple))
1644	{
1645	Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1646	blkno);
1647	offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1648	at_chain_start = false;
1649	prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1650	}
1651	else
1652	break; / end of chain /
1653	}
1654
1655	return false;
1656	}
1657
1658	/*
1659	* heap_get_latest_tid - get the latest tid of a specified tuple
1660	*
1661	* Actually, this gets the latest version that is visible according to the
1662	* scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1663	* possibly uncommitted version.
1664	*
1665	* *tid is both an input and an output parameter: it is updated to
1666	* show the latest version of the row. Note that it will not be changed
1667	* if no version of the row passes the snapshot test.
1668	*/
1669	void
1670	heap_get_latest_tid(TableScanDesc sscan,
1671	ItemPointer tid)
1672	{
1673	Relation relation = sscan->rs_rd;
1674	Snapshot snapshot = sscan->rs_snapshot;
1675	ItemPointerData ctid;
1676	TransactionId priorXmax;
1677
1678	/*
1679	* table_get_latest_tid verified that the passed in tid is valid. Assume
1680	* that t_ctid links are valid however - there shouldn't be invalid ones
1681	* in the table.
1682	*/
1683	Assert(ItemPointerIsValid(tid));
1684
1685	/*
1686	* Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1687	* need to examine, and *tid is the TID we will return if ctid turns out
1688	* to be bogus.
1689	*
1690	* Note that we will loop until we reach the end of the t_ctid chain.
1691	* Depending on the snapshot passed, there might be at most one visible
1692	* version of the row, but we don't try to optimize for that.
1693	*/
1694	ctid = *tid;
1695	priorXmax = InvalidTransactionId; / cannot check first XMIN /
1696	for (;;)
1697	{
1698	Buffer buffer;
1699	Page page;
1700	OffsetNumber offnum;
1701	ItemId lp;
1702	HeapTupleData tp;
1703	bool valid;
1704
1705	/*
1706	* Read, pin, and lock the page.
1707	*/
1708	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1709	LockBuffer(buffer, BUFFER_LOCK_SHARE);
1710	page = BufferGetPage(buffer);
1711	TestForOldSnapshot(snapshot, relation, page);
1712
1713	/*
1714	* Check for bogus item number. This is not treated as an error
1715	* condition because it can happen while following a t_ctid link. We
1716	* just assume that the prior tid is OK and return it unchanged.
1717	*/
1718	offnum = ItemPointerGetOffsetNumber(&ctid);
1719	if (offnum < FirstOffsetNumber \|\| offnum > PageGetMaxOffsetNumber(page))
1720	{
1721	UnlockReleaseBuffer(buffer);
1722	break;
1723	}
1724	lp = PageGetItemId(page, offnum);
1725	if (!ItemIdIsNormal(lp))
1726	{
1727	UnlockReleaseBuffer(buffer);
1728	break;
1729	}
1730
1731	/ OK to access the tuple /
1732	tp.t_self = ctid;
1733	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1734	tp.t_len = ItemIdGetLength(lp);
1735	tp.t_tableOid = RelationGetRelid(relation);
1736
1737	/*
1738	* After following a t_ctid link, we might arrive at an unrelated
1739	* tuple. Check for XMIN match.
1740	*/
1741	if (TransactionIdIsValid(priorXmax) &&
1742	!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1743	{
1744	UnlockReleaseBuffer(buffer);
1745	break;
1746	}
1747
1748	/*
1749	* Check tuple visibility; if visible, set it as the new result
1750	* candidate.
1751	*/
1752	valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1753	CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1754	if (valid)
1755	*tid = ctid;
1756
1757	/*
1758	* If there's a valid t_ctid link, follow it, else we're done.
1759	*/
1760	if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) \|\|
1761	HeapTupleHeaderIsOnlyLocked(tp.t_data) \|\|
1762	HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) \|\|
1763	ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1764	{
1765	UnlockReleaseBuffer(buffer);
1766	break;
1767	}
1768
1769	ctid = tp.t_data->t_ctid;
1770	priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1771	UnlockReleaseBuffer(buffer);
1772	} / end of loop /
1773	}
1774
1775
1776	/*
1777	* UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1778	*
1779	* This is called after we have waited for the XMAX transaction to terminate.
1780	* If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1781	* be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1782	* hint bit if possible --- but beware that that may not yet be possible,
1783	* if the transaction committed asynchronously.
1784	*
1785	* Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1786	* even if it commits.
1787	*
1788	* Hence callers should look only at XMAX_INVALID.
1789	*
1790	* Note this is not allowed for tuples whose xmax is a multixact.
1791	*/
1792	static void
1793	UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1794	{
1795	Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1796	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1797
1798	if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED \| HEAP_XMAX_INVALID)))
1799	{
1800	if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1801	TransactionIdDidCommit(xid))
1802	HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1803	xid);
1804	else
1805	HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1806	InvalidTransactionId);
1807	}
1808	}
1809
1810
1811	/*
1812	* GetBulkInsertState - prepare status object for a bulk insert
1813	*/
1814	BulkInsertState
1815	GetBulkInsertState(void)
1816	{
1817	BulkInsertState bistate;
1818
1819	bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1820	bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
1821	bistate->current_buf = InvalidBuffer;
1822	return bistate;
1823	}
1824
1825	/*
1826	* FreeBulkInsertState - clean up after finishing a bulk insert
1827	*/
1828	void
1829	FreeBulkInsertState(BulkInsertState bistate)
1830	{
1831	if (bistate->current_buf != InvalidBuffer)
1832	ReleaseBuffer(bistate->current_buf);
1833	FreeAccessStrategy(bistate->strategy);
1834	pfree(bistate);
1835	}
1836
1837	/*
1838	* ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1839	*/
1840	void
1841	ReleaseBulkInsertStatePin(BulkInsertState bistate)
1842	{
1843	if (bistate->current_buf != InvalidBuffer)
1844	ReleaseBuffer(bistate->current_buf);
1845	bistate->current_buf = InvalidBuffer;
1846	}
1847
1848
1849	/*
1850	* heap_insert - insert tuple into a heap
1851	*
1852	* The new tuple is stamped with current transaction ID and the specified
1853	* command ID.
1854	*
1855	* See table_tuple_insert for comments about most of the input flags, except
1856	* that this routine directly takes a tuple rather than a slot.
1857	*
1858	* There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1859	* options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1860	* implement table_tuple_insert_speculative().
1861	*
1862	* On return the header fields of *tup are updated to match the stored tuple;
1863	* in particular tup->t_self receives the actual TID where the tuple was
1864	* stored. But note that any toasting of fields within the tuple data is NOT
1865	* reflected into *tup.
1866	*/
1867	void
1868	heap_insert(Relation relation, HeapTuple tup, CommandId cid,
1869	int options, BulkInsertState bistate)
1870	{
1871	TransactionId xid = GetCurrentTransactionId();
1872	HeapTuple heaptup;
1873	Buffer buffer;
1874	Buffer vmbuffer = InvalidBuffer;
1875	bool all_visible_cleared = false;
1876
1877	/*
1878	* Fill in tuple header fields and toast the tuple if necessary.
1879	*
1880	* Note: below this point, heaptup is the data we actually intend to store
1881	* into the relation; tup is the caller's original untoasted data.
1882	*/
1883	heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1884
1885	/*
1886	* Find buffer to insert this tuple into. If the page is all visible,
1887	* this will also pin the requisite visibility map page.
1888	*/
1889	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1890	InvalidBuffer, options, bistate,
1891	&vmbuffer, NULL);
1892
1893	/*
1894	* We're about to do the actual insert -- but check for conflict first, to
1895	* avoid possibly having to roll back work we've just done.
1896	*
1897	* This is safe without a recheck as long as there is no possibility of
1898	* another process scanning the page between this check and the insert
1899	* being visible to the scan (i.e., an exclusive buffer content lock is
1900	* continuously held from this point until the tuple insert is visible).
1901	*
1902	* For a heap insert, we only need to check for table-level SSI locks. Our
1903	* new tuple can't possibly conflict with existing tuple locks, and heap
1904	* page locks are only consolidated versions of tuple locks; they do not
1905	* lock "gaps" as index page locks do. So we don't need to specify a
1906	* buffer when making the call, which makes for a faster check.
1907	*/
1908	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
1909
1910	/ NO EREPORT(ERROR) from here till changes are logged /
1911	START_CRIT_SECTION();
1912
1913	RelationPutHeapTuple(relation, buffer, heaptup,
1914	(options & HEAP_INSERT_SPECULATIVE) != `0`);
1915
1916	if (PageIsAllVisible(BufferGetPage(buffer)))
1917	{
1918	all_visible_cleared = true;
1919	PageClearAllVisible(BufferGetPage(buffer));
1920	visibilitymap_clear(relation,
1921	ItemPointerGetBlockNumber(&(heaptup->t_self)),
1922	vmbuffer, VISIBILITYMAP_VALID_BITS);
1923	}
1924
1925	/*
1926	* XXX Should we set PageSetPrunable on this page ?
1927	*
1928	* The inserting transaction may eventually abort thus making this tuple
1929	* DEAD and hence available for pruning. Though we don't want to optimize
1930	* for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1931	* aborted tuple will never be pruned until next vacuum is triggered.
1932	*
1933	* If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1934	*/
1935
1936	MarkBufferDirty(buffer);
1937
1938	/ XLOG stuff /
1939	if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
1940	{
1941	xl_heap_insert xlrec;
1942	xl_heap_header xlhdr;
1943	XLogRecPtr recptr;
1944	Page page = BufferGetPage(buffer);
1945	uint8 info = XLOG_HEAP_INSERT;
1946	int bufflags = `0`;
1947
1948	/*
1949	* If this is a catalog, we need to transmit combocids to properly
1950	* decode, so log that as well.
1951	*/
1952	if (RelationIsAccessibleInLogicalDecoding(relation))
1953	log_heap_new_cid(relation, heaptup);
1954
1955	/*
1956	* If this is the single and first tuple on page, we can reinit the
1957	* page instead of restoring the whole thing. Set flag, and hide
1958	* buffer references from XLogInsert.
1959	*/
1960	if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1961	PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
1962	{
1963	info \|= XLOG_HEAP_INIT_PAGE;
1964	bufflags \|= REGBUF_WILL_INIT;
1965	}
1966
1967	xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1968	xlrec.flags = `0`;
1969	if (all_visible_cleared)
1970	xlrec.flags \|= XLH_INSERT_ALL_VISIBLE_CLEARED;
1971	if (options & HEAP_INSERT_SPECULATIVE)
1972	xlrec.flags \|= XLH_INSERT_IS_SPECULATIVE;
1973	Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
1974
1975	/*
1976	* For logical decoding, we need the tuple even if we're doing a full
1977	* page write, so make sure it's included even if we take a full-page
1978	* image. (XXX We could alternatively store a pointer into the FPW).
1979	*/
1980	if (RelationIsLogicallyLogged(relation) &&
1981	!(options & HEAP_INSERT_NO_LOGICAL))
1982	{
1983	xlrec.flags \|= XLH_INSERT_CONTAINS_NEW_TUPLE;
1984	bufflags \|= REGBUF_KEEP_DATA;
1985	}
1986
1987	XLogBeginInsert();
1988	XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1989
1990	xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1991	xlhdr.t_infomask = heaptup->t_data->t_infomask;
1992	xlhdr.t_hoff = heaptup->t_data->t_hoff;
1993
1994	/*
1995	* note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1996	* write the whole page to the xlog, we don't need to store
1997	* xl_heap_header in the xlog.
1998	*/
1999	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD \| bufflags);
2000	XLogRegisterBufData(`0`, (char *) &xlhdr, SizeOfHeapHeader);
2001	/ PG73FORMAT: write bitmap [+ padding] [+ oid] + data /
2002	XLogRegisterBufData(`0`,
2003	(char *) heaptup->t_data + SizeofHeapTupleHeader,
2004	heaptup->t_len - SizeofHeapTupleHeader);
2005
2006	/ filtering by origin on a row level is much more efficient /
2007	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2008
2009	recptr = XLogInsert(RM_HEAP_ID, info);
2010
2011	PageSetLSN(page, recptr);
2012	}
2013
2014	END_CRIT_SECTION();
2015
2016	UnlockReleaseBuffer(buffer);
2017	if (vmbuffer != InvalidBuffer)
2018	ReleaseBuffer(vmbuffer);
2019
2020	/*
2021	* If tuple is cachable, mark it for invalidation from the caches in case
2022	* we abort. Note it is OK to do this after releasing the buffer, because
2023	* the heaptup data structure is all in local memory, not in the shared
2024	* buffer.
2025	*/
2026	CacheInvalidateHeapTuple(relation, heaptup, NULL);
2027
2028	/ Note: speculative insertions are counted too, even if aborted later /
2029	pgstat_count_heap_insert(relation, `1`);
2030
2031	/*
2032	* If heaptup is a private copy, release it. Don't forget to copy t_self
2033	* back to the caller's image, too.
2034	*/
2035	if (heaptup != tup)
2036	{
2037	tup->t_self = heaptup->t_self;
2038	heap_freetuple(heaptup);
2039	}
2040	}
2041
2042	/*
2043	* Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2044	* tuple header fields and toasts the tuple if necessary. Returns a toasted
2045	* version of the tuple if it was toasted, or the original tuple if not. Note
2046	* that in any case, the header fields are also set in the original tuple.
2047	*/
2048	static HeapTuple
2049	heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2050	CommandId cid, int options)
2051	{
2052	/*
2053	* Parallel operations are required to be strictly read-only in a parallel
2054	* worker. Parallel inserts are not safe even in the leader in the
2055	* general case, because group locking means that heavyweight locks for
2056	* relation extension or GIN page locks will not conflict between members
2057	* of a lock group, but we don't prohibit that case here because there are
2058	* useful special cases that we can safely allow, such as CREATE TABLE AS.
2059	*/
2060	if (IsParallelWorker())
2061	ereport(ERROR,
2062	(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2063	errmsg("cannot insert tuples in a parallel worker")));
2064
2065	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2066	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2067	tup->t_data->t_infomask \|= HEAP_XMAX_INVALID;
2068	HeapTupleHeaderSetXmin(tup->t_data, xid);
2069	if (options & HEAP_INSERT_FROZEN)
2070	HeapTupleHeaderSetXminFrozen(tup->t_data);
2071
2072	HeapTupleHeaderSetCmin(tup->t_data, cid);
2073	HeapTupleHeaderSetXmax(tup->t_data, `0`); / for cleanliness /
2074	tup->t_tableOid = RelationGetRelid(relation);
2075
2076	/*
2077	* If the new tuple is too big for storage or contains already toasted
2078	* out-of-line attributes from some other relation, invoke the toaster.
2079	*/
2080	if (relation->rd_rel->relkind != RELKIND_RELATION &&
2081	relation->rd_rel->relkind != RELKIND_MATVIEW)
2082	{
2083	/ toast table entries should never be recursively toasted /
2084	Assert(!HeapTupleHasExternal(tup));
2085	return tup;
2086	}
2087	else if (HeapTupleHasExternal(tup) \|\| tup->t_len > TOAST_TUPLE_THRESHOLD)
2088	return toast_insert_or_update(relation, tup, NULL, options);
2089	else
2090	return tup;
2091	}
2092
2093	/*
2094	* heap_multi_insert - insert multiple tuple into a heap
2095	*
2096	* This is like heap_insert(), but inserts multiple tuples in one operation.
2097	* That's faster than calling heap_insert() in a loop, because when multiple
2098	* tuples can be inserted on a single page, we can write just a single WAL
2099	* record covering all of them, and only need to lock/unlock the page once.
2100	*
2101	* Note: this leaks memory into the current memory context. You can create a
2102	* temporary context before calling this, if that's a problem.
2103	*/
2104	void
2105	heap_multi_insert(Relation relation, TupleTableSlot *slots, int* ntuples,
2106	CommandId cid, int options, BulkInsertState bistate)
2107	{
2108	TransactionId xid = GetCurrentTransactionId();
2109	HeapTuple *heaptuples;
2110	int i;
2111	int ndone;
2112	PGAlignedBlock scratch;
2113	Page page;
2114	bool needwal;
2115	Size saveFreeSpace;
2116	bool need_tuple_data = RelationIsLogicallyLogged(relation);
2117	bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2118
2119	/ currently not needed (thus unsupported) for heap_multi_insert() /
2120	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2121
2122	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2123	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2124	HEAP_DEFAULT_FILLFACTOR);
2125
2126	/ Toast and set header data in all the slots /
2127	heaptuples = palloc(ntuples * sizeof(HeapTuple));
2128	for (i = `0`; i < ntuples; i++)
2129	{
2130	HeapTuple tuple;
2131
2132	tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2133	slots[i]->tts_tableOid = RelationGetRelid(relation);
2134	tuple->t_tableOid = slots[i]->tts_tableOid;
2135	heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2136	options);
2137	}
2138
2139	/*
2140	* We're about to do the actual inserts -- but check for conflict first,
2141	* to minimize the possibility of having to roll back work we've just
2142	* done.
2143	*
2144	* A check here does not definitively prevent a serialization anomaly;
2145	* that check MUST be done at least past the point of acquiring an
2146	* exclusive buffer content lock on every buffer that will be affected,
2147	* and MAY be done after all inserts are reflected in the buffers and
2148	* those locks are released; otherwise there race condition. Since
2149	* multiple buffers can be locked and unlocked in the loop below, and it
2150	* would not be feasible to identify and lock all of those buffers before
2151	* the loop, we must do a final check at the end.
2152	*
2153	* The check here could be omitted with no loss of correctness; it is
2154	* present strictly as an optimization.
2155	*
2156	* For heap inserts, we only need to check for table-level SSI locks. Our
2157	* new tuples can't possibly conflict with existing tuple locks, and heap
2158	* page locks are only consolidated versions of tuple locks; they do not
2159	* lock "gaps" as index page locks do. So we don't need to specify a
2160	* buffer when making the call, which makes for a faster check.
2161	*/
2162	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2163
2164	ndone = `0`;
2165	while (ndone < ntuples)
2166	{
2167	Buffer buffer;
2168	Buffer vmbuffer = InvalidBuffer;
2169	bool all_visible_cleared = false;
2170	int nthispage;
2171
2172	CHECK_FOR_INTERRUPTS();
2173
2174	/*
2175	* Find buffer where at least the next tuple will fit. If the page is
2176	* all-visible, this will also pin the requisite visibility map page.
2177	*/
2178	buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2179	InvalidBuffer, options, bistate,
2180	&vmbuffer, NULL);
2181	page = BufferGetPage(buffer);
2182
2183	/ NO EREPORT(ERROR) from here till changes are logged /
2184	START_CRIT_SECTION();
2185
2186	/*
2187	* RelationGetBufferForTuple has ensured that the first tuple fits.
2188	* Put that on the page, and then as many other tuples as fit.
2189	*/
2190	RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2191	for (nthispage = `1`; ndone + nthispage < ntuples; nthispage++)
2192	{
2193	HeapTuple heaptup = heaptuples[ndone + nthispage];
2194
2195	if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2196	break;
2197
2198	RelationPutHeapTuple(relation, buffer, heaptup, false);
2199
2200	/*
2201	* We don't use heap_multi_insert for catalog tuples yet, but
2202	* better be prepared...
2203	*/
2204	if (needwal && need_cids)
2205	log_heap_new_cid(relation, heaptup);
2206	}
2207
2208	if (PageIsAllVisible(page))
2209	{
2210	all_visible_cleared = true;
2211	PageClearAllVisible(page);
2212	visibilitymap_clear(relation,
2213	BufferGetBlockNumber(buffer),
2214	vmbuffer, VISIBILITYMAP_VALID_BITS);
2215	}
2216
2217	/*
2218	* XXX Should we set PageSetPrunable on this page ? See heap_insert()
2219	*/
2220
2221	MarkBufferDirty(buffer);
2222
2223	/ XLOG stuff /
2224	if (needwal)
2225	{
2226	XLogRecPtr recptr;
2227	xl_heap_multi_insert *xlrec;
2228	uint8 info = XLOG_HEAP2_MULTI_INSERT;
2229	char *tupledata;
2230	int totaldatalen;
2231	char *scratchptr = scratch.data;
2232	bool init;
2233	int bufflags = `0`;
2234
2235	/*
2236	* If the page was previously empty, we can reinit the page
2237	* instead of restoring the whole thing.
2238	*/
2239	init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2240	PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - `1`);
2241
2242	/ allocate xl_heap_multi_insert struct from the scratch area /
2243	xlrec = (xl_heap_multi_insert *) scratchptr;
2244	scratchptr += SizeOfHeapMultiInsert;
2245
2246	/*
2247	* Allocate offsets array. Unless we're reinitializing the page,
2248	* in that case the tuples are stored in order starting at
2249	* FirstOffsetNumber and we don't need to store the offsets
2250	* explicitly.
2251	*/
2252	if (!init)
2253	scratchptr += nthispage * sizeof(OffsetNumber);
2254
2255	/ the rest of the scratch space is used for tuple data /
2256	tupledata = scratchptr;
2257
2258	xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : `0`;
2259	xlrec->ntuples = nthispage;
2260
2261	/*
2262	* Write out an xl_multi_insert_tuple and the tuple data itself
2263	* for each tuple.
2264	*/
2265	for (i = `0`; i < nthispage; i++)
2266	{
2267	HeapTuple heaptup = heaptuples[ndone + i];
2268	xl_multi_insert_tuple *tuphdr;
2269	int datalen;
2270
2271	if (!init)
2272	xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2273	/ xl_multi_insert_tuple needs two-byte alignment. /
2274	tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2275	scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2276
2277	tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2278	tuphdr->t_infomask = heaptup->t_data->t_infomask;
2279	tuphdr->t_hoff = heaptup->t_data->t_hoff;
2280
2281	/ write bitmap [+ padding] [+ oid] + data /
2282	datalen = heaptup->t_len - SizeofHeapTupleHeader;
2283	memcpy(scratchptr,
2284	(char *) heaptup->t_data + SizeofHeapTupleHeader,
2285	datalen);
2286	tuphdr->datalen = datalen;
2287	scratchptr += datalen;
2288	}
2289	totaldatalen = scratchptr - tupledata;
2290	Assert((scratchptr - scratch.data) < BLCKSZ);
2291
2292	if (need_tuple_data)
2293	xlrec->flags \|= XLH_INSERT_CONTAINS_NEW_TUPLE;
2294
2295	/*
2296	* Signal that this is the last xl_heap_multi_insert record
2297	* emitted by this call to heap_multi_insert(). Needed for logical
2298	* decoding so it knows when to cleanup temporary data.
2299	*/
2300	if (ndone + nthispage == ntuples)
2301	xlrec->flags \|= XLH_INSERT_LAST_IN_MULTI;
2302
2303	if (init)
2304	{
2305	info \|= XLOG_HEAP_INIT_PAGE;
2306	bufflags \|= REGBUF_WILL_INIT;
2307	}
2308
2309	/*
2310	* If we're doing logical decoding, include the new tuple data
2311	* even if we take a full-page image of the page.
2312	*/
2313	if (need_tuple_data)
2314	bufflags \|= REGBUF_KEEP_DATA;
2315
2316	XLogBeginInsert();
2317	XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2318	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD \| bufflags);
2319
2320	XLogRegisterBufData(`0`, tupledata, totaldatalen);
2321
2322	/ filtering by origin on a row level is much more efficient /
2323	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2324
2325	recptr = XLogInsert(RM_HEAP2_ID, info);
2326
2327	PageSetLSN(page, recptr);
2328	}
2329
2330	END_CRIT_SECTION();
2331
2332	UnlockReleaseBuffer(buffer);
2333	if (vmbuffer != InvalidBuffer)
2334	ReleaseBuffer(vmbuffer);
2335
2336	ndone += nthispage;
2337	}
2338
2339	/*
2340	* We're done with the actual inserts. Check for conflicts again, to
2341	* ensure that all rw-conflicts in to these inserts are detected. Without
2342	* this final check, a sequential scan of the heap may have locked the
2343	* table after the "before" check, missing one opportunity to detect the
2344	* conflict, and then scanned the table before the new tuples were there,
2345	* missing the other chance to detect the conflict.
2346	*
2347	* For heap inserts, we only need to check for table-level SSI locks. Our
2348	* new tuples can't possibly conflict with existing tuple locks, and heap
2349	* page locks are only consolidated versions of tuple locks; they do not
2350	* lock "gaps" as index page locks do. So we don't need to specify a
2351	* buffer when making the call.
2352	*/
2353	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2354
2355	/*
2356	* If tuples are cachable, mark them for invalidation from the caches in
2357	* case we abort. Note it is OK to do this after releasing the buffer,
2358	* because the heaptuples data structure is all in local memory, not in
2359	* the shared buffer.
2360	*/
2361	if (IsCatalogRelation(relation))
2362	{
2363	for (i = `0`; i < ntuples; i++)
2364	CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2365	}
2366
2367	/ copy t_self fields back to the caller's slots /
2368	for (i = `0`; i < ntuples; i++)
2369	slots[i]->tts_tid = heaptuples[i]->t_self;
2370
2371	pgstat_count_heap_insert(relation, ntuples);
2372	}
2373
2374	/*
2375	* simple_heap_insert - insert a tuple
2376	*
2377	* Currently, this routine differs from heap_insert only in supplying
2378	* a default command ID and not allowing access to the speedup options.
2379	*
2380	* This should be used rather than using heap_insert directly in most places
2381	* where we are modifying system catalogs.
2382	*/
2383	void
2384	simple_heap_insert(Relation relation, HeapTuple tup)
2385	{
2386	heap_insert(relation, tup, GetCurrentCommandId(true), `0`, NULL);
2387	}
2388
2389	/*
2390	* Given infomask/infomask2, compute the bits that must be saved in the
2391	* "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2392	* xl_heap_lock_updated WAL records.
2393	*
2394	* See fix_infomask_from_infobits.
2395	*/
2396	static uint8
2397	compute_infobits(uint16 infomask, uint16 infomask2)
2398	{
2399	return
2400	((infomask & HEAP_XMAX_IS_MULTI) != `0` ? XLHL_XMAX_IS_MULTI : `0`) \|
2401	((infomask & HEAP_XMAX_LOCK_ONLY) != `0` ? XLHL_XMAX_LOCK_ONLY : `0`) \|
2402	((infomask & HEAP_XMAX_EXCL_LOCK) != `0` ? XLHL_XMAX_EXCL_LOCK : `0`) \|
2403	/ note we ignore HEAP_XMAX_SHR_LOCK here /
2404	((infomask & HEAP_XMAX_KEYSHR_LOCK) != `0` ? XLHL_XMAX_KEYSHR_LOCK : `0`) \|
2405	((infomask2 & HEAP_KEYS_UPDATED) != `0` ?
2406	XLHL_KEYS_UPDATED : `0`);
2407	}
2408
2409	/*
2410	* Given two versions of the same t_infomask for a tuple, compare them and
2411	* return whether the relevant status for a tuple Xmax has changed. This is
2412	* used after a buffer lock has been released and reacquired: we want to ensure
2413	* that the tuple state continues to be the same it was when we previously
2414	* examined it.
2415	*
2416	* Note the Xmax field itself must be compared separately.
2417	*/
2418	static inline bool
2419	xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2420	{
2421	const uint16 interesting =
2422	HEAP_XMAX_IS_MULTI \| HEAP_XMAX_LOCK_ONLY \| HEAP_LOCK_MASK;
2423
2424	if ((new_infomask & interesting) != (old_infomask & interesting))
2425	return true;
2426
2427	return false;
2428	}
2429
2430	/*
2431	* heap_delete - delete a tuple
2432	*
2433	* See table_tuple_delete() for an explanation of the parameters, except that
2434	* this routine directly takes a tuple rather than a slot.
2435	*
2436	* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2437	* t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2438	* only for TM_SelfModified, since we cannot obtain cmax from a combocid
2439	* generated by another transaction).
2440	*/
2441	TM_Result
2442	heap_delete(Relation relation, ItemPointer tid,
2443	CommandId cid, Snapshot crosscheck, bool wait,
2444	TM_FailureData *tmfd, bool changingPart)
2445	{
2446	TM_Result result;
2447	TransactionId xid = GetCurrentTransactionId();
2448	ItemId lp;
2449	HeapTupleData tp;
2450	Page page;
2451	BlockNumber block;
2452	Buffer buffer;
2453	Buffer vmbuffer = InvalidBuffer;
2454	TransactionId new_xmax;
2455	uint16 new_infomask,
2456	new_infomask2;
2457	bool have_tuple_lock = false;
2458	bool iscombo;
2459	bool all_visible_cleared = false;
2460	HeapTuple old_key_tuple = NULL; / replica identity of the tuple /
2461	bool old_key_copied = false;
2462
2463	Assert(ItemPointerIsValid(tid));
2464
2465	/*
2466	* Forbid this during a parallel operation, lest it allocate a combocid.
2467	* Other workers might need that combocid for visibility checks, and we
2468	* have no provision for broadcasting it to them.
2469	*/
2470	if (IsInParallelMode())
2471	ereport(ERROR,
2472	(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2473	errmsg("cannot delete tuples during a parallel operation")));
2474
2475	block = ItemPointerGetBlockNumber(tid);
2476	buffer = ReadBuffer(relation, block);
2477	page = BufferGetPage(buffer);
2478
2479	/*
2480	* Before locking the buffer, pin the visibility map page if it appears to
2481	* be necessary. Since we haven't got the lock yet, someone else might be
2482	* in the middle of changing this, so we'll need to recheck after we have
2483	* the lock.
2484	*/
2485	if (PageIsAllVisible(page))
2486	visibilitymap_pin(relation, block, &vmbuffer);
2487
2488	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2489
2490	/*
2491	* If we didn't pin the visibility map page and the page has become all
2492	* visible while we were busy locking the buffer, we'll have to unlock and
2493	* re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2494	* unfortunate, but hopefully shouldn't happen often.
2495	*/
2496	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2497	{
2498	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2499	visibilitymap_pin(relation, block, &vmbuffer);
2500	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2501	}
2502
2503	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2504	Assert(ItemIdIsNormal(lp));
2505
2506	tp.t_tableOid = RelationGetRelid(relation);
2507	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2508	tp.t_len = ItemIdGetLength(lp);
2509	tp.t_self = *tid;
2510
2511	l1:
2512	result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2513
2514	if (result == TM_Invisible)
2515	{
2516	UnlockReleaseBuffer(buffer);
2517	ereport(ERROR,
2518	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2519	errmsg("attempted to delete invisible tuple")));
2520	}
2521	else if (result == TM_BeingModified && wait)
2522	{
2523	TransactionId xwait;
2524	uint16 infomask;
2525
2526	/ must copy state data before unlocking buffer /
2527	xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2528	infomask = tp.t_data->t_infomask;
2529
2530	/*
2531	* Sleep until concurrent transaction ends -- except when there's a
2532	* single locker and it's our own transaction. Note we don't care
2533	* which lock mode the locker has, because we need the strongest one.
2534	*
2535	* Before sleeping, we need to acquire tuple lock to establish our
2536	* priority for the tuple (see heap_lock_tuple). LockTuple will
2537	* release us when we are next-in-line for the tuple.
2538	*
2539	* If we are forced to "start over" below, we keep the tuple lock;
2540	* this arranges that we stay at the head of the line while rechecking
2541	* tuple state.
2542	*/
2543	if (infomask & HEAP_XMAX_IS_MULTI)
2544	{
2545	bool current_is_member = false;
2546
2547	if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2548	LockTupleExclusive, &current_is_member))
2549	{
2550	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2551
2552	/*
2553	* Acquire the lock, if necessary (but skip it when we're
2554	* requesting a lock and already have one; avoids deadlock).
2555	*/
2556	if (!current_is_member)
2557	heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2558	LockWaitBlock, &have_tuple_lock);
2559
2560	/ wait for multixact /
2561	MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2562	relation, &(tp.t_self), XLTW_Delete,
2563	NULL);
2564	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2565
2566	/*
2567	* If xwait had just locked the tuple then some other xact
2568	* could update this tuple before we get to this point. Check
2569	* for xmax change, and start over if so.
2570	*/
2571	if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) \|\|
2572	!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2573	xwait))
2574	goto l1;
2575	}
2576
2577	/*
2578	* You might think the multixact is necessarily done here, but not
2579	* so: it could have surviving members, namely our own xact or
2580	* other subxacts of this backend. It is legal for us to delete
2581	* the tuple in either case, however (the latter case is
2582	* essentially a situation of upgrading our former shared lock to
2583	* exclusive). We don't bother changing the on-disk hint bits
2584	* since we are about to overwrite the xmax altogether.
2585	*/
2586	}
2587	else if (!TransactionIdIsCurrentTransactionId(xwait))
2588	{
2589	/*
2590	* Wait for regular transaction to end; but first, acquire tuple
2591	* lock.
2592	*/
2593	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2594	heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2595	LockWaitBlock, &have_tuple_lock);
2596	XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2597	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2598
2599	/*
2600	* xwait is done, but if xwait had just locked the tuple then some
2601	* other xact could update this tuple before we get to this point.
2602	* Check for xmax change, and start over if so.
2603	*/
2604	if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) \|\|
2605	!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2606	xwait))
2607	goto l1;
2608
2609	/ Otherwise check if it committed or aborted /
2610	UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2611	}
2612
2613	/*
2614	* We may overwrite if previous xmax aborted, or if it committed but
2615	* only locked the tuple without updating it.
2616	*/
2617	if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) \|\|
2618	HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) \|\|
2619	HeapTupleHeaderIsOnlyLocked(tp.t_data))
2620	result = TM_Ok;
2621	else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid) \|\|
2622	HeapTupleHeaderIndicatesMovedPartitions(tp.t_data))
2623	result = TM_Updated;
2624	else
2625	result = TM_Deleted;
2626	}
2627
2628	if (crosscheck != InvalidSnapshot && result == TM_Ok)
2629	{
2630	/ Perform additional check for transaction-snapshot mode RI updates /
2631	if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2632	result = TM_Updated;
2633	}
2634
2635	if (result != TM_Ok)
2636	{
2637	Assert(result == TM_SelfModified \|\|
2638	result == TM_Updated \|\|
2639	result == TM_Deleted \|\|
2640	result == TM_BeingModified);
2641	Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2642	Assert(result != TM_Updated \|\|
2643	!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2644	tmfd->ctid = tp.t_data->t_ctid;
2645	tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2646	if (result == TM_SelfModified)
2647	tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2648	else
2649	tmfd->cmax = InvalidCommandId;
2650	UnlockReleaseBuffer(buffer);
2651	if (have_tuple_lock)
2652	UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2653	if (vmbuffer != InvalidBuffer)
2654	ReleaseBuffer(vmbuffer);
2655	return result;
2656	}
2657
2658	/*
2659	* We're about to do the actual delete -- check for conflict first, to
2660	* avoid possibly having to roll back work we've just done.
2661	*
2662	* This is safe without a recheck as long as there is no possibility of
2663	* another process scanning the page between this check and the delete
2664	* being visible to the scan (i.e., an exclusive buffer content lock is
2665	* continuously held from this point until the tuple delete is visible).
2666	*/
2667	CheckForSerializableConflictIn(relation, &tp, buffer);
2668
2669	/ replace cid with a combo cid if necessary /
2670	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2671
2672	/*
2673	* Compute replica identity tuple before entering the critical section so
2674	* we don't PANIC upon a memory allocation failure.
2675	*/
2676	old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2677
2678	/*
2679	* If this is the first possibly-multixact-able operation in the current
2680	* transaction, set my per-backend OldestMemberMXactId setting. We can be
2681	* certain that the transaction will never become a member of any older
2682	* MultiXactIds than that. (We have to do this even if we end up just
2683	* using our own TransactionId below, since some other backend could
2684	* incorporate our XID into a MultiXact immediately afterwards.)
2685	*/
2686	MultiXactIdSetOldestMember();
2687
2688	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2689	tp.t_data->t_infomask, tp.t_data->t_infomask2,
2690	xid, LockTupleExclusive, true,
2691	&new_xmax, &new_infomask, &new_infomask2);
2692
2693	START_CRIT_SECTION();
2694
2695	/*
2696	* If this transaction commits, the tuple will become DEAD sooner or
2697	* later. Set flag that this page is a candidate for pruning once our xid
2698	* falls below the OldestXmin horizon. If the transaction finally aborts,
2699	* the subsequent page pruning will be a no-op and the hint will be
2700	* cleared.
2701	*/
2702	PageSetPrunable(page, xid);
2703
2704	if (PageIsAllVisible(page))
2705	{
2706	all_visible_cleared = true;
2707	PageClearAllVisible(page);
2708	visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2709	vmbuffer, VISIBILITYMAP_VALID_BITS);
2710	}
2711
2712	/ store transaction information of xact deleting the tuple /
2713	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
2714	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2715	tp.t_data->t_infomask \|= new_infomask;
2716	tp.t_data->t_infomask2 \|= new_infomask2;
2717	HeapTupleHeaderClearHotUpdated(tp.t_data);
2718	HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2719	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2720	/ Make sure there is no forward chain link in t_ctid /
2721	tp.t_data->t_ctid = tp.t_self;
2722
2723	/ Signal that this is actually a move into another partition /
2724	if (changingPart)
2725	HeapTupleHeaderSetMovedPartitions(tp.t_data);
2726
2727	MarkBufferDirty(buffer);
2728
2729	/*
2730	* XLOG stuff
2731	*
2732	* NB: heap_abort_speculative() uses the same xlog record and replay
2733	* routines.
2734	*/
2735	if (RelationNeedsWAL(relation))
2736	{
2737	xl_heap_delete xlrec;
2738	xl_heap_header xlhdr;
2739	XLogRecPtr recptr;
2740
2741	/ For logical decode we need combocids to properly decode the catalog /
2742	if (RelationIsAccessibleInLogicalDecoding(relation))
2743	log_heap_new_cid(relation, &tp);
2744
2745	xlrec.flags = `0`;
2746	if (all_visible_cleared)
2747	xlrec.flags \|= XLH_DELETE_ALL_VISIBLE_CLEARED;
2748	if (changingPart)
2749	xlrec.flags \|= XLH_DELETE_IS_PARTITION_MOVE;
2750	xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
2751	tp.t_data->t_infomask2);
2752	xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
2753	xlrec.xmax = new_xmax;
2754
2755	if (old_key_tuple != NULL)
2756	{
2757	if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2758	xlrec.flags \|= XLH_DELETE_CONTAINS_OLD_TUPLE;
2759	else
2760	xlrec.flags \|= XLH_DELETE_CONTAINS_OLD_KEY;
2761	}
2762
2763	XLogBeginInsert();
2764	XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2765
2766	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
2767
2768	/*
2769	* Log replica identity of the deleted tuple if there is one
2770	*/
2771	if (old_key_tuple != NULL)
2772	{
2773	xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2774	xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2775	xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2776
2777	XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2778	XLogRegisterData((char *) old_key_tuple->t_data
2779	+ SizeofHeapTupleHeader,
2780	old_key_tuple->t_len
2781	- SizeofHeapTupleHeader);
2782	}
2783
2784	/ filtering by origin on a row level is much more efficient /
2785	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2786
2787	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2788
2789	PageSetLSN(page, recptr);
2790	}
2791
2792	END_CRIT_SECTION();
2793
2794	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2795
2796	if (vmbuffer != InvalidBuffer)
2797	ReleaseBuffer(vmbuffer);
2798
2799	/*
2800	* If the tuple has toasted out-of-line attributes, we need to delete
2801	* those items too. We have to do this before releasing the buffer
2802	* because we need to look at the contents of the tuple, but it's OK to
2803	* release the content lock on the buffer first.
2804	*/
2805	if (relation->rd_rel->relkind != RELKIND_RELATION &&
2806	relation->rd_rel->relkind != RELKIND_MATVIEW)
2807	{
2808	/ toast table entries should never be recursively toasted /
2809	Assert(!HeapTupleHasExternal(&tp));
2810	}
2811	else if (HeapTupleHasExternal(&tp))
2812	toast_delete(relation, &tp, false);
2813
2814	/*
2815	* Mark tuple for invalidation from system caches at next command
2816	* boundary. We have to do this before releasing the buffer because we
2817	* need to look at the contents of the tuple.
2818	*/
2819	CacheInvalidateHeapTuple(relation, &tp, NULL);
2820
2821	/ Now we can release the buffer /
2822	ReleaseBuffer(buffer);
2823
2824	/*
2825	* Release the lmgr tuple lock, if we had it.
2826	*/
2827	if (have_tuple_lock)
2828	UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2829
2830	pgstat_count_heap_delete(relation);
2831
2832	if (old_key_tuple != NULL && old_key_copied)
2833	heap_freetuple(old_key_tuple);
2834
2835	return TM_Ok;
2836	}
2837
2838	/*
2839	* simple_heap_delete - delete a tuple
2840	*
2841	* This routine may be used to delete a tuple when concurrent updates of
2842	* the target tuple are not expected (for example, because we have a lock
2843	* on the relation associated with the tuple). Any failure is reported
2844	* via ereport().
2845	*/
2846	void
2847	simple_heap_delete(Relation relation, ItemPointer tid)
2848	{
2849	TM_Result result;
2850	TM_FailureData tmfd;
2851
2852	result = heap_delete(relation, tid,
2853	GetCurrentCommandId(true), InvalidSnapshot,
2854	true / wait for commit / ,
2855	&tmfd, false / changingPart / );
2856	switch (result)
2857	{
2858	case TM_SelfModified:
2859	/ Tuple was already updated in current command? /
2860	elog(ERROR, "tuple already updated by self");
2861	break;
2862
2863	case TM_Ok:
2864	/ done successfully /
2865	break;
2866
2867	case TM_Updated:
2868	elog(ERROR, "tuple concurrently updated");
2869	break;
2870
2871	case TM_Deleted:
2872	elog(ERROR, "tuple concurrently deleted");
2873	break;
2874
2875	default:
2876	elog(ERROR, "unrecognized heap_delete status: %u", result);
2877	break;
2878	}
2879	}
2880
2881	/*
2882	* heap_update - replace a tuple
2883	*
2884	* See table_tuple_update() for an explanation of the parameters, except that
2885	* this routine directly takes a tuple rather than a slot.
2886	*
2887	* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2888	* t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2889	* only for TM_SelfModified, since we cannot obtain cmax from a combocid
2890	* generated by another transaction).
2891	*/
2892	TM_Result
2893	heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
2894	CommandId cid, Snapshot crosscheck, bool wait,
2895	TM_FailureData tmfd, LockTupleMode lockmode)
2896	{
2897	TM_Result result;
2898	TransactionId xid = GetCurrentTransactionId();
2899	Bitmapset *hot_attrs;
2900	Bitmapset *key_attrs;
2901	Bitmapset *id_attrs;
2902	Bitmapset *interesting_attrs;
2903	Bitmapset *modified_attrs;
2904	ItemId lp;
2905	HeapTupleData oldtup;
2906	HeapTuple heaptup;
2907	HeapTuple old_key_tuple = NULL;
2908	bool old_key_copied = false;
2909	Page page;
2910	BlockNumber block;
2911	MultiXactStatus mxact_status;
2912	Buffer buffer,
2913	newbuf,
2914	vmbuffer = InvalidBuffer,
2915	vmbuffer_new = InvalidBuffer;
2916	bool need_toast;
2917	Size newtupsize,
2918	pagefree;
2919	bool have_tuple_lock = false;
2920	bool iscombo;
2921	bool use_hot_update = false;
2922	bool hot_attrs_checked = false;
2923	bool key_intact;
2924	bool all_visible_cleared = false;
2925	bool all_visible_cleared_new = false;
2926	bool checked_lockers;
2927	bool locker_remains;
2928	TransactionId xmax_new_tuple,
2929	xmax_old_tuple;
2930	uint16 infomask_old_tuple,
2931	infomask2_old_tuple,
2932	infomask_new_tuple,
2933	infomask2_new_tuple;
2934
2935	Assert(ItemPointerIsValid(otid));
2936
2937	/*
2938	* Forbid this during a parallel operation, lest it allocate a combocid.
2939	* Other workers might need that combocid for visibility checks, and we
2940	* have no provision for broadcasting it to them.
2941	*/
2942	if (IsInParallelMode())
2943	ereport(ERROR,
2944	(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2945	errmsg("cannot update tuples during a parallel operation")));
2946
2947	/*
2948	* Fetch the list of attributes to be checked for various operations.
2949	*
2950	* For HOT considerations, this is wasted effort if we fail to update or
2951	* have to put the new tuple on a different page. But we must compute the
2952	* list before obtaining buffer lock --- in the worst case, if we are
2953	* doing an update on one of the relevant system catalogs, we could
2954	* deadlock if we try to fetch the list later. In any case, the relcache
2955	* caches the data so this is usually pretty cheap.
2956	*
2957	* We also need columns used by the replica identity and columns that are
2958	* considered the "key" of rows in the table.
2959	*
2960	* Note that we get copies of each bitmap, so we need not worry about
2961	* relcache flush happening midway through.
2962	*/
2963	hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
2964	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
2965	id_attrs = RelationGetIndexAttrBitmap(relation,
2966	INDEX_ATTR_BITMAP_IDENTITY_KEY);
2967
2968
2969	block = ItemPointerGetBlockNumber(otid);
2970	buffer = ReadBuffer(relation, block);
2971	page = BufferGetPage(buffer);
2972
2973	interesting_attrs = NULL;
2974
2975	/*
2976	* If the page is already full, there is hardly any chance of doing a HOT
2977	* update on this page. It might be wasteful effort to look for index
2978	* column updates only to later reject HOT updates for lack of space in
2979	* the same page. So we be conservative and only fetch hot_attrs if the
2980	* page is not already full. Since we are already holding a pin on the
2981	* buffer, there is no chance that the buffer can get cleaned up
2982	* concurrently and even if that was possible, in the worst case we lose a
2983	* chance to do a HOT update.
2984	*/
2985	if (!PageIsFull(page))
2986	{
2987	interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
2988	hot_attrs_checked = true;
2989	}
2990	interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
2991	interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
2992
2993	/*
2994	* Before locking the buffer, pin the visibility map page if it appears to
2995	* be necessary. Since we haven't got the lock yet, someone else might be
2996	* in the middle of changing this, so we'll need to recheck after we have
2997	* the lock.
2998	*/
2999	if (PageIsAllVisible(page))
3000	visibilitymap_pin(relation, block, &vmbuffer);
3001
3002	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3003
3004	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3005	Assert(ItemIdIsNormal(lp));
3006
3007	/*
3008	* Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3009	* properly.
3010	*/
3011	oldtup.t_tableOid = RelationGetRelid(relation);
3012	oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3013	oldtup.t_len = ItemIdGetLength(lp);
3014	oldtup.t_self = *otid;
3015
3016	/ the new tuple is ready, except for this: /
3017	newtup->t_tableOid = RelationGetRelid(relation);
3018
3019	/ Determine columns modified by the update. /
3020	modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3021	&oldtup, newtup);
3022
3023	/*
3024	* If we're not updating any "key" column, we can grab a weaker lock type.
3025	* This allows for more concurrency when we are running simultaneously
3026	* with foreign key checks.
3027	*
3028	* Note that if a column gets detoasted while executing the update, but
3029	* the value ends up being the same, this test will fail and we will use
3030	* the stronger lock. This is acceptable; the important case to optimize
3031	* is updates that don't manipulate key columns, not those that
3032	* serendipitiously arrive at the same key values.
3033	*/
3034	if (!bms_overlap(modified_attrs, key_attrs))
3035	{
3036	*lockmode = LockTupleNoKeyExclusive;
3037	mxact_status = MultiXactStatusNoKeyUpdate;
3038	key_intact = true;
3039
3040	/*
3041	* If this is the first possibly-multixact-able operation in the
3042	* current transaction, set my per-backend OldestMemberMXactId
3043	* setting. We can be certain that the transaction will never become a
3044	* member of any older MultiXactIds than that. (We have to do this
3045	* even if we end up just using our own TransactionId below, since
3046	* some other backend could incorporate our XID into a MultiXact
3047	* immediately afterwards.)
3048	*/
3049	MultiXactIdSetOldestMember();
3050	}
3051	else
3052	{
3053	*lockmode = LockTupleExclusive;
3054	mxact_status = MultiXactStatusUpdate;
3055	key_intact = false;
3056	}
3057
3058	/*
3059	* Note: beyond this point, use oldtup not otid to refer to old tuple.
3060	* otid may very well point at newtup->t_self, which we will overwrite
3061	* with the new tuple's location, so there's great risk of confusion if we
3062	* use otid anymore.
3063	*/
3064
3065	l2:
3066	checked_lockers = false;
3067	locker_remains = false;
3068	result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3069
3070	/ see below about the "no wait" case /
3071	Assert(result != TM_BeingModified \|\| wait);
3072
3073	if (result == TM_Invisible)
3074	{
3075	UnlockReleaseBuffer(buffer);
3076	ereport(ERROR,
3077	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3078	errmsg("attempted to update invisible tuple")));
3079	}
3080	else if (result == TM_BeingModified && wait)
3081	{
3082	TransactionId xwait;
3083	uint16 infomask;
3084	bool can_continue = false;
3085
3086	/*
3087	* XXX note that we don't consider the "no wait" case here. This
3088	* isn't a problem currently because no caller uses that case, but it
3089	* should be fixed if such a caller is introduced. It wasn't a
3090	* problem previously because this code would always wait, but now
3091	* that some tuple locks do not conflict with one of the lock modes we
3092	* use, it is possible that this case is interesting to handle
3093	* specially.
3094	*
3095	* This may cause failures with third-party code that calls
3096	* heap_update directly.
3097	*/
3098
3099	/ must copy state data before unlocking buffer /
3100	xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3101	infomask = oldtup.t_data->t_infomask;
3102
3103	/*
3104	* Now we have to do something about the existing locker. If it's a
3105	* multi, sleep on it; we might be awakened before it is completely
3106	* gone (or even not sleep at all in some cases); we need to preserve
3107	* it as locker, unless it is gone completely.
3108	*
3109	* If it's not a multi, we need to check for sleeping conditions
3110	* before actually going to sleep. If the update doesn't conflict
3111	* with the locks, we just continue without sleeping (but making sure
3112	* it is preserved).
3113	*
3114	* Before sleeping, we need to acquire tuple lock to establish our
3115	* priority for the tuple (see heap_lock_tuple). LockTuple will
3116	* release us when we are next-in-line for the tuple. Note we must
3117	* not acquire the tuple lock until we're sure we're going to sleep;
3118	* otherwise we're open for race conditions with other transactions
3119	* holding the tuple lock which sleep on us.
3120	*
3121	* If we are forced to "start over" below, we keep the tuple lock;
3122	* this arranges that we stay at the head of the line while rechecking
3123	* tuple state.
3124	*/
3125	if (infomask & HEAP_XMAX_IS_MULTI)
3126	{
3127	TransactionId update_xact;
3128	int remain;
3129	bool current_is_member = false;
3130
3131	if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3132	*lockmode, &current_is_member))
3133	{
3134	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3135
3136	/*
3137	* Acquire the lock, if necessary (but skip it when we're
3138	* requesting a lock and already have one; avoids deadlock).
3139	*/
3140	if (!current_is_member)
3141	heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3142	LockWaitBlock, &have_tuple_lock);
3143
3144	/ wait for multixact /
3145	MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3146	relation, &oldtup.t_self, XLTW_Update,
3147	&remain);
3148	checked_lockers = true;
3149	locker_remains = remain != `0`;
3150	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3151
3152	/*
3153	* If xwait had just locked the tuple then some other xact
3154	* could update this tuple before we get to this point. Check
3155	* for xmax change, and start over if so.
3156	*/
3157	if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3158	infomask) \|\|
3159	!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3160	xwait))
3161	goto l2;
3162	}
3163
3164	/*
3165	* Note that the multixact may not be done by now. It could have
3166	* surviving members; our own xact or other subxacts of this
3167	* backend, and also any other concurrent transaction that locked
3168	* the tuple with LockTupleKeyShare if we only got
3169	* LockTupleNoKeyExclusive. If this is the case, we have to be
3170	* careful to mark the updated tuple with the surviving members in
3171	* Xmax.
3172	*
3173	* Note that there could have been another update in the
3174	* MultiXact. In that case, we need to check whether it committed
3175	* or aborted. If it aborted we are safe to update it again;
3176	* otherwise there is an update conflict, and we have to return
3177	* TableTuple{Deleted, Updated} below.
3178	*
3179	* In the LockTupleExclusive case, we still need to preserve the
3180	* surviving members: those would include the tuple locks we had
3181	* before this one, which are important to keep in case this
3182	* subxact aborts.
3183	*/
3184	if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3185	update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3186	else
3187	update_xact = InvalidTransactionId;
3188
3189	/*
3190	* There was no UPDATE in the MultiXact; or it aborted. No
3191	* TransactionIdIsInProgress() call needed here, since we called
3192	* MultiXactIdWait() above.
3193	*/
3194	if (!TransactionIdIsValid(update_xact) \|\|
3195	TransactionIdDidAbort(update_xact))
3196	can_continue = true;
3197	}
3198	else if (TransactionIdIsCurrentTransactionId(xwait))
3199	{
3200	/*
3201	* The only locker is ourselves; we can avoid grabbing the tuple
3202	* lock here, but must preserve our locking information.
3203	*/
3204	checked_lockers = true;
3205	locker_remains = true;
3206	can_continue = true;
3207	}
3208	else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3209	{
3210	/*
3211	* If it's just a key-share locker, and we're not changing the key
3212	* columns, we don't need to wait for it to end; but we need to
3213	* preserve it as locker.
3214	*/
3215	checked_lockers = true;
3216	locker_remains = true;
3217	can_continue = true;
3218	}
3219	else
3220	{
3221	/*
3222	* Wait for regular transaction to end; but first, acquire tuple
3223	* lock.
3224	*/
3225	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3226	heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3227	LockWaitBlock, &have_tuple_lock);
3228	XactLockTableWait(xwait, relation, &oldtup.t_self,
3229	XLTW_Update);
3230	checked_lockers = true;
3231	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3232
3233	/*
3234	* xwait is done, but if xwait had just locked the tuple then some
3235	* other xact could update this tuple before we get to this point.
3236	* Check for xmax change, and start over if so.
3237	*/
3238	if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) \|\|
3239	!TransactionIdEquals(xwait,
3240	HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3241	goto l2;
3242
3243	/ Otherwise check if it committed or aborted /
3244	UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3245	if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3246	can_continue = true;
3247	}
3248
3249	if (can_continue)
3250	result = TM_Ok;
3251	else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid) \|\|
3252	HeapTupleHeaderIndicatesMovedPartitions(oldtup.t_data))
3253	result = TM_Updated;
3254	else
3255	result = TM_Deleted;
3256	}
3257
3258	if (crosscheck != InvalidSnapshot && result == TM_Ok)
3259	{
3260	/ Perform additional check for transaction-snapshot mode RI updates /
3261	if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3262	{
3263	result = TM_Updated;
3264	Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3265	}
3266	}
3267
3268	if (result != TM_Ok)
3269	{
3270	Assert(result == TM_SelfModified \|\|
3271	result == TM_Updated \|\|
3272	result == TM_Deleted \|\|
3273	result == TM_BeingModified);
3274	Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3275	Assert(result != TM_Updated \|\|
3276	!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3277	tmfd->ctid = oldtup.t_data->t_ctid;
3278	tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3279	if (result == TM_SelfModified)
3280	tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3281	else
3282	tmfd->cmax = InvalidCommandId;
3283	UnlockReleaseBuffer(buffer);
3284	if (have_tuple_lock)
3285	UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3286	if (vmbuffer != InvalidBuffer)
3287	ReleaseBuffer(vmbuffer);
3288	bms_free(hot_attrs);
3289	bms_free(key_attrs);
3290	bms_free(id_attrs);
3291	bms_free(modified_attrs);
3292	bms_free(interesting_attrs);
3293	return result;
3294	}
3295
3296	/*
3297	* If we didn't pin the visibility map page and the page has become all
3298	* visible while we were busy locking the buffer, or during some
3299	* subsequent window during which we had it unlocked, we'll have to unlock
3300	* and re-lock, to avoid holding the buffer lock across an I/O. That's a
3301	* bit unfortunate, especially since we'll now have to recheck whether the
3302	* tuple has been locked or updated under us, but hopefully it won't
3303	* happen very often.
3304	*/
3305	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3306	{
3307	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3308	visibilitymap_pin(relation, block, &vmbuffer);
3309	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3310	goto l2;
3311	}
3312
3313	/ Fill in transaction status data /
3314
3315	/*
3316	* If the tuple we're updating is locked, we need to preserve the locking
3317	* info in the old tuple's Xmax. Prepare a new Xmax value for this.
3318	*/
3319	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3320	oldtup.t_data->t_infomask,
3321	oldtup.t_data->t_infomask2,
3322	xid, *lockmode, true,
3323	&xmax_old_tuple, &infomask_old_tuple,
3324	&infomask2_old_tuple);
3325
3326	/*
3327	* And also prepare an Xmax value for the new copy of the tuple. If there
3328	* was no xmax previously, or there was one but all lockers are now gone,
3329	* then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3330	* rare cases that might also be InvalidXid and yet not have the
3331	* HEAP_XMAX_INVALID bit set; that's fine.)
3332	*/
3333	if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) \|\|
3334	HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) \|\|
3335	(checked_lockers && !locker_remains))
3336	xmax_new_tuple = InvalidTransactionId;
3337	else
3338	xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3339
3340	if (!TransactionIdIsValid(xmax_new_tuple))
3341	{
3342	infomask_new_tuple = HEAP_XMAX_INVALID;
3343	infomask2_new_tuple = `0`;
3344	}
3345	else
3346	{
3347	/*
3348	* If we found a valid Xmax for the new tuple, then the infomask bits
3349	* to use on the new tuple depend on what was there on the old one.
3350	* Note that since we're doing an update, the only possibility is that
3351	* the lockers had FOR KEY SHARE lock.
3352	*/
3353	if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3354	{
3355	GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3356	&infomask2_new_tuple);
3357	}
3358	else
3359	{
3360	infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK \| HEAP_XMAX_LOCK_ONLY;
3361	infomask2_new_tuple = `0`;
3362	}
3363	}
3364
3365	/*
3366	* Prepare the new tuple with the appropriate initial values of Xmin and
3367	* Xmax, as well as initial infomask bits as computed above.
3368	*/
3369	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3370	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3371	HeapTupleHeaderSetXmin(newtup->t_data, xid);
3372	HeapTupleHeaderSetCmin(newtup->t_data, cid);
3373	newtup->t_data->t_infomask \|= HEAP_UPDATED \| infomask_new_tuple;
3374	newtup->t_data->t_infomask2 \|= infomask2_new_tuple;
3375	HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3376
3377	/*
3378	* Replace cid with a combo cid if necessary. Note that we already put
3379	* the plain cid into the new tuple.
3380	*/
3381	HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3382
3383	/*
3384	* If the toaster needs to be activated, OR if the new tuple will not fit
3385	* on the same page as the old, then we need to release the content lock
3386	* (but not the pin!) on the old tuple's buffer while we are off doing
3387	* TOAST and/or table-file-extension work. We must mark the old tuple to
3388	* show that it's locked, else other processes may try to update it
3389	* themselves.
3390	*
3391	* We need to invoke the toaster if there are already any out-of-line
3392	* toasted values present, or if the new tuple is over-threshold.
3393	*/
3394	if (relation->rd_rel->relkind != RELKIND_RELATION &&
3395	relation->rd_rel->relkind != RELKIND_MATVIEW)
3396	{
3397	/ toast table entries should never be recursively toasted /
3398	Assert(!HeapTupleHasExternal(&oldtup));
3399	Assert(!HeapTupleHasExternal(newtup));
3400	need_toast = false;
3401	}
3402	else
3403	need_toast = (HeapTupleHasExternal(&oldtup) \|\|
3404	HeapTupleHasExternal(newtup) \|\|
3405	newtup->t_len > TOAST_TUPLE_THRESHOLD);
3406
3407	pagefree = PageGetHeapFreeSpace(page);
3408
3409	newtupsize = MAXALIGN(newtup->t_len);
3410
3411	if (need_toast \|\| newtupsize > pagefree)
3412	{
3413	TransactionId xmax_lock_old_tuple;
3414	uint16 infomask_lock_old_tuple,
3415	infomask2_lock_old_tuple;
3416	bool cleared_all_frozen = false;
3417
3418	/*
3419	* To prevent concurrent sessions from updating the tuple, we have to
3420	* temporarily mark it locked, while we release the page-level lock.
3421	*
3422	* To satisfy the rule that any xid potentially appearing in a buffer
3423	* written out to disk, we unfortunately have to WAL log this
3424	* temporary modification. We can reuse xl_heap_lock for this
3425	* purpose. If we crash/error before following through with the
3426	* actual update, xmax will be of an aborted transaction, allowing
3427	* other sessions to proceed.
3428	*/
3429
3430	/*
3431	* Compute xmax / infomask appropriate for locking the tuple. This has
3432	* to be done separately from the combo that's going to be used for
3433	* updating, because the potentially created multixact would otherwise
3434	* be wrong.
3435	*/
3436	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3437	oldtup.t_data->t_infomask,
3438	oldtup.t_data->t_infomask2,
3439	xid, *lockmode, false,
3440	&xmax_lock_old_tuple, &infomask_lock_old_tuple,
3441	&infomask2_lock_old_tuple);
3442
3443	Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3444
3445	START_CRIT_SECTION();
3446
3447	/ Clear obsolete visibility flags ... /
3448	oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
3449	oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3450	HeapTupleClearHotUpdated(&oldtup);
3451	/ ... and store info about transaction updating this tuple /
3452	Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3453	HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3454	oldtup.t_data->t_infomask \|= infomask_lock_old_tuple;
3455	oldtup.t_data->t_infomask2 \|= infomask2_lock_old_tuple;
3456	HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3457
3458	/ temporarily make it look not-updated, but locked /
3459	oldtup.t_data->t_ctid = oldtup.t_self;
3460
3461	/*
3462	* Clear all-frozen bit on visibility map if needed. We could
3463	* immediately reset ALL_VISIBLE, but given that the WAL logging
3464	* overhead would be unchanged, that doesn't seem necessarily
3465	* worthwhile.
3466	*/
3467	if (PageIsAllVisible(BufferGetPage(buffer)) &&
3468	visibilitymap_clear(relation, block, vmbuffer,
3469	VISIBILITYMAP_ALL_FROZEN))
3470	cleared_all_frozen = true;
3471
3472	MarkBufferDirty(buffer);
3473
3474	if (RelationNeedsWAL(relation))
3475	{
3476	xl_heap_lock xlrec;
3477	XLogRecPtr recptr;
3478
3479	XLogBeginInsert();
3480	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
3481
3482	xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3483	xlrec.locking_xid = xmax_lock_old_tuple;
3484	xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3485	oldtup.t_data->t_infomask2);
3486	xlrec.flags =
3487	cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : `0`;
3488	XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3489	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3490	PageSetLSN(page, recptr);
3491	}
3492
3493	END_CRIT_SECTION();
3494
3495	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3496
3497	/*
3498	* Let the toaster do its thing, if needed.
3499	*
3500	* Note: below this point, heaptup is the data we actually intend to
3501	* store into the relation; newtup is the caller's original untoasted
3502	* data.
3503	*/
3504	if (need_toast)
3505	{
3506	/ Note we always use WAL and FSM during updates /
3507	heaptup = toast_insert_or_update(relation, newtup, &oldtup, `0`);
3508	newtupsize = MAXALIGN(heaptup->t_len);
3509	}
3510	else
3511	heaptup = newtup;
3512
3513	/*
3514	* Now, do we need a new page for the tuple, or not? This is a bit
3515	* tricky since someone else could have added tuples to the page while
3516	* we weren't looking. We have to recheck the available space after
3517	* reacquiring the buffer lock. But don't bother to do that if the
3518	* former amount of free space is still not enough; it's unlikely
3519	* there's more free now than before.
3520	*
3521	* What's more, if we need to get a new page, we will need to acquire
3522	* buffer locks on both old and new pages. To avoid deadlock against
3523	* some other backend trying to get the same two locks in the other
3524	* order, we must be consistent about the order we get the locks in.
3525	* We use the rule "lock the lower-numbered page of the relation
3526	* first". To implement this, we must do RelationGetBufferForTuple
3527	* while not holding the lock on the old page, and we must rely on it
3528	* to get the locks on both pages in the correct order.
3529	*/
3530	if (newtupsize > pagefree)
3531	{
3532	/ Assume there's no chance to put heaptup on same page. /
3533	newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3534	buffer, `0`, NULL,
3535	&vmbuffer_new, &vmbuffer);
3536	}
3537	else
3538	{
3539	/ Re-acquire the lock on the old tuple's page. /
3540	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3541	/ Re-check using the up-to-date free space /
3542	pagefree = PageGetHeapFreeSpace(page);
3543	if (newtupsize > pagefree)
3544	{
3545	/*
3546	* Rats, it doesn't fit anymore. We must now unlock and
3547	* relock to avoid deadlock. Fortunately, this path should
3548	* seldom be taken.
3549	*/
3550	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3551	newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3552	buffer, `0`, NULL,
3553	&vmbuffer_new, &vmbuffer);
3554	}
3555	else
3556	{
3557	/ OK, it fits here, so we're done. /
3558	newbuf = buffer;
3559	}
3560	}
3561	}
3562	else
3563	{
3564	/ No TOAST work needed, and it'll fit on same page /
3565	newbuf = buffer;
3566	heaptup = newtup;
3567	}
3568
3569	/*
3570	* We're about to do the actual update -- check for conflict first, to
3571	* avoid possibly having to roll back work we've just done.
3572	*
3573	* This is safe without a recheck as long as there is no possibility of
3574	* another process scanning the pages between this check and the update
3575	* being visible to the scan (i.e., exclusive buffer content lock(s) are
3576	* continuously held from this point until the tuple update is visible).
3577	*
3578	* For the new tuple the only check needed is at the relation level, but
3579	* since both tuples are in the same relation and the check for oldtup
3580	* will include checking the relation level, there is no benefit to a
3581	* separate check for the new tuple.
3582	*/
3583	CheckForSerializableConflictIn(relation, &oldtup, buffer);
3584
3585	/*
3586	* At this point newbuf and buffer are both pinned and locked, and newbuf
3587	* has enough space for the new tuple. If they are the same buffer, only
3588	* one pin is held.
3589	*/
3590
3591	if (newbuf == buffer)
3592	{
3593	/*
3594	* Since the new tuple is going into the same page, we might be able
3595	* to do a HOT update. Check if any of the index columns have been
3596	* changed. If the page was already full, we may have skipped checking
3597	* for index columns, and also can't do a HOT update.
3598	*/
3599	if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3600	use_hot_update = true;
3601	}
3602	else
3603	{
3604	/ Set a hint that the old page could use prune/defrag /
3605	PageSetFull(page);
3606	}
3607
3608	/*
3609	* Compute replica identity tuple before entering the critical section so
3610	* we don't PANIC upon a memory allocation failure.
3611	* ExtractReplicaIdentity() will return NULL if nothing needs to be
3612	* logged.
3613	*/
3614	old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3615	bms_overlap(modified_attrs, id_attrs),
3616	&old_key_copied);
3617
3618	/ NO EREPORT(ERROR) from here till changes are logged /
3619	START_CRIT_SECTION();
3620
3621	/*
3622	* If this transaction commits, the old tuple will become DEAD sooner or
3623	* later. Set flag that this page is a candidate for pruning once our xid
3624	* falls below the OldestXmin horizon. If the transaction finally aborts,
3625	* the subsequent page pruning will be a no-op and the hint will be
3626	* cleared.
3627	*
3628	* XXX Should we set hint on newbuf as well? If the transaction aborts,
3629	* there would be a prunable tuple in the newbuf; but for now we choose
3630	* not to optimize for aborts. Note that heap_xlog_update must be kept in
3631	* sync if this decision changes.
3632	*/
3633	PageSetPrunable(page, xid);
3634
3635	if (use_hot_update)
3636	{
3637	/ Mark the old tuple as HOT-updated /
3638	HeapTupleSetHotUpdated(&oldtup);
3639	/ And mark the new tuple as heap-only /
3640	HeapTupleSetHeapOnly(heaptup);
3641	/ Mark the caller's copy too, in case different from heaptup /
3642	HeapTupleSetHeapOnly(newtup);
3643	}
3644	else
3645	{
3646	/ Make sure tuples are correctly marked as not-HOT /
3647	HeapTupleClearHotUpdated(&oldtup);
3648	HeapTupleClearHeapOnly(heaptup);
3649	HeapTupleClearHeapOnly(newtup);
3650	}
3651
3652	RelationPutHeapTuple(relation, newbuf, heaptup, false); / insert new tuple /
3653
3654
3655	/ Clear obsolete visibility flags, possibly set by ourselves above... /
3656	oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
3657	oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3658	/ ... and store info about transaction updating this tuple /
3659	Assert(TransactionIdIsValid(xmax_old_tuple));
3660	HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3661	oldtup.t_data->t_infomask \|= infomask_old_tuple;
3662	oldtup.t_data->t_infomask2 \|= infomask2_old_tuple;
3663	HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3664
3665	/ record address of new tuple in t_ctid of old one /
3666	oldtup.t_data->t_ctid = heaptup->t_self;
3667
3668	/ clear PD_ALL_VISIBLE flags, reset all visibilitymap bits /
3669	if (PageIsAllVisible(BufferGetPage(buffer)))
3670	{
3671	all_visible_cleared = true;
3672	PageClearAllVisible(BufferGetPage(buffer));
3673	visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3674	vmbuffer, VISIBILITYMAP_VALID_BITS);
3675	}
3676	if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3677	{
3678	all_visible_cleared_new = true;
3679	PageClearAllVisible(BufferGetPage(newbuf));
3680	visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3681	vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3682	}
3683
3684	if (newbuf != buffer)
3685	MarkBufferDirty(newbuf);
3686	MarkBufferDirty(buffer);
3687
3688	/ XLOG stuff /
3689	if (RelationNeedsWAL(relation))
3690	{
3691	XLogRecPtr recptr;
3692
3693	/*
3694	* For logical decoding we need combocids to properly decode the
3695	* catalog.
3696	*/
3697	if (RelationIsAccessibleInLogicalDecoding(relation))
3698	{
3699	log_heap_new_cid(relation, &oldtup);
3700	log_heap_new_cid(relation, heaptup);
3701	}
3702
3703	recptr = log_heap_update(relation, buffer,
3704	newbuf, &oldtup, heaptup,
3705	old_key_tuple,
3706	all_visible_cleared,
3707	all_visible_cleared_new);
3708	if (newbuf != buffer)
3709	{
3710	PageSetLSN(BufferGetPage(newbuf), recptr);
3711	}
3712	PageSetLSN(BufferGetPage(buffer), recptr);
3713	}
3714
3715	END_CRIT_SECTION();
3716
3717	if (newbuf != buffer)
3718	LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3719	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3720
3721	/*
3722	* Mark old tuple for invalidation from system caches at next command
3723	* boundary, and mark the new tuple for invalidation in case we abort. We
3724	* have to do this before releasing the buffer because oldtup is in the
3725	* buffer. (heaptup is all in local memory, but it's necessary to process
3726	* both tuple versions in one call to inval.c so we can avoid redundant
3727	* sinval messages.)
3728	*/
3729	CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3730
3731	/ Now we can release the buffer(s) /
3732	if (newbuf != buffer)
3733	ReleaseBuffer(newbuf);
3734	ReleaseBuffer(buffer);
3735	if (BufferIsValid(vmbuffer_new))
3736	ReleaseBuffer(vmbuffer_new);
3737	if (BufferIsValid(vmbuffer))
3738	ReleaseBuffer(vmbuffer);
3739
3740	/*
3741	* Release the lmgr tuple lock, if we had it.
3742	*/
3743	if (have_tuple_lock)
3744	UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3745
3746	pgstat_count_heap_update(relation, use_hot_update);
3747
3748	/*
3749	* If heaptup is a private copy, release it. Don't forget to copy t_self
3750	* back to the caller's image, too.
3751	*/
3752	if (heaptup != newtup)
3753	{
3754	newtup->t_self = heaptup->t_self;
3755	heap_freetuple(heaptup);
3756	}
3757
3758	if (old_key_tuple != NULL && old_key_copied)
3759	heap_freetuple(old_key_tuple);
3760
3761	bms_free(hot_attrs);
3762	bms_free(key_attrs);
3763	bms_free(id_attrs);
3764	bms_free(modified_attrs);
3765	bms_free(interesting_attrs);
3766
3767	return TM_Ok;
3768	}
3769
3770	/*
3771	* Check if the specified attribute's value is same in both given tuples.
3772	* Subroutine for HeapDetermineModifiedColumns.
3773	*/
3774	static bool
3775	heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
3776	HeapTuple tup1, HeapTuple tup2)
3777	{
3778	Datum value1,
3779	value2;
3780	bool isnull1,
3781	isnull2;
3782	Form_pg_attribute att;
3783
3784	/*
3785	* If it's a whole-tuple reference, say "not equal". It's not really
3786	* worth supporting this case, since it could only succeed after a no-op
3787	* update, which is hardly a case worth optimizing for.
3788	*/
3789	if (attrnum == `0`)
3790	return false;
3791
3792	/*
3793	* Likewise, automatically say "not equal" for any system attribute other
3794	* than tableOID; we cannot expect these to be consistent in a HOT chain,
3795	* or even to be set correctly yet in the new tuple.
3796	*/
3797	if (attrnum < `0`)
3798	{
3799	if (attrnum != TableOidAttributeNumber)
3800	return false;
3801	}
3802
3803	/*
3804	* Extract the corresponding values. XXX this is pretty inefficient if
3805	* there are many indexed columns. Should HeapDetermineModifiedColumns do
3806	* a single heap_deform_tuple call on each tuple, instead? But that
3807	* doesn't work for system columns ...
3808	*/
3809	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
3810	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
3811
3812	/*
3813	* If one value is NULL and other is not, then they are certainly not
3814	* equal
3815	*/
3816	if (isnull1 != isnull2)
3817	return false;
3818
3819	/*
3820	* If both are NULL, they can be considered equal.
3821	*/
3822	if (isnull1)
3823	return true;
3824
3825	/*
3826	* We do simple binary comparison of the two datums. This may be overly
3827	* strict because there can be multiple binary representations for the
3828	* same logical value. But we should be OK as long as there are no false
3829	* positives. Using a type-specific equality operator is messy because
3830	* there could be multiple notions of equality in different operator
3831	* classes; furthermore, we cannot safely invoke user-defined functions
3832	* while holding exclusive buffer lock.
3833	*/
3834	if (attrnum <= `0`)
3835	{
3836	/ The only allowed system columns are OIDs, so do this /
3837	return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3838	}
3839	else
3840	{
3841	Assert(attrnum <= tupdesc->natts);
3842	att = TupleDescAttr(tupdesc, attrnum - `1`);
3843	return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3844	}
3845	}
3846
3847	/*
3848	* Check which columns are being updated.
3849	*
3850	* Given an updated tuple, determine (and return into the output bitmapset),
3851	* from those listed as interesting, the set of columns that changed.
3852	*
3853	* The input bitmapset is destructively modified; that is OK since this is
3854	* invoked at most once in heap_update.
3855	*/
3856	static Bitmapset *
3857	HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
3858	HeapTuple oldtup, HeapTuple newtup)
3859	{
3860	int attnum;
3861	Bitmapset *modified = NULL;
3862
3863	while ((attnum = bms_first_member(interesting_cols)) >= `0`)
3864	{
3865	attnum += FirstLowInvalidHeapAttributeNumber;
3866
3867	if (!heap_tuple_attr_equals(RelationGetDescr(relation),
3868	attnum, oldtup, newtup))
3869	modified = bms_add_member(modified,
3870	attnum - FirstLowInvalidHeapAttributeNumber);
3871	}
3872
3873	return modified;
3874	}
3875
3876	/*
3877	* simple_heap_update - replace a tuple
3878	*
3879	* This routine may be used to update a tuple when concurrent updates of
3880	* the target tuple are not expected (for example, because we have a lock
3881	* on the relation associated with the tuple). Any failure is reported
3882	* via ereport().
3883	*/
3884	void
3885	simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
3886	{
3887	TM_Result result;
3888	TM_FailureData tmfd;
3889	LockTupleMode lockmode;
3890
3891	result = heap_update(relation, otid, tup,
3892	GetCurrentCommandId(true), InvalidSnapshot,
3893	true / wait for commit / ,
3894	&tmfd, &lockmode);
3895	switch (result)
3896	{
3897	case TM_SelfModified:
3898	/ Tuple was already updated in current command? /
3899	elog(ERROR, "tuple already updated by self");
3900	break;
3901
3902	case TM_Ok:
3903	/ done successfully /
3904	break;
3905
3906	case TM_Updated:
3907	elog(ERROR, "tuple concurrently updated");
3908	break;
3909
3910	case TM_Deleted:
3911	elog(ERROR, "tuple concurrently deleted");
3912	break;
3913
3914	default:
3915	elog(ERROR, "unrecognized heap_update status: %u", result);
3916	break;
3917	}
3918	}
3919
3920
3921	/*
3922	* Return the MultiXactStatus corresponding to the given tuple lock mode.
3923	*/
3924	static MultiXactStatus
3925	get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
3926	{
3927	int retval;
3928
3929	if (is_update)
3930	retval = tupleLockExtraInfo[mode].updstatus;
3931	else
3932	retval = tupleLockExtraInfo[mode].lockstatus;
3933
3934	if (retval == -`1`)
3935	elog(ERROR, "invalid lock tuple mode %d/%s", mode,
3936	is_update ? "true" : "false");
3937
3938	return (MultiXactStatus) retval;
3939	}
3940
3941	/*
3942	* heap_lock_tuple - lock a tuple in shared or exclusive mode
3943	*
3944	* Note that this acquires a buffer pin, which the caller must release.
3945	*
3946	* Input parameters:
3947	* relation: relation containing tuple (caller must hold suitable lock)
3948	* tid: TID of tuple to lock
3949	* cid: current command ID (used for visibility test, and stored into
3950	* tuple's cmax if lock is successful)
3951	* mode: indicates if shared or exclusive tuple lock is desired
3952	* wait_policy: what to do if tuple lock is not available
3953	* follow_updates: if true, follow the update chain to also lock descendant
3954	* tuples.
3955	*
3956	* Output parameters:
3957	* *tuple: all fields filled in
3958	* *buffer: set to buffer holding tuple (pinned but not locked at exit)
3959	* *tmfd: filled in failure cases (see below)
3960	*
3961	* Function results are the same as the ones for table_tuple_lock().
3962	*
3963	* In the failure cases other than TM_Invisible, the routine fills
3964	* *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
3965	* if necessary), and t_cmax (the last only for TM_SelfModified,
3966	* since we cannot obtain cmax from a combocid generated by another
3967	* transaction).
3968	* See comments for struct TM_FailureData for additional info.
3969	*
3970	* See README.tuplock for a thorough explanation of this mechanism.
3971	*/
3972	TM_Result
3973	heap_lock_tuple(Relation relation, HeapTuple tuple,
3974	CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
3975	bool follow_updates,
3976	Buffer buffer, TM_FailureData tmfd)
3977	{
3978	TM_Result result;
3979	ItemPointer tid = &(tuple->t_self);
3980	ItemId lp;
3981	Page page;
3982	Buffer vmbuffer = InvalidBuffer;
3983	BlockNumber block;
3984	TransactionId xid,
3985	xmax;
3986	uint16 old_infomask,
3987	new_infomask,
3988	new_infomask2;
3989	bool first_time = true;
3990	bool skip_tuple_lock = false;
3991	bool have_tuple_lock = false;
3992	bool cleared_all_frozen = false;
3993
3994	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3995	block = ItemPointerGetBlockNumber(tid);
3996
3997	/*
3998	* Before locking the buffer, pin the visibility map page if it appears to
3999	* be necessary. Since we haven't got the lock yet, someone else might be
4000	* in the middle of changing this, so we'll need to recheck after we have
4001	* the lock.
4002	*/
4003	if (PageIsAllVisible(BufferGetPage(*buffer)))
4004	visibilitymap_pin(relation, block, &vmbuffer);
4005
4006	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4007
4008	page = BufferGetPage(*buffer);
4009	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4010	Assert(ItemIdIsNormal(lp));
4011
4012	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4013	tuple->t_len = ItemIdGetLength(lp);
4014	tuple->t_tableOid = RelationGetRelid(relation);
4015
4016	l3:
4017	result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4018
4019	if (result == TM_Invisible)
4020	{
4021	/*
4022	* This is possible, but only when locking a tuple for ON CONFLICT
4023	* UPDATE. We return this value here rather than throwing an error in
4024	* order to give that case the opportunity to throw a more specific
4025	* error.
4026	*/
4027	result = TM_Invisible;
4028	goto out_locked;
4029	}
4030	else if (result == TM_BeingModified \|\|
4031	result == TM_Updated \|\|
4032	result == TM_Deleted)
4033	{
4034	TransactionId xwait;
4035	uint16 infomask;
4036	uint16 infomask2;
4037	bool require_sleep;
4038	ItemPointerData t_ctid;
4039
4040	/ must copy state data before unlocking buffer /
4041	xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4042	infomask = tuple->t_data->t_infomask;
4043	infomask2 = tuple->t_data->t_infomask2;
4044	ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4045
4046	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4047
4048	/*
4049	* If any subtransaction of the current top transaction already holds
4050	* a lock as strong as or stronger than what we're requesting, we
4051	* effectively hold the desired lock already. We must succeed
4052	* without trying to take the tuple lock, else we will deadlock
4053	* against anyone wanting to acquire a stronger lock.
4054	*
4055	* Note we only do this the first time we loop on the HTSU result;
4056	* there is no point in testing in subsequent passes, because
4057	* evidently our own transaction cannot have acquired a new lock after
4058	* the first time we checked.
4059	*/
4060	if (first_time)
4061	{
4062	first_time = false;
4063
4064	if (infomask & HEAP_XMAX_IS_MULTI)
4065	{
4066	int i;
4067	int nmembers;
4068	MultiXactMember *members;
4069
4070	/*
4071	* We don't need to allow old multixacts here; if that had
4072	* been the case, HeapTupleSatisfiesUpdate would have returned
4073	* MayBeUpdated and we wouldn't be here.
4074	*/
4075	nmembers =
4076	GetMultiXactIdMembers(xwait, &members, false,
4077	HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4078
4079	for (i = `0`; i < nmembers; i++)
4080	{
4081	/ only consider members of our own transaction /
4082	if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4083	continue;
4084
4085	if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4086	{
4087	pfree(members);
4088	result = TM_Ok;
4089	goto out_unlocked;
4090	}
4091	else
4092	{
4093	/*
4094	* Disable acquisition of the heavyweight tuple lock.
4095	* Otherwise, when promoting a weaker lock, we might
4096	* deadlock with another locker that has acquired the
4097	* heavyweight tuple lock and is waiting for our
4098	* transaction to finish.
4099	*
4100	* Note that in this case we still need to wait for
4101	* the multixact if required, to avoid acquiring
4102	* conflicting locks.
4103	*/
4104	skip_tuple_lock = true;
4105	}
4106	}
4107
4108	if (members)
4109	pfree(members);
4110	}
4111	else if (TransactionIdIsCurrentTransactionId(xwait))
4112	{
4113	switch (mode)
4114	{
4115	case LockTupleKeyShare:
4116	Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) \|\|
4117	HEAP_XMAX_IS_SHR_LOCKED(infomask) \|\|
4118	HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4119	result = TM_Ok;
4120	goto out_unlocked;
4121	case LockTupleShare:
4122	if (HEAP_XMAX_IS_SHR_LOCKED(infomask) \|\|
4123	HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4124	{
4125	result = TM_Ok;
4126	goto out_unlocked;
4127	}
4128	break;
4129	case LockTupleNoKeyExclusive:
4130	if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4131	{
4132	result = TM_Ok;
4133	goto out_unlocked;
4134	}
4135	break;
4136	case LockTupleExclusive:
4137	if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4138	infomask2 & HEAP_KEYS_UPDATED)
4139	{
4140	result = TM_Ok;
4141	goto out_unlocked;
4142	}
4143	break;
4144	}
4145	}
4146	}
4147
4148	/*
4149	* Initially assume that we will have to wait for the locking
4150	* transaction(s) to finish. We check various cases below in which
4151	* this can be turned off.
4152	*/
4153	require_sleep = true;
4154	if (mode == LockTupleKeyShare)
4155	{
4156	/*
4157	* If we're requesting KeyShare, and there's no update present, we
4158	* don't need to wait. Even if there is an update, we can still
4159	* continue if the key hasn't been modified.
4160	*
4161	* However, if there are updates, we need to walk the update chain
4162	* to mark future versions of the row as locked, too. That way,
4163	* if somebody deletes that future version, we're protected
4164	* against the key going away. This locking of future versions
4165	* could block momentarily, if a concurrent transaction is
4166	* deleting a key; or it could return a value to the effect that
4167	* the transaction deleting the key has already committed. So we
4168	* do this before re-locking the buffer; otherwise this would be
4169	* prone to deadlocks.
4170	*
4171	* Note that the TID we're locking was grabbed before we unlocked
4172	* the buffer. For it to change while we're not looking, the
4173	* other properties we're testing for below after re-locking the
4174	* buffer would also change, in which case we would restart this
4175	* loop above.
4176	*/
4177	if (!(infomask2 & HEAP_KEYS_UPDATED))
4178	{
4179	bool updated;
4180
4181	updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4182
4183	/*
4184	* If there are updates, follow the update chain; bail out if
4185	* that cannot be done.
4186	*/
4187	if (follow_updates && updated)
4188	{
4189	TM_Result res;
4190
4191	res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4192	GetCurrentTransactionId(),
4193	mode);
4194	if (res != TM_Ok)
4195	{
4196	result = res;
4197	/ recovery code expects to have buffer lock held /
4198	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4199	goto failed;
4200	}
4201	}
4202
4203	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4204
4205	/*
4206	* Make sure it's still an appropriate lock, else start over.
4207	* Also, if it wasn't updated before we released the lock, but
4208	* is updated now, we start over too; the reason is that we
4209	* now need to follow the update chain to lock the new
4210	* versions.
4211	*/
4212	if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4213	((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) \|\|
4214	!updated))
4215	goto l3;
4216
4217	/ Things look okay, so we can skip sleeping /
4218	require_sleep = false;
4219
4220	/*
4221	* Note we allow Xmax to change here; other updaters/lockers
4222	* could have modified it before we grabbed the buffer lock.
4223	* However, this is not a problem, because with the recheck we
4224	* just did we ensure that they still don't conflict with the
4225	* lock we want.
4226	*/
4227	}
4228	}
4229	else if (mode == LockTupleShare)
4230	{
4231	/*
4232	* If we're requesting Share, we can similarly avoid sleeping if
4233	* there's no update and no exclusive lock present.
4234	*/
4235	if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4236	!HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4237	{
4238	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4239
4240	/*
4241	* Make sure it's still an appropriate lock, else start over.
4242	* See above about allowing xmax to change.
4243	*/
4244	if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) \|\|
4245	HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4246	goto l3;
4247	require_sleep = false;
4248	}
4249	}
4250	else if (mode == LockTupleNoKeyExclusive)
4251	{
4252	/*
4253	* If we're requesting NoKeyExclusive, we might also be able to
4254	* avoid sleeping; just ensure that there no conflicting lock
4255	* already acquired.
4256	*/
4257	if (infomask & HEAP_XMAX_IS_MULTI)
4258	{
4259	if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4260	mode, NULL))
4261	{
4262	/*
4263	* No conflict, but if the xmax changed under us in the
4264	* meantime, start over.
4265	*/
4266	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4267	if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) \|\|
4268	!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4269	xwait))
4270	goto l3;
4271
4272	/ otherwise, we're good /
4273	require_sleep = false;
4274	}
4275	}
4276	else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4277	{
4278	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4279
4280	/ if the xmax changed in the meantime, start over /
4281	if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) \|\|
4282	!TransactionIdEquals(
4283	HeapTupleHeaderGetRawXmax(tuple->t_data),
4284	xwait))
4285	goto l3;
4286	/ otherwise, we're good /
4287	require_sleep = false;
4288	}
4289	}
4290
4291	/*
4292	* As a check independent from those above, we can also avoid sleeping
4293	* if the current transaction is the sole locker of the tuple. Note
4294	* that the strength of the lock already held is irrelevant; this is
4295	* not about recording the lock in Xmax (which will be done regardless
4296	* of this optimization, below). Also, note that the cases where we
4297	* hold a lock stronger than we are requesting are already handled
4298	* above by not doing anything.
4299	*
4300	* Note we only deal with the non-multixact case here; MultiXactIdWait
4301	* is well equipped to deal with this situation on its own.
4302	*/
4303	if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4304	TransactionIdIsCurrentTransactionId(xwait))
4305	{
4306	/ ... but if the xmax changed in the meantime, start over /
4307	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4308	if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) \|\|
4309	!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4310	xwait))
4311	goto l3;
4312	Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4313	require_sleep = false;
4314	}
4315
4316	/*
4317	* Time to sleep on the other transaction/multixact, if necessary.
4318	*
4319	* If the other transaction is an update/delete that's already
4320	* committed, then sleeping cannot possibly do any good: if we're
4321	* required to sleep, get out to raise an error instead.
4322	*
4323	* By here, we either have already acquired the buffer exclusive lock,
4324	* or we must wait for the locking transaction or multixact; so below
4325	* we ensure that we grab buffer lock after the sleep.
4326	*/
4327	if (require_sleep && (result == TM_Updated \|\| result == TM_Deleted))
4328	{
4329	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4330	goto failed;
4331	}
4332	else if (require_sleep)
4333	{
4334	/*
4335	* Acquire tuple lock to establish our priority for the tuple, or
4336	* die trying. LockTuple will release us when we are next-in-line
4337	* for the tuple. We must do this even if we are share-locking,
4338	* but not if we already have a weaker lock on the tuple.
4339	*
4340	* If we are forced to "start over" below, we keep the tuple lock;
4341	* this arranges that we stay at the head of the line while
4342	* rechecking tuple state.
4343	*/
4344	if (!skip_tuple_lock &&
4345	!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4346	&have_tuple_lock))
4347	{
4348	/*
4349	* This can only happen if wait_policy is Skip and the lock
4350	* couldn't be obtained.
4351	*/
4352	result = TM_WouldBlock;
4353	/ recovery code expects to have buffer lock held /
4354	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4355	goto failed;
4356	}
4357
4358	if (infomask & HEAP_XMAX_IS_MULTI)
4359	{
4360	MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4361
4362	/ We only ever lock tuples, never update them /
4363	if (status >= MultiXactStatusNoKeyUpdate)
4364	elog(ERROR, "invalid lock mode in heap_lock_tuple");
4365
4366	/ wait for multixact to end, or die trying /
4367	switch (wait_policy)
4368	{
4369	case LockWaitBlock:
4370	MultiXactIdWait((MultiXactId) xwait, status, infomask,
4371	relation, &tuple->t_self, XLTW_Lock, NULL);
4372	break;
4373	case LockWaitSkip:
4374	if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4375	status, infomask, relation,
4376	NULL))
4377	{
4378	result = TM_WouldBlock;
4379	/ recovery code expects to have buffer lock held /
4380	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4381	goto failed;
4382	}
4383	break;
4384	case LockWaitError:
4385	if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4386	status, infomask, relation,
4387	NULL))
4388	ereport(ERROR,
4389	(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4390	errmsg("could not obtain lock on row in relation \"%s\"",
4391	RelationGetRelationName(relation))));
4392
4393	break;
4394	}
4395
4396	/*
4397	* Of course, the multixact might not be done here: if we're
4398	* requesting a light lock mode, other transactions with light
4399	* locks could still be alive, as well as locks owned by our
4400	* own xact or other subxacts of this backend. We need to
4401	* preserve the surviving MultiXact members. Note that it
4402	* isn't absolutely necessary in the latter case, but doing so
4403	* is simpler.
4404	*/
4405	}
4406	else
4407	{
4408	/ wait for regular transaction to end, or die trying /
4409	switch (wait_policy)
4410	{
4411	case LockWaitBlock:
4412	XactLockTableWait(xwait, relation, &tuple->t_self,
4413	XLTW_Lock);
4414	break;
4415	case LockWaitSkip:
4416	if (!ConditionalXactLockTableWait(xwait))
4417	{
4418	result = TM_WouldBlock;
4419	/ recovery code expects to have buffer lock held /
4420	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4421	goto failed;
4422	}
4423	break;
4424	case LockWaitError:
4425	if (!ConditionalXactLockTableWait(xwait))
4426	ereport(ERROR,
4427	(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4428	errmsg("could not obtain lock on row in relation \"%s\"",
4429	RelationGetRelationName(relation))));
4430	break;
4431	}
4432	}
4433
4434	/ if there are updates, follow the update chain /
4435	if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4436	{
4437	TM_Result res;
4438
4439	res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4440	GetCurrentTransactionId(),
4441	mode);
4442	if (res != TM_Ok)
4443	{
4444	result = res;
4445	/ recovery code expects to have buffer lock held /
4446	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4447	goto failed;
4448	}
4449	}
4450
4451	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4452
4453	/*
4454	* xwait is done, but if xwait had just locked the tuple then some
4455	* other xact could update this tuple before we get to this point.
4456	* Check for xmax change, and start over if so.
4457	*/
4458	if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) \|\|
4459	!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4460	xwait))
4461	goto l3;
4462
4463	if (!(infomask & HEAP_XMAX_IS_MULTI))
4464	{
4465	/*
4466	* Otherwise check if it committed or aborted. Note we cannot
4467	* be here if the tuple was only locked by somebody who didn't
4468	* conflict with us; that would have been handled above. So
4469	* that transaction must necessarily be gone by now. But
4470	* don't check for this in the multixact case, because some
4471	* locker transactions might still be running.
4472	*/
4473	UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4474	}
4475	}
4476
4477	/ By here, we're certain that we hold buffer exclusive lock again /
4478
4479	/*
4480	* We may lock if previous xmax aborted, or if it committed but only
4481	* locked the tuple without updating it; or if we didn't have to wait
4482	* at all for whatever reason.
4483	*/
4484	if (!require_sleep \|\|
4485	(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) \|\|
4486	HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) \|\|
4487	HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4488	result = TM_Ok;
4489	else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid) \|\|
4490	HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data))
4491	result = TM_Updated;
4492	else
4493	result = TM_Deleted;
4494	}
4495
4496	failed:
4497	if (result != TM_Ok)
4498	{
4499	Assert(result == TM_SelfModified \|\| result == TM_Updated \|\|
4500	result == TM_Deleted \|\| result == TM_WouldBlock);
4501	Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4502	Assert(result != TM_Updated \|\|
4503	!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4504	tmfd->ctid = tuple->t_data->t_ctid;
4505	tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4506	if (result == TM_SelfModified)
4507	tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4508	else
4509	tmfd->cmax = InvalidCommandId;
4510	goto out_locked;
4511	}
4512
4513	/*
4514	* If we didn't pin the visibility map page and the page has become all
4515	* visible while we were busy locking the buffer, or during some
4516	* subsequent window during which we had it unlocked, we'll have to unlock
4517	* and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4518	* unfortunate, especially since we'll now have to recheck whether the
4519	* tuple has been locked or updated under us, but hopefully it won't
4520	* happen very often.
4521	*/
4522	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4523	{
4524	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4525	visibilitymap_pin(relation, block, &vmbuffer);
4526	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4527	goto l3;
4528	}
4529
4530	xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4531	old_infomask = tuple->t_data->t_infomask;
4532
4533	/*
4534	* If this is the first possibly-multixact-able operation in the current
4535	* transaction, set my per-backend OldestMemberMXactId setting. We can be
4536	* certain that the transaction will never become a member of any older
4537	* MultiXactIds than that. (We have to do this even if we end up just
4538	* using our own TransactionId below, since some other backend could
4539	* incorporate our XID into a MultiXact immediately afterwards.)
4540	*/
4541	MultiXactIdSetOldestMember();
4542
4543	/*
4544	* Compute the new xmax and infomask to store into the tuple. Note we do
4545	* not modify the tuple just yet, because that would leave it in the wrong
4546	* state if multixact.c elogs.
4547	*/
4548	compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4549	GetCurrentTransactionId(), mode, false,
4550	&xid, &new_infomask, &new_infomask2);
4551
4552	START_CRIT_SECTION();
4553
4554	/*
4555	* Store transaction information of xact locking the tuple.
4556	*
4557	* Note: Cmax is meaningless in this context, so don't set it; this avoids
4558	* possibly generating a useless combo CID. Moreover, if we're locking a
4559	* previously updated tuple, it's important to preserve the Cmax.
4560	*
4561	* Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4562	* we would break the HOT chain.
4563	*/
4564	tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4565	tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4566	tuple->t_data->t_infomask \|= new_infomask;
4567	tuple->t_data->t_infomask2 \|= new_infomask2;
4568	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4569	HeapTupleHeaderClearHotUpdated(tuple->t_data);
4570	HeapTupleHeaderSetXmax(tuple->t_data, xid);
4571
4572	/*
4573	* Make sure there is no forward chain link in t_ctid. Note that in the
4574	* cases where the tuple has been updated, we must not overwrite t_ctid,
4575	* because it was set by the updater. Moreover, if the tuple has been
4576	* updated, we need to follow the update chain to lock the new versions of
4577	* the tuple as well.
4578	*/
4579	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4580	tuple->t_data->t_ctid = *tid;
4581
4582	/ Clear only the all-frozen bit on visibility map if needed /
4583	if (PageIsAllVisible(page) &&
4584	visibilitymap_clear(relation, block, vmbuffer,
4585	VISIBILITYMAP_ALL_FROZEN))
4586	cleared_all_frozen = true;
4587
4588
4589	MarkBufferDirty(*buffer);
4590
4591	/*
4592	* XLOG stuff. You might think that we don't need an XLOG record because
4593	* there is no state change worth restoring after a crash. You would be
4594	* wrong however: we have just written either a TransactionId or a
4595	* MultiXactId that may never have been seen on disk before, and we need
4596	* to make sure that there are XLOG entries covering those ID numbers.
4597	* Else the same IDs might be re-used after a crash, which would be
4598	* disastrous if this page made it to disk before the crash. Essentially
4599	* we have to enforce the WAL log-before-data rule even in this case.
4600	* (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4601	* entries for everything anyway.)
4602	*/
4603	if (RelationNeedsWAL(relation))
4604	{
4605	xl_heap_lock xlrec;
4606	XLogRecPtr recptr;
4607
4608	XLogBeginInsert();
4609	XLogRegisterBuffer(`0`, *buffer, REGBUF_STANDARD);
4610
4611	xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4612	xlrec.locking_xid = xid;
4613	xlrec.infobits_set = compute_infobits(new_infomask,
4614	tuple->t_data->t_infomask2);
4615	xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : `0`;
4616	XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4617
4618	/ we don't decode row locks atm, so no need to log the origin /
4619
4620	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4621
4622	PageSetLSN(page, recptr);
4623	}
4624
4625	END_CRIT_SECTION();
4626
4627	result = TM_Ok;
4628
4629	out_locked:
4630	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4631
4632	out_unlocked:
4633	if (BufferIsValid(vmbuffer))
4634	ReleaseBuffer(vmbuffer);
4635
4636	/*
4637	* Don't update the visibility map here. Locking a tuple doesn't change
4638	* visibility info.
4639	*/
4640
4641	/*
4642	* Now that we have successfully marked the tuple as locked, we can
4643	* release the lmgr tuple lock, if we had it.
4644	*/
4645	if (have_tuple_lock)
4646	UnlockTupleTuplock(relation, tid, mode);
4647
4648	return result;
4649	}
4650
4651	/*
4652	* Acquire heavyweight lock on the given tuple, in preparation for acquiring
4653	* its normal, Xmax-based tuple lock.
4654	*
4655	* have_tuple_lock is an input and output parameter: on input, it indicates
4656	* whether the lock has previously been acquired (and this function does
4657	* nothing in that case). If this function returns success, have_tuple_lock
4658	* has been flipped to true.
4659	*
4660	* Returns false if it was unable to obtain the lock; this can only happen if
4661	* wait_policy is Skip.
4662	*/
4663	static bool
4664	heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
4665	LockWaitPolicy wait_policy, bool *have_tuple_lock)
4666	{
4667	if (*have_tuple_lock)
4668	return true;
4669
4670	switch (wait_policy)
4671	{
4672	case LockWaitBlock:
4673	LockTupleTuplock(relation, tid, mode);
4674	break;
4675
4676	case LockWaitSkip:
4677	if (!ConditionalLockTupleTuplock(relation, tid, mode))
4678	return false;
4679	break;
4680
4681	case LockWaitError:
4682	if (!ConditionalLockTupleTuplock(relation, tid, mode))
4683	ereport(ERROR,
4684	(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4685	errmsg("could not obtain lock on row in relation \"%s\"",
4686	RelationGetRelationName(relation))));
4687	break;
4688	}
4689	*have_tuple_lock = true;
4690
4691	return true;
4692	}
4693
4694	/*
4695	* Given an original set of Xmax and infomask, and a transaction (identified by
4696	* add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4697	* corresponding infomasks to use on the tuple.
4698	*
4699	* Note that this might have side effects such as creating a new MultiXactId.
4700	*
4701	* Most callers will have called HeapTupleSatisfiesUpdate before this function;
4702	* that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4703	* but it was not running anymore. There is a race condition, which is that the
4704	* MultiXactId may have finished since then, but that uncommon case is handled
4705	* either here, or within MultiXactIdExpand.
4706	*
4707	* There is a similar race condition possible when the old xmax was a regular
4708	* TransactionId. We test TransactionIdIsInProgress again just to narrow the
4709	* window, but it's still possible to end up creating an unnecessary
4710	* MultiXactId. Fortunately this is harmless.
4711	*/
4712	static void
4713	compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
4714	uint16 old_infomask2, TransactionId add_to_xmax,
4715	LockTupleMode mode, bool is_update,
4716	TransactionId result_xmax, uint16 result_infomask,
4717	uint16 *result_infomask2)
4718	{
4719	TransactionId new_xmax;
4720	uint16 new_infomask,
4721	new_infomask2;
4722
4723	Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
4724
4725	l5:
4726	new_infomask = `0`;
4727	new_infomask2 = `0`;
4728	if (old_infomask & HEAP_XMAX_INVALID)
4729	{
4730	/*
4731	* No previous locker; we just insert our own TransactionId.
4732	*
4733	* Note that it's critical that this case be the first one checked,
4734	* because there are several blocks below that come back to this one
4735	* to implement certain optimizations; old_infomask might contain
4736	* other dirty bits in those cases, but we don't really care.
4737	*/
4738	if (is_update)
4739	{
4740	new_xmax = add_to_xmax;
4741	if (mode == LockTupleExclusive)
4742	new_infomask2 \|= HEAP_KEYS_UPDATED;
4743	}
4744	else
4745	{
4746	new_infomask \|= HEAP_XMAX_LOCK_ONLY;
4747	switch (mode)
4748	{
4749	case LockTupleKeyShare:
4750	new_xmax = add_to_xmax;
4751	new_infomask \|= HEAP_XMAX_KEYSHR_LOCK;
4752	break;
4753	case LockTupleShare:
4754	new_xmax = add_to_xmax;
4755	new_infomask \|= HEAP_XMAX_SHR_LOCK;
4756	break;
4757	case LockTupleNoKeyExclusive:
4758	new_xmax = add_to_xmax;
4759	new_infomask \|= HEAP_XMAX_EXCL_LOCK;
4760	break;
4761	case LockTupleExclusive:
4762	new_xmax = add_to_xmax;
4763	new_infomask \|= HEAP_XMAX_EXCL_LOCK;
4764	new_infomask2 \|= HEAP_KEYS_UPDATED;
4765	break;
4766	default:
4767	new_xmax = InvalidTransactionId; / silence compiler /
4768	elog(ERROR, "invalid lock mode");
4769	}
4770	}
4771	}
4772	else if (old_infomask & HEAP_XMAX_IS_MULTI)
4773	{
4774	MultiXactStatus new_status;
4775
4776	/*
4777	* Currently we don't allow XMAX_COMMITTED to be set for multis, so
4778	* cross-check.
4779	*/
4780	Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4781
4782	/*
4783	* A multixact together with LOCK_ONLY set but neither lock bit set
4784	* (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4785	* anymore. This check is critical for databases upgraded by
4786	* pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4787	* that such multis are never passed.
4788	*/
4789	if (HEAP_LOCKED_UPGRADED(old_infomask))
4790	{
4791	old_infomask &= ~HEAP_XMAX_IS_MULTI;
4792	old_infomask \|= HEAP_XMAX_INVALID;
4793	goto l5;
4794	}
4795
4796	/*
4797	* If the XMAX is already a MultiXactId, then we need to expand it to
4798	* include add_to_xmax; but if all the members were lockers and are
4799	* all gone, we can do away with the IS_MULTI bit and just set
4800	* add_to_xmax as the only locker/updater. If all lockers are gone
4801	* and we have an updater that aborted, we can also do without a
4802	* multi.
4803	*
4804	* The cost of doing GetMultiXactIdMembers would be paid by
4805	* MultiXactIdExpand if we weren't to do this, so this check is not
4806	* incurring extra work anyhow.
4807	*/
4808	if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4809	{
4810	if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) \|\|
4811	!TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
4812	old_infomask)))
4813	{
4814	/*
4815	* Reset these bits and restart; otherwise fall through to
4816	* create a new multi below.
4817	*/
4818	old_infomask &= ~HEAP_XMAX_IS_MULTI;
4819	old_infomask \|= HEAP_XMAX_INVALID;
4820	goto l5;
4821	}
4822	}
4823
4824	new_status = get_mxact_status_for_lock(mode, is_update);
4825
4826	new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
4827	new_status);
4828	GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4829	}
4830	else if (old_infomask & HEAP_XMAX_COMMITTED)
4831	{
4832	/*
4833	* It's a committed update, so we need to preserve him as updater of
4834	* the tuple.
4835	*/
4836	MultiXactStatus status;
4837	MultiXactStatus new_status;
4838
4839	if (old_infomask2 & HEAP_KEYS_UPDATED)
4840	status = MultiXactStatusUpdate;
4841	else
4842	status = MultiXactStatusNoKeyUpdate;
4843
4844	new_status = get_mxact_status_for_lock(mode, is_update);
4845
4846	/*
4847	* since it's not running, it's obviously impossible for the old
4848	* updater to be identical to the current one, so we need not check
4849	* for that case as we do in the block above.
4850	*/
4851	new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4852	GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4853	}
4854	else if (TransactionIdIsInProgress(xmax))
4855	{
4856	/*
4857	* If the XMAX is a valid, in-progress TransactionId, then we need to
4858	* create a new MultiXactId that includes both the old locker or
4859	* updater and our own TransactionId.
4860	*/
4861	MultiXactStatus new_status;
4862	MultiXactStatus old_status;
4863	LockTupleMode old_mode;
4864
4865	if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
4866	{
4867	if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
4868	old_status = MultiXactStatusForKeyShare;
4869	else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
4870	old_status = MultiXactStatusForShare;
4871	else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
4872	{
4873	if (old_infomask2 & HEAP_KEYS_UPDATED)
4874	old_status = MultiXactStatusForUpdate;
4875	else
4876	old_status = MultiXactStatusForNoKeyUpdate;
4877	}
4878	else
4879	{
4880	/*
4881	* LOCK_ONLY can be present alone only when a page has been
4882	* upgraded by pg_upgrade. But in that case,
4883	* TransactionIdIsInProgress() should have returned false. We
4884	* assume it's no longer locked in this case.
4885	*/
4886	elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
4887	old_infomask \|= HEAP_XMAX_INVALID;
4888	old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
4889	goto l5;
4890	}
4891	}
4892	else
4893	{
4894	/ it's an update, but which kind? /
4895	if (old_infomask2 & HEAP_KEYS_UPDATED)
4896	old_status = MultiXactStatusUpdate;
4897	else
4898	old_status = MultiXactStatusNoKeyUpdate;
4899	}
4900
4901	old_mode = TUPLOCK_from_mxstatus(old_status);
4902
4903	/*
4904	* If the lock to be acquired is for the same TransactionId as the
4905	* existing lock, there's an optimization possible: consider only the
4906	* strongest of both locks as the only one present, and restart.
4907	*/
4908	if (xmax == add_to_xmax)
4909	{
4910	/*
4911	* Note that it's not possible for the original tuple to be
4912	* updated: we wouldn't be here because the tuple would have been
4913	* invisible and we wouldn't try to update it. As a subtlety,
4914	* this code can also run when traversing an update chain to lock
4915	* future versions of a tuple. But we wouldn't be here either,
4916	* because the add_to_xmax would be different from the original
4917	* updater.
4918	*/
4919	Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
4920
4921	/ acquire the strongest of both /
4922	if (mode < old_mode)
4923	mode = old_mode;
4924	/ mustn't touch is_update /
4925
4926	old_infomask \|= HEAP_XMAX_INVALID;
4927	goto l5;
4928	}
4929
4930	/ otherwise, just fall back to creating a new multixact /
4931	new_status = get_mxact_status_for_lock(mode, is_update);
4932	new_xmax = MultiXactIdCreate(xmax, old_status,
4933	add_to_xmax, new_status);
4934	GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4935	}
4936	else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
4937	TransactionIdDidCommit(xmax))
4938	{
4939	/*
4940	* It's a committed update, so we gotta preserve him as updater of the
4941	* tuple.
4942	*/
4943	MultiXactStatus status;
4944	MultiXactStatus new_status;
4945
4946	if (old_infomask2 & HEAP_KEYS_UPDATED)
4947	status = MultiXactStatusUpdate;
4948	else
4949	status = MultiXactStatusNoKeyUpdate;
4950
4951	new_status = get_mxact_status_for_lock(mode, is_update);
4952
4953	/*
4954	* since it's not running, it's obviously impossible for the old
4955	* updater to be identical to the current one, so we need not check
4956	* for that case as we do in the block above.
4957	*/
4958	new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4959	GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4960	}
4961	else
4962	{
4963	/*
4964	* Can get here iff the locking/updating transaction was running when
4965	* the infomask was extracted from the tuple, but finished before
4966	* TransactionIdIsInProgress got to run. Deal with it as if there was
4967	* no locker at all in the first place.
4968	*/
4969	old_infomask \|= HEAP_XMAX_INVALID;
4970	goto l5;
4971	}
4972
4973	*result_infomask = new_infomask;
4974	*result_infomask2 = new_infomask2;
4975	*result_xmax = new_xmax;
4976	}
4977
4978	/*
4979	* Subroutine for heap_lock_updated_tuple_rec.
4980	*
4981	* Given a hypothetical multixact status held by the transaction identified
4982	* with the given xid, does the current transaction need to wait, fail, or can
4983	* it continue if it wanted to acquire a lock of the given mode? "needwait"
4984	* is set to true if waiting is necessary; if it can continue, then TM_Ok is
4985	* returned. If the lock is already held by the current transaction, return
4986	* TM_SelfModified. In case of a conflict with another transaction, a
4987	* different HeapTupleSatisfiesUpdate return code is returned.
4988	*
4989	* The held status is said to be hypothetical because it might correspond to a
4990	* lock held by a single Xid, i.e. not a real MultiXactId; we express it this
4991	* way for simplicity of API.
4992	*/
4993	static TM_Result
4994	test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
4995	LockTupleMode mode, HeapTuple tup,
4996	bool *needwait)
4997	{
4998	MultiXactStatus wantedstatus;
4999
5000	*needwait = false;
5001	wantedstatus = get_mxact_status_for_lock(mode, false);
5002
5003	/*
5004	* Note: we must check TransactionIdIsInProgress before
5005	* TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5006	* for an explanation.
5007	*/
5008	if (TransactionIdIsCurrentTransactionId(xid))
5009	{
5010	/*
5011	* The tuple has already been locked by our own transaction. This is
5012	* very rare but can happen if multiple transactions are trying to
5013	* lock an ancient version of the same tuple.
5014	*/
5015	return TM_SelfModified;
5016	}
5017	else if (TransactionIdIsInProgress(xid))
5018	{
5019	/*
5020	* If the locking transaction is running, what we do depends on
5021	* whether the lock modes conflict: if they do, then we must wait for
5022	* it to finish; otherwise we can fall through to lock this tuple
5023	* version without waiting.
5024	*/
5025	if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5026	LOCKMODE_from_mxstatus(wantedstatus)))
5027	{
5028	*needwait = true;
5029	}
5030
5031	/*
5032	* If we set needwait above, then this value doesn't matter;
5033	* otherwise, this value signals to caller that it's okay to proceed.
5034	*/
5035	return TM_Ok;
5036	}
5037	else if (TransactionIdDidAbort(xid))
5038	return TM_Ok;
5039	else if (TransactionIdDidCommit(xid))
5040	{
5041	/*
5042	* The other transaction committed. If it was only a locker, then the
5043	* lock is completely gone now and we can return success; but if it
5044	* was an update, then what we do depends on whether the two lock
5045	* modes conflict. If they conflict, then we must report error to
5046	* caller. But if they don't, we can fall through to allow the current
5047	* transaction to lock the tuple.
5048	*
5049	* Note: the reason we worry about ISUPDATE here is because as soon as
5050	* a transaction ends, all its locks are gone and meaningless, and
5051	* thus we can ignore them; whereas its updates persist. In the
5052	* TransactionIdIsInProgress case, above, we don't need to check
5053	* because we know the lock is still "alive" and thus a conflict needs
5054	* always be checked.
5055	*/
5056	if (!ISUPDATE_from_mxstatus(status))
5057	return TM_Ok;
5058
5059	if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5060	LOCKMODE_from_mxstatus(wantedstatus)))
5061	{
5062	/ bummer /
5063	if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid) \|\|
5064	HeapTupleHeaderIndicatesMovedPartitions(tup->t_data))
5065	return TM_Updated;
5066	else
5067	return TM_Deleted;
5068	}
5069
5070	return TM_Ok;
5071	}
5072
5073	/ Not in progress, not aborted, not committed -- must have crashed /
5074	return TM_Ok;
5075	}
5076
5077
5078	/*
5079	* Recursive part of heap_lock_updated_tuple
5080	*
5081	* Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5082	* xid with the given mode; if this tuple is updated, recurse to lock the new
5083	* version as well.
5084	*/
5085	static TM_Result
5086	heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5087	LockTupleMode mode)
5088	{
5089	TM_Result result;
5090	ItemPointerData tupid;
5091	HeapTupleData mytup;
5092	Buffer buf;
5093	uint16 new_infomask,
5094	new_infomask2,
5095	old_infomask,
5096	old_infomask2;
5097	TransactionId xmax,
5098	new_xmax;
5099	TransactionId priorXmax = InvalidTransactionId;
5100	bool cleared_all_frozen = false;
5101	bool pinned_desired_page;
5102	Buffer vmbuffer = InvalidBuffer;
5103	BlockNumber block;
5104
5105	ItemPointerCopy(tid, &tupid);
5106
5107	for (;;)
5108	{
5109	new_infomask = `0`;
5110	new_xmax = InvalidTransactionId;
5111	block = ItemPointerGetBlockNumber(&tupid);
5112	ItemPointerCopy(&tupid, &(mytup.t_self));
5113
5114	if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5115	{
5116	/*
5117	* if we fail to find the updated version of the tuple, it's
5118	* because it was vacuumed/pruned away after its creator
5119	* transaction aborted. So behave as if we got to the end of the
5120	* chain, and there's no further tuple to lock: return success to
5121	* caller.
5122	*/
5123	result = TM_Ok;
5124	goto out_unlocked;
5125	}
5126
5127	l4:
5128	CHECK_FOR_INTERRUPTS();
5129
5130	/*
5131	* Before locking the buffer, pin the visibility map page if it
5132	* appears to be necessary. Since we haven't got the lock yet,
5133	* someone else might be in the middle of changing this, so we'll need
5134	* to recheck after we have the lock.
5135	*/
5136	if (PageIsAllVisible(BufferGetPage(buf)))
5137	{
5138	visibilitymap_pin(rel, block, &vmbuffer);
5139	pinned_desired_page = true;
5140	}
5141	else
5142	pinned_desired_page = false;
5143
5144	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5145
5146	/*
5147	* If we didn't pin the visibility map page and the page has become
5148	* all visible while we were busy locking the buffer, we'll have to
5149	* unlock and re-lock, to avoid holding the buffer lock across I/O.
5150	* That's a bit unfortunate, but hopefully shouldn't happen often.
5151	*
5152	* Note: in some paths through this function, we will reach here
5153	* holding a pin on a vm page that may or may not be the one matching
5154	* this page. If this page isn't all-visible, we won't use the vm
5155	* page, but we hold onto such a pin till the end of the function.
5156	*/
5157	if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5158	{
5159	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5160	visibilitymap_pin(rel, block, &vmbuffer);
5161	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5162	}
5163
5164	/*
5165	* Check the tuple XMIN against prior XMAX, if any. If we reached the
5166	* end of the chain, we're done, so return success.
5167	*/
5168	if (TransactionIdIsValid(priorXmax) &&
5169	!TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5170	priorXmax))
5171	{
5172	result = TM_Ok;
5173	goto out_locked;
5174	}
5175
5176	/*
5177	* Also check Xmin: if this tuple was created by an aborted
5178	* (sub)transaction, then we already locked the last live one in the
5179	* chain, thus we're done, so return success.
5180	*/
5181	if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5182	{
5183	result = TM_Ok;
5184	goto out_locked;
5185	}
5186
5187	old_infomask = mytup.t_data->t_infomask;
5188	old_infomask2 = mytup.t_data->t_infomask2;
5189	xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5190
5191	/*
5192	* If this tuple version has been updated or locked by some concurrent
5193	* transaction(s), what we do depends on whether our lock mode
5194	* conflicts with what those other transactions hold, and also on the
5195	* status of them.
5196	*/
5197	if (!(old_infomask & HEAP_XMAX_INVALID))
5198	{
5199	TransactionId rawxmax;
5200	bool needwait;
5201
5202	rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5203	if (old_infomask & HEAP_XMAX_IS_MULTI)
5204	{
5205	int nmembers;
5206	int i;
5207	MultiXactMember *members;
5208
5209	/*
5210	* We don't need a test for pg_upgrade'd tuples: this is only
5211	* applied to tuples after the first in an update chain. Said
5212	* first tuple in the chain may well be locked-in-9.2-and-
5213	* pg_upgraded, but that one was already locked by our caller,
5214	* not us; and any subsequent ones cannot be because our
5215	* caller must necessarily have obtained a snapshot later than
5216	* the pg_upgrade itself.
5217	*/
5218	Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5219
5220	nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5221	HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5222	for (i = `0`; i < nmembers; i++)
5223	{
5224	result = test_lockmode_for_conflict(members[i].status,
5225	members[i].xid,
5226	mode,
5227	&mytup,
5228	&needwait);
5229
5230	/*
5231	* If the tuple was already locked by ourselves in a
5232	* previous iteration of this (say heap_lock_tuple was
5233	* forced to restart the locking loop because of a change
5234	* in xmax), then we hold the lock already on this tuple
5235	* version and we don't need to do anything; and this is
5236	* not an error condition either. We just need to skip
5237	* this tuple and continue locking the next version in the
5238	* update chain.
5239	*/
5240	if (result == TM_SelfModified)
5241	{
5242	pfree(members);
5243	goto next;
5244	}
5245
5246	if (needwait)
5247	{
5248	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5249	XactLockTableWait(members[i].xid, rel,
5250	&mytup.t_self,
5251	XLTW_LockUpdated);
5252	pfree(members);
5253	goto l4;
5254	}
5255	if (result != TM_Ok)
5256	{
5257	pfree(members);
5258	goto out_locked;
5259	}
5260	}
5261	if (members)
5262	pfree(members);
5263	}
5264	else
5265	{
5266	MultiXactStatus status;
5267
5268	/*
5269	* For a non-multi Xmax, we first need to compute the
5270	* corresponding MultiXactStatus by using the infomask bits.
5271	*/
5272	if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5273	{
5274	if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5275	status = MultiXactStatusForKeyShare;
5276	else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5277	status = MultiXactStatusForShare;
5278	else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5279	{
5280	if (old_infomask2 & HEAP_KEYS_UPDATED)
5281	status = MultiXactStatusForUpdate;
5282	else
5283	status = MultiXactStatusForNoKeyUpdate;
5284	}
5285	else
5286	{
5287	/*
5288	* LOCK_ONLY present alone (a pg_upgraded tuple marked
5289	* as share-locked in the old cluster) shouldn't be
5290	* seen in the middle of an update chain.
5291	*/
5292	elog(ERROR, "invalid lock status in tuple");
5293	}
5294	}
5295	else
5296	{
5297	/ it's an update, but which kind? /
5298	if (old_infomask2 & HEAP_KEYS_UPDATED)
5299	status = MultiXactStatusUpdate;
5300	else
5301	status = MultiXactStatusNoKeyUpdate;
5302	}
5303
5304	result = test_lockmode_for_conflict(status, rawxmax, mode,
5305	&mytup, &needwait);
5306
5307	/*
5308	* If the tuple was already locked by ourselves in a previous
5309	* iteration of this (say heap_lock_tuple was forced to
5310	* restart the locking loop because of a change in xmax), then
5311	* we hold the lock already on this tuple version and we don't
5312	* need to do anything; and this is not an error condition
5313	* either. We just need to skip this tuple and continue
5314	* locking the next version in the update chain.
5315	*/
5316	if (result == TM_SelfModified)
5317	goto next;
5318
5319	if (needwait)
5320	{
5321	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5322	XactLockTableWait(rawxmax, rel, &mytup.t_self,
5323	XLTW_LockUpdated);
5324	goto l4;
5325	}
5326	if (result != TM_Ok)
5327	{
5328	goto out_locked;
5329	}
5330	}
5331	}
5332
5333	/ compute the new Xmax and infomask values for the tuple ... /
5334	compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5335	xid, mode, false,
5336	&new_xmax, &new_infomask, &new_infomask2);
5337
5338	if (PageIsAllVisible(BufferGetPage(buf)) &&
5339	visibilitymap_clear(rel, block, vmbuffer,
5340	VISIBILITYMAP_ALL_FROZEN))
5341	cleared_all_frozen = true;
5342
5343	START_CRIT_SECTION();
5344
5345	/ ... and set them /
5346	HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5347	mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5348	mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5349	mytup.t_data->t_infomask \|= new_infomask;
5350	mytup.t_data->t_infomask2 \|= new_infomask2;
5351
5352	MarkBufferDirty(buf);
5353
5354	/ XLOG stuff /
5355	if (RelationNeedsWAL(rel))
5356	{
5357	xl_heap_lock_updated xlrec;
5358	XLogRecPtr recptr;
5359	Page page = BufferGetPage(buf);
5360
5361	XLogBeginInsert();
5362	XLogRegisterBuffer(`0`, buf, REGBUF_STANDARD);
5363
5364	xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5365	xlrec.xmax = new_xmax;
5366	xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5367	xlrec.flags =
5368	cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : `0`;
5369
5370	XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5371
5372	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5373
5374	PageSetLSN(page, recptr);
5375	}
5376
5377	END_CRIT_SECTION();
5378
5379	next:
5380	/ if we find the end of update chain, we're done. /
5381	if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID \|\|
5382	HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) \|\|
5383	ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) \|\|
5384	HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5385	{
5386	result = TM_Ok;
5387	goto out_locked;
5388	}
5389
5390	/ tail recursion /
5391	priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5392	ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5393	UnlockReleaseBuffer(buf);
5394	}
5395
5396	result = TM_Ok;
5397
5398	out_locked:
5399	UnlockReleaseBuffer(buf);
5400
5401	out_unlocked:
5402	if (vmbuffer != InvalidBuffer)
5403	ReleaseBuffer(vmbuffer);
5404
5405	return result;
5406	}
5407
5408	/*
5409	* heap_lock_updated_tuple
5410	* Follow update chain when locking an updated tuple, acquiring locks (row
5411	* marks) on the updated versions.
5412	*
5413	* The initial tuple is assumed to be already locked.
5414	*
5415	* This function doesn't check visibility, it just unconditionally marks the
5416	* tuple(s) as locked. If any tuple in the updated chain is being deleted
5417	* concurrently (or updated with the key being modified), sleep until the
5418	* transaction doing it is finished.
5419	*
5420	* Note that we don't acquire heavyweight tuple locks on the tuples we walk
5421	* when we have to wait for other transactions to release them, as opposed to
5422	* what heap_lock_tuple does. The reason is that having more than one
5423	* transaction walking the chain is probably uncommon enough that risk of
5424	* starvation is not likely: one of the preconditions for being here is that
5425	* the snapshot in use predates the update that created this tuple (because we
5426	* started at an earlier version of the tuple), but at the same time such a
5427	* transaction cannot be using repeatable read or serializable isolation
5428	* levels, because that would lead to a serializability failure.
5429	*/
5430	static TM_Result
5431	heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5432	TransactionId xid, LockTupleMode mode)
5433	{
5434	/*
5435	* If the tuple has not been updated, or has moved into another partition
5436	* (effectively a delete) stop here.
5437	*/
5438	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5439	!ItemPointerEquals(&tuple->t_self, ctid))
5440	{
5441	/*
5442	* If this is the first possibly-multixact-able operation in the
5443	* current transaction, set my per-backend OldestMemberMXactId
5444	* setting. We can be certain that the transaction will never become a
5445	* member of any older MultiXactIds than that. (We have to do this
5446	* even if we end up just using our own TransactionId below, since
5447	* some other backend could incorporate our XID into a MultiXact
5448	* immediately afterwards.)
5449	*/
5450	MultiXactIdSetOldestMember();
5451
5452	return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5453	}
5454
5455	/ nothing to lock /
5456	return TM_Ok;
5457	}
5458
5459	/*
5460	* heap_finish_speculative - mark speculative insertion as successful
5461	*
5462	* To successfully finish a speculative insertion we have to clear speculative
5463	* token from tuple. To do so the t_ctid field, which will contain a
5464	* speculative token value, is modified in place to point to the tuple itself,
5465	* which is characteristic of a newly inserted ordinary tuple.
5466	*
5467	* NB: It is not ok to commit without either finishing or aborting a
5468	* speculative insertion. We could treat speculative tuples of committed
5469	* transactions implicitly as completed, but then we would have to be prepared
5470	* to deal with speculative tokens on committed tuples. That wouldn't be
5471	* difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5472	* but clearing the token at completion isn't very expensive either.
5473	* An explicit confirmation WAL record also makes logical decoding simpler.
5474	*/
5475	void
5476	heap_finish_speculative(Relation relation, ItemPointer tid)
5477	{
5478	Buffer buffer;
5479	Page page;
5480	OffsetNumber offnum;
5481	ItemId lp = NULL;
5482	HeapTupleHeader htup;
5483
5484	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5485	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5486	page = (Page) BufferGetPage(buffer);
5487
5488	offnum = ItemPointerGetOffsetNumber(tid);
5489	if (PageGetMaxOffsetNumber(page) >= offnum)
5490	lp = PageGetItemId(page, offnum);
5491
5492	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
5493	elog(ERROR, "invalid lp");
5494
5495	htup = (HeapTupleHeader) PageGetItem(page, lp);
5496
5497	/ SpecTokenOffsetNumber should be distinguishable from any real offset /
5498	StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
5499	"invalid speculative token constant");
5500
5501	/ NO EREPORT(ERROR) from here till changes are logged /
5502	START_CRIT_SECTION();
5503
5504	Assert(HeapTupleHeaderIsSpeculative(htup));
5505
5506	MarkBufferDirty(buffer);
5507
5508	/*
5509	* Replace the speculative insertion token with a real t_ctid, pointing to
5510	* itself like it does on regular tuples.
5511	*/
5512	htup->t_ctid = *tid;
5513
5514	/ XLOG stuff /
5515	if (RelationNeedsWAL(relation))
5516	{
5517	xl_heap_confirm xlrec;
5518	XLogRecPtr recptr;
5519
5520	xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5521
5522	XLogBeginInsert();
5523
5524	/ We want the same filtering on this as on a plain insert /
5525	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5526
5527	XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5528	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
5529
5530	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5531
5532	PageSetLSN(page, recptr);
5533	}
5534
5535	END_CRIT_SECTION();
5536
5537	UnlockReleaseBuffer(buffer);
5538	}
5539
5540	/*
5541	* heap_abort_speculative - kill a speculatively inserted tuple
5542	*
5543	* Marks a tuple that was speculatively inserted in the same command as dead,
5544	* by setting its xmin as invalid. That makes it immediately appear as dead
5545	* to all transactions, including our own. In particular, it makes
5546	* HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5547	* inserting a duplicate key value won't unnecessarily wait for our whole
5548	* transaction to finish (it'll just wait for our speculative insertion to
5549	* finish).
5550	*
5551	* Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5552	* that arise due to a mutual dependency that is not user visible. By
5553	* definition, unprincipled deadlocks cannot be prevented by the user
5554	* reordering lock acquisition in client code, because the implementation level
5555	* lock acquisitions are not under the user's direct control. If speculative
5556	* inserters did not take this precaution, then under high concurrency they
5557	* could deadlock with each other, which would not be acceptable.
5558	*
5559	* This is somewhat redundant with heap_delete, but we prefer to have a
5560	* dedicated routine with stripped down requirements. Note that this is also
5561	* used to delete the TOAST tuples created during speculative insertion.
5562	*
5563	* This routine does not affect logical decoding as it only looks at
5564	* confirmation records.
5565	*/
5566	void
5567	heap_abort_speculative(Relation relation, ItemPointer tid)
5568	{
5569	TransactionId xid = GetCurrentTransactionId();
5570	ItemId lp;
5571	HeapTupleData tp;
5572	Page page;
5573	BlockNumber block;
5574	Buffer buffer;
5575
5576	Assert(ItemPointerIsValid(tid));
5577
5578	block = ItemPointerGetBlockNumber(tid);
5579	buffer = ReadBuffer(relation, block);
5580	page = BufferGetPage(buffer);
5581
5582	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5583
5584	/*
5585	* Page can't be all visible, we just inserted into it, and are still
5586	* running.
5587	*/
5588	Assert(!PageIsAllVisible(page));
5589
5590	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5591	Assert(ItemIdIsNormal(lp));
5592
5593	tp.t_tableOid = RelationGetRelid(relation);
5594	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5595	tp.t_len = ItemIdGetLength(lp);
5596	tp.t_self = *tid;
5597
5598	/*
5599	* Sanity check that the tuple really is a speculatively inserted tuple,
5600	* inserted by us.
5601	*/
5602	if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5603	elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5604	if (!(IsToastRelation(relation) \|\| HeapTupleHeaderIsSpeculative(tp.t_data)))
5605	elog(ERROR, "attempted to kill a non-speculative tuple");
5606	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
5607
5608	/*
5609	* No need to check for serializable conflicts here. There is never a
5610	* need for a combocid, either. No need to extract replica identity, or
5611	* do anything special with infomask bits.
5612	*/
5613
5614	START_CRIT_SECTION();
5615
5616	/*
5617	* The tuple will become DEAD immediately. Flag that this page
5618	* immediately is a candidate for pruning by setting xmin to
5619	* RecentGlobalXmin. That's not pretty, but it doesn't seem worth
5620	* inventing a nicer API for this.
5621	*/
5622	Assert(TransactionIdIsValid(RecentGlobalXmin));
5623	PageSetPrunable(page, RecentGlobalXmin);
5624
5625	/ store transaction information of xact deleting the tuple /
5626	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
5627	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5628
5629	/*
5630	* Set the tuple header xmin to InvalidTransactionId. This makes the
5631	* tuple immediately invisible everyone. (In particular, to any
5632	* transactions waiting on the speculative token, woken up later.)
5633	*/
5634	HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
5635
5636	/ Clear the speculative insertion token too /
5637	tp.t_data->t_ctid = tp.t_self;
5638
5639	MarkBufferDirty(buffer);
5640
5641	/*
5642	* XLOG stuff
5643	*
5644	* The WAL records generated here match heap_delete(). The same recovery
5645	* routines are used.
5646	*/
5647	if (RelationNeedsWAL(relation))
5648	{
5649	xl_heap_delete xlrec;
5650	XLogRecPtr recptr;
5651
5652	xlrec.flags = XLH_DELETE_IS_SUPER;
5653	xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
5654	tp.t_data->t_infomask2);
5655	xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
5656	xlrec.xmax = xid;
5657
5658	XLogBeginInsert();
5659	XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5660	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
5661
5662	/ No replica identity & replication origin logged /
5663
5664	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5665
5666	PageSetLSN(page, recptr);
5667	}
5668
5669	END_CRIT_SECTION();
5670
5671	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5672
5673	if (HeapTupleHasExternal(&tp))
5674	{
5675	Assert(!IsToastRelation(relation));
5676	toast_delete(relation, &tp, true);
5677	}
5678
5679	/*
5680	* Never need to mark tuple for invalidation, since catalogs don't support
5681	* speculative insertion
5682	*/
5683
5684	/ Now we can release the buffer /
5685	ReleaseBuffer(buffer);
5686
5687	/ count deletion, as we counted the insertion too /
5688	pgstat_count_heap_delete(relation);
5689	}
5690
5691	/*
5692	* heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5693	*
5694	* Overwriting violates both MVCC and transactional safety, so the uses
5695	* of this function in Postgres are extremely limited. Nonetheless we
5696	* find some places to use it.
5697	*
5698	* The tuple cannot change size, and therefore it's reasonable to assume
5699	* that its null bitmap (if any) doesn't change either. So we just
5700	* overwrite the data portion of the tuple without touching the null
5701	* bitmap or any of the header fields.
5702	*
5703	* tuple is an in-memory tuple structure containing the data to be written
5704	* over the target tuple. Also, tuple->t_self identifies the target tuple.
5705	*/
5706	void
5707	heap_inplace_update(Relation relation, HeapTuple tuple)
5708	{
5709	Buffer buffer;
5710	Page page;
5711	OffsetNumber offnum;
5712	ItemId lp = NULL;
5713	HeapTupleHeader htup;
5714	uint32 oldlen;
5715	uint32 newlen;
5716
5717	/*
5718	* For now, parallel operations are required to be strictly read-only.
5719	* Unlike a regular update, this should never create a combo CID, so it
5720	* might be possible to relax this restriction, but not without more
5721	* thought and testing. It's not clear that it would be useful, anyway.
5722	*/
5723	if (IsInParallelMode())
5724	ereport(ERROR,
5725	(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5726	errmsg("cannot update tuples during a parallel operation")));
5727
5728	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5729	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5730	page = (Page) BufferGetPage(buffer);
5731
5732	offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5733	if (PageGetMaxOffsetNumber(page) >= offnum)
5734	lp = PageGetItemId(page, offnum);
5735
5736	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
5737	elog(ERROR, "invalid lp");
5738
5739	htup = (HeapTupleHeader) PageGetItem(page, lp);
5740
5741	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5742	newlen = tuple->t_len - tuple->t_data->t_hoff;
5743	if (oldlen != newlen \|\| htup->t_hoff != tuple->t_data->t_hoff)
5744	elog(ERROR, "wrong tuple length");
5745
5746	/ NO EREPORT(ERROR) from here till changes are logged /
5747	START_CRIT_SECTION();
5748
5749	memcpy((char *) htup + htup->t_hoff,
5750	(char *) tuple->t_data + tuple->t_data->t_hoff,
5751	newlen);
5752
5753	MarkBufferDirty(buffer);
5754
5755	/ XLOG stuff /
5756	if (RelationNeedsWAL(relation))
5757	{
5758	xl_heap_inplace xlrec;
5759	XLogRecPtr recptr;
5760
5761	xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5762
5763	XLogBeginInsert();
5764	XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5765
5766	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
5767	XLogRegisterBufData(`0`, (char *) htup + htup->t_hoff, newlen);
5768
5769	/ inplace updates aren't decoded atm, don't log the origin /
5770
5771	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5772
5773	PageSetLSN(page, recptr);
5774	}
5775
5776	END_CRIT_SECTION();
5777
5778	UnlockReleaseBuffer(buffer);
5779
5780	/*
5781	* Send out shared cache inval if necessary. Note that because we only
5782	* pass the new version of the tuple, this mustn't be used for any
5783	* operations that could change catcache lookup keys. But we aren't
5784	* bothering with index updates either, so that's true a fortiori.
5785	*/
5786	if (!IsBootstrapProcessingMode())
5787	CacheInvalidateHeapTuple(relation, tuple, NULL);
5788	}
5789
5790	#define FRM_NOOP 0x0001
5791	#define FRM_INVALIDATE_XMAX 0x0002
5792	#define FRM_RETURN_IS_XID 0x0004
5793	#define FRM_RETURN_IS_MULTI 0x0008
5794	#define FRM_MARK_COMMITTED 0x0010
5795
5796	/*
5797	* FreezeMultiXactId
5798	* Determine what to do during freezing when a tuple is marked by a
5799	* MultiXactId.
5800	*
5801	* NB -- this might have the side-effect of creating a new MultiXactId!
5802	*
5803	* "flags" is an output value; it's used to tell caller what to do on return.
5804	* Possible flags are:
5805	* FRM_NOOP
5806	* don't do anything -- keep existing Xmax
5807	* FRM_INVALIDATE_XMAX
5808	* mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
5809	* FRM_RETURN_IS_XID
5810	* The Xid return value is a single update Xid to set as xmax.
5811	* FRM_MARK_COMMITTED
5812	* Xmax can be marked as HEAP_XMAX_COMMITTED
5813	* FRM_RETURN_IS_MULTI
5814	* The return value is a new MultiXactId to set as new Xmax.
5815	* (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
5816	*/
5817	static TransactionId
5818	FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
5819	TransactionId relfrozenxid, TransactionId relminmxid,
5820	TransactionId cutoff_xid, MultiXactId cutoff_multi,
5821	uint16 *flags)
5822	{
5823	TransactionId xid = InvalidTransactionId;
5824	int i;
5825	MultiXactMember *members;
5826	int nmembers;
5827	bool need_replace;
5828	int nnewmembers;
5829	MultiXactMember *newmembers;
5830	bool has_lockers;
5831	TransactionId update_xid;
5832	bool update_committed;
5833
5834	*flags = `0`;
5835
5836	/ We should only be called in Multis /
5837	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
5838
5839	if (!MultiXactIdIsValid(multi) \|\|
5840	HEAP_LOCKED_UPGRADED(t_infomask))
5841	{
5842	/ Ensure infomask bits are appropriately set/reset /
5843	*flags \|= FRM_INVALIDATE_XMAX;
5844	return InvalidTransactionId;
5845	}
5846	else if (MultiXactIdPrecedes(multi, relminmxid))
5847	ereport(ERROR,
5848	(errcode(ERRCODE_DATA_CORRUPTED),
5849	errmsg_internal("found multixact %u from before relminmxid %u",
5850	multi, relminmxid)));
5851	else if (MultiXactIdPrecedes(multi, cutoff_multi))
5852	{
5853	/*
5854	* This old multi cannot possibly have members still running, but
5855	* verify just in case. If it was a locker only, it can be removed
5856	* without any further consideration; but if it contained an update,
5857	* we might need to preserve it.
5858	*/
5859	if (MultiXactIdIsRunning(multi,
5860	HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
5861	ereport(ERROR,
5862	(errcode(ERRCODE_DATA_CORRUPTED),
5863	errmsg_internal("multixact %u from before cutoff %u found to be still running",
5864	multi, cutoff_multi)));
5865
5866	if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
5867	{
5868	*flags \|= FRM_INVALIDATE_XMAX;
5869	xid = InvalidTransactionId; / not strictly necessary /
5870	}
5871	else
5872	{
5873	/ replace multi by update xid /
5874	xid = MultiXactIdGetUpdateXid(multi, t_infomask);
5875
5876	/ wasn't only a lock, xid needs to be valid /
5877	Assert(TransactionIdIsValid(xid));
5878
5879	if (TransactionIdPrecedes(xid, relfrozenxid))
5880	ereport(ERROR,
5881	(errcode(ERRCODE_DATA_CORRUPTED),
5882	errmsg_internal("found update xid %u from before relfrozenxid %u",
5883	xid, relfrozenxid)));
5884
5885	/*
5886	* If the xid is older than the cutoff, it has to have aborted,
5887	* otherwise the tuple would have gotten pruned away.
5888	*/
5889	if (TransactionIdPrecedes(xid, cutoff_xid))
5890	{
5891	if (TransactionIdDidCommit(xid))
5892	ereport(ERROR,
5893	(errcode(ERRCODE_DATA_CORRUPTED),
5894	errmsg_internal("cannot freeze committed update xid %u", xid)));
5895	*flags \|= FRM_INVALIDATE_XMAX;
5896	xid = InvalidTransactionId; / not strictly necessary /
5897	}
5898	else
5899	{
5900	*flags \|= FRM_RETURN_IS_XID;
5901	}
5902	}
5903
5904	return xid;
5905	}
5906
5907	/*
5908	* This multixact might have or might not have members still running, but
5909	* we know it's valid and is newer than the cutoff point for multis.
5910	* However, some member(s) of it may be below the cutoff for Xids, so we
5911	* need to walk the whole members array to figure out what to do, if
5912	* anything.
5913	*/
5914
5915	nmembers =
5916	GetMultiXactIdMembers(multi, &members, false,
5917	HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
5918	if (nmembers <= `0`)
5919	{
5920	/ Nothing worth keeping /
5921	*flags \|= FRM_INVALIDATE_XMAX;
5922	return InvalidTransactionId;
5923	}
5924
5925	/ is there anything older than the cutoff? /
5926	need_replace = false;
5927	for (i = `0`; i < nmembers; i++)
5928	{
5929	if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
5930	{
5931	need_replace = true;
5932	break;
5933	}
5934	}
5935
5936	/*
5937	* In the simplest case, there is no member older than the cutoff; we can
5938	* keep the existing MultiXactId as is.
5939	*/
5940	if (!need_replace)
5941	{
5942	*flags \|= FRM_NOOP;
5943	pfree(members);
5944	return InvalidTransactionId;
5945	}
5946
5947	/*
5948	* If the multi needs to be updated, figure out which members do we need
5949	* to keep.
5950	*/
5951	nnewmembers = `0`;
5952	newmembers = palloc(sizeof(MultiXactMember) * nmembers);
5953	has_lockers = false;
5954	update_xid = InvalidTransactionId;
5955	update_committed = false;
5956
5957	for (i = `0`; i < nmembers; i++)
5958	{
5959	/*
5960	* Determine whether to keep this member or ignore it.
5961	*/
5962	if (ISUPDATE_from_mxstatus(members[i].status))
5963	{
5964	TransactionId xid = members[i].xid;
5965
5966	Assert(TransactionIdIsValid(xid));
5967	if (TransactionIdPrecedes(xid, relfrozenxid))
5968	ereport(ERROR,
5969	(errcode(ERRCODE_DATA_CORRUPTED),
5970	errmsg_internal("found update xid %u from before relfrozenxid %u",
5971	xid, relfrozenxid)));
5972
5973	/*
5974	* It's an update; should we keep it? If the transaction is known
5975	* aborted or crashed then it's okay to ignore it, otherwise not.
5976	* Note that an updater older than cutoff_xid cannot possibly be
5977	* committed, because HeapTupleSatisfiesVacuum would have returned
5978	* HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
5979	*
5980	* As with all tuple visibility routines, it's critical to test
5981	* TransactionIdIsInProgress before TransactionIdDidCommit,
5982	* because of race conditions explained in detail in
5983	* heapam_visibility.c.
5984	*/
5985	if (TransactionIdIsCurrentTransactionId(xid) \|\|
5986	TransactionIdIsInProgress(xid))
5987	{
5988	Assert(!TransactionIdIsValid(update_xid));
5989	update_xid = xid;
5990	}
5991	else if (TransactionIdDidCommit(xid))
5992	{
5993	/*
5994	* The transaction committed, so we can tell caller to set
5995	* HEAP_XMAX_COMMITTED. (We can only do this because we know
5996	* the transaction is not running.)
5997	*/
5998	Assert(!TransactionIdIsValid(update_xid));
5999	update_committed = true;
6000	update_xid = xid;
6001	}
6002	else
6003	{
6004	/*
6005	* Not in progress, not committed -- must be aborted or
6006	* crashed; we can ignore it.
6007	*/
6008	}
6009
6010	/*
6011	* Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6012	* update Xid cannot possibly be older than the xid cutoff. The
6013	* presence of such a tuple would cause corruption, so be paranoid
6014	* and check.
6015	*/
6016	if (TransactionIdIsValid(update_xid) &&
6017	TransactionIdPrecedes(update_xid, cutoff_xid))
6018	ereport(ERROR,
6019	(errcode(ERRCODE_DATA_CORRUPTED),
6020	errmsg_internal("found update xid %u from before xid cutoff %u",
6021	update_xid, cutoff_xid)));
6022
6023	/*
6024	* If we determined that it's an Xid corresponding to an update
6025	* that must be retained, additionally add it to the list of
6026	* members of the new Multi, in case we end up using that. (We
6027	* might still decide to use only an update Xid and not a multi,
6028	* but it's easier to maintain the list as we walk the old members
6029	* list.)
6030	*/
6031	if (TransactionIdIsValid(update_xid))
6032	newmembers[nnewmembers++] = members[i];
6033	}
6034	else
6035	{
6036	/ We only keep lockers if they are still running /
6037	if (TransactionIdIsCurrentTransactionId(members[i].xid) \|\|
6038	TransactionIdIsInProgress(members[i].xid))
6039	{
6040	/ running locker cannot possibly be older than the cutoff /
6041	Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6042	newmembers[nnewmembers++] = members[i];
6043	has_lockers = true;
6044	}
6045	}
6046	}
6047
6048	pfree(members);
6049
6050	if (nnewmembers == `0`)
6051	{
6052	/ nothing worth keeping!? Tell caller to remove the whole thing /
6053	*flags \|= FRM_INVALIDATE_XMAX;
6054	xid = InvalidTransactionId;
6055	}
6056	else if (TransactionIdIsValid(update_xid) && !has_lockers)
6057	{
6058	/*
6059	* If there's a single member and it's an update, pass it back alone
6060	* without creating a new Multi. (XXX we could do this when there's a
6061	* single remaining locker, too, but that would complicate the API too
6062	* much; moreover, the case with the single updater is more
6063	* interesting, because those are longer-lived.)
6064	*/
6065	Assert(nnewmembers == `1`);
6066	*flags \|= FRM_RETURN_IS_XID;
6067	if (update_committed)
6068	*flags \|= FRM_MARK_COMMITTED;
6069	xid = update_xid;
6070	}
6071	else
6072	{
6073	/*
6074	* Create a new multixact with the surviving members of the previous
6075	* one, to set as new Xmax in the tuple.
6076	*/
6077	xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6078	*flags \|= FRM_RETURN_IS_MULTI;
6079	}
6080
6081	pfree(newmembers);
6082
6083	return xid;
6084	}
6085
6086	/*
6087	* heap_prepare_freeze_tuple
6088	*
6089	* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6090	* are older than the specified cutoff XID and cutoff MultiXactId. If so,
6091	* setup enough state (in the *frz output argument) to later execute and
6092	* WAL-log what we would need to do, and return true. Return false if nothing
6093	* is to be changed. In addition, set *totally_frozen_p to true if the tuple
6094	* will be totally frozen after these operations are performed and false if
6095	* more freezing will eventually be required.
6096	*
6097	* Caller is responsible for setting the offset field, if appropriate.
6098	*
6099	* It is assumed that the caller has checked the tuple with
6100	* HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6101	* (else we should be removing the tuple, not freezing it).
6102	*
6103	* NB: cutoff_xid must be <= the current global xmin, to ensure that any
6104	* XID older than it could neither be running nor seen as running by any
6105	* open transaction. This ensures that the replacement will not change
6106	* anyone's idea of the tuple state.
6107	* Similarly, cutoff_multi must be less than or equal to the smallest
6108	* MultiXactId used by any transaction currently open.
6109	*
6110	* If the tuple is in a shared buffer, caller must hold an exclusive lock on
6111	* that buffer.
6112	*
6113	* NB: It is not enough to set hint bits to indicate something is
6114	* committed/invalid -- they might not be set on a standby, or after crash
6115	* recovery. We really need to remove old xids.
6116	*/
6117	bool
6118	heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6119	TransactionId relfrozenxid, TransactionId relminmxid,
6120	TransactionId cutoff_xid, TransactionId cutoff_multi,
6121	xl_heap_freeze_tuple frz, bool totally_frozen_p)
6122	{
6123	bool changed = false;
6124	bool xmax_already_frozen = false;
6125	bool xmin_frozen;
6126	bool freeze_xmax;
6127	TransactionId xid;
6128
6129	frz->frzflags = `0`;
6130	frz->t_infomask2 = tuple->t_infomask2;
6131	frz->t_infomask = tuple->t_infomask;
6132	frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6133
6134	/*
6135	* Process xmin. xmin_frozen has two slightly different meanings: in the
6136	* !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's
6137	* already a permanent value), while in the block below it is set true to
6138	* mean "xmin won't need freezing after what we do to it here" (false
6139	* otherwise). In both cases we're allowed to set totally_frozen, as far
6140	* as xmin is concerned.
6141	*/
6142	xid = HeapTupleHeaderGetXmin(tuple);
6143	if (!TransactionIdIsNormal(xid))
6144	xmin_frozen = true;
6145	else
6146	{
6147	if (TransactionIdPrecedes(xid, relfrozenxid))
6148	ereport(ERROR,
6149	(errcode(ERRCODE_DATA_CORRUPTED),
6150	errmsg_internal("found xmin %u from before relfrozenxid %u",
6151	xid, relfrozenxid)));
6152
6153	xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid);
6154	if (xmin_frozen)
6155	{
6156	if (!TransactionIdDidCommit(xid))
6157	ereport(ERROR,
6158	(errcode(ERRCODE_DATA_CORRUPTED),
6159	errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6160	xid, cutoff_xid)));
6161
6162	frz->t_infomask \|= HEAP_XMIN_FROZEN;
6163	changed = true;
6164	}
6165	}
6166
6167	/*
6168	* Process xmax. To thoroughly examine the current Xmax value we need to
6169	* resolve a MultiXactId to its member Xids, in case some of them are
6170	* below the given cutoff for Xids. In that case, those values might need
6171	* freezing, too. Also, if a multi needs freezing, we cannot simply take
6172	* it out --- if there's a live updater Xid, it needs to be kept.
6173	*
6174	* Make sure to keep heap_tuple_needs_freeze in sync with this.
6175	*/
6176	xid = HeapTupleHeaderGetRawXmax(tuple);
6177
6178	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6179	{
6180	TransactionId newxmax;
6181	uint16 flags;
6182
6183	newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6184	relfrozenxid, relminmxid,
6185	cutoff_xid, cutoff_multi, &flags);
6186
6187	freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6188
6189	if (flags & FRM_RETURN_IS_XID)
6190	{
6191	/*
6192	* NB -- some of these transformations are only valid because we
6193	* know the return Xid is a tuple updater (i.e. not merely a
6194	* locker.) Also note that the only reason we don't explicitly
6195	* worry about HEAP_KEYS_UPDATED is because it lives in
6196	* t_infomask2 rather than t_infomask.
6197	*/
6198	frz->t_infomask &= ~HEAP_XMAX_BITS;
6199	frz->xmax = newxmax;
6200	if (flags & FRM_MARK_COMMITTED)
6201	frz->t_infomask \|= HEAP_XMAX_COMMITTED;
6202	changed = true;
6203	}
6204	else if (flags & FRM_RETURN_IS_MULTI)
6205	{
6206	uint16 newbits;
6207	uint16 newbits2;
6208
6209	/*
6210	* We can't use GetMultiXactIdHintBits directly on the new multi
6211	* here; that routine initializes the masks to all zeroes, which
6212	* would lose other bits we need. Doing it this way ensures all
6213	* unrelated bits remain untouched.
6214	*/
6215	frz->t_infomask &= ~HEAP_XMAX_BITS;
6216	frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6217	GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6218	frz->t_infomask \|= newbits;
6219	frz->t_infomask2 \|= newbits2;
6220
6221	frz->xmax = newxmax;
6222
6223	changed = true;
6224	}
6225	}
6226	else if (TransactionIdIsNormal(xid))
6227	{
6228	if (TransactionIdPrecedes(xid, relfrozenxid))
6229	ereport(ERROR,
6230	(errcode(ERRCODE_DATA_CORRUPTED),
6231	errmsg_internal("found xmax %u from before relfrozenxid %u",
6232	xid, relfrozenxid)));
6233
6234	if (TransactionIdPrecedes(xid, cutoff_xid))
6235	{
6236	/*
6237	* If we freeze xmax, make absolutely sure that it's not an XID
6238	* that is important. (Note, a lock-only xmax can be removed
6239	* independent of committedness, since a committed lock holder has
6240	* released the lock).
6241	*/
6242	if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6243	TransactionIdDidCommit(xid))
6244	ereport(ERROR,
6245	(errcode(ERRCODE_DATA_CORRUPTED),
6246	errmsg_internal("cannot freeze committed xmax %u",
6247	xid)));
6248	freeze_xmax = true;
6249	}
6250	else
6251	freeze_xmax = false;
6252	}
6253	else if ((tuple->t_infomask & HEAP_XMAX_INVALID) \|\|
6254	!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6255	{
6256	freeze_xmax = false;
6257	xmax_already_frozen = true;
6258	}
6259	else
6260	ereport(ERROR,
6261	(errcode(ERRCODE_DATA_CORRUPTED),
6262	errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6263	xid, tuple->t_infomask)));
6264
6265	if (freeze_xmax)
6266	{
6267	Assert(!xmax_already_frozen);
6268
6269	frz->xmax = InvalidTransactionId;
6270
6271	/*
6272	* The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6273	* LOCKED. Normalize to INVALID just to be sure no one gets confused.
6274	* Also get rid of the HEAP_KEYS_UPDATED bit.
6275	*/
6276	frz->t_infomask &= ~HEAP_XMAX_BITS;
6277	frz->t_infomask \|= HEAP_XMAX_INVALID;
6278	frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6279	frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6280	changed = true;
6281	}
6282
6283	/*
6284	* Old-style VACUUM FULL is gone, but we have to keep this code as long as
6285	* we support having MOVED_OFF/MOVED_IN tuples in the database.
6286	*/
6287	if (tuple->t_infomask & HEAP_MOVED)
6288	{
6289	xid = HeapTupleHeaderGetXvac(tuple);
6290
6291	/*
6292	* For Xvac, we ignore the cutoff_xid and just always perform the
6293	* freeze operation. The oldest release in which such a value can
6294	* actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6295	* was removed in PostgreSQL 9.0. Note that if we were to respect
6296	* cutoff_xid here, we'd need to make surely to clear totally_frozen
6297	* when we skipped freezing on that basis.
6298	*/
6299	if (TransactionIdIsNormal(xid))
6300	{
6301	/*
6302	* If a MOVED_OFF tuple is not dead, the xvac transaction must
6303	* have failed; whereas a non-dead MOVED_IN tuple must mean the
6304	* xvac transaction succeeded.
6305	*/
6306	if (tuple->t_infomask & HEAP_MOVED_OFF)
6307	frz->frzflags \|= XLH_INVALID_XVAC;
6308	else
6309	frz->frzflags \|= XLH_FREEZE_XVAC;
6310
6311	/*
6312	* Might as well fix the hint bits too; usually XMIN_COMMITTED
6313	* will already be set here, but there's a small chance not.
6314	*/
6315	Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6316	frz->t_infomask \|= HEAP_XMIN_COMMITTED;
6317	changed = true;
6318	}
6319	}
6320
6321	*totally_frozen_p = (xmin_frozen &&
6322	(freeze_xmax \|\| xmax_already_frozen));
6323	return changed;
6324	}
6325
6326	/*
6327	* heap_execute_freeze_tuple
6328	* Execute the prepared freezing of a tuple.
6329	*
6330	* Caller is responsible for ensuring that no other backend can access the
6331	* storage underlying this tuple, either by holding an exclusive lock on the
6332	* buffer containing it (which is what lazy VACUUM does), or by having it be
6333	* in private storage (which is what CLUSTER and friends do).
6334	*
6335	* Note: it might seem we could make the changes without exclusive lock, since
6336	* TransactionId read/write is assumed atomic anyway. However there is a race
6337	* condition: someone who just fetched an old XID that we overwrite here could
6338	* conceivably not finish checking the XID against pg_xact before we finish
6339	* the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting
6340	* exclusive lock ensures no other backend is in process of checking the
6341	* tuple status. Also, getting exclusive lock makes it safe to adjust the
6342	* infomask bits.
6343	*
6344	* NB: All code in here must be safe to execute during crash recovery!
6345	*/
6346	void
6347	heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6348	{
6349	HeapTupleHeaderSetXmax(tuple, frz->xmax);
6350
6351	if (frz->frzflags & XLH_FREEZE_XVAC)
6352	HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6353
6354	if (frz->frzflags & XLH_INVALID_XVAC)
6355	HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6356
6357	tuple->t_infomask = frz->t_infomask;
6358	tuple->t_infomask2 = frz->t_infomask2;
6359	}
6360
6361	/*
6362	* heap_freeze_tuple
6363	* Freeze tuple in place, without WAL logging.
6364	*
6365	* Useful for callers like CLUSTER that perform their own WAL logging.
6366	*/
6367	bool
6368	heap_freeze_tuple(HeapTupleHeader tuple,
6369	TransactionId relfrozenxid, TransactionId relminmxid,
6370	TransactionId cutoff_xid, TransactionId cutoff_multi)
6371	{
6372	xl_heap_freeze_tuple frz;
6373	bool do_freeze;
6374	bool tuple_totally_frozen;
6375
6376	do_freeze = heap_prepare_freeze_tuple(tuple,
6377	relfrozenxid, relminmxid,
6378	cutoff_xid, cutoff_multi,
6379	&frz, &tuple_totally_frozen);
6380
6381	/*
6382	* Note that because this is not a WAL-logged operation, we don't need to
6383	* fill in the offset in the freeze record.
6384	*/
6385
6386	if (do_freeze)
6387	heap_execute_freeze_tuple(tuple, &frz);
6388	return do_freeze;
6389	}
6390
6391	/*
6392	* For a given MultiXactId, return the hint bits that should be set in the
6393	* tuple's infomask.
6394	*
6395	* Normally this should be called for a multixact that was just created, and
6396	* so is on our local cache, so the GetMembers call is fast.
6397	*/
6398	static void
6399	GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6400	uint16 *new_infomask2)
6401	{
6402	int nmembers;
6403	MultiXactMember *members;
6404	int i;
6405	uint16 bits = HEAP_XMAX_IS_MULTI;
6406	uint16 bits2 = `0`;
6407	bool has_update = false;
6408	LockTupleMode strongest = LockTupleKeyShare;
6409
6410	/*
6411	* We only use this in multis we just created, so they cannot be values
6412	* pre-pg_upgrade.
6413	*/
6414	nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6415
6416	for (i = `0`; i < nmembers; i++)
6417	{
6418	LockTupleMode mode;
6419
6420	/*
6421	* Remember the strongest lock mode held by any member of the
6422	* multixact.
6423	*/
6424	mode = TUPLOCK_from_mxstatus(members[i].status);
6425	if (mode > strongest)
6426	strongest = mode;
6427
6428	/ See what other bits we need /
6429	switch (members[i].status)
6430	{
6431	case MultiXactStatusForKeyShare:
6432	case MultiXactStatusForShare:
6433	case MultiXactStatusForNoKeyUpdate:
6434	break;
6435
6436	case MultiXactStatusForUpdate:
6437	bits2 \|= HEAP_KEYS_UPDATED;
6438	break;
6439
6440	case MultiXactStatusNoKeyUpdate:
6441	has_update = true;
6442	break;
6443
6444	case MultiXactStatusUpdate:
6445	bits2 \|= HEAP_KEYS_UPDATED;
6446	has_update = true;
6447	break;
6448	}
6449	}
6450
6451	if (strongest == LockTupleExclusive \|\|
6452	strongest == LockTupleNoKeyExclusive)
6453	bits \|= HEAP_XMAX_EXCL_LOCK;
6454	else if (strongest == LockTupleShare)
6455	bits \|= HEAP_XMAX_SHR_LOCK;
6456	else if (strongest == LockTupleKeyShare)
6457	bits \|= HEAP_XMAX_KEYSHR_LOCK;
6458
6459	if (!has_update)
6460	bits \|= HEAP_XMAX_LOCK_ONLY;
6461
6462	if (nmembers > `0`)
6463	pfree(members);
6464
6465	*new_infomask = bits;
6466	*new_infomask2 = bits2;
6467	}
6468
6469	/*
6470	* MultiXactIdGetUpdateXid
6471	*
6472	* Given a multixact Xmax and corresponding infomask, which does not have the
6473	* HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6474	* transaction.
6475	*
6476	* Caller is expected to check the status of the updating transaction, if
6477	* necessary.
6478	*/
6479	static TransactionId
6480	MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6481	{
6482	TransactionId update_xact = InvalidTransactionId;
6483	MultiXactMember *members;
6484	int nmembers;
6485
6486	Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6487	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6488
6489	/*
6490	* Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6491	* pre-pg_upgrade.
6492	*/
6493	nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6494
6495	if (nmembers > `0`)
6496	{
6497	int i;
6498
6499	for (i = `0`; i < nmembers; i++)
6500	{
6501	/ Ignore lockers /
6502	if (!ISUPDATE_from_mxstatus(members[i].status))
6503	continue;
6504
6505	/ there can be at most one updater /
6506	Assert(update_xact == InvalidTransactionId);
6507	update_xact = members[i].xid;
6508	#ifndef USE_ASSERT_CHECKING
6509
6510	/*
6511	* in an assert-enabled build, walk the whole array to ensure
6512	* there's no other updater.
6513	*/
6514	break;
6515	#endif
6516	}
6517
6518	pfree(members);
6519	}
6520
6521	return update_xact;
6522	}
6523
6524	/*
6525	* HeapTupleGetUpdateXid
6526	* As above, but use a HeapTupleHeader
6527	*
6528	* See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6529	* checking the hint bits.
6530	*/
6531	TransactionId
6532	HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6533	{
6534	return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6535	tuple->t_infomask);
6536	}
6537
6538	/*
6539	* Does the given multixact conflict with the current transaction grabbing a
6540	* tuple lock of the given strength?
6541	*
6542	* The passed infomask pairs up with the given multixact in the tuple header.
6543	*
6544	* If current_is_member is not NULL, it is set to 'true' if the current
6545	* transaction is a member of the given multixact.
6546	*/
6547	static bool
6548	DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6549	LockTupleMode lockmode, bool *current_is_member)
6550	{
6551	int nmembers;
6552	MultiXactMember *members;
6553	bool result = false;
6554	LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
6555
6556	if (HEAP_LOCKED_UPGRADED(infomask))
6557	return false;
6558
6559	nmembers = GetMultiXactIdMembers(multi, &members, false,
6560	HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6561	if (nmembers >= `0`)
6562	{
6563	int i;
6564
6565	for (i = `0`; i < nmembers; i++)
6566	{
6567	TransactionId memxid;
6568	LOCKMODE memlockmode;
6569
6570	if (result && (current_is_member == NULL \|\| *current_is_member))
6571	break;
6572
6573	memlockmode = LOCKMODE_from_mxstatus(members[i].status);
6574
6575	/ ignore members from current xact (but track their presence) /
6576	memxid = members[i].xid;
6577	if (TransactionIdIsCurrentTransactionId(memxid))
6578	{
6579	if (current_is_member != NULL)
6580	*current_is_member = true;
6581	continue;
6582	}
6583	else if (result)
6584	continue;
6585
6586	/ ignore members that don't conflict with the lock we want /
6587	if (!DoLockModesConflict(memlockmode, wanted))
6588	continue;
6589
6590	if (ISUPDATE_from_mxstatus(members[i].status))
6591	{
6592	/ ignore aborted updaters /
6593	if (TransactionIdDidAbort(memxid))
6594	continue;
6595	}
6596	else
6597	{
6598	/ ignore lockers-only that are no longer in progress /
6599	if (!TransactionIdIsInProgress(memxid))
6600	continue;
6601	}
6602
6603	/*
6604	* Whatever remains are either live lockers that conflict with our
6605	* wanted lock, and updaters that are not aborted. Those conflict
6606	* with what we want. Set up to return true, but keep going to
6607	* look for the current transaction among the multixact members,
6608	* if needed.
6609	*/
6610	result = true;
6611	}
6612	pfree(members);
6613	}
6614
6615	return result;
6616	}
6617
6618	/*
6619	* Do_MultiXactIdWait
6620	* Actual implementation for the two functions below.
6621	*
6622	* 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
6623	* needed to ensure we only sleep on conflicting members, and the infomask is
6624	* used to optimize multixact access in case it's a lock-only multi); 'nowait'
6625	* indicates whether to use conditional lock acquisition, to allow callers to
6626	* fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
6627	* context information for error messages. 'remaining', if not NULL, receives
6628	* the number of members that are still running, including any (non-aborted)
6629	* subtransactions of our own transaction.
6630	*
6631	* We do this by sleeping on each member using XactLockTableWait. Any
6632	* members that belong to the current backend are not waited for, however;
6633	* this would not merely be useless but would lead to Assert failure inside
6634	* XactLockTableWait. By the time this returns, it is certain that all
6635	* transactions of other backends that were members of the MultiXactId
6636	* that conflict with the requested status are dead (and no new ones can have
6637	* been added, since it is not legal to add members to an existing
6638	* MultiXactId).
6639	*
6640	* But by the time we finish sleeping, someone else may have changed the Xmax
6641	* of the containing tuple, so the caller needs to iterate on us somehow.
6642	*
6643	* Note that in case we return false, the number of remaining members is
6644	* not to be trusted.
6645	*/
6646	static bool
6647	Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6648	uint16 infomask, bool nowait,
6649	Relation rel, ItemPointer ctid, XLTW_Oper oper,
6650	int *remaining)
6651	{
6652	bool result = true;
6653	MultiXactMember *members;
6654	int nmembers;
6655	int remain = `0`;
6656
6657	/ for pre-pg_upgrade tuples, no need to sleep at all /
6658	nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -`1` :
6659	GetMultiXactIdMembers(multi, &members, false,
6660	HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6661
6662	if (nmembers >= `0`)
6663	{
6664	int i;
6665
6666	for (i = `0`; i < nmembers; i++)
6667	{
6668	TransactionId memxid = members[i].xid;
6669	MultiXactStatus memstatus = members[i].status;
6670
6671	if (TransactionIdIsCurrentTransactionId(memxid))
6672	{
6673	remain++;
6674	continue;
6675	}
6676
6677	if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
6678	LOCKMODE_from_mxstatus(status)))
6679	{
6680	if (remaining && TransactionIdIsInProgress(memxid))
6681	remain++;
6682	continue;
6683	}
6684
6685	/*
6686	* This member conflicts with our multi, so we have to sleep (or
6687	* return failure, if asked to avoid waiting.)
6688	*
6689	* Note that we don't set up an error context callback ourselves,
6690	* but instead we pass the info down to XactLockTableWait. This
6691	* might seem a bit wasteful because the context is set up and
6692	* tore down for each member of the multixact, but in reality it
6693	* should be barely noticeable, and it avoids duplicate code.
6694	*/
6695	if (nowait)
6696	{
6697	result = ConditionalXactLockTableWait(memxid);
6698	if (!result)
6699	break;
6700	}
6701	else
6702	XactLockTableWait(memxid, rel, ctid, oper);
6703	}
6704
6705	pfree(members);
6706	}
6707
6708	if (remaining)
6709	*remaining = remain;
6710
6711	return result;
6712	}
6713
6714	/*
6715	* MultiXactIdWait
6716	* Sleep on a MultiXactId.
6717	*
6718	* By the time we finish sleeping, someone else may have changed the Xmax
6719	* of the containing tuple, so the caller needs to iterate on us somehow.
6720	*
6721	* We return (in *remaining, if not NULL) the number of members that are still
6722	* running, including any (non-aborted) subtransactions of our own transaction.
6723	*/
6724	static void
6725	MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
6726	Relation rel, ItemPointer ctid, XLTW_Oper oper,
6727	int *remaining)
6728	{
6729	(void) Do_MultiXactIdWait(multi, status, infomask, false,
6730	rel, ctid, oper, remaining);
6731	}
6732
6733	/*
6734	* ConditionalMultiXactIdWait
6735	* As above, but only lock if we can get the lock without blocking.
6736	*
6737	* By the time we finish sleeping, someone else may have changed the Xmax
6738	* of the containing tuple, so the caller needs to iterate on us somehow.
6739	*
6740	* If the multixact is now all gone, return true. Returns false if some
6741	* transactions might still be running.
6742	*
6743	* We return (in *remaining, if not NULL) the number of members that are still
6744	* running, including any (non-aborted) subtransactions of our own transaction.
6745	*/
6746	static bool
6747	ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6748	uint16 infomask, Relation rel, int *remaining)
6749	{
6750	return Do_MultiXactIdWait(multi, status, infomask, true,
6751	rel, NULL, XLTW_None, remaining);
6752	}
6753
6754	/*
6755	* heap_tuple_needs_eventual_freeze
6756	*
6757	* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6758	* will eventually require freezing. Similar to heap_tuple_needs_freeze,
6759	* but there's no cutoff, since we're trying to figure out whether freezing
6760	* will ever be needed, not whether it's needed now.
6761	*/
6762	bool
6763	heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
6764	{
6765	TransactionId xid;
6766
6767	/*
6768	* If xmin is a normal transaction ID, this tuple is definitely not
6769	* frozen.
6770	*/
6771	xid = HeapTupleHeaderGetXmin(tuple);
6772	if (TransactionIdIsNormal(xid))
6773	return true;
6774
6775	/*
6776	* If xmax is a valid xact or multixact, this tuple is also not frozen.
6777	*/
6778	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6779	{
6780	MultiXactId multi;
6781
6782	multi = HeapTupleHeaderGetRawXmax(tuple);
6783	if (MultiXactIdIsValid(multi))
6784	return true;
6785	}
6786	else
6787	{
6788	xid = HeapTupleHeaderGetRawXmax(tuple);
6789	if (TransactionIdIsNormal(xid))
6790	return true;
6791	}
6792
6793	if (tuple->t_infomask & HEAP_MOVED)
6794	{
6795	xid = HeapTupleHeaderGetXvac(tuple);
6796	if (TransactionIdIsNormal(xid))
6797	return true;
6798	}
6799
6800	return false;
6801	}
6802
6803	/*
6804	* heap_tuple_needs_freeze
6805	*
6806	* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6807	* are older than the specified cutoff XID or MultiXactId. If so, return true.
6808	*
6809	* It doesn't matter whether the tuple is alive or dead, we are checking
6810	* to see if a tuple needs to be removed or frozen to avoid wraparound.
6811	*
6812	* NB: Cannot rely on hint bits here, they might not be set after a crash or
6813	* on a standby.
6814	*/
6815	bool
6816	heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
6817	MultiXactId cutoff_multi, Buffer buf)
6818	{
6819	TransactionId xid;
6820
6821	xid = HeapTupleHeaderGetXmin(tuple);
6822	if (TransactionIdIsNormal(xid) &&
6823	TransactionIdPrecedes(xid, cutoff_xid))
6824	return true;
6825
6826	/*
6827	* The considerations for multixacts are complicated; look at
6828	* heap_prepare_freeze_tuple for justifications. This routine had better
6829	* be in sync with that one!
6830	*/
6831	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6832	{
6833	MultiXactId multi;
6834
6835	multi = HeapTupleHeaderGetRawXmax(tuple);
6836	if (!MultiXactIdIsValid(multi))
6837	{
6838	/ no xmax set, ignore /
6839	;
6840	}
6841	else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
6842	return true;
6843	else if (MultiXactIdPrecedes(multi, cutoff_multi))
6844	return true;
6845	else
6846	{
6847	MultiXactMember *members;
6848	int nmembers;
6849	int i;
6850
6851	/ need to check whether any member of the mxact is too old /
6852
6853	nmembers = GetMultiXactIdMembers(multi, &members, false,
6854	HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
6855
6856	for (i = `0`; i < nmembers; i++)
6857	{
6858	if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6859	{
6860	pfree(members);
6861	return true;
6862	}
6863	}
6864	if (nmembers > `0`)
6865	pfree(members);
6866	}
6867	}
6868	else
6869	{
6870	xid = HeapTupleHeaderGetRawXmax(tuple);
6871	if (TransactionIdIsNormal(xid) &&
6872	TransactionIdPrecedes(xid, cutoff_xid))
6873	return true;
6874	}
6875
6876	if (tuple->t_infomask & HEAP_MOVED)
6877	{
6878	xid = HeapTupleHeaderGetXvac(tuple);
6879	if (TransactionIdIsNormal(xid) &&
6880	TransactionIdPrecedes(xid, cutoff_xid))
6881	return true;
6882	}
6883
6884	return false;
6885	}
6886
6887	/*
6888	* If 'tuple' contains any visible XID greater than latestRemovedXid,
6889	* ratchet forwards latestRemovedXid to the greatest one found.
6890	* This is used as the basis for generating Hot Standby conflicts, so
6891	* if a tuple was never visible then removing it should not conflict
6892	* with queries.
6893	*/
6894	void
6895	HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
6896	TransactionId *latestRemovedXid)
6897	{
6898	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
6899	TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
6900	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
6901
6902	if (tuple->t_infomask & HEAP_MOVED)
6903	{
6904	if (TransactionIdPrecedes(*latestRemovedXid, xvac))
6905	*latestRemovedXid = xvac;
6906	}
6907
6908	/*
6909	* Ignore tuples inserted by an aborted transaction or if the tuple was
6910	* updated/deleted by the inserting transaction.
6911	*
6912	* Look for a committed hint bit, or if no xmin bit is set, check clog.
6913	* This needs to work on both master and standby, where it is used to
6914	* assess btree delete records.
6915	*/
6916	if (HeapTupleHeaderXminCommitted(tuple) \|\|
6917	(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
6918	{
6919	if (xmax != xmin &&
6920	TransactionIdFollows(xmax, *latestRemovedXid))
6921	*latestRemovedXid = xmax;
6922	}
6923
6924	/ latestRemovedXid may still be invalid at end /*
6925	}
6926
6927	#ifdef USE_PREFETCH
6928	/*
6929	* Helper function for heap_compute_xid_horizon_for_tuples. Issue prefetch
6930	* requests for the number of buffers indicated by prefetch_count. The
6931	* prefetch_state keeps track of all the buffers that we can prefetch and
6932	* which ones have already been prefetched; each call to this function picks
6933	* up where the previous call left off.
6934	*/
6935	static void
6936	xid_horizon_prefetch_buffer(Relation rel,
6937	XidHorizonPrefetchState *prefetch_state,
6938	int prefetch_count)
6939	{
6940	BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
6941	int count = `0`;
6942	int i;
6943	int nitems = prefetch_state->nitems;
6944	ItemPointerData *tids = prefetch_state->tids;
6945
6946	for (i = prefetch_state->next_item;
6947	i < nitems && count < prefetch_count;
6948	i++)
6949	{
6950	ItemPointer htid = &tids[i];
6951
6952	if (cur_hblkno == InvalidBlockNumber \|\|
6953	ItemPointerGetBlockNumber(htid) != cur_hblkno)
6954	{
6955	cur_hblkno = ItemPointerGetBlockNumber(htid);
6956	PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
6957	count++;
6958	}
6959	}
6960
6961	/*
6962	* Save the prefetch position so that next time we can continue from that
6963	* position.
6964	*/
6965	prefetch_state->next_item = i;
6966	prefetch_state->cur_hblkno = cur_hblkno;
6967	}
6968	#endif
6969
6970	/*
6971	* Get the latestRemovedXid from the heap pages pointed at by the index
6972	* tuples being deleted.
6973	*
6974	* We used to do this during recovery rather than on the primary, but that
6975	* approach now appears inferior. It meant that the master could generate
6976	* a lot of work for the standby without any back-pressure to slow down the
6977	* master, and it required the standby to have reached consistency, whereas
6978	* we want to have correct information available even before that point.
6979	*
6980	* It's possible for this to generate a fair amount of I/O, since we may be
6981	* deleting hundreds of tuples from a single index block. To amortize that
6982	* cost to some degree, this uses prefetching and combines repeat accesses to
6983	* the same block.
6984	*/
6985	TransactionId
6986	heap_compute_xid_horizon_for_tuples(Relation rel,
6987	ItemPointerData *tids,
6988	int nitems)
6989	{
6990	TransactionId latestRemovedXid = InvalidTransactionId;
6991	BlockNumber hblkno;
6992	Buffer buf = InvalidBuffer;
6993	Page hpage;
6994	#ifdef USE_PREFETCH
6995	XidHorizonPrefetchState prefetch_state;
6996	int io_concurrency;
6997	int prefetch_distance;
6998	#endif
6999
7000	/*
7001	* Sort to avoid repeated lookups for the same page, and to make it more
7002	* likely to access items in an efficient order. In particular, this
7003	* ensures that if there are multiple pointers to the same page, they all
7004	* get processed looking up and locking the page just once.
7005	*/
7006	qsort((void ) tids, nitems, sizeof*(ItemPointerData),
7007	(int () (const* void , const* void *)) ItemPointerCompare);
7008
7009	#ifdef USE_PREFETCH
7010	/ Initialize prefetch state. /
7011	prefetch_state.cur_hblkno = InvalidBlockNumber;
7012	prefetch_state.next_item = `0`;
7013	prefetch_state.nitems = nitems;
7014	prefetch_state.tids = tids;
7015
7016	/*
7017	* Compute the prefetch distance that we will attempt to maintain.
7018	*
7019	* We don't use the regular formula to determine how much to prefetch
7020	* here, but instead just add a constant to effective_io_concurrency.
7021	* That's because it seems best to do some prefetching here even when
7022	* effective_io_concurrency is set to 0, but if the DBA thinks it's OK to
7023	* do more prefetching for other operations, then it's probably OK to do
7024	* more prefetching in this case, too. It may be that this formula is too
7025	* simplistic, but at the moment there is no evidence of that or any idea
7026	* about what would work better.
7027	*
7028	* Since the caller holds a buffer lock somewhere in rel, we'd better make
7029	* sure that isn't a catalog relation before we call code that does
7030	* syscache lookups, to avoid risk of deadlock.
7031	*/
7032	if (IsCatalogRelation(rel))
7033	io_concurrency = effective_io_concurrency;
7034	else
7035	io_concurrency = get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
7036	prefetch_distance = Min((io_concurrency) + `10`, MAX_IO_CONCURRENCY);
7037
7038	/ Start prefetching. /
7039	xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7040	#endif
7041
7042	/ Iterate over all tids, and check their horizon /
7043	hblkno = InvalidBlockNumber;
7044	hpage = NULL;
7045	for (int i = `0`; i < nitems; i++)
7046	{
7047	ItemPointer htid = &tids[i];
7048	ItemId hitemid;
7049	OffsetNumber hoffnum;
7050
7051	/*
7052	* Read heap buffer, but avoid refetching if it's the same block as
7053	* required for the last tid.
7054	*/
7055	if (hblkno == InvalidBlockNumber \|\|
7056	ItemPointerGetBlockNumber(htid) != hblkno)
7057	{
7058	/ release old buffer /
7059	if (BufferIsValid(buf))
7060	{
7061	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7062	ReleaseBuffer(buf);
7063	}
7064
7065	hblkno = ItemPointerGetBlockNumber(htid);
7066
7067	buf = ReadBuffer(rel, hblkno);
7068
7069	#ifdef USE_PREFETCH
7070
7071	/*
7072	* To maintain the prefetch distance, prefetch one more page for
7073	* each page we read.
7074	*/
7075	xid_horizon_prefetch_buffer(rel, &prefetch_state, `1`);
7076	#endif
7077
7078	hpage = BufferGetPage(buf);
7079
7080	LockBuffer(buf, BUFFER_LOCK_SHARE);
7081	}
7082
7083	hoffnum = ItemPointerGetOffsetNumber(htid);
7084	hitemid = PageGetItemId(hpage, hoffnum);
7085
7086	/*
7087	* Follow any redirections until we find something useful.
7088	*/
7089	while (ItemIdIsRedirected(hitemid))
7090	{
7091	hoffnum = ItemIdGetRedirect(hitemid);
7092	hitemid = PageGetItemId(hpage, hoffnum);
7093	CHECK_FOR_INTERRUPTS();
7094	}
7095
7096	/*
7097	* If the heap item has storage, then read the header and use that to
7098	* set latestRemovedXid.
7099	*
7100	* Some LP_DEAD items may not be accessible, so we ignore them.
7101	*/
7102	if (ItemIdHasStorage(hitemid))
7103	{
7104	HeapTupleHeader htuphdr;
7105
7106	htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
7107
7108	HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
7109	}
7110	else if (ItemIdIsDead(hitemid))
7111	{
7112	/*
7113	* Conjecture: if hitemid is dead then it had xids before the xids
7114	* marked on LP_NORMAL items. So we just ignore this item and move
7115	* onto the next, for the purposes of calculating
7116	* latestRemovedXid.
7117	*/
7118	}
7119	else
7120	Assert(!ItemIdIsUsed(hitemid));
7121
7122	}
7123
7124	if (BufferIsValid(buf))
7125	{
7126	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7127	ReleaseBuffer(buf);
7128	}
7129
7130	/*
7131	* If all heap tuples were LP_DEAD then we will be returning
7132	* InvalidTransactionId here, which avoids conflicts. This matches
7133	* existing logic which assumes that LP_DEAD tuples must already be older
7134	* than the latestRemovedXid on the cleanup record that set them as
7135	* LP_DEAD, hence must already have generated a conflict.
7136	*/
7137
7138	return latestRemovedXid;
7139	}
7140
7141	/*
7142	* Perform XLogInsert to register a heap cleanup info message. These
7143	* messages are sent once per VACUUM and are required because
7144	* of the phasing of removal operations during a lazy VACUUM.
7145	* see comments for vacuum_log_cleanup_info().
7146	*/
7147	XLogRecPtr
7148	log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7149	{
7150	xl_heap_cleanup_info xlrec;
7151	XLogRecPtr recptr;
7152
7153	xlrec.node = rnode;
7154	xlrec.latestRemovedXid = latestRemovedXid;
7155
7156	XLogBeginInsert();
7157	XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7158
7159	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7160
7161	return recptr;
7162	}
7163
7164	/*
7165	* Perform XLogInsert for a heap-clean operation. Caller must already
7166	* have modified the buffer and marked it dirty.
7167	*
7168	* Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7169	* zero-based tuple indexes. Now they are one-based like other uses
7170	* of OffsetNumber.
7171	*
7172	* We also include latestRemovedXid, which is the greatest XID present in
7173	* the removed tuples. That allows recovery processing to cancel or wait
7174	* for long standby queries that can still see these tuples.
7175	*/
7176	XLogRecPtr
7177	log_heap_clean(Relation reln, Buffer buffer,
7178	OffsetNumber redirected, int* nredirected,
7179	OffsetNumber nowdead, int* ndead,
7180	OffsetNumber nowunused, int* nunused,
7181	TransactionId latestRemovedXid)
7182	{
7183	xl_heap_clean xlrec;
7184	XLogRecPtr recptr;
7185
7186	/ Caller should not call me on a non-WAL-logged relation /
7187	Assert(RelationNeedsWAL(reln));
7188
7189	xlrec.latestRemovedXid = latestRemovedXid;
7190	xlrec.nredirected = nredirected;
7191	xlrec.ndead = ndead;
7192
7193	XLogBeginInsert();
7194	XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7195
7196	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
7197
7198	/*
7199	* The OffsetNumber arrays are not actually in the buffer, but we pretend
7200	* that they are. When XLogInsert stores the whole buffer, the offset
7201	* arrays need not be stored too. Note that even if all three arrays are
7202	* empty, we want to expose the buffer as a candidate for whole-page
7203	* storage, since this record type implies a defragmentation operation
7204	* even if no line pointers changed state.
7205	*/
7206	if (nredirected > `0`)
7207	XLogRegisterBufData(`0`, (char *) redirected,
7208	nredirected * sizeof(OffsetNumber) * `2`);
7209
7210	if (ndead > `0`)
7211	XLogRegisterBufData(`0`, (char *) nowdead,
7212	ndead * sizeof(OffsetNumber));
7213
7214	if (nunused > `0`)
7215	XLogRegisterBufData(`0`, (char *) nowunused,
7216	nunused * sizeof(OffsetNumber));
7217
7218	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7219
7220	return recptr;
7221	}
7222
7223	/*
7224	* Perform XLogInsert for a heap-freeze operation. Caller must have already
7225	* modified the buffer and marked it dirty.
7226	*/
7227	XLogRecPtr
7228	log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7229	xl_heap_freeze_tuple tuples, int* ntuples)
7230	{
7231	xl_heap_freeze_page xlrec;
7232	XLogRecPtr recptr;
7233
7234	/ Caller should not call me on a non-WAL-logged relation /
7235	Assert(RelationNeedsWAL(reln));
7236	/ nor when there are no tuples to freeze /
7237	Assert(ntuples > `0`);
7238
7239	xlrec.cutoff_xid = cutoff_xid;
7240	xlrec.ntuples = ntuples;
7241
7242	XLogBeginInsert();
7243	XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7244
7245	/*
7246	* The freeze plan array is not actually in the buffer, but pretend that
7247	* it is. When XLogInsert stores the whole buffer, the freeze plan need
7248	* not be stored too.
7249	*/
7250	XLogRegisterBuffer(`0`, buffer, REGBUF_STANDARD);
7251	XLogRegisterBufData(`0`, (char *) tuples,
7252	ntuples * sizeof(xl_heap_freeze_tuple));
7253
7254	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7255
7256	return recptr;
7257	}
7258
7259	/*
7260	* Perform XLogInsert for a heap-visible operation. 'block' is the block
7261	* being marked all-visible, and vm_buffer is the buffer containing the
7262	* corresponding visibility map block. Both should have already been modified
7263	* and dirtied.
7264	*
7265	* If checksums are enabled, we also generate a full-page image of
7266	* heap_buffer, if necessary.
7267	*/
7268	XLogRecPtr
7269	log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7270	TransactionId cutoff_xid, uint8 vmflags)
7271	{
7272	xl_heap_visible xlrec;
7273	XLogRecPtr recptr;
7274	uint8 flags;
7275
7276	Assert(BufferIsValid(heap_buffer));
7277	Assert(BufferIsValid(vm_buffer));
7278
7279	xlrec.cutoff_xid = cutoff_xid;
7280	xlrec.flags = vmflags;
7281	XLogBeginInsert();
7282	XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7283
7284	XLogRegisterBuffer(`0`, vm_buffer, `0`);
7285
7286	flags = REGBUF_STANDARD;
7287	if (!XLogHintBitIsNeeded())
7288	flags \|= REGBUF_NO_IMAGE;
7289	XLogRegisterBuffer(`1`, heap_buffer, flags);
7290
7291	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7292
7293	return recptr;
7294	}
7295
7296	/*
7297	* Perform XLogInsert for a heap-update operation. Caller must already
7298	* have modified the buffer(s) and marked them dirty.
7299	*/
7300	static XLogRecPtr
7301	log_heap_update(Relation reln, Buffer oldbuf,
7302	Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7303	HeapTuple old_key_tuple,
7304	bool all_visible_cleared, bool new_all_visible_cleared)
7305	{
7306	xl_heap_update xlrec;
7307	xl_heap_header xlhdr;
7308	xl_heap_header xlhdr_idx;
7309	uint8 info;
7310	uint16 prefix_suffix[`2`];
7311	uint16 prefixlen = `0`,
7312	suffixlen = `0`;
7313	XLogRecPtr recptr;
7314	Page page = BufferGetPage(newbuf);
7315	bool need_tuple_data = RelationIsLogicallyLogged(reln);
7316	bool init;
7317	int bufflags;
7318
7319	/ Caller should not call me on a non-WAL-logged relation /
7320	Assert(RelationNeedsWAL(reln));
7321
7322	XLogBeginInsert();
7323
7324	if (HeapTupleIsHeapOnly(newtup))
7325	info = XLOG_HEAP_HOT_UPDATE;
7326	else
7327	info = XLOG_HEAP_UPDATE;
7328
7329	/*
7330	* If the old and new tuple are on the same page, we only need to log the
7331	* parts of the new tuple that were changed. That saves on the amount of
7332	* WAL we need to write. Currently, we just count any unchanged bytes in
7333	* the beginning and end of the tuple. That's quick to check, and
7334	* perfectly covers the common case that only one field is updated.
7335	*
7336	* We could do this even if the old and new tuple are on different pages,
7337	* but only if we don't make a full-page image of the old page, which is
7338	* difficult to know in advance. Also, if the old tuple is corrupt for
7339	* some reason, it would allow the corruption to propagate the new page,
7340	* so it seems best to avoid. Under the general assumption that most
7341	* updates tend to create the new tuple version on the same page, there
7342	* isn't much to be gained by doing this across pages anyway.
7343	*
7344	* Skip this if we're taking a full-page image of the new page, as we
7345	* don't include the new tuple in the WAL record in that case. Also
7346	* disable if wal_level='logical', as logical decoding needs to be able to
7347	* read the new tuple in whole from the WAL record alone.
7348	*/
7349	if (oldbuf == newbuf && !need_tuple_data &&
7350	!XLogCheckBufferNeedsBackup(newbuf))
7351	{
7352	char oldp = (char* *) oldtup->t_data + oldtup->t_data->t_hoff;
7353	char newp = (char* *) newtup->t_data + newtup->t_data->t_hoff;
7354	int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7355	int newlen = newtup->t_len - newtup->t_data->t_hoff;
7356
7357	/ Check for common prefix between old and new tuple /
7358	for (prefixlen = `0`; prefixlen < Min(oldlen, newlen); prefixlen++)
7359	{
7360	if (newp[prefixlen] != oldp[prefixlen])
7361	break;
7362	}
7363
7364	/*
7365	* Storing the length of the prefix takes 2 bytes, so we need to save
7366	* at least 3 bytes or there's no point.
7367	*/
7368	if (prefixlen < `3`)
7369	prefixlen = `0`;
7370
7371	/ Same for suffix /
7372	for (suffixlen = `0`; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7373	{
7374	if (newp[newlen - suffixlen - `1`] != oldp[oldlen - suffixlen - `1`])
7375	break;
7376	}
7377	if (suffixlen < `3`)
7378	suffixlen = `0`;
7379	}
7380
7381	/ Prepare main WAL data chain /
7382	xlrec.flags = `0`;
7383	if (all_visible_cleared)
7384	xlrec.flags \|= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7385	if (new_all_visible_cleared)
7386	xlrec.flags \|= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7387	if (prefixlen > `0`)
7388	xlrec.flags \|= XLH_UPDATE_PREFIX_FROM_OLD;
7389	if (suffixlen > `0`)
7390	xlrec.flags \|= XLH_UPDATE_SUFFIX_FROM_OLD;
7391	if (need_tuple_data)
7392	{
7393	xlrec.flags \|= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7394	if (old_key_tuple)
7395	{
7396	if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7397	xlrec.flags \|= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7398	else
7399	xlrec.flags \|= XLH_UPDATE_CONTAINS_OLD_KEY;
7400	}
7401	}
7402
7403	/ If new tuple is the single and first tuple on page... /
7404	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7405	PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7406	{
7407	info \|= XLOG_HEAP_INIT_PAGE;
7408	init = true;
7409	}
7410	else
7411	init = false;
7412
7413	/ Prepare WAL data for the old page /
7414	xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7415	xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7416	xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7417	oldtup->t_data->t_infomask2);
7418
7419	/ Prepare WAL data for the new page /
7420	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7421	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7422
7423	bufflags = REGBUF_STANDARD;
7424	if (init)
7425	bufflags \|= REGBUF_WILL_INIT;
7426	if (need_tuple_data)
7427	bufflags \|= REGBUF_KEEP_DATA;
7428
7429	XLogRegisterBuffer(`0`, newbuf, bufflags);
7430	if (oldbuf != newbuf)
7431	XLogRegisterBuffer(`1`, oldbuf, REGBUF_STANDARD);
7432
7433	XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7434
7435	/*
7436	* Prepare WAL data for the new tuple.
7437	*/
7438	if (prefixlen > `0` \|\| suffixlen > `0`)
7439	{
7440	if (prefixlen > `0` && suffixlen > `0`)
7441	{
7442	prefix_suffix[`0`] = prefixlen;
7443	prefix_suffix[`1`] = suffixlen;
7444	XLogRegisterBufData(`0`, (char ) &prefix_suffix, sizeof(uint16) `2`);
7445	}
7446	else if (prefixlen > `0`)
7447	{
7448	XLogRegisterBufData(`0`, (char ) &prefixlen, sizeof*(uint16));
7449	}
7450	else
7451	{
7452	XLogRegisterBufData(`0`, (char ) &suffixlen, sizeof*(uint16));
7453	}
7454	}
7455
7456	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7457	xlhdr.t_infomask = newtup->t_data->t_infomask;
7458	xlhdr.t_hoff = newtup->t_data->t_hoff;
7459	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7460
7461	/*
7462	* PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7463	*
7464	* The 'data' doesn't include the common prefix or suffix.
7465	*/
7466	XLogRegisterBufData(`0`, (char *) &xlhdr, SizeOfHeapHeader);
7467	if (prefixlen == `0`)
7468	{
7469	XLogRegisterBufData(`0`,
7470	((char *) newtup->t_data) + SizeofHeapTupleHeader,
7471	newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7472	}
7473	else
7474	{
7475	/*
7476	* Have to write the null bitmap and data after the common prefix as
7477	* two separate rdata entries.
7478	*/
7479	/ bitmap [+ padding] [+ oid] /
7480	if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > `0`)
7481	{
7482	XLogRegisterBufData(`0`,
7483	((char *) newtup->t_data) + SizeofHeapTupleHeader,
7484	newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7485	}
7486
7487	/ data after common prefix /
7488	XLogRegisterBufData(`0`,
7489	((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
7490	newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
7491	}
7492
7493	/ We need to log a tuple identity /
7494	if (need_tuple_data && old_key_tuple)
7495	{
7496	/ don't really need this, but its more comfy to decode /
7497	xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
7498	xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
7499	xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
7500
7501	XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
7502
7503	/ PG73FORMAT: write bitmap [+ padding] [+ oid] + data /
7504	XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
7505	old_key_tuple->t_len - SizeofHeapTupleHeader);
7506	}
7507
7508	/ filtering by origin on a row level is much more efficient /
7509	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
7510
7511	recptr = XLogInsert(RM_HEAP_ID, info);
7512
7513	return recptr;
7514	}
7515
7516	/*
7517	* Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
7518	*
7519	* This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
7520	* tuples.
7521	*/
7522	static XLogRecPtr
7523	log_heap_new_cid(Relation relation, HeapTuple tup)
7524	{
7525	xl_heap_new_cid xlrec;
7526
7527	XLogRecPtr recptr;
7528	HeapTupleHeader hdr = tup->t_data;
7529
7530	Assert(ItemPointerIsValid(&tup->t_self));
7531	Assert(tup->t_tableOid != InvalidOid);
7532
7533	xlrec.top_xid = GetTopTransactionId();
7534	xlrec.target_node = relation->rd_node;
7535	xlrec.target_tid = tup->t_self;
7536
7537	/*
7538	* If the tuple got inserted & deleted in the same TX we definitely have a
7539	* combocid, set cmin and cmax.
7540	*/
7541	if (hdr->t_infomask & HEAP_COMBOCID)
7542	{
7543	Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
7544	Assert(!HeapTupleHeaderXminInvalid(hdr));
7545	xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
7546	xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
7547	xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
7548	}
7549	/ No combocid, so only cmin or cmax can be set by this TX /
7550	else
7551	{
7552	/*
7553	* Tuple inserted.
7554	*
7555	* We need to check for LOCK ONLY because multixacts might be
7556	* transferred to the new tuple in case of FOR KEY SHARE updates in
7557	* which case there will be an xmax, although the tuple just got
7558	* inserted.
7559	*/
7560	if (hdr->t_infomask & HEAP_XMAX_INVALID \|\|
7561	HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
7562	{
7563	xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
7564	xlrec.cmax = InvalidCommandId;
7565	}
7566	/ Tuple from a different tx updated or deleted. /
7567	else
7568	{
7569	xlrec.cmin = InvalidCommandId;
7570	xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
7571
7572	}
7573	xlrec.combocid = InvalidCommandId;
7574	}
7575
7576	/*
7577	* Note that we don't need to register the buffer here, because this
7578	* operation does not modify the page. The insert/update/delete that
7579	* called us certainly did, but that's WAL-logged separately.
7580	*/
7581	XLogBeginInsert();
7582	XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
7583
7584	/ will be looked at irrespective of origin /
7585
7586	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
7587
7588	return recptr;
7589	}
7590
7591	/*
7592	* Build a heap tuple representing the configured REPLICA IDENTITY to represent
7593	* the old tuple in a UPDATE or DELETE.
7594	*
7595	* Returns NULL if there's no need to log an identity or if there's no suitable
7596	* key defined.
7597	*
7598	* key_changed should be false if caller knows that no replica identity
7599	* columns changed value. It's always true in the DELETE case.
7600	*
7601	* *copy is set to true if the returned tuple is a modified copy rather than
7602	* the same tuple that was passed in.
7603	*/
7604	static HeapTuple
7605	ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed,
7606	bool *copy)
7607	{
7608	TupleDesc desc = RelationGetDescr(relation);
7609	char replident = relation->rd_rel->relreplident;
7610	Bitmapset *idattrs;
7611	HeapTuple key_tuple;
7612	bool nulls[MaxHeapAttributeNumber];
7613	Datum values[MaxHeapAttributeNumber];
7614
7615	*copy = false;
7616
7617	if (!RelationIsLogicallyLogged(relation))
7618	return NULL;
7619
7620	if (replident == REPLICA_IDENTITY_NOTHING)
7621	return NULL;
7622
7623	if (replident == REPLICA_IDENTITY_FULL)
7624	{
7625	/*
7626	* When logging the entire old tuple, it very well could contain
7627	* toasted columns. If so, force them to be inlined.
7628	*/
7629	if (HeapTupleHasExternal(tp))
7630	{
7631	*copy = true;
7632	tp = toast_flatten_tuple(tp, desc);
7633	}
7634	return tp;
7635	}
7636
7637	/ if the key hasn't changed and we're only logging the key, we're done /
7638	if (!key_changed)
7639	return NULL;
7640
7641	/ find out the replica identity columns /
7642	idattrs = RelationGetIndexAttrBitmap(relation,
7643	INDEX_ATTR_BITMAP_IDENTITY_KEY);
7644
7645	/*
7646	* If there's no defined replica identity columns, treat as !key_changed.
7647	* (This case should not be reachable from heap_update, since that should
7648	* calculate key_changed accurately. But heap_delete just passes constant
7649	* true for key_changed, so we can hit this case in deletes.)
7650	*/
7651	if (bms_is_empty(idattrs))
7652	return NULL;
7653
7654	/*
7655	* Construct a new tuple containing only the replica identity columns,
7656	* with nulls elsewhere. While we're at it, assert that the replica
7657	* identity columns aren't null.
7658	*/
7659	heap_deform_tuple(tp, desc, values, nulls);
7660
7661	for (int i = `0`; i < desc->natts; i++)
7662	{
7663	if (bms_is_member(i + `1` - FirstLowInvalidHeapAttributeNumber,
7664	idattrs))
7665	Assert(!nulls[i]);
7666	else
7667	nulls[i] = true;
7668	}
7669
7670	key_tuple = heap_form_tuple(desc, values, nulls);
7671	*copy = true;
7672
7673	bms_free(idattrs);
7674
7675	/*
7676	* If the tuple, which by here only contains indexed columns, still has
7677	* toasted columns, force them to be inlined. This is somewhat unlikely
7678	* since there's limits on the size of indexed columns, so we don't
7679	* duplicate toast_flatten_tuple()s functionality in the above loop over
7680	* the indexed columns, even if it would be more efficient.
7681	*/
7682	if (HeapTupleHasExternal(key_tuple))
7683	{
7684	HeapTuple oldtup = key_tuple;
7685
7686	key_tuple = toast_flatten_tuple(oldtup, desc);
7687	heap_freetuple(oldtup);
7688	}
7689
7690	return key_tuple;
7691	}
7692
7693	/*
7694	* Handles CLEANUP_INFO
7695	*/
7696	static void
7697	heap_xlog_cleanup_info(XLogReaderState *record)
7698	{
7699	xl_heap_cleanup_info xlrec = (xl_heap_cleanup_info ) XLogRecGetData(record);
7700
7701	if (InHotStandby)
7702	ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
7703
7704	/*
7705	* Actual operation is a no-op. Record type exists to provide a means for
7706	* conflict processing to occur before we begin index vacuum actions. see
7707	* vacuumlazy.c and also comments in btvacuumpage()
7708	*/
7709
7710	/ Backup blocks are not used in cleanup_info records /
7711	Assert(!XLogRecHasAnyBlockRefs(record));
7712	}
7713
7714	/*
7715	* Handles XLOG_HEAP2_CLEAN record type
7716	*/
7717	static void
7718	heap_xlog_clean(XLogReaderState *record)
7719	{
7720	XLogRecPtr lsn = record->EndRecPtr;
7721	xl_heap_clean xlrec = (xl_heap_clean ) XLogRecGetData(record);
7722	Buffer buffer;
7723	RelFileNode rnode;
7724	BlockNumber blkno;
7725	XLogRedoAction action;
7726
7727	XLogRecGetBlockTag(record, `0`, &rnode, NULL, &blkno);
7728
7729	/*
7730	* We're about to remove tuples. In Hot Standby mode, ensure that there's
7731	* no queries running for which the removed tuples are still visible.
7732	*
7733	* Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
7734	* conflict on the records that cause MVCC failures for user queries. If
7735	* latestRemovedXid is invalid, skip conflict processing.
7736	*/
7737	if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
7738	ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
7739
7740	/*
7741	* If we have a full-page image, restore it (using a cleanup lock) and
7742	* we're done.
7743	*/
7744	action = XLogReadBufferForRedoExtended(record, `0`, RBM_NORMAL, true,
7745	&buffer);
7746	if (action == BLK_NEEDS_REDO)
7747	{
7748	Page page = (Page) BufferGetPage(buffer);
7749	OffsetNumber *end;
7750	OffsetNumber *redirected;
7751	OffsetNumber *nowdead;
7752	OffsetNumber *nowunused;
7753	int nredirected;
7754	int ndead;
7755	int nunused;
7756	Size datalen;
7757
7758	redirected = (OffsetNumber *) XLogRecGetBlockData(record, `0`, &datalen);
7759
7760	nredirected = xlrec->nredirected;
7761	ndead = xlrec->ndead;
7762	end = (OffsetNumber ) ((char* *) redirected + datalen);
7763	nowdead = redirected + (nredirected * `2`);
7764	nowunused = nowdead + ndead;
7765	nunused = (end - nowunused);
7766	Assert(nunused >= `0`);
7767
7768	/ Update all line pointers per the record, and repair fragmentation /
7769	heap_page_prune_execute(buffer,
7770	redirected, nredirected,
7771	nowdead, ndead,
7772	nowunused, nunused);
7773
7774	/*
7775	* Note: we don't worry about updating the page's prunability hints.
7776	* At worst this will cause an extra prune cycle to occur soon.
7777	*/
7778
7779	PageSetLSN(page, lsn);
7780	MarkBufferDirty(buffer);
7781	}
7782
7783	if (BufferIsValid(buffer))
7784	{
7785	Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
7786
7787	UnlockReleaseBuffer(buffer);
7788
7789	/*
7790	* After cleaning records from a page, it's useful to update the FSM
7791	* about it, as it may cause the page become target for insertions
7792	* later even if vacuum decides not to visit it (which is possible if
7793	* gets marked all-visible.)
7794	*
7795	* Do this regardless of a full-page image being applied, since the
7796	* FSM data is not in the page anyway.
7797	*/
7798	XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
7799	}
7800	}
7801
7802	/*
7803	* Replay XLOG_HEAP2_VISIBLE record.
7804	*
7805	* The critical integrity requirement here is that we must never end up with
7806	* a situation where the visibility map bit is set, and the page-level
7807	* PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
7808	* page modification would fail to clear the visibility map bit.
7809	*/
7810	static void
7811	heap_xlog_visible(XLogReaderState *record)
7812	{
7813	XLogRecPtr lsn = record->EndRecPtr;
7814	xl_heap_visible xlrec = (xl_heap_visible ) XLogRecGetData(record);
7815	Buffer vmbuffer = InvalidBuffer;
7816	Buffer buffer;
7817	Page page;
7818	RelFileNode rnode;
7819	BlockNumber blkno;
7820	XLogRedoAction action;
7821
7822	XLogRecGetBlockTag(record, `1`, &rnode, NULL, &blkno);
7823
7824	/*
7825	* If there are any Hot Standby transactions running that have an xmin
7826	* horizon old enough that this page isn't all-visible for them, they
7827	* might incorrectly decide that an index-only scan can skip a heap fetch.
7828	*
7829	* NB: It might be better to throw some kind of "soft" conflict here that
7830	* forces any index-only scan that is in flight to perform heap fetches,
7831	* rather than killing the transaction outright.
7832	*/
7833	if (InHotStandby)
7834	ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
7835
7836	/*
7837	* Read the heap page, if it still exists. If the heap file has dropped or
7838	* truncated later in recovery, we don't need to update the page, but we'd
7839	* better still update the visibility map.
7840	*/
7841	action = XLogReadBufferForRedo(record, `1`, &buffer);
7842	if (action == BLK_NEEDS_REDO)
7843	{
7844	/*
7845	* We don't bump the LSN of the heap page when setting the visibility
7846	* map bit (unless checksums or wal_hint_bits is enabled, in which
7847	* case we must), because that would generate an unworkable volume of
7848	* full-page writes. This exposes us to torn page hazards, but since
7849	* we're not inspecting the existing page contents in any way, we
7850	* don't care.
7851	*
7852	* However, all operations that clear the visibility map bit do bump
7853	* the LSN, and those operations will only be replayed if the XLOG LSN
7854	* follows the page LSN. Thus, if the page LSN has advanced past our
7855	* XLOG record's LSN, we mustn't mark the page all-visible, because
7856	* the subsequent update won't be replayed to clear the flag.
7857	*/
7858	page = BufferGetPage(buffer);
7859
7860	PageSetAllVisible(page);
7861
7862	MarkBufferDirty(buffer);
7863	}
7864	else if (action == BLK_RESTORED)
7865	{
7866	/*
7867	* If heap block was backed up, we already restored it and there's
7868	* nothing more to do. (This can only happen with checksums or
7869	* wal_log_hints enabled.)
7870	*/
7871	}
7872
7873	if (BufferIsValid(buffer))
7874	{
7875	Size space = PageGetFreeSpace(BufferGetPage(buffer));
7876
7877	UnlockReleaseBuffer(buffer);
7878
7879	/*
7880	* Since FSM is not WAL-logged and only updated heuristically, it
7881	* easily becomes stale in standbys. If the standby is later promoted
7882	* and runs VACUUM, it will skip updating individual free space
7883	* figures for pages that became all-visible (or all-frozen, depending
7884	* on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
7885	* propagates too optimistic free space values to upper FSM layers;
7886	* later inserters try to use such pages only to find out that they
7887	* are unusable. This can cause long stalls when there are many such
7888	* pages.
7889	*
7890	* Forestall those problems by updating FSM's idea about a page that
7891	* is becoming all-visible or all-frozen.
7892	*
7893	* Do this regardless of a full-page image being applied, since the
7894	* FSM data is not in the page anyway.
7895	*/
7896	if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
7897	XLogRecordPageWithFreeSpace(rnode, blkno, space);
7898	}
7899
7900	/*
7901	* Even if we skipped the heap page update due to the LSN interlock, it's
7902	* still safe to update the visibility map. Any WAL record that clears
7903	* the visibility map bit does so before checking the page LSN, so any
7904	* bits that need to be cleared will still be cleared.
7905	*/
7906	if (XLogReadBufferForRedoExtended(record, `0`, RBM_ZERO_ON_ERROR, false,
7907	&vmbuffer) == BLK_NEEDS_REDO)
7908	{
7909	Page vmpage = BufferGetPage(vmbuffer);
7910	Relation reln;
7911
7912	/ initialize the page if it was read as zeros /
7913	if (PageIsNew(vmpage))
7914	PageInit(vmpage, BLCKSZ, `0`);
7915
7916	/*
7917	* XLogReadBufferForRedoExtended locked the buffer. But
7918	* visibilitymap_set will handle locking itself.
7919	*/
7920	LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
7921
7922	reln = CreateFakeRelcacheEntry(rnode);
7923	visibilitymap_pin(reln, blkno, &vmbuffer);
7924
7925	/*
7926	* Don't set the bit if replay has already passed this point.
7927	*
7928	* It might be safe to do this unconditionally; if replay has passed
7929	* this point, we'll replay at least as far this time as we did
7930	* before, and if this bit needs to be cleared, the record responsible
7931	* for doing so should be again replayed, and clear it. For right
7932	* now, out of an abundance of conservatism, we use the same test here
7933	* we did for the heap page. If this results in a dropped bit, no
7934	* real harm is done; and the next VACUUM will fix it.
7935	*/
7936	if (lsn > PageGetLSN(vmpage))
7937	visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
7938	xlrec->cutoff_xid, xlrec->flags);
7939
7940	ReleaseBuffer(vmbuffer);
7941	FreeFakeRelcacheEntry(reln);
7942	}
7943	else if (BufferIsValid(vmbuffer))
7944	UnlockReleaseBuffer(vmbuffer);
7945	}
7946
7947	/*
7948	* Replay XLOG_HEAP2_FREEZE_PAGE records
7949	*/
7950	static void
7951	heap_xlog_freeze_page(XLogReaderState *record)
7952	{
7953	XLogRecPtr lsn = record->EndRecPtr;
7954	xl_heap_freeze_page xlrec = (xl_heap_freeze_page ) XLogRecGetData(record);
7955	TransactionId cutoff_xid = xlrec->cutoff_xid;
7956	Buffer buffer;
7957	int ntup;
7958
7959	/*
7960	* In Hot Standby mode, ensure that there's no queries running which still
7961	* consider the frozen xids as running.
7962	*/
7963	if (InHotStandby)
7964	{
7965	RelFileNode rnode;
7966	TransactionId latestRemovedXid = cutoff_xid;
7967
7968	TransactionIdRetreat(latestRemovedXid);
7969
7970	XLogRecGetBlockTag(record, `0`, &rnode, NULL, NULL);
7971	ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
7972	}
7973
7974	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
7975	{
7976	Page page = BufferGetPage(buffer);
7977	xl_heap_freeze_tuple *tuples;
7978
7979	tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, `0`, NULL);
7980
7981	/ now execute freeze plan for each frozen tuple /
7982	for (ntup = `0`; ntup < xlrec->ntuples; ntup++)
7983	{
7984	xl_heap_freeze_tuple *xlrec_tp;
7985	ItemId lp;
7986	HeapTupleHeader tuple;
7987
7988	xlrec_tp = &tuples[ntup];
7989	lp = PageGetItemId(page, xlrec_tp->offset); / offsets are one-based /
7990	tuple = (HeapTupleHeader) PageGetItem(page, lp);
7991
7992	heap_execute_freeze_tuple(tuple, xlrec_tp);
7993	}
7994
7995	PageSetLSN(page, lsn);
7996	MarkBufferDirty(buffer);
7997	}
7998	if (BufferIsValid(buffer))
7999	UnlockReleaseBuffer(buffer);
8000	}
8001
8002	/*
8003	* Given an "infobits" field from an XLog record, set the correct bits in the
8004	* given infomask and infomask2 for the tuple touched by the record.
8005	*
8006	* (This is the reverse of compute_infobits).
8007	*/
8008	static void
8009	fix_infomask_from_infobits(uint8 infobits, uint16 infomask, uint16 infomask2)
8010	{
8011	*infomask &= ~(HEAP_XMAX_IS_MULTI \| HEAP_XMAX_LOCK_ONLY \|
8012	HEAP_XMAX_KEYSHR_LOCK \| HEAP_XMAX_EXCL_LOCK);
8013	*infomask2 &= ~HEAP_KEYS_UPDATED;
8014
8015	if (infobits & XLHL_XMAX_IS_MULTI)
8016	*infomask \|= HEAP_XMAX_IS_MULTI;
8017	if (infobits & XLHL_XMAX_LOCK_ONLY)
8018	*infomask \|= HEAP_XMAX_LOCK_ONLY;
8019	if (infobits & XLHL_XMAX_EXCL_LOCK)
8020	*infomask \|= HEAP_XMAX_EXCL_LOCK;
8021	/ note HEAP_XMAX_SHR_LOCK isn't considered here /
8022	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8023	*infomask \|= HEAP_XMAX_KEYSHR_LOCK;
8024
8025	if (infobits & XLHL_KEYS_UPDATED)
8026	*infomask2 \|= HEAP_KEYS_UPDATED;
8027	}
8028
8029	static void
8030	heap_xlog_delete(XLogReaderState *record)
8031	{
8032	XLogRecPtr lsn = record->EndRecPtr;
8033	xl_heap_delete xlrec = (xl_heap_delete ) XLogRecGetData(record);
8034	Buffer buffer;
8035	Page page;
8036	ItemId lp = NULL;
8037	HeapTupleHeader htup;
8038	BlockNumber blkno;
8039	RelFileNode target_node;
8040	ItemPointerData target_tid;
8041
8042	XLogRecGetBlockTag(record, `0`, &target_node, NULL, &blkno);
8043	ItemPointerSetBlockNumber(&target_tid, blkno);
8044	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8045
8046	/*
8047	* The visibility map may need to be fixed even if the heap page is
8048	* already up-to-date.
8049	*/
8050	if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8051	{
8052	Relation reln = CreateFakeRelcacheEntry(target_node);
8053	Buffer vmbuffer = InvalidBuffer;
8054
8055	visibilitymap_pin(reln, blkno, &vmbuffer);
8056	visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8057	ReleaseBuffer(vmbuffer);
8058	FreeFakeRelcacheEntry(reln);
8059	}
8060
8061	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
8062	{
8063	page = BufferGetPage(buffer);
8064
8065	if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8066	lp = PageGetItemId(page, xlrec->offnum);
8067
8068	if (PageGetMaxOffsetNumber(page) < xlrec->offnum \|\| !ItemIdIsNormal(lp))
8069	elog(PANIC, "invalid lp");
8070
8071	htup = (HeapTupleHeader) PageGetItem(page, lp);
8072
8073	htup->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
8074	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8075	HeapTupleHeaderClearHotUpdated(htup);
8076	fix_infomask_from_infobits(xlrec->infobits_set,
8077	&htup->t_infomask, &htup->t_infomask2);
8078	if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8079	HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8080	else
8081	HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8082	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8083
8084	/ Mark the page as a candidate for pruning /
8085	PageSetPrunable(page, XLogRecGetXid(record));
8086
8087	if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8088	PageClearAllVisible(page);
8089
8090	/ Make sure t_ctid is set correctly /
8091	if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8092	HeapTupleHeaderSetMovedPartitions(htup);
8093	else
8094	htup->t_ctid = target_tid;
8095	PageSetLSN(page, lsn);
8096	MarkBufferDirty(buffer);
8097	}
8098	if (BufferIsValid(buffer))
8099	UnlockReleaseBuffer(buffer);
8100	}
8101
8102	static void
8103	heap_xlog_insert(XLogReaderState *record)
8104	{
8105	XLogRecPtr lsn = record->EndRecPtr;
8106	xl_heap_insert xlrec = (xl_heap_insert ) XLogRecGetData(record);
8107	Buffer buffer;
8108	Page page;
8109	union
8110	{
8111	HeapTupleHeaderData hdr;
8112	char data[MaxHeapTupleSize];
8113	} tbuf;
8114	HeapTupleHeader htup;
8115	xl_heap_header xlhdr;
8116	uint32 newlen;
8117	Size freespace = `0`;
8118	RelFileNode target_node;
8119	BlockNumber blkno;
8120	ItemPointerData target_tid;
8121	XLogRedoAction action;
8122
8123	XLogRecGetBlockTag(record, `0`, &target_node, NULL, &blkno);
8124	ItemPointerSetBlockNumber(&target_tid, blkno);
8125	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8126
8127	/*
8128	* The visibility map may need to be fixed even if the heap page is
8129	* already up-to-date.
8130	*/
8131	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8132	{
8133	Relation reln = CreateFakeRelcacheEntry(target_node);
8134	Buffer vmbuffer = InvalidBuffer;
8135
8136	visibilitymap_pin(reln, blkno, &vmbuffer);
8137	visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8138	ReleaseBuffer(vmbuffer);
8139	FreeFakeRelcacheEntry(reln);
8140	}
8141
8142	/*
8143	* If we inserted the first and only tuple on the page, re-initialize the
8144	* page from scratch.
8145	*/
8146	if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8147	{
8148	buffer = XLogInitBufferForRedo(record, `0`);
8149	page = BufferGetPage(buffer);
8150	PageInit(page, BufferGetPageSize(buffer), `0`);
8151	action = BLK_NEEDS_REDO;
8152	}
8153	else
8154	action = XLogReadBufferForRedo(record, `0`, &buffer);
8155	if (action == BLK_NEEDS_REDO)
8156	{
8157	Size datalen;
8158	char *data;
8159
8160	page = BufferGetPage(buffer);
8161
8162	if (PageGetMaxOffsetNumber(page) + `1` < xlrec->offnum)
8163	elog(PANIC, "invalid max offset number");
8164
8165	data = XLogRecGetBlockData(record, `0`, &datalen);
8166
8167	newlen = datalen - SizeOfHeapHeader;
8168	Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8169	memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8170	data += SizeOfHeapHeader;
8171
8172	htup = &tbuf.hdr;
8173	MemSet((char *) htup, `0`, SizeofHeapTupleHeader);
8174	/ PG73FORMAT: get bitmap [+ padding] [+ oid] + data /
8175	memcpy((char *) htup + SizeofHeapTupleHeader,
8176	data,
8177	newlen);
8178	newlen += SizeofHeapTupleHeader;
8179	htup->t_infomask2 = xlhdr.t_infomask2;
8180	htup->t_infomask = xlhdr.t_infomask;
8181	htup->t_hoff = xlhdr.t_hoff;
8182	HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8183	HeapTupleHeaderSetCmin(htup, FirstCommandId);
8184	htup->t_ctid = target_tid;
8185
8186	if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8187	true, true) == InvalidOffsetNumber)
8188	elog(PANIC, "failed to add tuple");
8189
8190	freespace = PageGetHeapFreeSpace(page); / needed to update FSM below /
8191
8192	PageSetLSN(page, lsn);
8193
8194	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8195	PageClearAllVisible(page);
8196
8197	MarkBufferDirty(buffer);
8198	}
8199	if (BufferIsValid(buffer))
8200	UnlockReleaseBuffer(buffer);
8201
8202	/*
8203	* If the page is running low on free space, update the FSM as well.
8204	* Arbitrarily, our definition of "low" is less than 20%. We can't do much
8205	* better than that without knowing the fill-factor for the table.
8206	*
8207	* XXX: Don't do this if the page was restored from full page image. We
8208	* don't bother to update the FSM in that case, it doesn't need to be
8209	* totally accurate anyway.
8210	*/
8211	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / `5`)
8212	XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8213	}
8214
8215	/*
8216	* Handles MULTI_INSERT record type.
8217	*/
8218	static void
8219	heap_xlog_multi_insert(XLogReaderState *record)
8220	{
8221	XLogRecPtr lsn = record->EndRecPtr;
8222	xl_heap_multi_insert *xlrec;
8223	RelFileNode rnode;
8224	BlockNumber blkno;
8225	Buffer buffer;
8226	Page page;
8227	union
8228	{
8229	HeapTupleHeaderData hdr;
8230	char data[MaxHeapTupleSize];
8231	} tbuf;
8232	HeapTupleHeader htup;
8233	uint32 newlen;
8234	Size freespace = `0`;
8235	int i;
8236	bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != `0`;
8237	XLogRedoAction action;
8238
8239	/*
8240	* Insertion doesn't overwrite MVCC data, so no conflict processing is
8241	* required.
8242	*/
8243	xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8244
8245	XLogRecGetBlockTag(record, `0`, &rnode, NULL, &blkno);
8246
8247	/*
8248	* The visibility map may need to be fixed even if the heap page is
8249	* already up-to-date.
8250	*/
8251	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8252	{
8253	Relation reln = CreateFakeRelcacheEntry(rnode);
8254	Buffer vmbuffer = InvalidBuffer;
8255
8256	visibilitymap_pin(reln, blkno, &vmbuffer);
8257	visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8258	ReleaseBuffer(vmbuffer);
8259	FreeFakeRelcacheEntry(reln);
8260	}
8261
8262	if (isinit)
8263	{
8264	buffer = XLogInitBufferForRedo(record, `0`);
8265	page = BufferGetPage(buffer);
8266	PageInit(page, BufferGetPageSize(buffer), `0`);
8267	action = BLK_NEEDS_REDO;
8268	}
8269	else
8270	action = XLogReadBufferForRedo(record, `0`, &buffer);
8271	if (action == BLK_NEEDS_REDO)
8272	{
8273	char *tupdata;
8274	char *endptr;
8275	Size len;
8276
8277	/ Tuples are stored as block data /
8278	tupdata = XLogRecGetBlockData(record, `0`, &len);
8279	endptr = tupdata + len;
8280
8281	page = (Page) BufferGetPage(buffer);
8282
8283	for (i = `0`; i < xlrec->ntuples; i++)
8284	{
8285	OffsetNumber offnum;
8286	xl_multi_insert_tuple *xlhdr;
8287
8288	/*
8289	* If we're reinitializing the page, the tuples are stored in
8290	* order from FirstOffsetNumber. Otherwise there's an array of
8291	* offsets in the WAL record, and the tuples come after that.
8292	*/
8293	if (isinit)
8294	offnum = FirstOffsetNumber + i;
8295	else
8296	offnum = xlrec->offsets[i];
8297	if (PageGetMaxOffsetNumber(page) + `1` < offnum)
8298	elog(PANIC, "invalid max offset number");
8299
8300	xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8301	tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8302
8303	newlen = xlhdr->datalen;
8304	Assert(newlen <= MaxHeapTupleSize);
8305	htup = &tbuf.hdr;
8306	MemSet((char *) htup, `0`, SizeofHeapTupleHeader);
8307	/ PG73FORMAT: get bitmap [+ padding] [+ oid] + data /
8308	memcpy((char *) htup + SizeofHeapTupleHeader,
8309	(char *) tupdata,
8310	newlen);
8311	tupdata += newlen;
8312
8313	newlen += SizeofHeapTupleHeader;
8314	htup->t_infomask2 = xlhdr->t_infomask2;
8315	htup->t_infomask = xlhdr->t_infomask;
8316	htup->t_hoff = xlhdr->t_hoff;
8317	HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8318	HeapTupleHeaderSetCmin(htup, FirstCommandId);
8319	ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8320	ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8321
8322	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8323	if (offnum == InvalidOffsetNumber)
8324	elog(PANIC, "failed to add tuple");
8325	}
8326	if (tupdata != endptr)
8327	elog(PANIC, "total tuple length mismatch");
8328
8329	freespace = PageGetHeapFreeSpace(page); / needed to update FSM below /
8330
8331	PageSetLSN(page, lsn);
8332
8333	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8334	PageClearAllVisible(page);
8335
8336	MarkBufferDirty(buffer);
8337	}
8338	if (BufferIsValid(buffer))
8339	UnlockReleaseBuffer(buffer);
8340
8341	/*
8342	* If the page is running low on free space, update the FSM as well.
8343	* Arbitrarily, our definition of "low" is less than 20%. We can't do much
8344	* better than that without knowing the fill-factor for the table.
8345	*
8346	* XXX: Don't do this if the page was restored from full page image. We
8347	* don't bother to update the FSM in that case, it doesn't need to be
8348	* totally accurate anyway.
8349	*/
8350	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / `5`)
8351	XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8352	}
8353
8354	/*
8355	* Handles UPDATE and HOT_UPDATE
8356	*/
8357	static void
8358	heap_xlog_update(XLogReaderState *record, bool hot_update)
8359	{
8360	XLogRecPtr lsn = record->EndRecPtr;
8361	xl_heap_update xlrec = (xl_heap_update ) XLogRecGetData(record);
8362	RelFileNode rnode;
8363	BlockNumber oldblk;
8364	BlockNumber newblk;
8365	ItemPointerData newtid;
8366	Buffer obuffer,
8367	nbuffer;
8368	Page page;
8369	OffsetNumber offnum;
8370	ItemId lp = NULL;
8371	HeapTupleData oldtup;
8372	HeapTupleHeader htup;
8373	uint16 prefixlen = `0`,
8374	suffixlen = `0`;
8375	char *newp;
8376	union
8377	{
8378	HeapTupleHeaderData hdr;
8379	char data[MaxHeapTupleSize];
8380	} tbuf;
8381	xl_heap_header xlhdr;
8382	uint32 newlen;
8383	Size freespace = `0`;
8384	XLogRedoAction oldaction;
8385	XLogRedoAction newaction;
8386
8387	/ initialize to keep the compiler quiet /
8388	oldtup.t_data = NULL;
8389	oldtup.t_len = `0`;
8390
8391	XLogRecGetBlockTag(record, `0`, &rnode, NULL, &newblk);
8392	if (XLogRecGetBlockTag(record, `1`, NULL, NULL, &oldblk))
8393	{
8394	/ HOT updates are never done across pages /
8395	Assert(!hot_update);
8396	}
8397	else
8398	oldblk = newblk;
8399
8400	ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8401
8402	/*
8403	* The visibility map may need to be fixed even if the heap page is
8404	* already up-to-date.
8405	*/
8406	if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8407	{
8408	Relation reln = CreateFakeRelcacheEntry(rnode);
8409	Buffer vmbuffer = InvalidBuffer;
8410
8411	visibilitymap_pin(reln, oldblk, &vmbuffer);
8412	visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8413	ReleaseBuffer(vmbuffer);
8414	FreeFakeRelcacheEntry(reln);
8415	}
8416
8417	/*
8418	* In normal operation, it is important to lock the two pages in
8419	* page-number order, to avoid possible deadlocks against other update
8420	* operations going the other way. However, during WAL replay there can
8421	* be no other update happening, so we don't need to worry about that. But
8422	* we do need to worry that we don't expose an inconsistent state to Hot
8423	* Standby queries --- so the original page can't be unlocked before we've
8424	* added the new tuple to the new page.
8425	*/
8426
8427	/ Deal with old tuple version /
8428	oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? `0` : `1`,
8429	&obuffer);
8430	if (oldaction == BLK_NEEDS_REDO)
8431	{
8432	page = BufferGetPage(obuffer);
8433	offnum = xlrec->old_offnum;
8434	if (PageGetMaxOffsetNumber(page) >= offnum)
8435	lp = PageGetItemId(page, offnum);
8436
8437	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
8438	elog(PANIC, "invalid lp");
8439
8440	htup = (HeapTupleHeader) PageGetItem(page, lp);
8441
8442	oldtup.t_data = htup;
8443	oldtup.t_len = ItemIdGetLength(lp);
8444
8445	htup->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
8446	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8447	if (hot_update)
8448	HeapTupleHeaderSetHotUpdated(htup);
8449	else
8450	HeapTupleHeaderClearHotUpdated(htup);
8451	fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8452	&htup->t_infomask2);
8453	HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8454	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8455	/ Set forward chain link in t_ctid /
8456	htup->t_ctid = newtid;
8457
8458	/ Mark the page as a candidate for pruning /
8459	PageSetPrunable(page, XLogRecGetXid(record));
8460
8461	if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8462	PageClearAllVisible(page);
8463
8464	PageSetLSN(page, lsn);
8465	MarkBufferDirty(obuffer);
8466	}
8467
8468	/*
8469	* Read the page the new tuple goes into, if different from old.
8470	*/
8471	if (oldblk == newblk)
8472	{
8473	nbuffer = obuffer;
8474	newaction = oldaction;
8475	}
8476	else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8477	{
8478	nbuffer = XLogInitBufferForRedo(record, `0`);
8479	page = (Page) BufferGetPage(nbuffer);
8480	PageInit(page, BufferGetPageSize(nbuffer), `0`);
8481	newaction = BLK_NEEDS_REDO;
8482	}
8483	else
8484	newaction = XLogReadBufferForRedo(record, `0`, &nbuffer);
8485
8486	/*
8487	* The visibility map may need to be fixed even if the heap page is
8488	* already up-to-date.
8489	*/
8490	if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8491	{
8492	Relation reln = CreateFakeRelcacheEntry(rnode);
8493	Buffer vmbuffer = InvalidBuffer;
8494
8495	visibilitymap_pin(reln, newblk, &vmbuffer);
8496	visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8497	ReleaseBuffer(vmbuffer);
8498	FreeFakeRelcacheEntry(reln);
8499	}
8500
8501	/ Deal with new tuple /
8502	if (newaction == BLK_NEEDS_REDO)
8503	{
8504	char *recdata;
8505	char *recdata_end;
8506	Size datalen;
8507	Size tuplen;
8508
8509	recdata = XLogRecGetBlockData(record, `0`, &datalen);
8510	recdata_end = recdata + datalen;
8511
8512	page = BufferGetPage(nbuffer);
8513
8514	offnum = xlrec->new_offnum;
8515	if (PageGetMaxOffsetNumber(page) + `1` < offnum)
8516	elog(PANIC, "invalid max offset number");
8517
8518	if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
8519	{
8520	Assert(newblk == oldblk);
8521	memcpy(&prefixlen, recdata, sizeof(uint16));
8522	recdata += sizeof(uint16);
8523	}
8524	if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
8525	{
8526	Assert(newblk == oldblk);
8527	memcpy(&suffixlen, recdata, sizeof(uint16));
8528	recdata += sizeof(uint16);
8529	}
8530
8531	memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
8532	recdata += SizeOfHeapHeader;
8533
8534	tuplen = recdata_end - recdata;
8535	Assert(tuplen <= MaxHeapTupleSize);
8536
8537	htup = &tbuf.hdr;
8538	MemSet((char *) htup, `0`, SizeofHeapTupleHeader);
8539
8540	/*
8541	* Reconstruct the new tuple using the prefix and/or suffix from the
8542	* old tuple, and the data stored in the WAL record.
8543	*/
8544	newp = (char *) htup + SizeofHeapTupleHeader;
8545	if (prefixlen > `0`)
8546	{
8547	int len;
8548
8549	/ copy bitmap [+ padding] [+ oid] from WAL record /
8550	len = xlhdr.t_hoff - SizeofHeapTupleHeader;
8551	memcpy(newp, recdata, len);
8552	recdata += len;
8553	newp += len;
8554
8555	/ copy prefix from old tuple /
8556	memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
8557	newp += prefixlen;
8558
8559	/ copy new tuple data from WAL record /
8560	len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
8561	memcpy(newp, recdata, len);
8562	recdata += len;
8563	newp += len;
8564	}
8565	else
8566	{
8567	/*
8568	* copy bitmap [+ padding] [+ oid] + data from record, all in one
8569	* go
8570	*/
8571	memcpy(newp, recdata, tuplen);
8572	recdata += tuplen;
8573	newp += tuplen;
8574	}
8575	Assert(recdata == recdata_end);
8576
8577	/ copy suffix from old tuple /
8578	if (suffixlen > `0`)
8579	memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
8580
8581	newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
8582	htup->t_infomask2 = xlhdr.t_infomask2;
8583	htup->t_infomask = xlhdr.t_infomask;
8584	htup->t_hoff = xlhdr.t_hoff;
8585
8586	HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8587	HeapTupleHeaderSetCmin(htup, FirstCommandId);
8588	HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
8589	/ Make sure there is no forward chain link in t_ctid /
8590	htup->t_ctid = newtid;
8591
8592	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8593	if (offnum == InvalidOffsetNumber)
8594	elog(PANIC, "failed to add tuple");
8595
8596	if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8597	PageClearAllVisible(page);
8598
8599	freespace = PageGetHeapFreeSpace(page); / needed to update FSM below /
8600
8601	PageSetLSN(page, lsn);
8602	MarkBufferDirty(nbuffer);
8603	}
8604
8605	if (BufferIsValid(nbuffer) && nbuffer != obuffer)
8606	UnlockReleaseBuffer(nbuffer);
8607	if (BufferIsValid(obuffer))
8608	UnlockReleaseBuffer(obuffer);
8609
8610	/*
8611	* If the new page is running low on free space, update the FSM as well.
8612	* Arbitrarily, our definition of "low" is less than 20%. We can't do much
8613	* better than that without knowing the fill-factor for the table.
8614	*
8615	* However, don't update the FSM on HOT updates, because after crash
8616	* recovery, either the old or the new tuple will certainly be dead and
8617	* prunable. After pruning, the page will have roughly as much free space
8618	* as it did before the update, assuming the new tuple is about the same
8619	* size as the old one.
8620	*
8621	* XXX: Don't do this if the page was restored from full page image. We
8622	* don't bother to update the FSM in that case, it doesn't need to be
8623	* totally accurate anyway.
8624	*/
8625	if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / `5`)
8626	XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
8627	}
8628
8629	static void
8630	heap_xlog_confirm(XLogReaderState *record)
8631	{
8632	XLogRecPtr lsn = record->EndRecPtr;
8633	xl_heap_confirm xlrec = (xl_heap_confirm ) XLogRecGetData(record);
8634	Buffer buffer;
8635	Page page;
8636	OffsetNumber offnum;
8637	ItemId lp = NULL;
8638	HeapTupleHeader htup;
8639
8640	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
8641	{
8642	page = BufferGetPage(buffer);
8643
8644	offnum = xlrec->offnum;
8645	if (PageGetMaxOffsetNumber(page) >= offnum)
8646	lp = PageGetItemId(page, offnum);
8647
8648	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
8649	elog(PANIC, "invalid lp");
8650
8651	htup = (HeapTupleHeader) PageGetItem(page, lp);
8652
8653	/*
8654	* Confirm tuple as actually inserted
8655	*/
8656	ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
8657
8658	PageSetLSN(page, lsn);
8659	MarkBufferDirty(buffer);
8660	}
8661	if (BufferIsValid(buffer))
8662	UnlockReleaseBuffer(buffer);
8663	}
8664
8665	static void
8666	heap_xlog_lock(XLogReaderState *record)
8667	{
8668	XLogRecPtr lsn = record->EndRecPtr;
8669	xl_heap_lock xlrec = (xl_heap_lock ) XLogRecGetData(record);
8670	Buffer buffer;
8671	Page page;
8672	OffsetNumber offnum;
8673	ItemId lp = NULL;
8674	HeapTupleHeader htup;
8675
8676	/*
8677	* The visibility map may need to be fixed even if the heap page is
8678	* already up-to-date.
8679	*/
8680	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8681	{
8682	RelFileNode rnode;
8683	Buffer vmbuffer = InvalidBuffer;
8684	BlockNumber block;
8685	Relation reln;
8686
8687	XLogRecGetBlockTag(record, `0`, &rnode, NULL, &block);
8688	reln = CreateFakeRelcacheEntry(rnode);
8689
8690	visibilitymap_pin(reln, block, &vmbuffer);
8691	visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8692
8693	ReleaseBuffer(vmbuffer);
8694	FreeFakeRelcacheEntry(reln);
8695	}
8696
8697	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
8698	{
8699	page = (Page) BufferGetPage(buffer);
8700
8701	offnum = xlrec->offnum;
8702	if (PageGetMaxOffsetNumber(page) >= offnum)
8703	lp = PageGetItemId(page, offnum);
8704
8705	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
8706	elog(PANIC, "invalid lp");
8707
8708	htup = (HeapTupleHeader) PageGetItem(page, lp);
8709
8710	htup->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
8711	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8712	fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8713	&htup->t_infomask2);
8714
8715	/*
8716	* Clear relevant update flags, but only if the modified infomask says
8717	* there's no update.
8718	*/
8719	if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
8720	{
8721	HeapTupleHeaderClearHotUpdated(htup);
8722	/ Make sure there is no forward chain link in t_ctid /
8723	ItemPointerSet(&htup->t_ctid,
8724	BufferGetBlockNumber(buffer),
8725	offnum);
8726	}
8727	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
8728	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8729	PageSetLSN(page, lsn);
8730	MarkBufferDirty(buffer);
8731	}
8732	if (BufferIsValid(buffer))
8733	UnlockReleaseBuffer(buffer);
8734	}
8735
8736	static void
8737	heap_xlog_lock_updated(XLogReaderState *record)
8738	{
8739	XLogRecPtr lsn = record->EndRecPtr;
8740	xl_heap_lock_updated *xlrec;
8741	Buffer buffer;
8742	Page page;
8743	OffsetNumber offnum;
8744	ItemId lp = NULL;
8745	HeapTupleHeader htup;
8746
8747	xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
8748
8749	/*
8750	* The visibility map may need to be fixed even if the heap page is
8751	* already up-to-date.
8752	*/
8753	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8754	{
8755	RelFileNode rnode;
8756	Buffer vmbuffer = InvalidBuffer;
8757	BlockNumber block;
8758	Relation reln;
8759
8760	XLogRecGetBlockTag(record, `0`, &rnode, NULL, &block);
8761	reln = CreateFakeRelcacheEntry(rnode);
8762
8763	visibilitymap_pin(reln, block, &vmbuffer);
8764	visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8765
8766	ReleaseBuffer(vmbuffer);
8767	FreeFakeRelcacheEntry(reln);
8768	}
8769
8770	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
8771	{
8772	page = BufferGetPage(buffer);
8773
8774	offnum = xlrec->offnum;
8775	if (PageGetMaxOffsetNumber(page) >= offnum)
8776	lp = PageGetItemId(page, offnum);
8777
8778	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
8779	elog(PANIC, "invalid lp");
8780
8781	htup = (HeapTupleHeader) PageGetItem(page, lp);
8782
8783	htup->t_infomask &= ~(HEAP_XMAX_BITS \| HEAP_MOVED);
8784	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8785	fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8786	&htup->t_infomask2);
8787	HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8788
8789	PageSetLSN(page, lsn);
8790	MarkBufferDirty(buffer);
8791	}
8792	if (BufferIsValid(buffer))
8793	UnlockReleaseBuffer(buffer);
8794	}
8795
8796	static void
8797	heap_xlog_inplace(XLogReaderState *record)
8798	{
8799	XLogRecPtr lsn = record->EndRecPtr;
8800	xl_heap_inplace xlrec = (xl_heap_inplace ) XLogRecGetData(record);
8801	Buffer buffer;
8802	Page page;
8803	OffsetNumber offnum;
8804	ItemId lp = NULL;
8805	HeapTupleHeader htup;
8806	uint32 oldlen;
8807	Size newlen;
8808
8809	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
8810	{
8811	char *newtup = XLogRecGetBlockData(record, `0`, &newlen);
8812
8813	page = BufferGetPage(buffer);
8814
8815	offnum = xlrec->offnum;
8816	if (PageGetMaxOffsetNumber(page) >= offnum)
8817	lp = PageGetItemId(page, offnum);
8818
8819	if (PageGetMaxOffsetNumber(page) < offnum \|\| !ItemIdIsNormal(lp))
8820	elog(PANIC, "invalid lp");
8821
8822	htup = (HeapTupleHeader) PageGetItem(page, lp);
8823
8824	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
8825	if (oldlen != newlen)
8826	elog(PANIC, "wrong tuple length");
8827
8828	memcpy((char *) htup + htup->t_hoff, newtup, newlen);
8829
8830	PageSetLSN(page, lsn);
8831	MarkBufferDirty(buffer);
8832	}
8833	if (BufferIsValid(buffer))
8834	UnlockReleaseBuffer(buffer);
8835	}
8836
8837	void
8838	heap_redo(XLogReaderState *record)
8839	{
8840	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8841
8842	/*
8843	* These operations don't overwrite MVCC data so no conflict processing is
8844	* required. The ones in heap2 rmgr do.
8845	*/
8846
8847	switch (info & XLOG_HEAP_OPMASK)
8848	{
8849	case XLOG_HEAP_INSERT:
8850	heap_xlog_insert(record);
8851	break;
8852	case XLOG_HEAP_DELETE:
8853	heap_xlog_delete(record);
8854	break;
8855	case XLOG_HEAP_UPDATE:
8856	heap_xlog_update(record, false);
8857	break;
8858	case XLOG_HEAP_TRUNCATE:
8859
8860	/*
8861	* TRUNCATE is a no-op because the actions are already logged as
8862	* SMGR WAL records. TRUNCATE WAL record only exists for logical
8863	* decoding.
8864	*/
8865	break;
8866	case XLOG_HEAP_HOT_UPDATE:
8867	heap_xlog_update(record, true);
8868	break;
8869	case XLOG_HEAP_CONFIRM:
8870	heap_xlog_confirm(record);
8871	break;
8872	case XLOG_HEAP_LOCK:
8873	heap_xlog_lock(record);
8874	break;
8875	case XLOG_HEAP_INPLACE:
8876	heap_xlog_inplace(record);
8877	break;
8878	default:
8879	elog(PANIC, "heap_redo: unknown op code %u", info);
8880	}
8881	}
8882
8883	void
8884	heap2_redo(XLogReaderState *record)
8885	{
8886	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8887
8888	switch (info & XLOG_HEAP_OPMASK)
8889	{
8890	case XLOG_HEAP2_CLEAN:
8891	heap_xlog_clean(record);
8892	break;
8893	case XLOG_HEAP2_FREEZE_PAGE:
8894	heap_xlog_freeze_page(record);
8895	break;
8896	case XLOG_HEAP2_CLEANUP_INFO:
8897	heap_xlog_cleanup_info(record);
8898	break;
8899	case XLOG_HEAP2_VISIBLE:
8900	heap_xlog_visible(record);
8901	break;
8902	case XLOG_HEAP2_MULTI_INSERT:
8903	heap_xlog_multi_insert(record);
8904	break;
8905	case XLOG_HEAP2_LOCK_UPDATED:
8906	heap_xlog_lock_updated(record);
8907	break;
8908	case XLOG_HEAP2_NEW_CID:
8909
8910	/*
8911	* Nothing to do on a real replay, only used during logical
8912	* decoding.
8913	*/
8914	break;
8915	case XLOG_HEAP2_REWRITE:
8916	heap_xlog_logical_rewrite(record);
8917	break;
8918	default:
8919	elog(PANIC, "heap2_redo: unknown op code %u", info);
8920	}
8921	}
8922
8923	/*
8924	* heap_sync - sync a heap, for use when no WAL has been written
8925	*
8926	* This forces the heap contents (including TOAST heap if any) down to disk.
8927	* If we skipped using WAL, and WAL is otherwise needed, we must force the
8928	* relation down to disk before it's safe to commit the transaction. This
8929	* requires writing out any dirty buffers and then doing a forced fsync.
8930	*
8931	* Indexes are not touched. (Currently, index operations associated with
8932	* the commands that use this are WAL-logged and so do not need fsync.
8933	* That behavior might change someday, but in any case it's likely that
8934	* any fsync decisions required would be per-index and hence not appropriate
8935	* to be done here.)
8936	*/
8937	void
8938	heap_sync(Relation rel)
8939	{
8940	/ non-WAL-logged tables never need fsync /
8941	if (!RelationNeedsWAL(rel))
8942	return;
8943
8944	/ main heap /
8945	FlushRelationBuffers(rel);
8946	/ FlushRelationBuffers will have opened rd_smgr /
8947	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
8948
8949	/ FSM is not critical, don't bother syncing it /
8950
8951	/ toast heap, if any /
8952	if (OidIsValid(rel->rd_rel->reltoastrelid))
8953	{
8954	Relation toastrel;
8955
8956	toastrel = table_open(rel->rd_rel->reltoastrelid, AccessShareLock);
8957	FlushRelationBuffers(toastrel);
8958	smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
8959	table_close(toastrel, AccessShareLock);
8960	}
8961	}
8962
8963	/*
8964	* Mask a heap page before performing consistency checks on it.
8965	*/
8966	void
8967	heap_mask(char *pagedata, BlockNumber blkno)
8968	{
8969	Page page = (Page) pagedata;
8970	OffsetNumber off;
8971
8972	mask_page_lsn_and_checksum(page);
8973
8974	mask_page_hint_bits(page);
8975	mask_unused_space(page);
8976
8977	for (off = `1`; off <= PageGetMaxOffsetNumber(page); off++)
8978	{
8979	ItemId iid = PageGetItemId(page, off);
8980	char *page_item;
8981
8982	page_item = (char *) (page + ItemIdGetOffset(iid));
8983
8984	if (ItemIdIsNormal(iid))
8985	{
8986	HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
8987
8988	/*
8989	* If xmin of a tuple is not yet frozen, we should ignore
8990	* differences in hint bits, since they can be set without
8991	* emitting WAL.
8992	*/
8993	if (!HeapTupleHeaderXminFrozen(page_htup))
8994	page_htup->t_infomask &= ~HEAP_XACT_MASK;
8995	else
8996	{
8997	/ Still we need to mask xmax hint bits. /
8998	page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
8999	page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9000	}
9001
9002	/*
9003	* During replay, we set Command Id to FirstCommandId. Hence, mask
9004	* it. See heap_xlog_insert() for details.
9005	*/
9006	page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9007
9008	/*
9009	* For a speculative tuple, heap_insert() does not set ctid in the
9010	* caller-passed heap tuple itself, leaving the ctid field to
9011	* contain a speculative token value - a per-backend monotonically
9012	* increasing identifier. Besides, it does not WAL-log ctid under
9013	* any circumstances.
9014	*
9015	* During redo, heap_xlog_insert() sets t_ctid to current block
9016	* number and self offset number. It doesn't care about any
9017	* speculative insertions in master. Hence, we set t_ctid to
9018	* current block number and self offset number to ignore any
9019	* inconsistency.
9020	*/
9021	if (HeapTupleHeaderIsSpeculative(page_htup))
9022	ItemPointerSet(&page_htup->t_ctid, blkno, off);
9023
9024	/*
9025	* NB: Not ignoring ctid changes due to the tuple having moved
9026	* (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9027	* important information that needs to be in-sync between primary
9028	* and standby, and thus is WAL logged.
9029	*/
9030	}
9031
9032	/*
9033	* Ignore any padding bytes after the tuple, when the length of the
9034	* item is not MAXALIGNed.
9035	*/
9036	if (ItemIdHasStorage(iid))
9037	{
9038	int len = ItemIdGetLength(iid);
9039	int padlen = MAXALIGN(len) - len;
9040
9041	if (padlen > `0`)
9042	memset(page_item + len, MASK_MARKER, padlen);
9043	}
9044	}
9045	}
9046

Browse the source code of PostgreSQL/src/backend/access/heap/heapam.c