heapam_handler.c source code [PostgreSQL/src/backend/access/heap/heapam_handler.c]

1	/-------------------------------------------------------------------------*
2	*
3	* heapam_handler.c
4	* heap table access method code
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/heap/heapam_handler.c
12	*
13	*
14	* NOTES
15	* This files wires up the lower level heapam.c et al routines with the
16	* tableam abstraction.
17	*
18	*-------------------------------------------------------------------------
19	*/
20	#include "postgres.h"
21
22	#include <math.h>
23
24	#include "miscadmin.h"
25
26	#include "access/genam.h"
27	#include "access/heapam.h"
28	#include "access/multixact.h"
29	#include "access/rewriteheap.h"
30	#include "access/tableam.h"
31	#include "access/tsmapi.h"
32	#include "access/tuptoaster.h"
33	#include "access/xact.h"
34	#include "catalog/catalog.h"
35	#include "catalog/index.h"
36	#include "catalog/storage.h"
37	#include "catalog/storage_xlog.h"
38	#include "commands/progress.h"
39	#include "executor/executor.h"
40	#include "optimizer/plancat.h"
41	#include "pgstat.h"
42	#include "storage/bufmgr.h"
43	#include "storage/bufpage.h"
44	#include "storage/bufmgr.h"
45	#include "storage/lmgr.h"
46	#include "storage/predicate.h"
47	#include "storage/procarray.h"
48	#include "storage/smgr.h"
49	#include "utils/builtins.h"
50	#include "utils/rel.h"
51
52
53	static void reform_and_rewrite_tuple(HeapTuple tuple,
54	Relation OldHeap, Relation NewHeap,
55	Datum values, bool isnull, RewriteState rwstate);
56
57	static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
58	HeapTuple tuple,
59	OffsetNumber tupoffset);
60
61	static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
62
63	static const TableAmRoutine heapam_methods;
64
65
66	/ ------------------------------------------------------------------------*
67	* Slot related callbacks for heap AM
68	* ------------------------------------------------------------------------
69	*/
70
71	static const TupleTableSlotOps *
72	heapam_slot_callbacks(Relation relation)
73	{
74	return &TTSOpsBufferHeapTuple;
75	}
76
77
78	/ ------------------------------------------------------------------------*
79	* Index Scan Callbacks for heap AM
80	* ------------------------------------------------------------------------
81	*/
82
83	static IndexFetchTableData *
84	heapam_index_fetch_begin(Relation rel)
85	{
86	IndexFetchHeapData hscan = palloc0(sizeof*(IndexFetchHeapData));
87
88	hscan->xs_base.rel = rel;
89	hscan->xs_cbuf = InvalidBuffer;
90
91	return &hscan->xs_base;
92	}
93
94	static void
95	heapam_index_fetch_reset(IndexFetchTableData *scan)
96	{
97	IndexFetchHeapData hscan = (IndexFetchHeapData ) scan;
98
99	if (BufferIsValid(hscan->xs_cbuf))
100	{
101	ReleaseBuffer(hscan->xs_cbuf);
102	hscan->xs_cbuf = InvalidBuffer;
103	}
104	}
105
106	static void
107	heapam_index_fetch_end(IndexFetchTableData *scan)
108	{
109	IndexFetchHeapData hscan = (IndexFetchHeapData ) scan;
110
111	heapam_index_fetch_reset(scan);
112
113	pfree(hscan);
114	}
115
116	static bool
117	heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
118	ItemPointer tid,
119	Snapshot snapshot,
120	TupleTableSlot *slot,
121	bool call_again, bool all_dead)
122	{
123	IndexFetchHeapData hscan = (IndexFetchHeapData ) scan;
124	BufferHeapTupleTableSlot bslot = (BufferHeapTupleTableSlot ) slot;
125	bool got_heap_tuple;
126
127	Assert(TTS_IS_BUFFERTUPLE(slot));
128
129	/ We can skip the buffer-switching logic if we're in mid-HOT chain. /
130	if (!*call_again)
131	{
132	/ Switch to correct buffer if we don't have it already /
133	Buffer prev_buf = hscan->xs_cbuf;
134
135	hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
136	hscan->xs_base.rel,
137	ItemPointerGetBlockNumber(tid));
138
139	/*
140	* Prune page, but only if we weren't already on this page
141	*/
142	if (prev_buf != hscan->xs_cbuf)
143	heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
144	}
145
146	/ Obtain share-lock on the buffer so we can examine visibility /
147	LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
148	got_heap_tuple = heap_hot_search_buffer(tid,
149	hscan->xs_base.rel,
150	hscan->xs_cbuf,
151	snapshot,
152	&bslot->base.tupdata,
153	all_dead,
154	!*call_again);
155	bslot->base.tupdata.t_self = *tid;
156	LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
157
158	if (got_heap_tuple)
159	{
160	/*
161	* Only in a non-MVCC snapshot can more than one member of the HOT
162	* chain be visible.
163	*/
164	*call_again = !IsMVCCSnapshot(snapshot);
165
166	slot->tts_tableOid = RelationGetRelid(scan->rel);
167	ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
168	}
169	else
170	{
171	/ We've reached the end of the HOT chain. /
172	*call_again = false;
173	}
174
175	return got_heap_tuple;
176	}
177
178
179	/ ------------------------------------------------------------------------*
180	* Callbacks for non-modifying operations on individual tuples for heap AM
181	* ------------------------------------------------------------------------
182	*/
183
184	static bool
185	heapam_fetch_row_version(Relation relation,
186	ItemPointer tid,
187	Snapshot snapshot,
188	TupleTableSlot *slot)
189	{
190	BufferHeapTupleTableSlot bslot = (BufferHeapTupleTableSlot ) slot;
191	Buffer buffer;
192
193	Assert(TTS_IS_BUFFERTUPLE(slot));
194
195	bslot->base.tupdata.t_self = *tid;
196	if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer))
197	{
198	/ store in slot, transferring existing pin /
199	ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
200	slot->tts_tableOid = RelationGetRelid(relation);
201
202	return true;
203	}
204
205	return false;
206	}
207
208	static bool
209	heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
210	{
211	HeapScanDesc hscan = (HeapScanDesc) scan;
212
213	return ItemPointerIsValid(tid) &&
214	ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
215	}
216
217	static bool
218	heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
219	Snapshot snapshot)
220	{
221	BufferHeapTupleTableSlot bslot = (BufferHeapTupleTableSlot ) slot;
222	bool res;
223
224	Assert(TTS_IS_BUFFERTUPLE(slot));
225	Assert(BufferIsValid(bslot->buffer));
226
227	/*
228	* We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
229	* Caller should be holding pin, but not lock.
230	*/
231	LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
232	res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
233	bslot->buffer);
234	LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
235
236	return res;
237	}
238
239
240	/ ----------------------------------------------------------------------------*
241	* Functions for manipulations of physical tuples for heap AM.
242	* ----------------------------------------------------------------------------
243	*/
244
245	static void
246	heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
247	int options, BulkInsertState bistate)
248	{
249	bool shouldFree = true;
250	HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
251
252	/ Update the tuple with table oid /
253	slot->tts_tableOid = RelationGetRelid(relation);
254	tuple->t_tableOid = slot->tts_tableOid;
255
256	/ Perform the insertion, and copy the resulting ItemPointer /
257	heap_insert(relation, tuple, cid, options, bistate);
258	ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
259
260	if (shouldFree)
261	pfree(tuple);
262	}
263
264	static void
265	heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
266	CommandId cid, int options,
267	BulkInsertState bistate, uint32 specToken)
268	{
269	bool shouldFree = true;
270	HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
271
272	/ Update the tuple with table oid /
273	slot->tts_tableOid = RelationGetRelid(relation);
274	tuple->t_tableOid = slot->tts_tableOid;
275
276	HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
277	options \|= HEAP_INSERT_SPECULATIVE;
278
279	/ Perform the insertion, and copy the resulting ItemPointer /
280	heap_insert(relation, tuple, cid, options, bistate);
281	ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
282
283	if (shouldFree)
284	pfree(tuple);
285	}
286
287	static void
288	heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
289	uint32 specToken, bool succeeded)
290	{
291	bool shouldFree = true;
292	HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
293
294	/ adjust the tuple's state accordingly /
295	if (succeeded)
296	heap_finish_speculative(relation, &slot->tts_tid);
297	else
298	heap_abort_speculative(relation, &slot->tts_tid);
299
300	if (shouldFree)
301	pfree(tuple);
302	}
303
304	static TM_Result
305	heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
306	Snapshot snapshot, Snapshot crosscheck, bool wait,
307	TM_FailureData *tmfd, bool changingPart)
308	{
309	/*
310	* Currently Deleting of index tuples are handled at vacuum, in case if
311	* the storage itself is cleaning the dead tuples by itself, it is the
312	* time to call the index tuple deletion also.
313	*/
314	return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
315	}
316
317
318	static TM_Result
319	heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
320	CommandId cid, Snapshot snapshot, Snapshot crosscheck,
321	bool wait, TM_FailureData *tmfd,
322	LockTupleMode lockmode, bool update_indexes)
323	{
324	bool shouldFree = true;
325	HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
326	TM_Result result;
327
328	/ Update the tuple with table oid /
329	slot->tts_tableOid = RelationGetRelid(relation);
330	tuple->t_tableOid = slot->tts_tableOid;
331
332	result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
333	tmfd, lockmode);
334	ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
335
336	/*
337	* Decide whether new index entries are needed for the tuple
338	*
339	* Note: heap_update returns the tid (location) of the new tuple in the
340	* t_self field.
341	*
342	* If it's a HOT update, we mustn't insert new index entries.
343	*/
344	*update_indexes = result == TM_Ok && !HeapTupleIsHeapOnly(tuple);
345
346	if (shouldFree)
347	pfree(tuple);
348
349	return result;
350	}
351
352	static TM_Result
353	heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
354	TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
355	LockWaitPolicy wait_policy, uint8 flags,
356	TM_FailureData *tmfd)
357	{
358	BufferHeapTupleTableSlot bslot = (BufferHeapTupleTableSlot ) slot;
359	TM_Result result;
360	Buffer buffer;
361	HeapTuple tuple = &bslot->base.tupdata;
362	bool follow_updates;
363
364	follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != `0`;
365	tmfd->traversed = false;
366
367	Assert(TTS_IS_BUFFERTUPLE(slot));
368
369	tuple_lock_retry:
370	tuple->t_self = *tid;
371	result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
372	follow_updates, &buffer, tmfd);
373
374	if (result == TM_Updated &&
375	(flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
376	{
377	ReleaseBuffer(buffer);
378	/ Should not encounter speculative tuple on recheck /
379	Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
380
381	if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
382	{
383	SnapshotData SnapshotDirty;
384	TransactionId priorXmax;
385
386	/ it was updated, so look at the updated version /
387	*tid = tmfd->ctid;
388	/ updated row should have xmin matching this xmax /
389	priorXmax = tmfd->xmax;
390
391	/ signal that a tuple later in the chain is getting locked /
392	tmfd->traversed = true;
393
394	/*
395	* fetch target tuple
396	*
397	* Loop here to deal with updated or busy tuples
398	*/
399	InitDirtySnapshot(SnapshotDirty);
400	for (;;)
401	{
402	if (ItemPointerIndicatesMovedPartitions(tid))
403	ereport(ERROR,
404	(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
405	errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
406
407	tuple->t_self = *tid;
408	if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer))
409	{
410	/*
411	* If xmin isn't what we're expecting, the slot must have
412	* been recycled and reused for an unrelated tuple. This
413	* implies that the latest version of the row was deleted,
414	* so we need do nothing. (Should be safe to examine xmin
415	* without getting buffer's content lock. We assume
416	* reading a TransactionId to be atomic, and Xmin never
417	* changes in an existing tuple, except to invalid or
418	* frozen, and neither of those can match priorXmax.)
419	*/
420	if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
421	priorXmax))
422	{
423	ReleaseBuffer(buffer);
424	return TM_Deleted;
425	}
426
427	/ otherwise xmin should not be dirty... /
428	if (TransactionIdIsValid(SnapshotDirty.xmin))
429	elog(ERROR, "t_xmin is uncommitted in tuple to be updated");
430
431	/*
432	* If tuple is being updated by other transaction then we
433	* have to wait for its commit/abort, or die trying.
434	*/
435	if (TransactionIdIsValid(SnapshotDirty.xmax))
436	{
437	ReleaseBuffer(buffer);
438	switch (wait_policy)
439	{
440	case LockWaitBlock:
441	XactLockTableWait(SnapshotDirty.xmax,
442	relation, &tuple->t_self,
443	XLTW_FetchUpdated);
444	break;
445	case LockWaitSkip:
446	if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
447	/ skip instead of waiting /
448	return TM_WouldBlock;
449	break;
450	case LockWaitError:
451	if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
452	ereport(ERROR,
453	(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
454	errmsg("could not obtain lock on row in relation \"%s\"",
455	RelationGetRelationName(relation))));
456	break;
457	}
458	continue; / loop back to repeat heap_fetch /
459	}
460
461	/*
462	* If tuple was inserted by our own transaction, we have
463	* to check cmin against cid: cmin >= current CID means
464	* our command cannot see the tuple, so we should ignore
465	* it. Otherwise heap_lock_tuple() will throw an error,
466	* and so would any later attempt to update or delete the
467	* tuple. (We need not check cmax because
468	* HeapTupleSatisfiesDirty will consider a tuple deleted
469	* by our transaction dead, regardless of cmax.) We just
470	* checked that priorXmax == xmin, so we can test that
471	* variable instead of doing HeapTupleHeaderGetXmin again.
472	*/
473	if (TransactionIdIsCurrentTransactionId(priorXmax) &&
474	HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
475	{
476	tmfd->xmax = priorXmax;
477
478	/*
479	* Cmin is the problematic value, so store that. See
480	* above.
481	*/
482	tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
483	ReleaseBuffer(buffer);
484	return TM_SelfModified;
485	}
486
487	/*
488	* This is a live tuple, so try to lock it again.
489	*/
490	ReleaseBuffer(buffer);
491	goto tuple_lock_retry;
492	}
493
494	/*
495	* If the referenced slot was actually empty, the latest
496	* version of the row must have been deleted, so we need do
497	* nothing.
498	*/
499	if (tuple->t_data == NULL)
500	{
501	return TM_Deleted;
502	}
503
504	/*
505	* As above, if xmin isn't what we're expecting, do nothing.
506	*/
507	if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
508	priorXmax))
509	{
510	if (BufferIsValid(buffer))
511	ReleaseBuffer(buffer);
512	return TM_Deleted;
513	}
514
515	/*
516	* If we get here, the tuple was found but failed
517	* SnapshotDirty. Assuming the xmin is either a committed xact
518	* or our own xact (as it certainly should be if we're trying
519	* to modify the tuple), this must mean that the row was
520	* updated or deleted by either a committed xact or our own
521	* xact. If it was deleted, we can ignore it; if it was
522	* updated then chain up to the next version and repeat the
523	* whole process.
524	*
525	* As above, it should be safe to examine xmax and t_ctid
526	* without the buffer content lock, because they can't be
527	* changing.
528	*/
529	if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
530	{
531	/ deleted, so forget about it /
532	if (BufferIsValid(buffer))
533	ReleaseBuffer(buffer);
534	return TM_Deleted;
535	}
536
537	/ updated, so look at the updated row /
538	*tid = tuple->t_data->t_ctid;
539	/ updated row should have xmin matching this xmax /
540	priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
541	if (BufferIsValid(buffer))
542	ReleaseBuffer(buffer);
543	/ loop back to fetch next in chain /
544	}
545	}
546	else
547	{
548	/ tuple was deleted, so give up /
549	return TM_Deleted;
550	}
551	}
552
553	slot->tts_tableOid = RelationGetRelid(relation);
554	tuple->t_tableOid = slot->tts_tableOid;
555
556	/ store in slot, transferring existing pin /
557	ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
558
559	return result;
560	}
561
562	static void
563	heapam_finish_bulk_insert(Relation relation, int options)
564	{
565	/*
566	* If we skipped writing WAL, then we need to sync the heap (but not
567	* indexes since those use WAL anyway / don't go through tableam)
568	*/
569	if (options & HEAP_INSERT_SKIP_WAL)
570	heap_sync(relation);
571	}
572
573
574	/ ------------------------------------------------------------------------*
575	* DDL related callbacks for heap AM.
576	* ------------------------------------------------------------------------
577	*/
578
579	static void
580	heapam_relation_set_new_filenode(Relation rel,
581	const RelFileNode *newrnode,
582	char persistence,
583	TransactionId *freezeXid,
584	MultiXactId *minmulti)
585	{
586	SMgrRelation srel;
587
588	/*
589	* Initialize to the minimum XID that could put tuples in the table. We
590	* know that no xacts older than RecentXmin are still running, so that
591	* will do.
592	*/
593	*freezeXid = RecentXmin;
594
595	/*
596	* Similarly, initialize the minimum Multixact to the first value that
597	* could possibly be stored in tuples in the table. Running transactions
598	* could reuse values from their local cache, so we are careful to
599	* consider all currently running multis.
600	*
601	* XXX this could be refined further, but is it worth the hassle?
602	*/
603	*minmulti = GetOldestMultiXactId();
604
605	srel = RelationCreateStorage(*newrnode, persistence);
606
607	/*
608	* If required, set up an init fork for an unlogged table so that it can
609	* be correctly reinitialized on restart. An immediate sync is required
610	* even if the page has been logged, because the write did not go through
611	* shared_buffers and therefore a concurrent checkpoint may have moved the
612	* redo pointer past our xlog record. Recovery may as well remove it
613	* while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
614	* record. Therefore, logging is necessary even if wal_level=minimal.
615	*/
616	if (persistence == RELPERSISTENCE_UNLOGGED)
617	{
618	Assert(rel->rd_rel->relkind == RELKIND_RELATION \|\|
619	rel->rd_rel->relkind == RELKIND_MATVIEW \|\|
620	rel->rd_rel->relkind == RELKIND_TOASTVALUE);
621	smgrcreate(srel, INIT_FORKNUM, false);
622	log_smgrcreate(newrnode, INIT_FORKNUM);
623	smgrimmedsync(srel, INIT_FORKNUM);
624	}
625
626	smgrclose(srel);
627	}
628
629	static void
630	heapam_relation_nontransactional_truncate(Relation rel)
631	{
632	RelationTruncate(rel, `0`);
633	}
634
635	static void
636	heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
637	{
638	SMgrRelation dstrel;
639
640	dstrel = smgropen(*newrnode, rel->rd_backend);
641	RelationOpenSmgr(rel);
642
643	/*
644	* Since we copy the file directly without looking at the shared buffers,
645	* we'd better first flush out any pages of the source relation that are
646	* in shared buffers. We assume no new changes will be made while we are
647	* holding exclusive lock on the rel.
648	*/
649	FlushRelationBuffers(rel);
650
651	/*
652	* Create and copy all forks of the relation, and schedule unlinking of
653	* old physical files.
654	*
655	* NOTE: any conflict in relfilenode value will be caught in
656	* RelationCreateStorage().
657	*/
658	RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
659
660	/ copy main fork /
661	RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
662	rel->rd_rel->relpersistence);
663
664	/ copy those extra forks that exist /
665	for (ForkNumber forkNum = MAIN_FORKNUM + `1`;
666	forkNum <= MAX_FORKNUM; forkNum++)
667	{
668	if (smgrexists(rel->rd_smgr, forkNum))
669	{
670	smgrcreate(dstrel, forkNum, false);
671
672	/*
673	* WAL log creation if the relation is persistent, or this is the
674	* init fork of an unlogged relation.
675	*/
676	if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT \|\|
677	(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
678	forkNum == INIT_FORKNUM))
679	log_smgrcreate(newrnode, forkNum);
680	RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
681	rel->rd_rel->relpersistence);
682	}
683	}
684
685
686	/ drop old relation, and close new one /
687	RelationDropStorage(rel);
688	smgrclose(dstrel);
689	}
690
691	static void
692	heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
693	Relation OldIndex, bool use_sort,
694	TransactionId OldestXmin,
695	TransactionId *xid_cutoff,
696	MultiXactId *multi_cutoff,
697	double *num_tuples,
698	double *tups_vacuumed,
699	double *tups_recently_dead)
700	{
701	RewriteState rwstate;
702	IndexScanDesc indexScan;
703	TableScanDesc tableScan;
704	HeapScanDesc heapScan;
705	bool use_wal;
706	bool is_system_catalog;
707	Tuplesortstate *tuplesort;
708	TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
709	TupleDesc newTupDesc = RelationGetDescr(NewHeap);
710	TupleTableSlot *slot;
711	int natts;
712	Datum *values;
713	bool *isnull;
714	BufferHeapTupleTableSlot *hslot;
715
716	/ Remember if it's a system catalog /
717	is_system_catalog = IsSystemRelation(OldHeap);
718
719	/*
720	* We need to log the copied data in WAL iff WAL archiving/streaming is
721	* enabled AND it's a WAL-logged rel.
722	*/
723	use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
724
725	/ use_wal off requires smgr_targblock be initially invalid /
726	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
727
728	/ Preallocate values/isnull arrays /
729	natts = newTupDesc->natts;
730	values = (Datum ) palloc(natts sizeof(Datum));
731	isnull = (bool ) palloc(natts sizeof(bool));
732
733	/ Initialize the rewrite operation /
734	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
735	*multi_cutoff, use_wal);
736
737
738	/ Set up sorting if wanted /
739	if (use_sort)
740	tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
741	maintenance_work_mem,
742	NULL, false);
743	else
744	tuplesort = NULL;
745
746	/*
747	* Prepare to scan the OldHeap. To ensure we see recently-dead tuples
748	* that still need to be copied, we scan with SnapshotAny and use
749	* HeapTupleSatisfiesVacuum for the visibility test.
750	*/
751	if (OldIndex != NULL && !use_sort)
752	{
753	const int ci_index[] = {
754	PROGRESS_CLUSTER_PHASE,
755	PROGRESS_CLUSTER_INDEX_RELID
756	};
757	int64 ci_val[`2`];
758
759	/ Set phase and OIDOldIndex to columns /
760	ci_val[`0`] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
761	ci_val[`1`] = RelationGetRelid(OldIndex);
762	pgstat_progress_update_multi_param(`2`, ci_index, ci_val);
763
764	tableScan = NULL;
765	heapScan = NULL;
766	indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, `0`, `0`);
767	index_rescan(indexScan, NULL, `0`, NULL, `0`);
768	}
769	else
770	{
771	/ In scan-and-sort mode and also VACUUM FULL, set phase /
772	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
773	PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
774
775	tableScan = table_beginscan(OldHeap, SnapshotAny, `0`, (ScanKey) NULL);
776	heapScan = (HeapScanDesc) tableScan;
777	indexScan = NULL;
778
779	/ Set total heap blocks /
780	pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
781	heapScan->rs_nblocks);
782	}
783
784	slot = table_slot_create(OldHeap, NULL);
785	hslot = (BufferHeapTupleTableSlot *) slot;
786
787	/*
788	* Scan through the OldHeap, either in OldIndex order or sequentially;
789	* copy each tuple into the NewHeap, or transiently to the tuplesort
790	* module. Note that we don't bother sorting dead tuples (they won't get
791	* to the new table anyway).
792	*/
793	for (;;)
794	{
795	HeapTuple tuple;
796	Buffer buf;
797	bool isdead;
798
799	CHECK_FOR_INTERRUPTS();
800
801	if (indexScan != NULL)
802	{
803	if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
804	break;
805
806	/ Since we used no scan keys, should never need to recheck /
807	if (indexScan->xs_recheck)
808	elog(ERROR, "CLUSTER does not support lossy index conditions");
809	}
810	else
811	{
812	if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
813	break;
814
815	/*
816	* In scan-and-sort mode and also VACUUM FULL, set heap blocks
817	* scanned
818	*/
819	pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
820	heapScan->rs_cblock + `1`);
821	}
822
823	tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
824	buf = hslot->buffer;
825
826	LockBuffer(buf, BUFFER_LOCK_SHARE);
827
828	switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
829	{
830	case HEAPTUPLE_DEAD:
831	/ Definitely dead /
832	isdead = true;
833	break;
834	case HEAPTUPLE_RECENTLY_DEAD:
835	*tups_recently_dead += `1`;
836	/ fall through /
837	case HEAPTUPLE_LIVE:
838	/ Live or recently dead, must copy it /
839	isdead = false;
840	break;
841	case HEAPTUPLE_INSERT_IN_PROGRESS:
842
843	/*
844	* Since we hold exclusive lock on the relation, normally the
845	* only way to see this is if it was inserted earlier in our
846	* own transaction. However, it can happen in system
847	* catalogs, since we tend to release write lock before commit
848	* there. Give a warning if neither case applies; but in any
849	* case we had better copy it.
850	*/
851	if (!is_system_catalog &&
852	!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
853	elog(WARNING, "concurrent insert in progress within table \"%s\"",
854	RelationGetRelationName(OldHeap));
855	/ treat as live /
856	isdead = false;
857	break;
858	case HEAPTUPLE_DELETE_IN_PROGRESS:
859
860	/*
861	* Similar situation to INSERT_IN_PROGRESS case.
862	*/
863	if (!is_system_catalog &&
864	!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
865	elog(WARNING, "concurrent delete in progress within table \"%s\"",
866	RelationGetRelationName(OldHeap));
867	/ treat as recently dead /
868	*tups_recently_dead += `1`;
869	isdead = false;
870	break;
871	default:
872	elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
873	isdead = false; / keep compiler quiet /
874	break;
875	}
876
877	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
878
879	if (isdead)
880	{
881	*tups_vacuumed += `1`;
882	/ heap rewrite module still needs to see it... /
883	if (rewrite_heap_dead_tuple(rwstate, tuple))
884	{
885	/ A previous recently-dead tuple is now known dead /
886	*tups_vacuumed += `1`;
887	*tups_recently_dead -= `1`;
888	}
889	continue;
890	}
891
892	*num_tuples += `1`;
893	if (tuplesort != NULL)
894	{
895	tuplesort_putheaptuple(tuplesort, tuple);
896
897	/*
898	* In scan-and-sort mode, report increase in number of tuples
899	* scanned
900	*/
901	pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
902	*num_tuples);
903	}
904	else
905	{
906	const int ct_index[] = {
907	PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
908	PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
909	};
910	int64 ct_val[`2`];
911
912	reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
913	values, isnull, rwstate);
914
915	/*
916	* In indexscan mode and also VACUUM FULL, report increase in
917	* number of tuples scanned and written
918	*/
919	ct_val[`0`] = *num_tuples;
920	ct_val[`1`] = *num_tuples;
921	pgstat_progress_update_multi_param(`2`, ct_index, ct_val);
922	}
923	}
924
925	if (indexScan != NULL)
926	index_endscan(indexScan);
927	if (tableScan != NULL)
928	table_endscan(tableScan);
929	if (slot)
930	ExecDropSingleTupleTableSlot(slot);
931
932	/*
933	* In scan-and-sort mode, complete the sort, then read out all live tuples
934	* from the tuplestore and write them to the new relation.
935	*/
936	if (tuplesort != NULL)
937	{
938	double n_tuples = `0`;
939
940	/ Report that we are now sorting tuples /
941	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
942	PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
943
944	tuplesort_performsort(tuplesort);
945
946	/ Report that we are now writing new heap /
947	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
948	PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
949
950	for (;;)
951	{
952	HeapTuple tuple;
953
954	CHECK_FOR_INTERRUPTS();
955
956	tuple = tuplesort_getheaptuple(tuplesort, true);
957	if (tuple == NULL)
958	break;
959
960	n_tuples += `1`;
961	reform_and_rewrite_tuple(tuple,
962	OldHeap, NewHeap,
963	values, isnull,
964	rwstate);
965	/ Report n_tuples /
966	pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
967	n_tuples);
968	}
969
970	tuplesort_end(tuplesort);
971	}
972
973	/ Write out any remaining tuples, and fsync if needed /
974	end_heap_rewrite(rwstate);
975
976	/ Clean up /
977	pfree(values);
978	pfree(isnull);
979	}
980
981	static bool
982	heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
983	BufferAccessStrategy bstrategy)
984	{
985	HeapScanDesc hscan = (HeapScanDesc) scan;
986
987	/*
988	* We must maintain a pin on the target page's buffer to ensure that
989	* concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
990	* under us. Hence, pin the page until we are done looking at it. We
991	* also choose to hold sharelock on the buffer throughout --- we could
992	* release and re-acquire sharelock for each tuple, but since we aren't
993	* doing much work per tuple, the extra lock traffic is probably better
994	* avoided.
995	*/
996	hscan->rs_cblock = blockno;
997	hscan->rs_cindex = FirstOffsetNumber;
998	hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
999	blockno, RBM_NORMAL, bstrategy);
1000	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1001
1002	/ in heap all blocks can contain tuples, so always return true /
1003	return true;
1004	}
1005
1006	static bool
1007	heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1008	double liverows, double* *deadrows,
1009	TupleTableSlot *slot)
1010	{
1011	HeapScanDesc hscan = (HeapScanDesc) scan;
1012	Page targpage;
1013	OffsetNumber maxoffset;
1014	BufferHeapTupleTableSlot *hslot;
1015
1016	Assert(TTS_IS_BUFFERTUPLE(slot));
1017
1018	hslot = (BufferHeapTupleTableSlot *) slot;
1019	targpage = BufferGetPage(hscan->rs_cbuf);
1020	maxoffset = PageGetMaxOffsetNumber(targpage);
1021
1022	/ Inner loop over all tuples on the selected page /
1023	for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1024	{
1025	ItemId itemid;
1026	HeapTuple targtuple = &hslot->base.tupdata;
1027	bool sample_it = false;
1028
1029	itemid = PageGetItemId(targpage, hscan->rs_cindex);
1030
1031	/*
1032	* We ignore unused and redirect line pointers. DEAD line pointers
1033	* should be counted as dead, because we need vacuum to run to get rid
1034	* of them. Note that this rule agrees with the way that
1035	* heap_page_prune() counts things.
1036	*/
1037	if (!ItemIdIsNormal(itemid))
1038	{
1039	if (ItemIdIsDead(itemid))
1040	*deadrows += `1`;
1041	continue;
1042	}
1043
1044	ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1045
1046	targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1047	targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1048	targtuple->t_len = ItemIdGetLength(itemid);
1049
1050	switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1051	hscan->rs_cbuf))
1052	{
1053	case HEAPTUPLE_LIVE:
1054	sample_it = true;
1055	*liverows += `1`;
1056	break;
1057
1058	case HEAPTUPLE_DEAD:
1059	case HEAPTUPLE_RECENTLY_DEAD:
1060	/ Count dead and recently-dead rows /
1061	*deadrows += `1`;
1062	break;
1063
1064	case HEAPTUPLE_INSERT_IN_PROGRESS:
1065
1066	/*
1067	* Insert-in-progress rows are not counted. We assume that
1068	* when the inserting transaction commits or aborts, it will
1069	* send a stats message to increment the proper count. This
1070	* works right only if that transaction ends after we finish
1071	* analyzing the table; if things happen in the other order,
1072	* its stats update will be overwritten by ours. However, the
1073	* error will be large only if the other transaction runs long
1074	* enough to insert many tuples, so assuming it will finish
1075	* after us is the safer option.
1076	*
1077	* A special case is that the inserting transaction might be
1078	* our own. In this case we should count and sample the row,
1079	* to accommodate users who load a table and analyze it in one
1080	* transaction. (pgstat_report_analyze has to adjust the
1081	* numbers we send to the stats collector to make this come
1082	* out right.)
1083	*/
1084	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1085	{
1086	sample_it = true;
1087	*liverows += `1`;
1088	}
1089	break;
1090
1091	case HEAPTUPLE_DELETE_IN_PROGRESS:
1092
1093	/*
1094	* We count and sample delete-in-progress rows the same as
1095	* live ones, so that the stats counters come out right if the
1096	* deleting transaction commits after us, per the same
1097	* reasoning given above.
1098	*
1099	* If the delete was done by our own transaction, however, we
1100	* must count the row as dead to make pgstat_report_analyze's
1101	* stats adjustments come out right. (Note: this works out
1102	* properly when the row was both inserted and deleted in our
1103	* xact.)
1104	*
1105	* The net effect of these choices is that we act as though an
1106	* IN_PROGRESS transaction hasn't happened yet, except if it
1107	* is our own transaction, which we assume has happened.
1108	*
1109	* This approach ensures that we behave sanely if we see both
1110	* the pre-image and post-image rows for a row being updated
1111	* by a concurrent transaction: we will sample the pre-image
1112	* but not the post-image. We also get sane results if the
1113	* concurrent transaction never commits.
1114	*/
1115	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1116	*deadrows += `1`;
1117	else
1118	{
1119	sample_it = true;
1120	*liverows += `1`;
1121	}
1122	break;
1123
1124	default:
1125	elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1126	break;
1127	}
1128
1129	if (sample_it)
1130	{
1131	ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1132	hscan->rs_cindex++;
1133
1134	/ note that we leave the buffer locked here! /
1135	return true;
1136	}
1137	}
1138
1139	/ Now release the lock and pin on the page /
1140	UnlockReleaseBuffer(hscan->rs_cbuf);
1141	hscan->rs_cbuf = InvalidBuffer;
1142
1143	/ also prevent old slot contents from having pin on page /
1144	ExecClearTuple(slot);
1145
1146	return false;
1147	}
1148
1149	static double
1150	heapam_index_build_range_scan(Relation heapRelation,
1151	Relation indexRelation,
1152	IndexInfo *indexInfo,
1153	bool allow_sync,
1154	bool anyvisible,
1155	bool progress,
1156	BlockNumber start_blockno,
1157	BlockNumber numblocks,
1158	IndexBuildCallback callback,
1159	void *callback_state,
1160	TableScanDesc scan)
1161	{
1162	HeapScanDesc hscan;
1163	bool is_system_catalog;
1164	bool checking_uniqueness;
1165	HeapTuple heapTuple;
1166	Datum values[INDEX_MAX_KEYS];
1167	bool isnull[INDEX_MAX_KEYS];
1168	double reltuples;
1169	ExprState *predicate;
1170	TupleTableSlot *slot;
1171	EState *estate;
1172	ExprContext *econtext;
1173	Snapshot snapshot;
1174	bool need_unregister_snapshot = false;
1175	TransactionId OldestXmin;
1176	BlockNumber previous_blkno = InvalidBlockNumber;
1177	BlockNumber root_blkno = InvalidBlockNumber;
1178	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1179
1180	/*
1181	* sanity checks
1182	*/
1183	Assert(OidIsValid(indexRelation->rd_rel->relam));
1184
1185	/ Remember if it's a system catalog /
1186	is_system_catalog = IsSystemRelation(heapRelation);
1187
1188	/ See whether we're verifying uniqueness/exclusion properties /
1189	checking_uniqueness = (indexInfo->ii_Unique \|\|
1190	indexInfo->ii_ExclusionOps != NULL);
1191
1192	/*
1193	* "Any visible" mode is not compatible with uniqueness checks; make sure
1194	* only one of those is requested.
1195	*/
1196	Assert(!(anyvisible && checking_uniqueness));
1197
1198	/*
1199	* Need an EState for evaluation of index expressions and partial-index
1200	* predicates. Also a slot to hold the current tuple.
1201	*/
1202	estate = CreateExecutorState();
1203	econtext = GetPerTupleExprContext(estate);
1204	slot = table_slot_create(heapRelation, NULL);
1205
1206	/ Arrange for econtext's scan tuple to be the tuple under test /
1207	econtext->ecxt_scantuple = slot;
1208
1209	/ Set up execution state for predicate, if any. /
1210	predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1211
1212	/*
1213	* Prepare for scan of the base relation. In a normal index build, we use
1214	* SnapshotAny because we must retrieve all tuples and do our own time
1215	* qual checks (because we have to index RECENTLY_DEAD tuples). In a
1216	* concurrent build, or during bootstrap, we take a regular MVCC snapshot
1217	* and index whatever's live according to that.
1218	*/
1219	OldestXmin = InvalidTransactionId;
1220
1221	/ okay to ignore lazy VACUUMs here /
1222	if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1223	OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM);
1224
1225	if (!scan)
1226	{
1227	/*
1228	* Serial index build.
1229	*
1230	* Must begin our own heap scan in this case. We may also need to
1231	* register a snapshot whose lifetime is under our direct control.
1232	*/
1233	if (!TransactionIdIsValid(OldestXmin))
1234	{
1235	snapshot = RegisterSnapshot(GetTransactionSnapshot());
1236	need_unregister_snapshot = true;
1237	}
1238	else
1239	snapshot = SnapshotAny;
1240
1241	scan = table_beginscan_strat(heapRelation, / relation /
1242	snapshot, / snapshot /
1243	`0`, / number of keys /
1244	NULL, / scan key /
1245	true, / buffer access strategy OK /
1246	allow_sync); / syncscan OK? /
1247	}
1248	else
1249	{
1250	/*
1251	* Parallel index build.
1252	*
1253	* Parallel case never registers/unregisters own snapshot. Snapshot
1254	* is taken from parallel heap scan, and is SnapshotAny or an MVCC
1255	* snapshot, based on same criteria as serial case.
1256	*/
1257	Assert(!IsBootstrapProcessingMode());
1258	Assert(allow_sync);
1259	snapshot = scan->rs_snapshot;
1260	}
1261
1262	hscan = (HeapScanDesc) scan;
1263
1264	/ Publish number of blocks to scan /
1265	if (progress)
1266	{
1267	BlockNumber nblocks;
1268
1269	if (hscan->rs_base.rs_parallel != NULL)
1270	{
1271	ParallelBlockTableScanDesc pbscan;
1272
1273	pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1274	nblocks = pbscan->phs_nblocks;
1275	}
1276	else
1277	nblocks = hscan->rs_nblocks;
1278
1279	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1280	nblocks);
1281	}
1282
1283	/*
1284	* Must call GetOldestXmin() with SnapshotAny. Should never call
1285	* GetOldestXmin() with MVCC snapshot. (It's especially worth checking
1286	* this for parallel builds, since ambuild routines that support parallel
1287	* builds must work these details out for themselves.)
1288	*/
1289	Assert(snapshot == SnapshotAny \|\| IsMVCCSnapshot(snapshot));
1290	Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1291	!TransactionIdIsValid(OldestXmin));
1292	Assert(snapshot == SnapshotAny \|\| !anyvisible);
1293
1294	/ set our scan endpoints /
1295	if (!allow_sync)
1296	heap_setscanlimits(scan, start_blockno, numblocks);
1297	else
1298	{
1299	/ syncscan can only be requested on whole relation /
1300	Assert(start_blockno == `0`);
1301	Assert(numblocks == InvalidBlockNumber);
1302	}
1303
1304	reltuples = `0`;
1305
1306	/*
1307	* Scan all tuples in the base relation.
1308	*/
1309	while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1310	{
1311	bool tupleIsAlive;
1312
1313	CHECK_FOR_INTERRUPTS();
1314
1315	/ Report scan progress, if asked to. /
1316	if (progress)
1317	{
1318	BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1319
1320	if (blocks_done != previous_blkno)
1321	{
1322	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1323	blocks_done);
1324	previous_blkno = blocks_done;
1325	}
1326	}
1327
1328	/*
1329	* When dealing with a HOT-chain of updated tuples, we want to index
1330	* the values of the live tuple (if any), but index it under the TID
1331	* of the chain's root tuple. This approach is necessary to preserve
1332	* the HOT-chain structure in the heap. So we need to be able to find
1333	* the root item offset for every tuple that's in a HOT-chain. When
1334	* first reaching a new page of the relation, call
1335	* heap_get_root_tuples() to build a map of root item offsets on the
1336	* page.
1337	*
1338	* It might look unsafe to use this information across buffer
1339	* lock/unlock. However, we hold ShareLock on the table so no
1340	* ordinary insert/update/delete should occur; and we hold pin on the
1341	* buffer continuously while visiting the page, so no pruning
1342	* operation can occur either.
1343	*
1344	* Also, although our opinions about tuple liveness could change while
1345	* we scan the page (due to concurrent transaction commits/aborts),
1346	* the chain root locations won't, so this info doesn't need to be
1347	* rebuilt after waiting for another transaction.
1348	*
1349	* Note the implied assumption that there is no more than one live
1350	* tuple per HOT-chain --- else we could create more than one index
1351	* entry pointing to the same root tuple.
1352	*/
1353	if (hscan->rs_cblock != root_blkno)
1354	{
1355	Page page = BufferGetPage(hscan->rs_cbuf);
1356
1357	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1358	heap_get_root_tuples(page, root_offsets);
1359	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1360
1361	root_blkno = hscan->rs_cblock;
1362	}
1363
1364	if (snapshot == SnapshotAny)
1365	{
1366	/ do our own time qual check /
1367	bool indexIt;
1368	TransactionId xwait;
1369
1370	recheck:
1371
1372	/*
1373	* We could possibly get away with not locking the buffer here,
1374	* since caller should hold ShareLock on the relation, but let's
1375	* be conservative about it. (This remark is still correct even
1376	* with HOT-pruning: our pin on the buffer prevents pruning.)
1377	*/
1378	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1379
1380	/*
1381	* The criteria for counting a tuple as live in this block need to
1382	* match what analyze.c's heapam_scan_analyze_next_tuple() does,
1383	* otherwise CREATE INDEX and ANALYZE may produce wildly different
1384	* reltuples values, e.g. when there are many recently-dead
1385	* tuples.
1386	*/
1387	switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1388	hscan->rs_cbuf))
1389	{
1390	case HEAPTUPLE_DEAD:
1391	/ Definitely dead, we can ignore it /
1392	indexIt = false;
1393	tupleIsAlive = false;
1394	break;
1395	case HEAPTUPLE_LIVE:
1396	/ Normal case, index and unique-check it /
1397	indexIt = true;
1398	tupleIsAlive = true;
1399	/ Count it as live, too /
1400	reltuples += `1`;
1401	break;
1402	case HEAPTUPLE_RECENTLY_DEAD:
1403
1404	/*
1405	* If tuple is recently deleted then we must index it
1406	* anyway to preserve MVCC semantics. (Pre-existing
1407	* transactions could try to use the index after we finish
1408	* building it, and may need to see such tuples.)
1409	*
1410	* However, if it was HOT-updated then we must only index
1411	* the live tuple at the end of the HOT-chain. Since this
1412	* breaks semantics for pre-existing snapshots, mark the
1413	* index as unusable for them.
1414	*
1415	* We don't count recently-dead tuples in reltuples, even
1416	* if we index them; see heapam_scan_analyze_next_tuple().
1417	*/
1418	if (HeapTupleIsHotUpdated(heapTuple))
1419	{
1420	indexIt = false;
1421	/ mark the index as unsafe for old snapshots /
1422	indexInfo->ii_BrokenHotChain = true;
1423	}
1424	else
1425	indexIt = true;
1426	/ In any case, exclude the tuple from unique-checking /
1427	tupleIsAlive = false;
1428	break;
1429	case HEAPTUPLE_INSERT_IN_PROGRESS:
1430
1431	/*
1432	* In "anyvisible" mode, this tuple is visible and we
1433	* don't need any further checks.
1434	*/
1435	if (anyvisible)
1436	{
1437	indexIt = true;
1438	tupleIsAlive = true;
1439	reltuples += `1`;
1440	break;
1441	}
1442
1443	/*
1444	* Since caller should hold ShareLock or better, normally
1445	* the only way to see this is if it was inserted earlier
1446	* in our own transaction. However, it can happen in
1447	* system catalogs, since we tend to release write lock
1448	* before commit there. Give a warning if neither case
1449	* applies.
1450	*/
1451	xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1452	if (!TransactionIdIsCurrentTransactionId(xwait))
1453	{
1454	if (!is_system_catalog)
1455	elog(WARNING, "concurrent insert in progress within table \"%s\"",
1456	RelationGetRelationName(heapRelation));
1457
1458	/*
1459	* If we are performing uniqueness checks, indexing
1460	* such a tuple could lead to a bogus uniqueness
1461	* failure. In that case we wait for the inserting
1462	* transaction to finish and check again.
1463	*/
1464	if (checking_uniqueness)
1465	{
1466	/*
1467	* Must drop the lock on the buffer before we wait
1468	*/
1469	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1470	XactLockTableWait(xwait, heapRelation,
1471	&heapTuple->t_self,
1472	XLTW_InsertIndexUnique);
1473	CHECK_FOR_INTERRUPTS();
1474	goto recheck;
1475	}
1476	}
1477	else
1478	{
1479	/*
1480	* For consistency with
1481	* heapam_scan_analyze_next_tuple(), count
1482	* HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1483	* when inserted by our own transaction.
1484	*/
1485	reltuples += `1`;
1486	}
1487
1488	/*
1489	* We must index such tuples, since if the index build
1490	* commits then they're good.
1491	*/
1492	indexIt = true;
1493	tupleIsAlive = true;
1494	break;
1495	case HEAPTUPLE_DELETE_IN_PROGRESS:
1496
1497	/*
1498	* As with INSERT_IN_PROGRESS case, this is unexpected
1499	* unless it's our own deletion or a system catalog; but
1500	* in anyvisible mode, this tuple is visible.
1501	*/
1502	if (anyvisible)
1503	{
1504	indexIt = true;
1505	tupleIsAlive = false;
1506	reltuples += `1`;
1507	break;
1508	}
1509
1510	xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1511	if (!TransactionIdIsCurrentTransactionId(xwait))
1512	{
1513	if (!is_system_catalog)
1514	elog(WARNING, "concurrent delete in progress within table \"%s\"",
1515	RelationGetRelationName(heapRelation));
1516
1517	/*
1518	* If we are performing uniqueness checks, assuming
1519	* the tuple is dead could lead to missing a
1520	* uniqueness violation. In that case we wait for the
1521	* deleting transaction to finish and check again.
1522	*
1523	* Also, if it's a HOT-updated tuple, we should not
1524	* index it but rather the live tuple at the end of
1525	* the HOT-chain. However, the deleting transaction
1526	* could abort, possibly leaving this tuple as live
1527	* after all, in which case it has to be indexed. The
1528	* only way to know what to do is to wait for the
1529	* deleting transaction to finish and check again.
1530	*/
1531	if (checking_uniqueness \|\|
1532	HeapTupleIsHotUpdated(heapTuple))
1533	{
1534	/*
1535	* Must drop the lock on the buffer before we wait
1536	*/
1537	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1538	XactLockTableWait(xwait, heapRelation,
1539	&heapTuple->t_self,
1540	XLTW_InsertIndexUnique);
1541	CHECK_FOR_INTERRUPTS();
1542	goto recheck;
1543	}
1544
1545	/*
1546	* Otherwise index it but don't check for uniqueness,
1547	* the same as a RECENTLY_DEAD tuple.
1548	*/
1549	indexIt = true;
1550
1551	/*
1552	* Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1553	* if they were not deleted by the current
1554	* transaction. That's what
1555	* heapam_scan_analyze_next_tuple() does, and we want
1556	* the behavior to be consistent.
1557	*/
1558	reltuples += `1`;
1559	}
1560	else if (HeapTupleIsHotUpdated(heapTuple))
1561	{
1562	/*
1563	* It's a HOT-updated tuple deleted by our own xact.
1564	* We can assume the deletion will commit (else the
1565	* index contents don't matter), so treat the same as
1566	* RECENTLY_DEAD HOT-updated tuples.
1567	*/
1568	indexIt = false;
1569	/ mark the index as unsafe for old snapshots /
1570	indexInfo->ii_BrokenHotChain = true;
1571	}
1572	else
1573	{
1574	/*
1575	* It's a regular tuple deleted by our own xact. Index
1576	* it, but don't check for uniqueness nor count in
1577	* reltuples, the same as a RECENTLY_DEAD tuple.
1578	*/
1579	indexIt = true;
1580	}
1581	/ In any case, exclude the tuple from unique-checking /
1582	tupleIsAlive = false;
1583	break;
1584	default:
1585	elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1586	indexIt = tupleIsAlive = false; / keep compiler quiet /
1587	break;
1588	}
1589
1590	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1591
1592	if (!indexIt)
1593	continue;
1594	}
1595	else
1596	{
1597	/ heap_getnext did the time qual check /
1598	tupleIsAlive = true;
1599	reltuples += `1`;
1600	}
1601
1602	MemoryContextReset(econtext->ecxt_per_tuple_memory);
1603
1604	/ Set up for predicate or expression evaluation /
1605	ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1606
1607	/*
1608	* In a partial index, discard tuples that don't satisfy the
1609	* predicate.
1610	*/
1611	if (predicate != NULL)
1612	{
1613	if (!ExecQual(predicate, econtext))
1614	continue;
1615	}
1616
1617	/*
1618	* For the current heap tuple, extract all the attributes we use in
1619	* this index, and note which are null. This also performs evaluation
1620	* of any expressions needed.
1621	*/
1622	FormIndexDatum(indexInfo,
1623	slot,
1624	estate,
1625	values,
1626	isnull);
1627
1628	/*
1629	* You'd think we should go ahead and build the index tuple here, but
1630	* some index AMs want to do further processing on the data first. So
1631	* pass the values[] and isnull[] arrays, instead.
1632	*/
1633
1634	if (HeapTupleIsHeapOnly(heapTuple))
1635	{
1636	/*
1637	* For a heap-only tuple, pretend its TID is that of the root. See
1638	* src/backend/access/heap/README.HOT for discussion.
1639	*/
1640	HeapTupleData rootTuple;
1641	OffsetNumber offnum;
1642
1643	rootTuple = *heapTuple;
1644	offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1645
1646	if (!OffsetNumberIsValid(root_offsets[offnum - `1`]))
1647	ereport(ERROR,
1648	(errcode(ERRCODE_DATA_CORRUPTED),
1649	errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1650	ItemPointerGetBlockNumber(&heapTuple->t_self),
1651	offnum,
1652	RelationGetRelationName(heapRelation))));
1653
1654	ItemPointerSetOffsetNumber(&rootTuple.t_self,
1655	root_offsets[offnum - `1`]);
1656
1657	/ Call the AM's callback routine to process the tuple /
1658	callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
1659	callback_state);
1660	}
1661	else
1662	{
1663	/ Call the AM's callback routine to process the tuple /
1664	callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
1665	callback_state);
1666	}
1667	}
1668
1669	/ Report scan progress one last time. /
1670	if (progress)
1671	{
1672	BlockNumber blks_done;
1673
1674	if (hscan->rs_base.rs_parallel != NULL)
1675	{
1676	ParallelBlockTableScanDesc pbscan;
1677
1678	pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1679	blks_done = pbscan->phs_nblocks;
1680	}
1681	else
1682	blks_done = hscan->rs_nblocks;
1683
1684	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1685	blks_done);
1686	}
1687
1688	table_endscan(scan);
1689
1690	/ we can now forget our snapshot, if set and registered by us /
1691	if (need_unregister_snapshot)
1692	UnregisterSnapshot(snapshot);
1693
1694	ExecDropSingleTupleTableSlot(slot);
1695
1696	FreeExecutorState(estate);
1697
1698	/ These may have been pointing to the now-gone estate /
1699	indexInfo->ii_ExpressionsState = NIL;
1700	indexInfo->ii_PredicateState = NULL;
1701
1702	return reltuples;
1703	}
1704
1705	static void
1706	heapam_index_validate_scan(Relation heapRelation,
1707	Relation indexRelation,
1708	IndexInfo *indexInfo,
1709	Snapshot snapshot,
1710	ValidateIndexState *state)
1711	{
1712	TableScanDesc scan;
1713	HeapScanDesc hscan;
1714	HeapTuple heapTuple;
1715	Datum values[INDEX_MAX_KEYS];
1716	bool isnull[INDEX_MAX_KEYS];
1717	ExprState *predicate;
1718	TupleTableSlot *slot;
1719	EState *estate;
1720	ExprContext *econtext;
1721	BlockNumber root_blkno = InvalidBlockNumber;
1722	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1723	bool in_index[MaxHeapTuplesPerPage];
1724	BlockNumber previous_blkno = InvalidBlockNumber;
1725
1726	/ state variables for the merge /
1727	ItemPointer indexcursor = NULL;
1728	ItemPointerData decoded;
1729	bool tuplesort_empty = false;
1730
1731	/*
1732	* sanity checks
1733	*/
1734	Assert(OidIsValid(indexRelation->rd_rel->relam));
1735
1736	/*
1737	* Need an EState for evaluation of index expressions and partial-index
1738	* predicates. Also a slot to hold the current tuple.
1739	*/
1740	estate = CreateExecutorState();
1741	econtext = GetPerTupleExprContext(estate);
1742	slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1743	&TTSOpsHeapTuple);
1744
1745	/ Arrange for econtext's scan tuple to be the tuple under test /
1746	econtext->ecxt_scantuple = slot;
1747
1748	/ Set up execution state for predicate, if any. /
1749	predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1750
1751	/*
1752	* Prepare for scan of the base relation. We need just those tuples
1753	* satisfying the passed-in reference snapshot. We must disable syncscan
1754	* here, because it's critical that we read from block zero forward to
1755	* match the sorted TIDs.
1756	*/
1757	scan = table_beginscan_strat(heapRelation, / relation /
1758	snapshot, / snapshot /
1759	`0`, / number of keys /
1760	NULL, / scan key /
1761	true, / buffer access strategy OK /
1762	false); / syncscan not OK /
1763	hscan = (HeapScanDesc) scan;
1764
1765	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1766	hscan->rs_nblocks);
1767
1768	/*
1769	* Scan all tuples matching the snapshot.
1770	*/
1771	while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1772	{
1773	ItemPointer heapcursor = &heapTuple->t_self;
1774	ItemPointerData rootTuple;
1775	OffsetNumber root_offnum;
1776
1777	CHECK_FOR_INTERRUPTS();
1778
1779	state->htups += `1`;
1780
1781	if ((previous_blkno == InvalidBlockNumber) \|\|
1782	(hscan->rs_cblock != previous_blkno))
1783	{
1784	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1785	hscan->rs_cblock);
1786	previous_blkno = hscan->rs_cblock;
1787	}
1788
1789	/*
1790	* As commented in table_index_build_scan, we should index heap-only
1791	* tuples under the TIDs of their root tuples; so when we advance onto
1792	* a new heap page, build a map of root item offsets on the page.
1793	*
1794	* This complicates merging against the tuplesort output: we will
1795	* visit the live tuples in order by their offsets, but the root
1796	* offsets that we need to compare against the index contents might be
1797	* ordered differently. So we might have to "look back" within the
1798	* tuplesort output, but only within the current page. We handle that
1799	* by keeping a bool array in_index[] showing all the
1800	* already-passed-over tuplesort output TIDs of the current page. We
1801	* clear that array here, when advancing onto a new heap page.
1802	*/
1803	if (hscan->rs_cblock != root_blkno)
1804	{
1805	Page page = BufferGetPage(hscan->rs_cbuf);
1806
1807	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1808	heap_get_root_tuples(page, root_offsets);
1809	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1810
1811	memset(in_index, `0`, sizeof(in_index));
1812
1813	root_blkno = hscan->rs_cblock;
1814	}
1815
1816	/ Convert actual tuple TID to root TID /
1817	rootTuple = *heapcursor;
1818	root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1819
1820	if (HeapTupleIsHeapOnly(heapTuple))
1821	{
1822	root_offnum = root_offsets[root_offnum - `1`];
1823	if (!OffsetNumberIsValid(root_offnum))
1824	ereport(ERROR,
1825	(errcode(ERRCODE_DATA_CORRUPTED),
1826	errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1827	ItemPointerGetBlockNumber(heapcursor),
1828	ItemPointerGetOffsetNumber(heapcursor),
1829	RelationGetRelationName(heapRelation))));
1830	ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1831	}
1832
1833	/*
1834	* "merge" by skipping through the index tuples until we find or pass
1835	* the current root tuple.
1836	*/
1837	while (!tuplesort_empty &&
1838	(!indexcursor \|\|
1839	ItemPointerCompare(indexcursor, &rootTuple) < `0`))
1840	{
1841	Datum ts_val;
1842	bool ts_isnull;
1843
1844	if (indexcursor)
1845	{
1846	/*
1847	* Remember index items seen earlier on the current heap page
1848	*/
1849	if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1850	in_index[ItemPointerGetOffsetNumber(indexcursor) - `1`] = true;
1851	}
1852
1853	tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1854	&ts_val, &ts_isnull, NULL);
1855	Assert(tuplesort_empty \|\| !ts_isnull);
1856	if (!tuplesort_empty)
1857	{
1858	itemptr_decode(&decoded, DatumGetInt64(ts_val));
1859	indexcursor = &decoded;
1860
1861	/ If int8 is pass-by-ref, free (encoded) TID Datum memory /
1862	#ifndef USE_FLOAT8_BYVAL
1863	pfree(DatumGetPointer(ts_val));
1864	#endif
1865	}
1866	else
1867	{
1868	/ Be tidy /
1869	indexcursor = NULL;
1870	}
1871	}
1872
1873	/*
1874	* If the tuplesort has overshot and we didn't see a match earlier,
1875	* then this tuple is missing from the index, so insert it.
1876	*/
1877	if ((tuplesort_empty \|\|
1878	ItemPointerCompare(indexcursor, &rootTuple) > `0`) &&
1879	!in_index[root_offnum - `1`])
1880	{
1881	MemoryContextReset(econtext->ecxt_per_tuple_memory);
1882
1883	/ Set up for predicate or expression evaluation /
1884	ExecStoreHeapTuple(heapTuple, slot, false);
1885
1886	/*
1887	* In a partial index, discard tuples that don't satisfy the
1888	* predicate.
1889	*/
1890	if (predicate != NULL)
1891	{
1892	if (!ExecQual(predicate, econtext))
1893	continue;
1894	}
1895
1896	/*
1897	* For the current heap tuple, extract all the attributes we use
1898	* in this index, and note which are null. This also performs
1899	* evaluation of any expressions needed.
1900	*/
1901	FormIndexDatum(indexInfo,
1902	slot,
1903	estate,
1904	values,
1905	isnull);
1906
1907	/*
1908	* You'd think we should go ahead and build the index tuple here,
1909	* but some index AMs want to do further processing on the data
1910	* first. So pass the values[] and isnull[] arrays, instead.
1911	*/
1912
1913	/*
1914	* If the tuple is already committed dead, you might think we
1915	* could suppress uniqueness checking, but this is no longer true
1916	* in the presence of HOT, because the insert is actually a proxy
1917	* for a uniqueness check on the whole HOT-chain. That is, the
1918	* tuple we have here could be dead because it was already
1919	* HOT-updated, and if so the updating transaction will not have
1920	* thought it should insert index entries. The index AM will
1921	* check the whole HOT-chain and correctly detect a conflict if
1922	* there is one.
1923	*/
1924
1925	index_insert(indexRelation,
1926	values,
1927	isnull,
1928	&rootTuple,
1929	heapRelation,
1930	indexInfo->ii_Unique ?
1931	UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1932	indexInfo);
1933
1934	state->tups_inserted += `1`;
1935	}
1936	}
1937
1938	table_endscan(scan);
1939
1940	ExecDropSingleTupleTableSlot(slot);
1941
1942	FreeExecutorState(estate);
1943
1944	/ These may have been pointing to the now-gone estate /
1945	indexInfo->ii_ExpressionsState = NIL;
1946	indexInfo->ii_PredicateState = NULL;
1947	}
1948
1949	/*
1950	* Return the number of blocks that have been read by this scan since
1951	* starting. This is meant for progress reporting rather than be fully
1952	* accurate: in a parallel scan, workers can be concurrently reading blocks
1953	* further ahead than what we report.
1954	*/
1955	static BlockNumber
1956	heapam_scan_get_blocks_done(HeapScanDesc hscan)
1957	{
1958	ParallelBlockTableScanDesc bpscan = NULL;
1959	BlockNumber startblock;
1960	BlockNumber blocks_done;
1961
1962	if (hscan->rs_base.rs_parallel != NULL)
1963	{
1964	bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1965	startblock = bpscan->phs_startblock;
1966	}
1967	else
1968	startblock = hscan->rs_startblock;
1969
1970	/*
1971	* Might have wrapped around the end of the relation, if startblock was
1972	* not zero.
1973	*/
1974	if (hscan->rs_cblock > startblock)
1975	blocks_done = hscan->rs_cblock - startblock;
1976	else
1977	{
1978	BlockNumber nblocks;
1979
1980	nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
1981	blocks_done = nblocks - startblock +
1982	hscan->rs_cblock;
1983	}
1984
1985	return blocks_done;
1986	}
1987
1988
1989	/ ------------------------------------------------------------------------*
1990	* Miscellaneous callbacks for the heap AM
1991	* ------------------------------------------------------------------------
1992	*/
1993
1994	static uint64
1995	heapam_relation_size(Relation rel, ForkNumber forkNumber)
1996	{
1997	uint64 nblocks = `0`;
1998
1999	/ Open it at the smgr level if not already done /
2000	RelationOpenSmgr(rel);
2001
2002	/ InvalidForkNumber indicates returning the size for all forks /
2003	if (forkNumber == InvalidForkNumber)
2004	{
2005	for (int i = `0`; i < MAX_FORKNUM; i++)
2006	nblocks += smgrnblocks(rel->rd_smgr, i);
2007	}
2008	else
2009	nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
2010
2011	return nblocks * BLCKSZ;
2012	}
2013
2014	/*
2015	* Check to see whether the table needs a TOAST table. It does only if
2016	* (1) there are any toastable attributes, and (2) the maximum length
2017	* of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2018	* create a toast table for something like "f1 varchar(20)".)
2019	*/
2020	static bool
2021	heapam_relation_needs_toast_table(Relation rel)
2022	{
2023	int32 data_length = `0`;
2024	bool maxlength_unknown = false;
2025	bool has_toastable_attrs = false;
2026	TupleDesc tupdesc = rel->rd_att;
2027	int32 tuple_length;
2028	int i;
2029
2030	for (i = `0`; i < tupdesc->natts; i++)
2031	{
2032	Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2033
2034	if (att->attisdropped)
2035	continue;
2036	data_length = att_align_nominal(data_length, att->attalign);
2037	if (att->attlen > `0`)
2038	{
2039	/ Fixed-length types are never toastable /
2040	data_length += att->attlen;
2041	}
2042	else
2043	{
2044	int32 maxlen = type_maximum_size(att->atttypid,
2045	att->atttypmod);
2046
2047	if (maxlen < `0`)
2048	maxlength_unknown = true;
2049	else
2050	data_length += maxlen;
2051	if (att->attstorage != `'p'`)
2052	has_toastable_attrs = true;
2053	}
2054	}
2055	if (!has_toastable_attrs)
2056	return false; / nothing to toast? /
2057	if (maxlength_unknown)
2058	return true; / any unlimited-length attrs? /
2059	tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2060	BITMAPLEN(tupdesc->natts)) +
2061	MAXALIGN(data_length);
2062	return (tuple_length > TOAST_TUPLE_THRESHOLD);
2063	}
2064
2065
2066	/ ------------------------------------------------------------------------*
2067	* Planner related callbacks for the heap AM
2068	* ------------------------------------------------------------------------
2069	*/
2070
2071	static void
2072	heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2073	BlockNumber pages, double* *tuples,
2074	double *allvisfrac)
2075	{
2076	BlockNumber curpages;
2077	BlockNumber relpages;
2078	double reltuples;
2079	BlockNumber relallvisible;
2080	double density;
2081
2082	/ it has storage, ok to call the smgr /
2083	curpages = RelationGetNumberOfBlocks(rel);
2084
2085	/ coerce values in pg_class to more desirable types /
2086	relpages = (BlockNumber) rel->rd_rel->relpages;
2087	reltuples = (double) rel->rd_rel->reltuples;
2088	relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
2089
2090	/*
2091	* HACK: if the relation has never yet been vacuumed, use a minimum size
2092	* estimate of 10 pages. The idea here is to avoid assuming a
2093	* newly-created table is really small, even if it currently is, because
2094	* that may not be true once some data gets loaded into it. Once a vacuum
2095	* or analyze cycle has been done on it, it's more reasonable to believe
2096	* the size is somewhat stable.
2097	*
2098	* (Note that this is only an issue if the plan gets cached and used again
2099	* after the table has been filled. What we're trying to avoid is using a
2100	* nestloop-type plan on a table that has grown substantially since the
2101	* plan was made. Normally, autovacuum/autoanalyze will occur once enough
2102	* inserts have happened and cause cached-plan invalidation; but that
2103	* doesn't happen instantaneously, and it won't happen at all for cases
2104	* such as temporary tables.)
2105	*
2106	* We approximate "never vacuumed" by "has relpages = 0", which means this
2107	* will also fire on genuinely empty relations. Not great, but
2108	* fortunately that's a seldom-seen case in the real world, and it
2109	* shouldn't degrade the quality of the plan too much anyway to err in
2110	* this direction.
2111	*
2112	* If the table has inheritance children, we don't apply this heuristic.
2113	* Totally empty parent tables are quite common, so we should be willing
2114	* to believe that they are empty.
2115	*/
2116	if (curpages < `10` &&
2117	relpages == `0` &&
2118	!rel->rd_rel->relhassubclass)
2119	curpages = `10`;
2120
2121	/ report estimated # pages /
2122	*pages = curpages;
2123	/ quick exit if rel is clearly empty /
2124	if (curpages == `0`)
2125	{
2126	*tuples = `0`;
2127	*allvisfrac = `0`;
2128	return;
2129	}
2130
2131	/ estimate number of tuples from previous tuple density /
2132	if (relpages > `0`)
2133	density = reltuples / (double) relpages;
2134	else
2135	{
2136	/*
2137	* When we have no data because the relation was truncated, estimate
2138	* tuple width from attribute datatypes. We assume here that the
2139	* pages are completely full, which is OK for tables (since they've
2140	* presumably not been VACUUMed yet) but is probably an overestimate
2141	* for indexes. Fortunately get_relation_info() can clamp the
2142	* overestimate to the parent table's size.
2143	*
2144	* Note: this code intentionally disregards alignment considerations,
2145	* because (a) that would be gilding the lily considering how crude
2146	* the estimate is, and (b) it creates platform dependencies in the
2147	* default plans which are kind of a headache for regression testing.
2148	*/
2149	int32 tuple_width;
2150
2151	tuple_width = get_rel_data_width(rel, attr_widths);
2152	tuple_width += MAXALIGN(SizeofHeapTupleHeader);
2153	tuple_width += sizeof(ItemIdData);
2154	/ note: integer division is intentional here /
2155	density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width;
2156	}
2157	tuples = rint(density (double) curpages);
2158
2159	/*
2160	* We use relallvisible as-is, rather than scaling it up like we do for
2161	* the pages and tuples counts, on the theory that any pages added since
2162	* the last VACUUM are most likely not marked all-visible. But costsize.c
2163	* wants it converted to a fraction.
2164	*/
2165	if (relallvisible == `0` \|\| curpages <= `0`)
2166	*allvisfrac = `0`;
2167	else if ((double) relallvisible >= curpages)
2168	*allvisfrac = `1`;
2169	else
2170	allvisfrac = (double*) relallvisible / curpages;
2171	}
2172
2173
2174	/ ------------------------------------------------------------------------*
2175	* Executor related callbacks for the heap AM
2176	* ------------------------------------------------------------------------
2177	*/
2178
2179	static bool
2180	heapam_scan_bitmap_next_block(TableScanDesc scan,
2181	TBMIterateResult *tbmres)
2182	{
2183	HeapScanDesc hscan = (HeapScanDesc) scan;
2184	BlockNumber page = tbmres->blockno;
2185	Buffer buffer;
2186	Snapshot snapshot;
2187	int ntup;
2188
2189	hscan->rs_cindex = `0`;
2190	hscan->rs_ntuples = `0`;
2191
2192	/*
2193	* Ignore any claimed entries past what we think is the end of the
2194	* relation. It may have been extended after the start of our scan (we
2195	* only hold an AccessShareLock, and it could be inserts from this
2196	* backend).
2197	*/
2198	if (page >= hscan->rs_nblocks)
2199	return false;
2200
2201	/*
2202	* Acquire pin on the target heap page, trading in any pin we held before.
2203	*/
2204	hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
2205	scan->rs_rd,
2206	page);
2207	hscan->rs_cblock = page;
2208	buffer = hscan->rs_cbuf;
2209	snapshot = scan->rs_snapshot;
2210
2211	ntup = `0`;
2212
2213	/*
2214	* Prune and repair fragmentation for the whole page, if possible.
2215	*/
2216	heap_page_prune_opt(scan->rs_rd, buffer);
2217
2218	/*
2219	* We must hold share lock on the buffer content while examining tuple
2220	* visibility. Afterwards, however, the tuples we have found to be
2221	* visible are guaranteed good as long as we hold the buffer pin.
2222	*/
2223	LockBuffer(buffer, BUFFER_LOCK_SHARE);
2224
2225	/*
2226	* We need two separate strategies for lossy and non-lossy cases.
2227	*/
2228	if (tbmres->ntuples >= `0`)
2229	{
2230	/*
2231	* Bitmap is non-lossy, so we just look through the offsets listed in
2232	* tbmres; but we have to follow any HOT chain starting at each such
2233	* offset.
2234	*/
2235	int curslot;
2236
2237	for (curslot = `0`; curslot < tbmres->ntuples; curslot++)
2238	{
2239	OffsetNumber offnum = tbmres->offsets[curslot];
2240	ItemPointerData tid;
2241	HeapTupleData heapTuple;
2242
2243	ItemPointerSet(&tid, page, offnum);
2244	if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2245	&heapTuple, NULL, true))
2246	hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2247	}
2248	}
2249	else
2250	{
2251	/*
2252	* Bitmap is lossy, so we must examine each line pointer on the page.
2253	* But we can ignore HOT chains, since we'll check each tuple anyway.
2254	*/
2255	Page dp = (Page) BufferGetPage(buffer);
2256	OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
2257	OffsetNumber offnum;
2258
2259	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2260	{
2261	ItemId lp;
2262	HeapTupleData loctup;
2263	bool valid;
2264
2265	lp = PageGetItemId(dp, offnum);
2266	if (!ItemIdIsNormal(lp))
2267	continue;
2268	loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
2269	loctup.t_len = ItemIdGetLength(lp);
2270	loctup.t_tableOid = scan->rs_rd->rd_id;
2271	ItemPointerSet(&loctup.t_self, page, offnum);
2272	valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2273	if (valid)
2274	{
2275	hscan->rs_vistuples[ntup++] = offnum;
2276	PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
2277	}
2278	CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2279	buffer, snapshot);
2280	}
2281	}
2282
2283	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2284
2285	Assert(ntup <= MaxHeapTuplesPerPage);
2286	hscan->rs_ntuples = ntup;
2287
2288	return ntup > `0`;
2289	}
2290
2291	static bool
2292	heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2293	TBMIterateResult *tbmres,
2294	TupleTableSlot *slot)
2295	{
2296	HeapScanDesc hscan = (HeapScanDesc) scan;
2297	OffsetNumber targoffset;
2298	Page dp;
2299	ItemId lp;
2300
2301	/*
2302	* Out of range? If so, nothing more to look at on this page
2303	*/
2304	if (hscan->rs_cindex < `0` \|\| hscan->rs_cindex >= hscan->rs_ntuples)
2305	return false;
2306
2307	targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2308	dp = (Page) BufferGetPage(hscan->rs_cbuf);
2309	lp = PageGetItemId(dp, targoffset);
2310	Assert(ItemIdIsNormal(lp));
2311
2312	hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
2313	hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2314	hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2315	ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2316
2317	pgstat_count_heap_fetch(scan->rs_rd);
2318
2319	/*
2320	* Set up the result slot to point to this tuple. Note that the slot
2321	* acquires a pin on the buffer.
2322	*/
2323	ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2324	slot,
2325	hscan->rs_cbuf);
2326
2327	hscan->rs_cindex++;
2328
2329	return true;
2330	}
2331
2332	static bool
2333	heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2334	{
2335	HeapScanDesc hscan = (HeapScanDesc) scan;
2336	TsmRoutine *tsm = scanstate->tsmroutine;
2337	BlockNumber blockno;
2338
2339	/ return false immediately if relation is empty /
2340	if (hscan->rs_nblocks == `0`)
2341	return false;
2342
2343	if (tsm->NextSampleBlock)
2344	{
2345	blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2346	hscan->rs_cblock = blockno;
2347	}
2348	else
2349	{
2350	/ scanning table sequentially /
2351
2352	if (hscan->rs_cblock == InvalidBlockNumber)
2353	{
2354	Assert(!hscan->rs_inited);
2355	blockno = hscan->rs_startblock;
2356	}
2357	else
2358	{
2359	Assert(hscan->rs_inited);
2360
2361	blockno = hscan->rs_cblock + `1`;
2362
2363	if (blockno >= hscan->rs_nblocks)
2364	{
2365	/ wrap to beginning of rel, might not have started at 0 /
2366	blockno = `0`;
2367	}
2368
2369	/*
2370	* Report our new scan position for synchronization purposes.
2371	*
2372	* Note: we do this before checking for end of scan so that the
2373	* final state of the position hint is back at the start of the
2374	* rel. That's not strictly necessary, but otherwise when you run
2375	* the same query multiple times the starting position would shift
2376	* a little bit backwards on every invocation, which is confusing.
2377	* We don't guarantee any specific ordering in general, though.
2378	*/
2379	if (scan->rs_flags & SO_ALLOW_SYNC)
2380	ss_report_location(scan->rs_rd, blockno);
2381
2382	if (blockno == hscan->rs_startblock)
2383	{
2384	blockno = InvalidBlockNumber;
2385	}
2386	}
2387	}
2388
2389	if (!BlockNumberIsValid(blockno))
2390	{
2391	if (BufferIsValid(hscan->rs_cbuf))
2392	ReleaseBuffer(hscan->rs_cbuf);
2393	hscan->rs_cbuf = InvalidBuffer;
2394	hscan->rs_cblock = InvalidBlockNumber;
2395	hscan->rs_inited = false;
2396
2397	return false;
2398	}
2399
2400	heapgetpage(scan, blockno);
2401	hscan->rs_inited = true;
2402
2403	return true;
2404	}
2405
2406	static bool
2407	heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2408	TupleTableSlot *slot)
2409	{
2410	HeapScanDesc hscan = (HeapScanDesc) scan;
2411	TsmRoutine *tsm = scanstate->tsmroutine;
2412	BlockNumber blockno = hscan->rs_cblock;
2413	bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != `0`;
2414
2415	Page page;
2416	bool all_visible;
2417	OffsetNumber maxoffset;
2418
2419	/*
2420	* When not using pagemode, we must lock the buffer during tuple
2421	* visibility checks.
2422	*/
2423	if (!pagemode)
2424	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2425
2426	page = (Page) BufferGetPage(hscan->rs_cbuf);
2427	all_visible = PageIsAllVisible(page) &&
2428	!scan->rs_snapshot->takenDuringRecovery;
2429	maxoffset = PageGetMaxOffsetNumber(page);
2430
2431	for (;;)
2432	{
2433	OffsetNumber tupoffset;
2434
2435	CHECK_FOR_INTERRUPTS();
2436
2437	/ Ask the tablesample method which tuples to check on this page. /
2438	tupoffset = tsm->NextSampleTuple(scanstate,
2439	blockno,
2440	maxoffset);
2441
2442	if (OffsetNumberIsValid(tupoffset))
2443	{
2444	ItemId itemid;
2445	bool visible;
2446	HeapTuple tuple = &(hscan->rs_ctup);
2447
2448	/ Skip invalid tuple pointers. /
2449	itemid = PageGetItemId(page, tupoffset);
2450	if (!ItemIdIsNormal(itemid))
2451	continue;
2452
2453	tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2454	tuple->t_len = ItemIdGetLength(itemid);
2455	ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2456
2457
2458	if (all_visible)
2459	visible = true;
2460	else
2461	visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2462	tuple, tupoffset);
2463
2464	/ in pagemode, heapgetpage did this for us /
2465	if (!pagemode)
2466	CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2467	hscan->rs_cbuf, scan->rs_snapshot);
2468
2469	/ Try next tuple from same page. /
2470	if (!visible)
2471	continue;
2472
2473	/ Found visible tuple, return it. /
2474	if (!pagemode)
2475	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2476
2477	ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2478
2479	/ Count successfully-fetched tuples as heap fetches /
2480	pgstat_count_heap_getnext(scan->rs_rd);
2481
2482	return true;
2483	}
2484	else
2485	{
2486	/*
2487	* If we get here, it means we've exhausted the items on this page
2488	* and it's time to move to the next.
2489	*/
2490	if (!pagemode)
2491	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2492
2493	ExecClearTuple(slot);
2494	return false;
2495	}
2496	}
2497
2498	Assert(`0`);
2499	}
2500
2501
2502	/ ----------------------------------------------------------------------------*
2503	* Helper functions for the above.
2504	* ----------------------------------------------------------------------------
2505	*/
2506
2507	/*
2508	* Reconstruct and rewrite the given tuple
2509	*
2510	* We cannot simply copy the tuple as-is, for several reasons:
2511	*
2512	* 1. We'd like to squeeze out the values of any dropped columns, both
2513	* to save space and to ensure we have no corner-case failures. (It's
2514	* possible for example that the new table hasn't got a TOAST table
2515	* and so is unable to store any large values of dropped cols.)
2516	*
2517	* 2. The tuple might not even be legal for the new table; this is
2518	* currently only known to happen as an after-effect of ALTER TABLE
2519	* SET WITHOUT OIDS.
2520	*
2521	* So, we must reconstruct the tuple from component Datums.
2522	*/
2523	static void
2524	reform_and_rewrite_tuple(HeapTuple tuple,
2525	Relation OldHeap, Relation NewHeap,
2526	Datum values, bool isnull, RewriteState rwstate)
2527	{
2528	TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2529	TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2530	HeapTuple copiedTuple;
2531	int i;
2532
2533	heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2534
2535	/ Be sure to null out any dropped columns /
2536	for (i = `0`; i < newTupDesc->natts; i++)
2537	{
2538	if (TupleDescAttr(newTupDesc, i)->attisdropped)
2539	isnull[i] = true;
2540	}
2541
2542	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2543
2544	/ The heap rewrite module does the rest /
2545	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2546
2547	heap_freetuple(copiedTuple);
2548	}
2549
2550	/*
2551	* Check visibility of the tuple.
2552	*/
2553	static bool
2554	SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2555	HeapTuple tuple,
2556	OffsetNumber tupoffset)
2557	{
2558	HeapScanDesc hscan = (HeapScanDesc) scan;
2559
2560	if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2561	{
2562	/*
2563	* In pageatatime mode, heapgetpage() already did visibility checks,
2564	* so just look at the info it left in rs_vistuples[].
2565	*
2566	* We use a binary search over the known-sorted array. Note: we could
2567	* save some effort if we insisted that NextSampleTuple select tuples
2568	* in increasing order, but it's not clear that there would be enough
2569	* gain to justify the restriction.
2570	*/
2571	int start = `0`,
2572	end = hscan->rs_ntuples - `1`;
2573
2574	while (start <= end)
2575	{
2576	int mid = (start + end) / `2`;
2577	OffsetNumber curoffset = hscan->rs_vistuples[mid];
2578
2579	if (tupoffset == curoffset)
2580	return true;
2581	else if (tupoffset < curoffset)
2582	end = mid - `1`;
2583	else
2584	start = mid + `1`;
2585	}
2586
2587	return false;
2588	}
2589	else
2590	{
2591	/ Otherwise, we have to check the tuple individually. /
2592	return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2593	buffer);
2594	}
2595	}
2596
2597
2598	/ ------------------------------------------------------------------------*
2599	* Definition of the heap table access method.
2600	* ------------------------------------------------------------------------
2601	*/
2602
2603	static const TableAmRoutine heapam_methods = {
2604	.type = T_TableAmRoutine,
2605
2606	.slot_callbacks = heapam_slot_callbacks,
2607
2608	.scan_begin = heap_beginscan,
2609	.scan_end = heap_endscan,
2610	.scan_rescan = heap_rescan,
2611	.scan_getnextslot = heap_getnextslot,
2612
2613	.parallelscan_estimate = table_block_parallelscan_estimate,
2614	.parallelscan_initialize = table_block_parallelscan_initialize,
2615	.parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2616
2617	.index_fetch_begin = heapam_index_fetch_begin,
2618	.index_fetch_reset = heapam_index_fetch_reset,
2619	.index_fetch_end = heapam_index_fetch_end,
2620	.index_fetch_tuple = heapam_index_fetch_tuple,
2621
2622	.tuple_insert = heapam_tuple_insert,
2623	.tuple_insert_speculative = heapam_tuple_insert_speculative,
2624	.tuple_complete_speculative = heapam_tuple_complete_speculative,
2625	.multi_insert = heap_multi_insert,
2626	.tuple_delete = heapam_tuple_delete,
2627	.tuple_update = heapam_tuple_update,
2628	.tuple_lock = heapam_tuple_lock,
2629	.finish_bulk_insert = heapam_finish_bulk_insert,
2630
2631	.tuple_fetch_row_version = heapam_fetch_row_version,
2632	.tuple_get_latest_tid = heap_get_latest_tid,
2633	.tuple_tid_valid = heapam_tuple_tid_valid,
2634	.tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2635	.compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
2636
2637	.relation_set_new_filenode = heapam_relation_set_new_filenode,
2638	.relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2639	.relation_copy_data = heapam_relation_copy_data,
2640	.relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2641	.relation_vacuum = heap_vacuum_rel,
2642	.scan_analyze_next_block = heapam_scan_analyze_next_block,
2643	.scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2644	.index_build_range_scan = heapam_index_build_range_scan,
2645	.index_validate_scan = heapam_index_validate_scan,
2646
2647	.relation_size = heapam_relation_size,
2648	.relation_needs_toast_table = heapam_relation_needs_toast_table,
2649
2650	.relation_estimate_size = heapam_estimate_rel_size,
2651
2652	.scan_bitmap_next_block = heapam_scan_bitmap_next_block,
2653	.scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2654	.scan_sample_next_block = heapam_scan_sample_next_block,
2655	.scan_sample_next_tuple = heapam_scan_sample_next_tuple
2656	};
2657
2658
2659	const TableAmRoutine *
2660	GetHeapamTableAmRoutine(void)
2661	{
2662	return &heapam_methods;
2663	}
2664
2665	Datum
2666	heap_tableam_handler(PG_FUNCTION_ARGS)
2667	{
2668	PG_RETURN_POINTER(&heapam_methods);
2669	}
2670

Browse the source code of PostgreSQL/src/backend/access/heap/heapam_handler.c