1/*-------------------------------------------------------------------------
2 *
3 * heapam_handler.c
4 * heap table access method code
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/heapam_handler.c
12 *
13 *
14 * NOTES
15 * This files wires up the lower level heapam.c et al routines with the
16 * tableam abstraction.
17 *
18 *-------------------------------------------------------------------------
19 */
20#include "postgres.h"
21
22#include <math.h>
23
24#include "miscadmin.h"
25
26#include "access/genam.h"
27#include "access/heapam.h"
28#include "access/multixact.h"
29#include "access/rewriteheap.h"
30#include "access/tableam.h"
31#include "access/tsmapi.h"
32#include "access/tuptoaster.h"
33#include "access/xact.h"
34#include "catalog/catalog.h"
35#include "catalog/index.h"
36#include "catalog/storage.h"
37#include "catalog/storage_xlog.h"
38#include "commands/progress.h"
39#include "executor/executor.h"
40#include "optimizer/plancat.h"
41#include "pgstat.h"
42#include "storage/bufmgr.h"
43#include "storage/bufpage.h"
44#include "storage/bufmgr.h"
45#include "storage/lmgr.h"
46#include "storage/predicate.h"
47#include "storage/procarray.h"
48#include "storage/smgr.h"
49#include "utils/builtins.h"
50#include "utils/rel.h"
51
52
53static void reform_and_rewrite_tuple(HeapTuple tuple,
54 Relation OldHeap, Relation NewHeap,
55 Datum *values, bool *isnull, RewriteState rwstate);
56
57static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
58 HeapTuple tuple,
59 OffsetNumber tupoffset);
60
61static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
62
63static const TableAmRoutine heapam_methods;
64
65
66/* ------------------------------------------------------------------------
67 * Slot related callbacks for heap AM
68 * ------------------------------------------------------------------------
69 */
70
71static const TupleTableSlotOps *
72heapam_slot_callbacks(Relation relation)
73{
74 return &TTSOpsBufferHeapTuple;
75}
76
77
78/* ------------------------------------------------------------------------
79 * Index Scan Callbacks for heap AM
80 * ------------------------------------------------------------------------
81 */
82
83static IndexFetchTableData *
84heapam_index_fetch_begin(Relation rel)
85{
86 IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
87
88 hscan->xs_base.rel = rel;
89 hscan->xs_cbuf = InvalidBuffer;
90
91 return &hscan->xs_base;
92}
93
94static void
95heapam_index_fetch_reset(IndexFetchTableData *scan)
96{
97 IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
98
99 if (BufferIsValid(hscan->xs_cbuf))
100 {
101 ReleaseBuffer(hscan->xs_cbuf);
102 hscan->xs_cbuf = InvalidBuffer;
103 }
104}
105
106static void
107heapam_index_fetch_end(IndexFetchTableData *scan)
108{
109 IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
110
111 heapam_index_fetch_reset(scan);
112
113 pfree(hscan);
114}
115
116static bool
117heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
118 ItemPointer tid,
119 Snapshot snapshot,
120 TupleTableSlot *slot,
121 bool *call_again, bool *all_dead)
122{
123 IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
124 BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
125 bool got_heap_tuple;
126
127 Assert(TTS_IS_BUFFERTUPLE(slot));
128
129 /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
130 if (!*call_again)
131 {
132 /* Switch to correct buffer if we don't have it already */
133 Buffer prev_buf = hscan->xs_cbuf;
134
135 hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
136 hscan->xs_base.rel,
137 ItemPointerGetBlockNumber(tid));
138
139 /*
140 * Prune page, but only if we weren't already on this page
141 */
142 if (prev_buf != hscan->xs_cbuf)
143 heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
144 }
145
146 /* Obtain share-lock on the buffer so we can examine visibility */
147 LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
148 got_heap_tuple = heap_hot_search_buffer(tid,
149 hscan->xs_base.rel,
150 hscan->xs_cbuf,
151 snapshot,
152 &bslot->base.tupdata,
153 all_dead,
154 !*call_again);
155 bslot->base.tupdata.t_self = *tid;
156 LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
157
158 if (got_heap_tuple)
159 {
160 /*
161 * Only in a non-MVCC snapshot can more than one member of the HOT
162 * chain be visible.
163 */
164 *call_again = !IsMVCCSnapshot(snapshot);
165
166 slot->tts_tableOid = RelationGetRelid(scan->rel);
167 ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
168 }
169 else
170 {
171 /* We've reached the end of the HOT chain. */
172 *call_again = false;
173 }
174
175 return got_heap_tuple;
176}
177
178
179/* ------------------------------------------------------------------------
180 * Callbacks for non-modifying operations on individual tuples for heap AM
181 * ------------------------------------------------------------------------
182 */
183
184static bool
185heapam_fetch_row_version(Relation relation,
186 ItemPointer tid,
187 Snapshot snapshot,
188 TupleTableSlot *slot)
189{
190 BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
191 Buffer buffer;
192
193 Assert(TTS_IS_BUFFERTUPLE(slot));
194
195 bslot->base.tupdata.t_self = *tid;
196 if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer))
197 {
198 /* store in slot, transferring existing pin */
199 ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
200 slot->tts_tableOid = RelationGetRelid(relation);
201
202 return true;
203 }
204
205 return false;
206}
207
208static bool
209heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
210{
211 HeapScanDesc hscan = (HeapScanDesc) scan;
212
213 return ItemPointerIsValid(tid) &&
214 ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
215}
216
217static bool
218heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
219 Snapshot snapshot)
220{
221 BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
222 bool res;
223
224 Assert(TTS_IS_BUFFERTUPLE(slot));
225 Assert(BufferIsValid(bslot->buffer));
226
227 /*
228 * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
229 * Caller should be holding pin, but not lock.
230 */
231 LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
232 res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
233 bslot->buffer);
234 LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
235
236 return res;
237}
238
239
240/* ----------------------------------------------------------------------------
241 * Functions for manipulations of physical tuples for heap AM.
242 * ----------------------------------------------------------------------------
243 */
244
245static void
246heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
247 int options, BulkInsertState bistate)
248{
249 bool shouldFree = true;
250 HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
251
252 /* Update the tuple with table oid */
253 slot->tts_tableOid = RelationGetRelid(relation);
254 tuple->t_tableOid = slot->tts_tableOid;
255
256 /* Perform the insertion, and copy the resulting ItemPointer */
257 heap_insert(relation, tuple, cid, options, bistate);
258 ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
259
260 if (shouldFree)
261 pfree(tuple);
262}
263
264static void
265heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
266 CommandId cid, int options,
267 BulkInsertState bistate, uint32 specToken)
268{
269 bool shouldFree = true;
270 HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
271
272 /* Update the tuple with table oid */
273 slot->tts_tableOid = RelationGetRelid(relation);
274 tuple->t_tableOid = slot->tts_tableOid;
275
276 HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
277 options |= HEAP_INSERT_SPECULATIVE;
278
279 /* Perform the insertion, and copy the resulting ItemPointer */
280 heap_insert(relation, tuple, cid, options, bistate);
281 ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
282
283 if (shouldFree)
284 pfree(tuple);
285}
286
287static void
288heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
289 uint32 specToken, bool succeeded)
290{
291 bool shouldFree = true;
292 HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
293
294 /* adjust the tuple's state accordingly */
295 if (succeeded)
296 heap_finish_speculative(relation, &slot->tts_tid);
297 else
298 heap_abort_speculative(relation, &slot->tts_tid);
299
300 if (shouldFree)
301 pfree(tuple);
302}
303
304static TM_Result
305heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
306 Snapshot snapshot, Snapshot crosscheck, bool wait,
307 TM_FailureData *tmfd, bool changingPart)
308{
309 /*
310 * Currently Deleting of index tuples are handled at vacuum, in case if
311 * the storage itself is cleaning the dead tuples by itself, it is the
312 * time to call the index tuple deletion also.
313 */
314 return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
315}
316
317
318static TM_Result
319heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
320 CommandId cid, Snapshot snapshot, Snapshot crosscheck,
321 bool wait, TM_FailureData *tmfd,
322 LockTupleMode *lockmode, bool *update_indexes)
323{
324 bool shouldFree = true;
325 HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
326 TM_Result result;
327
328 /* Update the tuple with table oid */
329 slot->tts_tableOid = RelationGetRelid(relation);
330 tuple->t_tableOid = slot->tts_tableOid;
331
332 result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
333 tmfd, lockmode);
334 ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
335
336 /*
337 * Decide whether new index entries are needed for the tuple
338 *
339 * Note: heap_update returns the tid (location) of the new tuple in the
340 * t_self field.
341 *
342 * If it's a HOT update, we mustn't insert new index entries.
343 */
344 *update_indexes = result == TM_Ok && !HeapTupleIsHeapOnly(tuple);
345
346 if (shouldFree)
347 pfree(tuple);
348
349 return result;
350}
351
352static TM_Result
353heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
354 TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
355 LockWaitPolicy wait_policy, uint8 flags,
356 TM_FailureData *tmfd)
357{
358 BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
359 TM_Result result;
360 Buffer buffer;
361 HeapTuple tuple = &bslot->base.tupdata;
362 bool follow_updates;
363
364 follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
365 tmfd->traversed = false;
366
367 Assert(TTS_IS_BUFFERTUPLE(slot));
368
369tuple_lock_retry:
370 tuple->t_self = *tid;
371 result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
372 follow_updates, &buffer, tmfd);
373
374 if (result == TM_Updated &&
375 (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
376 {
377 ReleaseBuffer(buffer);
378 /* Should not encounter speculative tuple on recheck */
379 Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
380
381 if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
382 {
383 SnapshotData SnapshotDirty;
384 TransactionId priorXmax;
385
386 /* it was updated, so look at the updated version */
387 *tid = tmfd->ctid;
388 /* updated row should have xmin matching this xmax */
389 priorXmax = tmfd->xmax;
390
391 /* signal that a tuple later in the chain is getting locked */
392 tmfd->traversed = true;
393
394 /*
395 * fetch target tuple
396 *
397 * Loop here to deal with updated or busy tuples
398 */
399 InitDirtySnapshot(SnapshotDirty);
400 for (;;)
401 {
402 if (ItemPointerIndicatesMovedPartitions(tid))
403 ereport(ERROR,
404 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
405 errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
406
407 tuple->t_self = *tid;
408 if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer))
409 {
410 /*
411 * If xmin isn't what we're expecting, the slot must have
412 * been recycled and reused for an unrelated tuple. This
413 * implies that the latest version of the row was deleted,
414 * so we need do nothing. (Should be safe to examine xmin
415 * without getting buffer's content lock. We assume
416 * reading a TransactionId to be atomic, and Xmin never
417 * changes in an existing tuple, except to invalid or
418 * frozen, and neither of those can match priorXmax.)
419 */
420 if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
421 priorXmax))
422 {
423 ReleaseBuffer(buffer);
424 return TM_Deleted;
425 }
426
427 /* otherwise xmin should not be dirty... */
428 if (TransactionIdIsValid(SnapshotDirty.xmin))
429 elog(ERROR, "t_xmin is uncommitted in tuple to be updated");
430
431 /*
432 * If tuple is being updated by other transaction then we
433 * have to wait for its commit/abort, or die trying.
434 */
435 if (TransactionIdIsValid(SnapshotDirty.xmax))
436 {
437 ReleaseBuffer(buffer);
438 switch (wait_policy)
439 {
440 case LockWaitBlock:
441 XactLockTableWait(SnapshotDirty.xmax,
442 relation, &tuple->t_self,
443 XLTW_FetchUpdated);
444 break;
445 case LockWaitSkip:
446 if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
447 /* skip instead of waiting */
448 return TM_WouldBlock;
449 break;
450 case LockWaitError:
451 if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
452 ereport(ERROR,
453 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
454 errmsg("could not obtain lock on row in relation \"%s\"",
455 RelationGetRelationName(relation))));
456 break;
457 }
458 continue; /* loop back to repeat heap_fetch */
459 }
460
461 /*
462 * If tuple was inserted by our own transaction, we have
463 * to check cmin against cid: cmin >= current CID means
464 * our command cannot see the tuple, so we should ignore
465 * it. Otherwise heap_lock_tuple() will throw an error,
466 * and so would any later attempt to update or delete the
467 * tuple. (We need not check cmax because
468 * HeapTupleSatisfiesDirty will consider a tuple deleted
469 * by our transaction dead, regardless of cmax.) We just
470 * checked that priorXmax == xmin, so we can test that
471 * variable instead of doing HeapTupleHeaderGetXmin again.
472 */
473 if (TransactionIdIsCurrentTransactionId(priorXmax) &&
474 HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
475 {
476 tmfd->xmax = priorXmax;
477
478 /*
479 * Cmin is the problematic value, so store that. See
480 * above.
481 */
482 tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
483 ReleaseBuffer(buffer);
484 return TM_SelfModified;
485 }
486
487 /*
488 * This is a live tuple, so try to lock it again.
489 */
490 ReleaseBuffer(buffer);
491 goto tuple_lock_retry;
492 }
493
494 /*
495 * If the referenced slot was actually empty, the latest
496 * version of the row must have been deleted, so we need do
497 * nothing.
498 */
499 if (tuple->t_data == NULL)
500 {
501 return TM_Deleted;
502 }
503
504 /*
505 * As above, if xmin isn't what we're expecting, do nothing.
506 */
507 if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
508 priorXmax))
509 {
510 if (BufferIsValid(buffer))
511 ReleaseBuffer(buffer);
512 return TM_Deleted;
513 }
514
515 /*
516 * If we get here, the tuple was found but failed
517 * SnapshotDirty. Assuming the xmin is either a committed xact
518 * or our own xact (as it certainly should be if we're trying
519 * to modify the tuple), this must mean that the row was
520 * updated or deleted by either a committed xact or our own
521 * xact. If it was deleted, we can ignore it; if it was
522 * updated then chain up to the next version and repeat the
523 * whole process.
524 *
525 * As above, it should be safe to examine xmax and t_ctid
526 * without the buffer content lock, because they can't be
527 * changing.
528 */
529 if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
530 {
531 /* deleted, so forget about it */
532 if (BufferIsValid(buffer))
533 ReleaseBuffer(buffer);
534 return TM_Deleted;
535 }
536
537 /* updated, so look at the updated row */
538 *tid = tuple->t_data->t_ctid;
539 /* updated row should have xmin matching this xmax */
540 priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
541 if (BufferIsValid(buffer))
542 ReleaseBuffer(buffer);
543 /* loop back to fetch next in chain */
544 }
545 }
546 else
547 {
548 /* tuple was deleted, so give up */
549 return TM_Deleted;
550 }
551 }
552
553 slot->tts_tableOid = RelationGetRelid(relation);
554 tuple->t_tableOid = slot->tts_tableOid;
555
556 /* store in slot, transferring existing pin */
557 ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
558
559 return result;
560}
561
562static void
563heapam_finish_bulk_insert(Relation relation, int options)
564{
565 /*
566 * If we skipped writing WAL, then we need to sync the heap (but not
567 * indexes since those use WAL anyway / don't go through tableam)
568 */
569 if (options & HEAP_INSERT_SKIP_WAL)
570 heap_sync(relation);
571}
572
573
574/* ------------------------------------------------------------------------
575 * DDL related callbacks for heap AM.
576 * ------------------------------------------------------------------------
577 */
578
579static void
580heapam_relation_set_new_filenode(Relation rel,
581 const RelFileNode *newrnode,
582 char persistence,
583 TransactionId *freezeXid,
584 MultiXactId *minmulti)
585{
586 SMgrRelation srel;
587
588 /*
589 * Initialize to the minimum XID that could put tuples in the table. We
590 * know that no xacts older than RecentXmin are still running, so that
591 * will do.
592 */
593 *freezeXid = RecentXmin;
594
595 /*
596 * Similarly, initialize the minimum Multixact to the first value that
597 * could possibly be stored in tuples in the table. Running transactions
598 * could reuse values from their local cache, so we are careful to
599 * consider all currently running multis.
600 *
601 * XXX this could be refined further, but is it worth the hassle?
602 */
603 *minmulti = GetOldestMultiXactId();
604
605 srel = RelationCreateStorage(*newrnode, persistence);
606
607 /*
608 * If required, set up an init fork for an unlogged table so that it can
609 * be correctly reinitialized on restart. An immediate sync is required
610 * even if the page has been logged, because the write did not go through
611 * shared_buffers and therefore a concurrent checkpoint may have moved the
612 * redo pointer past our xlog record. Recovery may as well remove it
613 * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
614 * record. Therefore, logging is necessary even if wal_level=minimal.
615 */
616 if (persistence == RELPERSISTENCE_UNLOGGED)
617 {
618 Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
619 rel->rd_rel->relkind == RELKIND_MATVIEW ||
620 rel->rd_rel->relkind == RELKIND_TOASTVALUE);
621 smgrcreate(srel, INIT_FORKNUM, false);
622 log_smgrcreate(newrnode, INIT_FORKNUM);
623 smgrimmedsync(srel, INIT_FORKNUM);
624 }
625
626 smgrclose(srel);
627}
628
629static void
630heapam_relation_nontransactional_truncate(Relation rel)
631{
632 RelationTruncate(rel, 0);
633}
634
635static void
636heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
637{
638 SMgrRelation dstrel;
639
640 dstrel = smgropen(*newrnode, rel->rd_backend);
641 RelationOpenSmgr(rel);
642
643 /*
644 * Since we copy the file directly without looking at the shared buffers,
645 * we'd better first flush out any pages of the source relation that are
646 * in shared buffers. We assume no new changes will be made while we are
647 * holding exclusive lock on the rel.
648 */
649 FlushRelationBuffers(rel);
650
651 /*
652 * Create and copy all forks of the relation, and schedule unlinking of
653 * old physical files.
654 *
655 * NOTE: any conflict in relfilenode value will be caught in
656 * RelationCreateStorage().
657 */
658 RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
659
660 /* copy main fork */
661 RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
662 rel->rd_rel->relpersistence);
663
664 /* copy those extra forks that exist */
665 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
666 forkNum <= MAX_FORKNUM; forkNum++)
667 {
668 if (smgrexists(rel->rd_smgr, forkNum))
669 {
670 smgrcreate(dstrel, forkNum, false);
671
672 /*
673 * WAL log creation if the relation is persistent, or this is the
674 * init fork of an unlogged relation.
675 */
676 if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
677 (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
678 forkNum == INIT_FORKNUM))
679 log_smgrcreate(newrnode, forkNum);
680 RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
681 rel->rd_rel->relpersistence);
682 }
683 }
684
685
686 /* drop old relation, and close new one */
687 RelationDropStorage(rel);
688 smgrclose(dstrel);
689}
690
691static void
692heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
693 Relation OldIndex, bool use_sort,
694 TransactionId OldestXmin,
695 TransactionId *xid_cutoff,
696 MultiXactId *multi_cutoff,
697 double *num_tuples,
698 double *tups_vacuumed,
699 double *tups_recently_dead)
700{
701 RewriteState rwstate;
702 IndexScanDesc indexScan;
703 TableScanDesc tableScan;
704 HeapScanDesc heapScan;
705 bool use_wal;
706 bool is_system_catalog;
707 Tuplesortstate *tuplesort;
708 TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
709 TupleDesc newTupDesc = RelationGetDescr(NewHeap);
710 TupleTableSlot *slot;
711 int natts;
712 Datum *values;
713 bool *isnull;
714 BufferHeapTupleTableSlot *hslot;
715
716 /* Remember if it's a system catalog */
717 is_system_catalog = IsSystemRelation(OldHeap);
718
719 /*
720 * We need to log the copied data in WAL iff WAL archiving/streaming is
721 * enabled AND it's a WAL-logged rel.
722 */
723 use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
724
725 /* use_wal off requires smgr_targblock be initially invalid */
726 Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
727
728 /* Preallocate values/isnull arrays */
729 natts = newTupDesc->natts;
730 values = (Datum *) palloc(natts * sizeof(Datum));
731 isnull = (bool *) palloc(natts * sizeof(bool));
732
733 /* Initialize the rewrite operation */
734 rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
735 *multi_cutoff, use_wal);
736
737
738 /* Set up sorting if wanted */
739 if (use_sort)
740 tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
741 maintenance_work_mem,
742 NULL, false);
743 else
744 tuplesort = NULL;
745
746 /*
747 * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
748 * that still need to be copied, we scan with SnapshotAny and use
749 * HeapTupleSatisfiesVacuum for the visibility test.
750 */
751 if (OldIndex != NULL && !use_sort)
752 {
753 const int ci_index[] = {
754 PROGRESS_CLUSTER_PHASE,
755 PROGRESS_CLUSTER_INDEX_RELID
756 };
757 int64 ci_val[2];
758
759 /* Set phase and OIDOldIndex to columns */
760 ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
761 ci_val[1] = RelationGetRelid(OldIndex);
762 pgstat_progress_update_multi_param(2, ci_index, ci_val);
763
764 tableScan = NULL;
765 heapScan = NULL;
766 indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
767 index_rescan(indexScan, NULL, 0, NULL, 0);
768 }
769 else
770 {
771 /* In scan-and-sort mode and also VACUUM FULL, set phase */
772 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
773 PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
774
775 tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
776 heapScan = (HeapScanDesc) tableScan;
777 indexScan = NULL;
778
779 /* Set total heap blocks */
780 pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
781 heapScan->rs_nblocks);
782 }
783
784 slot = table_slot_create(OldHeap, NULL);
785 hslot = (BufferHeapTupleTableSlot *) slot;
786
787 /*
788 * Scan through the OldHeap, either in OldIndex order or sequentially;
789 * copy each tuple into the NewHeap, or transiently to the tuplesort
790 * module. Note that we don't bother sorting dead tuples (they won't get
791 * to the new table anyway).
792 */
793 for (;;)
794 {
795 HeapTuple tuple;
796 Buffer buf;
797 bool isdead;
798
799 CHECK_FOR_INTERRUPTS();
800
801 if (indexScan != NULL)
802 {
803 if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
804 break;
805
806 /* Since we used no scan keys, should never need to recheck */
807 if (indexScan->xs_recheck)
808 elog(ERROR, "CLUSTER does not support lossy index conditions");
809 }
810 else
811 {
812 if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
813 break;
814
815 /*
816 * In scan-and-sort mode and also VACUUM FULL, set heap blocks
817 * scanned
818 */
819 pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
820 heapScan->rs_cblock + 1);
821 }
822
823 tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
824 buf = hslot->buffer;
825
826 LockBuffer(buf, BUFFER_LOCK_SHARE);
827
828 switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
829 {
830 case HEAPTUPLE_DEAD:
831 /* Definitely dead */
832 isdead = true;
833 break;
834 case HEAPTUPLE_RECENTLY_DEAD:
835 *tups_recently_dead += 1;
836 /* fall through */
837 case HEAPTUPLE_LIVE:
838 /* Live or recently dead, must copy it */
839 isdead = false;
840 break;
841 case HEAPTUPLE_INSERT_IN_PROGRESS:
842
843 /*
844 * Since we hold exclusive lock on the relation, normally the
845 * only way to see this is if it was inserted earlier in our
846 * own transaction. However, it can happen in system
847 * catalogs, since we tend to release write lock before commit
848 * there. Give a warning if neither case applies; but in any
849 * case we had better copy it.
850 */
851 if (!is_system_catalog &&
852 !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
853 elog(WARNING, "concurrent insert in progress within table \"%s\"",
854 RelationGetRelationName(OldHeap));
855 /* treat as live */
856 isdead = false;
857 break;
858 case HEAPTUPLE_DELETE_IN_PROGRESS:
859
860 /*
861 * Similar situation to INSERT_IN_PROGRESS case.
862 */
863 if (!is_system_catalog &&
864 !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
865 elog(WARNING, "concurrent delete in progress within table \"%s\"",
866 RelationGetRelationName(OldHeap));
867 /* treat as recently dead */
868 *tups_recently_dead += 1;
869 isdead = false;
870 break;
871 default:
872 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
873 isdead = false; /* keep compiler quiet */
874 break;
875 }
876
877 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
878
879 if (isdead)
880 {
881 *tups_vacuumed += 1;
882 /* heap rewrite module still needs to see it... */
883 if (rewrite_heap_dead_tuple(rwstate, tuple))
884 {
885 /* A previous recently-dead tuple is now known dead */
886 *tups_vacuumed += 1;
887 *tups_recently_dead -= 1;
888 }
889 continue;
890 }
891
892 *num_tuples += 1;
893 if (tuplesort != NULL)
894 {
895 tuplesort_putheaptuple(tuplesort, tuple);
896
897 /*
898 * In scan-and-sort mode, report increase in number of tuples
899 * scanned
900 */
901 pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
902 *num_tuples);
903 }
904 else
905 {
906 const int ct_index[] = {
907 PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
908 PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
909 };
910 int64 ct_val[2];
911
912 reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
913 values, isnull, rwstate);
914
915 /*
916 * In indexscan mode and also VACUUM FULL, report increase in
917 * number of tuples scanned and written
918 */
919 ct_val[0] = *num_tuples;
920 ct_val[1] = *num_tuples;
921 pgstat_progress_update_multi_param(2, ct_index, ct_val);
922 }
923 }
924
925 if (indexScan != NULL)
926 index_endscan(indexScan);
927 if (tableScan != NULL)
928 table_endscan(tableScan);
929 if (slot)
930 ExecDropSingleTupleTableSlot(slot);
931
932 /*
933 * In scan-and-sort mode, complete the sort, then read out all live tuples
934 * from the tuplestore and write them to the new relation.
935 */
936 if (tuplesort != NULL)
937 {
938 double n_tuples = 0;
939
940 /* Report that we are now sorting tuples */
941 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
942 PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
943
944 tuplesort_performsort(tuplesort);
945
946 /* Report that we are now writing new heap */
947 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
948 PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
949
950 for (;;)
951 {
952 HeapTuple tuple;
953
954 CHECK_FOR_INTERRUPTS();
955
956 tuple = tuplesort_getheaptuple(tuplesort, true);
957 if (tuple == NULL)
958 break;
959
960 n_tuples += 1;
961 reform_and_rewrite_tuple(tuple,
962 OldHeap, NewHeap,
963 values, isnull,
964 rwstate);
965 /* Report n_tuples */
966 pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
967 n_tuples);
968 }
969
970 tuplesort_end(tuplesort);
971 }
972
973 /* Write out any remaining tuples, and fsync if needed */
974 end_heap_rewrite(rwstate);
975
976 /* Clean up */
977 pfree(values);
978 pfree(isnull);
979}
980
981static bool
982heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
983 BufferAccessStrategy bstrategy)
984{
985 HeapScanDesc hscan = (HeapScanDesc) scan;
986
987 /*
988 * We must maintain a pin on the target page's buffer to ensure that
989 * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
990 * under us. Hence, pin the page until we are done looking at it. We
991 * also choose to hold sharelock on the buffer throughout --- we could
992 * release and re-acquire sharelock for each tuple, but since we aren't
993 * doing much work per tuple, the extra lock traffic is probably better
994 * avoided.
995 */
996 hscan->rs_cblock = blockno;
997 hscan->rs_cindex = FirstOffsetNumber;
998 hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
999 blockno, RBM_NORMAL, bstrategy);
1000 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1001
1002 /* in heap all blocks can contain tuples, so always return true */
1003 return true;
1004}
1005
1006static bool
1007heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1008 double *liverows, double *deadrows,
1009 TupleTableSlot *slot)
1010{
1011 HeapScanDesc hscan = (HeapScanDesc) scan;
1012 Page targpage;
1013 OffsetNumber maxoffset;
1014 BufferHeapTupleTableSlot *hslot;
1015
1016 Assert(TTS_IS_BUFFERTUPLE(slot));
1017
1018 hslot = (BufferHeapTupleTableSlot *) slot;
1019 targpage = BufferGetPage(hscan->rs_cbuf);
1020 maxoffset = PageGetMaxOffsetNumber(targpage);
1021
1022 /* Inner loop over all tuples on the selected page */
1023 for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1024 {
1025 ItemId itemid;
1026 HeapTuple targtuple = &hslot->base.tupdata;
1027 bool sample_it = false;
1028
1029 itemid = PageGetItemId(targpage, hscan->rs_cindex);
1030
1031 /*
1032 * We ignore unused and redirect line pointers. DEAD line pointers
1033 * should be counted as dead, because we need vacuum to run to get rid
1034 * of them. Note that this rule agrees with the way that
1035 * heap_page_prune() counts things.
1036 */
1037 if (!ItemIdIsNormal(itemid))
1038 {
1039 if (ItemIdIsDead(itemid))
1040 *deadrows += 1;
1041 continue;
1042 }
1043
1044 ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1045
1046 targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1047 targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1048 targtuple->t_len = ItemIdGetLength(itemid);
1049
1050 switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1051 hscan->rs_cbuf))
1052 {
1053 case HEAPTUPLE_LIVE:
1054 sample_it = true;
1055 *liverows += 1;
1056 break;
1057
1058 case HEAPTUPLE_DEAD:
1059 case HEAPTUPLE_RECENTLY_DEAD:
1060 /* Count dead and recently-dead rows */
1061 *deadrows += 1;
1062 break;
1063
1064 case HEAPTUPLE_INSERT_IN_PROGRESS:
1065
1066 /*
1067 * Insert-in-progress rows are not counted. We assume that
1068 * when the inserting transaction commits or aborts, it will
1069 * send a stats message to increment the proper count. This
1070 * works right only if that transaction ends after we finish
1071 * analyzing the table; if things happen in the other order,
1072 * its stats update will be overwritten by ours. However, the
1073 * error will be large only if the other transaction runs long
1074 * enough to insert many tuples, so assuming it will finish
1075 * after us is the safer option.
1076 *
1077 * A special case is that the inserting transaction might be
1078 * our own. In this case we should count and sample the row,
1079 * to accommodate users who load a table and analyze it in one
1080 * transaction. (pgstat_report_analyze has to adjust the
1081 * numbers we send to the stats collector to make this come
1082 * out right.)
1083 */
1084 if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1085 {
1086 sample_it = true;
1087 *liverows += 1;
1088 }
1089 break;
1090
1091 case HEAPTUPLE_DELETE_IN_PROGRESS:
1092
1093 /*
1094 * We count and sample delete-in-progress rows the same as
1095 * live ones, so that the stats counters come out right if the
1096 * deleting transaction commits after us, per the same
1097 * reasoning given above.
1098 *
1099 * If the delete was done by our own transaction, however, we
1100 * must count the row as dead to make pgstat_report_analyze's
1101 * stats adjustments come out right. (Note: this works out
1102 * properly when the row was both inserted and deleted in our
1103 * xact.)
1104 *
1105 * The net effect of these choices is that we act as though an
1106 * IN_PROGRESS transaction hasn't happened yet, except if it
1107 * is our own transaction, which we assume has happened.
1108 *
1109 * This approach ensures that we behave sanely if we see both
1110 * the pre-image and post-image rows for a row being updated
1111 * by a concurrent transaction: we will sample the pre-image
1112 * but not the post-image. We also get sane results if the
1113 * concurrent transaction never commits.
1114 */
1115 if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1116 *deadrows += 1;
1117 else
1118 {
1119 sample_it = true;
1120 *liverows += 1;
1121 }
1122 break;
1123
1124 default:
1125 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1126 break;
1127 }
1128
1129 if (sample_it)
1130 {
1131 ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1132 hscan->rs_cindex++;
1133
1134 /* note that we leave the buffer locked here! */
1135 return true;
1136 }
1137 }
1138
1139 /* Now release the lock and pin on the page */
1140 UnlockReleaseBuffer(hscan->rs_cbuf);
1141 hscan->rs_cbuf = InvalidBuffer;
1142
1143 /* also prevent old slot contents from having pin on page */
1144 ExecClearTuple(slot);
1145
1146 return false;
1147}
1148
1149static double
1150heapam_index_build_range_scan(Relation heapRelation,
1151 Relation indexRelation,
1152 IndexInfo *indexInfo,
1153 bool allow_sync,
1154 bool anyvisible,
1155 bool progress,
1156 BlockNumber start_blockno,
1157 BlockNumber numblocks,
1158 IndexBuildCallback callback,
1159 void *callback_state,
1160 TableScanDesc scan)
1161{
1162 HeapScanDesc hscan;
1163 bool is_system_catalog;
1164 bool checking_uniqueness;
1165 HeapTuple heapTuple;
1166 Datum values[INDEX_MAX_KEYS];
1167 bool isnull[INDEX_MAX_KEYS];
1168 double reltuples;
1169 ExprState *predicate;
1170 TupleTableSlot *slot;
1171 EState *estate;
1172 ExprContext *econtext;
1173 Snapshot snapshot;
1174 bool need_unregister_snapshot = false;
1175 TransactionId OldestXmin;
1176 BlockNumber previous_blkno = InvalidBlockNumber;
1177 BlockNumber root_blkno = InvalidBlockNumber;
1178 OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1179
1180 /*
1181 * sanity checks
1182 */
1183 Assert(OidIsValid(indexRelation->rd_rel->relam));
1184
1185 /* Remember if it's a system catalog */
1186 is_system_catalog = IsSystemRelation(heapRelation);
1187
1188 /* See whether we're verifying uniqueness/exclusion properties */
1189 checking_uniqueness = (indexInfo->ii_Unique ||
1190 indexInfo->ii_ExclusionOps != NULL);
1191
1192 /*
1193 * "Any visible" mode is not compatible with uniqueness checks; make sure
1194 * only one of those is requested.
1195 */
1196 Assert(!(anyvisible && checking_uniqueness));
1197
1198 /*
1199 * Need an EState for evaluation of index expressions and partial-index
1200 * predicates. Also a slot to hold the current tuple.
1201 */
1202 estate = CreateExecutorState();
1203 econtext = GetPerTupleExprContext(estate);
1204 slot = table_slot_create(heapRelation, NULL);
1205
1206 /* Arrange for econtext's scan tuple to be the tuple under test */
1207 econtext->ecxt_scantuple = slot;
1208
1209 /* Set up execution state for predicate, if any. */
1210 predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1211
1212 /*
1213 * Prepare for scan of the base relation. In a normal index build, we use
1214 * SnapshotAny because we must retrieve all tuples and do our own time
1215 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1216 * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1217 * and index whatever's live according to that.
1218 */
1219 OldestXmin = InvalidTransactionId;
1220
1221 /* okay to ignore lazy VACUUMs here */
1222 if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1223 OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM);
1224
1225 if (!scan)
1226 {
1227 /*
1228 * Serial index build.
1229 *
1230 * Must begin our own heap scan in this case. We may also need to
1231 * register a snapshot whose lifetime is under our direct control.
1232 */
1233 if (!TransactionIdIsValid(OldestXmin))
1234 {
1235 snapshot = RegisterSnapshot(GetTransactionSnapshot());
1236 need_unregister_snapshot = true;
1237 }
1238 else
1239 snapshot = SnapshotAny;
1240
1241 scan = table_beginscan_strat(heapRelation, /* relation */
1242 snapshot, /* snapshot */
1243 0, /* number of keys */
1244 NULL, /* scan key */
1245 true, /* buffer access strategy OK */
1246 allow_sync); /* syncscan OK? */
1247 }
1248 else
1249 {
1250 /*
1251 * Parallel index build.
1252 *
1253 * Parallel case never registers/unregisters own snapshot. Snapshot
1254 * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1255 * snapshot, based on same criteria as serial case.
1256 */
1257 Assert(!IsBootstrapProcessingMode());
1258 Assert(allow_sync);
1259 snapshot = scan->rs_snapshot;
1260 }
1261
1262 hscan = (HeapScanDesc) scan;
1263
1264 /* Publish number of blocks to scan */
1265 if (progress)
1266 {
1267 BlockNumber nblocks;
1268
1269 if (hscan->rs_base.rs_parallel != NULL)
1270 {
1271 ParallelBlockTableScanDesc pbscan;
1272
1273 pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1274 nblocks = pbscan->phs_nblocks;
1275 }
1276 else
1277 nblocks = hscan->rs_nblocks;
1278
1279 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1280 nblocks);
1281 }
1282
1283 /*
1284 * Must call GetOldestXmin() with SnapshotAny. Should never call
1285 * GetOldestXmin() with MVCC snapshot. (It's especially worth checking
1286 * this for parallel builds, since ambuild routines that support parallel
1287 * builds must work these details out for themselves.)
1288 */
1289 Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1290 Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1291 !TransactionIdIsValid(OldestXmin));
1292 Assert(snapshot == SnapshotAny || !anyvisible);
1293
1294 /* set our scan endpoints */
1295 if (!allow_sync)
1296 heap_setscanlimits(scan, start_blockno, numblocks);
1297 else
1298 {
1299 /* syncscan can only be requested on whole relation */
1300 Assert(start_blockno == 0);
1301 Assert(numblocks == InvalidBlockNumber);
1302 }
1303
1304 reltuples = 0;
1305
1306 /*
1307 * Scan all tuples in the base relation.
1308 */
1309 while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1310 {
1311 bool tupleIsAlive;
1312
1313 CHECK_FOR_INTERRUPTS();
1314
1315 /* Report scan progress, if asked to. */
1316 if (progress)
1317 {
1318 BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1319
1320 if (blocks_done != previous_blkno)
1321 {
1322 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1323 blocks_done);
1324 previous_blkno = blocks_done;
1325 }
1326 }
1327
1328 /*
1329 * When dealing with a HOT-chain of updated tuples, we want to index
1330 * the values of the live tuple (if any), but index it under the TID
1331 * of the chain's root tuple. This approach is necessary to preserve
1332 * the HOT-chain structure in the heap. So we need to be able to find
1333 * the root item offset for every tuple that's in a HOT-chain. When
1334 * first reaching a new page of the relation, call
1335 * heap_get_root_tuples() to build a map of root item offsets on the
1336 * page.
1337 *
1338 * It might look unsafe to use this information across buffer
1339 * lock/unlock. However, we hold ShareLock on the table so no
1340 * ordinary insert/update/delete should occur; and we hold pin on the
1341 * buffer continuously while visiting the page, so no pruning
1342 * operation can occur either.
1343 *
1344 * Also, although our opinions about tuple liveness could change while
1345 * we scan the page (due to concurrent transaction commits/aborts),
1346 * the chain root locations won't, so this info doesn't need to be
1347 * rebuilt after waiting for another transaction.
1348 *
1349 * Note the implied assumption that there is no more than one live
1350 * tuple per HOT-chain --- else we could create more than one index
1351 * entry pointing to the same root tuple.
1352 */
1353 if (hscan->rs_cblock != root_blkno)
1354 {
1355 Page page = BufferGetPage(hscan->rs_cbuf);
1356
1357 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1358 heap_get_root_tuples(page, root_offsets);
1359 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1360
1361 root_blkno = hscan->rs_cblock;
1362 }
1363
1364 if (snapshot == SnapshotAny)
1365 {
1366 /* do our own time qual check */
1367 bool indexIt;
1368 TransactionId xwait;
1369
1370 recheck:
1371
1372 /*
1373 * We could possibly get away with not locking the buffer here,
1374 * since caller should hold ShareLock on the relation, but let's
1375 * be conservative about it. (This remark is still correct even
1376 * with HOT-pruning: our pin on the buffer prevents pruning.)
1377 */
1378 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1379
1380 /*
1381 * The criteria for counting a tuple as live in this block need to
1382 * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1383 * otherwise CREATE INDEX and ANALYZE may produce wildly different
1384 * reltuples values, e.g. when there are many recently-dead
1385 * tuples.
1386 */
1387 switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1388 hscan->rs_cbuf))
1389 {
1390 case HEAPTUPLE_DEAD:
1391 /* Definitely dead, we can ignore it */
1392 indexIt = false;
1393 tupleIsAlive = false;
1394 break;
1395 case HEAPTUPLE_LIVE:
1396 /* Normal case, index and unique-check it */
1397 indexIt = true;
1398 tupleIsAlive = true;
1399 /* Count it as live, too */
1400 reltuples += 1;
1401 break;
1402 case HEAPTUPLE_RECENTLY_DEAD:
1403
1404 /*
1405 * If tuple is recently deleted then we must index it
1406 * anyway to preserve MVCC semantics. (Pre-existing
1407 * transactions could try to use the index after we finish
1408 * building it, and may need to see such tuples.)
1409 *
1410 * However, if it was HOT-updated then we must only index
1411 * the live tuple at the end of the HOT-chain. Since this
1412 * breaks semantics for pre-existing snapshots, mark the
1413 * index as unusable for them.
1414 *
1415 * We don't count recently-dead tuples in reltuples, even
1416 * if we index them; see heapam_scan_analyze_next_tuple().
1417 */
1418 if (HeapTupleIsHotUpdated(heapTuple))
1419 {
1420 indexIt = false;
1421 /* mark the index as unsafe for old snapshots */
1422 indexInfo->ii_BrokenHotChain = true;
1423 }
1424 else
1425 indexIt = true;
1426 /* In any case, exclude the tuple from unique-checking */
1427 tupleIsAlive = false;
1428 break;
1429 case HEAPTUPLE_INSERT_IN_PROGRESS:
1430
1431 /*
1432 * In "anyvisible" mode, this tuple is visible and we
1433 * don't need any further checks.
1434 */
1435 if (anyvisible)
1436 {
1437 indexIt = true;
1438 tupleIsAlive = true;
1439 reltuples += 1;
1440 break;
1441 }
1442
1443 /*
1444 * Since caller should hold ShareLock or better, normally
1445 * the only way to see this is if it was inserted earlier
1446 * in our own transaction. However, it can happen in
1447 * system catalogs, since we tend to release write lock
1448 * before commit there. Give a warning if neither case
1449 * applies.
1450 */
1451 xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1452 if (!TransactionIdIsCurrentTransactionId(xwait))
1453 {
1454 if (!is_system_catalog)
1455 elog(WARNING, "concurrent insert in progress within table \"%s\"",
1456 RelationGetRelationName(heapRelation));
1457
1458 /*
1459 * If we are performing uniqueness checks, indexing
1460 * such a tuple could lead to a bogus uniqueness
1461 * failure. In that case we wait for the inserting
1462 * transaction to finish and check again.
1463 */
1464 if (checking_uniqueness)
1465 {
1466 /*
1467 * Must drop the lock on the buffer before we wait
1468 */
1469 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1470 XactLockTableWait(xwait, heapRelation,
1471 &heapTuple->t_self,
1472 XLTW_InsertIndexUnique);
1473 CHECK_FOR_INTERRUPTS();
1474 goto recheck;
1475 }
1476 }
1477 else
1478 {
1479 /*
1480 * For consistency with
1481 * heapam_scan_analyze_next_tuple(), count
1482 * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1483 * when inserted by our own transaction.
1484 */
1485 reltuples += 1;
1486 }
1487
1488 /*
1489 * We must index such tuples, since if the index build
1490 * commits then they're good.
1491 */
1492 indexIt = true;
1493 tupleIsAlive = true;
1494 break;
1495 case HEAPTUPLE_DELETE_IN_PROGRESS:
1496
1497 /*
1498 * As with INSERT_IN_PROGRESS case, this is unexpected
1499 * unless it's our own deletion or a system catalog; but
1500 * in anyvisible mode, this tuple is visible.
1501 */
1502 if (anyvisible)
1503 {
1504 indexIt = true;
1505 tupleIsAlive = false;
1506 reltuples += 1;
1507 break;
1508 }
1509
1510 xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1511 if (!TransactionIdIsCurrentTransactionId(xwait))
1512 {
1513 if (!is_system_catalog)
1514 elog(WARNING, "concurrent delete in progress within table \"%s\"",
1515 RelationGetRelationName(heapRelation));
1516
1517 /*
1518 * If we are performing uniqueness checks, assuming
1519 * the tuple is dead could lead to missing a
1520 * uniqueness violation. In that case we wait for the
1521 * deleting transaction to finish and check again.
1522 *
1523 * Also, if it's a HOT-updated tuple, we should not
1524 * index it but rather the live tuple at the end of
1525 * the HOT-chain. However, the deleting transaction
1526 * could abort, possibly leaving this tuple as live
1527 * after all, in which case it has to be indexed. The
1528 * only way to know what to do is to wait for the
1529 * deleting transaction to finish and check again.
1530 */
1531 if (checking_uniqueness ||
1532 HeapTupleIsHotUpdated(heapTuple))
1533 {
1534 /*
1535 * Must drop the lock on the buffer before we wait
1536 */
1537 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1538 XactLockTableWait(xwait, heapRelation,
1539 &heapTuple->t_self,
1540 XLTW_InsertIndexUnique);
1541 CHECK_FOR_INTERRUPTS();
1542 goto recheck;
1543 }
1544
1545 /*
1546 * Otherwise index it but don't check for uniqueness,
1547 * the same as a RECENTLY_DEAD tuple.
1548 */
1549 indexIt = true;
1550
1551 /*
1552 * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1553 * if they were not deleted by the current
1554 * transaction. That's what
1555 * heapam_scan_analyze_next_tuple() does, and we want
1556 * the behavior to be consistent.
1557 */
1558 reltuples += 1;
1559 }
1560 else if (HeapTupleIsHotUpdated(heapTuple))
1561 {
1562 /*
1563 * It's a HOT-updated tuple deleted by our own xact.
1564 * We can assume the deletion will commit (else the
1565 * index contents don't matter), so treat the same as
1566 * RECENTLY_DEAD HOT-updated tuples.
1567 */
1568 indexIt = false;
1569 /* mark the index as unsafe for old snapshots */
1570 indexInfo->ii_BrokenHotChain = true;
1571 }
1572 else
1573 {
1574 /*
1575 * It's a regular tuple deleted by our own xact. Index
1576 * it, but don't check for uniqueness nor count in
1577 * reltuples, the same as a RECENTLY_DEAD tuple.
1578 */
1579 indexIt = true;
1580 }
1581 /* In any case, exclude the tuple from unique-checking */
1582 tupleIsAlive = false;
1583 break;
1584 default:
1585 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1586 indexIt = tupleIsAlive = false; /* keep compiler quiet */
1587 break;
1588 }
1589
1590 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1591
1592 if (!indexIt)
1593 continue;
1594 }
1595 else
1596 {
1597 /* heap_getnext did the time qual check */
1598 tupleIsAlive = true;
1599 reltuples += 1;
1600 }
1601
1602 MemoryContextReset(econtext->ecxt_per_tuple_memory);
1603
1604 /* Set up for predicate or expression evaluation */
1605 ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1606
1607 /*
1608 * In a partial index, discard tuples that don't satisfy the
1609 * predicate.
1610 */
1611 if (predicate != NULL)
1612 {
1613 if (!ExecQual(predicate, econtext))
1614 continue;
1615 }
1616
1617 /*
1618 * For the current heap tuple, extract all the attributes we use in
1619 * this index, and note which are null. This also performs evaluation
1620 * of any expressions needed.
1621 */
1622 FormIndexDatum(indexInfo,
1623 slot,
1624 estate,
1625 values,
1626 isnull);
1627
1628 /*
1629 * You'd think we should go ahead and build the index tuple here, but
1630 * some index AMs want to do further processing on the data first. So
1631 * pass the values[] and isnull[] arrays, instead.
1632 */
1633
1634 if (HeapTupleIsHeapOnly(heapTuple))
1635 {
1636 /*
1637 * For a heap-only tuple, pretend its TID is that of the root. See
1638 * src/backend/access/heap/README.HOT for discussion.
1639 */
1640 HeapTupleData rootTuple;
1641 OffsetNumber offnum;
1642
1643 rootTuple = *heapTuple;
1644 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1645
1646 if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1647 ereport(ERROR,
1648 (errcode(ERRCODE_DATA_CORRUPTED),
1649 errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1650 ItemPointerGetBlockNumber(&heapTuple->t_self),
1651 offnum,
1652 RelationGetRelationName(heapRelation))));
1653
1654 ItemPointerSetOffsetNumber(&rootTuple.t_self,
1655 root_offsets[offnum - 1]);
1656
1657 /* Call the AM's callback routine to process the tuple */
1658 callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
1659 callback_state);
1660 }
1661 else
1662 {
1663 /* Call the AM's callback routine to process the tuple */
1664 callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
1665 callback_state);
1666 }
1667 }
1668
1669 /* Report scan progress one last time. */
1670 if (progress)
1671 {
1672 BlockNumber blks_done;
1673
1674 if (hscan->rs_base.rs_parallel != NULL)
1675 {
1676 ParallelBlockTableScanDesc pbscan;
1677
1678 pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1679 blks_done = pbscan->phs_nblocks;
1680 }
1681 else
1682 blks_done = hscan->rs_nblocks;
1683
1684 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1685 blks_done);
1686 }
1687
1688 table_endscan(scan);
1689
1690 /* we can now forget our snapshot, if set and registered by us */
1691 if (need_unregister_snapshot)
1692 UnregisterSnapshot(snapshot);
1693
1694 ExecDropSingleTupleTableSlot(slot);
1695
1696 FreeExecutorState(estate);
1697
1698 /* These may have been pointing to the now-gone estate */
1699 indexInfo->ii_ExpressionsState = NIL;
1700 indexInfo->ii_PredicateState = NULL;
1701
1702 return reltuples;
1703}
1704
1705static void
1706heapam_index_validate_scan(Relation heapRelation,
1707 Relation indexRelation,
1708 IndexInfo *indexInfo,
1709 Snapshot snapshot,
1710 ValidateIndexState *state)
1711{
1712 TableScanDesc scan;
1713 HeapScanDesc hscan;
1714 HeapTuple heapTuple;
1715 Datum values[INDEX_MAX_KEYS];
1716 bool isnull[INDEX_MAX_KEYS];
1717 ExprState *predicate;
1718 TupleTableSlot *slot;
1719 EState *estate;
1720 ExprContext *econtext;
1721 BlockNumber root_blkno = InvalidBlockNumber;
1722 OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1723 bool in_index[MaxHeapTuplesPerPage];
1724 BlockNumber previous_blkno = InvalidBlockNumber;
1725
1726 /* state variables for the merge */
1727 ItemPointer indexcursor = NULL;
1728 ItemPointerData decoded;
1729 bool tuplesort_empty = false;
1730
1731 /*
1732 * sanity checks
1733 */
1734 Assert(OidIsValid(indexRelation->rd_rel->relam));
1735
1736 /*
1737 * Need an EState for evaluation of index expressions and partial-index
1738 * predicates. Also a slot to hold the current tuple.
1739 */
1740 estate = CreateExecutorState();
1741 econtext = GetPerTupleExprContext(estate);
1742 slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1743 &TTSOpsHeapTuple);
1744
1745 /* Arrange for econtext's scan tuple to be the tuple under test */
1746 econtext->ecxt_scantuple = slot;
1747
1748 /* Set up execution state for predicate, if any. */
1749 predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1750
1751 /*
1752 * Prepare for scan of the base relation. We need just those tuples
1753 * satisfying the passed-in reference snapshot. We must disable syncscan
1754 * here, because it's critical that we read from block zero forward to
1755 * match the sorted TIDs.
1756 */
1757 scan = table_beginscan_strat(heapRelation, /* relation */
1758 snapshot, /* snapshot */
1759 0, /* number of keys */
1760 NULL, /* scan key */
1761 true, /* buffer access strategy OK */
1762 false); /* syncscan not OK */
1763 hscan = (HeapScanDesc) scan;
1764
1765 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1766 hscan->rs_nblocks);
1767
1768 /*
1769 * Scan all tuples matching the snapshot.
1770 */
1771 while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1772 {
1773 ItemPointer heapcursor = &heapTuple->t_self;
1774 ItemPointerData rootTuple;
1775 OffsetNumber root_offnum;
1776
1777 CHECK_FOR_INTERRUPTS();
1778
1779 state->htups += 1;
1780
1781 if ((previous_blkno == InvalidBlockNumber) ||
1782 (hscan->rs_cblock != previous_blkno))
1783 {
1784 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1785 hscan->rs_cblock);
1786 previous_blkno = hscan->rs_cblock;
1787 }
1788
1789 /*
1790 * As commented in table_index_build_scan, we should index heap-only
1791 * tuples under the TIDs of their root tuples; so when we advance onto
1792 * a new heap page, build a map of root item offsets on the page.
1793 *
1794 * This complicates merging against the tuplesort output: we will
1795 * visit the live tuples in order by their offsets, but the root
1796 * offsets that we need to compare against the index contents might be
1797 * ordered differently. So we might have to "look back" within the
1798 * tuplesort output, but only within the current page. We handle that
1799 * by keeping a bool array in_index[] showing all the
1800 * already-passed-over tuplesort output TIDs of the current page. We
1801 * clear that array here, when advancing onto a new heap page.
1802 */
1803 if (hscan->rs_cblock != root_blkno)
1804 {
1805 Page page = BufferGetPage(hscan->rs_cbuf);
1806
1807 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1808 heap_get_root_tuples(page, root_offsets);
1809 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1810
1811 memset(in_index, 0, sizeof(in_index));
1812
1813 root_blkno = hscan->rs_cblock;
1814 }
1815
1816 /* Convert actual tuple TID to root TID */
1817 rootTuple = *heapcursor;
1818 root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1819
1820 if (HeapTupleIsHeapOnly(heapTuple))
1821 {
1822 root_offnum = root_offsets[root_offnum - 1];
1823 if (!OffsetNumberIsValid(root_offnum))
1824 ereport(ERROR,
1825 (errcode(ERRCODE_DATA_CORRUPTED),
1826 errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1827 ItemPointerGetBlockNumber(heapcursor),
1828 ItemPointerGetOffsetNumber(heapcursor),
1829 RelationGetRelationName(heapRelation))));
1830 ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1831 }
1832
1833 /*
1834 * "merge" by skipping through the index tuples until we find or pass
1835 * the current root tuple.
1836 */
1837 while (!tuplesort_empty &&
1838 (!indexcursor ||
1839 ItemPointerCompare(indexcursor, &rootTuple) < 0))
1840 {
1841 Datum ts_val;
1842 bool ts_isnull;
1843
1844 if (indexcursor)
1845 {
1846 /*
1847 * Remember index items seen earlier on the current heap page
1848 */
1849 if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1850 in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1851 }
1852
1853 tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1854 &ts_val, &ts_isnull, NULL);
1855 Assert(tuplesort_empty || !ts_isnull);
1856 if (!tuplesort_empty)
1857 {
1858 itemptr_decode(&decoded, DatumGetInt64(ts_val));
1859 indexcursor = &decoded;
1860
1861 /* If int8 is pass-by-ref, free (encoded) TID Datum memory */
1862#ifndef USE_FLOAT8_BYVAL
1863 pfree(DatumGetPointer(ts_val));
1864#endif
1865 }
1866 else
1867 {
1868 /* Be tidy */
1869 indexcursor = NULL;
1870 }
1871 }
1872
1873 /*
1874 * If the tuplesort has overshot *and* we didn't see a match earlier,
1875 * then this tuple is missing from the index, so insert it.
1876 */
1877 if ((tuplesort_empty ||
1878 ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1879 !in_index[root_offnum - 1])
1880 {
1881 MemoryContextReset(econtext->ecxt_per_tuple_memory);
1882
1883 /* Set up for predicate or expression evaluation */
1884 ExecStoreHeapTuple(heapTuple, slot, false);
1885
1886 /*
1887 * In a partial index, discard tuples that don't satisfy the
1888 * predicate.
1889 */
1890 if (predicate != NULL)
1891 {
1892 if (!ExecQual(predicate, econtext))
1893 continue;
1894 }
1895
1896 /*
1897 * For the current heap tuple, extract all the attributes we use
1898 * in this index, and note which are null. This also performs
1899 * evaluation of any expressions needed.
1900 */
1901 FormIndexDatum(indexInfo,
1902 slot,
1903 estate,
1904 values,
1905 isnull);
1906
1907 /*
1908 * You'd think we should go ahead and build the index tuple here,
1909 * but some index AMs want to do further processing on the data
1910 * first. So pass the values[] and isnull[] arrays, instead.
1911 */
1912
1913 /*
1914 * If the tuple is already committed dead, you might think we
1915 * could suppress uniqueness checking, but this is no longer true
1916 * in the presence of HOT, because the insert is actually a proxy
1917 * for a uniqueness check on the whole HOT-chain. That is, the
1918 * tuple we have here could be dead because it was already
1919 * HOT-updated, and if so the updating transaction will not have
1920 * thought it should insert index entries. The index AM will
1921 * check the whole HOT-chain and correctly detect a conflict if
1922 * there is one.
1923 */
1924
1925 index_insert(indexRelation,
1926 values,
1927 isnull,
1928 &rootTuple,
1929 heapRelation,
1930 indexInfo->ii_Unique ?
1931 UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1932 indexInfo);
1933
1934 state->tups_inserted += 1;
1935 }
1936 }
1937
1938 table_endscan(scan);
1939
1940 ExecDropSingleTupleTableSlot(slot);
1941
1942 FreeExecutorState(estate);
1943
1944 /* These may have been pointing to the now-gone estate */
1945 indexInfo->ii_ExpressionsState = NIL;
1946 indexInfo->ii_PredicateState = NULL;
1947}
1948
1949/*
1950 * Return the number of blocks that have been read by this scan since
1951 * starting. This is meant for progress reporting rather than be fully
1952 * accurate: in a parallel scan, workers can be concurrently reading blocks
1953 * further ahead than what we report.
1954 */
1955static BlockNumber
1956heapam_scan_get_blocks_done(HeapScanDesc hscan)
1957{
1958 ParallelBlockTableScanDesc bpscan = NULL;
1959 BlockNumber startblock;
1960 BlockNumber blocks_done;
1961
1962 if (hscan->rs_base.rs_parallel != NULL)
1963 {
1964 bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1965 startblock = bpscan->phs_startblock;
1966 }
1967 else
1968 startblock = hscan->rs_startblock;
1969
1970 /*
1971 * Might have wrapped around the end of the relation, if startblock was
1972 * not zero.
1973 */
1974 if (hscan->rs_cblock > startblock)
1975 blocks_done = hscan->rs_cblock - startblock;
1976 else
1977 {
1978 BlockNumber nblocks;
1979
1980 nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
1981 blocks_done = nblocks - startblock +
1982 hscan->rs_cblock;
1983 }
1984
1985 return blocks_done;
1986}
1987
1988
1989/* ------------------------------------------------------------------------
1990 * Miscellaneous callbacks for the heap AM
1991 * ------------------------------------------------------------------------
1992 */
1993
1994static uint64
1995heapam_relation_size(Relation rel, ForkNumber forkNumber)
1996{
1997 uint64 nblocks = 0;
1998
1999 /* Open it at the smgr level if not already done */
2000 RelationOpenSmgr(rel);
2001
2002 /* InvalidForkNumber indicates returning the size for all forks */
2003 if (forkNumber == InvalidForkNumber)
2004 {
2005 for (int i = 0; i < MAX_FORKNUM; i++)
2006 nblocks += smgrnblocks(rel->rd_smgr, i);
2007 }
2008 else
2009 nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
2010
2011 return nblocks * BLCKSZ;
2012}
2013
2014/*
2015 * Check to see whether the table needs a TOAST table. It does only if
2016 * (1) there are any toastable attributes, and (2) the maximum length
2017 * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2018 * create a toast table for something like "f1 varchar(20)".)
2019 */
2020static bool
2021heapam_relation_needs_toast_table(Relation rel)
2022{
2023 int32 data_length = 0;
2024 bool maxlength_unknown = false;
2025 bool has_toastable_attrs = false;
2026 TupleDesc tupdesc = rel->rd_att;
2027 int32 tuple_length;
2028 int i;
2029
2030 for (i = 0; i < tupdesc->natts; i++)
2031 {
2032 Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2033
2034 if (att->attisdropped)
2035 continue;
2036 data_length = att_align_nominal(data_length, att->attalign);
2037 if (att->attlen > 0)
2038 {
2039 /* Fixed-length types are never toastable */
2040 data_length += att->attlen;
2041 }
2042 else
2043 {
2044 int32 maxlen = type_maximum_size(att->atttypid,
2045 att->atttypmod);
2046
2047 if (maxlen < 0)
2048 maxlength_unknown = true;
2049 else
2050 data_length += maxlen;
2051 if (att->attstorage != 'p')
2052 has_toastable_attrs = true;
2053 }
2054 }
2055 if (!has_toastable_attrs)
2056 return false; /* nothing to toast? */
2057 if (maxlength_unknown)
2058 return true; /* any unlimited-length attrs? */
2059 tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2060 BITMAPLEN(tupdesc->natts)) +
2061 MAXALIGN(data_length);
2062 return (tuple_length > TOAST_TUPLE_THRESHOLD);
2063}
2064
2065
2066/* ------------------------------------------------------------------------
2067 * Planner related callbacks for the heap AM
2068 * ------------------------------------------------------------------------
2069 */
2070
2071static void
2072heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2073 BlockNumber *pages, double *tuples,
2074 double *allvisfrac)
2075{
2076 BlockNumber curpages;
2077 BlockNumber relpages;
2078 double reltuples;
2079 BlockNumber relallvisible;
2080 double density;
2081
2082 /* it has storage, ok to call the smgr */
2083 curpages = RelationGetNumberOfBlocks(rel);
2084
2085 /* coerce values in pg_class to more desirable types */
2086 relpages = (BlockNumber) rel->rd_rel->relpages;
2087 reltuples = (double) rel->rd_rel->reltuples;
2088 relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
2089
2090 /*
2091 * HACK: if the relation has never yet been vacuumed, use a minimum size
2092 * estimate of 10 pages. The idea here is to avoid assuming a
2093 * newly-created table is really small, even if it currently is, because
2094 * that may not be true once some data gets loaded into it. Once a vacuum
2095 * or analyze cycle has been done on it, it's more reasonable to believe
2096 * the size is somewhat stable.
2097 *
2098 * (Note that this is only an issue if the plan gets cached and used again
2099 * after the table has been filled. What we're trying to avoid is using a
2100 * nestloop-type plan on a table that has grown substantially since the
2101 * plan was made. Normally, autovacuum/autoanalyze will occur once enough
2102 * inserts have happened and cause cached-plan invalidation; but that
2103 * doesn't happen instantaneously, and it won't happen at all for cases
2104 * such as temporary tables.)
2105 *
2106 * We approximate "never vacuumed" by "has relpages = 0", which means this
2107 * will also fire on genuinely empty relations. Not great, but
2108 * fortunately that's a seldom-seen case in the real world, and it
2109 * shouldn't degrade the quality of the plan too much anyway to err in
2110 * this direction.
2111 *
2112 * If the table has inheritance children, we don't apply this heuristic.
2113 * Totally empty parent tables are quite common, so we should be willing
2114 * to believe that they are empty.
2115 */
2116 if (curpages < 10 &&
2117 relpages == 0 &&
2118 !rel->rd_rel->relhassubclass)
2119 curpages = 10;
2120
2121 /* report estimated # pages */
2122 *pages = curpages;
2123 /* quick exit if rel is clearly empty */
2124 if (curpages == 0)
2125 {
2126 *tuples = 0;
2127 *allvisfrac = 0;
2128 return;
2129 }
2130
2131 /* estimate number of tuples from previous tuple density */
2132 if (relpages > 0)
2133 density = reltuples / (double) relpages;
2134 else
2135 {
2136 /*
2137 * When we have no data because the relation was truncated, estimate
2138 * tuple width from attribute datatypes. We assume here that the
2139 * pages are completely full, which is OK for tables (since they've
2140 * presumably not been VACUUMed yet) but is probably an overestimate
2141 * for indexes. Fortunately get_relation_info() can clamp the
2142 * overestimate to the parent table's size.
2143 *
2144 * Note: this code intentionally disregards alignment considerations,
2145 * because (a) that would be gilding the lily considering how crude
2146 * the estimate is, and (b) it creates platform dependencies in the
2147 * default plans which are kind of a headache for regression testing.
2148 */
2149 int32 tuple_width;
2150
2151 tuple_width = get_rel_data_width(rel, attr_widths);
2152 tuple_width += MAXALIGN(SizeofHeapTupleHeader);
2153 tuple_width += sizeof(ItemIdData);
2154 /* note: integer division is intentional here */
2155 density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width;
2156 }
2157 *tuples = rint(density * (double) curpages);
2158
2159 /*
2160 * We use relallvisible as-is, rather than scaling it up like we do for
2161 * the pages and tuples counts, on the theory that any pages added since
2162 * the last VACUUM are most likely not marked all-visible. But costsize.c
2163 * wants it converted to a fraction.
2164 */
2165 if (relallvisible == 0 || curpages <= 0)
2166 *allvisfrac = 0;
2167 else if ((double) relallvisible >= curpages)
2168 *allvisfrac = 1;
2169 else
2170 *allvisfrac = (double) relallvisible / curpages;
2171}
2172
2173
2174/* ------------------------------------------------------------------------
2175 * Executor related callbacks for the heap AM
2176 * ------------------------------------------------------------------------
2177 */
2178
2179static bool
2180heapam_scan_bitmap_next_block(TableScanDesc scan,
2181 TBMIterateResult *tbmres)
2182{
2183 HeapScanDesc hscan = (HeapScanDesc) scan;
2184 BlockNumber page = tbmres->blockno;
2185 Buffer buffer;
2186 Snapshot snapshot;
2187 int ntup;
2188
2189 hscan->rs_cindex = 0;
2190 hscan->rs_ntuples = 0;
2191
2192 /*
2193 * Ignore any claimed entries past what we think is the end of the
2194 * relation. It may have been extended after the start of our scan (we
2195 * only hold an AccessShareLock, and it could be inserts from this
2196 * backend).
2197 */
2198 if (page >= hscan->rs_nblocks)
2199 return false;
2200
2201 /*
2202 * Acquire pin on the target heap page, trading in any pin we held before.
2203 */
2204 hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
2205 scan->rs_rd,
2206 page);
2207 hscan->rs_cblock = page;
2208 buffer = hscan->rs_cbuf;
2209 snapshot = scan->rs_snapshot;
2210
2211 ntup = 0;
2212
2213 /*
2214 * Prune and repair fragmentation for the whole page, if possible.
2215 */
2216 heap_page_prune_opt(scan->rs_rd, buffer);
2217
2218 /*
2219 * We must hold share lock on the buffer content while examining tuple
2220 * visibility. Afterwards, however, the tuples we have found to be
2221 * visible are guaranteed good as long as we hold the buffer pin.
2222 */
2223 LockBuffer(buffer, BUFFER_LOCK_SHARE);
2224
2225 /*
2226 * We need two separate strategies for lossy and non-lossy cases.
2227 */
2228 if (tbmres->ntuples >= 0)
2229 {
2230 /*
2231 * Bitmap is non-lossy, so we just look through the offsets listed in
2232 * tbmres; but we have to follow any HOT chain starting at each such
2233 * offset.
2234 */
2235 int curslot;
2236
2237 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
2238 {
2239 OffsetNumber offnum = tbmres->offsets[curslot];
2240 ItemPointerData tid;
2241 HeapTupleData heapTuple;
2242
2243 ItemPointerSet(&tid, page, offnum);
2244 if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2245 &heapTuple, NULL, true))
2246 hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2247 }
2248 }
2249 else
2250 {
2251 /*
2252 * Bitmap is lossy, so we must examine each line pointer on the page.
2253 * But we can ignore HOT chains, since we'll check each tuple anyway.
2254 */
2255 Page dp = (Page) BufferGetPage(buffer);
2256 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
2257 OffsetNumber offnum;
2258
2259 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2260 {
2261 ItemId lp;
2262 HeapTupleData loctup;
2263 bool valid;
2264
2265 lp = PageGetItemId(dp, offnum);
2266 if (!ItemIdIsNormal(lp))
2267 continue;
2268 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
2269 loctup.t_len = ItemIdGetLength(lp);
2270 loctup.t_tableOid = scan->rs_rd->rd_id;
2271 ItemPointerSet(&loctup.t_self, page, offnum);
2272 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2273 if (valid)
2274 {
2275 hscan->rs_vistuples[ntup++] = offnum;
2276 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
2277 }
2278 CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2279 buffer, snapshot);
2280 }
2281 }
2282
2283 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2284
2285 Assert(ntup <= MaxHeapTuplesPerPage);
2286 hscan->rs_ntuples = ntup;
2287
2288 return ntup > 0;
2289}
2290
2291static bool
2292heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2293 TBMIterateResult *tbmres,
2294 TupleTableSlot *slot)
2295{
2296 HeapScanDesc hscan = (HeapScanDesc) scan;
2297 OffsetNumber targoffset;
2298 Page dp;
2299 ItemId lp;
2300
2301 /*
2302 * Out of range? If so, nothing more to look at on this page
2303 */
2304 if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
2305 return false;
2306
2307 targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2308 dp = (Page) BufferGetPage(hscan->rs_cbuf);
2309 lp = PageGetItemId(dp, targoffset);
2310 Assert(ItemIdIsNormal(lp));
2311
2312 hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
2313 hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2314 hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2315 ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2316
2317 pgstat_count_heap_fetch(scan->rs_rd);
2318
2319 /*
2320 * Set up the result slot to point to this tuple. Note that the slot
2321 * acquires a pin on the buffer.
2322 */
2323 ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2324 slot,
2325 hscan->rs_cbuf);
2326
2327 hscan->rs_cindex++;
2328
2329 return true;
2330}
2331
2332static bool
2333heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2334{
2335 HeapScanDesc hscan = (HeapScanDesc) scan;
2336 TsmRoutine *tsm = scanstate->tsmroutine;
2337 BlockNumber blockno;
2338
2339 /* return false immediately if relation is empty */
2340 if (hscan->rs_nblocks == 0)
2341 return false;
2342
2343 if (tsm->NextSampleBlock)
2344 {
2345 blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2346 hscan->rs_cblock = blockno;
2347 }
2348 else
2349 {
2350 /* scanning table sequentially */
2351
2352 if (hscan->rs_cblock == InvalidBlockNumber)
2353 {
2354 Assert(!hscan->rs_inited);
2355 blockno = hscan->rs_startblock;
2356 }
2357 else
2358 {
2359 Assert(hscan->rs_inited);
2360
2361 blockno = hscan->rs_cblock + 1;
2362
2363 if (blockno >= hscan->rs_nblocks)
2364 {
2365 /* wrap to beginning of rel, might not have started at 0 */
2366 blockno = 0;
2367 }
2368
2369 /*
2370 * Report our new scan position for synchronization purposes.
2371 *
2372 * Note: we do this before checking for end of scan so that the
2373 * final state of the position hint is back at the start of the
2374 * rel. That's not strictly necessary, but otherwise when you run
2375 * the same query multiple times the starting position would shift
2376 * a little bit backwards on every invocation, which is confusing.
2377 * We don't guarantee any specific ordering in general, though.
2378 */
2379 if (scan->rs_flags & SO_ALLOW_SYNC)
2380 ss_report_location(scan->rs_rd, blockno);
2381
2382 if (blockno == hscan->rs_startblock)
2383 {
2384 blockno = InvalidBlockNumber;
2385 }
2386 }
2387 }
2388
2389 if (!BlockNumberIsValid(blockno))
2390 {
2391 if (BufferIsValid(hscan->rs_cbuf))
2392 ReleaseBuffer(hscan->rs_cbuf);
2393 hscan->rs_cbuf = InvalidBuffer;
2394 hscan->rs_cblock = InvalidBlockNumber;
2395 hscan->rs_inited = false;
2396
2397 return false;
2398 }
2399
2400 heapgetpage(scan, blockno);
2401 hscan->rs_inited = true;
2402
2403 return true;
2404}
2405
2406static bool
2407heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2408 TupleTableSlot *slot)
2409{
2410 HeapScanDesc hscan = (HeapScanDesc) scan;
2411 TsmRoutine *tsm = scanstate->tsmroutine;
2412 BlockNumber blockno = hscan->rs_cblock;
2413 bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
2414
2415 Page page;
2416 bool all_visible;
2417 OffsetNumber maxoffset;
2418
2419 /*
2420 * When not using pagemode, we must lock the buffer during tuple
2421 * visibility checks.
2422 */
2423 if (!pagemode)
2424 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2425
2426 page = (Page) BufferGetPage(hscan->rs_cbuf);
2427 all_visible = PageIsAllVisible(page) &&
2428 !scan->rs_snapshot->takenDuringRecovery;
2429 maxoffset = PageGetMaxOffsetNumber(page);
2430
2431 for (;;)
2432 {
2433 OffsetNumber tupoffset;
2434
2435 CHECK_FOR_INTERRUPTS();
2436
2437 /* Ask the tablesample method which tuples to check on this page. */
2438 tupoffset = tsm->NextSampleTuple(scanstate,
2439 blockno,
2440 maxoffset);
2441
2442 if (OffsetNumberIsValid(tupoffset))
2443 {
2444 ItemId itemid;
2445 bool visible;
2446 HeapTuple tuple = &(hscan->rs_ctup);
2447
2448 /* Skip invalid tuple pointers. */
2449 itemid = PageGetItemId(page, tupoffset);
2450 if (!ItemIdIsNormal(itemid))
2451 continue;
2452
2453 tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2454 tuple->t_len = ItemIdGetLength(itemid);
2455 ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2456
2457
2458 if (all_visible)
2459 visible = true;
2460 else
2461 visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2462 tuple, tupoffset);
2463
2464 /* in pagemode, heapgetpage did this for us */
2465 if (!pagemode)
2466 CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2467 hscan->rs_cbuf, scan->rs_snapshot);
2468
2469 /* Try next tuple from same page. */
2470 if (!visible)
2471 continue;
2472
2473 /* Found visible tuple, return it. */
2474 if (!pagemode)
2475 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2476
2477 ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2478
2479 /* Count successfully-fetched tuples as heap fetches */
2480 pgstat_count_heap_getnext(scan->rs_rd);
2481
2482 return true;
2483 }
2484 else
2485 {
2486 /*
2487 * If we get here, it means we've exhausted the items on this page
2488 * and it's time to move to the next.
2489 */
2490 if (!pagemode)
2491 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2492
2493 ExecClearTuple(slot);
2494 return false;
2495 }
2496 }
2497
2498 Assert(0);
2499}
2500
2501
2502/* ----------------------------------------------------------------------------
2503 * Helper functions for the above.
2504 * ----------------------------------------------------------------------------
2505 */
2506
2507/*
2508 * Reconstruct and rewrite the given tuple
2509 *
2510 * We cannot simply copy the tuple as-is, for several reasons:
2511 *
2512 * 1. We'd like to squeeze out the values of any dropped columns, both
2513 * to save space and to ensure we have no corner-case failures. (It's
2514 * possible for example that the new table hasn't got a TOAST table
2515 * and so is unable to store any large values of dropped cols.)
2516 *
2517 * 2. The tuple might not even be legal for the new table; this is
2518 * currently only known to happen as an after-effect of ALTER TABLE
2519 * SET WITHOUT OIDS.
2520 *
2521 * So, we must reconstruct the tuple from component Datums.
2522 */
2523static void
2524reform_and_rewrite_tuple(HeapTuple tuple,
2525 Relation OldHeap, Relation NewHeap,
2526 Datum *values, bool *isnull, RewriteState rwstate)
2527{
2528 TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2529 TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2530 HeapTuple copiedTuple;
2531 int i;
2532
2533 heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2534
2535 /* Be sure to null out any dropped columns */
2536 for (i = 0; i < newTupDesc->natts; i++)
2537 {
2538 if (TupleDescAttr(newTupDesc, i)->attisdropped)
2539 isnull[i] = true;
2540 }
2541
2542 copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2543
2544 /* The heap rewrite module does the rest */
2545 rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2546
2547 heap_freetuple(copiedTuple);
2548}
2549
2550/*
2551 * Check visibility of the tuple.
2552 */
2553static bool
2554SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2555 HeapTuple tuple,
2556 OffsetNumber tupoffset)
2557{
2558 HeapScanDesc hscan = (HeapScanDesc) scan;
2559
2560 if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2561 {
2562 /*
2563 * In pageatatime mode, heapgetpage() already did visibility checks,
2564 * so just look at the info it left in rs_vistuples[].
2565 *
2566 * We use a binary search over the known-sorted array. Note: we could
2567 * save some effort if we insisted that NextSampleTuple select tuples
2568 * in increasing order, but it's not clear that there would be enough
2569 * gain to justify the restriction.
2570 */
2571 int start = 0,
2572 end = hscan->rs_ntuples - 1;
2573
2574 while (start <= end)
2575 {
2576 int mid = (start + end) / 2;
2577 OffsetNumber curoffset = hscan->rs_vistuples[mid];
2578
2579 if (tupoffset == curoffset)
2580 return true;
2581 else if (tupoffset < curoffset)
2582 end = mid - 1;
2583 else
2584 start = mid + 1;
2585 }
2586
2587 return false;
2588 }
2589 else
2590 {
2591 /* Otherwise, we have to check the tuple individually. */
2592 return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2593 buffer);
2594 }
2595}
2596
2597
2598/* ------------------------------------------------------------------------
2599 * Definition of the heap table access method.
2600 * ------------------------------------------------------------------------
2601 */
2602
2603static const TableAmRoutine heapam_methods = {
2604 .type = T_TableAmRoutine,
2605
2606 .slot_callbacks = heapam_slot_callbacks,
2607
2608 .scan_begin = heap_beginscan,
2609 .scan_end = heap_endscan,
2610 .scan_rescan = heap_rescan,
2611 .scan_getnextslot = heap_getnextslot,
2612
2613 .parallelscan_estimate = table_block_parallelscan_estimate,
2614 .parallelscan_initialize = table_block_parallelscan_initialize,
2615 .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2616
2617 .index_fetch_begin = heapam_index_fetch_begin,
2618 .index_fetch_reset = heapam_index_fetch_reset,
2619 .index_fetch_end = heapam_index_fetch_end,
2620 .index_fetch_tuple = heapam_index_fetch_tuple,
2621
2622 .tuple_insert = heapam_tuple_insert,
2623 .tuple_insert_speculative = heapam_tuple_insert_speculative,
2624 .tuple_complete_speculative = heapam_tuple_complete_speculative,
2625 .multi_insert = heap_multi_insert,
2626 .tuple_delete = heapam_tuple_delete,
2627 .tuple_update = heapam_tuple_update,
2628 .tuple_lock = heapam_tuple_lock,
2629 .finish_bulk_insert = heapam_finish_bulk_insert,
2630
2631 .tuple_fetch_row_version = heapam_fetch_row_version,
2632 .tuple_get_latest_tid = heap_get_latest_tid,
2633 .tuple_tid_valid = heapam_tuple_tid_valid,
2634 .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2635 .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
2636
2637 .relation_set_new_filenode = heapam_relation_set_new_filenode,
2638 .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2639 .relation_copy_data = heapam_relation_copy_data,
2640 .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2641 .relation_vacuum = heap_vacuum_rel,
2642 .scan_analyze_next_block = heapam_scan_analyze_next_block,
2643 .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2644 .index_build_range_scan = heapam_index_build_range_scan,
2645 .index_validate_scan = heapam_index_validate_scan,
2646
2647 .relation_size = heapam_relation_size,
2648 .relation_needs_toast_table = heapam_relation_needs_toast_table,
2649
2650 .relation_estimate_size = heapam_estimate_rel_size,
2651
2652 .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
2653 .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2654 .scan_sample_next_block = heapam_scan_sample_next_block,
2655 .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2656};
2657
2658
2659const TableAmRoutine *
2660GetHeapamTableAmRoutine(void)
2661{
2662 return &heapam_methods;
2663}
2664
2665Datum
2666heap_tableam_handler(PG_FUNCTION_ARGS)
2667{
2668 PG_RETURN_POINTER(&heapam_methods);
2669}
2670