1/*-------------------------------------------------------------------------
2 *
3 * tableam.h
4 * POSTGRES table access method definitions.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/include/access/tableam.h
11 *
12 * NOTES
13 * See tableam.sgml for higher level documentation.
14 *
15 *-------------------------------------------------------------------------
16 */
17#ifndef TABLEAM_H
18#define TABLEAM_H
19
20#include "access/relscan.h"
21#include "access/sdir.h"
22#include "utils/guc.h"
23#include "utils/rel.h"
24#include "utils/snapshot.h"
25
26
27#define DEFAULT_TABLE_ACCESS_METHOD "heap"
28
29/* GUCs */
30extern char *default_table_access_method;
31extern bool synchronize_seqscans;
32
33
34struct BulkInsertStateData;
35struct IndexInfo;
36struct SampleScanState;
37struct TBMIterateResult;
38struct VacuumParams;
39struct ValidateIndexState;
40
41/*
42 * Bitmask values for the flags argument to the scan_begin callback.
43 */
44typedef enum ScanOptions
45{
46 /* one of SO_TYPE_* may be specified */
47 SO_TYPE_SEQSCAN = 1 << 0,
48 SO_TYPE_BITMAPSCAN = 1 << 1,
49 SO_TYPE_SAMPLESCAN = 1 << 2,
50 SO_TYPE_ANALYZE = 1 << 3,
51
52 /* several of SO_ALLOW_* may be specified */
53 /* allow or disallow use of access strategy */
54 SO_ALLOW_STRAT = 1 << 4,
55 /* report location to syncscan logic? */
56 SO_ALLOW_SYNC = 1 << 5,
57 /* verify visibility page-at-a-time? */
58 SO_ALLOW_PAGEMODE = 1 << 6,
59
60 /* unregister snapshot at scan end? */
61 SO_TEMP_SNAPSHOT = 1 << 7
62} ScanOptions;
63
64/*
65 * Result codes for table_{update,delete,lock_tuple}, and for visibility
66 * routines inside table AMs.
67 */
68typedef enum TM_Result
69{
70 /*
71 * Signals that the action succeeded (i.e. update/delete performed, lock
72 * was acquired)
73 */
74 TM_Ok,
75
76 /* The affected tuple wasn't visible to the relevant snapshot */
77 TM_Invisible,
78
79 /* The affected tuple was already modified by the calling backend */
80 TM_SelfModified,
81
82 /*
83 * The affected tuple was updated by another transaction. This includes
84 * the case where tuple was moved to another partition.
85 */
86 TM_Updated,
87
88 /* The affected tuple was deleted by another transaction */
89 TM_Deleted,
90
91 /*
92 * The affected tuple is currently being modified by another session. This
93 * will only be returned if table_(update/delete/lock_tuple) are
94 * instructed not to wait.
95 */
96 TM_BeingModified,
97
98 /* lock couldn't be acquired, action skipped. Only used by lock_tuple */
99 TM_WouldBlock
100} TM_Result;
101
102/*
103 * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail
104 * because the target tuple is already outdated, they fill in this struct to
105 * provide information to the caller about what happened.
106 *
107 * ctid is the target's ctid link: it is the same as the target's TID if the
108 * target was deleted, or the location of the replacement tuple if the target
109 * was updated.
110 *
111 * xmax is the outdating transaction's XID. If the caller wants to visit the
112 * replacement tuple, it must check that this matches before believing the
113 * replacement is really a match.
114 *
115 * cmax is the outdating command's CID, but only when the failure code is
116 * TM_SelfModified (i.e., something in the current transaction outdated the
117 * tuple); otherwise cmax is zero. (We make this restriction because
118 * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other
119 * transactions.)
120 */
121typedef struct TM_FailureData
122{
123 ItemPointerData ctid;
124 TransactionId xmax;
125 CommandId cmax;
126 bool traversed;
127} TM_FailureData;
128
129/* "options" flag bits for table_tuple_insert */
130#define TABLE_INSERT_SKIP_WAL 0x0001
131#define TABLE_INSERT_SKIP_FSM 0x0002
132#define TABLE_INSERT_FROZEN 0x0004
133#define TABLE_INSERT_NO_LOGICAL 0x0008
134
135/* flag bits for table_tuple_lock */
136/* Follow tuples whose update is in progress if lock modes don't conflict */
137#define TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS (1 << 0)
138/* Follow update chain and lock latest version of tuple */
139#define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1)
140
141
142/* Typedef for callback function for table_index_build_scan */
143typedef void (*IndexBuildCallback) (Relation index,
144 HeapTuple htup,
145 Datum *values,
146 bool *isnull,
147 bool tupleIsAlive,
148 void *state);
149
150/*
151 * API struct for a table AM. Note this must be allocated in a
152 * server-lifetime manner, typically as a static const struct, which then gets
153 * returned by FormData_pg_am.amhandler.
154 *
155 * In most cases it's not appropriate to call the callbacks directly, use the
156 * table_* wrapper functions instead.
157 *
158 * GetTableAmRoutine() asserts that required callbacks are filled in, remember
159 * to update when adding a callback.
160 */
161typedef struct TableAmRoutine
162{
163 /* this must be set to T_TableAmRoutine */
164 NodeTag type;
165
166
167 /* ------------------------------------------------------------------------
168 * Slot related callbacks.
169 * ------------------------------------------------------------------------
170 */
171
172 /*
173 * Return slot implementation suitable for storing a tuple of this AM.
174 */
175 const TupleTableSlotOps *(*slot_callbacks) (Relation rel);
176
177
178 /* ------------------------------------------------------------------------
179 * Table scan callbacks.
180 * ------------------------------------------------------------------------
181 */
182
183 /*
184 * Start a scan of `rel`. The callback has to return a TableScanDesc,
185 * which will typically be embedded in a larger, AM specific, struct.
186 *
187 * If nkeys != 0, the results need to be filtered by those scan keys.
188 *
189 * pscan, if not NULL, will have already been initialized with
190 * parallelscan_initialize(), and has to be for the same relation. Will
191 * only be set coming from table_beginscan_parallel().
192 *
193 * `flags` is a bitmask indicating the type of scan (ScanOptions's
194 * SO_TYPE_*, currently only one may be specified), options controlling
195 * the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be
196 * specified, an AM may ignore unsupported ones) and whether the snapshot
197 * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT).
198 */
199 TableScanDesc (*scan_begin) (Relation rel,
200 Snapshot snapshot,
201 int nkeys, struct ScanKeyData *key,
202 ParallelTableScanDesc pscan,
203 uint32 flags);
204
205 /*
206 * Release resources and deallocate scan. If TableScanDesc.temp_snap,
207 * TableScanDesc.rs_snapshot needs to be unregistered.
208 */
209 void (*scan_end) (TableScanDesc scan);
210
211 /*
212 * Restart relation scan. If set_params is set to true, allow_{strat,
213 * sync, pagemode} (see scan_begin) changes should be taken into account.
214 */
215 void (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key,
216 bool set_params, bool allow_strat,
217 bool allow_sync, bool allow_pagemode);
218
219 /*
220 * Return next tuple from `scan`, store in slot.
221 */
222 bool (*scan_getnextslot) (TableScanDesc scan,
223 ScanDirection direction,
224 TupleTableSlot *slot);
225
226
227 /* ------------------------------------------------------------------------
228 * Parallel table scan related functions.
229 * ------------------------------------------------------------------------
230 */
231
232 /*
233 * Estimate the size of shared memory needed for a parallel scan of this
234 * relation. The snapshot does not need to be accounted for.
235 */
236 Size (*parallelscan_estimate) (Relation rel);
237
238 /*
239 * Initialize ParallelTableScanDesc for a parallel scan of this relation.
240 * `pscan` will be sized according to parallelscan_estimate() for the same
241 * relation.
242 */
243 Size (*parallelscan_initialize) (Relation rel,
244 ParallelTableScanDesc pscan);
245
246 /*
247 * Reinitialize `pscan` for a new scan. `rel` will be the same relation as
248 * when `pscan` was initialized by parallelscan_initialize.
249 */
250 void (*parallelscan_reinitialize) (Relation rel,
251 ParallelTableScanDesc pscan);
252
253
254 /* ------------------------------------------------------------------------
255 * Index Scan Callbacks
256 * ------------------------------------------------------------------------
257 */
258
259 /*
260 * Prepare to fetch tuples from the relation, as needed when fetching
261 * tuples for an index scan. The callback has to return an
262 * IndexFetchTableData, which the AM will typically embed in a larger
263 * structure with additional information.
264 *
265 * Tuples for an index scan can then be fetched via index_fetch_tuple.
266 */
267 struct IndexFetchTableData *(*index_fetch_begin) (Relation rel);
268
269 /*
270 * Reset index fetch. Typically this will release cross index fetch
271 * resources held in IndexFetchTableData.
272 */
273 void (*index_fetch_reset) (struct IndexFetchTableData *data);
274
275 /*
276 * Release resources and deallocate index fetch.
277 */
278 void (*index_fetch_end) (struct IndexFetchTableData *data);
279
280 /*
281 * Fetch tuple at `tid` into `slot`, after doing a visibility test
282 * according to `snapshot`. If a tuple was found and passed the visibility
283 * test, return true, false otherwise.
284 *
285 * Note that AMs that do not necessarily update indexes when indexed
286 * columns do not change, need to return the current/correct version of
287 * the tuple that is visible to the snapshot, even if the tid points to an
288 * older version of the tuple.
289 *
290 * *call_again is false on the first call to index_fetch_tuple for a tid.
291 * If there potentially is another tuple matching the tid, *call_again
292 * needs be set to true by index_fetch_tuple, signalling to the caller
293 * that index_fetch_tuple should be called again for the same tid.
294 *
295 * *all_dead, if all_dead is not NULL, should be set to true by
296 * index_fetch_tuple iff it is guaranteed that no backend needs to see
297 * that tuple. Index AMs can use that to avoid returning that tid in
298 * future searches.
299 */
300 bool (*index_fetch_tuple) (struct IndexFetchTableData *scan,
301 ItemPointer tid,
302 Snapshot snapshot,
303 TupleTableSlot *slot,
304 bool *call_again, bool *all_dead);
305
306
307 /* ------------------------------------------------------------------------
308 * Callbacks for non-modifying operations on individual tuples
309 * ------------------------------------------------------------------------
310 */
311
312 /*
313 * Fetch tuple at `tid` into `slot`, after doing a visibility test
314 * according to `snapshot`. If a tuple was found and passed the visibility
315 * test, returns true, false otherwise.
316 */
317 bool (*tuple_fetch_row_version) (Relation rel,
318 ItemPointer tid,
319 Snapshot snapshot,
320 TupleTableSlot *slot);
321
322 /*
323 * Is tid valid for a scan of this relation.
324 */
325 bool (*tuple_tid_valid) (TableScanDesc scan,
326 ItemPointer tid);
327
328 /*
329 * Return the latest version of the tuple at `tid`, by updating `tid` to
330 * point at the newest version.
331 */
332 void (*tuple_get_latest_tid) (TableScanDesc scan,
333 ItemPointer tid);
334
335 /*
336 * Does the tuple in `slot` satisfy `snapshot`? The slot needs to be of
337 * the appropriate type for the AM.
338 */
339 bool (*tuple_satisfies_snapshot) (Relation rel,
340 TupleTableSlot *slot,
341 Snapshot snapshot);
342
343 /* see table_compute_xid_horizon_for_tuples() */
344 TransactionId (*compute_xid_horizon_for_tuples) (Relation rel,
345 ItemPointerData *items,
346 int nitems);
347
348
349 /* ------------------------------------------------------------------------
350 * Manipulations of physical tuples.
351 * ------------------------------------------------------------------------
352 */
353
354 /* see table_tuple_insert() for reference about parameters */
355 void (*tuple_insert) (Relation rel, TupleTableSlot *slot,
356 CommandId cid, int options,
357 struct BulkInsertStateData *bistate);
358
359 /* see table_tuple_insert_speculative() for reference about parameters */
360 void (*tuple_insert_speculative) (Relation rel,
361 TupleTableSlot *slot,
362 CommandId cid,
363 int options,
364 struct BulkInsertStateData *bistate,
365 uint32 specToken);
366
367 /* see table_tuple_complete_speculative() for reference about parameters */
368 void (*tuple_complete_speculative) (Relation rel,
369 TupleTableSlot *slot,
370 uint32 specToken,
371 bool succeeded);
372
373 /* see table_multi_insert() for reference about parameters */
374 void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots,
375 CommandId cid, int options, struct BulkInsertStateData *bistate);
376
377 /* see table_tuple_delete() for reference about parameters */
378 TM_Result (*tuple_delete) (Relation rel,
379 ItemPointer tid,
380 CommandId cid,
381 Snapshot snapshot,
382 Snapshot crosscheck,
383 bool wait,
384 TM_FailureData *tmfd,
385 bool changingPart);
386
387 /* see table_tuple_update() for reference about parameters */
388 TM_Result (*tuple_update) (Relation rel,
389 ItemPointer otid,
390 TupleTableSlot *slot,
391 CommandId cid,
392 Snapshot snapshot,
393 Snapshot crosscheck,
394 bool wait,
395 TM_FailureData *tmfd,
396 LockTupleMode *lockmode,
397 bool *update_indexes);
398
399 /* see table_tuple_lock() for reference about parameters */
400 TM_Result (*tuple_lock) (Relation rel,
401 ItemPointer tid,
402 Snapshot snapshot,
403 TupleTableSlot *slot,
404 CommandId cid,
405 LockTupleMode mode,
406 LockWaitPolicy wait_policy,
407 uint8 flags,
408 TM_FailureData *tmfd);
409
410 /*
411 * Perform operations necessary to complete insertions made via
412 * tuple_insert and multi_insert with a BulkInsertState specified. This
413 * may for example be used to flush the relation, when the
414 * TABLE_INSERT_SKIP_WAL option was used.
415 *
416 * Typically callers of tuple_insert and multi_insert will just pass all
417 * the flags that apply to them, and each AM has to decide which of them
418 * make sense for it, and then only take actions in finish_bulk_insert for
419 * those flags, and ignore others.
420 *
421 * Optional callback.
422 */
423 void (*finish_bulk_insert) (Relation rel, int options);
424
425
426 /* ------------------------------------------------------------------------
427 * DDL related functionality.
428 * ------------------------------------------------------------------------
429 */
430
431 /*
432 * This callback needs to create a new relation filenode for `rel`, with
433 * appropriate durability behaviour for `persistence`.
434 *
435 * Note that only the subset of the relcache filled by
436 * RelationBuildLocalRelation() can be relied upon and that the relation's
437 * catalog entries will either not yet exist (new relation), or will still
438 * reference the old relfilenode.
439 *
440 * As output *freezeXid, *minmulti must be set to the values appropriate
441 * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
442 * fields to be filled they can be set to InvalidTransactionId and
443 * InvalidMultiXactId, respectively.
444 *
445 * See also table_relation_set_new_filenode().
446 */
447 void (*relation_set_new_filenode) (Relation rel,
448 const RelFileNode *newrnode,
449 char persistence,
450 TransactionId *freezeXid,
451 MultiXactId *minmulti);
452
453 /*
454 * This callback needs to remove all contents from `rel`'s current
455 * relfilenode. No provisions for transactional behaviour need to be made.
456 * Often this can be implemented by truncating the underlying storage to
457 * its minimal size.
458 *
459 * See also table_relation_nontransactional_truncate().
460 */
461 void (*relation_nontransactional_truncate) (Relation rel);
462
463 /*
464 * See table_relation_copy_data().
465 *
466 * This can typically be implemented by directly copying the underlying
467 * storage, unless it contains references to the tablespace internally.
468 */
469 void (*relation_copy_data) (Relation rel,
470 const RelFileNode *newrnode);
471
472 /* See table_relation_copy_for_cluster() */
473 void (*relation_copy_for_cluster) (Relation NewTable,
474 Relation OldTable,
475 Relation OldIndex,
476 bool use_sort,
477 TransactionId OldestXmin,
478 TransactionId *xid_cutoff,
479 MultiXactId *multi_cutoff,
480 double *num_tuples,
481 double *tups_vacuumed,
482 double *tups_recently_dead);
483
484 /*
485 * React to VACUUM command on the relation. The VACUUM can be
486 * triggered by a user or by autovacuum. The specific actions
487 * performed by the AM will depend heavily on the individual AM.
488 *
489 * On entry a transaction is already established, and the relation is
490 * locked with a ShareUpdateExclusive lock.
491 *
492 * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
493 * this routine, even if (for ANALYZE) it is part of the same VACUUM
494 * command.
495 *
496 * There probably, in the future, needs to be a separate callback to
497 * integrate with autovacuum's scheduling.
498 */
499 void (*relation_vacuum) (Relation onerel,
500 struct VacuumParams *params,
501 BufferAccessStrategy bstrategy);
502
503 /*
504 * Prepare to analyze block `blockno` of `scan`. The scan has been started
505 * with table_beginscan_analyze(). See also
506 * table_scan_analyze_next_block().
507 *
508 * The callback may acquire resources like locks that are held until
509 * table_scan_analyze_next_tuple() returns false. It e.g. can make sense
510 * to hold a lock until all tuples on a block have been analyzed by
511 * scan_analyze_next_tuple.
512 *
513 * The callback can return false if the block is not suitable for
514 * sampling, e.g. because it's a metapage that could never contain tuples.
515 *
516 * XXX: This obviously is primarily suited for block-based AMs. It's not
517 * clear what a good interface for non block based AMs would be, so there
518 * isn't one yet.
519 */
520 bool (*scan_analyze_next_block) (TableScanDesc scan,
521 BlockNumber blockno,
522 BufferAccessStrategy bstrategy);
523
524 /*
525 * See table_scan_analyze_next_tuple().
526 *
527 * Not every AM might have a meaningful concept of dead rows, in which
528 * case it's OK to not increment *deadrows - but note that that may
529 * influence autovacuum scheduling (see comment for relation_vacuum
530 * callback).
531 */
532 bool (*scan_analyze_next_tuple) (TableScanDesc scan,
533 TransactionId OldestXmin,
534 double *liverows,
535 double *deadrows,
536 TupleTableSlot *slot);
537
538 /* see table_index_build_range_scan for reference about parameters */
539 double (*index_build_range_scan) (Relation table_rel,
540 Relation index_rel,
541 struct IndexInfo *index_info,
542 bool allow_sync,
543 bool anyvisible,
544 bool progress,
545 BlockNumber start_blockno,
546 BlockNumber numblocks,
547 IndexBuildCallback callback,
548 void *callback_state,
549 TableScanDesc scan);
550
551 /* see table_index_validate_scan for reference about parameters */
552 void (*index_validate_scan) (Relation table_rel,
553 Relation index_rel,
554 struct IndexInfo *index_info,
555 Snapshot snapshot,
556 struct ValidateIndexState *state);
557
558
559 /* ------------------------------------------------------------------------
560 * Miscellaneous functions.
561 * ------------------------------------------------------------------------
562 */
563
564 /*
565 * See table_relation_size().
566 *
567 * Note that currently a few callers use the MAIN_FORKNUM size to figure
568 * out the range of potentially interesting blocks (brin, analyze). It's
569 * probable that we'll need to revise the interface for those at some
570 * point.
571 */
572 uint64 (*relation_size) (Relation rel, ForkNumber forkNumber);
573
574
575 /*
576 * This callback should return true if the relation requires a TOAST table
577 * and false if it does not. It may wish to examine the relation's tuple
578 * descriptor before making a decision, but if it uses some other method
579 * of storing large values (or if it does not support them) it can simply
580 * return false.
581 */
582 bool (*relation_needs_toast_table) (Relation rel);
583
584
585 /* ------------------------------------------------------------------------
586 * Planner related functions.
587 * ------------------------------------------------------------------------
588 */
589
590 /*
591 * See table_relation_estimate_size().
592 *
593 * While block oriented, it shouldn't be too hard for an AM that doesn't
594 * internally use blocks to convert into a usable representation.
595 *
596 * This differs from the relation_size callback by returning size
597 * estimates (both relation size and tuple count) for planning purposes,
598 * rather than returning a currently correct estimate.
599 */
600 void (*relation_estimate_size) (Relation rel, int32 *attr_widths,
601 BlockNumber *pages, double *tuples,
602 double *allvisfrac);
603
604
605 /* ------------------------------------------------------------------------
606 * Executor related functions.
607 * ------------------------------------------------------------------------
608 */
609
610 /*
611 * Prepare to fetch / check / return tuples from `tbmres->blockno` as part
612 * of a bitmap table scan. `scan` was started via table_beginscan_bm().
613 * Return false if there are no tuples to be found on the page, true
614 * otherwise.
615 *
616 * This will typically read and pin the target block, and do the necessary
617 * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
618 * make sense to perform tuple visibility checks at this time). For some
619 * AMs it will make more sense to do all the work referencing `tbmres`
620 * contents here, for others it might be better to defer more work to
621 * scan_bitmap_next_tuple.
622 *
623 * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
624 * on the page have to be returned, otherwise the tuples at offsets in
625 * `tbmres->offsets` need to be returned.
626 *
627 * XXX: Currently this may only be implemented if the AM uses md.c as its
628 * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
629 * blockids directly to the underlying storage. nodeBitmapHeapscan.c
630 * performs prefetching directly using that interface. This probably
631 * needs to be rectified at a later point.
632 *
633 * XXX: Currently this may only be implemented if the AM uses the
634 * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
635 * perform prefetching. This probably needs to be rectified at a later
636 * point.
637 *
638 * Optional callback, but either both scan_bitmap_next_block and
639 * scan_bitmap_next_tuple need to exist, or neither.
640 */
641 bool (*scan_bitmap_next_block) (TableScanDesc scan,
642 struct TBMIterateResult *tbmres);
643
644 /*
645 * Fetch the next tuple of a bitmap table scan into `slot` and return true
646 * if a visible tuple was found, false otherwise.
647 *
648 * For some AMs it will make more sense to do all the work referencing
649 * `tbmres` contents in scan_bitmap_next_block, for others it might be
650 * better to defer more work to this callback.
651 *
652 * Optional callback, but either both scan_bitmap_next_block and
653 * scan_bitmap_next_tuple need to exist, or neither.
654 */
655 bool (*scan_bitmap_next_tuple) (TableScanDesc scan,
656 struct TBMIterateResult *tbmres,
657 TupleTableSlot *slot);
658
659 /*
660 * Prepare to fetch tuples from the next block in a sample scan. Return
661 * false if the sample scan is finished, true otherwise. `scan` was
662 * started via table_beginscan_sampling().
663 *
664 * Typically this will first determine the target block by calling the
665 * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
666 * perform a sequential scan over all blocks. The determined block is
667 * then typically read and pinned.
668 *
669 * As the TsmRoutine interface is block based, a block needs to be passed
670 * to NextSampleBlock(). If that's not appropriate for an AM, it
671 * internally needs to perform mapping between the internal and a block
672 * based representation.
673 *
674 * Note that it's not acceptable to hold deadlock prone resources such as
675 * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
676 * block - the tuple is likely to be returned to an upper query node, and
677 * the next call could be off a long while. Holding buffer pins and such
678 * is obviously OK.
679 *
680 * Currently it is required to implement this interface, as there's no
681 * alternative way (contrary e.g. to bitmap scans) to implement sample
682 * scans. If infeasible to implement, the AM may raise an error.
683 */
684 bool (*scan_sample_next_block) (TableScanDesc scan,
685 struct SampleScanState *scanstate);
686
687 /*
688 * This callback, only called after scan_sample_next_block has returned
689 * true, should determine the next tuple to be returned from the selected
690 * block using the TsmRoutine's NextSampleTuple() callback.
691 *
692 * The callback needs to perform visibility checks, and only return
693 * visible tuples. That obviously can mean calling NextSampleTuple()
694 * multiple times.
695 *
696 * The TsmRoutine interface assumes that there's a maximum offset on a
697 * given page, so if that doesn't apply to an AM, it needs to emulate that
698 * assumption somehow.
699 */
700 bool (*scan_sample_next_tuple) (TableScanDesc scan,
701 struct SampleScanState *scanstate,
702 TupleTableSlot *slot);
703
704} TableAmRoutine;
705
706
707/* ----------------------------------------------------------------------------
708 * Slot functions.
709 * ----------------------------------------------------------------------------
710 */
711
712/*
713 * Returns slot callbacks suitable for holding tuples of the appropriate type
714 * for the relation. Works for tables, views, foreign tables and partitioned
715 * tables.
716 */
717extern const TupleTableSlotOps *table_slot_callbacks(Relation rel);
718
719/*
720 * Returns slot using the callbacks returned by table_slot_callbacks(), and
721 * registers it on *reglist.
722 */
723extern TupleTableSlot *table_slot_create(Relation rel, List **reglist);
724
725
726/* ----------------------------------------------------------------------------
727 * Table scan functions.
728 * ----------------------------------------------------------------------------
729 */
730
731/*
732 * Start a scan of `rel`. Returned tuples pass a visibility test of
733 * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys.
734 */
735static inline TableScanDesc
736table_beginscan(Relation rel, Snapshot snapshot,
737 int nkeys, struct ScanKeyData *key)
738{
739 uint32 flags = SO_TYPE_SEQSCAN |
740 SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
741
742 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
743}
744
745/*
746 * Like table_beginscan(), but for scanning catalog. It'll automatically use a
747 * snapshot appropriate for scanning catalog relations.
748 */
749extern TableScanDesc table_beginscan_catalog(Relation rel, int nkeys,
750 struct ScanKeyData *key);
751
752/*
753 * Like table_beginscan(), but table_beginscan_strat() offers an extended API
754 * that lets the caller control whether a nondefault buffer access strategy
755 * can be used, and whether syncscan can be chosen (possibly resulting in the
756 * scan not starting from block zero). Both of these default to true with
757 * plain table_beginscan.
758 */
759static inline TableScanDesc
760table_beginscan_strat(Relation rel, Snapshot snapshot,
761 int nkeys, struct ScanKeyData *key,
762 bool allow_strat, bool allow_sync)
763{
764 uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE;
765
766 if (allow_strat)
767 flags |= SO_ALLOW_STRAT;
768 if (allow_sync)
769 flags |= SO_ALLOW_SYNC;
770
771 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
772}
773
774/*
775 * table_beginscan_bm is an alternative entry point for setting up a
776 * TableScanDesc for a bitmap heap scan. Although that scan technology is
777 * really quite unlike a standard seqscan, there is just enough commonality to
778 * make it worth using the same data structure.
779 */
780static inline TableScanDesc
781table_beginscan_bm(Relation rel, Snapshot snapshot,
782 int nkeys, struct ScanKeyData *key)
783{
784 uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
785
786 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
787}
788
789/*
790 * table_beginscan_sampling is an alternative entry point for setting up a
791 * TableScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
792 * using the same data structure although the behavior is rather different.
793 * In addition to the options offered by table_beginscan_strat, this call
794 * also allows control of whether page-mode visibility checking is used.
795 */
796static inline TableScanDesc
797table_beginscan_sampling(Relation rel, Snapshot snapshot,
798 int nkeys, struct ScanKeyData *key,
799 bool allow_strat, bool allow_sync,
800 bool allow_pagemode)
801{
802 uint32 flags = SO_TYPE_SAMPLESCAN;
803
804 if (allow_strat)
805 flags |= SO_ALLOW_STRAT;
806 if (allow_sync)
807 flags |= SO_ALLOW_SYNC;
808 if (allow_pagemode)
809 flags |= SO_ALLOW_PAGEMODE;
810
811 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
812}
813
814/*
815 * table_beginscan_analyze is an alternative entry point for setting up a
816 * TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using
817 * the same data structure although the behavior is rather different.
818 */
819static inline TableScanDesc
820table_beginscan_analyze(Relation rel)
821{
822 uint32 flags = SO_TYPE_ANALYZE;
823
824 return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags);
825}
826
827/*
828 * End relation scan.
829 */
830static inline void
831table_endscan(TableScanDesc scan)
832{
833 scan->rs_rd->rd_tableam->scan_end(scan);
834}
835
836/*
837 * Restart a relation scan.
838 */
839static inline void
840table_rescan(TableScanDesc scan,
841 struct ScanKeyData *key)
842{
843 scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false);
844}
845
846/*
847 * Restart a relation scan after changing params.
848 *
849 * This call allows changing the buffer strategy, syncscan, and pagemode
850 * options before starting a fresh scan. Note that although the actual use of
851 * syncscan might change (effectively, enabling or disabling reporting), the
852 * previously selected startblock will be kept.
853 */
854static inline void
855table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key,
856 bool allow_strat, bool allow_sync, bool allow_pagemode)
857{
858 scan->rs_rd->rd_tableam->scan_rescan(scan, key, true,
859 allow_strat, allow_sync,
860 allow_pagemode);
861}
862
863/*
864 * Update snapshot used by the scan.
865 */
866extern void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot);
867
868/*
869 * Return next tuple from `scan`, store in slot.
870 */
871static inline bool
872table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
873{
874 slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
875 return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);
876}
877
878
879/* ----------------------------------------------------------------------------
880 * Parallel table scan related functions.
881 * ----------------------------------------------------------------------------
882 */
883
884/*
885 * Estimate the size of shared memory needed for a parallel scan of this
886 * relation.
887 */
888extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot);
889
890/*
891 * Initialize ParallelTableScanDesc for a parallel scan of this
892 * relation. `pscan` needs to be sized according to parallelscan_estimate()
893 * for the same relation. Call this just once in the leader process; then,
894 * individual workers attach via table_beginscan_parallel.
895 */
896extern void table_parallelscan_initialize(Relation rel,
897 ParallelTableScanDesc pscan,
898 Snapshot snapshot);
899
900/*
901 * Begin a parallel scan. `pscan` needs to have been initialized with
902 * table_parallelscan_initialize(), for the same relation. The initialization
903 * does not need to have happened in this backend.
904 *
905 * Caller must hold a suitable lock on the relation.
906 */
907extern TableScanDesc table_beginscan_parallel(Relation rel,
908 ParallelTableScanDesc pscan);
909
910/*
911 * Restart a parallel scan. Call this in the leader process. Caller is
912 * responsible for making sure that all workers have finished the scan
913 * beforehand.
914 */
915static inline void
916table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
917{
918 rel->rd_tableam->parallelscan_reinitialize(rel, pscan);
919}
920
921
922/* ----------------------------------------------------------------------------
923 * Index scan related functions.
924 * ----------------------------------------------------------------------------
925 */
926
927/*
928 * Prepare to fetch tuples from the relation, as needed when fetching tuples
929 * for an index scan.
930 *
931 * Tuples for an index scan can then be fetched via table_index_fetch_tuple().
932 */
933static inline IndexFetchTableData *
934table_index_fetch_begin(Relation rel)
935{
936 return rel->rd_tableam->index_fetch_begin(rel);
937}
938
939/*
940 * Reset index fetch. Typically this will release cross index fetch resources
941 * held in IndexFetchTableData.
942 */
943static inline void
944table_index_fetch_reset(struct IndexFetchTableData *scan)
945{
946 scan->rel->rd_tableam->index_fetch_reset(scan);
947}
948
949/*
950 * Release resources and deallocate index fetch.
951 */
952static inline void
953table_index_fetch_end(struct IndexFetchTableData *scan)
954{
955 scan->rel->rd_tableam->index_fetch_end(scan);
956}
957
958/*
959 * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing
960 * a visibility test according to `snapshot`. If a tuple was found and passed
961 * the visibility test, returns true, false otherwise.
962 *
963 * *call_again needs to be false on the first call to table_index_fetch_tuple() for
964 * a tid. If there potentially is another tuple matching the tid, *call_again
965 * will be set to true, signalling that table_index_fetch_tuple() should be called
966 * again for the same tid.
967 *
968 * *all_dead, if all_dead is not NULL, will be set to true by
969 * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see
970 * that tuple. Index AMs can use that to avoid returning that tid in future
971 * searches.
972 *
973 * The difference between this function and table_fetch_row_version is that
974 * this function returns the currently visible version of a row if the AM
975 * supports storing multiple row versions reachable via a single index entry
976 * (like heap's HOT). Whereas table_fetch_row_version only evaluates the
977 * tuple exactly at `tid`. Outside of index entry ->table tuple lookups,
978 * table_tuple_fetch_row_version is what's usually needed.
979 */
980static inline bool
981table_index_fetch_tuple(struct IndexFetchTableData *scan,
982 ItemPointer tid,
983 Snapshot snapshot,
984 TupleTableSlot *slot,
985 bool *call_again, bool *all_dead)
986{
987
988 return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
989 slot, call_again,
990 all_dead);
991}
992
993/*
994 * This is a convenience wrapper around table_index_fetch_tuple() which
995 * returns whether there are table tuple items corresponding to an index
996 * entry. This likely is only useful to verify if there's a conflict in a
997 * unique index.
998 */
999extern bool table_index_fetch_tuple_check(Relation rel,
1000 ItemPointer tid,
1001 Snapshot snapshot,
1002 bool *all_dead);
1003
1004
1005/* ------------------------------------------------------------------------
1006 * Functions for non-modifying operations on individual tuples
1007 * ------------------------------------------------------------------------
1008 */
1009
1010
1011/*
1012 * Fetch tuple at `tid` into `slot`, after doing a visibility test according to
1013 * `snapshot`. If a tuple was found and passed the visibility test, returns
1014 * true, false otherwise.
1015 *
1016 * See table_index_fetch_tuple's comment about what the difference between
1017 * these functions is. It is correct to use this function outside of index
1018 * entry->table tuple lookups.
1019 */
1020static inline bool
1021table_tuple_fetch_row_version(Relation rel,
1022 ItemPointer tid,
1023 Snapshot snapshot,
1024 TupleTableSlot *slot)
1025{
1026 return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot);
1027}
1028
1029/*
1030 * Verify that `tid` is a potentially valid tuple identifier. That doesn't
1031 * mean that the pointed to row needs to exist or be visible, but that
1032 * attempting to fetch the row (e.g. with table_get_latest_tid() or
1033 * table_fetch_row_version()) should not error out if called with that tid.
1034 *
1035 * `scan` needs to have been started via table_beginscan().
1036 */
1037static inline bool
1038table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
1039{
1040 return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid);
1041}
1042
1043/*
1044 * Return the latest version of the tuple at `tid`, by updating `tid` to
1045 * point at the newest version.
1046 */
1047extern void table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid);
1048
1049/*
1050 * Return true iff tuple in slot satisfies the snapshot.
1051 *
1052 * This assumes the slot's tuple is valid, and of the appropriate type for the
1053 * AM.
1054 *
1055 * Some AMs might modify the data underlying the tuple as a side-effect. If so
1056 * they ought to mark the relevant buffer dirty.
1057 */
1058static inline bool
1059table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
1060 Snapshot snapshot)
1061{
1062 return rel->rd_tableam->tuple_satisfies_snapshot(rel, slot, snapshot);
1063}
1064
1065/*
1066 * Compute the newest xid among the tuples pointed to by items. This is used
1067 * to compute what snapshots to conflict with when replaying WAL records for
1068 * page-level index vacuums.
1069 */
1070static inline TransactionId
1071table_compute_xid_horizon_for_tuples(Relation rel,
1072 ItemPointerData *items,
1073 int nitems)
1074{
1075 return rel->rd_tableam->compute_xid_horizon_for_tuples(rel, items, nitems);
1076}
1077
1078
1079/* ----------------------------------------------------------------------------
1080 * Functions for manipulations of physical tuples.
1081 * ----------------------------------------------------------------------------
1082 */
1083
1084/*
1085 * Insert a tuple from a slot into table AM routine.
1086 *
1087 * The options bitmask allows the caller to specify options that may change the
1088 * behaviour of the AM. The AM will ignore options that it does not support.
1089 *
1090 * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't
1091 * need to be logged to WAL, even for a non-temp relation. It is the AMs
1092 * choice whether this optimization is supported.
1093 *
1094 * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
1095 * free space in the relation. This can save some cycles when we know the
1096 * relation is new and doesn't contain useful amounts of free space.
1097 * TABLE_INSERT_SKIP_FSM is commonly passed directly to
1098 * RelationGetBufferForTuple. See that method for more information.
1099 *
1100 * TABLE_INSERT_FROZEN should only be specified for inserts into
1101 * relfilenodes created during the current subtransaction and when
1102 * there are no prior snapshots or pre-existing portals open.
1103 * This causes rows to be frozen, which is an MVCC violation and
1104 * requires explicit options chosen by user.
1105 *
1106 * TABLE_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
1107 * information for the tuple. This should solely be used during table rewrites
1108 * where RelationIsLogicallyLogged(relation) is not yet accurate for the new
1109 * relation.
1110 *
1111 * Note that most of these options will be applied when inserting into the
1112 * heap's TOAST table, too, if the tuple requires any out-of-line data.
1113 *
1114 * The BulkInsertState object (if any; bistate can be NULL for default
1115 * behavior) is also just passed through to RelationGetBufferForTuple. If
1116 * `bistate` is provided, table_finish_bulk_insert() needs to be called.
1117 *
1118 * On return the slot's tts_tid and tts_tableOid are updated to reflect the
1119 * insertion. But note that any toasting of fields within the slot is NOT
1120 * reflected in the slots contents.
1121 */
1122static inline void
1123table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid,
1124 int options, struct BulkInsertStateData *bistate)
1125{
1126 rel->rd_tableam->tuple_insert(rel, slot, cid, options,
1127 bistate);
1128}
1129
1130/*
1131 * Perform a "speculative insertion". These can be backed out afterwards
1132 * without aborting the whole transaction. Other sessions can wait for the
1133 * speculative insertion to be confirmed, turning it into a regular tuple, or
1134 * aborted, as if it never existed. Speculatively inserted tuples behave as
1135 * "value locks" of short duration, used to implement INSERT .. ON CONFLICT.
1136 *
1137 * A transaction having performed a speculative insertion has to either abort,
1138 * or finish the speculative insertion with
1139 * table_tuple_complete_speculative(succeeded = ...).
1140 */
1141static inline void
1142table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot,
1143 CommandId cid, int options,
1144 struct BulkInsertStateData *bistate,
1145 uint32 specToken)
1146{
1147 rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options,
1148 bistate, specToken);
1149}
1150
1151/*
1152 * Complete "speculative insertion" started in the same transaction. If
1153 * succeeded is true, the tuple is fully inserted, if false, it's removed.
1154 */
1155static inline void
1156table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot,
1157 uint32 specToken, bool succeeded)
1158{
1159 rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken,
1160 succeeded);
1161}
1162
1163/*
1164 * Insert multiple tuples into a table.
1165 *
1166 * This is like table_insert(), but inserts multiple tuples in one
1167 * operation. That's often faster than calling table_insert() in a loop,
1168 * because e.g. the AM can reduce WAL logging and page locking overhead.
1169 *
1170 * Except for taking `nslots` tuples as input, as an array of TupleTableSlots
1171 * in `slots`, the parameters for table_multi_insert() are the same as for
1172 * table_tuple_insert().
1173 *
1174 * Note: this leaks memory into the current memory context. You can create a
1175 * temporary context before calling this, if that's a problem.
1176 */
1177static inline void
1178table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
1179 CommandId cid, int options, struct BulkInsertStateData *bistate)
1180{
1181 rel->rd_tableam->multi_insert(rel, slots, nslots,
1182 cid, options, bistate);
1183}
1184
1185/*
1186 * Delete a tuple.
1187 *
1188 * NB: do not call this directly unless prepared to deal with
1189 * concurrent-update conditions. Use simple_table_tuple_delete instead.
1190 *
1191 * Input parameters:
1192 * relation - table to be modified (caller must hold suitable lock)
1193 * tid - TID of tuple to be deleted
1194 * cid - delete command ID (used for visibility test, and stored into
1195 * cmax if successful)
1196 * crosscheck - if not InvalidSnapshot, also check tuple against this
1197 * wait - true if should wait for any conflicting update to commit/abort
1198 * Output parameters:
1199 * tmfd - filled in failure cases (see below)
1200 * changingPart - true iff the tuple is being moved to another partition
1201 * table due to an update of the partition key. Otherwise, false.
1202 *
1203 * Normal, successful return value is TM_Ok, which means we did actually
1204 * delete it. Failure return codes are TM_SelfModified, TM_Updated, and
1205 * TM_BeingModified (the last only possible if wait == false).
1206 *
1207 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
1208 * t_xmax, and, if possible, and, if possible, t_cmax. See comments for
1209 * struct TM_FailureData for additional info.
1210 */
1211static inline TM_Result
1212table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid,
1213 Snapshot snapshot, Snapshot crosscheck, bool wait,
1214 TM_FailureData *tmfd, bool changingPart)
1215{
1216 return rel->rd_tableam->tuple_delete(rel, tid, cid,
1217 snapshot, crosscheck,
1218 wait, tmfd, changingPart);
1219}
1220
1221/*
1222 * Update a tuple.
1223 *
1224 * NB: do not call this directly unless you are prepared to deal with
1225 * concurrent-update conditions. Use simple_table_tuple_update instead.
1226 *
1227 * Input parameters:
1228 * relation - table to be modified (caller must hold suitable lock)
1229 * otid - TID of old tuple to be replaced
1230 * slot - newly constructed tuple data to store
1231 * cid - update command ID (used for visibility test, and stored into
1232 * cmax/cmin if successful)
1233 * crosscheck - if not InvalidSnapshot, also check old tuple against this
1234 * wait - true if should wait for any conflicting update to commit/abort
1235 * Output parameters:
1236 * tmfd - filled in failure cases (see below)
1237 * lockmode - filled with lock mode acquired on tuple
1238 * update_indexes - in success cases this is set to true if new index entries
1239 * are required for this tuple
1240 *
1241 * Normal, successful return value is TM_Ok, which means we did actually
1242 * update it. Failure return codes are TM_SelfModified, TM_Updated, and
1243 * TM_BeingModified (the last only possible if wait == false).
1244 *
1245 * On success, the slot's tts_tid and tts_tableOid are updated to match the new
1246 * stored tuple; in particular, slot->tts_tid is set to the TID where the
1247 * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
1248 * update was done. However, any TOAST changes in the new tuple's
1249 * data are not reflected into *newtup.
1250 *
1251 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
1252 * t_xmax, and, if possible, t_cmax. See comments for struct TM_FailureData
1253 * for additional info.
1254 */
1255static inline TM_Result
1256table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot,
1257 CommandId cid, Snapshot snapshot, Snapshot crosscheck,
1258 bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode,
1259 bool *update_indexes)
1260{
1261 return rel->rd_tableam->tuple_update(rel, otid, slot,
1262 cid, snapshot, crosscheck,
1263 wait, tmfd,
1264 lockmode, update_indexes);
1265}
1266
1267/*
1268 * Lock a tuple in the specified mode.
1269 *
1270 * Input parameters:
1271 * relation: relation containing tuple (caller must hold suitable lock)
1272 * tid: TID of tuple to lock
1273 * snapshot: snapshot to use for visibility determinations
1274 * cid: current command ID (used for visibility test, and stored into
1275 * tuple's cmax if lock is successful)
1276 * mode: lock mode desired
1277 * wait_policy: what to do if tuple lock is not available
1278 * flags:
1279 * If TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS, follow the update chain to
1280 * also lock descendant tuples if lock modes don't conflict.
1281 * If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, follow the update chain and lock
1282 * latest version.
1283 *
1284 * Output parameters:
1285 * *slot: contains the target tuple
1286 * *tmfd: filled in failure cases (see below)
1287 *
1288 * Function result may be:
1289 * TM_Ok: lock was successfully acquired
1290 * TM_Invisible: lock failed because tuple was never visible to us
1291 * TM_SelfModified: lock failed because tuple updated by self
1292 * TM_Updated: lock failed because tuple updated by other xact
1293 * TM_Deleted: lock failed because tuple deleted by other xact
1294 * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip
1295 *
1296 * In the failure cases other than TM_Invisible and TM_Deleted, the routine
1297 * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See
1298 * comments for struct TM_FailureData for additional info.
1299 */
1300static inline TM_Result
1301table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot,
1302 TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
1303 LockWaitPolicy wait_policy, uint8 flags,
1304 TM_FailureData *tmfd)
1305{
1306 return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot,
1307 cid, mode, wait_policy,
1308 flags, tmfd);
1309}
1310
1311/*
1312 * Perform operations necessary to complete insertions made via
1313 * tuple_insert and multi_insert with a BulkInsertState specified. This
1314 * e.g. may e.g. used to flush the relation when inserting with
1315 * TABLE_INSERT_SKIP_WAL specified.
1316 */
1317static inline void
1318table_finish_bulk_insert(Relation rel, int options)
1319{
1320 /* optional callback */
1321 if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert)
1322 rel->rd_tableam->finish_bulk_insert(rel, options);
1323}
1324
1325
1326/* ------------------------------------------------------------------------
1327 * DDL related functionality.
1328 * ------------------------------------------------------------------------
1329 */
1330
1331/*
1332 * Create storage for `rel` in `newrnode`, with persistence set to
1333 * `persistence`.
1334 *
1335 * This is used both during relation creation and various DDL operations to
1336 * create a new relfilenode that can be filled from scratch. When creating
1337 * new storage for an existing relfilenode, this should be called before the
1338 * relcache entry has been updated.
1339 *
1340 * *freezeXid, *minmulti are set to the xid / multixact horizon for the table
1341 * that pg_class.{relfrozenxid, relminmxid} have to be set to.
1342 */
1343static inline void
1344table_relation_set_new_filenode(Relation rel,
1345 const RelFileNode *newrnode,
1346 char persistence,
1347 TransactionId *freezeXid,
1348 MultiXactId *minmulti)
1349{
1350 rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence,
1351 freezeXid, minmulti);
1352}
1353
1354/*
1355 * Remove all table contents from `rel`, in a non-transactional manner.
1356 * Non-transactional meaning that there's no need to support rollbacks. This
1357 * commonly only is used to perform truncations for relfilenodes created in the
1358 * current transaction.
1359 */
1360static inline void
1361table_relation_nontransactional_truncate(Relation rel)
1362{
1363 rel->rd_tableam->relation_nontransactional_truncate(rel);
1364}
1365
1366/*
1367 * Copy data from `rel` into the new relfilenode `newrnode`. The new
1368 * relfilenode may not have storage associated before this function is
1369 * called. This is only supposed to be used for low level operations like
1370 * changing a relation's tablespace.
1371 */
1372static inline void
1373table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
1374{
1375 rel->rd_tableam->relation_copy_data(rel, newrnode);
1376}
1377
1378/*
1379 * Copy data from `OldTable` into `NewTable`, as part of a CLUSTER or VACUUM
1380 * FULL.
1381 *
1382 * Additional Input parameters:
1383 * - use_sort - if true, the table contents are sorted appropriate for
1384 * `OldIndex`; if false and OldIndex is not InvalidOid, the data is copied
1385 * in that index's order; if false and OldIndex is InvalidOid, no sorting is
1386 * performed
1387 * - OldIndex - see use_sort
1388 * - OldestXmin - computed by vacuum_set_xid_limits(), even when
1389 * not needed for the relation's AM
1390 * - *xid_cutoff - ditto
1391 * - *multi_cutoff - ditto
1392 *
1393 * Output parameters:
1394 * - *xid_cutoff - rel's new relfrozenxid value, may be invalid
1395 * - *multi_cutoff - rel's new relminmxid value, may be invalid
1396 * - *tups_vacuumed - stats, for logging, if appropriate for AM
1397 * - *tups_recently_dead - stats, for logging, if appropriate for AM
1398 */
1399static inline void
1400table_relation_copy_for_cluster(Relation OldTable, Relation NewTable,
1401 Relation OldIndex,
1402 bool use_sort,
1403 TransactionId OldestXmin,
1404 TransactionId *xid_cutoff,
1405 MultiXactId *multi_cutoff,
1406 double *num_tuples,
1407 double *tups_vacuumed,
1408 double *tups_recently_dead)
1409{
1410 OldTable->rd_tableam->relation_copy_for_cluster(OldTable, NewTable, OldIndex,
1411 use_sort, OldestXmin,
1412 xid_cutoff, multi_cutoff,
1413 num_tuples, tups_vacuumed,
1414 tups_recently_dead);
1415}
1416
1417/*
1418 * Perform VACUUM on the relation. The VACUUM can be triggered by a user or by
1419 * autovacuum. The specific actions performed by the AM will depend heavily on
1420 * the individual AM.
1421 *
1422 * On entry a transaction needs to already been established, and the
1423 * table is locked with a ShareUpdateExclusive lock.
1424 *
1425 * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this
1426 * routine, even if (for ANALYZE) it is part of the same VACUUM command.
1427 */
1428static inline void
1429table_relation_vacuum(Relation rel, struct VacuumParams *params,
1430 BufferAccessStrategy bstrategy)
1431{
1432 rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
1433}
1434
1435/*
1436 * Prepare to analyze block `blockno` of `scan`. The scan needs to have been
1437 * started with table_beginscan_analyze(). Note that this routine might
1438 * acquire resources like locks that are held until
1439 * table_scan_analyze_next_tuple() returns false.
1440 *
1441 * Returns false if block is unsuitable for sampling, true otherwise.
1442 */
1443static inline bool
1444table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
1445 BufferAccessStrategy bstrategy)
1446{
1447 return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno,
1448 bstrategy);
1449}
1450
1451/*
1452 * Iterate over tuples in the block selected with
1453 * table_scan_analyze_next_block() (which needs to have returned true, and
1454 * this routine may not have returned false for the same block before). If a
1455 * tuple that's suitable for sampling is found, true is returned and a tuple
1456 * is stored in `slot`.
1457 *
1458 * *liverows and *deadrows are incremented according to the encountered
1459 * tuples.
1460 */
1461static inline bool
1462table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1463 double *liverows, double *deadrows,
1464 TupleTableSlot *slot)
1465{
1466 return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin,
1467 liverows, deadrows,
1468 slot);
1469}
1470
1471/*
1472 * table_index_build_scan - scan the table to find tuples to be indexed
1473 *
1474 * This is called back from an access-method-specific index build procedure
1475 * after the AM has done whatever setup it needs. The parent table relation
1476 * is scanned to find tuples that should be entered into the index. Each
1477 * such tuple is passed to the AM's callback routine, which does the right
1478 * things to add it to the new index. After we return, the AM's index
1479 * build procedure does whatever cleanup it needs.
1480 *
1481 * The total count of live tuples is returned. This is for updating pg_class
1482 * statistics. (It's annoying not to be able to do that here, but we want to
1483 * merge that update with others; see index_update_stats.) Note that the
1484 * index AM itself must keep track of the number of index tuples; we don't do
1485 * so here because the AM might reject some of the tuples for its own reasons,
1486 * such as being unable to store NULLs.
1487 *
1488 * If 'progress', the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when
1489 * starting the scan, and PROGRESS_SCAN_BLOCKS_DONE is updated as we go along.
1490 *
1491 * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
1492 * any potentially broken HOT chains. Currently, we set this if there are any
1493 * RECENTLY_DEAD or DELETE_IN_PROGRESS entries in a HOT chain, without trying
1494 * very hard to detect whether they're really incompatible with the chain tip.
1495 * This only really makes sense for heap AM, it might need to be generalized
1496 * for other AMs later.
1497 */
1498static inline double
1499table_index_build_scan(Relation table_rel,
1500 Relation index_rel,
1501 struct IndexInfo *index_info,
1502 bool allow_sync,
1503 bool progress,
1504 IndexBuildCallback callback,
1505 void *callback_state,
1506 TableScanDesc scan)
1507{
1508 return table_rel->rd_tableam->index_build_range_scan(table_rel,
1509 index_rel,
1510 index_info,
1511 allow_sync,
1512 false,
1513 progress,
1514 0,
1515 InvalidBlockNumber,
1516 callback,
1517 callback_state,
1518 scan);
1519}
1520
1521/*
1522 * As table_index_build_scan(), except that instead of scanning the complete
1523 * table, only the given number of blocks are scanned. Scan to end-of-rel can
1524 * be signalled by passing InvalidBlockNumber as numblocks. Note that
1525 * restricting the range to scan cannot be done when requesting syncscan.
1526 *
1527 * When "anyvisible" mode is requested, all tuples visible to any transaction
1528 * are indexed and counted as live, including those inserted or deleted by
1529 * transactions that are still in progress.
1530 */
1531static inline double
1532table_index_build_range_scan(Relation table_rel,
1533 Relation index_rel,
1534 struct IndexInfo *index_info,
1535 bool allow_sync,
1536 bool anyvisible,
1537 bool progress,
1538 BlockNumber start_blockno,
1539 BlockNumber numblocks,
1540 IndexBuildCallback callback,
1541 void *callback_state,
1542 TableScanDesc scan)
1543{
1544 return table_rel->rd_tableam->index_build_range_scan(table_rel,
1545 index_rel,
1546 index_info,
1547 allow_sync,
1548 anyvisible,
1549 progress,
1550 start_blockno,
1551 numblocks,
1552 callback,
1553 callback_state,
1554 scan);
1555}
1556
1557/*
1558 * table_index_validate_scan - second table scan for concurrent index build
1559 *
1560 * See validate_index() for an explanation.
1561 */
1562static inline void
1563table_index_validate_scan(Relation table_rel,
1564 Relation index_rel,
1565 struct IndexInfo *index_info,
1566 Snapshot snapshot,
1567 struct ValidateIndexState *state)
1568{
1569 table_rel->rd_tableam->index_validate_scan(table_rel,
1570 index_rel,
1571 index_info,
1572 snapshot,
1573 state);
1574}
1575
1576
1577/* ----------------------------------------------------------------------------
1578 * Miscellaneous functionality
1579 * ----------------------------------------------------------------------------
1580 */
1581
1582/*
1583 * Return the current size of `rel` in bytes. If `forkNumber` is
1584 * InvalidForkNumber, return the relation's overall size, otherwise the size
1585 * for the indicated fork.
1586 *
1587 * Note that the overall size might not be the equivalent of the sum of sizes
1588 * for the individual forks for some AMs, e.g. because the AMs storage does
1589 * not neatly map onto the builtin types of forks.
1590 */
1591static inline uint64
1592table_relation_size(Relation rel, ForkNumber forkNumber)
1593{
1594 return rel->rd_tableam->relation_size(rel, forkNumber);
1595}
1596
1597/*
1598 * table_relation_needs_toast_table - does this relation need a toast table?
1599 */
1600static inline bool
1601table_relation_needs_toast_table(Relation rel)
1602{
1603 return rel->rd_tableam->relation_needs_toast_table(rel);
1604}
1605
1606
1607/* ----------------------------------------------------------------------------
1608 * Planner related functionality
1609 * ----------------------------------------------------------------------------
1610 */
1611
1612/*
1613 * Estimate the current size of the relation, as an AM specific workhorse for
1614 * estimate_rel_size(). Look there for an explanation of the parameters.
1615 */
1616static inline void
1617table_relation_estimate_size(Relation rel, int32 *attr_widths,
1618 BlockNumber *pages, double *tuples,
1619 double *allvisfrac)
1620{
1621 rel->rd_tableam->relation_estimate_size(rel, attr_widths, pages, tuples,
1622 allvisfrac);
1623}
1624
1625
1626/* ----------------------------------------------------------------------------
1627 * Executor related functionality
1628 * ----------------------------------------------------------------------------
1629 */
1630
1631/*
1632 * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
1633 * a bitmap table scan. `scan` needs to have been started via
1634 * table_beginscan_bm(). Returns false if there are no tuples to be found on
1635 * the page, true otherwise.
1636 *
1637 * Note, this is an optionally implemented function, therefore should only be
1638 * used after verifying the presence (at plan time or such).
1639 */
1640static inline bool
1641table_scan_bitmap_next_block(TableScanDesc scan,
1642 struct TBMIterateResult *tbmres)
1643{
1644 return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
1645 tbmres);
1646}
1647
1648/*
1649 * Fetch the next tuple of a bitmap table scan into `slot` and return true if
1650 * a visible tuple was found, false otherwise.
1651 * table_scan_bitmap_next_block() needs to previously have selected a
1652 * block (i.e. returned true), and no previous
1653 * table_scan_bitmap_next_tuple() for the same block may have
1654 * returned false.
1655 */
1656static inline bool
1657table_scan_bitmap_next_tuple(TableScanDesc scan,
1658 struct TBMIterateResult *tbmres,
1659 TupleTableSlot *slot)
1660{
1661 return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
1662 tbmres,
1663 slot);
1664}
1665
1666/*
1667 * Prepare to fetch tuples from the next block in a sample scan. Returns false
1668 * if the sample scan is finished, true otherwise. `scan` needs to have been
1669 * started via table_beginscan_sampling().
1670 *
1671 * This will call the TsmRoutine's NextSampleBlock() callback if necessary
1672 * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
1673 * underlying relation.
1674 */
1675static inline bool
1676table_scan_sample_next_block(TableScanDesc scan,
1677 struct SampleScanState *scanstate)
1678{
1679 return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
1680}
1681
1682/*
1683 * Fetch the next sample tuple into `slot` and return true if a visible tuple
1684 * was found, false otherwise. table_scan_sample_next_block() needs to
1685 * previously have selected a block (i.e. returned true), and no previous
1686 * table_scan_sample_next_tuple() for the same block may have returned false.
1687 *
1688 * This will call the TsmRoutine's NextSampleTuple() callback.
1689 */
1690static inline bool
1691table_scan_sample_next_tuple(TableScanDesc scan,
1692 struct SampleScanState *scanstate,
1693 TupleTableSlot *slot)
1694{
1695 return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
1696 slot);
1697}
1698
1699
1700/* ----------------------------------------------------------------------------
1701 * Functions to make modifications a bit simpler.
1702 * ----------------------------------------------------------------------------
1703 */
1704
1705extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot);
1706extern void simple_table_tuple_delete(Relation rel, ItemPointer tid,
1707 Snapshot snapshot);
1708extern void simple_table_tuple_update(Relation rel, ItemPointer otid,
1709 TupleTableSlot *slot, Snapshot snapshot,
1710 bool *update_indexes);
1711
1712
1713/* ----------------------------------------------------------------------------
1714 * Helper functions to implement parallel scans for block oriented AMs.
1715 * ----------------------------------------------------------------------------
1716 */
1717
1718extern Size table_block_parallelscan_estimate(Relation rel);
1719extern Size table_block_parallelscan_initialize(Relation rel,
1720 ParallelTableScanDesc pscan);
1721extern void table_block_parallelscan_reinitialize(Relation rel,
1722 ParallelTableScanDesc pscan);
1723extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
1724 ParallelBlockTableScanDesc pbscan);
1725extern void table_block_parallelscan_startblock_init(Relation rel,
1726 ParallelBlockTableScanDesc pbscan);
1727
1728
1729/* ----------------------------------------------------------------------------
1730 * Functions in tableamapi.c
1731 * ----------------------------------------------------------------------------
1732 */
1733
1734extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler);
1735extern const TableAmRoutine *GetHeapamTableAmRoutine(void);
1736extern bool check_default_table_access_method(char **newval, void **extra,
1737 GucSource source);
1738
1739#endif /* TABLEAM_H */
1740