1/*-------------------------------------------------------------------------
2 *
3 * heapam.c
4 * heap access method code
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/heapam.c
12 *
13 *
14 * INTERFACE ROUTINES
15 * heap_beginscan - begin relation scan
16 * heap_rescan - restart a relation scan
17 * heap_endscan - end relation scan
18 * heap_getnext - retrieve next tuple in scan
19 * heap_fetch - retrieve tuple with given tid
20 * heap_insert - insert tuple into a relation
21 * heap_multi_insert - insert multiple tuples into a relation
22 * heap_delete - delete a tuple from a relation
23 * heap_update - replace a tuple in a relation with another tuple
24 * heap_sync - sync heap, for when no WAL has been written
25 *
26 * NOTES
27 * This file contains the heap_ routines which implement
28 * the POSTGRES heap access method used for all POSTGRES
29 * relations.
30 *
31 *-------------------------------------------------------------------------
32 */
33#include "postgres.h"
34
35#include "access/bufmask.h"
36#include "access/genam.h"
37#include "access/heapam.h"
38#include "access/heapam_xlog.h"
39#include "access/hio.h"
40#include "access/multixact.h"
41#include "access/parallel.h"
42#include "access/relscan.h"
43#include "access/sysattr.h"
44#include "access/tableam.h"
45#include "access/transam.h"
46#include "access/tuptoaster.h"
47#include "access/valid.h"
48#include "access/visibilitymap.h"
49#include "access/xact.h"
50#include "access/xlog.h"
51#include "access/xloginsert.h"
52#include "access/xlogutils.h"
53#include "catalog/catalog.h"
54#include "miscadmin.h"
55#include "pgstat.h"
56#include "port/atomics.h"
57#include "storage/bufmgr.h"
58#include "storage/freespace.h"
59#include "storage/lmgr.h"
60#include "storage/predicate.h"
61#include "storage/procarray.h"
62#include "storage/smgr.h"
63#include "storage/spin.h"
64#include "storage/standby.h"
65#include "utils/datum.h"
66#include "utils/inval.h"
67#include "utils/lsyscache.h"
68#include "utils/relcache.h"
69#include "utils/snapmgr.h"
70#include "utils/spccache.h"
71
72
73static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
74 TransactionId xid, CommandId cid, int options);
75static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
76 Buffer newbuf, HeapTuple oldtup,
77 HeapTuple newtup, HeapTuple old_key_tup,
78 bool all_visible_cleared, bool new_all_visible_cleared);
79static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
80 Bitmapset *interesting_cols,
81 HeapTuple oldtup, HeapTuple newtup);
82static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
83 LockTupleMode mode, LockWaitPolicy wait_policy,
84 bool *have_tuple_lock);
85static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
86 uint16 old_infomask2, TransactionId add_to_xmax,
87 LockTupleMode mode, bool is_update,
88 TransactionId *result_xmax, uint16 *result_infomask,
89 uint16 *result_infomask2);
90static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
91 ItemPointer ctid, TransactionId xid,
92 LockTupleMode mode);
93static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
94 uint16 *new_infomask2);
95static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
96 uint16 t_infomask);
97static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
98 LockTupleMode lockmode, bool *current_is_member);
99static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
100 Relation rel, ItemPointer ctid, XLTW_Oper oper,
101 int *remaining);
102static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
103 uint16 infomask, Relation rel, int *remaining);
104static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
105static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
106 bool *copy);
107
108
109/*
110 * Each tuple lock mode has a corresponding heavyweight lock, and one or two
111 * corresponding MultiXactStatuses (one to merely lock tuples, another one to
112 * update them). This table (and the macros below) helps us determine the
113 * heavyweight lock mode and MultiXactStatus values to use for any particular
114 * tuple lock strength.
115 *
116 * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117 * instead.
118 */
119static const struct
120{
121 LOCKMODE hwlock;
122 int lockstatus;
123 int updstatus;
124}
125
126 tupleLockExtraInfo[MaxLockTupleMode + 1] =
127{
128 { /* LockTupleKeyShare */
129 AccessShareLock,
130 MultiXactStatusForKeyShare,
131 -1 /* KeyShare does not allow updating tuples */
132 },
133 { /* LockTupleShare */
134 RowShareLock,
135 MultiXactStatusForShare,
136 -1 /* Share does not allow updating tuples */
137 },
138 { /* LockTupleNoKeyExclusive */
139 ExclusiveLock,
140 MultiXactStatusForNoKeyUpdate,
141 MultiXactStatusNoKeyUpdate
142 },
143 { /* LockTupleExclusive */
144 AccessExclusiveLock,
145 MultiXactStatusForUpdate,
146 MultiXactStatusUpdate
147 }
148};
149
150/* Get the LOCKMODE for a given MultiXactStatus */
151#define LOCKMODE_from_mxstatus(status) \
152 (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153
154/*
155 * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156 * This is more readable than having every caller translate it to lock.h's
157 * LOCKMODE.
158 */
159#define LockTupleTuplock(rel, tup, mode) \
160 LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161#define UnlockTupleTuplock(rel, tup, mode) \
162 UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163#define ConditionalLockTupleTuplock(rel, tup, mode) \
164 ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165
166#ifdef USE_PREFETCH
167/*
168 * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
169 * this structure to coordinate prefetching activity.
170 */
171typedef struct
172{
173 BlockNumber cur_hblkno;
174 int next_item;
175 int nitems;
176 ItemPointerData *tids;
177} XidHorizonPrefetchState;
178#endif
179
180/*
181 * This table maps tuple lock strength values for each particular
182 * MultiXactStatus value.
183 */
184static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
185{
186 LockTupleKeyShare, /* ForKeyShare */
187 LockTupleShare, /* ForShare */
188 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
189 LockTupleExclusive, /* ForUpdate */
190 LockTupleNoKeyExclusive, /* NoKeyUpdate */
191 LockTupleExclusive /* Update */
192};
193
194/* Get the LockTupleMode for a given MultiXactStatus */
195#define TUPLOCK_from_mxstatus(status) \
196 (MultiXactStatusLock[(status)])
197
198/* ----------------------------------------------------------------
199 * heap support routines
200 * ----------------------------------------------------------------
201 */
202
203/* ----------------
204 * initscan - scan code common to heap_beginscan and heap_rescan
205 * ----------------
206 */
207static void
208initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
209{
210 ParallelBlockTableScanDesc bpscan = NULL;
211 bool allow_strat;
212 bool allow_sync;
213
214 /*
215 * Determine the number of blocks we have to scan.
216 *
217 * It is sufficient to do this once at scan start, since any tuples added
218 * while the scan is in progress will be invisible to my snapshot anyway.
219 * (That is not true when using a non-MVCC snapshot. However, we couldn't
220 * guarantee to return tuples added after scan start anyway, since they
221 * might go into pages we already scanned. To guarantee consistent
222 * results for a non-MVCC snapshot, the caller must hold some higher-level
223 * lock that ensures the interesting tuple(s) won't change.)
224 */
225 if (scan->rs_base.rs_parallel != NULL)
226 {
227 bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
228 scan->rs_nblocks = bpscan->phs_nblocks;
229 }
230 else
231 scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
232
233 /*
234 * If the table is large relative to NBuffers, use a bulk-read access
235 * strategy and enable synchronized scanning (see syncscan.c). Although
236 * the thresholds for these features could be different, we make them the
237 * same so that there are only two behaviors to tune rather than four.
238 * (However, some callers need to be able to disable one or both of these
239 * behaviors, independently of the size of the table; also there is a GUC
240 * variable that can disable synchronized scanning.)
241 *
242 * Note that table_block_parallelscan_initialize has a very similar test;
243 * if you change this, consider changing that one, too.
244 */
245 if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
246 scan->rs_nblocks > NBuffers / 4)
247 {
248 allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
249 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
250 }
251 else
252 allow_strat = allow_sync = false;
253
254 if (allow_strat)
255 {
256 /* During a rescan, keep the previous strategy object. */
257 if (scan->rs_strategy == NULL)
258 scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
259 }
260 else
261 {
262 if (scan->rs_strategy != NULL)
263 FreeAccessStrategy(scan->rs_strategy);
264 scan->rs_strategy = NULL;
265 }
266
267 if (scan->rs_base.rs_parallel != NULL)
268 {
269 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
270 if (scan->rs_base.rs_parallel->phs_syncscan)
271 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
272 else
273 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
274 }
275 else if (keep_startblock)
276 {
277 /*
278 * When rescanning, we want to keep the previous startblock setting,
279 * so that rewinding a cursor doesn't generate surprising results.
280 * Reset the active syncscan setting, though.
281 */
282 if (allow_sync && synchronize_seqscans)
283 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
284 else
285 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
286 }
287 else if (allow_sync && synchronize_seqscans)
288 {
289 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
290 scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
291 }
292 else
293 {
294 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295 scan->rs_startblock = 0;
296 }
297
298 scan->rs_numblocks = InvalidBlockNumber;
299 scan->rs_inited = false;
300 scan->rs_ctup.t_data = NULL;
301 ItemPointerSetInvalid(&scan->rs_ctup.t_self);
302 scan->rs_cbuf = InvalidBuffer;
303 scan->rs_cblock = InvalidBlockNumber;
304
305 /* page-at-a-time fields are always invalid when not rs_inited */
306
307 /*
308 * copy the scan key, if appropriate
309 */
310 if (key != NULL)
311 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
312
313 /*
314 * Currently, we only have a stats counter for sequential heap scans (but
315 * e.g for bitmap scans the underlying bitmap index scans will be counted,
316 * and for sample scans we update stats for tuple fetches).
317 */
318 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
319 pgstat_count_heap_scan(scan->rs_base.rs_rd);
320}
321
322/*
323 * heap_setscanlimits - restrict range of a heapscan
324 *
325 * startBlk is the page to start at
326 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
327 */
328void
329heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
330{
331 HeapScanDesc scan = (HeapScanDesc) sscan;
332
333 Assert(!scan->rs_inited); /* else too late to change */
334 /* else rs_startblock is significant */
335 Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
336
337 /* Check startBlk is valid (but allow case of zero blocks...) */
338 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
339
340 scan->rs_startblock = startBlk;
341 scan->rs_numblocks = numBlks;
342}
343
344/*
345 * heapgetpage - subroutine for heapgettup()
346 *
347 * This routine reads and pins the specified page of the relation.
348 * In page-at-a-time mode it performs additional work, namely determining
349 * which tuples on the page are visible.
350 */
351void
352heapgetpage(TableScanDesc sscan, BlockNumber page)
353{
354 HeapScanDesc scan = (HeapScanDesc) sscan;
355 Buffer buffer;
356 Snapshot snapshot;
357 Page dp;
358 int lines;
359 int ntup;
360 OffsetNumber lineoff;
361 ItemId lpp;
362 bool all_visible;
363
364 Assert(page < scan->rs_nblocks);
365
366 /* release previous scan buffer, if any */
367 if (BufferIsValid(scan->rs_cbuf))
368 {
369 ReleaseBuffer(scan->rs_cbuf);
370 scan->rs_cbuf = InvalidBuffer;
371 }
372
373 /*
374 * Be sure to check for interrupts at least once per page. Checks at
375 * higher code levels won't be able to stop a seqscan that encounters many
376 * pages' worth of consecutive dead tuples.
377 */
378 CHECK_FOR_INTERRUPTS();
379
380 /* read page using selected strategy */
381 scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
382 RBM_NORMAL, scan->rs_strategy);
383 scan->rs_cblock = page;
384
385 if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
386 return;
387
388 buffer = scan->rs_cbuf;
389 snapshot = scan->rs_base.rs_snapshot;
390
391 /*
392 * Prune and repair fragmentation for the whole page, if possible.
393 */
394 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
395
396 /*
397 * We must hold share lock on the buffer content while examining tuple
398 * visibility. Afterwards, however, the tuples we have found to be
399 * visible are guaranteed good as long as we hold the buffer pin.
400 */
401 LockBuffer(buffer, BUFFER_LOCK_SHARE);
402
403 dp = BufferGetPage(buffer);
404 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
405 lines = PageGetMaxOffsetNumber(dp);
406 ntup = 0;
407
408 /*
409 * If the all-visible flag indicates that all tuples on the page are
410 * visible to everyone, we can skip the per-tuple visibility tests.
411 *
412 * Note: In hot standby, a tuple that's already visible to all
413 * transactions in the master might still be invisible to a read-only
414 * transaction in the standby. We partly handle this problem by tracking
415 * the minimum xmin of visible tuples as the cut-off XID while marking a
416 * page all-visible on master and WAL log that along with the visibility
417 * map SET operation. In hot standby, we wait for (or abort) all
418 * transactions that can potentially may not see one or more tuples on the
419 * page. That's how index-only scans work fine in hot standby. A crucial
420 * difference between index-only scans and heap scans is that the
421 * index-only scan completely relies on the visibility map where as heap
422 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
423 * the page-level flag can be trusted in the same way, because it might
424 * get propagated somehow without being explicitly WAL-logged, e.g. via a
425 * full page write. Until we can prove that beyond doubt, let's check each
426 * tuple for visibility the hard way.
427 */
428 all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
429
430 for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
431 lineoff <= lines;
432 lineoff++, lpp++)
433 {
434 if (ItemIdIsNormal(lpp))
435 {
436 HeapTupleData loctup;
437 bool valid;
438
439 loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
440 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
441 loctup.t_len = ItemIdGetLength(lpp);
442 ItemPointerSet(&(loctup.t_self), page, lineoff);
443
444 if (all_visible)
445 valid = true;
446 else
447 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
448
449 CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
450 &loctup, buffer, snapshot);
451
452 if (valid)
453 scan->rs_vistuples[ntup++] = lineoff;
454 }
455 }
456
457 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
458
459 Assert(ntup <= MaxHeapTuplesPerPage);
460 scan->rs_ntuples = ntup;
461}
462
463/* ----------------
464 * heapgettup - fetch next heap tuple
465 *
466 * Initialize the scan if not already done; then advance to the next
467 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
468 * or set scan->rs_ctup.t_data = NULL if no more tuples.
469 *
470 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
471 * by scan->rs_ctup".
472 *
473 * Note: the reason nkeys/key are passed separately, even though they are
474 * kept in the scan descriptor, is that the caller may not want us to check
475 * the scankeys.
476 *
477 * Note: when we fall off the end of the scan in either direction, we
478 * reset rs_inited. This means that a further request with the same
479 * scan direction will restart the scan, which is a bit odd, but a
480 * request with the opposite scan direction will start a fresh scan
481 * in the proper direction. The latter is required behavior for cursors,
482 * while the former case is generally undefined behavior in Postgres
483 * so we don't care too much.
484 * ----------------
485 */
486static void
487heapgettup(HeapScanDesc scan,
488 ScanDirection dir,
489 int nkeys,
490 ScanKey key)
491{
492 HeapTuple tuple = &(scan->rs_ctup);
493 Snapshot snapshot = scan->rs_base.rs_snapshot;
494 bool backward = ScanDirectionIsBackward(dir);
495 BlockNumber page;
496 bool finished;
497 Page dp;
498 int lines;
499 OffsetNumber lineoff;
500 int linesleft;
501 ItemId lpp;
502
503 /*
504 * calculate next starting lineoff, given scan direction
505 */
506 if (ScanDirectionIsForward(dir))
507 {
508 if (!scan->rs_inited)
509 {
510 /*
511 * return null immediately if relation is empty
512 */
513 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
514 {
515 Assert(!BufferIsValid(scan->rs_cbuf));
516 tuple->t_data = NULL;
517 return;
518 }
519 if (scan->rs_base.rs_parallel != NULL)
520 {
521 ParallelBlockTableScanDesc pbscan =
522 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
523
524 table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
525 pbscan);
526
527 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
528 pbscan);
529
530 /* Other processes might have already finished the scan. */
531 if (page == InvalidBlockNumber)
532 {
533 Assert(!BufferIsValid(scan->rs_cbuf));
534 tuple->t_data = NULL;
535 return;
536 }
537 }
538 else
539 page = scan->rs_startblock; /* first page */
540 heapgetpage((TableScanDesc) scan, page);
541 lineoff = FirstOffsetNumber; /* first offnum */
542 scan->rs_inited = true;
543 }
544 else
545 {
546 /* continue from previously returned page/tuple */
547 page = scan->rs_cblock; /* current page */
548 lineoff = /* next offnum */
549 OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
550 }
551
552 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
553
554 dp = BufferGetPage(scan->rs_cbuf);
555 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
556 lines = PageGetMaxOffsetNumber(dp);
557 /* page and lineoff now reference the physically next tid */
558
559 linesleft = lines - lineoff + 1;
560 }
561 else if (backward)
562 {
563 /* backward parallel scan not supported */
564 Assert(scan->rs_base.rs_parallel == NULL);
565
566 if (!scan->rs_inited)
567 {
568 /*
569 * return null immediately if relation is empty
570 */
571 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
572 {
573 Assert(!BufferIsValid(scan->rs_cbuf));
574 tuple->t_data = NULL;
575 return;
576 }
577
578 /*
579 * Disable reporting to syncscan logic in a backwards scan; it's
580 * not very likely anyone else is doing the same thing at the same
581 * time, and much more likely that we'll just bollix things for
582 * forward scanners.
583 */
584 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
585 /* start from last page of the scan */
586 if (scan->rs_startblock > 0)
587 page = scan->rs_startblock - 1;
588 else
589 page = scan->rs_nblocks - 1;
590 heapgetpage((TableScanDesc) scan, page);
591 }
592 else
593 {
594 /* continue from previously returned page/tuple */
595 page = scan->rs_cblock; /* current page */
596 }
597
598 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
599
600 dp = BufferGetPage(scan->rs_cbuf);
601 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
602 lines = PageGetMaxOffsetNumber(dp);
603
604 if (!scan->rs_inited)
605 {
606 lineoff = lines; /* final offnum */
607 scan->rs_inited = true;
608 }
609 else
610 {
611 lineoff = /* previous offnum */
612 OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
613 }
614 /* page and lineoff now reference the physically previous tid */
615
616 linesleft = lineoff;
617 }
618 else
619 {
620 /*
621 * ``no movement'' scan direction: refetch prior tuple
622 */
623 if (!scan->rs_inited)
624 {
625 Assert(!BufferIsValid(scan->rs_cbuf));
626 tuple->t_data = NULL;
627 return;
628 }
629
630 page = ItemPointerGetBlockNumber(&(tuple->t_self));
631 if (page != scan->rs_cblock)
632 heapgetpage((TableScanDesc) scan, page);
633
634 /* Since the tuple was previously fetched, needn't lock page here */
635 dp = BufferGetPage(scan->rs_cbuf);
636 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
637 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
638 lpp = PageGetItemId(dp, lineoff);
639 Assert(ItemIdIsNormal(lpp));
640
641 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
642 tuple->t_len = ItemIdGetLength(lpp);
643
644 return;
645 }
646
647 /*
648 * advance the scan until we find a qualifying tuple or run out of stuff
649 * to scan
650 */
651 lpp = PageGetItemId(dp, lineoff);
652 for (;;)
653 {
654 while (linesleft > 0)
655 {
656 if (ItemIdIsNormal(lpp))
657 {
658 bool valid;
659
660 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
661 tuple->t_len = ItemIdGetLength(lpp);
662 ItemPointerSet(&(tuple->t_self), page, lineoff);
663
664 /*
665 * if current tuple qualifies, return it.
666 */
667 valid = HeapTupleSatisfiesVisibility(tuple,
668 snapshot,
669 scan->rs_cbuf);
670
671 CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
672 tuple, scan->rs_cbuf,
673 snapshot);
674
675 if (valid && key != NULL)
676 HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
677 nkeys, key, valid);
678
679 if (valid)
680 {
681 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
682 return;
683 }
684 }
685
686 /*
687 * otherwise move to the next item on the page
688 */
689 --linesleft;
690 if (backward)
691 {
692 --lpp; /* move back in this page's ItemId array */
693 --lineoff;
694 }
695 else
696 {
697 ++lpp; /* move forward in this page's ItemId array */
698 ++lineoff;
699 }
700 }
701
702 /*
703 * if we get here, it means we've exhausted the items on this page and
704 * it's time to move to the next.
705 */
706 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
707
708 /*
709 * advance to next/prior page and detect end of scan
710 */
711 if (backward)
712 {
713 finished = (page == scan->rs_startblock) ||
714 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
715 if (page == 0)
716 page = scan->rs_nblocks;
717 page--;
718 }
719 else if (scan->rs_base.rs_parallel != NULL)
720 {
721 ParallelBlockTableScanDesc pbscan =
722 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
723
724 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
725 pbscan);
726 finished = (page == InvalidBlockNumber);
727 }
728 else
729 {
730 page++;
731 if (page >= scan->rs_nblocks)
732 page = 0;
733 finished = (page == scan->rs_startblock) ||
734 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
735
736 /*
737 * Report our new scan position for synchronization purposes. We
738 * don't do that when moving backwards, however. That would just
739 * mess up any other forward-moving scanners.
740 *
741 * Note: we do this before checking for end of scan so that the
742 * final state of the position hint is back at the start of the
743 * rel. That's not strictly necessary, but otherwise when you run
744 * the same query multiple times the starting position would shift
745 * a little bit backwards on every invocation, which is confusing.
746 * We don't guarantee any specific ordering in general, though.
747 */
748 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
749 ss_report_location(scan->rs_base.rs_rd, page);
750 }
751
752 /*
753 * return NULL if we've exhausted all the pages
754 */
755 if (finished)
756 {
757 if (BufferIsValid(scan->rs_cbuf))
758 ReleaseBuffer(scan->rs_cbuf);
759 scan->rs_cbuf = InvalidBuffer;
760 scan->rs_cblock = InvalidBlockNumber;
761 tuple->t_data = NULL;
762 scan->rs_inited = false;
763 return;
764 }
765
766 heapgetpage((TableScanDesc) scan, page);
767
768 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
769
770 dp = BufferGetPage(scan->rs_cbuf);
771 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
772 lines = PageGetMaxOffsetNumber((Page) dp);
773 linesleft = lines;
774 if (backward)
775 {
776 lineoff = lines;
777 lpp = PageGetItemId(dp, lines);
778 }
779 else
780 {
781 lineoff = FirstOffsetNumber;
782 lpp = PageGetItemId(dp, FirstOffsetNumber);
783 }
784 }
785}
786
787/* ----------------
788 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
789 *
790 * Same API as heapgettup, but used in page-at-a-time mode
791 *
792 * The internal logic is much the same as heapgettup's too, but there are some
793 * differences: we do not take the buffer content lock (that only needs to
794 * happen inside heapgetpage), and we iterate through just the tuples listed
795 * in rs_vistuples[] rather than all tuples on the page. Notice that
796 * lineindex is 0-based, where the corresponding loop variable lineoff in
797 * heapgettup is 1-based.
798 * ----------------
799 */
800static void
801heapgettup_pagemode(HeapScanDesc scan,
802 ScanDirection dir,
803 int nkeys,
804 ScanKey key)
805{
806 HeapTuple tuple = &(scan->rs_ctup);
807 bool backward = ScanDirectionIsBackward(dir);
808 BlockNumber page;
809 bool finished;
810 Page dp;
811 int lines;
812 int lineindex;
813 OffsetNumber lineoff;
814 int linesleft;
815 ItemId lpp;
816
817 /*
818 * calculate next starting lineindex, given scan direction
819 */
820 if (ScanDirectionIsForward(dir))
821 {
822 if (!scan->rs_inited)
823 {
824 /*
825 * return null immediately if relation is empty
826 */
827 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
828 {
829 Assert(!BufferIsValid(scan->rs_cbuf));
830 tuple->t_data = NULL;
831 return;
832 }
833 if (scan->rs_base.rs_parallel != NULL)
834 {
835 ParallelBlockTableScanDesc pbscan =
836 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
837
838 table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
839 pbscan);
840
841 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
842 pbscan);
843
844 /* Other processes might have already finished the scan. */
845 if (page == InvalidBlockNumber)
846 {
847 Assert(!BufferIsValid(scan->rs_cbuf));
848 tuple->t_data = NULL;
849 return;
850 }
851 }
852 else
853 page = scan->rs_startblock; /* first page */
854 heapgetpage((TableScanDesc) scan, page);
855 lineindex = 0;
856 scan->rs_inited = true;
857 }
858 else
859 {
860 /* continue from previously returned page/tuple */
861 page = scan->rs_cblock; /* current page */
862 lineindex = scan->rs_cindex + 1;
863 }
864
865 dp = BufferGetPage(scan->rs_cbuf);
866 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
867 lines = scan->rs_ntuples;
868 /* page and lineindex now reference the next visible tid */
869
870 linesleft = lines - lineindex;
871 }
872 else if (backward)
873 {
874 /* backward parallel scan not supported */
875 Assert(scan->rs_base.rs_parallel == NULL);
876
877 if (!scan->rs_inited)
878 {
879 /*
880 * return null immediately if relation is empty
881 */
882 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
883 {
884 Assert(!BufferIsValid(scan->rs_cbuf));
885 tuple->t_data = NULL;
886 return;
887 }
888
889 /*
890 * Disable reporting to syncscan logic in a backwards scan; it's
891 * not very likely anyone else is doing the same thing at the same
892 * time, and much more likely that we'll just bollix things for
893 * forward scanners.
894 */
895 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
896 /* start from last page of the scan */
897 if (scan->rs_startblock > 0)
898 page = scan->rs_startblock - 1;
899 else
900 page = scan->rs_nblocks - 1;
901 heapgetpage((TableScanDesc) scan, page);
902 }
903 else
904 {
905 /* continue from previously returned page/tuple */
906 page = scan->rs_cblock; /* current page */
907 }
908
909 dp = BufferGetPage(scan->rs_cbuf);
910 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
911 lines = scan->rs_ntuples;
912
913 if (!scan->rs_inited)
914 {
915 lineindex = lines - 1;
916 scan->rs_inited = true;
917 }
918 else
919 {
920 lineindex = scan->rs_cindex - 1;
921 }
922 /* page and lineindex now reference the previous visible tid */
923
924 linesleft = lineindex + 1;
925 }
926 else
927 {
928 /*
929 * ``no movement'' scan direction: refetch prior tuple
930 */
931 if (!scan->rs_inited)
932 {
933 Assert(!BufferIsValid(scan->rs_cbuf));
934 tuple->t_data = NULL;
935 return;
936 }
937
938 page = ItemPointerGetBlockNumber(&(tuple->t_self));
939 if (page != scan->rs_cblock)
940 heapgetpage((TableScanDesc) scan, page);
941
942 /* Since the tuple was previously fetched, needn't lock page here */
943 dp = BufferGetPage(scan->rs_cbuf);
944 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
945 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
946 lpp = PageGetItemId(dp, lineoff);
947 Assert(ItemIdIsNormal(lpp));
948
949 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
950 tuple->t_len = ItemIdGetLength(lpp);
951
952 /* check that rs_cindex is in sync */
953 Assert(scan->rs_cindex < scan->rs_ntuples);
954 Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
955
956 return;
957 }
958
959 /*
960 * advance the scan until we find a qualifying tuple or run out of stuff
961 * to scan
962 */
963 for (;;)
964 {
965 while (linesleft > 0)
966 {
967 lineoff = scan->rs_vistuples[lineindex];
968 lpp = PageGetItemId(dp, lineoff);
969 Assert(ItemIdIsNormal(lpp));
970
971 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
972 tuple->t_len = ItemIdGetLength(lpp);
973 ItemPointerSet(&(tuple->t_self), page, lineoff);
974
975 /*
976 * if current tuple qualifies, return it.
977 */
978 if (key != NULL)
979 {
980 bool valid;
981
982 HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
983 nkeys, key, valid);
984 if (valid)
985 {
986 scan->rs_cindex = lineindex;
987 return;
988 }
989 }
990 else
991 {
992 scan->rs_cindex = lineindex;
993 return;
994 }
995
996 /*
997 * otherwise move to the next item on the page
998 */
999 --linesleft;
1000 if (backward)
1001 --lineindex;
1002 else
1003 ++lineindex;
1004 }
1005
1006 /*
1007 * if we get here, it means we've exhausted the items on this page and
1008 * it's time to move to the next.
1009 */
1010 if (backward)
1011 {
1012 finished = (page == scan->rs_startblock) ||
1013 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1014 if (page == 0)
1015 page = scan->rs_nblocks;
1016 page--;
1017 }
1018 else if (scan->rs_base.rs_parallel != NULL)
1019 {
1020 ParallelBlockTableScanDesc pbscan =
1021 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1022
1023 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1024 pbscan);
1025 finished = (page == InvalidBlockNumber);
1026 }
1027 else
1028 {
1029 page++;
1030 if (page >= scan->rs_nblocks)
1031 page = 0;
1032 finished = (page == scan->rs_startblock) ||
1033 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1034
1035 /*
1036 * Report our new scan position for synchronization purposes. We
1037 * don't do that when moving backwards, however. That would just
1038 * mess up any other forward-moving scanners.
1039 *
1040 * Note: we do this before checking for end of scan so that the
1041 * final state of the position hint is back at the start of the
1042 * rel. That's not strictly necessary, but otherwise when you run
1043 * the same query multiple times the starting position would shift
1044 * a little bit backwards on every invocation, which is confusing.
1045 * We don't guarantee any specific ordering in general, though.
1046 */
1047 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1048 ss_report_location(scan->rs_base.rs_rd, page);
1049 }
1050
1051 /*
1052 * return NULL if we've exhausted all the pages
1053 */
1054 if (finished)
1055 {
1056 if (BufferIsValid(scan->rs_cbuf))
1057 ReleaseBuffer(scan->rs_cbuf);
1058 scan->rs_cbuf = InvalidBuffer;
1059 scan->rs_cblock = InvalidBlockNumber;
1060 tuple->t_data = NULL;
1061 scan->rs_inited = false;
1062 return;
1063 }
1064
1065 heapgetpage((TableScanDesc) scan, page);
1066
1067 dp = BufferGetPage(scan->rs_cbuf);
1068 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1069 lines = scan->rs_ntuples;
1070 linesleft = lines;
1071 if (backward)
1072 lineindex = lines - 1;
1073 else
1074 lineindex = 0;
1075 }
1076}
1077
1078
1079#if defined(DISABLE_COMPLEX_MACRO)
1080/*
1081 * This is formatted so oddly so that the correspondence to the macro
1082 * definition in access/htup_details.h is maintained.
1083 */
1084Datum
1085fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1086 bool *isnull)
1087{
1088 return (
1089 (attnum) > 0 ?
1090 (
1091 (*(isnull) = false),
1092 HeapTupleNoNulls(tup) ?
1093 (
1094 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1095 (
1096 fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1097 (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1098 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1099 )
1100 :
1101 nocachegetattr((tup), (attnum), (tupleDesc))
1102 )
1103 :
1104 (
1105 att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1106 (
1107 (*(isnull) = true),
1108 (Datum) NULL
1109 )
1110 :
1111 (
1112 nocachegetattr((tup), (attnum), (tupleDesc))
1113 )
1114 )
1115 )
1116 :
1117 (
1118 (Datum) NULL
1119 )
1120 );
1121}
1122#endif /* defined(DISABLE_COMPLEX_MACRO) */
1123
1124
1125/* ----------------------------------------------------------------
1126 * heap access method interface
1127 * ----------------------------------------------------------------
1128 */
1129
1130
1131TableScanDesc
1132heap_beginscan(Relation relation, Snapshot snapshot,
1133 int nkeys, ScanKey key,
1134 ParallelTableScanDesc parallel_scan,
1135 uint32 flags)
1136{
1137 HeapScanDesc scan;
1138
1139 /*
1140 * increment relation ref count while scanning relation
1141 *
1142 * This is just to make really sure the relcache entry won't go away while
1143 * the scan has a pointer to it. Caller should be holding the rel open
1144 * anyway, so this is redundant in all normal scenarios...
1145 */
1146 RelationIncrementReferenceCount(relation);
1147
1148 /*
1149 * allocate and initialize scan descriptor
1150 */
1151 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1152
1153 scan->rs_base.rs_rd = relation;
1154 scan->rs_base.rs_snapshot = snapshot;
1155 scan->rs_base.rs_nkeys = nkeys;
1156 scan->rs_base.rs_flags = flags;
1157 scan->rs_base.rs_parallel = parallel_scan;
1158 scan->rs_strategy = NULL; /* set in initscan */
1159
1160 /*
1161 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1162 */
1163 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1164 scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1165
1166 /*
1167 * For seqscan and sample scans in a serializable transaction, acquire a
1168 * predicate lock on the entire relation. This is required not only to
1169 * lock all the matching tuples, but also to conflict with new insertions
1170 * into the table. In an indexscan, we take page locks on the index pages
1171 * covering the range specified in the scan qual, but in a heap scan there
1172 * is nothing more fine-grained to lock. A bitmap scan is a different
1173 * story, there we have already scanned the index and locked the index
1174 * pages covering the predicate. But in that case we still have to lock
1175 * any matching heap tuples. For sample scan we could optimize the locking
1176 * to be at least page-level granularity, but we'd need to add per-tuple
1177 * locking for that.
1178 */
1179 if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
1180 {
1181 /*
1182 * Ensure a missing snapshot is noticed reliably, even if the
1183 * isolation mode means predicate locking isn't performed (and
1184 * therefore the snapshot isn't used here).
1185 */
1186 Assert(snapshot);
1187 PredicateLockRelation(relation, snapshot);
1188 }
1189
1190 /* we only need to set this up once */
1191 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1192
1193 /*
1194 * we do this here instead of in initscan() because heap_rescan also calls
1195 * initscan() and we don't want to allocate memory again
1196 */
1197 if (nkeys > 0)
1198 scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1199 else
1200 scan->rs_base.rs_key = NULL;
1201
1202 initscan(scan, key, false);
1203
1204 return (TableScanDesc) scan;
1205}
1206
1207void
1208heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1209 bool allow_strat, bool allow_sync, bool allow_pagemode)
1210{
1211 HeapScanDesc scan = (HeapScanDesc) sscan;
1212
1213 if (set_params)
1214 {
1215 if (allow_strat)
1216 scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1217 else
1218 scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1219
1220 if (allow_sync)
1221 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1222 else
1223 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1224
1225 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1226 IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1227 scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
1228 else
1229 scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1230 }
1231
1232 /*
1233 * unpin scan buffers
1234 */
1235 if (BufferIsValid(scan->rs_cbuf))
1236 ReleaseBuffer(scan->rs_cbuf);
1237
1238 /*
1239 * reinitialize scan descriptor
1240 */
1241 initscan(scan, key, true);
1242}
1243
1244void
1245heap_endscan(TableScanDesc sscan)
1246{
1247 HeapScanDesc scan = (HeapScanDesc) sscan;
1248
1249 /* Note: no locking manipulations needed */
1250
1251 /*
1252 * unpin scan buffers
1253 */
1254 if (BufferIsValid(scan->rs_cbuf))
1255 ReleaseBuffer(scan->rs_cbuf);
1256
1257 /*
1258 * decrement relation reference count and free scan descriptor storage
1259 */
1260 RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1261
1262 if (scan->rs_base.rs_key)
1263 pfree(scan->rs_base.rs_key);
1264
1265 if (scan->rs_strategy != NULL)
1266 FreeAccessStrategy(scan->rs_strategy);
1267
1268 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1269 UnregisterSnapshot(scan->rs_base.rs_snapshot);
1270
1271 pfree(scan);
1272}
1273
1274#ifdef HEAPDEBUGALL
1275#define HEAPDEBUG_1 \
1276 elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1277 RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1278#define HEAPDEBUG_2 \
1279 elog(DEBUG2, "heap_getnext returning EOS")
1280#define HEAPDEBUG_3 \
1281 elog(DEBUG2, "heap_getnext returning tuple")
1282#else
1283#define HEAPDEBUG_1
1284#define HEAPDEBUG_2
1285#define HEAPDEBUG_3
1286#endif /* !defined(HEAPDEBUGALL) */
1287
1288
1289HeapTuple
1290heap_getnext(TableScanDesc sscan, ScanDirection direction)
1291{
1292 HeapScanDesc scan = (HeapScanDesc) sscan;
1293
1294 /*
1295 * This is still widely used directly, without going through table AM, so
1296 * add a safety check. It's possible we should, at a later point,
1297 * downgrade this to an assert. The reason for checking the AM routine,
1298 * rather than the AM oid, is that this allows to write regression tests
1299 * that create another AM reusing the heap handler.
1300 */
1301 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1302 ereport(ERROR,
1303 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1304 errmsg_internal("only heap AM is supported")));
1305
1306 /* Note: no locking manipulations needed */
1307
1308 HEAPDEBUG_1; /* heap_getnext( info ) */
1309
1310 if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1311 heapgettup_pagemode(scan, direction,
1312 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1313 else
1314 heapgettup(scan, direction,
1315 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1316
1317 if (scan->rs_ctup.t_data == NULL)
1318 {
1319 HEAPDEBUG_2; /* heap_getnext returning EOS */
1320 return NULL;
1321 }
1322
1323 /*
1324 * if we get here it means we have a new current scan tuple, so point to
1325 * the proper return buffer and return the tuple.
1326 */
1327 HEAPDEBUG_3; /* heap_getnext returning tuple */
1328
1329 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1330
1331 return &scan->rs_ctup;
1332}
1333
1334#ifdef HEAPAMSLOTDEBUGALL
1335#define HEAPAMSLOTDEBUG_1 \
1336 elog(DEBUG2, "heapam_getnextslot([%s,nkeys=%d],dir=%d) called", \
1337 RelationGetRelationName(scan->rs_base.rs_rd), scan->rs_base.rs_nkeys, (int) direction)
1338#define HEAPAMSLOTDEBUG_2 \
1339 elog(DEBUG2, "heapam_getnextslot returning EOS")
1340#define HEAPAMSLOTDEBUG_3 \
1341 elog(DEBUG2, "heapam_getnextslot returning tuple")
1342#else
1343#define HEAPAMSLOTDEBUG_1
1344#define HEAPAMSLOTDEBUG_2
1345#define HEAPAMSLOTDEBUG_3
1346#endif
1347
1348bool
1349heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1350{
1351 HeapScanDesc scan = (HeapScanDesc) sscan;
1352
1353 /* Note: no locking manipulations needed */
1354
1355 HEAPAMSLOTDEBUG_1; /* heap_getnextslot( info ) */
1356
1357 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1358 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1359 else
1360 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1361
1362 if (scan->rs_ctup.t_data == NULL)
1363 {
1364 HEAPAMSLOTDEBUG_2; /* heap_getnextslot returning EOS */
1365 ExecClearTuple(slot);
1366 return false;
1367 }
1368
1369 /*
1370 * if we get here it means we have a new current scan tuple, so point to
1371 * the proper return buffer and return the tuple.
1372 */
1373 HEAPAMSLOTDEBUG_3; /* heap_getnextslot returning tuple */
1374
1375 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1376
1377 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1378 scan->rs_cbuf);
1379 return true;
1380}
1381
1382/*
1383 * heap_fetch - retrieve tuple with given tid
1384 *
1385 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1386 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1387 * against the specified snapshot.
1388 *
1389 * If successful (tuple found and passes snapshot time qual), then *userbuf
1390 * is set to the buffer holding the tuple and true is returned. The caller
1391 * must unpin the buffer when done with the tuple.
1392 *
1393 * If the tuple is not found (ie, item number references a deleted slot),
1394 * then tuple->t_data is set to NULL and false is returned.
1395 *
1396 * If the tuple is found but fails the time qual check, then false is returned
1397 * but tuple->t_data is left pointing to the tuple.
1398 *
1399 * heap_fetch does not follow HOT chains: only the exact TID requested will
1400 * be fetched.
1401 *
1402 * It is somewhat inconsistent that we ereport() on invalid block number but
1403 * return false on invalid item number. There are a couple of reasons though.
1404 * One is that the caller can relatively easily check the block number for
1405 * validity, but cannot check the item number without reading the page
1406 * himself. Another is that when we are following a t_ctid link, we can be
1407 * reasonably confident that the page number is valid (since VACUUM shouldn't
1408 * truncate off the destination page without having killed the referencing
1409 * tuple first), but the item number might well not be good.
1410 */
1411bool
1412heap_fetch(Relation relation,
1413 Snapshot snapshot,
1414 HeapTuple tuple,
1415 Buffer *userbuf)
1416{
1417 ItemPointer tid = &(tuple->t_self);
1418 ItemId lp;
1419 Buffer buffer;
1420 Page page;
1421 OffsetNumber offnum;
1422 bool valid;
1423
1424 /*
1425 * Fetch and pin the appropriate page of the relation.
1426 */
1427 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1428
1429 /*
1430 * Need share lock on buffer to examine tuple commit status.
1431 */
1432 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1433 page = BufferGetPage(buffer);
1434 TestForOldSnapshot(snapshot, relation, page);
1435
1436 /*
1437 * We'd better check for out-of-range offnum in case of VACUUM since the
1438 * TID was obtained.
1439 */
1440 offnum = ItemPointerGetOffsetNumber(tid);
1441 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1442 {
1443 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1444 ReleaseBuffer(buffer);
1445 *userbuf = InvalidBuffer;
1446 tuple->t_data = NULL;
1447 return false;
1448 }
1449
1450 /*
1451 * get the item line pointer corresponding to the requested tid
1452 */
1453 lp = PageGetItemId(page, offnum);
1454
1455 /*
1456 * Must check for deleted tuple.
1457 */
1458 if (!ItemIdIsNormal(lp))
1459 {
1460 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1461 ReleaseBuffer(buffer);
1462 *userbuf = InvalidBuffer;
1463 tuple->t_data = NULL;
1464 return false;
1465 }
1466
1467 /*
1468 * fill in *tuple fields
1469 */
1470 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1471 tuple->t_len = ItemIdGetLength(lp);
1472 tuple->t_tableOid = RelationGetRelid(relation);
1473
1474 /*
1475 * check tuple visibility, then release lock
1476 */
1477 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1478
1479 if (valid)
1480 PredicateLockTuple(relation, tuple, snapshot);
1481
1482 CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1483
1484 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1485
1486 if (valid)
1487 {
1488 /*
1489 * All checks passed, so return the tuple as valid. Caller is now
1490 * responsible for releasing the buffer.
1491 */
1492 *userbuf = buffer;
1493
1494 return true;
1495 }
1496
1497 /* Tuple failed time qual */
1498 ReleaseBuffer(buffer);
1499 *userbuf = InvalidBuffer;
1500
1501 return false;
1502}
1503
1504/*
1505 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1506 *
1507 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1508 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1509 * for the first chain member satisfying the given snapshot. If one is
1510 * found, we update *tid to reference that tuple's offset number, and
1511 * return true. If no match, return false without modifying *tid.
1512 *
1513 * heapTuple is a caller-supplied buffer. When a match is found, we return
1514 * the tuple here, in addition to updating *tid. If no match is found, the
1515 * contents of this buffer on return are undefined.
1516 *
1517 * If all_dead is not NULL, we check non-visible tuples to see if they are
1518 * globally dead; *all_dead is set true if all members of the HOT chain
1519 * are vacuumable, false if not.
1520 *
1521 * Unlike heap_fetch, the caller must already have pin and (at least) share
1522 * lock on the buffer; it is still pinned/locked at exit. Also unlike
1523 * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1524 */
1525bool
1526heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1527 Snapshot snapshot, HeapTuple heapTuple,
1528 bool *all_dead, bool first_call)
1529{
1530 Page dp = (Page) BufferGetPage(buffer);
1531 TransactionId prev_xmax = InvalidTransactionId;
1532 BlockNumber blkno;
1533 OffsetNumber offnum;
1534 bool at_chain_start;
1535 bool valid;
1536 bool skip;
1537
1538 /* If this is not the first call, previous call returned a (live!) tuple */
1539 if (all_dead)
1540 *all_dead = first_call;
1541
1542 blkno = ItemPointerGetBlockNumber(tid);
1543 offnum = ItemPointerGetOffsetNumber(tid);
1544 at_chain_start = first_call;
1545 skip = !first_call;
1546
1547 Assert(TransactionIdIsValid(RecentGlobalXmin));
1548 Assert(BufferGetBlockNumber(buffer) == blkno);
1549
1550 /* Scan through possible multiple members of HOT-chain */
1551 for (;;)
1552 {
1553 ItemId lp;
1554
1555 /* check for bogus TID */
1556 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1557 break;
1558
1559 lp = PageGetItemId(dp, offnum);
1560
1561 /* check for unused, dead, or redirected items */
1562 if (!ItemIdIsNormal(lp))
1563 {
1564 /* We should only see a redirect at start of chain */
1565 if (ItemIdIsRedirected(lp) && at_chain_start)
1566 {
1567 /* Follow the redirect */
1568 offnum = ItemIdGetRedirect(lp);
1569 at_chain_start = false;
1570 continue;
1571 }
1572 /* else must be end of chain */
1573 break;
1574 }
1575
1576 /*
1577 * Update heapTuple to point to the element of the HOT chain we're
1578 * currently investigating. Having t_self set correctly is important
1579 * because the SSI checks and the *Satisfies routine for historical
1580 * MVCC snapshots need the correct tid to decide about the visibility.
1581 */
1582 heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1583 heapTuple->t_len = ItemIdGetLength(lp);
1584 heapTuple->t_tableOid = RelationGetRelid(relation);
1585 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1586
1587 /*
1588 * Shouldn't see a HEAP_ONLY tuple at chain start.
1589 */
1590 if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1591 break;
1592
1593 /*
1594 * The xmin should match the previous xmax value, else chain is
1595 * broken.
1596 */
1597 if (TransactionIdIsValid(prev_xmax) &&
1598 !TransactionIdEquals(prev_xmax,
1599 HeapTupleHeaderGetXmin(heapTuple->t_data)))
1600 break;
1601
1602 /*
1603 * When first_call is true (and thus, skip is initially false) we'll
1604 * return the first tuple we find. But on later passes, heapTuple
1605 * will initially be pointing to the tuple we returned last time.
1606 * Returning it again would be incorrect (and would loop forever), so
1607 * we skip it and return the next match we find.
1608 */
1609 if (!skip)
1610 {
1611 /* If it's visible per the snapshot, we must return it */
1612 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1613 CheckForSerializableConflictOut(valid, relation, heapTuple,
1614 buffer, snapshot);
1615
1616 if (valid)
1617 {
1618 ItemPointerSetOffsetNumber(tid, offnum);
1619 PredicateLockTuple(relation, heapTuple, snapshot);
1620 if (all_dead)
1621 *all_dead = false;
1622 return true;
1623 }
1624 }
1625 skip = false;
1626
1627 /*
1628 * If we can't see it, maybe no one else can either. At caller
1629 * request, check whether all chain members are dead to all
1630 * transactions.
1631 *
1632 * Note: if you change the criterion here for what is "dead", fix the
1633 * planner's get_actual_variable_range() function to match.
1634 */
1635 if (all_dead && *all_dead &&
1636 !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
1637 *all_dead = false;
1638
1639 /*
1640 * Check to see if HOT chain continues past this tuple; if so fetch
1641 * the next offnum and loop around.
1642 */
1643 if (HeapTupleIsHotUpdated(heapTuple))
1644 {
1645 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1646 blkno);
1647 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1648 at_chain_start = false;
1649 prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1650 }
1651 else
1652 break; /* end of chain */
1653 }
1654
1655 return false;
1656}
1657
1658/*
1659 * heap_get_latest_tid - get the latest tid of a specified tuple
1660 *
1661 * Actually, this gets the latest version that is visible according to the
1662 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1663 * possibly uncommitted version.
1664 *
1665 * *tid is both an input and an output parameter: it is updated to
1666 * show the latest version of the row. Note that it will not be changed
1667 * if no version of the row passes the snapshot test.
1668 */
1669void
1670heap_get_latest_tid(TableScanDesc sscan,
1671 ItemPointer tid)
1672{
1673 Relation relation = sscan->rs_rd;
1674 Snapshot snapshot = sscan->rs_snapshot;
1675 ItemPointerData ctid;
1676 TransactionId priorXmax;
1677
1678 /*
1679 * table_get_latest_tid verified that the passed in tid is valid. Assume
1680 * that t_ctid links are valid however - there shouldn't be invalid ones
1681 * in the table.
1682 */
1683 Assert(ItemPointerIsValid(tid));
1684
1685 /*
1686 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1687 * need to examine, and *tid is the TID we will return if ctid turns out
1688 * to be bogus.
1689 *
1690 * Note that we will loop until we reach the end of the t_ctid chain.
1691 * Depending on the snapshot passed, there might be at most one visible
1692 * version of the row, but we don't try to optimize for that.
1693 */
1694 ctid = *tid;
1695 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1696 for (;;)
1697 {
1698 Buffer buffer;
1699 Page page;
1700 OffsetNumber offnum;
1701 ItemId lp;
1702 HeapTupleData tp;
1703 bool valid;
1704
1705 /*
1706 * Read, pin, and lock the page.
1707 */
1708 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1709 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1710 page = BufferGetPage(buffer);
1711 TestForOldSnapshot(snapshot, relation, page);
1712
1713 /*
1714 * Check for bogus item number. This is not treated as an error
1715 * condition because it can happen while following a t_ctid link. We
1716 * just assume that the prior tid is OK and return it unchanged.
1717 */
1718 offnum = ItemPointerGetOffsetNumber(&ctid);
1719 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1720 {
1721 UnlockReleaseBuffer(buffer);
1722 break;
1723 }
1724 lp = PageGetItemId(page, offnum);
1725 if (!ItemIdIsNormal(lp))
1726 {
1727 UnlockReleaseBuffer(buffer);
1728 break;
1729 }
1730
1731 /* OK to access the tuple */
1732 tp.t_self = ctid;
1733 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1734 tp.t_len = ItemIdGetLength(lp);
1735 tp.t_tableOid = RelationGetRelid(relation);
1736
1737 /*
1738 * After following a t_ctid link, we might arrive at an unrelated
1739 * tuple. Check for XMIN match.
1740 */
1741 if (TransactionIdIsValid(priorXmax) &&
1742 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1743 {
1744 UnlockReleaseBuffer(buffer);
1745 break;
1746 }
1747
1748 /*
1749 * Check tuple visibility; if visible, set it as the new result
1750 * candidate.
1751 */
1752 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1753 CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1754 if (valid)
1755 *tid = ctid;
1756
1757 /*
1758 * If there's a valid t_ctid link, follow it, else we're done.
1759 */
1760 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1761 HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
1762 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
1763 ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1764 {
1765 UnlockReleaseBuffer(buffer);
1766 break;
1767 }
1768
1769 ctid = tp.t_data->t_ctid;
1770 priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1771 UnlockReleaseBuffer(buffer);
1772 } /* end of loop */
1773}
1774
1775
1776/*
1777 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1778 *
1779 * This is called after we have waited for the XMAX transaction to terminate.
1780 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1781 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1782 * hint bit if possible --- but beware that that may not yet be possible,
1783 * if the transaction committed asynchronously.
1784 *
1785 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1786 * even if it commits.
1787 *
1788 * Hence callers should look only at XMAX_INVALID.
1789 *
1790 * Note this is not allowed for tuples whose xmax is a multixact.
1791 */
1792static void
1793UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1794{
1795 Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1796 Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1797
1798 if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1799 {
1800 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1801 TransactionIdDidCommit(xid))
1802 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1803 xid);
1804 else
1805 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1806 InvalidTransactionId);
1807 }
1808}
1809
1810
1811/*
1812 * GetBulkInsertState - prepare status object for a bulk insert
1813 */
1814BulkInsertState
1815GetBulkInsertState(void)
1816{
1817 BulkInsertState bistate;
1818
1819 bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1820 bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
1821 bistate->current_buf = InvalidBuffer;
1822 return bistate;
1823}
1824
1825/*
1826 * FreeBulkInsertState - clean up after finishing a bulk insert
1827 */
1828void
1829FreeBulkInsertState(BulkInsertState bistate)
1830{
1831 if (bistate->current_buf != InvalidBuffer)
1832 ReleaseBuffer(bistate->current_buf);
1833 FreeAccessStrategy(bistate->strategy);
1834 pfree(bistate);
1835}
1836
1837/*
1838 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1839 */
1840void
1841ReleaseBulkInsertStatePin(BulkInsertState bistate)
1842{
1843 if (bistate->current_buf != InvalidBuffer)
1844 ReleaseBuffer(bistate->current_buf);
1845 bistate->current_buf = InvalidBuffer;
1846}
1847
1848
1849/*
1850 * heap_insert - insert tuple into a heap
1851 *
1852 * The new tuple is stamped with current transaction ID and the specified
1853 * command ID.
1854 *
1855 * See table_tuple_insert for comments about most of the input flags, except
1856 * that this routine directly takes a tuple rather than a slot.
1857 *
1858 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1859 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1860 * implement table_tuple_insert_speculative().
1861 *
1862 * On return the header fields of *tup are updated to match the stored tuple;
1863 * in particular tup->t_self receives the actual TID where the tuple was
1864 * stored. But note that any toasting of fields within the tuple data is NOT
1865 * reflected into *tup.
1866 */
1867void
1868heap_insert(Relation relation, HeapTuple tup, CommandId cid,
1869 int options, BulkInsertState bistate)
1870{
1871 TransactionId xid = GetCurrentTransactionId();
1872 HeapTuple heaptup;
1873 Buffer buffer;
1874 Buffer vmbuffer = InvalidBuffer;
1875 bool all_visible_cleared = false;
1876
1877 /*
1878 * Fill in tuple header fields and toast the tuple if necessary.
1879 *
1880 * Note: below this point, heaptup is the data we actually intend to store
1881 * into the relation; tup is the caller's original untoasted data.
1882 */
1883 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1884
1885 /*
1886 * Find buffer to insert this tuple into. If the page is all visible,
1887 * this will also pin the requisite visibility map page.
1888 */
1889 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1890 InvalidBuffer, options, bistate,
1891 &vmbuffer, NULL);
1892
1893 /*
1894 * We're about to do the actual insert -- but check for conflict first, to
1895 * avoid possibly having to roll back work we've just done.
1896 *
1897 * This is safe without a recheck as long as there is no possibility of
1898 * another process scanning the page between this check and the insert
1899 * being visible to the scan (i.e., an exclusive buffer content lock is
1900 * continuously held from this point until the tuple insert is visible).
1901 *
1902 * For a heap insert, we only need to check for table-level SSI locks. Our
1903 * new tuple can't possibly conflict with existing tuple locks, and heap
1904 * page locks are only consolidated versions of tuple locks; they do not
1905 * lock "gaps" as index page locks do. So we don't need to specify a
1906 * buffer when making the call, which makes for a faster check.
1907 */
1908 CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
1909
1910 /* NO EREPORT(ERROR) from here till changes are logged */
1911 START_CRIT_SECTION();
1912
1913 RelationPutHeapTuple(relation, buffer, heaptup,
1914 (options & HEAP_INSERT_SPECULATIVE) != 0);
1915
1916 if (PageIsAllVisible(BufferGetPage(buffer)))
1917 {
1918 all_visible_cleared = true;
1919 PageClearAllVisible(BufferGetPage(buffer));
1920 visibilitymap_clear(relation,
1921 ItemPointerGetBlockNumber(&(heaptup->t_self)),
1922 vmbuffer, VISIBILITYMAP_VALID_BITS);
1923 }
1924
1925 /*
1926 * XXX Should we set PageSetPrunable on this page ?
1927 *
1928 * The inserting transaction may eventually abort thus making this tuple
1929 * DEAD and hence available for pruning. Though we don't want to optimize
1930 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1931 * aborted tuple will never be pruned until next vacuum is triggered.
1932 *
1933 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1934 */
1935
1936 MarkBufferDirty(buffer);
1937
1938 /* XLOG stuff */
1939 if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
1940 {
1941 xl_heap_insert xlrec;
1942 xl_heap_header xlhdr;
1943 XLogRecPtr recptr;
1944 Page page = BufferGetPage(buffer);
1945 uint8 info = XLOG_HEAP_INSERT;
1946 int bufflags = 0;
1947
1948 /*
1949 * If this is a catalog, we need to transmit combocids to properly
1950 * decode, so log that as well.
1951 */
1952 if (RelationIsAccessibleInLogicalDecoding(relation))
1953 log_heap_new_cid(relation, heaptup);
1954
1955 /*
1956 * If this is the single and first tuple on page, we can reinit the
1957 * page instead of restoring the whole thing. Set flag, and hide
1958 * buffer references from XLogInsert.
1959 */
1960 if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1961 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
1962 {
1963 info |= XLOG_HEAP_INIT_PAGE;
1964 bufflags |= REGBUF_WILL_INIT;
1965 }
1966
1967 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1968 xlrec.flags = 0;
1969 if (all_visible_cleared)
1970 xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
1971 if (options & HEAP_INSERT_SPECULATIVE)
1972 xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
1973 Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
1974
1975 /*
1976 * For logical decoding, we need the tuple even if we're doing a full
1977 * page write, so make sure it's included even if we take a full-page
1978 * image. (XXX We could alternatively store a pointer into the FPW).
1979 */
1980 if (RelationIsLogicallyLogged(relation) &&
1981 !(options & HEAP_INSERT_NO_LOGICAL))
1982 {
1983 xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
1984 bufflags |= REGBUF_KEEP_DATA;
1985 }
1986
1987 XLogBeginInsert();
1988 XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1989
1990 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1991 xlhdr.t_infomask = heaptup->t_data->t_infomask;
1992 xlhdr.t_hoff = heaptup->t_data->t_hoff;
1993
1994 /*
1995 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1996 * write the whole page to the xlog, we don't need to store
1997 * xl_heap_header in the xlog.
1998 */
1999 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2000 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2001 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2002 XLogRegisterBufData(0,
2003 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2004 heaptup->t_len - SizeofHeapTupleHeader);
2005
2006 /* filtering by origin on a row level is much more efficient */
2007 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2008
2009 recptr = XLogInsert(RM_HEAP_ID, info);
2010
2011 PageSetLSN(page, recptr);
2012 }
2013
2014 END_CRIT_SECTION();
2015
2016 UnlockReleaseBuffer(buffer);
2017 if (vmbuffer != InvalidBuffer)
2018 ReleaseBuffer(vmbuffer);
2019
2020 /*
2021 * If tuple is cachable, mark it for invalidation from the caches in case
2022 * we abort. Note it is OK to do this after releasing the buffer, because
2023 * the heaptup data structure is all in local memory, not in the shared
2024 * buffer.
2025 */
2026 CacheInvalidateHeapTuple(relation, heaptup, NULL);
2027
2028 /* Note: speculative insertions are counted too, even if aborted later */
2029 pgstat_count_heap_insert(relation, 1);
2030
2031 /*
2032 * If heaptup is a private copy, release it. Don't forget to copy t_self
2033 * back to the caller's image, too.
2034 */
2035 if (heaptup != tup)
2036 {
2037 tup->t_self = heaptup->t_self;
2038 heap_freetuple(heaptup);
2039 }
2040}
2041
2042/*
2043 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2044 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2045 * version of the tuple if it was toasted, or the original tuple if not. Note
2046 * that in any case, the header fields are also set in the original tuple.
2047 */
2048static HeapTuple
2049heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2050 CommandId cid, int options)
2051{
2052 /*
2053 * Parallel operations are required to be strictly read-only in a parallel
2054 * worker. Parallel inserts are not safe even in the leader in the
2055 * general case, because group locking means that heavyweight locks for
2056 * relation extension or GIN page locks will not conflict between members
2057 * of a lock group, but we don't prohibit that case here because there are
2058 * useful special cases that we can safely allow, such as CREATE TABLE AS.
2059 */
2060 if (IsParallelWorker())
2061 ereport(ERROR,
2062 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2063 errmsg("cannot insert tuples in a parallel worker")));
2064
2065 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2066 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2067 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2068 HeapTupleHeaderSetXmin(tup->t_data, xid);
2069 if (options & HEAP_INSERT_FROZEN)
2070 HeapTupleHeaderSetXminFrozen(tup->t_data);
2071
2072 HeapTupleHeaderSetCmin(tup->t_data, cid);
2073 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2074 tup->t_tableOid = RelationGetRelid(relation);
2075
2076 /*
2077 * If the new tuple is too big for storage or contains already toasted
2078 * out-of-line attributes from some other relation, invoke the toaster.
2079 */
2080 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2081 relation->rd_rel->relkind != RELKIND_MATVIEW)
2082 {
2083 /* toast table entries should never be recursively toasted */
2084 Assert(!HeapTupleHasExternal(tup));
2085 return tup;
2086 }
2087 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2088 return toast_insert_or_update(relation, tup, NULL, options);
2089 else
2090 return tup;
2091}
2092
2093/*
2094 * heap_multi_insert - insert multiple tuple into a heap
2095 *
2096 * This is like heap_insert(), but inserts multiple tuples in one operation.
2097 * That's faster than calling heap_insert() in a loop, because when multiple
2098 * tuples can be inserted on a single page, we can write just a single WAL
2099 * record covering all of them, and only need to lock/unlock the page once.
2100 *
2101 * Note: this leaks memory into the current memory context. You can create a
2102 * temporary context before calling this, if that's a problem.
2103 */
2104void
2105heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2106 CommandId cid, int options, BulkInsertState bistate)
2107{
2108 TransactionId xid = GetCurrentTransactionId();
2109 HeapTuple *heaptuples;
2110 int i;
2111 int ndone;
2112 PGAlignedBlock scratch;
2113 Page page;
2114 bool needwal;
2115 Size saveFreeSpace;
2116 bool need_tuple_data = RelationIsLogicallyLogged(relation);
2117 bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2118
2119 /* currently not needed (thus unsupported) for heap_multi_insert() */
2120 AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2121
2122 needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2123 saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2124 HEAP_DEFAULT_FILLFACTOR);
2125
2126 /* Toast and set header data in all the slots */
2127 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2128 for (i = 0; i < ntuples; i++)
2129 {
2130 HeapTuple tuple;
2131
2132 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2133 slots[i]->tts_tableOid = RelationGetRelid(relation);
2134 tuple->t_tableOid = slots[i]->tts_tableOid;
2135 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2136 options);
2137 }
2138
2139 /*
2140 * We're about to do the actual inserts -- but check for conflict first,
2141 * to minimize the possibility of having to roll back work we've just
2142 * done.
2143 *
2144 * A check here does not definitively prevent a serialization anomaly;
2145 * that check MUST be done at least past the point of acquiring an
2146 * exclusive buffer content lock on every buffer that will be affected,
2147 * and MAY be done after all inserts are reflected in the buffers and
2148 * those locks are released; otherwise there race condition. Since
2149 * multiple buffers can be locked and unlocked in the loop below, and it
2150 * would not be feasible to identify and lock all of those buffers before
2151 * the loop, we must do a final check at the end.
2152 *
2153 * The check here could be omitted with no loss of correctness; it is
2154 * present strictly as an optimization.
2155 *
2156 * For heap inserts, we only need to check for table-level SSI locks. Our
2157 * new tuples can't possibly conflict with existing tuple locks, and heap
2158 * page locks are only consolidated versions of tuple locks; they do not
2159 * lock "gaps" as index page locks do. So we don't need to specify a
2160 * buffer when making the call, which makes for a faster check.
2161 */
2162 CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2163
2164 ndone = 0;
2165 while (ndone < ntuples)
2166 {
2167 Buffer buffer;
2168 Buffer vmbuffer = InvalidBuffer;
2169 bool all_visible_cleared = false;
2170 int nthispage;
2171
2172 CHECK_FOR_INTERRUPTS();
2173
2174 /*
2175 * Find buffer where at least the next tuple will fit. If the page is
2176 * all-visible, this will also pin the requisite visibility map page.
2177 */
2178 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2179 InvalidBuffer, options, bistate,
2180 &vmbuffer, NULL);
2181 page = BufferGetPage(buffer);
2182
2183 /* NO EREPORT(ERROR) from here till changes are logged */
2184 START_CRIT_SECTION();
2185
2186 /*
2187 * RelationGetBufferForTuple has ensured that the first tuple fits.
2188 * Put that on the page, and then as many other tuples as fit.
2189 */
2190 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2191 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2192 {
2193 HeapTuple heaptup = heaptuples[ndone + nthispage];
2194
2195 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2196 break;
2197
2198 RelationPutHeapTuple(relation, buffer, heaptup, false);
2199
2200 /*
2201 * We don't use heap_multi_insert for catalog tuples yet, but
2202 * better be prepared...
2203 */
2204 if (needwal && need_cids)
2205 log_heap_new_cid(relation, heaptup);
2206 }
2207
2208 if (PageIsAllVisible(page))
2209 {
2210 all_visible_cleared = true;
2211 PageClearAllVisible(page);
2212 visibilitymap_clear(relation,
2213 BufferGetBlockNumber(buffer),
2214 vmbuffer, VISIBILITYMAP_VALID_BITS);
2215 }
2216
2217 /*
2218 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2219 */
2220
2221 MarkBufferDirty(buffer);
2222
2223 /* XLOG stuff */
2224 if (needwal)
2225 {
2226 XLogRecPtr recptr;
2227 xl_heap_multi_insert *xlrec;
2228 uint8 info = XLOG_HEAP2_MULTI_INSERT;
2229 char *tupledata;
2230 int totaldatalen;
2231 char *scratchptr = scratch.data;
2232 bool init;
2233 int bufflags = 0;
2234
2235 /*
2236 * If the page was previously empty, we can reinit the page
2237 * instead of restoring the whole thing.
2238 */
2239 init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2240 PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2241
2242 /* allocate xl_heap_multi_insert struct from the scratch area */
2243 xlrec = (xl_heap_multi_insert *) scratchptr;
2244 scratchptr += SizeOfHeapMultiInsert;
2245
2246 /*
2247 * Allocate offsets array. Unless we're reinitializing the page,
2248 * in that case the tuples are stored in order starting at
2249 * FirstOffsetNumber and we don't need to store the offsets
2250 * explicitly.
2251 */
2252 if (!init)
2253 scratchptr += nthispage * sizeof(OffsetNumber);
2254
2255 /* the rest of the scratch space is used for tuple data */
2256 tupledata = scratchptr;
2257
2258 xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2259 xlrec->ntuples = nthispage;
2260
2261 /*
2262 * Write out an xl_multi_insert_tuple and the tuple data itself
2263 * for each tuple.
2264 */
2265 for (i = 0; i < nthispage; i++)
2266 {
2267 HeapTuple heaptup = heaptuples[ndone + i];
2268 xl_multi_insert_tuple *tuphdr;
2269 int datalen;
2270
2271 if (!init)
2272 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2273 /* xl_multi_insert_tuple needs two-byte alignment. */
2274 tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2275 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2276
2277 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2278 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2279 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2280
2281 /* write bitmap [+ padding] [+ oid] + data */
2282 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2283 memcpy(scratchptr,
2284 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2285 datalen);
2286 tuphdr->datalen = datalen;
2287 scratchptr += datalen;
2288 }
2289 totaldatalen = scratchptr - tupledata;
2290 Assert((scratchptr - scratch.data) < BLCKSZ);
2291
2292 if (need_tuple_data)
2293 xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2294
2295 /*
2296 * Signal that this is the last xl_heap_multi_insert record
2297 * emitted by this call to heap_multi_insert(). Needed for logical
2298 * decoding so it knows when to cleanup temporary data.
2299 */
2300 if (ndone + nthispage == ntuples)
2301 xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2302
2303 if (init)
2304 {
2305 info |= XLOG_HEAP_INIT_PAGE;
2306 bufflags |= REGBUF_WILL_INIT;
2307 }
2308
2309 /*
2310 * If we're doing logical decoding, include the new tuple data
2311 * even if we take a full-page image of the page.
2312 */
2313 if (need_tuple_data)
2314 bufflags |= REGBUF_KEEP_DATA;
2315
2316 XLogBeginInsert();
2317 XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2318 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2319
2320 XLogRegisterBufData(0, tupledata, totaldatalen);
2321
2322 /* filtering by origin on a row level is much more efficient */
2323 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2324
2325 recptr = XLogInsert(RM_HEAP2_ID, info);
2326
2327 PageSetLSN(page, recptr);
2328 }
2329
2330 END_CRIT_SECTION();
2331
2332 UnlockReleaseBuffer(buffer);
2333 if (vmbuffer != InvalidBuffer)
2334 ReleaseBuffer(vmbuffer);
2335
2336 ndone += nthispage;
2337 }
2338
2339 /*
2340 * We're done with the actual inserts. Check for conflicts again, to
2341 * ensure that all rw-conflicts in to these inserts are detected. Without
2342 * this final check, a sequential scan of the heap may have locked the
2343 * table after the "before" check, missing one opportunity to detect the
2344 * conflict, and then scanned the table before the new tuples were there,
2345 * missing the other chance to detect the conflict.
2346 *
2347 * For heap inserts, we only need to check for table-level SSI locks. Our
2348 * new tuples can't possibly conflict with existing tuple locks, and heap
2349 * page locks are only consolidated versions of tuple locks; they do not
2350 * lock "gaps" as index page locks do. So we don't need to specify a
2351 * buffer when making the call.
2352 */
2353 CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2354
2355 /*
2356 * If tuples are cachable, mark them for invalidation from the caches in
2357 * case we abort. Note it is OK to do this after releasing the buffer,
2358 * because the heaptuples data structure is all in local memory, not in
2359 * the shared buffer.
2360 */
2361 if (IsCatalogRelation(relation))
2362 {
2363 for (i = 0; i < ntuples; i++)
2364 CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2365 }
2366
2367 /* copy t_self fields back to the caller's slots */
2368 for (i = 0; i < ntuples; i++)
2369 slots[i]->tts_tid = heaptuples[i]->t_self;
2370
2371 pgstat_count_heap_insert(relation, ntuples);
2372}
2373
2374/*
2375 * simple_heap_insert - insert a tuple
2376 *
2377 * Currently, this routine differs from heap_insert only in supplying
2378 * a default command ID and not allowing access to the speedup options.
2379 *
2380 * This should be used rather than using heap_insert directly in most places
2381 * where we are modifying system catalogs.
2382 */
2383void
2384simple_heap_insert(Relation relation, HeapTuple tup)
2385{
2386 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2387}
2388
2389/*
2390 * Given infomask/infomask2, compute the bits that must be saved in the
2391 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2392 * xl_heap_lock_updated WAL records.
2393 *
2394 * See fix_infomask_from_infobits.
2395 */
2396static uint8
2397compute_infobits(uint16 infomask, uint16 infomask2)
2398{
2399 return
2400 ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2401 ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2402 ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2403 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2404 ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2405 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2406 XLHL_KEYS_UPDATED : 0);
2407}
2408
2409/*
2410 * Given two versions of the same t_infomask for a tuple, compare them and
2411 * return whether the relevant status for a tuple Xmax has changed. This is
2412 * used after a buffer lock has been released and reacquired: we want to ensure
2413 * that the tuple state continues to be the same it was when we previously
2414 * examined it.
2415 *
2416 * Note the Xmax field itself must be compared separately.
2417 */
2418static inline bool
2419xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2420{
2421 const uint16 interesting =
2422 HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2423
2424 if ((new_infomask & interesting) != (old_infomask & interesting))
2425 return true;
2426
2427 return false;
2428}
2429
2430/*
2431 * heap_delete - delete a tuple
2432 *
2433 * See table_tuple_delete() for an explanation of the parameters, except that
2434 * this routine directly takes a tuple rather than a slot.
2435 *
2436 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2437 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2438 * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2439 * generated by another transaction).
2440 */
2441TM_Result
2442heap_delete(Relation relation, ItemPointer tid,
2443 CommandId cid, Snapshot crosscheck, bool wait,
2444 TM_FailureData *tmfd, bool changingPart)
2445{
2446 TM_Result result;
2447 TransactionId xid = GetCurrentTransactionId();
2448 ItemId lp;
2449 HeapTupleData tp;
2450 Page page;
2451 BlockNumber block;
2452 Buffer buffer;
2453 Buffer vmbuffer = InvalidBuffer;
2454 TransactionId new_xmax;
2455 uint16 new_infomask,
2456 new_infomask2;
2457 bool have_tuple_lock = false;
2458 bool iscombo;
2459 bool all_visible_cleared = false;
2460 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2461 bool old_key_copied = false;
2462
2463 Assert(ItemPointerIsValid(tid));
2464
2465 /*
2466 * Forbid this during a parallel operation, lest it allocate a combocid.
2467 * Other workers might need that combocid for visibility checks, and we
2468 * have no provision for broadcasting it to them.
2469 */
2470 if (IsInParallelMode())
2471 ereport(ERROR,
2472 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2473 errmsg("cannot delete tuples during a parallel operation")));
2474
2475 block = ItemPointerGetBlockNumber(tid);
2476 buffer = ReadBuffer(relation, block);
2477 page = BufferGetPage(buffer);
2478
2479 /*
2480 * Before locking the buffer, pin the visibility map page if it appears to
2481 * be necessary. Since we haven't got the lock yet, someone else might be
2482 * in the middle of changing this, so we'll need to recheck after we have
2483 * the lock.
2484 */
2485 if (PageIsAllVisible(page))
2486 visibilitymap_pin(relation, block, &vmbuffer);
2487
2488 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2489
2490 /*
2491 * If we didn't pin the visibility map page and the page has become all
2492 * visible while we were busy locking the buffer, we'll have to unlock and
2493 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2494 * unfortunate, but hopefully shouldn't happen often.
2495 */
2496 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2497 {
2498 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2499 visibilitymap_pin(relation, block, &vmbuffer);
2500 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2501 }
2502
2503 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2504 Assert(ItemIdIsNormal(lp));
2505
2506 tp.t_tableOid = RelationGetRelid(relation);
2507 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2508 tp.t_len = ItemIdGetLength(lp);
2509 tp.t_self = *tid;
2510
2511l1:
2512 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2513
2514 if (result == TM_Invisible)
2515 {
2516 UnlockReleaseBuffer(buffer);
2517 ereport(ERROR,
2518 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2519 errmsg("attempted to delete invisible tuple")));
2520 }
2521 else if (result == TM_BeingModified && wait)
2522 {
2523 TransactionId xwait;
2524 uint16 infomask;
2525
2526 /* must copy state data before unlocking buffer */
2527 xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2528 infomask = tp.t_data->t_infomask;
2529
2530 /*
2531 * Sleep until concurrent transaction ends -- except when there's a
2532 * single locker and it's our own transaction. Note we don't care
2533 * which lock mode the locker has, because we need the strongest one.
2534 *
2535 * Before sleeping, we need to acquire tuple lock to establish our
2536 * priority for the tuple (see heap_lock_tuple). LockTuple will
2537 * release us when we are next-in-line for the tuple.
2538 *
2539 * If we are forced to "start over" below, we keep the tuple lock;
2540 * this arranges that we stay at the head of the line while rechecking
2541 * tuple state.
2542 */
2543 if (infomask & HEAP_XMAX_IS_MULTI)
2544 {
2545 bool current_is_member = false;
2546
2547 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2548 LockTupleExclusive, &current_is_member))
2549 {
2550 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2551
2552 /*
2553 * Acquire the lock, if necessary (but skip it when we're
2554 * requesting a lock and already have one; avoids deadlock).
2555 */
2556 if (!current_is_member)
2557 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2558 LockWaitBlock, &have_tuple_lock);
2559
2560 /* wait for multixact */
2561 MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2562 relation, &(tp.t_self), XLTW_Delete,
2563 NULL);
2564 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2565
2566 /*
2567 * If xwait had just locked the tuple then some other xact
2568 * could update this tuple before we get to this point. Check
2569 * for xmax change, and start over if so.
2570 */
2571 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2572 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2573 xwait))
2574 goto l1;
2575 }
2576
2577 /*
2578 * You might think the multixact is necessarily done here, but not
2579 * so: it could have surviving members, namely our own xact or
2580 * other subxacts of this backend. It is legal for us to delete
2581 * the tuple in either case, however (the latter case is
2582 * essentially a situation of upgrading our former shared lock to
2583 * exclusive). We don't bother changing the on-disk hint bits
2584 * since we are about to overwrite the xmax altogether.
2585 */
2586 }
2587 else if (!TransactionIdIsCurrentTransactionId(xwait))
2588 {
2589 /*
2590 * Wait for regular transaction to end; but first, acquire tuple
2591 * lock.
2592 */
2593 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2594 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2595 LockWaitBlock, &have_tuple_lock);
2596 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2597 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2598
2599 /*
2600 * xwait is done, but if xwait had just locked the tuple then some
2601 * other xact could update this tuple before we get to this point.
2602 * Check for xmax change, and start over if so.
2603 */
2604 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2605 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2606 xwait))
2607 goto l1;
2608
2609 /* Otherwise check if it committed or aborted */
2610 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2611 }
2612
2613 /*
2614 * We may overwrite if previous xmax aborted, or if it committed but
2615 * only locked the tuple without updating it.
2616 */
2617 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2618 HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
2619 HeapTupleHeaderIsOnlyLocked(tp.t_data))
2620 result = TM_Ok;
2621 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid) ||
2622 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data))
2623 result = TM_Updated;
2624 else
2625 result = TM_Deleted;
2626 }
2627
2628 if (crosscheck != InvalidSnapshot && result == TM_Ok)
2629 {
2630 /* Perform additional check for transaction-snapshot mode RI updates */
2631 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2632 result = TM_Updated;
2633 }
2634
2635 if (result != TM_Ok)
2636 {
2637 Assert(result == TM_SelfModified ||
2638 result == TM_Updated ||
2639 result == TM_Deleted ||
2640 result == TM_BeingModified);
2641 Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2642 Assert(result != TM_Updated ||
2643 !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2644 tmfd->ctid = tp.t_data->t_ctid;
2645 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2646 if (result == TM_SelfModified)
2647 tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2648 else
2649 tmfd->cmax = InvalidCommandId;
2650 UnlockReleaseBuffer(buffer);
2651 if (have_tuple_lock)
2652 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2653 if (vmbuffer != InvalidBuffer)
2654 ReleaseBuffer(vmbuffer);
2655 return result;
2656 }
2657
2658 /*
2659 * We're about to do the actual delete -- check for conflict first, to
2660 * avoid possibly having to roll back work we've just done.
2661 *
2662 * This is safe without a recheck as long as there is no possibility of
2663 * another process scanning the page between this check and the delete
2664 * being visible to the scan (i.e., an exclusive buffer content lock is
2665 * continuously held from this point until the tuple delete is visible).
2666 */
2667 CheckForSerializableConflictIn(relation, &tp, buffer);
2668
2669 /* replace cid with a combo cid if necessary */
2670 HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2671
2672 /*
2673 * Compute replica identity tuple before entering the critical section so
2674 * we don't PANIC upon a memory allocation failure.
2675 */
2676 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2677
2678 /*
2679 * If this is the first possibly-multixact-able operation in the current
2680 * transaction, set my per-backend OldestMemberMXactId setting. We can be
2681 * certain that the transaction will never become a member of any older
2682 * MultiXactIds than that. (We have to do this even if we end up just
2683 * using our own TransactionId below, since some other backend could
2684 * incorporate our XID into a MultiXact immediately afterwards.)
2685 */
2686 MultiXactIdSetOldestMember();
2687
2688 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2689 tp.t_data->t_infomask, tp.t_data->t_infomask2,
2690 xid, LockTupleExclusive, true,
2691 &new_xmax, &new_infomask, &new_infomask2);
2692
2693 START_CRIT_SECTION();
2694
2695 /*
2696 * If this transaction commits, the tuple will become DEAD sooner or
2697 * later. Set flag that this page is a candidate for pruning once our xid
2698 * falls below the OldestXmin horizon. If the transaction finally aborts,
2699 * the subsequent page pruning will be a no-op and the hint will be
2700 * cleared.
2701 */
2702 PageSetPrunable(page, xid);
2703
2704 if (PageIsAllVisible(page))
2705 {
2706 all_visible_cleared = true;
2707 PageClearAllVisible(page);
2708 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2709 vmbuffer, VISIBILITYMAP_VALID_BITS);
2710 }
2711
2712 /* store transaction information of xact deleting the tuple */
2713 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
2714 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2715 tp.t_data->t_infomask |= new_infomask;
2716 tp.t_data->t_infomask2 |= new_infomask2;
2717 HeapTupleHeaderClearHotUpdated(tp.t_data);
2718 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2719 HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2720 /* Make sure there is no forward chain link in t_ctid */
2721 tp.t_data->t_ctid = tp.t_self;
2722
2723 /* Signal that this is actually a move into another partition */
2724 if (changingPart)
2725 HeapTupleHeaderSetMovedPartitions(tp.t_data);
2726
2727 MarkBufferDirty(buffer);
2728
2729 /*
2730 * XLOG stuff
2731 *
2732 * NB: heap_abort_speculative() uses the same xlog record and replay
2733 * routines.
2734 */
2735 if (RelationNeedsWAL(relation))
2736 {
2737 xl_heap_delete xlrec;
2738 xl_heap_header xlhdr;
2739 XLogRecPtr recptr;
2740
2741 /* For logical decode we need combocids to properly decode the catalog */
2742 if (RelationIsAccessibleInLogicalDecoding(relation))
2743 log_heap_new_cid(relation, &tp);
2744
2745 xlrec.flags = 0;
2746 if (all_visible_cleared)
2747 xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
2748 if (changingPart)
2749 xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
2750 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
2751 tp.t_data->t_infomask2);
2752 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
2753 xlrec.xmax = new_xmax;
2754
2755 if (old_key_tuple != NULL)
2756 {
2757 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2758 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
2759 else
2760 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
2761 }
2762
2763 XLogBeginInsert();
2764 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2765
2766 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2767
2768 /*
2769 * Log replica identity of the deleted tuple if there is one
2770 */
2771 if (old_key_tuple != NULL)
2772 {
2773 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2774 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2775 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2776
2777 XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2778 XLogRegisterData((char *) old_key_tuple->t_data
2779 + SizeofHeapTupleHeader,
2780 old_key_tuple->t_len
2781 - SizeofHeapTupleHeader);
2782 }
2783
2784 /* filtering by origin on a row level is much more efficient */
2785 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2786
2787 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2788
2789 PageSetLSN(page, recptr);
2790 }
2791
2792 END_CRIT_SECTION();
2793
2794 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2795
2796 if (vmbuffer != InvalidBuffer)
2797 ReleaseBuffer(vmbuffer);
2798
2799 /*
2800 * If the tuple has toasted out-of-line attributes, we need to delete
2801 * those items too. We have to do this before releasing the buffer
2802 * because we need to look at the contents of the tuple, but it's OK to
2803 * release the content lock on the buffer first.
2804 */
2805 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2806 relation->rd_rel->relkind != RELKIND_MATVIEW)
2807 {
2808 /* toast table entries should never be recursively toasted */
2809 Assert(!HeapTupleHasExternal(&tp));
2810 }
2811 else if (HeapTupleHasExternal(&tp))
2812 toast_delete(relation, &tp, false);
2813
2814 /*
2815 * Mark tuple for invalidation from system caches at next command
2816 * boundary. We have to do this before releasing the buffer because we
2817 * need to look at the contents of the tuple.
2818 */
2819 CacheInvalidateHeapTuple(relation, &tp, NULL);
2820
2821 /* Now we can release the buffer */
2822 ReleaseBuffer(buffer);
2823
2824 /*
2825 * Release the lmgr tuple lock, if we had it.
2826 */
2827 if (have_tuple_lock)
2828 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2829
2830 pgstat_count_heap_delete(relation);
2831
2832 if (old_key_tuple != NULL && old_key_copied)
2833 heap_freetuple(old_key_tuple);
2834
2835 return TM_Ok;
2836}
2837
2838/*
2839 * simple_heap_delete - delete a tuple
2840 *
2841 * This routine may be used to delete a tuple when concurrent updates of
2842 * the target tuple are not expected (for example, because we have a lock
2843 * on the relation associated with the tuple). Any failure is reported
2844 * via ereport().
2845 */
2846void
2847simple_heap_delete(Relation relation, ItemPointer tid)
2848{
2849 TM_Result result;
2850 TM_FailureData tmfd;
2851
2852 result = heap_delete(relation, tid,
2853 GetCurrentCommandId(true), InvalidSnapshot,
2854 true /* wait for commit */ ,
2855 &tmfd, false /* changingPart */ );
2856 switch (result)
2857 {
2858 case TM_SelfModified:
2859 /* Tuple was already updated in current command? */
2860 elog(ERROR, "tuple already updated by self");
2861 break;
2862
2863 case TM_Ok:
2864 /* done successfully */
2865 break;
2866
2867 case TM_Updated:
2868 elog(ERROR, "tuple concurrently updated");
2869 break;
2870
2871 case TM_Deleted:
2872 elog(ERROR, "tuple concurrently deleted");
2873 break;
2874
2875 default:
2876 elog(ERROR, "unrecognized heap_delete status: %u", result);
2877 break;
2878 }
2879}
2880
2881/*
2882 * heap_update - replace a tuple
2883 *
2884 * See table_tuple_update() for an explanation of the parameters, except that
2885 * this routine directly takes a tuple rather than a slot.
2886 *
2887 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2888 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2889 * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2890 * generated by another transaction).
2891 */
2892TM_Result
2893heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
2894 CommandId cid, Snapshot crosscheck, bool wait,
2895 TM_FailureData *tmfd, LockTupleMode *lockmode)
2896{
2897 TM_Result result;
2898 TransactionId xid = GetCurrentTransactionId();
2899 Bitmapset *hot_attrs;
2900 Bitmapset *key_attrs;
2901 Bitmapset *id_attrs;
2902 Bitmapset *interesting_attrs;
2903 Bitmapset *modified_attrs;
2904 ItemId lp;
2905 HeapTupleData oldtup;
2906 HeapTuple heaptup;
2907 HeapTuple old_key_tuple = NULL;
2908 bool old_key_copied = false;
2909 Page page;
2910 BlockNumber block;
2911 MultiXactStatus mxact_status;
2912 Buffer buffer,
2913 newbuf,
2914 vmbuffer = InvalidBuffer,
2915 vmbuffer_new = InvalidBuffer;
2916 bool need_toast;
2917 Size newtupsize,
2918 pagefree;
2919 bool have_tuple_lock = false;
2920 bool iscombo;
2921 bool use_hot_update = false;
2922 bool hot_attrs_checked = false;
2923 bool key_intact;
2924 bool all_visible_cleared = false;
2925 bool all_visible_cleared_new = false;
2926 bool checked_lockers;
2927 bool locker_remains;
2928 TransactionId xmax_new_tuple,
2929 xmax_old_tuple;
2930 uint16 infomask_old_tuple,
2931 infomask2_old_tuple,
2932 infomask_new_tuple,
2933 infomask2_new_tuple;
2934
2935 Assert(ItemPointerIsValid(otid));
2936
2937 /*
2938 * Forbid this during a parallel operation, lest it allocate a combocid.
2939 * Other workers might need that combocid for visibility checks, and we
2940 * have no provision for broadcasting it to them.
2941 */
2942 if (IsInParallelMode())
2943 ereport(ERROR,
2944 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2945 errmsg("cannot update tuples during a parallel operation")));
2946
2947 /*
2948 * Fetch the list of attributes to be checked for various operations.
2949 *
2950 * For HOT considerations, this is wasted effort if we fail to update or
2951 * have to put the new tuple on a different page. But we must compute the
2952 * list before obtaining buffer lock --- in the worst case, if we are
2953 * doing an update on one of the relevant system catalogs, we could
2954 * deadlock if we try to fetch the list later. In any case, the relcache
2955 * caches the data so this is usually pretty cheap.
2956 *
2957 * We also need columns used by the replica identity and columns that are
2958 * considered the "key" of rows in the table.
2959 *
2960 * Note that we get copies of each bitmap, so we need not worry about
2961 * relcache flush happening midway through.
2962 */
2963 hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
2964 key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
2965 id_attrs = RelationGetIndexAttrBitmap(relation,
2966 INDEX_ATTR_BITMAP_IDENTITY_KEY);
2967
2968
2969 block = ItemPointerGetBlockNumber(otid);
2970 buffer = ReadBuffer(relation, block);
2971 page = BufferGetPage(buffer);
2972
2973 interesting_attrs = NULL;
2974
2975 /*
2976 * If the page is already full, there is hardly any chance of doing a HOT
2977 * update on this page. It might be wasteful effort to look for index
2978 * column updates only to later reject HOT updates for lack of space in
2979 * the same page. So we be conservative and only fetch hot_attrs if the
2980 * page is not already full. Since we are already holding a pin on the
2981 * buffer, there is no chance that the buffer can get cleaned up
2982 * concurrently and even if that was possible, in the worst case we lose a
2983 * chance to do a HOT update.
2984 */
2985 if (!PageIsFull(page))
2986 {
2987 interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
2988 hot_attrs_checked = true;
2989 }
2990 interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
2991 interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
2992
2993 /*
2994 * Before locking the buffer, pin the visibility map page if it appears to
2995 * be necessary. Since we haven't got the lock yet, someone else might be
2996 * in the middle of changing this, so we'll need to recheck after we have
2997 * the lock.
2998 */
2999 if (PageIsAllVisible(page))
3000 visibilitymap_pin(relation, block, &vmbuffer);
3001
3002 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3003
3004 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3005 Assert(ItemIdIsNormal(lp));
3006
3007 /*
3008 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3009 * properly.
3010 */
3011 oldtup.t_tableOid = RelationGetRelid(relation);
3012 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3013 oldtup.t_len = ItemIdGetLength(lp);
3014 oldtup.t_self = *otid;
3015
3016 /* the new tuple is ready, except for this: */
3017 newtup->t_tableOid = RelationGetRelid(relation);
3018
3019 /* Determine columns modified by the update. */
3020 modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3021 &oldtup, newtup);
3022
3023 /*
3024 * If we're not updating any "key" column, we can grab a weaker lock type.
3025 * This allows for more concurrency when we are running simultaneously
3026 * with foreign key checks.
3027 *
3028 * Note that if a column gets detoasted while executing the update, but
3029 * the value ends up being the same, this test will fail and we will use
3030 * the stronger lock. This is acceptable; the important case to optimize
3031 * is updates that don't manipulate key columns, not those that
3032 * serendipitiously arrive at the same key values.
3033 */
3034 if (!bms_overlap(modified_attrs, key_attrs))
3035 {
3036 *lockmode = LockTupleNoKeyExclusive;
3037 mxact_status = MultiXactStatusNoKeyUpdate;
3038 key_intact = true;
3039
3040 /*
3041 * If this is the first possibly-multixact-able operation in the
3042 * current transaction, set my per-backend OldestMemberMXactId
3043 * setting. We can be certain that the transaction will never become a
3044 * member of any older MultiXactIds than that. (We have to do this
3045 * even if we end up just using our own TransactionId below, since
3046 * some other backend could incorporate our XID into a MultiXact
3047 * immediately afterwards.)
3048 */
3049 MultiXactIdSetOldestMember();
3050 }
3051 else
3052 {
3053 *lockmode = LockTupleExclusive;
3054 mxact_status = MultiXactStatusUpdate;
3055 key_intact = false;
3056 }
3057
3058 /*
3059 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3060 * otid may very well point at newtup->t_self, which we will overwrite
3061 * with the new tuple's location, so there's great risk of confusion if we
3062 * use otid anymore.
3063 */
3064
3065l2:
3066 checked_lockers = false;
3067 locker_remains = false;
3068 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3069
3070 /* see below about the "no wait" case */
3071 Assert(result != TM_BeingModified || wait);
3072
3073 if (result == TM_Invisible)
3074 {
3075 UnlockReleaseBuffer(buffer);
3076 ereport(ERROR,
3077 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3078 errmsg("attempted to update invisible tuple")));
3079 }
3080 else if (result == TM_BeingModified && wait)
3081 {
3082 TransactionId xwait;
3083 uint16 infomask;
3084 bool can_continue = false;
3085
3086 /*
3087 * XXX note that we don't consider the "no wait" case here. This
3088 * isn't a problem currently because no caller uses that case, but it
3089 * should be fixed if such a caller is introduced. It wasn't a
3090 * problem previously because this code would always wait, but now
3091 * that some tuple locks do not conflict with one of the lock modes we
3092 * use, it is possible that this case is interesting to handle
3093 * specially.
3094 *
3095 * This may cause failures with third-party code that calls
3096 * heap_update directly.
3097 */
3098
3099 /* must copy state data before unlocking buffer */
3100 xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3101 infomask = oldtup.t_data->t_infomask;
3102
3103 /*
3104 * Now we have to do something about the existing locker. If it's a
3105 * multi, sleep on it; we might be awakened before it is completely
3106 * gone (or even not sleep at all in some cases); we need to preserve
3107 * it as locker, unless it is gone completely.
3108 *
3109 * If it's not a multi, we need to check for sleeping conditions
3110 * before actually going to sleep. If the update doesn't conflict
3111 * with the locks, we just continue without sleeping (but making sure
3112 * it is preserved).
3113 *
3114 * Before sleeping, we need to acquire tuple lock to establish our
3115 * priority for the tuple (see heap_lock_tuple). LockTuple will
3116 * release us when we are next-in-line for the tuple. Note we must
3117 * not acquire the tuple lock until we're sure we're going to sleep;
3118 * otherwise we're open for race conditions with other transactions
3119 * holding the tuple lock which sleep on us.
3120 *
3121 * If we are forced to "start over" below, we keep the tuple lock;
3122 * this arranges that we stay at the head of the line while rechecking
3123 * tuple state.
3124 */
3125 if (infomask & HEAP_XMAX_IS_MULTI)
3126 {
3127 TransactionId update_xact;
3128 int remain;
3129 bool current_is_member = false;
3130
3131 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3132 *lockmode, &current_is_member))
3133 {
3134 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3135
3136 /*
3137 * Acquire the lock, if necessary (but skip it when we're
3138 * requesting a lock and already have one; avoids deadlock).
3139 */
3140 if (!current_is_member)
3141 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3142 LockWaitBlock, &have_tuple_lock);
3143
3144 /* wait for multixact */
3145 MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3146 relation, &oldtup.t_self, XLTW_Update,
3147 &remain);
3148 checked_lockers = true;
3149 locker_remains = remain != 0;
3150 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3151
3152 /*
3153 * If xwait had just locked the tuple then some other xact
3154 * could update this tuple before we get to this point. Check
3155 * for xmax change, and start over if so.
3156 */
3157 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3158 infomask) ||
3159 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3160 xwait))
3161 goto l2;
3162 }
3163
3164 /*
3165 * Note that the multixact may not be done by now. It could have
3166 * surviving members; our own xact or other subxacts of this
3167 * backend, and also any other concurrent transaction that locked
3168 * the tuple with LockTupleKeyShare if we only got
3169 * LockTupleNoKeyExclusive. If this is the case, we have to be
3170 * careful to mark the updated tuple with the surviving members in
3171 * Xmax.
3172 *
3173 * Note that there could have been another update in the
3174 * MultiXact. In that case, we need to check whether it committed
3175 * or aborted. If it aborted we are safe to update it again;
3176 * otherwise there is an update conflict, and we have to return
3177 * TableTuple{Deleted, Updated} below.
3178 *
3179 * In the LockTupleExclusive case, we still need to preserve the
3180 * surviving members: those would include the tuple locks we had
3181 * before this one, which are important to keep in case this
3182 * subxact aborts.
3183 */
3184 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3185 update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3186 else
3187 update_xact = InvalidTransactionId;
3188
3189 /*
3190 * There was no UPDATE in the MultiXact; or it aborted. No
3191 * TransactionIdIsInProgress() call needed here, since we called
3192 * MultiXactIdWait() above.
3193 */
3194 if (!TransactionIdIsValid(update_xact) ||
3195 TransactionIdDidAbort(update_xact))
3196 can_continue = true;
3197 }
3198 else if (TransactionIdIsCurrentTransactionId(xwait))
3199 {
3200 /*
3201 * The only locker is ourselves; we can avoid grabbing the tuple
3202 * lock here, but must preserve our locking information.
3203 */
3204 checked_lockers = true;
3205 locker_remains = true;
3206 can_continue = true;
3207 }
3208 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3209 {
3210 /*
3211 * If it's just a key-share locker, and we're not changing the key
3212 * columns, we don't need to wait for it to end; but we need to
3213 * preserve it as locker.
3214 */
3215 checked_lockers = true;
3216 locker_remains = true;
3217 can_continue = true;
3218 }
3219 else
3220 {
3221 /*
3222 * Wait for regular transaction to end; but first, acquire tuple
3223 * lock.
3224 */
3225 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3226 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3227 LockWaitBlock, &have_tuple_lock);
3228 XactLockTableWait(xwait, relation, &oldtup.t_self,
3229 XLTW_Update);
3230 checked_lockers = true;
3231 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3232
3233 /*
3234 * xwait is done, but if xwait had just locked the tuple then some
3235 * other xact could update this tuple before we get to this point.
3236 * Check for xmax change, and start over if so.
3237 */
3238 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3239 !TransactionIdEquals(xwait,
3240 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3241 goto l2;
3242
3243 /* Otherwise check if it committed or aborted */
3244 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3245 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3246 can_continue = true;
3247 }
3248
3249 if (can_continue)
3250 result = TM_Ok;
3251 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid) ||
3252 HeapTupleHeaderIndicatesMovedPartitions(oldtup.t_data))
3253 result = TM_Updated;
3254 else
3255 result = TM_Deleted;
3256 }
3257
3258 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3259 {
3260 /* Perform additional check for transaction-snapshot mode RI updates */
3261 if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3262 {
3263 result = TM_Updated;
3264 Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3265 }
3266 }
3267
3268 if (result != TM_Ok)
3269 {
3270 Assert(result == TM_SelfModified ||
3271 result == TM_Updated ||
3272 result == TM_Deleted ||
3273 result == TM_BeingModified);
3274 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3275 Assert(result != TM_Updated ||
3276 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3277 tmfd->ctid = oldtup.t_data->t_ctid;
3278 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3279 if (result == TM_SelfModified)
3280 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3281 else
3282 tmfd->cmax = InvalidCommandId;
3283 UnlockReleaseBuffer(buffer);
3284 if (have_tuple_lock)
3285 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3286 if (vmbuffer != InvalidBuffer)
3287 ReleaseBuffer(vmbuffer);
3288 bms_free(hot_attrs);
3289 bms_free(key_attrs);
3290 bms_free(id_attrs);
3291 bms_free(modified_attrs);
3292 bms_free(interesting_attrs);
3293 return result;
3294 }
3295
3296 /*
3297 * If we didn't pin the visibility map page and the page has become all
3298 * visible while we were busy locking the buffer, or during some
3299 * subsequent window during which we had it unlocked, we'll have to unlock
3300 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3301 * bit unfortunate, especially since we'll now have to recheck whether the
3302 * tuple has been locked or updated under us, but hopefully it won't
3303 * happen very often.
3304 */
3305 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3306 {
3307 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3308 visibilitymap_pin(relation, block, &vmbuffer);
3309 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3310 goto l2;
3311 }
3312
3313 /* Fill in transaction status data */
3314
3315 /*
3316 * If the tuple we're updating is locked, we need to preserve the locking
3317 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3318 */
3319 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3320 oldtup.t_data->t_infomask,
3321 oldtup.t_data->t_infomask2,
3322 xid, *lockmode, true,
3323 &xmax_old_tuple, &infomask_old_tuple,
3324 &infomask2_old_tuple);
3325
3326 /*
3327 * And also prepare an Xmax value for the new copy of the tuple. If there
3328 * was no xmax previously, or there was one but all lockers are now gone,
3329 * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3330 * rare cases that might also be InvalidXid and yet not have the
3331 * HEAP_XMAX_INVALID bit set; that's fine.)
3332 */
3333 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3334 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3335 (checked_lockers && !locker_remains))
3336 xmax_new_tuple = InvalidTransactionId;
3337 else
3338 xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3339
3340 if (!TransactionIdIsValid(xmax_new_tuple))
3341 {
3342 infomask_new_tuple = HEAP_XMAX_INVALID;
3343 infomask2_new_tuple = 0;
3344 }
3345 else
3346 {
3347 /*
3348 * If we found a valid Xmax for the new tuple, then the infomask bits
3349 * to use on the new tuple depend on what was there on the old one.
3350 * Note that since we're doing an update, the only possibility is that
3351 * the lockers had FOR KEY SHARE lock.
3352 */
3353 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3354 {
3355 GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3356 &infomask2_new_tuple);
3357 }
3358 else
3359 {
3360 infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3361 infomask2_new_tuple = 0;
3362 }
3363 }
3364
3365 /*
3366 * Prepare the new tuple with the appropriate initial values of Xmin and
3367 * Xmax, as well as initial infomask bits as computed above.
3368 */
3369 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3370 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3371 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3372 HeapTupleHeaderSetCmin(newtup->t_data, cid);
3373 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3374 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3375 HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3376
3377 /*
3378 * Replace cid with a combo cid if necessary. Note that we already put
3379 * the plain cid into the new tuple.
3380 */
3381 HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3382
3383 /*
3384 * If the toaster needs to be activated, OR if the new tuple will not fit
3385 * on the same page as the old, then we need to release the content lock
3386 * (but not the pin!) on the old tuple's buffer while we are off doing
3387 * TOAST and/or table-file-extension work. We must mark the old tuple to
3388 * show that it's locked, else other processes may try to update it
3389 * themselves.
3390 *
3391 * We need to invoke the toaster if there are already any out-of-line
3392 * toasted values present, or if the new tuple is over-threshold.
3393 */
3394 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3395 relation->rd_rel->relkind != RELKIND_MATVIEW)
3396 {
3397 /* toast table entries should never be recursively toasted */
3398 Assert(!HeapTupleHasExternal(&oldtup));
3399 Assert(!HeapTupleHasExternal(newtup));
3400 need_toast = false;
3401 }
3402 else
3403 need_toast = (HeapTupleHasExternal(&oldtup) ||
3404 HeapTupleHasExternal(newtup) ||
3405 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3406
3407 pagefree = PageGetHeapFreeSpace(page);
3408
3409 newtupsize = MAXALIGN(newtup->t_len);
3410
3411 if (need_toast || newtupsize > pagefree)
3412 {
3413 TransactionId xmax_lock_old_tuple;
3414 uint16 infomask_lock_old_tuple,
3415 infomask2_lock_old_tuple;
3416 bool cleared_all_frozen = false;
3417
3418 /*
3419 * To prevent concurrent sessions from updating the tuple, we have to
3420 * temporarily mark it locked, while we release the page-level lock.
3421 *
3422 * To satisfy the rule that any xid potentially appearing in a buffer
3423 * written out to disk, we unfortunately have to WAL log this
3424 * temporary modification. We can reuse xl_heap_lock for this
3425 * purpose. If we crash/error before following through with the
3426 * actual update, xmax will be of an aborted transaction, allowing
3427 * other sessions to proceed.
3428 */
3429
3430 /*
3431 * Compute xmax / infomask appropriate for locking the tuple. This has
3432 * to be done separately from the combo that's going to be used for
3433 * updating, because the potentially created multixact would otherwise
3434 * be wrong.
3435 */
3436 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3437 oldtup.t_data->t_infomask,
3438 oldtup.t_data->t_infomask2,
3439 xid, *lockmode, false,
3440 &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3441 &infomask2_lock_old_tuple);
3442
3443 Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3444
3445 START_CRIT_SECTION();
3446
3447 /* Clear obsolete visibility flags ... */
3448 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3449 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3450 HeapTupleClearHotUpdated(&oldtup);
3451 /* ... and store info about transaction updating this tuple */
3452 Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3453 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3454 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3455 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3456 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3457
3458 /* temporarily make it look not-updated, but locked */
3459 oldtup.t_data->t_ctid = oldtup.t_self;
3460
3461 /*
3462 * Clear all-frozen bit on visibility map if needed. We could
3463 * immediately reset ALL_VISIBLE, but given that the WAL logging
3464 * overhead would be unchanged, that doesn't seem necessarily
3465 * worthwhile.
3466 */
3467 if (PageIsAllVisible(BufferGetPage(buffer)) &&
3468 visibilitymap_clear(relation, block, vmbuffer,
3469 VISIBILITYMAP_ALL_FROZEN))
3470 cleared_all_frozen = true;
3471
3472 MarkBufferDirty(buffer);
3473
3474 if (RelationNeedsWAL(relation))
3475 {
3476 xl_heap_lock xlrec;
3477 XLogRecPtr recptr;
3478
3479 XLogBeginInsert();
3480 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3481
3482 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3483 xlrec.locking_xid = xmax_lock_old_tuple;
3484 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3485 oldtup.t_data->t_infomask2);
3486 xlrec.flags =
3487 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3488 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3489 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3490 PageSetLSN(page, recptr);
3491 }
3492
3493 END_CRIT_SECTION();
3494
3495 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3496
3497 /*
3498 * Let the toaster do its thing, if needed.
3499 *
3500 * Note: below this point, heaptup is the data we actually intend to
3501 * store into the relation; newtup is the caller's original untoasted
3502 * data.
3503 */
3504 if (need_toast)
3505 {
3506 /* Note we always use WAL and FSM during updates */
3507 heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
3508 newtupsize = MAXALIGN(heaptup->t_len);
3509 }
3510 else
3511 heaptup = newtup;
3512
3513 /*
3514 * Now, do we need a new page for the tuple, or not? This is a bit
3515 * tricky since someone else could have added tuples to the page while
3516 * we weren't looking. We have to recheck the available space after
3517 * reacquiring the buffer lock. But don't bother to do that if the
3518 * former amount of free space is still not enough; it's unlikely
3519 * there's more free now than before.
3520 *
3521 * What's more, if we need to get a new page, we will need to acquire
3522 * buffer locks on both old and new pages. To avoid deadlock against
3523 * some other backend trying to get the same two locks in the other
3524 * order, we must be consistent about the order we get the locks in.
3525 * We use the rule "lock the lower-numbered page of the relation
3526 * first". To implement this, we must do RelationGetBufferForTuple
3527 * while not holding the lock on the old page, and we must rely on it
3528 * to get the locks on both pages in the correct order.
3529 */
3530 if (newtupsize > pagefree)
3531 {
3532 /* Assume there's no chance to put heaptup on same page. */
3533 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3534 buffer, 0, NULL,
3535 &vmbuffer_new, &vmbuffer);
3536 }
3537 else
3538 {
3539 /* Re-acquire the lock on the old tuple's page. */
3540 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3541 /* Re-check using the up-to-date free space */
3542 pagefree = PageGetHeapFreeSpace(page);
3543 if (newtupsize > pagefree)
3544 {
3545 /*
3546 * Rats, it doesn't fit anymore. We must now unlock and
3547 * relock to avoid deadlock. Fortunately, this path should
3548 * seldom be taken.
3549 */
3550 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3551 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3552 buffer, 0, NULL,
3553 &vmbuffer_new, &vmbuffer);
3554 }
3555 else
3556 {
3557 /* OK, it fits here, so we're done. */
3558 newbuf = buffer;
3559 }
3560 }
3561 }
3562 else
3563 {
3564 /* No TOAST work needed, and it'll fit on same page */
3565 newbuf = buffer;
3566 heaptup = newtup;
3567 }
3568
3569 /*
3570 * We're about to do the actual update -- check for conflict first, to
3571 * avoid possibly having to roll back work we've just done.
3572 *
3573 * This is safe without a recheck as long as there is no possibility of
3574 * another process scanning the pages between this check and the update
3575 * being visible to the scan (i.e., exclusive buffer content lock(s) are
3576 * continuously held from this point until the tuple update is visible).
3577 *
3578 * For the new tuple the only check needed is at the relation level, but
3579 * since both tuples are in the same relation and the check for oldtup
3580 * will include checking the relation level, there is no benefit to a
3581 * separate check for the new tuple.
3582 */
3583 CheckForSerializableConflictIn(relation, &oldtup, buffer);
3584
3585 /*
3586 * At this point newbuf and buffer are both pinned and locked, and newbuf
3587 * has enough space for the new tuple. If they are the same buffer, only
3588 * one pin is held.
3589 */
3590
3591 if (newbuf == buffer)
3592 {
3593 /*
3594 * Since the new tuple is going into the same page, we might be able
3595 * to do a HOT update. Check if any of the index columns have been
3596 * changed. If the page was already full, we may have skipped checking
3597 * for index columns, and also can't do a HOT update.
3598 */
3599 if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3600 use_hot_update = true;
3601 }
3602 else
3603 {
3604 /* Set a hint that the old page could use prune/defrag */
3605 PageSetFull(page);
3606 }
3607
3608 /*
3609 * Compute replica identity tuple before entering the critical section so
3610 * we don't PANIC upon a memory allocation failure.
3611 * ExtractReplicaIdentity() will return NULL if nothing needs to be
3612 * logged.
3613 */
3614 old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3615 bms_overlap(modified_attrs, id_attrs),
3616 &old_key_copied);
3617
3618 /* NO EREPORT(ERROR) from here till changes are logged */
3619 START_CRIT_SECTION();
3620
3621 /*
3622 * If this transaction commits, the old tuple will become DEAD sooner or
3623 * later. Set flag that this page is a candidate for pruning once our xid
3624 * falls below the OldestXmin horizon. If the transaction finally aborts,
3625 * the subsequent page pruning will be a no-op and the hint will be
3626 * cleared.
3627 *
3628 * XXX Should we set hint on newbuf as well? If the transaction aborts,
3629 * there would be a prunable tuple in the newbuf; but for now we choose
3630 * not to optimize for aborts. Note that heap_xlog_update must be kept in
3631 * sync if this decision changes.
3632 */
3633 PageSetPrunable(page, xid);
3634
3635 if (use_hot_update)
3636 {
3637 /* Mark the old tuple as HOT-updated */
3638 HeapTupleSetHotUpdated(&oldtup);
3639 /* And mark the new tuple as heap-only */
3640 HeapTupleSetHeapOnly(heaptup);
3641 /* Mark the caller's copy too, in case different from heaptup */
3642 HeapTupleSetHeapOnly(newtup);
3643 }
3644 else
3645 {
3646 /* Make sure tuples are correctly marked as not-HOT */
3647 HeapTupleClearHotUpdated(&oldtup);
3648 HeapTupleClearHeapOnly(heaptup);
3649 HeapTupleClearHeapOnly(newtup);
3650 }
3651
3652 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3653
3654
3655 /* Clear obsolete visibility flags, possibly set by ourselves above... */
3656 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3657 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3658 /* ... and store info about transaction updating this tuple */
3659 Assert(TransactionIdIsValid(xmax_old_tuple));
3660 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3661 oldtup.t_data->t_infomask |= infomask_old_tuple;
3662 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3663 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3664
3665 /* record address of new tuple in t_ctid of old one */
3666 oldtup.t_data->t_ctid = heaptup->t_self;
3667
3668 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3669 if (PageIsAllVisible(BufferGetPage(buffer)))
3670 {
3671 all_visible_cleared = true;
3672 PageClearAllVisible(BufferGetPage(buffer));
3673 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3674 vmbuffer, VISIBILITYMAP_VALID_BITS);
3675 }
3676 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3677 {
3678 all_visible_cleared_new = true;
3679 PageClearAllVisible(BufferGetPage(newbuf));
3680 visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3681 vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3682 }
3683
3684 if (newbuf != buffer)
3685 MarkBufferDirty(newbuf);
3686 MarkBufferDirty(buffer);
3687
3688 /* XLOG stuff */
3689 if (RelationNeedsWAL(relation))
3690 {
3691 XLogRecPtr recptr;
3692
3693 /*
3694 * For logical decoding we need combocids to properly decode the
3695 * catalog.
3696 */
3697 if (RelationIsAccessibleInLogicalDecoding(relation))
3698 {
3699 log_heap_new_cid(relation, &oldtup);
3700 log_heap_new_cid(relation, heaptup);
3701 }
3702
3703 recptr = log_heap_update(relation, buffer,
3704 newbuf, &oldtup, heaptup,
3705 old_key_tuple,
3706 all_visible_cleared,
3707 all_visible_cleared_new);
3708 if (newbuf != buffer)
3709 {
3710 PageSetLSN(BufferGetPage(newbuf), recptr);
3711 }
3712 PageSetLSN(BufferGetPage(buffer), recptr);
3713 }
3714
3715 END_CRIT_SECTION();
3716
3717 if (newbuf != buffer)
3718 LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3719 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3720
3721 /*
3722 * Mark old tuple for invalidation from system caches at next command
3723 * boundary, and mark the new tuple for invalidation in case we abort. We
3724 * have to do this before releasing the buffer because oldtup is in the
3725 * buffer. (heaptup is all in local memory, but it's necessary to process
3726 * both tuple versions in one call to inval.c so we can avoid redundant
3727 * sinval messages.)
3728 */
3729 CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3730
3731 /* Now we can release the buffer(s) */
3732 if (newbuf != buffer)
3733 ReleaseBuffer(newbuf);
3734 ReleaseBuffer(buffer);
3735 if (BufferIsValid(vmbuffer_new))
3736 ReleaseBuffer(vmbuffer_new);
3737 if (BufferIsValid(vmbuffer))
3738 ReleaseBuffer(vmbuffer);
3739
3740 /*
3741 * Release the lmgr tuple lock, if we had it.
3742 */
3743 if (have_tuple_lock)
3744 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3745
3746 pgstat_count_heap_update(relation, use_hot_update);
3747
3748 /*
3749 * If heaptup is a private copy, release it. Don't forget to copy t_self
3750 * back to the caller's image, too.
3751 */
3752 if (heaptup != newtup)
3753 {
3754 newtup->t_self = heaptup->t_self;
3755 heap_freetuple(heaptup);
3756 }
3757
3758 if (old_key_tuple != NULL && old_key_copied)
3759 heap_freetuple(old_key_tuple);
3760
3761 bms_free(hot_attrs);
3762 bms_free(key_attrs);
3763 bms_free(id_attrs);
3764 bms_free(modified_attrs);
3765 bms_free(interesting_attrs);
3766
3767 return TM_Ok;
3768}
3769
3770/*
3771 * Check if the specified attribute's value is same in both given tuples.
3772 * Subroutine for HeapDetermineModifiedColumns.
3773 */
3774static bool
3775heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
3776 HeapTuple tup1, HeapTuple tup2)
3777{
3778 Datum value1,
3779 value2;
3780 bool isnull1,
3781 isnull2;
3782 Form_pg_attribute att;
3783
3784 /*
3785 * If it's a whole-tuple reference, say "not equal". It's not really
3786 * worth supporting this case, since it could only succeed after a no-op
3787 * update, which is hardly a case worth optimizing for.
3788 */
3789 if (attrnum == 0)
3790 return false;
3791
3792 /*
3793 * Likewise, automatically say "not equal" for any system attribute other
3794 * than tableOID; we cannot expect these to be consistent in a HOT chain,
3795 * or even to be set correctly yet in the new tuple.
3796 */
3797 if (attrnum < 0)
3798 {
3799 if (attrnum != TableOidAttributeNumber)
3800 return false;
3801 }
3802
3803 /*
3804 * Extract the corresponding values. XXX this is pretty inefficient if
3805 * there are many indexed columns. Should HeapDetermineModifiedColumns do
3806 * a single heap_deform_tuple call on each tuple, instead? But that
3807 * doesn't work for system columns ...
3808 */
3809 value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
3810 value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
3811
3812 /*
3813 * If one value is NULL and other is not, then they are certainly not
3814 * equal
3815 */
3816 if (isnull1 != isnull2)
3817 return false;
3818
3819 /*
3820 * If both are NULL, they can be considered equal.
3821 */
3822 if (isnull1)
3823 return true;
3824
3825 /*
3826 * We do simple binary comparison of the two datums. This may be overly
3827 * strict because there can be multiple binary representations for the
3828 * same logical value. But we should be OK as long as there are no false
3829 * positives. Using a type-specific equality operator is messy because
3830 * there could be multiple notions of equality in different operator
3831 * classes; furthermore, we cannot safely invoke user-defined functions
3832 * while holding exclusive buffer lock.
3833 */
3834 if (attrnum <= 0)
3835 {
3836 /* The only allowed system columns are OIDs, so do this */
3837 return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3838 }
3839 else
3840 {
3841 Assert(attrnum <= tupdesc->natts);
3842 att = TupleDescAttr(tupdesc, attrnum - 1);
3843 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3844 }
3845}
3846
3847/*
3848 * Check which columns are being updated.
3849 *
3850 * Given an updated tuple, determine (and return into the output bitmapset),
3851 * from those listed as interesting, the set of columns that changed.
3852 *
3853 * The input bitmapset is destructively modified; that is OK since this is
3854 * invoked at most once in heap_update.
3855 */
3856static Bitmapset *
3857HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
3858 HeapTuple oldtup, HeapTuple newtup)
3859{
3860 int attnum;
3861 Bitmapset *modified = NULL;
3862
3863 while ((attnum = bms_first_member(interesting_cols)) >= 0)
3864 {
3865 attnum += FirstLowInvalidHeapAttributeNumber;
3866
3867 if (!heap_tuple_attr_equals(RelationGetDescr(relation),
3868 attnum, oldtup, newtup))
3869 modified = bms_add_member(modified,
3870 attnum - FirstLowInvalidHeapAttributeNumber);
3871 }
3872
3873 return modified;
3874}
3875
3876/*
3877 * simple_heap_update - replace a tuple
3878 *
3879 * This routine may be used to update a tuple when concurrent updates of
3880 * the target tuple are not expected (for example, because we have a lock
3881 * on the relation associated with the tuple). Any failure is reported
3882 * via ereport().
3883 */
3884void
3885simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
3886{
3887 TM_Result result;
3888 TM_FailureData tmfd;
3889 LockTupleMode lockmode;
3890
3891 result = heap_update(relation, otid, tup,
3892 GetCurrentCommandId(true), InvalidSnapshot,
3893 true /* wait for commit */ ,
3894 &tmfd, &lockmode);
3895 switch (result)
3896 {
3897 case TM_SelfModified:
3898 /* Tuple was already updated in current command? */
3899 elog(ERROR, "tuple already updated by self");
3900 break;
3901
3902 case TM_Ok:
3903 /* done successfully */
3904 break;
3905
3906 case TM_Updated:
3907 elog(ERROR, "tuple concurrently updated");
3908 break;
3909
3910 case TM_Deleted:
3911 elog(ERROR, "tuple concurrently deleted");
3912 break;
3913
3914 default:
3915 elog(ERROR, "unrecognized heap_update status: %u", result);
3916 break;
3917 }
3918}
3919
3920
3921/*
3922 * Return the MultiXactStatus corresponding to the given tuple lock mode.
3923 */
3924static MultiXactStatus
3925get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
3926{
3927 int retval;
3928
3929 if (is_update)
3930 retval = tupleLockExtraInfo[mode].updstatus;
3931 else
3932 retval = tupleLockExtraInfo[mode].lockstatus;
3933
3934 if (retval == -1)
3935 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
3936 is_update ? "true" : "false");
3937
3938 return (MultiXactStatus) retval;
3939}
3940
3941/*
3942 * heap_lock_tuple - lock a tuple in shared or exclusive mode
3943 *
3944 * Note that this acquires a buffer pin, which the caller must release.
3945 *
3946 * Input parameters:
3947 * relation: relation containing tuple (caller must hold suitable lock)
3948 * tid: TID of tuple to lock
3949 * cid: current command ID (used for visibility test, and stored into
3950 * tuple's cmax if lock is successful)
3951 * mode: indicates if shared or exclusive tuple lock is desired
3952 * wait_policy: what to do if tuple lock is not available
3953 * follow_updates: if true, follow the update chain to also lock descendant
3954 * tuples.
3955 *
3956 * Output parameters:
3957 * *tuple: all fields filled in
3958 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
3959 * *tmfd: filled in failure cases (see below)
3960 *
3961 * Function results are the same as the ones for table_tuple_lock().
3962 *
3963 * In the failure cases other than TM_Invisible, the routine fills
3964 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
3965 * if necessary), and t_cmax (the last only for TM_SelfModified,
3966 * since we cannot obtain cmax from a combocid generated by another
3967 * transaction).
3968 * See comments for struct TM_FailureData for additional info.
3969 *
3970 * See README.tuplock for a thorough explanation of this mechanism.
3971 */
3972TM_Result
3973heap_lock_tuple(Relation relation, HeapTuple tuple,
3974 CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
3975 bool follow_updates,
3976 Buffer *buffer, TM_FailureData *tmfd)
3977{
3978 TM_Result result;
3979 ItemPointer tid = &(tuple->t_self);
3980 ItemId lp;
3981 Page page;
3982 Buffer vmbuffer = InvalidBuffer;
3983 BlockNumber block;
3984 TransactionId xid,
3985 xmax;
3986 uint16 old_infomask,
3987 new_infomask,
3988 new_infomask2;
3989 bool first_time = true;
3990 bool skip_tuple_lock = false;
3991 bool have_tuple_lock = false;
3992 bool cleared_all_frozen = false;
3993
3994 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3995 block = ItemPointerGetBlockNumber(tid);
3996
3997 /*
3998 * Before locking the buffer, pin the visibility map page if it appears to
3999 * be necessary. Since we haven't got the lock yet, someone else might be
4000 * in the middle of changing this, so we'll need to recheck after we have
4001 * the lock.
4002 */
4003 if (PageIsAllVisible(BufferGetPage(*buffer)))
4004 visibilitymap_pin(relation, block, &vmbuffer);
4005
4006 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4007
4008 page = BufferGetPage(*buffer);
4009 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4010 Assert(ItemIdIsNormal(lp));
4011
4012 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4013 tuple->t_len = ItemIdGetLength(lp);
4014 tuple->t_tableOid = RelationGetRelid(relation);
4015
4016l3:
4017 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4018
4019 if (result == TM_Invisible)
4020 {
4021 /*
4022 * This is possible, but only when locking a tuple for ON CONFLICT
4023 * UPDATE. We return this value here rather than throwing an error in
4024 * order to give that case the opportunity to throw a more specific
4025 * error.
4026 */
4027 result = TM_Invisible;
4028 goto out_locked;
4029 }
4030 else if (result == TM_BeingModified ||
4031 result == TM_Updated ||
4032 result == TM_Deleted)
4033 {
4034 TransactionId xwait;
4035 uint16 infomask;
4036 uint16 infomask2;
4037 bool require_sleep;
4038 ItemPointerData t_ctid;
4039
4040 /* must copy state data before unlocking buffer */
4041 xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4042 infomask = tuple->t_data->t_infomask;
4043 infomask2 = tuple->t_data->t_infomask2;
4044 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4045
4046 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4047
4048 /*
4049 * If any subtransaction of the current top transaction already holds
4050 * a lock as strong as or stronger than what we're requesting, we
4051 * effectively hold the desired lock already. We *must* succeed
4052 * without trying to take the tuple lock, else we will deadlock
4053 * against anyone wanting to acquire a stronger lock.
4054 *
4055 * Note we only do this the first time we loop on the HTSU result;
4056 * there is no point in testing in subsequent passes, because
4057 * evidently our own transaction cannot have acquired a new lock after
4058 * the first time we checked.
4059 */
4060 if (first_time)
4061 {
4062 first_time = false;
4063
4064 if (infomask & HEAP_XMAX_IS_MULTI)
4065 {
4066 int i;
4067 int nmembers;
4068 MultiXactMember *members;
4069
4070 /*
4071 * We don't need to allow old multixacts here; if that had
4072 * been the case, HeapTupleSatisfiesUpdate would have returned
4073 * MayBeUpdated and we wouldn't be here.
4074 */
4075 nmembers =
4076 GetMultiXactIdMembers(xwait, &members, false,
4077 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4078
4079 for (i = 0; i < nmembers; i++)
4080 {
4081 /* only consider members of our own transaction */
4082 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4083 continue;
4084
4085 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4086 {
4087 pfree(members);
4088 result = TM_Ok;
4089 goto out_unlocked;
4090 }
4091 else
4092 {
4093 /*
4094 * Disable acquisition of the heavyweight tuple lock.
4095 * Otherwise, when promoting a weaker lock, we might
4096 * deadlock with another locker that has acquired the
4097 * heavyweight tuple lock and is waiting for our
4098 * transaction to finish.
4099 *
4100 * Note that in this case we still need to wait for
4101 * the multixact if required, to avoid acquiring
4102 * conflicting locks.
4103 */
4104 skip_tuple_lock = true;
4105 }
4106 }
4107
4108 if (members)
4109 pfree(members);
4110 }
4111 else if (TransactionIdIsCurrentTransactionId(xwait))
4112 {
4113 switch (mode)
4114 {
4115 case LockTupleKeyShare:
4116 Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4117 HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4118 HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4119 result = TM_Ok;
4120 goto out_unlocked;
4121 case LockTupleShare:
4122 if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4123 HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4124 {
4125 result = TM_Ok;
4126 goto out_unlocked;
4127 }
4128 break;
4129 case LockTupleNoKeyExclusive:
4130 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4131 {
4132 result = TM_Ok;
4133 goto out_unlocked;
4134 }
4135 break;
4136 case LockTupleExclusive:
4137 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4138 infomask2 & HEAP_KEYS_UPDATED)
4139 {
4140 result = TM_Ok;
4141 goto out_unlocked;
4142 }
4143 break;
4144 }
4145 }
4146 }
4147
4148 /*
4149 * Initially assume that we will have to wait for the locking
4150 * transaction(s) to finish. We check various cases below in which
4151 * this can be turned off.
4152 */
4153 require_sleep = true;
4154 if (mode == LockTupleKeyShare)
4155 {
4156 /*
4157 * If we're requesting KeyShare, and there's no update present, we
4158 * don't need to wait. Even if there is an update, we can still
4159 * continue if the key hasn't been modified.
4160 *
4161 * However, if there are updates, we need to walk the update chain
4162 * to mark future versions of the row as locked, too. That way,
4163 * if somebody deletes that future version, we're protected
4164 * against the key going away. This locking of future versions
4165 * could block momentarily, if a concurrent transaction is
4166 * deleting a key; or it could return a value to the effect that
4167 * the transaction deleting the key has already committed. So we
4168 * do this before re-locking the buffer; otherwise this would be
4169 * prone to deadlocks.
4170 *
4171 * Note that the TID we're locking was grabbed before we unlocked
4172 * the buffer. For it to change while we're not looking, the
4173 * other properties we're testing for below after re-locking the
4174 * buffer would also change, in which case we would restart this
4175 * loop above.
4176 */
4177 if (!(infomask2 & HEAP_KEYS_UPDATED))
4178 {
4179 bool updated;
4180
4181 updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4182
4183 /*
4184 * If there are updates, follow the update chain; bail out if
4185 * that cannot be done.
4186 */
4187 if (follow_updates && updated)
4188 {
4189 TM_Result res;
4190
4191 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4192 GetCurrentTransactionId(),
4193 mode);
4194 if (res != TM_Ok)
4195 {
4196 result = res;
4197 /* recovery code expects to have buffer lock held */
4198 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4199 goto failed;
4200 }
4201 }
4202
4203 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4204
4205 /*
4206 * Make sure it's still an appropriate lock, else start over.
4207 * Also, if it wasn't updated before we released the lock, but
4208 * is updated now, we start over too; the reason is that we
4209 * now need to follow the update chain to lock the new
4210 * versions.
4211 */
4212 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4213 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4214 !updated))
4215 goto l3;
4216
4217 /* Things look okay, so we can skip sleeping */
4218 require_sleep = false;
4219
4220 /*
4221 * Note we allow Xmax to change here; other updaters/lockers
4222 * could have modified it before we grabbed the buffer lock.
4223 * However, this is not a problem, because with the recheck we
4224 * just did we ensure that they still don't conflict with the
4225 * lock we want.
4226 */
4227 }
4228 }
4229 else if (mode == LockTupleShare)
4230 {
4231 /*
4232 * If we're requesting Share, we can similarly avoid sleeping if
4233 * there's no update and no exclusive lock present.
4234 */
4235 if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4236 !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4237 {
4238 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4239
4240 /*
4241 * Make sure it's still an appropriate lock, else start over.
4242 * See above about allowing xmax to change.
4243 */
4244 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4245 HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4246 goto l3;
4247 require_sleep = false;
4248 }
4249 }
4250 else if (mode == LockTupleNoKeyExclusive)
4251 {
4252 /*
4253 * If we're requesting NoKeyExclusive, we might also be able to
4254 * avoid sleeping; just ensure that there no conflicting lock
4255 * already acquired.
4256 */
4257 if (infomask & HEAP_XMAX_IS_MULTI)
4258 {
4259 if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4260 mode, NULL))
4261 {
4262 /*
4263 * No conflict, but if the xmax changed under us in the
4264 * meantime, start over.
4265 */
4266 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4267 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4268 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4269 xwait))
4270 goto l3;
4271
4272 /* otherwise, we're good */
4273 require_sleep = false;
4274 }
4275 }
4276 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4277 {
4278 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4279
4280 /* if the xmax changed in the meantime, start over */
4281 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4282 !TransactionIdEquals(
4283 HeapTupleHeaderGetRawXmax(tuple->t_data),
4284 xwait))
4285 goto l3;
4286 /* otherwise, we're good */
4287 require_sleep = false;
4288 }
4289 }
4290
4291 /*
4292 * As a check independent from those above, we can also avoid sleeping
4293 * if the current transaction is the sole locker of the tuple. Note
4294 * that the strength of the lock already held is irrelevant; this is
4295 * not about recording the lock in Xmax (which will be done regardless
4296 * of this optimization, below). Also, note that the cases where we
4297 * hold a lock stronger than we are requesting are already handled
4298 * above by not doing anything.
4299 *
4300 * Note we only deal with the non-multixact case here; MultiXactIdWait
4301 * is well equipped to deal with this situation on its own.
4302 */
4303 if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4304 TransactionIdIsCurrentTransactionId(xwait))
4305 {
4306 /* ... but if the xmax changed in the meantime, start over */
4307 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4308 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4309 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4310 xwait))
4311 goto l3;
4312 Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4313 require_sleep = false;
4314 }
4315
4316 /*
4317 * Time to sleep on the other transaction/multixact, if necessary.
4318 *
4319 * If the other transaction is an update/delete that's already
4320 * committed, then sleeping cannot possibly do any good: if we're
4321 * required to sleep, get out to raise an error instead.
4322 *
4323 * By here, we either have already acquired the buffer exclusive lock,
4324 * or we must wait for the locking transaction or multixact; so below
4325 * we ensure that we grab buffer lock after the sleep.
4326 */
4327 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4328 {
4329 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4330 goto failed;
4331 }
4332 else if (require_sleep)
4333 {
4334 /*
4335 * Acquire tuple lock to establish our priority for the tuple, or
4336 * die trying. LockTuple will release us when we are next-in-line
4337 * for the tuple. We must do this even if we are share-locking,
4338 * but not if we already have a weaker lock on the tuple.
4339 *
4340 * If we are forced to "start over" below, we keep the tuple lock;
4341 * this arranges that we stay at the head of the line while
4342 * rechecking tuple state.
4343 */
4344 if (!skip_tuple_lock &&
4345 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4346 &have_tuple_lock))
4347 {
4348 /*
4349 * This can only happen if wait_policy is Skip and the lock
4350 * couldn't be obtained.
4351 */
4352 result = TM_WouldBlock;
4353 /* recovery code expects to have buffer lock held */
4354 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4355 goto failed;
4356 }
4357
4358 if (infomask & HEAP_XMAX_IS_MULTI)
4359 {
4360 MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4361
4362 /* We only ever lock tuples, never update them */
4363 if (status >= MultiXactStatusNoKeyUpdate)
4364 elog(ERROR, "invalid lock mode in heap_lock_tuple");
4365
4366 /* wait for multixact to end, or die trying */
4367 switch (wait_policy)
4368 {
4369 case LockWaitBlock:
4370 MultiXactIdWait((MultiXactId) xwait, status, infomask,
4371 relation, &tuple->t_self, XLTW_Lock, NULL);
4372 break;
4373 case LockWaitSkip:
4374 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4375 status, infomask, relation,
4376 NULL))
4377 {
4378 result = TM_WouldBlock;
4379 /* recovery code expects to have buffer lock held */
4380 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4381 goto failed;
4382 }
4383 break;
4384 case LockWaitError:
4385 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4386 status, infomask, relation,
4387 NULL))
4388 ereport(ERROR,
4389 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4390 errmsg("could not obtain lock on row in relation \"%s\"",
4391 RelationGetRelationName(relation))));
4392
4393 break;
4394 }
4395
4396 /*
4397 * Of course, the multixact might not be done here: if we're
4398 * requesting a light lock mode, other transactions with light
4399 * locks could still be alive, as well as locks owned by our
4400 * own xact or other subxacts of this backend. We need to
4401 * preserve the surviving MultiXact members. Note that it
4402 * isn't absolutely necessary in the latter case, but doing so
4403 * is simpler.
4404 */
4405 }
4406 else
4407 {
4408 /* wait for regular transaction to end, or die trying */
4409 switch (wait_policy)
4410 {
4411 case LockWaitBlock:
4412 XactLockTableWait(xwait, relation, &tuple->t_self,
4413 XLTW_Lock);
4414 break;
4415 case LockWaitSkip:
4416 if (!ConditionalXactLockTableWait(xwait))
4417 {
4418 result = TM_WouldBlock;
4419 /* recovery code expects to have buffer lock held */
4420 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4421 goto failed;
4422 }
4423 break;
4424 case LockWaitError:
4425 if (!ConditionalXactLockTableWait(xwait))
4426 ereport(ERROR,
4427 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4428 errmsg("could not obtain lock on row in relation \"%s\"",
4429 RelationGetRelationName(relation))));
4430 break;
4431 }
4432 }
4433
4434 /* if there are updates, follow the update chain */
4435 if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4436 {
4437 TM_Result res;
4438
4439 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4440 GetCurrentTransactionId(),
4441 mode);
4442 if (res != TM_Ok)
4443 {
4444 result = res;
4445 /* recovery code expects to have buffer lock held */
4446 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4447 goto failed;
4448 }
4449 }
4450
4451 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4452
4453 /*
4454 * xwait is done, but if xwait had just locked the tuple then some
4455 * other xact could update this tuple before we get to this point.
4456 * Check for xmax change, and start over if so.
4457 */
4458 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4459 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4460 xwait))
4461 goto l3;
4462
4463 if (!(infomask & HEAP_XMAX_IS_MULTI))
4464 {
4465 /*
4466 * Otherwise check if it committed or aborted. Note we cannot
4467 * be here if the tuple was only locked by somebody who didn't
4468 * conflict with us; that would have been handled above. So
4469 * that transaction must necessarily be gone by now. But
4470 * don't check for this in the multixact case, because some
4471 * locker transactions might still be running.
4472 */
4473 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4474 }
4475 }
4476
4477 /* By here, we're certain that we hold buffer exclusive lock again */
4478
4479 /*
4480 * We may lock if previous xmax aborted, or if it committed but only
4481 * locked the tuple without updating it; or if we didn't have to wait
4482 * at all for whatever reason.
4483 */
4484 if (!require_sleep ||
4485 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4486 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4487 HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4488 result = TM_Ok;
4489 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid) ||
4490 HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data))
4491 result = TM_Updated;
4492 else
4493 result = TM_Deleted;
4494 }
4495
4496failed:
4497 if (result != TM_Ok)
4498 {
4499 Assert(result == TM_SelfModified || result == TM_Updated ||
4500 result == TM_Deleted || result == TM_WouldBlock);
4501 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4502 Assert(result != TM_Updated ||
4503 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4504 tmfd->ctid = tuple->t_data->t_ctid;
4505 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4506 if (result == TM_SelfModified)
4507 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4508 else
4509 tmfd->cmax = InvalidCommandId;
4510 goto out_locked;
4511 }
4512
4513 /*
4514 * If we didn't pin the visibility map page and the page has become all
4515 * visible while we were busy locking the buffer, or during some
4516 * subsequent window during which we had it unlocked, we'll have to unlock
4517 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4518 * unfortunate, especially since we'll now have to recheck whether the
4519 * tuple has been locked or updated under us, but hopefully it won't
4520 * happen very often.
4521 */
4522 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4523 {
4524 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4525 visibilitymap_pin(relation, block, &vmbuffer);
4526 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4527 goto l3;
4528 }
4529
4530 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4531 old_infomask = tuple->t_data->t_infomask;
4532
4533 /*
4534 * If this is the first possibly-multixact-able operation in the current
4535 * transaction, set my per-backend OldestMemberMXactId setting. We can be
4536 * certain that the transaction will never become a member of any older
4537 * MultiXactIds than that. (We have to do this even if we end up just
4538 * using our own TransactionId below, since some other backend could
4539 * incorporate our XID into a MultiXact immediately afterwards.)
4540 */
4541 MultiXactIdSetOldestMember();
4542
4543 /*
4544 * Compute the new xmax and infomask to store into the tuple. Note we do
4545 * not modify the tuple just yet, because that would leave it in the wrong
4546 * state if multixact.c elogs.
4547 */
4548 compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4549 GetCurrentTransactionId(), mode, false,
4550 &xid, &new_infomask, &new_infomask2);
4551
4552 START_CRIT_SECTION();
4553
4554 /*
4555 * Store transaction information of xact locking the tuple.
4556 *
4557 * Note: Cmax is meaningless in this context, so don't set it; this avoids
4558 * possibly generating a useless combo CID. Moreover, if we're locking a
4559 * previously updated tuple, it's important to preserve the Cmax.
4560 *
4561 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4562 * we would break the HOT chain.
4563 */
4564 tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4565 tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4566 tuple->t_data->t_infomask |= new_infomask;
4567 tuple->t_data->t_infomask2 |= new_infomask2;
4568 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4569 HeapTupleHeaderClearHotUpdated(tuple->t_data);
4570 HeapTupleHeaderSetXmax(tuple->t_data, xid);
4571
4572 /*
4573 * Make sure there is no forward chain link in t_ctid. Note that in the
4574 * cases where the tuple has been updated, we must not overwrite t_ctid,
4575 * because it was set by the updater. Moreover, if the tuple has been
4576 * updated, we need to follow the update chain to lock the new versions of
4577 * the tuple as well.
4578 */
4579 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4580 tuple->t_data->t_ctid = *tid;
4581
4582 /* Clear only the all-frozen bit on visibility map if needed */
4583 if (PageIsAllVisible(page) &&
4584 visibilitymap_clear(relation, block, vmbuffer,
4585 VISIBILITYMAP_ALL_FROZEN))
4586 cleared_all_frozen = true;
4587
4588
4589 MarkBufferDirty(*buffer);
4590
4591 /*
4592 * XLOG stuff. You might think that we don't need an XLOG record because
4593 * there is no state change worth restoring after a crash. You would be
4594 * wrong however: we have just written either a TransactionId or a
4595 * MultiXactId that may never have been seen on disk before, and we need
4596 * to make sure that there are XLOG entries covering those ID numbers.
4597 * Else the same IDs might be re-used after a crash, which would be
4598 * disastrous if this page made it to disk before the crash. Essentially
4599 * we have to enforce the WAL log-before-data rule even in this case.
4600 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4601 * entries for everything anyway.)
4602 */
4603 if (RelationNeedsWAL(relation))
4604 {
4605 xl_heap_lock xlrec;
4606 XLogRecPtr recptr;
4607
4608 XLogBeginInsert();
4609 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4610
4611 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4612 xlrec.locking_xid = xid;
4613 xlrec.infobits_set = compute_infobits(new_infomask,
4614 tuple->t_data->t_infomask2);
4615 xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4616 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4617
4618 /* we don't decode row locks atm, so no need to log the origin */
4619
4620 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4621
4622 PageSetLSN(page, recptr);
4623 }
4624
4625 END_CRIT_SECTION();
4626
4627 result = TM_Ok;
4628
4629out_locked:
4630 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4631
4632out_unlocked:
4633 if (BufferIsValid(vmbuffer))
4634 ReleaseBuffer(vmbuffer);
4635
4636 /*
4637 * Don't update the visibility map here. Locking a tuple doesn't change
4638 * visibility info.
4639 */
4640
4641 /*
4642 * Now that we have successfully marked the tuple as locked, we can
4643 * release the lmgr tuple lock, if we had it.
4644 */
4645 if (have_tuple_lock)
4646 UnlockTupleTuplock(relation, tid, mode);
4647
4648 return result;
4649}
4650
4651/*
4652 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4653 * its normal, Xmax-based tuple lock.
4654 *
4655 * have_tuple_lock is an input and output parameter: on input, it indicates
4656 * whether the lock has previously been acquired (and this function does
4657 * nothing in that case). If this function returns success, have_tuple_lock
4658 * has been flipped to true.
4659 *
4660 * Returns false if it was unable to obtain the lock; this can only happen if
4661 * wait_policy is Skip.
4662 */
4663static bool
4664heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
4665 LockWaitPolicy wait_policy, bool *have_tuple_lock)
4666{
4667 if (*have_tuple_lock)
4668 return true;
4669
4670 switch (wait_policy)
4671 {
4672 case LockWaitBlock:
4673 LockTupleTuplock(relation, tid, mode);
4674 break;
4675
4676 case LockWaitSkip:
4677 if (!ConditionalLockTupleTuplock(relation, tid, mode))
4678 return false;
4679 break;
4680
4681 case LockWaitError:
4682 if (!ConditionalLockTupleTuplock(relation, tid, mode))
4683 ereport(ERROR,
4684 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4685 errmsg("could not obtain lock on row in relation \"%s\"",
4686 RelationGetRelationName(relation))));
4687 break;
4688 }
4689 *have_tuple_lock = true;
4690
4691 return true;
4692}
4693
4694/*
4695 * Given an original set of Xmax and infomask, and a transaction (identified by
4696 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4697 * corresponding infomasks to use on the tuple.
4698 *
4699 * Note that this might have side effects such as creating a new MultiXactId.
4700 *
4701 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4702 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4703 * but it was not running anymore. There is a race condition, which is that the
4704 * MultiXactId may have finished since then, but that uncommon case is handled
4705 * either here, or within MultiXactIdExpand.
4706 *
4707 * There is a similar race condition possible when the old xmax was a regular
4708 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4709 * window, but it's still possible to end up creating an unnecessary
4710 * MultiXactId. Fortunately this is harmless.
4711 */
4712static void
4713compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
4714 uint16 old_infomask2, TransactionId add_to_xmax,
4715 LockTupleMode mode, bool is_update,
4716 TransactionId *result_xmax, uint16 *result_infomask,
4717 uint16 *result_infomask2)
4718{
4719 TransactionId new_xmax;
4720 uint16 new_infomask,
4721 new_infomask2;
4722
4723 Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
4724
4725l5:
4726 new_infomask = 0;
4727 new_infomask2 = 0;
4728 if (old_infomask & HEAP_XMAX_INVALID)
4729 {
4730 /*
4731 * No previous locker; we just insert our own TransactionId.
4732 *
4733 * Note that it's critical that this case be the first one checked,
4734 * because there are several blocks below that come back to this one
4735 * to implement certain optimizations; old_infomask might contain
4736 * other dirty bits in those cases, but we don't really care.
4737 */
4738 if (is_update)
4739 {
4740 new_xmax = add_to_xmax;
4741 if (mode == LockTupleExclusive)
4742 new_infomask2 |= HEAP_KEYS_UPDATED;
4743 }
4744 else
4745 {
4746 new_infomask |= HEAP_XMAX_LOCK_ONLY;
4747 switch (mode)
4748 {
4749 case LockTupleKeyShare:
4750 new_xmax = add_to_xmax;
4751 new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
4752 break;
4753 case LockTupleShare:
4754 new_xmax = add_to_xmax;
4755 new_infomask |= HEAP_XMAX_SHR_LOCK;
4756 break;
4757 case LockTupleNoKeyExclusive:
4758 new_xmax = add_to_xmax;
4759 new_infomask |= HEAP_XMAX_EXCL_LOCK;
4760 break;
4761 case LockTupleExclusive:
4762 new_xmax = add_to_xmax;
4763 new_infomask |= HEAP_XMAX_EXCL_LOCK;
4764 new_infomask2 |= HEAP_KEYS_UPDATED;
4765 break;
4766 default:
4767 new_xmax = InvalidTransactionId; /* silence compiler */
4768 elog(ERROR, "invalid lock mode");
4769 }
4770 }
4771 }
4772 else if (old_infomask & HEAP_XMAX_IS_MULTI)
4773 {
4774 MultiXactStatus new_status;
4775
4776 /*
4777 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
4778 * cross-check.
4779 */
4780 Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4781
4782 /*
4783 * A multixact together with LOCK_ONLY set but neither lock bit set
4784 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4785 * anymore. This check is critical for databases upgraded by
4786 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4787 * that such multis are never passed.
4788 */
4789 if (HEAP_LOCKED_UPGRADED(old_infomask))
4790 {
4791 old_infomask &= ~HEAP_XMAX_IS_MULTI;
4792 old_infomask |= HEAP_XMAX_INVALID;
4793 goto l5;
4794 }
4795
4796 /*
4797 * If the XMAX is already a MultiXactId, then we need to expand it to
4798 * include add_to_xmax; but if all the members were lockers and are
4799 * all gone, we can do away with the IS_MULTI bit and just set
4800 * add_to_xmax as the only locker/updater. If all lockers are gone
4801 * and we have an updater that aborted, we can also do without a
4802 * multi.
4803 *
4804 * The cost of doing GetMultiXactIdMembers would be paid by
4805 * MultiXactIdExpand if we weren't to do this, so this check is not
4806 * incurring extra work anyhow.
4807 */
4808 if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4809 {
4810 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
4811 !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
4812 old_infomask)))
4813 {
4814 /*
4815 * Reset these bits and restart; otherwise fall through to
4816 * create a new multi below.
4817 */
4818 old_infomask &= ~HEAP_XMAX_IS_MULTI;
4819 old_infomask |= HEAP_XMAX_INVALID;
4820 goto l5;
4821 }
4822 }
4823
4824 new_status = get_mxact_status_for_lock(mode, is_update);
4825
4826 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
4827 new_status);
4828 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4829 }
4830 else if (old_infomask & HEAP_XMAX_COMMITTED)
4831 {
4832 /*
4833 * It's a committed update, so we need to preserve him as updater of
4834 * the tuple.
4835 */
4836 MultiXactStatus status;
4837 MultiXactStatus new_status;
4838
4839 if (old_infomask2 & HEAP_KEYS_UPDATED)
4840 status = MultiXactStatusUpdate;
4841 else
4842 status = MultiXactStatusNoKeyUpdate;
4843
4844 new_status = get_mxact_status_for_lock(mode, is_update);
4845
4846 /*
4847 * since it's not running, it's obviously impossible for the old
4848 * updater to be identical to the current one, so we need not check
4849 * for that case as we do in the block above.
4850 */
4851 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4852 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4853 }
4854 else if (TransactionIdIsInProgress(xmax))
4855 {
4856 /*
4857 * If the XMAX is a valid, in-progress TransactionId, then we need to
4858 * create a new MultiXactId that includes both the old locker or
4859 * updater and our own TransactionId.
4860 */
4861 MultiXactStatus new_status;
4862 MultiXactStatus old_status;
4863 LockTupleMode old_mode;
4864
4865 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
4866 {
4867 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
4868 old_status = MultiXactStatusForKeyShare;
4869 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
4870 old_status = MultiXactStatusForShare;
4871 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
4872 {
4873 if (old_infomask2 & HEAP_KEYS_UPDATED)
4874 old_status = MultiXactStatusForUpdate;
4875 else
4876 old_status = MultiXactStatusForNoKeyUpdate;
4877 }
4878 else
4879 {
4880 /*
4881 * LOCK_ONLY can be present alone only when a page has been
4882 * upgraded by pg_upgrade. But in that case,
4883 * TransactionIdIsInProgress() should have returned false. We
4884 * assume it's no longer locked in this case.
4885 */
4886 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
4887 old_infomask |= HEAP_XMAX_INVALID;
4888 old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
4889 goto l5;
4890 }
4891 }
4892 else
4893 {
4894 /* it's an update, but which kind? */
4895 if (old_infomask2 & HEAP_KEYS_UPDATED)
4896 old_status = MultiXactStatusUpdate;
4897 else
4898 old_status = MultiXactStatusNoKeyUpdate;
4899 }
4900
4901 old_mode = TUPLOCK_from_mxstatus(old_status);
4902
4903 /*
4904 * If the lock to be acquired is for the same TransactionId as the
4905 * existing lock, there's an optimization possible: consider only the
4906 * strongest of both locks as the only one present, and restart.
4907 */
4908 if (xmax == add_to_xmax)
4909 {
4910 /*
4911 * Note that it's not possible for the original tuple to be
4912 * updated: we wouldn't be here because the tuple would have been
4913 * invisible and we wouldn't try to update it. As a subtlety,
4914 * this code can also run when traversing an update chain to lock
4915 * future versions of a tuple. But we wouldn't be here either,
4916 * because the add_to_xmax would be different from the original
4917 * updater.
4918 */
4919 Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
4920
4921 /* acquire the strongest of both */
4922 if (mode < old_mode)
4923 mode = old_mode;
4924 /* mustn't touch is_update */
4925
4926 old_infomask |= HEAP_XMAX_INVALID;
4927 goto l5;
4928 }
4929
4930 /* otherwise, just fall back to creating a new multixact */
4931 new_status = get_mxact_status_for_lock(mode, is_update);
4932 new_xmax = MultiXactIdCreate(xmax, old_status,
4933 add_to_xmax, new_status);
4934 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4935 }
4936 else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
4937 TransactionIdDidCommit(xmax))
4938 {
4939 /*
4940 * It's a committed update, so we gotta preserve him as updater of the
4941 * tuple.
4942 */
4943 MultiXactStatus status;
4944 MultiXactStatus new_status;
4945
4946 if (old_infomask2 & HEAP_KEYS_UPDATED)
4947 status = MultiXactStatusUpdate;
4948 else
4949 status = MultiXactStatusNoKeyUpdate;
4950
4951 new_status = get_mxact_status_for_lock(mode, is_update);
4952
4953 /*
4954 * since it's not running, it's obviously impossible for the old
4955 * updater to be identical to the current one, so we need not check
4956 * for that case as we do in the block above.
4957 */
4958 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4959 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4960 }
4961 else
4962 {
4963 /*
4964 * Can get here iff the locking/updating transaction was running when
4965 * the infomask was extracted from the tuple, but finished before
4966 * TransactionIdIsInProgress got to run. Deal with it as if there was
4967 * no locker at all in the first place.
4968 */
4969 old_infomask |= HEAP_XMAX_INVALID;
4970 goto l5;
4971 }
4972
4973 *result_infomask = new_infomask;
4974 *result_infomask2 = new_infomask2;
4975 *result_xmax = new_xmax;
4976}
4977
4978/*
4979 * Subroutine for heap_lock_updated_tuple_rec.
4980 *
4981 * Given a hypothetical multixact status held by the transaction identified
4982 * with the given xid, does the current transaction need to wait, fail, or can
4983 * it continue if it wanted to acquire a lock of the given mode? "needwait"
4984 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
4985 * returned. If the lock is already held by the current transaction, return
4986 * TM_SelfModified. In case of a conflict with another transaction, a
4987 * different HeapTupleSatisfiesUpdate return code is returned.
4988 *
4989 * The held status is said to be hypothetical because it might correspond to a
4990 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
4991 * way for simplicity of API.
4992 */
4993static TM_Result
4994test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
4995 LockTupleMode mode, HeapTuple tup,
4996 bool *needwait)
4997{
4998 MultiXactStatus wantedstatus;
4999
5000 *needwait = false;
5001 wantedstatus = get_mxact_status_for_lock(mode, false);
5002
5003 /*
5004 * Note: we *must* check TransactionIdIsInProgress before
5005 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5006 * for an explanation.
5007 */
5008 if (TransactionIdIsCurrentTransactionId(xid))
5009 {
5010 /*
5011 * The tuple has already been locked by our own transaction. This is
5012 * very rare but can happen if multiple transactions are trying to
5013 * lock an ancient version of the same tuple.
5014 */
5015 return TM_SelfModified;
5016 }
5017 else if (TransactionIdIsInProgress(xid))
5018 {
5019 /*
5020 * If the locking transaction is running, what we do depends on
5021 * whether the lock modes conflict: if they do, then we must wait for
5022 * it to finish; otherwise we can fall through to lock this tuple
5023 * version without waiting.
5024 */
5025 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5026 LOCKMODE_from_mxstatus(wantedstatus)))
5027 {
5028 *needwait = true;
5029 }
5030
5031 /*
5032 * If we set needwait above, then this value doesn't matter;
5033 * otherwise, this value signals to caller that it's okay to proceed.
5034 */
5035 return TM_Ok;
5036 }
5037 else if (TransactionIdDidAbort(xid))
5038 return TM_Ok;
5039 else if (TransactionIdDidCommit(xid))
5040 {
5041 /*
5042 * The other transaction committed. If it was only a locker, then the
5043 * lock is completely gone now and we can return success; but if it
5044 * was an update, then what we do depends on whether the two lock
5045 * modes conflict. If they conflict, then we must report error to
5046 * caller. But if they don't, we can fall through to allow the current
5047 * transaction to lock the tuple.
5048 *
5049 * Note: the reason we worry about ISUPDATE here is because as soon as
5050 * a transaction ends, all its locks are gone and meaningless, and
5051 * thus we can ignore them; whereas its updates persist. In the
5052 * TransactionIdIsInProgress case, above, we don't need to check
5053 * because we know the lock is still "alive" and thus a conflict needs
5054 * always be checked.
5055 */
5056 if (!ISUPDATE_from_mxstatus(status))
5057 return TM_Ok;
5058
5059 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5060 LOCKMODE_from_mxstatus(wantedstatus)))
5061 {
5062 /* bummer */
5063 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid) ||
5064 HeapTupleHeaderIndicatesMovedPartitions(tup->t_data))
5065 return TM_Updated;
5066 else
5067 return TM_Deleted;
5068 }
5069
5070 return TM_Ok;
5071 }
5072
5073 /* Not in progress, not aborted, not committed -- must have crashed */
5074 return TM_Ok;
5075}
5076
5077
5078/*
5079 * Recursive part of heap_lock_updated_tuple
5080 *
5081 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5082 * xid with the given mode; if this tuple is updated, recurse to lock the new
5083 * version as well.
5084 */
5085static TM_Result
5086heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5087 LockTupleMode mode)
5088{
5089 TM_Result result;
5090 ItemPointerData tupid;
5091 HeapTupleData mytup;
5092 Buffer buf;
5093 uint16 new_infomask,
5094 new_infomask2,
5095 old_infomask,
5096 old_infomask2;
5097 TransactionId xmax,
5098 new_xmax;
5099 TransactionId priorXmax = InvalidTransactionId;
5100 bool cleared_all_frozen = false;
5101 bool pinned_desired_page;
5102 Buffer vmbuffer = InvalidBuffer;
5103 BlockNumber block;
5104
5105 ItemPointerCopy(tid, &tupid);
5106
5107 for (;;)
5108 {
5109 new_infomask = 0;
5110 new_xmax = InvalidTransactionId;
5111 block = ItemPointerGetBlockNumber(&tupid);
5112 ItemPointerCopy(&tupid, &(mytup.t_self));
5113
5114 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5115 {
5116 /*
5117 * if we fail to find the updated version of the tuple, it's
5118 * because it was vacuumed/pruned away after its creator
5119 * transaction aborted. So behave as if we got to the end of the
5120 * chain, and there's no further tuple to lock: return success to
5121 * caller.
5122 */
5123 result = TM_Ok;
5124 goto out_unlocked;
5125 }
5126
5127l4:
5128 CHECK_FOR_INTERRUPTS();
5129
5130 /*
5131 * Before locking the buffer, pin the visibility map page if it
5132 * appears to be necessary. Since we haven't got the lock yet,
5133 * someone else might be in the middle of changing this, so we'll need
5134 * to recheck after we have the lock.
5135 */
5136 if (PageIsAllVisible(BufferGetPage(buf)))
5137 {
5138 visibilitymap_pin(rel, block, &vmbuffer);
5139 pinned_desired_page = true;
5140 }
5141 else
5142 pinned_desired_page = false;
5143
5144 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5145
5146 /*
5147 * If we didn't pin the visibility map page and the page has become
5148 * all visible while we were busy locking the buffer, we'll have to
5149 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5150 * That's a bit unfortunate, but hopefully shouldn't happen often.
5151 *
5152 * Note: in some paths through this function, we will reach here
5153 * holding a pin on a vm page that may or may not be the one matching
5154 * this page. If this page isn't all-visible, we won't use the vm
5155 * page, but we hold onto such a pin till the end of the function.
5156 */
5157 if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5158 {
5159 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5160 visibilitymap_pin(rel, block, &vmbuffer);
5161 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5162 }
5163
5164 /*
5165 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5166 * end of the chain, we're done, so return success.
5167 */
5168 if (TransactionIdIsValid(priorXmax) &&
5169 !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5170 priorXmax))
5171 {
5172 result = TM_Ok;
5173 goto out_locked;
5174 }
5175
5176 /*
5177 * Also check Xmin: if this tuple was created by an aborted
5178 * (sub)transaction, then we already locked the last live one in the
5179 * chain, thus we're done, so return success.
5180 */
5181 if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5182 {
5183 result = TM_Ok;
5184 goto out_locked;
5185 }
5186
5187 old_infomask = mytup.t_data->t_infomask;
5188 old_infomask2 = mytup.t_data->t_infomask2;
5189 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5190
5191 /*
5192 * If this tuple version has been updated or locked by some concurrent
5193 * transaction(s), what we do depends on whether our lock mode
5194 * conflicts with what those other transactions hold, and also on the
5195 * status of them.
5196 */
5197 if (!(old_infomask & HEAP_XMAX_INVALID))
5198 {
5199 TransactionId rawxmax;
5200 bool needwait;
5201
5202 rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5203 if (old_infomask & HEAP_XMAX_IS_MULTI)
5204 {
5205 int nmembers;
5206 int i;
5207 MultiXactMember *members;
5208
5209 /*
5210 * We don't need a test for pg_upgrade'd tuples: this is only
5211 * applied to tuples after the first in an update chain. Said
5212 * first tuple in the chain may well be locked-in-9.2-and-
5213 * pg_upgraded, but that one was already locked by our caller,
5214 * not us; and any subsequent ones cannot be because our
5215 * caller must necessarily have obtained a snapshot later than
5216 * the pg_upgrade itself.
5217 */
5218 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5219
5220 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5221 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5222 for (i = 0; i < nmembers; i++)
5223 {
5224 result = test_lockmode_for_conflict(members[i].status,
5225 members[i].xid,
5226 mode,
5227 &mytup,
5228 &needwait);
5229
5230 /*
5231 * If the tuple was already locked by ourselves in a
5232 * previous iteration of this (say heap_lock_tuple was
5233 * forced to restart the locking loop because of a change
5234 * in xmax), then we hold the lock already on this tuple
5235 * version and we don't need to do anything; and this is
5236 * not an error condition either. We just need to skip
5237 * this tuple and continue locking the next version in the
5238 * update chain.
5239 */
5240 if (result == TM_SelfModified)
5241 {
5242 pfree(members);
5243 goto next;
5244 }
5245
5246 if (needwait)
5247 {
5248 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5249 XactLockTableWait(members[i].xid, rel,
5250 &mytup.t_self,
5251 XLTW_LockUpdated);
5252 pfree(members);
5253 goto l4;
5254 }
5255 if (result != TM_Ok)
5256 {
5257 pfree(members);
5258 goto out_locked;
5259 }
5260 }
5261 if (members)
5262 pfree(members);
5263 }
5264 else
5265 {
5266 MultiXactStatus status;
5267
5268 /*
5269 * For a non-multi Xmax, we first need to compute the
5270 * corresponding MultiXactStatus by using the infomask bits.
5271 */
5272 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5273 {
5274 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5275 status = MultiXactStatusForKeyShare;
5276 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5277 status = MultiXactStatusForShare;
5278 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5279 {
5280 if (old_infomask2 & HEAP_KEYS_UPDATED)
5281 status = MultiXactStatusForUpdate;
5282 else
5283 status = MultiXactStatusForNoKeyUpdate;
5284 }
5285 else
5286 {
5287 /*
5288 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5289 * as share-locked in the old cluster) shouldn't be
5290 * seen in the middle of an update chain.
5291 */
5292 elog(ERROR, "invalid lock status in tuple");
5293 }
5294 }
5295 else
5296 {
5297 /* it's an update, but which kind? */
5298 if (old_infomask2 & HEAP_KEYS_UPDATED)
5299 status = MultiXactStatusUpdate;
5300 else
5301 status = MultiXactStatusNoKeyUpdate;
5302 }
5303
5304 result = test_lockmode_for_conflict(status, rawxmax, mode,
5305 &mytup, &needwait);
5306
5307 /*
5308 * If the tuple was already locked by ourselves in a previous
5309 * iteration of this (say heap_lock_tuple was forced to
5310 * restart the locking loop because of a change in xmax), then
5311 * we hold the lock already on this tuple version and we don't
5312 * need to do anything; and this is not an error condition
5313 * either. We just need to skip this tuple and continue
5314 * locking the next version in the update chain.
5315 */
5316 if (result == TM_SelfModified)
5317 goto next;
5318
5319 if (needwait)
5320 {
5321 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5322 XactLockTableWait(rawxmax, rel, &mytup.t_self,
5323 XLTW_LockUpdated);
5324 goto l4;
5325 }
5326 if (result != TM_Ok)
5327 {
5328 goto out_locked;
5329 }
5330 }
5331 }
5332
5333 /* compute the new Xmax and infomask values for the tuple ... */
5334 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5335 xid, mode, false,
5336 &new_xmax, &new_infomask, &new_infomask2);
5337
5338 if (PageIsAllVisible(BufferGetPage(buf)) &&
5339 visibilitymap_clear(rel, block, vmbuffer,
5340 VISIBILITYMAP_ALL_FROZEN))
5341 cleared_all_frozen = true;
5342
5343 START_CRIT_SECTION();
5344
5345 /* ... and set them */
5346 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5347 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5348 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5349 mytup.t_data->t_infomask |= new_infomask;
5350 mytup.t_data->t_infomask2 |= new_infomask2;
5351
5352 MarkBufferDirty(buf);
5353
5354 /* XLOG stuff */
5355 if (RelationNeedsWAL(rel))
5356 {
5357 xl_heap_lock_updated xlrec;
5358 XLogRecPtr recptr;
5359 Page page = BufferGetPage(buf);
5360
5361 XLogBeginInsert();
5362 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5363
5364 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5365 xlrec.xmax = new_xmax;
5366 xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5367 xlrec.flags =
5368 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5369
5370 XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5371
5372 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5373
5374 PageSetLSN(page, recptr);
5375 }
5376
5377 END_CRIT_SECTION();
5378
5379next:
5380 /* if we find the end of update chain, we're done. */
5381 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5382 HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
5383 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5384 HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5385 {
5386 result = TM_Ok;
5387 goto out_locked;
5388 }
5389
5390 /* tail recursion */
5391 priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5392 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5393 UnlockReleaseBuffer(buf);
5394 }
5395
5396 result = TM_Ok;
5397
5398out_locked:
5399 UnlockReleaseBuffer(buf);
5400
5401out_unlocked:
5402 if (vmbuffer != InvalidBuffer)
5403 ReleaseBuffer(vmbuffer);
5404
5405 return result;
5406}
5407
5408/*
5409 * heap_lock_updated_tuple
5410 * Follow update chain when locking an updated tuple, acquiring locks (row
5411 * marks) on the updated versions.
5412 *
5413 * The initial tuple is assumed to be already locked.
5414 *
5415 * This function doesn't check visibility, it just unconditionally marks the
5416 * tuple(s) as locked. If any tuple in the updated chain is being deleted
5417 * concurrently (or updated with the key being modified), sleep until the
5418 * transaction doing it is finished.
5419 *
5420 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5421 * when we have to wait for other transactions to release them, as opposed to
5422 * what heap_lock_tuple does. The reason is that having more than one
5423 * transaction walking the chain is probably uncommon enough that risk of
5424 * starvation is not likely: one of the preconditions for being here is that
5425 * the snapshot in use predates the update that created this tuple (because we
5426 * started at an earlier version of the tuple), but at the same time such a
5427 * transaction cannot be using repeatable read or serializable isolation
5428 * levels, because that would lead to a serializability failure.
5429 */
5430static TM_Result
5431heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5432 TransactionId xid, LockTupleMode mode)
5433{
5434 /*
5435 * If the tuple has not been updated, or has moved into another partition
5436 * (effectively a delete) stop here.
5437 */
5438 if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5439 !ItemPointerEquals(&tuple->t_self, ctid))
5440 {
5441 /*
5442 * If this is the first possibly-multixact-able operation in the
5443 * current transaction, set my per-backend OldestMemberMXactId
5444 * setting. We can be certain that the transaction will never become a
5445 * member of any older MultiXactIds than that. (We have to do this
5446 * even if we end up just using our own TransactionId below, since
5447 * some other backend could incorporate our XID into a MultiXact
5448 * immediately afterwards.)
5449 */
5450 MultiXactIdSetOldestMember();
5451
5452 return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5453 }
5454
5455 /* nothing to lock */
5456 return TM_Ok;
5457}
5458
5459/*
5460 * heap_finish_speculative - mark speculative insertion as successful
5461 *
5462 * To successfully finish a speculative insertion we have to clear speculative
5463 * token from tuple. To do so the t_ctid field, which will contain a
5464 * speculative token value, is modified in place to point to the tuple itself,
5465 * which is characteristic of a newly inserted ordinary tuple.
5466 *
5467 * NB: It is not ok to commit without either finishing or aborting a
5468 * speculative insertion. We could treat speculative tuples of committed
5469 * transactions implicitly as completed, but then we would have to be prepared
5470 * to deal with speculative tokens on committed tuples. That wouldn't be
5471 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5472 * but clearing the token at completion isn't very expensive either.
5473 * An explicit confirmation WAL record also makes logical decoding simpler.
5474 */
5475void
5476heap_finish_speculative(Relation relation, ItemPointer tid)
5477{
5478 Buffer buffer;
5479 Page page;
5480 OffsetNumber offnum;
5481 ItemId lp = NULL;
5482 HeapTupleHeader htup;
5483
5484 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5485 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5486 page = (Page) BufferGetPage(buffer);
5487
5488 offnum = ItemPointerGetOffsetNumber(tid);
5489 if (PageGetMaxOffsetNumber(page) >= offnum)
5490 lp = PageGetItemId(page, offnum);
5491
5492 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5493 elog(ERROR, "invalid lp");
5494
5495 htup = (HeapTupleHeader) PageGetItem(page, lp);
5496
5497 /* SpecTokenOffsetNumber should be distinguishable from any real offset */
5498 StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
5499 "invalid speculative token constant");
5500
5501 /* NO EREPORT(ERROR) from here till changes are logged */
5502 START_CRIT_SECTION();
5503
5504 Assert(HeapTupleHeaderIsSpeculative(htup));
5505
5506 MarkBufferDirty(buffer);
5507
5508 /*
5509 * Replace the speculative insertion token with a real t_ctid, pointing to
5510 * itself like it does on regular tuples.
5511 */
5512 htup->t_ctid = *tid;
5513
5514 /* XLOG stuff */
5515 if (RelationNeedsWAL(relation))
5516 {
5517 xl_heap_confirm xlrec;
5518 XLogRecPtr recptr;
5519
5520 xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5521
5522 XLogBeginInsert();
5523
5524 /* We want the same filtering on this as on a plain insert */
5525 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5526
5527 XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5528 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5529
5530 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5531
5532 PageSetLSN(page, recptr);
5533 }
5534
5535 END_CRIT_SECTION();
5536
5537 UnlockReleaseBuffer(buffer);
5538}
5539
5540/*
5541 * heap_abort_speculative - kill a speculatively inserted tuple
5542 *
5543 * Marks a tuple that was speculatively inserted in the same command as dead,
5544 * by setting its xmin as invalid. That makes it immediately appear as dead
5545 * to all transactions, including our own. In particular, it makes
5546 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5547 * inserting a duplicate key value won't unnecessarily wait for our whole
5548 * transaction to finish (it'll just wait for our speculative insertion to
5549 * finish).
5550 *
5551 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5552 * that arise due to a mutual dependency that is not user visible. By
5553 * definition, unprincipled deadlocks cannot be prevented by the user
5554 * reordering lock acquisition in client code, because the implementation level
5555 * lock acquisitions are not under the user's direct control. If speculative
5556 * inserters did not take this precaution, then under high concurrency they
5557 * could deadlock with each other, which would not be acceptable.
5558 *
5559 * This is somewhat redundant with heap_delete, but we prefer to have a
5560 * dedicated routine with stripped down requirements. Note that this is also
5561 * used to delete the TOAST tuples created during speculative insertion.
5562 *
5563 * This routine does not affect logical decoding as it only looks at
5564 * confirmation records.
5565 */
5566void
5567heap_abort_speculative(Relation relation, ItemPointer tid)
5568{
5569 TransactionId xid = GetCurrentTransactionId();
5570 ItemId lp;
5571 HeapTupleData tp;
5572 Page page;
5573 BlockNumber block;
5574 Buffer buffer;
5575
5576 Assert(ItemPointerIsValid(tid));
5577
5578 block = ItemPointerGetBlockNumber(tid);
5579 buffer = ReadBuffer(relation, block);
5580 page = BufferGetPage(buffer);
5581
5582 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5583
5584 /*
5585 * Page can't be all visible, we just inserted into it, and are still
5586 * running.
5587 */
5588 Assert(!PageIsAllVisible(page));
5589
5590 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5591 Assert(ItemIdIsNormal(lp));
5592
5593 tp.t_tableOid = RelationGetRelid(relation);
5594 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5595 tp.t_len = ItemIdGetLength(lp);
5596 tp.t_self = *tid;
5597
5598 /*
5599 * Sanity check that the tuple really is a speculatively inserted tuple,
5600 * inserted by us.
5601 */
5602 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5603 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5604 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5605 elog(ERROR, "attempted to kill a non-speculative tuple");
5606 Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
5607
5608 /*
5609 * No need to check for serializable conflicts here. There is never a
5610 * need for a combocid, either. No need to extract replica identity, or
5611 * do anything special with infomask bits.
5612 */
5613
5614 START_CRIT_SECTION();
5615
5616 /*
5617 * The tuple will become DEAD immediately. Flag that this page
5618 * immediately is a candidate for pruning by setting xmin to
5619 * RecentGlobalXmin. That's not pretty, but it doesn't seem worth
5620 * inventing a nicer API for this.
5621 */
5622 Assert(TransactionIdIsValid(RecentGlobalXmin));
5623 PageSetPrunable(page, RecentGlobalXmin);
5624
5625 /* store transaction information of xact deleting the tuple */
5626 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
5627 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5628
5629 /*
5630 * Set the tuple header xmin to InvalidTransactionId. This makes the
5631 * tuple immediately invisible everyone. (In particular, to any
5632 * transactions waiting on the speculative token, woken up later.)
5633 */
5634 HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
5635
5636 /* Clear the speculative insertion token too */
5637 tp.t_data->t_ctid = tp.t_self;
5638
5639 MarkBufferDirty(buffer);
5640
5641 /*
5642 * XLOG stuff
5643 *
5644 * The WAL records generated here match heap_delete(). The same recovery
5645 * routines are used.
5646 */
5647 if (RelationNeedsWAL(relation))
5648 {
5649 xl_heap_delete xlrec;
5650 XLogRecPtr recptr;
5651
5652 xlrec.flags = XLH_DELETE_IS_SUPER;
5653 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
5654 tp.t_data->t_infomask2);
5655 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
5656 xlrec.xmax = xid;
5657
5658 XLogBeginInsert();
5659 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5660 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5661
5662 /* No replica identity & replication origin logged */
5663
5664 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5665
5666 PageSetLSN(page, recptr);
5667 }
5668
5669 END_CRIT_SECTION();
5670
5671 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5672
5673 if (HeapTupleHasExternal(&tp))
5674 {
5675 Assert(!IsToastRelation(relation));
5676 toast_delete(relation, &tp, true);
5677 }
5678
5679 /*
5680 * Never need to mark tuple for invalidation, since catalogs don't support
5681 * speculative insertion
5682 */
5683
5684 /* Now we can release the buffer */
5685 ReleaseBuffer(buffer);
5686
5687 /* count deletion, as we counted the insertion too */
5688 pgstat_count_heap_delete(relation);
5689}
5690
5691/*
5692 * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5693 *
5694 * Overwriting violates both MVCC and transactional safety, so the uses
5695 * of this function in Postgres are extremely limited. Nonetheless we
5696 * find some places to use it.
5697 *
5698 * The tuple cannot change size, and therefore it's reasonable to assume
5699 * that its null bitmap (if any) doesn't change either. So we just
5700 * overwrite the data portion of the tuple without touching the null
5701 * bitmap or any of the header fields.
5702 *
5703 * tuple is an in-memory tuple structure containing the data to be written
5704 * over the target tuple. Also, tuple->t_self identifies the target tuple.
5705 */
5706void
5707heap_inplace_update(Relation relation, HeapTuple tuple)
5708{
5709 Buffer buffer;
5710 Page page;
5711 OffsetNumber offnum;
5712 ItemId lp = NULL;
5713 HeapTupleHeader htup;
5714 uint32 oldlen;
5715 uint32 newlen;
5716
5717 /*
5718 * For now, parallel operations are required to be strictly read-only.
5719 * Unlike a regular update, this should never create a combo CID, so it
5720 * might be possible to relax this restriction, but not without more
5721 * thought and testing. It's not clear that it would be useful, anyway.
5722 */
5723 if (IsInParallelMode())
5724 ereport(ERROR,
5725 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5726 errmsg("cannot update tuples during a parallel operation")));
5727
5728 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5729 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5730 page = (Page) BufferGetPage(buffer);
5731
5732 offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5733 if (PageGetMaxOffsetNumber(page) >= offnum)
5734 lp = PageGetItemId(page, offnum);
5735
5736 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5737 elog(ERROR, "invalid lp");
5738
5739 htup = (HeapTupleHeader) PageGetItem(page, lp);
5740
5741 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5742 newlen = tuple->t_len - tuple->t_data->t_hoff;
5743 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
5744 elog(ERROR, "wrong tuple length");
5745
5746 /* NO EREPORT(ERROR) from here till changes are logged */
5747 START_CRIT_SECTION();
5748
5749 memcpy((char *) htup + htup->t_hoff,
5750 (char *) tuple->t_data + tuple->t_data->t_hoff,
5751 newlen);
5752
5753 MarkBufferDirty(buffer);
5754
5755 /* XLOG stuff */
5756 if (RelationNeedsWAL(relation))
5757 {
5758 xl_heap_inplace xlrec;
5759 XLogRecPtr recptr;
5760
5761 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5762
5763 XLogBeginInsert();
5764 XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5765
5766 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5767 XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
5768
5769 /* inplace updates aren't decoded atm, don't log the origin */
5770
5771 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5772
5773 PageSetLSN(page, recptr);
5774 }
5775
5776 END_CRIT_SECTION();
5777
5778 UnlockReleaseBuffer(buffer);
5779
5780 /*
5781 * Send out shared cache inval if necessary. Note that because we only
5782 * pass the new version of the tuple, this mustn't be used for any
5783 * operations that could change catcache lookup keys. But we aren't
5784 * bothering with index updates either, so that's true a fortiori.
5785 */
5786 if (!IsBootstrapProcessingMode())
5787 CacheInvalidateHeapTuple(relation, tuple, NULL);
5788}
5789
5790#define FRM_NOOP 0x0001
5791#define FRM_INVALIDATE_XMAX 0x0002
5792#define FRM_RETURN_IS_XID 0x0004
5793#define FRM_RETURN_IS_MULTI 0x0008
5794#define FRM_MARK_COMMITTED 0x0010
5795
5796/*
5797 * FreezeMultiXactId
5798 * Determine what to do during freezing when a tuple is marked by a
5799 * MultiXactId.
5800 *
5801 * NB -- this might have the side-effect of creating a new MultiXactId!
5802 *
5803 * "flags" is an output value; it's used to tell caller what to do on return.
5804 * Possible flags are:
5805 * FRM_NOOP
5806 * don't do anything -- keep existing Xmax
5807 * FRM_INVALIDATE_XMAX
5808 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
5809 * FRM_RETURN_IS_XID
5810 * The Xid return value is a single update Xid to set as xmax.
5811 * FRM_MARK_COMMITTED
5812 * Xmax can be marked as HEAP_XMAX_COMMITTED
5813 * FRM_RETURN_IS_MULTI
5814 * The return value is a new MultiXactId to set as new Xmax.
5815 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
5816 */
5817static TransactionId
5818FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
5819 TransactionId relfrozenxid, TransactionId relminmxid,
5820 TransactionId cutoff_xid, MultiXactId cutoff_multi,
5821 uint16 *flags)
5822{
5823 TransactionId xid = InvalidTransactionId;
5824 int i;
5825 MultiXactMember *members;
5826 int nmembers;
5827 bool need_replace;
5828 int nnewmembers;
5829 MultiXactMember *newmembers;
5830 bool has_lockers;
5831 TransactionId update_xid;
5832 bool update_committed;
5833
5834 *flags = 0;
5835
5836 /* We should only be called in Multis */
5837 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
5838
5839 if (!MultiXactIdIsValid(multi) ||
5840 HEAP_LOCKED_UPGRADED(t_infomask))
5841 {
5842 /* Ensure infomask bits are appropriately set/reset */
5843 *flags |= FRM_INVALIDATE_XMAX;
5844 return InvalidTransactionId;
5845 }
5846 else if (MultiXactIdPrecedes(multi, relminmxid))
5847 ereport(ERROR,
5848 (errcode(ERRCODE_DATA_CORRUPTED),
5849 errmsg_internal("found multixact %u from before relminmxid %u",
5850 multi, relminmxid)));
5851 else if (MultiXactIdPrecedes(multi, cutoff_multi))
5852 {
5853 /*
5854 * This old multi cannot possibly have members still running, but
5855 * verify just in case. If it was a locker only, it can be removed
5856 * without any further consideration; but if it contained an update,
5857 * we might need to preserve it.
5858 */
5859 if (MultiXactIdIsRunning(multi,
5860 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
5861 ereport(ERROR,
5862 (errcode(ERRCODE_DATA_CORRUPTED),
5863 errmsg_internal("multixact %u from before cutoff %u found to be still running",
5864 multi, cutoff_multi)));
5865
5866 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
5867 {
5868 *flags |= FRM_INVALIDATE_XMAX;
5869 xid = InvalidTransactionId; /* not strictly necessary */
5870 }
5871 else
5872 {
5873 /* replace multi by update xid */
5874 xid = MultiXactIdGetUpdateXid(multi, t_infomask);
5875
5876 /* wasn't only a lock, xid needs to be valid */
5877 Assert(TransactionIdIsValid(xid));
5878
5879 if (TransactionIdPrecedes(xid, relfrozenxid))
5880 ereport(ERROR,
5881 (errcode(ERRCODE_DATA_CORRUPTED),
5882 errmsg_internal("found update xid %u from before relfrozenxid %u",
5883 xid, relfrozenxid)));
5884
5885 /*
5886 * If the xid is older than the cutoff, it has to have aborted,
5887 * otherwise the tuple would have gotten pruned away.
5888 */
5889 if (TransactionIdPrecedes(xid, cutoff_xid))
5890 {
5891 if (TransactionIdDidCommit(xid))
5892 ereport(ERROR,
5893 (errcode(ERRCODE_DATA_CORRUPTED),
5894 errmsg_internal("cannot freeze committed update xid %u", xid)));
5895 *flags |= FRM_INVALIDATE_XMAX;
5896 xid = InvalidTransactionId; /* not strictly necessary */
5897 }
5898 else
5899 {
5900 *flags |= FRM_RETURN_IS_XID;
5901 }
5902 }
5903
5904 return xid;
5905 }
5906
5907 /*
5908 * This multixact might have or might not have members still running, but
5909 * we know it's valid and is newer than the cutoff point for multis.
5910 * However, some member(s) of it may be below the cutoff for Xids, so we
5911 * need to walk the whole members array to figure out what to do, if
5912 * anything.
5913 */
5914
5915 nmembers =
5916 GetMultiXactIdMembers(multi, &members, false,
5917 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
5918 if (nmembers <= 0)
5919 {
5920 /* Nothing worth keeping */
5921 *flags |= FRM_INVALIDATE_XMAX;
5922 return InvalidTransactionId;
5923 }
5924
5925 /* is there anything older than the cutoff? */
5926 need_replace = false;
5927 for (i = 0; i < nmembers; i++)
5928 {
5929 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
5930 {
5931 need_replace = true;
5932 break;
5933 }
5934 }
5935
5936 /*
5937 * In the simplest case, there is no member older than the cutoff; we can
5938 * keep the existing MultiXactId as is.
5939 */
5940 if (!need_replace)
5941 {
5942 *flags |= FRM_NOOP;
5943 pfree(members);
5944 return InvalidTransactionId;
5945 }
5946
5947 /*
5948 * If the multi needs to be updated, figure out which members do we need
5949 * to keep.
5950 */
5951 nnewmembers = 0;
5952 newmembers = palloc(sizeof(MultiXactMember) * nmembers);
5953 has_lockers = false;
5954 update_xid = InvalidTransactionId;
5955 update_committed = false;
5956
5957 for (i = 0; i < nmembers; i++)
5958 {
5959 /*
5960 * Determine whether to keep this member or ignore it.
5961 */
5962 if (ISUPDATE_from_mxstatus(members[i].status))
5963 {
5964 TransactionId xid = members[i].xid;
5965
5966 Assert(TransactionIdIsValid(xid));
5967 if (TransactionIdPrecedes(xid, relfrozenxid))
5968 ereport(ERROR,
5969 (errcode(ERRCODE_DATA_CORRUPTED),
5970 errmsg_internal("found update xid %u from before relfrozenxid %u",
5971 xid, relfrozenxid)));
5972
5973 /*
5974 * It's an update; should we keep it? If the transaction is known
5975 * aborted or crashed then it's okay to ignore it, otherwise not.
5976 * Note that an updater older than cutoff_xid cannot possibly be
5977 * committed, because HeapTupleSatisfiesVacuum would have returned
5978 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
5979 *
5980 * As with all tuple visibility routines, it's critical to test
5981 * TransactionIdIsInProgress before TransactionIdDidCommit,
5982 * because of race conditions explained in detail in
5983 * heapam_visibility.c.
5984 */
5985 if (TransactionIdIsCurrentTransactionId(xid) ||
5986 TransactionIdIsInProgress(xid))
5987 {
5988 Assert(!TransactionIdIsValid(update_xid));
5989 update_xid = xid;
5990 }
5991 else if (TransactionIdDidCommit(xid))
5992 {
5993 /*
5994 * The transaction committed, so we can tell caller to set
5995 * HEAP_XMAX_COMMITTED. (We can only do this because we know
5996 * the transaction is not running.)
5997 */
5998 Assert(!TransactionIdIsValid(update_xid));
5999 update_committed = true;
6000 update_xid = xid;
6001 }
6002 else
6003 {
6004 /*
6005 * Not in progress, not committed -- must be aborted or
6006 * crashed; we can ignore it.
6007 */
6008 }
6009
6010 /*
6011 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6012 * update Xid cannot possibly be older than the xid cutoff. The
6013 * presence of such a tuple would cause corruption, so be paranoid
6014 * and check.
6015 */
6016 if (TransactionIdIsValid(update_xid) &&
6017 TransactionIdPrecedes(update_xid, cutoff_xid))
6018 ereport(ERROR,
6019 (errcode(ERRCODE_DATA_CORRUPTED),
6020 errmsg_internal("found update xid %u from before xid cutoff %u",
6021 update_xid, cutoff_xid)));
6022
6023 /*
6024 * If we determined that it's an Xid corresponding to an update
6025 * that must be retained, additionally add it to the list of
6026 * members of the new Multi, in case we end up using that. (We
6027 * might still decide to use only an update Xid and not a multi,
6028 * but it's easier to maintain the list as we walk the old members
6029 * list.)
6030 */
6031 if (TransactionIdIsValid(update_xid))
6032 newmembers[nnewmembers++] = members[i];
6033 }
6034 else
6035 {
6036 /* We only keep lockers if they are still running */
6037 if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6038 TransactionIdIsInProgress(members[i].xid))
6039 {
6040 /* running locker cannot possibly be older than the cutoff */
6041 Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6042 newmembers[nnewmembers++] = members[i];
6043 has_lockers = true;
6044 }
6045 }
6046 }
6047
6048 pfree(members);
6049
6050 if (nnewmembers == 0)
6051 {
6052 /* nothing worth keeping!? Tell caller to remove the whole thing */
6053 *flags |= FRM_INVALIDATE_XMAX;
6054 xid = InvalidTransactionId;
6055 }
6056 else if (TransactionIdIsValid(update_xid) && !has_lockers)
6057 {
6058 /*
6059 * If there's a single member and it's an update, pass it back alone
6060 * without creating a new Multi. (XXX we could do this when there's a
6061 * single remaining locker, too, but that would complicate the API too
6062 * much; moreover, the case with the single updater is more
6063 * interesting, because those are longer-lived.)
6064 */
6065 Assert(nnewmembers == 1);
6066 *flags |= FRM_RETURN_IS_XID;
6067 if (update_committed)
6068 *flags |= FRM_MARK_COMMITTED;
6069 xid = update_xid;
6070 }
6071 else
6072 {
6073 /*
6074 * Create a new multixact with the surviving members of the previous
6075 * one, to set as new Xmax in the tuple.
6076 */
6077 xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6078 *flags |= FRM_RETURN_IS_MULTI;
6079 }
6080
6081 pfree(newmembers);
6082
6083 return xid;
6084}
6085
6086/*
6087 * heap_prepare_freeze_tuple
6088 *
6089 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6090 * are older than the specified cutoff XID and cutoff MultiXactId. If so,
6091 * setup enough state (in the *frz output argument) to later execute and
6092 * WAL-log what we would need to do, and return true. Return false if nothing
6093 * is to be changed. In addition, set *totally_frozen_p to true if the tuple
6094 * will be totally frozen after these operations are performed and false if
6095 * more freezing will eventually be required.
6096 *
6097 * Caller is responsible for setting the offset field, if appropriate.
6098 *
6099 * It is assumed that the caller has checked the tuple with
6100 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6101 * (else we should be removing the tuple, not freezing it).
6102 *
6103 * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6104 * XID older than it could neither be running nor seen as running by any
6105 * open transaction. This ensures that the replacement will not change
6106 * anyone's idea of the tuple state.
6107 * Similarly, cutoff_multi must be less than or equal to the smallest
6108 * MultiXactId used by any transaction currently open.
6109 *
6110 * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6111 * that buffer.
6112 *
6113 * NB: It is not enough to set hint bits to indicate something is
6114 * committed/invalid -- they might not be set on a standby, or after crash
6115 * recovery. We really need to remove old xids.
6116 */
6117bool
6118heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6119 TransactionId relfrozenxid, TransactionId relminmxid,
6120 TransactionId cutoff_xid, TransactionId cutoff_multi,
6121 xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6122{
6123 bool changed = false;
6124 bool xmax_already_frozen = false;
6125 bool xmin_frozen;
6126 bool freeze_xmax;
6127 TransactionId xid;
6128
6129 frz->frzflags = 0;
6130 frz->t_infomask2 = tuple->t_infomask2;
6131 frz->t_infomask = tuple->t_infomask;
6132 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6133
6134 /*
6135 * Process xmin. xmin_frozen has two slightly different meanings: in the
6136 * !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's
6137 * already a permanent value), while in the block below it is set true to
6138 * mean "xmin won't need freezing after what we do to it here" (false
6139 * otherwise). In both cases we're allowed to set totally_frozen, as far
6140 * as xmin is concerned.
6141 */
6142 xid = HeapTupleHeaderGetXmin(tuple);
6143 if (!TransactionIdIsNormal(xid))
6144 xmin_frozen = true;
6145 else
6146 {
6147 if (TransactionIdPrecedes(xid, relfrozenxid))
6148 ereport(ERROR,
6149 (errcode(ERRCODE_DATA_CORRUPTED),
6150 errmsg_internal("found xmin %u from before relfrozenxid %u",
6151 xid, relfrozenxid)));
6152
6153 xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid);
6154 if (xmin_frozen)
6155 {
6156 if (!TransactionIdDidCommit(xid))
6157 ereport(ERROR,
6158 (errcode(ERRCODE_DATA_CORRUPTED),
6159 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6160 xid, cutoff_xid)));
6161
6162 frz->t_infomask |= HEAP_XMIN_FROZEN;
6163 changed = true;
6164 }
6165 }
6166
6167 /*
6168 * Process xmax. To thoroughly examine the current Xmax value we need to
6169 * resolve a MultiXactId to its member Xids, in case some of them are
6170 * below the given cutoff for Xids. In that case, those values might need
6171 * freezing, too. Also, if a multi needs freezing, we cannot simply take
6172 * it out --- if there's a live updater Xid, it needs to be kept.
6173 *
6174 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6175 */
6176 xid = HeapTupleHeaderGetRawXmax(tuple);
6177
6178 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6179 {
6180 TransactionId newxmax;
6181 uint16 flags;
6182
6183 newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6184 relfrozenxid, relminmxid,
6185 cutoff_xid, cutoff_multi, &flags);
6186
6187 freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6188
6189 if (flags & FRM_RETURN_IS_XID)
6190 {
6191 /*
6192 * NB -- some of these transformations are only valid because we
6193 * know the return Xid is a tuple updater (i.e. not merely a
6194 * locker.) Also note that the only reason we don't explicitly
6195 * worry about HEAP_KEYS_UPDATED is because it lives in
6196 * t_infomask2 rather than t_infomask.
6197 */
6198 frz->t_infomask &= ~HEAP_XMAX_BITS;
6199 frz->xmax = newxmax;
6200 if (flags & FRM_MARK_COMMITTED)
6201 frz->t_infomask |= HEAP_XMAX_COMMITTED;
6202 changed = true;
6203 }
6204 else if (flags & FRM_RETURN_IS_MULTI)
6205 {
6206 uint16 newbits;
6207 uint16 newbits2;
6208
6209 /*
6210 * We can't use GetMultiXactIdHintBits directly on the new multi
6211 * here; that routine initializes the masks to all zeroes, which
6212 * would lose other bits we need. Doing it this way ensures all
6213 * unrelated bits remain untouched.
6214 */
6215 frz->t_infomask &= ~HEAP_XMAX_BITS;
6216 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6217 GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6218 frz->t_infomask |= newbits;
6219 frz->t_infomask2 |= newbits2;
6220
6221 frz->xmax = newxmax;
6222
6223 changed = true;
6224 }
6225 }
6226 else if (TransactionIdIsNormal(xid))
6227 {
6228 if (TransactionIdPrecedes(xid, relfrozenxid))
6229 ereport(ERROR,
6230 (errcode(ERRCODE_DATA_CORRUPTED),
6231 errmsg_internal("found xmax %u from before relfrozenxid %u",
6232 xid, relfrozenxid)));
6233
6234 if (TransactionIdPrecedes(xid, cutoff_xid))
6235 {
6236 /*
6237 * If we freeze xmax, make absolutely sure that it's not an XID
6238 * that is important. (Note, a lock-only xmax can be removed
6239 * independent of committedness, since a committed lock holder has
6240 * released the lock).
6241 */
6242 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6243 TransactionIdDidCommit(xid))
6244 ereport(ERROR,
6245 (errcode(ERRCODE_DATA_CORRUPTED),
6246 errmsg_internal("cannot freeze committed xmax %u",
6247 xid)));
6248 freeze_xmax = true;
6249 }
6250 else
6251 freeze_xmax = false;
6252 }
6253 else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6254 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6255 {
6256 freeze_xmax = false;
6257 xmax_already_frozen = true;
6258 }
6259 else
6260 ereport(ERROR,
6261 (errcode(ERRCODE_DATA_CORRUPTED),
6262 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6263 xid, tuple->t_infomask)));
6264
6265 if (freeze_xmax)
6266 {
6267 Assert(!xmax_already_frozen);
6268
6269 frz->xmax = InvalidTransactionId;
6270
6271 /*
6272 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6273 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
6274 * Also get rid of the HEAP_KEYS_UPDATED bit.
6275 */
6276 frz->t_infomask &= ~HEAP_XMAX_BITS;
6277 frz->t_infomask |= HEAP_XMAX_INVALID;
6278 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6279 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6280 changed = true;
6281 }
6282
6283 /*
6284 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
6285 * we support having MOVED_OFF/MOVED_IN tuples in the database.
6286 */
6287 if (tuple->t_infomask & HEAP_MOVED)
6288 {
6289 xid = HeapTupleHeaderGetXvac(tuple);
6290
6291 /*
6292 * For Xvac, we ignore the cutoff_xid and just always perform the
6293 * freeze operation. The oldest release in which such a value can
6294 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6295 * was removed in PostgreSQL 9.0. Note that if we were to respect
6296 * cutoff_xid here, we'd need to make surely to clear totally_frozen
6297 * when we skipped freezing on that basis.
6298 */
6299 if (TransactionIdIsNormal(xid))
6300 {
6301 /*
6302 * If a MOVED_OFF tuple is not dead, the xvac transaction must
6303 * have failed; whereas a non-dead MOVED_IN tuple must mean the
6304 * xvac transaction succeeded.
6305 */
6306 if (tuple->t_infomask & HEAP_MOVED_OFF)
6307 frz->frzflags |= XLH_INVALID_XVAC;
6308 else
6309 frz->frzflags |= XLH_FREEZE_XVAC;
6310
6311 /*
6312 * Might as well fix the hint bits too; usually XMIN_COMMITTED
6313 * will already be set here, but there's a small chance not.
6314 */
6315 Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6316 frz->t_infomask |= HEAP_XMIN_COMMITTED;
6317 changed = true;
6318 }
6319 }
6320
6321 *totally_frozen_p = (xmin_frozen &&
6322 (freeze_xmax || xmax_already_frozen));
6323 return changed;
6324}
6325
6326/*
6327 * heap_execute_freeze_tuple
6328 * Execute the prepared freezing of a tuple.
6329 *
6330 * Caller is responsible for ensuring that no other backend can access the
6331 * storage underlying this tuple, either by holding an exclusive lock on the
6332 * buffer containing it (which is what lazy VACUUM does), or by having it be
6333 * in private storage (which is what CLUSTER and friends do).
6334 *
6335 * Note: it might seem we could make the changes without exclusive lock, since
6336 * TransactionId read/write is assumed atomic anyway. However there is a race
6337 * condition: someone who just fetched an old XID that we overwrite here could
6338 * conceivably not finish checking the XID against pg_xact before we finish
6339 * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting
6340 * exclusive lock ensures no other backend is in process of checking the
6341 * tuple status. Also, getting exclusive lock makes it safe to adjust the
6342 * infomask bits.
6343 *
6344 * NB: All code in here must be safe to execute during crash recovery!
6345 */
6346void
6347heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6348{
6349 HeapTupleHeaderSetXmax(tuple, frz->xmax);
6350
6351 if (frz->frzflags & XLH_FREEZE_XVAC)
6352 HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6353
6354 if (frz->frzflags & XLH_INVALID_XVAC)
6355 HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6356
6357 tuple->t_infomask = frz->t_infomask;
6358 tuple->t_infomask2 = frz->t_infomask2;
6359}
6360
6361/*
6362 * heap_freeze_tuple
6363 * Freeze tuple in place, without WAL logging.
6364 *
6365 * Useful for callers like CLUSTER that perform their own WAL logging.
6366 */
6367bool
6368heap_freeze_tuple(HeapTupleHeader tuple,
6369 TransactionId relfrozenxid, TransactionId relminmxid,
6370 TransactionId cutoff_xid, TransactionId cutoff_multi)
6371{
6372 xl_heap_freeze_tuple frz;
6373 bool do_freeze;
6374 bool tuple_totally_frozen;
6375
6376 do_freeze = heap_prepare_freeze_tuple(tuple,
6377 relfrozenxid, relminmxid,
6378 cutoff_xid, cutoff_multi,
6379 &frz, &tuple_totally_frozen);
6380
6381 /*
6382 * Note that because this is not a WAL-logged operation, we don't need to
6383 * fill in the offset in the freeze record.
6384 */
6385
6386 if (do_freeze)
6387 heap_execute_freeze_tuple(tuple, &frz);
6388 return do_freeze;
6389}
6390
6391/*
6392 * For a given MultiXactId, return the hint bits that should be set in the
6393 * tuple's infomask.
6394 *
6395 * Normally this should be called for a multixact that was just created, and
6396 * so is on our local cache, so the GetMembers call is fast.
6397 */
6398static void
6399GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6400 uint16 *new_infomask2)
6401{
6402 int nmembers;
6403 MultiXactMember *members;
6404 int i;
6405 uint16 bits = HEAP_XMAX_IS_MULTI;
6406 uint16 bits2 = 0;
6407 bool has_update = false;
6408 LockTupleMode strongest = LockTupleKeyShare;
6409
6410 /*
6411 * We only use this in multis we just created, so they cannot be values
6412 * pre-pg_upgrade.
6413 */
6414 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6415
6416 for (i = 0; i < nmembers; i++)
6417 {
6418 LockTupleMode mode;
6419
6420 /*
6421 * Remember the strongest lock mode held by any member of the
6422 * multixact.
6423 */
6424 mode = TUPLOCK_from_mxstatus(members[i].status);
6425 if (mode > strongest)
6426 strongest = mode;
6427
6428 /* See what other bits we need */
6429 switch (members[i].status)
6430 {
6431 case MultiXactStatusForKeyShare:
6432 case MultiXactStatusForShare:
6433 case MultiXactStatusForNoKeyUpdate:
6434 break;
6435
6436 case MultiXactStatusForUpdate:
6437 bits2 |= HEAP_KEYS_UPDATED;
6438 break;
6439
6440 case MultiXactStatusNoKeyUpdate:
6441 has_update = true;
6442 break;
6443
6444 case MultiXactStatusUpdate:
6445 bits2 |= HEAP_KEYS_UPDATED;
6446 has_update = true;
6447 break;
6448 }
6449 }
6450
6451 if (strongest == LockTupleExclusive ||
6452 strongest == LockTupleNoKeyExclusive)
6453 bits |= HEAP_XMAX_EXCL_LOCK;
6454 else if (strongest == LockTupleShare)
6455 bits |= HEAP_XMAX_SHR_LOCK;
6456 else if (strongest == LockTupleKeyShare)
6457 bits |= HEAP_XMAX_KEYSHR_LOCK;
6458
6459 if (!has_update)
6460 bits |= HEAP_XMAX_LOCK_ONLY;
6461
6462 if (nmembers > 0)
6463 pfree(members);
6464
6465 *new_infomask = bits;
6466 *new_infomask2 = bits2;
6467}
6468
6469/*
6470 * MultiXactIdGetUpdateXid
6471 *
6472 * Given a multixact Xmax and corresponding infomask, which does not have the
6473 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6474 * transaction.
6475 *
6476 * Caller is expected to check the status of the updating transaction, if
6477 * necessary.
6478 */
6479static TransactionId
6480MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6481{
6482 TransactionId update_xact = InvalidTransactionId;
6483 MultiXactMember *members;
6484 int nmembers;
6485
6486 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6487 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6488
6489 /*
6490 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6491 * pre-pg_upgrade.
6492 */
6493 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6494
6495 if (nmembers > 0)
6496 {
6497 int i;
6498
6499 for (i = 0; i < nmembers; i++)
6500 {
6501 /* Ignore lockers */
6502 if (!ISUPDATE_from_mxstatus(members[i].status))
6503 continue;
6504
6505 /* there can be at most one updater */
6506 Assert(update_xact == InvalidTransactionId);
6507 update_xact = members[i].xid;
6508#ifndef USE_ASSERT_CHECKING
6509
6510 /*
6511 * in an assert-enabled build, walk the whole array to ensure
6512 * there's no other updater.
6513 */
6514 break;
6515#endif
6516 }
6517
6518 pfree(members);
6519 }
6520
6521 return update_xact;
6522}
6523
6524/*
6525 * HeapTupleGetUpdateXid
6526 * As above, but use a HeapTupleHeader
6527 *
6528 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6529 * checking the hint bits.
6530 */
6531TransactionId
6532HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6533{
6534 return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6535 tuple->t_infomask);
6536}
6537
6538/*
6539 * Does the given multixact conflict with the current transaction grabbing a
6540 * tuple lock of the given strength?
6541 *
6542 * The passed infomask pairs up with the given multixact in the tuple header.
6543 *
6544 * If current_is_member is not NULL, it is set to 'true' if the current
6545 * transaction is a member of the given multixact.
6546 */
6547static bool
6548DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6549 LockTupleMode lockmode, bool *current_is_member)
6550{
6551 int nmembers;
6552 MultiXactMember *members;
6553 bool result = false;
6554 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
6555
6556 if (HEAP_LOCKED_UPGRADED(infomask))
6557 return false;
6558
6559 nmembers = GetMultiXactIdMembers(multi, &members, false,
6560 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6561 if (nmembers >= 0)
6562 {
6563 int i;
6564
6565 for (i = 0; i < nmembers; i++)
6566 {
6567 TransactionId memxid;
6568 LOCKMODE memlockmode;
6569
6570 if (result && (current_is_member == NULL || *current_is_member))
6571 break;
6572
6573 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
6574
6575 /* ignore members from current xact (but track their presence) */
6576 memxid = members[i].xid;
6577 if (TransactionIdIsCurrentTransactionId(memxid))
6578 {
6579 if (current_is_member != NULL)
6580 *current_is_member = true;
6581 continue;
6582 }
6583 else if (result)
6584 continue;
6585
6586 /* ignore members that don't conflict with the lock we want */
6587 if (!DoLockModesConflict(memlockmode, wanted))
6588 continue;
6589
6590 if (ISUPDATE_from_mxstatus(members[i].status))
6591 {
6592 /* ignore aborted updaters */
6593 if (TransactionIdDidAbort(memxid))
6594 continue;
6595 }
6596 else
6597 {
6598 /* ignore lockers-only that are no longer in progress */
6599 if (!TransactionIdIsInProgress(memxid))
6600 continue;
6601 }
6602
6603 /*
6604 * Whatever remains are either live lockers that conflict with our
6605 * wanted lock, and updaters that are not aborted. Those conflict
6606 * with what we want. Set up to return true, but keep going to
6607 * look for the current transaction among the multixact members,
6608 * if needed.
6609 */
6610 result = true;
6611 }
6612 pfree(members);
6613 }
6614
6615 return result;
6616}
6617
6618/*
6619 * Do_MultiXactIdWait
6620 * Actual implementation for the two functions below.
6621 *
6622 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
6623 * needed to ensure we only sleep on conflicting members, and the infomask is
6624 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
6625 * indicates whether to use conditional lock acquisition, to allow callers to
6626 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
6627 * context information for error messages. 'remaining', if not NULL, receives
6628 * the number of members that are still running, including any (non-aborted)
6629 * subtransactions of our own transaction.
6630 *
6631 * We do this by sleeping on each member using XactLockTableWait. Any
6632 * members that belong to the current backend are *not* waited for, however;
6633 * this would not merely be useless but would lead to Assert failure inside
6634 * XactLockTableWait. By the time this returns, it is certain that all
6635 * transactions *of other backends* that were members of the MultiXactId
6636 * that conflict with the requested status are dead (and no new ones can have
6637 * been added, since it is not legal to add members to an existing
6638 * MultiXactId).
6639 *
6640 * But by the time we finish sleeping, someone else may have changed the Xmax
6641 * of the containing tuple, so the caller needs to iterate on us somehow.
6642 *
6643 * Note that in case we return false, the number of remaining members is
6644 * not to be trusted.
6645 */
6646static bool
6647Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6648 uint16 infomask, bool nowait,
6649 Relation rel, ItemPointer ctid, XLTW_Oper oper,
6650 int *remaining)
6651{
6652 bool result = true;
6653 MultiXactMember *members;
6654 int nmembers;
6655 int remain = 0;
6656
6657 /* for pre-pg_upgrade tuples, no need to sleep at all */
6658 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
6659 GetMultiXactIdMembers(multi, &members, false,
6660 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6661
6662 if (nmembers >= 0)
6663 {
6664 int i;
6665
6666 for (i = 0; i < nmembers; i++)
6667 {
6668 TransactionId memxid = members[i].xid;
6669 MultiXactStatus memstatus = members[i].status;
6670
6671 if (TransactionIdIsCurrentTransactionId(memxid))
6672 {
6673 remain++;
6674 continue;
6675 }
6676
6677 if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
6678 LOCKMODE_from_mxstatus(status)))
6679 {
6680 if (remaining && TransactionIdIsInProgress(memxid))
6681 remain++;
6682 continue;
6683 }
6684
6685 /*
6686 * This member conflicts with our multi, so we have to sleep (or
6687 * return failure, if asked to avoid waiting.)
6688 *
6689 * Note that we don't set up an error context callback ourselves,
6690 * but instead we pass the info down to XactLockTableWait. This
6691 * might seem a bit wasteful because the context is set up and
6692 * tore down for each member of the multixact, but in reality it
6693 * should be barely noticeable, and it avoids duplicate code.
6694 */
6695 if (nowait)
6696 {
6697 result = ConditionalXactLockTableWait(memxid);
6698 if (!result)
6699 break;
6700 }
6701 else
6702 XactLockTableWait(memxid, rel, ctid, oper);
6703 }
6704
6705 pfree(members);
6706 }
6707
6708 if (remaining)
6709 *remaining = remain;
6710
6711 return result;
6712}
6713
6714/*
6715 * MultiXactIdWait
6716 * Sleep on a MultiXactId.
6717 *
6718 * By the time we finish sleeping, someone else may have changed the Xmax
6719 * of the containing tuple, so the caller needs to iterate on us somehow.
6720 *
6721 * We return (in *remaining, if not NULL) the number of members that are still
6722 * running, including any (non-aborted) subtransactions of our own transaction.
6723 */
6724static void
6725MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
6726 Relation rel, ItemPointer ctid, XLTW_Oper oper,
6727 int *remaining)
6728{
6729 (void) Do_MultiXactIdWait(multi, status, infomask, false,
6730 rel, ctid, oper, remaining);
6731}
6732
6733/*
6734 * ConditionalMultiXactIdWait
6735 * As above, but only lock if we can get the lock without blocking.
6736 *
6737 * By the time we finish sleeping, someone else may have changed the Xmax
6738 * of the containing tuple, so the caller needs to iterate on us somehow.
6739 *
6740 * If the multixact is now all gone, return true. Returns false if some
6741 * transactions might still be running.
6742 *
6743 * We return (in *remaining, if not NULL) the number of members that are still
6744 * running, including any (non-aborted) subtransactions of our own transaction.
6745 */
6746static bool
6747ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6748 uint16 infomask, Relation rel, int *remaining)
6749{
6750 return Do_MultiXactIdWait(multi, status, infomask, true,
6751 rel, NULL, XLTW_None, remaining);
6752}
6753
6754/*
6755 * heap_tuple_needs_eventual_freeze
6756 *
6757 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6758 * will eventually require freezing. Similar to heap_tuple_needs_freeze,
6759 * but there's no cutoff, since we're trying to figure out whether freezing
6760 * will ever be needed, not whether it's needed now.
6761 */
6762bool
6763heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
6764{
6765 TransactionId xid;
6766
6767 /*
6768 * If xmin is a normal transaction ID, this tuple is definitely not
6769 * frozen.
6770 */
6771 xid = HeapTupleHeaderGetXmin(tuple);
6772 if (TransactionIdIsNormal(xid))
6773 return true;
6774
6775 /*
6776 * If xmax is a valid xact or multixact, this tuple is also not frozen.
6777 */
6778 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6779 {
6780 MultiXactId multi;
6781
6782 multi = HeapTupleHeaderGetRawXmax(tuple);
6783 if (MultiXactIdIsValid(multi))
6784 return true;
6785 }
6786 else
6787 {
6788 xid = HeapTupleHeaderGetRawXmax(tuple);
6789 if (TransactionIdIsNormal(xid))
6790 return true;
6791 }
6792
6793 if (tuple->t_infomask & HEAP_MOVED)
6794 {
6795 xid = HeapTupleHeaderGetXvac(tuple);
6796 if (TransactionIdIsNormal(xid))
6797 return true;
6798 }
6799
6800 return false;
6801}
6802
6803/*
6804 * heap_tuple_needs_freeze
6805 *
6806 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6807 * are older than the specified cutoff XID or MultiXactId. If so, return true.
6808 *
6809 * It doesn't matter whether the tuple is alive or dead, we are checking
6810 * to see if a tuple needs to be removed or frozen to avoid wraparound.
6811 *
6812 * NB: Cannot rely on hint bits here, they might not be set after a crash or
6813 * on a standby.
6814 */
6815bool
6816heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
6817 MultiXactId cutoff_multi, Buffer buf)
6818{
6819 TransactionId xid;
6820
6821 xid = HeapTupleHeaderGetXmin(tuple);
6822 if (TransactionIdIsNormal(xid) &&
6823 TransactionIdPrecedes(xid, cutoff_xid))
6824 return true;
6825
6826 /*
6827 * The considerations for multixacts are complicated; look at
6828 * heap_prepare_freeze_tuple for justifications. This routine had better
6829 * be in sync with that one!
6830 */
6831 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6832 {
6833 MultiXactId multi;
6834
6835 multi = HeapTupleHeaderGetRawXmax(tuple);
6836 if (!MultiXactIdIsValid(multi))
6837 {
6838 /* no xmax set, ignore */
6839 ;
6840 }
6841 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
6842 return true;
6843 else if (MultiXactIdPrecedes(multi, cutoff_multi))
6844 return true;
6845 else
6846 {
6847 MultiXactMember *members;
6848 int nmembers;
6849 int i;
6850
6851 /* need to check whether any member of the mxact is too old */
6852
6853 nmembers = GetMultiXactIdMembers(multi, &members, false,
6854 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
6855
6856 for (i = 0; i < nmembers; i++)
6857 {
6858 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6859 {
6860 pfree(members);
6861 return true;
6862 }
6863 }
6864 if (nmembers > 0)
6865 pfree(members);
6866 }
6867 }
6868 else
6869 {
6870 xid = HeapTupleHeaderGetRawXmax(tuple);
6871 if (TransactionIdIsNormal(xid) &&
6872 TransactionIdPrecedes(xid, cutoff_xid))
6873 return true;
6874 }
6875
6876 if (tuple->t_infomask & HEAP_MOVED)
6877 {
6878 xid = HeapTupleHeaderGetXvac(tuple);
6879 if (TransactionIdIsNormal(xid) &&
6880 TransactionIdPrecedes(xid, cutoff_xid))
6881 return true;
6882 }
6883
6884 return false;
6885}
6886
6887/*
6888 * If 'tuple' contains any visible XID greater than latestRemovedXid,
6889 * ratchet forwards latestRemovedXid to the greatest one found.
6890 * This is used as the basis for generating Hot Standby conflicts, so
6891 * if a tuple was never visible then removing it should not conflict
6892 * with queries.
6893 */
6894void
6895HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
6896 TransactionId *latestRemovedXid)
6897{
6898 TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
6899 TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
6900 TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
6901
6902 if (tuple->t_infomask & HEAP_MOVED)
6903 {
6904 if (TransactionIdPrecedes(*latestRemovedXid, xvac))
6905 *latestRemovedXid = xvac;
6906 }
6907
6908 /*
6909 * Ignore tuples inserted by an aborted transaction or if the tuple was
6910 * updated/deleted by the inserting transaction.
6911 *
6912 * Look for a committed hint bit, or if no xmin bit is set, check clog.
6913 * This needs to work on both master and standby, where it is used to
6914 * assess btree delete records.
6915 */
6916 if (HeapTupleHeaderXminCommitted(tuple) ||
6917 (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
6918 {
6919 if (xmax != xmin &&
6920 TransactionIdFollows(xmax, *latestRemovedXid))
6921 *latestRemovedXid = xmax;
6922 }
6923
6924 /* *latestRemovedXid may still be invalid at end */
6925}
6926
6927#ifdef USE_PREFETCH
6928/*
6929 * Helper function for heap_compute_xid_horizon_for_tuples. Issue prefetch
6930 * requests for the number of buffers indicated by prefetch_count. The
6931 * prefetch_state keeps track of all the buffers that we can prefetch and
6932 * which ones have already been prefetched; each call to this function picks
6933 * up where the previous call left off.
6934 */
6935static void
6936xid_horizon_prefetch_buffer(Relation rel,
6937 XidHorizonPrefetchState *prefetch_state,
6938 int prefetch_count)
6939{
6940 BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
6941 int count = 0;
6942 int i;
6943 int nitems = prefetch_state->nitems;
6944 ItemPointerData *tids = prefetch_state->tids;
6945
6946 for (i = prefetch_state->next_item;
6947 i < nitems && count < prefetch_count;
6948 i++)
6949 {
6950 ItemPointer htid = &tids[i];
6951
6952 if (cur_hblkno == InvalidBlockNumber ||
6953 ItemPointerGetBlockNumber(htid) != cur_hblkno)
6954 {
6955 cur_hblkno = ItemPointerGetBlockNumber(htid);
6956 PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
6957 count++;
6958 }
6959 }
6960
6961 /*
6962 * Save the prefetch position so that next time we can continue from that
6963 * position.
6964 */
6965 prefetch_state->next_item = i;
6966 prefetch_state->cur_hblkno = cur_hblkno;
6967}
6968#endif
6969
6970/*
6971 * Get the latestRemovedXid from the heap pages pointed at by the index
6972 * tuples being deleted.
6973 *
6974 * We used to do this during recovery rather than on the primary, but that
6975 * approach now appears inferior. It meant that the master could generate
6976 * a lot of work for the standby without any back-pressure to slow down the
6977 * master, and it required the standby to have reached consistency, whereas
6978 * we want to have correct information available even before that point.
6979 *
6980 * It's possible for this to generate a fair amount of I/O, since we may be
6981 * deleting hundreds of tuples from a single index block. To amortize that
6982 * cost to some degree, this uses prefetching and combines repeat accesses to
6983 * the same block.
6984 */
6985TransactionId
6986heap_compute_xid_horizon_for_tuples(Relation rel,
6987 ItemPointerData *tids,
6988 int nitems)
6989{
6990 TransactionId latestRemovedXid = InvalidTransactionId;
6991 BlockNumber hblkno;
6992 Buffer buf = InvalidBuffer;
6993 Page hpage;
6994#ifdef USE_PREFETCH
6995 XidHorizonPrefetchState prefetch_state;
6996 int io_concurrency;
6997 int prefetch_distance;
6998#endif
6999
7000 /*
7001 * Sort to avoid repeated lookups for the same page, and to make it more
7002 * likely to access items in an efficient order. In particular, this
7003 * ensures that if there are multiple pointers to the same page, they all
7004 * get processed looking up and locking the page just once.
7005 */
7006 qsort((void *) tids, nitems, sizeof(ItemPointerData),
7007 (int (*) (const void *, const void *)) ItemPointerCompare);
7008
7009#ifdef USE_PREFETCH
7010 /* Initialize prefetch state. */
7011 prefetch_state.cur_hblkno = InvalidBlockNumber;
7012 prefetch_state.next_item = 0;
7013 prefetch_state.nitems = nitems;
7014 prefetch_state.tids = tids;
7015
7016 /*
7017 * Compute the prefetch distance that we will attempt to maintain.
7018 *
7019 * We don't use the regular formula to determine how much to prefetch
7020 * here, but instead just add a constant to effective_io_concurrency.
7021 * That's because it seems best to do some prefetching here even when
7022 * effective_io_concurrency is set to 0, but if the DBA thinks it's OK to
7023 * do more prefetching for other operations, then it's probably OK to do
7024 * more prefetching in this case, too. It may be that this formula is too
7025 * simplistic, but at the moment there is no evidence of that or any idea
7026 * about what would work better.
7027 *
7028 * Since the caller holds a buffer lock somewhere in rel, we'd better make
7029 * sure that isn't a catalog relation before we call code that does
7030 * syscache lookups, to avoid risk of deadlock.
7031 */
7032 if (IsCatalogRelation(rel))
7033 io_concurrency = effective_io_concurrency;
7034 else
7035 io_concurrency = get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
7036 prefetch_distance = Min((io_concurrency) + 10, MAX_IO_CONCURRENCY);
7037
7038 /* Start prefetching. */
7039 xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7040#endif
7041
7042 /* Iterate over all tids, and check their horizon */
7043 hblkno = InvalidBlockNumber;
7044 hpage = NULL;
7045 for (int i = 0; i < nitems; i++)
7046 {
7047 ItemPointer htid = &tids[i];
7048 ItemId hitemid;
7049 OffsetNumber hoffnum;
7050
7051 /*
7052 * Read heap buffer, but avoid refetching if it's the same block as
7053 * required for the last tid.
7054 */
7055 if (hblkno == InvalidBlockNumber ||
7056 ItemPointerGetBlockNumber(htid) != hblkno)
7057 {
7058 /* release old buffer */
7059 if (BufferIsValid(buf))
7060 {
7061 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7062 ReleaseBuffer(buf);
7063 }
7064
7065 hblkno = ItemPointerGetBlockNumber(htid);
7066
7067 buf = ReadBuffer(rel, hblkno);
7068
7069#ifdef USE_PREFETCH
7070
7071 /*
7072 * To maintain the prefetch distance, prefetch one more page for
7073 * each page we read.
7074 */
7075 xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
7076#endif
7077
7078 hpage = BufferGetPage(buf);
7079
7080 LockBuffer(buf, BUFFER_LOCK_SHARE);
7081 }
7082
7083 hoffnum = ItemPointerGetOffsetNumber(htid);
7084 hitemid = PageGetItemId(hpage, hoffnum);
7085
7086 /*
7087 * Follow any redirections until we find something useful.
7088 */
7089 while (ItemIdIsRedirected(hitemid))
7090 {
7091 hoffnum = ItemIdGetRedirect(hitemid);
7092 hitemid = PageGetItemId(hpage, hoffnum);
7093 CHECK_FOR_INTERRUPTS();
7094 }
7095
7096 /*
7097 * If the heap item has storage, then read the header and use that to
7098 * set latestRemovedXid.
7099 *
7100 * Some LP_DEAD items may not be accessible, so we ignore them.
7101 */
7102 if (ItemIdHasStorage(hitemid))
7103 {
7104 HeapTupleHeader htuphdr;
7105
7106 htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
7107
7108 HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
7109 }
7110 else if (ItemIdIsDead(hitemid))
7111 {
7112 /*
7113 * Conjecture: if hitemid is dead then it had xids before the xids
7114 * marked on LP_NORMAL items. So we just ignore this item and move
7115 * onto the next, for the purposes of calculating
7116 * latestRemovedXid.
7117 */
7118 }
7119 else
7120 Assert(!ItemIdIsUsed(hitemid));
7121
7122 }
7123
7124 if (BufferIsValid(buf))
7125 {
7126 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7127 ReleaseBuffer(buf);
7128 }
7129
7130 /*
7131 * If all heap tuples were LP_DEAD then we will be returning
7132 * InvalidTransactionId here, which avoids conflicts. This matches
7133 * existing logic which assumes that LP_DEAD tuples must already be older
7134 * than the latestRemovedXid on the cleanup record that set them as
7135 * LP_DEAD, hence must already have generated a conflict.
7136 */
7137
7138 return latestRemovedXid;
7139}
7140
7141/*
7142 * Perform XLogInsert to register a heap cleanup info message. These
7143 * messages are sent once per VACUUM and are required because
7144 * of the phasing of removal operations during a lazy VACUUM.
7145 * see comments for vacuum_log_cleanup_info().
7146 */
7147XLogRecPtr
7148log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7149{
7150 xl_heap_cleanup_info xlrec;
7151 XLogRecPtr recptr;
7152
7153 xlrec.node = rnode;
7154 xlrec.latestRemovedXid = latestRemovedXid;
7155
7156 XLogBeginInsert();
7157 XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7158
7159 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7160
7161 return recptr;
7162}
7163
7164/*
7165 * Perform XLogInsert for a heap-clean operation. Caller must already
7166 * have modified the buffer and marked it dirty.
7167 *
7168 * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7169 * zero-based tuple indexes. Now they are one-based like other uses
7170 * of OffsetNumber.
7171 *
7172 * We also include latestRemovedXid, which is the greatest XID present in
7173 * the removed tuples. That allows recovery processing to cancel or wait
7174 * for long standby queries that can still see these tuples.
7175 */
7176XLogRecPtr
7177log_heap_clean(Relation reln, Buffer buffer,
7178 OffsetNumber *redirected, int nredirected,
7179 OffsetNumber *nowdead, int ndead,
7180 OffsetNumber *nowunused, int nunused,
7181 TransactionId latestRemovedXid)
7182{
7183 xl_heap_clean xlrec;
7184 XLogRecPtr recptr;
7185
7186 /* Caller should not call me on a non-WAL-logged relation */
7187 Assert(RelationNeedsWAL(reln));
7188
7189 xlrec.latestRemovedXid = latestRemovedXid;
7190 xlrec.nredirected = nredirected;
7191 xlrec.ndead = ndead;
7192
7193 XLogBeginInsert();
7194 XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7195
7196 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7197
7198 /*
7199 * The OffsetNumber arrays are not actually in the buffer, but we pretend
7200 * that they are. When XLogInsert stores the whole buffer, the offset
7201 * arrays need not be stored too. Note that even if all three arrays are
7202 * empty, we want to expose the buffer as a candidate for whole-page
7203 * storage, since this record type implies a defragmentation operation
7204 * even if no line pointers changed state.
7205 */
7206 if (nredirected > 0)
7207 XLogRegisterBufData(0, (char *) redirected,
7208 nredirected * sizeof(OffsetNumber) * 2);
7209
7210 if (ndead > 0)
7211 XLogRegisterBufData(0, (char *) nowdead,
7212 ndead * sizeof(OffsetNumber));
7213
7214 if (nunused > 0)
7215 XLogRegisterBufData(0, (char *) nowunused,
7216 nunused * sizeof(OffsetNumber));
7217
7218 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7219
7220 return recptr;
7221}
7222
7223/*
7224 * Perform XLogInsert for a heap-freeze operation. Caller must have already
7225 * modified the buffer and marked it dirty.
7226 */
7227XLogRecPtr
7228log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7229 xl_heap_freeze_tuple *tuples, int ntuples)
7230{
7231 xl_heap_freeze_page xlrec;
7232 XLogRecPtr recptr;
7233
7234 /* Caller should not call me on a non-WAL-logged relation */
7235 Assert(RelationNeedsWAL(reln));
7236 /* nor when there are no tuples to freeze */
7237 Assert(ntuples > 0);
7238
7239 xlrec.cutoff_xid = cutoff_xid;
7240 xlrec.ntuples = ntuples;
7241
7242 XLogBeginInsert();
7243 XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7244
7245 /*
7246 * The freeze plan array is not actually in the buffer, but pretend that
7247 * it is. When XLogInsert stores the whole buffer, the freeze plan need
7248 * not be stored too.
7249 */
7250 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7251 XLogRegisterBufData(0, (char *) tuples,
7252 ntuples * sizeof(xl_heap_freeze_tuple));
7253
7254 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7255
7256 return recptr;
7257}
7258
7259/*
7260 * Perform XLogInsert for a heap-visible operation. 'block' is the block
7261 * being marked all-visible, and vm_buffer is the buffer containing the
7262 * corresponding visibility map block. Both should have already been modified
7263 * and dirtied.
7264 *
7265 * If checksums are enabled, we also generate a full-page image of
7266 * heap_buffer, if necessary.
7267 */
7268XLogRecPtr
7269log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7270 TransactionId cutoff_xid, uint8 vmflags)
7271{
7272 xl_heap_visible xlrec;
7273 XLogRecPtr recptr;
7274 uint8 flags;
7275
7276 Assert(BufferIsValid(heap_buffer));
7277 Assert(BufferIsValid(vm_buffer));
7278
7279 xlrec.cutoff_xid = cutoff_xid;
7280 xlrec.flags = vmflags;
7281 XLogBeginInsert();
7282 XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7283
7284 XLogRegisterBuffer(0, vm_buffer, 0);
7285
7286 flags = REGBUF_STANDARD;
7287 if (!XLogHintBitIsNeeded())
7288 flags |= REGBUF_NO_IMAGE;
7289 XLogRegisterBuffer(1, heap_buffer, flags);
7290
7291 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7292
7293 return recptr;
7294}
7295
7296/*
7297 * Perform XLogInsert for a heap-update operation. Caller must already
7298 * have modified the buffer(s) and marked them dirty.
7299 */
7300static XLogRecPtr
7301log_heap_update(Relation reln, Buffer oldbuf,
7302 Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7303 HeapTuple old_key_tuple,
7304 bool all_visible_cleared, bool new_all_visible_cleared)
7305{
7306 xl_heap_update xlrec;
7307 xl_heap_header xlhdr;
7308 xl_heap_header xlhdr_idx;
7309 uint8 info;
7310 uint16 prefix_suffix[2];
7311 uint16 prefixlen = 0,
7312 suffixlen = 0;
7313 XLogRecPtr recptr;
7314 Page page = BufferGetPage(newbuf);
7315 bool need_tuple_data = RelationIsLogicallyLogged(reln);
7316 bool init;
7317 int bufflags;
7318
7319 /* Caller should not call me on a non-WAL-logged relation */
7320 Assert(RelationNeedsWAL(reln));
7321
7322 XLogBeginInsert();
7323
7324 if (HeapTupleIsHeapOnly(newtup))
7325 info = XLOG_HEAP_HOT_UPDATE;
7326 else
7327 info = XLOG_HEAP_UPDATE;
7328
7329 /*
7330 * If the old and new tuple are on the same page, we only need to log the
7331 * parts of the new tuple that were changed. That saves on the amount of
7332 * WAL we need to write. Currently, we just count any unchanged bytes in
7333 * the beginning and end of the tuple. That's quick to check, and
7334 * perfectly covers the common case that only one field is updated.
7335 *
7336 * We could do this even if the old and new tuple are on different pages,
7337 * but only if we don't make a full-page image of the old page, which is
7338 * difficult to know in advance. Also, if the old tuple is corrupt for
7339 * some reason, it would allow the corruption to propagate the new page,
7340 * so it seems best to avoid. Under the general assumption that most
7341 * updates tend to create the new tuple version on the same page, there
7342 * isn't much to be gained by doing this across pages anyway.
7343 *
7344 * Skip this if we're taking a full-page image of the new page, as we
7345 * don't include the new tuple in the WAL record in that case. Also
7346 * disable if wal_level='logical', as logical decoding needs to be able to
7347 * read the new tuple in whole from the WAL record alone.
7348 */
7349 if (oldbuf == newbuf && !need_tuple_data &&
7350 !XLogCheckBufferNeedsBackup(newbuf))
7351 {
7352 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
7353 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
7354 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7355 int newlen = newtup->t_len - newtup->t_data->t_hoff;
7356
7357 /* Check for common prefix between old and new tuple */
7358 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
7359 {
7360 if (newp[prefixlen] != oldp[prefixlen])
7361 break;
7362 }
7363
7364 /*
7365 * Storing the length of the prefix takes 2 bytes, so we need to save
7366 * at least 3 bytes or there's no point.
7367 */
7368 if (prefixlen < 3)
7369 prefixlen = 0;
7370
7371 /* Same for suffix */
7372 for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7373 {
7374 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
7375 break;
7376 }
7377 if (suffixlen < 3)
7378 suffixlen = 0;
7379 }
7380
7381 /* Prepare main WAL data chain */
7382 xlrec.flags = 0;
7383 if (all_visible_cleared)
7384 xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7385 if (new_all_visible_cleared)
7386 xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7387 if (prefixlen > 0)
7388 xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
7389 if (suffixlen > 0)
7390 xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
7391 if (need_tuple_data)
7392 {
7393 xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7394 if (old_key_tuple)
7395 {
7396 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7397 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7398 else
7399 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
7400 }
7401 }
7402
7403 /* If new tuple is the single and first tuple on page... */
7404 if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7405 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7406 {
7407 info |= XLOG_HEAP_INIT_PAGE;
7408 init = true;
7409 }
7410 else
7411 init = false;
7412
7413 /* Prepare WAL data for the old page */
7414 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7415 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7416 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7417 oldtup->t_data->t_infomask2);
7418
7419 /* Prepare WAL data for the new page */
7420 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7421 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7422
7423 bufflags = REGBUF_STANDARD;
7424 if (init)
7425 bufflags |= REGBUF_WILL_INIT;
7426 if (need_tuple_data)
7427 bufflags |= REGBUF_KEEP_DATA;
7428
7429 XLogRegisterBuffer(0, newbuf, bufflags);
7430 if (oldbuf != newbuf)
7431 XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
7432
7433 XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7434
7435 /*
7436 * Prepare WAL data for the new tuple.
7437 */
7438 if (prefixlen > 0 || suffixlen > 0)
7439 {
7440 if (prefixlen > 0 && suffixlen > 0)
7441 {
7442 prefix_suffix[0] = prefixlen;
7443 prefix_suffix[1] = suffixlen;
7444 XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
7445 }
7446 else if (prefixlen > 0)
7447 {
7448 XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
7449 }
7450 else
7451 {
7452 XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
7453 }
7454 }
7455
7456 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7457 xlhdr.t_infomask = newtup->t_data->t_infomask;
7458 xlhdr.t_hoff = newtup->t_data->t_hoff;
7459 Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7460
7461 /*
7462 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7463 *
7464 * The 'data' doesn't include the common prefix or suffix.
7465 */
7466 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
7467 if (prefixlen == 0)
7468 {
7469 XLogRegisterBufData(0,
7470 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7471 newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7472 }
7473 else
7474 {
7475 /*
7476 * Have to write the null bitmap and data after the common prefix as
7477 * two separate rdata entries.
7478 */
7479 /* bitmap [+ padding] [+ oid] */
7480 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
7481 {
7482 XLogRegisterBufData(0,
7483 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7484 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7485 }
7486
7487 /* data after common prefix */
7488 XLogRegisterBufData(0,
7489 ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
7490 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
7491 }
7492
7493 /* We need to log a tuple identity */
7494 if (need_tuple_data && old_key_tuple)
7495 {
7496 /* don't really need this, but its more comfy to decode */
7497 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
7498 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
7499 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
7500
7501 XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
7502
7503 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
7504 XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
7505 old_key_tuple->t_len - SizeofHeapTupleHeader);
7506 }
7507
7508 /* filtering by origin on a row level is much more efficient */
7509 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
7510
7511 recptr = XLogInsert(RM_HEAP_ID, info);
7512
7513 return recptr;
7514}
7515
7516/*
7517 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
7518 *
7519 * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
7520 * tuples.
7521 */
7522static XLogRecPtr
7523log_heap_new_cid(Relation relation, HeapTuple tup)
7524{
7525 xl_heap_new_cid xlrec;
7526
7527 XLogRecPtr recptr;
7528 HeapTupleHeader hdr = tup->t_data;
7529
7530 Assert(ItemPointerIsValid(&tup->t_self));
7531 Assert(tup->t_tableOid != InvalidOid);
7532
7533 xlrec.top_xid = GetTopTransactionId();
7534 xlrec.target_node = relation->rd_node;
7535 xlrec.target_tid = tup->t_self;
7536
7537 /*
7538 * If the tuple got inserted & deleted in the same TX we definitely have a
7539 * combocid, set cmin and cmax.
7540 */
7541 if (hdr->t_infomask & HEAP_COMBOCID)
7542 {
7543 Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
7544 Assert(!HeapTupleHeaderXminInvalid(hdr));
7545 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
7546 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
7547 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
7548 }
7549 /* No combocid, so only cmin or cmax can be set by this TX */
7550 else
7551 {
7552 /*
7553 * Tuple inserted.
7554 *
7555 * We need to check for LOCK ONLY because multixacts might be
7556 * transferred to the new tuple in case of FOR KEY SHARE updates in
7557 * which case there will be an xmax, although the tuple just got
7558 * inserted.
7559 */
7560 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
7561 HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
7562 {
7563 xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
7564 xlrec.cmax = InvalidCommandId;
7565 }
7566 /* Tuple from a different tx updated or deleted. */
7567 else
7568 {
7569 xlrec.cmin = InvalidCommandId;
7570 xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
7571
7572 }
7573 xlrec.combocid = InvalidCommandId;
7574 }
7575
7576 /*
7577 * Note that we don't need to register the buffer here, because this
7578 * operation does not modify the page. The insert/update/delete that
7579 * called us certainly did, but that's WAL-logged separately.
7580 */
7581 XLogBeginInsert();
7582 XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
7583
7584 /* will be looked at irrespective of origin */
7585
7586 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
7587
7588 return recptr;
7589}
7590
7591/*
7592 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
7593 * the old tuple in a UPDATE or DELETE.
7594 *
7595 * Returns NULL if there's no need to log an identity or if there's no suitable
7596 * key defined.
7597 *
7598 * key_changed should be false if caller knows that no replica identity
7599 * columns changed value. It's always true in the DELETE case.
7600 *
7601 * *copy is set to true if the returned tuple is a modified copy rather than
7602 * the same tuple that was passed in.
7603 */
7604static HeapTuple
7605ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed,
7606 bool *copy)
7607{
7608 TupleDesc desc = RelationGetDescr(relation);
7609 char replident = relation->rd_rel->relreplident;
7610 Bitmapset *idattrs;
7611 HeapTuple key_tuple;
7612 bool nulls[MaxHeapAttributeNumber];
7613 Datum values[MaxHeapAttributeNumber];
7614
7615 *copy = false;
7616
7617 if (!RelationIsLogicallyLogged(relation))
7618 return NULL;
7619
7620 if (replident == REPLICA_IDENTITY_NOTHING)
7621 return NULL;
7622
7623 if (replident == REPLICA_IDENTITY_FULL)
7624 {
7625 /*
7626 * When logging the entire old tuple, it very well could contain
7627 * toasted columns. If so, force them to be inlined.
7628 */
7629 if (HeapTupleHasExternal(tp))
7630 {
7631 *copy = true;
7632 tp = toast_flatten_tuple(tp, desc);
7633 }
7634 return tp;
7635 }
7636
7637 /* if the key hasn't changed and we're only logging the key, we're done */
7638 if (!key_changed)
7639 return NULL;
7640
7641 /* find out the replica identity columns */
7642 idattrs = RelationGetIndexAttrBitmap(relation,
7643 INDEX_ATTR_BITMAP_IDENTITY_KEY);
7644
7645 /*
7646 * If there's no defined replica identity columns, treat as !key_changed.
7647 * (This case should not be reachable from heap_update, since that should
7648 * calculate key_changed accurately. But heap_delete just passes constant
7649 * true for key_changed, so we can hit this case in deletes.)
7650 */
7651 if (bms_is_empty(idattrs))
7652 return NULL;
7653
7654 /*
7655 * Construct a new tuple containing only the replica identity columns,
7656 * with nulls elsewhere. While we're at it, assert that the replica
7657 * identity columns aren't null.
7658 */
7659 heap_deform_tuple(tp, desc, values, nulls);
7660
7661 for (int i = 0; i < desc->natts; i++)
7662 {
7663 if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
7664 idattrs))
7665 Assert(!nulls[i]);
7666 else
7667 nulls[i] = true;
7668 }
7669
7670 key_tuple = heap_form_tuple(desc, values, nulls);
7671 *copy = true;
7672
7673 bms_free(idattrs);
7674
7675 /*
7676 * If the tuple, which by here only contains indexed columns, still has
7677 * toasted columns, force them to be inlined. This is somewhat unlikely
7678 * since there's limits on the size of indexed columns, so we don't
7679 * duplicate toast_flatten_tuple()s functionality in the above loop over
7680 * the indexed columns, even if it would be more efficient.
7681 */
7682 if (HeapTupleHasExternal(key_tuple))
7683 {
7684 HeapTuple oldtup = key_tuple;
7685
7686 key_tuple = toast_flatten_tuple(oldtup, desc);
7687 heap_freetuple(oldtup);
7688 }
7689
7690 return key_tuple;
7691}
7692
7693/*
7694 * Handles CLEANUP_INFO
7695 */
7696static void
7697heap_xlog_cleanup_info(XLogReaderState *record)
7698{
7699 xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
7700
7701 if (InHotStandby)
7702 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
7703
7704 /*
7705 * Actual operation is a no-op. Record type exists to provide a means for
7706 * conflict processing to occur before we begin index vacuum actions. see
7707 * vacuumlazy.c and also comments in btvacuumpage()
7708 */
7709
7710 /* Backup blocks are not used in cleanup_info records */
7711 Assert(!XLogRecHasAnyBlockRefs(record));
7712}
7713
7714/*
7715 * Handles XLOG_HEAP2_CLEAN record type
7716 */
7717static void
7718heap_xlog_clean(XLogReaderState *record)
7719{
7720 XLogRecPtr lsn = record->EndRecPtr;
7721 xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
7722 Buffer buffer;
7723 RelFileNode rnode;
7724 BlockNumber blkno;
7725 XLogRedoAction action;
7726
7727 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
7728
7729 /*
7730 * We're about to remove tuples. In Hot Standby mode, ensure that there's
7731 * no queries running for which the removed tuples are still visible.
7732 *
7733 * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
7734 * conflict on the records that cause MVCC failures for user queries. If
7735 * latestRemovedXid is invalid, skip conflict processing.
7736 */
7737 if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
7738 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
7739
7740 /*
7741 * If we have a full-page image, restore it (using a cleanup lock) and
7742 * we're done.
7743 */
7744 action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
7745 &buffer);
7746 if (action == BLK_NEEDS_REDO)
7747 {
7748 Page page = (Page) BufferGetPage(buffer);
7749 OffsetNumber *end;
7750 OffsetNumber *redirected;
7751 OffsetNumber *nowdead;
7752 OffsetNumber *nowunused;
7753 int nredirected;
7754 int ndead;
7755 int nunused;
7756 Size datalen;
7757
7758 redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
7759
7760 nredirected = xlrec->nredirected;
7761 ndead = xlrec->ndead;
7762 end = (OffsetNumber *) ((char *) redirected + datalen);
7763 nowdead = redirected + (nredirected * 2);
7764 nowunused = nowdead + ndead;
7765 nunused = (end - nowunused);
7766 Assert(nunused >= 0);
7767
7768 /* Update all line pointers per the record, and repair fragmentation */
7769 heap_page_prune_execute(buffer,
7770 redirected, nredirected,
7771 nowdead, ndead,
7772 nowunused, nunused);
7773
7774 /*
7775 * Note: we don't worry about updating the page's prunability hints.
7776 * At worst this will cause an extra prune cycle to occur soon.
7777 */
7778
7779 PageSetLSN(page, lsn);
7780 MarkBufferDirty(buffer);
7781 }
7782
7783 if (BufferIsValid(buffer))
7784 {
7785 Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
7786
7787 UnlockReleaseBuffer(buffer);
7788
7789 /*
7790 * After cleaning records from a page, it's useful to update the FSM
7791 * about it, as it may cause the page become target for insertions
7792 * later even if vacuum decides not to visit it (which is possible if
7793 * gets marked all-visible.)
7794 *
7795 * Do this regardless of a full-page image being applied, since the
7796 * FSM data is not in the page anyway.
7797 */
7798 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
7799 }
7800}
7801
7802/*
7803 * Replay XLOG_HEAP2_VISIBLE record.
7804 *
7805 * The critical integrity requirement here is that we must never end up with
7806 * a situation where the visibility map bit is set, and the page-level
7807 * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
7808 * page modification would fail to clear the visibility map bit.
7809 */
7810static void
7811heap_xlog_visible(XLogReaderState *record)
7812{
7813 XLogRecPtr lsn = record->EndRecPtr;
7814 xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
7815 Buffer vmbuffer = InvalidBuffer;
7816 Buffer buffer;
7817 Page page;
7818 RelFileNode rnode;
7819 BlockNumber blkno;
7820 XLogRedoAction action;
7821
7822 XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
7823
7824 /*
7825 * If there are any Hot Standby transactions running that have an xmin
7826 * horizon old enough that this page isn't all-visible for them, they
7827 * might incorrectly decide that an index-only scan can skip a heap fetch.
7828 *
7829 * NB: It might be better to throw some kind of "soft" conflict here that
7830 * forces any index-only scan that is in flight to perform heap fetches,
7831 * rather than killing the transaction outright.
7832 */
7833 if (InHotStandby)
7834 ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
7835
7836 /*
7837 * Read the heap page, if it still exists. If the heap file has dropped or
7838 * truncated later in recovery, we don't need to update the page, but we'd
7839 * better still update the visibility map.
7840 */
7841 action = XLogReadBufferForRedo(record, 1, &buffer);
7842 if (action == BLK_NEEDS_REDO)
7843 {
7844 /*
7845 * We don't bump the LSN of the heap page when setting the visibility
7846 * map bit (unless checksums or wal_hint_bits is enabled, in which
7847 * case we must), because that would generate an unworkable volume of
7848 * full-page writes. This exposes us to torn page hazards, but since
7849 * we're not inspecting the existing page contents in any way, we
7850 * don't care.
7851 *
7852 * However, all operations that clear the visibility map bit *do* bump
7853 * the LSN, and those operations will only be replayed if the XLOG LSN
7854 * follows the page LSN. Thus, if the page LSN has advanced past our
7855 * XLOG record's LSN, we mustn't mark the page all-visible, because
7856 * the subsequent update won't be replayed to clear the flag.
7857 */
7858 page = BufferGetPage(buffer);
7859
7860 PageSetAllVisible(page);
7861
7862 MarkBufferDirty(buffer);
7863 }
7864 else if (action == BLK_RESTORED)
7865 {
7866 /*
7867 * If heap block was backed up, we already restored it and there's
7868 * nothing more to do. (This can only happen with checksums or
7869 * wal_log_hints enabled.)
7870 */
7871 }
7872
7873 if (BufferIsValid(buffer))
7874 {
7875 Size space = PageGetFreeSpace(BufferGetPage(buffer));
7876
7877 UnlockReleaseBuffer(buffer);
7878
7879 /*
7880 * Since FSM is not WAL-logged and only updated heuristically, it
7881 * easily becomes stale in standbys. If the standby is later promoted
7882 * and runs VACUUM, it will skip updating individual free space
7883 * figures for pages that became all-visible (or all-frozen, depending
7884 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
7885 * propagates too optimistic free space values to upper FSM layers;
7886 * later inserters try to use such pages only to find out that they
7887 * are unusable. This can cause long stalls when there are many such
7888 * pages.
7889 *
7890 * Forestall those problems by updating FSM's idea about a page that
7891 * is becoming all-visible or all-frozen.
7892 *
7893 * Do this regardless of a full-page image being applied, since the
7894 * FSM data is not in the page anyway.
7895 */
7896 if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
7897 XLogRecordPageWithFreeSpace(rnode, blkno, space);
7898 }
7899
7900 /*
7901 * Even if we skipped the heap page update due to the LSN interlock, it's
7902 * still safe to update the visibility map. Any WAL record that clears
7903 * the visibility map bit does so before checking the page LSN, so any
7904 * bits that need to be cleared will still be cleared.
7905 */
7906 if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
7907 &vmbuffer) == BLK_NEEDS_REDO)
7908 {
7909 Page vmpage = BufferGetPage(vmbuffer);
7910 Relation reln;
7911
7912 /* initialize the page if it was read as zeros */
7913 if (PageIsNew(vmpage))
7914 PageInit(vmpage, BLCKSZ, 0);
7915
7916 /*
7917 * XLogReadBufferForRedoExtended locked the buffer. But
7918 * visibilitymap_set will handle locking itself.
7919 */
7920 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
7921
7922 reln = CreateFakeRelcacheEntry(rnode);
7923 visibilitymap_pin(reln, blkno, &vmbuffer);
7924
7925 /*
7926 * Don't set the bit if replay has already passed this point.
7927 *
7928 * It might be safe to do this unconditionally; if replay has passed
7929 * this point, we'll replay at least as far this time as we did
7930 * before, and if this bit needs to be cleared, the record responsible
7931 * for doing so should be again replayed, and clear it. For right
7932 * now, out of an abundance of conservatism, we use the same test here
7933 * we did for the heap page. If this results in a dropped bit, no
7934 * real harm is done; and the next VACUUM will fix it.
7935 */
7936 if (lsn > PageGetLSN(vmpage))
7937 visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
7938 xlrec->cutoff_xid, xlrec->flags);
7939
7940 ReleaseBuffer(vmbuffer);
7941 FreeFakeRelcacheEntry(reln);
7942 }
7943 else if (BufferIsValid(vmbuffer))
7944 UnlockReleaseBuffer(vmbuffer);
7945}
7946
7947/*
7948 * Replay XLOG_HEAP2_FREEZE_PAGE records
7949 */
7950static void
7951heap_xlog_freeze_page(XLogReaderState *record)
7952{
7953 XLogRecPtr lsn = record->EndRecPtr;
7954 xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
7955 TransactionId cutoff_xid = xlrec->cutoff_xid;
7956 Buffer buffer;
7957 int ntup;
7958
7959 /*
7960 * In Hot Standby mode, ensure that there's no queries running which still
7961 * consider the frozen xids as running.
7962 */
7963 if (InHotStandby)
7964 {
7965 RelFileNode rnode;
7966 TransactionId latestRemovedXid = cutoff_xid;
7967
7968 TransactionIdRetreat(latestRemovedXid);
7969
7970 XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
7971 ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
7972 }
7973
7974 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
7975 {
7976 Page page = BufferGetPage(buffer);
7977 xl_heap_freeze_tuple *tuples;
7978
7979 tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
7980
7981 /* now execute freeze plan for each frozen tuple */
7982 for (ntup = 0; ntup < xlrec->ntuples; ntup++)
7983 {
7984 xl_heap_freeze_tuple *xlrec_tp;
7985 ItemId lp;
7986 HeapTupleHeader tuple;
7987
7988 xlrec_tp = &tuples[ntup];
7989 lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
7990 tuple = (HeapTupleHeader) PageGetItem(page, lp);
7991
7992 heap_execute_freeze_tuple(tuple, xlrec_tp);
7993 }
7994
7995 PageSetLSN(page, lsn);
7996 MarkBufferDirty(buffer);
7997 }
7998 if (BufferIsValid(buffer))
7999 UnlockReleaseBuffer(buffer);
8000}
8001
8002/*
8003 * Given an "infobits" field from an XLog record, set the correct bits in the
8004 * given infomask and infomask2 for the tuple touched by the record.
8005 *
8006 * (This is the reverse of compute_infobits).
8007 */
8008static void
8009fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8010{
8011 *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8012 HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8013 *infomask2 &= ~HEAP_KEYS_UPDATED;
8014
8015 if (infobits & XLHL_XMAX_IS_MULTI)
8016 *infomask |= HEAP_XMAX_IS_MULTI;
8017 if (infobits & XLHL_XMAX_LOCK_ONLY)
8018 *infomask |= HEAP_XMAX_LOCK_ONLY;
8019 if (infobits & XLHL_XMAX_EXCL_LOCK)
8020 *infomask |= HEAP_XMAX_EXCL_LOCK;
8021 /* note HEAP_XMAX_SHR_LOCK isn't considered here */
8022 if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8023 *infomask |= HEAP_XMAX_KEYSHR_LOCK;
8024
8025 if (infobits & XLHL_KEYS_UPDATED)
8026 *infomask2 |= HEAP_KEYS_UPDATED;
8027}
8028
8029static void
8030heap_xlog_delete(XLogReaderState *record)
8031{
8032 XLogRecPtr lsn = record->EndRecPtr;
8033 xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8034 Buffer buffer;
8035 Page page;
8036 ItemId lp = NULL;
8037 HeapTupleHeader htup;
8038 BlockNumber blkno;
8039 RelFileNode target_node;
8040 ItemPointerData target_tid;
8041
8042 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8043 ItemPointerSetBlockNumber(&target_tid, blkno);
8044 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8045
8046 /*
8047 * The visibility map may need to be fixed even if the heap page is
8048 * already up-to-date.
8049 */
8050 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8051 {
8052 Relation reln = CreateFakeRelcacheEntry(target_node);
8053 Buffer vmbuffer = InvalidBuffer;
8054
8055 visibilitymap_pin(reln, blkno, &vmbuffer);
8056 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8057 ReleaseBuffer(vmbuffer);
8058 FreeFakeRelcacheEntry(reln);
8059 }
8060
8061 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8062 {
8063 page = BufferGetPage(buffer);
8064
8065 if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8066 lp = PageGetItemId(page, xlrec->offnum);
8067
8068 if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8069 elog(PANIC, "invalid lp");
8070
8071 htup = (HeapTupleHeader) PageGetItem(page, lp);
8072
8073 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8074 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8075 HeapTupleHeaderClearHotUpdated(htup);
8076 fix_infomask_from_infobits(xlrec->infobits_set,
8077 &htup->t_infomask, &htup->t_infomask2);
8078 if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8079 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8080 else
8081 HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8082 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8083
8084 /* Mark the page as a candidate for pruning */
8085 PageSetPrunable(page, XLogRecGetXid(record));
8086
8087 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8088 PageClearAllVisible(page);
8089
8090 /* Make sure t_ctid is set correctly */
8091 if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8092 HeapTupleHeaderSetMovedPartitions(htup);
8093 else
8094 htup->t_ctid = target_tid;
8095 PageSetLSN(page, lsn);
8096 MarkBufferDirty(buffer);
8097 }
8098 if (BufferIsValid(buffer))
8099 UnlockReleaseBuffer(buffer);
8100}
8101
8102static void
8103heap_xlog_insert(XLogReaderState *record)
8104{
8105 XLogRecPtr lsn = record->EndRecPtr;
8106 xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8107 Buffer buffer;
8108 Page page;
8109 union
8110 {
8111 HeapTupleHeaderData hdr;
8112 char data[MaxHeapTupleSize];
8113 } tbuf;
8114 HeapTupleHeader htup;
8115 xl_heap_header xlhdr;
8116 uint32 newlen;
8117 Size freespace = 0;
8118 RelFileNode target_node;
8119 BlockNumber blkno;
8120 ItemPointerData target_tid;
8121 XLogRedoAction action;
8122
8123 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8124 ItemPointerSetBlockNumber(&target_tid, blkno);
8125 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8126
8127 /*
8128 * The visibility map may need to be fixed even if the heap page is
8129 * already up-to-date.
8130 */
8131 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8132 {
8133 Relation reln = CreateFakeRelcacheEntry(target_node);
8134 Buffer vmbuffer = InvalidBuffer;
8135
8136 visibilitymap_pin(reln, blkno, &vmbuffer);
8137 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8138 ReleaseBuffer(vmbuffer);
8139 FreeFakeRelcacheEntry(reln);
8140 }
8141
8142 /*
8143 * If we inserted the first and only tuple on the page, re-initialize the
8144 * page from scratch.
8145 */
8146 if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8147 {
8148 buffer = XLogInitBufferForRedo(record, 0);
8149 page = BufferGetPage(buffer);
8150 PageInit(page, BufferGetPageSize(buffer), 0);
8151 action = BLK_NEEDS_REDO;
8152 }
8153 else
8154 action = XLogReadBufferForRedo(record, 0, &buffer);
8155 if (action == BLK_NEEDS_REDO)
8156 {
8157 Size datalen;
8158 char *data;
8159
8160 page = BufferGetPage(buffer);
8161
8162 if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8163 elog(PANIC, "invalid max offset number");
8164
8165 data = XLogRecGetBlockData(record, 0, &datalen);
8166
8167 newlen = datalen - SizeOfHeapHeader;
8168 Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8169 memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8170 data += SizeOfHeapHeader;
8171
8172 htup = &tbuf.hdr;
8173 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8174 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8175 memcpy((char *) htup + SizeofHeapTupleHeader,
8176 data,
8177 newlen);
8178 newlen += SizeofHeapTupleHeader;
8179 htup->t_infomask2 = xlhdr.t_infomask2;
8180 htup->t_infomask = xlhdr.t_infomask;
8181 htup->t_hoff = xlhdr.t_hoff;
8182 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8183 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8184 htup->t_ctid = target_tid;
8185
8186 if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8187 true, true) == InvalidOffsetNumber)
8188 elog(PANIC, "failed to add tuple");
8189
8190 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8191
8192 PageSetLSN(page, lsn);
8193
8194 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8195 PageClearAllVisible(page);
8196
8197 MarkBufferDirty(buffer);
8198 }
8199 if (BufferIsValid(buffer))
8200 UnlockReleaseBuffer(buffer);
8201
8202 /*
8203 * If the page is running low on free space, update the FSM as well.
8204 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8205 * better than that without knowing the fill-factor for the table.
8206 *
8207 * XXX: Don't do this if the page was restored from full page image. We
8208 * don't bother to update the FSM in that case, it doesn't need to be
8209 * totally accurate anyway.
8210 */
8211 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8212 XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8213}
8214
8215/*
8216 * Handles MULTI_INSERT record type.
8217 */
8218static void
8219heap_xlog_multi_insert(XLogReaderState *record)
8220{
8221 XLogRecPtr lsn = record->EndRecPtr;
8222 xl_heap_multi_insert *xlrec;
8223 RelFileNode rnode;
8224 BlockNumber blkno;
8225 Buffer buffer;
8226 Page page;
8227 union
8228 {
8229 HeapTupleHeaderData hdr;
8230 char data[MaxHeapTupleSize];
8231 } tbuf;
8232 HeapTupleHeader htup;
8233 uint32 newlen;
8234 Size freespace = 0;
8235 int i;
8236 bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8237 XLogRedoAction action;
8238
8239 /*
8240 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8241 * required.
8242 */
8243 xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8244
8245 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8246
8247 /*
8248 * The visibility map may need to be fixed even if the heap page is
8249 * already up-to-date.
8250 */
8251 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8252 {
8253 Relation reln = CreateFakeRelcacheEntry(rnode);
8254 Buffer vmbuffer = InvalidBuffer;
8255
8256 visibilitymap_pin(reln, blkno, &vmbuffer);
8257 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8258 ReleaseBuffer(vmbuffer);
8259 FreeFakeRelcacheEntry(reln);
8260 }
8261
8262 if (isinit)
8263 {
8264 buffer = XLogInitBufferForRedo(record, 0);
8265 page = BufferGetPage(buffer);
8266 PageInit(page, BufferGetPageSize(buffer), 0);
8267 action = BLK_NEEDS_REDO;
8268 }
8269 else
8270 action = XLogReadBufferForRedo(record, 0, &buffer);
8271 if (action == BLK_NEEDS_REDO)
8272 {
8273 char *tupdata;
8274 char *endptr;
8275 Size len;
8276
8277 /* Tuples are stored as block data */
8278 tupdata = XLogRecGetBlockData(record, 0, &len);
8279 endptr = tupdata + len;
8280
8281 page = (Page) BufferGetPage(buffer);
8282
8283 for (i = 0; i < xlrec->ntuples; i++)
8284 {
8285 OffsetNumber offnum;
8286 xl_multi_insert_tuple *xlhdr;
8287
8288 /*
8289 * If we're reinitializing the page, the tuples are stored in
8290 * order from FirstOffsetNumber. Otherwise there's an array of
8291 * offsets in the WAL record, and the tuples come after that.
8292 */
8293 if (isinit)
8294 offnum = FirstOffsetNumber + i;
8295 else
8296 offnum = xlrec->offsets[i];
8297 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8298 elog(PANIC, "invalid max offset number");
8299
8300 xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8301 tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8302
8303 newlen = xlhdr->datalen;
8304 Assert(newlen <= MaxHeapTupleSize);
8305 htup = &tbuf.hdr;
8306 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8307 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8308 memcpy((char *) htup + SizeofHeapTupleHeader,
8309 (char *) tupdata,
8310 newlen);
8311 tupdata += newlen;
8312
8313 newlen += SizeofHeapTupleHeader;
8314 htup->t_infomask2 = xlhdr->t_infomask2;
8315 htup->t_infomask = xlhdr->t_infomask;
8316 htup->t_hoff = xlhdr->t_hoff;
8317 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8318 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8319 ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8320 ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8321
8322 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8323 if (offnum == InvalidOffsetNumber)
8324 elog(PANIC, "failed to add tuple");
8325 }
8326 if (tupdata != endptr)
8327 elog(PANIC, "total tuple length mismatch");
8328
8329 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8330
8331 PageSetLSN(page, lsn);
8332
8333 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8334 PageClearAllVisible(page);
8335
8336 MarkBufferDirty(buffer);
8337 }
8338 if (BufferIsValid(buffer))
8339 UnlockReleaseBuffer(buffer);
8340
8341 /*
8342 * If the page is running low on free space, update the FSM as well.
8343 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8344 * better than that without knowing the fill-factor for the table.
8345 *
8346 * XXX: Don't do this if the page was restored from full page image. We
8347 * don't bother to update the FSM in that case, it doesn't need to be
8348 * totally accurate anyway.
8349 */
8350 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8351 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8352}
8353
8354/*
8355 * Handles UPDATE and HOT_UPDATE
8356 */
8357static void
8358heap_xlog_update(XLogReaderState *record, bool hot_update)
8359{
8360 XLogRecPtr lsn = record->EndRecPtr;
8361 xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
8362 RelFileNode rnode;
8363 BlockNumber oldblk;
8364 BlockNumber newblk;
8365 ItemPointerData newtid;
8366 Buffer obuffer,
8367 nbuffer;
8368 Page page;
8369 OffsetNumber offnum;
8370 ItemId lp = NULL;
8371 HeapTupleData oldtup;
8372 HeapTupleHeader htup;
8373 uint16 prefixlen = 0,
8374 suffixlen = 0;
8375 char *newp;
8376 union
8377 {
8378 HeapTupleHeaderData hdr;
8379 char data[MaxHeapTupleSize];
8380 } tbuf;
8381 xl_heap_header xlhdr;
8382 uint32 newlen;
8383 Size freespace = 0;
8384 XLogRedoAction oldaction;
8385 XLogRedoAction newaction;
8386
8387 /* initialize to keep the compiler quiet */
8388 oldtup.t_data = NULL;
8389 oldtup.t_len = 0;
8390
8391 XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
8392 if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
8393 {
8394 /* HOT updates are never done across pages */
8395 Assert(!hot_update);
8396 }
8397 else
8398 oldblk = newblk;
8399
8400 ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8401
8402 /*
8403 * The visibility map may need to be fixed even if the heap page is
8404 * already up-to-date.
8405 */
8406 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8407 {
8408 Relation reln = CreateFakeRelcacheEntry(rnode);
8409 Buffer vmbuffer = InvalidBuffer;
8410
8411 visibilitymap_pin(reln, oldblk, &vmbuffer);
8412 visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8413 ReleaseBuffer(vmbuffer);
8414 FreeFakeRelcacheEntry(reln);
8415 }
8416
8417 /*
8418 * In normal operation, it is important to lock the two pages in
8419 * page-number order, to avoid possible deadlocks against other update
8420 * operations going the other way. However, during WAL replay there can
8421 * be no other update happening, so we don't need to worry about that. But
8422 * we *do* need to worry that we don't expose an inconsistent state to Hot
8423 * Standby queries --- so the original page can't be unlocked before we've
8424 * added the new tuple to the new page.
8425 */
8426
8427 /* Deal with old tuple version */
8428 oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
8429 &obuffer);
8430 if (oldaction == BLK_NEEDS_REDO)
8431 {
8432 page = BufferGetPage(obuffer);
8433 offnum = xlrec->old_offnum;
8434 if (PageGetMaxOffsetNumber(page) >= offnum)
8435 lp = PageGetItemId(page, offnum);
8436
8437 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8438 elog(PANIC, "invalid lp");
8439
8440 htup = (HeapTupleHeader) PageGetItem(page, lp);
8441
8442 oldtup.t_data = htup;
8443 oldtup.t_len = ItemIdGetLength(lp);
8444
8445 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8446 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8447 if (hot_update)
8448 HeapTupleHeaderSetHotUpdated(htup);
8449 else
8450 HeapTupleHeaderClearHotUpdated(htup);
8451 fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8452 &htup->t_infomask2);
8453 HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8454 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8455 /* Set forward chain link in t_ctid */
8456 htup->t_ctid = newtid;
8457
8458 /* Mark the page as a candidate for pruning */
8459 PageSetPrunable(page, XLogRecGetXid(record));
8460
8461 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8462 PageClearAllVisible(page);
8463
8464 PageSetLSN(page, lsn);
8465 MarkBufferDirty(obuffer);
8466 }
8467
8468 /*
8469 * Read the page the new tuple goes into, if different from old.
8470 */
8471 if (oldblk == newblk)
8472 {
8473 nbuffer = obuffer;
8474 newaction = oldaction;
8475 }
8476 else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8477 {
8478 nbuffer = XLogInitBufferForRedo(record, 0);
8479 page = (Page) BufferGetPage(nbuffer);
8480 PageInit(page, BufferGetPageSize(nbuffer), 0);
8481 newaction = BLK_NEEDS_REDO;
8482 }
8483 else
8484 newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
8485
8486 /*
8487 * The visibility map may need to be fixed even if the heap page is
8488 * already up-to-date.
8489 */
8490 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8491 {
8492 Relation reln = CreateFakeRelcacheEntry(rnode);
8493 Buffer vmbuffer = InvalidBuffer;
8494
8495 visibilitymap_pin(reln, newblk, &vmbuffer);
8496 visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8497 ReleaseBuffer(vmbuffer);
8498 FreeFakeRelcacheEntry(reln);
8499 }
8500
8501 /* Deal with new tuple */
8502 if (newaction == BLK_NEEDS_REDO)
8503 {
8504 char *recdata;
8505 char *recdata_end;
8506 Size datalen;
8507 Size tuplen;
8508
8509 recdata = XLogRecGetBlockData(record, 0, &datalen);
8510 recdata_end = recdata + datalen;
8511
8512 page = BufferGetPage(nbuffer);
8513
8514 offnum = xlrec->new_offnum;
8515 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8516 elog(PANIC, "invalid max offset number");
8517
8518 if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
8519 {
8520 Assert(newblk == oldblk);
8521 memcpy(&prefixlen, recdata, sizeof(uint16));
8522 recdata += sizeof(uint16);
8523 }
8524 if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
8525 {
8526 Assert(newblk == oldblk);
8527 memcpy(&suffixlen, recdata, sizeof(uint16));
8528 recdata += sizeof(uint16);
8529 }
8530
8531 memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
8532 recdata += SizeOfHeapHeader;
8533
8534 tuplen = recdata_end - recdata;
8535 Assert(tuplen <= MaxHeapTupleSize);
8536
8537 htup = &tbuf.hdr;
8538 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8539
8540 /*
8541 * Reconstruct the new tuple using the prefix and/or suffix from the
8542 * old tuple, and the data stored in the WAL record.
8543 */
8544 newp = (char *) htup + SizeofHeapTupleHeader;
8545 if (prefixlen > 0)
8546 {
8547 int len;
8548
8549 /* copy bitmap [+ padding] [+ oid] from WAL record */
8550 len = xlhdr.t_hoff - SizeofHeapTupleHeader;
8551 memcpy(newp, recdata, len);
8552 recdata += len;
8553 newp += len;
8554
8555 /* copy prefix from old tuple */
8556 memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
8557 newp += prefixlen;
8558
8559 /* copy new tuple data from WAL record */
8560 len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
8561 memcpy(newp, recdata, len);
8562 recdata += len;
8563 newp += len;
8564 }
8565 else
8566 {
8567 /*
8568 * copy bitmap [+ padding] [+ oid] + data from record, all in one
8569 * go
8570 */
8571 memcpy(newp, recdata, tuplen);
8572 recdata += tuplen;
8573 newp += tuplen;
8574 }
8575 Assert(recdata == recdata_end);
8576
8577 /* copy suffix from old tuple */
8578 if (suffixlen > 0)
8579 memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
8580
8581 newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
8582 htup->t_infomask2 = xlhdr.t_infomask2;
8583 htup->t_infomask = xlhdr.t_infomask;
8584 htup->t_hoff = xlhdr.t_hoff;
8585
8586 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8587 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8588 HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
8589 /* Make sure there is no forward chain link in t_ctid */
8590 htup->t_ctid = newtid;
8591
8592 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8593 if (offnum == InvalidOffsetNumber)
8594 elog(PANIC, "failed to add tuple");
8595
8596 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8597 PageClearAllVisible(page);
8598
8599 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8600
8601 PageSetLSN(page, lsn);
8602 MarkBufferDirty(nbuffer);
8603 }
8604
8605 if (BufferIsValid(nbuffer) && nbuffer != obuffer)
8606 UnlockReleaseBuffer(nbuffer);
8607 if (BufferIsValid(obuffer))
8608 UnlockReleaseBuffer(obuffer);
8609
8610 /*
8611 * If the new page is running low on free space, update the FSM as well.
8612 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8613 * better than that without knowing the fill-factor for the table.
8614 *
8615 * However, don't update the FSM on HOT updates, because after crash
8616 * recovery, either the old or the new tuple will certainly be dead and
8617 * prunable. After pruning, the page will have roughly as much free space
8618 * as it did before the update, assuming the new tuple is about the same
8619 * size as the old one.
8620 *
8621 * XXX: Don't do this if the page was restored from full page image. We
8622 * don't bother to update the FSM in that case, it doesn't need to be
8623 * totally accurate anyway.
8624 */
8625 if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
8626 XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
8627}
8628
8629static void
8630heap_xlog_confirm(XLogReaderState *record)
8631{
8632 XLogRecPtr lsn = record->EndRecPtr;
8633 xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
8634 Buffer buffer;
8635 Page page;
8636 OffsetNumber offnum;
8637 ItemId lp = NULL;
8638 HeapTupleHeader htup;
8639
8640 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8641 {
8642 page = BufferGetPage(buffer);
8643
8644 offnum = xlrec->offnum;
8645 if (PageGetMaxOffsetNumber(page) >= offnum)
8646 lp = PageGetItemId(page, offnum);
8647
8648 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8649 elog(PANIC, "invalid lp");
8650
8651 htup = (HeapTupleHeader) PageGetItem(page, lp);
8652
8653 /*
8654 * Confirm tuple as actually inserted
8655 */
8656 ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
8657
8658 PageSetLSN(page, lsn);
8659 MarkBufferDirty(buffer);
8660 }
8661 if (BufferIsValid(buffer))
8662 UnlockReleaseBuffer(buffer);
8663}
8664
8665static void
8666heap_xlog_lock(XLogReaderState *record)
8667{
8668 XLogRecPtr lsn = record->EndRecPtr;
8669 xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
8670 Buffer buffer;
8671 Page page;
8672 OffsetNumber offnum;
8673 ItemId lp = NULL;
8674 HeapTupleHeader htup;
8675
8676 /*
8677 * The visibility map may need to be fixed even if the heap page is
8678 * already up-to-date.
8679 */
8680 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8681 {
8682 RelFileNode rnode;
8683 Buffer vmbuffer = InvalidBuffer;
8684 BlockNumber block;
8685 Relation reln;
8686
8687 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8688 reln = CreateFakeRelcacheEntry(rnode);
8689
8690 visibilitymap_pin(reln, block, &vmbuffer);
8691 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8692
8693 ReleaseBuffer(vmbuffer);
8694 FreeFakeRelcacheEntry(reln);
8695 }
8696
8697 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8698 {
8699 page = (Page) BufferGetPage(buffer);
8700
8701 offnum = xlrec->offnum;
8702 if (PageGetMaxOffsetNumber(page) >= offnum)
8703 lp = PageGetItemId(page, offnum);
8704
8705 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8706 elog(PANIC, "invalid lp");
8707
8708 htup = (HeapTupleHeader) PageGetItem(page, lp);
8709
8710 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8711 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8712 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8713 &htup->t_infomask2);
8714
8715 /*
8716 * Clear relevant update flags, but only if the modified infomask says
8717 * there's no update.
8718 */
8719 if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
8720 {
8721 HeapTupleHeaderClearHotUpdated(htup);
8722 /* Make sure there is no forward chain link in t_ctid */
8723 ItemPointerSet(&htup->t_ctid,
8724 BufferGetBlockNumber(buffer),
8725 offnum);
8726 }
8727 HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
8728 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8729 PageSetLSN(page, lsn);
8730 MarkBufferDirty(buffer);
8731 }
8732 if (BufferIsValid(buffer))
8733 UnlockReleaseBuffer(buffer);
8734}
8735
8736static void
8737heap_xlog_lock_updated(XLogReaderState *record)
8738{
8739 XLogRecPtr lsn = record->EndRecPtr;
8740 xl_heap_lock_updated *xlrec;
8741 Buffer buffer;
8742 Page page;
8743 OffsetNumber offnum;
8744 ItemId lp = NULL;
8745 HeapTupleHeader htup;
8746
8747 xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
8748
8749 /*
8750 * The visibility map may need to be fixed even if the heap page is
8751 * already up-to-date.
8752 */
8753 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8754 {
8755 RelFileNode rnode;
8756 Buffer vmbuffer = InvalidBuffer;
8757 BlockNumber block;
8758 Relation reln;
8759
8760 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8761 reln = CreateFakeRelcacheEntry(rnode);
8762
8763 visibilitymap_pin(reln, block, &vmbuffer);
8764 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8765
8766 ReleaseBuffer(vmbuffer);
8767 FreeFakeRelcacheEntry(reln);
8768 }
8769
8770 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8771 {
8772 page = BufferGetPage(buffer);
8773
8774 offnum = xlrec->offnum;
8775 if (PageGetMaxOffsetNumber(page) >= offnum)
8776 lp = PageGetItemId(page, offnum);
8777
8778 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8779 elog(PANIC, "invalid lp");
8780
8781 htup = (HeapTupleHeader) PageGetItem(page, lp);
8782
8783 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8784 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8785 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8786 &htup->t_infomask2);
8787 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8788
8789 PageSetLSN(page, lsn);
8790 MarkBufferDirty(buffer);
8791 }
8792 if (BufferIsValid(buffer))
8793 UnlockReleaseBuffer(buffer);
8794}
8795
8796static void
8797heap_xlog_inplace(XLogReaderState *record)
8798{
8799 XLogRecPtr lsn = record->EndRecPtr;
8800 xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
8801 Buffer buffer;
8802 Page page;
8803 OffsetNumber offnum;
8804 ItemId lp = NULL;
8805 HeapTupleHeader htup;
8806 uint32 oldlen;
8807 Size newlen;
8808
8809 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8810 {
8811 char *newtup = XLogRecGetBlockData(record, 0, &newlen);
8812
8813 page = BufferGetPage(buffer);
8814
8815 offnum = xlrec->offnum;
8816 if (PageGetMaxOffsetNumber(page) >= offnum)
8817 lp = PageGetItemId(page, offnum);
8818
8819 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8820 elog(PANIC, "invalid lp");
8821
8822 htup = (HeapTupleHeader) PageGetItem(page, lp);
8823
8824 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
8825 if (oldlen != newlen)
8826 elog(PANIC, "wrong tuple length");
8827
8828 memcpy((char *) htup + htup->t_hoff, newtup, newlen);
8829
8830 PageSetLSN(page, lsn);
8831 MarkBufferDirty(buffer);
8832 }
8833 if (BufferIsValid(buffer))
8834 UnlockReleaseBuffer(buffer);
8835}
8836
8837void
8838heap_redo(XLogReaderState *record)
8839{
8840 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8841
8842 /*
8843 * These operations don't overwrite MVCC data so no conflict processing is
8844 * required. The ones in heap2 rmgr do.
8845 */
8846
8847 switch (info & XLOG_HEAP_OPMASK)
8848 {
8849 case XLOG_HEAP_INSERT:
8850 heap_xlog_insert(record);
8851 break;
8852 case XLOG_HEAP_DELETE:
8853 heap_xlog_delete(record);
8854 break;
8855 case XLOG_HEAP_UPDATE:
8856 heap_xlog_update(record, false);
8857 break;
8858 case XLOG_HEAP_TRUNCATE:
8859
8860 /*
8861 * TRUNCATE is a no-op because the actions are already logged as
8862 * SMGR WAL records. TRUNCATE WAL record only exists for logical
8863 * decoding.
8864 */
8865 break;
8866 case XLOG_HEAP_HOT_UPDATE:
8867 heap_xlog_update(record, true);
8868 break;
8869 case XLOG_HEAP_CONFIRM:
8870 heap_xlog_confirm(record);
8871 break;
8872 case XLOG_HEAP_LOCK:
8873 heap_xlog_lock(record);
8874 break;
8875 case XLOG_HEAP_INPLACE:
8876 heap_xlog_inplace(record);
8877 break;
8878 default:
8879 elog(PANIC, "heap_redo: unknown op code %u", info);
8880 }
8881}
8882
8883void
8884heap2_redo(XLogReaderState *record)
8885{
8886 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8887
8888 switch (info & XLOG_HEAP_OPMASK)
8889 {
8890 case XLOG_HEAP2_CLEAN:
8891 heap_xlog_clean(record);
8892 break;
8893 case XLOG_HEAP2_FREEZE_PAGE:
8894 heap_xlog_freeze_page(record);
8895 break;
8896 case XLOG_HEAP2_CLEANUP_INFO:
8897 heap_xlog_cleanup_info(record);
8898 break;
8899 case XLOG_HEAP2_VISIBLE:
8900 heap_xlog_visible(record);
8901 break;
8902 case XLOG_HEAP2_MULTI_INSERT:
8903 heap_xlog_multi_insert(record);
8904 break;
8905 case XLOG_HEAP2_LOCK_UPDATED:
8906 heap_xlog_lock_updated(record);
8907 break;
8908 case XLOG_HEAP2_NEW_CID:
8909
8910 /*
8911 * Nothing to do on a real replay, only used during logical
8912 * decoding.
8913 */
8914 break;
8915 case XLOG_HEAP2_REWRITE:
8916 heap_xlog_logical_rewrite(record);
8917 break;
8918 default:
8919 elog(PANIC, "heap2_redo: unknown op code %u", info);
8920 }
8921}
8922
8923/*
8924 * heap_sync - sync a heap, for use when no WAL has been written
8925 *
8926 * This forces the heap contents (including TOAST heap if any) down to disk.
8927 * If we skipped using WAL, and WAL is otherwise needed, we must force the
8928 * relation down to disk before it's safe to commit the transaction. This
8929 * requires writing out any dirty buffers and then doing a forced fsync.
8930 *
8931 * Indexes are not touched. (Currently, index operations associated with
8932 * the commands that use this are WAL-logged and so do not need fsync.
8933 * That behavior might change someday, but in any case it's likely that
8934 * any fsync decisions required would be per-index and hence not appropriate
8935 * to be done here.)
8936 */
8937void
8938heap_sync(Relation rel)
8939{
8940 /* non-WAL-logged tables never need fsync */
8941 if (!RelationNeedsWAL(rel))
8942 return;
8943
8944 /* main heap */
8945 FlushRelationBuffers(rel);
8946 /* FlushRelationBuffers will have opened rd_smgr */
8947 smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
8948
8949 /* FSM is not critical, don't bother syncing it */
8950
8951 /* toast heap, if any */
8952 if (OidIsValid(rel->rd_rel->reltoastrelid))
8953 {
8954 Relation toastrel;
8955
8956 toastrel = table_open(rel->rd_rel->reltoastrelid, AccessShareLock);
8957 FlushRelationBuffers(toastrel);
8958 smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
8959 table_close(toastrel, AccessShareLock);
8960 }
8961}
8962
8963/*
8964 * Mask a heap page before performing consistency checks on it.
8965 */
8966void
8967heap_mask(char *pagedata, BlockNumber blkno)
8968{
8969 Page page = (Page) pagedata;
8970 OffsetNumber off;
8971
8972 mask_page_lsn_and_checksum(page);
8973
8974 mask_page_hint_bits(page);
8975 mask_unused_space(page);
8976
8977 for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
8978 {
8979 ItemId iid = PageGetItemId(page, off);
8980 char *page_item;
8981
8982 page_item = (char *) (page + ItemIdGetOffset(iid));
8983
8984 if (ItemIdIsNormal(iid))
8985 {
8986 HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
8987
8988 /*
8989 * If xmin of a tuple is not yet frozen, we should ignore
8990 * differences in hint bits, since they can be set without
8991 * emitting WAL.
8992 */
8993 if (!HeapTupleHeaderXminFrozen(page_htup))
8994 page_htup->t_infomask &= ~HEAP_XACT_MASK;
8995 else
8996 {
8997 /* Still we need to mask xmax hint bits. */
8998 page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
8999 page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9000 }
9001
9002 /*
9003 * During replay, we set Command Id to FirstCommandId. Hence, mask
9004 * it. See heap_xlog_insert() for details.
9005 */
9006 page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9007
9008 /*
9009 * For a speculative tuple, heap_insert() does not set ctid in the
9010 * caller-passed heap tuple itself, leaving the ctid field to
9011 * contain a speculative token value - a per-backend monotonically
9012 * increasing identifier. Besides, it does not WAL-log ctid under
9013 * any circumstances.
9014 *
9015 * During redo, heap_xlog_insert() sets t_ctid to current block
9016 * number and self offset number. It doesn't care about any
9017 * speculative insertions in master. Hence, we set t_ctid to
9018 * current block number and self offset number to ignore any
9019 * inconsistency.
9020 */
9021 if (HeapTupleHeaderIsSpeculative(page_htup))
9022 ItemPointerSet(&page_htup->t_ctid, blkno, off);
9023
9024 /*
9025 * NB: Not ignoring ctid changes due to the tuple having moved
9026 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9027 * important information that needs to be in-sync between primary
9028 * and standby, and thus is WAL logged.
9029 */
9030 }
9031
9032 /*
9033 * Ignore any padding bytes after the tuple, when the length of the
9034 * item is not MAXALIGNed.
9035 */
9036 if (ItemIdHasStorage(iid))
9037 {
9038 int len = ItemIdGetLength(iid);
9039 int padlen = MAXALIGN(len) - len;
9040
9041 if (padlen > 0)
9042 memset(page_item + len, MASK_MARKER, padlen);
9043 }
9044 }
9045}
9046