1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * heapam_handler.c |
4 | * heap table access method code |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/heap/heapam_handler.c |
12 | * |
13 | * |
14 | * NOTES |
15 | * This files wires up the lower level heapam.c et al routines with the |
16 | * tableam abstraction. |
17 | * |
18 | *------------------------------------------------------------------------- |
19 | */ |
20 | #include "postgres.h" |
21 | |
22 | #include <math.h> |
23 | |
24 | #include "miscadmin.h" |
25 | |
26 | #include "access/genam.h" |
27 | #include "access/heapam.h" |
28 | #include "access/multixact.h" |
29 | #include "access/rewriteheap.h" |
30 | #include "access/tableam.h" |
31 | #include "access/tsmapi.h" |
32 | #include "access/tuptoaster.h" |
33 | #include "access/xact.h" |
34 | #include "catalog/catalog.h" |
35 | #include "catalog/index.h" |
36 | #include "catalog/storage.h" |
37 | #include "catalog/storage_xlog.h" |
38 | #include "commands/progress.h" |
39 | #include "executor/executor.h" |
40 | #include "optimizer/plancat.h" |
41 | #include "pgstat.h" |
42 | #include "storage/bufmgr.h" |
43 | #include "storage/bufpage.h" |
44 | #include "storage/bufmgr.h" |
45 | #include "storage/lmgr.h" |
46 | #include "storage/predicate.h" |
47 | #include "storage/procarray.h" |
48 | #include "storage/smgr.h" |
49 | #include "utils/builtins.h" |
50 | #include "utils/rel.h" |
51 | |
52 | |
53 | static void reform_and_rewrite_tuple(HeapTuple tuple, |
54 | Relation OldHeap, Relation NewHeap, |
55 | Datum *values, bool *isnull, RewriteState rwstate); |
56 | |
57 | static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, |
58 | HeapTuple tuple, |
59 | OffsetNumber tupoffset); |
60 | |
61 | static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); |
62 | |
63 | static const TableAmRoutine heapam_methods; |
64 | |
65 | |
66 | /* ------------------------------------------------------------------------ |
67 | * Slot related callbacks for heap AM |
68 | * ------------------------------------------------------------------------ |
69 | */ |
70 | |
71 | static const TupleTableSlotOps * |
72 | heapam_slot_callbacks(Relation relation) |
73 | { |
74 | return &TTSOpsBufferHeapTuple; |
75 | } |
76 | |
77 | |
78 | /* ------------------------------------------------------------------------ |
79 | * Index Scan Callbacks for heap AM |
80 | * ------------------------------------------------------------------------ |
81 | */ |
82 | |
83 | static IndexFetchTableData * |
84 | heapam_index_fetch_begin(Relation rel) |
85 | { |
86 | IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); |
87 | |
88 | hscan->xs_base.rel = rel; |
89 | hscan->xs_cbuf = InvalidBuffer; |
90 | |
91 | return &hscan->xs_base; |
92 | } |
93 | |
94 | static void |
95 | heapam_index_fetch_reset(IndexFetchTableData *scan) |
96 | { |
97 | IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; |
98 | |
99 | if (BufferIsValid(hscan->xs_cbuf)) |
100 | { |
101 | ReleaseBuffer(hscan->xs_cbuf); |
102 | hscan->xs_cbuf = InvalidBuffer; |
103 | } |
104 | } |
105 | |
106 | static void |
107 | heapam_index_fetch_end(IndexFetchTableData *scan) |
108 | { |
109 | IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; |
110 | |
111 | heapam_index_fetch_reset(scan); |
112 | |
113 | pfree(hscan); |
114 | } |
115 | |
116 | static bool |
117 | heapam_index_fetch_tuple(struct IndexFetchTableData *scan, |
118 | ItemPointer tid, |
119 | Snapshot snapshot, |
120 | TupleTableSlot *slot, |
121 | bool *call_again, bool *all_dead) |
122 | { |
123 | IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; |
124 | BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; |
125 | bool got_heap_tuple; |
126 | |
127 | Assert(TTS_IS_BUFFERTUPLE(slot)); |
128 | |
129 | /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ |
130 | if (!*call_again) |
131 | { |
132 | /* Switch to correct buffer if we don't have it already */ |
133 | Buffer prev_buf = hscan->xs_cbuf; |
134 | |
135 | hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, |
136 | hscan->xs_base.rel, |
137 | ItemPointerGetBlockNumber(tid)); |
138 | |
139 | /* |
140 | * Prune page, but only if we weren't already on this page |
141 | */ |
142 | if (prev_buf != hscan->xs_cbuf) |
143 | heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); |
144 | } |
145 | |
146 | /* Obtain share-lock on the buffer so we can examine visibility */ |
147 | LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE); |
148 | got_heap_tuple = heap_hot_search_buffer(tid, |
149 | hscan->xs_base.rel, |
150 | hscan->xs_cbuf, |
151 | snapshot, |
152 | &bslot->base.tupdata, |
153 | all_dead, |
154 | !*call_again); |
155 | bslot->base.tupdata.t_self = *tid; |
156 | LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK); |
157 | |
158 | if (got_heap_tuple) |
159 | { |
160 | /* |
161 | * Only in a non-MVCC snapshot can more than one member of the HOT |
162 | * chain be visible. |
163 | */ |
164 | *call_again = !IsMVCCSnapshot(snapshot); |
165 | |
166 | slot->tts_tableOid = RelationGetRelid(scan->rel); |
167 | ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf); |
168 | } |
169 | else |
170 | { |
171 | /* We've reached the end of the HOT chain. */ |
172 | *call_again = false; |
173 | } |
174 | |
175 | return got_heap_tuple; |
176 | } |
177 | |
178 | |
179 | /* ------------------------------------------------------------------------ |
180 | * Callbacks for non-modifying operations on individual tuples for heap AM |
181 | * ------------------------------------------------------------------------ |
182 | */ |
183 | |
184 | static bool |
185 | heapam_fetch_row_version(Relation relation, |
186 | ItemPointer tid, |
187 | Snapshot snapshot, |
188 | TupleTableSlot *slot) |
189 | { |
190 | BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; |
191 | Buffer buffer; |
192 | |
193 | Assert(TTS_IS_BUFFERTUPLE(slot)); |
194 | |
195 | bslot->base.tupdata.t_self = *tid; |
196 | if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer)) |
197 | { |
198 | /* store in slot, transferring existing pin */ |
199 | ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer); |
200 | slot->tts_tableOid = RelationGetRelid(relation); |
201 | |
202 | return true; |
203 | } |
204 | |
205 | return false; |
206 | } |
207 | |
208 | static bool |
209 | heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) |
210 | { |
211 | HeapScanDesc hscan = (HeapScanDesc) scan; |
212 | |
213 | return ItemPointerIsValid(tid) && |
214 | ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks; |
215 | } |
216 | |
217 | static bool |
218 | heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, |
219 | Snapshot snapshot) |
220 | { |
221 | BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; |
222 | bool res; |
223 | |
224 | Assert(TTS_IS_BUFFERTUPLE(slot)); |
225 | Assert(BufferIsValid(bslot->buffer)); |
226 | |
227 | /* |
228 | * We need buffer pin and lock to call HeapTupleSatisfiesVisibility. |
229 | * Caller should be holding pin, but not lock. |
230 | */ |
231 | LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE); |
232 | res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot, |
233 | bslot->buffer); |
234 | LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK); |
235 | |
236 | return res; |
237 | } |
238 | |
239 | |
240 | /* ---------------------------------------------------------------------------- |
241 | * Functions for manipulations of physical tuples for heap AM. |
242 | * ---------------------------------------------------------------------------- |
243 | */ |
244 | |
245 | static void |
246 | heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, |
247 | int options, BulkInsertState bistate) |
248 | { |
249 | bool shouldFree = true; |
250 | HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); |
251 | |
252 | /* Update the tuple with table oid */ |
253 | slot->tts_tableOid = RelationGetRelid(relation); |
254 | tuple->t_tableOid = slot->tts_tableOid; |
255 | |
256 | /* Perform the insertion, and copy the resulting ItemPointer */ |
257 | heap_insert(relation, tuple, cid, options, bistate); |
258 | ItemPointerCopy(&tuple->t_self, &slot->tts_tid); |
259 | |
260 | if (shouldFree) |
261 | pfree(tuple); |
262 | } |
263 | |
264 | static void |
265 | heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot, |
266 | CommandId cid, int options, |
267 | BulkInsertState bistate, uint32 specToken) |
268 | { |
269 | bool shouldFree = true; |
270 | HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); |
271 | |
272 | /* Update the tuple with table oid */ |
273 | slot->tts_tableOid = RelationGetRelid(relation); |
274 | tuple->t_tableOid = slot->tts_tableOid; |
275 | |
276 | HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken); |
277 | options |= HEAP_INSERT_SPECULATIVE; |
278 | |
279 | /* Perform the insertion, and copy the resulting ItemPointer */ |
280 | heap_insert(relation, tuple, cid, options, bistate); |
281 | ItemPointerCopy(&tuple->t_self, &slot->tts_tid); |
282 | |
283 | if (shouldFree) |
284 | pfree(tuple); |
285 | } |
286 | |
287 | static void |
288 | heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, |
289 | uint32 specToken, bool succeeded) |
290 | { |
291 | bool shouldFree = true; |
292 | HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); |
293 | |
294 | /* adjust the tuple's state accordingly */ |
295 | if (succeeded) |
296 | heap_finish_speculative(relation, &slot->tts_tid); |
297 | else |
298 | heap_abort_speculative(relation, &slot->tts_tid); |
299 | |
300 | if (shouldFree) |
301 | pfree(tuple); |
302 | } |
303 | |
304 | static TM_Result |
305 | heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, |
306 | Snapshot snapshot, Snapshot crosscheck, bool wait, |
307 | TM_FailureData *tmfd, bool changingPart) |
308 | { |
309 | /* |
310 | * Currently Deleting of index tuples are handled at vacuum, in case if |
311 | * the storage itself is cleaning the dead tuples by itself, it is the |
312 | * time to call the index tuple deletion also. |
313 | */ |
314 | return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); |
315 | } |
316 | |
317 | |
318 | static TM_Result |
319 | heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, |
320 | CommandId cid, Snapshot snapshot, Snapshot crosscheck, |
321 | bool wait, TM_FailureData *tmfd, |
322 | LockTupleMode *lockmode, bool *update_indexes) |
323 | { |
324 | bool shouldFree = true; |
325 | HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); |
326 | TM_Result result; |
327 | |
328 | /* Update the tuple with table oid */ |
329 | slot->tts_tableOid = RelationGetRelid(relation); |
330 | tuple->t_tableOid = slot->tts_tableOid; |
331 | |
332 | result = heap_update(relation, otid, tuple, cid, crosscheck, wait, |
333 | tmfd, lockmode); |
334 | ItemPointerCopy(&tuple->t_self, &slot->tts_tid); |
335 | |
336 | /* |
337 | * Decide whether new index entries are needed for the tuple |
338 | * |
339 | * Note: heap_update returns the tid (location) of the new tuple in the |
340 | * t_self field. |
341 | * |
342 | * If it's a HOT update, we mustn't insert new index entries. |
343 | */ |
344 | *update_indexes = result == TM_Ok && !HeapTupleIsHeapOnly(tuple); |
345 | |
346 | if (shouldFree) |
347 | pfree(tuple); |
348 | |
349 | return result; |
350 | } |
351 | |
352 | static TM_Result |
353 | heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, |
354 | TupleTableSlot *slot, CommandId cid, LockTupleMode mode, |
355 | LockWaitPolicy wait_policy, uint8 flags, |
356 | TM_FailureData *tmfd) |
357 | { |
358 | BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; |
359 | TM_Result result; |
360 | Buffer buffer; |
361 | HeapTuple tuple = &bslot->base.tupdata; |
362 | bool follow_updates; |
363 | |
364 | follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; |
365 | tmfd->traversed = false; |
366 | |
367 | Assert(TTS_IS_BUFFERTUPLE(slot)); |
368 | |
369 | tuple_lock_retry: |
370 | tuple->t_self = *tid; |
371 | result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy, |
372 | follow_updates, &buffer, tmfd); |
373 | |
374 | if (result == TM_Updated && |
375 | (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) |
376 | { |
377 | ReleaseBuffer(buffer); |
378 | /* Should not encounter speculative tuple on recheck */ |
379 | Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); |
380 | |
381 | if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) |
382 | { |
383 | SnapshotData SnapshotDirty; |
384 | TransactionId priorXmax; |
385 | |
386 | /* it was updated, so look at the updated version */ |
387 | *tid = tmfd->ctid; |
388 | /* updated row should have xmin matching this xmax */ |
389 | priorXmax = tmfd->xmax; |
390 | |
391 | /* signal that a tuple later in the chain is getting locked */ |
392 | tmfd->traversed = true; |
393 | |
394 | /* |
395 | * fetch target tuple |
396 | * |
397 | * Loop here to deal with updated or busy tuples |
398 | */ |
399 | InitDirtySnapshot(SnapshotDirty); |
400 | for (;;) |
401 | { |
402 | if (ItemPointerIndicatesMovedPartitions(tid)) |
403 | ereport(ERROR, |
404 | (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), |
405 | errmsg("tuple to be locked was already moved to another partition due to concurrent update" ))); |
406 | |
407 | tuple->t_self = *tid; |
408 | if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer)) |
409 | { |
410 | /* |
411 | * If xmin isn't what we're expecting, the slot must have |
412 | * been recycled and reused for an unrelated tuple. This |
413 | * implies that the latest version of the row was deleted, |
414 | * so we need do nothing. (Should be safe to examine xmin |
415 | * without getting buffer's content lock. We assume |
416 | * reading a TransactionId to be atomic, and Xmin never |
417 | * changes in an existing tuple, except to invalid or |
418 | * frozen, and neither of those can match priorXmax.) |
419 | */ |
420 | if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), |
421 | priorXmax)) |
422 | { |
423 | ReleaseBuffer(buffer); |
424 | return TM_Deleted; |
425 | } |
426 | |
427 | /* otherwise xmin should not be dirty... */ |
428 | if (TransactionIdIsValid(SnapshotDirty.xmin)) |
429 | elog(ERROR, "t_xmin is uncommitted in tuple to be updated" ); |
430 | |
431 | /* |
432 | * If tuple is being updated by other transaction then we |
433 | * have to wait for its commit/abort, or die trying. |
434 | */ |
435 | if (TransactionIdIsValid(SnapshotDirty.xmax)) |
436 | { |
437 | ReleaseBuffer(buffer); |
438 | switch (wait_policy) |
439 | { |
440 | case LockWaitBlock: |
441 | XactLockTableWait(SnapshotDirty.xmax, |
442 | relation, &tuple->t_self, |
443 | XLTW_FetchUpdated); |
444 | break; |
445 | case LockWaitSkip: |
446 | if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) |
447 | /* skip instead of waiting */ |
448 | return TM_WouldBlock; |
449 | break; |
450 | case LockWaitError: |
451 | if (!ConditionalXactLockTableWait(SnapshotDirty.xmax)) |
452 | ereport(ERROR, |
453 | (errcode(ERRCODE_LOCK_NOT_AVAILABLE), |
454 | errmsg("could not obtain lock on row in relation \"%s\"" , |
455 | RelationGetRelationName(relation)))); |
456 | break; |
457 | } |
458 | continue; /* loop back to repeat heap_fetch */ |
459 | } |
460 | |
461 | /* |
462 | * If tuple was inserted by our own transaction, we have |
463 | * to check cmin against cid: cmin >= current CID means |
464 | * our command cannot see the tuple, so we should ignore |
465 | * it. Otherwise heap_lock_tuple() will throw an error, |
466 | * and so would any later attempt to update or delete the |
467 | * tuple. (We need not check cmax because |
468 | * HeapTupleSatisfiesDirty will consider a tuple deleted |
469 | * by our transaction dead, regardless of cmax.) We just |
470 | * checked that priorXmax == xmin, so we can test that |
471 | * variable instead of doing HeapTupleHeaderGetXmin again. |
472 | */ |
473 | if (TransactionIdIsCurrentTransactionId(priorXmax) && |
474 | HeapTupleHeaderGetCmin(tuple->t_data) >= cid) |
475 | { |
476 | tmfd->xmax = priorXmax; |
477 | |
478 | /* |
479 | * Cmin is the problematic value, so store that. See |
480 | * above. |
481 | */ |
482 | tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data); |
483 | ReleaseBuffer(buffer); |
484 | return TM_SelfModified; |
485 | } |
486 | |
487 | /* |
488 | * This is a live tuple, so try to lock it again. |
489 | */ |
490 | ReleaseBuffer(buffer); |
491 | goto tuple_lock_retry; |
492 | } |
493 | |
494 | /* |
495 | * If the referenced slot was actually empty, the latest |
496 | * version of the row must have been deleted, so we need do |
497 | * nothing. |
498 | */ |
499 | if (tuple->t_data == NULL) |
500 | { |
501 | return TM_Deleted; |
502 | } |
503 | |
504 | /* |
505 | * As above, if xmin isn't what we're expecting, do nothing. |
506 | */ |
507 | if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), |
508 | priorXmax)) |
509 | { |
510 | if (BufferIsValid(buffer)) |
511 | ReleaseBuffer(buffer); |
512 | return TM_Deleted; |
513 | } |
514 | |
515 | /* |
516 | * If we get here, the tuple was found but failed |
517 | * SnapshotDirty. Assuming the xmin is either a committed xact |
518 | * or our own xact (as it certainly should be if we're trying |
519 | * to modify the tuple), this must mean that the row was |
520 | * updated or deleted by either a committed xact or our own |
521 | * xact. If it was deleted, we can ignore it; if it was |
522 | * updated then chain up to the next version and repeat the |
523 | * whole process. |
524 | * |
525 | * As above, it should be safe to examine xmax and t_ctid |
526 | * without the buffer content lock, because they can't be |
527 | * changing. |
528 | */ |
529 | if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) |
530 | { |
531 | /* deleted, so forget about it */ |
532 | if (BufferIsValid(buffer)) |
533 | ReleaseBuffer(buffer); |
534 | return TM_Deleted; |
535 | } |
536 | |
537 | /* updated, so look at the updated row */ |
538 | *tid = tuple->t_data->t_ctid; |
539 | /* updated row should have xmin matching this xmax */ |
540 | priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); |
541 | if (BufferIsValid(buffer)) |
542 | ReleaseBuffer(buffer); |
543 | /* loop back to fetch next in chain */ |
544 | } |
545 | } |
546 | else |
547 | { |
548 | /* tuple was deleted, so give up */ |
549 | return TM_Deleted; |
550 | } |
551 | } |
552 | |
553 | slot->tts_tableOid = RelationGetRelid(relation); |
554 | tuple->t_tableOid = slot->tts_tableOid; |
555 | |
556 | /* store in slot, transferring existing pin */ |
557 | ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); |
558 | |
559 | return result; |
560 | } |
561 | |
562 | static void |
563 | heapam_finish_bulk_insert(Relation relation, int options) |
564 | { |
565 | /* |
566 | * If we skipped writing WAL, then we need to sync the heap (but not |
567 | * indexes since those use WAL anyway / don't go through tableam) |
568 | */ |
569 | if (options & HEAP_INSERT_SKIP_WAL) |
570 | heap_sync(relation); |
571 | } |
572 | |
573 | |
574 | /* ------------------------------------------------------------------------ |
575 | * DDL related callbacks for heap AM. |
576 | * ------------------------------------------------------------------------ |
577 | */ |
578 | |
579 | static void |
580 | heapam_relation_set_new_filenode(Relation rel, |
581 | const RelFileNode *newrnode, |
582 | char persistence, |
583 | TransactionId *freezeXid, |
584 | MultiXactId *minmulti) |
585 | { |
586 | SMgrRelation srel; |
587 | |
588 | /* |
589 | * Initialize to the minimum XID that could put tuples in the table. We |
590 | * know that no xacts older than RecentXmin are still running, so that |
591 | * will do. |
592 | */ |
593 | *freezeXid = RecentXmin; |
594 | |
595 | /* |
596 | * Similarly, initialize the minimum Multixact to the first value that |
597 | * could possibly be stored in tuples in the table. Running transactions |
598 | * could reuse values from their local cache, so we are careful to |
599 | * consider all currently running multis. |
600 | * |
601 | * XXX this could be refined further, but is it worth the hassle? |
602 | */ |
603 | *minmulti = GetOldestMultiXactId(); |
604 | |
605 | srel = RelationCreateStorage(*newrnode, persistence); |
606 | |
607 | /* |
608 | * If required, set up an init fork for an unlogged table so that it can |
609 | * be correctly reinitialized on restart. An immediate sync is required |
610 | * even if the page has been logged, because the write did not go through |
611 | * shared_buffers and therefore a concurrent checkpoint may have moved the |
612 | * redo pointer past our xlog record. Recovery may as well remove it |
613 | * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE |
614 | * record. Therefore, logging is necessary even if wal_level=minimal. |
615 | */ |
616 | if (persistence == RELPERSISTENCE_UNLOGGED) |
617 | { |
618 | Assert(rel->rd_rel->relkind == RELKIND_RELATION || |
619 | rel->rd_rel->relkind == RELKIND_MATVIEW || |
620 | rel->rd_rel->relkind == RELKIND_TOASTVALUE); |
621 | smgrcreate(srel, INIT_FORKNUM, false); |
622 | log_smgrcreate(newrnode, INIT_FORKNUM); |
623 | smgrimmedsync(srel, INIT_FORKNUM); |
624 | } |
625 | |
626 | smgrclose(srel); |
627 | } |
628 | |
629 | static void |
630 | heapam_relation_nontransactional_truncate(Relation rel) |
631 | { |
632 | RelationTruncate(rel, 0); |
633 | } |
634 | |
635 | static void |
636 | heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode) |
637 | { |
638 | SMgrRelation dstrel; |
639 | |
640 | dstrel = smgropen(*newrnode, rel->rd_backend); |
641 | RelationOpenSmgr(rel); |
642 | |
643 | /* |
644 | * Since we copy the file directly without looking at the shared buffers, |
645 | * we'd better first flush out any pages of the source relation that are |
646 | * in shared buffers. We assume no new changes will be made while we are |
647 | * holding exclusive lock on the rel. |
648 | */ |
649 | FlushRelationBuffers(rel); |
650 | |
651 | /* |
652 | * Create and copy all forks of the relation, and schedule unlinking of |
653 | * old physical files. |
654 | * |
655 | * NOTE: any conflict in relfilenode value will be caught in |
656 | * RelationCreateStorage(). |
657 | */ |
658 | RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence); |
659 | |
660 | /* copy main fork */ |
661 | RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, |
662 | rel->rd_rel->relpersistence); |
663 | |
664 | /* copy those extra forks that exist */ |
665 | for (ForkNumber forkNum = MAIN_FORKNUM + 1; |
666 | forkNum <= MAX_FORKNUM; forkNum++) |
667 | { |
668 | if (smgrexists(rel->rd_smgr, forkNum)) |
669 | { |
670 | smgrcreate(dstrel, forkNum, false); |
671 | |
672 | /* |
673 | * WAL log creation if the relation is persistent, or this is the |
674 | * init fork of an unlogged relation. |
675 | */ |
676 | if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT || |
677 | (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && |
678 | forkNum == INIT_FORKNUM)) |
679 | log_smgrcreate(newrnode, forkNum); |
680 | RelationCopyStorage(rel->rd_smgr, dstrel, forkNum, |
681 | rel->rd_rel->relpersistence); |
682 | } |
683 | } |
684 | |
685 | |
686 | /* drop old relation, and close new one */ |
687 | RelationDropStorage(rel); |
688 | smgrclose(dstrel); |
689 | } |
690 | |
691 | static void |
692 | heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, |
693 | Relation OldIndex, bool use_sort, |
694 | TransactionId OldestXmin, |
695 | TransactionId *xid_cutoff, |
696 | MultiXactId *multi_cutoff, |
697 | double *num_tuples, |
698 | double *tups_vacuumed, |
699 | double *tups_recently_dead) |
700 | { |
701 | RewriteState rwstate; |
702 | IndexScanDesc indexScan; |
703 | TableScanDesc tableScan; |
704 | HeapScanDesc heapScan; |
705 | bool use_wal; |
706 | bool is_system_catalog; |
707 | Tuplesortstate *tuplesort; |
708 | TupleDesc oldTupDesc = RelationGetDescr(OldHeap); |
709 | TupleDesc newTupDesc = RelationGetDescr(NewHeap); |
710 | TupleTableSlot *slot; |
711 | int natts; |
712 | Datum *values; |
713 | bool *isnull; |
714 | BufferHeapTupleTableSlot *hslot; |
715 | |
716 | /* Remember if it's a system catalog */ |
717 | is_system_catalog = IsSystemRelation(OldHeap); |
718 | |
719 | /* |
720 | * We need to log the copied data in WAL iff WAL archiving/streaming is |
721 | * enabled AND it's a WAL-logged rel. |
722 | */ |
723 | use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap); |
724 | |
725 | /* use_wal off requires smgr_targblock be initially invalid */ |
726 | Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); |
727 | |
728 | /* Preallocate values/isnull arrays */ |
729 | natts = newTupDesc->natts; |
730 | values = (Datum *) palloc(natts * sizeof(Datum)); |
731 | isnull = (bool *) palloc(natts * sizeof(bool)); |
732 | |
733 | /* Initialize the rewrite operation */ |
734 | rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff, |
735 | *multi_cutoff, use_wal); |
736 | |
737 | |
738 | /* Set up sorting if wanted */ |
739 | if (use_sort) |
740 | tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, |
741 | maintenance_work_mem, |
742 | NULL, false); |
743 | else |
744 | tuplesort = NULL; |
745 | |
746 | /* |
747 | * Prepare to scan the OldHeap. To ensure we see recently-dead tuples |
748 | * that still need to be copied, we scan with SnapshotAny and use |
749 | * HeapTupleSatisfiesVacuum for the visibility test. |
750 | */ |
751 | if (OldIndex != NULL && !use_sort) |
752 | { |
753 | const int ci_index[] = { |
754 | PROGRESS_CLUSTER_PHASE, |
755 | PROGRESS_CLUSTER_INDEX_RELID |
756 | }; |
757 | int64 ci_val[2]; |
758 | |
759 | /* Set phase and OIDOldIndex to columns */ |
760 | ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; |
761 | ci_val[1] = RelationGetRelid(OldIndex); |
762 | pgstat_progress_update_multi_param(2, ci_index, ci_val); |
763 | |
764 | tableScan = NULL; |
765 | heapScan = NULL; |
766 | indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); |
767 | index_rescan(indexScan, NULL, 0, NULL, 0); |
768 | } |
769 | else |
770 | { |
771 | /* In scan-and-sort mode and also VACUUM FULL, set phase */ |
772 | pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, |
773 | PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); |
774 | |
775 | tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); |
776 | heapScan = (HeapScanDesc) tableScan; |
777 | indexScan = NULL; |
778 | |
779 | /* Set total heap blocks */ |
780 | pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, |
781 | heapScan->rs_nblocks); |
782 | } |
783 | |
784 | slot = table_slot_create(OldHeap, NULL); |
785 | hslot = (BufferHeapTupleTableSlot *) slot; |
786 | |
787 | /* |
788 | * Scan through the OldHeap, either in OldIndex order or sequentially; |
789 | * copy each tuple into the NewHeap, or transiently to the tuplesort |
790 | * module. Note that we don't bother sorting dead tuples (they won't get |
791 | * to the new table anyway). |
792 | */ |
793 | for (;;) |
794 | { |
795 | HeapTuple tuple; |
796 | Buffer buf; |
797 | bool isdead; |
798 | |
799 | CHECK_FOR_INTERRUPTS(); |
800 | |
801 | if (indexScan != NULL) |
802 | { |
803 | if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) |
804 | break; |
805 | |
806 | /* Since we used no scan keys, should never need to recheck */ |
807 | if (indexScan->xs_recheck) |
808 | elog(ERROR, "CLUSTER does not support lossy index conditions" ); |
809 | } |
810 | else |
811 | { |
812 | if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot)) |
813 | break; |
814 | |
815 | /* |
816 | * In scan-and-sort mode and also VACUUM FULL, set heap blocks |
817 | * scanned |
818 | */ |
819 | pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, |
820 | heapScan->rs_cblock + 1); |
821 | } |
822 | |
823 | tuple = ExecFetchSlotHeapTuple(slot, false, NULL); |
824 | buf = hslot->buffer; |
825 | |
826 | LockBuffer(buf, BUFFER_LOCK_SHARE); |
827 | |
828 | switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) |
829 | { |
830 | case HEAPTUPLE_DEAD: |
831 | /* Definitely dead */ |
832 | isdead = true; |
833 | break; |
834 | case HEAPTUPLE_RECENTLY_DEAD: |
835 | *tups_recently_dead += 1; |
836 | /* fall through */ |
837 | case HEAPTUPLE_LIVE: |
838 | /* Live or recently dead, must copy it */ |
839 | isdead = false; |
840 | break; |
841 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
842 | |
843 | /* |
844 | * Since we hold exclusive lock on the relation, normally the |
845 | * only way to see this is if it was inserted earlier in our |
846 | * own transaction. However, it can happen in system |
847 | * catalogs, since we tend to release write lock before commit |
848 | * there. Give a warning if neither case applies; but in any |
849 | * case we had better copy it. |
850 | */ |
851 | if (!is_system_catalog && |
852 | !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) |
853 | elog(WARNING, "concurrent insert in progress within table \"%s\"" , |
854 | RelationGetRelationName(OldHeap)); |
855 | /* treat as live */ |
856 | isdead = false; |
857 | break; |
858 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
859 | |
860 | /* |
861 | * Similar situation to INSERT_IN_PROGRESS case. |
862 | */ |
863 | if (!is_system_catalog && |
864 | !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) |
865 | elog(WARNING, "concurrent delete in progress within table \"%s\"" , |
866 | RelationGetRelationName(OldHeap)); |
867 | /* treat as recently dead */ |
868 | *tups_recently_dead += 1; |
869 | isdead = false; |
870 | break; |
871 | default: |
872 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result" ); |
873 | isdead = false; /* keep compiler quiet */ |
874 | break; |
875 | } |
876 | |
877 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
878 | |
879 | if (isdead) |
880 | { |
881 | *tups_vacuumed += 1; |
882 | /* heap rewrite module still needs to see it... */ |
883 | if (rewrite_heap_dead_tuple(rwstate, tuple)) |
884 | { |
885 | /* A previous recently-dead tuple is now known dead */ |
886 | *tups_vacuumed += 1; |
887 | *tups_recently_dead -= 1; |
888 | } |
889 | continue; |
890 | } |
891 | |
892 | *num_tuples += 1; |
893 | if (tuplesort != NULL) |
894 | { |
895 | tuplesort_putheaptuple(tuplesort, tuple); |
896 | |
897 | /* |
898 | * In scan-and-sort mode, report increase in number of tuples |
899 | * scanned |
900 | */ |
901 | pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, |
902 | *num_tuples); |
903 | } |
904 | else |
905 | { |
906 | const int ct_index[] = { |
907 | PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, |
908 | PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN |
909 | }; |
910 | int64 ct_val[2]; |
911 | |
912 | reform_and_rewrite_tuple(tuple, OldHeap, NewHeap, |
913 | values, isnull, rwstate); |
914 | |
915 | /* |
916 | * In indexscan mode and also VACUUM FULL, report increase in |
917 | * number of tuples scanned and written |
918 | */ |
919 | ct_val[0] = *num_tuples; |
920 | ct_val[1] = *num_tuples; |
921 | pgstat_progress_update_multi_param(2, ct_index, ct_val); |
922 | } |
923 | } |
924 | |
925 | if (indexScan != NULL) |
926 | index_endscan(indexScan); |
927 | if (tableScan != NULL) |
928 | table_endscan(tableScan); |
929 | if (slot) |
930 | ExecDropSingleTupleTableSlot(slot); |
931 | |
932 | /* |
933 | * In scan-and-sort mode, complete the sort, then read out all live tuples |
934 | * from the tuplestore and write them to the new relation. |
935 | */ |
936 | if (tuplesort != NULL) |
937 | { |
938 | double n_tuples = 0; |
939 | |
940 | /* Report that we are now sorting tuples */ |
941 | pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, |
942 | PROGRESS_CLUSTER_PHASE_SORT_TUPLES); |
943 | |
944 | tuplesort_performsort(tuplesort); |
945 | |
946 | /* Report that we are now writing new heap */ |
947 | pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, |
948 | PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); |
949 | |
950 | for (;;) |
951 | { |
952 | HeapTuple tuple; |
953 | |
954 | CHECK_FOR_INTERRUPTS(); |
955 | |
956 | tuple = tuplesort_getheaptuple(tuplesort, true); |
957 | if (tuple == NULL) |
958 | break; |
959 | |
960 | n_tuples += 1; |
961 | reform_and_rewrite_tuple(tuple, |
962 | OldHeap, NewHeap, |
963 | values, isnull, |
964 | rwstate); |
965 | /* Report n_tuples */ |
966 | pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, |
967 | n_tuples); |
968 | } |
969 | |
970 | tuplesort_end(tuplesort); |
971 | } |
972 | |
973 | /* Write out any remaining tuples, and fsync if needed */ |
974 | end_heap_rewrite(rwstate); |
975 | |
976 | /* Clean up */ |
977 | pfree(values); |
978 | pfree(isnull); |
979 | } |
980 | |
981 | static bool |
982 | heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, |
983 | BufferAccessStrategy bstrategy) |
984 | { |
985 | HeapScanDesc hscan = (HeapScanDesc) scan; |
986 | |
987 | /* |
988 | * We must maintain a pin on the target page's buffer to ensure that |
989 | * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from |
990 | * under us. Hence, pin the page until we are done looking at it. We |
991 | * also choose to hold sharelock on the buffer throughout --- we could |
992 | * release and re-acquire sharelock for each tuple, but since we aren't |
993 | * doing much work per tuple, the extra lock traffic is probably better |
994 | * avoided. |
995 | */ |
996 | hscan->rs_cblock = blockno; |
997 | hscan->rs_cindex = FirstOffsetNumber; |
998 | hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, |
999 | blockno, RBM_NORMAL, bstrategy); |
1000 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); |
1001 | |
1002 | /* in heap all blocks can contain tuples, so always return true */ |
1003 | return true; |
1004 | } |
1005 | |
1006 | static bool |
1007 | heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, |
1008 | double *liverows, double *deadrows, |
1009 | TupleTableSlot *slot) |
1010 | { |
1011 | HeapScanDesc hscan = (HeapScanDesc) scan; |
1012 | Page targpage; |
1013 | OffsetNumber maxoffset; |
1014 | BufferHeapTupleTableSlot *hslot; |
1015 | |
1016 | Assert(TTS_IS_BUFFERTUPLE(slot)); |
1017 | |
1018 | hslot = (BufferHeapTupleTableSlot *) slot; |
1019 | targpage = BufferGetPage(hscan->rs_cbuf); |
1020 | maxoffset = PageGetMaxOffsetNumber(targpage); |
1021 | |
1022 | /* Inner loop over all tuples on the selected page */ |
1023 | for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++) |
1024 | { |
1025 | ItemId itemid; |
1026 | HeapTuple targtuple = &hslot->base.tupdata; |
1027 | bool sample_it = false; |
1028 | |
1029 | itemid = PageGetItemId(targpage, hscan->rs_cindex); |
1030 | |
1031 | /* |
1032 | * We ignore unused and redirect line pointers. DEAD line pointers |
1033 | * should be counted as dead, because we need vacuum to run to get rid |
1034 | * of them. Note that this rule agrees with the way that |
1035 | * heap_page_prune() counts things. |
1036 | */ |
1037 | if (!ItemIdIsNormal(itemid)) |
1038 | { |
1039 | if (ItemIdIsDead(itemid)) |
1040 | *deadrows += 1; |
1041 | continue; |
1042 | } |
1043 | |
1044 | ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex); |
1045 | |
1046 | targtuple->t_tableOid = RelationGetRelid(scan->rs_rd); |
1047 | targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); |
1048 | targtuple->t_len = ItemIdGetLength(itemid); |
1049 | |
1050 | switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, |
1051 | hscan->rs_cbuf)) |
1052 | { |
1053 | case HEAPTUPLE_LIVE: |
1054 | sample_it = true; |
1055 | *liverows += 1; |
1056 | break; |
1057 | |
1058 | case HEAPTUPLE_DEAD: |
1059 | case HEAPTUPLE_RECENTLY_DEAD: |
1060 | /* Count dead and recently-dead rows */ |
1061 | *deadrows += 1; |
1062 | break; |
1063 | |
1064 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
1065 | |
1066 | /* |
1067 | * Insert-in-progress rows are not counted. We assume that |
1068 | * when the inserting transaction commits or aborts, it will |
1069 | * send a stats message to increment the proper count. This |
1070 | * works right only if that transaction ends after we finish |
1071 | * analyzing the table; if things happen in the other order, |
1072 | * its stats update will be overwritten by ours. However, the |
1073 | * error will be large only if the other transaction runs long |
1074 | * enough to insert many tuples, so assuming it will finish |
1075 | * after us is the safer option. |
1076 | * |
1077 | * A special case is that the inserting transaction might be |
1078 | * our own. In this case we should count and sample the row, |
1079 | * to accommodate users who load a table and analyze it in one |
1080 | * transaction. (pgstat_report_analyze has to adjust the |
1081 | * numbers we send to the stats collector to make this come |
1082 | * out right.) |
1083 | */ |
1084 | if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data))) |
1085 | { |
1086 | sample_it = true; |
1087 | *liverows += 1; |
1088 | } |
1089 | break; |
1090 | |
1091 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
1092 | |
1093 | /* |
1094 | * We count and sample delete-in-progress rows the same as |
1095 | * live ones, so that the stats counters come out right if the |
1096 | * deleting transaction commits after us, per the same |
1097 | * reasoning given above. |
1098 | * |
1099 | * If the delete was done by our own transaction, however, we |
1100 | * must count the row as dead to make pgstat_report_analyze's |
1101 | * stats adjustments come out right. (Note: this works out |
1102 | * properly when the row was both inserted and deleted in our |
1103 | * xact.) |
1104 | * |
1105 | * The net effect of these choices is that we act as though an |
1106 | * IN_PROGRESS transaction hasn't happened yet, except if it |
1107 | * is our own transaction, which we assume has happened. |
1108 | * |
1109 | * This approach ensures that we behave sanely if we see both |
1110 | * the pre-image and post-image rows for a row being updated |
1111 | * by a concurrent transaction: we will sample the pre-image |
1112 | * but not the post-image. We also get sane results if the |
1113 | * concurrent transaction never commits. |
1114 | */ |
1115 | if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data))) |
1116 | *deadrows += 1; |
1117 | else |
1118 | { |
1119 | sample_it = true; |
1120 | *liverows += 1; |
1121 | } |
1122 | break; |
1123 | |
1124 | default: |
1125 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result" ); |
1126 | break; |
1127 | } |
1128 | |
1129 | if (sample_it) |
1130 | { |
1131 | ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf); |
1132 | hscan->rs_cindex++; |
1133 | |
1134 | /* note that we leave the buffer locked here! */ |
1135 | return true; |
1136 | } |
1137 | } |
1138 | |
1139 | /* Now release the lock and pin on the page */ |
1140 | UnlockReleaseBuffer(hscan->rs_cbuf); |
1141 | hscan->rs_cbuf = InvalidBuffer; |
1142 | |
1143 | /* also prevent old slot contents from having pin on page */ |
1144 | ExecClearTuple(slot); |
1145 | |
1146 | return false; |
1147 | } |
1148 | |
1149 | static double |
1150 | heapam_index_build_range_scan(Relation heapRelation, |
1151 | Relation indexRelation, |
1152 | IndexInfo *indexInfo, |
1153 | bool allow_sync, |
1154 | bool anyvisible, |
1155 | bool progress, |
1156 | BlockNumber start_blockno, |
1157 | BlockNumber numblocks, |
1158 | IndexBuildCallback callback, |
1159 | void *callback_state, |
1160 | TableScanDesc scan) |
1161 | { |
1162 | HeapScanDesc hscan; |
1163 | bool is_system_catalog; |
1164 | bool checking_uniqueness; |
1165 | HeapTuple heapTuple; |
1166 | Datum values[INDEX_MAX_KEYS]; |
1167 | bool isnull[INDEX_MAX_KEYS]; |
1168 | double reltuples; |
1169 | ExprState *predicate; |
1170 | TupleTableSlot *slot; |
1171 | EState *estate; |
1172 | ExprContext *econtext; |
1173 | Snapshot snapshot; |
1174 | bool need_unregister_snapshot = false; |
1175 | TransactionId OldestXmin; |
1176 | BlockNumber previous_blkno = InvalidBlockNumber; |
1177 | BlockNumber root_blkno = InvalidBlockNumber; |
1178 | OffsetNumber root_offsets[MaxHeapTuplesPerPage]; |
1179 | |
1180 | /* |
1181 | * sanity checks |
1182 | */ |
1183 | Assert(OidIsValid(indexRelation->rd_rel->relam)); |
1184 | |
1185 | /* Remember if it's a system catalog */ |
1186 | is_system_catalog = IsSystemRelation(heapRelation); |
1187 | |
1188 | /* See whether we're verifying uniqueness/exclusion properties */ |
1189 | checking_uniqueness = (indexInfo->ii_Unique || |
1190 | indexInfo->ii_ExclusionOps != NULL); |
1191 | |
1192 | /* |
1193 | * "Any visible" mode is not compatible with uniqueness checks; make sure |
1194 | * only one of those is requested. |
1195 | */ |
1196 | Assert(!(anyvisible && checking_uniqueness)); |
1197 | |
1198 | /* |
1199 | * Need an EState for evaluation of index expressions and partial-index |
1200 | * predicates. Also a slot to hold the current tuple. |
1201 | */ |
1202 | estate = CreateExecutorState(); |
1203 | econtext = GetPerTupleExprContext(estate); |
1204 | slot = table_slot_create(heapRelation, NULL); |
1205 | |
1206 | /* Arrange for econtext's scan tuple to be the tuple under test */ |
1207 | econtext->ecxt_scantuple = slot; |
1208 | |
1209 | /* Set up execution state for predicate, if any. */ |
1210 | predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); |
1211 | |
1212 | /* |
1213 | * Prepare for scan of the base relation. In a normal index build, we use |
1214 | * SnapshotAny because we must retrieve all tuples and do our own time |
1215 | * qual checks (because we have to index RECENTLY_DEAD tuples). In a |
1216 | * concurrent build, or during bootstrap, we take a regular MVCC snapshot |
1217 | * and index whatever's live according to that. |
1218 | */ |
1219 | OldestXmin = InvalidTransactionId; |
1220 | |
1221 | /* okay to ignore lazy VACUUMs here */ |
1222 | if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) |
1223 | OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM); |
1224 | |
1225 | if (!scan) |
1226 | { |
1227 | /* |
1228 | * Serial index build. |
1229 | * |
1230 | * Must begin our own heap scan in this case. We may also need to |
1231 | * register a snapshot whose lifetime is under our direct control. |
1232 | */ |
1233 | if (!TransactionIdIsValid(OldestXmin)) |
1234 | { |
1235 | snapshot = RegisterSnapshot(GetTransactionSnapshot()); |
1236 | need_unregister_snapshot = true; |
1237 | } |
1238 | else |
1239 | snapshot = SnapshotAny; |
1240 | |
1241 | scan = table_beginscan_strat(heapRelation, /* relation */ |
1242 | snapshot, /* snapshot */ |
1243 | 0, /* number of keys */ |
1244 | NULL, /* scan key */ |
1245 | true, /* buffer access strategy OK */ |
1246 | allow_sync); /* syncscan OK? */ |
1247 | } |
1248 | else |
1249 | { |
1250 | /* |
1251 | * Parallel index build. |
1252 | * |
1253 | * Parallel case never registers/unregisters own snapshot. Snapshot |
1254 | * is taken from parallel heap scan, and is SnapshotAny or an MVCC |
1255 | * snapshot, based on same criteria as serial case. |
1256 | */ |
1257 | Assert(!IsBootstrapProcessingMode()); |
1258 | Assert(allow_sync); |
1259 | snapshot = scan->rs_snapshot; |
1260 | } |
1261 | |
1262 | hscan = (HeapScanDesc) scan; |
1263 | |
1264 | /* Publish number of blocks to scan */ |
1265 | if (progress) |
1266 | { |
1267 | BlockNumber nblocks; |
1268 | |
1269 | if (hscan->rs_base.rs_parallel != NULL) |
1270 | { |
1271 | ParallelBlockTableScanDesc pbscan; |
1272 | |
1273 | pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; |
1274 | nblocks = pbscan->phs_nblocks; |
1275 | } |
1276 | else |
1277 | nblocks = hscan->rs_nblocks; |
1278 | |
1279 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, |
1280 | nblocks); |
1281 | } |
1282 | |
1283 | /* |
1284 | * Must call GetOldestXmin() with SnapshotAny. Should never call |
1285 | * GetOldestXmin() with MVCC snapshot. (It's especially worth checking |
1286 | * this for parallel builds, since ambuild routines that support parallel |
1287 | * builds must work these details out for themselves.) |
1288 | */ |
1289 | Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); |
1290 | Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : |
1291 | !TransactionIdIsValid(OldestXmin)); |
1292 | Assert(snapshot == SnapshotAny || !anyvisible); |
1293 | |
1294 | /* set our scan endpoints */ |
1295 | if (!allow_sync) |
1296 | heap_setscanlimits(scan, start_blockno, numblocks); |
1297 | else |
1298 | { |
1299 | /* syncscan can only be requested on whole relation */ |
1300 | Assert(start_blockno == 0); |
1301 | Assert(numblocks == InvalidBlockNumber); |
1302 | } |
1303 | |
1304 | reltuples = 0; |
1305 | |
1306 | /* |
1307 | * Scan all tuples in the base relation. |
1308 | */ |
1309 | while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) |
1310 | { |
1311 | bool tupleIsAlive; |
1312 | |
1313 | CHECK_FOR_INTERRUPTS(); |
1314 | |
1315 | /* Report scan progress, if asked to. */ |
1316 | if (progress) |
1317 | { |
1318 | BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan); |
1319 | |
1320 | if (blocks_done != previous_blkno) |
1321 | { |
1322 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, |
1323 | blocks_done); |
1324 | previous_blkno = blocks_done; |
1325 | } |
1326 | } |
1327 | |
1328 | /* |
1329 | * When dealing with a HOT-chain of updated tuples, we want to index |
1330 | * the values of the live tuple (if any), but index it under the TID |
1331 | * of the chain's root tuple. This approach is necessary to preserve |
1332 | * the HOT-chain structure in the heap. So we need to be able to find |
1333 | * the root item offset for every tuple that's in a HOT-chain. When |
1334 | * first reaching a new page of the relation, call |
1335 | * heap_get_root_tuples() to build a map of root item offsets on the |
1336 | * page. |
1337 | * |
1338 | * It might look unsafe to use this information across buffer |
1339 | * lock/unlock. However, we hold ShareLock on the table so no |
1340 | * ordinary insert/update/delete should occur; and we hold pin on the |
1341 | * buffer continuously while visiting the page, so no pruning |
1342 | * operation can occur either. |
1343 | * |
1344 | * Also, although our opinions about tuple liveness could change while |
1345 | * we scan the page (due to concurrent transaction commits/aborts), |
1346 | * the chain root locations won't, so this info doesn't need to be |
1347 | * rebuilt after waiting for another transaction. |
1348 | * |
1349 | * Note the implied assumption that there is no more than one live |
1350 | * tuple per HOT-chain --- else we could create more than one index |
1351 | * entry pointing to the same root tuple. |
1352 | */ |
1353 | if (hscan->rs_cblock != root_blkno) |
1354 | { |
1355 | Page page = BufferGetPage(hscan->rs_cbuf); |
1356 | |
1357 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); |
1358 | heap_get_root_tuples(page, root_offsets); |
1359 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
1360 | |
1361 | root_blkno = hscan->rs_cblock; |
1362 | } |
1363 | |
1364 | if (snapshot == SnapshotAny) |
1365 | { |
1366 | /* do our own time qual check */ |
1367 | bool indexIt; |
1368 | TransactionId xwait; |
1369 | |
1370 | recheck: |
1371 | |
1372 | /* |
1373 | * We could possibly get away with not locking the buffer here, |
1374 | * since caller should hold ShareLock on the relation, but let's |
1375 | * be conservative about it. (This remark is still correct even |
1376 | * with HOT-pruning: our pin on the buffer prevents pruning.) |
1377 | */ |
1378 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); |
1379 | |
1380 | /* |
1381 | * The criteria for counting a tuple as live in this block need to |
1382 | * match what analyze.c's heapam_scan_analyze_next_tuple() does, |
1383 | * otherwise CREATE INDEX and ANALYZE may produce wildly different |
1384 | * reltuples values, e.g. when there are many recently-dead |
1385 | * tuples. |
1386 | */ |
1387 | switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin, |
1388 | hscan->rs_cbuf)) |
1389 | { |
1390 | case HEAPTUPLE_DEAD: |
1391 | /* Definitely dead, we can ignore it */ |
1392 | indexIt = false; |
1393 | tupleIsAlive = false; |
1394 | break; |
1395 | case HEAPTUPLE_LIVE: |
1396 | /* Normal case, index and unique-check it */ |
1397 | indexIt = true; |
1398 | tupleIsAlive = true; |
1399 | /* Count it as live, too */ |
1400 | reltuples += 1; |
1401 | break; |
1402 | case HEAPTUPLE_RECENTLY_DEAD: |
1403 | |
1404 | /* |
1405 | * If tuple is recently deleted then we must index it |
1406 | * anyway to preserve MVCC semantics. (Pre-existing |
1407 | * transactions could try to use the index after we finish |
1408 | * building it, and may need to see such tuples.) |
1409 | * |
1410 | * However, if it was HOT-updated then we must only index |
1411 | * the live tuple at the end of the HOT-chain. Since this |
1412 | * breaks semantics for pre-existing snapshots, mark the |
1413 | * index as unusable for them. |
1414 | * |
1415 | * We don't count recently-dead tuples in reltuples, even |
1416 | * if we index them; see heapam_scan_analyze_next_tuple(). |
1417 | */ |
1418 | if (HeapTupleIsHotUpdated(heapTuple)) |
1419 | { |
1420 | indexIt = false; |
1421 | /* mark the index as unsafe for old snapshots */ |
1422 | indexInfo->ii_BrokenHotChain = true; |
1423 | } |
1424 | else |
1425 | indexIt = true; |
1426 | /* In any case, exclude the tuple from unique-checking */ |
1427 | tupleIsAlive = false; |
1428 | break; |
1429 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
1430 | |
1431 | /* |
1432 | * In "anyvisible" mode, this tuple is visible and we |
1433 | * don't need any further checks. |
1434 | */ |
1435 | if (anyvisible) |
1436 | { |
1437 | indexIt = true; |
1438 | tupleIsAlive = true; |
1439 | reltuples += 1; |
1440 | break; |
1441 | } |
1442 | |
1443 | /* |
1444 | * Since caller should hold ShareLock or better, normally |
1445 | * the only way to see this is if it was inserted earlier |
1446 | * in our own transaction. However, it can happen in |
1447 | * system catalogs, since we tend to release write lock |
1448 | * before commit there. Give a warning if neither case |
1449 | * applies. |
1450 | */ |
1451 | xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); |
1452 | if (!TransactionIdIsCurrentTransactionId(xwait)) |
1453 | { |
1454 | if (!is_system_catalog) |
1455 | elog(WARNING, "concurrent insert in progress within table \"%s\"" , |
1456 | RelationGetRelationName(heapRelation)); |
1457 | |
1458 | /* |
1459 | * If we are performing uniqueness checks, indexing |
1460 | * such a tuple could lead to a bogus uniqueness |
1461 | * failure. In that case we wait for the inserting |
1462 | * transaction to finish and check again. |
1463 | */ |
1464 | if (checking_uniqueness) |
1465 | { |
1466 | /* |
1467 | * Must drop the lock on the buffer before we wait |
1468 | */ |
1469 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
1470 | XactLockTableWait(xwait, heapRelation, |
1471 | &heapTuple->t_self, |
1472 | XLTW_InsertIndexUnique); |
1473 | CHECK_FOR_INTERRUPTS(); |
1474 | goto recheck; |
1475 | } |
1476 | } |
1477 | else |
1478 | { |
1479 | /* |
1480 | * For consistency with |
1481 | * heapam_scan_analyze_next_tuple(), count |
1482 | * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only |
1483 | * when inserted by our own transaction. |
1484 | */ |
1485 | reltuples += 1; |
1486 | } |
1487 | |
1488 | /* |
1489 | * We must index such tuples, since if the index build |
1490 | * commits then they're good. |
1491 | */ |
1492 | indexIt = true; |
1493 | tupleIsAlive = true; |
1494 | break; |
1495 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
1496 | |
1497 | /* |
1498 | * As with INSERT_IN_PROGRESS case, this is unexpected |
1499 | * unless it's our own deletion or a system catalog; but |
1500 | * in anyvisible mode, this tuple is visible. |
1501 | */ |
1502 | if (anyvisible) |
1503 | { |
1504 | indexIt = true; |
1505 | tupleIsAlive = false; |
1506 | reltuples += 1; |
1507 | break; |
1508 | } |
1509 | |
1510 | xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); |
1511 | if (!TransactionIdIsCurrentTransactionId(xwait)) |
1512 | { |
1513 | if (!is_system_catalog) |
1514 | elog(WARNING, "concurrent delete in progress within table \"%s\"" , |
1515 | RelationGetRelationName(heapRelation)); |
1516 | |
1517 | /* |
1518 | * If we are performing uniqueness checks, assuming |
1519 | * the tuple is dead could lead to missing a |
1520 | * uniqueness violation. In that case we wait for the |
1521 | * deleting transaction to finish and check again. |
1522 | * |
1523 | * Also, if it's a HOT-updated tuple, we should not |
1524 | * index it but rather the live tuple at the end of |
1525 | * the HOT-chain. However, the deleting transaction |
1526 | * could abort, possibly leaving this tuple as live |
1527 | * after all, in which case it has to be indexed. The |
1528 | * only way to know what to do is to wait for the |
1529 | * deleting transaction to finish and check again. |
1530 | */ |
1531 | if (checking_uniqueness || |
1532 | HeapTupleIsHotUpdated(heapTuple)) |
1533 | { |
1534 | /* |
1535 | * Must drop the lock on the buffer before we wait |
1536 | */ |
1537 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
1538 | XactLockTableWait(xwait, heapRelation, |
1539 | &heapTuple->t_self, |
1540 | XLTW_InsertIndexUnique); |
1541 | CHECK_FOR_INTERRUPTS(); |
1542 | goto recheck; |
1543 | } |
1544 | |
1545 | /* |
1546 | * Otherwise index it but don't check for uniqueness, |
1547 | * the same as a RECENTLY_DEAD tuple. |
1548 | */ |
1549 | indexIt = true; |
1550 | |
1551 | /* |
1552 | * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live, |
1553 | * if they were not deleted by the current |
1554 | * transaction. That's what |
1555 | * heapam_scan_analyze_next_tuple() does, and we want |
1556 | * the behavior to be consistent. |
1557 | */ |
1558 | reltuples += 1; |
1559 | } |
1560 | else if (HeapTupleIsHotUpdated(heapTuple)) |
1561 | { |
1562 | /* |
1563 | * It's a HOT-updated tuple deleted by our own xact. |
1564 | * We can assume the deletion will commit (else the |
1565 | * index contents don't matter), so treat the same as |
1566 | * RECENTLY_DEAD HOT-updated tuples. |
1567 | */ |
1568 | indexIt = false; |
1569 | /* mark the index as unsafe for old snapshots */ |
1570 | indexInfo->ii_BrokenHotChain = true; |
1571 | } |
1572 | else |
1573 | { |
1574 | /* |
1575 | * It's a regular tuple deleted by our own xact. Index |
1576 | * it, but don't check for uniqueness nor count in |
1577 | * reltuples, the same as a RECENTLY_DEAD tuple. |
1578 | */ |
1579 | indexIt = true; |
1580 | } |
1581 | /* In any case, exclude the tuple from unique-checking */ |
1582 | tupleIsAlive = false; |
1583 | break; |
1584 | default: |
1585 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result" ); |
1586 | indexIt = tupleIsAlive = false; /* keep compiler quiet */ |
1587 | break; |
1588 | } |
1589 | |
1590 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
1591 | |
1592 | if (!indexIt) |
1593 | continue; |
1594 | } |
1595 | else |
1596 | { |
1597 | /* heap_getnext did the time qual check */ |
1598 | tupleIsAlive = true; |
1599 | reltuples += 1; |
1600 | } |
1601 | |
1602 | MemoryContextReset(econtext->ecxt_per_tuple_memory); |
1603 | |
1604 | /* Set up for predicate or expression evaluation */ |
1605 | ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf); |
1606 | |
1607 | /* |
1608 | * In a partial index, discard tuples that don't satisfy the |
1609 | * predicate. |
1610 | */ |
1611 | if (predicate != NULL) |
1612 | { |
1613 | if (!ExecQual(predicate, econtext)) |
1614 | continue; |
1615 | } |
1616 | |
1617 | /* |
1618 | * For the current heap tuple, extract all the attributes we use in |
1619 | * this index, and note which are null. This also performs evaluation |
1620 | * of any expressions needed. |
1621 | */ |
1622 | FormIndexDatum(indexInfo, |
1623 | slot, |
1624 | estate, |
1625 | values, |
1626 | isnull); |
1627 | |
1628 | /* |
1629 | * You'd think we should go ahead and build the index tuple here, but |
1630 | * some index AMs want to do further processing on the data first. So |
1631 | * pass the values[] and isnull[] arrays, instead. |
1632 | */ |
1633 | |
1634 | if (HeapTupleIsHeapOnly(heapTuple)) |
1635 | { |
1636 | /* |
1637 | * For a heap-only tuple, pretend its TID is that of the root. See |
1638 | * src/backend/access/heap/README.HOT for discussion. |
1639 | */ |
1640 | HeapTupleData rootTuple; |
1641 | OffsetNumber offnum; |
1642 | |
1643 | rootTuple = *heapTuple; |
1644 | offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); |
1645 | |
1646 | if (!OffsetNumberIsValid(root_offsets[offnum - 1])) |
1647 | ereport(ERROR, |
1648 | (errcode(ERRCODE_DATA_CORRUPTED), |
1649 | errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"" , |
1650 | ItemPointerGetBlockNumber(&heapTuple->t_self), |
1651 | offnum, |
1652 | RelationGetRelationName(heapRelation)))); |
1653 | |
1654 | ItemPointerSetOffsetNumber(&rootTuple.t_self, |
1655 | root_offsets[offnum - 1]); |
1656 | |
1657 | /* Call the AM's callback routine to process the tuple */ |
1658 | callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive, |
1659 | callback_state); |
1660 | } |
1661 | else |
1662 | { |
1663 | /* Call the AM's callback routine to process the tuple */ |
1664 | callback(indexRelation, heapTuple, values, isnull, tupleIsAlive, |
1665 | callback_state); |
1666 | } |
1667 | } |
1668 | |
1669 | /* Report scan progress one last time. */ |
1670 | if (progress) |
1671 | { |
1672 | BlockNumber blks_done; |
1673 | |
1674 | if (hscan->rs_base.rs_parallel != NULL) |
1675 | { |
1676 | ParallelBlockTableScanDesc pbscan; |
1677 | |
1678 | pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; |
1679 | blks_done = pbscan->phs_nblocks; |
1680 | } |
1681 | else |
1682 | blks_done = hscan->rs_nblocks; |
1683 | |
1684 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, |
1685 | blks_done); |
1686 | } |
1687 | |
1688 | table_endscan(scan); |
1689 | |
1690 | /* we can now forget our snapshot, if set and registered by us */ |
1691 | if (need_unregister_snapshot) |
1692 | UnregisterSnapshot(snapshot); |
1693 | |
1694 | ExecDropSingleTupleTableSlot(slot); |
1695 | |
1696 | FreeExecutorState(estate); |
1697 | |
1698 | /* These may have been pointing to the now-gone estate */ |
1699 | indexInfo->ii_ExpressionsState = NIL; |
1700 | indexInfo->ii_PredicateState = NULL; |
1701 | |
1702 | return reltuples; |
1703 | } |
1704 | |
1705 | static void |
1706 | heapam_index_validate_scan(Relation heapRelation, |
1707 | Relation indexRelation, |
1708 | IndexInfo *indexInfo, |
1709 | Snapshot snapshot, |
1710 | ValidateIndexState *state) |
1711 | { |
1712 | TableScanDesc scan; |
1713 | HeapScanDesc hscan; |
1714 | HeapTuple heapTuple; |
1715 | Datum values[INDEX_MAX_KEYS]; |
1716 | bool isnull[INDEX_MAX_KEYS]; |
1717 | ExprState *predicate; |
1718 | TupleTableSlot *slot; |
1719 | EState *estate; |
1720 | ExprContext *econtext; |
1721 | BlockNumber root_blkno = InvalidBlockNumber; |
1722 | OffsetNumber root_offsets[MaxHeapTuplesPerPage]; |
1723 | bool in_index[MaxHeapTuplesPerPage]; |
1724 | BlockNumber previous_blkno = InvalidBlockNumber; |
1725 | |
1726 | /* state variables for the merge */ |
1727 | ItemPointer indexcursor = NULL; |
1728 | ItemPointerData decoded; |
1729 | bool tuplesort_empty = false; |
1730 | |
1731 | /* |
1732 | * sanity checks |
1733 | */ |
1734 | Assert(OidIsValid(indexRelation->rd_rel->relam)); |
1735 | |
1736 | /* |
1737 | * Need an EState for evaluation of index expressions and partial-index |
1738 | * predicates. Also a slot to hold the current tuple. |
1739 | */ |
1740 | estate = CreateExecutorState(); |
1741 | econtext = GetPerTupleExprContext(estate); |
1742 | slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), |
1743 | &TTSOpsHeapTuple); |
1744 | |
1745 | /* Arrange for econtext's scan tuple to be the tuple under test */ |
1746 | econtext->ecxt_scantuple = slot; |
1747 | |
1748 | /* Set up execution state for predicate, if any. */ |
1749 | predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); |
1750 | |
1751 | /* |
1752 | * Prepare for scan of the base relation. We need just those tuples |
1753 | * satisfying the passed-in reference snapshot. We must disable syncscan |
1754 | * here, because it's critical that we read from block zero forward to |
1755 | * match the sorted TIDs. |
1756 | */ |
1757 | scan = table_beginscan_strat(heapRelation, /* relation */ |
1758 | snapshot, /* snapshot */ |
1759 | 0, /* number of keys */ |
1760 | NULL, /* scan key */ |
1761 | true, /* buffer access strategy OK */ |
1762 | false); /* syncscan not OK */ |
1763 | hscan = (HeapScanDesc) scan; |
1764 | |
1765 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, |
1766 | hscan->rs_nblocks); |
1767 | |
1768 | /* |
1769 | * Scan all tuples matching the snapshot. |
1770 | */ |
1771 | while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) |
1772 | { |
1773 | ItemPointer heapcursor = &heapTuple->t_self; |
1774 | ItemPointerData rootTuple; |
1775 | OffsetNumber root_offnum; |
1776 | |
1777 | CHECK_FOR_INTERRUPTS(); |
1778 | |
1779 | state->htups += 1; |
1780 | |
1781 | if ((previous_blkno == InvalidBlockNumber) || |
1782 | (hscan->rs_cblock != previous_blkno)) |
1783 | { |
1784 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, |
1785 | hscan->rs_cblock); |
1786 | previous_blkno = hscan->rs_cblock; |
1787 | } |
1788 | |
1789 | /* |
1790 | * As commented in table_index_build_scan, we should index heap-only |
1791 | * tuples under the TIDs of their root tuples; so when we advance onto |
1792 | * a new heap page, build a map of root item offsets on the page. |
1793 | * |
1794 | * This complicates merging against the tuplesort output: we will |
1795 | * visit the live tuples in order by their offsets, but the root |
1796 | * offsets that we need to compare against the index contents might be |
1797 | * ordered differently. So we might have to "look back" within the |
1798 | * tuplesort output, but only within the current page. We handle that |
1799 | * by keeping a bool array in_index[] showing all the |
1800 | * already-passed-over tuplesort output TIDs of the current page. We |
1801 | * clear that array here, when advancing onto a new heap page. |
1802 | */ |
1803 | if (hscan->rs_cblock != root_blkno) |
1804 | { |
1805 | Page page = BufferGetPage(hscan->rs_cbuf); |
1806 | |
1807 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); |
1808 | heap_get_root_tuples(page, root_offsets); |
1809 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
1810 | |
1811 | memset(in_index, 0, sizeof(in_index)); |
1812 | |
1813 | root_blkno = hscan->rs_cblock; |
1814 | } |
1815 | |
1816 | /* Convert actual tuple TID to root TID */ |
1817 | rootTuple = *heapcursor; |
1818 | root_offnum = ItemPointerGetOffsetNumber(heapcursor); |
1819 | |
1820 | if (HeapTupleIsHeapOnly(heapTuple)) |
1821 | { |
1822 | root_offnum = root_offsets[root_offnum - 1]; |
1823 | if (!OffsetNumberIsValid(root_offnum)) |
1824 | ereport(ERROR, |
1825 | (errcode(ERRCODE_DATA_CORRUPTED), |
1826 | errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"" , |
1827 | ItemPointerGetBlockNumber(heapcursor), |
1828 | ItemPointerGetOffsetNumber(heapcursor), |
1829 | RelationGetRelationName(heapRelation)))); |
1830 | ItemPointerSetOffsetNumber(&rootTuple, root_offnum); |
1831 | } |
1832 | |
1833 | /* |
1834 | * "merge" by skipping through the index tuples until we find or pass |
1835 | * the current root tuple. |
1836 | */ |
1837 | while (!tuplesort_empty && |
1838 | (!indexcursor || |
1839 | ItemPointerCompare(indexcursor, &rootTuple) < 0)) |
1840 | { |
1841 | Datum ts_val; |
1842 | bool ts_isnull; |
1843 | |
1844 | if (indexcursor) |
1845 | { |
1846 | /* |
1847 | * Remember index items seen earlier on the current heap page |
1848 | */ |
1849 | if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) |
1850 | in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; |
1851 | } |
1852 | |
1853 | tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, |
1854 | &ts_val, &ts_isnull, NULL); |
1855 | Assert(tuplesort_empty || !ts_isnull); |
1856 | if (!tuplesort_empty) |
1857 | { |
1858 | itemptr_decode(&decoded, DatumGetInt64(ts_val)); |
1859 | indexcursor = &decoded; |
1860 | |
1861 | /* If int8 is pass-by-ref, free (encoded) TID Datum memory */ |
1862 | #ifndef USE_FLOAT8_BYVAL |
1863 | pfree(DatumGetPointer(ts_val)); |
1864 | #endif |
1865 | } |
1866 | else |
1867 | { |
1868 | /* Be tidy */ |
1869 | indexcursor = NULL; |
1870 | } |
1871 | } |
1872 | |
1873 | /* |
1874 | * If the tuplesort has overshot *and* we didn't see a match earlier, |
1875 | * then this tuple is missing from the index, so insert it. |
1876 | */ |
1877 | if ((tuplesort_empty || |
1878 | ItemPointerCompare(indexcursor, &rootTuple) > 0) && |
1879 | !in_index[root_offnum - 1]) |
1880 | { |
1881 | MemoryContextReset(econtext->ecxt_per_tuple_memory); |
1882 | |
1883 | /* Set up for predicate or expression evaluation */ |
1884 | ExecStoreHeapTuple(heapTuple, slot, false); |
1885 | |
1886 | /* |
1887 | * In a partial index, discard tuples that don't satisfy the |
1888 | * predicate. |
1889 | */ |
1890 | if (predicate != NULL) |
1891 | { |
1892 | if (!ExecQual(predicate, econtext)) |
1893 | continue; |
1894 | } |
1895 | |
1896 | /* |
1897 | * For the current heap tuple, extract all the attributes we use |
1898 | * in this index, and note which are null. This also performs |
1899 | * evaluation of any expressions needed. |
1900 | */ |
1901 | FormIndexDatum(indexInfo, |
1902 | slot, |
1903 | estate, |
1904 | values, |
1905 | isnull); |
1906 | |
1907 | /* |
1908 | * You'd think we should go ahead and build the index tuple here, |
1909 | * but some index AMs want to do further processing on the data |
1910 | * first. So pass the values[] and isnull[] arrays, instead. |
1911 | */ |
1912 | |
1913 | /* |
1914 | * If the tuple is already committed dead, you might think we |
1915 | * could suppress uniqueness checking, but this is no longer true |
1916 | * in the presence of HOT, because the insert is actually a proxy |
1917 | * for a uniqueness check on the whole HOT-chain. That is, the |
1918 | * tuple we have here could be dead because it was already |
1919 | * HOT-updated, and if so the updating transaction will not have |
1920 | * thought it should insert index entries. The index AM will |
1921 | * check the whole HOT-chain and correctly detect a conflict if |
1922 | * there is one. |
1923 | */ |
1924 | |
1925 | index_insert(indexRelation, |
1926 | values, |
1927 | isnull, |
1928 | &rootTuple, |
1929 | heapRelation, |
1930 | indexInfo->ii_Unique ? |
1931 | UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, |
1932 | indexInfo); |
1933 | |
1934 | state->tups_inserted += 1; |
1935 | } |
1936 | } |
1937 | |
1938 | table_endscan(scan); |
1939 | |
1940 | ExecDropSingleTupleTableSlot(slot); |
1941 | |
1942 | FreeExecutorState(estate); |
1943 | |
1944 | /* These may have been pointing to the now-gone estate */ |
1945 | indexInfo->ii_ExpressionsState = NIL; |
1946 | indexInfo->ii_PredicateState = NULL; |
1947 | } |
1948 | |
1949 | /* |
1950 | * Return the number of blocks that have been read by this scan since |
1951 | * starting. This is meant for progress reporting rather than be fully |
1952 | * accurate: in a parallel scan, workers can be concurrently reading blocks |
1953 | * further ahead than what we report. |
1954 | */ |
1955 | static BlockNumber |
1956 | heapam_scan_get_blocks_done(HeapScanDesc hscan) |
1957 | { |
1958 | ParallelBlockTableScanDesc bpscan = NULL; |
1959 | BlockNumber startblock; |
1960 | BlockNumber blocks_done; |
1961 | |
1962 | if (hscan->rs_base.rs_parallel != NULL) |
1963 | { |
1964 | bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel; |
1965 | startblock = bpscan->phs_startblock; |
1966 | } |
1967 | else |
1968 | startblock = hscan->rs_startblock; |
1969 | |
1970 | /* |
1971 | * Might have wrapped around the end of the relation, if startblock was |
1972 | * not zero. |
1973 | */ |
1974 | if (hscan->rs_cblock > startblock) |
1975 | blocks_done = hscan->rs_cblock - startblock; |
1976 | else |
1977 | { |
1978 | BlockNumber nblocks; |
1979 | |
1980 | nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks; |
1981 | blocks_done = nblocks - startblock + |
1982 | hscan->rs_cblock; |
1983 | } |
1984 | |
1985 | return blocks_done; |
1986 | } |
1987 | |
1988 | |
1989 | /* ------------------------------------------------------------------------ |
1990 | * Miscellaneous callbacks for the heap AM |
1991 | * ------------------------------------------------------------------------ |
1992 | */ |
1993 | |
1994 | static uint64 |
1995 | heapam_relation_size(Relation rel, ForkNumber forkNumber) |
1996 | { |
1997 | uint64 nblocks = 0; |
1998 | |
1999 | /* Open it at the smgr level if not already done */ |
2000 | RelationOpenSmgr(rel); |
2001 | |
2002 | /* InvalidForkNumber indicates returning the size for all forks */ |
2003 | if (forkNumber == InvalidForkNumber) |
2004 | { |
2005 | for (int i = 0; i < MAX_FORKNUM; i++) |
2006 | nblocks += smgrnblocks(rel->rd_smgr, i); |
2007 | } |
2008 | else |
2009 | nblocks = smgrnblocks(rel->rd_smgr, forkNumber); |
2010 | |
2011 | return nblocks * BLCKSZ; |
2012 | } |
2013 | |
2014 | /* |
2015 | * Check to see whether the table needs a TOAST table. It does only if |
2016 | * (1) there are any toastable attributes, and (2) the maximum length |
2017 | * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to |
2018 | * create a toast table for something like "f1 varchar(20)".) |
2019 | */ |
2020 | static bool |
2021 | heapam_relation_needs_toast_table(Relation rel) |
2022 | { |
2023 | int32 data_length = 0; |
2024 | bool maxlength_unknown = false; |
2025 | bool has_toastable_attrs = false; |
2026 | TupleDesc tupdesc = rel->rd_att; |
2027 | int32 tuple_length; |
2028 | int i; |
2029 | |
2030 | for (i = 0; i < tupdesc->natts; i++) |
2031 | { |
2032 | Form_pg_attribute att = TupleDescAttr(tupdesc, i); |
2033 | |
2034 | if (att->attisdropped) |
2035 | continue; |
2036 | data_length = att_align_nominal(data_length, att->attalign); |
2037 | if (att->attlen > 0) |
2038 | { |
2039 | /* Fixed-length types are never toastable */ |
2040 | data_length += att->attlen; |
2041 | } |
2042 | else |
2043 | { |
2044 | int32 maxlen = type_maximum_size(att->atttypid, |
2045 | att->atttypmod); |
2046 | |
2047 | if (maxlen < 0) |
2048 | maxlength_unknown = true; |
2049 | else |
2050 | data_length += maxlen; |
2051 | if (att->attstorage != 'p') |
2052 | has_toastable_attrs = true; |
2053 | } |
2054 | } |
2055 | if (!has_toastable_attrs) |
2056 | return false; /* nothing to toast? */ |
2057 | if (maxlength_unknown) |
2058 | return true; /* any unlimited-length attrs? */ |
2059 | tuple_length = MAXALIGN(SizeofHeapTupleHeader + |
2060 | BITMAPLEN(tupdesc->natts)) + |
2061 | MAXALIGN(data_length); |
2062 | return (tuple_length > TOAST_TUPLE_THRESHOLD); |
2063 | } |
2064 | |
2065 | |
2066 | /* ------------------------------------------------------------------------ |
2067 | * Planner related callbacks for the heap AM |
2068 | * ------------------------------------------------------------------------ |
2069 | */ |
2070 | |
2071 | static void |
2072 | heapam_estimate_rel_size(Relation rel, int32 *attr_widths, |
2073 | BlockNumber *pages, double *tuples, |
2074 | double *allvisfrac) |
2075 | { |
2076 | BlockNumber curpages; |
2077 | BlockNumber relpages; |
2078 | double reltuples; |
2079 | BlockNumber relallvisible; |
2080 | double density; |
2081 | |
2082 | /* it has storage, ok to call the smgr */ |
2083 | curpages = RelationGetNumberOfBlocks(rel); |
2084 | |
2085 | /* coerce values in pg_class to more desirable types */ |
2086 | relpages = (BlockNumber) rel->rd_rel->relpages; |
2087 | reltuples = (double) rel->rd_rel->reltuples; |
2088 | relallvisible = (BlockNumber) rel->rd_rel->relallvisible; |
2089 | |
2090 | /* |
2091 | * HACK: if the relation has never yet been vacuumed, use a minimum size |
2092 | * estimate of 10 pages. The idea here is to avoid assuming a |
2093 | * newly-created table is really small, even if it currently is, because |
2094 | * that may not be true once some data gets loaded into it. Once a vacuum |
2095 | * or analyze cycle has been done on it, it's more reasonable to believe |
2096 | * the size is somewhat stable. |
2097 | * |
2098 | * (Note that this is only an issue if the plan gets cached and used again |
2099 | * after the table has been filled. What we're trying to avoid is using a |
2100 | * nestloop-type plan on a table that has grown substantially since the |
2101 | * plan was made. Normally, autovacuum/autoanalyze will occur once enough |
2102 | * inserts have happened and cause cached-plan invalidation; but that |
2103 | * doesn't happen instantaneously, and it won't happen at all for cases |
2104 | * such as temporary tables.) |
2105 | * |
2106 | * We approximate "never vacuumed" by "has relpages = 0", which means this |
2107 | * will also fire on genuinely empty relations. Not great, but |
2108 | * fortunately that's a seldom-seen case in the real world, and it |
2109 | * shouldn't degrade the quality of the plan too much anyway to err in |
2110 | * this direction. |
2111 | * |
2112 | * If the table has inheritance children, we don't apply this heuristic. |
2113 | * Totally empty parent tables are quite common, so we should be willing |
2114 | * to believe that they are empty. |
2115 | */ |
2116 | if (curpages < 10 && |
2117 | relpages == 0 && |
2118 | !rel->rd_rel->relhassubclass) |
2119 | curpages = 10; |
2120 | |
2121 | /* report estimated # pages */ |
2122 | *pages = curpages; |
2123 | /* quick exit if rel is clearly empty */ |
2124 | if (curpages == 0) |
2125 | { |
2126 | *tuples = 0; |
2127 | *allvisfrac = 0; |
2128 | return; |
2129 | } |
2130 | |
2131 | /* estimate number of tuples from previous tuple density */ |
2132 | if (relpages > 0) |
2133 | density = reltuples / (double) relpages; |
2134 | else |
2135 | { |
2136 | /* |
2137 | * When we have no data because the relation was truncated, estimate |
2138 | * tuple width from attribute datatypes. We assume here that the |
2139 | * pages are completely full, which is OK for tables (since they've |
2140 | * presumably not been VACUUMed yet) but is probably an overestimate |
2141 | * for indexes. Fortunately get_relation_info() can clamp the |
2142 | * overestimate to the parent table's size. |
2143 | * |
2144 | * Note: this code intentionally disregards alignment considerations, |
2145 | * because (a) that would be gilding the lily considering how crude |
2146 | * the estimate is, and (b) it creates platform dependencies in the |
2147 | * default plans which are kind of a headache for regression testing. |
2148 | */ |
2149 | int32 tuple_width; |
2150 | |
2151 | tuple_width = get_rel_data_width(rel, attr_widths); |
2152 | tuple_width += MAXALIGN(SizeofHeapTupleHeader); |
2153 | tuple_width += sizeof(ItemIdData); |
2154 | /* note: integer division is intentional here */ |
2155 | density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width; |
2156 | } |
2157 | *tuples = rint(density * (double) curpages); |
2158 | |
2159 | /* |
2160 | * We use relallvisible as-is, rather than scaling it up like we do for |
2161 | * the pages and tuples counts, on the theory that any pages added since |
2162 | * the last VACUUM are most likely not marked all-visible. But costsize.c |
2163 | * wants it converted to a fraction. |
2164 | */ |
2165 | if (relallvisible == 0 || curpages <= 0) |
2166 | *allvisfrac = 0; |
2167 | else if ((double) relallvisible >= curpages) |
2168 | *allvisfrac = 1; |
2169 | else |
2170 | *allvisfrac = (double) relallvisible / curpages; |
2171 | } |
2172 | |
2173 | |
2174 | /* ------------------------------------------------------------------------ |
2175 | * Executor related callbacks for the heap AM |
2176 | * ------------------------------------------------------------------------ |
2177 | */ |
2178 | |
2179 | static bool |
2180 | heapam_scan_bitmap_next_block(TableScanDesc scan, |
2181 | TBMIterateResult *tbmres) |
2182 | { |
2183 | HeapScanDesc hscan = (HeapScanDesc) scan; |
2184 | BlockNumber page = tbmres->blockno; |
2185 | Buffer buffer; |
2186 | Snapshot snapshot; |
2187 | int ntup; |
2188 | |
2189 | hscan->rs_cindex = 0; |
2190 | hscan->rs_ntuples = 0; |
2191 | |
2192 | /* |
2193 | * Ignore any claimed entries past what we think is the end of the |
2194 | * relation. It may have been extended after the start of our scan (we |
2195 | * only hold an AccessShareLock, and it could be inserts from this |
2196 | * backend). |
2197 | */ |
2198 | if (page >= hscan->rs_nblocks) |
2199 | return false; |
2200 | |
2201 | /* |
2202 | * Acquire pin on the target heap page, trading in any pin we held before. |
2203 | */ |
2204 | hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf, |
2205 | scan->rs_rd, |
2206 | page); |
2207 | hscan->rs_cblock = page; |
2208 | buffer = hscan->rs_cbuf; |
2209 | snapshot = scan->rs_snapshot; |
2210 | |
2211 | ntup = 0; |
2212 | |
2213 | /* |
2214 | * Prune and repair fragmentation for the whole page, if possible. |
2215 | */ |
2216 | heap_page_prune_opt(scan->rs_rd, buffer); |
2217 | |
2218 | /* |
2219 | * We must hold share lock on the buffer content while examining tuple |
2220 | * visibility. Afterwards, however, the tuples we have found to be |
2221 | * visible are guaranteed good as long as we hold the buffer pin. |
2222 | */ |
2223 | LockBuffer(buffer, BUFFER_LOCK_SHARE); |
2224 | |
2225 | /* |
2226 | * We need two separate strategies for lossy and non-lossy cases. |
2227 | */ |
2228 | if (tbmres->ntuples >= 0) |
2229 | { |
2230 | /* |
2231 | * Bitmap is non-lossy, so we just look through the offsets listed in |
2232 | * tbmres; but we have to follow any HOT chain starting at each such |
2233 | * offset. |
2234 | */ |
2235 | int curslot; |
2236 | |
2237 | for (curslot = 0; curslot < tbmres->ntuples; curslot++) |
2238 | { |
2239 | OffsetNumber offnum = tbmres->offsets[curslot]; |
2240 | ItemPointerData tid; |
2241 | HeapTupleData heapTuple; |
2242 | |
2243 | ItemPointerSet(&tid, page, offnum); |
2244 | if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, |
2245 | &heapTuple, NULL, true)) |
2246 | hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); |
2247 | } |
2248 | } |
2249 | else |
2250 | { |
2251 | /* |
2252 | * Bitmap is lossy, so we must examine each line pointer on the page. |
2253 | * But we can ignore HOT chains, since we'll check each tuple anyway. |
2254 | */ |
2255 | Page dp = (Page) BufferGetPage(buffer); |
2256 | OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); |
2257 | OffsetNumber offnum; |
2258 | |
2259 | for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) |
2260 | { |
2261 | ItemId lp; |
2262 | HeapTupleData loctup; |
2263 | bool valid; |
2264 | |
2265 | lp = PageGetItemId(dp, offnum); |
2266 | if (!ItemIdIsNormal(lp)) |
2267 | continue; |
2268 | loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); |
2269 | loctup.t_len = ItemIdGetLength(lp); |
2270 | loctup.t_tableOid = scan->rs_rd->rd_id; |
2271 | ItemPointerSet(&loctup.t_self, page, offnum); |
2272 | valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); |
2273 | if (valid) |
2274 | { |
2275 | hscan->rs_vistuples[ntup++] = offnum; |
2276 | PredicateLockTuple(scan->rs_rd, &loctup, snapshot); |
2277 | } |
2278 | CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, |
2279 | buffer, snapshot); |
2280 | } |
2281 | } |
2282 | |
2283 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
2284 | |
2285 | Assert(ntup <= MaxHeapTuplesPerPage); |
2286 | hscan->rs_ntuples = ntup; |
2287 | |
2288 | return ntup > 0; |
2289 | } |
2290 | |
2291 | static bool |
2292 | heapam_scan_bitmap_next_tuple(TableScanDesc scan, |
2293 | TBMIterateResult *tbmres, |
2294 | TupleTableSlot *slot) |
2295 | { |
2296 | HeapScanDesc hscan = (HeapScanDesc) scan; |
2297 | OffsetNumber targoffset; |
2298 | Page dp; |
2299 | ItemId lp; |
2300 | |
2301 | /* |
2302 | * Out of range? If so, nothing more to look at on this page |
2303 | */ |
2304 | if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples) |
2305 | return false; |
2306 | |
2307 | targoffset = hscan->rs_vistuples[hscan->rs_cindex]; |
2308 | dp = (Page) BufferGetPage(hscan->rs_cbuf); |
2309 | lp = PageGetItemId(dp, targoffset); |
2310 | Assert(ItemIdIsNormal(lp)); |
2311 | |
2312 | hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); |
2313 | hscan->rs_ctup.t_len = ItemIdGetLength(lp); |
2314 | hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; |
2315 | ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset); |
2316 | |
2317 | pgstat_count_heap_fetch(scan->rs_rd); |
2318 | |
2319 | /* |
2320 | * Set up the result slot to point to this tuple. Note that the slot |
2321 | * acquires a pin on the buffer. |
2322 | */ |
2323 | ExecStoreBufferHeapTuple(&hscan->rs_ctup, |
2324 | slot, |
2325 | hscan->rs_cbuf); |
2326 | |
2327 | hscan->rs_cindex++; |
2328 | |
2329 | return true; |
2330 | } |
2331 | |
2332 | static bool |
2333 | heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) |
2334 | { |
2335 | HeapScanDesc hscan = (HeapScanDesc) scan; |
2336 | TsmRoutine *tsm = scanstate->tsmroutine; |
2337 | BlockNumber blockno; |
2338 | |
2339 | /* return false immediately if relation is empty */ |
2340 | if (hscan->rs_nblocks == 0) |
2341 | return false; |
2342 | |
2343 | if (tsm->NextSampleBlock) |
2344 | { |
2345 | blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks); |
2346 | hscan->rs_cblock = blockno; |
2347 | } |
2348 | else |
2349 | { |
2350 | /* scanning table sequentially */ |
2351 | |
2352 | if (hscan->rs_cblock == InvalidBlockNumber) |
2353 | { |
2354 | Assert(!hscan->rs_inited); |
2355 | blockno = hscan->rs_startblock; |
2356 | } |
2357 | else |
2358 | { |
2359 | Assert(hscan->rs_inited); |
2360 | |
2361 | blockno = hscan->rs_cblock + 1; |
2362 | |
2363 | if (blockno >= hscan->rs_nblocks) |
2364 | { |
2365 | /* wrap to beginning of rel, might not have started at 0 */ |
2366 | blockno = 0; |
2367 | } |
2368 | |
2369 | /* |
2370 | * Report our new scan position for synchronization purposes. |
2371 | * |
2372 | * Note: we do this before checking for end of scan so that the |
2373 | * final state of the position hint is back at the start of the |
2374 | * rel. That's not strictly necessary, but otherwise when you run |
2375 | * the same query multiple times the starting position would shift |
2376 | * a little bit backwards on every invocation, which is confusing. |
2377 | * We don't guarantee any specific ordering in general, though. |
2378 | */ |
2379 | if (scan->rs_flags & SO_ALLOW_SYNC) |
2380 | ss_report_location(scan->rs_rd, blockno); |
2381 | |
2382 | if (blockno == hscan->rs_startblock) |
2383 | { |
2384 | blockno = InvalidBlockNumber; |
2385 | } |
2386 | } |
2387 | } |
2388 | |
2389 | if (!BlockNumberIsValid(blockno)) |
2390 | { |
2391 | if (BufferIsValid(hscan->rs_cbuf)) |
2392 | ReleaseBuffer(hscan->rs_cbuf); |
2393 | hscan->rs_cbuf = InvalidBuffer; |
2394 | hscan->rs_cblock = InvalidBlockNumber; |
2395 | hscan->rs_inited = false; |
2396 | |
2397 | return false; |
2398 | } |
2399 | |
2400 | heapgetpage(scan, blockno); |
2401 | hscan->rs_inited = true; |
2402 | |
2403 | return true; |
2404 | } |
2405 | |
2406 | static bool |
2407 | heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, |
2408 | TupleTableSlot *slot) |
2409 | { |
2410 | HeapScanDesc hscan = (HeapScanDesc) scan; |
2411 | TsmRoutine *tsm = scanstate->tsmroutine; |
2412 | BlockNumber blockno = hscan->rs_cblock; |
2413 | bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0; |
2414 | |
2415 | Page page; |
2416 | bool all_visible; |
2417 | OffsetNumber maxoffset; |
2418 | |
2419 | /* |
2420 | * When not using pagemode, we must lock the buffer during tuple |
2421 | * visibility checks. |
2422 | */ |
2423 | if (!pagemode) |
2424 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); |
2425 | |
2426 | page = (Page) BufferGetPage(hscan->rs_cbuf); |
2427 | all_visible = PageIsAllVisible(page) && |
2428 | !scan->rs_snapshot->takenDuringRecovery; |
2429 | maxoffset = PageGetMaxOffsetNumber(page); |
2430 | |
2431 | for (;;) |
2432 | { |
2433 | OffsetNumber tupoffset; |
2434 | |
2435 | CHECK_FOR_INTERRUPTS(); |
2436 | |
2437 | /* Ask the tablesample method which tuples to check on this page. */ |
2438 | tupoffset = tsm->NextSampleTuple(scanstate, |
2439 | blockno, |
2440 | maxoffset); |
2441 | |
2442 | if (OffsetNumberIsValid(tupoffset)) |
2443 | { |
2444 | ItemId itemid; |
2445 | bool visible; |
2446 | HeapTuple tuple = &(hscan->rs_ctup); |
2447 | |
2448 | /* Skip invalid tuple pointers. */ |
2449 | itemid = PageGetItemId(page, tupoffset); |
2450 | if (!ItemIdIsNormal(itemid)) |
2451 | continue; |
2452 | |
2453 | tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); |
2454 | tuple->t_len = ItemIdGetLength(itemid); |
2455 | ItemPointerSet(&(tuple->t_self), blockno, tupoffset); |
2456 | |
2457 | |
2458 | if (all_visible) |
2459 | visible = true; |
2460 | else |
2461 | visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf, |
2462 | tuple, tupoffset); |
2463 | |
2464 | /* in pagemode, heapgetpage did this for us */ |
2465 | if (!pagemode) |
2466 | CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, |
2467 | hscan->rs_cbuf, scan->rs_snapshot); |
2468 | |
2469 | /* Try next tuple from same page. */ |
2470 | if (!visible) |
2471 | continue; |
2472 | |
2473 | /* Found visible tuple, return it. */ |
2474 | if (!pagemode) |
2475 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
2476 | |
2477 | ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf); |
2478 | |
2479 | /* Count successfully-fetched tuples as heap fetches */ |
2480 | pgstat_count_heap_getnext(scan->rs_rd); |
2481 | |
2482 | return true; |
2483 | } |
2484 | else |
2485 | { |
2486 | /* |
2487 | * If we get here, it means we've exhausted the items on this page |
2488 | * and it's time to move to the next. |
2489 | */ |
2490 | if (!pagemode) |
2491 | LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
2492 | |
2493 | ExecClearTuple(slot); |
2494 | return false; |
2495 | } |
2496 | } |
2497 | |
2498 | Assert(0); |
2499 | } |
2500 | |
2501 | |
2502 | /* ---------------------------------------------------------------------------- |
2503 | * Helper functions for the above. |
2504 | * ---------------------------------------------------------------------------- |
2505 | */ |
2506 | |
2507 | /* |
2508 | * Reconstruct and rewrite the given tuple |
2509 | * |
2510 | * We cannot simply copy the tuple as-is, for several reasons: |
2511 | * |
2512 | * 1. We'd like to squeeze out the values of any dropped columns, both |
2513 | * to save space and to ensure we have no corner-case failures. (It's |
2514 | * possible for example that the new table hasn't got a TOAST table |
2515 | * and so is unable to store any large values of dropped cols.) |
2516 | * |
2517 | * 2. The tuple might not even be legal for the new table; this is |
2518 | * currently only known to happen as an after-effect of ALTER TABLE |
2519 | * SET WITHOUT OIDS. |
2520 | * |
2521 | * So, we must reconstruct the tuple from component Datums. |
2522 | */ |
2523 | static void |
2524 | reform_and_rewrite_tuple(HeapTuple tuple, |
2525 | Relation OldHeap, Relation NewHeap, |
2526 | Datum *values, bool *isnull, RewriteState rwstate) |
2527 | { |
2528 | TupleDesc oldTupDesc = RelationGetDescr(OldHeap); |
2529 | TupleDesc newTupDesc = RelationGetDescr(NewHeap); |
2530 | HeapTuple copiedTuple; |
2531 | int i; |
2532 | |
2533 | heap_deform_tuple(tuple, oldTupDesc, values, isnull); |
2534 | |
2535 | /* Be sure to null out any dropped columns */ |
2536 | for (i = 0; i < newTupDesc->natts; i++) |
2537 | { |
2538 | if (TupleDescAttr(newTupDesc, i)->attisdropped) |
2539 | isnull[i] = true; |
2540 | } |
2541 | |
2542 | copiedTuple = heap_form_tuple(newTupDesc, values, isnull); |
2543 | |
2544 | /* The heap rewrite module does the rest */ |
2545 | rewrite_heap_tuple(rwstate, tuple, copiedTuple); |
2546 | |
2547 | heap_freetuple(copiedTuple); |
2548 | } |
2549 | |
2550 | /* |
2551 | * Check visibility of the tuple. |
2552 | */ |
2553 | static bool |
2554 | SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, |
2555 | HeapTuple tuple, |
2556 | OffsetNumber tupoffset) |
2557 | { |
2558 | HeapScanDesc hscan = (HeapScanDesc) scan; |
2559 | |
2560 | if (scan->rs_flags & SO_ALLOW_PAGEMODE) |
2561 | { |
2562 | /* |
2563 | * In pageatatime mode, heapgetpage() already did visibility checks, |
2564 | * so just look at the info it left in rs_vistuples[]. |
2565 | * |
2566 | * We use a binary search over the known-sorted array. Note: we could |
2567 | * save some effort if we insisted that NextSampleTuple select tuples |
2568 | * in increasing order, but it's not clear that there would be enough |
2569 | * gain to justify the restriction. |
2570 | */ |
2571 | int start = 0, |
2572 | end = hscan->rs_ntuples - 1; |
2573 | |
2574 | while (start <= end) |
2575 | { |
2576 | int mid = (start + end) / 2; |
2577 | OffsetNumber curoffset = hscan->rs_vistuples[mid]; |
2578 | |
2579 | if (tupoffset == curoffset) |
2580 | return true; |
2581 | else if (tupoffset < curoffset) |
2582 | end = mid - 1; |
2583 | else |
2584 | start = mid + 1; |
2585 | } |
2586 | |
2587 | return false; |
2588 | } |
2589 | else |
2590 | { |
2591 | /* Otherwise, we have to check the tuple individually. */ |
2592 | return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, |
2593 | buffer); |
2594 | } |
2595 | } |
2596 | |
2597 | |
2598 | /* ------------------------------------------------------------------------ |
2599 | * Definition of the heap table access method. |
2600 | * ------------------------------------------------------------------------ |
2601 | */ |
2602 | |
2603 | static const TableAmRoutine heapam_methods = { |
2604 | .type = T_TableAmRoutine, |
2605 | |
2606 | .slot_callbacks = heapam_slot_callbacks, |
2607 | |
2608 | .scan_begin = heap_beginscan, |
2609 | .scan_end = heap_endscan, |
2610 | .scan_rescan = heap_rescan, |
2611 | .scan_getnextslot = heap_getnextslot, |
2612 | |
2613 | .parallelscan_estimate = table_block_parallelscan_estimate, |
2614 | .parallelscan_initialize = table_block_parallelscan_initialize, |
2615 | .parallelscan_reinitialize = table_block_parallelscan_reinitialize, |
2616 | |
2617 | .index_fetch_begin = heapam_index_fetch_begin, |
2618 | .index_fetch_reset = heapam_index_fetch_reset, |
2619 | .index_fetch_end = heapam_index_fetch_end, |
2620 | .index_fetch_tuple = heapam_index_fetch_tuple, |
2621 | |
2622 | .tuple_insert = heapam_tuple_insert, |
2623 | .tuple_insert_speculative = heapam_tuple_insert_speculative, |
2624 | .tuple_complete_speculative = heapam_tuple_complete_speculative, |
2625 | .multi_insert = heap_multi_insert, |
2626 | .tuple_delete = heapam_tuple_delete, |
2627 | .tuple_update = heapam_tuple_update, |
2628 | .tuple_lock = heapam_tuple_lock, |
2629 | .finish_bulk_insert = heapam_finish_bulk_insert, |
2630 | |
2631 | .tuple_fetch_row_version = heapam_fetch_row_version, |
2632 | .tuple_get_latest_tid = heap_get_latest_tid, |
2633 | .tuple_tid_valid = heapam_tuple_tid_valid, |
2634 | .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, |
2635 | .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples, |
2636 | |
2637 | .relation_set_new_filenode = heapam_relation_set_new_filenode, |
2638 | .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, |
2639 | .relation_copy_data = heapam_relation_copy_data, |
2640 | .relation_copy_for_cluster = heapam_relation_copy_for_cluster, |
2641 | .relation_vacuum = heap_vacuum_rel, |
2642 | .scan_analyze_next_block = heapam_scan_analyze_next_block, |
2643 | .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple, |
2644 | .index_build_range_scan = heapam_index_build_range_scan, |
2645 | .index_validate_scan = heapam_index_validate_scan, |
2646 | |
2647 | .relation_size = heapam_relation_size, |
2648 | .relation_needs_toast_table = heapam_relation_needs_toast_table, |
2649 | |
2650 | .relation_estimate_size = heapam_estimate_rel_size, |
2651 | |
2652 | .scan_bitmap_next_block = heapam_scan_bitmap_next_block, |
2653 | .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, |
2654 | .scan_sample_next_block = heapam_scan_sample_next_block, |
2655 | .scan_sample_next_tuple = heapam_scan_sample_next_tuple |
2656 | }; |
2657 | |
2658 | |
2659 | const TableAmRoutine * |
2660 | GetHeapamTableAmRoutine(void) |
2661 | { |
2662 | return &heapam_methods; |
2663 | } |
2664 | |
2665 | Datum |
2666 | heap_tableam_handler(PG_FUNCTION_ARGS) |
2667 | { |
2668 | PG_RETURN_POINTER(&heapam_methods); |
2669 | } |
2670 | |