1/*-------------------------------------------------------------------------
2 *
3 * cluster.c
4 * CLUSTER a table on an index. This is now also used for VACUUM FULL.
5 *
6 * There is hardly anything left of Paul Brown's original implementation...
7 *
8 *
9 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994-5, Regents of the University of California
11 *
12 *
13 * IDENTIFICATION
14 * src/backend/commands/cluster.c
15 *
16 *-------------------------------------------------------------------------
17 */
18#include "postgres.h"
19
20#include "access/amapi.h"
21#include "access/heapam.h"
22#include "access/multixact.h"
23#include "access/relscan.h"
24#include "access/tableam.h"
25#include "access/transam.h"
26#include "access/tuptoaster.h"
27#include "access/xact.h"
28#include "access/xlog.h"
29#include "catalog/pg_am.h"
30#include "catalog/catalog.h"
31#include "catalog/dependency.h"
32#include "catalog/heap.h"
33#include "catalog/index.h"
34#include "catalog/namespace.h"
35#include "catalog/objectaccess.h"
36#include "catalog/toasting.h"
37#include "commands/cluster.h"
38#include "commands/progress.h"
39#include "commands/tablecmds.h"
40#include "commands/vacuum.h"
41#include "miscadmin.h"
42#include "optimizer/optimizer.h"
43#include "pgstat.h"
44#include "storage/bufmgr.h"
45#include "storage/lmgr.h"
46#include "storage/predicate.h"
47#include "utils/acl.h"
48#include "utils/fmgroids.h"
49#include "utils/inval.h"
50#include "utils/lsyscache.h"
51#include "utils/memutils.h"
52#include "utils/pg_rusage.h"
53#include "utils/relmapper.h"
54#include "utils/snapmgr.h"
55#include "utils/syscache.h"
56#include "utils/tuplesort.h"
57
58
59/*
60 * This struct is used to pass around the information on tables to be
61 * clustered. We need this so we can make a list of them when invoked without
62 * a specific table/index pair.
63 */
64typedef struct
65{
66 Oid tableOid;
67 Oid indexOid;
68} RelToCluster;
69
70
71static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
72static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
73 bool verbose, bool *pSwapToastByContent,
74 TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
75static List *get_tables_to_cluster(MemoryContext cluster_context);
76
77
78/*---------------------------------------------------------------------------
79 * This cluster code allows for clustering multiple tables at once. Because
80 * of this, we cannot just run everything on a single transaction, or we
81 * would be forced to acquire exclusive locks on all the tables being
82 * clustered, simultaneously --- very likely leading to deadlock.
83 *
84 * To solve this we follow a similar strategy to VACUUM code,
85 * clustering each relation in a separate transaction. For this to work,
86 * we need to:
87 * - provide a separate memory context so that we can pass information in
88 * a way that survives across transactions
89 * - start a new transaction every time a new relation is clustered
90 * - check for validity of the information on to-be-clustered relations,
91 * as someone might have deleted a relation behind our back, or
92 * clustered one on a different index
93 * - end the transaction
94 *
95 * The single-relation case does not have any such overhead.
96 *
97 * We also allow a relation to be specified without index. In that case,
98 * the indisclustered bit will be looked up, and an ERROR will be thrown
99 * if there is no index with the bit set.
100 *---------------------------------------------------------------------------
101 */
102void
103cluster(ClusterStmt *stmt, bool isTopLevel)
104{
105 if (stmt->relation != NULL)
106 {
107 /* This is the single-relation case. */
108 Oid tableOid,
109 indexOid = InvalidOid;
110 Relation rel;
111
112 /* Find, lock, and check permissions on the table */
113 tableOid = RangeVarGetRelidExtended(stmt->relation,
114 AccessExclusiveLock,
115 0,
116 RangeVarCallbackOwnsTable, NULL);
117 rel = table_open(tableOid, NoLock);
118
119 /*
120 * Reject clustering a remote temp table ... their local buffer
121 * manager is not going to cope.
122 */
123 if (RELATION_IS_OTHER_TEMP(rel))
124 ereport(ERROR,
125 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
126 errmsg("cannot cluster temporary tables of other sessions")));
127
128 /*
129 * Reject clustering a partitioned table.
130 */
131 if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
132 ereport(ERROR,
133 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
134 errmsg("cannot cluster a partitioned table")));
135
136 if (stmt->indexname == NULL)
137 {
138 ListCell *index;
139
140 /* We need to find the index that has indisclustered set. */
141 foreach(index, RelationGetIndexList(rel))
142 {
143 HeapTuple idxtuple;
144 Form_pg_index indexForm;
145
146 indexOid = lfirst_oid(index);
147 idxtuple = SearchSysCache1(INDEXRELID,
148 ObjectIdGetDatum(indexOid));
149 if (!HeapTupleIsValid(idxtuple))
150 elog(ERROR, "cache lookup failed for index %u", indexOid);
151 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
152 if (indexForm->indisclustered)
153 {
154 ReleaseSysCache(idxtuple);
155 break;
156 }
157 ReleaseSysCache(idxtuple);
158 indexOid = InvalidOid;
159 }
160
161 if (!OidIsValid(indexOid))
162 ereport(ERROR,
163 (errcode(ERRCODE_UNDEFINED_OBJECT),
164 errmsg("there is no previously clustered index for table \"%s\"",
165 stmt->relation->relname)));
166 }
167 else
168 {
169 /*
170 * The index is expected to be in the same namespace as the
171 * relation.
172 */
173 indexOid = get_relname_relid(stmt->indexname,
174 rel->rd_rel->relnamespace);
175 if (!OidIsValid(indexOid))
176 ereport(ERROR,
177 (errcode(ERRCODE_UNDEFINED_OBJECT),
178 errmsg("index \"%s\" for table \"%s\" does not exist",
179 stmt->indexname, stmt->relation->relname)));
180 }
181
182 /* close relation, keep lock till commit */
183 table_close(rel, NoLock);
184
185 /* Do the job. */
186 cluster_rel(tableOid, indexOid, stmt->options);
187 }
188 else
189 {
190 /*
191 * This is the "multi relation" case. We need to cluster all tables
192 * that have some index with indisclustered set.
193 */
194 MemoryContext cluster_context;
195 List *rvs;
196 ListCell *rv;
197
198 /*
199 * We cannot run this form of CLUSTER inside a user transaction block;
200 * we'd be holding locks way too long.
201 */
202 PreventInTransactionBlock(isTopLevel, "CLUSTER");
203
204 /*
205 * Create special memory context for cross-transaction storage.
206 *
207 * Since it is a child of PortalContext, it will go away even in case
208 * of error.
209 */
210 cluster_context = AllocSetContextCreate(PortalContext,
211 "Cluster",
212 ALLOCSET_DEFAULT_SIZES);
213
214 /*
215 * Build the list of relations to cluster. Note that this lives in
216 * cluster_context.
217 */
218 rvs = get_tables_to_cluster(cluster_context);
219
220 /* Commit to get out of starting transaction */
221 PopActiveSnapshot();
222 CommitTransactionCommand();
223
224 /* Ok, now that we've got them all, cluster them one by one */
225 foreach(rv, rvs)
226 {
227 RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
228
229 /* Start a new transaction for each relation. */
230 StartTransactionCommand();
231 /* functions in indexes may want a snapshot set */
232 PushActiveSnapshot(GetTransactionSnapshot());
233 /* Do the job. */
234 cluster_rel(rvtc->tableOid, rvtc->indexOid,
235 stmt->options | CLUOPT_RECHECK);
236 PopActiveSnapshot();
237 CommitTransactionCommand();
238 }
239
240 /* Start a new transaction for the cleanup work. */
241 StartTransactionCommand();
242
243 /* Clean up working storage */
244 MemoryContextDelete(cluster_context);
245 }
246}
247
248/*
249 * cluster_rel
250 *
251 * This clusters the table by creating a new, clustered table and
252 * swapping the relfilenodes of the new table and the old table, so
253 * the OID of the original table is preserved. Thus we do not lose
254 * GRANT, inheritance nor references to this table (this was a bug
255 * in releases through 7.3).
256 *
257 * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
258 * the new table, it's better to create the indexes afterwards than to fill
259 * them incrementally while we load the table.
260 *
261 * If indexOid is InvalidOid, the table will be rewritten in physical order
262 * instead of index order. This is the new implementation of VACUUM FULL,
263 * and error messages should refer to the operation as VACUUM not CLUSTER.
264 */
265void
266cluster_rel(Oid tableOid, Oid indexOid, int options)
267{
268 Relation OldHeap;
269 bool verbose = ((options & CLUOPT_VERBOSE) != 0);
270 bool recheck = ((options & CLUOPT_RECHECK) != 0);
271
272 /* Check for user-requested abort. */
273 CHECK_FOR_INTERRUPTS();
274
275 pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
276 if (OidIsValid(indexOid))
277 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
278 PROGRESS_CLUSTER_COMMAND_CLUSTER);
279 else
280 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
281 PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
282
283 /*
284 * We grab exclusive access to the target rel and index for the duration
285 * of the transaction. (This is redundant for the single-transaction
286 * case, since cluster() already did it.) The index lock is taken inside
287 * check_index_is_clusterable.
288 */
289 OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
290
291 /* If the table has gone away, we can skip processing it */
292 if (!OldHeap)
293 {
294 pgstat_progress_end_command();
295 return;
296 }
297
298 /*
299 * Since we may open a new transaction for each relation, we have to check
300 * that the relation still is what we think it is.
301 *
302 * If this is a single-transaction CLUSTER, we can skip these tests. We
303 * *must* skip the one on indisclustered since it would reject an attempt
304 * to cluster a not-previously-clustered index.
305 */
306 if (recheck)
307 {
308 HeapTuple tuple;
309 Form_pg_index indexForm;
310
311 /* Check that the user still owns the relation */
312 if (!pg_class_ownercheck(tableOid, GetUserId()))
313 {
314 relation_close(OldHeap, AccessExclusiveLock);
315 pgstat_progress_end_command();
316 return;
317 }
318
319 /*
320 * Silently skip a temp table for a remote session. Only doing this
321 * check in the "recheck" case is appropriate (which currently means
322 * somebody is executing a database-wide CLUSTER), because there is
323 * another check in cluster() which will stop any attempt to cluster
324 * remote temp tables by name. There is another check in cluster_rel
325 * which is redundant, but we leave it for extra safety.
326 */
327 if (RELATION_IS_OTHER_TEMP(OldHeap))
328 {
329 relation_close(OldHeap, AccessExclusiveLock);
330 pgstat_progress_end_command();
331 return;
332 }
333
334 if (OidIsValid(indexOid))
335 {
336 /*
337 * Check that the index still exists
338 */
339 if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
340 {
341 relation_close(OldHeap, AccessExclusiveLock);
342 pgstat_progress_end_command();
343 return;
344 }
345
346 /*
347 * Check that the index is still the one with indisclustered set.
348 */
349 tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
350 if (!HeapTupleIsValid(tuple)) /* probably can't happen */
351 {
352 relation_close(OldHeap, AccessExclusiveLock);
353 pgstat_progress_end_command();
354 return;
355 }
356 indexForm = (Form_pg_index) GETSTRUCT(tuple);
357 if (!indexForm->indisclustered)
358 {
359 ReleaseSysCache(tuple);
360 relation_close(OldHeap, AccessExclusiveLock);
361 pgstat_progress_end_command();
362 return;
363 }
364 ReleaseSysCache(tuple);
365 }
366 }
367
368 /*
369 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
370 * would work in most respects, but the index would only get marked as
371 * indisclustered in the current database, leading to unexpected behavior
372 * if CLUSTER were later invoked in another database.
373 */
374 if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
375 ereport(ERROR,
376 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
377 errmsg("cannot cluster a shared catalog")));
378
379 /*
380 * Don't process temp tables of other backends ... their local buffer
381 * manager is not going to cope.
382 */
383 if (RELATION_IS_OTHER_TEMP(OldHeap))
384 {
385 if (OidIsValid(indexOid))
386 ereport(ERROR,
387 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
388 errmsg("cannot cluster temporary tables of other sessions")));
389 else
390 ereport(ERROR,
391 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
392 errmsg("cannot vacuum temporary tables of other sessions")));
393 }
394
395 /*
396 * Also check for active uses of the relation in the current transaction,
397 * including open scans and pending AFTER trigger events.
398 */
399 CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
400
401 /* Check heap and index are valid to cluster on */
402 if (OidIsValid(indexOid))
403 check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
404
405 /*
406 * Quietly ignore the request if this is a materialized view which has not
407 * been populated from its query. No harm is done because there is no data
408 * to deal with, and we don't want to throw an error if this is part of a
409 * multi-relation request -- for example, CLUSTER was run on the entire
410 * database.
411 */
412 if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
413 !RelationIsPopulated(OldHeap))
414 {
415 relation_close(OldHeap, AccessExclusiveLock);
416 pgstat_progress_end_command();
417 return;
418 }
419
420 /*
421 * All predicate locks on the tuples or pages are about to be made
422 * invalid, because we move tuples around. Promote them to relation
423 * locks. Predicate locks on indexes will be promoted when they are
424 * reindexed.
425 */
426 TransferPredicateLocksToHeapRelation(OldHeap);
427
428 /* rebuild_relation does all the dirty work */
429 rebuild_relation(OldHeap, indexOid, verbose);
430
431 /* NB: rebuild_relation does table_close() on OldHeap */
432
433 pgstat_progress_end_command();
434}
435
436/*
437 * Verify that the specified heap and index are valid to cluster on
438 *
439 * Side effect: obtains lock on the index. The caller may
440 * in some cases already have AccessExclusiveLock on the table, but
441 * not in all cases so we can't rely on the table-level lock for
442 * protection here.
443 */
444void
445check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
446{
447 Relation OldIndex;
448
449 OldIndex = index_open(indexOid, lockmode);
450
451 /*
452 * Check that index is in fact an index on the given relation
453 */
454 if (OldIndex->rd_index == NULL ||
455 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
456 ereport(ERROR,
457 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
458 errmsg("\"%s\" is not an index for table \"%s\"",
459 RelationGetRelationName(OldIndex),
460 RelationGetRelationName(OldHeap))));
461
462 /* Index AM must allow clustering */
463 if (!OldIndex->rd_indam->amclusterable)
464 ereport(ERROR,
465 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
466 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
467 RelationGetRelationName(OldIndex))));
468
469 /*
470 * Disallow clustering on incomplete indexes (those that might not index
471 * every row of the relation). We could relax this by making a separate
472 * seqscan pass over the table to copy the missing rows, but that seems
473 * expensive and tedious.
474 */
475 if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
476 ereport(ERROR,
477 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
478 errmsg("cannot cluster on partial index \"%s\"",
479 RelationGetRelationName(OldIndex))));
480
481 /*
482 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
483 * it might well not contain entries for every heap row, or might not even
484 * be internally consistent. (But note that we don't check indcheckxmin;
485 * the worst consequence of following broken HOT chains would be that we
486 * might put recently-dead tuples out-of-order in the new table, and there
487 * is little harm in that.)
488 */
489 if (!OldIndex->rd_index->indisvalid)
490 ereport(ERROR,
491 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
492 errmsg("cannot cluster on invalid index \"%s\"",
493 RelationGetRelationName(OldIndex))));
494
495 /* Drop relcache refcnt on OldIndex, but keep lock */
496 index_close(OldIndex, NoLock);
497}
498
499/*
500 * mark_index_clustered: mark the specified index as the one clustered on
501 *
502 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
503 */
504void
505mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
506{
507 HeapTuple indexTuple;
508 Form_pg_index indexForm;
509 Relation pg_index;
510 ListCell *index;
511
512 /* Disallow applying to a partitioned table */
513 if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
514 ereport(ERROR,
515 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
516 errmsg("cannot mark index clustered in partitioned table")));
517
518 /*
519 * If the index is already marked clustered, no need to do anything.
520 */
521 if (OidIsValid(indexOid))
522 {
523 indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
524 if (!HeapTupleIsValid(indexTuple))
525 elog(ERROR, "cache lookup failed for index %u", indexOid);
526 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
527
528 if (indexForm->indisclustered)
529 {
530 ReleaseSysCache(indexTuple);
531 return;
532 }
533
534 ReleaseSysCache(indexTuple);
535 }
536
537 /*
538 * Check each index of the relation and set/clear the bit as needed.
539 */
540 pg_index = table_open(IndexRelationId, RowExclusiveLock);
541
542 foreach(index, RelationGetIndexList(rel))
543 {
544 Oid thisIndexOid = lfirst_oid(index);
545
546 indexTuple = SearchSysCacheCopy1(INDEXRELID,
547 ObjectIdGetDatum(thisIndexOid));
548 if (!HeapTupleIsValid(indexTuple))
549 elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
550 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
551
552 /*
553 * Unset the bit if set. We know it's wrong because we checked this
554 * earlier.
555 */
556 if (indexForm->indisclustered)
557 {
558 indexForm->indisclustered = false;
559 CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
560 }
561 else if (thisIndexOid == indexOid)
562 {
563 /* this was checked earlier, but let's be real sure */
564 if (!indexForm->indisvalid)
565 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
566 indexForm->indisclustered = true;
567 CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
568 }
569
570 InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
571 InvalidOid, is_internal);
572
573 heap_freetuple(indexTuple);
574 }
575
576 table_close(pg_index, RowExclusiveLock);
577}
578
579/*
580 * rebuild_relation: rebuild an existing relation in index or physical order
581 *
582 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
583 * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
584 *
585 * NB: this routine closes OldHeap at the right time; caller should not.
586 */
587static void
588rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
589{
590 Oid tableOid = RelationGetRelid(OldHeap);
591 Oid tableSpace = OldHeap->rd_rel->reltablespace;
592 Oid OIDNewHeap;
593 char relpersistence;
594 bool is_system_catalog;
595 bool swap_toast_by_content;
596 TransactionId frozenXid;
597 MultiXactId cutoffMulti;
598
599 /* Mark the correct index as clustered */
600 if (OidIsValid(indexOid))
601 mark_index_clustered(OldHeap, indexOid, true);
602
603 /* Remember info about rel before closing OldHeap */
604 relpersistence = OldHeap->rd_rel->relpersistence;
605 is_system_catalog = IsSystemRelation(OldHeap);
606
607 /* Close relcache entry, but keep lock until transaction commit */
608 table_close(OldHeap, NoLock);
609
610 /* Create the transient table that will receive the re-ordered data */
611 OIDNewHeap = make_new_heap(tableOid, tableSpace,
612 relpersistence,
613 AccessExclusiveLock);
614
615 /* Copy the heap data into the new table in the desired order */
616 copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
617 &swap_toast_by_content, &frozenXid, &cutoffMulti);
618
619 /*
620 * Swap the physical files of the target and transient tables, then
621 * rebuild the target's indexes and throw away the transient table.
622 */
623 finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
624 swap_toast_by_content, false, true,
625 frozenXid, cutoffMulti,
626 relpersistence);
627}
628
629
630/*
631 * Create the transient table that will be filled with new data during
632 * CLUSTER, ALTER TABLE, and similar operations. The transient table
633 * duplicates the logical structure of the OldHeap, but is placed in
634 * NewTableSpace which might be different from OldHeap's. Also, it's built
635 * with the specified persistence, which might differ from the original's.
636 *
637 * After this, the caller should load the new heap with transferred/modified
638 * data, then call finish_heap_swap to complete the operation.
639 */
640Oid
641make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
642 LOCKMODE lockmode)
643{
644 TupleDesc OldHeapDesc;
645 char NewHeapName[NAMEDATALEN];
646 Oid OIDNewHeap;
647 Oid toastid;
648 Relation OldHeap;
649 HeapTuple tuple;
650 Datum reloptions;
651 bool isNull;
652 Oid namespaceid;
653
654 OldHeap = table_open(OIDOldHeap, lockmode);
655 OldHeapDesc = RelationGetDescr(OldHeap);
656
657 /*
658 * Note that the NewHeap will not receive any of the defaults or
659 * constraints associated with the OldHeap; we don't need 'em, and there's
660 * no reason to spend cycles inserting them into the catalogs only to
661 * delete them.
662 */
663
664 /*
665 * But we do want to use reloptions of the old heap for new heap.
666 */
667 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
668 if (!HeapTupleIsValid(tuple))
669 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
670 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
671 &isNull);
672 if (isNull)
673 reloptions = (Datum) 0;
674
675 if (relpersistence == RELPERSISTENCE_TEMP)
676 namespaceid = LookupCreationNamespace("pg_temp");
677 else
678 namespaceid = RelationGetNamespace(OldHeap);
679
680 /*
681 * Create the new heap, using a temporary name in the same namespace as
682 * the existing table. NOTE: there is some risk of collision with user
683 * relnames. Working around this seems more trouble than it's worth; in
684 * particular, we can't create the new heap in a different namespace from
685 * the old, or we will have problems with the TEMP status of temp tables.
686 *
687 * Note: the new heap is not a shared relation, even if we are rebuilding
688 * a shared rel. However, we do make the new heap mapped if the source is
689 * mapped. This simplifies swap_relation_files, and is absolutely
690 * necessary for rebuilding pg_class, for reasons explained there.
691 */
692 snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
693
694 OIDNewHeap = heap_create_with_catalog(NewHeapName,
695 namespaceid,
696 NewTableSpace,
697 InvalidOid,
698 InvalidOid,
699 InvalidOid,
700 OldHeap->rd_rel->relowner,
701 OldHeap->rd_rel->relam,
702 OldHeapDesc,
703 NIL,
704 RELKIND_RELATION,
705 relpersistence,
706 false,
707 RelationIsMapped(OldHeap),
708 ONCOMMIT_NOOP,
709 reloptions,
710 false,
711 true,
712 true,
713 OIDOldHeap,
714 NULL);
715 Assert(OIDNewHeap != InvalidOid);
716
717 ReleaseSysCache(tuple);
718
719 /*
720 * Advance command counter so that the newly-created relation's catalog
721 * tuples will be visible to table_open.
722 */
723 CommandCounterIncrement();
724
725 /*
726 * If necessary, create a TOAST table for the new relation.
727 *
728 * If the relation doesn't have a TOAST table already, we can't need one
729 * for the new relation. The other way around is possible though: if some
730 * wide columns have been dropped, NewHeapCreateToastTable can decide that
731 * no TOAST table is needed for the new table.
732 *
733 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
734 * that the TOAST table will be visible for insertion.
735 */
736 toastid = OldHeap->rd_rel->reltoastrelid;
737 if (OidIsValid(toastid))
738 {
739 /* keep the existing toast table's reloptions, if any */
740 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
741 if (!HeapTupleIsValid(tuple))
742 elog(ERROR, "cache lookup failed for relation %u", toastid);
743 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
744 &isNull);
745 if (isNull)
746 reloptions = (Datum) 0;
747
748 NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
749
750 ReleaseSysCache(tuple);
751 }
752
753 table_close(OldHeap, NoLock);
754
755 return OIDNewHeap;
756}
757
758/*
759 * Do the physical copying of table data.
760 *
761 * There are three output parameters:
762 * *pSwapToastByContent is set true if toast tables must be swapped by content.
763 * *pFreezeXid receives the TransactionId used as freeze cutoff point.
764 * *pCutoffMulti receives the MultiXactId used as a cutoff point.
765 */
766static void
767copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
768 bool *pSwapToastByContent, TransactionId *pFreezeXid,
769 MultiXactId *pCutoffMulti)
770{
771 Relation NewHeap,
772 OldHeap,
773 OldIndex;
774 Relation relRelation;
775 HeapTuple reltup;
776 Form_pg_class relform;
777 TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
778 TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
779 TransactionId OldestXmin;
780 TransactionId FreezeXid;
781 MultiXactId MultiXactCutoff;
782 bool use_sort;
783 double num_tuples = 0,
784 tups_vacuumed = 0,
785 tups_recently_dead = 0;
786 BlockNumber num_pages;
787 int elevel = verbose ? INFO : DEBUG2;
788 PGRUsage ru0;
789
790 pg_rusage_init(&ru0);
791
792 /*
793 * Open the relations we need.
794 */
795 NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
796 OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
797 if (OidIsValid(OIDOldIndex))
798 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
799 else
800 OldIndex = NULL;
801
802 /*
803 * Their tuple descriptors should be exactly alike, but here we only need
804 * assume that they have the same number of columns.
805 */
806 oldTupDesc = RelationGetDescr(OldHeap);
807 newTupDesc = RelationGetDescr(NewHeap);
808 Assert(newTupDesc->natts == oldTupDesc->natts);
809
810 /*
811 * If the OldHeap has a toast table, get lock on the toast table to keep
812 * it from being vacuumed. This is needed because autovacuum processes
813 * toast tables independently of their main tables, with no lock on the
814 * latter. If an autovacuum were to start on the toast table after we
815 * compute our OldestXmin below, it would use a later OldestXmin, and then
816 * possibly remove as DEAD toast tuples belonging to main tuples we think
817 * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
818 * tuples.
819 *
820 * We don't need to open the toast relation here, just lock it. The lock
821 * will be held till end of transaction.
822 */
823 if (OldHeap->rd_rel->reltoastrelid)
824 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
825
826 /*
827 * If both tables have TOAST tables, perform toast swap by content. It is
828 * possible that the old table has a toast table but the new one doesn't,
829 * if toastable columns have been dropped. In that case we have to do
830 * swap by links. This is okay because swap by content is only essential
831 * for system catalogs, and we don't support schema changes for them.
832 */
833 if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
834 {
835 *pSwapToastByContent = true;
836
837 /*
838 * When doing swap by content, any toast pointers written into NewHeap
839 * must use the old toast table's OID, because that's where the toast
840 * data will eventually be found. Set this up by setting rd_toastoid.
841 * This also tells toast_save_datum() to preserve the toast value
842 * OIDs, which we want so as not to invalidate toast pointers in
843 * system catalog caches, and to avoid making multiple copies of a
844 * single toast value.
845 *
846 * Note that we must hold NewHeap open until we are done writing data,
847 * since the relcache will not guarantee to remember this setting once
848 * the relation is closed. Also, this technique depends on the fact
849 * that no one will try to read from the NewHeap until after we've
850 * finished writing it and swapping the rels --- otherwise they could
851 * follow the toast pointers to the wrong place. (It would actually
852 * work for values copied over from the old toast table, but not for
853 * any values that we toast which were previously not toasted.)
854 */
855 NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
856 }
857 else
858 *pSwapToastByContent = false;
859
860 /*
861 * Compute xids used to freeze and weed out dead tuples and multixacts.
862 * Since we're going to rewrite the whole table anyway, there's no reason
863 * not to be aggressive about this.
864 */
865 vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
866 &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
867 NULL);
868
869 /*
870 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
871 * backwards, so take the max.
872 */
873 if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
874 TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
875 FreezeXid = OldHeap->rd_rel->relfrozenxid;
876
877 /*
878 * MultiXactCutoff, similarly, shouldn't go backwards either.
879 */
880 if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
881 MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
882 MultiXactCutoff = OldHeap->rd_rel->relminmxid;
883
884 /*
885 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
886 * the OldHeap. We know how to use a sort to duplicate the ordering of a
887 * btree index, and will use seqscan-and-sort for that case if the planner
888 * tells us it's cheaper. Otherwise, always indexscan if an index is
889 * provided, else plain seqscan.
890 */
891 if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
892 use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
893 else
894 use_sort = false;
895
896 /* Log what we're doing */
897 if (OldIndex != NULL && !use_sort)
898 ereport(elevel,
899 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
900 get_namespace_name(RelationGetNamespace(OldHeap)),
901 RelationGetRelationName(OldHeap),
902 RelationGetRelationName(OldIndex))));
903 else if (use_sort)
904 ereport(elevel,
905 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
906 get_namespace_name(RelationGetNamespace(OldHeap)),
907 RelationGetRelationName(OldHeap))));
908 else
909 ereport(elevel,
910 (errmsg("vacuuming \"%s.%s\"",
911 get_namespace_name(RelationGetNamespace(OldHeap)),
912 RelationGetRelationName(OldHeap))));
913
914 /*
915 * Hand of the actual copying to AM specific function, the generic code
916 * cannot know how to deal with visibility across AMs. Note that this
917 * routine is allowed to set FreezeXid / MultiXactCutoff to different
918 * values (e.g. because the AM doesn't use freezing).
919 */
920 table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
921 OldestXmin, &FreezeXid, &MultiXactCutoff,
922 &num_tuples, &tups_vacuumed,
923 &tups_recently_dead);
924
925 /* return selected values to caller, get set as relfrozenxid/minmxid */
926 *pFreezeXid = FreezeXid;
927 *pCutoffMulti = MultiXactCutoff;
928
929 /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
930 NewHeap->rd_toastoid = InvalidOid;
931
932 num_pages = RelationGetNumberOfBlocks(NewHeap);
933
934 /* Log what we did */
935 ereport(elevel,
936 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
937 RelationGetRelationName(OldHeap),
938 tups_vacuumed, num_tuples,
939 RelationGetNumberOfBlocks(OldHeap)),
940 errdetail("%.0f dead row versions cannot be removed yet.\n"
941 "%s.",
942 tups_recently_dead,
943 pg_rusage_show(&ru0))));
944
945 if (OldIndex != NULL)
946 index_close(OldIndex, NoLock);
947 table_close(OldHeap, NoLock);
948 table_close(NewHeap, NoLock);
949
950 /* Update pg_class to reflect the correct values of pages and tuples. */
951 relRelation = table_open(RelationRelationId, RowExclusiveLock);
952
953 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
954 if (!HeapTupleIsValid(reltup))
955 elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
956 relform = (Form_pg_class) GETSTRUCT(reltup);
957
958 relform->relpages = num_pages;
959 relform->reltuples = num_tuples;
960
961 /* Don't update the stats for pg_class. See swap_relation_files. */
962 if (OIDOldHeap != RelationRelationId)
963 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
964 else
965 CacheInvalidateRelcacheByTuple(reltup);
966
967 /* Clean up. */
968 heap_freetuple(reltup);
969 table_close(relRelation, RowExclusiveLock);
970
971 /* Make the update visible */
972 CommandCounterIncrement();
973}
974
975/*
976 * Swap the physical files of two given relations.
977 *
978 * We swap the physical identity (reltablespace, relfilenode) while keeping the
979 * same logical identities of the two relations. relpersistence is also
980 * swapped, which is critical since it determines where buffers live for each
981 * relation.
982 *
983 * We can swap associated TOAST data in either of two ways: recursively swap
984 * the physical content of the toast tables (and their indexes), or swap the
985 * TOAST links in the given relations' pg_class entries. The former is needed
986 * to manage rewrites of shared catalogs (where we cannot change the pg_class
987 * links) while the latter is the only way to handle cases in which a toast
988 * table is added or removed altogether.
989 *
990 * Additionally, the first relation is marked with relfrozenxid set to
991 * frozenXid. It seems a bit ugly to have this here, but the caller would
992 * have to do it anyway, so having it here saves a heap_update. Note: in
993 * the swap-toast-links case, we assume we don't need to change the toast
994 * table's relfrozenxid: the new version of the toast table should already
995 * have relfrozenxid set to RecentXmin, which is good enough.
996 *
997 * Lastly, if r2 and its toast table and toast index (if any) are mapped,
998 * their OIDs are emitted into mapped_tables[]. This is hacky but beats
999 * having to look the information up again later in finish_heap_swap.
1000 */
1001static void
1002swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1003 bool swap_toast_by_content,
1004 bool is_internal,
1005 TransactionId frozenXid,
1006 MultiXactId cutoffMulti,
1007 Oid *mapped_tables)
1008{
1009 Relation relRelation;
1010 HeapTuple reltup1,
1011 reltup2;
1012 Form_pg_class relform1,
1013 relform2;
1014 Oid relfilenode1,
1015 relfilenode2;
1016 Oid swaptemp;
1017 char swptmpchr;
1018
1019 /* We need writable copies of both pg_class tuples. */
1020 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1021
1022 reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1023 if (!HeapTupleIsValid(reltup1))
1024 elog(ERROR, "cache lookup failed for relation %u", r1);
1025 relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1026
1027 reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1028 if (!HeapTupleIsValid(reltup2))
1029 elog(ERROR, "cache lookup failed for relation %u", r2);
1030 relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1031
1032 relfilenode1 = relform1->relfilenode;
1033 relfilenode2 = relform2->relfilenode;
1034
1035 if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1036 {
1037 /*
1038 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1039 * relpersistence
1040 */
1041 Assert(!target_is_pg_class);
1042
1043 swaptemp = relform1->relfilenode;
1044 relform1->relfilenode = relform2->relfilenode;
1045 relform2->relfilenode = swaptemp;
1046
1047 swaptemp = relform1->reltablespace;
1048 relform1->reltablespace = relform2->reltablespace;
1049 relform2->reltablespace = swaptemp;
1050
1051 swptmpchr = relform1->relpersistence;
1052 relform1->relpersistence = relform2->relpersistence;
1053 relform2->relpersistence = swptmpchr;
1054
1055 /* Also swap toast links, if we're swapping by links */
1056 if (!swap_toast_by_content)
1057 {
1058 swaptemp = relform1->reltoastrelid;
1059 relform1->reltoastrelid = relform2->reltoastrelid;
1060 relform2->reltoastrelid = swaptemp;
1061 }
1062 }
1063 else
1064 {
1065 /*
1066 * Mapped-relation case. Here we have to swap the relation mappings
1067 * instead of modifying the pg_class columns. Both must be mapped.
1068 */
1069 if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1070 elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1071 NameStr(relform1->relname));
1072
1073 /*
1074 * We can't change the tablespace nor persistence of a mapped rel, and
1075 * we can't handle toast link swapping for one either, because we must
1076 * not apply any critical changes to its pg_class row. These cases
1077 * should be prevented by upstream permissions tests, so these checks
1078 * are non-user-facing emergency backstop.
1079 */
1080 if (relform1->reltablespace != relform2->reltablespace)
1081 elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1082 NameStr(relform1->relname));
1083 if (relform1->relpersistence != relform2->relpersistence)
1084 elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1085 NameStr(relform1->relname));
1086 if (!swap_toast_by_content &&
1087 (relform1->reltoastrelid || relform2->reltoastrelid))
1088 elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1089 NameStr(relform1->relname));
1090
1091 /*
1092 * Fetch the mappings --- shouldn't fail, but be paranoid
1093 */
1094 relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1095 if (!OidIsValid(relfilenode1))
1096 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1097 NameStr(relform1->relname), r1);
1098 relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1099 if (!OidIsValid(relfilenode2))
1100 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1101 NameStr(relform2->relname), r2);
1102
1103 /*
1104 * Send replacement mappings to relmapper. Note these won't actually
1105 * take effect until CommandCounterIncrement.
1106 */
1107 RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1108 RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1109
1110 /* Pass OIDs of mapped r2 tables back to caller */
1111 *mapped_tables++ = r2;
1112 }
1113
1114 /*
1115 * In the case of a shared catalog, these next few steps will only affect
1116 * our own database's pg_class row; but that's okay, because they are all
1117 * noncritical updates. That's also an important fact for the case of a
1118 * mapped catalog, because it's possible that we'll commit the map change
1119 * and then fail to commit the pg_class update.
1120 */
1121
1122 /* set rel1's frozen Xid and minimum MultiXid */
1123 if (relform1->relkind != RELKIND_INDEX)
1124 {
1125 Assert(!TransactionIdIsValid(frozenXid) ||
1126 TransactionIdIsNormal(frozenXid));
1127 relform1->relfrozenxid = frozenXid;
1128 relform1->relminmxid = cutoffMulti;
1129 }
1130
1131 /* swap size statistics too, since new rel has freshly-updated stats */
1132 {
1133 int32 swap_pages;
1134 float4 swap_tuples;
1135 int32 swap_allvisible;
1136
1137 swap_pages = relform1->relpages;
1138 relform1->relpages = relform2->relpages;
1139 relform2->relpages = swap_pages;
1140
1141 swap_tuples = relform1->reltuples;
1142 relform1->reltuples = relform2->reltuples;
1143 relform2->reltuples = swap_tuples;
1144
1145 swap_allvisible = relform1->relallvisible;
1146 relform1->relallvisible = relform2->relallvisible;
1147 relform2->relallvisible = swap_allvisible;
1148 }
1149
1150 /*
1151 * Update the tuples in pg_class --- unless the target relation of the
1152 * swap is pg_class itself. In that case, there is zero point in making
1153 * changes because we'd be updating the old data that we're about to throw
1154 * away. Because the real work being done here for a mapped relation is
1155 * just to change the relation map settings, it's all right to not update
1156 * the pg_class rows in this case. The most important changes will instead
1157 * performed later, in finish_heap_swap() itself.
1158 */
1159 if (!target_is_pg_class)
1160 {
1161 CatalogIndexState indstate;
1162
1163 indstate = CatalogOpenIndexes(relRelation);
1164 CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1165 indstate);
1166 CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1167 indstate);
1168 CatalogCloseIndexes(indstate);
1169 }
1170 else
1171 {
1172 /* no update ... but we do still need relcache inval */
1173 CacheInvalidateRelcacheByTuple(reltup1);
1174 CacheInvalidateRelcacheByTuple(reltup2);
1175 }
1176
1177 /*
1178 * Post alter hook for modified relations. The change to r2 is always
1179 * internal, but r1 depends on the invocation context.
1180 */
1181 InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1182 InvalidOid, is_internal);
1183 InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1184 InvalidOid, true);
1185
1186 /*
1187 * If we have toast tables associated with the relations being swapped,
1188 * deal with them too.
1189 */
1190 if (relform1->reltoastrelid || relform2->reltoastrelid)
1191 {
1192 if (swap_toast_by_content)
1193 {
1194 if (relform1->reltoastrelid && relform2->reltoastrelid)
1195 {
1196 /* Recursively swap the contents of the toast tables */
1197 swap_relation_files(relform1->reltoastrelid,
1198 relform2->reltoastrelid,
1199 target_is_pg_class,
1200 swap_toast_by_content,
1201 is_internal,
1202 frozenXid,
1203 cutoffMulti,
1204 mapped_tables);
1205 }
1206 else
1207 {
1208 /* caller messed up */
1209 elog(ERROR, "cannot swap toast files by content when there's only one");
1210 }
1211 }
1212 else
1213 {
1214 /*
1215 * We swapped the ownership links, so we need to change dependency
1216 * data to match.
1217 *
1218 * NOTE: it is possible that only one table has a toast table.
1219 *
1220 * NOTE: at present, a TOAST table's only dependency is the one on
1221 * its owning table. If more are ever created, we'd need to use
1222 * something more selective than deleteDependencyRecordsFor() to
1223 * get rid of just the link we want.
1224 */
1225 ObjectAddress baseobject,
1226 toastobject;
1227 long count;
1228
1229 /*
1230 * We disallow this case for system catalogs, to avoid the
1231 * possibility that the catalog we're rebuilding is one of the
1232 * ones the dependency changes would change. It's too late to be
1233 * making any data changes to the target catalog.
1234 */
1235 if (IsSystemClass(r1, relform1))
1236 elog(ERROR, "cannot swap toast files by links for system catalogs");
1237
1238 /* Delete old dependencies */
1239 if (relform1->reltoastrelid)
1240 {
1241 count = deleteDependencyRecordsFor(RelationRelationId,
1242 relform1->reltoastrelid,
1243 false);
1244 if (count != 1)
1245 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1246 count);
1247 }
1248 if (relform2->reltoastrelid)
1249 {
1250 count = deleteDependencyRecordsFor(RelationRelationId,
1251 relform2->reltoastrelid,
1252 false);
1253 if (count != 1)
1254 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1255 count);
1256 }
1257
1258 /* Register new dependencies */
1259 baseobject.classId = RelationRelationId;
1260 baseobject.objectSubId = 0;
1261 toastobject.classId = RelationRelationId;
1262 toastobject.objectSubId = 0;
1263
1264 if (relform1->reltoastrelid)
1265 {
1266 baseobject.objectId = r1;
1267 toastobject.objectId = relform1->reltoastrelid;
1268 recordDependencyOn(&toastobject, &baseobject,
1269 DEPENDENCY_INTERNAL);
1270 }
1271
1272 if (relform2->reltoastrelid)
1273 {
1274 baseobject.objectId = r2;
1275 toastobject.objectId = relform2->reltoastrelid;
1276 recordDependencyOn(&toastobject, &baseobject,
1277 DEPENDENCY_INTERNAL);
1278 }
1279 }
1280 }
1281
1282 /*
1283 * If we're swapping two toast tables by content, do the same for their
1284 * valid index. The swap can actually be safely done only if the relations
1285 * have indexes.
1286 */
1287 if (swap_toast_by_content &&
1288 relform1->relkind == RELKIND_TOASTVALUE &&
1289 relform2->relkind == RELKIND_TOASTVALUE)
1290 {
1291 Oid toastIndex1,
1292 toastIndex2;
1293
1294 /* Get valid index for each relation */
1295 toastIndex1 = toast_get_valid_index(r1,
1296 AccessExclusiveLock);
1297 toastIndex2 = toast_get_valid_index(r2,
1298 AccessExclusiveLock);
1299
1300 swap_relation_files(toastIndex1,
1301 toastIndex2,
1302 target_is_pg_class,
1303 swap_toast_by_content,
1304 is_internal,
1305 InvalidTransactionId,
1306 InvalidMultiXactId,
1307 mapped_tables);
1308 }
1309
1310 /* Clean up. */
1311 heap_freetuple(reltup1);
1312 heap_freetuple(reltup2);
1313
1314 table_close(relRelation, RowExclusiveLock);
1315
1316 /*
1317 * Close both relcache entries' smgr links. We need this kluge because
1318 * both links will be invalidated during upcoming CommandCounterIncrement.
1319 * Whichever of the rels is the second to be cleared will have a dangling
1320 * reference to the other's smgr entry. Rather than trying to avoid this
1321 * by ordering operations just so, it's easiest to close the links first.
1322 * (Fortunately, since one of the entries is local in our transaction,
1323 * it's sufficient to clear out our own relcache this way; the problem
1324 * cannot arise for other backends when they see our update on the
1325 * non-transient relation.)
1326 *
1327 * Caution: the placement of this step interacts with the decision to
1328 * handle toast rels by recursion. When we are trying to rebuild pg_class
1329 * itself, the smgr close on pg_class must happen after all accesses in
1330 * this function.
1331 */
1332 RelationCloseSmgrByOid(r1);
1333 RelationCloseSmgrByOid(r2);
1334}
1335
1336/*
1337 * Remove the transient table that was built by make_new_heap, and finish
1338 * cleaning up (including rebuilding all indexes on the old heap).
1339 */
1340void
1341finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1342 bool is_system_catalog,
1343 bool swap_toast_by_content,
1344 bool check_constraints,
1345 bool is_internal,
1346 TransactionId frozenXid,
1347 MultiXactId cutoffMulti,
1348 char newrelpersistence)
1349{
1350 ObjectAddress object;
1351 Oid mapped_tables[4];
1352 int reindex_flags;
1353 int i;
1354
1355 /* Report that we are now swapping relation files */
1356 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1357 PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1358
1359 /* Zero out possible results from swapped_relation_files */
1360 memset(mapped_tables, 0, sizeof(mapped_tables));
1361
1362 /*
1363 * Swap the contents of the heap relations (including any toast tables).
1364 * Also set old heap's relfrozenxid to frozenXid.
1365 */
1366 swap_relation_files(OIDOldHeap, OIDNewHeap,
1367 (OIDOldHeap == RelationRelationId),
1368 swap_toast_by_content, is_internal,
1369 frozenXid, cutoffMulti, mapped_tables);
1370
1371 /*
1372 * If it's a system catalog, queue a sinval message to flush all catcaches
1373 * on the catalog when we reach CommandCounterIncrement.
1374 */
1375 if (is_system_catalog)
1376 CacheInvalidateCatalog(OIDOldHeap);
1377
1378 /*
1379 * Rebuild each index on the relation (but not the toast table, which is
1380 * all-new at this point). It is important to do this before the DROP
1381 * step because if we are processing a system catalog that will be used
1382 * during DROP, we want to have its indexes available. There is no
1383 * advantage to the other order anyway because this is all transactional,
1384 * so no chance to reclaim disk space before commit. We do not need a
1385 * final CommandCounterIncrement() because reindex_relation does it.
1386 *
1387 * Note: because index_build is called via reindex_relation, it will never
1388 * set indcheckxmin true for the indexes. This is OK even though in some
1389 * sense we are building new indexes rather than rebuilding existing ones,
1390 * because the new heap won't contain any HOT chains at all, let alone
1391 * broken ones, so it can't be necessary to set indcheckxmin.
1392 */
1393 reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1394 if (check_constraints)
1395 reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1396
1397 /*
1398 * Ensure that the indexes have the same persistence as the parent
1399 * relation.
1400 */
1401 if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1402 reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1403 else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1404 reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1405
1406 /* Report that we are now reindexing relations */
1407 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1408 PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1409
1410 reindex_relation(OIDOldHeap, reindex_flags, 0);
1411
1412 /* Report that we are now doing clean up */
1413 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1414 PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1415
1416 /*
1417 * If the relation being rebuild is pg_class, swap_relation_files()
1418 * couldn't update pg_class's own pg_class entry (check comments in
1419 * swap_relation_files()), thus relfrozenxid was not updated. That's
1420 * annoying because a potential reason for doing a VACUUM FULL is a
1421 * imminent or actual anti-wraparound shutdown. So, now that we can
1422 * access the new relation using its indices, update relfrozenxid.
1423 * pg_class doesn't have a toast relation, so we don't need to update the
1424 * corresponding toast relation. Not that there's little point moving all
1425 * relfrozenxid updates here since swap_relation_files() needs to write to
1426 * pg_class for non-mapped relations anyway.
1427 */
1428 if (OIDOldHeap == RelationRelationId)
1429 {
1430 Relation relRelation;
1431 HeapTuple reltup;
1432 Form_pg_class relform;
1433
1434 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1435
1436 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1437 if (!HeapTupleIsValid(reltup))
1438 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1439 relform = (Form_pg_class) GETSTRUCT(reltup);
1440
1441 relform->relfrozenxid = frozenXid;
1442 relform->relminmxid = cutoffMulti;
1443
1444 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1445
1446 table_close(relRelation, RowExclusiveLock);
1447 }
1448
1449 /* Destroy new heap with old filenode */
1450 object.classId = RelationRelationId;
1451 object.objectId = OIDNewHeap;
1452 object.objectSubId = 0;
1453
1454 /*
1455 * The new relation is local to our transaction and we know nothing
1456 * depends on it, so DROP_RESTRICT should be OK.
1457 */
1458 performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1459
1460 /* performDeletion does CommandCounterIncrement at end */
1461
1462 /*
1463 * Now we must remove any relation mapping entries that we set up for the
1464 * transient table, as well as its toast table and toast index if any. If
1465 * we fail to do this before commit, the relmapper will complain about new
1466 * permanent map entries being added post-bootstrap.
1467 */
1468 for (i = 0; OidIsValid(mapped_tables[i]); i++)
1469 RelationMapRemoveMapping(mapped_tables[i]);
1470
1471 /*
1472 * At this point, everything is kosher except that, if we did toast swap
1473 * by links, the toast table's name corresponds to the transient table.
1474 * The name is irrelevant to the backend because it's referenced by OID,
1475 * but users looking at the catalogs could be confused. Rename it to
1476 * prevent this problem.
1477 *
1478 * Note no lock required on the relation, because we already hold an
1479 * exclusive lock on it.
1480 */
1481 if (!swap_toast_by_content)
1482 {
1483 Relation newrel;
1484
1485 newrel = table_open(OIDOldHeap, NoLock);
1486 if (OidIsValid(newrel->rd_rel->reltoastrelid))
1487 {
1488 Oid toastidx;
1489 char NewToastName[NAMEDATALEN];
1490
1491 /* Get the associated valid index to be renamed */
1492 toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1493 AccessShareLock);
1494
1495 /* rename the toast table ... */
1496 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1497 OIDOldHeap);
1498 RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1499 NewToastName, true, false);
1500
1501 /* ... and its valid index too. */
1502 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1503 OIDOldHeap);
1504
1505 RenameRelationInternal(toastidx,
1506 NewToastName, true, true);
1507 }
1508 relation_close(newrel, NoLock);
1509 }
1510
1511 /* if it's not a catalog table, clear any missing attribute settings */
1512 if (!is_system_catalog)
1513 {
1514 Relation newrel;
1515
1516 newrel = table_open(OIDOldHeap, NoLock);
1517 RelationClearMissing(newrel);
1518 relation_close(newrel, NoLock);
1519 }
1520}
1521
1522
1523/*
1524 * Get a list of tables that the current user owns and
1525 * have indisclustered set. Return the list in a List * of rvsToCluster
1526 * with the tableOid and the indexOid on which the table is already
1527 * clustered.
1528 */
1529static List *
1530get_tables_to_cluster(MemoryContext cluster_context)
1531{
1532 Relation indRelation;
1533 TableScanDesc scan;
1534 ScanKeyData entry;
1535 HeapTuple indexTuple;
1536 Form_pg_index index;
1537 MemoryContext old_context;
1538 RelToCluster *rvtc;
1539 List *rvs = NIL;
1540
1541 /*
1542 * Get all indexes that have indisclustered set and are owned by
1543 * appropriate user. System relations or nailed-in relations cannot ever
1544 * have indisclustered set, because CLUSTER will refuse to set it when
1545 * called with one of them as argument.
1546 */
1547 indRelation = table_open(IndexRelationId, AccessShareLock);
1548 ScanKeyInit(&entry,
1549 Anum_pg_index_indisclustered,
1550 BTEqualStrategyNumber, F_BOOLEQ,
1551 BoolGetDatum(true));
1552 scan = table_beginscan_catalog(indRelation, 1, &entry);
1553 while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1554 {
1555 index = (Form_pg_index) GETSTRUCT(indexTuple);
1556
1557 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1558 continue;
1559
1560 /*
1561 * We have to build the list in a different memory context so it will
1562 * survive the cross-transaction processing
1563 */
1564 old_context = MemoryContextSwitchTo(cluster_context);
1565
1566 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1567 rvtc->tableOid = index->indrelid;
1568 rvtc->indexOid = index->indexrelid;
1569 rvs = lcons(rvtc, rvs);
1570
1571 MemoryContextSwitchTo(old_context);
1572 }
1573 table_endscan(scan);
1574
1575 relation_close(indRelation, AccessShareLock);
1576
1577 return rvs;
1578}
1579