cluster.c source code [PostgreSQL/src/backend/commands/cluster.c]

1	/-------------------------------------------------------------------------*
2	*
3	* cluster.c
4	* CLUSTER a table on an index. This is now also used for VACUUM FULL.
5	*
6	* There is hardly anything left of Paul Brown's original implementation...
7	*
8	*
9	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10	* Portions Copyright (c) 1994-5, Regents of the University of California
11	*
12	*
13	* IDENTIFICATION
14	* src/backend/commands/cluster.c
15	*
16	*-------------------------------------------------------------------------
17	*/
18	#include "postgres.h"
19
20	#include "access/amapi.h"
21	#include "access/heapam.h"
22	#include "access/multixact.h"
23	#include "access/relscan.h"
24	#include "access/tableam.h"
25	#include "access/transam.h"
26	#include "access/tuptoaster.h"
27	#include "access/xact.h"
28	#include "access/xlog.h"
29	#include "catalog/pg_am.h"
30	#include "catalog/catalog.h"
31	#include "catalog/dependency.h"
32	#include "catalog/heap.h"
33	#include "catalog/index.h"
34	#include "catalog/namespace.h"
35	#include "catalog/objectaccess.h"
36	#include "catalog/toasting.h"
37	#include "commands/cluster.h"
38	#include "commands/progress.h"
39	#include "commands/tablecmds.h"
40	#include "commands/vacuum.h"
41	#include "miscadmin.h"
42	#include "optimizer/optimizer.h"
43	#include "pgstat.h"
44	#include "storage/bufmgr.h"
45	#include "storage/lmgr.h"
46	#include "storage/predicate.h"
47	#include "utils/acl.h"
48	#include "utils/fmgroids.h"
49	#include "utils/inval.h"
50	#include "utils/lsyscache.h"
51	#include "utils/memutils.h"
52	#include "utils/pg_rusage.h"
53	#include "utils/relmapper.h"
54	#include "utils/snapmgr.h"
55	#include "utils/syscache.h"
56	#include "utils/tuplesort.h"
57
58
59	/*
60	* This struct is used to pass around the information on tables to be
61	* clustered. We need this so we can make a list of them when invoked without
62	* a specific table/index pair.
63	*/
64	typedef struct
65	{
66	Oid tableOid;
67	Oid indexOid;
68	} RelToCluster;
69
70
71	static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
72	static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
73	bool verbose, bool *pSwapToastByContent,
74	TransactionId pFreezeXid, MultiXactId pCutoffMulti);
75	static List *get_tables_to_cluster(MemoryContext cluster_context);
76
77
78	/---------------------------------------------------------------------------*
79	* This cluster code allows for clustering multiple tables at once. Because
80	* of this, we cannot just run everything on a single transaction, or we
81	* would be forced to acquire exclusive locks on all the tables being
82	* clustered, simultaneously --- very likely leading to deadlock.
83	*
84	* To solve this we follow a similar strategy to VACUUM code,
85	* clustering each relation in a separate transaction. For this to work,
86	* we need to:
87	* - provide a separate memory context so that we can pass information in
88	* a way that survives across transactions
89	* - start a new transaction every time a new relation is clustered
90	* - check for validity of the information on to-be-clustered relations,
91	* as someone might have deleted a relation behind our back, or
92	* clustered one on a different index
93	* - end the transaction
94	*
95	* The single-relation case does not have any such overhead.
96	*
97	* We also allow a relation to be specified without index. In that case,
98	* the indisclustered bit will be looked up, and an ERROR will be thrown
99	* if there is no index with the bit set.
100	*---------------------------------------------------------------------------
101	*/
102	void
103	cluster(ClusterStmt *stmt, bool isTopLevel)
104	{
105	if (stmt->relation != NULL)
106	{
107	/ This is the single-relation case. /
108	Oid tableOid,
109	indexOid = InvalidOid;
110	Relation rel;
111
112	/ Find, lock, and check permissions on the table /
113	tableOid = RangeVarGetRelidExtended(stmt->relation,
114	AccessExclusiveLock,
115	`0`,
116	RangeVarCallbackOwnsTable, NULL);
117	rel = table_open(tableOid, NoLock);
118
119	/*
120	* Reject clustering a remote temp table ... their local buffer
121	* manager is not going to cope.
122	*/
123	if (RELATION_IS_OTHER_TEMP(rel))
124	ereport(ERROR,
125	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
126	errmsg("cannot cluster temporary tables of other sessions")));
127
128	/*
129	* Reject clustering a partitioned table.
130	*/
131	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
132	ereport(ERROR,
133	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
134	errmsg("cannot cluster a partitioned table")));
135
136	if (stmt->indexname == NULL)
137	{
138	ListCell *index;
139
140	/ We need to find the index that has indisclustered set. /
141	foreach(index, RelationGetIndexList(rel))
142	{
143	HeapTuple idxtuple;
144	Form_pg_index indexForm;
145
146	indexOid = lfirst_oid(index);
147	idxtuple = SearchSysCache1(INDEXRELID,
148	ObjectIdGetDatum(indexOid));
149	if (!HeapTupleIsValid(idxtuple))
150	elog(ERROR, "cache lookup failed for index %u", indexOid);
151	indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
152	if (indexForm->indisclustered)
153	{
154	ReleaseSysCache(idxtuple);
155	break;
156	}
157	ReleaseSysCache(idxtuple);
158	indexOid = InvalidOid;
159	}
160
161	if (!OidIsValid(indexOid))
162	ereport(ERROR,
163	(errcode(ERRCODE_UNDEFINED_OBJECT),
164	errmsg("there is no previously clustered index for table \"%s\"",
165	stmt->relation->relname)));
166	}
167	else
168	{
169	/*
170	* The index is expected to be in the same namespace as the
171	* relation.
172	*/
173	indexOid = get_relname_relid(stmt->indexname,
174	rel->rd_rel->relnamespace);
175	if (!OidIsValid(indexOid))
176	ereport(ERROR,
177	(errcode(ERRCODE_UNDEFINED_OBJECT),
178	errmsg("index \"%s\" for table \"%s\" does not exist",
179	stmt->indexname, stmt->relation->relname)));
180	}
181
182	/ close relation, keep lock till commit /
183	table_close(rel, NoLock);
184
185	/ Do the job. /
186	cluster_rel(tableOid, indexOid, stmt->options);
187	}
188	else
189	{
190	/*
191	* This is the "multi relation" case. We need to cluster all tables
192	* that have some index with indisclustered set.
193	*/
194	MemoryContext cluster_context;
195	List *rvs;
196	ListCell *rv;
197
198	/*
199	* We cannot run this form of CLUSTER inside a user transaction block;
200	* we'd be holding locks way too long.
201	*/
202	PreventInTransactionBlock(isTopLevel, "CLUSTER");
203
204	/*
205	* Create special memory context for cross-transaction storage.
206	*
207	* Since it is a child of PortalContext, it will go away even in case
208	* of error.
209	*/
210	cluster_context = AllocSetContextCreate(PortalContext,
211	"Cluster",
212	ALLOCSET_DEFAULT_SIZES);
213
214	/*
215	* Build the list of relations to cluster. Note that this lives in
216	* cluster_context.
217	*/
218	rvs = get_tables_to_cluster(cluster_context);
219
220	/ Commit to get out of starting transaction /
221	PopActiveSnapshot();
222	CommitTransactionCommand();
223
224	/ Ok, now that we've got them all, cluster them one by one /
225	foreach(rv, rvs)
226	{
227	RelToCluster rvtc = (RelToCluster ) lfirst(rv);
228
229	/ Start a new transaction for each relation. /
230	StartTransactionCommand();
231	/ functions in indexes may want a snapshot set /
232	PushActiveSnapshot(GetTransactionSnapshot());
233	/ Do the job. /
234	cluster_rel(rvtc->tableOid, rvtc->indexOid,
235	stmt->options \| CLUOPT_RECHECK);
236	PopActiveSnapshot();
237	CommitTransactionCommand();
238	}
239
240	/ Start a new transaction for the cleanup work. /
241	StartTransactionCommand();
242
243	/ Clean up working storage /
244	MemoryContextDelete(cluster_context);
245	}
246	}
247
248	/*
249	* cluster_rel
250	*
251	* This clusters the table by creating a new, clustered table and
252	* swapping the relfilenodes of the new table and the old table, so
253	* the OID of the original table is preserved. Thus we do not lose
254	* GRANT, inheritance nor references to this table (this was a bug
255	* in releases through 7.3).
256	*
257	* Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
258	* the new table, it's better to create the indexes afterwards than to fill
259	* them incrementally while we load the table.
260	*
261	* If indexOid is InvalidOid, the table will be rewritten in physical order
262	* instead of index order. This is the new implementation of VACUUM FULL,
263	* and error messages should refer to the operation as VACUUM not CLUSTER.
264	*/
265	void
266	cluster_rel(Oid tableOid, Oid indexOid, int options)
267	{
268	Relation OldHeap;
269	bool verbose = ((options & CLUOPT_VERBOSE) != `0`);
270	bool recheck = ((options & CLUOPT_RECHECK) != `0`);
271
272	/ Check for user-requested abort. /
273	CHECK_FOR_INTERRUPTS();
274
275	pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
276	if (OidIsValid(indexOid))
277	pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
278	PROGRESS_CLUSTER_COMMAND_CLUSTER);
279	else
280	pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
281	PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
282
283	/*
284	* We grab exclusive access to the target rel and index for the duration
285	* of the transaction. (This is redundant for the single-transaction
286	* case, since cluster() already did it.) The index lock is taken inside
287	* check_index_is_clusterable.
288	*/
289	OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
290
291	/ If the table has gone away, we can skip processing it /
292	if (!OldHeap)
293	{
294	pgstat_progress_end_command();
295	return;
296	}
297
298	/*
299	* Since we may open a new transaction for each relation, we have to check
300	* that the relation still is what we think it is.
301	*
302	* If this is a single-transaction CLUSTER, we can skip these tests. We
303	* must skip the one on indisclustered since it would reject an attempt
304	* to cluster a not-previously-clustered index.
305	*/
306	if (recheck)
307	{
308	HeapTuple tuple;
309	Form_pg_index indexForm;
310
311	/ Check that the user still owns the relation /
312	if (!pg_class_ownercheck(tableOid, GetUserId()))
313	{
314	relation_close(OldHeap, AccessExclusiveLock);
315	pgstat_progress_end_command();
316	return;
317	}
318
319	/*
320	* Silently skip a temp table for a remote session. Only doing this
321	* check in the "recheck" case is appropriate (which currently means
322	* somebody is executing a database-wide CLUSTER), because there is
323	* another check in cluster() which will stop any attempt to cluster
324	* remote temp tables by name. There is another check in cluster_rel
325	* which is redundant, but we leave it for extra safety.
326	*/
327	if (RELATION_IS_OTHER_TEMP(OldHeap))
328	{
329	relation_close(OldHeap, AccessExclusiveLock);
330	pgstat_progress_end_command();
331	return;
332	}
333
334	if (OidIsValid(indexOid))
335	{
336	/*
337	* Check that the index still exists
338	*/
339	if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
340	{
341	relation_close(OldHeap, AccessExclusiveLock);
342	pgstat_progress_end_command();
343	return;
344	}
345
346	/*
347	* Check that the index is still the one with indisclustered set.
348	*/
349	tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
350	if (!HeapTupleIsValid(tuple)) / probably can't happen /
351	{
352	relation_close(OldHeap, AccessExclusiveLock);
353	pgstat_progress_end_command();
354	return;
355	}
356	indexForm = (Form_pg_index) GETSTRUCT(tuple);
357	if (!indexForm->indisclustered)
358	{
359	ReleaseSysCache(tuple);
360	relation_close(OldHeap, AccessExclusiveLock);
361	pgstat_progress_end_command();
362	return;
363	}
364	ReleaseSysCache(tuple);
365	}
366	}
367
368	/*
369	* We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
370	* would work in most respects, but the index would only get marked as
371	* indisclustered in the current database, leading to unexpected behavior
372	* if CLUSTER were later invoked in another database.
373	*/
374	if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
375	ereport(ERROR,
376	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
377	errmsg("cannot cluster a shared catalog")));
378
379	/*
380	* Don't process temp tables of other backends ... their local buffer
381	* manager is not going to cope.
382	*/
383	if (RELATION_IS_OTHER_TEMP(OldHeap))
384	{
385	if (OidIsValid(indexOid))
386	ereport(ERROR,
387	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
388	errmsg("cannot cluster temporary tables of other sessions")));
389	else
390	ereport(ERROR,
391	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
392	errmsg("cannot vacuum temporary tables of other sessions")));
393	}
394
395	/*
396	* Also check for active uses of the relation in the current transaction,
397	* including open scans and pending AFTER trigger events.
398	*/
399	CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
400
401	/ Check heap and index are valid to cluster on /
402	if (OidIsValid(indexOid))
403	check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
404
405	/*
406	* Quietly ignore the request if this is a materialized view which has not
407	* been populated from its query. No harm is done because there is no data
408	* to deal with, and we don't want to throw an error if this is part of a
409	* multi-relation request -- for example, CLUSTER was run on the entire
410	* database.
411	*/
412	if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
413	!RelationIsPopulated(OldHeap))
414	{
415	relation_close(OldHeap, AccessExclusiveLock);
416	pgstat_progress_end_command();
417	return;
418	}
419
420	/*
421	* All predicate locks on the tuples or pages are about to be made
422	* invalid, because we move tuples around. Promote them to relation
423	* locks. Predicate locks on indexes will be promoted when they are
424	* reindexed.
425	*/
426	TransferPredicateLocksToHeapRelation(OldHeap);
427
428	/ rebuild_relation does all the dirty work /
429	rebuild_relation(OldHeap, indexOid, verbose);
430
431	/ NB: rebuild_relation does table_close() on OldHeap /
432
433	pgstat_progress_end_command();
434	}
435
436	/*
437	* Verify that the specified heap and index are valid to cluster on
438	*
439	* Side effect: obtains lock on the index. The caller may
440	* in some cases already have AccessExclusiveLock on the table, but
441	* not in all cases so we can't rely on the table-level lock for
442	* protection here.
443	*/
444	void
445	check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
446	{
447	Relation OldIndex;
448
449	OldIndex = index_open(indexOid, lockmode);
450
451	/*
452	* Check that index is in fact an index on the given relation
453	*/
454	if (OldIndex->rd_index == NULL \|\|
455	OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
456	ereport(ERROR,
457	(errcode(ERRCODE_WRONG_OBJECT_TYPE),
458	errmsg("\"%s\" is not an index for table \"%s\"",
459	RelationGetRelationName(OldIndex),
460	RelationGetRelationName(OldHeap))));
461
462	/ Index AM must allow clustering /
463	if (!OldIndex->rd_indam->amclusterable)
464	ereport(ERROR,
465	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
466	errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
467	RelationGetRelationName(OldIndex))));
468
469	/*
470	* Disallow clustering on incomplete indexes (those that might not index
471	* every row of the relation). We could relax this by making a separate
472	* seqscan pass over the table to copy the missing rows, but that seems
473	* expensive and tedious.
474	*/
475	if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
476	ereport(ERROR,
477	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
478	errmsg("cannot cluster on partial index \"%s\"",
479	RelationGetRelationName(OldIndex))));
480
481	/*
482	* Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
483	* it might well not contain entries for every heap row, or might not even
484	* be internally consistent. (But note that we don't check indcheckxmin;
485	* the worst consequence of following broken HOT chains would be that we
486	* might put recently-dead tuples out-of-order in the new table, and there
487	* is little harm in that.)
488	*/
489	if (!OldIndex->rd_index->indisvalid)
490	ereport(ERROR,
491	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
492	errmsg("cannot cluster on invalid index \"%s\"",
493	RelationGetRelationName(OldIndex))));
494
495	/ Drop relcache refcnt on OldIndex, but keep lock /
496	index_close(OldIndex, NoLock);
497	}
498
499	/*
500	* mark_index_clustered: mark the specified index as the one clustered on
501	*
502	* With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
503	*/
504	void
505	mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
506	{
507	HeapTuple indexTuple;
508	Form_pg_index indexForm;
509	Relation pg_index;
510	ListCell *index;
511
512	/ Disallow applying to a partitioned table /
513	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
514	ereport(ERROR,
515	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
516	errmsg("cannot mark index clustered in partitioned table")));
517
518	/*
519	* If the index is already marked clustered, no need to do anything.
520	*/
521	if (OidIsValid(indexOid))
522	{
523	indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
524	if (!HeapTupleIsValid(indexTuple))
525	elog(ERROR, "cache lookup failed for index %u", indexOid);
526	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
527
528	if (indexForm->indisclustered)
529	{
530	ReleaseSysCache(indexTuple);
531	return;
532	}
533
534	ReleaseSysCache(indexTuple);
535	}
536
537	/*
538	* Check each index of the relation and set/clear the bit as needed.
539	*/
540	pg_index = table_open(IndexRelationId, RowExclusiveLock);
541
542	foreach(index, RelationGetIndexList(rel))
543	{
544	Oid thisIndexOid = lfirst_oid(index);
545
546	indexTuple = SearchSysCacheCopy1(INDEXRELID,
547	ObjectIdGetDatum(thisIndexOid));
548	if (!HeapTupleIsValid(indexTuple))
549	elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
550	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
551
552	/*
553	* Unset the bit if set. We know it's wrong because we checked this
554	* earlier.
555	*/
556	if (indexForm->indisclustered)
557	{
558	indexForm->indisclustered = false;
559	CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
560	}
561	else if (thisIndexOid == indexOid)
562	{
563	/ this was checked earlier, but let's be real sure /
564	if (!indexForm->indisvalid)
565	elog(ERROR, "cannot cluster on invalid index %u", indexOid);
566	indexForm->indisclustered = true;
567	CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
568	}
569
570	InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, `0`,
571	InvalidOid, is_internal);
572
573	heap_freetuple(indexTuple);
574	}
575
576	table_close(pg_index, RowExclusiveLock);
577	}
578
579	/*
580	* rebuild_relation: rebuild an existing relation in index or physical order
581	*
582	* OldHeap: table to rebuild --- must be opened and exclusive-locked!
583	* indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
584	*
585	* NB: this routine closes OldHeap at the right time; caller should not.
586	*/
587	static void
588	rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
589	{
590	Oid tableOid = RelationGetRelid(OldHeap);
591	Oid tableSpace = OldHeap->rd_rel->reltablespace;
592	Oid OIDNewHeap;
593	char relpersistence;
594	bool is_system_catalog;
595	bool swap_toast_by_content;
596	TransactionId frozenXid;
597	MultiXactId cutoffMulti;
598
599	/ Mark the correct index as clustered /
600	if (OidIsValid(indexOid))
601	mark_index_clustered(OldHeap, indexOid, true);
602
603	/ Remember info about rel before closing OldHeap /
604	relpersistence = OldHeap->rd_rel->relpersistence;
605	is_system_catalog = IsSystemRelation(OldHeap);
606
607	/ Close relcache entry, but keep lock until transaction commit /
608	table_close(OldHeap, NoLock);
609
610	/ Create the transient table that will receive the re-ordered data /
611	OIDNewHeap = make_new_heap(tableOid, tableSpace,
612	relpersistence,
613	AccessExclusiveLock);
614
615	/ Copy the heap data into the new table in the desired order /
616	copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
617	&swap_toast_by_content, &frozenXid, &cutoffMulti);
618
619	/*
620	* Swap the physical files of the target and transient tables, then
621	* rebuild the target's indexes and throw away the transient table.
622	*/
623	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
624	swap_toast_by_content, false, true,
625	frozenXid, cutoffMulti,
626	relpersistence);
627	}
628
629
630	/*
631	* Create the transient table that will be filled with new data during
632	* CLUSTER, ALTER TABLE, and similar operations. The transient table
633	* duplicates the logical structure of the OldHeap, but is placed in
634	* NewTableSpace which might be different from OldHeap's. Also, it's built
635	* with the specified persistence, which might differ from the original's.
636	*
637	* After this, the caller should load the new heap with transferred/modified
638	* data, then call finish_heap_swap to complete the operation.
639	*/
640	Oid
641	make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
642	LOCKMODE lockmode)
643	{
644	TupleDesc OldHeapDesc;
645	char NewHeapName[NAMEDATALEN];
646	Oid OIDNewHeap;
647	Oid toastid;
648	Relation OldHeap;
649	HeapTuple tuple;
650	Datum reloptions;
651	bool isNull;
652	Oid namespaceid;
653
654	OldHeap = table_open(OIDOldHeap, lockmode);
655	OldHeapDesc = RelationGetDescr(OldHeap);
656
657	/*
658	* Note that the NewHeap will not receive any of the defaults or
659	* constraints associated with the OldHeap; we don't need 'em, and there's
660	* no reason to spend cycles inserting them into the catalogs only to
661	* delete them.
662	*/
663
664	/*
665	* But we do want to use reloptions of the old heap for new heap.
666	*/
667	tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
668	if (!HeapTupleIsValid(tuple))
669	elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
670	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
671	&isNull);
672	if (isNull)
673	reloptions = (Datum) `0`;
674
675	if (relpersistence == RELPERSISTENCE_TEMP)
676	namespaceid = LookupCreationNamespace("pg_temp");
677	else
678	namespaceid = RelationGetNamespace(OldHeap);
679
680	/*
681	* Create the new heap, using a temporary name in the same namespace as
682	* the existing table. NOTE: there is some risk of collision with user
683	* relnames. Working around this seems more trouble than it's worth; in
684	* particular, we can't create the new heap in a different namespace from
685	* the old, or we will have problems with the TEMP status of temp tables.
686	*
687	* Note: the new heap is not a shared relation, even if we are rebuilding
688	* a shared rel. However, we do make the new heap mapped if the source is
689	* mapped. This simplifies swap_relation_files, and is absolutely
690	* necessary for rebuilding pg_class, for reasons explained there.
691	*/
692	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
693
694	OIDNewHeap = heap_create_with_catalog(NewHeapName,
695	namespaceid,
696	NewTableSpace,
697	InvalidOid,
698	InvalidOid,
699	InvalidOid,
700	OldHeap->rd_rel->relowner,
701	OldHeap->rd_rel->relam,
702	OldHeapDesc,
703	NIL,
704	RELKIND_RELATION,
705	relpersistence,
706	false,
707	RelationIsMapped(OldHeap),
708	ONCOMMIT_NOOP,
709	reloptions,
710	false,
711	true,
712	true,
713	OIDOldHeap,
714	NULL);
715	Assert(OIDNewHeap != InvalidOid);
716
717	ReleaseSysCache(tuple);
718
719	/*
720	* Advance command counter so that the newly-created relation's catalog
721	* tuples will be visible to table_open.
722	*/
723	CommandCounterIncrement();
724
725	/*
726	* If necessary, create a TOAST table for the new relation.
727	*
728	* If the relation doesn't have a TOAST table already, we can't need one
729	* for the new relation. The other way around is possible though: if some
730	* wide columns have been dropped, NewHeapCreateToastTable can decide that
731	* no TOAST table is needed for the new table.
732	*
733	* Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
734	* that the TOAST table will be visible for insertion.
735	*/
736	toastid = OldHeap->rd_rel->reltoastrelid;
737	if (OidIsValid(toastid))
738	{
739	/ keep the existing toast table's reloptions, if any /
740	tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
741	if (!HeapTupleIsValid(tuple))
742	elog(ERROR, "cache lookup failed for relation %u", toastid);
743	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
744	&isNull);
745	if (isNull)
746	reloptions = (Datum) `0`;
747
748	NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
749
750	ReleaseSysCache(tuple);
751	}
752
753	table_close(OldHeap, NoLock);
754
755	return OIDNewHeap;
756	}
757
758	/*
759	* Do the physical copying of table data.
760	*
761	* There are three output parameters:
762	* *pSwapToastByContent is set true if toast tables must be swapped by content.
763	* *pFreezeXid receives the TransactionId used as freeze cutoff point.
764	* *pCutoffMulti receives the MultiXactId used as a cutoff point.
765	*/
766	static void
767	copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
768	bool pSwapToastByContent, TransactionId pFreezeXid,
769	MultiXactId *pCutoffMulti)
770	{
771	Relation NewHeap,
772	OldHeap,
773	OldIndex;
774	Relation relRelation;
775	HeapTuple reltup;
776	Form_pg_class relform;
777	TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
778	TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
779	TransactionId OldestXmin;
780	TransactionId FreezeXid;
781	MultiXactId MultiXactCutoff;
782	bool use_sort;
783	double num_tuples = `0`,
784	tups_vacuumed = `0`,
785	tups_recently_dead = `0`;
786	BlockNumber num_pages;
787	int elevel = verbose ? INFO : DEBUG2;
788	PGRUsage ru0;
789
790	pg_rusage_init(&ru0);
791
792	/*
793	* Open the relations we need.
794	*/
795	NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
796	OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
797	if (OidIsValid(OIDOldIndex))
798	OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
799	else
800	OldIndex = NULL;
801
802	/*
803	* Their tuple descriptors should be exactly alike, but here we only need
804	* assume that they have the same number of columns.
805	*/
806	oldTupDesc = RelationGetDescr(OldHeap);
807	newTupDesc = RelationGetDescr(NewHeap);
808	Assert(newTupDesc->natts == oldTupDesc->natts);
809
810	/*
811	* If the OldHeap has a toast table, get lock on the toast table to keep
812	* it from being vacuumed. This is needed because autovacuum processes
813	* toast tables independently of their main tables, with no lock on the
814	* latter. If an autovacuum were to start on the toast table after we
815	* compute our OldestXmin below, it would use a later OldestXmin, and then
816	* possibly remove as DEAD toast tuples belonging to main tuples we think
817	* are only RECENTLY_DEAD. Then we'd fail while trying to copy those
818	* tuples.
819	*
820	* We don't need to open the toast relation here, just lock it. The lock
821	* will be held till end of transaction.
822	*/
823	if (OldHeap->rd_rel->reltoastrelid)
824	LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
825
826	/*
827	* If both tables have TOAST tables, perform toast swap by content. It is
828	* possible that the old table has a toast table but the new one doesn't,
829	* if toastable columns have been dropped. In that case we have to do
830	* swap by links. This is okay because swap by content is only essential
831	* for system catalogs, and we don't support schema changes for them.
832	*/
833	if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
834	{
835	*pSwapToastByContent = true;
836
837	/*
838	* When doing swap by content, any toast pointers written into NewHeap
839	* must use the old toast table's OID, because that's where the toast
840	* data will eventually be found. Set this up by setting rd_toastoid.
841	* This also tells toast_save_datum() to preserve the toast value
842	* OIDs, which we want so as not to invalidate toast pointers in
843	* system catalog caches, and to avoid making multiple copies of a
844	* single toast value.
845	*
846	* Note that we must hold NewHeap open until we are done writing data,
847	* since the relcache will not guarantee to remember this setting once
848	* the relation is closed. Also, this technique depends on the fact
849	* that no one will try to read from the NewHeap until after we've
850	* finished writing it and swapping the rels --- otherwise they could
851	* follow the toast pointers to the wrong place. (It would actually
852	* work for values copied over from the old toast table, but not for
853	* any values that we toast which were previously not toasted.)
854	*/
855	NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
856	}
857	else
858	*pSwapToastByContent = false;
859
860	/*
861	* Compute xids used to freeze and weed out dead tuples and multixacts.
862	* Since we're going to rewrite the whole table anyway, there's no reason
863	* not to be aggressive about this.
864	*/
865	vacuum_set_xid_limits(OldHeap, `0`, `0`, `0`, `0`,
866	&OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
867	NULL);
868
869	/*
870	* FreezeXid will become the table's new relfrozenxid, and that mustn't go
871	* backwards, so take the max.
872	*/
873	if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
874	TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
875	FreezeXid = OldHeap->rd_rel->relfrozenxid;
876
877	/*
878	* MultiXactCutoff, similarly, shouldn't go backwards either.
879	*/
880	if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
881	MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
882	MultiXactCutoff = OldHeap->rd_rel->relminmxid;
883
884	/*
885	* Decide whether to use an indexscan or seqscan-and-optional-sort to scan
886	* the OldHeap. We know how to use a sort to duplicate the ordering of a
887	* btree index, and will use seqscan-and-sort for that case if the planner
888	* tells us it's cheaper. Otherwise, always indexscan if an index is
889	* provided, else plain seqscan.
890	*/
891	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
892	use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
893	else
894	use_sort = false;
895
896	/ Log what we're doing /
897	if (OldIndex != NULL && !use_sort)
898	ereport(elevel,
899	(errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
900	get_namespace_name(RelationGetNamespace(OldHeap)),
901	RelationGetRelationName(OldHeap),
902	RelationGetRelationName(OldIndex))));
903	else if (use_sort)
904	ereport(elevel,
905	(errmsg("clustering \"%s.%s\" using sequential scan and sort",
906	get_namespace_name(RelationGetNamespace(OldHeap)),
907	RelationGetRelationName(OldHeap))));
908	else
909	ereport(elevel,
910	(errmsg("vacuuming \"%s.%s\"",
911	get_namespace_name(RelationGetNamespace(OldHeap)),
912	RelationGetRelationName(OldHeap))));
913
914	/*
915	* Hand of the actual copying to AM specific function, the generic code
916	* cannot know how to deal with visibility across AMs. Note that this
917	* routine is allowed to set FreezeXid / MultiXactCutoff to different
918	* values (e.g. because the AM doesn't use freezing).
919	*/
920	table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
921	OldestXmin, &FreezeXid, &MultiXactCutoff,
922	&num_tuples, &tups_vacuumed,
923	&tups_recently_dead);
924
925	/ return selected values to caller, get set as relfrozenxid/minmxid /
926	*pFreezeXid = FreezeXid;
927	*pCutoffMulti = MultiXactCutoff;
928
929	/ Reset rd_toastoid just to be tidy --- it shouldn't be looked at again /
930	NewHeap->rd_toastoid = InvalidOid;
931
932	num_pages = RelationGetNumberOfBlocks(NewHeap);
933
934	/ Log what we did /
935	ereport(elevel,
936	(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
937	RelationGetRelationName(OldHeap),
938	tups_vacuumed, num_tuples,
939	RelationGetNumberOfBlocks(OldHeap)),
940	errdetail("%.0f dead row versions cannot be removed yet.\n"
941	"%s.",
942	tups_recently_dead,
943	pg_rusage_show(&ru0))));
944
945	if (OldIndex != NULL)
946	index_close(OldIndex, NoLock);
947	table_close(OldHeap, NoLock);
948	table_close(NewHeap, NoLock);
949
950	/ Update pg_class to reflect the correct values of pages and tuples. /
951	relRelation = table_open(RelationRelationId, RowExclusiveLock);
952
953	reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
954	if (!HeapTupleIsValid(reltup))
955	elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
956	relform = (Form_pg_class) GETSTRUCT(reltup);
957
958	relform->relpages = num_pages;
959	relform->reltuples = num_tuples;
960
961	/ Don't update the stats for pg_class. See swap_relation_files. /
962	if (OIDOldHeap != RelationRelationId)
963	CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
964	else
965	CacheInvalidateRelcacheByTuple(reltup);
966
967	/ Clean up. /
968	heap_freetuple(reltup);
969	table_close(relRelation, RowExclusiveLock);
970
971	/ Make the update visible /
972	CommandCounterIncrement();
973	}
974
975	/*
976	* Swap the physical files of two given relations.
977	*
978	* We swap the physical identity (reltablespace, relfilenode) while keeping the
979	* same logical identities of the two relations. relpersistence is also
980	* swapped, which is critical since it determines where buffers live for each
981	* relation.
982	*
983	* We can swap associated TOAST data in either of two ways: recursively swap
984	* the physical content of the toast tables (and their indexes), or swap the
985	* TOAST links in the given relations' pg_class entries. The former is needed
986	* to manage rewrites of shared catalogs (where we cannot change the pg_class
987	* links) while the latter is the only way to handle cases in which a toast
988	* table is added or removed altogether.
989	*
990	* Additionally, the first relation is marked with relfrozenxid set to
991	* frozenXid. It seems a bit ugly to have this here, but the caller would
992	* have to do it anyway, so having it here saves a heap_update. Note: in
993	* the swap-toast-links case, we assume we don't need to change the toast
994	* table's relfrozenxid: the new version of the toast table should already
995	* have relfrozenxid set to RecentXmin, which is good enough.
996	*
997	* Lastly, if r2 and its toast table and toast index (if any) are mapped,
998	* their OIDs are emitted into mapped_tables[]. This is hacky but beats
999	* having to look the information up again later in finish_heap_swap.
1000	*/
1001	static void
1002	swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1003	bool swap_toast_by_content,
1004	bool is_internal,
1005	TransactionId frozenXid,
1006	MultiXactId cutoffMulti,
1007	Oid *mapped_tables)
1008	{
1009	Relation relRelation;
1010	HeapTuple reltup1,
1011	reltup2;
1012	Form_pg_class relform1,
1013	relform2;
1014	Oid relfilenode1,
1015	relfilenode2;
1016	Oid swaptemp;
1017	char swptmpchr;
1018
1019	/ We need writable copies of both pg_class tuples. /
1020	relRelation = table_open(RelationRelationId, RowExclusiveLock);
1021
1022	reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1023	if (!HeapTupleIsValid(reltup1))
1024	elog(ERROR, "cache lookup failed for relation %u", r1);
1025	relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1026
1027	reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1028	if (!HeapTupleIsValid(reltup2))
1029	elog(ERROR, "cache lookup failed for relation %u", r2);
1030	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1031
1032	relfilenode1 = relform1->relfilenode;
1033	relfilenode2 = relform2->relfilenode;
1034
1035	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1036	{
1037	/*
1038	* Normal non-mapped relations: swap relfilenodes, reltablespaces,
1039	* relpersistence
1040	*/
1041	Assert(!target_is_pg_class);
1042
1043	swaptemp = relform1->relfilenode;
1044	relform1->relfilenode = relform2->relfilenode;
1045	relform2->relfilenode = swaptemp;
1046
1047	swaptemp = relform1->reltablespace;
1048	relform1->reltablespace = relform2->reltablespace;
1049	relform2->reltablespace = swaptemp;
1050
1051	swptmpchr = relform1->relpersistence;
1052	relform1->relpersistence = relform2->relpersistence;
1053	relform2->relpersistence = swptmpchr;
1054
1055	/ Also swap toast links, if we're swapping by links /
1056	if (!swap_toast_by_content)
1057	{
1058	swaptemp = relform1->reltoastrelid;
1059	relform1->reltoastrelid = relform2->reltoastrelid;
1060	relform2->reltoastrelid = swaptemp;
1061	}
1062	}
1063	else
1064	{
1065	/*
1066	* Mapped-relation case. Here we have to swap the relation mappings
1067	* instead of modifying the pg_class columns. Both must be mapped.
1068	*/
1069	if (OidIsValid(relfilenode1) \|\| OidIsValid(relfilenode2))
1070	elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1071	NameStr(relform1->relname));
1072
1073	/*
1074	* We can't change the tablespace nor persistence of a mapped rel, and
1075	* we can't handle toast link swapping for one either, because we must
1076	* not apply any critical changes to its pg_class row. These cases
1077	* should be prevented by upstream permissions tests, so these checks
1078	* are non-user-facing emergency backstop.
1079	*/
1080	if (relform1->reltablespace != relform2->reltablespace)
1081	elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1082	NameStr(relform1->relname));
1083	if (relform1->relpersistence != relform2->relpersistence)
1084	elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1085	NameStr(relform1->relname));
1086	if (!swap_toast_by_content &&
1087	(relform1->reltoastrelid \|\| relform2->reltoastrelid))
1088	elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1089	NameStr(relform1->relname));
1090
1091	/*
1092	* Fetch the mappings --- shouldn't fail, but be paranoid
1093	*/
1094	relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1095	if (!OidIsValid(relfilenode1))
1096	elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1097	NameStr(relform1->relname), r1);
1098	relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1099	if (!OidIsValid(relfilenode2))
1100	elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1101	NameStr(relform2->relname), r2);
1102
1103	/*
1104	* Send replacement mappings to relmapper. Note these won't actually
1105	* take effect until CommandCounterIncrement.
1106	*/
1107	RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1108	RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1109
1110	/ Pass OIDs of mapped r2 tables back to caller /
1111	*mapped_tables++ = r2;
1112	}
1113
1114	/*
1115	* In the case of a shared catalog, these next few steps will only affect
1116	* our own database's pg_class row; but that's okay, because they are all
1117	* noncritical updates. That's also an important fact for the case of a
1118	* mapped catalog, because it's possible that we'll commit the map change
1119	* and then fail to commit the pg_class update.
1120	*/
1121
1122	/ set rel1's frozen Xid and minimum MultiXid /
1123	if (relform1->relkind != RELKIND_INDEX)
1124	{
1125	Assert(!TransactionIdIsValid(frozenXid) \|\|
1126	TransactionIdIsNormal(frozenXid));
1127	relform1->relfrozenxid = frozenXid;
1128	relform1->relminmxid = cutoffMulti;
1129	}
1130
1131	/ swap size statistics too, since new rel has freshly-updated stats /
1132	{
1133	int32 swap_pages;
1134	float4 swap_tuples;
1135	int32 swap_allvisible;
1136
1137	swap_pages = relform1->relpages;
1138	relform1->relpages = relform2->relpages;
1139	relform2->relpages = swap_pages;
1140
1141	swap_tuples = relform1->reltuples;
1142	relform1->reltuples = relform2->reltuples;
1143	relform2->reltuples = swap_tuples;
1144
1145	swap_allvisible = relform1->relallvisible;
1146	relform1->relallvisible = relform2->relallvisible;
1147	relform2->relallvisible = swap_allvisible;
1148	}
1149
1150	/*
1151	* Update the tuples in pg_class --- unless the target relation of the
1152	* swap is pg_class itself. In that case, there is zero point in making
1153	* changes because we'd be updating the old data that we're about to throw
1154	* away. Because the real work being done here for a mapped relation is
1155	* just to change the relation map settings, it's all right to not update
1156	* the pg_class rows in this case. The most important changes will instead
1157	* performed later, in finish_heap_swap() itself.
1158	*/
1159	if (!target_is_pg_class)
1160	{
1161	CatalogIndexState indstate;
1162
1163	indstate = CatalogOpenIndexes(relRelation);
1164	CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1165	indstate);
1166	CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1167	indstate);
1168	CatalogCloseIndexes(indstate);
1169	}
1170	else
1171	{
1172	/ no update ... but we do still need relcache inval /
1173	CacheInvalidateRelcacheByTuple(reltup1);
1174	CacheInvalidateRelcacheByTuple(reltup2);
1175	}
1176
1177	/*
1178	* Post alter hook for modified relations. The change to r2 is always
1179	* internal, but r1 depends on the invocation context.
1180	*/
1181	InvokeObjectPostAlterHookArg(RelationRelationId, r1, `0`,
1182	InvalidOid, is_internal);
1183	InvokeObjectPostAlterHookArg(RelationRelationId, r2, `0`,
1184	InvalidOid, true);
1185
1186	/*
1187	* If we have toast tables associated with the relations being swapped,
1188	* deal with them too.
1189	*/
1190	if (relform1->reltoastrelid \|\| relform2->reltoastrelid)
1191	{
1192	if (swap_toast_by_content)
1193	{
1194	if (relform1->reltoastrelid && relform2->reltoastrelid)
1195	{
1196	/ Recursively swap the contents of the toast tables /
1197	swap_relation_files(relform1->reltoastrelid,
1198	relform2->reltoastrelid,
1199	target_is_pg_class,
1200	swap_toast_by_content,
1201	is_internal,
1202	frozenXid,
1203	cutoffMulti,
1204	mapped_tables);
1205	}
1206	else
1207	{
1208	/ caller messed up /
1209	elog(ERROR, "cannot swap toast files by content when there's only one");
1210	}
1211	}
1212	else
1213	{
1214	/*
1215	* We swapped the ownership links, so we need to change dependency
1216	* data to match.
1217	*
1218	* NOTE: it is possible that only one table has a toast table.
1219	*
1220	* NOTE: at present, a TOAST table's only dependency is the one on
1221	* its owning table. If more are ever created, we'd need to use
1222	* something more selective than deleteDependencyRecordsFor() to
1223	* get rid of just the link we want.
1224	*/
1225	ObjectAddress baseobject,
1226	toastobject;
1227	long count;
1228
1229	/*
1230	* We disallow this case for system catalogs, to avoid the
1231	* possibility that the catalog we're rebuilding is one of the
1232	* ones the dependency changes would change. It's too late to be
1233	* making any data changes to the target catalog.
1234	*/
1235	if (IsSystemClass(r1, relform1))
1236	elog(ERROR, "cannot swap toast files by links for system catalogs");
1237
1238	/ Delete old dependencies /
1239	if (relform1->reltoastrelid)
1240	{
1241	count = deleteDependencyRecordsFor(RelationRelationId,
1242	relform1->reltoastrelid,
1243	false);
1244	if (count != `1`)
1245	elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1246	count);
1247	}
1248	if (relform2->reltoastrelid)
1249	{
1250	count = deleteDependencyRecordsFor(RelationRelationId,
1251	relform2->reltoastrelid,
1252	false);
1253	if (count != `1`)
1254	elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1255	count);
1256	}
1257
1258	/ Register new dependencies /
1259	baseobject.classId = RelationRelationId;
1260	baseobject.objectSubId = `0`;
1261	toastobject.classId = RelationRelationId;
1262	toastobject.objectSubId = `0`;
1263
1264	if (relform1->reltoastrelid)
1265	{
1266	baseobject.objectId = r1;
1267	toastobject.objectId = relform1->reltoastrelid;
1268	recordDependencyOn(&toastobject, &baseobject,
1269	DEPENDENCY_INTERNAL);
1270	}
1271
1272	if (relform2->reltoastrelid)
1273	{
1274	baseobject.objectId = r2;
1275	toastobject.objectId = relform2->reltoastrelid;
1276	recordDependencyOn(&toastobject, &baseobject,
1277	DEPENDENCY_INTERNAL);
1278	}
1279	}
1280	}
1281
1282	/*
1283	* If we're swapping two toast tables by content, do the same for their
1284	* valid index. The swap can actually be safely done only if the relations
1285	* have indexes.
1286	*/
1287	if (swap_toast_by_content &&
1288	relform1->relkind == RELKIND_TOASTVALUE &&
1289	relform2->relkind == RELKIND_TOASTVALUE)
1290	{
1291	Oid toastIndex1,
1292	toastIndex2;
1293
1294	/ Get valid index for each relation /
1295	toastIndex1 = toast_get_valid_index(r1,
1296	AccessExclusiveLock);
1297	toastIndex2 = toast_get_valid_index(r2,
1298	AccessExclusiveLock);
1299
1300	swap_relation_files(toastIndex1,
1301	toastIndex2,
1302	target_is_pg_class,
1303	swap_toast_by_content,
1304	is_internal,
1305	InvalidTransactionId,
1306	InvalidMultiXactId,
1307	mapped_tables);
1308	}
1309
1310	/ Clean up. /
1311	heap_freetuple(reltup1);
1312	heap_freetuple(reltup2);
1313
1314	table_close(relRelation, RowExclusiveLock);
1315
1316	/*
1317	* Close both relcache entries' smgr links. We need this kluge because
1318	* both links will be invalidated during upcoming CommandCounterIncrement.
1319	* Whichever of the rels is the second to be cleared will have a dangling
1320	* reference to the other's smgr entry. Rather than trying to avoid this
1321	* by ordering operations just so, it's easiest to close the links first.
1322	* (Fortunately, since one of the entries is local in our transaction,
1323	* it's sufficient to clear out our own relcache this way; the problem
1324	* cannot arise for other backends when they see our update on the
1325	* non-transient relation.)
1326	*
1327	* Caution: the placement of this step interacts with the decision to
1328	* handle toast rels by recursion. When we are trying to rebuild pg_class
1329	* itself, the smgr close on pg_class must happen after all accesses in
1330	* this function.
1331	*/
1332	RelationCloseSmgrByOid(r1);
1333	RelationCloseSmgrByOid(r2);
1334	}
1335
1336	/*
1337	* Remove the transient table that was built by make_new_heap, and finish
1338	* cleaning up (including rebuilding all indexes on the old heap).
1339	*/
1340	void
1341	finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1342	bool is_system_catalog,
1343	bool swap_toast_by_content,
1344	bool check_constraints,
1345	bool is_internal,
1346	TransactionId frozenXid,
1347	MultiXactId cutoffMulti,
1348	char newrelpersistence)
1349	{
1350	ObjectAddress object;
1351	Oid mapped_tables[`4`];
1352	int reindex_flags;
1353	int i;
1354
1355	/ Report that we are now swapping relation files /
1356	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1357	PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1358
1359	/ Zero out possible results from swapped_relation_files /
1360	memset(mapped_tables, `0`, sizeof(mapped_tables));
1361
1362	/*
1363	* Swap the contents of the heap relations (including any toast tables).
1364	* Also set old heap's relfrozenxid to frozenXid.
1365	*/
1366	swap_relation_files(OIDOldHeap, OIDNewHeap,
1367	(OIDOldHeap == RelationRelationId),
1368	swap_toast_by_content, is_internal,
1369	frozenXid, cutoffMulti, mapped_tables);
1370
1371	/*
1372	* If it's a system catalog, queue a sinval message to flush all catcaches
1373	* on the catalog when we reach CommandCounterIncrement.
1374	*/
1375	if (is_system_catalog)
1376	CacheInvalidateCatalog(OIDOldHeap);
1377
1378	/*
1379	* Rebuild each index on the relation (but not the toast table, which is
1380	* all-new at this point). It is important to do this before the DROP
1381	* step because if we are processing a system catalog that will be used
1382	* during DROP, we want to have its indexes available. There is no
1383	* advantage to the other order anyway because this is all transactional,
1384	* so no chance to reclaim disk space before commit. We do not need a
1385	* final CommandCounterIncrement() because reindex_relation does it.
1386	*
1387	* Note: because index_build is called via reindex_relation, it will never
1388	* set indcheckxmin true for the indexes. This is OK even though in some
1389	* sense we are building new indexes rather than rebuilding existing ones,
1390	* because the new heap won't contain any HOT chains at all, let alone
1391	* broken ones, so it can't be necessary to set indcheckxmin.
1392	*/
1393	reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1394	if (check_constraints)
1395	reindex_flags \|= REINDEX_REL_CHECK_CONSTRAINTS;
1396
1397	/*
1398	* Ensure that the indexes have the same persistence as the parent
1399	* relation.
1400	*/
1401	if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1402	reindex_flags \|= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1403	else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1404	reindex_flags \|= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1405
1406	/ Report that we are now reindexing relations /
1407	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1408	PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1409
1410	reindex_relation(OIDOldHeap, reindex_flags, `0`);
1411
1412	/ Report that we are now doing clean up /
1413	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1414	PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1415
1416	/*
1417	* If the relation being rebuild is pg_class, swap_relation_files()
1418	* couldn't update pg_class's own pg_class entry (check comments in
1419	* swap_relation_files()), thus relfrozenxid was not updated. That's
1420	* annoying because a potential reason for doing a VACUUM FULL is a
1421	* imminent or actual anti-wraparound shutdown. So, now that we can
1422	* access the new relation using its indices, update relfrozenxid.
1423	* pg_class doesn't have a toast relation, so we don't need to update the
1424	* corresponding toast relation. Not that there's little point moving all
1425	* relfrozenxid updates here since swap_relation_files() needs to write to
1426	* pg_class for non-mapped relations anyway.
1427	*/
1428	if (OIDOldHeap == RelationRelationId)
1429	{
1430	Relation relRelation;
1431	HeapTuple reltup;
1432	Form_pg_class relform;
1433
1434	relRelation = table_open(RelationRelationId, RowExclusiveLock);
1435
1436	reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1437	if (!HeapTupleIsValid(reltup))
1438	elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1439	relform = (Form_pg_class) GETSTRUCT(reltup);
1440
1441	relform->relfrozenxid = frozenXid;
1442	relform->relminmxid = cutoffMulti;
1443
1444	CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1445
1446	table_close(relRelation, RowExclusiveLock);
1447	}
1448
1449	/ Destroy new heap with old filenode /
1450	object.classId = RelationRelationId;
1451	object.objectId = OIDNewHeap;
1452	object.objectSubId = `0`;
1453
1454	/*
1455	* The new relation is local to our transaction and we know nothing
1456	* depends on it, so DROP_RESTRICT should be OK.
1457	*/
1458	performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1459
1460	/ performDeletion does CommandCounterIncrement at end /
1461
1462	/*
1463	* Now we must remove any relation mapping entries that we set up for the
1464	* transient table, as well as its toast table and toast index if any. If
1465	* we fail to do this before commit, the relmapper will complain about new
1466	* permanent map entries being added post-bootstrap.
1467	*/
1468	for (i = `0`; OidIsValid(mapped_tables[i]); i++)
1469	RelationMapRemoveMapping(mapped_tables[i]);
1470
1471	/*
1472	* At this point, everything is kosher except that, if we did toast swap
1473	* by links, the toast table's name corresponds to the transient table.
1474	* The name is irrelevant to the backend because it's referenced by OID,
1475	* but users looking at the catalogs could be confused. Rename it to
1476	* prevent this problem.
1477	*
1478	* Note no lock required on the relation, because we already hold an
1479	* exclusive lock on it.
1480	*/
1481	if (!swap_toast_by_content)
1482	{
1483	Relation newrel;
1484
1485	newrel = table_open(OIDOldHeap, NoLock);
1486	if (OidIsValid(newrel->rd_rel->reltoastrelid))
1487	{
1488	Oid toastidx;
1489	char NewToastName[NAMEDATALEN];
1490
1491	/ Get the associated valid index to be renamed /
1492	toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1493	AccessShareLock);
1494
1495	/ rename the toast table ... /
1496	snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1497	OIDOldHeap);
1498	RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1499	NewToastName, true, false);
1500
1501	/ ... and its valid index too. /
1502	snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1503	OIDOldHeap);
1504
1505	RenameRelationInternal(toastidx,
1506	NewToastName, true, true);
1507	}
1508	relation_close(newrel, NoLock);
1509	}
1510
1511	/ if it's not a catalog table, clear any missing attribute settings /
1512	if (!is_system_catalog)
1513	{
1514	Relation newrel;
1515
1516	newrel = table_open(OIDOldHeap, NoLock);
1517	RelationClearMissing(newrel);
1518	relation_close(newrel, NoLock);
1519	}
1520	}
1521
1522
1523	/*
1524	* Get a list of tables that the current user owns and
1525	* have indisclustered set. Return the list in a List * of rvsToCluster
1526	* with the tableOid and the indexOid on which the table is already
1527	* clustered.
1528	*/
1529	static List *
1530	get_tables_to_cluster(MemoryContext cluster_context)
1531	{
1532	Relation indRelation;
1533	TableScanDesc scan;
1534	ScanKeyData entry;
1535	HeapTuple indexTuple;
1536	Form_pg_index index;
1537	MemoryContext old_context;
1538	RelToCluster *rvtc;
1539	List *rvs = NIL;
1540
1541	/*
1542	* Get all indexes that have indisclustered set and are owned by
1543	* appropriate user. System relations or nailed-in relations cannot ever
1544	* have indisclustered set, because CLUSTER will refuse to set it when
1545	* called with one of them as argument.
1546	*/
1547	indRelation = table_open(IndexRelationId, AccessShareLock);
1548	ScanKeyInit(&entry,
1549	Anum_pg_index_indisclustered,
1550	BTEqualStrategyNumber, F_BOOLEQ,
1551	BoolGetDatum(true));
1552	scan = table_beginscan_catalog(indRelation, `1`, &entry);
1553	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1554	{
1555	index = (Form_pg_index) GETSTRUCT(indexTuple);
1556
1557	if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1558	continue;
1559
1560	/*
1561	* We have to build the list in a different memory context so it will
1562	* survive the cross-transaction processing
1563	*/
1564	old_context = MemoryContextSwitchTo(cluster_context);
1565
1566	rvtc = (RelToCluster ) palloc(sizeof*(RelToCluster));
1567	rvtc->tableOid = index->indrelid;
1568	rvtc->indexOid = index->indexrelid;
1569	rvs = lcons(rvtc, rvs);
1570
1571	MemoryContextSwitchTo(old_context);
1572	}
1573	table_endscan(scan);
1574
1575	relation_close(indRelation, AccessShareLock);
1576
1577	return rvs;
1578	}
1579

Browse the source code of PostgreSQL/src/backend/commands/cluster.c