1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * cluster.c |
4 | * CLUSTER a table on an index. This is now also used for VACUUM FULL. |
5 | * |
6 | * There is hardly anything left of Paul Brown's original implementation... |
7 | * |
8 | * |
9 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
10 | * Portions Copyright (c) 1994-5, Regents of the University of California |
11 | * |
12 | * |
13 | * IDENTIFICATION |
14 | * src/backend/commands/cluster.c |
15 | * |
16 | *------------------------------------------------------------------------- |
17 | */ |
18 | #include "postgres.h" |
19 | |
20 | #include "access/amapi.h" |
21 | #include "access/heapam.h" |
22 | #include "access/multixact.h" |
23 | #include "access/relscan.h" |
24 | #include "access/tableam.h" |
25 | #include "access/transam.h" |
26 | #include "access/tuptoaster.h" |
27 | #include "access/xact.h" |
28 | #include "access/xlog.h" |
29 | #include "catalog/pg_am.h" |
30 | #include "catalog/catalog.h" |
31 | #include "catalog/dependency.h" |
32 | #include "catalog/heap.h" |
33 | #include "catalog/index.h" |
34 | #include "catalog/namespace.h" |
35 | #include "catalog/objectaccess.h" |
36 | #include "catalog/toasting.h" |
37 | #include "commands/cluster.h" |
38 | #include "commands/progress.h" |
39 | #include "commands/tablecmds.h" |
40 | #include "commands/vacuum.h" |
41 | #include "miscadmin.h" |
42 | #include "optimizer/optimizer.h" |
43 | #include "pgstat.h" |
44 | #include "storage/bufmgr.h" |
45 | #include "storage/lmgr.h" |
46 | #include "storage/predicate.h" |
47 | #include "utils/acl.h" |
48 | #include "utils/fmgroids.h" |
49 | #include "utils/inval.h" |
50 | #include "utils/lsyscache.h" |
51 | #include "utils/memutils.h" |
52 | #include "utils/pg_rusage.h" |
53 | #include "utils/relmapper.h" |
54 | #include "utils/snapmgr.h" |
55 | #include "utils/syscache.h" |
56 | #include "utils/tuplesort.h" |
57 | |
58 | |
59 | /* |
60 | * This struct is used to pass around the information on tables to be |
61 | * clustered. We need this so we can make a list of them when invoked without |
62 | * a specific table/index pair. |
63 | */ |
64 | typedef struct |
65 | { |
66 | Oid tableOid; |
67 | Oid indexOid; |
68 | } RelToCluster; |
69 | |
70 | |
71 | static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose); |
72 | static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, |
73 | bool verbose, bool *pSwapToastByContent, |
74 | TransactionId *pFreezeXid, MultiXactId *pCutoffMulti); |
75 | static List *get_tables_to_cluster(MemoryContext cluster_context); |
76 | |
77 | |
78 | /*--------------------------------------------------------------------------- |
79 | * This cluster code allows for clustering multiple tables at once. Because |
80 | * of this, we cannot just run everything on a single transaction, or we |
81 | * would be forced to acquire exclusive locks on all the tables being |
82 | * clustered, simultaneously --- very likely leading to deadlock. |
83 | * |
84 | * To solve this we follow a similar strategy to VACUUM code, |
85 | * clustering each relation in a separate transaction. For this to work, |
86 | * we need to: |
87 | * - provide a separate memory context so that we can pass information in |
88 | * a way that survives across transactions |
89 | * - start a new transaction every time a new relation is clustered |
90 | * - check for validity of the information on to-be-clustered relations, |
91 | * as someone might have deleted a relation behind our back, or |
92 | * clustered one on a different index |
93 | * - end the transaction |
94 | * |
95 | * The single-relation case does not have any such overhead. |
96 | * |
97 | * We also allow a relation to be specified without index. In that case, |
98 | * the indisclustered bit will be looked up, and an ERROR will be thrown |
99 | * if there is no index with the bit set. |
100 | *--------------------------------------------------------------------------- |
101 | */ |
102 | void |
103 | cluster(ClusterStmt *stmt, bool isTopLevel) |
104 | { |
105 | if (stmt->relation != NULL) |
106 | { |
107 | /* This is the single-relation case. */ |
108 | Oid tableOid, |
109 | indexOid = InvalidOid; |
110 | Relation rel; |
111 | |
112 | /* Find, lock, and check permissions on the table */ |
113 | tableOid = RangeVarGetRelidExtended(stmt->relation, |
114 | AccessExclusiveLock, |
115 | 0, |
116 | RangeVarCallbackOwnsTable, NULL); |
117 | rel = table_open(tableOid, NoLock); |
118 | |
119 | /* |
120 | * Reject clustering a remote temp table ... their local buffer |
121 | * manager is not going to cope. |
122 | */ |
123 | if (RELATION_IS_OTHER_TEMP(rel)) |
124 | ereport(ERROR, |
125 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
126 | errmsg("cannot cluster temporary tables of other sessions" ))); |
127 | |
128 | /* |
129 | * Reject clustering a partitioned table. |
130 | */ |
131 | if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
132 | ereport(ERROR, |
133 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
134 | errmsg("cannot cluster a partitioned table" ))); |
135 | |
136 | if (stmt->indexname == NULL) |
137 | { |
138 | ListCell *index; |
139 | |
140 | /* We need to find the index that has indisclustered set. */ |
141 | foreach(index, RelationGetIndexList(rel)) |
142 | { |
143 | HeapTuple idxtuple; |
144 | Form_pg_index indexForm; |
145 | |
146 | indexOid = lfirst_oid(index); |
147 | idxtuple = SearchSysCache1(INDEXRELID, |
148 | ObjectIdGetDatum(indexOid)); |
149 | if (!HeapTupleIsValid(idxtuple)) |
150 | elog(ERROR, "cache lookup failed for index %u" , indexOid); |
151 | indexForm = (Form_pg_index) GETSTRUCT(idxtuple); |
152 | if (indexForm->indisclustered) |
153 | { |
154 | ReleaseSysCache(idxtuple); |
155 | break; |
156 | } |
157 | ReleaseSysCache(idxtuple); |
158 | indexOid = InvalidOid; |
159 | } |
160 | |
161 | if (!OidIsValid(indexOid)) |
162 | ereport(ERROR, |
163 | (errcode(ERRCODE_UNDEFINED_OBJECT), |
164 | errmsg("there is no previously clustered index for table \"%s\"" , |
165 | stmt->relation->relname))); |
166 | } |
167 | else |
168 | { |
169 | /* |
170 | * The index is expected to be in the same namespace as the |
171 | * relation. |
172 | */ |
173 | indexOid = get_relname_relid(stmt->indexname, |
174 | rel->rd_rel->relnamespace); |
175 | if (!OidIsValid(indexOid)) |
176 | ereport(ERROR, |
177 | (errcode(ERRCODE_UNDEFINED_OBJECT), |
178 | errmsg("index \"%s\" for table \"%s\" does not exist" , |
179 | stmt->indexname, stmt->relation->relname))); |
180 | } |
181 | |
182 | /* close relation, keep lock till commit */ |
183 | table_close(rel, NoLock); |
184 | |
185 | /* Do the job. */ |
186 | cluster_rel(tableOid, indexOid, stmt->options); |
187 | } |
188 | else |
189 | { |
190 | /* |
191 | * This is the "multi relation" case. We need to cluster all tables |
192 | * that have some index with indisclustered set. |
193 | */ |
194 | MemoryContext cluster_context; |
195 | List *rvs; |
196 | ListCell *rv; |
197 | |
198 | /* |
199 | * We cannot run this form of CLUSTER inside a user transaction block; |
200 | * we'd be holding locks way too long. |
201 | */ |
202 | PreventInTransactionBlock(isTopLevel, "CLUSTER" ); |
203 | |
204 | /* |
205 | * Create special memory context for cross-transaction storage. |
206 | * |
207 | * Since it is a child of PortalContext, it will go away even in case |
208 | * of error. |
209 | */ |
210 | cluster_context = AllocSetContextCreate(PortalContext, |
211 | "Cluster" , |
212 | ALLOCSET_DEFAULT_SIZES); |
213 | |
214 | /* |
215 | * Build the list of relations to cluster. Note that this lives in |
216 | * cluster_context. |
217 | */ |
218 | rvs = get_tables_to_cluster(cluster_context); |
219 | |
220 | /* Commit to get out of starting transaction */ |
221 | PopActiveSnapshot(); |
222 | CommitTransactionCommand(); |
223 | |
224 | /* Ok, now that we've got them all, cluster them one by one */ |
225 | foreach(rv, rvs) |
226 | { |
227 | RelToCluster *rvtc = (RelToCluster *) lfirst(rv); |
228 | |
229 | /* Start a new transaction for each relation. */ |
230 | StartTransactionCommand(); |
231 | /* functions in indexes may want a snapshot set */ |
232 | PushActiveSnapshot(GetTransactionSnapshot()); |
233 | /* Do the job. */ |
234 | cluster_rel(rvtc->tableOid, rvtc->indexOid, |
235 | stmt->options | CLUOPT_RECHECK); |
236 | PopActiveSnapshot(); |
237 | CommitTransactionCommand(); |
238 | } |
239 | |
240 | /* Start a new transaction for the cleanup work. */ |
241 | StartTransactionCommand(); |
242 | |
243 | /* Clean up working storage */ |
244 | MemoryContextDelete(cluster_context); |
245 | } |
246 | } |
247 | |
248 | /* |
249 | * cluster_rel |
250 | * |
251 | * This clusters the table by creating a new, clustered table and |
252 | * swapping the relfilenodes of the new table and the old table, so |
253 | * the OID of the original table is preserved. Thus we do not lose |
254 | * GRANT, inheritance nor references to this table (this was a bug |
255 | * in releases through 7.3). |
256 | * |
257 | * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading |
258 | * the new table, it's better to create the indexes afterwards than to fill |
259 | * them incrementally while we load the table. |
260 | * |
261 | * If indexOid is InvalidOid, the table will be rewritten in physical order |
262 | * instead of index order. This is the new implementation of VACUUM FULL, |
263 | * and error messages should refer to the operation as VACUUM not CLUSTER. |
264 | */ |
265 | void |
266 | cluster_rel(Oid tableOid, Oid indexOid, int options) |
267 | { |
268 | Relation OldHeap; |
269 | bool verbose = ((options & CLUOPT_VERBOSE) != 0); |
270 | bool recheck = ((options & CLUOPT_RECHECK) != 0); |
271 | |
272 | /* Check for user-requested abort. */ |
273 | CHECK_FOR_INTERRUPTS(); |
274 | |
275 | pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid); |
276 | if (OidIsValid(indexOid)) |
277 | pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, |
278 | PROGRESS_CLUSTER_COMMAND_CLUSTER); |
279 | else |
280 | pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, |
281 | PROGRESS_CLUSTER_COMMAND_VACUUM_FULL); |
282 | |
283 | /* |
284 | * We grab exclusive access to the target rel and index for the duration |
285 | * of the transaction. (This is redundant for the single-transaction |
286 | * case, since cluster() already did it.) The index lock is taken inside |
287 | * check_index_is_clusterable. |
288 | */ |
289 | OldHeap = try_relation_open(tableOid, AccessExclusiveLock); |
290 | |
291 | /* If the table has gone away, we can skip processing it */ |
292 | if (!OldHeap) |
293 | { |
294 | pgstat_progress_end_command(); |
295 | return; |
296 | } |
297 | |
298 | /* |
299 | * Since we may open a new transaction for each relation, we have to check |
300 | * that the relation still is what we think it is. |
301 | * |
302 | * If this is a single-transaction CLUSTER, we can skip these tests. We |
303 | * *must* skip the one on indisclustered since it would reject an attempt |
304 | * to cluster a not-previously-clustered index. |
305 | */ |
306 | if (recheck) |
307 | { |
308 | HeapTuple tuple; |
309 | Form_pg_index indexForm; |
310 | |
311 | /* Check that the user still owns the relation */ |
312 | if (!pg_class_ownercheck(tableOid, GetUserId())) |
313 | { |
314 | relation_close(OldHeap, AccessExclusiveLock); |
315 | pgstat_progress_end_command(); |
316 | return; |
317 | } |
318 | |
319 | /* |
320 | * Silently skip a temp table for a remote session. Only doing this |
321 | * check in the "recheck" case is appropriate (which currently means |
322 | * somebody is executing a database-wide CLUSTER), because there is |
323 | * another check in cluster() which will stop any attempt to cluster |
324 | * remote temp tables by name. There is another check in cluster_rel |
325 | * which is redundant, but we leave it for extra safety. |
326 | */ |
327 | if (RELATION_IS_OTHER_TEMP(OldHeap)) |
328 | { |
329 | relation_close(OldHeap, AccessExclusiveLock); |
330 | pgstat_progress_end_command(); |
331 | return; |
332 | } |
333 | |
334 | if (OidIsValid(indexOid)) |
335 | { |
336 | /* |
337 | * Check that the index still exists |
338 | */ |
339 | if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) |
340 | { |
341 | relation_close(OldHeap, AccessExclusiveLock); |
342 | pgstat_progress_end_command(); |
343 | return; |
344 | } |
345 | |
346 | /* |
347 | * Check that the index is still the one with indisclustered set. |
348 | */ |
349 | tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid)); |
350 | if (!HeapTupleIsValid(tuple)) /* probably can't happen */ |
351 | { |
352 | relation_close(OldHeap, AccessExclusiveLock); |
353 | pgstat_progress_end_command(); |
354 | return; |
355 | } |
356 | indexForm = (Form_pg_index) GETSTRUCT(tuple); |
357 | if (!indexForm->indisclustered) |
358 | { |
359 | ReleaseSysCache(tuple); |
360 | relation_close(OldHeap, AccessExclusiveLock); |
361 | pgstat_progress_end_command(); |
362 | return; |
363 | } |
364 | ReleaseSysCache(tuple); |
365 | } |
366 | } |
367 | |
368 | /* |
369 | * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER |
370 | * would work in most respects, but the index would only get marked as |
371 | * indisclustered in the current database, leading to unexpected behavior |
372 | * if CLUSTER were later invoked in another database. |
373 | */ |
374 | if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared) |
375 | ereport(ERROR, |
376 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
377 | errmsg("cannot cluster a shared catalog" ))); |
378 | |
379 | /* |
380 | * Don't process temp tables of other backends ... their local buffer |
381 | * manager is not going to cope. |
382 | */ |
383 | if (RELATION_IS_OTHER_TEMP(OldHeap)) |
384 | { |
385 | if (OidIsValid(indexOid)) |
386 | ereport(ERROR, |
387 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
388 | errmsg("cannot cluster temporary tables of other sessions" ))); |
389 | else |
390 | ereport(ERROR, |
391 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
392 | errmsg("cannot vacuum temporary tables of other sessions" ))); |
393 | } |
394 | |
395 | /* |
396 | * Also check for active uses of the relation in the current transaction, |
397 | * including open scans and pending AFTER trigger events. |
398 | */ |
399 | CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM" ); |
400 | |
401 | /* Check heap and index are valid to cluster on */ |
402 | if (OidIsValid(indexOid)) |
403 | check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock); |
404 | |
405 | /* |
406 | * Quietly ignore the request if this is a materialized view which has not |
407 | * been populated from its query. No harm is done because there is no data |
408 | * to deal with, and we don't want to throw an error if this is part of a |
409 | * multi-relation request -- for example, CLUSTER was run on the entire |
410 | * database. |
411 | */ |
412 | if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW && |
413 | !RelationIsPopulated(OldHeap)) |
414 | { |
415 | relation_close(OldHeap, AccessExclusiveLock); |
416 | pgstat_progress_end_command(); |
417 | return; |
418 | } |
419 | |
420 | /* |
421 | * All predicate locks on the tuples or pages are about to be made |
422 | * invalid, because we move tuples around. Promote them to relation |
423 | * locks. Predicate locks on indexes will be promoted when they are |
424 | * reindexed. |
425 | */ |
426 | TransferPredicateLocksToHeapRelation(OldHeap); |
427 | |
428 | /* rebuild_relation does all the dirty work */ |
429 | rebuild_relation(OldHeap, indexOid, verbose); |
430 | |
431 | /* NB: rebuild_relation does table_close() on OldHeap */ |
432 | |
433 | pgstat_progress_end_command(); |
434 | } |
435 | |
436 | /* |
437 | * Verify that the specified heap and index are valid to cluster on |
438 | * |
439 | * Side effect: obtains lock on the index. The caller may |
440 | * in some cases already have AccessExclusiveLock on the table, but |
441 | * not in all cases so we can't rely on the table-level lock for |
442 | * protection here. |
443 | */ |
444 | void |
445 | check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode) |
446 | { |
447 | Relation OldIndex; |
448 | |
449 | OldIndex = index_open(indexOid, lockmode); |
450 | |
451 | /* |
452 | * Check that index is in fact an index on the given relation |
453 | */ |
454 | if (OldIndex->rd_index == NULL || |
455 | OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap)) |
456 | ereport(ERROR, |
457 | (errcode(ERRCODE_WRONG_OBJECT_TYPE), |
458 | errmsg("\"%s\" is not an index for table \"%s\"" , |
459 | RelationGetRelationName(OldIndex), |
460 | RelationGetRelationName(OldHeap)))); |
461 | |
462 | /* Index AM must allow clustering */ |
463 | if (!OldIndex->rd_indam->amclusterable) |
464 | ereport(ERROR, |
465 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
466 | errmsg("cannot cluster on index \"%s\" because access method does not support clustering" , |
467 | RelationGetRelationName(OldIndex)))); |
468 | |
469 | /* |
470 | * Disallow clustering on incomplete indexes (those that might not index |
471 | * every row of the relation). We could relax this by making a separate |
472 | * seqscan pass over the table to copy the missing rows, but that seems |
473 | * expensive and tedious. |
474 | */ |
475 | if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL)) |
476 | ereport(ERROR, |
477 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
478 | errmsg("cannot cluster on partial index \"%s\"" , |
479 | RelationGetRelationName(OldIndex)))); |
480 | |
481 | /* |
482 | * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY; |
483 | * it might well not contain entries for every heap row, or might not even |
484 | * be internally consistent. (But note that we don't check indcheckxmin; |
485 | * the worst consequence of following broken HOT chains would be that we |
486 | * might put recently-dead tuples out-of-order in the new table, and there |
487 | * is little harm in that.) |
488 | */ |
489 | if (!OldIndex->rd_index->indisvalid) |
490 | ereport(ERROR, |
491 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
492 | errmsg("cannot cluster on invalid index \"%s\"" , |
493 | RelationGetRelationName(OldIndex)))); |
494 | |
495 | /* Drop relcache refcnt on OldIndex, but keep lock */ |
496 | index_close(OldIndex, NoLock); |
497 | } |
498 | |
499 | /* |
500 | * mark_index_clustered: mark the specified index as the one clustered on |
501 | * |
502 | * With indexOid == InvalidOid, will mark all indexes of rel not-clustered. |
503 | */ |
504 | void |
505 | mark_index_clustered(Relation rel, Oid indexOid, bool is_internal) |
506 | { |
507 | HeapTuple indexTuple; |
508 | Form_pg_index indexForm; |
509 | Relation pg_index; |
510 | ListCell *index; |
511 | |
512 | /* Disallow applying to a partitioned table */ |
513 | if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
514 | ereport(ERROR, |
515 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
516 | errmsg("cannot mark index clustered in partitioned table" ))); |
517 | |
518 | /* |
519 | * If the index is already marked clustered, no need to do anything. |
520 | */ |
521 | if (OidIsValid(indexOid)) |
522 | { |
523 | indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid)); |
524 | if (!HeapTupleIsValid(indexTuple)) |
525 | elog(ERROR, "cache lookup failed for index %u" , indexOid); |
526 | indexForm = (Form_pg_index) GETSTRUCT(indexTuple); |
527 | |
528 | if (indexForm->indisclustered) |
529 | { |
530 | ReleaseSysCache(indexTuple); |
531 | return; |
532 | } |
533 | |
534 | ReleaseSysCache(indexTuple); |
535 | } |
536 | |
537 | /* |
538 | * Check each index of the relation and set/clear the bit as needed. |
539 | */ |
540 | pg_index = table_open(IndexRelationId, RowExclusiveLock); |
541 | |
542 | foreach(index, RelationGetIndexList(rel)) |
543 | { |
544 | Oid thisIndexOid = lfirst_oid(index); |
545 | |
546 | indexTuple = SearchSysCacheCopy1(INDEXRELID, |
547 | ObjectIdGetDatum(thisIndexOid)); |
548 | if (!HeapTupleIsValid(indexTuple)) |
549 | elog(ERROR, "cache lookup failed for index %u" , thisIndexOid); |
550 | indexForm = (Form_pg_index) GETSTRUCT(indexTuple); |
551 | |
552 | /* |
553 | * Unset the bit if set. We know it's wrong because we checked this |
554 | * earlier. |
555 | */ |
556 | if (indexForm->indisclustered) |
557 | { |
558 | indexForm->indisclustered = false; |
559 | CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); |
560 | } |
561 | else if (thisIndexOid == indexOid) |
562 | { |
563 | /* this was checked earlier, but let's be real sure */ |
564 | if (!indexForm->indisvalid) |
565 | elog(ERROR, "cannot cluster on invalid index %u" , indexOid); |
566 | indexForm->indisclustered = true; |
567 | CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); |
568 | } |
569 | |
570 | InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0, |
571 | InvalidOid, is_internal); |
572 | |
573 | heap_freetuple(indexTuple); |
574 | } |
575 | |
576 | table_close(pg_index, RowExclusiveLock); |
577 | } |
578 | |
579 | /* |
580 | * rebuild_relation: rebuild an existing relation in index or physical order |
581 | * |
582 | * OldHeap: table to rebuild --- must be opened and exclusive-locked! |
583 | * indexOid: index to cluster by, or InvalidOid to rewrite in physical order. |
584 | * |
585 | * NB: this routine closes OldHeap at the right time; caller should not. |
586 | */ |
587 | static void |
588 | rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) |
589 | { |
590 | Oid tableOid = RelationGetRelid(OldHeap); |
591 | Oid tableSpace = OldHeap->rd_rel->reltablespace; |
592 | Oid OIDNewHeap; |
593 | char relpersistence; |
594 | bool is_system_catalog; |
595 | bool swap_toast_by_content; |
596 | TransactionId frozenXid; |
597 | MultiXactId cutoffMulti; |
598 | |
599 | /* Mark the correct index as clustered */ |
600 | if (OidIsValid(indexOid)) |
601 | mark_index_clustered(OldHeap, indexOid, true); |
602 | |
603 | /* Remember info about rel before closing OldHeap */ |
604 | relpersistence = OldHeap->rd_rel->relpersistence; |
605 | is_system_catalog = IsSystemRelation(OldHeap); |
606 | |
607 | /* Close relcache entry, but keep lock until transaction commit */ |
608 | table_close(OldHeap, NoLock); |
609 | |
610 | /* Create the transient table that will receive the re-ordered data */ |
611 | OIDNewHeap = make_new_heap(tableOid, tableSpace, |
612 | relpersistence, |
613 | AccessExclusiveLock); |
614 | |
615 | /* Copy the heap data into the new table in the desired order */ |
616 | copy_table_data(OIDNewHeap, tableOid, indexOid, verbose, |
617 | &swap_toast_by_content, &frozenXid, &cutoffMulti); |
618 | |
619 | /* |
620 | * Swap the physical files of the target and transient tables, then |
621 | * rebuild the target's indexes and throw away the transient table. |
622 | */ |
623 | finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, |
624 | swap_toast_by_content, false, true, |
625 | frozenXid, cutoffMulti, |
626 | relpersistence); |
627 | } |
628 | |
629 | |
630 | /* |
631 | * Create the transient table that will be filled with new data during |
632 | * CLUSTER, ALTER TABLE, and similar operations. The transient table |
633 | * duplicates the logical structure of the OldHeap, but is placed in |
634 | * NewTableSpace which might be different from OldHeap's. Also, it's built |
635 | * with the specified persistence, which might differ from the original's. |
636 | * |
637 | * After this, the caller should load the new heap with transferred/modified |
638 | * data, then call finish_heap_swap to complete the operation. |
639 | */ |
640 | Oid |
641 | make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence, |
642 | LOCKMODE lockmode) |
643 | { |
644 | TupleDesc OldHeapDesc; |
645 | char NewHeapName[NAMEDATALEN]; |
646 | Oid OIDNewHeap; |
647 | Oid toastid; |
648 | Relation OldHeap; |
649 | HeapTuple tuple; |
650 | Datum reloptions; |
651 | bool isNull; |
652 | Oid namespaceid; |
653 | |
654 | OldHeap = table_open(OIDOldHeap, lockmode); |
655 | OldHeapDesc = RelationGetDescr(OldHeap); |
656 | |
657 | /* |
658 | * Note that the NewHeap will not receive any of the defaults or |
659 | * constraints associated with the OldHeap; we don't need 'em, and there's |
660 | * no reason to spend cycles inserting them into the catalogs only to |
661 | * delete them. |
662 | */ |
663 | |
664 | /* |
665 | * But we do want to use reloptions of the old heap for new heap. |
666 | */ |
667 | tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap)); |
668 | if (!HeapTupleIsValid(tuple)) |
669 | elog(ERROR, "cache lookup failed for relation %u" , OIDOldHeap); |
670 | reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, |
671 | &isNull); |
672 | if (isNull) |
673 | reloptions = (Datum) 0; |
674 | |
675 | if (relpersistence == RELPERSISTENCE_TEMP) |
676 | namespaceid = LookupCreationNamespace("pg_temp" ); |
677 | else |
678 | namespaceid = RelationGetNamespace(OldHeap); |
679 | |
680 | /* |
681 | * Create the new heap, using a temporary name in the same namespace as |
682 | * the existing table. NOTE: there is some risk of collision with user |
683 | * relnames. Working around this seems more trouble than it's worth; in |
684 | * particular, we can't create the new heap in a different namespace from |
685 | * the old, or we will have problems with the TEMP status of temp tables. |
686 | * |
687 | * Note: the new heap is not a shared relation, even if we are rebuilding |
688 | * a shared rel. However, we do make the new heap mapped if the source is |
689 | * mapped. This simplifies swap_relation_files, and is absolutely |
690 | * necessary for rebuilding pg_class, for reasons explained there. |
691 | */ |
692 | snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u" , OIDOldHeap); |
693 | |
694 | OIDNewHeap = heap_create_with_catalog(NewHeapName, |
695 | namespaceid, |
696 | NewTableSpace, |
697 | InvalidOid, |
698 | InvalidOid, |
699 | InvalidOid, |
700 | OldHeap->rd_rel->relowner, |
701 | OldHeap->rd_rel->relam, |
702 | OldHeapDesc, |
703 | NIL, |
704 | RELKIND_RELATION, |
705 | relpersistence, |
706 | false, |
707 | RelationIsMapped(OldHeap), |
708 | ONCOMMIT_NOOP, |
709 | reloptions, |
710 | false, |
711 | true, |
712 | true, |
713 | OIDOldHeap, |
714 | NULL); |
715 | Assert(OIDNewHeap != InvalidOid); |
716 | |
717 | ReleaseSysCache(tuple); |
718 | |
719 | /* |
720 | * Advance command counter so that the newly-created relation's catalog |
721 | * tuples will be visible to table_open. |
722 | */ |
723 | CommandCounterIncrement(); |
724 | |
725 | /* |
726 | * If necessary, create a TOAST table for the new relation. |
727 | * |
728 | * If the relation doesn't have a TOAST table already, we can't need one |
729 | * for the new relation. The other way around is possible though: if some |
730 | * wide columns have been dropped, NewHeapCreateToastTable can decide that |
731 | * no TOAST table is needed for the new table. |
732 | * |
733 | * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so |
734 | * that the TOAST table will be visible for insertion. |
735 | */ |
736 | toastid = OldHeap->rd_rel->reltoastrelid; |
737 | if (OidIsValid(toastid)) |
738 | { |
739 | /* keep the existing toast table's reloptions, if any */ |
740 | tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid)); |
741 | if (!HeapTupleIsValid(tuple)) |
742 | elog(ERROR, "cache lookup failed for relation %u" , toastid); |
743 | reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, |
744 | &isNull); |
745 | if (isNull) |
746 | reloptions = (Datum) 0; |
747 | |
748 | NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode); |
749 | |
750 | ReleaseSysCache(tuple); |
751 | } |
752 | |
753 | table_close(OldHeap, NoLock); |
754 | |
755 | return OIDNewHeap; |
756 | } |
757 | |
758 | /* |
759 | * Do the physical copying of table data. |
760 | * |
761 | * There are three output parameters: |
762 | * *pSwapToastByContent is set true if toast tables must be swapped by content. |
763 | * *pFreezeXid receives the TransactionId used as freeze cutoff point. |
764 | * *pCutoffMulti receives the MultiXactId used as a cutoff point. |
765 | */ |
766 | static void |
767 | copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, |
768 | bool *pSwapToastByContent, TransactionId *pFreezeXid, |
769 | MultiXactId *pCutoffMulti) |
770 | { |
771 | Relation NewHeap, |
772 | OldHeap, |
773 | OldIndex; |
774 | Relation relRelation; |
775 | HeapTuple reltup; |
776 | Form_pg_class relform; |
777 | TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY; |
778 | TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY; |
779 | TransactionId OldestXmin; |
780 | TransactionId FreezeXid; |
781 | MultiXactId MultiXactCutoff; |
782 | bool use_sort; |
783 | double num_tuples = 0, |
784 | tups_vacuumed = 0, |
785 | tups_recently_dead = 0; |
786 | BlockNumber num_pages; |
787 | int elevel = verbose ? INFO : DEBUG2; |
788 | PGRUsage ru0; |
789 | |
790 | pg_rusage_init(&ru0); |
791 | |
792 | /* |
793 | * Open the relations we need. |
794 | */ |
795 | NewHeap = table_open(OIDNewHeap, AccessExclusiveLock); |
796 | OldHeap = table_open(OIDOldHeap, AccessExclusiveLock); |
797 | if (OidIsValid(OIDOldIndex)) |
798 | OldIndex = index_open(OIDOldIndex, AccessExclusiveLock); |
799 | else |
800 | OldIndex = NULL; |
801 | |
802 | /* |
803 | * Their tuple descriptors should be exactly alike, but here we only need |
804 | * assume that they have the same number of columns. |
805 | */ |
806 | oldTupDesc = RelationGetDescr(OldHeap); |
807 | newTupDesc = RelationGetDescr(NewHeap); |
808 | Assert(newTupDesc->natts == oldTupDesc->natts); |
809 | |
810 | /* |
811 | * If the OldHeap has a toast table, get lock on the toast table to keep |
812 | * it from being vacuumed. This is needed because autovacuum processes |
813 | * toast tables independently of their main tables, with no lock on the |
814 | * latter. If an autovacuum were to start on the toast table after we |
815 | * compute our OldestXmin below, it would use a later OldestXmin, and then |
816 | * possibly remove as DEAD toast tuples belonging to main tuples we think |
817 | * are only RECENTLY_DEAD. Then we'd fail while trying to copy those |
818 | * tuples. |
819 | * |
820 | * We don't need to open the toast relation here, just lock it. The lock |
821 | * will be held till end of transaction. |
822 | */ |
823 | if (OldHeap->rd_rel->reltoastrelid) |
824 | LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock); |
825 | |
826 | /* |
827 | * If both tables have TOAST tables, perform toast swap by content. It is |
828 | * possible that the old table has a toast table but the new one doesn't, |
829 | * if toastable columns have been dropped. In that case we have to do |
830 | * swap by links. This is okay because swap by content is only essential |
831 | * for system catalogs, and we don't support schema changes for them. |
832 | */ |
833 | if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid) |
834 | { |
835 | *pSwapToastByContent = true; |
836 | |
837 | /* |
838 | * When doing swap by content, any toast pointers written into NewHeap |
839 | * must use the old toast table's OID, because that's where the toast |
840 | * data will eventually be found. Set this up by setting rd_toastoid. |
841 | * This also tells toast_save_datum() to preserve the toast value |
842 | * OIDs, which we want so as not to invalidate toast pointers in |
843 | * system catalog caches, and to avoid making multiple copies of a |
844 | * single toast value. |
845 | * |
846 | * Note that we must hold NewHeap open until we are done writing data, |
847 | * since the relcache will not guarantee to remember this setting once |
848 | * the relation is closed. Also, this technique depends on the fact |
849 | * that no one will try to read from the NewHeap until after we've |
850 | * finished writing it and swapping the rels --- otherwise they could |
851 | * follow the toast pointers to the wrong place. (It would actually |
852 | * work for values copied over from the old toast table, but not for |
853 | * any values that we toast which were previously not toasted.) |
854 | */ |
855 | NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid; |
856 | } |
857 | else |
858 | *pSwapToastByContent = false; |
859 | |
860 | /* |
861 | * Compute xids used to freeze and weed out dead tuples and multixacts. |
862 | * Since we're going to rewrite the whole table anyway, there's no reason |
863 | * not to be aggressive about this. |
864 | */ |
865 | vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0, |
866 | &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff, |
867 | NULL); |
868 | |
869 | /* |
870 | * FreezeXid will become the table's new relfrozenxid, and that mustn't go |
871 | * backwards, so take the max. |
872 | */ |
873 | if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) && |
874 | TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid)) |
875 | FreezeXid = OldHeap->rd_rel->relfrozenxid; |
876 | |
877 | /* |
878 | * MultiXactCutoff, similarly, shouldn't go backwards either. |
879 | */ |
880 | if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) && |
881 | MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid)) |
882 | MultiXactCutoff = OldHeap->rd_rel->relminmxid; |
883 | |
884 | /* |
885 | * Decide whether to use an indexscan or seqscan-and-optional-sort to scan |
886 | * the OldHeap. We know how to use a sort to duplicate the ordering of a |
887 | * btree index, and will use seqscan-and-sort for that case if the planner |
888 | * tells us it's cheaper. Otherwise, always indexscan if an index is |
889 | * provided, else plain seqscan. |
890 | */ |
891 | if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID) |
892 | use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex); |
893 | else |
894 | use_sort = false; |
895 | |
896 | /* Log what we're doing */ |
897 | if (OldIndex != NULL && !use_sort) |
898 | ereport(elevel, |
899 | (errmsg("clustering \"%s.%s\" using index scan on \"%s\"" , |
900 | get_namespace_name(RelationGetNamespace(OldHeap)), |
901 | RelationGetRelationName(OldHeap), |
902 | RelationGetRelationName(OldIndex)))); |
903 | else if (use_sort) |
904 | ereport(elevel, |
905 | (errmsg("clustering \"%s.%s\" using sequential scan and sort" , |
906 | get_namespace_name(RelationGetNamespace(OldHeap)), |
907 | RelationGetRelationName(OldHeap)))); |
908 | else |
909 | ereport(elevel, |
910 | (errmsg("vacuuming \"%s.%s\"" , |
911 | get_namespace_name(RelationGetNamespace(OldHeap)), |
912 | RelationGetRelationName(OldHeap)))); |
913 | |
914 | /* |
915 | * Hand of the actual copying to AM specific function, the generic code |
916 | * cannot know how to deal with visibility across AMs. Note that this |
917 | * routine is allowed to set FreezeXid / MultiXactCutoff to different |
918 | * values (e.g. because the AM doesn't use freezing). |
919 | */ |
920 | table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort, |
921 | OldestXmin, &FreezeXid, &MultiXactCutoff, |
922 | &num_tuples, &tups_vacuumed, |
923 | &tups_recently_dead); |
924 | |
925 | /* return selected values to caller, get set as relfrozenxid/minmxid */ |
926 | *pFreezeXid = FreezeXid; |
927 | *pCutoffMulti = MultiXactCutoff; |
928 | |
929 | /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */ |
930 | NewHeap->rd_toastoid = InvalidOid; |
931 | |
932 | num_pages = RelationGetNumberOfBlocks(NewHeap); |
933 | |
934 | /* Log what we did */ |
935 | ereport(elevel, |
936 | (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages" , |
937 | RelationGetRelationName(OldHeap), |
938 | tups_vacuumed, num_tuples, |
939 | RelationGetNumberOfBlocks(OldHeap)), |
940 | errdetail("%.0f dead row versions cannot be removed yet.\n" |
941 | "%s." , |
942 | tups_recently_dead, |
943 | pg_rusage_show(&ru0)))); |
944 | |
945 | if (OldIndex != NULL) |
946 | index_close(OldIndex, NoLock); |
947 | table_close(OldHeap, NoLock); |
948 | table_close(NewHeap, NoLock); |
949 | |
950 | /* Update pg_class to reflect the correct values of pages and tuples. */ |
951 | relRelation = table_open(RelationRelationId, RowExclusiveLock); |
952 | |
953 | reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap)); |
954 | if (!HeapTupleIsValid(reltup)) |
955 | elog(ERROR, "cache lookup failed for relation %u" , OIDNewHeap); |
956 | relform = (Form_pg_class) GETSTRUCT(reltup); |
957 | |
958 | relform->relpages = num_pages; |
959 | relform->reltuples = num_tuples; |
960 | |
961 | /* Don't update the stats for pg_class. See swap_relation_files. */ |
962 | if (OIDOldHeap != RelationRelationId) |
963 | CatalogTupleUpdate(relRelation, &reltup->t_self, reltup); |
964 | else |
965 | CacheInvalidateRelcacheByTuple(reltup); |
966 | |
967 | /* Clean up. */ |
968 | heap_freetuple(reltup); |
969 | table_close(relRelation, RowExclusiveLock); |
970 | |
971 | /* Make the update visible */ |
972 | CommandCounterIncrement(); |
973 | } |
974 | |
975 | /* |
976 | * Swap the physical files of two given relations. |
977 | * |
978 | * We swap the physical identity (reltablespace, relfilenode) while keeping the |
979 | * same logical identities of the two relations. relpersistence is also |
980 | * swapped, which is critical since it determines where buffers live for each |
981 | * relation. |
982 | * |
983 | * We can swap associated TOAST data in either of two ways: recursively swap |
984 | * the physical content of the toast tables (and their indexes), or swap the |
985 | * TOAST links in the given relations' pg_class entries. The former is needed |
986 | * to manage rewrites of shared catalogs (where we cannot change the pg_class |
987 | * links) while the latter is the only way to handle cases in which a toast |
988 | * table is added or removed altogether. |
989 | * |
990 | * Additionally, the first relation is marked with relfrozenxid set to |
991 | * frozenXid. It seems a bit ugly to have this here, but the caller would |
992 | * have to do it anyway, so having it here saves a heap_update. Note: in |
993 | * the swap-toast-links case, we assume we don't need to change the toast |
994 | * table's relfrozenxid: the new version of the toast table should already |
995 | * have relfrozenxid set to RecentXmin, which is good enough. |
996 | * |
997 | * Lastly, if r2 and its toast table and toast index (if any) are mapped, |
998 | * their OIDs are emitted into mapped_tables[]. This is hacky but beats |
999 | * having to look the information up again later in finish_heap_swap. |
1000 | */ |
1001 | static void |
1002 | swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, |
1003 | bool swap_toast_by_content, |
1004 | bool is_internal, |
1005 | TransactionId frozenXid, |
1006 | MultiXactId cutoffMulti, |
1007 | Oid *mapped_tables) |
1008 | { |
1009 | Relation relRelation; |
1010 | HeapTuple reltup1, |
1011 | reltup2; |
1012 | Form_pg_class relform1, |
1013 | relform2; |
1014 | Oid relfilenode1, |
1015 | relfilenode2; |
1016 | Oid swaptemp; |
1017 | char swptmpchr; |
1018 | |
1019 | /* We need writable copies of both pg_class tuples. */ |
1020 | relRelation = table_open(RelationRelationId, RowExclusiveLock); |
1021 | |
1022 | reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1)); |
1023 | if (!HeapTupleIsValid(reltup1)) |
1024 | elog(ERROR, "cache lookup failed for relation %u" , r1); |
1025 | relform1 = (Form_pg_class) GETSTRUCT(reltup1); |
1026 | |
1027 | reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2)); |
1028 | if (!HeapTupleIsValid(reltup2)) |
1029 | elog(ERROR, "cache lookup failed for relation %u" , r2); |
1030 | relform2 = (Form_pg_class) GETSTRUCT(reltup2); |
1031 | |
1032 | relfilenode1 = relform1->relfilenode; |
1033 | relfilenode2 = relform2->relfilenode; |
1034 | |
1035 | if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2)) |
1036 | { |
1037 | /* |
1038 | * Normal non-mapped relations: swap relfilenodes, reltablespaces, |
1039 | * relpersistence |
1040 | */ |
1041 | Assert(!target_is_pg_class); |
1042 | |
1043 | swaptemp = relform1->relfilenode; |
1044 | relform1->relfilenode = relform2->relfilenode; |
1045 | relform2->relfilenode = swaptemp; |
1046 | |
1047 | swaptemp = relform1->reltablespace; |
1048 | relform1->reltablespace = relform2->reltablespace; |
1049 | relform2->reltablespace = swaptemp; |
1050 | |
1051 | swptmpchr = relform1->relpersistence; |
1052 | relform1->relpersistence = relform2->relpersistence; |
1053 | relform2->relpersistence = swptmpchr; |
1054 | |
1055 | /* Also swap toast links, if we're swapping by links */ |
1056 | if (!swap_toast_by_content) |
1057 | { |
1058 | swaptemp = relform1->reltoastrelid; |
1059 | relform1->reltoastrelid = relform2->reltoastrelid; |
1060 | relform2->reltoastrelid = swaptemp; |
1061 | } |
1062 | } |
1063 | else |
1064 | { |
1065 | /* |
1066 | * Mapped-relation case. Here we have to swap the relation mappings |
1067 | * instead of modifying the pg_class columns. Both must be mapped. |
1068 | */ |
1069 | if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2)) |
1070 | elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation" , |
1071 | NameStr(relform1->relname)); |
1072 | |
1073 | /* |
1074 | * We can't change the tablespace nor persistence of a mapped rel, and |
1075 | * we can't handle toast link swapping for one either, because we must |
1076 | * not apply any critical changes to its pg_class row. These cases |
1077 | * should be prevented by upstream permissions tests, so these checks |
1078 | * are non-user-facing emergency backstop. |
1079 | */ |
1080 | if (relform1->reltablespace != relform2->reltablespace) |
1081 | elog(ERROR, "cannot change tablespace of mapped relation \"%s\"" , |
1082 | NameStr(relform1->relname)); |
1083 | if (relform1->relpersistence != relform2->relpersistence) |
1084 | elog(ERROR, "cannot change persistence of mapped relation \"%s\"" , |
1085 | NameStr(relform1->relname)); |
1086 | if (!swap_toast_by_content && |
1087 | (relform1->reltoastrelid || relform2->reltoastrelid)) |
1088 | elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"" , |
1089 | NameStr(relform1->relname)); |
1090 | |
1091 | /* |
1092 | * Fetch the mappings --- shouldn't fail, but be paranoid |
1093 | */ |
1094 | relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared); |
1095 | if (!OidIsValid(relfilenode1)) |
1096 | elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u" , |
1097 | NameStr(relform1->relname), r1); |
1098 | relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared); |
1099 | if (!OidIsValid(relfilenode2)) |
1100 | elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u" , |
1101 | NameStr(relform2->relname), r2); |
1102 | |
1103 | /* |
1104 | * Send replacement mappings to relmapper. Note these won't actually |
1105 | * take effect until CommandCounterIncrement. |
1106 | */ |
1107 | RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false); |
1108 | RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false); |
1109 | |
1110 | /* Pass OIDs of mapped r2 tables back to caller */ |
1111 | *mapped_tables++ = r2; |
1112 | } |
1113 | |
1114 | /* |
1115 | * In the case of a shared catalog, these next few steps will only affect |
1116 | * our own database's pg_class row; but that's okay, because they are all |
1117 | * noncritical updates. That's also an important fact for the case of a |
1118 | * mapped catalog, because it's possible that we'll commit the map change |
1119 | * and then fail to commit the pg_class update. |
1120 | */ |
1121 | |
1122 | /* set rel1's frozen Xid and minimum MultiXid */ |
1123 | if (relform1->relkind != RELKIND_INDEX) |
1124 | { |
1125 | Assert(!TransactionIdIsValid(frozenXid) || |
1126 | TransactionIdIsNormal(frozenXid)); |
1127 | relform1->relfrozenxid = frozenXid; |
1128 | relform1->relminmxid = cutoffMulti; |
1129 | } |
1130 | |
1131 | /* swap size statistics too, since new rel has freshly-updated stats */ |
1132 | { |
1133 | int32 swap_pages; |
1134 | float4 swap_tuples; |
1135 | int32 swap_allvisible; |
1136 | |
1137 | swap_pages = relform1->relpages; |
1138 | relform1->relpages = relform2->relpages; |
1139 | relform2->relpages = swap_pages; |
1140 | |
1141 | swap_tuples = relform1->reltuples; |
1142 | relform1->reltuples = relform2->reltuples; |
1143 | relform2->reltuples = swap_tuples; |
1144 | |
1145 | swap_allvisible = relform1->relallvisible; |
1146 | relform1->relallvisible = relform2->relallvisible; |
1147 | relform2->relallvisible = swap_allvisible; |
1148 | } |
1149 | |
1150 | /* |
1151 | * Update the tuples in pg_class --- unless the target relation of the |
1152 | * swap is pg_class itself. In that case, there is zero point in making |
1153 | * changes because we'd be updating the old data that we're about to throw |
1154 | * away. Because the real work being done here for a mapped relation is |
1155 | * just to change the relation map settings, it's all right to not update |
1156 | * the pg_class rows in this case. The most important changes will instead |
1157 | * performed later, in finish_heap_swap() itself. |
1158 | */ |
1159 | if (!target_is_pg_class) |
1160 | { |
1161 | CatalogIndexState indstate; |
1162 | |
1163 | indstate = CatalogOpenIndexes(relRelation); |
1164 | CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1, |
1165 | indstate); |
1166 | CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2, |
1167 | indstate); |
1168 | CatalogCloseIndexes(indstate); |
1169 | } |
1170 | else |
1171 | { |
1172 | /* no update ... but we do still need relcache inval */ |
1173 | CacheInvalidateRelcacheByTuple(reltup1); |
1174 | CacheInvalidateRelcacheByTuple(reltup2); |
1175 | } |
1176 | |
1177 | /* |
1178 | * Post alter hook for modified relations. The change to r2 is always |
1179 | * internal, but r1 depends on the invocation context. |
1180 | */ |
1181 | InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0, |
1182 | InvalidOid, is_internal); |
1183 | InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0, |
1184 | InvalidOid, true); |
1185 | |
1186 | /* |
1187 | * If we have toast tables associated with the relations being swapped, |
1188 | * deal with them too. |
1189 | */ |
1190 | if (relform1->reltoastrelid || relform2->reltoastrelid) |
1191 | { |
1192 | if (swap_toast_by_content) |
1193 | { |
1194 | if (relform1->reltoastrelid && relform2->reltoastrelid) |
1195 | { |
1196 | /* Recursively swap the contents of the toast tables */ |
1197 | swap_relation_files(relform1->reltoastrelid, |
1198 | relform2->reltoastrelid, |
1199 | target_is_pg_class, |
1200 | swap_toast_by_content, |
1201 | is_internal, |
1202 | frozenXid, |
1203 | cutoffMulti, |
1204 | mapped_tables); |
1205 | } |
1206 | else |
1207 | { |
1208 | /* caller messed up */ |
1209 | elog(ERROR, "cannot swap toast files by content when there's only one" ); |
1210 | } |
1211 | } |
1212 | else |
1213 | { |
1214 | /* |
1215 | * We swapped the ownership links, so we need to change dependency |
1216 | * data to match. |
1217 | * |
1218 | * NOTE: it is possible that only one table has a toast table. |
1219 | * |
1220 | * NOTE: at present, a TOAST table's only dependency is the one on |
1221 | * its owning table. If more are ever created, we'd need to use |
1222 | * something more selective than deleteDependencyRecordsFor() to |
1223 | * get rid of just the link we want. |
1224 | */ |
1225 | ObjectAddress baseobject, |
1226 | toastobject; |
1227 | long count; |
1228 | |
1229 | /* |
1230 | * We disallow this case for system catalogs, to avoid the |
1231 | * possibility that the catalog we're rebuilding is one of the |
1232 | * ones the dependency changes would change. It's too late to be |
1233 | * making any data changes to the target catalog. |
1234 | */ |
1235 | if (IsSystemClass(r1, relform1)) |
1236 | elog(ERROR, "cannot swap toast files by links for system catalogs" ); |
1237 | |
1238 | /* Delete old dependencies */ |
1239 | if (relform1->reltoastrelid) |
1240 | { |
1241 | count = deleteDependencyRecordsFor(RelationRelationId, |
1242 | relform1->reltoastrelid, |
1243 | false); |
1244 | if (count != 1) |
1245 | elog(ERROR, "expected one dependency record for TOAST table, found %ld" , |
1246 | count); |
1247 | } |
1248 | if (relform2->reltoastrelid) |
1249 | { |
1250 | count = deleteDependencyRecordsFor(RelationRelationId, |
1251 | relform2->reltoastrelid, |
1252 | false); |
1253 | if (count != 1) |
1254 | elog(ERROR, "expected one dependency record for TOAST table, found %ld" , |
1255 | count); |
1256 | } |
1257 | |
1258 | /* Register new dependencies */ |
1259 | baseobject.classId = RelationRelationId; |
1260 | baseobject.objectSubId = 0; |
1261 | toastobject.classId = RelationRelationId; |
1262 | toastobject.objectSubId = 0; |
1263 | |
1264 | if (relform1->reltoastrelid) |
1265 | { |
1266 | baseobject.objectId = r1; |
1267 | toastobject.objectId = relform1->reltoastrelid; |
1268 | recordDependencyOn(&toastobject, &baseobject, |
1269 | DEPENDENCY_INTERNAL); |
1270 | } |
1271 | |
1272 | if (relform2->reltoastrelid) |
1273 | { |
1274 | baseobject.objectId = r2; |
1275 | toastobject.objectId = relform2->reltoastrelid; |
1276 | recordDependencyOn(&toastobject, &baseobject, |
1277 | DEPENDENCY_INTERNAL); |
1278 | } |
1279 | } |
1280 | } |
1281 | |
1282 | /* |
1283 | * If we're swapping two toast tables by content, do the same for their |
1284 | * valid index. The swap can actually be safely done only if the relations |
1285 | * have indexes. |
1286 | */ |
1287 | if (swap_toast_by_content && |
1288 | relform1->relkind == RELKIND_TOASTVALUE && |
1289 | relform2->relkind == RELKIND_TOASTVALUE) |
1290 | { |
1291 | Oid toastIndex1, |
1292 | toastIndex2; |
1293 | |
1294 | /* Get valid index for each relation */ |
1295 | toastIndex1 = toast_get_valid_index(r1, |
1296 | AccessExclusiveLock); |
1297 | toastIndex2 = toast_get_valid_index(r2, |
1298 | AccessExclusiveLock); |
1299 | |
1300 | swap_relation_files(toastIndex1, |
1301 | toastIndex2, |
1302 | target_is_pg_class, |
1303 | swap_toast_by_content, |
1304 | is_internal, |
1305 | InvalidTransactionId, |
1306 | InvalidMultiXactId, |
1307 | mapped_tables); |
1308 | } |
1309 | |
1310 | /* Clean up. */ |
1311 | heap_freetuple(reltup1); |
1312 | heap_freetuple(reltup2); |
1313 | |
1314 | table_close(relRelation, RowExclusiveLock); |
1315 | |
1316 | /* |
1317 | * Close both relcache entries' smgr links. We need this kluge because |
1318 | * both links will be invalidated during upcoming CommandCounterIncrement. |
1319 | * Whichever of the rels is the second to be cleared will have a dangling |
1320 | * reference to the other's smgr entry. Rather than trying to avoid this |
1321 | * by ordering operations just so, it's easiest to close the links first. |
1322 | * (Fortunately, since one of the entries is local in our transaction, |
1323 | * it's sufficient to clear out our own relcache this way; the problem |
1324 | * cannot arise for other backends when they see our update on the |
1325 | * non-transient relation.) |
1326 | * |
1327 | * Caution: the placement of this step interacts with the decision to |
1328 | * handle toast rels by recursion. When we are trying to rebuild pg_class |
1329 | * itself, the smgr close on pg_class must happen after all accesses in |
1330 | * this function. |
1331 | */ |
1332 | RelationCloseSmgrByOid(r1); |
1333 | RelationCloseSmgrByOid(r2); |
1334 | } |
1335 | |
1336 | /* |
1337 | * Remove the transient table that was built by make_new_heap, and finish |
1338 | * cleaning up (including rebuilding all indexes on the old heap). |
1339 | */ |
1340 | void |
1341 | finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, |
1342 | bool is_system_catalog, |
1343 | bool swap_toast_by_content, |
1344 | bool check_constraints, |
1345 | bool is_internal, |
1346 | TransactionId frozenXid, |
1347 | MultiXactId cutoffMulti, |
1348 | char newrelpersistence) |
1349 | { |
1350 | ObjectAddress object; |
1351 | Oid mapped_tables[4]; |
1352 | int reindex_flags; |
1353 | int i; |
1354 | |
1355 | /* Report that we are now swapping relation files */ |
1356 | pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, |
1357 | PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES); |
1358 | |
1359 | /* Zero out possible results from swapped_relation_files */ |
1360 | memset(mapped_tables, 0, sizeof(mapped_tables)); |
1361 | |
1362 | /* |
1363 | * Swap the contents of the heap relations (including any toast tables). |
1364 | * Also set old heap's relfrozenxid to frozenXid. |
1365 | */ |
1366 | swap_relation_files(OIDOldHeap, OIDNewHeap, |
1367 | (OIDOldHeap == RelationRelationId), |
1368 | swap_toast_by_content, is_internal, |
1369 | frozenXid, cutoffMulti, mapped_tables); |
1370 | |
1371 | /* |
1372 | * If it's a system catalog, queue a sinval message to flush all catcaches |
1373 | * on the catalog when we reach CommandCounterIncrement. |
1374 | */ |
1375 | if (is_system_catalog) |
1376 | CacheInvalidateCatalog(OIDOldHeap); |
1377 | |
1378 | /* |
1379 | * Rebuild each index on the relation (but not the toast table, which is |
1380 | * all-new at this point). It is important to do this before the DROP |
1381 | * step because if we are processing a system catalog that will be used |
1382 | * during DROP, we want to have its indexes available. There is no |
1383 | * advantage to the other order anyway because this is all transactional, |
1384 | * so no chance to reclaim disk space before commit. We do not need a |
1385 | * final CommandCounterIncrement() because reindex_relation does it. |
1386 | * |
1387 | * Note: because index_build is called via reindex_relation, it will never |
1388 | * set indcheckxmin true for the indexes. This is OK even though in some |
1389 | * sense we are building new indexes rather than rebuilding existing ones, |
1390 | * because the new heap won't contain any HOT chains at all, let alone |
1391 | * broken ones, so it can't be necessary to set indcheckxmin. |
1392 | */ |
1393 | reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE; |
1394 | if (check_constraints) |
1395 | reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS; |
1396 | |
1397 | /* |
1398 | * Ensure that the indexes have the same persistence as the parent |
1399 | * relation. |
1400 | */ |
1401 | if (newrelpersistence == RELPERSISTENCE_UNLOGGED) |
1402 | reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; |
1403 | else if (newrelpersistence == RELPERSISTENCE_PERMANENT) |
1404 | reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; |
1405 | |
1406 | /* Report that we are now reindexing relations */ |
1407 | pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, |
1408 | PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); |
1409 | |
1410 | reindex_relation(OIDOldHeap, reindex_flags, 0); |
1411 | |
1412 | /* Report that we are now doing clean up */ |
1413 | pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, |
1414 | PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); |
1415 | |
1416 | /* |
1417 | * If the relation being rebuild is pg_class, swap_relation_files() |
1418 | * couldn't update pg_class's own pg_class entry (check comments in |
1419 | * swap_relation_files()), thus relfrozenxid was not updated. That's |
1420 | * annoying because a potential reason for doing a VACUUM FULL is a |
1421 | * imminent or actual anti-wraparound shutdown. So, now that we can |
1422 | * access the new relation using its indices, update relfrozenxid. |
1423 | * pg_class doesn't have a toast relation, so we don't need to update the |
1424 | * corresponding toast relation. Not that there's little point moving all |
1425 | * relfrozenxid updates here since swap_relation_files() needs to write to |
1426 | * pg_class for non-mapped relations anyway. |
1427 | */ |
1428 | if (OIDOldHeap == RelationRelationId) |
1429 | { |
1430 | Relation relRelation; |
1431 | HeapTuple reltup; |
1432 | Form_pg_class relform; |
1433 | |
1434 | relRelation = table_open(RelationRelationId, RowExclusiveLock); |
1435 | |
1436 | reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap)); |
1437 | if (!HeapTupleIsValid(reltup)) |
1438 | elog(ERROR, "cache lookup failed for relation %u" , OIDOldHeap); |
1439 | relform = (Form_pg_class) GETSTRUCT(reltup); |
1440 | |
1441 | relform->relfrozenxid = frozenXid; |
1442 | relform->relminmxid = cutoffMulti; |
1443 | |
1444 | CatalogTupleUpdate(relRelation, &reltup->t_self, reltup); |
1445 | |
1446 | table_close(relRelation, RowExclusiveLock); |
1447 | } |
1448 | |
1449 | /* Destroy new heap with old filenode */ |
1450 | object.classId = RelationRelationId; |
1451 | object.objectId = OIDNewHeap; |
1452 | object.objectSubId = 0; |
1453 | |
1454 | /* |
1455 | * The new relation is local to our transaction and we know nothing |
1456 | * depends on it, so DROP_RESTRICT should be OK. |
1457 | */ |
1458 | performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); |
1459 | |
1460 | /* performDeletion does CommandCounterIncrement at end */ |
1461 | |
1462 | /* |
1463 | * Now we must remove any relation mapping entries that we set up for the |
1464 | * transient table, as well as its toast table and toast index if any. If |
1465 | * we fail to do this before commit, the relmapper will complain about new |
1466 | * permanent map entries being added post-bootstrap. |
1467 | */ |
1468 | for (i = 0; OidIsValid(mapped_tables[i]); i++) |
1469 | RelationMapRemoveMapping(mapped_tables[i]); |
1470 | |
1471 | /* |
1472 | * At this point, everything is kosher except that, if we did toast swap |
1473 | * by links, the toast table's name corresponds to the transient table. |
1474 | * The name is irrelevant to the backend because it's referenced by OID, |
1475 | * but users looking at the catalogs could be confused. Rename it to |
1476 | * prevent this problem. |
1477 | * |
1478 | * Note no lock required on the relation, because we already hold an |
1479 | * exclusive lock on it. |
1480 | */ |
1481 | if (!swap_toast_by_content) |
1482 | { |
1483 | Relation newrel; |
1484 | |
1485 | newrel = table_open(OIDOldHeap, NoLock); |
1486 | if (OidIsValid(newrel->rd_rel->reltoastrelid)) |
1487 | { |
1488 | Oid toastidx; |
1489 | char NewToastName[NAMEDATALEN]; |
1490 | |
1491 | /* Get the associated valid index to be renamed */ |
1492 | toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid, |
1493 | AccessShareLock); |
1494 | |
1495 | /* rename the toast table ... */ |
1496 | snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u" , |
1497 | OIDOldHeap); |
1498 | RenameRelationInternal(newrel->rd_rel->reltoastrelid, |
1499 | NewToastName, true, false); |
1500 | |
1501 | /* ... and its valid index too. */ |
1502 | snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index" , |
1503 | OIDOldHeap); |
1504 | |
1505 | RenameRelationInternal(toastidx, |
1506 | NewToastName, true, true); |
1507 | } |
1508 | relation_close(newrel, NoLock); |
1509 | } |
1510 | |
1511 | /* if it's not a catalog table, clear any missing attribute settings */ |
1512 | if (!is_system_catalog) |
1513 | { |
1514 | Relation newrel; |
1515 | |
1516 | newrel = table_open(OIDOldHeap, NoLock); |
1517 | RelationClearMissing(newrel); |
1518 | relation_close(newrel, NoLock); |
1519 | } |
1520 | } |
1521 | |
1522 | |
1523 | /* |
1524 | * Get a list of tables that the current user owns and |
1525 | * have indisclustered set. Return the list in a List * of rvsToCluster |
1526 | * with the tableOid and the indexOid on which the table is already |
1527 | * clustered. |
1528 | */ |
1529 | static List * |
1530 | get_tables_to_cluster(MemoryContext cluster_context) |
1531 | { |
1532 | Relation indRelation; |
1533 | TableScanDesc scan; |
1534 | ScanKeyData entry; |
1535 | HeapTuple indexTuple; |
1536 | Form_pg_index index; |
1537 | MemoryContext old_context; |
1538 | RelToCluster *rvtc; |
1539 | List *rvs = NIL; |
1540 | |
1541 | /* |
1542 | * Get all indexes that have indisclustered set and are owned by |
1543 | * appropriate user. System relations or nailed-in relations cannot ever |
1544 | * have indisclustered set, because CLUSTER will refuse to set it when |
1545 | * called with one of them as argument. |
1546 | */ |
1547 | indRelation = table_open(IndexRelationId, AccessShareLock); |
1548 | ScanKeyInit(&entry, |
1549 | Anum_pg_index_indisclustered, |
1550 | BTEqualStrategyNumber, F_BOOLEQ, |
1551 | BoolGetDatum(true)); |
1552 | scan = table_beginscan_catalog(indRelation, 1, &entry); |
1553 | while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) |
1554 | { |
1555 | index = (Form_pg_index) GETSTRUCT(indexTuple); |
1556 | |
1557 | if (!pg_class_ownercheck(index->indrelid, GetUserId())) |
1558 | continue; |
1559 | |
1560 | /* |
1561 | * We have to build the list in a different memory context so it will |
1562 | * survive the cross-transaction processing |
1563 | */ |
1564 | old_context = MemoryContextSwitchTo(cluster_context); |
1565 | |
1566 | rvtc = (RelToCluster *) palloc(sizeof(RelToCluster)); |
1567 | rvtc->tableOid = index->indrelid; |
1568 | rvtc->indexOid = index->indexrelid; |
1569 | rvs = lcons(rvtc, rvs); |
1570 | |
1571 | MemoryContextSwitchTo(old_context); |
1572 | } |
1573 | table_endscan(scan); |
1574 | |
1575 | relation_close(indRelation, AccessShareLock); |
1576 | |
1577 | return rvs; |
1578 | } |
1579 | |