1/*-------------------------------------------------------------------------
2 *
3 * storage.c
4 * code to create and destroy physical storage for relations
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/catalog/storage.c
12 *
13 * NOTES
14 * Some of this code used to be in storage/smgr/smgr.c, and the
15 * function names still reflect that.
16 *
17 *-------------------------------------------------------------------------
18 */
19
20#include "postgres.h"
21
22#include "miscadmin.h"
23
24#include "access/visibilitymap.h"
25#include "access/xact.h"
26#include "access/xlog.h"
27#include "access/xloginsert.h"
28#include "access/xlogutils.h"
29#include "catalog/storage.h"
30#include "catalog/storage_xlog.h"
31#include "storage/freespace.h"
32#include "storage/smgr.h"
33#include "utils/memutils.h"
34#include "utils/rel.h"
35
36/*
37 * We keep a list of all relations (represented as RelFileNode values)
38 * that have been created or deleted in the current transaction. When
39 * a relation is created, we create the physical file immediately, but
40 * remember it so that we can delete the file again if the current
41 * transaction is aborted. Conversely, a deletion request is NOT
42 * executed immediately, but is just entered in the list. When and if
43 * the transaction commits, we can delete the physical file.
44 *
45 * To handle subtransactions, every entry is marked with its transaction
46 * nesting level. At subtransaction commit, we reassign the subtransaction's
47 * entries to the parent nesting level. At subtransaction abort, we can
48 * immediately execute the abort-time actions for all entries of the current
49 * nesting level.
50 *
51 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
52 * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
53 * but I'm being paranoid.
54 */
55
56typedef struct PendingRelDelete
57{
58 RelFileNode relnode; /* relation that may need to be deleted */
59 BackendId backend; /* InvalidBackendId if not a temp rel */
60 bool atCommit; /* T=delete at commit; F=delete at abort */
61 int nestLevel; /* xact nesting level of request */
62 struct PendingRelDelete *next; /* linked-list link */
63} PendingRelDelete;
64
65static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
66
67/*
68 * RelationCreateStorage
69 * Create physical storage for a relation.
70 *
71 * Create the underlying disk file storage for the relation. This only
72 * creates the main fork; additional forks are created lazily by the
73 * modules that need them.
74 *
75 * This function is transactional. The creation is WAL-logged, and if the
76 * transaction aborts later on, the storage will be destroyed.
77 */
78SMgrRelation
79RelationCreateStorage(RelFileNode rnode, char relpersistence)
80{
81 PendingRelDelete *pending;
82 SMgrRelation srel;
83 BackendId backend;
84 bool needs_wal;
85
86 switch (relpersistence)
87 {
88 case RELPERSISTENCE_TEMP:
89 backend = BackendIdForTempRelations();
90 needs_wal = false;
91 break;
92 case RELPERSISTENCE_UNLOGGED:
93 backend = InvalidBackendId;
94 needs_wal = false;
95 break;
96 case RELPERSISTENCE_PERMANENT:
97 backend = InvalidBackendId;
98 needs_wal = true;
99 break;
100 default:
101 elog(ERROR, "invalid relpersistence: %c", relpersistence);
102 return NULL; /* placate compiler */
103 }
104
105 srel = smgropen(rnode, backend);
106 smgrcreate(srel, MAIN_FORKNUM, false);
107
108 if (needs_wal)
109 log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
110
111 /* Add the relation to the list of stuff to delete at abort */
112 pending = (PendingRelDelete *)
113 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
114 pending->relnode = rnode;
115 pending->backend = backend;
116 pending->atCommit = false; /* delete if abort */
117 pending->nestLevel = GetCurrentTransactionNestLevel();
118 pending->next = pendingDeletes;
119 pendingDeletes = pending;
120
121 return srel;
122}
123
124/*
125 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
126 */
127void
128log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
129{
130 xl_smgr_create xlrec;
131
132 /*
133 * Make an XLOG entry reporting the file creation.
134 */
135 xlrec.rnode = *rnode;
136 xlrec.forkNum = forkNum;
137
138 XLogBeginInsert();
139 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
140 XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
141}
142
143/*
144 * RelationDropStorage
145 * Schedule unlinking of physical storage at transaction commit.
146 */
147void
148RelationDropStorage(Relation rel)
149{
150 PendingRelDelete *pending;
151
152 /* Add the relation to the list of stuff to delete at commit */
153 pending = (PendingRelDelete *)
154 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
155 pending->relnode = rel->rd_node;
156 pending->backend = rel->rd_backend;
157 pending->atCommit = true; /* delete if commit */
158 pending->nestLevel = GetCurrentTransactionNestLevel();
159 pending->next = pendingDeletes;
160 pendingDeletes = pending;
161
162 /*
163 * NOTE: if the relation was created in this transaction, it will now be
164 * present in the pending-delete list twice, once with atCommit true and
165 * once with atCommit false. Hence, it will be physically deleted at end
166 * of xact in either case (and the other entry will be ignored by
167 * smgrDoPendingDeletes, so no error will occur). We could instead remove
168 * the existing list entry and delete the physical file immediately, but
169 * for now I'll keep the logic simple.
170 */
171
172 RelationCloseSmgr(rel);
173}
174
175/*
176 * RelationPreserveStorage
177 * Mark a relation as not to be deleted after all.
178 *
179 * We need this function because relation mapping changes are committed
180 * separately from commit of the whole transaction, so it's still possible
181 * for the transaction to abort after the mapping update is done.
182 * When a new physical relation is installed in the map, it would be
183 * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
184 * The relation mapper fixes this by telling us to not delete such relations
185 * after all as part of its commit.
186 *
187 * We also use this to reuse an old build of an index during ALTER TABLE, this
188 * time removing the delete-at-commit entry.
189 *
190 * No-op if the relation is not among those scheduled for deletion.
191 */
192void
193RelationPreserveStorage(RelFileNode rnode, bool atCommit)
194{
195 PendingRelDelete *pending;
196 PendingRelDelete *prev;
197 PendingRelDelete *next;
198
199 prev = NULL;
200 for (pending = pendingDeletes; pending != NULL; pending = next)
201 {
202 next = pending->next;
203 if (RelFileNodeEquals(rnode, pending->relnode)
204 && pending->atCommit == atCommit)
205 {
206 /* unlink and delete list entry */
207 if (prev)
208 prev->next = next;
209 else
210 pendingDeletes = next;
211 pfree(pending);
212 /* prev does not change */
213 }
214 else
215 {
216 /* unrelated entry, don't touch it */
217 prev = pending;
218 }
219 }
220}
221
222/*
223 * RelationTruncate
224 * Physically truncate a relation to the specified number of blocks.
225 *
226 * This includes getting rid of any buffers for the blocks that are to be
227 * dropped.
228 */
229void
230RelationTruncate(Relation rel, BlockNumber nblocks)
231{
232 bool fsm;
233 bool vm;
234
235 /* Open it at the smgr level if not already done */
236 RelationOpenSmgr(rel);
237
238 /*
239 * Make sure smgr_targblock etc aren't pointing somewhere past new end
240 */
241 rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
242 rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
243 rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
244
245 /* Truncate the FSM first if it exists */
246 fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
247 if (fsm)
248 FreeSpaceMapTruncateRel(rel, nblocks);
249
250 /* Truncate the visibility map too if it exists. */
251 vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
252 if (vm)
253 visibilitymap_truncate(rel, nblocks);
254
255 /*
256 * We WAL-log the truncation before actually truncating, which means
257 * trouble if the truncation fails. If we then crash, the WAL replay
258 * likely isn't going to succeed in the truncation either, and cause a
259 * PANIC. It's tempting to put a critical section here, but that cure
260 * would be worse than the disease. It would turn a usually harmless
261 * failure to truncate, that might spell trouble at WAL replay, into a
262 * certain PANIC.
263 */
264 if (RelationNeedsWAL(rel))
265 {
266 /*
267 * Make an XLOG entry reporting the file truncation.
268 */
269 XLogRecPtr lsn;
270 xl_smgr_truncate xlrec;
271
272 xlrec.blkno = nblocks;
273 xlrec.rnode = rel->rd_node;
274 xlrec.flags = SMGR_TRUNCATE_ALL;
275
276 XLogBeginInsert();
277 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
278
279 lsn = XLogInsert(RM_SMGR_ID,
280 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
281
282 /*
283 * Flush, because otherwise the truncation of the main relation might
284 * hit the disk before the WAL record, and the truncation of the FSM
285 * or visibility map. If we crashed during that window, we'd be left
286 * with a truncated heap, but the FSM or visibility map would still
287 * contain entries for the non-existent heap pages.
288 */
289 if (fsm || vm)
290 XLogFlush(lsn);
291 }
292
293 /* Do the real work */
294 smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
295}
296
297/*
298 * Copy a fork's data, block by block.
299 *
300 * Note that this requires that there is no dirty data in shared buffers. If
301 * it's possible that there are, callers need to flush those using
302 * e.g. FlushRelationBuffers(rel).
303 */
304void
305RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
306 ForkNumber forkNum, char relpersistence)
307{
308 PGAlignedBlock buf;
309 Page page;
310 bool use_wal;
311 bool copying_initfork;
312 BlockNumber nblocks;
313 BlockNumber blkno;
314
315 page = (Page) buf.data;
316
317 /*
318 * The init fork for an unlogged relation in many respects has to be
319 * treated the same as normal relation, changes need to be WAL logged and
320 * it needs to be synced to disk.
321 */
322 copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
323 forkNum == INIT_FORKNUM;
324
325 /*
326 * We need to log the copied data in WAL iff WAL archiving/streaming is
327 * enabled AND it's a permanent relation.
328 */
329 use_wal = XLogIsNeeded() &&
330 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
331
332 nblocks = smgrnblocks(src, forkNum);
333
334 for (blkno = 0; blkno < nblocks; blkno++)
335 {
336 /* If we got a cancel signal during the copy of the data, quit */
337 CHECK_FOR_INTERRUPTS();
338
339 smgrread(src, forkNum, blkno, buf.data);
340
341 if (!PageIsVerified(page, blkno))
342 ereport(ERROR,
343 (errcode(ERRCODE_DATA_CORRUPTED),
344 errmsg("invalid page in block %u of relation %s",
345 blkno,
346 relpathbackend(src->smgr_rnode.node,
347 src->smgr_rnode.backend,
348 forkNum))));
349
350 /*
351 * WAL-log the copied page. Unfortunately we don't know what kind of a
352 * page this is, so we have to log the full page including any unused
353 * space.
354 */
355 if (use_wal)
356 log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
357
358 PageSetChecksumInplace(page, blkno);
359
360 /*
361 * Now write the page. We say isTemp = true even if it's not a temp
362 * rel, because there's no need for smgr to schedule an fsync for this
363 * write; we'll do it ourselves below.
364 */
365 smgrextend(dst, forkNum, blkno, buf.data, true);
366 }
367
368 /*
369 * If the rel is WAL-logged, must fsync before commit. We use heap_sync
370 * to ensure that the toast table gets fsync'd too. (For a temp or
371 * unlogged rel we don't care since the data will be gone after a crash
372 * anyway.)
373 *
374 * It's obvious that we must do this when not WAL-logging the copy. It's
375 * less obvious that we have to do it even if we did WAL-log the copied
376 * pages. The reason is that since we're copying outside shared buffers, a
377 * CHECKPOINT occurring during the copy has no way to flush the previously
378 * written data to disk (indeed it won't know the new rel even exists). A
379 * crash later on would replay WAL from the checkpoint, therefore it
380 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
381 * here, they might still not be on disk when the crash occurs.
382 */
383 if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
384 smgrimmedsync(dst, forkNum);
385}
386
387/*
388 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
389 *
390 * This also runs when aborting a subxact; we want to clean up a failed
391 * subxact immediately.
392 *
393 * Note: It's possible that we're being asked to remove a relation that has
394 * no physical storage in any fork. In particular, it's possible that we're
395 * cleaning up an old temporary relation for which RemovePgTempFiles has
396 * already recovered the physical storage.
397 */
398void
399smgrDoPendingDeletes(bool isCommit)
400{
401 int nestLevel = GetCurrentTransactionNestLevel();
402 PendingRelDelete *pending;
403 PendingRelDelete *prev;
404 PendingRelDelete *next;
405 int nrels = 0,
406 i = 0,
407 maxrels = 0;
408 SMgrRelation *srels = NULL;
409
410 prev = NULL;
411 for (pending = pendingDeletes; pending != NULL; pending = next)
412 {
413 next = pending->next;
414 if (pending->nestLevel < nestLevel)
415 {
416 /* outer-level entries should not be processed yet */
417 prev = pending;
418 }
419 else
420 {
421 /* unlink list entry first, so we don't retry on failure */
422 if (prev)
423 prev->next = next;
424 else
425 pendingDeletes = next;
426 /* do deletion if called for */
427 if (pending->atCommit == isCommit)
428 {
429 SMgrRelation srel;
430
431 srel = smgropen(pending->relnode, pending->backend);
432
433 /* allocate the initial array, or extend it, if needed */
434 if (maxrels == 0)
435 {
436 maxrels = 8;
437 srels = palloc(sizeof(SMgrRelation) * maxrels);
438 }
439 else if (maxrels <= nrels)
440 {
441 maxrels *= 2;
442 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
443 }
444
445 srels[nrels++] = srel;
446 }
447 /* must explicitly free the list entry */
448 pfree(pending);
449 /* prev does not change */
450 }
451 }
452
453 if (nrels > 0)
454 {
455 smgrdounlinkall(srels, nrels, false);
456
457 for (i = 0; i < nrels; i++)
458 smgrclose(srels[i]);
459
460 pfree(srels);
461 }
462}
463
464/*
465 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
466 *
467 * The return value is the number of relations scheduled for termination.
468 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
469 * If there are no relations to be deleted, *ptr is set to NULL.
470 *
471 * Only non-temporary relations are included in the returned list. This is OK
472 * because the list is used only in contexts where temporary relations don't
473 * matter: we're either writing to the two-phase state file (and transactions
474 * that have touched temp tables can't be prepared) or we're writing to xlog
475 * (and all temporary files will be zapped if we restart anyway, so no need
476 * for redo to do it also).
477 *
478 * Note that the list does not include anything scheduled for termination
479 * by upper-level transactions.
480 */
481int
482smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
483{
484 int nestLevel = GetCurrentTransactionNestLevel();
485 int nrels;
486 RelFileNode *rptr;
487 PendingRelDelete *pending;
488
489 nrels = 0;
490 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
491 {
492 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
493 && pending->backend == InvalidBackendId)
494 nrels++;
495 }
496 if (nrels == 0)
497 {
498 *ptr = NULL;
499 return 0;
500 }
501 rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
502 *ptr = rptr;
503 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
504 {
505 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
506 && pending->backend == InvalidBackendId)
507 {
508 *rptr = pending->relnode;
509 rptr++;
510 }
511 }
512 return nrels;
513}
514
515/*
516 * PostPrepare_smgr -- Clean up after a successful PREPARE
517 *
518 * What we have to do here is throw away the in-memory state about pending
519 * relation deletes. It's all been recorded in the 2PC state file and
520 * it's no longer smgr's job to worry about it.
521 */
522void
523PostPrepare_smgr(void)
524{
525 PendingRelDelete *pending;
526 PendingRelDelete *next;
527
528 for (pending = pendingDeletes; pending != NULL; pending = next)
529 {
530 next = pending->next;
531 pendingDeletes = next;
532 /* must explicitly free the list entry */
533 pfree(pending);
534 }
535}
536
537
538/*
539 * AtSubCommit_smgr() --- Take care of subtransaction commit.
540 *
541 * Reassign all items in the pending-deletes list to the parent transaction.
542 */
543void
544AtSubCommit_smgr(void)
545{
546 int nestLevel = GetCurrentTransactionNestLevel();
547 PendingRelDelete *pending;
548
549 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
550 {
551 if (pending->nestLevel >= nestLevel)
552 pending->nestLevel = nestLevel - 1;
553 }
554}
555
556/*
557 * AtSubAbort_smgr() --- Take care of subtransaction abort.
558 *
559 * Delete created relations and forget about deleted relations.
560 * We can execute these operations immediately because we know this
561 * subtransaction will not commit.
562 */
563void
564AtSubAbort_smgr(void)
565{
566 smgrDoPendingDeletes(false);
567}
568
569void
570smgr_redo(XLogReaderState *record)
571{
572 XLogRecPtr lsn = record->EndRecPtr;
573 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
574
575 /* Backup blocks are not used in smgr records */
576 Assert(!XLogRecHasAnyBlockRefs(record));
577
578 if (info == XLOG_SMGR_CREATE)
579 {
580 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
581 SMgrRelation reln;
582
583 reln = smgropen(xlrec->rnode, InvalidBackendId);
584 smgrcreate(reln, xlrec->forkNum, true);
585 }
586 else if (info == XLOG_SMGR_TRUNCATE)
587 {
588 xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
589 SMgrRelation reln;
590 Relation rel;
591
592 reln = smgropen(xlrec->rnode, InvalidBackendId);
593
594 /*
595 * Forcibly create relation if it doesn't exist (which suggests that
596 * it was dropped somewhere later in the WAL sequence). As in
597 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
598 * log as best we can until the drop is seen.
599 */
600 smgrcreate(reln, MAIN_FORKNUM, true);
601
602 /*
603 * Before we perform the truncation, update minimum recovery point to
604 * cover this WAL record. Once the relation is truncated, there's no
605 * going back. The buffer manager enforces the WAL-first rule for
606 * normal updates to relation files, so that the minimum recovery
607 * point is always updated before the corresponding change in the data
608 * file is flushed to disk. We have to do the same manually here.
609 *
610 * Doing this before the truncation means that if the truncation fails
611 * for some reason, you cannot start up the system even after restart,
612 * until you fix the underlying situation so that the truncation will
613 * succeed. Alternatively, we could update the minimum recovery point
614 * after truncation, but that would leave a small window where the
615 * WAL-first rule could be violated.
616 */
617 XLogFlush(lsn);
618
619 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
620 {
621 smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
622
623 /* Also tell xlogutils.c about it */
624 XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
625 }
626
627 /* Truncate FSM and VM too */
628 rel = CreateFakeRelcacheEntry(xlrec->rnode);
629
630 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
631 smgrexists(reln, FSM_FORKNUM))
632 FreeSpaceMapTruncateRel(rel, xlrec->blkno);
633 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
634 smgrexists(reln, VISIBILITYMAP_FORKNUM))
635 visibilitymap_truncate(rel, xlrec->blkno);
636
637 FreeFakeRelcacheEntry(rel);
638 }
639 else
640 elog(PANIC, "smgr_redo: unknown op code %u", info);
641}
642