storage.c source code [PostgreSQL/src/backend/catalog/storage.c]

1	/-------------------------------------------------------------------------*
2	*
3	* storage.c
4	* code to create and destroy physical storage for relations
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/catalog/storage.c
12	*
13	* NOTES
14	* Some of this code used to be in storage/smgr/smgr.c, and the
15	* function names still reflect that.
16	*
17	*-------------------------------------------------------------------------
18	*/
19
20	#include "postgres.h"
21
22	#include "miscadmin.h"
23
24	#include "access/visibilitymap.h"
25	#include "access/xact.h"
26	#include "access/xlog.h"
27	#include "access/xloginsert.h"
28	#include "access/xlogutils.h"
29	#include "catalog/storage.h"
30	#include "catalog/storage_xlog.h"
31	#include "storage/freespace.h"
32	#include "storage/smgr.h"
33	#include "utils/memutils.h"
34	#include "utils/rel.h"
35
36	/*
37	* We keep a list of all relations (represented as RelFileNode values)
38	* that have been created or deleted in the current transaction. When
39	* a relation is created, we create the physical file immediately, but
40	* remember it so that we can delete the file again if the current
41	* transaction is aborted. Conversely, a deletion request is NOT
42	* executed immediately, but is just entered in the list. When and if
43	* the transaction commits, we can delete the physical file.
44	*
45	* To handle subtransactions, every entry is marked with its transaction
46	* nesting level. At subtransaction commit, we reassign the subtransaction's
47	* entries to the parent nesting level. At subtransaction abort, we can
48	* immediately execute the abort-time actions for all entries of the current
49	* nesting level.
50	*
51	* NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
52	* unbetimes. It'd probably be OK to keep it in TopTransactionContext,
53	* but I'm being paranoid.
54	*/
55
56	typedef struct PendingRelDelete
57	{
58	RelFileNode relnode; / relation that may need to be deleted /
59	BackendId backend; / InvalidBackendId if not a temp rel /
60	bool atCommit; / T=delete at commit; F=delete at abort /
61	int nestLevel; / xact nesting level of request /
62	struct PendingRelDelete next; /* linked-list link /
63	} PendingRelDelete;
64
65	static PendingRelDelete pendingDeletes = NULL; /* head of linked list /
66
67	/*
68	* RelationCreateStorage
69	* Create physical storage for a relation.
70	*
71	* Create the underlying disk file storage for the relation. This only
72	* creates the main fork; additional forks are created lazily by the
73	* modules that need them.
74	*
75	* This function is transactional. The creation is WAL-logged, and if the
76	* transaction aborts later on, the storage will be destroyed.
77	*/
78	SMgrRelation
79	RelationCreateStorage(RelFileNode rnode, char relpersistence)
80	{
81	PendingRelDelete *pending;
82	SMgrRelation srel;
83	BackendId backend;
84	bool needs_wal;
85
86	switch (relpersistence)
87	{
88	case RELPERSISTENCE_TEMP:
89	backend = BackendIdForTempRelations();
90	needs_wal = false;
91	break;
92	case RELPERSISTENCE_UNLOGGED:
93	backend = InvalidBackendId;
94	needs_wal = false;
95	break;
96	case RELPERSISTENCE_PERMANENT:
97	backend = InvalidBackendId;
98	needs_wal = true;
99	break;
100	default:
101	elog(ERROR, "invalid relpersistence: %c", relpersistence);
102	return NULL; / placate compiler /
103	}
104
105	srel = smgropen(rnode, backend);
106	smgrcreate(srel, MAIN_FORKNUM, false);
107
108	if (needs_wal)
109	log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
110
111	/ Add the relation to the list of stuff to delete at abort /
112	pending = (PendingRelDelete *)
113	MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
114	pending->relnode = rnode;
115	pending->backend = backend;
116	pending->atCommit = false; / delete if abort /
117	pending->nestLevel = GetCurrentTransactionNestLevel();
118	pending->next = pendingDeletes;
119	pendingDeletes = pending;
120
121	return srel;
122	}
123
124	/*
125	* Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
126	*/
127	void
128	log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
129	{
130	xl_smgr_create xlrec;
131
132	/*
133	* Make an XLOG entry reporting the file creation.
134	*/
135	xlrec.rnode = *rnode;
136	xlrec.forkNum = forkNum;
137
138	XLogBeginInsert();
139	XLogRegisterData((char ) &xlrec, sizeof*(xlrec));
140	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE \| XLR_SPECIAL_REL_UPDATE);
141	}
142
143	/*
144	* RelationDropStorage
145	* Schedule unlinking of physical storage at transaction commit.
146	*/
147	void
148	RelationDropStorage(Relation rel)
149	{
150	PendingRelDelete *pending;
151
152	/ Add the relation to the list of stuff to delete at commit /
153	pending = (PendingRelDelete *)
154	MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
155	pending->relnode = rel->rd_node;
156	pending->backend = rel->rd_backend;
157	pending->atCommit = true; / delete if commit /
158	pending->nestLevel = GetCurrentTransactionNestLevel();
159	pending->next = pendingDeletes;
160	pendingDeletes = pending;
161
162	/*
163	* NOTE: if the relation was created in this transaction, it will now be
164	* present in the pending-delete list twice, once with atCommit true and
165	* once with atCommit false. Hence, it will be physically deleted at end
166	* of xact in either case (and the other entry will be ignored by
167	* smgrDoPendingDeletes, so no error will occur). We could instead remove
168	* the existing list entry and delete the physical file immediately, but
169	* for now I'll keep the logic simple.
170	*/
171
172	RelationCloseSmgr(rel);
173	}
174
175	/*
176	* RelationPreserveStorage
177	* Mark a relation as not to be deleted after all.
178	*
179	* We need this function because relation mapping changes are committed
180	* separately from commit of the whole transaction, so it's still possible
181	* for the transaction to abort after the mapping update is done.
182	* When a new physical relation is installed in the map, it would be
183	* scheduled for delete-on-abort, so we'd delete it, and be in trouble.
184	* The relation mapper fixes this by telling us to not delete such relations
185	* after all as part of its commit.
186	*
187	* We also use this to reuse an old build of an index during ALTER TABLE, this
188	* time removing the delete-at-commit entry.
189	*
190	* No-op if the relation is not among those scheduled for deletion.
191	*/
192	void
193	RelationPreserveStorage(RelFileNode rnode, bool atCommit)
194	{
195	PendingRelDelete *pending;
196	PendingRelDelete *prev;
197	PendingRelDelete *next;
198
199	prev = NULL;
200	for (pending = pendingDeletes; pending != NULL; pending = next)
201	{
202	next = pending->next;
203	if (RelFileNodeEquals(rnode, pending->relnode)
204	&& pending->atCommit == atCommit)
205	{
206	/ unlink and delete list entry /
207	if (prev)
208	prev->next = next;
209	else
210	pendingDeletes = next;
211	pfree(pending);
212	/ prev does not change /
213	}
214	else
215	{
216	/ unrelated entry, don't touch it /
217	prev = pending;
218	}
219	}
220	}
221
222	/*
223	* RelationTruncate
224	* Physically truncate a relation to the specified number of blocks.
225	*
226	* This includes getting rid of any buffers for the blocks that are to be
227	* dropped.
228	*/
229	void
230	RelationTruncate(Relation rel, BlockNumber nblocks)
231	{
232	bool fsm;
233	bool vm;
234
235	/ Open it at the smgr level if not already done /
236	RelationOpenSmgr(rel);
237
238	/*
239	* Make sure smgr_targblock etc aren't pointing somewhere past new end
240	*/
241	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
242	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
243	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
244
245	/ Truncate the FSM first if it exists /
246	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
247	if (fsm)
248	FreeSpaceMapTruncateRel(rel, nblocks);
249
250	/ Truncate the visibility map too if it exists. /
251	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
252	if (vm)
253	visibilitymap_truncate(rel, nblocks);
254
255	/*
256	* We WAL-log the truncation before actually truncating, which means
257	* trouble if the truncation fails. If we then crash, the WAL replay
258	* likely isn't going to succeed in the truncation either, and cause a
259	* PANIC. It's tempting to put a critical section here, but that cure
260	* would be worse than the disease. It would turn a usually harmless
261	* failure to truncate, that might spell trouble at WAL replay, into a
262	* certain PANIC.
263	*/
264	if (RelationNeedsWAL(rel))
265	{
266	/*
267	* Make an XLOG entry reporting the file truncation.
268	*/
269	XLogRecPtr lsn;
270	xl_smgr_truncate xlrec;
271
272	xlrec.blkno = nblocks;
273	xlrec.rnode = rel->rd_node;
274	xlrec.flags = SMGR_TRUNCATE_ALL;
275
276	XLogBeginInsert();
277	XLogRegisterData((char ) &xlrec, sizeof*(xlrec));
278
279	lsn = XLogInsert(RM_SMGR_ID,
280	XLOG_SMGR_TRUNCATE \| XLR_SPECIAL_REL_UPDATE);
281
282	/*
283	* Flush, because otherwise the truncation of the main relation might
284	* hit the disk before the WAL record, and the truncation of the FSM
285	* or visibility map. If we crashed during that window, we'd be left
286	* with a truncated heap, but the FSM or visibility map would still
287	* contain entries for the non-existent heap pages.
288	*/
289	if (fsm \|\| vm)
290	XLogFlush(lsn);
291	}
292
293	/ Do the real work /
294	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
295	}
296
297	/*
298	* Copy a fork's data, block by block.
299	*
300	* Note that this requires that there is no dirty data in shared buffers. If
301	* it's possible that there are, callers need to flush those using
302	* e.g. FlushRelationBuffers(rel).
303	*/
304	void
305	RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
306	ForkNumber forkNum, char relpersistence)
307	{
308	PGAlignedBlock buf;
309	Page page;
310	bool use_wal;
311	bool copying_initfork;
312	BlockNumber nblocks;
313	BlockNumber blkno;
314
315	page = (Page) buf.data;
316
317	/*
318	* The init fork for an unlogged relation in many respects has to be
319	* treated the same as normal relation, changes need to be WAL logged and
320	* it needs to be synced to disk.
321	*/
322	copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
323	forkNum == INIT_FORKNUM;
324
325	/*
326	* We need to log the copied data in WAL iff WAL archiving/streaming is
327	* enabled AND it's a permanent relation.
328	*/
329	use_wal = XLogIsNeeded() &&
330	(relpersistence == RELPERSISTENCE_PERMANENT \|\| copying_initfork);
331
332	nblocks = smgrnblocks(src, forkNum);
333
334	for (blkno = `0`; blkno < nblocks; blkno++)
335	{
336	/ If we got a cancel signal during the copy of the data, quit /
337	CHECK_FOR_INTERRUPTS();
338
339	smgrread(src, forkNum, blkno, buf.data);
340
341	if (!PageIsVerified(page, blkno))
342	ereport(ERROR,
343	(errcode(ERRCODE_DATA_CORRUPTED),
344	errmsg("invalid page in block %u of relation %s",
345	blkno,
346	relpathbackend(src->smgr_rnode.node,
347	src->smgr_rnode.backend,
348	forkNum))));
349
350	/*
351	* WAL-log the copied page. Unfortunately we don't know what kind of a
352	* page this is, so we have to log the full page including any unused
353	* space.
354	*/
355	if (use_wal)
356	log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
357
358	PageSetChecksumInplace(page, blkno);
359
360	/*
361	* Now write the page. We say isTemp = true even if it's not a temp
362	* rel, because there's no need for smgr to schedule an fsync for this
363	* write; we'll do it ourselves below.
364	*/
365	smgrextend(dst, forkNum, blkno, buf.data, true);
366	}
367
368	/*
369	* If the rel is WAL-logged, must fsync before commit. We use heap_sync
370	* to ensure that the toast table gets fsync'd too. (For a temp or
371	* unlogged rel we don't care since the data will be gone after a crash
372	* anyway.)
373	*
374	* It's obvious that we must do this when not WAL-logging the copy. It's
375	* less obvious that we have to do it even if we did WAL-log the copied
376	* pages. The reason is that since we're copying outside shared buffers, a
377	* CHECKPOINT occurring during the copy has no way to flush the previously
378	* written data to disk (indeed it won't know the new rel even exists). A
379	* crash later on would replay WAL from the checkpoint, therefore it
380	* wouldn't replay our earlier WAL entries. If we do not fsync those pages
381	* here, they might still not be on disk when the crash occurs.
382	*/
383	if (relpersistence == RELPERSISTENCE_PERMANENT \|\| copying_initfork)
384	smgrimmedsync(dst, forkNum);
385	}
386
387	/*
388	* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
389	*
390	* This also runs when aborting a subxact; we want to clean up a failed
391	* subxact immediately.
392	*
393	* Note: It's possible that we're being asked to remove a relation that has
394	* no physical storage in any fork. In particular, it's possible that we're
395	* cleaning up an old temporary relation for which RemovePgTempFiles has
396	* already recovered the physical storage.
397	*/
398	void
399	smgrDoPendingDeletes(bool isCommit)
400	{
401	int nestLevel = GetCurrentTransactionNestLevel();
402	PendingRelDelete *pending;
403	PendingRelDelete *prev;
404	PendingRelDelete *next;
405	int nrels = `0`,
406	i = `0`,
407	maxrels = `0`;
408	SMgrRelation *srels = NULL;
409
410	prev = NULL;
411	for (pending = pendingDeletes; pending != NULL; pending = next)
412	{
413	next = pending->next;
414	if (pending->nestLevel < nestLevel)
415	{
416	/ outer-level entries should not be processed yet /
417	prev = pending;
418	}
419	else
420	{
421	/ unlink list entry first, so we don't retry on failure /
422	if (prev)
423	prev->next = next;
424	else
425	pendingDeletes = next;
426	/ do deletion if called for /
427	if (pending->atCommit == isCommit)
428	{
429	SMgrRelation srel;
430
431	srel = smgropen(pending->relnode, pending->backend);
432
433	/ allocate the initial array, or extend it, if needed /
434	if (maxrels == `0`)
435	{
436	maxrels = `8`;
437	srels = palloc(sizeof(SMgrRelation) * maxrels);
438	}
439	else if (maxrels <= nrels)
440	{
441	maxrels *= `2`;
442	srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
443	}
444
445	srels[nrels++] = srel;
446	}
447	/ must explicitly free the list entry /
448	pfree(pending);
449	/ prev does not change /
450	}
451	}
452
453	if (nrels > `0`)
454	{
455	smgrdounlinkall(srels, nrels, false);
456
457	for (i = `0`; i < nrels; i++)
458	smgrclose(srels[i]);
459
460	pfree(srels);
461	}
462	}
463
464	/*
465	* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
466	*
467	* The return value is the number of relations scheduled for termination.
468	* *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
469	* If there are no relations to be deleted, *ptr is set to NULL.
470	*
471	* Only non-temporary relations are included in the returned list. This is OK
472	* because the list is used only in contexts where temporary relations don't
473	* matter: we're either writing to the two-phase state file (and transactions
474	* that have touched temp tables can't be prepared) or we're writing to xlog
475	* (and all temporary files will be zapped if we restart anyway, so no need
476	* for redo to do it also).
477	*
478	* Note that the list does not include anything scheduled for termination
479	* by upper-level transactions.
480	*/
481	int
482	smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
483	{
484	int nestLevel = GetCurrentTransactionNestLevel();
485	int nrels;
486	RelFileNode *rptr;
487	PendingRelDelete *pending;
488
489	nrels = `0`;
490	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
491	{
492	if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
493	&& pending->backend == InvalidBackendId)
494	nrels++;
495	}
496	if (nrels == `0`)
497	{
498	*ptr = NULL;
499	return `0`;
500	}
501	rptr = (RelFileNode ) palloc(nrels sizeof(RelFileNode));
502	*ptr = rptr;
503	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
504	{
505	if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
506	&& pending->backend == InvalidBackendId)
507	{
508	*rptr = pending->relnode;
509	rptr++;
510	}
511	}
512	return nrels;
513	}
514
515	/*
516	* PostPrepare_smgr -- Clean up after a successful PREPARE
517	*
518	* What we have to do here is throw away the in-memory state about pending
519	* relation deletes. It's all been recorded in the 2PC state file and
520	* it's no longer smgr's job to worry about it.
521	*/
522	void
523	PostPrepare_smgr(void)
524	{
525	PendingRelDelete *pending;
526	PendingRelDelete *next;
527
528	for (pending = pendingDeletes; pending != NULL; pending = next)
529	{
530	next = pending->next;
531	pendingDeletes = next;
532	/ must explicitly free the list entry /
533	pfree(pending);
534	}
535	}
536
537
538	/*
539	* AtSubCommit_smgr() --- Take care of subtransaction commit.
540	*
541	* Reassign all items in the pending-deletes list to the parent transaction.
542	*/
543	void
544	AtSubCommit_smgr(void)
545	{
546	int nestLevel = GetCurrentTransactionNestLevel();
547	PendingRelDelete *pending;
548
549	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
550	{
551	if (pending->nestLevel >= nestLevel)
552	pending->nestLevel = nestLevel - `1`;
553	}
554	}
555
556	/*
557	* AtSubAbort_smgr() --- Take care of subtransaction abort.
558	*
559	* Delete created relations and forget about deleted relations.
560	* We can execute these operations immediately because we know this
561	* subtransaction will not commit.
562	*/
563	void
564	AtSubAbort_smgr(void)
565	{
566	smgrDoPendingDeletes(false);
567	}
568
569	void
570	smgr_redo(XLogReaderState *record)
571	{
572	XLogRecPtr lsn = record->EndRecPtr;
573	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
574
575	/ Backup blocks are not used in smgr records /
576	Assert(!XLogRecHasAnyBlockRefs(record));
577
578	if (info == XLOG_SMGR_CREATE)
579	{
580	xl_smgr_create xlrec = (xl_smgr_create ) XLogRecGetData(record);
581	SMgrRelation reln;
582
583	reln = smgropen(xlrec->rnode, InvalidBackendId);
584	smgrcreate(reln, xlrec->forkNum, true);
585	}
586	else if (info == XLOG_SMGR_TRUNCATE)
587	{
588	xl_smgr_truncate xlrec = (xl_smgr_truncate ) XLogRecGetData(record);
589	SMgrRelation reln;
590	Relation rel;
591
592	reln = smgropen(xlrec->rnode, InvalidBackendId);
593
594	/*
595	* Forcibly create relation if it doesn't exist (which suggests that
596	* it was dropped somewhere later in the WAL sequence). As in
597	* XLogReadBufferForRedo, we prefer to recreate the rel and replay the
598	* log as best we can until the drop is seen.
599	*/
600	smgrcreate(reln, MAIN_FORKNUM, true);
601
602	/*
603	* Before we perform the truncation, update minimum recovery point to
604	* cover this WAL record. Once the relation is truncated, there's no
605	* going back. The buffer manager enforces the WAL-first rule for
606	* normal updates to relation files, so that the minimum recovery
607	* point is always updated before the corresponding change in the data
608	* file is flushed to disk. We have to do the same manually here.
609	*
610	* Doing this before the truncation means that if the truncation fails
611	* for some reason, you cannot start up the system even after restart,
612	* until you fix the underlying situation so that the truncation will
613	* succeed. Alternatively, we could update the minimum recovery point
614	* after truncation, but that would leave a small window where the
615	* WAL-first rule could be violated.
616	*/
617	XLogFlush(lsn);
618
619	if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != `0`)
620	{
621	smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
622
623	/ Also tell xlogutils.c about it /
624	XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
625	}
626
627	/ Truncate FSM and VM too /
628	rel = CreateFakeRelcacheEntry(xlrec->rnode);
629
630	if ((xlrec->flags & SMGR_TRUNCATE_FSM) != `0` &&
631	smgrexists(reln, FSM_FORKNUM))
632	FreeSpaceMapTruncateRel(rel, xlrec->blkno);
633	if ((xlrec->flags & SMGR_TRUNCATE_VM) != `0` &&
634	smgrexists(reln, VISIBILITYMAP_FORKNUM))
635	visibilitymap_truncate(rel, xlrec->blkno);
636
637	FreeFakeRelcacheEntry(rel);
638	}
639	else
640	elog(PANIC, "smgr_redo: unknown op code %u", info);
641	}
642

Browse the source code of PostgreSQL/src/backend/catalog/storage.c