relmapper.c source code [PostgreSQL/src/backend/utils/cache/relmapper.c]

1	/-------------------------------------------------------------------------*
2	*
3	* relmapper.c
4	* Catalog-to-filenode mapping
5	*
6	* For most tables, the physical file underlying the table is specified by
7	* pg_class.relfilenode. However, that obviously won't work for pg_class
8	* itself, nor for the other "nailed" catalogs for which we have to be able
9	* to set up working Relation entries without access to pg_class. It also
10	* does not work for shared catalogs, since there is no practical way to
11	* update other databases' pg_class entries when relocating a shared catalog.
12	* Therefore, for these special catalogs (henceforth referred to as "mapped
13	* catalogs") we rely on a separately maintained file that shows the mapping
14	* from catalog OIDs to filenode numbers. Each database has a map file for
15	* its local mapped catalogs, and there is a separate map file for shared
16	* catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17	*
18	* Relocation of a normal table is committed (ie, the new physical file becomes
19	* authoritative) when the pg_class row update commits. For mapped catalogs,
20	* the act of updating the map file is effectively commit of the relocation.
21	* We postpone the file update till just before commit of the transaction
22	* doing the rewrite, but there is necessarily a window between. Therefore
23	* mapped catalogs can only be relocated by operations such as VACUUM FULL
24	* and CLUSTER, which make no transactionally-significant changes: it must be
25	* safe for the new file to replace the old, even if the transaction itself
26	* aborts. An important factor here is that the indexes and toast table of
27	* a mapped catalog must also be mapped, so that the rewrites/relocations of
28	* all these files commit in a single map file update rather than being tied
29	* to transaction commit.
30	*
31	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
32	* Portions Copyright (c) 1994, Regents of the University of California
33	*
34	*
35	* IDENTIFICATION
36	* src/backend/utils/cache/relmapper.c
37	*
38	*-------------------------------------------------------------------------
39	*/
40	#include "postgres.h"
41
42	#include <fcntl.h>
43	#include <sys/stat.h>
44	#include <unistd.h>
45
46	#include "access/xact.h"
47	#include "access/xlog.h"
48	#include "access/xloginsert.h"
49	#include "catalog/catalog.h"
50	#include "catalog/pg_tablespace.h"
51	#include "catalog/storage.h"
52	#include "miscadmin.h"
53	#include "pgstat.h"
54	#include "storage/fd.h"
55	#include "storage/lwlock.h"
56	#include "utils/inval.h"
57	#include "utils/relmapper.h"
58
59
60	/*
61	* The map file is critical data: we have no automatic method for recovering
62	* from loss or corruption of it. We use a CRC so that we can detect
63	* corruption. To minimize the risk of failed updates, the map file should
64	* be kept to no more than one standard-size disk sector (ie 512 bytes),
65	* and we use overwrite-in-place rather than playing renaming games.
66	* The struct layout below is designed to occupy exactly 512 bytes, which
67	* might make filesystem updates a bit more efficient.
68	*
69	* Entries in the mappings[] array are in no particular order. We could
70	* speed searching by insisting on OID order, but it really shouldn't be
71	* worth the trouble given the intended size of the mapping sets.
72	*/
73	#define RELMAPPER_FILENAME "pg_filenode.map"
74
75	#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
76
77	#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
78
79	typedef struct RelMapping
80	{
81	Oid mapoid; / OID of a catalog /
82	Oid mapfilenode; / its filenode number /
83	} RelMapping;
84
85	typedef struct RelMapFile
86	{
87	int32 magic; / always RELMAPPER_FILEMAGIC /
88	int32 num_mappings; / number of valid RelMapping entries /
89	RelMapping mappings[MAX_MAPPINGS];
90	pg_crc32c crc; / CRC of all above /
91	int32 pad; / to make the struct size be 512 exactly /
92	} RelMapFile;
93
94	/*
95	* State for serializing local and shared relmappings for parallel workers
96	* (active states only). See notes on active_* and pending_* updates state.
97	*/
98	typedef struct SerializedActiveRelMaps
99	{
100	RelMapFile active_shared_updates;
101	RelMapFile active_local_updates;
102	} SerializedActiveRelMaps;
103
104	/*
105	* The currently known contents of the shared map file and our database's
106	* local map file are stored here. These can be reloaded from disk
107	* immediately whenever we receive an update sinval message.
108	*/
109	static RelMapFile shared_map;
110	static RelMapFile local_map;
111
112	/*
113	* We use the same RelMapFile data structure to track uncommitted local
114	* changes in the mappings (but note the magic and crc fields are not made
115	* valid in these variables). Currently, map updates are not allowed within
116	* subtransactions, so one set of transaction-level changes is sufficient.
117	*
118	* The active_xxx variables contain updates that are valid in our transaction
119	* and should be honored by RelationMapOidToFilenode. The pending_xxx
120	* variables contain updates we have been told about that aren't active yet;
121	* they will become active at the next CommandCounterIncrement. This setup
122	* lets map updates act similarly to updates of pg_class rows, ie, they
123	* become visible only at the next CommandCounterIncrement boundary.
124	*
125	* Active shared and active local updates are serialized by the parallel
126	* infrastructure, and deserialized within parallel workers.
127	*/
128	static RelMapFile active_shared_updates;
129	static RelMapFile active_local_updates;
130	static RelMapFile pending_shared_updates;
131	static RelMapFile pending_local_updates;
132
133
134	/ non-export function prototypes /
135	static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
136	bool add_okay);
137	static void merge_map_updates(RelMapFile map, const* RelMapFile *updates,
138	bool add_okay);
139	static void load_relmap_file(bool shared);
140	static void write_relmap_file(bool shared, RelMapFile *newmap,
141	bool write_wal, bool send_sinval, bool preserve_files,
142	Oid dbid, Oid tsid, const char *dbpath);
143	static void perform_relmap_update(bool shared, const RelMapFile *updates);
144
145
146	/*
147	* RelationMapOidToFilenode
148	*
149	* The raison d' etre ... given a relation OID, look up its filenode.
150	*
151	* Although shared and local relation OIDs should never overlap, the caller
152	* always knows which we need --- so pass that information to avoid useless
153	* searching.
154	*
155	* Returns InvalidOid if the OID is not known (which should never happen,
156	* but the caller is in a better position to report a meaningful error).
157	*/
158	Oid
159	RelationMapOidToFilenode(Oid relationId, bool shared)
160	{
161	const RelMapFile *map;
162	int32 i;
163
164	/ If there are active updates, believe those over the main maps /
165	if (shared)
166	{
167	map = &active_shared_updates;
168	for (i = `0`; i < map->num_mappings; i++)
169	{
170	if (relationId == map->mappings[i].mapoid)
171	return map->mappings[i].mapfilenode;
172	}
173	map = &shared_map;
174	for (i = `0`; i < map->num_mappings; i++)
175	{
176	if (relationId == map->mappings[i].mapoid)
177	return map->mappings[i].mapfilenode;
178	}
179	}
180	else
181	{
182	map = &active_local_updates;
183	for (i = `0`; i < map->num_mappings; i++)
184	{
185	if (relationId == map->mappings[i].mapoid)
186	return map->mappings[i].mapfilenode;
187	}
188	map = &local_map;
189	for (i = `0`; i < map->num_mappings; i++)
190	{
191	if (relationId == map->mappings[i].mapoid)
192	return map->mappings[i].mapfilenode;
193	}
194	}
195
196	return InvalidOid;
197	}
198
199	/*
200	* RelationMapFilenodeToOid
201	*
202	* Do the reverse of the normal direction of mapping done in
203	* RelationMapOidToFilenode.
204	*
205	* This is not supposed to be used during normal running but rather for
206	* information purposes when looking at the filesystem or xlog.
207	*
208	* Returns InvalidOid if the OID is not known; this can easily happen if the
209	* relfilenode doesn't pertain to a mapped relation.
210	*/
211	Oid
212	RelationMapFilenodeToOid(Oid filenode, bool shared)
213	{
214	const RelMapFile *map;
215	int32 i;
216
217	/ If there are active updates, believe those over the main maps /
218	if (shared)
219	{
220	map = &active_shared_updates;
221	for (i = `0`; i < map->num_mappings; i++)
222	{
223	if (filenode == map->mappings[i].mapfilenode)
224	return map->mappings[i].mapoid;
225	}
226	map = &shared_map;
227	for (i = `0`; i < map->num_mappings; i++)
228	{
229	if (filenode == map->mappings[i].mapfilenode)
230	return map->mappings[i].mapoid;
231	}
232	}
233	else
234	{
235	map = &active_local_updates;
236	for (i = `0`; i < map->num_mappings; i++)
237	{
238	if (filenode == map->mappings[i].mapfilenode)
239	return map->mappings[i].mapoid;
240	}
241	map = &local_map;
242	for (i = `0`; i < map->num_mappings; i++)
243	{
244	if (filenode == map->mappings[i].mapfilenode)
245	return map->mappings[i].mapoid;
246	}
247	}
248
249	return InvalidOid;
250	}
251
252	/*
253	* RelationMapUpdateMap
254	*
255	* Install a new relfilenode mapping for the specified relation.
256	*
257	* If immediate is true (or we're bootstrapping), the mapping is activated
258	* immediately. Otherwise it is made pending until CommandCounterIncrement.
259	*/
260	void
261	RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
262	bool immediate)
263	{
264	RelMapFile *map;
265
266	if (IsBootstrapProcessingMode())
267	{
268	/*
269	* In bootstrap mode, the mapping gets installed in permanent map.
270	*/
271	if (shared)
272	map = &shared_map;
273	else
274	map = &local_map;
275	}
276	else
277	{
278	/*
279	* We don't currently support map changes within subtransactions, or
280	* when in parallel mode. This could be done with more bookkeeping
281	* infrastructure, but it doesn't presently seem worth it.
282	*/
283	if (GetCurrentTransactionNestLevel() > `1`)
284	elog(ERROR, "cannot change relation mapping within subtransaction");
285
286	if (IsInParallelMode())
287	elog(ERROR, "cannot change relation mapping in parallel mode");
288
289	if (immediate)
290	{
291	/ Make it active, but only locally /
292	if (shared)
293	map = &active_shared_updates;
294	else
295	map = &active_local_updates;
296	}
297	else
298	{
299	/ Make it pending /
300	if (shared)
301	map = &pending_shared_updates;
302	else
303	map = &pending_local_updates;
304	}
305	}
306	apply_map_update(map, relationId, fileNode, true);
307	}
308
309	/*
310	* apply_map_update
311	*
312	* Insert a new mapping into the given map variable, replacing any existing
313	* mapping for the same relation.
314	*
315	* In some cases the caller knows there must be an existing mapping; pass
316	* add_okay = false to draw an error if not.
317	*/
318	static void
319	apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
320	{
321	int32 i;
322
323	/ Replace any existing mapping /
324	for (i = `0`; i < map->num_mappings; i++)
325	{
326	if (relationId == map->mappings[i].mapoid)
327	{
328	map->mappings[i].mapfilenode = fileNode;
329	return;
330	}
331	}
332
333	/ Nope, need to add a new mapping /
334	if (!add_okay)
335	elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
336	relationId);
337	if (map->num_mappings >= MAX_MAPPINGS)
338	elog(ERROR, "ran out of space in relation map");
339	map->mappings[map->num_mappings].mapoid = relationId;
340	map->mappings[map->num_mappings].mapfilenode = fileNode;
341	map->num_mappings++;
342	}
343
344	/*
345	* merge_map_updates
346	*
347	* Merge all the updates in the given pending-update map into the target map.
348	* This is just a bulk form of apply_map_update.
349	*/
350	static void
351	merge_map_updates(RelMapFile map, const* RelMapFile *updates, bool add_okay)
352	{
353	int32 i;
354
355	for (i = `0`; i < updates->num_mappings; i++)
356	{
357	apply_map_update(map,
358	updates->mappings[i].mapoid,
359	updates->mappings[i].mapfilenode,
360	add_okay);
361	}
362	}
363
364	/*
365	* RelationMapRemoveMapping
366	*
367	* Remove a relation's entry in the map. This is only allowed for "active"
368	* (but not committed) local mappings. We need it so we can back out the
369	* entry for the transient target file when doing VACUUM FULL/CLUSTER on
370	* a mapped relation.
371	*/
372	void
373	RelationMapRemoveMapping(Oid relationId)
374	{
375	RelMapFile *map = &active_local_updates;
376	int32 i;
377
378	for (i = `0`; i < map->num_mappings; i++)
379	{
380	if (relationId == map->mappings[i].mapoid)
381	{
382	/ Found it, collapse it out /
383	map->mappings[i] = map->mappings[map->num_mappings - `1`];
384	map->num_mappings--;
385	return;
386	}
387	}
388	elog(ERROR, "could not find temporary mapping for relation %u",
389	relationId);
390	}
391
392	/*
393	* RelationMapInvalidate
394	*
395	* This routine is invoked for SI cache flush messages. We must re-read
396	* the indicated map file. However, we might receive a SI message in a
397	* process that hasn't yet, and might never, load the mapping files;
398	* for example the autovacuum launcher, which must not try to read
399	* a local map since it is attached to no particular database.
400	* So, re-read only if the map is valid now.
401	*/
402	void
403	RelationMapInvalidate(bool shared)
404	{
405	if (shared)
406	{
407	if (shared_map.magic == RELMAPPER_FILEMAGIC)
408	load_relmap_file(true);
409	}
410	else
411	{
412	if (local_map.magic == RELMAPPER_FILEMAGIC)
413	load_relmap_file(false);
414	}
415	}
416
417	/*
418	* RelationMapInvalidateAll
419	*
420	* Reload all map files. This is used to recover from SI message buffer
421	* overflow: we can't be sure if we missed an inval message.
422	* Again, reload only currently-valid maps.
423	*/
424	void
425	RelationMapInvalidateAll(void)
426	{
427	if (shared_map.magic == RELMAPPER_FILEMAGIC)
428	load_relmap_file(true);
429	if (local_map.magic == RELMAPPER_FILEMAGIC)
430	load_relmap_file(false);
431	}
432
433	/*
434	* AtCCI_RelationMap
435	*
436	* Activate any "pending" relation map updates at CommandCounterIncrement time.
437	*/
438	void
439	AtCCI_RelationMap(void)
440	{
441	if (pending_shared_updates.num_mappings != `0`)
442	{
443	merge_map_updates(&active_shared_updates,
444	&pending_shared_updates,
445	true);
446	pending_shared_updates.num_mappings = `0`;
447	}
448	if (pending_local_updates.num_mappings != `0`)
449	{
450	merge_map_updates(&active_local_updates,
451	&pending_local_updates,
452	true);
453	pending_local_updates.num_mappings = `0`;
454	}
455	}
456
457	/*
458	* AtEOXact_RelationMap
459	*
460	* Handle relation mapping at main-transaction commit or abort.
461	*
462	* During commit, this must be called as late as possible before the actual
463	* transaction commit, so as to minimize the window where the transaction
464	* could still roll back after committing map changes. Although nothing
465	* critically bad happens in such a case, we still would prefer that it
466	* not happen, since we'd possibly be losing useful updates to the relations'
467	* pg_class row(s).
468	*
469	* During abort, we just have to throw away any pending map changes.
470	* Normal post-abort cleanup will take care of fixing relcache entries.
471	* Parallel worker commit/abort is handled by resetting active mappings
472	* that may have been received from the leader process. (There should be
473	* no pending updates in parallel workers.)
474	*/
475	void
476	AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
477	{
478	if (isCommit && !isParallelWorker)
479	{
480	/*
481	* We should not get here with any "pending" updates. (We could
482	* logically choose to treat such as committed, but in the current
483	* code this should never happen.)
484	*/
485	Assert(pending_shared_updates.num_mappings == `0`);
486	Assert(pending_local_updates.num_mappings == `0`);
487
488	/*
489	* Write any active updates to the actual map files, then reset them.
490	*/
491	if (active_shared_updates.num_mappings != `0`)
492	{
493	perform_relmap_update(true, &active_shared_updates);
494	active_shared_updates.num_mappings = `0`;
495	}
496	if (active_local_updates.num_mappings != `0`)
497	{
498	perform_relmap_update(false, &active_local_updates);
499	active_local_updates.num_mappings = `0`;
500	}
501	}
502	else
503	{
504	/ Abort or parallel worker --- drop all local and pending updates /
505	Assert(!isParallelWorker \|\| pending_shared_updates.num_mappings == `0`);
506	Assert(!isParallelWorker \|\| pending_local_updates.num_mappings == `0`);
507
508	active_shared_updates.num_mappings = `0`;
509	active_local_updates.num_mappings = `0`;
510	pending_shared_updates.num_mappings = `0`;
511	pending_local_updates.num_mappings = `0`;
512	}
513	}
514
515	/*
516	* AtPrepare_RelationMap
517	*
518	* Handle relation mapping at PREPARE.
519	*
520	* Currently, we don't support preparing any transaction that changes the map.
521	*/
522	void
523	AtPrepare_RelationMap(void)
524	{
525	if (active_shared_updates.num_mappings != `0` \|\|
526	active_local_updates.num_mappings != `0` \|\|
527	pending_shared_updates.num_mappings != `0` \|\|
528	pending_local_updates.num_mappings != `0`)
529	ereport(ERROR,
530	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
531	errmsg("cannot PREPARE a transaction that modified relation mapping")));
532	}
533
534	/*
535	* CheckPointRelationMap
536	*
537	* This is called during a checkpoint. It must ensure that any relation map
538	* updates that were WAL-logged before the start of the checkpoint are
539	* securely flushed to disk and will not need to be replayed later. This
540	* seems unlikely to be a performance-critical issue, so we use a simple
541	* method: we just take and release the RelationMappingLock. This ensures
542	* that any already-logged map update is complete, because write_relmap_file
543	* will fsync the map file before the lock is released.
544	*/
545	void
546	CheckPointRelationMap(void)
547	{
548	LWLockAcquire(RelationMappingLock, LW_SHARED);
549	LWLockRelease(RelationMappingLock);
550	}
551
552	/*
553	* RelationMapFinishBootstrap
554	*
555	* Write out the initial relation mapping files at the completion of
556	* bootstrap. All the mapped files should have been made known to us
557	* via RelationMapUpdateMap calls.
558	*/
559	void
560	RelationMapFinishBootstrap(void)
561	{
562	Assert(IsBootstrapProcessingMode());
563
564	/ Shouldn't be anything "pending" ... /
565	Assert(active_shared_updates.num_mappings == `0`);
566	Assert(active_local_updates.num_mappings == `0`);
567	Assert(pending_shared_updates.num_mappings == `0`);
568	Assert(pending_local_updates.num_mappings == `0`);
569
570	/ Write the files; no WAL or sinval needed /
571	write_relmap_file(true, &shared_map, false, false, false,
572	InvalidOid, GLOBALTABLESPACE_OID, NULL);
573	write_relmap_file(false, &local_map, false, false, false,
574	MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
575	}
576
577	/*
578	* RelationMapInitialize
579	*
580	* This initializes the mapper module at process startup. We can't access the
581	* database yet, so just make sure the maps are empty.
582	*/
583	void
584	RelationMapInitialize(void)
585	{
586	/ The static variables should initialize to zeroes, but let's be sure /
587	shared_map.magic = `0`; / mark it not loaded /
588	local_map.magic = `0`;
589	shared_map.num_mappings = `0`;
590	local_map.num_mappings = `0`;
591	active_shared_updates.num_mappings = `0`;
592	active_local_updates.num_mappings = `0`;
593	pending_shared_updates.num_mappings = `0`;
594	pending_local_updates.num_mappings = `0`;
595	}
596
597	/*
598	* RelationMapInitializePhase2
599	*
600	* This is called to prepare for access to pg_database during startup.
601	* We should be able to read the shared map file now.
602	*/
603	void
604	RelationMapInitializePhase2(void)
605	{
606	/*
607	* In bootstrap mode, the map file isn't there yet, so do nothing.
608	*/
609	if (IsBootstrapProcessingMode())
610	return;
611
612	/*
613	* Load the shared map file, die on error.
614	*/
615	load_relmap_file(true);
616	}
617
618	/*
619	* RelationMapInitializePhase3
620	*
621	* This is called as soon as we have determined MyDatabaseId and set up
622	* DatabasePath. At this point we should be able to read the local map file.
623	*/
624	void
625	RelationMapInitializePhase3(void)
626	{
627	/*
628	* In bootstrap mode, the map file isn't there yet, so do nothing.
629	*/
630	if (IsBootstrapProcessingMode())
631	return;
632
633	/*
634	* Load the local map file, die on error.
635	*/
636	load_relmap_file(false);
637	}
638
639	/*
640	* EstimateRelationMapSpace
641	*
642	* Estimate space needed to pass active shared and local relmaps to parallel
643	* workers.
644	*/
645	Size
646	EstimateRelationMapSpace(void)
647	{
648	return sizeof(SerializedActiveRelMaps);
649	}
650
651	/*
652	* SerializeRelationMap
653	*
654	* Serialize active shared and local relmap state for parallel workers.
655	*/
656	void
657	SerializeRelationMap(Size maxSize, char *startAddress)
658	{
659	SerializedActiveRelMaps *relmaps;
660
661	Assert(maxSize >= EstimateRelationMapSpace());
662
663	relmaps = (SerializedActiveRelMaps *) startAddress;
664	relmaps->active_shared_updates = active_shared_updates;
665	relmaps->active_local_updates = active_local_updates;
666	}
667
668	/*
669	* RestoreRelationMap
670	*
671	* Restore active shared and local relmap state within a parallel worker.
672	*/
673	void
674	RestoreRelationMap(char *startAddress)
675	{
676	SerializedActiveRelMaps *relmaps;
677
678	if (active_shared_updates.num_mappings != `0` \|\|
679	active_local_updates.num_mappings != `0` \|\|
680	pending_shared_updates.num_mappings != `0` \|\|
681	pending_local_updates.num_mappings != `0`)
682	elog(ERROR, "parallel worker has existing mappings");
683
684	relmaps = (SerializedActiveRelMaps *) startAddress;
685	active_shared_updates = relmaps->active_shared_updates;
686	active_local_updates = relmaps->active_local_updates;
687	}
688
689	/*
690	* load_relmap_file -- load data from the shared or local map file
691	*
692	* Because the map file is essential for access to core system catalogs,
693	* failure to read it is a fatal error.
694	*
695	* Note that the local case requires DatabasePath to be set up.
696	*/
697	static void
698	load_relmap_file(bool shared)
699	{
700	RelMapFile *map;
701	char mapfilename[MAXPGPATH];
702	pg_crc32c crc;
703	int fd;
704	int r;
705
706	if (shared)
707	{
708	snprintf(mapfilename, sizeof(mapfilename), "global/%s",
709	RELMAPPER_FILENAME);
710	map = &shared_map;
711	}
712	else
713	{
714	snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
715	DatabasePath, RELMAPPER_FILENAME);
716	map = &local_map;
717	}
718
719	/ Read data ... /
720	fd = OpenTransientFile(mapfilename, O_RDONLY \| PG_BINARY);
721	if (fd < `0`)
722	ereport(FATAL,
723	(errcode_for_file_access(),
724	errmsg("could not open file \"%s\": %m",
725	mapfilename)));
726
727	/*
728	* Note: we could take RelationMappingLock in shared mode here, but it
729	* seems unnecessary since our read() should be atomic against any
730	* concurrent updater's write(). If the file is updated shortly after we
731	* look, the sinval signaling mechanism will make us re-read it before we
732	* are able to access any relation that's affected by the change.
733	*/
734	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
735	r = read(fd, map, sizeof(RelMapFile));
736	if (r != sizeof(RelMapFile))
737	{
738	if (r < `0`)
739	ereport(FATAL,
740	(errcode_for_file_access(),
741	errmsg("could not read file \"%s\": %m", mapfilename)));
742	else
743	ereport(FATAL,
744	(errcode(ERRCODE_DATA_CORRUPTED),
745	errmsg("could not read file \"%s\": read %d of %zu",
746	mapfilename, r, sizeof(RelMapFile))));
747	}
748	pgstat_report_wait_end();
749
750	if (CloseTransientFile(fd))
751	ereport(FATAL,
752	(errcode_for_file_access(),
753	errmsg("could not close file \"%s\": %m",
754	mapfilename)));
755
756	/ check for correct magic number, etc /
757	if (map->magic != RELMAPPER_FILEMAGIC \|\|
758	map->num_mappings < `0` \|\|
759	map->num_mappings > MAX_MAPPINGS)
760	ereport(FATAL,
761	(errmsg("relation mapping file \"%s\" contains invalid data",
762	mapfilename)));
763
764	/ verify the CRC /
765	INIT_CRC32C(crc);
766	COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
767	FIN_CRC32C(crc);
768
769	if (!EQ_CRC32C(crc, map->crc))
770	ereport(FATAL,
771	(errmsg("relation mapping file \"%s\" contains incorrect checksum",
772	mapfilename)));
773	}
774
775	/*
776	* Write out a new shared or local map file with the given contents.
777	*
778	* The magic number and CRC are automatically updated in *newmap. On
779	* success, we copy the data to the appropriate permanent static variable.
780	*
781	* If write_wal is true then an appropriate WAL message is emitted.
782	* (It will be false for bootstrap and WAL replay cases.)
783	*
784	* If send_sinval is true then a SI invalidation message is sent.
785	* (This should be true except in bootstrap case.)
786	*
787	* If preserve_files is true then the storage manager is warned not to
788	* delete the files listed in the map.
789	*
790	* Because this may be called during WAL replay when MyDatabaseId,
791	* DatabasePath, etc aren't valid, we require the caller to pass in suitable
792	* values. The caller is also responsible for being sure no concurrent
793	* map update could be happening.
794	*/
795	static void
796	write_relmap_file(bool shared, RelMapFile *newmap,
797	bool write_wal, bool send_sinval, bool preserve_files,
798	Oid dbid, Oid tsid, const char *dbpath)
799	{
800	int fd;
801	RelMapFile *realmap;
802	char mapfilename[MAXPGPATH];
803
804	/*
805	* Fill in the overhead fields and update CRC.
806	*/
807	newmap->magic = RELMAPPER_FILEMAGIC;
808	if (newmap->num_mappings < `0` \|\| newmap->num_mappings > MAX_MAPPINGS)
809	elog(ERROR, "attempt to write bogus relation mapping");
810
811	INIT_CRC32C(newmap->crc);
812	COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
813	FIN_CRC32C(newmap->crc);
814
815	/*
816	* Open the target file. We prefer to do this before entering the
817	* critical section, so that an open() failure need not force PANIC.
818	*/
819	if (shared)
820	{
821	snprintf(mapfilename, sizeof(mapfilename), "global/%s",
822	RELMAPPER_FILENAME);
823	realmap = &shared_map;
824	}
825	else
826	{
827	snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
828	dbpath, RELMAPPER_FILENAME);
829	realmap = &local_map;
830	}
831
832	fd = OpenTransientFile(mapfilename, O_WRONLY \| O_CREAT \| PG_BINARY);
833	if (fd < `0`)
834	ereport(ERROR,
835	(errcode_for_file_access(),
836	errmsg("could not open file \"%s\": %m",
837	mapfilename)));
838
839	if (write_wal)
840	{
841	xl_relmap_update xlrec;
842	XLogRecPtr lsn;
843
844	/ now errors are fatal ... /
845	START_CRIT_SECTION();
846
847	xlrec.dbid = dbid;
848	xlrec.tsid = tsid;
849	xlrec.nbytes = sizeof(RelMapFile);
850
851	XLogBeginInsert();
852	XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
853	XLogRegisterData((char ) newmap, sizeof*(RelMapFile));
854
855	lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
856
857	/ As always, WAL must hit the disk before the data update does /
858	XLogFlush(lsn);
859	}
860
861	errno = `0`;
862	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
863	if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
864	{
865	/ if write didn't set errno, assume problem is no disk space /
866	if (errno == `0`)
867	errno = ENOSPC;
868	ereport(ERROR,
869	(errcode_for_file_access(),
870	errmsg("could not write file \"%s\": %m",
871	mapfilename)));
872	}
873	pgstat_report_wait_end();
874
875	/*
876	* We choose to fsync the data to disk before considering the task done.
877	* It would be possible to relax this if it turns out to be a performance
878	* issue, but it would complicate checkpointing --- see notes for
879	* CheckPointRelationMap.
880	*/
881	pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
882	if (pg_fsync(fd) != `0`)
883	ereport(data_sync_elevel(ERROR),
884	(errcode_for_file_access(),
885	errmsg("could not fsync file \"%s\": %m",
886	mapfilename)));
887	pgstat_report_wait_end();
888
889	if (CloseTransientFile(fd))
890	ereport(ERROR,
891	(errcode_for_file_access(),
892	errmsg("could not close file \"%s\": %m",
893	mapfilename)));
894
895	/*
896	* Now that the file is safely on disk, send sinval message to let other
897	* backends know to re-read it. We must do this inside the critical
898	* section: if for some reason we fail to send the message, we have to
899	* force a database-wide PANIC. Otherwise other backends might continue
900	* execution with stale mapping information, which would be catastrophic
901	* as soon as others began to use the now-committed data.
902	*/
903	if (send_sinval)
904	CacheInvalidateRelmap(dbid);
905
906	/*
907	* Make sure that the files listed in the map are not deleted if the outer
908	* transaction aborts. This had better be within the critical section
909	* too: it's not likely to fail, but if it did, we'd arrive at transaction
910	* abort with the files still vulnerable. PANICing will leave things in a
911	* good state on-disk.
912	*
913	* Note: we're cheating a little bit here by assuming that mapped files
914	* are either in pg_global or the database's default tablespace.
915	*/
916	if (preserve_files)
917	{
918	int32 i;
919
920	for (i = `0`; i < newmap->num_mappings; i++)
921	{
922	RelFileNode rnode;
923
924	rnode.spcNode = tsid;
925	rnode.dbNode = dbid;
926	rnode.relNode = newmap->mappings[i].mapfilenode;
927	RelationPreserveStorage(rnode, false);
928	}
929	}
930
931	/ Success, update permanent copy /
932	memcpy(realmap, newmap, sizeof(RelMapFile));
933
934	/ Critical section done /
935	if (write_wal)
936	END_CRIT_SECTION();
937	}
938
939	/*
940	* Merge the specified updates into the appropriate "real" map,
941	* and write out the changes. This function must be used for committing
942	* updates during normal multiuser operation.
943	*/
944	static void
945	perform_relmap_update(bool shared, const RelMapFile *updates)
946	{
947	RelMapFile newmap;
948
949	/*
950	* Anyone updating a relation's mapping info should take exclusive lock on
951	* that rel and hold it until commit. This ensures that there will not be
952	* concurrent updates on the same mapping value; but there could easily be
953	* concurrent updates on different values in the same file. We cover that
954	* by acquiring the RelationMappingLock, re-reading the target file to
955	* ensure it's up to date, applying the updates, and writing the data
956	* before releasing RelationMappingLock.
957	*
958	* There is only one RelationMappingLock. In principle we could try to
959	* have one per mapping file, but it seems unlikely to be worth the
960	* trouble.
961	*/
962	LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
963
964	/ Be certain we see any other updates just made /
965	load_relmap_file(shared);
966
967	/ Prepare updated data in a local variable /
968	if (shared)
969	memcpy(&newmap, &shared_map, sizeof(RelMapFile));
970	else
971	memcpy(&newmap, &local_map, sizeof(RelMapFile));
972
973	/*
974	* Apply the updates to newmap. No new mappings should appear, unless
975	* somebody is adding indexes to system catalogs.
976	*/
977	merge_map_updates(&newmap, updates, allowSystemTableMods);
978
979	/ Write out the updated map and do other necessary tasks /
980	write_relmap_file(shared, &newmap, true, true, true,
981	(shared ? InvalidOid : MyDatabaseId),
982	(shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
983	DatabasePath);
984
985	/ Now we can release the lock /
986	LWLockRelease(RelationMappingLock);
987	}
988
989	/*
990	* RELMAP resource manager's routines
991	*/
992	void
993	relmap_redo(XLogReaderState *record)
994	{
995	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
996
997	/ Backup blocks are not used in relmap records /
998	Assert(!XLogRecHasAnyBlockRefs(record));
999
1000	if (info == XLOG_RELMAP_UPDATE)
1001	{
1002	xl_relmap_update xlrec = (xl_relmap_update ) XLogRecGetData(record);
1003	RelMapFile newmap;
1004	char *dbpath;
1005
1006	if (xlrec->nbytes != sizeof(RelMapFile))
1007	elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
1008	xlrec->nbytes);
1009	memcpy(&newmap, xlrec->data, sizeof(newmap));
1010
1011	/ We need to construct the pathname for this database /
1012	dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
1013
1014	/*
1015	* Write out the new map and send sinval, but of course don't write a
1016	* new WAL entry. There's no surrounding transaction to tell to
1017	* preserve files, either.
1018	*
1019	* There shouldn't be anyone else updating relmaps during WAL replay,
1020	* so we don't bother to take the RelationMappingLock. We would need
1021	* to do so if load_relmap_file needed to interlock against writers.
1022	*/
1023	write_relmap_file((xlrec->dbid == InvalidOid), &newmap,
1024	false, true, false,
1025	xlrec->dbid, xlrec->tsid, dbpath);
1026
1027	pfree(dbpath);
1028	}
1029	else
1030	elog(PANIC, "relmap_redo: unknown op code %u", info);
1031	}
1032

Browse the source code of PostgreSQL/src/backend/utils/cache/relmapper.c