fd.c source code [PostgreSQL/src/backend/storage/file/fd.c]

1	/-------------------------------------------------------------------------*
2	*
3	* fd.c
4	* Virtual file descriptor code.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	* IDENTIFICATION
10	* src/backend/storage/file/fd.c
11	*
12	* NOTES:
13	*
14	* This code manages a cache of 'virtual' file descriptors (VFDs).
15	* The server opens many file descriptors for a variety of reasons,
16	* including base tables, scratch files (e.g., sort and hash spool
17	* files), and random calls to C library routines like system(3); it
18	* is quite easy to exceed system limits on the number of open files a
19	* single process can have. (This is around 1024 on many modern
20	* operating systems, but may be lower on others.)
21	*
22	* VFDs are managed as an LRU pool, with actual OS file descriptors
23	* being opened and closed as needed. Obviously, if a routine is
24	* opened using these interfaces, all subsequent operations must also
25	* be through these interfaces (the File type is not a real file
26	* descriptor).
27	*
28	* For this scheme to work, most (if not all) routines throughout the
29	* server should use these interfaces instead of calling the C library
30	* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31	* may find ourselves short of real file descriptors anyway.
32	*
33	* INTERFACE ROUTINES
34	*
35	* PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36	* A File opened with OpenTemporaryFile is automatically deleted when the
37	* File is closed, either explicitly or implicitly at end of transaction or
38	* process exit. PathNameOpenFile is intended for files that are held open
39	* for a long time, like relation files. It is the caller's responsibility
40	* to close them, there is no automatic mechanism in fd.c for that.
41	*
42	* PathName(Create\|Open\|Delete)Temporary(File\|Dir) are used to manage
43	* temporary files that have names so that they can be shared between
44	* backends. Such files are automatically closed and count against the
45	* temporary file limit of the backend that creates them, but unlike anonymous
46	* files they are not automatically deleted. See sharedfileset.c for a shared
47	* ownership mechanism that provides automatic cleanup for shared files when
48	* the last of a group of backends detaches.
49	*
50	* AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51	* wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52	* They behave like the corresponding native functions, except that the handle
53	* is registered with the current subtransaction, and will be automatically
54	* closed at abort. These are intended mainly for short operations like
55	* reading a configuration file; there is a limit on the number of files that
56	* can be opened using these functions at any one time.
57	*
58	* Finally, BasicOpenFile is just a thin wrapper around open() that can
59	* release file descriptors in use by the virtual file descriptors if
60	* necessary. There is no automatic cleanup of file descriptors returned by
61	* BasicOpenFile, it is solely the caller's responsibility to close the file
62	* descriptor by calling close(2).
63	*
64	*-------------------------------------------------------------------------
65	*/
66
67	#include "postgres.h"
68
69	#include <sys/file.h>
70	#include <sys/param.h>
71	#include <sys/stat.h>
72	#ifndef WIN32
73	#include <sys/mman.h>
74	#endif
75	#include <limits.h>
76	#include <unistd.h>
77	#include <fcntl.h>
78	#ifdef HAVE_SYS_RESOURCE_H
79	#include <sys/resource.h> /* for getrlimit */
80	#endif
81
82	#include "miscadmin.h"
83	#include "access/xact.h"
84	#include "access/xlog.h"
85	#include "catalog/pg_tablespace.h"
86	#include "common/file_perm.h"
87	#include "pgstat.h"
88	#include "portability/mem.h"
89	#include "storage/fd.h"
90	#include "storage/ipc.h"
91	#include "utils/guc.h"
92	#include "utils/resowner_private.h"
93
94
95	/ Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data /
96	#if defined(HAVE_SYNC_FILE_RANGE)
97	#define PG_FLUSH_DATA_WORKS 1
98	#elif !defined(WIN32) && defined(MS_ASYNC)
99	#define PG_FLUSH_DATA_WORKS 1
100	#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101	#define PG_FLUSH_DATA_WORKS 1
102	#endif
103
104	/*
105	* We must leave some file descriptors free for system(), the dynamic loader,
106	* and other code that tries to open files without consulting fd.c. This
107	* is the number left free. (While we can be pretty sure we won't get
108	* EMFILE, there's never any guarantee that we won't get ENFILE due to
109	* other processes chewing up FDs. So it's a bad idea to try to open files
110	* without consulting fd.c. Nonetheless we cannot control all code.)
111	*
112	* Because this is just a fixed setting, we are effectively assuming that
113	* no such code will leave FDs open over the long term; otherwise the slop
114	* is likely to be insufficient. Note in particular that we expect that
115	* loading a shared library does not result in any permanent increase in
116	* the number of open files. (This appears to be true on most if not
117	* all platforms as of Feb 2004.)
118	*/
119	#define NUM_RESERVED_FDS 10
120
121	/*
122	* If we have fewer than this many usable FDs after allowing for the reserved
123	* ones, choke.
124	*/
125	#define FD_MINFREE 10
126
127	/*
128	* A number of platforms allow individual processes to open many more files
129	* than they can really support when many processes do the same thing.
130	* This GUC parameter lets the DBA limit max_safe_fds to something less than
131	* what the postmaster's initial probe suggests will work.
132	*/
133	int max_files_per_process = `1000`;
134
135	/*
136	* Maximum number of file descriptors to open for either VFD entries or
137	* AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138	* to a conservative value, and remains that way indefinitely in bootstrap or
139	* standalone-backend cases. In normal postmaster operation, the postmaster
140	* calls set_max_safe_fds() late in initialization to update the value, and
141	* that value is then inherited by forked subprocesses.
142	*
143	* Note: the value of max_files_per_process is taken into account while
144	* setting this variable, and so need not be tested separately.
145	*/
146	int max_safe_fds = `32`; / default if not changed /
147
148	/ Whether it is safe to continue running after fsync() fails. /
149	bool data_sync_retry = false;
150
151	/ Debugging.... /
152
153	#ifdef FDDEBUG
154	#define DO_DB(A) \
155	do { \
156	int _do_db_save_errno = errno; \
157	A; \
158	errno = _do_db_save_errno; \
159	} while (0)
160	#else
161	#define DO_DB(A) \
162	((void) 0)
163	#endif
164
165	#define VFD_CLOSED (-1)
166
167	#define FileIsValid(file) \
168	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169
170	#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171
172	/ these are the assigned bits in fdstate below: /
173	#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
174	#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
175	#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
176
177	typedef struct vfd
178	{
179	int fd; / current FD, or VFD_CLOSED if none /
180	unsigned short fdstate; / bitflags for VFD's state /
181	ResourceOwner resowner; / owner, for automatic cleanup /
182	File nextFree; / link to next free VFD, if in freelist /
183	File lruMoreRecently; / doubly linked recency-of-use list /
184	File lruLessRecently;
185	off_t fileSize; / current size of file (0 if not temporary) /
186	char fileName; /* name of file, or NULL for unused VFD /
187	/ NB: fileName is malloc'd, and must be free'd when closing the VFD /
188	int fileFlags; / open(2) flags for (re)opening the file /
189	mode_t fileMode; / mode to pass to open(2) /
190	} Vfd;
191
192	/*
193	* Virtual File Descriptor array pointer and size. This grows as
194	* needed. 'File' values are indexes into this array.
195	* Note that VfdCache[0] is not a usable VFD, just a list header.
196	*/
197	static Vfd *VfdCache;
198	static Size SizeVfdCache = `0`;
199
200	/*
201	* Number of file descriptors known to be in use by VFD entries.
202	*/
203	static int nfile = `0`;
204
205	/*
206	* Flag to tell whether it's worth scanning VfdCache looking for temp files
207	* to close
208	*/
209	static bool have_xact_temporary_files = false;
210
211	/*
212	* Tracks the total size of all temporary files. Note: when temp_file_limit
213	* is being enforced, this cannot overflow since the limit cannot be more
214	* than INT_MAX kilobytes. When not enforcing, it could theoretically
215	* overflow, but we don't care.
216	*/
217	static uint64 temporary_files_size = `0`;
218
219	/*
220	* List of OS handles opened with AllocateFile, AllocateDir and
221	* OpenTransientFile.
222	*/
223	typedef enum
224	{
225	AllocateDescFile,
226	AllocateDescPipe,
227	AllocateDescDir,
228	AllocateDescRawFD
229	} AllocateDescKind;
230
231	typedef struct
232	{
233	AllocateDescKind kind;
234	SubTransactionId create_subid;
235	union
236	{
237	FILE *file;
238	DIR *dir;
239	int fd;
240	} desc;
241	} AllocateDesc;
242
243	static int numAllocatedDescs = `0`;
244	static int maxAllocatedDescs = `0`;
245	static AllocateDesc *allocatedDescs = NULL;
246
247	/*
248	* Number of temporary files opened during the current session;
249	* this is used in generation of tempfile names.
250	*/
251	static long tempFileCounter = `0`;
252
253	/*
254	* Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
255	* this has not been set in the current transaction.
256	*/
257	static Oid *tempTableSpaces = NULL;
258	static int numTempTableSpaces = -`1`;
259	static int nextTempTableSpace = `0`;
260
261
262	/--------------------*
263	*
264	* Private Routines
265	*
266	* Delete - delete a file from the Lru ring
267	* LruDelete - remove a file from the Lru ring and close its FD
268	* Insert - put a file at the front of the Lru ring
269	* LruInsert - put a file at the front of the Lru ring and open it
270	* ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
271	* ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
272	* AllocateVfd - grab a free (or new) file record (from VfdArray)
273	* FreeVfd - free a file record
274	*
275	* The Least Recently Used ring is a doubly linked list that begins and
276	* ends on element zero. Element zero is special -- it doesn't represent
277	* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
278	* anchor that shows us the beginning/end of the ring.
279	* Only VFD elements that are currently really open (have an FD assigned) are
280	* in the Lru ring. Elements that are "virtually" open can be recognized
281	* by having a non-null fileName field.
282	*
283	* example:
284	*
285	* /--less----\ /---------\
286	* v \ v \
287	* #0 --more---> LeastRecentlyUsed --more-\ \
288	* ^\ \| \|
289	* \\less--> MostRecentlyUsedFile <---/ \|
290	* \more---/ \--less--/
291	*
292	*--------------------
293	*/
294	static void Delete(File file);
295	static void LruDelete(File file);
296	static void Insert(File file);
297	static int LruInsert(File file);
298	static bool ReleaseLruFile(void);
299	static void ReleaseLruFiles(void);
300	static File AllocateVfd(void);
301	static void FreeVfd(File file);
302
303	static int FileAccess(File file);
304	static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
305	static bool reserveAllocatedDesc(void);
306	static int FreeDesc(AllocateDesc *desc);
307
308	static void AtProcExit_Files(int code, Datum arg);
309	static void CleanupTempFiles(bool isCommit, bool isProcExit);
310	static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
311	bool unlink_all);
312	static void RemovePgTempRelationFiles(const char *tsdirname);
313	static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
314
315	static void walkdir(const char *path,
316	void (action) (const* char fname, bool isdir, int* elevel),
317	bool process_symlinks,
318	int elevel);
319	#ifdef PG_FLUSH_DATA_WORKS
320	static void pre_sync_fname(const char fname, bool isdir, int* elevel);
321	#endif
322	static void datadir_fsync_fname(const char fname, bool isdir, int* elevel);
323	static void unlink_if_exists_fname(const char fname, bool isdir, int* elevel);
324
325	static int fsync_fname_ext(const char fname, bool isdir, bool ignore_perm, int* elevel);
326	static int fsync_parent_path(const char fname, int* elevel);
327
328
329	/*
330	* pg_fsync --- do fsync with or without writethrough
331	*/
332	int
333	pg_fsync(int fd)
334	{
335	/ #if is to skip the sync_method test if there's no need for it /
336	#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
337	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
338	return pg_fsync_writethrough(fd);
339	else
340	#endif
341	return pg_fsync_no_writethrough(fd);
342	}
343
344
345	/*
346	* pg_fsync_no_writethrough --- same as fsync except does nothing if
347	* enableFsync is off
348	*/
349	int
350	pg_fsync_no_writethrough(int fd)
351	{
352	if (enableFsync)
353	return fsync(fd);
354	else
355	return `0`;
356	}
357
358	/*
359	* pg_fsync_writethrough
360	*/
361	int
362	pg_fsync_writethrough(int fd)
363	{
364	if (enableFsync)
365	{
366	#ifdef WIN32
367	return _commit(fd);
368	#elif defined(F_FULLFSYNC)
369	return (fcntl(fd, F_FULLFSYNC, `0`) == -`1`) ? -`1` : `0`;
370	#else
371	errno = ENOSYS;
372	return -`1`;
373	#endif
374	}
375	else
376	return `0`;
377	}
378
379	/*
380	* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
381	*
382	* Not all platforms have fdatasync; treat as fsync if not available.
383	*/
384	int
385	pg_fdatasync(int fd)
386	{
387	if (enableFsync)
388	{
389	#ifdef HAVE_FDATASYNC
390	return fdatasync(fd);
391	#else
392	return fsync(fd);
393	#endif
394	}
395	else
396	return `0`;
397	}
398
399	/*
400	* pg_flush_data --- advise OS that the described dirty data should be flushed
401	*
402	* offset of 0 with nbytes 0 means that the entire file should be flushed
403	*/
404	void
405	pg_flush_data(int fd, off_t offset, off_t nbytes)
406	{
407	/*
408	* Right now file flushing is primarily used to avoid making later
409	* fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
410	* if fsyncs are disabled - that's a decision we might want to make
411	* configurable at some point.
412	*/
413	if (!enableFsync)
414	return;
415
416	/*
417	* We compile all alternatives that are supported on the current platform,
418	* to find portability problems more easily.
419	*/
420	#if defined(HAVE_SYNC_FILE_RANGE)
421	{
422	int rc;
423	static bool not_implemented_by_kernel = false;
424
425	if (not_implemented_by_kernel)
426	return;
427
428	/*
429	* sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
430	* tells the OS that writeback for the specified blocks should be
431	* started, but that we don't want to wait for completion. Note that
432	* this call might block if too much dirty data exists in the range.
433	* This is the preferable method on OSs supporting it, as it works
434	* reliably when available (contrast to msync()) and doesn't flush out
435	* clean data (like FADV_DONTNEED).
436	*/
437	rc = sync_file_range(fd, offset, nbytes,
438	SYNC_FILE_RANGE_WRITE);
439	if (rc != `0`)
440	{
441	int elevel;
442
443	/*
444	* For systems that don't have an implementation of
445	* sync_file_range() such as Windows WSL, generate only one
446	* warning and then suppress all further attempts by this process.
447	*/
448	if (errno == ENOSYS)
449	{
450	elevel = WARNING;
451	not_implemented_by_kernel = true;
452	}
453	else
454	elevel = data_sync_elevel(WARNING);
455
456	ereport(elevel,
457	(errcode_for_file_access(),
458	errmsg("could not flush dirty data: %m")));
459	}
460
461	return;
462	}
463	#endif
464	#if !defined(WIN32) && defined(MS_ASYNC)
465	{
466	void *p;
467	static int pagesize = `0`;
468
469	/*
470	* On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
471	* writeback. On linux it only does so if MS_SYNC is specified, but
472	* then it does the writeback synchronously. Luckily all common linux
473	* systems have sync_file_range(). This is preferable over
474	* FADV_DONTNEED because it doesn't flush out clean data.
475	*
476	* We map the file (mmap()), tell the kernel to sync back the contents
477	* (msync()), and then remove the mapping again (munmap()).
478	*/
479
480	/ mmap() needs actual length if we want to map whole file /
481	if (offset == `0` && nbytes == `0`)
482	{
483	nbytes = lseek(fd, `0`, SEEK_END);
484	if (nbytes < `0`)
485	{
486	ereport(WARNING,
487	(errcode_for_file_access(),
488	errmsg("could not determine dirty data size: %m")));
489	return;
490	}
491	}
492
493	/*
494	* Some platforms reject partial-page mmap() attempts. To deal with
495	* that, just truncate the request to a page boundary. If any extra
496	* bytes don't get flushed, well, it's only a hint anyway.
497	*/
498
499	/ fetch pagesize only once /
500	if (pagesize == `0`)
501	pagesize = sysconf(_SC_PAGESIZE);
502
503	/ align length to pagesize, dropping any fractional page /
504	if (pagesize > `0`)
505	nbytes = (nbytes / pagesize) * pagesize;
506
507	/ fractional-page request is a no-op /
508	if (nbytes <= `0`)
509	return;
510
511	/*
512	* mmap could well fail, particularly on 32-bit platforms where there
513	* may simply not be enough address space. If so, silently fall
514	* through to the next implementation.
515	*/
516	if (nbytes <= (off_t) SSIZE_MAX)
517	p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
518	else
519	p = MAP_FAILED;
520
521	if (p != MAP_FAILED)
522	{
523	int rc;
524
525	rc = msync(p, (size_t) nbytes, MS_ASYNC);
526	if (rc != `0`)
527	{
528	ereport(data_sync_elevel(WARNING),
529	(errcode_for_file_access(),
530	errmsg("could not flush dirty data: %m")));
531	/ NB: need to fall through to munmap()! /
532	}
533
534	rc = munmap(p, (size_t) nbytes);
535	if (rc != `0`)
536	{
537	/ FATAL error because mapping would remain /
538	ereport(FATAL,
539	(errcode_for_file_access(),
540	errmsg("could not munmap() while flushing data: %m")));
541	}
542
543	return;
544	}
545	}
546	#endif
547	#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
548	{
549	int rc;
550
551	/*
552	* Signal the kernel that the passed in range should not be cached
553	* anymore. This has the, desired, side effect of writing out dirty
554	* data, and the, undesired, side effect of likely discarding useful
555	* clean cached blocks. For the latter reason this is the least
556	* preferable method.
557	*/
558
559	rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
560
561	if (rc != `0`)
562	{
563	/ don't error out, this is just a performance optimization /
564	ereport(WARNING,
565	(errcode_for_file_access(),
566	errmsg("could not flush dirty data: %m")));
567	}
568
569	return;
570	}
571	#endif
572	}
573
574
575	/*
576	* fsync_fname -- fsync a file or directory, handling errors properly
577	*
578	* Try to fsync a file or directory. When doing the latter, ignore errors that
579	* indicate the OS just doesn't allow/require fsyncing directories.
580	*/
581	void
582	fsync_fname(const char *fname, bool isdir)
583	{
584	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
585	}
586
587	/*
588	* durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
589	*
590	* This routine ensures that, after returning, the effect of renaming file
591	* persists in case of a crash. A crash while this routine is running will
592	* leave you with either the pre-existing or the moved file in place of the
593	* new file; no mixed state or truncated files are possible.
594	*
595	* It does so by using fsync on the old filename and the possibly existing
596	* target filename before the rename, and the target file and directory after.
597	*
598	* Note that rename() cannot be used across arbitrary directories, as they
599	* might not be on the same filesystem. Therefore this routine does not
600	* support renaming across directories.
601	*
602	* Log errors with the caller specified severity.
603	*
604	* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
605	* valid upon return.
606	*/
607	int
608	durable_rename(const char oldfile, const* char newfile, int* elevel)
609	{
610	int fd;
611
612	/*
613	* First fsync the old and target path (if it exists), to ensure that they
614	* are properly persistent on disk. Syncing the target file is not
615	* strictly necessary, but it makes it easier to reason about crashes;
616	* because it's then guaranteed that either source or target file exists
617	* after a crash.
618	*/
619	if (fsync_fname_ext(oldfile, false, false, elevel) != `0`)
620	return -`1`;
621
622	fd = OpenTransientFile(newfile, PG_BINARY \| O_RDWR);
623	if (fd < `0`)
624	{
625	if (errno != ENOENT)
626	{
627	ereport(elevel,
628	(errcode_for_file_access(),
629	errmsg("could not open file \"%s\": %m", newfile)));
630	return -`1`;
631	}
632	}
633	else
634	{
635	if (pg_fsync(fd) != `0`)
636	{
637	int save_errno;
638
639	/ close file upon error, might not be in transaction context /
640	save_errno = errno;
641	CloseTransientFile(fd);
642	errno = save_errno;
643
644	ereport(elevel,
645	(errcode_for_file_access(),
646	errmsg("could not fsync file \"%s\": %m", newfile)));
647	return -`1`;
648	}
649
650	if (CloseTransientFile(fd))
651	{
652	ereport(elevel,
653	(errcode_for_file_access(),
654	errmsg("could not close file \"%s\": %m", newfile)));
655	return -`1`;
656	}
657	}
658
659	/ Time to do the real deal... /
660	if (rename(oldfile, newfile) < `0`)
661	{
662	ereport(elevel,
663	(errcode_for_file_access(),
664	errmsg("could not rename file \"%s\" to \"%s\": %m",
665	oldfile, newfile)));
666	return -`1`;
667	}
668
669	/*
670	* To guarantee renaming the file is persistent, fsync the file with its
671	* new name, and its containing directory.
672	*/
673	if (fsync_fname_ext(newfile, false, false, elevel) != `0`)
674	return -`1`;
675
676	if (fsync_parent_path(newfile, elevel) != `0`)
677	return -`1`;
678
679	return `0`;
680	}
681
682	/*
683	* durable_unlink -- remove a file in a durable manner
684	*
685	* This routine ensures that, after returning, the effect of removing file
686	* persists in case of a crash. A crash while this routine is running will
687	* leave the system in no mixed state.
688	*
689	* It does so by using fsync on the parent directory of the file after the
690	* actual removal is done.
691	*
692	* Log errors with the severity specified by caller.
693	*
694	* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
695	* valid upon return.
696	*/
697	int
698	durable_unlink(const char fname, int* elevel)
699	{
700	if (unlink(fname) < `0`)
701	{
702	ereport(elevel,
703	(errcode_for_file_access(),
704	errmsg("could not remove file \"%s\": %m",
705	fname)));
706	return -`1`;
707	}
708
709	/*
710	* To guarantee that the removal of the file is persistent, fsync its
711	* parent directory.
712	*/
713	if (fsync_parent_path(fname, elevel) != `0`)
714	return -`1`;
715
716	return `0`;
717	}
718
719	/*
720	* durable_link_or_rename -- rename a file in a durable manner.
721	*
722	* Similar to durable_rename(), except that this routine tries (but does not
723	* guarantee) not to overwrite the target file.
724	*
725	* Note that a crash in an unfortunate moment can leave you with two links to
726	* the target file.
727	*
728	* Log errors with the caller specified severity.
729	*
730	* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
731	* valid upon return.
732	*/
733	int
734	durable_link_or_rename(const char oldfile, const* char newfile, int* elevel)
735	{
736	/*
737	* Ensure that, if we crash directly after the rename/link, a file with
738	* valid contents is moved into place.
739	*/
740	if (fsync_fname_ext(oldfile, false, false, elevel) != `0`)
741	return -`1`;
742
743	#if HAVE_WORKING_LINK
744	if (link(oldfile, newfile) < `0`)
745	{
746	ereport(elevel,
747	(errcode_for_file_access(),
748	errmsg("could not link file \"%s\" to \"%s\": %m",
749	oldfile, newfile)));
750	return -`1`;
751	}
752	unlink(oldfile);
753	#else
754	/ XXX: Add racy file existence check? /
755	if (rename(oldfile, newfile) < `0`)
756	{
757	ereport(elevel,
758	(errcode_for_file_access(),
759	errmsg("could not rename file \"%s\" to \"%s\": %m",
760	oldfile, newfile)));
761	return -`1`;
762	}
763	#endif
764
765	/*
766	* Make change persistent in case of an OS crash, both the new entry and
767	* its parent directory need to be flushed.
768	*/
769	if (fsync_fname_ext(newfile, false, false, elevel) != `0`)
770	return -`1`;
771
772	/ Same for parent directory /
773	if (fsync_parent_path(newfile, elevel) != `0`)
774	return -`1`;
775
776	return `0`;
777	}
778
779	/*
780	* InitFileAccess --- initialize this module during backend startup
781	*
782	* This is called during either normal or standalone backend start.
783	* It is not called in the postmaster.
784	*/
785	void
786	InitFileAccess(void)
787	{
788	Assert(SizeVfdCache == `0`); / call me only once /
789
790	/ initialize cache header entry /
791	VfdCache = (Vfd ) malloc(sizeof*(Vfd));
792	if (VfdCache == NULL)
793	ereport(FATAL,
794	(errcode(ERRCODE_OUT_OF_MEMORY),
795	errmsg("out of memory")));
796
797	MemSet((char ) &(VfdCache[`0`]), `0`, sizeof*(Vfd));
798	VfdCache->fd = VFD_CLOSED;
799
800	SizeVfdCache = `1`;
801
802	/ register proc-exit hook to ensure temp files are dropped at exit /
803	on_proc_exit(AtProcExit_Files, `0`);
804	}
805
806	/*
807	* count_usable_fds --- count how many FDs the system will let us open,
808	* and estimate how many are already open.
809	*
810	* We stop counting if usable_fds reaches max_to_probe. Note: a small
811	* value of max_to_probe might result in an underestimate of already_open;
812	* we must fill in any "gaps" in the set of used FDs before the calculation
813	* of already_open will give the right answer. In practice, max_to_probe
814	* of a couple of dozen should be enough to ensure good results.
815	*
816	* We assume stdin (FD 0) is available for dup'ing
817	*/
818	static void
819	count_usable_fds(int max_to_probe, int usable_fds, int* *already_open)
820	{
821	int *fd;
822	int size;
823	int used = `0`;
824	int highestfd = `0`;
825	int j;
826
827	#ifdef HAVE_GETRLIMIT
828	struct rlimit rlim;
829	int getrlimit_status;
830	#endif
831
832	size = `1024`;
833	fd = (int ) palloc(size sizeof(int));
834
835	#ifdef HAVE_GETRLIMIT
836	#ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
837	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
838	#else /* but BSD doesn't ... */
839	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
840	#endif /* RLIMIT_NOFILE */
841	if (getrlimit_status != `0`)
842	ereport(WARNING, (errmsg("getrlimit failed: %m")));
843	#endif /* HAVE_GETRLIMIT */
844
845	/ dup until failure or probe limit reached /
846	for (;;)
847	{
848	int thisfd;
849
850	#ifdef HAVE_GETRLIMIT
851
852	/*
853	* don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
854	* some platforms
855	*/
856	if (getrlimit_status == `0` && highestfd >= rlim.rlim_cur - `1`)
857	break;
858	#endif
859
860	thisfd = dup(`0`);
861	if (thisfd < `0`)
862	{
863	/ Expect EMFILE or ENFILE, else it's fishy /
864	if (errno != EMFILE && errno != ENFILE)
865	elog(WARNING, "dup(0) failed after %d successes: %m", used);
866	break;
867	}
868
869	if (used >= size)
870	{
871	size *= `2`;
872	fd = (int ) repalloc(fd, size sizeof(int));
873	}
874	fd[used++] = thisfd;
875
876	if (highestfd < thisfd)
877	highestfd = thisfd;
878
879	if (used >= max_to_probe)
880	break;
881	}
882
883	/ release the files we opened /
884	for (j = `0`; j < used; j++)
885	close(fd[j]);
886
887	pfree(fd);
888
889	/*
890	* Return results. usable_fds is just the number of successful dups. We
891	* assume that the system limit is highestfd+1 (remember 0 is a legal FD
892	* number) and so already_open is highestfd+1 - usable_fds.
893	*/
894	*usable_fds = used;
895	*already_open = highestfd + `1` - used;
896	}
897
898	/*
899	* set_max_safe_fds
900	* Determine number of filedescriptors that fd.c is allowed to use
901	*/
902	void
903	set_max_safe_fds(void)
904	{
905	int usable_fds;
906	int already_open;
907
908	/----------*
909	* We want to set max_safe_fds to
910	* MIN(usable_fds, max_files_per_process - already_open)
911	* less the slop factor for files that are opened without consulting
912	* fd.c. This ensures that we won't exceed either max_files_per_process
913	* or the experimentally-determined EMFILE limit.
914	*----------
915	*/
916	count_usable_fds(max_files_per_process,
917	&usable_fds, &already_open);
918
919	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
920
921	/*
922	* Take off the FDs reserved for system() etc.
923	*/
924	max_safe_fds -= NUM_RESERVED_FDS;
925
926	/*
927	* Make sure we still have enough to get by.
928	*/
929	if (max_safe_fds < FD_MINFREE)
930	ereport(FATAL,
931	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
932	errmsg("insufficient file descriptors available to start server process"),
933	errdetail("System allows %d, we need at least %d.",
934	max_safe_fds + NUM_RESERVED_FDS,
935	FD_MINFREE + NUM_RESERVED_FDS)));
936
937	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
938	max_safe_fds, usable_fds, already_open);
939	}
940
941	/*
942	* Open a file with BasicOpenFilePerm() and pass default file mode for the
943	* fileMode parameter.
944	*/
945	int
946	BasicOpenFile(const char fileName, int* fileFlags)
947	{
948	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
949	}
950
951	/*
952	* BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
953	*
954	* This is exported for use by places that really want a plain kernel FD,
955	* but need to be proof against running out of FDs. Once an FD has been
956	* successfully returned, it is the caller's responsibility to ensure that
957	* it will not be leaked on ereport()! Most users should not call this
958	* routine directly, but instead use the VFD abstraction level, which
959	* provides protection against descriptor leaks as well as management of
960	* files that need to be open for more than a short period of time.
961	*
962	* Ideally this should be the only direct call of open() in the backend.
963	* In practice, the postmaster calls open() directly, and there are some
964	* direct open() calls done early in backend startup. Those are OK since
965	* this module wouldn't have any open files to close at that point anyway.
966	*/
967	int
968	BasicOpenFilePerm(const char fileName, int* fileFlags, mode_t fileMode)
969	{
970	int fd;
971
972	tryAgain:
973	fd = open(fileName, fileFlags, fileMode);
974
975	if (fd >= `0`)
976	return fd; / success! /
977
978	if (errno == EMFILE \|\| errno == ENFILE)
979	{
980	int save_errno = errno;
981
982	ereport(LOG,
983	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
984	errmsg("out of file descriptors: %m; release and retry")));
985	errno = `0`;
986	if (ReleaseLruFile())
987	goto tryAgain;
988	errno = save_errno;
989	}
990
991	return -`1`; / failure /
992	}
993
994	#if defined(FDDEBUG)
995
996	static void
997	_dump_lru(void)
998	{
999	int mru = VfdCache[`0`].lruLessRecently;
1000	Vfd *vfdP = &VfdCache[mru];
1001	char buf[`2048`];
1002
1003	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1004	while (mru != `0`)
1005	{
1006	mru = vfdP->lruLessRecently;
1007	vfdP = &VfdCache[mru];
1008	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1009	}
1010	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1011	elog(LOG, "%s", buf);
1012	}
1013	#endif /* FDDEBUG */
1014
1015	static void
1016	Delete(File file)
1017	{
1018	Vfd *vfdP;
1019
1020	Assert(file != `0`);
1021
1022	DO_DB(elog(LOG, "Delete %d (%s)",
1023	file, VfdCache[file].fileName));
1024	DO_DB(_dump_lru());
1025
1026	vfdP = &VfdCache[file];
1027
1028	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1029	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1030
1031	DO_DB(_dump_lru());
1032	}
1033
1034	static void
1035	LruDelete(File file)
1036	{
1037	Vfd *vfdP;
1038
1039	Assert(file != `0`);
1040
1041	DO_DB(elog(LOG, "LruDelete %d (%s)",
1042	file, VfdCache[file].fileName));
1043
1044	vfdP = &VfdCache[file];
1045
1046	/*
1047	* Close the file. We aren't expecting this to fail; if it does, better
1048	* to leak the FD than to mess up our internal state.
1049	*/
1050	if (close(vfdP->fd))
1051	elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1052	"could not close file \"%s\": %m", vfdP->fileName);
1053	vfdP->fd = VFD_CLOSED;
1054	--nfile;
1055
1056	/ delete the vfd record from the LRU ring /
1057	Delete(file);
1058	}
1059
1060	static void
1061	Insert(File file)
1062	{
1063	Vfd *vfdP;
1064
1065	Assert(file != `0`);
1066
1067	DO_DB(elog(LOG, "Insert %d (%s)",
1068	file, VfdCache[file].fileName));
1069	DO_DB(_dump_lru());
1070
1071	vfdP = &VfdCache[file];
1072
1073	vfdP->lruMoreRecently = `0`;
1074	vfdP->lruLessRecently = VfdCache[`0`].lruLessRecently;
1075	VfdCache[`0`].lruLessRecently = file;
1076	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1077
1078	DO_DB(_dump_lru());
1079	}
1080
1081	/ returns 0 on success, -1 on re-open failure (with errno set) /
1082	static int
1083	LruInsert(File file)
1084	{
1085	Vfd *vfdP;
1086
1087	Assert(file != `0`);
1088
1089	DO_DB(elog(LOG, "LruInsert %d (%s)",
1090	file, VfdCache[file].fileName));
1091
1092	vfdP = &VfdCache[file];
1093
1094	if (FileIsNotOpen(file))
1095	{
1096	/ Close excess kernel FDs. /
1097	ReleaseLruFiles();
1098
1099	/*
1100	* The open could still fail for lack of file descriptors, eg due to
1101	* overall system file table being full. So, be prepared to release
1102	* another FD if necessary...
1103	*/
1104	vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1105	vfdP->fileMode);
1106	if (vfdP->fd < `0`)
1107	{
1108	DO_DB(elog(LOG, "re-open failed: %m"));
1109	return -`1`;
1110	}
1111	else
1112	{
1113	++nfile;
1114	}
1115	}
1116
1117	/*
1118	* put it at the head of the Lru ring
1119	*/
1120
1121	Insert(file);
1122
1123	return `0`;
1124	}
1125
1126	/*
1127	* Release one kernel FD by closing the least-recently-used VFD.
1128	*/
1129	static bool
1130	ReleaseLruFile(void)
1131	{
1132	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1133
1134	if (nfile > `0`)
1135	{
1136	/*
1137	* There are opened files and so there should be at least one used vfd
1138	* in the ring.
1139	*/
1140	Assert(VfdCache[`0`].lruMoreRecently != `0`);
1141	LruDelete(VfdCache[`0`].lruMoreRecently);
1142	return true; / freed a file /
1143	}
1144	return false; / no files available to free /
1145	}
1146
1147	/*
1148	* Release kernel FDs as needed to get under the max_safe_fds limit.
1149	* After calling this, it's OK to try to open another file.
1150	*/
1151	static void
1152	ReleaseLruFiles(void)
1153	{
1154	while (nfile + numAllocatedDescs >= max_safe_fds)
1155	{
1156	if (!ReleaseLruFile())
1157	break;
1158	}
1159	}
1160
1161	static File
1162	AllocateVfd(void)
1163	{
1164	Index i;
1165	File file;
1166
1167	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1168
1169	Assert(SizeVfdCache > `0`); / InitFileAccess not called? /
1170
1171	if (VfdCache[`0`].nextFree == `0`)
1172	{
1173	/*
1174	* The free list is empty so it is time to increase the size of the
1175	* array. We choose to double it each time this happens. However,
1176	* there's not much point in starting real small.
1177	*/
1178	Size newCacheSize = SizeVfdCache * `2`;
1179	Vfd *newVfdCache;
1180
1181	if (newCacheSize < `32`)
1182	newCacheSize = `32`;
1183
1184	/*
1185	* Be careful not to clobber VfdCache ptr if realloc fails.
1186	*/
1187	newVfdCache = (Vfd ) realloc(VfdCache, sizeof(Vfd) newCacheSize);
1188	if (newVfdCache == NULL)
1189	ereport(ERROR,
1190	(errcode(ERRCODE_OUT_OF_MEMORY),
1191	errmsg("out of memory")));
1192	VfdCache = newVfdCache;
1193
1194	/*
1195	* Initialize the new entries and link them into the free list.
1196	*/
1197	for (i = SizeVfdCache; i < newCacheSize; i++)
1198	{
1199	MemSet((char ) &(VfdCache[i]), `0`, sizeof*(Vfd));
1200	VfdCache[i].nextFree = i + `1`;
1201	VfdCache[i].fd = VFD_CLOSED;
1202	}
1203	VfdCache[newCacheSize - `1`].nextFree = `0`;
1204	VfdCache[`0`].nextFree = SizeVfdCache;
1205
1206	/*
1207	* Record the new size
1208	*/
1209	SizeVfdCache = newCacheSize;
1210	}
1211
1212	file = VfdCache[`0`].nextFree;
1213
1214	VfdCache[`0`].nextFree = VfdCache[file].nextFree;
1215
1216	return file;
1217	}
1218
1219	static void
1220	FreeVfd(File file)
1221	{
1222	Vfd *vfdP = &VfdCache[file];
1223
1224	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1225	file, vfdP->fileName ? vfdP->fileName : ""));
1226
1227	if (vfdP->fileName != NULL)
1228	{
1229	free(vfdP->fileName);
1230	vfdP->fileName = NULL;
1231	}
1232	vfdP->fdstate = `0x0`;
1233
1234	vfdP->nextFree = VfdCache[`0`].nextFree;
1235	VfdCache[`0`].nextFree = file;
1236	}
1237
1238	/ returns 0 on success, -1 on re-open failure (with errno set) /
1239	static int
1240	FileAccess(File file)
1241	{
1242	int returnValue;
1243
1244	DO_DB(elog(LOG, "FileAccess %d (%s)",
1245	file, VfdCache[file].fileName));
1246
1247	/*
1248	* Is the file open? If not, open it and put it at the head of the LRU
1249	* ring (possibly closing the least recently used file to get an FD).
1250	*/
1251
1252	if (FileIsNotOpen(file))
1253	{
1254	returnValue = LruInsert(file);
1255	if (returnValue != `0`)
1256	return returnValue;
1257	}
1258	else if (VfdCache[`0`].lruLessRecently != file)
1259	{
1260	/*
1261	* We now know that the file is open and that it is not the last one
1262	* accessed, so we need to move it to the head of the Lru ring.
1263	*/
1264
1265	Delete(file);
1266	Insert(file);
1267	}
1268
1269	return `0`;
1270	}
1271
1272	/*
1273	* Called whenever a temporary file is deleted to report its size.
1274	*/
1275	static void
1276	ReportTemporaryFileUsage(const char *path, off_t size)
1277	{
1278	pgstat_report_tempfile(size);
1279
1280	if (log_temp_files >= `0`)
1281	{
1282	if ((size / `1024`) >= log_temp_files)
1283	ereport(LOG,
1284	(errmsg("temporary file: path \"%s\", size %lu",
1285	path, (unsigned long) size)));
1286	}
1287	}
1288
1289	/*
1290	* Called to register a temporary file for automatic close.
1291	* ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1292	* before the file was opened.
1293	*/
1294	static void
1295	RegisterTemporaryFile(File file)
1296	{
1297	ResourceOwnerRememberFile(CurrentResourceOwner, file);
1298	VfdCache[file].resowner = CurrentResourceOwner;
1299
1300	/ Backup mechanism for closing at end of xact. /
1301	VfdCache[file].fdstate \|= FD_CLOSE_AT_EOXACT;
1302	have_xact_temporary_files = true;
1303	}
1304
1305	/*
1306	* Called when we get a shared invalidation message on some relation.
1307	*/
1308	#ifdef NOT_USED
1309	void
1310	FileInvalidate(File file)
1311	{
1312	Assert(FileIsValid(file));
1313	if (!FileIsNotOpen(file))
1314	LruDelete(file);
1315	}
1316	#endif
1317
1318	/*
1319	* Open a file with PathNameOpenFilePerm() and pass default file mode for the
1320	* fileMode parameter.
1321	*/
1322	File
1323	PathNameOpenFile(const char fileName, int* fileFlags)
1324	{
1325	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1326	}
1327
1328	/*
1329	* open a file in an arbitrary directory
1330	*
1331	* NB: if the passed pathname is relative (which it usually is),
1332	* it will be interpreted relative to the process' working directory
1333	* (which should always be $PGDATA when this code is running).
1334	*/
1335	File
1336	PathNameOpenFilePerm(const char fileName, int* fileFlags, mode_t fileMode)
1337	{
1338	char *fnamecopy;
1339	File file;
1340	Vfd *vfdP;
1341
1342	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1343	fileName, fileFlags, fileMode));
1344
1345	/*
1346	* We need a malloc'd copy of the file name; fail cleanly if no room.
1347	*/
1348	fnamecopy = strdup(fileName);
1349	if (fnamecopy == NULL)
1350	ereport(ERROR,
1351	(errcode(ERRCODE_OUT_OF_MEMORY),
1352	errmsg("out of memory")));
1353
1354	file = AllocateVfd();
1355	vfdP = &VfdCache[file];
1356
1357	/ Close excess kernel FDs. /
1358	ReleaseLruFiles();
1359
1360	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1361
1362	if (vfdP->fd < `0`)
1363	{
1364	int save_errno = errno;
1365
1366	FreeVfd(file);
1367	free(fnamecopy);
1368	errno = save_errno;
1369	return -`1`;
1370	}
1371	++nfile;
1372	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1373	vfdP->fd));
1374
1375	Insert(file);
1376
1377	vfdP->fileName = fnamecopy;
1378	/ Saved flags are adjusted to be OK for re-opening file /
1379	vfdP->fileFlags = fileFlags & ~(O_CREAT \| O_TRUNC \| O_EXCL);
1380	vfdP->fileMode = fileMode;
1381	vfdP->fileSize = `0`;
1382	vfdP->fdstate = `0x0`;
1383	vfdP->resowner = NULL;
1384
1385	return file;
1386	}
1387
1388	/*
1389	* Create directory 'directory'. If necessary, create 'basedir', which must
1390	* be the directory above it. This is designed for creating the top-level
1391	* temporary directory on demand before creating a directory underneath it.
1392	* Do nothing if the directory already exists.
1393	*
1394	* Directories created within the top-level temporary directory should begin
1395	* with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1396	* deleted at startup by RemovePgTempFiles(). Further subdirectories below
1397	* that do not need any particular prefix.
1398	*/
1399	void
1400	PathNameCreateTemporaryDir(const char basedir, const* char *directory)
1401	{
1402	if (MakePGDirectory(directory) < `0`)
1403	{
1404	if (errno == EEXIST)
1405	return;
1406
1407	/*
1408	* Failed. Try to create basedir first in case it's missing. Tolerate
1409	* EEXIST to close a race against another process following the same
1410	* algorithm.
1411	*/
1412	if (MakePGDirectory(basedir) < `0` && errno != EEXIST)
1413	ereport(ERROR,
1414	(errcode_for_file_access(),
1415	errmsg("cannot create temporary directory \"%s\": %m",
1416	basedir)));
1417
1418	/ Try again. /
1419	if (MakePGDirectory(directory) < `0` && errno != EEXIST)
1420	ereport(ERROR,
1421	(errcode_for_file_access(),
1422	errmsg("cannot create temporary subdirectory \"%s\": %m",
1423	directory)));
1424	}
1425	}
1426
1427	/*
1428	* Delete a directory and everything in it, if it exists.
1429	*/
1430	void
1431	PathNameDeleteTemporaryDir(const char *dirname)
1432	{
1433	struct stat statbuf;
1434
1435	/ Silently ignore missing directory. /
1436	if (stat(dirname, &statbuf) != `0` && errno == ENOENT)
1437	return;
1438
1439	/*
1440	* Currently, walkdir doesn't offer a way for our passed in function to
1441	* maintain state. Perhaps it should, so that we could tell the caller
1442	* whether this operation succeeded or failed. Since this operation is
1443	* used in a cleanup path, we wouldn't actually behave differently: we'll
1444	* just log failures.
1445	*/
1446	walkdir(dirname, unlink_if_exists_fname, false, LOG);
1447	}
1448
1449	/*
1450	* Open a temporary file that will disappear when we close it.
1451	*
1452	* This routine takes care of generating an appropriate tempfile name.
1453	* There's no need to pass in fileFlags or fileMode either, since only
1454	* one setting makes any sense for a temp file.
1455	*
1456	* Unless interXact is true, the file is remembered by CurrentResourceOwner
1457	* to ensure it's closed and deleted when it's no longer needed, typically at
1458	* the end-of-transaction. In most cases, you don't want temporary files to
1459	* outlive the transaction that created them, so this should be false -- but
1460	* if you need "somewhat" temporary storage, this might be useful. In either
1461	* case, the file is removed when the File is explicitly closed.
1462	*/
1463	File
1464	OpenTemporaryFile(bool interXact)
1465	{
1466	File file = `0`;
1467
1468	/*
1469	* Make sure the current resource owner has space for this File before we
1470	* open it, if we'll be registering it below.
1471	*/
1472	if (!interXact)
1473	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1474
1475	/*
1476	* If some temp tablespace(s) have been given to us, try to use the next
1477	* one. If a given tablespace can't be found, we silently fall back to
1478	* the database's default tablespace.
1479	*
1480	* BUT: if the temp file is slated to outlive the current transaction,
1481	* force it into the database's default tablespace, so that it will not
1482	* pose a threat to possible tablespace drop attempts.
1483	*/
1484	if (numTempTableSpaces > `0` && !interXact)
1485	{
1486	Oid tblspcOid = GetNextTempTableSpace();
1487
1488	if (OidIsValid(tblspcOid))
1489	file = OpenTemporaryFileInTablespace(tblspcOid, false);
1490	}
1491
1492	/*
1493	* If not, or if tablespace is bad, create in database's default
1494	* tablespace. MyDatabaseTableSpace should normally be set before we get
1495	* here, but just in case it isn't, fall back to pg_default tablespace.
1496	*/
1497	if (file <= `0`)
1498	file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1499	MyDatabaseTableSpace :
1500	DEFAULTTABLESPACE_OID,
1501	true);
1502
1503	/ Mark it for deletion at close and temporary file size limit /
1504	VfdCache[file].fdstate \|= FD_DELETE_AT_CLOSE \| FD_TEMP_FILE_LIMIT;
1505
1506	/ Register it with the current resource owner /
1507	if (!interXact)
1508	RegisterTemporaryFile(file);
1509
1510	return file;
1511	}
1512
1513	/*
1514	* Return the path of the temp directory in a given tablespace.
1515	*/
1516	void
1517	TempTablespacePath(char *path, Oid tablespace)
1518	{
1519	/*
1520	* Identify the tempfile directory for this tablespace.
1521	*
1522	* If someone tries to specify pg_global, use pg_default instead.
1523	*/
1524	if (tablespace == InvalidOid \|\|
1525	tablespace == DEFAULTTABLESPACE_OID \|\|
1526	tablespace == GLOBALTABLESPACE_OID)
1527	snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1528	else
1529	{
1530	/ All other tablespaces are accessed via symlinks /
1531	snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1532	tablespace, TABLESPACE_VERSION_DIRECTORY,
1533	PG_TEMP_FILES_DIR);
1534	}
1535	}
1536
1537	/*
1538	* Open a temporary file in a specific tablespace.
1539	* Subroutine for OpenTemporaryFile, which see for details.
1540	*/
1541	static File
1542	OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1543	{
1544	char tempdirpath[MAXPGPATH];
1545	char tempfilepath[MAXPGPATH];
1546	File file;
1547
1548	TempTablespacePath(tempdirpath, tblspcOid);
1549
1550	/*
1551	* Generate a tempfile name that should be unique within the current
1552	* database instance.
1553	*/
1554	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1555	tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1556
1557	/*
1558	* Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1559	* temp file that can be reused.
1560	*/
1561	file = PathNameOpenFile(tempfilepath,
1562	O_RDWR \| O_CREAT \| O_TRUNC \| PG_BINARY);
1563	if (file <= `0`)
1564	{
1565	/*
1566	* We might need to create the tablespace's tempfile directory, if no
1567	* one has yet done so.
1568	*
1569	* Don't check for an error from MakePGDirectory; it could fail if
1570	* someone else just did the same thing. If it doesn't work then
1571	* we'll bomb out on the second create attempt, instead.
1572	*/
1573	(void) MakePGDirectory(tempdirpath);
1574
1575	file = PathNameOpenFile(tempfilepath,
1576	O_RDWR \| O_CREAT \| O_TRUNC \| PG_BINARY);
1577	if (file <= `0` && rejectError)
1578	elog(ERROR, "could not create temporary file \"%s\": %m",
1579	tempfilepath);
1580	}
1581
1582	return file;
1583	}
1584
1585
1586	/*
1587	* Create a new file. The directory containing it must already exist. Files
1588	* created this way are subject to temp_file_limit and are automatically
1589	* closed at end of transaction, but are not automatically deleted on close
1590	* because they are intended to be shared between cooperating backends.
1591	*
1592	* If the file is inside the top-level temporary directory, its name should
1593	* begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1594	* and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1595	* inside a directory created with PathNameCreateTemporaryDir(), in which case
1596	* the prefix isn't needed.
1597	*/
1598	File
1599	PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1600	{
1601	File file;
1602
1603	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1604
1605	/*
1606	* Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1607	* temp file that can be reused.
1608	*/
1609	file = PathNameOpenFile(path, O_RDWR \| O_CREAT \| O_TRUNC \| PG_BINARY);
1610	if (file <= `0`)
1611	{
1612	if (error_on_failure)
1613	ereport(ERROR,
1614	(errcode_for_file_access(),
1615	errmsg("could not create temporary file \"%s\": %m",
1616	path)));
1617	else
1618	return file;
1619	}
1620
1621	/ Mark it for temp_file_limit accounting. /
1622	VfdCache[file].fdstate \|= FD_TEMP_FILE_LIMIT;
1623
1624	/ Register it for automatic close. /
1625	RegisterTemporaryFile(file);
1626
1627	return file;
1628	}
1629
1630	/*
1631	* Open a file that was created with PathNameCreateTemporaryFile, possibly in
1632	* another backend. Files opened this way don't count against the
1633	* temp_file_limit of the caller, are read-only and are automatically closed
1634	* at the end of the transaction but are not deleted on close.
1635	*/
1636	File
1637	PathNameOpenTemporaryFile(const char *path)
1638	{
1639	File file;
1640
1641	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1642
1643	/ We open the file read-only. /
1644	file = PathNameOpenFile(path, O_RDONLY \| PG_BINARY);
1645
1646	/ If no such file, then we don't raise an error. /
1647	if (file <= `0` && errno != ENOENT)
1648	ereport(ERROR,
1649	(errcode_for_file_access(),
1650	errmsg("could not open temporary file \"%s\": %m",
1651	path)));
1652
1653	if (file > `0`)
1654	{
1655	/ Register it for automatic close. /
1656	RegisterTemporaryFile(file);
1657	}
1658
1659	return file;
1660	}
1661
1662	/*
1663	* Delete a file by pathname. Return true if the file existed, false if
1664	* didn't.
1665	*/
1666	bool
1667	PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1668	{
1669	struct stat filestats;
1670	int stat_errno;
1671
1672	/ Get the final size for pgstat reporting. /
1673	if (stat(path, &filestats) != `0`)
1674	stat_errno = errno;
1675	else
1676	stat_errno = `0`;
1677
1678	/*
1679	* Unlike FileClose's automatic file deletion code, we tolerate
1680	* non-existence to support BufFileDeleteShared which doesn't know how
1681	* many segments it has to delete until it runs out.
1682	*/
1683	if (stat_errno == ENOENT)
1684	return false;
1685
1686	if (unlink(path) < `0`)
1687	{
1688	if (errno != ENOENT)
1689	ereport(error_on_failure ? ERROR : LOG,
1690	(errcode_for_file_access(),
1691	errmsg("could not unlink temporary file \"%s\": %m",
1692	path)));
1693	return false;
1694	}
1695
1696	if (stat_errno == `0`)
1697	ReportTemporaryFileUsage(path, filestats.st_size);
1698	else
1699	{
1700	errno = stat_errno;
1701	ereport(LOG,
1702	(errcode_for_file_access(),
1703	errmsg("could not stat file \"%s\": %m", path)));
1704	}
1705
1706	return true;
1707	}
1708
1709	/*
1710	* close a file when done with it
1711	*/
1712	void
1713	FileClose(File file)
1714	{
1715	Vfd *vfdP;
1716
1717	Assert(FileIsValid(file));
1718
1719	DO_DB(elog(LOG, "FileClose: %d (%s)",
1720	file, VfdCache[file].fileName));
1721
1722	vfdP = &VfdCache[file];
1723
1724	if (!FileIsNotOpen(file))
1725	{
1726	/ close the file /
1727	if (close(vfdP->fd))
1728	{
1729	/*
1730	* We may need to panic on failure to close non-temporary files;
1731	* see LruDelete.
1732	*/
1733	elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1734	"could not close file \"%s\": %m", vfdP->fileName);
1735	}
1736
1737	--nfile;
1738	vfdP->fd = VFD_CLOSED;
1739
1740	/ remove the file from the lru ring /
1741	Delete(file);
1742	}
1743
1744	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1745	{
1746	/ Subtract its size from current usage (do first in case of error) /
1747	temporary_files_size -= vfdP->fileSize;
1748	vfdP->fileSize = `0`;
1749	}
1750
1751	/*
1752	* Delete the file if it was temporary, and make a log entry if wanted
1753	*/
1754	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1755	{
1756	struct stat filestats;
1757	int stat_errno;
1758
1759	/*
1760	* If we get an error, as could happen within the ereport/elog calls,
1761	* we'll come right back here during transaction abort. Reset the
1762	* flag to ensure that we can't get into an infinite loop. This code
1763	* is arranged to ensure that the worst-case consequence is failing to
1764	* emit log message(s), not failing to attempt the unlink.
1765	*/
1766	vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1767
1768
1769	/ first try the stat() /
1770	if (stat(vfdP->fileName, &filestats))
1771	stat_errno = errno;
1772	else
1773	stat_errno = `0`;
1774
1775	/ in any case do the unlink /
1776	if (unlink(vfdP->fileName))
1777	elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1778
1779	/ and last report the stat results /
1780	if (stat_errno == `0`)
1781	ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1782	else
1783	{
1784	errno = stat_errno;
1785	elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1786	}
1787	}
1788
1789	/ Unregister it from the resource owner /
1790	if (vfdP->resowner)
1791	ResourceOwnerForgetFile(vfdP->resowner, file);
1792
1793	/*
1794	* Return the Vfd slot to the free list
1795	*/
1796	FreeVfd(file);
1797	}
1798
1799	/*
1800	* FilePrefetch - initiate asynchronous read of a given range of the file.
1801	*
1802	* Currently the only implementation of this function is using posix_fadvise
1803	* which is the simplest standardized interface that accomplishes this.
1804	* We could add an implementation using libaio in the future; but note that
1805	* this API is inappropriate for libaio, which wants to have a buffer provided
1806	* to read into.
1807	*/
1808	int
1809	FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1810	{
1811	#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1812	int returnCode;
1813
1814	Assert(FileIsValid(file));
1815
1816	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1817	file, VfdCache[file].fileName,
1818	(int64) offset, amount));
1819
1820	returnCode = FileAccess(file);
1821	if (returnCode < `0`)
1822	return returnCode;
1823
1824	pgstat_report_wait_start(wait_event_info);
1825	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1826	POSIX_FADV_WILLNEED);
1827	pgstat_report_wait_end();
1828
1829	return returnCode;
1830	#else
1831	Assert(FileIsValid(file));
1832	return `0`;
1833	#endif
1834	}
1835
1836	void
1837	FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1838	{
1839	int returnCode;
1840
1841	Assert(FileIsValid(file));
1842
1843	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1844	file, VfdCache[file].fileName,
1845	(int64) offset, (int64) nbytes));
1846
1847	if (nbytes <= `0`)
1848	return;
1849
1850	returnCode = FileAccess(file);
1851	if (returnCode < `0`)
1852	return;
1853
1854	pgstat_report_wait_start(wait_event_info);
1855	pg_flush_data(VfdCache[file].fd, offset, nbytes);
1856	pgstat_report_wait_end();
1857	}
1858
1859	int
1860	FileRead(File file, char buffer, int* amount, off_t offset,
1861	uint32 wait_event_info)
1862	{
1863	int returnCode;
1864	Vfd *vfdP;
1865
1866	Assert(FileIsValid(file));
1867
1868	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1869	file, VfdCache[file].fileName,
1870	(int64) offset,
1871	amount, buffer));
1872
1873	returnCode = FileAccess(file);
1874	if (returnCode < `0`)
1875	return returnCode;
1876
1877	vfdP = &VfdCache[file];
1878
1879	retry:
1880	pgstat_report_wait_start(wait_event_info);
1881	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1882	pgstat_report_wait_end();
1883
1884	if (returnCode < `0`)
1885	{
1886	/*
1887	* Windows may run out of kernel buffers and return "Insufficient
1888	* system resources" error. Wait a bit and retry to solve it.
1889	*
1890	* It is rumored that EINTR is also possible on some Unix filesystems,
1891	* in which case immediate retry is indicated.
1892	*/
1893	#ifdef WIN32
1894	DWORD error = GetLastError();
1895
1896	switch (error)
1897	{
1898	case ERROR_NO_SYSTEM_RESOURCES:
1899	pg_usleep(`1000L`);
1900	errno = EINTR;
1901	break;
1902	default:
1903	_dosmaperr(error);
1904	break;
1905	}
1906	#endif
1907	/ OK to retry if interrupted /
1908	if (errno == EINTR)
1909	goto retry;
1910	}
1911
1912	return returnCode;
1913	}
1914
1915	int
1916	FileWrite(File file, char buffer, int* amount, off_t offset,
1917	uint32 wait_event_info)
1918	{
1919	int returnCode;
1920	Vfd *vfdP;
1921
1922	Assert(FileIsValid(file));
1923
1924	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1925	file, VfdCache[file].fileName,
1926	(int64) offset,
1927	amount, buffer));
1928
1929	returnCode = FileAccess(file);
1930	if (returnCode < `0`)
1931	return returnCode;
1932
1933	vfdP = &VfdCache[file];
1934
1935	/*
1936	* If enforcing temp_file_limit and it's a temp file, check to see if the
1937	* write would overrun temp_file_limit, and throw error if so. Note: it's
1938	* really a modularity violation to throw error here; we should set errno
1939	* and return -1. However, there's no way to report a suitable error
1940	* message if we do that. All current callers would just throw error
1941	* immediately anyway, so this is safe at present.
1942	*/
1943	if (temp_file_limit >= `0` && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1944	{
1945	off_t past_write = offset + amount;
1946
1947	if (past_write > vfdP->fileSize)
1948	{
1949	uint64 newTotal = temporary_files_size;
1950
1951	newTotal += past_write - vfdP->fileSize;
1952	if (newTotal > (uint64) temp_file_limit * (uint64) `1024`)
1953	ereport(ERROR,
1954	(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1955	errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1956	temp_file_limit)));
1957	}
1958	}
1959
1960	retry:
1961	errno = `0`;
1962	pgstat_report_wait_start(wait_event_info);
1963	returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
1964	pgstat_report_wait_end();
1965
1966	/ if write didn't set errno, assume problem is no disk space /
1967	if (returnCode != amount && errno == `0`)
1968	errno = ENOSPC;
1969
1970	if (returnCode >= `0`)
1971	{
1972	/*
1973	* Maintain fileSize and temporary_files_size if it's a temp file.
1974	*
1975	* If seekPos is -1 (unknown), this will do nothing; but we could only
1976	* get here in that state if we're not enforcing temporary_files_size,
1977	* so we don't care.
1978	*/
1979	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1980	{
1981	off_t past_write = offset + amount;
1982
1983	if (past_write > vfdP->fileSize)
1984	{
1985	temporary_files_size += past_write - vfdP->fileSize;
1986	vfdP->fileSize = past_write;
1987	}
1988	}
1989	}
1990	else
1991	{
1992	/*
1993	* See comments in FileRead()
1994	*/
1995	#ifdef WIN32
1996	DWORD error = GetLastError();
1997
1998	switch (error)
1999	{
2000	case ERROR_NO_SYSTEM_RESOURCES:
2001	pg_usleep(`1000L`);
2002	errno = EINTR;
2003	break;
2004	default:
2005	_dosmaperr(error);
2006	break;
2007	}
2008	#endif
2009	/ OK to retry if interrupted /
2010	if (errno == EINTR)
2011	goto retry;
2012	}
2013
2014	return returnCode;
2015	}
2016
2017	int
2018	FileSync(File file, uint32 wait_event_info)
2019	{
2020	int returnCode;
2021
2022	Assert(FileIsValid(file));
2023
2024	DO_DB(elog(LOG, "FileSync: %d (%s)",
2025	file, VfdCache[file].fileName));
2026
2027	returnCode = FileAccess(file);
2028	if (returnCode < `0`)
2029	return returnCode;
2030
2031	pgstat_report_wait_start(wait_event_info);
2032	returnCode = pg_fsync(VfdCache[file].fd);
2033	pgstat_report_wait_end();
2034
2035	return returnCode;
2036	}
2037
2038	off_t
2039	FileSize(File file)
2040	{
2041	Assert(FileIsValid(file));
2042
2043	DO_DB(elog(LOG, "FileSize %d (%s)",
2044	file, VfdCache[file].fileName));
2045
2046	if (FileIsNotOpen(file))
2047	{
2048	if (FileAccess(file) < `0`)
2049	return (off_t) -`1`;
2050	}
2051
2052	return lseek(VfdCache[file].fd, `0`, SEEK_END);
2053	}
2054
2055	int
2056	FileTruncate(File file, off_t offset, uint32 wait_event_info)
2057	{
2058	int returnCode;
2059
2060	Assert(FileIsValid(file));
2061
2062	DO_DB(elog(LOG, "FileTruncate %d (%s)",
2063	file, VfdCache[file].fileName));
2064
2065	returnCode = FileAccess(file);
2066	if (returnCode < `0`)
2067	return returnCode;
2068
2069	pgstat_report_wait_start(wait_event_info);
2070	returnCode = ftruncate(VfdCache[file].fd, offset);
2071	pgstat_report_wait_end();
2072
2073	if (returnCode == `0` && VfdCache[file].fileSize > offset)
2074	{
2075	/ adjust our state for truncation of a temp file /
2076	Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2077	temporary_files_size -= VfdCache[file].fileSize - offset;
2078	VfdCache[file].fileSize = offset;
2079	}
2080
2081	return returnCode;
2082	}
2083
2084	/*
2085	* Return the pathname associated with an open file.
2086	*
2087	* The returned string points to an internal buffer, which is valid until
2088	* the file is closed.
2089	*/
2090	char *
2091	FilePathName(File file)
2092	{
2093	Assert(FileIsValid(file));
2094
2095	return VfdCache[file].fileName;
2096	}
2097
2098	/*
2099	* Return the raw file descriptor of an opened file.
2100	*
2101	* The returned file descriptor will be valid until the file is closed, but
2102	* there are a lot of things that can make that happen. So the caller should
2103	* be careful not to do much of anything else before it finishes using the
2104	* returned file descriptor.
2105	*/
2106	int
2107	FileGetRawDesc(File file)
2108	{
2109	Assert(FileIsValid(file));
2110	return VfdCache[file].fd;
2111	}
2112
2113	/*
2114	* FileGetRawFlags - returns the file flags on open(2)
2115	*/
2116	int
2117	FileGetRawFlags(File file)
2118	{
2119	Assert(FileIsValid(file));
2120	return VfdCache[file].fileFlags;
2121	}
2122
2123	/*
2124	* FileGetRawMode - returns the mode bitmask passed to open(2)
2125	*/
2126	mode_t
2127	FileGetRawMode(File file)
2128	{
2129	Assert(FileIsValid(file));
2130	return VfdCache[file].fileMode;
2131	}
2132
2133	/*
2134	* Make room for another allocatedDescs[] array entry if needed and possible.
2135	* Returns true if an array element is available.
2136	*/
2137	static bool
2138	reserveAllocatedDesc(void)
2139	{
2140	AllocateDesc *newDescs;
2141	int newMax;
2142
2143	/ Quick out if array already has a free slot. /
2144	if (numAllocatedDescs < maxAllocatedDescs)
2145	return true;
2146
2147	/*
2148	* If the array hasn't yet been created in the current process, initialize
2149	* it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2150	* we will ever need, anyway. We don't want to look at max_safe_fds
2151	* immediately because set_max_safe_fds() may not have run yet.
2152	*/
2153	if (allocatedDescs == NULL)
2154	{
2155	newMax = FD_MINFREE / `2`;
2156	newDescs = (AllocateDesc ) malloc(newMax sizeof(AllocateDesc));
2157	/ Out of memory already? Treat as fatal error. /
2158	if (newDescs == NULL)
2159	ereport(ERROR,
2160	(errcode(ERRCODE_OUT_OF_MEMORY),
2161	errmsg("out of memory")));
2162	allocatedDescs = newDescs;
2163	maxAllocatedDescs = newMax;
2164	return true;
2165	}
2166
2167	/*
2168	* Consider enlarging the array beyond the initial allocation used above.
2169	* By the time this happens, max_safe_fds should be known accurately.
2170	*
2171	* We mustn't let allocated descriptors hog all the available FDs, and in
2172	* practice we'd better leave a reasonable number of FDs for VFD use. So
2173	* set the maximum to max_safe_fds / 2. (This should certainly be at
2174	* least as large as the initial size, FD_MINFREE / 2.)
2175	*/
2176	newMax = max_safe_fds / `2`;
2177	if (newMax > maxAllocatedDescs)
2178	{
2179	newDescs = (AllocateDesc *) realloc(allocatedDescs,
2180	newMax * sizeof(AllocateDesc));
2181	/ Treat out-of-memory as a non-fatal error. /
2182	if (newDescs == NULL)
2183	return false;
2184	allocatedDescs = newDescs;
2185	maxAllocatedDescs = newMax;
2186	return true;
2187	}
2188
2189	/ Can't enlarge allocatedDescs[] any more. /
2190	return false;
2191	}
2192
2193	/*
2194	* Routines that want to use stdio (ie, FILE*) should use AllocateFile
2195	* rather than plain fopen(). This lets fd.c deal with freeing FDs if
2196	* necessary to open the file. When done, call FreeFile rather than fclose.
2197	*
2198	* Note that files that will be open for any significant length of time
2199	* should NOT be handled this way, since they cannot share kernel file
2200	* descriptors with other files; there is grave risk of running out of FDs
2201	* if anyone locks down too many FDs. Most callers of this routine are
2202	* simply reading a config file that they will read and close immediately.
2203	*
2204	* fd.c will automatically close all files opened with AllocateFile at
2205	* transaction commit or abort; this prevents FD leakage if a routine
2206	* that calls AllocateFile is terminated prematurely by ereport(ERROR).
2207	*
2208	* Ideally this should be the only direct call of fopen() in the backend.
2209	*/
2210	FILE *
2211	AllocateFile(const char name, const* char *mode)
2212	{
2213	FILE *file;
2214
2215	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2216	numAllocatedDescs, name));
2217
2218	/ Can we allocate another non-virtual FD? /
2219	if (!reserveAllocatedDesc())
2220	ereport(ERROR,
2221	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2222	errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2223	maxAllocatedDescs, name)));
2224
2225	/ Close excess kernel FDs. /
2226	ReleaseLruFiles();
2227
2228	TryAgain:
2229	if ((file = fopen(name, mode)) != NULL)
2230	{
2231	AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2232
2233	desc->kind = AllocateDescFile;
2234	desc->desc.file = file;
2235	desc->create_subid = GetCurrentSubTransactionId();
2236	numAllocatedDescs++;
2237	return desc->desc.file;
2238	}
2239
2240	if (errno == EMFILE \|\| errno == ENFILE)
2241	{
2242	int save_errno = errno;
2243
2244	ereport(LOG,
2245	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2246	errmsg("out of file descriptors: %m; release and retry")));
2247	errno = `0`;
2248	if (ReleaseLruFile())
2249	goto TryAgain;
2250	errno = save_errno;
2251	}
2252
2253	return NULL;
2254	}
2255
2256	/*
2257	* Open a file with OpenTransientFilePerm() and pass default file mode for
2258	* the fileMode parameter.
2259	*/
2260	int
2261	OpenTransientFile(const char fileName, int* fileFlags)
2262	{
2263	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2264	}
2265
2266	/*
2267	* Like AllocateFile, but returns an unbuffered fd like open(2)
2268	*/
2269	int
2270	OpenTransientFilePerm(const char fileName, int* fileFlags, mode_t fileMode)
2271	{
2272	int fd;
2273
2274	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2275	numAllocatedDescs, fileName));
2276
2277	/ Can we allocate another non-virtual FD? /
2278	if (!reserveAllocatedDesc())
2279	ereport(ERROR,
2280	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2281	errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2282	maxAllocatedDescs, fileName)));
2283
2284	/ Close excess kernel FDs. /
2285	ReleaseLruFiles();
2286
2287	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2288
2289	if (fd >= `0`)
2290	{
2291	AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2292
2293	desc->kind = AllocateDescRawFD;
2294	desc->desc.fd = fd;
2295	desc->create_subid = GetCurrentSubTransactionId();
2296	numAllocatedDescs++;
2297
2298	return fd;
2299	}
2300
2301	return -`1`; / failure /
2302	}
2303
2304	/*
2305	* Routines that want to initiate a pipe stream should use OpenPipeStream
2306	* rather than plain popen(). This lets fd.c deal with freeing FDs if
2307	* necessary. When done, call ClosePipeStream rather than pclose.
2308	*
2309	* This function also ensures that the popen'd program is run with default
2310	* SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2311	* uses. This ensures desirable response to, eg, closing a read pipe early.
2312	*/
2313	FILE *
2314	OpenPipeStream(const char command, const* char *mode)
2315	{
2316	FILE *file;
2317	int save_errno;
2318
2319	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2320	numAllocatedDescs, command));
2321
2322	/ Can we allocate another non-virtual FD? /
2323	if (!reserveAllocatedDesc())
2324	ereport(ERROR,
2325	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2326	errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2327	maxAllocatedDescs, command)));
2328
2329	/ Close excess kernel FDs. /
2330	ReleaseLruFiles();
2331
2332	TryAgain:
2333	fflush(stdout);
2334	fflush(stderr);
2335	pqsignal(SIGPIPE, SIG_DFL);
2336	errno = `0`;
2337	file = popen(command, mode);
2338	save_errno = errno;
2339	pqsignal(SIGPIPE, SIG_IGN);
2340	errno = save_errno;
2341	if (file != NULL)
2342	{
2343	AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2344
2345	desc->kind = AllocateDescPipe;
2346	desc->desc.file = file;
2347	desc->create_subid = GetCurrentSubTransactionId();
2348	numAllocatedDescs++;
2349	return desc->desc.file;
2350	}
2351
2352	if (errno == EMFILE \|\| errno == ENFILE)
2353	{
2354	ereport(LOG,
2355	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2356	errmsg("out of file descriptors: %m; release and retry")));
2357	if (ReleaseLruFile())
2358	goto TryAgain;
2359	errno = save_errno;
2360	}
2361
2362	return NULL;
2363	}
2364
2365	/*
2366	* Free an AllocateDesc of any type.
2367	*
2368	* The argument must point into the allocatedDescs[] array.
2369	*/
2370	static int
2371	FreeDesc(AllocateDesc *desc)
2372	{
2373	int result;
2374
2375	/ Close the underlying object /
2376	switch (desc->kind)
2377	{
2378	case AllocateDescFile:
2379	result = fclose(desc->desc.file);
2380	break;
2381	case AllocateDescPipe:
2382	result = pclose(desc->desc.file);
2383	break;
2384	case AllocateDescDir:
2385	result = closedir(desc->desc.dir);
2386	break;
2387	case AllocateDescRawFD:
2388	result = close(desc->desc.fd);
2389	break;
2390	default:
2391	elog(ERROR, "AllocateDesc kind not recognized");
2392	result = `0`; / keep compiler quiet /
2393	break;
2394	}
2395
2396	/ Compact storage in the allocatedDescs array /
2397	numAllocatedDescs--;
2398	*desc = allocatedDescs[numAllocatedDescs];
2399
2400	return result;
2401	}
2402
2403	/*
2404	* Close a file returned by AllocateFile.
2405	*
2406	* Note we do not check fclose's return value --- it is up to the caller
2407	* to handle close errors.
2408	*/
2409	int
2410	FreeFile(FILE *file)
2411	{
2412	int i;
2413
2414	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2415
2416	/ Remove file from list of allocated files, if it's present /
2417	for (i = numAllocatedDescs; --i >= `0`;)
2418	{
2419	AllocateDesc *desc = &allocatedDescs[i];
2420
2421	if (desc->kind == AllocateDescFile && desc->desc.file == file)
2422	return FreeDesc(desc);
2423	}
2424
2425	/ Only get here if someone passes us a file not in allocatedDescs /
2426	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2427
2428	return fclose(file);
2429	}
2430
2431	/*
2432	* Close a file returned by OpenTransientFile.
2433	*
2434	* Note we do not check close's return value --- it is up to the caller
2435	* to handle close errors.
2436	*/
2437	int
2438	CloseTransientFile(int fd)
2439	{
2440	int i;
2441
2442	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2443
2444	/ Remove fd from list of allocated files, if it's present /
2445	for (i = numAllocatedDescs; --i >= `0`;)
2446	{
2447	AllocateDesc *desc = &allocatedDescs[i];
2448
2449	if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2450	return FreeDesc(desc);
2451	}
2452
2453	/ Only get here if someone passes us a file not in allocatedDescs /
2454	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2455
2456	return close(fd);
2457	}
2458
2459	/*
2460	* Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2461	* rather than plain opendir(). This lets fd.c deal with freeing FDs if
2462	* necessary to open the directory, and with closing it after an elog.
2463	* When done, call FreeDir rather than closedir.
2464	*
2465	* Returns NULL, with errno set, on failure. Note that failure detection
2466	* is commonly left to the following call of ReadDir or ReadDirExtended;
2467	* see the comments for ReadDir.
2468	*
2469	* Ideally this should be the only direct call of opendir() in the backend.
2470	*/
2471	DIR *
2472	AllocateDir(const char *dirname)
2473	{
2474	DIR *dir;
2475
2476	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2477	numAllocatedDescs, dirname));
2478
2479	/ Can we allocate another non-virtual FD? /
2480	if (!reserveAllocatedDesc())
2481	ereport(ERROR,
2482	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2483	errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2484	maxAllocatedDescs, dirname)));
2485
2486	/ Close excess kernel FDs. /
2487	ReleaseLruFiles();
2488
2489	TryAgain:
2490	if ((dir = opendir(dirname)) != NULL)
2491	{
2492	AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2493
2494	desc->kind = AllocateDescDir;
2495	desc->desc.dir = dir;
2496	desc->create_subid = GetCurrentSubTransactionId();
2497	numAllocatedDescs++;
2498	return desc->desc.dir;
2499	}
2500
2501	if (errno == EMFILE \|\| errno == ENFILE)
2502	{
2503	int save_errno = errno;
2504
2505	ereport(LOG,
2506	(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2507	errmsg("out of file descriptors: %m; release and retry")));
2508	errno = `0`;
2509	if (ReleaseLruFile())
2510	goto TryAgain;
2511	errno = save_errno;
2512	}
2513
2514	return NULL;
2515	}
2516
2517	/*
2518	* Read a directory opened with AllocateDir, ereport'ing any error.
2519	*
2520	* This is easier to use than raw readdir() since it takes care of some
2521	* otherwise rather tedious and error-prone manipulation of errno. Also,
2522	* if you are happy with a generic error message for AllocateDir failure,
2523	* you can just do
2524	*
2525	* dir = AllocateDir(path);
2526	* while ((dirent = ReadDir(dir, path)) != NULL)
2527	* process dirent;
2528	* FreeDir(dir);
2529	*
2530	* since a NULL dir parameter is taken as indicating AllocateDir failed.
2531	* (Make sure errno isn't changed between AllocateDir and ReadDir if you
2532	* use this shortcut.)
2533	*
2534	* The pathname passed to AllocateDir must be passed to this routine too,
2535	* but it is only used for error reporting.
2536	*/
2537	struct dirent *
2538	ReadDir(DIR dir, const* char *dirname)
2539	{
2540	return ReadDirExtended(dir, dirname, ERROR);
2541	}
2542
2543	/*
2544	* Alternate version of ReadDir that allows caller to specify the elevel
2545	* for any error report (whether it's reporting an initial failure of
2546	* AllocateDir or a subsequent directory read failure).
2547	*
2548	* If elevel < ERROR, returns NULL after any error. With the normal coding
2549	* pattern, this will result in falling out of the loop immediately as
2550	* though the directory contained no (more) entries.
2551	*/
2552	struct dirent *
2553	ReadDirExtended(DIR dir, const* char dirname, int* elevel)
2554	{
2555	struct dirent *dent;
2556
2557	/ Give a generic message for AllocateDir failure, if caller didn't /
2558	if (dir == NULL)
2559	{
2560	ereport(elevel,
2561	(errcode_for_file_access(),
2562	errmsg("could not open directory \"%s\": %m",
2563	dirname)));
2564	return NULL;
2565	}
2566
2567	errno = `0`;
2568	if ((dent = readdir(dir)) != NULL)
2569	return dent;
2570
2571	if (errno)
2572	ereport(elevel,
2573	(errcode_for_file_access(),
2574	errmsg("could not read directory \"%s\": %m",
2575	dirname)));
2576	return NULL;
2577	}
2578
2579	/*
2580	* Close a directory opened with AllocateDir.
2581	*
2582	* Returns closedir's return value (with errno set if it's not 0).
2583	* Note we do not check the return value --- it is up to the caller
2584	* to handle close errors if wanted.
2585	*
2586	* Does nothing if dir == NULL; we assume that directory open failure was
2587	* already reported if desired.
2588	*/
2589	int
2590	FreeDir(DIR *dir)
2591	{
2592	int i;
2593
2594	/ Nothing to do if AllocateDir failed /
2595	if (dir == NULL)
2596	return `0`;
2597
2598	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2599
2600	/ Remove dir from list of allocated dirs, if it's present /
2601	for (i = numAllocatedDescs; --i >= `0`;)
2602	{
2603	AllocateDesc *desc = &allocatedDescs[i];
2604
2605	if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2606	return FreeDesc(desc);
2607	}
2608
2609	/ Only get here if someone passes us a dir not in allocatedDescs /
2610	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2611
2612	return closedir(dir);
2613	}
2614
2615
2616	/*
2617	* Close a pipe stream returned by OpenPipeStream.
2618	*/
2619	int
2620	ClosePipeStream(FILE *file)
2621	{
2622	int i;
2623
2624	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2625
2626	/ Remove file from list of allocated files, if it's present /
2627	for (i = numAllocatedDescs; --i >= `0`;)
2628	{
2629	AllocateDesc *desc = &allocatedDescs[i];
2630
2631	if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2632	return FreeDesc(desc);
2633	}
2634
2635	/ Only get here if someone passes us a file not in allocatedDescs /
2636	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2637
2638	return pclose(file);
2639	}
2640
2641	/*
2642	* closeAllVfds
2643	*
2644	* Force all VFDs into the physically-closed state, so that the fewest
2645	* possible number of kernel file descriptors are in use. There is no
2646	* change in the logical state of the VFDs.
2647	*/
2648	void
2649	closeAllVfds(void)
2650	{
2651	Index i;
2652
2653	if (SizeVfdCache > `0`)
2654	{
2655	Assert(FileIsNotOpen(`0`)); / Make sure ring not corrupted /
2656	for (i = `1`; i < SizeVfdCache; i++)
2657	{
2658	if (!FileIsNotOpen(i))
2659	LruDelete(i);
2660	}
2661	}
2662	}
2663
2664
2665	/*
2666	* SetTempTablespaces
2667	*
2668	* Define a list (actually an array) of OIDs of tablespaces to use for
2669	* temporary files. This list will be used until end of transaction,
2670	* unless this function is called again before then. It is caller's
2671	* responsibility that the passed-in array has adequate lifespan (typically
2672	* it'd be allocated in TopTransactionContext).
2673	*/
2674	void
2675	SetTempTablespaces(Oid tableSpaces, int* numSpaces)
2676	{
2677	Assert(numSpaces >= `0`);
2678	tempTableSpaces = tableSpaces;
2679	numTempTableSpaces = numSpaces;
2680
2681	/*
2682	* Select a random starting point in the list. This is to minimize
2683	* conflicts between backends that are most likely sharing the same list
2684	* of temp tablespaces. Note that if we create multiple temp files in the
2685	* same transaction, we'll advance circularly through the list --- this
2686	* ensures that large temporary sort files are nicely spread across all
2687	* available tablespaces.
2688	*/
2689	if (numSpaces > `1`)
2690	nextTempTableSpace = random() % numSpaces;
2691	else
2692	nextTempTableSpace = `0`;
2693	}
2694
2695	/*
2696	* TempTablespacesAreSet
2697	*
2698	* Returns true if SetTempTablespaces has been called in current transaction.
2699	* (This is just so that tablespaces.c doesn't need its own per-transaction
2700	* state.)
2701	*/
2702	bool
2703	TempTablespacesAreSet(void)
2704	{
2705	return (numTempTableSpaces >= `0`);
2706	}
2707
2708	/*
2709	* GetTempTablespaces
2710	*
2711	* Populate an array with the OIDs of the tablespaces that should be used for
2712	* temporary files. Return the number that were copied into the output array.
2713	*/
2714	int
2715	GetTempTablespaces(Oid tableSpaces, int* numSpaces)
2716	{
2717	int i;
2718
2719	Assert(TempTablespacesAreSet());
2720	for (i = `0`; i < numTempTableSpaces && i < numSpaces; ++i)
2721	tableSpaces[i] = tempTableSpaces[i];
2722
2723	return i;
2724	}
2725
2726	/*
2727	* GetNextTempTableSpace
2728	*
2729	* Select the next temp tablespace to use. A result of InvalidOid means
2730	* to use the current database's default tablespace.
2731	*/
2732	Oid
2733	GetNextTempTableSpace(void)
2734	{
2735	if (numTempTableSpaces > `0`)
2736	{
2737	/ Advance nextTempTableSpace counter with wraparound /
2738	if (++nextTempTableSpace >= numTempTableSpaces)
2739	nextTempTableSpace = `0`;
2740	return tempTableSpaces[nextTempTableSpace];
2741	}
2742	return InvalidOid;
2743	}
2744
2745
2746	/*
2747	* AtEOSubXact_Files
2748	*
2749	* Take care of subtransaction commit/abort. At abort, we close temp files
2750	* that the subtransaction may have opened. At commit, we reassign the
2751	* files that were opened to the parent subtransaction.
2752	*/
2753	void
2754	AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2755	SubTransactionId parentSubid)
2756	{
2757	Index i;
2758
2759	for (i = `0`; i < numAllocatedDescs; i++)
2760	{
2761	if (allocatedDescs[i].create_subid == mySubid)
2762	{
2763	if (isCommit)
2764	allocatedDescs[i].create_subid = parentSubid;
2765	else
2766	{
2767	/ have to recheck the item after FreeDesc (ugly) /
2768	FreeDesc(&allocatedDescs[i--]);
2769	}
2770	}
2771	}
2772	}
2773
2774	/*
2775	* AtEOXact_Files
2776	*
2777	* This routine is called during transaction commit or abort. All still-open
2778	* per-transaction temporary file VFDs are closed, which also causes the
2779	* underlying files to be deleted (although they should've been closed already
2780	* by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2781	* closed. We also forget any transaction-local temp tablespace list.
2782	*
2783	* The isCommit flag is used only to decide whether to emit warnings about
2784	* unclosed files.
2785	*/
2786	void
2787	AtEOXact_Files(bool isCommit)
2788	{
2789	CleanupTempFiles(isCommit, false);
2790	tempTableSpaces = NULL;
2791	numTempTableSpaces = -`1`;
2792	}
2793
2794	/*
2795	* AtProcExit_Files
2796	*
2797	* on_proc_exit hook to clean up temp files during backend shutdown.
2798	* Here, we want to clean up all temp files including interXact ones.
2799	*/
2800	static void
2801	AtProcExit_Files(int code, Datum arg)
2802	{
2803	CleanupTempFiles(false, true);
2804	}
2805
2806	/*
2807	* Close temporary files and delete their underlying files.
2808	*
2809	* isCommit: if true, this is normal transaction commit, and we don't
2810	* expect any remaining files; warn if there are some.
2811	*
2812	* isProcExit: if true, this is being called as the backend process is
2813	* exiting. If that's the case, we should remove all temporary files; if
2814	* that's not the case, we are being called for transaction commit/abort
2815	* and should only remove transaction-local temp files. In either case,
2816	* also clean up "allocated" stdio files, dirs and fds.
2817	*/
2818	static void
2819	CleanupTempFiles(bool isCommit, bool isProcExit)
2820	{
2821	Index i;
2822
2823	/*
2824	* Careful here: at proc_exit we need extra cleanup, not just
2825	* xact_temporary files.
2826	*/
2827	if (isProcExit \|\| have_xact_temporary_files)
2828	{
2829	Assert(FileIsNotOpen(`0`)); / Make sure ring not corrupted /
2830	for (i = `1`; i < SizeVfdCache; i++)
2831	{
2832	unsigned short fdstate = VfdCache[i].fdstate;
2833
2834	if (((fdstate & FD_DELETE_AT_CLOSE) \|\| (fdstate & FD_CLOSE_AT_EOXACT)) &&
2835	VfdCache[i].fileName != NULL)
2836	{
2837	/*
2838	* If we're in the process of exiting a backend process, close
2839	* all temporary files. Otherwise, only close temporary files
2840	* local to the current transaction. They should be closed by
2841	* the ResourceOwner mechanism already, so this is just a
2842	* debugging cross-check.
2843	*/
2844	if (isProcExit)
2845	FileClose(i);
2846	else if (fdstate & FD_CLOSE_AT_EOXACT)
2847	{
2848	elog(WARNING,
2849	"temporary file %s not closed at end-of-transaction",
2850	VfdCache[i].fileName);
2851	FileClose(i);
2852	}
2853	}
2854	}
2855
2856	have_xact_temporary_files = false;
2857	}
2858
2859	/ Complain if any allocated files remain open at commit. /
2860	if (isCommit && numAllocatedDescs > `0`)
2861	elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2862	numAllocatedDescs);
2863
2864	/ Clean up "allocated" stdio files, dirs and fds. /
2865	while (numAllocatedDescs > `0`)
2866	FreeDesc(&allocatedDescs[`0`]);
2867	}
2868
2869
2870	/*
2871	* Remove temporary and temporary relation files left over from a prior
2872	* postmaster session
2873	*
2874	* This should be called during postmaster startup. It will forcibly
2875	* remove any leftover files created by OpenTemporaryFile and any leftover
2876	* temporary relation files created by mdcreate.
2877	*
2878	* NOTE: we could, but don't, call this during a post-backend-crash restart
2879	* cycle. The argument for not doing it is that someone might want to examine
2880	* the temp files for debugging purposes. This does however mean that
2881	* OpenTemporaryFile had better allow for collision with an existing temp
2882	* file name.
2883	*
2884	* NOTE: this function and its subroutines generally report syscall failures
2885	* with ereport(LOG) and keep going. Removing temp files is not so critical
2886	* that we should fail to start the database when we can't do it.
2887	*/
2888	void
2889	RemovePgTempFiles(void)
2890	{
2891	char temp_path[MAXPGPATH + `10` + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2892	DIR *spc_dir;
2893	struct dirent *spc_de;
2894
2895	/*
2896	* First process temp files in pg_default ($PGDATA/base)
2897	*/
2898	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2899	RemovePgTempFilesInDir(temp_path, true, false);
2900	RemovePgTempRelationFiles("base");
2901
2902	/*
2903	* Cycle through temp directories for all non-default tablespaces.
2904	*/
2905	spc_dir = AllocateDir("pg_tblspc");
2906
2907	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2908	{
2909	if (strcmp(spc_de->d_name, ".") == `0` \|\|
2910	strcmp(spc_de->d_name, "..") == `0`)
2911	continue;
2912
2913	snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2914	spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2915	RemovePgTempFilesInDir(temp_path, true, false);
2916
2917	snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2918	spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2919	RemovePgTempRelationFiles(temp_path);
2920	}
2921
2922	FreeDir(spc_dir);
2923
2924	/*
2925	* In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2926	* DataDir as well.
2927	*/
2928	#ifdef EXEC_BACKEND
2929	RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
2930	#endif
2931	}
2932
2933	/*
2934	* Process one pgsql_tmp directory for RemovePgTempFiles.
2935	*
2936	* If missing_ok is true, it's all right for the named directory to not exist.
2937	* Any other problem results in a LOG message. (missing_ok should be true at
2938	* the top level, since pgsql_tmp directories are not created until needed.)
2939	*
2940	* At the top level, this should be called with unlink_all = false, so that
2941	* only files matching the temporary name prefix will be unlinked. When
2942	* recursing it will be called with unlink_all = true to unlink everything
2943	* under a top-level temporary directory.
2944	*
2945	* (These two flags could be replaced by one, but it seems clearer to keep
2946	* them separate.)
2947	*/
2948	static void
2949	RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2950	{
2951	DIR *temp_dir;
2952	struct dirent *temp_de;
2953	char rm_path[MAXPGPATH * `2`];
2954
2955	temp_dir = AllocateDir(tmpdirname);
2956
2957	if (temp_dir == NULL && errno == ENOENT && missing_ok)
2958	return;
2959
2960	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2961	{
2962	if (strcmp(temp_de->d_name, ".") == `0` \|\|
2963	strcmp(temp_de->d_name, "..") == `0`)
2964	continue;
2965
2966	snprintf(rm_path, sizeof(rm_path), "%s/%s",
2967	tmpdirname, temp_de->d_name);
2968
2969	if (unlink_all \|\|
2970	strncmp(temp_de->d_name,
2971	PG_TEMP_FILE_PREFIX,
2972	strlen(PG_TEMP_FILE_PREFIX)) == `0`)
2973	{
2974	struct stat statbuf;
2975
2976	if (lstat(rm_path, &statbuf) < `0`)
2977	{
2978	ereport(LOG,
2979	(errcode_for_file_access(),
2980	errmsg("could not stat file \"%s\": %m", rm_path)));
2981	continue;
2982	}
2983
2984	if (S_ISDIR(statbuf.st_mode))
2985	{
2986	/ recursively remove contents, then directory itself /
2987	RemovePgTempFilesInDir(rm_path, false, true);
2988
2989	if (rmdir(rm_path) < `0`)
2990	ereport(LOG,
2991	(errcode_for_file_access(),
2992	errmsg("could not remove directory \"%s\": %m",
2993	rm_path)));
2994	}
2995	else
2996	{
2997	if (unlink(rm_path) < `0`)
2998	ereport(LOG,
2999	(errcode_for_file_access(),
3000	errmsg("could not remove file \"%s\": %m",
3001	rm_path)));
3002	}
3003	}
3004	else
3005	ereport(LOG,
3006	(errmsg("unexpected file found in temporary-files directory: \"%s\"",
3007	rm_path)));
3008	}
3009
3010	FreeDir(temp_dir);
3011	}
3012
3013	/ Process one tablespace directory, look for per-DB subdirectories /
3014	static void
3015	RemovePgTempRelationFiles(const char *tsdirname)
3016	{
3017	DIR *ts_dir;
3018	struct dirent *de;
3019	char dbspace_path[MAXPGPATH * `2`];
3020
3021	ts_dir = AllocateDir(tsdirname);
3022
3023	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3024	{
3025	/*
3026	* We're only interested in the per-database directories, which have
3027	* numeric names. Note that this code will also (properly) ignore "."
3028	* and "..".
3029	*/
3030	if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3031	continue;
3032
3033	snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3034	tsdirname, de->d_name);
3035	RemovePgTempRelationFilesInDbspace(dbspace_path);
3036	}
3037
3038	FreeDir(ts_dir);
3039	}
3040
3041	/ Process one per-dbspace directory for RemovePgTempRelationFiles /
3042	static void
3043	RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3044	{
3045	DIR *dbspace_dir;
3046	struct dirent *de;
3047	char rm_path[MAXPGPATH * `2`];
3048
3049	dbspace_dir = AllocateDir(dbspacedirname);
3050
3051	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3052	{
3053	if (!looks_like_temp_rel_name(de->d_name))
3054	continue;
3055
3056	snprintf(rm_path, sizeof(rm_path), "%s/%s",
3057	dbspacedirname, de->d_name);
3058
3059	if (unlink(rm_path) < `0`)
3060	ereport(LOG,
3061	(errcode_for_file_access(),
3062	errmsg("could not remove file \"%s\": %m",
3063	rm_path)));
3064	}
3065
3066	FreeDir(dbspace_dir);
3067	}
3068
3069	/ t<digits>_<digits>, or t<digits>_<digits>_<forkname> /
3070	bool
3071	looks_like_temp_rel_name(const char *name)
3072	{
3073	int pos;
3074	int savepos;
3075
3076	/ Must start with "t". /
3077	if (name[`0`] != `'t'`)
3078	return false;
3079
3080	/ Followed by a non-empty string of digits and then an underscore. /
3081	for (pos = `1`; isdigit((unsigned char) name[pos]); ++pos)
3082	;
3083	if (pos == `1` \|\| name[pos] != `'_'`)
3084	return false;
3085
3086	/ Followed by another nonempty string of digits. /
3087	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3088	;
3089	if (savepos == pos)
3090	return false;
3091
3092	/ We might have _forkname or .segment or both. /
3093	if (name[pos] == `'_'`)
3094	{
3095	int forkchar = forkname_chars(&name[pos + `1`], NULL);
3096
3097	if (forkchar <= `0`)
3098	return false;
3099	pos += forkchar + `1`;
3100	}
3101	if (name[pos] == `'.'`)
3102	{
3103	int segchar;
3104
3105	for (segchar = `1`; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3106	;
3107	if (segchar <= `1`)
3108	return false;
3109	pos += segchar;
3110	}
3111
3112	/ Now we should be at the end. /
3113	if (name[pos] != `'\0'`)
3114	return false;
3115	return true;
3116	}
3117
3118
3119	/*
3120	* Issue fsync recursively on PGDATA and all its contents.
3121	*
3122	* We fsync regular files and directories wherever they are, but we
3123	* follow symlinks only for pg_wal and immediately under pg_tblspc.
3124	* Other symlinks are presumed to point at files we're not responsible
3125	* for fsyncing, and might not have privileges to write at all.
3126	*
3127	* Errors are logged but not considered fatal; that's because this is used
3128	* only during database startup, to deal with the possibility that there are
3129	* issued-but-unsynced writes pending against the data directory. We want to
3130	* ensure that such writes reach disk before anything that's done in the new
3131	* run. However, aborting on error would result in failure to start for
3132	* harmless cases such as read-only files in the data directory, and that's
3133	* not good either.
3134	*
3135	* Note that if we previously crashed due to a PANIC on fsync(), we'll be
3136	* rewriting all changes again during recovery.
3137	*
3138	* Note we assume we're chdir'd into PGDATA to begin with.
3139	*/
3140	void
3141	SyncDataDirectory(void)
3142	{
3143	bool xlog_is_symlink;
3144
3145	/ We can skip this whole thing if fsync is disabled. /
3146	if (!enableFsync)
3147	return;
3148
3149	/*
3150	* If pg_wal is a symlink, we'll need to recurse into it separately,
3151	* because the first walkdir below will ignore it.
3152	*/
3153	xlog_is_symlink = false;
3154
3155	#ifndef WIN32
3156	{
3157	struct stat st;
3158
3159	if (lstat("pg_wal", &st) < `0`)
3160	ereport(LOG,
3161	(errcode_for_file_access(),
3162	errmsg("could not stat file \"%s\": %m",
3163	"pg_wal")));
3164	else if (S_ISLNK(st.st_mode))
3165	xlog_is_symlink = true;
3166	}
3167	#else
3168	if (pgwin32_is_junction("pg_wal"))
3169	xlog_is_symlink = true;
3170	#endif
3171
3172	/*
3173	* If possible, hint to the kernel that we're soon going to fsync the data
3174	* directory and its contents. Errors in this step are even less
3175	* interesting than normal, so log them only at DEBUG1.
3176	*/
3177	#ifdef PG_FLUSH_DATA_WORKS
3178	walkdir(".", pre_sync_fname, false, DEBUG1);
3179	if (xlog_is_symlink)
3180	walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3181	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3182	#endif
3183
3184	/*
3185	* Now we do the fsync()s in the same order.
3186	*
3187	* The main call ignores symlinks, so in addition to specially processing
3188	* pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3189	* process_symlinks = true. Note that if there are any plain directories
3190	* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3191	* so we don't worry about optimizing it.
3192	*/
3193	walkdir(".", datadir_fsync_fname, false, LOG);
3194	if (xlog_is_symlink)
3195	walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3196	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3197	}
3198
3199	/*
3200	* walkdir: recursively walk a directory, applying the action to each
3201	* regular file and directory (including the named directory itself).
3202	*
3203	* If process_symlinks is true, the action and recursion are also applied
3204	* to regular files and directories that are pointed to by symlinks in the
3205	* given directory; otherwise symlinks are ignored. Symlinks are always
3206	* ignored in subdirectories, ie we intentionally don't pass down the
3207	* process_symlinks flag to recursive calls.
3208	*
3209	* Errors are reported at level elevel, which might be ERROR or less.
3210	*
3211	* See also walkdir in initdb.c, which is a frontend version of this logic.
3212	*/
3213	static void
3214	walkdir(const char *path,
3215	void (action) (const* char fname, bool isdir, int* elevel),
3216	bool process_symlinks,
3217	int elevel)
3218	{
3219	DIR *dir;
3220	struct dirent *de;
3221
3222	dir = AllocateDir(path);
3223
3224	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3225	{
3226	char subpath[MAXPGPATH * `2`];
3227	struct stat fst;
3228	int sret;
3229
3230	CHECK_FOR_INTERRUPTS();
3231
3232	if (strcmp(de->d_name, ".") == `0` \|\|
3233	strcmp(de->d_name, "..") == `0`)
3234	continue;
3235
3236	snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3237
3238	if (process_symlinks)
3239	sret = stat(subpath, &fst);
3240	else
3241	sret = lstat(subpath, &fst);
3242
3243	if (sret < `0`)
3244	{
3245	ereport(elevel,
3246	(errcode_for_file_access(),
3247	errmsg("could not stat file \"%s\": %m", subpath)));
3248	continue;
3249	}
3250
3251	if (S_ISREG(fst.st_mode))
3252	(*action) (subpath, false, elevel);
3253	else if (S_ISDIR(fst.st_mode))
3254	walkdir(subpath, action, false, elevel);
3255	}
3256
3257	FreeDir(dir); / we ignore any error here /
3258
3259	/*
3260	* It's important to fsync the destination directory itself as individual
3261	* file fsyncs don't guarantee that the directory entry for the file is
3262	* synced. However, skip this if AllocateDir failed; the action function
3263	* might not be robust against that.
3264	*/
3265	if (dir)
3266	(*action) (path, true, elevel);
3267	}
3268
3269
3270	/*
3271	* Hint to the OS that it should get ready to fsync() this file.
3272	*
3273	* Ignores errors trying to open unreadable files, and logs other errors at a
3274	* caller-specified level.
3275	*/
3276	#ifdef PG_FLUSH_DATA_WORKS
3277
3278	static void
3279	pre_sync_fname(const char fname, bool isdir, int* elevel)
3280	{
3281	int fd;
3282
3283	/ Don't try to flush directories, it'll likely just fail /
3284	if (isdir)
3285	return;
3286
3287	fd = OpenTransientFile(fname, O_RDONLY \| PG_BINARY);
3288
3289	if (fd < `0`)
3290	{
3291	if (errno == EACCES)
3292	return;
3293	ereport(elevel,
3294	(errcode_for_file_access(),
3295	errmsg("could not open file \"%s\": %m", fname)));
3296	return;
3297	}
3298
3299	/*
3300	* pg_flush_data() ignores errors, which is ok because this is only a
3301	* hint.
3302	*/
3303	pg_flush_data(fd, `0`, `0`);
3304
3305	if (CloseTransientFile(fd))
3306	ereport(elevel,
3307	(errcode_for_file_access(),
3308	errmsg("could not close file \"%s\": %m", fname)));
3309	}
3310
3311	#endif /* PG_FLUSH_DATA_WORKS */
3312
3313	static void
3314	datadir_fsync_fname(const char fname, bool isdir, int* elevel)
3315	{
3316	/*
3317	* We want to silently ignoring errors about unreadable files. Pass that
3318	* desire on to fsync_fname_ext().
3319	*/
3320	fsync_fname_ext(fname, isdir, true, elevel);
3321	}
3322
3323	static void
3324	unlink_if_exists_fname(const char fname, bool isdir, int* elevel)
3325	{
3326	if (isdir)
3327	{
3328	if (rmdir(fname) != `0` && errno != ENOENT)
3329	ereport(elevel,
3330	(errcode_for_file_access(),
3331	errmsg("could not remove directory \"%s\": %m", fname)));
3332	}
3333	else
3334	{
3335	/ Use PathNameDeleteTemporaryFile to report filesize /
3336	PathNameDeleteTemporaryFile(fname, false);
3337	}
3338	}
3339
3340	/*
3341	* fsync_fname_ext -- Try to fsync a file or directory
3342	*
3343	* If ignore_perm is true, ignore errors upon trying to open unreadable
3344	* files. Logs other errors at a caller-specified level.
3345	*
3346	* Returns 0 if the operation succeeded, -1 otherwise.
3347	*/
3348	static int
3349	fsync_fname_ext(const char fname, bool isdir, bool ignore_perm, int* elevel)
3350	{
3351	int fd;
3352	int flags;
3353	int returncode;
3354
3355	/*
3356	* Some OSs require directories to be opened read-only whereas other
3357	* systems don't allow us to fsync files opened read-only; so we need both
3358	* cases here. Using O_RDWR will cause us to fail to fsync files that are
3359	* not writable by our userid, but we assume that's OK.
3360	*/
3361	flags = PG_BINARY;
3362	if (!isdir)
3363	flags \|= O_RDWR;
3364	else
3365	flags \|= O_RDONLY;
3366
3367	fd = OpenTransientFile(fname, flags);
3368
3369	/*
3370	* Some OSs don't allow us to open directories at all (Windows returns
3371	* EACCES), just ignore the error in that case. If desired also silently
3372	* ignoring errors about unreadable files. Log others.
3373	*/
3374	if (fd < `0` && isdir && (errno == EISDIR \|\| errno == EACCES))
3375	return `0`;
3376	else if (fd < `0` && ignore_perm && errno == EACCES)
3377	return `0`;
3378	else if (fd < `0`)
3379	{
3380	ereport(elevel,
3381	(errcode_for_file_access(),
3382	errmsg("could not open file \"%s\": %m", fname)));
3383	return -`1`;
3384	}
3385
3386	returncode = pg_fsync(fd);
3387
3388	/*
3389	* Some OSes don't allow us to fsync directories at all, so we can ignore
3390	* those errors. Anything else needs to be logged.
3391	*/
3392	if (returncode != `0` && !(isdir && (errno == EBADF \|\| errno == EINVAL)))
3393	{
3394	int save_errno;
3395
3396	/ close file upon error, might not be in transaction context /
3397	save_errno = errno;
3398	(void) CloseTransientFile(fd);
3399	errno = save_errno;
3400
3401	ereport(elevel,
3402	(errcode_for_file_access(),
3403	errmsg("could not fsync file \"%s\": %m", fname)));
3404	return -`1`;
3405	}
3406
3407	if (CloseTransientFile(fd))
3408	{
3409	ereport(elevel,
3410	(errcode_for_file_access(),
3411	errmsg("could not close file \"%s\": %m", fname)));
3412	return -`1`;
3413	}
3414
3415	return `0`;
3416	}
3417
3418	/*
3419	* fsync_parent_path -- fsync the parent path of a file or directory
3420	*
3421	* This is aimed at making file operations persistent on disk in case of
3422	* an OS crash or power failure.
3423	*/
3424	static int
3425	fsync_parent_path(const char fname, int* elevel)
3426	{
3427	char parentpath[MAXPGPATH];
3428
3429	strlcpy(parentpath, fname, MAXPGPATH);
3430	get_parent_directory(parentpath);
3431
3432	/*
3433	* get_parent_directory() returns an empty string if the input argument is
3434	* just a file name (see comments in path.c), so handle that as being the
3435	* current directory.
3436	*/
3437	if (strlen(parentpath) == `0`)
3438	strlcpy(parentpath, ".", MAXPGPATH);
3439
3440	if (fsync_fname_ext(parentpath, true, false, elevel) != `0`)
3441	return -`1`;
3442
3443	return `0`;
3444	}
3445
3446	/*
3447	* Create a PostgreSQL data sub-directory
3448	*
3449	* The data directory itself, and most of its sub-directories, are created at
3450	* initdb time, but we do have some occasions when we create directories in
3451	* the backend (CREATE TABLESPACE, for example). In those cases, we want to
3452	* make sure that those directories are created consistently. Today, that means
3453	* making sure that the created directory has the correct permissions, which is
3454	* what pg_dir_create_mode tracks for us.
3455	*
3456	* Note that we also set the umask() based on what we understand the correct
3457	* permissions to be (see file_perm.c).
3458	*
3459	* For permissions other than the default, mkdir() can be used directly, but
3460	* be sure to consider carefully such cases -- a sub-directory with incorrect
3461	* permissions in a PostgreSQL data directory could cause backups and other
3462	* processes to fail.
3463	*/
3464	int
3465	MakePGDirectory(const char *directoryName)
3466	{
3467	return mkdir(directoryName, pg_dir_create_mode);
3468	}
3469
3470	/*
3471	* Return the passed-in error level, or PANIC if data_sync_retry is off.
3472	*
3473	* Failure to fsync any data file is cause for immediate panic, unless
3474	* data_sync_retry is enabled. Data may have been written to the operating
3475	* system and removed from our buffer pool already, and if we are running on
3476	* an operating system that forgets dirty data on write-back failure, there
3477	* may be only one copy of the data remaining: in the WAL. A later attempt to
3478	* fsync again might falsely report success. Therefore we must not allow any
3479	* further checkpoints to be attempted. data_sync_retry can in theory be
3480	* enabled on systems known not to drop dirty buffered data on write-back
3481	* failure (with the likely outcome that checkpoints will continue to fail
3482	* until the underlying problem is fixed).
3483	*
3484	* Any code that reports a failure from fsync() or related functions should
3485	* filter the error level with this function.
3486	*/
3487	int
3488	data_sync_elevel(int elevel)
3489	{
3490	return data_sync_retry ? elevel : PANIC;
3491	}
3492

Browse the source code of PostgreSQL/src/backend/storage/file/fd.c