1/*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 *-------------------------------------------------------------------------
65 */
66
67#include "postgres.h"
68
69#include <sys/file.h>
70#include <sys/param.h>
71#include <sys/stat.h>
72#ifndef WIN32
73#include <sys/mman.h>
74#endif
75#include <limits.h>
76#include <unistd.h>
77#include <fcntl.h>
78#ifdef HAVE_SYS_RESOURCE_H
79#include <sys/resource.h> /* for getrlimit */
80#endif
81
82#include "miscadmin.h"
83#include "access/xact.h"
84#include "access/xlog.h"
85#include "catalog/pg_tablespace.h"
86#include "common/file_perm.h"
87#include "pgstat.h"
88#include "portability/mem.h"
89#include "storage/fd.h"
90#include "storage/ipc.h"
91#include "utils/guc.h"
92#include "utils/resowner_private.h"
93
94
95/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96#if defined(HAVE_SYNC_FILE_RANGE)
97#define PG_FLUSH_DATA_WORKS 1
98#elif !defined(WIN32) && defined(MS_ASYNC)
99#define PG_FLUSH_DATA_WORKS 1
100#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101#define PG_FLUSH_DATA_WORKS 1
102#endif
103
104/*
105 * We must leave some file descriptors free for system(), the dynamic loader,
106 * and other code that tries to open files without consulting fd.c. This
107 * is the number left free. (While we can be pretty sure we won't get
108 * EMFILE, there's never any guarantee that we won't get ENFILE due to
109 * other processes chewing up FDs. So it's a bad idea to try to open files
110 * without consulting fd.c. Nonetheless we cannot control all code.)
111 *
112 * Because this is just a fixed setting, we are effectively assuming that
113 * no such code will leave FDs open over the long term; otherwise the slop
114 * is likely to be insufficient. Note in particular that we expect that
115 * loading a shared library does not result in any permanent increase in
116 * the number of open files. (This appears to be true on most if not
117 * all platforms as of Feb 2004.)
118 */
119#define NUM_RESERVED_FDS 10
120
121/*
122 * If we have fewer than this many usable FDs after allowing for the reserved
123 * ones, choke.
124 */
125#define FD_MINFREE 10
126
127/*
128 * A number of platforms allow individual processes to open many more files
129 * than they can really support when *many* processes do the same thing.
130 * This GUC parameter lets the DBA limit max_safe_fds to something less than
131 * what the postmaster's initial probe suggests will work.
132 */
133int max_files_per_process = 1000;
134
135/*
136 * Maximum number of file descriptors to open for either VFD entries or
137 * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138 * to a conservative value, and remains that way indefinitely in bootstrap or
139 * standalone-backend cases. In normal postmaster operation, the postmaster
140 * calls set_max_safe_fds() late in initialization to update the value, and
141 * that value is then inherited by forked subprocesses.
142 *
143 * Note: the value of max_files_per_process is taken into account while
144 * setting this variable, and so need not be tested separately.
145 */
146int max_safe_fds = 32; /* default if not changed */
147
148/* Whether it is safe to continue running after fsync() fails. */
149bool data_sync_retry = false;
150
151/* Debugging.... */
152
153#ifdef FDDEBUG
154#define DO_DB(A) \
155 do { \
156 int _do_db_save_errno = errno; \
157 A; \
158 errno = _do_db_save_errno; \
159 } while (0)
160#else
161#define DO_DB(A) \
162 ((void) 0)
163#endif
164
165#define VFD_CLOSED (-1)
166
167#define FileIsValid(file) \
168 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169
170#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171
172/* these are the assigned bits in fdstate below: */
173#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
174#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
175#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
176
177typedef struct vfd
178{
179 int fd; /* current FD, or VFD_CLOSED if none */
180 unsigned short fdstate; /* bitflags for VFD's state */
181 ResourceOwner resowner; /* owner, for automatic cleanup */
182 File nextFree; /* link to next free VFD, if in freelist */
183 File lruMoreRecently; /* doubly linked recency-of-use list */
184 File lruLessRecently;
185 off_t fileSize; /* current size of file (0 if not temporary) */
186 char *fileName; /* name of file, or NULL for unused VFD */
187 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188 int fileFlags; /* open(2) flags for (re)opening the file */
189 mode_t fileMode; /* mode to pass to open(2) */
190} Vfd;
191
192/*
193 * Virtual File Descriptor array pointer and size. This grows as
194 * needed. 'File' values are indexes into this array.
195 * Note that VfdCache[0] is not a usable VFD, just a list header.
196 */
197static Vfd *VfdCache;
198static Size SizeVfdCache = 0;
199
200/*
201 * Number of file descriptors known to be in use by VFD entries.
202 */
203static int nfile = 0;
204
205/*
206 * Flag to tell whether it's worth scanning VfdCache looking for temp files
207 * to close
208 */
209static bool have_xact_temporary_files = false;
210
211/*
212 * Tracks the total size of all temporary files. Note: when temp_file_limit
213 * is being enforced, this cannot overflow since the limit cannot be more
214 * than INT_MAX kilobytes. When not enforcing, it could theoretically
215 * overflow, but we don't care.
216 */
217static uint64 temporary_files_size = 0;
218
219/*
220 * List of OS handles opened with AllocateFile, AllocateDir and
221 * OpenTransientFile.
222 */
223typedef enum
224{
225 AllocateDescFile,
226 AllocateDescPipe,
227 AllocateDescDir,
228 AllocateDescRawFD
229} AllocateDescKind;
230
231typedef struct
232{
233 AllocateDescKind kind;
234 SubTransactionId create_subid;
235 union
236 {
237 FILE *file;
238 DIR *dir;
239 int fd;
240 } desc;
241} AllocateDesc;
242
243static int numAllocatedDescs = 0;
244static int maxAllocatedDescs = 0;
245static AllocateDesc *allocatedDescs = NULL;
246
247/*
248 * Number of temporary files opened during the current session;
249 * this is used in generation of tempfile names.
250 */
251static long tempFileCounter = 0;
252
253/*
254 * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
255 * this has not been set in the current transaction.
256 */
257static Oid *tempTableSpaces = NULL;
258static int numTempTableSpaces = -1;
259static int nextTempTableSpace = 0;
260
261
262/*--------------------
263 *
264 * Private Routines
265 *
266 * Delete - delete a file from the Lru ring
267 * LruDelete - remove a file from the Lru ring and close its FD
268 * Insert - put a file at the front of the Lru ring
269 * LruInsert - put a file at the front of the Lru ring and open it
270 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
271 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
272 * AllocateVfd - grab a free (or new) file record (from VfdArray)
273 * FreeVfd - free a file record
274 *
275 * The Least Recently Used ring is a doubly linked list that begins and
276 * ends on element zero. Element zero is special -- it doesn't represent
277 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
278 * anchor that shows us the beginning/end of the ring.
279 * Only VFD elements that are currently really open (have an FD assigned) are
280 * in the Lru ring. Elements that are "virtually" open can be recognized
281 * by having a non-null fileName field.
282 *
283 * example:
284 *
285 * /--less----\ /---------\
286 * v \ v \
287 * #0 --more---> LeastRecentlyUsed --more-\ \
288 * ^\ | |
289 * \\less--> MostRecentlyUsedFile <---/ |
290 * \more---/ \--less--/
291 *
292 *--------------------
293 */
294static void Delete(File file);
295static void LruDelete(File file);
296static void Insert(File file);
297static int LruInsert(File file);
298static bool ReleaseLruFile(void);
299static void ReleaseLruFiles(void);
300static File AllocateVfd(void);
301static void FreeVfd(File file);
302
303static int FileAccess(File file);
304static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
305static bool reserveAllocatedDesc(void);
306static int FreeDesc(AllocateDesc *desc);
307
308static void AtProcExit_Files(int code, Datum arg);
309static void CleanupTempFiles(bool isCommit, bool isProcExit);
310static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
311 bool unlink_all);
312static void RemovePgTempRelationFiles(const char *tsdirname);
313static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
314
315static void walkdir(const char *path,
316 void (*action) (const char *fname, bool isdir, int elevel),
317 bool process_symlinks,
318 int elevel);
319#ifdef PG_FLUSH_DATA_WORKS
320static void pre_sync_fname(const char *fname, bool isdir, int elevel);
321#endif
322static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
323static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
324
325static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
326static int fsync_parent_path(const char *fname, int elevel);
327
328
329/*
330 * pg_fsync --- do fsync with or without writethrough
331 */
332int
333pg_fsync(int fd)
334{
335 /* #if is to skip the sync_method test if there's no need for it */
336#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
337 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
338 return pg_fsync_writethrough(fd);
339 else
340#endif
341 return pg_fsync_no_writethrough(fd);
342}
343
344
345/*
346 * pg_fsync_no_writethrough --- same as fsync except does nothing if
347 * enableFsync is off
348 */
349int
350pg_fsync_no_writethrough(int fd)
351{
352 if (enableFsync)
353 return fsync(fd);
354 else
355 return 0;
356}
357
358/*
359 * pg_fsync_writethrough
360 */
361int
362pg_fsync_writethrough(int fd)
363{
364 if (enableFsync)
365 {
366#ifdef WIN32
367 return _commit(fd);
368#elif defined(F_FULLFSYNC)
369 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
370#else
371 errno = ENOSYS;
372 return -1;
373#endif
374 }
375 else
376 return 0;
377}
378
379/*
380 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
381 *
382 * Not all platforms have fdatasync; treat as fsync if not available.
383 */
384int
385pg_fdatasync(int fd)
386{
387 if (enableFsync)
388 {
389#ifdef HAVE_FDATASYNC
390 return fdatasync(fd);
391#else
392 return fsync(fd);
393#endif
394 }
395 else
396 return 0;
397}
398
399/*
400 * pg_flush_data --- advise OS that the described dirty data should be flushed
401 *
402 * offset of 0 with nbytes 0 means that the entire file should be flushed
403 */
404void
405pg_flush_data(int fd, off_t offset, off_t nbytes)
406{
407 /*
408 * Right now file flushing is primarily used to avoid making later
409 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
410 * if fsyncs are disabled - that's a decision we might want to make
411 * configurable at some point.
412 */
413 if (!enableFsync)
414 return;
415
416 /*
417 * We compile all alternatives that are supported on the current platform,
418 * to find portability problems more easily.
419 */
420#if defined(HAVE_SYNC_FILE_RANGE)
421 {
422 int rc;
423 static bool not_implemented_by_kernel = false;
424
425 if (not_implemented_by_kernel)
426 return;
427
428 /*
429 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
430 * tells the OS that writeback for the specified blocks should be
431 * started, but that we don't want to wait for completion. Note that
432 * this call might block if too much dirty data exists in the range.
433 * This is the preferable method on OSs supporting it, as it works
434 * reliably when available (contrast to msync()) and doesn't flush out
435 * clean data (like FADV_DONTNEED).
436 */
437 rc = sync_file_range(fd, offset, nbytes,
438 SYNC_FILE_RANGE_WRITE);
439 if (rc != 0)
440 {
441 int elevel;
442
443 /*
444 * For systems that don't have an implementation of
445 * sync_file_range() such as Windows WSL, generate only one
446 * warning and then suppress all further attempts by this process.
447 */
448 if (errno == ENOSYS)
449 {
450 elevel = WARNING;
451 not_implemented_by_kernel = true;
452 }
453 else
454 elevel = data_sync_elevel(WARNING);
455
456 ereport(elevel,
457 (errcode_for_file_access(),
458 errmsg("could not flush dirty data: %m")));
459 }
460
461 return;
462 }
463#endif
464#if !defined(WIN32) && defined(MS_ASYNC)
465 {
466 void *p;
467 static int pagesize = 0;
468
469 /*
470 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
471 * writeback. On linux it only does so if MS_SYNC is specified, but
472 * then it does the writeback synchronously. Luckily all common linux
473 * systems have sync_file_range(). This is preferable over
474 * FADV_DONTNEED because it doesn't flush out clean data.
475 *
476 * We map the file (mmap()), tell the kernel to sync back the contents
477 * (msync()), and then remove the mapping again (munmap()).
478 */
479
480 /* mmap() needs actual length if we want to map whole file */
481 if (offset == 0 && nbytes == 0)
482 {
483 nbytes = lseek(fd, 0, SEEK_END);
484 if (nbytes < 0)
485 {
486 ereport(WARNING,
487 (errcode_for_file_access(),
488 errmsg("could not determine dirty data size: %m")));
489 return;
490 }
491 }
492
493 /*
494 * Some platforms reject partial-page mmap() attempts. To deal with
495 * that, just truncate the request to a page boundary. If any extra
496 * bytes don't get flushed, well, it's only a hint anyway.
497 */
498
499 /* fetch pagesize only once */
500 if (pagesize == 0)
501 pagesize = sysconf(_SC_PAGESIZE);
502
503 /* align length to pagesize, dropping any fractional page */
504 if (pagesize > 0)
505 nbytes = (nbytes / pagesize) * pagesize;
506
507 /* fractional-page request is a no-op */
508 if (nbytes <= 0)
509 return;
510
511 /*
512 * mmap could well fail, particularly on 32-bit platforms where there
513 * may simply not be enough address space. If so, silently fall
514 * through to the next implementation.
515 */
516 if (nbytes <= (off_t) SSIZE_MAX)
517 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
518 else
519 p = MAP_FAILED;
520
521 if (p != MAP_FAILED)
522 {
523 int rc;
524
525 rc = msync(p, (size_t) nbytes, MS_ASYNC);
526 if (rc != 0)
527 {
528 ereport(data_sync_elevel(WARNING),
529 (errcode_for_file_access(),
530 errmsg("could not flush dirty data: %m")));
531 /* NB: need to fall through to munmap()! */
532 }
533
534 rc = munmap(p, (size_t) nbytes);
535 if (rc != 0)
536 {
537 /* FATAL error because mapping would remain */
538 ereport(FATAL,
539 (errcode_for_file_access(),
540 errmsg("could not munmap() while flushing data: %m")));
541 }
542
543 return;
544 }
545 }
546#endif
547#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
548 {
549 int rc;
550
551 /*
552 * Signal the kernel that the passed in range should not be cached
553 * anymore. This has the, desired, side effect of writing out dirty
554 * data, and the, undesired, side effect of likely discarding useful
555 * clean cached blocks. For the latter reason this is the least
556 * preferable method.
557 */
558
559 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
560
561 if (rc != 0)
562 {
563 /* don't error out, this is just a performance optimization */
564 ereport(WARNING,
565 (errcode_for_file_access(),
566 errmsg("could not flush dirty data: %m")));
567 }
568
569 return;
570 }
571#endif
572}
573
574
575/*
576 * fsync_fname -- fsync a file or directory, handling errors properly
577 *
578 * Try to fsync a file or directory. When doing the latter, ignore errors that
579 * indicate the OS just doesn't allow/require fsyncing directories.
580 */
581void
582fsync_fname(const char *fname, bool isdir)
583{
584 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
585}
586
587/*
588 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
589 *
590 * This routine ensures that, after returning, the effect of renaming file
591 * persists in case of a crash. A crash while this routine is running will
592 * leave you with either the pre-existing or the moved file in place of the
593 * new file; no mixed state or truncated files are possible.
594 *
595 * It does so by using fsync on the old filename and the possibly existing
596 * target filename before the rename, and the target file and directory after.
597 *
598 * Note that rename() cannot be used across arbitrary directories, as they
599 * might not be on the same filesystem. Therefore this routine does not
600 * support renaming across directories.
601 *
602 * Log errors with the caller specified severity.
603 *
604 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
605 * valid upon return.
606 */
607int
608durable_rename(const char *oldfile, const char *newfile, int elevel)
609{
610 int fd;
611
612 /*
613 * First fsync the old and target path (if it exists), to ensure that they
614 * are properly persistent on disk. Syncing the target file is not
615 * strictly necessary, but it makes it easier to reason about crashes;
616 * because it's then guaranteed that either source or target file exists
617 * after a crash.
618 */
619 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
620 return -1;
621
622 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
623 if (fd < 0)
624 {
625 if (errno != ENOENT)
626 {
627 ereport(elevel,
628 (errcode_for_file_access(),
629 errmsg("could not open file \"%s\": %m", newfile)));
630 return -1;
631 }
632 }
633 else
634 {
635 if (pg_fsync(fd) != 0)
636 {
637 int save_errno;
638
639 /* close file upon error, might not be in transaction context */
640 save_errno = errno;
641 CloseTransientFile(fd);
642 errno = save_errno;
643
644 ereport(elevel,
645 (errcode_for_file_access(),
646 errmsg("could not fsync file \"%s\": %m", newfile)));
647 return -1;
648 }
649
650 if (CloseTransientFile(fd))
651 {
652 ereport(elevel,
653 (errcode_for_file_access(),
654 errmsg("could not close file \"%s\": %m", newfile)));
655 return -1;
656 }
657 }
658
659 /* Time to do the real deal... */
660 if (rename(oldfile, newfile) < 0)
661 {
662 ereport(elevel,
663 (errcode_for_file_access(),
664 errmsg("could not rename file \"%s\" to \"%s\": %m",
665 oldfile, newfile)));
666 return -1;
667 }
668
669 /*
670 * To guarantee renaming the file is persistent, fsync the file with its
671 * new name, and its containing directory.
672 */
673 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
674 return -1;
675
676 if (fsync_parent_path(newfile, elevel) != 0)
677 return -1;
678
679 return 0;
680}
681
682/*
683 * durable_unlink -- remove a file in a durable manner
684 *
685 * This routine ensures that, after returning, the effect of removing file
686 * persists in case of a crash. A crash while this routine is running will
687 * leave the system in no mixed state.
688 *
689 * It does so by using fsync on the parent directory of the file after the
690 * actual removal is done.
691 *
692 * Log errors with the severity specified by caller.
693 *
694 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
695 * valid upon return.
696 */
697int
698durable_unlink(const char *fname, int elevel)
699{
700 if (unlink(fname) < 0)
701 {
702 ereport(elevel,
703 (errcode_for_file_access(),
704 errmsg("could not remove file \"%s\": %m",
705 fname)));
706 return -1;
707 }
708
709 /*
710 * To guarantee that the removal of the file is persistent, fsync its
711 * parent directory.
712 */
713 if (fsync_parent_path(fname, elevel) != 0)
714 return -1;
715
716 return 0;
717}
718
719/*
720 * durable_link_or_rename -- rename a file in a durable manner.
721 *
722 * Similar to durable_rename(), except that this routine tries (but does not
723 * guarantee) not to overwrite the target file.
724 *
725 * Note that a crash in an unfortunate moment can leave you with two links to
726 * the target file.
727 *
728 * Log errors with the caller specified severity.
729 *
730 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
731 * valid upon return.
732 */
733int
734durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
735{
736 /*
737 * Ensure that, if we crash directly after the rename/link, a file with
738 * valid contents is moved into place.
739 */
740 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
741 return -1;
742
743#if HAVE_WORKING_LINK
744 if (link(oldfile, newfile) < 0)
745 {
746 ereport(elevel,
747 (errcode_for_file_access(),
748 errmsg("could not link file \"%s\" to \"%s\": %m",
749 oldfile, newfile)));
750 return -1;
751 }
752 unlink(oldfile);
753#else
754 /* XXX: Add racy file existence check? */
755 if (rename(oldfile, newfile) < 0)
756 {
757 ereport(elevel,
758 (errcode_for_file_access(),
759 errmsg("could not rename file \"%s\" to \"%s\": %m",
760 oldfile, newfile)));
761 return -1;
762 }
763#endif
764
765 /*
766 * Make change persistent in case of an OS crash, both the new entry and
767 * its parent directory need to be flushed.
768 */
769 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
770 return -1;
771
772 /* Same for parent directory */
773 if (fsync_parent_path(newfile, elevel) != 0)
774 return -1;
775
776 return 0;
777}
778
779/*
780 * InitFileAccess --- initialize this module during backend startup
781 *
782 * This is called during either normal or standalone backend start.
783 * It is *not* called in the postmaster.
784 */
785void
786InitFileAccess(void)
787{
788 Assert(SizeVfdCache == 0); /* call me only once */
789
790 /* initialize cache header entry */
791 VfdCache = (Vfd *) malloc(sizeof(Vfd));
792 if (VfdCache == NULL)
793 ereport(FATAL,
794 (errcode(ERRCODE_OUT_OF_MEMORY),
795 errmsg("out of memory")));
796
797 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
798 VfdCache->fd = VFD_CLOSED;
799
800 SizeVfdCache = 1;
801
802 /* register proc-exit hook to ensure temp files are dropped at exit */
803 on_proc_exit(AtProcExit_Files, 0);
804}
805
806/*
807 * count_usable_fds --- count how many FDs the system will let us open,
808 * and estimate how many are already open.
809 *
810 * We stop counting if usable_fds reaches max_to_probe. Note: a small
811 * value of max_to_probe might result in an underestimate of already_open;
812 * we must fill in any "gaps" in the set of used FDs before the calculation
813 * of already_open will give the right answer. In practice, max_to_probe
814 * of a couple of dozen should be enough to ensure good results.
815 *
816 * We assume stdin (FD 0) is available for dup'ing
817 */
818static void
819count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
820{
821 int *fd;
822 int size;
823 int used = 0;
824 int highestfd = 0;
825 int j;
826
827#ifdef HAVE_GETRLIMIT
828 struct rlimit rlim;
829 int getrlimit_status;
830#endif
831
832 size = 1024;
833 fd = (int *) palloc(size * sizeof(int));
834
835#ifdef HAVE_GETRLIMIT
836#ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
837 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
838#else /* but BSD doesn't ... */
839 getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
840#endif /* RLIMIT_NOFILE */
841 if (getrlimit_status != 0)
842 ereport(WARNING, (errmsg("getrlimit failed: %m")));
843#endif /* HAVE_GETRLIMIT */
844
845 /* dup until failure or probe limit reached */
846 for (;;)
847 {
848 int thisfd;
849
850#ifdef HAVE_GETRLIMIT
851
852 /*
853 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
854 * some platforms
855 */
856 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
857 break;
858#endif
859
860 thisfd = dup(0);
861 if (thisfd < 0)
862 {
863 /* Expect EMFILE or ENFILE, else it's fishy */
864 if (errno != EMFILE && errno != ENFILE)
865 elog(WARNING, "dup(0) failed after %d successes: %m", used);
866 break;
867 }
868
869 if (used >= size)
870 {
871 size *= 2;
872 fd = (int *) repalloc(fd, size * sizeof(int));
873 }
874 fd[used++] = thisfd;
875
876 if (highestfd < thisfd)
877 highestfd = thisfd;
878
879 if (used >= max_to_probe)
880 break;
881 }
882
883 /* release the files we opened */
884 for (j = 0; j < used; j++)
885 close(fd[j]);
886
887 pfree(fd);
888
889 /*
890 * Return results. usable_fds is just the number of successful dups. We
891 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
892 * number) and so already_open is highestfd+1 - usable_fds.
893 */
894 *usable_fds = used;
895 *already_open = highestfd + 1 - used;
896}
897
898/*
899 * set_max_safe_fds
900 * Determine number of filedescriptors that fd.c is allowed to use
901 */
902void
903set_max_safe_fds(void)
904{
905 int usable_fds;
906 int already_open;
907
908 /*----------
909 * We want to set max_safe_fds to
910 * MIN(usable_fds, max_files_per_process - already_open)
911 * less the slop factor for files that are opened without consulting
912 * fd.c. This ensures that we won't exceed either max_files_per_process
913 * or the experimentally-determined EMFILE limit.
914 *----------
915 */
916 count_usable_fds(max_files_per_process,
917 &usable_fds, &already_open);
918
919 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
920
921 /*
922 * Take off the FDs reserved for system() etc.
923 */
924 max_safe_fds -= NUM_RESERVED_FDS;
925
926 /*
927 * Make sure we still have enough to get by.
928 */
929 if (max_safe_fds < FD_MINFREE)
930 ereport(FATAL,
931 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
932 errmsg("insufficient file descriptors available to start server process"),
933 errdetail("System allows %d, we need at least %d.",
934 max_safe_fds + NUM_RESERVED_FDS,
935 FD_MINFREE + NUM_RESERVED_FDS)));
936
937 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
938 max_safe_fds, usable_fds, already_open);
939}
940
941/*
942 * Open a file with BasicOpenFilePerm() and pass default file mode for the
943 * fileMode parameter.
944 */
945int
946BasicOpenFile(const char *fileName, int fileFlags)
947{
948 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
949}
950
951/*
952 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
953 *
954 * This is exported for use by places that really want a plain kernel FD,
955 * but need to be proof against running out of FDs. Once an FD has been
956 * successfully returned, it is the caller's responsibility to ensure that
957 * it will not be leaked on ereport()! Most users should *not* call this
958 * routine directly, but instead use the VFD abstraction level, which
959 * provides protection against descriptor leaks as well as management of
960 * files that need to be open for more than a short period of time.
961 *
962 * Ideally this should be the *only* direct call of open() in the backend.
963 * In practice, the postmaster calls open() directly, and there are some
964 * direct open() calls done early in backend startup. Those are OK since
965 * this module wouldn't have any open files to close at that point anyway.
966 */
967int
968BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
969{
970 int fd;
971
972tryAgain:
973 fd = open(fileName, fileFlags, fileMode);
974
975 if (fd >= 0)
976 return fd; /* success! */
977
978 if (errno == EMFILE || errno == ENFILE)
979 {
980 int save_errno = errno;
981
982 ereport(LOG,
983 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
984 errmsg("out of file descriptors: %m; release and retry")));
985 errno = 0;
986 if (ReleaseLruFile())
987 goto tryAgain;
988 errno = save_errno;
989 }
990
991 return -1; /* failure */
992}
993
994#if defined(FDDEBUG)
995
996static void
997_dump_lru(void)
998{
999 int mru = VfdCache[0].lruLessRecently;
1000 Vfd *vfdP = &VfdCache[mru];
1001 char buf[2048];
1002
1003 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1004 while (mru != 0)
1005 {
1006 mru = vfdP->lruLessRecently;
1007 vfdP = &VfdCache[mru];
1008 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1009 }
1010 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1011 elog(LOG, "%s", buf);
1012}
1013#endif /* FDDEBUG */
1014
1015static void
1016Delete(File file)
1017{
1018 Vfd *vfdP;
1019
1020 Assert(file != 0);
1021
1022 DO_DB(elog(LOG, "Delete %d (%s)",
1023 file, VfdCache[file].fileName));
1024 DO_DB(_dump_lru());
1025
1026 vfdP = &VfdCache[file];
1027
1028 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1029 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1030
1031 DO_DB(_dump_lru());
1032}
1033
1034static void
1035LruDelete(File file)
1036{
1037 Vfd *vfdP;
1038
1039 Assert(file != 0);
1040
1041 DO_DB(elog(LOG, "LruDelete %d (%s)",
1042 file, VfdCache[file].fileName));
1043
1044 vfdP = &VfdCache[file];
1045
1046 /*
1047 * Close the file. We aren't expecting this to fail; if it does, better
1048 * to leak the FD than to mess up our internal state.
1049 */
1050 if (close(vfdP->fd))
1051 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1052 "could not close file \"%s\": %m", vfdP->fileName);
1053 vfdP->fd = VFD_CLOSED;
1054 --nfile;
1055
1056 /* delete the vfd record from the LRU ring */
1057 Delete(file);
1058}
1059
1060static void
1061Insert(File file)
1062{
1063 Vfd *vfdP;
1064
1065 Assert(file != 0);
1066
1067 DO_DB(elog(LOG, "Insert %d (%s)",
1068 file, VfdCache[file].fileName));
1069 DO_DB(_dump_lru());
1070
1071 vfdP = &VfdCache[file];
1072
1073 vfdP->lruMoreRecently = 0;
1074 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1075 VfdCache[0].lruLessRecently = file;
1076 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1077
1078 DO_DB(_dump_lru());
1079}
1080
1081/* returns 0 on success, -1 on re-open failure (with errno set) */
1082static int
1083LruInsert(File file)
1084{
1085 Vfd *vfdP;
1086
1087 Assert(file != 0);
1088
1089 DO_DB(elog(LOG, "LruInsert %d (%s)",
1090 file, VfdCache[file].fileName));
1091
1092 vfdP = &VfdCache[file];
1093
1094 if (FileIsNotOpen(file))
1095 {
1096 /* Close excess kernel FDs. */
1097 ReleaseLruFiles();
1098
1099 /*
1100 * The open could still fail for lack of file descriptors, eg due to
1101 * overall system file table being full. So, be prepared to release
1102 * another FD if necessary...
1103 */
1104 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1105 vfdP->fileMode);
1106 if (vfdP->fd < 0)
1107 {
1108 DO_DB(elog(LOG, "re-open failed: %m"));
1109 return -1;
1110 }
1111 else
1112 {
1113 ++nfile;
1114 }
1115 }
1116
1117 /*
1118 * put it at the head of the Lru ring
1119 */
1120
1121 Insert(file);
1122
1123 return 0;
1124}
1125
1126/*
1127 * Release one kernel FD by closing the least-recently-used VFD.
1128 */
1129static bool
1130ReleaseLruFile(void)
1131{
1132 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1133
1134 if (nfile > 0)
1135 {
1136 /*
1137 * There are opened files and so there should be at least one used vfd
1138 * in the ring.
1139 */
1140 Assert(VfdCache[0].lruMoreRecently != 0);
1141 LruDelete(VfdCache[0].lruMoreRecently);
1142 return true; /* freed a file */
1143 }
1144 return false; /* no files available to free */
1145}
1146
1147/*
1148 * Release kernel FDs as needed to get under the max_safe_fds limit.
1149 * After calling this, it's OK to try to open another file.
1150 */
1151static void
1152ReleaseLruFiles(void)
1153{
1154 while (nfile + numAllocatedDescs >= max_safe_fds)
1155 {
1156 if (!ReleaseLruFile())
1157 break;
1158 }
1159}
1160
1161static File
1162AllocateVfd(void)
1163{
1164 Index i;
1165 File file;
1166
1167 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1168
1169 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1170
1171 if (VfdCache[0].nextFree == 0)
1172 {
1173 /*
1174 * The free list is empty so it is time to increase the size of the
1175 * array. We choose to double it each time this happens. However,
1176 * there's not much point in starting *real* small.
1177 */
1178 Size newCacheSize = SizeVfdCache * 2;
1179 Vfd *newVfdCache;
1180
1181 if (newCacheSize < 32)
1182 newCacheSize = 32;
1183
1184 /*
1185 * Be careful not to clobber VfdCache ptr if realloc fails.
1186 */
1187 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1188 if (newVfdCache == NULL)
1189 ereport(ERROR,
1190 (errcode(ERRCODE_OUT_OF_MEMORY),
1191 errmsg("out of memory")));
1192 VfdCache = newVfdCache;
1193
1194 /*
1195 * Initialize the new entries and link them into the free list.
1196 */
1197 for (i = SizeVfdCache; i < newCacheSize; i++)
1198 {
1199 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1200 VfdCache[i].nextFree = i + 1;
1201 VfdCache[i].fd = VFD_CLOSED;
1202 }
1203 VfdCache[newCacheSize - 1].nextFree = 0;
1204 VfdCache[0].nextFree = SizeVfdCache;
1205
1206 /*
1207 * Record the new size
1208 */
1209 SizeVfdCache = newCacheSize;
1210 }
1211
1212 file = VfdCache[0].nextFree;
1213
1214 VfdCache[0].nextFree = VfdCache[file].nextFree;
1215
1216 return file;
1217}
1218
1219static void
1220FreeVfd(File file)
1221{
1222 Vfd *vfdP = &VfdCache[file];
1223
1224 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1225 file, vfdP->fileName ? vfdP->fileName : ""));
1226
1227 if (vfdP->fileName != NULL)
1228 {
1229 free(vfdP->fileName);
1230 vfdP->fileName = NULL;
1231 }
1232 vfdP->fdstate = 0x0;
1233
1234 vfdP->nextFree = VfdCache[0].nextFree;
1235 VfdCache[0].nextFree = file;
1236}
1237
1238/* returns 0 on success, -1 on re-open failure (with errno set) */
1239static int
1240FileAccess(File file)
1241{
1242 int returnValue;
1243
1244 DO_DB(elog(LOG, "FileAccess %d (%s)",
1245 file, VfdCache[file].fileName));
1246
1247 /*
1248 * Is the file open? If not, open it and put it at the head of the LRU
1249 * ring (possibly closing the least recently used file to get an FD).
1250 */
1251
1252 if (FileIsNotOpen(file))
1253 {
1254 returnValue = LruInsert(file);
1255 if (returnValue != 0)
1256 return returnValue;
1257 }
1258 else if (VfdCache[0].lruLessRecently != file)
1259 {
1260 /*
1261 * We now know that the file is open and that it is not the last one
1262 * accessed, so we need to move it to the head of the Lru ring.
1263 */
1264
1265 Delete(file);
1266 Insert(file);
1267 }
1268
1269 return 0;
1270}
1271
1272/*
1273 * Called whenever a temporary file is deleted to report its size.
1274 */
1275static void
1276ReportTemporaryFileUsage(const char *path, off_t size)
1277{
1278 pgstat_report_tempfile(size);
1279
1280 if (log_temp_files >= 0)
1281 {
1282 if ((size / 1024) >= log_temp_files)
1283 ereport(LOG,
1284 (errmsg("temporary file: path \"%s\", size %lu",
1285 path, (unsigned long) size)));
1286 }
1287}
1288
1289/*
1290 * Called to register a temporary file for automatic close.
1291 * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1292 * before the file was opened.
1293 */
1294static void
1295RegisterTemporaryFile(File file)
1296{
1297 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1298 VfdCache[file].resowner = CurrentResourceOwner;
1299
1300 /* Backup mechanism for closing at end of xact. */
1301 VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1302 have_xact_temporary_files = true;
1303}
1304
1305/*
1306 * Called when we get a shared invalidation message on some relation.
1307 */
1308#ifdef NOT_USED
1309void
1310FileInvalidate(File file)
1311{
1312 Assert(FileIsValid(file));
1313 if (!FileIsNotOpen(file))
1314 LruDelete(file);
1315}
1316#endif
1317
1318/*
1319 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1320 * fileMode parameter.
1321 */
1322File
1323PathNameOpenFile(const char *fileName, int fileFlags)
1324{
1325 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1326}
1327
1328/*
1329 * open a file in an arbitrary directory
1330 *
1331 * NB: if the passed pathname is relative (which it usually is),
1332 * it will be interpreted relative to the process' working directory
1333 * (which should always be $PGDATA when this code is running).
1334 */
1335File
1336PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1337{
1338 char *fnamecopy;
1339 File file;
1340 Vfd *vfdP;
1341
1342 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1343 fileName, fileFlags, fileMode));
1344
1345 /*
1346 * We need a malloc'd copy of the file name; fail cleanly if no room.
1347 */
1348 fnamecopy = strdup(fileName);
1349 if (fnamecopy == NULL)
1350 ereport(ERROR,
1351 (errcode(ERRCODE_OUT_OF_MEMORY),
1352 errmsg("out of memory")));
1353
1354 file = AllocateVfd();
1355 vfdP = &VfdCache[file];
1356
1357 /* Close excess kernel FDs. */
1358 ReleaseLruFiles();
1359
1360 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1361
1362 if (vfdP->fd < 0)
1363 {
1364 int save_errno = errno;
1365
1366 FreeVfd(file);
1367 free(fnamecopy);
1368 errno = save_errno;
1369 return -1;
1370 }
1371 ++nfile;
1372 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1373 vfdP->fd));
1374
1375 Insert(file);
1376
1377 vfdP->fileName = fnamecopy;
1378 /* Saved flags are adjusted to be OK for re-opening file */
1379 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1380 vfdP->fileMode = fileMode;
1381 vfdP->fileSize = 0;
1382 vfdP->fdstate = 0x0;
1383 vfdP->resowner = NULL;
1384
1385 return file;
1386}
1387
1388/*
1389 * Create directory 'directory'. If necessary, create 'basedir', which must
1390 * be the directory above it. This is designed for creating the top-level
1391 * temporary directory on demand before creating a directory underneath it.
1392 * Do nothing if the directory already exists.
1393 *
1394 * Directories created within the top-level temporary directory should begin
1395 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1396 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1397 * that do not need any particular prefix.
1398*/
1399void
1400PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1401{
1402 if (MakePGDirectory(directory) < 0)
1403 {
1404 if (errno == EEXIST)
1405 return;
1406
1407 /*
1408 * Failed. Try to create basedir first in case it's missing. Tolerate
1409 * EEXIST to close a race against another process following the same
1410 * algorithm.
1411 */
1412 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1413 ereport(ERROR,
1414 (errcode_for_file_access(),
1415 errmsg("cannot create temporary directory \"%s\": %m",
1416 basedir)));
1417
1418 /* Try again. */
1419 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1420 ereport(ERROR,
1421 (errcode_for_file_access(),
1422 errmsg("cannot create temporary subdirectory \"%s\": %m",
1423 directory)));
1424 }
1425}
1426
1427/*
1428 * Delete a directory and everything in it, if it exists.
1429 */
1430void
1431PathNameDeleteTemporaryDir(const char *dirname)
1432{
1433 struct stat statbuf;
1434
1435 /* Silently ignore missing directory. */
1436 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1437 return;
1438
1439 /*
1440 * Currently, walkdir doesn't offer a way for our passed in function to
1441 * maintain state. Perhaps it should, so that we could tell the caller
1442 * whether this operation succeeded or failed. Since this operation is
1443 * used in a cleanup path, we wouldn't actually behave differently: we'll
1444 * just log failures.
1445 */
1446 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1447}
1448
1449/*
1450 * Open a temporary file that will disappear when we close it.
1451 *
1452 * This routine takes care of generating an appropriate tempfile name.
1453 * There's no need to pass in fileFlags or fileMode either, since only
1454 * one setting makes any sense for a temp file.
1455 *
1456 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1457 * to ensure it's closed and deleted when it's no longer needed, typically at
1458 * the end-of-transaction. In most cases, you don't want temporary files to
1459 * outlive the transaction that created them, so this should be false -- but
1460 * if you need "somewhat" temporary storage, this might be useful. In either
1461 * case, the file is removed when the File is explicitly closed.
1462 */
1463File
1464OpenTemporaryFile(bool interXact)
1465{
1466 File file = 0;
1467
1468 /*
1469 * Make sure the current resource owner has space for this File before we
1470 * open it, if we'll be registering it below.
1471 */
1472 if (!interXact)
1473 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1474
1475 /*
1476 * If some temp tablespace(s) have been given to us, try to use the next
1477 * one. If a given tablespace can't be found, we silently fall back to
1478 * the database's default tablespace.
1479 *
1480 * BUT: if the temp file is slated to outlive the current transaction,
1481 * force it into the database's default tablespace, so that it will not
1482 * pose a threat to possible tablespace drop attempts.
1483 */
1484 if (numTempTableSpaces > 0 && !interXact)
1485 {
1486 Oid tblspcOid = GetNextTempTableSpace();
1487
1488 if (OidIsValid(tblspcOid))
1489 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1490 }
1491
1492 /*
1493 * If not, or if tablespace is bad, create in database's default
1494 * tablespace. MyDatabaseTableSpace should normally be set before we get
1495 * here, but just in case it isn't, fall back to pg_default tablespace.
1496 */
1497 if (file <= 0)
1498 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1499 MyDatabaseTableSpace :
1500 DEFAULTTABLESPACE_OID,
1501 true);
1502
1503 /* Mark it for deletion at close and temporary file size limit */
1504 VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1505
1506 /* Register it with the current resource owner */
1507 if (!interXact)
1508 RegisterTemporaryFile(file);
1509
1510 return file;
1511}
1512
1513/*
1514 * Return the path of the temp directory in a given tablespace.
1515 */
1516void
1517TempTablespacePath(char *path, Oid tablespace)
1518{
1519 /*
1520 * Identify the tempfile directory for this tablespace.
1521 *
1522 * If someone tries to specify pg_global, use pg_default instead.
1523 */
1524 if (tablespace == InvalidOid ||
1525 tablespace == DEFAULTTABLESPACE_OID ||
1526 tablespace == GLOBALTABLESPACE_OID)
1527 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1528 else
1529 {
1530 /* All other tablespaces are accessed via symlinks */
1531 snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1532 tablespace, TABLESPACE_VERSION_DIRECTORY,
1533 PG_TEMP_FILES_DIR);
1534 }
1535}
1536
1537/*
1538 * Open a temporary file in a specific tablespace.
1539 * Subroutine for OpenTemporaryFile, which see for details.
1540 */
1541static File
1542OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1543{
1544 char tempdirpath[MAXPGPATH];
1545 char tempfilepath[MAXPGPATH];
1546 File file;
1547
1548 TempTablespacePath(tempdirpath, tblspcOid);
1549
1550 /*
1551 * Generate a tempfile name that should be unique within the current
1552 * database instance.
1553 */
1554 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1555 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1556
1557 /*
1558 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1559 * temp file that can be reused.
1560 */
1561 file = PathNameOpenFile(tempfilepath,
1562 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1563 if (file <= 0)
1564 {
1565 /*
1566 * We might need to create the tablespace's tempfile directory, if no
1567 * one has yet done so.
1568 *
1569 * Don't check for an error from MakePGDirectory; it could fail if
1570 * someone else just did the same thing. If it doesn't work then
1571 * we'll bomb out on the second create attempt, instead.
1572 */
1573 (void) MakePGDirectory(tempdirpath);
1574
1575 file = PathNameOpenFile(tempfilepath,
1576 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1577 if (file <= 0 && rejectError)
1578 elog(ERROR, "could not create temporary file \"%s\": %m",
1579 tempfilepath);
1580 }
1581
1582 return file;
1583}
1584
1585
1586/*
1587 * Create a new file. The directory containing it must already exist. Files
1588 * created this way are subject to temp_file_limit and are automatically
1589 * closed at end of transaction, but are not automatically deleted on close
1590 * because they are intended to be shared between cooperating backends.
1591 *
1592 * If the file is inside the top-level temporary directory, its name should
1593 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1594 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1595 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1596 * the prefix isn't needed.
1597 */
1598File
1599PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1600{
1601 File file;
1602
1603 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1604
1605 /*
1606 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1607 * temp file that can be reused.
1608 */
1609 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1610 if (file <= 0)
1611 {
1612 if (error_on_failure)
1613 ereport(ERROR,
1614 (errcode_for_file_access(),
1615 errmsg("could not create temporary file \"%s\": %m",
1616 path)));
1617 else
1618 return file;
1619 }
1620
1621 /* Mark it for temp_file_limit accounting. */
1622 VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1623
1624 /* Register it for automatic close. */
1625 RegisterTemporaryFile(file);
1626
1627 return file;
1628}
1629
1630/*
1631 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1632 * another backend. Files opened this way don't count against the
1633 * temp_file_limit of the caller, are read-only and are automatically closed
1634 * at the end of the transaction but are not deleted on close.
1635 */
1636File
1637PathNameOpenTemporaryFile(const char *path)
1638{
1639 File file;
1640
1641 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1642
1643 /* We open the file read-only. */
1644 file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1645
1646 /* If no such file, then we don't raise an error. */
1647 if (file <= 0 && errno != ENOENT)
1648 ereport(ERROR,
1649 (errcode_for_file_access(),
1650 errmsg("could not open temporary file \"%s\": %m",
1651 path)));
1652
1653 if (file > 0)
1654 {
1655 /* Register it for automatic close. */
1656 RegisterTemporaryFile(file);
1657 }
1658
1659 return file;
1660}
1661
1662/*
1663 * Delete a file by pathname. Return true if the file existed, false if
1664 * didn't.
1665 */
1666bool
1667PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1668{
1669 struct stat filestats;
1670 int stat_errno;
1671
1672 /* Get the final size for pgstat reporting. */
1673 if (stat(path, &filestats) != 0)
1674 stat_errno = errno;
1675 else
1676 stat_errno = 0;
1677
1678 /*
1679 * Unlike FileClose's automatic file deletion code, we tolerate
1680 * non-existence to support BufFileDeleteShared which doesn't know how
1681 * many segments it has to delete until it runs out.
1682 */
1683 if (stat_errno == ENOENT)
1684 return false;
1685
1686 if (unlink(path) < 0)
1687 {
1688 if (errno != ENOENT)
1689 ereport(error_on_failure ? ERROR : LOG,
1690 (errcode_for_file_access(),
1691 errmsg("could not unlink temporary file \"%s\": %m",
1692 path)));
1693 return false;
1694 }
1695
1696 if (stat_errno == 0)
1697 ReportTemporaryFileUsage(path, filestats.st_size);
1698 else
1699 {
1700 errno = stat_errno;
1701 ereport(LOG,
1702 (errcode_for_file_access(),
1703 errmsg("could not stat file \"%s\": %m", path)));
1704 }
1705
1706 return true;
1707}
1708
1709/*
1710 * close a file when done with it
1711 */
1712void
1713FileClose(File file)
1714{
1715 Vfd *vfdP;
1716
1717 Assert(FileIsValid(file));
1718
1719 DO_DB(elog(LOG, "FileClose: %d (%s)",
1720 file, VfdCache[file].fileName));
1721
1722 vfdP = &VfdCache[file];
1723
1724 if (!FileIsNotOpen(file))
1725 {
1726 /* close the file */
1727 if (close(vfdP->fd))
1728 {
1729 /*
1730 * We may need to panic on failure to close non-temporary files;
1731 * see LruDelete.
1732 */
1733 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1734 "could not close file \"%s\": %m", vfdP->fileName);
1735 }
1736
1737 --nfile;
1738 vfdP->fd = VFD_CLOSED;
1739
1740 /* remove the file from the lru ring */
1741 Delete(file);
1742 }
1743
1744 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1745 {
1746 /* Subtract its size from current usage (do first in case of error) */
1747 temporary_files_size -= vfdP->fileSize;
1748 vfdP->fileSize = 0;
1749 }
1750
1751 /*
1752 * Delete the file if it was temporary, and make a log entry if wanted
1753 */
1754 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1755 {
1756 struct stat filestats;
1757 int stat_errno;
1758
1759 /*
1760 * If we get an error, as could happen within the ereport/elog calls,
1761 * we'll come right back here during transaction abort. Reset the
1762 * flag to ensure that we can't get into an infinite loop. This code
1763 * is arranged to ensure that the worst-case consequence is failing to
1764 * emit log message(s), not failing to attempt the unlink.
1765 */
1766 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1767
1768
1769 /* first try the stat() */
1770 if (stat(vfdP->fileName, &filestats))
1771 stat_errno = errno;
1772 else
1773 stat_errno = 0;
1774
1775 /* in any case do the unlink */
1776 if (unlink(vfdP->fileName))
1777 elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1778
1779 /* and last report the stat results */
1780 if (stat_errno == 0)
1781 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1782 else
1783 {
1784 errno = stat_errno;
1785 elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1786 }
1787 }
1788
1789 /* Unregister it from the resource owner */
1790 if (vfdP->resowner)
1791 ResourceOwnerForgetFile(vfdP->resowner, file);
1792
1793 /*
1794 * Return the Vfd slot to the free list
1795 */
1796 FreeVfd(file);
1797}
1798
1799/*
1800 * FilePrefetch - initiate asynchronous read of a given range of the file.
1801 *
1802 * Currently the only implementation of this function is using posix_fadvise
1803 * which is the simplest standardized interface that accomplishes this.
1804 * We could add an implementation using libaio in the future; but note that
1805 * this API is inappropriate for libaio, which wants to have a buffer provided
1806 * to read into.
1807 */
1808int
1809FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1810{
1811#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1812 int returnCode;
1813
1814 Assert(FileIsValid(file));
1815
1816 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1817 file, VfdCache[file].fileName,
1818 (int64) offset, amount));
1819
1820 returnCode = FileAccess(file);
1821 if (returnCode < 0)
1822 return returnCode;
1823
1824 pgstat_report_wait_start(wait_event_info);
1825 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1826 POSIX_FADV_WILLNEED);
1827 pgstat_report_wait_end();
1828
1829 return returnCode;
1830#else
1831 Assert(FileIsValid(file));
1832 return 0;
1833#endif
1834}
1835
1836void
1837FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1838{
1839 int returnCode;
1840
1841 Assert(FileIsValid(file));
1842
1843 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1844 file, VfdCache[file].fileName,
1845 (int64) offset, (int64) nbytes));
1846
1847 if (nbytes <= 0)
1848 return;
1849
1850 returnCode = FileAccess(file);
1851 if (returnCode < 0)
1852 return;
1853
1854 pgstat_report_wait_start(wait_event_info);
1855 pg_flush_data(VfdCache[file].fd, offset, nbytes);
1856 pgstat_report_wait_end();
1857}
1858
1859int
1860FileRead(File file, char *buffer, int amount, off_t offset,
1861 uint32 wait_event_info)
1862{
1863 int returnCode;
1864 Vfd *vfdP;
1865
1866 Assert(FileIsValid(file));
1867
1868 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1869 file, VfdCache[file].fileName,
1870 (int64) offset,
1871 amount, buffer));
1872
1873 returnCode = FileAccess(file);
1874 if (returnCode < 0)
1875 return returnCode;
1876
1877 vfdP = &VfdCache[file];
1878
1879retry:
1880 pgstat_report_wait_start(wait_event_info);
1881 returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1882 pgstat_report_wait_end();
1883
1884 if (returnCode < 0)
1885 {
1886 /*
1887 * Windows may run out of kernel buffers and return "Insufficient
1888 * system resources" error. Wait a bit and retry to solve it.
1889 *
1890 * It is rumored that EINTR is also possible on some Unix filesystems,
1891 * in which case immediate retry is indicated.
1892 */
1893#ifdef WIN32
1894 DWORD error = GetLastError();
1895
1896 switch (error)
1897 {
1898 case ERROR_NO_SYSTEM_RESOURCES:
1899 pg_usleep(1000L);
1900 errno = EINTR;
1901 break;
1902 default:
1903 _dosmaperr(error);
1904 break;
1905 }
1906#endif
1907 /* OK to retry if interrupted */
1908 if (errno == EINTR)
1909 goto retry;
1910 }
1911
1912 return returnCode;
1913}
1914
1915int
1916FileWrite(File file, char *buffer, int amount, off_t offset,
1917 uint32 wait_event_info)
1918{
1919 int returnCode;
1920 Vfd *vfdP;
1921
1922 Assert(FileIsValid(file));
1923
1924 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1925 file, VfdCache[file].fileName,
1926 (int64) offset,
1927 amount, buffer));
1928
1929 returnCode = FileAccess(file);
1930 if (returnCode < 0)
1931 return returnCode;
1932
1933 vfdP = &VfdCache[file];
1934
1935 /*
1936 * If enforcing temp_file_limit and it's a temp file, check to see if the
1937 * write would overrun temp_file_limit, and throw error if so. Note: it's
1938 * really a modularity violation to throw error here; we should set errno
1939 * and return -1. However, there's no way to report a suitable error
1940 * message if we do that. All current callers would just throw error
1941 * immediately anyway, so this is safe at present.
1942 */
1943 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1944 {
1945 off_t past_write = offset + amount;
1946
1947 if (past_write > vfdP->fileSize)
1948 {
1949 uint64 newTotal = temporary_files_size;
1950
1951 newTotal += past_write - vfdP->fileSize;
1952 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1953 ereport(ERROR,
1954 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1955 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1956 temp_file_limit)));
1957 }
1958 }
1959
1960retry:
1961 errno = 0;
1962 pgstat_report_wait_start(wait_event_info);
1963 returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
1964 pgstat_report_wait_end();
1965
1966 /* if write didn't set errno, assume problem is no disk space */
1967 if (returnCode != amount && errno == 0)
1968 errno = ENOSPC;
1969
1970 if (returnCode >= 0)
1971 {
1972 /*
1973 * Maintain fileSize and temporary_files_size if it's a temp file.
1974 *
1975 * If seekPos is -1 (unknown), this will do nothing; but we could only
1976 * get here in that state if we're not enforcing temporary_files_size,
1977 * so we don't care.
1978 */
1979 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1980 {
1981 off_t past_write = offset + amount;
1982
1983 if (past_write > vfdP->fileSize)
1984 {
1985 temporary_files_size += past_write - vfdP->fileSize;
1986 vfdP->fileSize = past_write;
1987 }
1988 }
1989 }
1990 else
1991 {
1992 /*
1993 * See comments in FileRead()
1994 */
1995#ifdef WIN32
1996 DWORD error = GetLastError();
1997
1998 switch (error)
1999 {
2000 case ERROR_NO_SYSTEM_RESOURCES:
2001 pg_usleep(1000L);
2002 errno = EINTR;
2003 break;
2004 default:
2005 _dosmaperr(error);
2006 break;
2007 }
2008#endif
2009 /* OK to retry if interrupted */
2010 if (errno == EINTR)
2011 goto retry;
2012 }
2013
2014 return returnCode;
2015}
2016
2017int
2018FileSync(File file, uint32 wait_event_info)
2019{
2020 int returnCode;
2021
2022 Assert(FileIsValid(file));
2023
2024 DO_DB(elog(LOG, "FileSync: %d (%s)",
2025 file, VfdCache[file].fileName));
2026
2027 returnCode = FileAccess(file);
2028 if (returnCode < 0)
2029 return returnCode;
2030
2031 pgstat_report_wait_start(wait_event_info);
2032 returnCode = pg_fsync(VfdCache[file].fd);
2033 pgstat_report_wait_end();
2034
2035 return returnCode;
2036}
2037
2038off_t
2039FileSize(File file)
2040{
2041 Assert(FileIsValid(file));
2042
2043 DO_DB(elog(LOG, "FileSize %d (%s)",
2044 file, VfdCache[file].fileName));
2045
2046 if (FileIsNotOpen(file))
2047 {
2048 if (FileAccess(file) < 0)
2049 return (off_t) -1;
2050 }
2051
2052 return lseek(VfdCache[file].fd, 0, SEEK_END);
2053}
2054
2055int
2056FileTruncate(File file, off_t offset, uint32 wait_event_info)
2057{
2058 int returnCode;
2059
2060 Assert(FileIsValid(file));
2061
2062 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2063 file, VfdCache[file].fileName));
2064
2065 returnCode = FileAccess(file);
2066 if (returnCode < 0)
2067 return returnCode;
2068
2069 pgstat_report_wait_start(wait_event_info);
2070 returnCode = ftruncate(VfdCache[file].fd, offset);
2071 pgstat_report_wait_end();
2072
2073 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2074 {
2075 /* adjust our state for truncation of a temp file */
2076 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2077 temporary_files_size -= VfdCache[file].fileSize - offset;
2078 VfdCache[file].fileSize = offset;
2079 }
2080
2081 return returnCode;
2082}
2083
2084/*
2085 * Return the pathname associated with an open file.
2086 *
2087 * The returned string points to an internal buffer, which is valid until
2088 * the file is closed.
2089 */
2090char *
2091FilePathName(File file)
2092{
2093 Assert(FileIsValid(file));
2094
2095 return VfdCache[file].fileName;
2096}
2097
2098/*
2099 * Return the raw file descriptor of an opened file.
2100 *
2101 * The returned file descriptor will be valid until the file is closed, but
2102 * there are a lot of things that can make that happen. So the caller should
2103 * be careful not to do much of anything else before it finishes using the
2104 * returned file descriptor.
2105 */
2106int
2107FileGetRawDesc(File file)
2108{
2109 Assert(FileIsValid(file));
2110 return VfdCache[file].fd;
2111}
2112
2113/*
2114 * FileGetRawFlags - returns the file flags on open(2)
2115 */
2116int
2117FileGetRawFlags(File file)
2118{
2119 Assert(FileIsValid(file));
2120 return VfdCache[file].fileFlags;
2121}
2122
2123/*
2124 * FileGetRawMode - returns the mode bitmask passed to open(2)
2125 */
2126mode_t
2127FileGetRawMode(File file)
2128{
2129 Assert(FileIsValid(file));
2130 return VfdCache[file].fileMode;
2131}
2132
2133/*
2134 * Make room for another allocatedDescs[] array entry if needed and possible.
2135 * Returns true if an array element is available.
2136 */
2137static bool
2138reserveAllocatedDesc(void)
2139{
2140 AllocateDesc *newDescs;
2141 int newMax;
2142
2143 /* Quick out if array already has a free slot. */
2144 if (numAllocatedDescs < maxAllocatedDescs)
2145 return true;
2146
2147 /*
2148 * If the array hasn't yet been created in the current process, initialize
2149 * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2150 * we will ever need, anyway. We don't want to look at max_safe_fds
2151 * immediately because set_max_safe_fds() may not have run yet.
2152 */
2153 if (allocatedDescs == NULL)
2154 {
2155 newMax = FD_MINFREE / 2;
2156 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2157 /* Out of memory already? Treat as fatal error. */
2158 if (newDescs == NULL)
2159 ereport(ERROR,
2160 (errcode(ERRCODE_OUT_OF_MEMORY),
2161 errmsg("out of memory")));
2162 allocatedDescs = newDescs;
2163 maxAllocatedDescs = newMax;
2164 return true;
2165 }
2166
2167 /*
2168 * Consider enlarging the array beyond the initial allocation used above.
2169 * By the time this happens, max_safe_fds should be known accurately.
2170 *
2171 * We mustn't let allocated descriptors hog all the available FDs, and in
2172 * practice we'd better leave a reasonable number of FDs for VFD use. So
2173 * set the maximum to max_safe_fds / 2. (This should certainly be at
2174 * least as large as the initial size, FD_MINFREE / 2.)
2175 */
2176 newMax = max_safe_fds / 2;
2177 if (newMax > maxAllocatedDescs)
2178 {
2179 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2180 newMax * sizeof(AllocateDesc));
2181 /* Treat out-of-memory as a non-fatal error. */
2182 if (newDescs == NULL)
2183 return false;
2184 allocatedDescs = newDescs;
2185 maxAllocatedDescs = newMax;
2186 return true;
2187 }
2188
2189 /* Can't enlarge allocatedDescs[] any more. */
2190 return false;
2191}
2192
2193/*
2194 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2195 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2196 * necessary to open the file. When done, call FreeFile rather than fclose.
2197 *
2198 * Note that files that will be open for any significant length of time
2199 * should NOT be handled this way, since they cannot share kernel file
2200 * descriptors with other files; there is grave risk of running out of FDs
2201 * if anyone locks down too many FDs. Most callers of this routine are
2202 * simply reading a config file that they will read and close immediately.
2203 *
2204 * fd.c will automatically close all files opened with AllocateFile at
2205 * transaction commit or abort; this prevents FD leakage if a routine
2206 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2207 *
2208 * Ideally this should be the *only* direct call of fopen() in the backend.
2209 */
2210FILE *
2211AllocateFile(const char *name, const char *mode)
2212{
2213 FILE *file;
2214
2215 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2216 numAllocatedDescs, name));
2217
2218 /* Can we allocate another non-virtual FD? */
2219 if (!reserveAllocatedDesc())
2220 ereport(ERROR,
2221 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2222 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2223 maxAllocatedDescs, name)));
2224
2225 /* Close excess kernel FDs. */
2226 ReleaseLruFiles();
2227
2228TryAgain:
2229 if ((file = fopen(name, mode)) != NULL)
2230 {
2231 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2232
2233 desc->kind = AllocateDescFile;
2234 desc->desc.file = file;
2235 desc->create_subid = GetCurrentSubTransactionId();
2236 numAllocatedDescs++;
2237 return desc->desc.file;
2238 }
2239
2240 if (errno == EMFILE || errno == ENFILE)
2241 {
2242 int save_errno = errno;
2243
2244 ereport(LOG,
2245 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2246 errmsg("out of file descriptors: %m; release and retry")));
2247 errno = 0;
2248 if (ReleaseLruFile())
2249 goto TryAgain;
2250 errno = save_errno;
2251 }
2252
2253 return NULL;
2254}
2255
2256/*
2257 * Open a file with OpenTransientFilePerm() and pass default file mode for
2258 * the fileMode parameter.
2259 */
2260int
2261OpenTransientFile(const char *fileName, int fileFlags)
2262{
2263 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2264}
2265
2266/*
2267 * Like AllocateFile, but returns an unbuffered fd like open(2)
2268 */
2269int
2270OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2271{
2272 int fd;
2273
2274 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2275 numAllocatedDescs, fileName));
2276
2277 /* Can we allocate another non-virtual FD? */
2278 if (!reserveAllocatedDesc())
2279 ereport(ERROR,
2280 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2281 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2282 maxAllocatedDescs, fileName)));
2283
2284 /* Close excess kernel FDs. */
2285 ReleaseLruFiles();
2286
2287 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2288
2289 if (fd >= 0)
2290 {
2291 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2292
2293 desc->kind = AllocateDescRawFD;
2294 desc->desc.fd = fd;
2295 desc->create_subid = GetCurrentSubTransactionId();
2296 numAllocatedDescs++;
2297
2298 return fd;
2299 }
2300
2301 return -1; /* failure */
2302}
2303
2304/*
2305 * Routines that want to initiate a pipe stream should use OpenPipeStream
2306 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2307 * necessary. When done, call ClosePipeStream rather than pclose.
2308 *
2309 * This function also ensures that the popen'd program is run with default
2310 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2311 * uses. This ensures desirable response to, eg, closing a read pipe early.
2312 */
2313FILE *
2314OpenPipeStream(const char *command, const char *mode)
2315{
2316 FILE *file;
2317 int save_errno;
2318
2319 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2320 numAllocatedDescs, command));
2321
2322 /* Can we allocate another non-virtual FD? */
2323 if (!reserveAllocatedDesc())
2324 ereport(ERROR,
2325 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2326 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2327 maxAllocatedDescs, command)));
2328
2329 /* Close excess kernel FDs. */
2330 ReleaseLruFiles();
2331
2332TryAgain:
2333 fflush(stdout);
2334 fflush(stderr);
2335 pqsignal(SIGPIPE, SIG_DFL);
2336 errno = 0;
2337 file = popen(command, mode);
2338 save_errno = errno;
2339 pqsignal(SIGPIPE, SIG_IGN);
2340 errno = save_errno;
2341 if (file != NULL)
2342 {
2343 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2344
2345 desc->kind = AllocateDescPipe;
2346 desc->desc.file = file;
2347 desc->create_subid = GetCurrentSubTransactionId();
2348 numAllocatedDescs++;
2349 return desc->desc.file;
2350 }
2351
2352 if (errno == EMFILE || errno == ENFILE)
2353 {
2354 ereport(LOG,
2355 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2356 errmsg("out of file descriptors: %m; release and retry")));
2357 if (ReleaseLruFile())
2358 goto TryAgain;
2359 errno = save_errno;
2360 }
2361
2362 return NULL;
2363}
2364
2365/*
2366 * Free an AllocateDesc of any type.
2367 *
2368 * The argument *must* point into the allocatedDescs[] array.
2369 */
2370static int
2371FreeDesc(AllocateDesc *desc)
2372{
2373 int result;
2374
2375 /* Close the underlying object */
2376 switch (desc->kind)
2377 {
2378 case AllocateDescFile:
2379 result = fclose(desc->desc.file);
2380 break;
2381 case AllocateDescPipe:
2382 result = pclose(desc->desc.file);
2383 break;
2384 case AllocateDescDir:
2385 result = closedir(desc->desc.dir);
2386 break;
2387 case AllocateDescRawFD:
2388 result = close(desc->desc.fd);
2389 break;
2390 default:
2391 elog(ERROR, "AllocateDesc kind not recognized");
2392 result = 0; /* keep compiler quiet */
2393 break;
2394 }
2395
2396 /* Compact storage in the allocatedDescs array */
2397 numAllocatedDescs--;
2398 *desc = allocatedDescs[numAllocatedDescs];
2399
2400 return result;
2401}
2402
2403/*
2404 * Close a file returned by AllocateFile.
2405 *
2406 * Note we do not check fclose's return value --- it is up to the caller
2407 * to handle close errors.
2408 */
2409int
2410FreeFile(FILE *file)
2411{
2412 int i;
2413
2414 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2415
2416 /* Remove file from list of allocated files, if it's present */
2417 for (i = numAllocatedDescs; --i >= 0;)
2418 {
2419 AllocateDesc *desc = &allocatedDescs[i];
2420
2421 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2422 return FreeDesc(desc);
2423 }
2424
2425 /* Only get here if someone passes us a file not in allocatedDescs */
2426 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2427
2428 return fclose(file);
2429}
2430
2431/*
2432 * Close a file returned by OpenTransientFile.
2433 *
2434 * Note we do not check close's return value --- it is up to the caller
2435 * to handle close errors.
2436 */
2437int
2438CloseTransientFile(int fd)
2439{
2440 int i;
2441
2442 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2443
2444 /* Remove fd from list of allocated files, if it's present */
2445 for (i = numAllocatedDescs; --i >= 0;)
2446 {
2447 AllocateDesc *desc = &allocatedDescs[i];
2448
2449 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2450 return FreeDesc(desc);
2451 }
2452
2453 /* Only get here if someone passes us a file not in allocatedDescs */
2454 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2455
2456 return close(fd);
2457}
2458
2459/*
2460 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2461 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2462 * necessary to open the directory, and with closing it after an elog.
2463 * When done, call FreeDir rather than closedir.
2464 *
2465 * Returns NULL, with errno set, on failure. Note that failure detection
2466 * is commonly left to the following call of ReadDir or ReadDirExtended;
2467 * see the comments for ReadDir.
2468 *
2469 * Ideally this should be the *only* direct call of opendir() in the backend.
2470 */
2471DIR *
2472AllocateDir(const char *dirname)
2473{
2474 DIR *dir;
2475
2476 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2477 numAllocatedDescs, dirname));
2478
2479 /* Can we allocate another non-virtual FD? */
2480 if (!reserveAllocatedDesc())
2481 ereport(ERROR,
2482 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2483 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2484 maxAllocatedDescs, dirname)));
2485
2486 /* Close excess kernel FDs. */
2487 ReleaseLruFiles();
2488
2489TryAgain:
2490 if ((dir = opendir(dirname)) != NULL)
2491 {
2492 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2493
2494 desc->kind = AllocateDescDir;
2495 desc->desc.dir = dir;
2496 desc->create_subid = GetCurrentSubTransactionId();
2497 numAllocatedDescs++;
2498 return desc->desc.dir;
2499 }
2500
2501 if (errno == EMFILE || errno == ENFILE)
2502 {
2503 int save_errno = errno;
2504
2505 ereport(LOG,
2506 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2507 errmsg("out of file descriptors: %m; release and retry")));
2508 errno = 0;
2509 if (ReleaseLruFile())
2510 goto TryAgain;
2511 errno = save_errno;
2512 }
2513
2514 return NULL;
2515}
2516
2517/*
2518 * Read a directory opened with AllocateDir, ereport'ing any error.
2519 *
2520 * This is easier to use than raw readdir() since it takes care of some
2521 * otherwise rather tedious and error-prone manipulation of errno. Also,
2522 * if you are happy with a generic error message for AllocateDir failure,
2523 * you can just do
2524 *
2525 * dir = AllocateDir(path);
2526 * while ((dirent = ReadDir(dir, path)) != NULL)
2527 * process dirent;
2528 * FreeDir(dir);
2529 *
2530 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2531 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2532 * use this shortcut.)
2533 *
2534 * The pathname passed to AllocateDir must be passed to this routine too,
2535 * but it is only used for error reporting.
2536 */
2537struct dirent *
2538ReadDir(DIR *dir, const char *dirname)
2539{
2540 return ReadDirExtended(dir, dirname, ERROR);
2541}
2542
2543/*
2544 * Alternate version of ReadDir that allows caller to specify the elevel
2545 * for any error report (whether it's reporting an initial failure of
2546 * AllocateDir or a subsequent directory read failure).
2547 *
2548 * If elevel < ERROR, returns NULL after any error. With the normal coding
2549 * pattern, this will result in falling out of the loop immediately as
2550 * though the directory contained no (more) entries.
2551 */
2552struct dirent *
2553ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2554{
2555 struct dirent *dent;
2556
2557 /* Give a generic message for AllocateDir failure, if caller didn't */
2558 if (dir == NULL)
2559 {
2560 ereport(elevel,
2561 (errcode_for_file_access(),
2562 errmsg("could not open directory \"%s\": %m",
2563 dirname)));
2564 return NULL;
2565 }
2566
2567 errno = 0;
2568 if ((dent = readdir(dir)) != NULL)
2569 return dent;
2570
2571 if (errno)
2572 ereport(elevel,
2573 (errcode_for_file_access(),
2574 errmsg("could not read directory \"%s\": %m",
2575 dirname)));
2576 return NULL;
2577}
2578
2579/*
2580 * Close a directory opened with AllocateDir.
2581 *
2582 * Returns closedir's return value (with errno set if it's not 0).
2583 * Note we do not check the return value --- it is up to the caller
2584 * to handle close errors if wanted.
2585 *
2586 * Does nothing if dir == NULL; we assume that directory open failure was
2587 * already reported if desired.
2588 */
2589int
2590FreeDir(DIR *dir)
2591{
2592 int i;
2593
2594 /* Nothing to do if AllocateDir failed */
2595 if (dir == NULL)
2596 return 0;
2597
2598 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2599
2600 /* Remove dir from list of allocated dirs, if it's present */
2601 for (i = numAllocatedDescs; --i >= 0;)
2602 {
2603 AllocateDesc *desc = &allocatedDescs[i];
2604
2605 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2606 return FreeDesc(desc);
2607 }
2608
2609 /* Only get here if someone passes us a dir not in allocatedDescs */
2610 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2611
2612 return closedir(dir);
2613}
2614
2615
2616/*
2617 * Close a pipe stream returned by OpenPipeStream.
2618 */
2619int
2620ClosePipeStream(FILE *file)
2621{
2622 int i;
2623
2624 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2625
2626 /* Remove file from list of allocated files, if it's present */
2627 for (i = numAllocatedDescs; --i >= 0;)
2628 {
2629 AllocateDesc *desc = &allocatedDescs[i];
2630
2631 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2632 return FreeDesc(desc);
2633 }
2634
2635 /* Only get here if someone passes us a file not in allocatedDescs */
2636 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2637
2638 return pclose(file);
2639}
2640
2641/*
2642 * closeAllVfds
2643 *
2644 * Force all VFDs into the physically-closed state, so that the fewest
2645 * possible number of kernel file descriptors are in use. There is no
2646 * change in the logical state of the VFDs.
2647 */
2648void
2649closeAllVfds(void)
2650{
2651 Index i;
2652
2653 if (SizeVfdCache > 0)
2654 {
2655 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2656 for (i = 1; i < SizeVfdCache; i++)
2657 {
2658 if (!FileIsNotOpen(i))
2659 LruDelete(i);
2660 }
2661 }
2662}
2663
2664
2665/*
2666 * SetTempTablespaces
2667 *
2668 * Define a list (actually an array) of OIDs of tablespaces to use for
2669 * temporary files. This list will be used until end of transaction,
2670 * unless this function is called again before then. It is caller's
2671 * responsibility that the passed-in array has adequate lifespan (typically
2672 * it'd be allocated in TopTransactionContext).
2673 */
2674void
2675SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2676{
2677 Assert(numSpaces >= 0);
2678 tempTableSpaces = tableSpaces;
2679 numTempTableSpaces = numSpaces;
2680
2681 /*
2682 * Select a random starting point in the list. This is to minimize
2683 * conflicts between backends that are most likely sharing the same list
2684 * of temp tablespaces. Note that if we create multiple temp files in the
2685 * same transaction, we'll advance circularly through the list --- this
2686 * ensures that large temporary sort files are nicely spread across all
2687 * available tablespaces.
2688 */
2689 if (numSpaces > 1)
2690 nextTempTableSpace = random() % numSpaces;
2691 else
2692 nextTempTableSpace = 0;
2693}
2694
2695/*
2696 * TempTablespacesAreSet
2697 *
2698 * Returns true if SetTempTablespaces has been called in current transaction.
2699 * (This is just so that tablespaces.c doesn't need its own per-transaction
2700 * state.)
2701 */
2702bool
2703TempTablespacesAreSet(void)
2704{
2705 return (numTempTableSpaces >= 0);
2706}
2707
2708/*
2709 * GetTempTablespaces
2710 *
2711 * Populate an array with the OIDs of the tablespaces that should be used for
2712 * temporary files. Return the number that were copied into the output array.
2713 */
2714int
2715GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2716{
2717 int i;
2718
2719 Assert(TempTablespacesAreSet());
2720 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2721 tableSpaces[i] = tempTableSpaces[i];
2722
2723 return i;
2724}
2725
2726/*
2727 * GetNextTempTableSpace
2728 *
2729 * Select the next temp tablespace to use. A result of InvalidOid means
2730 * to use the current database's default tablespace.
2731 */
2732Oid
2733GetNextTempTableSpace(void)
2734{
2735 if (numTempTableSpaces > 0)
2736 {
2737 /* Advance nextTempTableSpace counter with wraparound */
2738 if (++nextTempTableSpace >= numTempTableSpaces)
2739 nextTempTableSpace = 0;
2740 return tempTableSpaces[nextTempTableSpace];
2741 }
2742 return InvalidOid;
2743}
2744
2745
2746/*
2747 * AtEOSubXact_Files
2748 *
2749 * Take care of subtransaction commit/abort. At abort, we close temp files
2750 * that the subtransaction may have opened. At commit, we reassign the
2751 * files that were opened to the parent subtransaction.
2752 */
2753void
2754AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2755 SubTransactionId parentSubid)
2756{
2757 Index i;
2758
2759 for (i = 0; i < numAllocatedDescs; i++)
2760 {
2761 if (allocatedDescs[i].create_subid == mySubid)
2762 {
2763 if (isCommit)
2764 allocatedDescs[i].create_subid = parentSubid;
2765 else
2766 {
2767 /* have to recheck the item after FreeDesc (ugly) */
2768 FreeDesc(&allocatedDescs[i--]);
2769 }
2770 }
2771 }
2772}
2773
2774/*
2775 * AtEOXact_Files
2776 *
2777 * This routine is called during transaction commit or abort. All still-open
2778 * per-transaction temporary file VFDs are closed, which also causes the
2779 * underlying files to be deleted (although they should've been closed already
2780 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2781 * closed. We also forget any transaction-local temp tablespace list.
2782 *
2783 * The isCommit flag is used only to decide whether to emit warnings about
2784 * unclosed files.
2785 */
2786void
2787AtEOXact_Files(bool isCommit)
2788{
2789 CleanupTempFiles(isCommit, false);
2790 tempTableSpaces = NULL;
2791 numTempTableSpaces = -1;
2792}
2793
2794/*
2795 * AtProcExit_Files
2796 *
2797 * on_proc_exit hook to clean up temp files during backend shutdown.
2798 * Here, we want to clean up *all* temp files including interXact ones.
2799 */
2800static void
2801AtProcExit_Files(int code, Datum arg)
2802{
2803 CleanupTempFiles(false, true);
2804}
2805
2806/*
2807 * Close temporary files and delete their underlying files.
2808 *
2809 * isCommit: if true, this is normal transaction commit, and we don't
2810 * expect any remaining files; warn if there are some.
2811 *
2812 * isProcExit: if true, this is being called as the backend process is
2813 * exiting. If that's the case, we should remove all temporary files; if
2814 * that's not the case, we are being called for transaction commit/abort
2815 * and should only remove transaction-local temp files. In either case,
2816 * also clean up "allocated" stdio files, dirs and fds.
2817 */
2818static void
2819CleanupTempFiles(bool isCommit, bool isProcExit)
2820{
2821 Index i;
2822
2823 /*
2824 * Careful here: at proc_exit we need extra cleanup, not just
2825 * xact_temporary files.
2826 */
2827 if (isProcExit || have_xact_temporary_files)
2828 {
2829 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2830 for (i = 1; i < SizeVfdCache; i++)
2831 {
2832 unsigned short fdstate = VfdCache[i].fdstate;
2833
2834 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2835 VfdCache[i].fileName != NULL)
2836 {
2837 /*
2838 * If we're in the process of exiting a backend process, close
2839 * all temporary files. Otherwise, only close temporary files
2840 * local to the current transaction. They should be closed by
2841 * the ResourceOwner mechanism already, so this is just a
2842 * debugging cross-check.
2843 */
2844 if (isProcExit)
2845 FileClose(i);
2846 else if (fdstate & FD_CLOSE_AT_EOXACT)
2847 {
2848 elog(WARNING,
2849 "temporary file %s not closed at end-of-transaction",
2850 VfdCache[i].fileName);
2851 FileClose(i);
2852 }
2853 }
2854 }
2855
2856 have_xact_temporary_files = false;
2857 }
2858
2859 /* Complain if any allocated files remain open at commit. */
2860 if (isCommit && numAllocatedDescs > 0)
2861 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2862 numAllocatedDescs);
2863
2864 /* Clean up "allocated" stdio files, dirs and fds. */
2865 while (numAllocatedDescs > 0)
2866 FreeDesc(&allocatedDescs[0]);
2867}
2868
2869
2870/*
2871 * Remove temporary and temporary relation files left over from a prior
2872 * postmaster session
2873 *
2874 * This should be called during postmaster startup. It will forcibly
2875 * remove any leftover files created by OpenTemporaryFile and any leftover
2876 * temporary relation files created by mdcreate.
2877 *
2878 * NOTE: we could, but don't, call this during a post-backend-crash restart
2879 * cycle. The argument for not doing it is that someone might want to examine
2880 * the temp files for debugging purposes. This does however mean that
2881 * OpenTemporaryFile had better allow for collision with an existing temp
2882 * file name.
2883 *
2884 * NOTE: this function and its subroutines generally report syscall failures
2885 * with ereport(LOG) and keep going. Removing temp files is not so critical
2886 * that we should fail to start the database when we can't do it.
2887 */
2888void
2889RemovePgTempFiles(void)
2890{
2891 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2892 DIR *spc_dir;
2893 struct dirent *spc_de;
2894
2895 /*
2896 * First process temp files in pg_default ($PGDATA/base)
2897 */
2898 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2899 RemovePgTempFilesInDir(temp_path, true, false);
2900 RemovePgTempRelationFiles("base");
2901
2902 /*
2903 * Cycle through temp directories for all non-default tablespaces.
2904 */
2905 spc_dir = AllocateDir("pg_tblspc");
2906
2907 while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2908 {
2909 if (strcmp(spc_de->d_name, ".") == 0 ||
2910 strcmp(spc_de->d_name, "..") == 0)
2911 continue;
2912
2913 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2914 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2915 RemovePgTempFilesInDir(temp_path, true, false);
2916
2917 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2918 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2919 RemovePgTempRelationFiles(temp_path);
2920 }
2921
2922 FreeDir(spc_dir);
2923
2924 /*
2925 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2926 * DataDir as well.
2927 */
2928#ifdef EXEC_BACKEND
2929 RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
2930#endif
2931}
2932
2933/*
2934 * Process one pgsql_tmp directory for RemovePgTempFiles.
2935 *
2936 * If missing_ok is true, it's all right for the named directory to not exist.
2937 * Any other problem results in a LOG message. (missing_ok should be true at
2938 * the top level, since pgsql_tmp directories are not created until needed.)
2939 *
2940 * At the top level, this should be called with unlink_all = false, so that
2941 * only files matching the temporary name prefix will be unlinked. When
2942 * recursing it will be called with unlink_all = true to unlink everything
2943 * under a top-level temporary directory.
2944 *
2945 * (These two flags could be replaced by one, but it seems clearer to keep
2946 * them separate.)
2947 */
2948static void
2949RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2950{
2951 DIR *temp_dir;
2952 struct dirent *temp_de;
2953 char rm_path[MAXPGPATH * 2];
2954
2955 temp_dir = AllocateDir(tmpdirname);
2956
2957 if (temp_dir == NULL && errno == ENOENT && missing_ok)
2958 return;
2959
2960 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2961 {
2962 if (strcmp(temp_de->d_name, ".") == 0 ||
2963 strcmp(temp_de->d_name, "..") == 0)
2964 continue;
2965
2966 snprintf(rm_path, sizeof(rm_path), "%s/%s",
2967 tmpdirname, temp_de->d_name);
2968
2969 if (unlink_all ||
2970 strncmp(temp_de->d_name,
2971 PG_TEMP_FILE_PREFIX,
2972 strlen(PG_TEMP_FILE_PREFIX)) == 0)
2973 {
2974 struct stat statbuf;
2975
2976 if (lstat(rm_path, &statbuf) < 0)
2977 {
2978 ereport(LOG,
2979 (errcode_for_file_access(),
2980 errmsg("could not stat file \"%s\": %m", rm_path)));
2981 continue;
2982 }
2983
2984 if (S_ISDIR(statbuf.st_mode))
2985 {
2986 /* recursively remove contents, then directory itself */
2987 RemovePgTempFilesInDir(rm_path, false, true);
2988
2989 if (rmdir(rm_path) < 0)
2990 ereport(LOG,
2991 (errcode_for_file_access(),
2992 errmsg("could not remove directory \"%s\": %m",
2993 rm_path)));
2994 }
2995 else
2996 {
2997 if (unlink(rm_path) < 0)
2998 ereport(LOG,
2999 (errcode_for_file_access(),
3000 errmsg("could not remove file \"%s\": %m",
3001 rm_path)));
3002 }
3003 }
3004 else
3005 ereport(LOG,
3006 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3007 rm_path)));
3008 }
3009
3010 FreeDir(temp_dir);
3011}
3012
3013/* Process one tablespace directory, look for per-DB subdirectories */
3014static void
3015RemovePgTempRelationFiles(const char *tsdirname)
3016{
3017 DIR *ts_dir;
3018 struct dirent *de;
3019 char dbspace_path[MAXPGPATH * 2];
3020
3021 ts_dir = AllocateDir(tsdirname);
3022
3023 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3024 {
3025 /*
3026 * We're only interested in the per-database directories, which have
3027 * numeric names. Note that this code will also (properly) ignore "."
3028 * and "..".
3029 */
3030 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3031 continue;
3032
3033 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3034 tsdirname, de->d_name);
3035 RemovePgTempRelationFilesInDbspace(dbspace_path);
3036 }
3037
3038 FreeDir(ts_dir);
3039}
3040
3041/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3042static void
3043RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3044{
3045 DIR *dbspace_dir;
3046 struct dirent *de;
3047 char rm_path[MAXPGPATH * 2];
3048
3049 dbspace_dir = AllocateDir(dbspacedirname);
3050
3051 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3052 {
3053 if (!looks_like_temp_rel_name(de->d_name))
3054 continue;
3055
3056 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3057 dbspacedirname, de->d_name);
3058
3059 if (unlink(rm_path) < 0)
3060 ereport(LOG,
3061 (errcode_for_file_access(),
3062 errmsg("could not remove file \"%s\": %m",
3063 rm_path)));
3064 }
3065
3066 FreeDir(dbspace_dir);
3067}
3068
3069/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3070bool
3071looks_like_temp_rel_name(const char *name)
3072{
3073 int pos;
3074 int savepos;
3075
3076 /* Must start with "t". */
3077 if (name[0] != 't')
3078 return false;
3079
3080 /* Followed by a non-empty string of digits and then an underscore. */
3081 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3082 ;
3083 if (pos == 1 || name[pos] != '_')
3084 return false;
3085
3086 /* Followed by another nonempty string of digits. */
3087 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3088 ;
3089 if (savepos == pos)
3090 return false;
3091
3092 /* We might have _forkname or .segment or both. */
3093 if (name[pos] == '_')
3094 {
3095 int forkchar = forkname_chars(&name[pos + 1], NULL);
3096
3097 if (forkchar <= 0)
3098 return false;
3099 pos += forkchar + 1;
3100 }
3101 if (name[pos] == '.')
3102 {
3103 int segchar;
3104
3105 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3106 ;
3107 if (segchar <= 1)
3108 return false;
3109 pos += segchar;
3110 }
3111
3112 /* Now we should be at the end. */
3113 if (name[pos] != '\0')
3114 return false;
3115 return true;
3116}
3117
3118
3119/*
3120 * Issue fsync recursively on PGDATA and all its contents.
3121 *
3122 * We fsync regular files and directories wherever they are, but we
3123 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3124 * Other symlinks are presumed to point at files we're not responsible
3125 * for fsyncing, and might not have privileges to write at all.
3126 *
3127 * Errors are logged but not considered fatal; that's because this is used
3128 * only during database startup, to deal with the possibility that there are
3129 * issued-but-unsynced writes pending against the data directory. We want to
3130 * ensure that such writes reach disk before anything that's done in the new
3131 * run. However, aborting on error would result in failure to start for
3132 * harmless cases such as read-only files in the data directory, and that's
3133 * not good either.
3134 *
3135 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3136 * rewriting all changes again during recovery.
3137 *
3138 * Note we assume we're chdir'd into PGDATA to begin with.
3139 */
3140void
3141SyncDataDirectory(void)
3142{
3143 bool xlog_is_symlink;
3144
3145 /* We can skip this whole thing if fsync is disabled. */
3146 if (!enableFsync)
3147 return;
3148
3149 /*
3150 * If pg_wal is a symlink, we'll need to recurse into it separately,
3151 * because the first walkdir below will ignore it.
3152 */
3153 xlog_is_symlink = false;
3154
3155#ifndef WIN32
3156 {
3157 struct stat st;
3158
3159 if (lstat("pg_wal", &st) < 0)
3160 ereport(LOG,
3161 (errcode_for_file_access(),
3162 errmsg("could not stat file \"%s\": %m",
3163 "pg_wal")));
3164 else if (S_ISLNK(st.st_mode))
3165 xlog_is_symlink = true;
3166 }
3167#else
3168 if (pgwin32_is_junction("pg_wal"))
3169 xlog_is_symlink = true;
3170#endif
3171
3172 /*
3173 * If possible, hint to the kernel that we're soon going to fsync the data
3174 * directory and its contents. Errors in this step are even less
3175 * interesting than normal, so log them only at DEBUG1.
3176 */
3177#ifdef PG_FLUSH_DATA_WORKS
3178 walkdir(".", pre_sync_fname, false, DEBUG1);
3179 if (xlog_is_symlink)
3180 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3181 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3182#endif
3183
3184 /*
3185 * Now we do the fsync()s in the same order.
3186 *
3187 * The main call ignores symlinks, so in addition to specially processing
3188 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3189 * process_symlinks = true. Note that if there are any plain directories
3190 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3191 * so we don't worry about optimizing it.
3192 */
3193 walkdir(".", datadir_fsync_fname, false, LOG);
3194 if (xlog_is_symlink)
3195 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3196 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3197}
3198
3199/*
3200 * walkdir: recursively walk a directory, applying the action to each
3201 * regular file and directory (including the named directory itself).
3202 *
3203 * If process_symlinks is true, the action and recursion are also applied
3204 * to regular files and directories that are pointed to by symlinks in the
3205 * given directory; otherwise symlinks are ignored. Symlinks are always
3206 * ignored in subdirectories, ie we intentionally don't pass down the
3207 * process_symlinks flag to recursive calls.
3208 *
3209 * Errors are reported at level elevel, which might be ERROR or less.
3210 *
3211 * See also walkdir in initdb.c, which is a frontend version of this logic.
3212 */
3213static void
3214walkdir(const char *path,
3215 void (*action) (const char *fname, bool isdir, int elevel),
3216 bool process_symlinks,
3217 int elevel)
3218{
3219 DIR *dir;
3220 struct dirent *de;
3221
3222 dir = AllocateDir(path);
3223
3224 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3225 {
3226 char subpath[MAXPGPATH * 2];
3227 struct stat fst;
3228 int sret;
3229
3230 CHECK_FOR_INTERRUPTS();
3231
3232 if (strcmp(de->d_name, ".") == 0 ||
3233 strcmp(de->d_name, "..") == 0)
3234 continue;
3235
3236 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3237
3238 if (process_symlinks)
3239 sret = stat(subpath, &fst);
3240 else
3241 sret = lstat(subpath, &fst);
3242
3243 if (sret < 0)
3244 {
3245 ereport(elevel,
3246 (errcode_for_file_access(),
3247 errmsg("could not stat file \"%s\": %m", subpath)));
3248 continue;
3249 }
3250
3251 if (S_ISREG(fst.st_mode))
3252 (*action) (subpath, false, elevel);
3253 else if (S_ISDIR(fst.st_mode))
3254 walkdir(subpath, action, false, elevel);
3255 }
3256
3257 FreeDir(dir); /* we ignore any error here */
3258
3259 /*
3260 * It's important to fsync the destination directory itself as individual
3261 * file fsyncs don't guarantee that the directory entry for the file is
3262 * synced. However, skip this if AllocateDir failed; the action function
3263 * might not be robust against that.
3264 */
3265 if (dir)
3266 (*action) (path, true, elevel);
3267}
3268
3269
3270/*
3271 * Hint to the OS that it should get ready to fsync() this file.
3272 *
3273 * Ignores errors trying to open unreadable files, and logs other errors at a
3274 * caller-specified level.
3275 */
3276#ifdef PG_FLUSH_DATA_WORKS
3277
3278static void
3279pre_sync_fname(const char *fname, bool isdir, int elevel)
3280{
3281 int fd;
3282
3283 /* Don't try to flush directories, it'll likely just fail */
3284 if (isdir)
3285 return;
3286
3287 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3288
3289 if (fd < 0)
3290 {
3291 if (errno == EACCES)
3292 return;
3293 ereport(elevel,
3294 (errcode_for_file_access(),
3295 errmsg("could not open file \"%s\": %m", fname)));
3296 return;
3297 }
3298
3299 /*
3300 * pg_flush_data() ignores errors, which is ok because this is only a
3301 * hint.
3302 */
3303 pg_flush_data(fd, 0, 0);
3304
3305 if (CloseTransientFile(fd))
3306 ereport(elevel,
3307 (errcode_for_file_access(),
3308 errmsg("could not close file \"%s\": %m", fname)));
3309}
3310
3311#endif /* PG_FLUSH_DATA_WORKS */
3312
3313static void
3314datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3315{
3316 /*
3317 * We want to silently ignoring errors about unreadable files. Pass that
3318 * desire on to fsync_fname_ext().
3319 */
3320 fsync_fname_ext(fname, isdir, true, elevel);
3321}
3322
3323static void
3324unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3325{
3326 if (isdir)
3327 {
3328 if (rmdir(fname) != 0 && errno != ENOENT)
3329 ereport(elevel,
3330 (errcode_for_file_access(),
3331 errmsg("could not remove directory \"%s\": %m", fname)));
3332 }
3333 else
3334 {
3335 /* Use PathNameDeleteTemporaryFile to report filesize */
3336 PathNameDeleteTemporaryFile(fname, false);
3337 }
3338}
3339
3340/*
3341 * fsync_fname_ext -- Try to fsync a file or directory
3342 *
3343 * If ignore_perm is true, ignore errors upon trying to open unreadable
3344 * files. Logs other errors at a caller-specified level.
3345 *
3346 * Returns 0 if the operation succeeded, -1 otherwise.
3347 */
3348static int
3349fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3350{
3351 int fd;
3352 int flags;
3353 int returncode;
3354
3355 /*
3356 * Some OSs require directories to be opened read-only whereas other
3357 * systems don't allow us to fsync files opened read-only; so we need both
3358 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3359 * not writable by our userid, but we assume that's OK.
3360 */
3361 flags = PG_BINARY;
3362 if (!isdir)
3363 flags |= O_RDWR;
3364 else
3365 flags |= O_RDONLY;
3366
3367 fd = OpenTransientFile(fname, flags);
3368
3369 /*
3370 * Some OSs don't allow us to open directories at all (Windows returns
3371 * EACCES), just ignore the error in that case. If desired also silently
3372 * ignoring errors about unreadable files. Log others.
3373 */
3374 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3375 return 0;
3376 else if (fd < 0 && ignore_perm && errno == EACCES)
3377 return 0;
3378 else if (fd < 0)
3379 {
3380 ereport(elevel,
3381 (errcode_for_file_access(),
3382 errmsg("could not open file \"%s\": %m", fname)));
3383 return -1;
3384 }
3385
3386 returncode = pg_fsync(fd);
3387
3388 /*
3389 * Some OSes don't allow us to fsync directories at all, so we can ignore
3390 * those errors. Anything else needs to be logged.
3391 */
3392 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3393 {
3394 int save_errno;
3395
3396 /* close file upon error, might not be in transaction context */
3397 save_errno = errno;
3398 (void) CloseTransientFile(fd);
3399 errno = save_errno;
3400
3401 ereport(elevel,
3402 (errcode_for_file_access(),
3403 errmsg("could not fsync file \"%s\": %m", fname)));
3404 return -1;
3405 }
3406
3407 if (CloseTransientFile(fd))
3408 {
3409 ereport(elevel,
3410 (errcode_for_file_access(),
3411 errmsg("could not close file \"%s\": %m", fname)));
3412 return -1;
3413 }
3414
3415 return 0;
3416}
3417
3418/*
3419 * fsync_parent_path -- fsync the parent path of a file or directory
3420 *
3421 * This is aimed at making file operations persistent on disk in case of
3422 * an OS crash or power failure.
3423 */
3424static int
3425fsync_parent_path(const char *fname, int elevel)
3426{
3427 char parentpath[MAXPGPATH];
3428
3429 strlcpy(parentpath, fname, MAXPGPATH);
3430 get_parent_directory(parentpath);
3431
3432 /*
3433 * get_parent_directory() returns an empty string if the input argument is
3434 * just a file name (see comments in path.c), so handle that as being the
3435 * current directory.
3436 */
3437 if (strlen(parentpath) == 0)
3438 strlcpy(parentpath, ".", MAXPGPATH);
3439
3440 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3441 return -1;
3442
3443 return 0;
3444}
3445
3446/*
3447 * Create a PostgreSQL data sub-directory
3448 *
3449 * The data directory itself, and most of its sub-directories, are created at
3450 * initdb time, but we do have some occasions when we create directories in
3451 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3452 * make sure that those directories are created consistently. Today, that means
3453 * making sure that the created directory has the correct permissions, which is
3454 * what pg_dir_create_mode tracks for us.
3455 *
3456 * Note that we also set the umask() based on what we understand the correct
3457 * permissions to be (see file_perm.c).
3458 *
3459 * For permissions other than the default, mkdir() can be used directly, but
3460 * be sure to consider carefully such cases -- a sub-directory with incorrect
3461 * permissions in a PostgreSQL data directory could cause backups and other
3462 * processes to fail.
3463 */
3464int
3465MakePGDirectory(const char *directoryName)
3466{
3467 return mkdir(directoryName, pg_dir_create_mode);
3468}
3469
3470/*
3471 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3472 *
3473 * Failure to fsync any data file is cause for immediate panic, unless
3474 * data_sync_retry is enabled. Data may have been written to the operating
3475 * system and removed from our buffer pool already, and if we are running on
3476 * an operating system that forgets dirty data on write-back failure, there
3477 * may be only one copy of the data remaining: in the WAL. A later attempt to
3478 * fsync again might falsely report success. Therefore we must not allow any
3479 * further checkpoints to be attempted. data_sync_retry can in theory be
3480 * enabled on systems known not to drop dirty buffered data on write-back
3481 * failure (with the likely outcome that checkpoints will continue to fail
3482 * until the underlying problem is fixed).
3483 *
3484 * Any code that reports a failure from fsync() or related functions should
3485 * filter the error level with this function.
3486 */
3487int
3488data_sync_elevel(int elevel)
3489{
3490 return data_sync_retry ? elevel : PANIC;
3491}
3492