1/*-------------------------------------------------------------------------
2 *
3 * File-processing utility routines.
4 *
5 * Assorted utility functions to work on files.
6 *
7 *
8 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
10 *
11 * src/common/file_utils.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres_fe.h"
16
17#include <dirent.h>
18#include <fcntl.h>
19#include <sys/stat.h>
20#include <unistd.h>
21
22#include "common/file_utils.h"
23#include "common/logging.h"
24
25
26/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
27#if defined(HAVE_SYNC_FILE_RANGE)
28#define PG_FLUSH_DATA_WORKS 1
29#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
30#define PG_FLUSH_DATA_WORKS 1
31#endif
32
33/*
34 * pg_xlog has been renamed to pg_wal in version 10.
35 */
36#define MINIMUM_VERSION_FOR_PG_WAL 100000
37
38#ifdef PG_FLUSH_DATA_WORKS
39static int pre_sync_fname(const char *fname, bool isdir);
40#endif
41static void walkdir(const char *path,
42 int (*action) (const char *fname, bool isdir),
43 bool process_symlinks);
44
45/*
46 * Issue fsync recursively on PGDATA and all its contents.
47 *
48 * We fsync regular files and directories wherever they are, but we follow
49 * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
50 * Other symlinks are presumed to point at files we're not responsible for
51 * fsyncing, and might not have privileges to write at all.
52 *
53 * serverVersion indicates the version of the server to be fsync'd.
54 *
55 * Errors are reported but not considered fatal.
56 */
57void
58fsync_pgdata(const char *pg_data,
59 int serverVersion)
60{
61 bool xlog_is_symlink;
62 char pg_wal[MAXPGPATH];
63 char pg_tblspc[MAXPGPATH];
64
65 /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
66 snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
67 serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
68 snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
69
70 /*
71 * If pg_wal is a symlink, we'll need to recurse into it separately,
72 * because the first walkdir below will ignore it.
73 */
74 xlog_is_symlink = false;
75
76#ifndef WIN32
77 {
78 struct stat st;
79
80 if (lstat(pg_wal, &st) < 0)
81 pg_log_error("could not stat file \"%s\": %m", pg_wal);
82 else if (S_ISLNK(st.st_mode))
83 xlog_is_symlink = true;
84 }
85#else
86 if (pgwin32_is_junction(pg_wal))
87 xlog_is_symlink = true;
88#endif
89
90 /*
91 * If possible, hint to the kernel that we're soon going to fsync the data
92 * directory and its contents.
93 */
94#ifdef PG_FLUSH_DATA_WORKS
95 walkdir(pg_data, pre_sync_fname, false);
96 if (xlog_is_symlink)
97 walkdir(pg_wal, pre_sync_fname, false);
98 walkdir(pg_tblspc, pre_sync_fname, true);
99#endif
100
101 /*
102 * Now we do the fsync()s in the same order.
103 *
104 * The main call ignores symlinks, so in addition to specially processing
105 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
106 * process_symlinks = true. Note that if there are any plain directories
107 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
108 * so we don't worry about optimizing it.
109 */
110 walkdir(pg_data, fsync_fname, false);
111 if (xlog_is_symlink)
112 walkdir(pg_wal, fsync_fname, false);
113 walkdir(pg_tblspc, fsync_fname, true);
114}
115
116/*
117 * Issue fsync recursively on the given directory and all its contents.
118 *
119 * This is a convenient wrapper on top of walkdir().
120 */
121void
122fsync_dir_recurse(const char *dir)
123{
124 /*
125 * If possible, hint to the kernel that we're soon going to fsync the data
126 * directory and its contents.
127 */
128#ifdef PG_FLUSH_DATA_WORKS
129 walkdir(dir, pre_sync_fname, false);
130#endif
131
132 walkdir(dir, fsync_fname, false);
133}
134
135/*
136 * walkdir: recursively walk a directory, applying the action to each
137 * regular file and directory (including the named directory itself).
138 *
139 * If process_symlinks is true, the action and recursion are also applied
140 * to regular files and directories that are pointed to by symlinks in the
141 * given directory; otherwise symlinks are ignored. Symlinks are always
142 * ignored in subdirectories, ie we intentionally don't pass down the
143 * process_symlinks flag to recursive calls.
144 *
145 * Errors are reported but not considered fatal.
146 *
147 * See also walkdir in fd.c, which is a backend version of this logic.
148 */
149static void
150walkdir(const char *path,
151 int (*action) (const char *fname, bool isdir),
152 bool process_symlinks)
153{
154 DIR *dir;
155 struct dirent *de;
156
157 dir = opendir(path);
158 if (dir == NULL)
159 {
160 pg_log_error("could not open directory \"%s\": %m", path);
161 return;
162 }
163
164 while (errno = 0, (de = readdir(dir)) != NULL)
165 {
166 char subpath[MAXPGPATH * 2];
167 struct stat fst;
168 int sret;
169
170 if (strcmp(de->d_name, ".") == 0 ||
171 strcmp(de->d_name, "..") == 0)
172 continue;
173
174 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
175
176 if (process_symlinks)
177 sret = stat(subpath, &fst);
178 else
179 sret = lstat(subpath, &fst);
180
181 if (sret < 0)
182 {
183 pg_log_error("could not stat file \"%s\": %m", subpath);
184 continue;
185 }
186
187 if (S_ISREG(fst.st_mode))
188 (*action) (subpath, false);
189 else if (S_ISDIR(fst.st_mode))
190 walkdir(subpath, action, false);
191 }
192
193 if (errno)
194 pg_log_error("could not read directory \"%s\": %m", path);
195
196 (void) closedir(dir);
197
198 /*
199 * It's important to fsync the destination directory itself as individual
200 * file fsyncs don't guarantee that the directory entry for the file is
201 * synced. Recent versions of ext4 have made the window much wider but
202 * it's been an issue for ext3 and other filesystems in the past.
203 */
204 (*action) (path, true);
205}
206
207/*
208 * Hint to the OS that it should get ready to fsync() this file.
209 *
210 * Ignores errors trying to open unreadable files, and reports other errors
211 * non-fatally.
212 */
213#ifdef PG_FLUSH_DATA_WORKS
214
215static int
216pre_sync_fname(const char *fname, bool isdir)
217{
218 int fd;
219
220 fd = open(fname, O_RDONLY | PG_BINARY, 0);
221
222 if (fd < 0)
223 {
224 if (errno == EACCES || (isdir && errno == EISDIR))
225 return 0;
226 pg_log_error("could not open file \"%s\": %m", fname);
227 return -1;
228 }
229
230 /*
231 * We do what pg_flush_data() would do in the backend: prefer to use
232 * sync_file_range, but fall back to posix_fadvise. We ignore errors
233 * because this is only a hint.
234 */
235#if defined(HAVE_SYNC_FILE_RANGE)
236 (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
237#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
238 (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
239#else
240#error PG_FLUSH_DATA_WORKS should not have been defined
241#endif
242
243 (void) close(fd);
244 return 0;
245}
246
247#endif /* PG_FLUSH_DATA_WORKS */
248
249/*
250 * fsync_fname -- Try to fsync a file or directory
251 *
252 * Ignores errors trying to open unreadable files, or trying to fsync
253 * directories on systems where that isn't allowed/required. Reports
254 * other errors non-fatally.
255 */
256int
257fsync_fname(const char *fname, bool isdir)
258{
259 int fd;
260 int flags;
261 int returncode;
262
263 /*
264 * Some OSs require directories to be opened read-only whereas other
265 * systems don't allow us to fsync files opened read-only; so we need both
266 * cases here. Using O_RDWR will cause us to fail to fsync files that are
267 * not writable by our userid, but we assume that's OK.
268 */
269 flags = PG_BINARY;
270 if (!isdir)
271 flags |= O_RDWR;
272 else
273 flags |= O_RDONLY;
274
275 /*
276 * Open the file, silently ignoring errors about unreadable files (or
277 * unsupported operations, e.g. opening a directory under Windows), and
278 * logging others.
279 */
280 fd = open(fname, flags, 0);
281 if (fd < 0)
282 {
283 if (errno == EACCES || (isdir && errno == EISDIR))
284 return 0;
285 pg_log_error("could not open file \"%s\": %m", fname);
286 return -1;
287 }
288
289 returncode = fsync(fd);
290
291 /*
292 * Some OSes don't allow us to fsync directories at all, so we can ignore
293 * those errors. Anything else needs to be reported.
294 */
295 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
296 {
297 pg_log_error("could not fsync file \"%s\": %m", fname);
298 (void) close(fd);
299 return -1;
300 }
301
302 (void) close(fd);
303 return 0;
304}
305
306/*
307 * fsync_parent_path -- fsync the parent path of a file or directory
308 *
309 * This is aimed at making file operations persistent on disk in case of
310 * an OS crash or power failure.
311 */
312int
313fsync_parent_path(const char *fname)
314{
315 char parentpath[MAXPGPATH];
316
317 strlcpy(parentpath, fname, MAXPGPATH);
318 get_parent_directory(parentpath);
319
320 /*
321 * get_parent_directory() returns an empty string if the input argument is
322 * just a file name (see comments in path.c), so handle that as being the
323 * current directory.
324 */
325 if (strlen(parentpath) == 0)
326 strlcpy(parentpath, ".", MAXPGPATH);
327
328 if (fsync_fname(parentpath, true) != 0)
329 return -1;
330
331 return 0;
332}
333
334/*
335 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
336 *
337 * Wrapper around rename, similar to the backend version.
338 */
339int
340durable_rename(const char *oldfile, const char *newfile)
341{
342 int fd;
343
344 /*
345 * First fsync the old and target path (if it exists), to ensure that they
346 * are properly persistent on disk. Syncing the target file is not
347 * strictly necessary, but it makes it easier to reason about crashes;
348 * because it's then guaranteed that either source or target file exists
349 * after a crash.
350 */
351 if (fsync_fname(oldfile, false) != 0)
352 return -1;
353
354 fd = open(newfile, PG_BINARY | O_RDWR, 0);
355 if (fd < 0)
356 {
357 if (errno != ENOENT)
358 {
359 pg_log_error("could not open file \"%s\": %m", newfile);
360 return -1;
361 }
362 }
363 else
364 {
365 if (fsync(fd) != 0)
366 {
367 pg_log_error("could not fsync file \"%s\": %m", newfile);
368 close(fd);
369 return -1;
370 }
371 close(fd);
372 }
373
374 /* Time to do the real deal... */
375 if (rename(oldfile, newfile) != 0)
376 {
377 pg_log_error("could not rename file \"%s\" to \"%s\": %m",
378 oldfile, newfile);
379 return -1;
380 }
381
382 /*
383 * To guarantee renaming the file is persistent, fsync the file with its
384 * new name, and its containing directory.
385 */
386 if (fsync_fname(newfile, false) != 0)
387 return -1;
388
389 if (fsync_parent_path(newfile) != 0)
390 return -1;
391
392 return 0;
393}
394