1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * File-processing utility routines. |
4 | * |
5 | * Assorted utility functions to work on files. |
6 | * |
7 | * |
8 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
9 | * Portions Copyright (c) 1994, Regents of the University of California |
10 | * |
11 | * src/common/file_utils.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres_fe.h" |
16 | |
17 | #include <dirent.h> |
18 | #include <fcntl.h> |
19 | #include <sys/stat.h> |
20 | #include <unistd.h> |
21 | |
22 | #include "common/file_utils.h" |
23 | #include "common/logging.h" |
24 | |
25 | |
26 | /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ |
27 | #if defined(HAVE_SYNC_FILE_RANGE) |
28 | #define PG_FLUSH_DATA_WORKS 1 |
29 | #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
30 | #define PG_FLUSH_DATA_WORKS 1 |
31 | #endif |
32 | |
33 | /* |
34 | * pg_xlog has been renamed to pg_wal in version 10. |
35 | */ |
36 | #define MINIMUM_VERSION_FOR_PG_WAL 100000 |
37 | |
38 | #ifdef PG_FLUSH_DATA_WORKS |
39 | static int pre_sync_fname(const char *fname, bool isdir); |
40 | #endif |
41 | static void walkdir(const char *path, |
42 | int (*action) (const char *fname, bool isdir), |
43 | bool process_symlinks); |
44 | |
45 | /* |
46 | * Issue fsync recursively on PGDATA and all its contents. |
47 | * |
48 | * We fsync regular files and directories wherever they are, but we follow |
49 | * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc. |
50 | * Other symlinks are presumed to point at files we're not responsible for |
51 | * fsyncing, and might not have privileges to write at all. |
52 | * |
53 | * serverVersion indicates the version of the server to be fsync'd. |
54 | * |
55 | * Errors are reported but not considered fatal. |
56 | */ |
57 | void |
58 | fsync_pgdata(const char *pg_data, |
59 | int serverVersion) |
60 | { |
61 | bool xlog_is_symlink; |
62 | char pg_wal[MAXPGPATH]; |
63 | char pg_tblspc[MAXPGPATH]; |
64 | |
65 | /* handle renaming of pg_xlog to pg_wal in post-10 clusters */ |
66 | snprintf(pg_wal, MAXPGPATH, "%s/%s" , pg_data, |
67 | serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal" ); |
68 | snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc" , pg_data); |
69 | |
70 | /* |
71 | * If pg_wal is a symlink, we'll need to recurse into it separately, |
72 | * because the first walkdir below will ignore it. |
73 | */ |
74 | xlog_is_symlink = false; |
75 | |
76 | #ifndef WIN32 |
77 | { |
78 | struct stat st; |
79 | |
80 | if (lstat(pg_wal, &st) < 0) |
81 | pg_log_error("could not stat file \"%s\": %m" , pg_wal); |
82 | else if (S_ISLNK(st.st_mode)) |
83 | xlog_is_symlink = true; |
84 | } |
85 | #else |
86 | if (pgwin32_is_junction(pg_wal)) |
87 | xlog_is_symlink = true; |
88 | #endif |
89 | |
90 | /* |
91 | * If possible, hint to the kernel that we're soon going to fsync the data |
92 | * directory and its contents. |
93 | */ |
94 | #ifdef PG_FLUSH_DATA_WORKS |
95 | walkdir(pg_data, pre_sync_fname, false); |
96 | if (xlog_is_symlink) |
97 | walkdir(pg_wal, pre_sync_fname, false); |
98 | walkdir(pg_tblspc, pre_sync_fname, true); |
99 | #endif |
100 | |
101 | /* |
102 | * Now we do the fsync()s in the same order. |
103 | * |
104 | * The main call ignores symlinks, so in addition to specially processing |
105 | * pg_wal if it's a symlink, pg_tblspc has to be visited separately with |
106 | * process_symlinks = true. Note that if there are any plain directories |
107 | * in pg_tblspc, they'll get fsync'd twice. That's not an expected case |
108 | * so we don't worry about optimizing it. |
109 | */ |
110 | walkdir(pg_data, fsync_fname, false); |
111 | if (xlog_is_symlink) |
112 | walkdir(pg_wal, fsync_fname, false); |
113 | walkdir(pg_tblspc, fsync_fname, true); |
114 | } |
115 | |
116 | /* |
117 | * Issue fsync recursively on the given directory and all its contents. |
118 | * |
119 | * This is a convenient wrapper on top of walkdir(). |
120 | */ |
121 | void |
122 | fsync_dir_recurse(const char *dir) |
123 | { |
124 | /* |
125 | * If possible, hint to the kernel that we're soon going to fsync the data |
126 | * directory and its contents. |
127 | */ |
128 | #ifdef PG_FLUSH_DATA_WORKS |
129 | walkdir(dir, pre_sync_fname, false); |
130 | #endif |
131 | |
132 | walkdir(dir, fsync_fname, false); |
133 | } |
134 | |
135 | /* |
136 | * walkdir: recursively walk a directory, applying the action to each |
137 | * regular file and directory (including the named directory itself). |
138 | * |
139 | * If process_symlinks is true, the action and recursion are also applied |
140 | * to regular files and directories that are pointed to by symlinks in the |
141 | * given directory; otherwise symlinks are ignored. Symlinks are always |
142 | * ignored in subdirectories, ie we intentionally don't pass down the |
143 | * process_symlinks flag to recursive calls. |
144 | * |
145 | * Errors are reported but not considered fatal. |
146 | * |
147 | * See also walkdir in fd.c, which is a backend version of this logic. |
148 | */ |
149 | static void |
150 | walkdir(const char *path, |
151 | int (*action) (const char *fname, bool isdir), |
152 | bool process_symlinks) |
153 | { |
154 | DIR *dir; |
155 | struct dirent *de; |
156 | |
157 | dir = opendir(path); |
158 | if (dir == NULL) |
159 | { |
160 | pg_log_error("could not open directory \"%s\": %m" , path); |
161 | return; |
162 | } |
163 | |
164 | while (errno = 0, (de = readdir(dir)) != NULL) |
165 | { |
166 | char subpath[MAXPGPATH * 2]; |
167 | struct stat fst; |
168 | int sret; |
169 | |
170 | if (strcmp(de->d_name, "." ) == 0 || |
171 | strcmp(de->d_name, ".." ) == 0) |
172 | continue; |
173 | |
174 | snprintf(subpath, sizeof(subpath), "%s/%s" , path, de->d_name); |
175 | |
176 | if (process_symlinks) |
177 | sret = stat(subpath, &fst); |
178 | else |
179 | sret = lstat(subpath, &fst); |
180 | |
181 | if (sret < 0) |
182 | { |
183 | pg_log_error("could not stat file \"%s\": %m" , subpath); |
184 | continue; |
185 | } |
186 | |
187 | if (S_ISREG(fst.st_mode)) |
188 | (*action) (subpath, false); |
189 | else if (S_ISDIR(fst.st_mode)) |
190 | walkdir(subpath, action, false); |
191 | } |
192 | |
193 | if (errno) |
194 | pg_log_error("could not read directory \"%s\": %m" , path); |
195 | |
196 | (void) closedir(dir); |
197 | |
198 | /* |
199 | * It's important to fsync the destination directory itself as individual |
200 | * file fsyncs don't guarantee that the directory entry for the file is |
201 | * synced. Recent versions of ext4 have made the window much wider but |
202 | * it's been an issue for ext3 and other filesystems in the past. |
203 | */ |
204 | (*action) (path, true); |
205 | } |
206 | |
207 | /* |
208 | * Hint to the OS that it should get ready to fsync() this file. |
209 | * |
210 | * Ignores errors trying to open unreadable files, and reports other errors |
211 | * non-fatally. |
212 | */ |
213 | #ifdef PG_FLUSH_DATA_WORKS |
214 | |
215 | static int |
216 | pre_sync_fname(const char *fname, bool isdir) |
217 | { |
218 | int fd; |
219 | |
220 | fd = open(fname, O_RDONLY | PG_BINARY, 0); |
221 | |
222 | if (fd < 0) |
223 | { |
224 | if (errno == EACCES || (isdir && errno == EISDIR)) |
225 | return 0; |
226 | pg_log_error("could not open file \"%s\": %m" , fname); |
227 | return -1; |
228 | } |
229 | |
230 | /* |
231 | * We do what pg_flush_data() would do in the backend: prefer to use |
232 | * sync_file_range, but fall back to posix_fadvise. We ignore errors |
233 | * because this is only a hint. |
234 | */ |
235 | #if defined(HAVE_SYNC_FILE_RANGE) |
236 | (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); |
237 | #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
238 | (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); |
239 | #else |
240 | #error PG_FLUSH_DATA_WORKS should not have been defined |
241 | #endif |
242 | |
243 | (void) close(fd); |
244 | return 0; |
245 | } |
246 | |
247 | #endif /* PG_FLUSH_DATA_WORKS */ |
248 | |
249 | /* |
250 | * fsync_fname -- Try to fsync a file or directory |
251 | * |
252 | * Ignores errors trying to open unreadable files, or trying to fsync |
253 | * directories on systems where that isn't allowed/required. Reports |
254 | * other errors non-fatally. |
255 | */ |
256 | int |
257 | fsync_fname(const char *fname, bool isdir) |
258 | { |
259 | int fd; |
260 | int flags; |
261 | int returncode; |
262 | |
263 | /* |
264 | * Some OSs require directories to be opened read-only whereas other |
265 | * systems don't allow us to fsync files opened read-only; so we need both |
266 | * cases here. Using O_RDWR will cause us to fail to fsync files that are |
267 | * not writable by our userid, but we assume that's OK. |
268 | */ |
269 | flags = PG_BINARY; |
270 | if (!isdir) |
271 | flags |= O_RDWR; |
272 | else |
273 | flags |= O_RDONLY; |
274 | |
275 | /* |
276 | * Open the file, silently ignoring errors about unreadable files (or |
277 | * unsupported operations, e.g. opening a directory under Windows), and |
278 | * logging others. |
279 | */ |
280 | fd = open(fname, flags, 0); |
281 | if (fd < 0) |
282 | { |
283 | if (errno == EACCES || (isdir && errno == EISDIR)) |
284 | return 0; |
285 | pg_log_error("could not open file \"%s\": %m" , fname); |
286 | return -1; |
287 | } |
288 | |
289 | returncode = fsync(fd); |
290 | |
291 | /* |
292 | * Some OSes don't allow us to fsync directories at all, so we can ignore |
293 | * those errors. Anything else needs to be reported. |
294 | */ |
295 | if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) |
296 | { |
297 | pg_log_error("could not fsync file \"%s\": %m" , fname); |
298 | (void) close(fd); |
299 | return -1; |
300 | } |
301 | |
302 | (void) close(fd); |
303 | return 0; |
304 | } |
305 | |
306 | /* |
307 | * fsync_parent_path -- fsync the parent path of a file or directory |
308 | * |
309 | * This is aimed at making file operations persistent on disk in case of |
310 | * an OS crash or power failure. |
311 | */ |
312 | int |
313 | fsync_parent_path(const char *fname) |
314 | { |
315 | char parentpath[MAXPGPATH]; |
316 | |
317 | strlcpy(parentpath, fname, MAXPGPATH); |
318 | get_parent_directory(parentpath); |
319 | |
320 | /* |
321 | * get_parent_directory() returns an empty string if the input argument is |
322 | * just a file name (see comments in path.c), so handle that as being the |
323 | * current directory. |
324 | */ |
325 | if (strlen(parentpath) == 0) |
326 | strlcpy(parentpath, "." , MAXPGPATH); |
327 | |
328 | if (fsync_fname(parentpath, true) != 0) |
329 | return -1; |
330 | |
331 | return 0; |
332 | } |
333 | |
334 | /* |
335 | * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability |
336 | * |
337 | * Wrapper around rename, similar to the backend version. |
338 | */ |
339 | int |
340 | durable_rename(const char *oldfile, const char *newfile) |
341 | { |
342 | int fd; |
343 | |
344 | /* |
345 | * First fsync the old and target path (if it exists), to ensure that they |
346 | * are properly persistent on disk. Syncing the target file is not |
347 | * strictly necessary, but it makes it easier to reason about crashes; |
348 | * because it's then guaranteed that either source or target file exists |
349 | * after a crash. |
350 | */ |
351 | if (fsync_fname(oldfile, false) != 0) |
352 | return -1; |
353 | |
354 | fd = open(newfile, PG_BINARY | O_RDWR, 0); |
355 | if (fd < 0) |
356 | { |
357 | if (errno != ENOENT) |
358 | { |
359 | pg_log_error("could not open file \"%s\": %m" , newfile); |
360 | return -1; |
361 | } |
362 | } |
363 | else |
364 | { |
365 | if (fsync(fd) != 0) |
366 | { |
367 | pg_log_error("could not fsync file \"%s\": %m" , newfile); |
368 | close(fd); |
369 | return -1; |
370 | } |
371 | close(fd); |
372 | } |
373 | |
374 | /* Time to do the real deal... */ |
375 | if (rename(oldfile, newfile) != 0) |
376 | { |
377 | pg_log_error("could not rename file \"%s\" to \"%s\": %m" , |
378 | oldfile, newfile); |
379 | return -1; |
380 | } |
381 | |
382 | /* |
383 | * To guarantee renaming the file is persistent, fsync the file with its |
384 | * new name, and its containing directory. |
385 | */ |
386 | if (fsync_fname(newfile, false) != 0) |
387 | return -1; |
388 | |
389 | if (fsync_parent_path(newfile) != 0) |
390 | return -1; |
391 | |
392 | return 0; |
393 | } |
394 | |