| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * File-processing utility routines. |
| 4 | * |
| 5 | * Assorted utility functions to work on files. |
| 6 | * |
| 7 | * |
| 8 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 9 | * Portions Copyright (c) 1994, Regents of the University of California |
| 10 | * |
| 11 | * src/common/file_utils.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | #include "postgres_fe.h" |
| 16 | |
| 17 | #include <dirent.h> |
| 18 | #include <fcntl.h> |
| 19 | #include <sys/stat.h> |
| 20 | #include <unistd.h> |
| 21 | |
| 22 | #include "common/file_utils.h" |
| 23 | #include "common/logging.h" |
| 24 | |
| 25 | |
| 26 | /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ |
| 27 | #if defined(HAVE_SYNC_FILE_RANGE) |
| 28 | #define PG_FLUSH_DATA_WORKS 1 |
| 29 | #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
| 30 | #define PG_FLUSH_DATA_WORKS 1 |
| 31 | #endif |
| 32 | |
| 33 | /* |
| 34 | * pg_xlog has been renamed to pg_wal in version 10. |
| 35 | */ |
| 36 | #define MINIMUM_VERSION_FOR_PG_WAL 100000 |
| 37 | |
| 38 | #ifdef PG_FLUSH_DATA_WORKS |
| 39 | static int pre_sync_fname(const char *fname, bool isdir); |
| 40 | #endif |
| 41 | static void walkdir(const char *path, |
| 42 | int (*action) (const char *fname, bool isdir), |
| 43 | bool process_symlinks); |
| 44 | |
| 45 | /* |
| 46 | * Issue fsync recursively on PGDATA and all its contents. |
| 47 | * |
| 48 | * We fsync regular files and directories wherever they are, but we follow |
| 49 | * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc. |
| 50 | * Other symlinks are presumed to point at files we're not responsible for |
| 51 | * fsyncing, and might not have privileges to write at all. |
| 52 | * |
| 53 | * serverVersion indicates the version of the server to be fsync'd. |
| 54 | * |
| 55 | * Errors are reported but not considered fatal. |
| 56 | */ |
| 57 | void |
| 58 | fsync_pgdata(const char *pg_data, |
| 59 | int serverVersion) |
| 60 | { |
| 61 | bool xlog_is_symlink; |
| 62 | char pg_wal[MAXPGPATH]; |
| 63 | char pg_tblspc[MAXPGPATH]; |
| 64 | |
| 65 | /* handle renaming of pg_xlog to pg_wal in post-10 clusters */ |
| 66 | snprintf(pg_wal, MAXPGPATH, "%s/%s" , pg_data, |
| 67 | serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal" ); |
| 68 | snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc" , pg_data); |
| 69 | |
| 70 | /* |
| 71 | * If pg_wal is a symlink, we'll need to recurse into it separately, |
| 72 | * because the first walkdir below will ignore it. |
| 73 | */ |
| 74 | xlog_is_symlink = false; |
| 75 | |
| 76 | #ifndef WIN32 |
| 77 | { |
| 78 | struct stat st; |
| 79 | |
| 80 | if (lstat(pg_wal, &st) < 0) |
| 81 | pg_log_error("could not stat file \"%s\": %m" , pg_wal); |
| 82 | else if (S_ISLNK(st.st_mode)) |
| 83 | xlog_is_symlink = true; |
| 84 | } |
| 85 | #else |
| 86 | if (pgwin32_is_junction(pg_wal)) |
| 87 | xlog_is_symlink = true; |
| 88 | #endif |
| 89 | |
| 90 | /* |
| 91 | * If possible, hint to the kernel that we're soon going to fsync the data |
| 92 | * directory and its contents. |
| 93 | */ |
| 94 | #ifdef PG_FLUSH_DATA_WORKS |
| 95 | walkdir(pg_data, pre_sync_fname, false); |
| 96 | if (xlog_is_symlink) |
| 97 | walkdir(pg_wal, pre_sync_fname, false); |
| 98 | walkdir(pg_tblspc, pre_sync_fname, true); |
| 99 | #endif |
| 100 | |
| 101 | /* |
| 102 | * Now we do the fsync()s in the same order. |
| 103 | * |
| 104 | * The main call ignores symlinks, so in addition to specially processing |
| 105 | * pg_wal if it's a symlink, pg_tblspc has to be visited separately with |
| 106 | * process_symlinks = true. Note that if there are any plain directories |
| 107 | * in pg_tblspc, they'll get fsync'd twice. That's not an expected case |
| 108 | * so we don't worry about optimizing it. |
| 109 | */ |
| 110 | walkdir(pg_data, fsync_fname, false); |
| 111 | if (xlog_is_symlink) |
| 112 | walkdir(pg_wal, fsync_fname, false); |
| 113 | walkdir(pg_tblspc, fsync_fname, true); |
| 114 | } |
| 115 | |
| 116 | /* |
| 117 | * Issue fsync recursively on the given directory and all its contents. |
| 118 | * |
| 119 | * This is a convenient wrapper on top of walkdir(). |
| 120 | */ |
| 121 | void |
| 122 | fsync_dir_recurse(const char *dir) |
| 123 | { |
| 124 | /* |
| 125 | * If possible, hint to the kernel that we're soon going to fsync the data |
| 126 | * directory and its contents. |
| 127 | */ |
| 128 | #ifdef PG_FLUSH_DATA_WORKS |
| 129 | walkdir(dir, pre_sync_fname, false); |
| 130 | #endif |
| 131 | |
| 132 | walkdir(dir, fsync_fname, false); |
| 133 | } |
| 134 | |
| 135 | /* |
| 136 | * walkdir: recursively walk a directory, applying the action to each |
| 137 | * regular file and directory (including the named directory itself). |
| 138 | * |
| 139 | * If process_symlinks is true, the action and recursion are also applied |
| 140 | * to regular files and directories that are pointed to by symlinks in the |
| 141 | * given directory; otherwise symlinks are ignored. Symlinks are always |
| 142 | * ignored in subdirectories, ie we intentionally don't pass down the |
| 143 | * process_symlinks flag to recursive calls. |
| 144 | * |
| 145 | * Errors are reported but not considered fatal. |
| 146 | * |
| 147 | * See also walkdir in fd.c, which is a backend version of this logic. |
| 148 | */ |
| 149 | static void |
| 150 | walkdir(const char *path, |
| 151 | int (*action) (const char *fname, bool isdir), |
| 152 | bool process_symlinks) |
| 153 | { |
| 154 | DIR *dir; |
| 155 | struct dirent *de; |
| 156 | |
| 157 | dir = opendir(path); |
| 158 | if (dir == NULL) |
| 159 | { |
| 160 | pg_log_error("could not open directory \"%s\": %m" , path); |
| 161 | return; |
| 162 | } |
| 163 | |
| 164 | while (errno = 0, (de = readdir(dir)) != NULL) |
| 165 | { |
| 166 | char subpath[MAXPGPATH * 2]; |
| 167 | struct stat fst; |
| 168 | int sret; |
| 169 | |
| 170 | if (strcmp(de->d_name, "." ) == 0 || |
| 171 | strcmp(de->d_name, ".." ) == 0) |
| 172 | continue; |
| 173 | |
| 174 | snprintf(subpath, sizeof(subpath), "%s/%s" , path, de->d_name); |
| 175 | |
| 176 | if (process_symlinks) |
| 177 | sret = stat(subpath, &fst); |
| 178 | else |
| 179 | sret = lstat(subpath, &fst); |
| 180 | |
| 181 | if (sret < 0) |
| 182 | { |
| 183 | pg_log_error("could not stat file \"%s\": %m" , subpath); |
| 184 | continue; |
| 185 | } |
| 186 | |
| 187 | if (S_ISREG(fst.st_mode)) |
| 188 | (*action) (subpath, false); |
| 189 | else if (S_ISDIR(fst.st_mode)) |
| 190 | walkdir(subpath, action, false); |
| 191 | } |
| 192 | |
| 193 | if (errno) |
| 194 | pg_log_error("could not read directory \"%s\": %m" , path); |
| 195 | |
| 196 | (void) closedir(dir); |
| 197 | |
| 198 | /* |
| 199 | * It's important to fsync the destination directory itself as individual |
| 200 | * file fsyncs don't guarantee that the directory entry for the file is |
| 201 | * synced. Recent versions of ext4 have made the window much wider but |
| 202 | * it's been an issue for ext3 and other filesystems in the past. |
| 203 | */ |
| 204 | (*action) (path, true); |
| 205 | } |
| 206 | |
| 207 | /* |
| 208 | * Hint to the OS that it should get ready to fsync() this file. |
| 209 | * |
| 210 | * Ignores errors trying to open unreadable files, and reports other errors |
| 211 | * non-fatally. |
| 212 | */ |
| 213 | #ifdef PG_FLUSH_DATA_WORKS |
| 214 | |
| 215 | static int |
| 216 | pre_sync_fname(const char *fname, bool isdir) |
| 217 | { |
| 218 | int fd; |
| 219 | |
| 220 | fd = open(fname, O_RDONLY | PG_BINARY, 0); |
| 221 | |
| 222 | if (fd < 0) |
| 223 | { |
| 224 | if (errno == EACCES || (isdir && errno == EISDIR)) |
| 225 | return 0; |
| 226 | pg_log_error("could not open file \"%s\": %m" , fname); |
| 227 | return -1; |
| 228 | } |
| 229 | |
| 230 | /* |
| 231 | * We do what pg_flush_data() would do in the backend: prefer to use |
| 232 | * sync_file_range, but fall back to posix_fadvise. We ignore errors |
| 233 | * because this is only a hint. |
| 234 | */ |
| 235 | #if defined(HAVE_SYNC_FILE_RANGE) |
| 236 | (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); |
| 237 | #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
| 238 | (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); |
| 239 | #else |
| 240 | #error PG_FLUSH_DATA_WORKS should not have been defined |
| 241 | #endif |
| 242 | |
| 243 | (void) close(fd); |
| 244 | return 0; |
| 245 | } |
| 246 | |
| 247 | #endif /* PG_FLUSH_DATA_WORKS */ |
| 248 | |
| 249 | /* |
| 250 | * fsync_fname -- Try to fsync a file or directory |
| 251 | * |
| 252 | * Ignores errors trying to open unreadable files, or trying to fsync |
| 253 | * directories on systems where that isn't allowed/required. Reports |
| 254 | * other errors non-fatally. |
| 255 | */ |
| 256 | int |
| 257 | fsync_fname(const char *fname, bool isdir) |
| 258 | { |
| 259 | int fd; |
| 260 | int flags; |
| 261 | int returncode; |
| 262 | |
| 263 | /* |
| 264 | * Some OSs require directories to be opened read-only whereas other |
| 265 | * systems don't allow us to fsync files opened read-only; so we need both |
| 266 | * cases here. Using O_RDWR will cause us to fail to fsync files that are |
| 267 | * not writable by our userid, but we assume that's OK. |
| 268 | */ |
| 269 | flags = PG_BINARY; |
| 270 | if (!isdir) |
| 271 | flags |= O_RDWR; |
| 272 | else |
| 273 | flags |= O_RDONLY; |
| 274 | |
| 275 | /* |
| 276 | * Open the file, silently ignoring errors about unreadable files (or |
| 277 | * unsupported operations, e.g. opening a directory under Windows), and |
| 278 | * logging others. |
| 279 | */ |
| 280 | fd = open(fname, flags, 0); |
| 281 | if (fd < 0) |
| 282 | { |
| 283 | if (errno == EACCES || (isdir && errno == EISDIR)) |
| 284 | return 0; |
| 285 | pg_log_error("could not open file \"%s\": %m" , fname); |
| 286 | return -1; |
| 287 | } |
| 288 | |
| 289 | returncode = fsync(fd); |
| 290 | |
| 291 | /* |
| 292 | * Some OSes don't allow us to fsync directories at all, so we can ignore |
| 293 | * those errors. Anything else needs to be reported. |
| 294 | */ |
| 295 | if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) |
| 296 | { |
| 297 | pg_log_error("could not fsync file \"%s\": %m" , fname); |
| 298 | (void) close(fd); |
| 299 | return -1; |
| 300 | } |
| 301 | |
| 302 | (void) close(fd); |
| 303 | return 0; |
| 304 | } |
| 305 | |
| 306 | /* |
| 307 | * fsync_parent_path -- fsync the parent path of a file or directory |
| 308 | * |
| 309 | * This is aimed at making file operations persistent on disk in case of |
| 310 | * an OS crash or power failure. |
| 311 | */ |
| 312 | int |
| 313 | fsync_parent_path(const char *fname) |
| 314 | { |
| 315 | char parentpath[MAXPGPATH]; |
| 316 | |
| 317 | strlcpy(parentpath, fname, MAXPGPATH); |
| 318 | get_parent_directory(parentpath); |
| 319 | |
| 320 | /* |
| 321 | * get_parent_directory() returns an empty string if the input argument is |
| 322 | * just a file name (see comments in path.c), so handle that as being the |
| 323 | * current directory. |
| 324 | */ |
| 325 | if (strlen(parentpath) == 0) |
| 326 | strlcpy(parentpath, "." , MAXPGPATH); |
| 327 | |
| 328 | if (fsync_fname(parentpath, true) != 0) |
| 329 | return -1; |
| 330 | |
| 331 | return 0; |
| 332 | } |
| 333 | |
| 334 | /* |
| 335 | * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability |
| 336 | * |
| 337 | * Wrapper around rename, similar to the backend version. |
| 338 | */ |
| 339 | int |
| 340 | durable_rename(const char *oldfile, const char *newfile) |
| 341 | { |
| 342 | int fd; |
| 343 | |
| 344 | /* |
| 345 | * First fsync the old and target path (if it exists), to ensure that they |
| 346 | * are properly persistent on disk. Syncing the target file is not |
| 347 | * strictly necessary, but it makes it easier to reason about crashes; |
| 348 | * because it's then guaranteed that either source or target file exists |
| 349 | * after a crash. |
| 350 | */ |
| 351 | if (fsync_fname(oldfile, false) != 0) |
| 352 | return -1; |
| 353 | |
| 354 | fd = open(newfile, PG_BINARY | O_RDWR, 0); |
| 355 | if (fd < 0) |
| 356 | { |
| 357 | if (errno != ENOENT) |
| 358 | { |
| 359 | pg_log_error("could not open file \"%s\": %m" , newfile); |
| 360 | return -1; |
| 361 | } |
| 362 | } |
| 363 | else |
| 364 | { |
| 365 | if (fsync(fd) != 0) |
| 366 | { |
| 367 | pg_log_error("could not fsync file \"%s\": %m" , newfile); |
| 368 | close(fd); |
| 369 | return -1; |
| 370 | } |
| 371 | close(fd); |
| 372 | } |
| 373 | |
| 374 | /* Time to do the real deal... */ |
| 375 | if (rename(oldfile, newfile) != 0) |
| 376 | { |
| 377 | pg_log_error("could not rename file \"%s\" to \"%s\": %m" , |
| 378 | oldfile, newfile); |
| 379 | return -1; |
| 380 | } |
| 381 | |
| 382 | /* |
| 383 | * To guarantee renaming the file is persistent, fsync the file with its |
| 384 | * new name, and its containing directory. |
| 385 | */ |
| 386 | if (fsync_fname(newfile, false) != 0) |
| 387 | return -1; |
| 388 | |
| 389 | if (fsync_parent_path(newfile) != 0) |
| 390 | return -1; |
| 391 | |
| 392 | return 0; |
| 393 | } |
| 394 | |