| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * reinit.c |
| 4 | * Reinitialization of unlogged relations |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | * |
| 9 | * IDENTIFICATION |
| 10 | * src/backend/storage/file/reinit.c |
| 11 | * |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include <unistd.h> |
| 18 | |
| 19 | #include "common/relpath.h" |
| 20 | #include "storage/copydir.h" |
| 21 | #include "storage/fd.h" |
| 22 | #include "storage/reinit.h" |
| 23 | #include "utils/hsearch.h" |
| 24 | #include "utils/memutils.h" |
| 25 | |
| 26 | static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, |
| 27 | int op); |
| 28 | static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, |
| 29 | int op); |
| 30 | |
| 31 | typedef struct |
| 32 | { |
| 33 | char oid[OIDCHARS + 1]; |
| 34 | } unlogged_relation_entry; |
| 35 | |
| 36 | /* |
| 37 | * Reset unlogged relations from before the last restart. |
| 38 | * |
| 39 | * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any |
| 40 | * relation with an "init" fork, except for the "init" fork itself. |
| 41 | * |
| 42 | * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main |
| 43 | * fork. |
| 44 | */ |
| 45 | void |
| 46 | ResetUnloggedRelations(int op) |
| 47 | { |
| 48 | char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)]; |
| 49 | DIR *spc_dir; |
| 50 | struct dirent *spc_de; |
| 51 | MemoryContext tmpctx, |
| 52 | oldctx; |
| 53 | |
| 54 | /* Log it. */ |
| 55 | elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d" , |
| 56 | (op & UNLOGGED_RELATION_CLEANUP) != 0, |
| 57 | (op & UNLOGGED_RELATION_INIT) != 0); |
| 58 | |
| 59 | /* |
| 60 | * Just to be sure we don't leak any memory, let's create a temporary |
| 61 | * memory context for this operation. |
| 62 | */ |
| 63 | tmpctx = AllocSetContextCreate(CurrentMemoryContext, |
| 64 | "ResetUnloggedRelations" , |
| 65 | ALLOCSET_DEFAULT_SIZES); |
| 66 | oldctx = MemoryContextSwitchTo(tmpctx); |
| 67 | |
| 68 | /* |
| 69 | * First process unlogged files in pg_default ($PGDATA/base) |
| 70 | */ |
| 71 | ResetUnloggedRelationsInTablespaceDir("base" , op); |
| 72 | |
| 73 | /* |
| 74 | * Cycle through directories for all non-default tablespaces. |
| 75 | */ |
| 76 | spc_dir = AllocateDir("pg_tblspc" ); |
| 77 | |
| 78 | while ((spc_de = ReadDir(spc_dir, "pg_tblspc" )) != NULL) |
| 79 | { |
| 80 | if (strcmp(spc_de->d_name, "." ) == 0 || |
| 81 | strcmp(spc_de->d_name, ".." ) == 0) |
| 82 | continue; |
| 83 | |
| 84 | snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s" , |
| 85 | spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); |
| 86 | ResetUnloggedRelationsInTablespaceDir(temp_path, op); |
| 87 | } |
| 88 | |
| 89 | FreeDir(spc_dir); |
| 90 | |
| 91 | /* |
| 92 | * Restore memory context. |
| 93 | */ |
| 94 | MemoryContextSwitchTo(oldctx); |
| 95 | MemoryContextDelete(tmpctx); |
| 96 | } |
| 97 | |
| 98 | /* |
| 99 | * Process one tablespace directory for ResetUnloggedRelations |
| 100 | */ |
| 101 | static void |
| 102 | ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) |
| 103 | { |
| 104 | DIR *ts_dir; |
| 105 | struct dirent *de; |
| 106 | char dbspace_path[MAXPGPATH * 2]; |
| 107 | |
| 108 | ts_dir = AllocateDir(tsdirname); |
| 109 | |
| 110 | /* |
| 111 | * If we get ENOENT on a tablespace directory, log it and return. This |
| 112 | * can happen if a previous DROP TABLESPACE crashed between removing the |
| 113 | * tablespace directory and removing the symlink in pg_tblspc. We don't |
| 114 | * really want to prevent database startup in that scenario, so let it |
| 115 | * pass instead. Any other type of error will be reported by ReadDir |
| 116 | * (causing a startup failure). |
| 117 | */ |
| 118 | if (ts_dir == NULL && errno == ENOENT) |
| 119 | { |
| 120 | ereport(LOG, |
| 121 | (errcode_for_file_access(), |
| 122 | errmsg("could not open directory \"%s\": %m" , |
| 123 | tsdirname))); |
| 124 | return; |
| 125 | } |
| 126 | |
| 127 | while ((de = ReadDir(ts_dir, tsdirname)) != NULL) |
| 128 | { |
| 129 | /* |
| 130 | * We're only interested in the per-database directories, which have |
| 131 | * numeric names. Note that this code will also (properly) ignore "." |
| 132 | * and "..". |
| 133 | */ |
| 134 | if (strspn(de->d_name, "0123456789" ) != strlen(de->d_name)) |
| 135 | continue; |
| 136 | |
| 137 | snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s" , |
| 138 | tsdirname, de->d_name); |
| 139 | ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); |
| 140 | } |
| 141 | |
| 142 | FreeDir(ts_dir); |
| 143 | } |
| 144 | |
| 145 | /* |
| 146 | * Process one per-dbspace directory for ResetUnloggedRelations |
| 147 | */ |
| 148 | static void |
| 149 | ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) |
| 150 | { |
| 151 | DIR *dbspace_dir; |
| 152 | struct dirent *de; |
| 153 | char rm_path[MAXPGPATH * 2]; |
| 154 | |
| 155 | /* Caller must specify at least one operation. */ |
| 156 | Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); |
| 157 | |
| 158 | /* |
| 159 | * Cleanup is a two-pass operation. First, we go through and identify all |
| 160 | * the files with init forks. Then, we go through again and nuke |
| 161 | * everything with the same OID except the init fork. |
| 162 | */ |
| 163 | if ((op & UNLOGGED_RELATION_CLEANUP) != 0) |
| 164 | { |
| 165 | HTAB *hash; |
| 166 | HASHCTL ctl; |
| 167 | |
| 168 | /* |
| 169 | * It's possible that someone could create a ton of unlogged relations |
| 170 | * in the same database & tablespace, so we'd better use a hash table |
| 171 | * rather than an array or linked list to keep track of which files |
| 172 | * need to be reset. Otherwise, this cleanup operation would be |
| 173 | * O(n^2). |
| 174 | */ |
| 175 | memset(&ctl, 0, sizeof(ctl)); |
| 176 | ctl.keysize = sizeof(unlogged_relation_entry); |
| 177 | ctl.entrysize = sizeof(unlogged_relation_entry); |
| 178 | hash = hash_create("unlogged hash" , 32, &ctl, HASH_ELEM); |
| 179 | |
| 180 | /* Scan the directory. */ |
| 181 | dbspace_dir = AllocateDir(dbspacedirname); |
| 182 | while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) |
| 183 | { |
| 184 | ForkNumber forkNum; |
| 185 | int oidchars; |
| 186 | unlogged_relation_entry ent; |
| 187 | |
| 188 | /* Skip anything that doesn't look like a relation data file. */ |
| 189 | if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, |
| 190 | &forkNum)) |
| 191 | continue; |
| 192 | |
| 193 | /* Also skip it unless this is the init fork. */ |
| 194 | if (forkNum != INIT_FORKNUM) |
| 195 | continue; |
| 196 | |
| 197 | /* |
| 198 | * Put the OID portion of the name into the hash table, if it |
| 199 | * isn't already. |
| 200 | */ |
| 201 | memset(ent.oid, 0, sizeof(ent.oid)); |
| 202 | memcpy(ent.oid, de->d_name, oidchars); |
| 203 | hash_search(hash, &ent, HASH_ENTER, NULL); |
| 204 | } |
| 205 | |
| 206 | /* Done with the first pass. */ |
| 207 | FreeDir(dbspace_dir); |
| 208 | |
| 209 | /* |
| 210 | * If we didn't find any init forks, there's no point in continuing; |
| 211 | * we can bail out now. |
| 212 | */ |
| 213 | if (hash_get_num_entries(hash) == 0) |
| 214 | { |
| 215 | hash_destroy(hash); |
| 216 | return; |
| 217 | } |
| 218 | |
| 219 | /* |
| 220 | * Now, make a second pass and remove anything that matches. |
| 221 | */ |
| 222 | dbspace_dir = AllocateDir(dbspacedirname); |
| 223 | while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) |
| 224 | { |
| 225 | ForkNumber forkNum; |
| 226 | int oidchars; |
| 227 | bool found; |
| 228 | unlogged_relation_entry ent; |
| 229 | |
| 230 | /* Skip anything that doesn't look like a relation data file. */ |
| 231 | if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, |
| 232 | &forkNum)) |
| 233 | continue; |
| 234 | |
| 235 | /* We never remove the init fork. */ |
| 236 | if (forkNum == INIT_FORKNUM) |
| 237 | continue; |
| 238 | |
| 239 | /* |
| 240 | * See whether the OID portion of the name shows up in the hash |
| 241 | * table. |
| 242 | */ |
| 243 | memset(ent.oid, 0, sizeof(ent.oid)); |
| 244 | memcpy(ent.oid, de->d_name, oidchars); |
| 245 | hash_search(hash, &ent, HASH_FIND, &found); |
| 246 | |
| 247 | /* If so, nuke it! */ |
| 248 | if (found) |
| 249 | { |
| 250 | snprintf(rm_path, sizeof(rm_path), "%s/%s" , |
| 251 | dbspacedirname, de->d_name); |
| 252 | if (unlink(rm_path) < 0) |
| 253 | ereport(ERROR, |
| 254 | (errcode_for_file_access(), |
| 255 | errmsg("could not remove file \"%s\": %m" , |
| 256 | rm_path))); |
| 257 | else |
| 258 | elog(DEBUG2, "unlinked file \"%s\"" , rm_path); |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | /* Cleanup is complete. */ |
| 263 | FreeDir(dbspace_dir); |
| 264 | hash_destroy(hash); |
| 265 | } |
| 266 | |
| 267 | /* |
| 268 | * Initialization happens after cleanup is complete: we copy each init |
| 269 | * fork file to the corresponding main fork file. Note that if we are |
| 270 | * asked to do both cleanup and init, we may never get here: if the |
| 271 | * cleanup code determines that there are no init forks in this dbspace, |
| 272 | * it will return before we get to this point. |
| 273 | */ |
| 274 | if ((op & UNLOGGED_RELATION_INIT) != 0) |
| 275 | { |
| 276 | /* Scan the directory. */ |
| 277 | dbspace_dir = AllocateDir(dbspacedirname); |
| 278 | while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) |
| 279 | { |
| 280 | ForkNumber forkNum; |
| 281 | int oidchars; |
| 282 | char oidbuf[OIDCHARS + 1]; |
| 283 | char srcpath[MAXPGPATH * 2]; |
| 284 | char dstpath[MAXPGPATH]; |
| 285 | |
| 286 | /* Skip anything that doesn't look like a relation data file. */ |
| 287 | if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, |
| 288 | &forkNum)) |
| 289 | continue; |
| 290 | |
| 291 | /* Also skip it unless this is the init fork. */ |
| 292 | if (forkNum != INIT_FORKNUM) |
| 293 | continue; |
| 294 | |
| 295 | /* Construct source pathname. */ |
| 296 | snprintf(srcpath, sizeof(srcpath), "%s/%s" , |
| 297 | dbspacedirname, de->d_name); |
| 298 | |
| 299 | /* Construct destination pathname. */ |
| 300 | memcpy(oidbuf, de->d_name, oidchars); |
| 301 | oidbuf[oidchars] = '\0'; |
| 302 | snprintf(dstpath, sizeof(dstpath), "%s/%s%s" , |
| 303 | dbspacedirname, oidbuf, de->d_name + oidchars + 1 + |
| 304 | strlen(forkNames[INIT_FORKNUM])); |
| 305 | |
| 306 | /* OK, we're ready to perform the actual copy. */ |
| 307 | elog(DEBUG2, "copying %s to %s" , srcpath, dstpath); |
| 308 | copy_file(srcpath, dstpath); |
| 309 | } |
| 310 | |
| 311 | FreeDir(dbspace_dir); |
| 312 | |
| 313 | /* |
| 314 | * copy_file() above has already called pg_flush_data() on the files |
| 315 | * it created. Now we need to fsync those files, because a checkpoint |
| 316 | * won't do it for us while we're in recovery. We do this in a |
| 317 | * separate pass to allow the kernel to perform all the flushes |
| 318 | * (especially the metadata ones) at once. |
| 319 | */ |
| 320 | dbspace_dir = AllocateDir(dbspacedirname); |
| 321 | while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) |
| 322 | { |
| 323 | ForkNumber forkNum; |
| 324 | int oidchars; |
| 325 | char oidbuf[OIDCHARS + 1]; |
| 326 | char mainpath[MAXPGPATH]; |
| 327 | |
| 328 | /* Skip anything that doesn't look like a relation data file. */ |
| 329 | if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, |
| 330 | &forkNum)) |
| 331 | continue; |
| 332 | |
| 333 | /* Also skip it unless this is the init fork. */ |
| 334 | if (forkNum != INIT_FORKNUM) |
| 335 | continue; |
| 336 | |
| 337 | /* Construct main fork pathname. */ |
| 338 | memcpy(oidbuf, de->d_name, oidchars); |
| 339 | oidbuf[oidchars] = '\0'; |
| 340 | snprintf(mainpath, sizeof(mainpath), "%s/%s%s" , |
| 341 | dbspacedirname, oidbuf, de->d_name + oidchars + 1 + |
| 342 | strlen(forkNames[INIT_FORKNUM])); |
| 343 | |
| 344 | fsync_fname(mainpath, false); |
| 345 | } |
| 346 | |
| 347 | FreeDir(dbspace_dir); |
| 348 | |
| 349 | /* |
| 350 | * Lastly, fsync the database directory itself, ensuring the |
| 351 | * filesystem remembers the file creations and deletions we've done. |
| 352 | * We don't bother with this during a call that does only |
| 353 | * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we |
| 354 | * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step |
| 355 | * too at the next startup attempt. |
| 356 | */ |
| 357 | fsync_fname(dbspacedirname, true); |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | /* |
| 362 | * Basic parsing of putative relation filenames. |
| 363 | * |
| 364 | * This function returns true if the file appears to be in the correct format |
| 365 | * for a non-temporary relation and false otherwise. |
| 366 | * |
| 367 | * NB: If this function returns true, the caller is entitled to assume that |
| 368 | * *oidchars has been set to the a value no more than OIDCHARS, and thus |
| 369 | * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID |
| 370 | * portion of the filename. This is critical to protect against a possible |
| 371 | * buffer overrun. |
| 372 | */ |
| 373 | bool |
| 374 | parse_filename_for_nontemp_relation(const char *name, int *oidchars, |
| 375 | ForkNumber *fork) |
| 376 | { |
| 377 | int pos; |
| 378 | |
| 379 | /* Look for a non-empty string of digits (that isn't too long). */ |
| 380 | for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) |
| 381 | ; |
| 382 | if (pos == 0 || pos > OIDCHARS) |
| 383 | return false; |
| 384 | *oidchars = pos; |
| 385 | |
| 386 | /* Check for a fork name. */ |
| 387 | if (name[pos] != '_') |
| 388 | *fork = MAIN_FORKNUM; |
| 389 | else |
| 390 | { |
| 391 | int forkchar; |
| 392 | |
| 393 | forkchar = forkname_chars(&name[pos + 1], fork); |
| 394 | if (forkchar <= 0) |
| 395 | return false; |
| 396 | pos += forkchar + 1; |
| 397 | } |
| 398 | |
| 399 | /* Check for a segment number. */ |
| 400 | if (name[pos] == '.') |
| 401 | { |
| 402 | int segchar; |
| 403 | |
| 404 | for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) |
| 405 | ; |
| 406 | if (segchar <= 1) |
| 407 | return false; |
| 408 | pos += segchar; |
| 409 | } |
| 410 | |
| 411 | /* Now we should be at the end. */ |
| 412 | if (name[pos] != '\0') |
| 413 | return false; |
| 414 | return true; |
| 415 | } |
| 416 | |