1/*-------------------------------------------------------------------------
2 *
3 * reinit.c
4 * Reinitialization of unlogged relations
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/reinit.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include <unistd.h>
18
19#include "common/relpath.h"
20#include "storage/copydir.h"
21#include "storage/fd.h"
22#include "storage/reinit.h"
23#include "utils/hsearch.h"
24#include "utils/memutils.h"
25
26static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
27 int op);
28static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
29 int op);
30
31typedef struct
32{
33 char oid[OIDCHARS + 1];
34} unlogged_relation_entry;
35
36/*
37 * Reset unlogged relations from before the last restart.
38 *
39 * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
40 * relation with an "init" fork, except for the "init" fork itself.
41 *
42 * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
43 * fork.
44 */
45void
46ResetUnloggedRelations(int op)
47{
48 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
49 DIR *spc_dir;
50 struct dirent *spc_de;
51 MemoryContext tmpctx,
52 oldctx;
53
54 /* Log it. */
55 elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
56 (op & UNLOGGED_RELATION_CLEANUP) != 0,
57 (op & UNLOGGED_RELATION_INIT) != 0);
58
59 /*
60 * Just to be sure we don't leak any memory, let's create a temporary
61 * memory context for this operation.
62 */
63 tmpctx = AllocSetContextCreate(CurrentMemoryContext,
64 "ResetUnloggedRelations",
65 ALLOCSET_DEFAULT_SIZES);
66 oldctx = MemoryContextSwitchTo(tmpctx);
67
68 /*
69 * First process unlogged files in pg_default ($PGDATA/base)
70 */
71 ResetUnloggedRelationsInTablespaceDir("base", op);
72
73 /*
74 * Cycle through directories for all non-default tablespaces.
75 */
76 spc_dir = AllocateDir("pg_tblspc");
77
78 while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
79 {
80 if (strcmp(spc_de->d_name, ".") == 0 ||
81 strcmp(spc_de->d_name, "..") == 0)
82 continue;
83
84 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
85 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
86 ResetUnloggedRelationsInTablespaceDir(temp_path, op);
87 }
88
89 FreeDir(spc_dir);
90
91 /*
92 * Restore memory context.
93 */
94 MemoryContextSwitchTo(oldctx);
95 MemoryContextDelete(tmpctx);
96}
97
98/*
99 * Process one tablespace directory for ResetUnloggedRelations
100 */
101static void
102ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
103{
104 DIR *ts_dir;
105 struct dirent *de;
106 char dbspace_path[MAXPGPATH * 2];
107
108 ts_dir = AllocateDir(tsdirname);
109
110 /*
111 * If we get ENOENT on a tablespace directory, log it and return. This
112 * can happen if a previous DROP TABLESPACE crashed between removing the
113 * tablespace directory and removing the symlink in pg_tblspc. We don't
114 * really want to prevent database startup in that scenario, so let it
115 * pass instead. Any other type of error will be reported by ReadDir
116 * (causing a startup failure).
117 */
118 if (ts_dir == NULL && errno == ENOENT)
119 {
120 ereport(LOG,
121 (errcode_for_file_access(),
122 errmsg("could not open directory \"%s\": %m",
123 tsdirname)));
124 return;
125 }
126
127 while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
128 {
129 /*
130 * We're only interested in the per-database directories, which have
131 * numeric names. Note that this code will also (properly) ignore "."
132 * and "..".
133 */
134 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
135 continue;
136
137 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
138 tsdirname, de->d_name);
139 ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
140 }
141
142 FreeDir(ts_dir);
143}
144
145/*
146 * Process one per-dbspace directory for ResetUnloggedRelations
147 */
148static void
149ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
150{
151 DIR *dbspace_dir;
152 struct dirent *de;
153 char rm_path[MAXPGPATH * 2];
154
155 /* Caller must specify at least one operation. */
156 Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
157
158 /*
159 * Cleanup is a two-pass operation. First, we go through and identify all
160 * the files with init forks. Then, we go through again and nuke
161 * everything with the same OID except the init fork.
162 */
163 if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
164 {
165 HTAB *hash;
166 HASHCTL ctl;
167
168 /*
169 * It's possible that someone could create a ton of unlogged relations
170 * in the same database & tablespace, so we'd better use a hash table
171 * rather than an array or linked list to keep track of which files
172 * need to be reset. Otherwise, this cleanup operation would be
173 * O(n^2).
174 */
175 memset(&ctl, 0, sizeof(ctl));
176 ctl.keysize = sizeof(unlogged_relation_entry);
177 ctl.entrysize = sizeof(unlogged_relation_entry);
178 hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM);
179
180 /* Scan the directory. */
181 dbspace_dir = AllocateDir(dbspacedirname);
182 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
183 {
184 ForkNumber forkNum;
185 int oidchars;
186 unlogged_relation_entry ent;
187
188 /* Skip anything that doesn't look like a relation data file. */
189 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
190 &forkNum))
191 continue;
192
193 /* Also skip it unless this is the init fork. */
194 if (forkNum != INIT_FORKNUM)
195 continue;
196
197 /*
198 * Put the OID portion of the name into the hash table, if it
199 * isn't already.
200 */
201 memset(ent.oid, 0, sizeof(ent.oid));
202 memcpy(ent.oid, de->d_name, oidchars);
203 hash_search(hash, &ent, HASH_ENTER, NULL);
204 }
205
206 /* Done with the first pass. */
207 FreeDir(dbspace_dir);
208
209 /*
210 * If we didn't find any init forks, there's no point in continuing;
211 * we can bail out now.
212 */
213 if (hash_get_num_entries(hash) == 0)
214 {
215 hash_destroy(hash);
216 return;
217 }
218
219 /*
220 * Now, make a second pass and remove anything that matches.
221 */
222 dbspace_dir = AllocateDir(dbspacedirname);
223 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
224 {
225 ForkNumber forkNum;
226 int oidchars;
227 bool found;
228 unlogged_relation_entry ent;
229
230 /* Skip anything that doesn't look like a relation data file. */
231 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
232 &forkNum))
233 continue;
234
235 /* We never remove the init fork. */
236 if (forkNum == INIT_FORKNUM)
237 continue;
238
239 /*
240 * See whether the OID portion of the name shows up in the hash
241 * table.
242 */
243 memset(ent.oid, 0, sizeof(ent.oid));
244 memcpy(ent.oid, de->d_name, oidchars);
245 hash_search(hash, &ent, HASH_FIND, &found);
246
247 /* If so, nuke it! */
248 if (found)
249 {
250 snprintf(rm_path, sizeof(rm_path), "%s/%s",
251 dbspacedirname, de->d_name);
252 if (unlink(rm_path) < 0)
253 ereport(ERROR,
254 (errcode_for_file_access(),
255 errmsg("could not remove file \"%s\": %m",
256 rm_path)));
257 else
258 elog(DEBUG2, "unlinked file \"%s\"", rm_path);
259 }
260 }
261
262 /* Cleanup is complete. */
263 FreeDir(dbspace_dir);
264 hash_destroy(hash);
265 }
266
267 /*
268 * Initialization happens after cleanup is complete: we copy each init
269 * fork file to the corresponding main fork file. Note that if we are
270 * asked to do both cleanup and init, we may never get here: if the
271 * cleanup code determines that there are no init forks in this dbspace,
272 * it will return before we get to this point.
273 */
274 if ((op & UNLOGGED_RELATION_INIT) != 0)
275 {
276 /* Scan the directory. */
277 dbspace_dir = AllocateDir(dbspacedirname);
278 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
279 {
280 ForkNumber forkNum;
281 int oidchars;
282 char oidbuf[OIDCHARS + 1];
283 char srcpath[MAXPGPATH * 2];
284 char dstpath[MAXPGPATH];
285
286 /* Skip anything that doesn't look like a relation data file. */
287 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
288 &forkNum))
289 continue;
290
291 /* Also skip it unless this is the init fork. */
292 if (forkNum != INIT_FORKNUM)
293 continue;
294
295 /* Construct source pathname. */
296 snprintf(srcpath, sizeof(srcpath), "%s/%s",
297 dbspacedirname, de->d_name);
298
299 /* Construct destination pathname. */
300 memcpy(oidbuf, de->d_name, oidchars);
301 oidbuf[oidchars] = '\0';
302 snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
303 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
304 strlen(forkNames[INIT_FORKNUM]));
305
306 /* OK, we're ready to perform the actual copy. */
307 elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
308 copy_file(srcpath, dstpath);
309 }
310
311 FreeDir(dbspace_dir);
312
313 /*
314 * copy_file() above has already called pg_flush_data() on the files
315 * it created. Now we need to fsync those files, because a checkpoint
316 * won't do it for us while we're in recovery. We do this in a
317 * separate pass to allow the kernel to perform all the flushes
318 * (especially the metadata ones) at once.
319 */
320 dbspace_dir = AllocateDir(dbspacedirname);
321 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
322 {
323 ForkNumber forkNum;
324 int oidchars;
325 char oidbuf[OIDCHARS + 1];
326 char mainpath[MAXPGPATH];
327
328 /* Skip anything that doesn't look like a relation data file. */
329 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
330 &forkNum))
331 continue;
332
333 /* Also skip it unless this is the init fork. */
334 if (forkNum != INIT_FORKNUM)
335 continue;
336
337 /* Construct main fork pathname. */
338 memcpy(oidbuf, de->d_name, oidchars);
339 oidbuf[oidchars] = '\0';
340 snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
341 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
342 strlen(forkNames[INIT_FORKNUM]));
343
344 fsync_fname(mainpath, false);
345 }
346
347 FreeDir(dbspace_dir);
348
349 /*
350 * Lastly, fsync the database directory itself, ensuring the
351 * filesystem remembers the file creations and deletions we've done.
352 * We don't bother with this during a call that does only
353 * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
354 * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
355 * too at the next startup attempt.
356 */
357 fsync_fname(dbspacedirname, true);
358 }
359}
360
361/*
362 * Basic parsing of putative relation filenames.
363 *
364 * This function returns true if the file appears to be in the correct format
365 * for a non-temporary relation and false otherwise.
366 *
367 * NB: If this function returns true, the caller is entitled to assume that
368 * *oidchars has been set to the a value no more than OIDCHARS, and thus
369 * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID
370 * portion of the filename. This is critical to protect against a possible
371 * buffer overrun.
372 */
373bool
374parse_filename_for_nontemp_relation(const char *name, int *oidchars,
375 ForkNumber *fork)
376{
377 int pos;
378
379 /* Look for a non-empty string of digits (that isn't too long). */
380 for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
381 ;
382 if (pos == 0 || pos > OIDCHARS)
383 return false;
384 *oidchars = pos;
385
386 /* Check for a fork name. */
387 if (name[pos] != '_')
388 *fork = MAIN_FORKNUM;
389 else
390 {
391 int forkchar;
392
393 forkchar = forkname_chars(&name[pos + 1], fork);
394 if (forkchar <= 0)
395 return false;
396 pos += forkchar + 1;
397 }
398
399 /* Check for a segment number. */
400 if (name[pos] == '.')
401 {
402 int segchar;
403
404 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
405 ;
406 if (segchar <= 1)
407 return false;
408 pos += segchar;
409 }
410
411 /* Now we should be at the end. */
412 if (name[pos] != '\0')
413 return false;
414 return true;
415}
416