1/*-------------------------------------------------------------------------
2 *
3 * filemap.c
4 * A data structure for keeping track of files that have changed.
5 *
6 * Copyright (c) 2013-2019, PostgreSQL Global Development Group
7 *
8 *-------------------------------------------------------------------------
9 */
10
11#include "postgres_fe.h"
12
13#include <sys/stat.h>
14#include <unistd.h>
15
16#include "datapagemap.h"
17#include "filemap.h"
18#include "pg_rewind.h"
19
20#include "common/string.h"
21#include "catalog/pg_tablespace_d.h"
22#include "storage/fd.h"
23
24filemap_t *filemap = NULL;
25
26static bool isRelDataFile(const char *path);
27static char *datasegpath(RelFileNode rnode, ForkNumber forknum,
28 BlockNumber segno);
29static int path_cmp(const void *a, const void *b);
30static int final_filemap_cmp(const void *a, const void *b);
31static void filemap_list_to_array(filemap_t *map);
32static bool check_file_excluded(const char *path, bool is_source);
33
34/*
35 * The contents of these directories are removed or recreated during server
36 * start so they are not included in data processed by pg_rewind.
37 *
38 * Note: those lists should be kept in sync with what basebackup.c provides.
39 * Some of the values, contrary to what basebackup.c uses, are hardcoded as
40 * they are defined in backend-only headers. So this list is maintained
41 * with a best effort in mind.
42 */
43static const char *excludeDirContents[] =
44{
45 /*
46 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
47 * when stats_temp_directory is set because PGSS_TEXT_FILE is always
48 * created there.
49 */
50 "pg_stat_tmp", /* defined as PG_STAT_TMP_DIR */
51
52 /*
53 * It is generally not useful to backup the contents of this directory
54 * even if the intention is to restore to another master. See backup.sgml
55 * for a more detailed description.
56 */
57 "pg_replslot",
58
59 /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
60 "pg_dynshmem", /* defined as PG_DYNSHMEM_DIR */
61
62 /* Contents removed on startup, see AsyncShmemInit(). */
63 "pg_notify",
64
65 /*
66 * Old contents are loaded for possible debugging but are not required for
67 * normal operation, see OldSerXidInit().
68 */
69 "pg_serial",
70
71 /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
72 "pg_snapshots",
73
74 /* Contents zeroed on startup, see StartupSUBTRANS(). */
75 "pg_subtrans",
76
77 /* end of list */
78 NULL
79};
80
81/*
82 * List of files excluded from filemap processing.
83 */
84static const char *excludeFiles[] =
85{
86 /* Skip auto conf temporary file. */
87 "postgresql.auto.conf.tmp", /* defined as PG_AUTOCONF_FILENAME */
88
89 /* Skip current log file temporary file */
90 "current_logfiles.tmp", /* defined as LOG_METAINFO_DATAFILE_TMP */
91
92 /* Skip relation cache because it is rebuilt on startup */
93 "pg_internal.init", /* defined as RELCACHE_INIT_FILENAME */
94
95 /*
96 * If there's a backup_label or tablespace_map file, it belongs to a
97 * backup started by the user with pg_start_backup(). It is *not* correct
98 * for this backup. Our backup_label is written later on separately.
99 */
100 "backup_label", /* defined as BACKUP_LABEL_FILE */
101 "tablespace_map", /* defined as TABLESPACE_MAP */
102
103 "postmaster.pid",
104 "postmaster.opts",
105
106 /* end of list */
107 NULL
108};
109
110/*
111 * Create a new file map (stored in the global pointer "filemap").
112 */
113void
114filemap_create(void)
115{
116 filemap_t *map;
117
118 map = pg_malloc(sizeof(filemap_t));
119 map->first = map->last = NULL;
120 map->nlist = 0;
121 map->array = NULL;
122 map->narray = 0;
123
124 Assert(filemap == NULL);
125 filemap = map;
126}
127
128/*
129 * Callback for processing source file list.
130 *
131 * This is called once for every file in the source server. We decide what
132 * action needs to be taken for the file, depending on whether the file
133 * exists in the target and whether the size matches.
134 */
135void
136process_source_file(const char *path, file_type_t type, size_t newsize,
137 const char *link_target)
138{
139 bool exists;
140 char localpath[MAXPGPATH];
141 struct stat statbuf;
142 filemap_t *map = filemap;
143 file_action_t action = FILE_ACTION_NONE;
144 size_t oldsize = 0;
145 file_entry_t *entry;
146
147 Assert(map->array == NULL);
148
149 /*
150 * Skip any files matching the exclusion filters. This has the effect to
151 * remove all those files on the target.
152 */
153 if (check_file_excluded(path, true))
154 return;
155
156 /*
157 * Pretend that pg_wal is a directory, even if it's really a symlink. We
158 * don't want to mess with the symlink itself, nor complain if it's a
159 * symlink in source but not in target or vice versa.
160 */
161 if (strcmp(path, "pg_wal") == 0 && type == FILE_TYPE_SYMLINK)
162 type = FILE_TYPE_DIRECTORY;
163
164 /*
165 * Skip temporary files, .../pgsql_tmp/... and .../pgsql_tmp.* in source.
166 * This has the effect that all temporary files in the destination will be
167 * removed.
168 */
169 if (strstr(path, "/" PG_TEMP_FILE_PREFIX) != NULL)
170 return;
171 if (strstr(path, "/" PG_TEMP_FILES_DIR "/") != NULL)
172 return;
173
174 /*
175 * sanity check: a filename that looks like a data file better be a
176 * regular file
177 */
178 if (type != FILE_TYPE_REGULAR && isRelDataFile(path))
179 pg_fatal("data file \"%s\" in source is not a regular file", path);
180
181 snprintf(localpath, sizeof(localpath), "%s/%s", datadir_target, path);
182
183 /* Does the corresponding file exist in the target data dir? */
184 if (lstat(localpath, &statbuf) < 0)
185 {
186 if (errno != ENOENT)
187 pg_fatal("could not stat file \"%s\": %m",
188 localpath);
189
190 exists = false;
191 }
192 else
193 exists = true;
194
195 switch (type)
196 {
197 case FILE_TYPE_DIRECTORY:
198 if (exists && !S_ISDIR(statbuf.st_mode) && strcmp(path, "pg_wal") != 0)
199 {
200 /* it's a directory in source, but not in target. Strange.. */
201 pg_fatal("\"%s\" is not a directory", localpath);
202 }
203
204 if (!exists)
205 action = FILE_ACTION_CREATE;
206 else
207 action = FILE_ACTION_NONE;
208 oldsize = 0;
209 break;
210
211 case FILE_TYPE_SYMLINK:
212 if (exists &&
213#ifndef WIN32
214 !S_ISLNK(statbuf.st_mode)
215#else
216 !pgwin32_is_junction(localpath)
217#endif
218 )
219 {
220 /*
221 * It's a symbolic link in source, but not in target.
222 * Strange..
223 */
224 pg_fatal("\"%s\" is not a symbolic link", localpath);
225 }
226
227 if (!exists)
228 action = FILE_ACTION_CREATE;
229 else
230 action = FILE_ACTION_NONE;
231 oldsize = 0;
232 break;
233
234 case FILE_TYPE_REGULAR:
235 if (exists && !S_ISREG(statbuf.st_mode))
236 pg_fatal("\"%s\" is not a regular file", localpath);
237
238 if (!exists || !isRelDataFile(path))
239 {
240 /*
241 * File exists in source, but not in target. Or it's a
242 * non-data file that we have no special processing for. Copy
243 * it in toto.
244 *
245 * An exception: PG_VERSIONs should be identical, but avoid
246 * overwriting it for paranoia.
247 */
248 if (pg_str_endswith(path, "PG_VERSION"))
249 {
250 action = FILE_ACTION_NONE;
251 oldsize = statbuf.st_size;
252 }
253 else
254 {
255 action = FILE_ACTION_COPY;
256 oldsize = 0;
257 }
258 }
259 else
260 {
261 /*
262 * It's a data file that exists in both.
263 *
264 * If it's larger in target, we can truncate it. There will
265 * also be a WAL record of the truncation in the source
266 * system, so WAL replay would eventually truncate the target
267 * too, but we might as well do it now.
268 *
269 * If it's smaller in the target, it means that it has been
270 * truncated in the target, or enlarged in the source, or
271 * both. If it was truncated in the target, we need to copy
272 * the missing tail from the source system. If it was enlarged
273 * in the source system, there will be WAL records in the
274 * source system for the new blocks, so we wouldn't need to
275 * copy them here. But we don't know which scenario we're
276 * dealing with, and there's no harm in copying the missing
277 * blocks now, so do it now.
278 *
279 * If it's the same size, do nothing here. Any blocks modified
280 * in the target will be copied based on parsing the target
281 * system's WAL, and any blocks modified in the source will be
282 * updated after rewinding, when the source system's WAL is
283 * replayed.
284 */
285 oldsize = statbuf.st_size;
286 if (oldsize < newsize)
287 action = FILE_ACTION_COPY_TAIL;
288 else if (oldsize > newsize)
289 action = FILE_ACTION_TRUNCATE;
290 else
291 action = FILE_ACTION_NONE;
292 }
293 break;
294 }
295
296 /* Create a new entry for this file */
297 entry = pg_malloc(sizeof(file_entry_t));
298 entry->path = pg_strdup(path);
299 entry->type = type;
300 entry->action = action;
301 entry->oldsize = oldsize;
302 entry->newsize = newsize;
303 entry->link_target = link_target ? pg_strdup(link_target) : NULL;
304 entry->next = NULL;
305 entry->pagemap.bitmap = NULL;
306 entry->pagemap.bitmapsize = 0;
307 entry->isrelfile = isRelDataFile(path);
308
309 if (map->last)
310 {
311 map->last->next = entry;
312 map->last = entry;
313 }
314 else
315 map->first = map->last = entry;
316 map->nlist++;
317}
318
319/*
320 * Callback for processing target file list.
321 *
322 * All source files must be already processed before calling this. This only
323 * marks target data directory's files that didn't exist in the source for
324 * deletion.
325 */
326void
327process_target_file(const char *path, file_type_t type, size_t oldsize,
328 const char *link_target)
329{
330 bool exists;
331 char localpath[MAXPGPATH];
332 struct stat statbuf;
333 file_entry_t key;
334 file_entry_t *key_ptr;
335 filemap_t *map = filemap;
336 file_entry_t *entry;
337
338 /*
339 * Do not apply any exclusion filters here. This has advantage to remove
340 * from the target data folder all paths which have been filtered out from
341 * the source data folder when processing the source files.
342 */
343
344 snprintf(localpath, sizeof(localpath), "%s/%s", datadir_target, path);
345 if (lstat(localpath, &statbuf) < 0)
346 {
347 if (errno != ENOENT)
348 pg_fatal("could not stat file \"%s\": %m",
349 localpath);
350
351 exists = false;
352 }
353
354 if (map->array == NULL)
355 {
356 /* on first call, initialize lookup array */
357 if (map->nlist == 0)
358 {
359 /* should not happen */
360 pg_fatal("source file list is empty");
361 }
362
363 filemap_list_to_array(map);
364
365 Assert(map->array != NULL);
366
367 qsort(map->array, map->narray, sizeof(file_entry_t *), path_cmp);
368 }
369
370 /*
371 * Like in process_source_file, pretend that xlog is always a directory.
372 */
373 if (strcmp(path, "pg_wal") == 0 && type == FILE_TYPE_SYMLINK)
374 type = FILE_TYPE_DIRECTORY;
375
376 key.path = (char *) path;
377 key_ptr = &key;
378 exists = (bsearch(&key_ptr, map->array, map->narray, sizeof(file_entry_t *),
379 path_cmp) != NULL);
380
381 /* Remove any file or folder that doesn't exist in the source system. */
382 if (!exists)
383 {
384 entry = pg_malloc(sizeof(file_entry_t));
385 entry->path = pg_strdup(path);
386 entry->type = type;
387 entry->action = FILE_ACTION_REMOVE;
388 entry->oldsize = oldsize;
389 entry->newsize = 0;
390 entry->link_target = link_target ? pg_strdup(link_target) : NULL;
391 entry->next = NULL;
392 entry->pagemap.bitmap = NULL;
393 entry->pagemap.bitmapsize = 0;
394 entry->isrelfile = isRelDataFile(path);
395
396 if (map->last == NULL)
397 map->first = entry;
398 else
399 map->last->next = entry;
400 map->last = entry;
401 map->nlist++;
402 }
403 else
404 {
405 /*
406 * We already handled all files that exist in the source system in
407 * process_source_file().
408 */
409 }
410}
411
412/*
413 * This callback gets called while we read the WAL in the target, for every
414 * block that have changed in the target system. It makes note of all the
415 * changed blocks in the pagemap of the file.
416 */
417void
418process_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno)
419{
420 char *path;
421 file_entry_t key;
422 file_entry_t *key_ptr;
423 file_entry_t *entry;
424 BlockNumber blkno_inseg;
425 int segno;
426 filemap_t *map = filemap;
427 file_entry_t **e;
428
429 Assert(map->array);
430
431 segno = blkno / RELSEG_SIZE;
432 blkno_inseg = blkno % RELSEG_SIZE;
433
434 path = datasegpath(rnode, forknum, segno);
435
436 key.path = (char *) path;
437 key_ptr = &key;
438
439 e = bsearch(&key_ptr, map->array, map->narray, sizeof(file_entry_t *),
440 path_cmp);
441 if (e)
442 entry = *e;
443 else
444 entry = NULL;
445 pfree(path);
446
447 if (entry)
448 {
449 Assert(entry->isrelfile);
450
451 switch (entry->action)
452 {
453 case FILE_ACTION_NONE:
454 case FILE_ACTION_TRUNCATE:
455 /* skip if we're truncating away the modified block anyway */
456 if ((blkno_inseg + 1) * BLCKSZ <= entry->newsize)
457 datapagemap_add(&entry->pagemap, blkno_inseg);
458 break;
459
460 case FILE_ACTION_COPY_TAIL:
461
462 /*
463 * skip the modified block if it is part of the "tail" that
464 * we're copying anyway.
465 */
466 if ((blkno_inseg + 1) * BLCKSZ <= entry->oldsize)
467 datapagemap_add(&entry->pagemap, blkno_inseg);
468 break;
469
470 case FILE_ACTION_COPY:
471 case FILE_ACTION_REMOVE:
472 break;
473
474 case FILE_ACTION_CREATE:
475 pg_fatal("unexpected page modification for directory or symbolic link \"%s\"", entry->path);
476 }
477 }
478 else
479 {
480 /*
481 * If we don't have any record of this file in the file map, it means
482 * that it's a relation that doesn't exist in the source system, and
483 * it was subsequently removed in the target system, too. We can
484 * safely ignore it.
485 */
486 }
487}
488
489/*
490 * Is this the path of file that pg_rewind can skip copying?
491 */
492static bool
493check_file_excluded(const char *path, bool is_source)
494{
495 char localpath[MAXPGPATH];
496 int excludeIdx;
497 const char *filename;
498
499 /* check individual files... */
500 for (excludeIdx = 0; excludeFiles[excludeIdx] != NULL; excludeIdx++)
501 {
502 filename = last_dir_separator(path);
503 if (filename == NULL)
504 filename = path;
505 else
506 filename++;
507 if (strcmp(filename, excludeFiles[excludeIdx]) == 0)
508 {
509 if (is_source)
510 pg_log_debug("entry \"%s\" excluded from source file list",
511 path);
512 else
513 pg_log_debug("entry \"%s\" excluded from target file list",
514 path);
515 return true;
516 }
517 }
518
519 /*
520 * ... And check some directories. Note that this includes any contents
521 * within the directories themselves.
522 */
523 for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
524 {
525 snprintf(localpath, sizeof(localpath), "%s/",
526 excludeDirContents[excludeIdx]);
527 if (strstr(path, localpath) == path)
528 {
529 if (is_source)
530 pg_log_debug("entry \"%s\" excluded from source file list",
531 path);
532 else
533 pg_log_debug("entry \"%s\" excluded from target file list",
534 path);
535 return true;
536 }
537 }
538
539 return false;
540}
541
542/*
543 * Convert the linked list of entries in map->first/last to the array,
544 * map->array.
545 */
546static void
547filemap_list_to_array(filemap_t *map)
548{
549 int narray;
550 file_entry_t *entry,
551 *next;
552
553 map->array = (file_entry_t **)
554 pg_realloc(map->array,
555 (map->nlist + map->narray) * sizeof(file_entry_t *));
556
557 narray = map->narray;
558 for (entry = map->first; entry != NULL; entry = next)
559 {
560 map->array[narray++] = entry;
561 next = entry->next;
562 entry->next = NULL;
563 }
564 Assert(narray == map->nlist + map->narray);
565 map->narray = narray;
566 map->nlist = 0;
567 map->first = map->last = NULL;
568}
569
570void
571filemap_finalize(void)
572{
573 filemap_t *map = filemap;
574
575 filemap_list_to_array(map);
576 qsort(map->array, map->narray, sizeof(file_entry_t *),
577 final_filemap_cmp);
578}
579
580static const char *
581action_to_str(file_action_t action)
582{
583 switch (action)
584 {
585 case FILE_ACTION_NONE:
586 return "NONE";
587 case FILE_ACTION_COPY:
588 return "COPY";
589 case FILE_ACTION_TRUNCATE:
590 return "TRUNCATE";
591 case FILE_ACTION_COPY_TAIL:
592 return "COPY_TAIL";
593 case FILE_ACTION_CREATE:
594 return "CREATE";
595 case FILE_ACTION_REMOVE:
596 return "REMOVE";
597
598 default:
599 return "unknown";
600 }
601}
602
603/*
604 * Calculate the totals needed for progress reports.
605 */
606void
607calculate_totals(void)
608{
609 file_entry_t *entry;
610 int i;
611 filemap_t *map = filemap;
612
613 map->total_size = 0;
614 map->fetch_size = 0;
615
616 for (i = 0; i < map->narray; i++)
617 {
618 entry = map->array[i];
619
620 if (entry->type != FILE_TYPE_REGULAR)
621 continue;
622
623 map->total_size += entry->newsize;
624
625 if (entry->action == FILE_ACTION_COPY)
626 {
627 map->fetch_size += entry->newsize;
628 continue;
629 }
630
631 if (entry->action == FILE_ACTION_COPY_TAIL)
632 map->fetch_size += (entry->newsize - entry->oldsize);
633
634 if (entry->pagemap.bitmapsize > 0)
635 {
636 datapagemap_iterator_t *iter;
637 BlockNumber blk;
638
639 iter = datapagemap_iterate(&entry->pagemap);
640 while (datapagemap_next(iter, &blk))
641 map->fetch_size += BLCKSZ;
642
643 pg_free(iter);
644 }
645 }
646}
647
648void
649print_filemap(void)
650{
651 filemap_t *map = filemap;
652 file_entry_t *entry;
653 int i;
654
655 for (i = 0; i < map->narray; i++)
656 {
657 entry = map->array[i];
658 if (entry->action != FILE_ACTION_NONE ||
659 entry->pagemap.bitmapsize > 0)
660 {
661 pg_log_debug("%s (%s)", entry->path,
662 action_to_str(entry->action));
663
664 if (entry->pagemap.bitmapsize > 0)
665 datapagemap_print(&entry->pagemap);
666 }
667 }
668 fflush(stdout);
669}
670
671/*
672 * Does it look like a relation data file?
673 *
674 * For our purposes, only files belonging to the main fork are considered
675 * relation files. Other forks are always copied in toto, because we cannot
676 * reliably track changes to them, because WAL only contains block references
677 * for the main fork.
678 */
679static bool
680isRelDataFile(const char *path)
681{
682 RelFileNode rnode;
683 unsigned int segNo;
684 int nmatch;
685 bool matched;
686
687 /*----
688 * Relation data files can be in one of the following directories:
689 *
690 * global/
691 * shared relations
692 *
693 * base/<db oid>/
694 * regular relations, default tablespace
695 *
696 * pg_tblspc/<tblspc oid>/<tblspc version>/
697 * within a non-default tablespace (the name of the directory
698 * depends on version)
699 *
700 * And the relation data files themselves have a filename like:
701 *
702 * <oid>.<segment number>
703 *
704 *----
705 */
706 rnode.spcNode = InvalidOid;
707 rnode.dbNode = InvalidOid;
708 rnode.relNode = InvalidOid;
709 segNo = 0;
710 matched = false;
711
712 nmatch = sscanf(path, "global/%u.%u", &rnode.relNode, &segNo);
713 if (nmatch == 1 || nmatch == 2)
714 {
715 rnode.spcNode = GLOBALTABLESPACE_OID;
716 rnode.dbNode = 0;
717 matched = true;
718 }
719 else
720 {
721 nmatch = sscanf(path, "base/%u/%u.%u",
722 &rnode.dbNode, &rnode.relNode, &segNo);
723 if (nmatch == 2 || nmatch == 3)
724 {
725 rnode.spcNode = DEFAULTTABLESPACE_OID;
726 matched = true;
727 }
728 else
729 {
730 nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u",
731 &rnode.spcNode, &rnode.dbNode, &rnode.relNode,
732 &segNo);
733 if (nmatch == 3 || nmatch == 4)
734 matched = true;
735 }
736 }
737
738 /*
739 * The sscanf tests above can match files that have extra characters at
740 * the end. To eliminate such cases, cross-check that GetRelationPath
741 * creates the exact same filename, when passed the RelFileNode
742 * information we extracted from the filename.
743 */
744 if (matched)
745 {
746 char *check_path = datasegpath(rnode, MAIN_FORKNUM, segNo);
747
748 if (strcmp(check_path, path) != 0)
749 matched = false;
750
751 pfree(check_path);
752 }
753
754 return matched;
755}
756
757/*
758 * A helper function to create the path of a relation file and segment.
759 *
760 * The returned path is palloc'd
761 */
762static char *
763datasegpath(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
764{
765 char *path;
766 char *segpath;
767
768 path = relpathperm(rnode, forknum);
769 if (segno > 0)
770 {
771 segpath = psprintf("%s.%u", path, segno);
772 pfree(path);
773 return segpath;
774 }
775 else
776 return path;
777}
778
779static int
780path_cmp(const void *a, const void *b)
781{
782 file_entry_t *fa = *((file_entry_t **) a);
783 file_entry_t *fb = *((file_entry_t **) b);
784
785 return strcmp(fa->path, fb->path);
786}
787
788/*
789 * In the final stage, the filemap is sorted so that removals come last.
790 * From disk space usage point of view, it would be better to do removals
791 * first, but for now, safety first. If a whole directory is deleted, all
792 * files and subdirectories inside it need to removed first. On creation,
793 * parent directory needs to be created before files and directories inside
794 * it. To achieve that, the file_action_t enum is ordered so that we can
795 * just sort on that first. Furthermore, sort REMOVE entries in reverse
796 * path order, so that "foo/bar" subdirectory is removed before "foo".
797 */
798static int
799final_filemap_cmp(const void *a, const void *b)
800{
801 file_entry_t *fa = *((file_entry_t **) a);
802 file_entry_t *fb = *((file_entry_t **) b);
803
804 if (fa->action > fb->action)
805 return 1;
806 if (fa->action < fb->action)
807 return -1;
808
809 if (fa->action == FILE_ACTION_REMOVE)
810 return strcmp(fb->path, fa->path);
811 else
812 return strcmp(fa->path, fb->path);
813}
814