1/*
2 * file.c
3 *
4 * file system operations
5 *
6 * Copyright (c) 2010-2019, PostgreSQL Global Development Group
7 * src/bin/pg_upgrade/file.c
8 */
9
10#include "postgres_fe.h"
11
12#include "access/visibilitymap.h"
13#include "common/file_perm.h"
14#include "pg_upgrade.h"
15#include "storage/bufpage.h"
16#include "storage/checksum.h"
17#include "storage/checksum_impl.h"
18
19#include <sys/stat.h>
20#include <fcntl.h>
21#ifdef HAVE_COPYFILE_H
22#include <copyfile.h>
23#endif
24#ifdef __linux__
25#include <sys/ioctl.h>
26#include <linux/fs.h>
27#endif
28
29
30#ifdef WIN32
31static int win32_pghardlink(const char *src, const char *dst);
32#endif
33
34
35/*
36 * cloneFile()
37 *
38 * Clones/reflinks a relation file from src to dst.
39 *
40 * schemaName/relName are relation's SQL name (used for error messages only).
41 */
42void
43cloneFile(const char *src, const char *dst,
44 const char *schemaName, const char *relName)
45{
46#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
47 if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0)
48 pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
49 schemaName, relName, src, dst, strerror(errno));
50#elif defined(__linux__) && defined(FICLONE)
51 int src_fd;
52 int dest_fd;
53
54 if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
55 pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n",
56 schemaName, relName, src, strerror(errno));
57
58 if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
59 pg_file_create_mode)) < 0)
60 pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n",
61 schemaName, relName, dst, strerror(errno));
62
63 if (ioctl(dest_fd, FICLONE, src_fd) < 0)
64 {
65 unlink(dst);
66 pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
67 schemaName, relName, src, dst, strerror(errno));
68 }
69
70 close(src_fd);
71 close(dest_fd);
72#endif
73}
74
75
76/*
77 * copyFile()
78 *
79 * Copies a relation file from src to dst.
80 * schemaName/relName are relation's SQL name (used for error messages only).
81 */
82void
83copyFile(const char *src, const char *dst,
84 const char *schemaName, const char *relName)
85{
86#ifndef WIN32
87 int src_fd;
88 int dest_fd;
89 char *buffer;
90
91 if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
92 pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n",
93 schemaName, relName, src, strerror(errno));
94
95 if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
96 pg_file_create_mode)) < 0)
97 pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n",
98 schemaName, relName, dst, strerror(errno));
99
100 /* copy in fairly large chunks for best efficiency */
101#define COPY_BUF_SIZE (50 * BLCKSZ)
102
103 buffer = (char *) pg_malloc(COPY_BUF_SIZE);
104
105 /* perform data copying i.e read src source, write to destination */
106 while (true)
107 {
108 ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE);
109
110 if (nbytes < 0)
111 pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n",
112 schemaName, relName, src, strerror(errno));
113
114 if (nbytes == 0)
115 break;
116
117 errno = 0;
118 if (write(dest_fd, buffer, nbytes) != nbytes)
119 {
120 /* if write didn't set errno, assume problem is no disk space */
121 if (errno == 0)
122 errno = ENOSPC;
123 pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n",
124 schemaName, relName, dst, strerror(errno));
125 }
126 }
127
128 pg_free(buffer);
129 close(src_fd);
130 close(dest_fd);
131
132#else /* WIN32 */
133
134 if (CopyFile(src, dst, true) == 0)
135 {
136 _dosmaperr(GetLastError());
137 pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
138 schemaName, relName, src, dst, strerror(errno));
139 }
140
141#endif /* WIN32 */
142}
143
144
145/*
146 * linkFile()
147 *
148 * Hard-links a relation file from src to dst.
149 * schemaName/relName are relation's SQL name (used for error messages only).
150 */
151void
152linkFile(const char *src, const char *dst,
153 const char *schemaName, const char *relName)
154{
155 if (pg_link_file(src, dst) < 0)
156 pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
157 schemaName, relName, src, dst, strerror(errno));
158}
159
160
161/*
162 * rewriteVisibilityMap()
163 *
164 * Transform a visibility map file, copying from src to dst.
165 * schemaName/relName are relation's SQL name (used for error messages only).
166 *
167 * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
168 * visibility map included one bit per heap page; it now includes two.
169 * When upgrading a cluster from before that time to a current PostgreSQL
170 * version, we could refuse to copy visibility maps from the old cluster
171 * to the new cluster; the next VACUUM would recreate them, but at the
172 * price of scanning the entire table. So, instead, we rewrite the old
173 * visibility maps in the new format. That way, the all-visible bits
174 * remain set for the pages for which they were set previously. The
175 * all-frozen bits are never set by this conversion; we leave that to VACUUM.
176 */
177void
178rewriteVisibilityMap(const char *fromfile, const char *tofile,
179 const char *schemaName, const char *relName)
180{
181 int src_fd;
182 int dst_fd;
183 PGAlignedBlock buffer;
184 PGAlignedBlock new_vmbuf;
185 ssize_t totalBytesRead = 0;
186 ssize_t src_filesize;
187 int rewriteVmBytesPerPage;
188 BlockNumber new_blkno = 0;
189 struct stat statbuf;
190
191 /* Compute number of old-format bytes per new page */
192 rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
193
194 if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0)
195 pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n",
196 schemaName, relName, fromfile, strerror(errno));
197
198 if (fstat(src_fd, &statbuf) != 0)
199 pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s\n",
200 schemaName, relName, fromfile, strerror(errno));
201
202 if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
203 pg_file_create_mode)) < 0)
204 pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n",
205 schemaName, relName, tofile, strerror(errno));
206
207 /* Save old file size */
208 src_filesize = statbuf.st_size;
209
210 /*
211 * Turn each visibility map page into 2 pages one by one. Each new page
212 * has the same page header as the old one. If the last section of the
213 * last page is empty, we skip it, mostly to avoid turning one-page
214 * visibility maps for small relations into two pages needlessly.
215 */
216 while (totalBytesRead < src_filesize)
217 {
218 ssize_t bytesRead;
219 char *old_cur;
220 char *old_break;
221 char *old_blkend;
222 PageHeaderData pageheader;
223 bool old_lastblk;
224
225 if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ)
226 {
227 if (bytesRead < 0)
228 pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s\n",
229 schemaName, relName, fromfile, strerror(errno));
230 else
231 pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"\n",
232 schemaName, relName, fromfile);
233 }
234
235 totalBytesRead += BLCKSZ;
236 old_lastblk = (totalBytesRead == src_filesize);
237
238 /* Save the page header data */
239 memcpy(&pageheader, buffer.data, SizeOfPageHeaderData);
240
241 /*
242 * These old_* variables point to old visibility map page. old_cur
243 * points to current position on old page. old_blkend points to end of
244 * old block. old_break is the end+1 position on the old page for the
245 * data that will be transferred to the current new page.
246 */
247 old_cur = buffer.data + SizeOfPageHeaderData;
248 old_blkend = buffer.data + bytesRead;
249 old_break = old_cur + rewriteVmBytesPerPage;
250
251 while (old_break <= old_blkend)
252 {
253 char *new_cur;
254 bool empty = true;
255 bool old_lastpart;
256
257 /* First, copy old page header to new page */
258 memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData);
259
260 /* Rewriting the last part of the last old page? */
261 old_lastpart = old_lastblk && (old_break == old_blkend);
262
263 new_cur = new_vmbuf.data + SizeOfPageHeaderData;
264
265 /* Process old page bytes one by one, and turn it into new page. */
266 while (old_cur < old_break)
267 {
268 uint8 byte = *(uint8 *) old_cur;
269 uint16 new_vmbits = 0;
270 int i;
271
272 /* Generate new format bits while keeping old information */
273 for (i = 0; i < BITS_PER_BYTE; i++)
274 {
275 if (byte & (1 << i))
276 {
277 empty = false;
278 new_vmbits |=
279 VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
280 }
281 }
282
283 /* Copy new visibility map bytes to new-format page */
284 new_cur[0] = (char) (new_vmbits & 0xFF);
285 new_cur[1] = (char) (new_vmbits >> 8);
286
287 old_cur++;
288 new_cur += BITS_PER_HEAPBLOCK;
289 }
290
291 /* If the last part of the last page is empty, skip writing it */
292 if (old_lastpart && empty)
293 break;
294
295 /* Set new checksum for visibility map page, if enabled */
296 if (new_cluster.controldata.data_checksum_version != 0)
297 ((PageHeader) new_vmbuf.data)->pd_checksum =
298 pg_checksum_page(new_vmbuf.data, new_blkno);
299
300 errno = 0;
301 if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ)
302 {
303 /* if write didn't set errno, assume problem is no disk space */
304 if (errno == 0)
305 errno = ENOSPC;
306 pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s\n",
307 schemaName, relName, tofile, strerror(errno));
308 }
309
310 /* Advance for next new page */
311 old_break += rewriteVmBytesPerPage;
312 new_blkno++;
313 }
314 }
315
316 /* Clean up */
317 close(dst_fd);
318 close(src_fd);
319}
320
321void
322check_file_clone(void)
323{
324 char existing_file[MAXPGPATH];
325 char new_link_file[MAXPGPATH];
326
327 snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
328 snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
329 unlink(new_link_file); /* might fail */
330
331#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
332 if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0)
333 pg_fatal("could not clone file between old and new data directories: %s\n",
334 strerror(errno));
335#elif defined(__linux__) && defined(FICLONE)
336 {
337 int src_fd;
338 int dest_fd;
339
340 if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
341 pg_fatal("could not open file \"%s\": %s\n",
342 existing_file, strerror(errno));
343
344 if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
345 pg_file_create_mode)) < 0)
346 pg_fatal("could not create file \"%s\": %s\n",
347 new_link_file, strerror(errno));
348
349 if (ioctl(dest_fd, FICLONE, src_fd) < 0)
350 pg_fatal("could not clone file between old and new data directories: %s\n",
351 strerror(errno));
352
353 close(src_fd);
354 close(dest_fd);
355 }
356#else
357 pg_fatal("file cloning not supported on this platform\n");
358#endif
359
360 unlink(new_link_file);
361}
362
363void
364check_hard_link(void)
365{
366 char existing_file[MAXPGPATH];
367 char new_link_file[MAXPGPATH];
368
369 snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
370 snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata);
371 unlink(new_link_file); /* might fail */
372
373 if (pg_link_file(existing_file, new_link_file) < 0)
374 pg_fatal("could not create hard link between old and new data directories: %s\n"
375 "In link mode the old and new data directories must be on the same file system.\n",
376 strerror(errno));
377
378 unlink(new_link_file);
379}
380
381#ifdef WIN32
382/* implementation of pg_link_file() on Windows */
383static int
384win32_pghardlink(const char *src, const char *dst)
385{
386 /*
387 * CreateHardLinkA returns zero for failure
388 * http://msdn.microsoft.com/en-us/library/aa363860(VS.85).aspx
389 */
390 if (CreateHardLinkA(dst, src, NULL) == 0)
391 {
392 _dosmaperr(GetLastError());
393 return -1;
394 }
395 else
396 return 0;
397}
398#endif
399