1/*
2 * pg_test_fsync.c
3 * tests all supported fsync() methods
4 */
5
6#include "postgres_fe.h"
7
8#include <sys/stat.h>
9#include <sys/time.h>
10#include <fcntl.h>
11#include <time.h>
12#include <unistd.h>
13#include <signal.h>
14
15#include "getopt_long.h"
16#include "access/xlogdefs.h"
17#include "common/logging.h"
18
19
20/*
21 * put the temp files in the local directory
22 * unless the user specifies otherwise
23 */
24#define FSYNC_FILENAME "./pg_test_fsync.out"
25
26#define XLOG_BLCKSZ_K (XLOG_BLCKSZ / 1024)
27
28#define LABEL_FORMAT " %-30s"
29#define NA_FORMAT "%21s\n"
30/* translator: maintain alignment with NA_FORMAT */
31#define OPS_FORMAT gettext_noop("%13.3f ops/sec %6.0f usecs/op\n")
32#define USECS_SEC 1000000
33
34/* These are macros to avoid timing the function call overhead. */
35#ifndef WIN32
36#define START_TIMER \
37do { \
38 alarm_triggered = false; \
39 alarm(secs_per_test); \
40 gettimeofday(&start_t, NULL); \
41} while (0)
42#else
43/* WIN32 doesn't support alarm, so we create a thread and sleep there */
44#define START_TIMER \
45do { \
46 alarm_triggered = false; \
47 if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
48 INVALID_HANDLE_VALUE) \
49 { \
50 pg_log_error("could not create thread for alarm"); \
51 exit(1); \
52 } \
53 gettimeofday(&start_t, NULL); \
54} while (0)
55#endif
56
57#define STOP_TIMER \
58do { \
59 gettimeofday(&stop_t, NULL); \
60 print_elapse(start_t, stop_t, ops); \
61} while (0)
62
63
64static const char *progname;
65
66static int secs_per_test = 5;
67static int needs_unlink = 0;
68static char full_buf[DEFAULT_XLOG_SEG_SIZE],
69 *buf,
70 *filename = FSYNC_FILENAME;
71static struct timeval start_t,
72 stop_t;
73static bool alarm_triggered = false;
74
75
76static void handle_args(int argc, char *argv[]);
77static void prepare_buf(void);
78static void test_open(void);
79static void test_non_sync(void);
80static void test_sync(int writes_per_op);
81static void test_open_syncs(void);
82static void test_open_sync(const char *msg, int writes_size);
83static void test_file_descriptor_sync(void);
84
85#ifndef WIN32
86static void process_alarm(int sig);
87#else
88static DWORD WINAPI process_alarm(LPVOID param);
89#endif
90static void signal_cleanup(int sig);
91
92#ifdef HAVE_FSYNC_WRITETHROUGH
93static int pg_fsync_writethrough(int fd);
94#endif
95static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
96
97#define die(msg) do { pg_log_error("%s: %m", _(msg)); exit(1); } while(0)
98
99
100int
101main(int argc, char *argv[])
102{
103 pg_logging_init(argv[0]);
104 set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_fsync"));
105 progname = get_progname(argv[0]);
106
107 handle_args(argc, argv);
108
109 /* Prevent leaving behind the test file */
110 pqsignal(SIGINT, signal_cleanup);
111 pqsignal(SIGTERM, signal_cleanup);
112#ifndef WIN32
113 pqsignal(SIGALRM, process_alarm);
114#endif
115#ifdef SIGHUP
116 /* Not defined on win32 */
117 pqsignal(SIGHUP, signal_cleanup);
118#endif
119
120 prepare_buf();
121
122 test_open();
123
124 /* Test using 1 XLOG_BLCKSZ write */
125 test_sync(1);
126
127 /* Test using 2 XLOG_BLCKSZ writes */
128 test_sync(2);
129
130 test_open_syncs();
131
132 test_file_descriptor_sync();
133
134 test_non_sync();
135
136 unlink(filename);
137
138 return 0;
139}
140
141static void
142handle_args(int argc, char *argv[])
143{
144 static struct option long_options[] = {
145 {"filename", required_argument, NULL, 'f'},
146 {"secs-per-test", required_argument, NULL, 's'},
147 {NULL, 0, NULL, 0}
148 };
149
150 int option; /* Command line option */
151 int optindex = 0; /* used by getopt_long */
152
153 if (argc > 1)
154 {
155 if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
156 {
157 printf(_("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n"), progname);
158 exit(0);
159 }
160 if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
161 {
162 puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
163 exit(0);
164 }
165 }
166
167 while ((option = getopt_long(argc, argv, "f:s:",
168 long_options, &optindex)) != -1)
169 {
170 switch (option)
171 {
172 case 'f':
173 filename = pg_strdup(optarg);
174 break;
175
176 case 's':
177 secs_per_test = atoi(optarg);
178 break;
179
180 default:
181 fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
182 progname);
183 exit(1);
184 break;
185 }
186 }
187
188 if (argc > optind)
189 {
190 pg_log_error("too many command-line arguments (first is \"%s\")",
191 argv[optind]);
192 fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
193 progname);
194 exit(1);
195 }
196
197 printf(ngettext("%d second per test\n",
198 "%d seconds per test\n",
199 secs_per_test),
200 secs_per_test);
201#if PG_O_DIRECT != 0
202 printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n"));
203#else
204 printf(_("Direct I/O is not supported on this platform.\n"));
205#endif
206}
207
208static void
209prepare_buf(void)
210{
211 int ops;
212
213 /* write random data into buffer */
214 for (ops = 0; ops < DEFAULT_XLOG_SEG_SIZE; ops++)
215 full_buf[ops] = random();
216
217 buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf);
218}
219
220static void
221test_open(void)
222{
223 int tmpfile;
224
225 /*
226 * test if we can open the target file
227 */
228 if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
229 die("could not open output file");
230 needs_unlink = 1;
231 if (write(tmpfile, full_buf, DEFAULT_XLOG_SEG_SIZE) !=
232 DEFAULT_XLOG_SEG_SIZE)
233 die("write failed");
234
235 /* fsync now so that dirty buffers don't skew later tests */
236 if (fsync(tmpfile) != 0)
237 die("fsync failed");
238
239 close(tmpfile);
240}
241
242static void
243test_sync(int writes_per_op)
244{
245 int tmpfile,
246 ops,
247 writes;
248 bool fs_warning = false;
249
250 if (writes_per_op == 1)
251 printf(_("\nCompare file sync methods using one %dkB write:\n"), XLOG_BLCKSZ_K);
252 else
253 printf(_("\nCompare file sync methods using two %dkB writes:\n"), XLOG_BLCKSZ_K);
254 printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n"));
255
256 /*
257 * Test open_datasync if available
258 */
259 printf(LABEL_FORMAT, "open_datasync");
260 fflush(stdout);
261
262#ifdef OPEN_DATASYNC_FLAG
263 if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
264 {
265 printf(NA_FORMAT, _("n/a*"));
266 fs_warning = true;
267 }
268 else
269 {
270 START_TIMER;
271 for (ops = 0; alarm_triggered == false; ops++)
272 {
273 for (writes = 0; writes < writes_per_op; writes++)
274 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
275 die("write failed");
276 if (lseek(tmpfile, 0, SEEK_SET) == -1)
277 die("seek failed");
278 }
279 STOP_TIMER;
280 close(tmpfile);
281 }
282#else
283 printf(NA_FORMAT, _("n/a"));
284#endif
285
286/*
287 * Test fdatasync if available
288 */
289 printf(LABEL_FORMAT, "fdatasync");
290 fflush(stdout);
291
292#ifdef HAVE_FDATASYNC
293 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
294 die("could not open output file");
295 START_TIMER;
296 for (ops = 0; alarm_triggered == false; ops++)
297 {
298 for (writes = 0; writes < writes_per_op; writes++)
299 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
300 die("write failed");
301 fdatasync(tmpfile);
302 if (lseek(tmpfile, 0, SEEK_SET) == -1)
303 die("seek failed");
304 }
305 STOP_TIMER;
306 close(tmpfile);
307#else
308 printf(NA_FORMAT, _("n/a"));
309#endif
310
311/*
312 * Test fsync
313 */
314 printf(LABEL_FORMAT, "fsync");
315 fflush(stdout);
316
317 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
318 die("could not open output file");
319 START_TIMER;
320 for (ops = 0; alarm_triggered == false; ops++)
321 {
322 for (writes = 0; writes < writes_per_op; writes++)
323 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
324 die("write failed");
325 if (fsync(tmpfile) != 0)
326 die("fsync failed");
327 if (lseek(tmpfile, 0, SEEK_SET) == -1)
328 die("seek failed");
329 }
330 STOP_TIMER;
331 close(tmpfile);
332
333/*
334 * If fsync_writethrough is available, test as well
335 */
336 printf(LABEL_FORMAT, "fsync_writethrough");
337 fflush(stdout);
338
339#ifdef HAVE_FSYNC_WRITETHROUGH
340 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
341 die("could not open output file");
342 START_TIMER;
343 for (ops = 0; alarm_triggered == false; ops++)
344 {
345 for (writes = 0; writes < writes_per_op; writes++)
346 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
347 die("write failed");
348 if (pg_fsync_writethrough(tmpfile) != 0)
349 die("fsync failed");
350 if (lseek(tmpfile, 0, SEEK_SET) == -1)
351 die("seek failed");
352 }
353 STOP_TIMER;
354 close(tmpfile);
355#else
356 printf(NA_FORMAT, _("n/a"));
357#endif
358
359/*
360 * Test open_sync if available
361 */
362 printf(LABEL_FORMAT, "open_sync");
363 fflush(stdout);
364
365#ifdef OPEN_SYNC_FLAG
366 if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
367 {
368 printf(NA_FORMAT, _("n/a*"));
369 fs_warning = true;
370 }
371 else
372 {
373 START_TIMER;
374 for (ops = 0; alarm_triggered == false; ops++)
375 {
376 for (writes = 0; writes < writes_per_op; writes++)
377 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
378
379 /*
380 * This can generate write failures if the filesystem has
381 * a large block size, e.g. 4k, and there is no support
382 * for O_DIRECT writes smaller than the file system block
383 * size, e.g. XFS.
384 */
385 die("write failed");
386 if (lseek(tmpfile, 0, SEEK_SET) == -1)
387 die("seek failed");
388 }
389 STOP_TIMER;
390 close(tmpfile);
391 }
392#else
393 printf(NA_FORMAT, _("n/a"));
394#endif
395
396 if (fs_warning)
397 {
398 printf(_("* This file system and its mount options do not support direct\n"
399 " I/O, e.g. ext4 in journaled mode.\n"));
400 }
401}
402
403static void
404test_open_syncs(void)
405{
406 printf(_("\nCompare open_sync with different write sizes:\n"));
407 printf(_("(This is designed to compare the cost of writing 16kB in different write\n"
408 "open_sync sizes.)\n"));
409
410 test_open_sync(_(" 1 * 16kB open_sync write"), 16);
411 test_open_sync(_(" 2 * 8kB open_sync writes"), 8);
412 test_open_sync(_(" 4 * 4kB open_sync writes"), 4);
413 test_open_sync(_(" 8 * 2kB open_sync writes"), 2);
414 test_open_sync(_("16 * 1kB open_sync writes"), 1);
415}
416
417/*
418 * Test open_sync with different size files
419 */
420static void
421test_open_sync(const char *msg, int writes_size)
422{
423#ifdef OPEN_SYNC_FLAG
424 int tmpfile,
425 ops,
426 writes;
427#endif
428
429 printf(LABEL_FORMAT, msg);
430 fflush(stdout);
431
432#ifdef OPEN_SYNC_FLAG
433 if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
434 printf(NA_FORMAT, _("n/a*"));
435 else
436 {
437 START_TIMER;
438 for (ops = 0; alarm_triggered == false; ops++)
439 {
440 for (writes = 0; writes < 16 / writes_size; writes++)
441 if (write(tmpfile, buf, writes_size * 1024) !=
442 writes_size * 1024)
443 die("write failed");
444 if (lseek(tmpfile, 0, SEEK_SET) == -1)
445 die("seek failed");
446 }
447 STOP_TIMER;
448 close(tmpfile);
449 }
450#else
451 printf(NA_FORMAT, _("n/a"));
452#endif
453}
454
455static void
456test_file_descriptor_sync(void)
457{
458 int tmpfile,
459 ops;
460
461 /*
462 * Test whether fsync can sync data written on a different descriptor for
463 * the same file. This checks the efficiency of multi-process fsyncs
464 * against the same file. Possibly this should be done with writethrough
465 * on platforms which support it.
466 */
467 printf(_("\nTest if fsync on non-write file descriptor is honored:\n"));
468 printf(_("(If the times are similar, fsync() can sync data written on a different\n"
469 "descriptor.)\n"));
470
471 /*
472 * first write, fsync and close, which is the normal behavior without
473 * multiple descriptors
474 */
475 printf(LABEL_FORMAT, "write, fsync, close");
476 fflush(stdout);
477
478 START_TIMER;
479 for (ops = 0; alarm_triggered == false; ops++)
480 {
481 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
482 die("could not open output file");
483 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
484 die("write failed");
485 if (fsync(tmpfile) != 0)
486 die("fsync failed");
487 close(tmpfile);
488
489 /*
490 * open and close the file again to be consistent with the following
491 * test
492 */
493 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
494 die("could not open output file");
495 close(tmpfile);
496 }
497 STOP_TIMER;
498
499 /*
500 * Now open, write, close, open again and fsync This simulates processes
501 * fsyncing each other's writes.
502 */
503 printf(LABEL_FORMAT, "write, close, fsync");
504 fflush(stdout);
505
506 START_TIMER;
507 for (ops = 0; alarm_triggered == false; ops++)
508 {
509 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
510 die("could not open output file");
511 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
512 die("write failed");
513 close(tmpfile);
514 /* reopen file */
515 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
516 die("could not open output file");
517 if (fsync(tmpfile) != 0)
518 die("fsync failed");
519 close(tmpfile);
520 }
521 STOP_TIMER;
522}
523
524static void
525test_non_sync(void)
526{
527 int tmpfile,
528 ops;
529
530 /*
531 * Test a simple write without fsync
532 */
533 printf(_("\nNon-sync'ed %dkB writes:\n"), XLOG_BLCKSZ_K);
534 printf(LABEL_FORMAT, "write");
535 fflush(stdout);
536
537 START_TIMER;
538 for (ops = 0; alarm_triggered == false; ops++)
539 {
540 if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
541 die("could not open output file");
542 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
543 die("write failed");
544 close(tmpfile);
545 }
546 STOP_TIMER;
547}
548
549static void
550signal_cleanup(int signum)
551{
552 /* Delete the file if it exists. Ignore errors */
553 if (needs_unlink)
554 unlink(filename);
555 /* Finish incomplete line on stdout */
556 puts("");
557 exit(signum);
558}
559
560#ifdef HAVE_FSYNC_WRITETHROUGH
561
562static int
563pg_fsync_writethrough(int fd)
564{
565#ifdef WIN32
566 return _commit(fd);
567#elif defined(F_FULLFSYNC)
568 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
569#else
570 errno = ENOSYS;
571 return -1;
572#endif
573}
574#endif
575
576/*
577 * print out the writes per second for tests
578 */
579static void
580print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
581{
582 double total_time = (stop_t.tv_sec - start_t.tv_sec) +
583 (stop_t.tv_usec - start_t.tv_usec) * 0.000001;
584 double per_second = ops / total_time;
585 double avg_op_time_us = (total_time / ops) * USECS_SEC;
586
587 printf(_(OPS_FORMAT), per_second, avg_op_time_us);
588}
589
590#ifndef WIN32
591static void
592process_alarm(int sig)
593{
594 alarm_triggered = true;
595}
596#else
597static DWORD WINAPI
598process_alarm(LPVOID param)
599{
600 /* WIN32 doesn't support alarm, so we create a thread and sleep here */
601 Sleep(secs_per_test * 1000);
602 alarm_triggered = true;
603 ExitThread(0);
604}
605#endif
606