1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * @a Niels Nes, Peter Boncz
11 * @* System Independent Layer
12 *
13 * GDK is built on Posix. Exceptions are made for memory mapped files
14 * and anonymous virtual memory, for which somewhat higher-level
15 * functions are defined here. Most of this file concerns itself with
16 * emulation of Posix functionality on the WIN32 native platform.
17 */
18#include "monetdb_config.h"
19#include "gdk.h" /* includes gdk_posix.h */
20#include "gdk_private.h"
21#include "mutils.h"
22#include <unistd.h>
23#include <string.h> /* strncpy */
24
25#ifdef HAVE_FCNTL_H
26# include <fcntl.h>
27#endif
28#ifdef HAVE_PROCFS_H
29# include <procfs.h>
30#endif
31#ifdef HAVE_MACH_TASK_H
32# include <mach/task.h>
33#endif
34#ifdef HAVE_MACH_MACH_INIT_H
35# include <mach/mach_init.h>
36#endif
37#if defined(HAVE_KVM_H)
38# include <kvm.h>
39# include <sys/param.h>
40# include <sys/sysctl.h>
41# include <sys/user.h>
42#endif
43
44#ifdef NDEBUG
45#ifndef NVALGRIND
46#define NVALGRIND NDEBUG
47#endif
48#endif
49
50#if defined(__GNUC__) && defined(HAVE_VALGRIND)
51#include <valgrind.h>
52#else
53#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)
54#define VALGRIND_FREELIKE_BLOCK(addr, rzB)
55#define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)
56#endif
57
58#ifndef MAP_NORESERVE
59# define MAP_NORESERVE MAP_PRIVATE
60#endif
61#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
62#define MAP_ANONYMOUS MAP_ANON
63#endif
64
65#define MMAP_ADVISE 7
66#define MMAP_WRITABLE (MMAP_WRITE|MMAP_COPY)
67
68#ifndef O_CLOEXEC
69#define O_CLOEXEC 0
70#endif
71
72/* DDALERT: AIX4.X 64bits needs HAVE_SETENV==0 due to a AIX bug, but
73 * it probably isn't detected so by configure */
74
75#ifndef HAVE_SETENV
76int
77setenv(const char *name, const char *value, int overwrite)
78{
79 int ret = 0;
80
81 if (overwrite || getenv(name) == NULL) {
82 char *p = GDKmalloc(2 + strlen(name) + strlen(value));
83
84 if (p == NULL)
85 return -1;
86 strcpy(p, name);
87 strcat(p, "=");
88 strcat(p, value);
89 ret = putenv(p);
90 /* GDKfree(p); LEAK INSERTED DUE TO SOME WEIRD CRASHES */
91 }
92 return ret;
93}
94#endif
95
96/* Crude VM buffer management that keep a list of all memory mapped
97 * regions.
98 *
99 * a.k.a. "helping stupid VM implementations that ignore VM advice"
100 *
101 * The main goal is to be able to tell the OS to please stop buffering
102 * all memory mapped pages when under pressure. A major problem is
103 * materialization of large results in newly created memory mapped
104 * files. Operating systems tend to cache all dirty pages, such that
105 * when memory is out, all pages are dirty and cannot be unloaded
106 * quickly. The VM panic occurs and comatose OS states may be
107 * observed. This is in spite of our use of
108 * madvise(MADV_SEQUENTIAL). That is; we would want that the OS drops
109 * pages after we've passed them. That does not happen; pages are
110 * retained and pollute the buffer cache.
111 *
112 * Regrettably, at this level, we don't know anything about how Monet
113 * is using the mmapped regions. Monet code is totally oblivious of
114 * any I/O; that's why it is so easy to create CPU efficient code in
115 * Monet.
116 *
117 * The current solution focuses on large writable maps. These often
118 * represent newly created BATs, that are the result of some (running)
119 * operator. We assume two things here:
120 * - the BAT is created in sequential fashion (always almost true)
121 * - afterwards, this BAT is used in sequential fashion (often true)
122 *
123 * A VMtrim thread keeps an eye on the RSS (memory pressure) and large
124 * writable memory maps. If RSS approaches mem_maxsize(), it starts to
125 * *worry*, and starts to write dirty data from these writable maps to
126 * disk in 128MB tiles. So, if memory pressure rises further in the
127 * near future, the OS has some option to release memory pages cheaply
128 * (i.e. without needing I/O). This is also done explicitly by the
129 * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS
130 * to release pages. The reason is that Linux is not smart enough to
131 * do even this. Anyway..
132 *
133 * The way to free pages explicitly in Linux is to call
134 * posix_fadvise(..,MADV_DONTNEED). Particularly,
135 * posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and
136 * documented doesn't work on Linux. But we do both posix_madvise and
137 * posix_fadvise, so on other unix systems that don't support
138 * posix_fadvise, posix_madvise still might work. On Windows, to our
139 * knowledge, there is no way to tell it stop buffering a memory
140 * mapped region. msync (FlushViewOfFile) does work, though. So let's
141 * hope the VM paging algorithm behaves better than Linux which just
142 * runs off the cliff and if MonetDB does not prevent RSS from being
143 * too high, enters coma.
144 *
145 * We will only be able to sensibly test this on Windows64. On
146 * Windows32, mmap sizes do not significantly exceed RAM sizes so
147 * MonetDB swapping actually will not happen (of course, you've got
148 * this nasty problem of VM fragemntation and failing mmaps instead).
149 *
150 * In principle, page tiles are saved sequentially, and behind it, but
151 * never overtaking it, is an "unload-cursor" that frees the pages if
152 * that is needed to keep RSS down. There is a tweak in the
153 * algorithm, that re-sets the unload-cursor if it seems that all
154 * tiles to the end have been saved (whether a tile is actually saved
155 * is determined by timing the sync action). This means that the
156 * producing operator is ready creating the BAT, and we assume it is
157 * going to be used sequentially afterwards. In that case, we should
158 * start unloading right after the 'read-cursor', that is, from the
159 * start.
160 *
161 * EXAMPLE
162 * D = dirty tile
163 * s = saved tile (i.e. clean)
164 * u = unloaded tile
165 * L = tile that is being loaded
166 *
167 * +--> operator produces BAT
168 * (1) DDDDDD|......................................| end of reserved mmap
169 * ____|RSS
170 * |
171 * | at 3/4 of RSS consumed we start to worry
172 * +--> operator produces BAT
173 * (2) DDDDDDDDDDDDDDDD|............................|
174 * s<----------------------------- VM backwards save thread
175 * |
176 * + first tile of which saving costs anything
177 *
178 * +--> operator produces BAT
179 * (3) DDDDDDDDDDDDDDDss|D|.........................|
180 * VM-thread save ->|
181 *
182 * When the RSS target is exceeded, we start unloading tiles..
183 *
184 * +--> VM-thread unload starts at *second* 's'
185 * |
186 * | +--> operator produces BAT
187 * (4) DDDDDDDDDDDDDDDsus|DD|........................|
188 * VM-thread save -->| | RSS = Full!
189 *
190 * +-- 0 => save costs nothing!!
191 * VM-thread save ------------->| assume bat complete
192 * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
193 * |<-------- re-set unload cursor
194 * +--- first tile was not unloaded.
195 *
196 * later.. some other operator sequentially reads the bat
197 * first part is 'D', that is, nicely cached.
198 *
199 * ---read------->|
200 * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
201 *
202 * now we're hitting the unloaded region. the query becomes
203 * I/O read bound here (typically 20% CPU utilization).
204 *
205 * ---read-------->|
206 * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................|
207 * / \
208 * unload cursor load cursor
209 *
210 * ---read---------------->|
211 * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
212 * / \
213 * unload cursor load cursor
214 *
215 * ---read--------------------->| done
216 * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
217 * ****
218 * last part still cached
219 *
220 * note: if we would not have re-setted the unload cursor (5)
221 * the last part would have been lost due to continuing
222 * RSS pressure from the 'L' read-cursor.
223 *
224 * If multiple write-mmaps exist, we do unload-tile and save-tile
225 * selection on a round-robin basis among them.
226 *
227 * Of course, this is a simple solution for simple cases only.
228 * (a) if the bat is produced too fast, (or your disk is too slow)
229 * RSS will exceeds its limit and Linux will go into swapping.
230 * (b) if your data is not produced and read sequentially.
231 * Examples are sorting or clustering on huge datasets.
232 * (c) if RSS pressure is due to large read-maps, rather than
233 * intermediate results.
234 *
235 * Two crude suggestions:
236 * - If we are under RSS pressure without unloadable tiles and with
237 * savable tiles, we should consider suspending *all* other threads
238 * until we manage to unload a tile.
239 * - if there are no savable tiles (or in case of read-only maps)
240 * we could resort to saving and unloading random tiles.
241 *
242 * To do better, our BAT algorithms should provide even more detailed
243 * advice on their access patterns, which may even consist of pointers
244 * to the cursors (i.e. pointers to b->batBuns->free or the cursors
245 * in radix-cluster), which an enhanced version of this thread might
246 * take into account.
247 *
248 * [Kersten] The memory map table should be aligned to the number of
249 * mapped files. In more recent applications, such as the SkyServer
250 * this may be around 2000 BATs easily.
251 */
252
253#ifdef HAVE_PTHREAD_H
254/* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */
255#undef HAVE_CONFIG_H
256#include <sched.h>
257#include <pthread.h>
258#endif
259#ifdef HAVE_SEMAPHORE_H
260#include <semaphore.h>
261#endif
262
263#ifndef NATIVE_WIN32
264#ifdef HAVE_POSIX_FADVISE
265#ifdef HAVE_UNAME
266#include <sys/utsname.h>
267#endif
268#endif
269
270void
271MT_init_posix(void)
272{
273}
274
275/* return RSS in bytes */
276size_t
277MT_getrss(void)
278{
279#if defined(HAVE_PROCFS_H) && defined(__sun__)
280 /* retrieve RSS the Solaris way (2.6+) */
281 int fd;
282 psinfo_t psbuff;
283
284 fd = open("/proc/self/psinfo", O_RDONLY | O_CLOEXEC);
285 if (fd >= 0) {
286 if (read(fd, &psbuff, sizeof(psbuff)) == sizeof(psbuff)) {
287 close(fd);
288 return psbuff.pr_rssize * 1024;
289 }
290 close(fd);
291 }
292#elif defined(HAVE_TASK_INFO)
293 /* Darwin/MACH call for process' RSS */
294 task_t task = mach_task_self();
295 struct task_basic_info_64 t_info;
296 mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_64_COUNT;
297
298 if (task_info(task, TASK_BASIC_INFO_64, (task_info_t)&t_info, &t_info_count) != KERN_INVALID_POLICY)
299 return t_info.resident_size; /* bytes */
300#elif defined(HAVE_KVM_H)
301 /* get RSS on FreeBSD and NetBSD */
302 struct kinfo_proc *ki;
303 int ski = 1;
304 kvm_t *kd;
305 size_t rss = 0;
306
307 kd = kvm_open(NULL, "/dev/null", NULL, O_RDONLY, "kvm_open");
308 if (kd != NULL) {
309 ki = kvm_getprocs(kd, KERN_PROC_PID, getpid(), &ski);
310 if (ki != NULL) {
311#ifdef __NetBSD__ /* should we use configure for this? */
312 /* see bug 3217 */
313 rss = ki->kp_eproc.e_vm.vm_rssize;
314#else
315 rss = ki->ki_rssize;
316#endif
317 kvm_close(kd);
318
319 return rss * MT_pagesize();
320 } else {
321 kvm_close(kd);
322 }
323 }
324#elif defined(__linux__)
325 /* get RSS on Linux */
326 int fd;
327
328 fd = open("/proc/self/stat", O_RDONLY | O_CLOEXEC);
329 if (fd >= 0) {
330 char buf[1024], *r = buf;
331 ssize_t i, sz = read(fd, buf, 1024);
332
333 close(fd);
334 if (sz > 0) {
335 for (i = 0; i < 23; i++) {
336 while (*r && (*r == ' ' || *r == '\t'))
337 r++;
338 while (*r && (*r != ' ' && *r != '\t'))
339 r++;
340 }
341 while (*r && (*r == ' ' || *r == '\t'))
342 r++;
343 return ((size_t) atol(r)) * MT_pagesize();
344 }
345 }
346#endif
347 return 0;
348}
349
350void *
351MT_mmap(const char *path, int mode, size_t len)
352{
353 int fd;
354 void *ret;
355
356 fd = open(path, O_CREAT | ((mode & MMAP_WRITE) ? O_RDWR : O_RDONLY) | O_CLOEXEC, MONETDB_MODE);
357 if (fd < 0) {
358 GDKsyserror("MT_mmap: open %s failed\n", path);
359 return MAP_FAILED;
360 }
361 ret = mmap(NULL,
362 len,
363 ((mode & MMAP_WRITABLE) ? PROT_WRITE : 0) | PROT_READ,
364 (mode & MMAP_COPY) ? (MAP_PRIVATE | MAP_NORESERVE) : MAP_SHARED,
365 fd,
366 0);
367 if (ret == MAP_FAILED) {
368 GDKsyserror("MT_mmap: mmap(%s,%zu) failed\n", path, len);
369 ret = NULL;
370 }
371 close(fd);
372 VALGRIND_MALLOCLIKE_BLOCK(ret, len, 0, 1);
373 return ret;
374}
375
376int
377MT_munmap(void *p, size_t len)
378{
379 int ret = munmap(p, len);
380
381 if (ret < 0)
382 GDKsyserror("MT_munmap: munmap(%p,%zu) failed\n",
383 p, len);
384 VALGRIND_FREELIKE_BLOCK(p, 0);
385#ifdef MMAP_DEBUG
386 fprintf(stderr, "#munmap(%p,%zu) = %d\n", p, len, ret);
387#endif
388 return ret;
389}
390
391/* expand or shrink a memory map (ala realloc).
392 * the address returned may be different from the address going in.
393 * in case of failure, the old address is still mapped and NULL is returned.
394 */
395void *
396MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
397{
398 void *p;
399 int fd = -1;
400 int flags = mode & MMAP_COPY ? MAP_PRIVATE : MAP_SHARED;
401 int prot = PROT_WRITE | PROT_READ;
402
403 /* round up to multiple of page size */
404 *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
405
406 /* doesn't make sense for us to extend read-only memory map */
407 assert(mode & MMAP_WRITABLE);
408
409 if (*new_size < old_size) {
410#ifndef STATIC_CODE_ANALYSIS /* hide this from static code analyzer */
411 /* shrink */
412 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
413 if (munmap((char *) old_address + *new_size,
414 old_size - *new_size) < 0) {
415 GDKsyserror("MT_mremap: munmap(%p,%zu) failed\n",
416 ((char *) old_address + *new_size),
417 old_size - *new_size);
418 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): munmap() failed\n", __FILE__, __LINE__, path?path:"NULL", old_address, old_size, *new_size);
419 /* even though the system call failed, we
420 * don't need to propagate the error up: the
421 * address should still work in the same way
422 * as it did before */
423 return old_address;
424 }
425 if (path && truncate(path, *new_size) < 0)
426 fprintf(stderr, "#MT_mremap(%s): truncate failed\n", path);
427#ifdef MMAP_DEBUG
428 fprintf(stderr, "MT_mremap(%s,%p,%zu,%zu) -> shrinking\n", path?path:"NULL", old_address, old_size, *new_size);
429#endif
430#endif /* !STATIC_CODE_ANALYSIS */
431 return old_address;
432 }
433 if (*new_size == old_size) {
434 /* do nothing */
435#ifdef MMAP_DEBUG
436 fprintf(stderr, "MT_mremap(%s,%p,%zu,%zu) -> unchanged\n", path?path:"NULL", old_address, old_size, *new_size);
437#endif
438 return old_address;
439 }
440
441 if (!(mode & MMAP_COPY) && path != NULL) {
442 /* "normal" memory map */
443
444 if ((fd = open(path, O_RDWR | O_CLOEXEC)) < 0) {
445 GDKsyserror("MT_mremap: open(%s) failed\n", path);
446 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): open() failed\n", __FILE__, __LINE__, path, old_address, old_size, *new_size);
447 return NULL;
448 }
449 if (GDKextendf(fd, *new_size, path) != GDK_SUCCEED) {
450 close(fd);
451 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): GDKextendf() failed\n", __FILE__, __LINE__, path, old_address, old_size, *new_size);
452 return NULL;
453 }
454#ifdef HAVE_MREMAP
455 /* on Linux it's easy */
456 p = mremap(old_address, old_size, *new_size, MREMAP_MAYMOVE);
457 if (p == MAP_FAILED)
458 GDKsyserror("MT_mremap: mremap(%p,%zu,%zu) failed\n",
459 old_address, old_size,
460 *new_size);
461#ifdef HAVE_VALGRIND
462 if (p == old_address) {
463 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
464 } else {
465 VALGRIND_FREELIKE_BLOCK(old_address, 0);
466 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
467 }
468#endif
469#else
470 /* try to map extension at end of current map */
471 p = mmap((char *) old_address + old_size, *new_size - old_size,
472 prot, flags, fd, old_size);
473 /* if it failed, there is no point trying a full mmap:
474 * that too won't fit */
475 if (p != MAP_FAILED) {
476 if (p == (char *) old_address + old_size) {
477 /* we got the requested address, make
478 * sure we return the correct (old)
479 * address */
480 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
481 p = old_address;
482 } else {
483 /* we got some other address: discard
484 * it and make full mmap */
485 munmap(p, *new_size - old_size);
486#ifdef NO_MMAP_ALIASING
487 msync(old_address, old_size, MS_SYNC);
488#endif
489 /* first create full mmap, then, if
490 * successful, remove old mmap */
491 p = mmap(NULL, *new_size, prot, flags, fd, 0);
492 if (p != MAP_FAILED) {
493 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
494 munmap(old_address, old_size);
495 VALGRIND_FREELIKE_BLOCK(old_address, 0);
496 }
497 }
498 }
499 if (p == MAP_FAILED)
500 GDKsyserror("MT_mremap: mmap failed\n");
501#endif /* HAVE_MREMAP */
502 close(fd);
503 } else {
504 /* "copy-on-write" or "anonymous" memory map */
505#ifdef MAP_ANONYMOUS
506 flags |= MAP_ANONYMOUS;
507#else
508 if ((fd = open("/dev/zero", O_RDWR | O_CLOEXEC)) < 0) {
509 GDKsyserror("MT_mremap: open(/dev/zero) failed\n");
510 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): open('/dev/zero') failed\n", __FILE__, __LINE__, path?path:"NULL", old_address, old_size, *new_size);
511 return NULL;
512 }
513#endif
514 /* try to map an anonymous area as extent to the
515 * current map */
516 p = mmap((char *) old_address + old_size, *new_size - old_size,
517 prot, flags, fd, 0);
518 /* no point trying a full map if this didn't work:
519 * there isn't enough space */
520 if (p != MAP_FAILED) {
521 if (p == (char *) old_address + old_size) {
522 /* we got the requested address, make
523 * sure we return the correct (old)
524 * address */
525 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
526 p = old_address;
527 } else {
528 /* we got some other address: discard
529 * it and make full mmap */
530 munmap(p, *new_size - old_size);
531#ifdef HAVE_MREMAP
532 /* first get an area large enough for
533 * *new_size */
534 p = mmap(NULL, *new_size, prot, flags, fd, 0);
535 if (p != MAP_FAILED) {
536 /* then overlay old mmap over new */
537 void *q;
538
539 q = mremap(old_address, old_size,
540 old_size,
541 MREMAP_FIXED | MREMAP_MAYMOVE,
542 p);
543 assert(q == p || q == MAP_FAILED);
544 if (q == MAP_FAILED) {
545 /* we didn't expect this... */
546 munmap(p, *new_size);
547 p = MAP_FAILED;
548 }
549#ifdef HAVE_VALGRIND
550 else {
551 VALGRIND_FREELIKE_BLOCK(old_size, 0);
552 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
553 }
554#endif
555 }
556#else
557 p = MAP_FAILED;
558 if (path == NULL ||
559 *new_size <= GDK_mmap_minsize_persistent) {
560 /* size not too big yet or
561 * anonymous, try to make new
562 * anonymous mmap and copy
563 * data over */
564 p = mmap(NULL, *new_size, prot, flags,
565 fd, 0);
566 if (p != MAP_FAILED) {
567 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 0);
568 memcpy(p, old_address,
569 old_size);
570 munmap(old_address, old_size);
571 VALGRIND_FREELIKE_BLOCK(old_address, 0);
572 }
573 /* if it failed, try alternative */
574 }
575 if (p == MAP_FAILED && path != NULL) {
576#ifdef HAVE_POSIX_FALLOCATE
577 int rt;
578#endif
579 /* write data to disk, then
580 * mmap it to new address */
581 if (fd >= 0)
582 close(fd);
583 p = malloc(strlen(path) + 5);
584 if (p == NULL){
585 GDKsyserror("MT_mremap: malloc() failed\n");
586 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): fd < 0\n", __FILE__, __LINE__, path, old_address, old_size, *new_size);
587 return NULL;
588 }
589
590 strcat(strcpy(p, path), ".tmp");
591 fd = open(p, O_RDWR | O_CREAT | O_CLOEXEC,
592 MONETDB_MODE);
593 if (fd < 0) {
594 GDKsyserror("MT_mremap: open(%s) failed\n", (char *) p);
595 free(p);
596 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): fd < 0\n", __FILE__, __LINE__, path, old_address, old_size, *new_size);
597 return NULL;
598 }
599 free(p);
600 if (write(fd, old_address,
601 old_size) < 0 ||
602#ifdef HAVE_FALLOCATE
603 /* prefer Linux-specific
604 * fallocate over standard
605 * posix_fallocate, since
606 * glibc uses a rather
607 * slow method of
608 * allocating the file if
609 * the file system doesn't
610 * support the operation,
611 * we just use ftruncate
612 * in that case */
613 (fallocate(fd, 0, (off_t) old_size, (off_t) *new_size - (off_t) old_size) < 0 && (errno != EOPNOTSUPP || ftruncate(fd, (off_t) *new_size) < 0))
614#else
615#ifdef HAVE_POSIX_FALLOCATE
616 /* posix_fallocate returns
617 * error number on
618 * failure, not -1, and if
619 * it returns EINVAL, the
620 * underlying file system
621 * may not support the
622 * operation, so we then
623 * need to try
624 * ftruncate */
625 ((rt = posix_fallocate(fd, (off_t) old_size, (off_t) *new_size - (off_t) old_size)) == EINVAL ? ftruncate(fd, (off_t) *new_size) < 0 : rt != 0)
626#else
627 ftruncate(fd, (off_t) *new_size) < 0
628#endif
629#endif
630 ) {
631 int err = errno, other;
632 /* extending failed:
633 * free any disk space
634 * allocated in the
635 * process */
636 other = ftruncate(fd, (off_t) old_size);
637 (void) other; /* silence compiler warning for ignoring result of ftruncate */
638 errno = err; /* restore for error message */
639 GDKsyserror("MT_mremap: growing file failed\n");
640 close(fd);
641 fprintf(stderr,
642 "= %s:%d: MT_mremap(%s,%p,%zu,%zu): write() or "
643#ifdef HAVE_FALLOCATE
644 "fallocate()"
645#else
646#ifdef HAVE_POSIX_FALLOCATE
647 "posix_fallocate()"
648#else
649 "ftruncate()"
650#endif
651#endif
652 " failed\n", __FILE__, __LINE__, path, old_address, old_size, *new_size);
653 return NULL;
654 }
655 p = mmap(NULL, *new_size, prot, flags,
656 fd, 0);
657 if (p != MAP_FAILED) {
658 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
659 munmap(old_address, old_size);
660 VALGRIND_FREELIKE_BLOCK(old_address, 0);
661 }
662 }
663#endif /* HAVE_MREMAP */
664 }
665 }
666 if (p == MAP_FAILED)
667 GDKsyserror("MT_mremap: mmap failed\n");
668 if (fd >= 0)
669 close(fd);
670 }
671#ifdef MMAP_DEBUG
672 fprintf(stderr, "MT_mremap(%s,%p,%zu,%zu) -> %p%s\n", path?path:"NULL", old_address, old_size, *new_size, p, path && mode & MMAP_COPY ? " private" : "");
673#endif
674 if (p == MAP_FAILED)
675 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): p == MAP_FAILED\n", __FILE__, __LINE__, path?path:"NULL", old_address, old_size, *new_size);
676 return p == MAP_FAILED ? NULL : p;
677}
678
679int
680MT_msync(void *p, size_t len)
681{
682 int ret = msync(p, len, MS_SYNC);
683
684 if (ret < 0)
685 GDKsyserror("MT_msync: msync failed\n");
686#ifdef MMAP_DEBUG
687 fprintf(stderr,
688 "#msync(%p,%zu,MS_SYNC) = %d\n",
689 p, len, ret);
690#endif
691 return ret;
692}
693
694bool
695MT_path_absolute(const char *pathname)
696{
697 return (*pathname == DIR_SEP);
698}
699
700#ifdef HAVE_DLFCN_H
701# include <dlfcn.h>
702#endif
703
704void *
705mdlopen(const char *library, int mode)
706{
707 (void) library;
708 return dlopen(NULL, mode);
709}
710
711#else /* WIN32 native */
712
713#ifndef BUFSIZ
714#define BUFSIZ 1024
715#endif
716
717#undef _errno
718#undef stat
719#undef rmdir
720#undef mkdir
721
722#include <windows.h>
723
724#ifdef _MSC_VER
725#include <io.h>
726#endif /* _MSC_VER */
727#include <Psapi.h>
728
729#define MT_SMALLBLOCK 256
730
731static LONG WINAPI
732MT_ignore_exceptions(struct _EXCEPTION_POINTERS *ExceptionInfo)
733{
734 (void) ExceptionInfo;
735 return EXCEPTION_EXECUTE_HANDLER;
736}
737
738void
739MT_init_posix(void)
740{
741 SetUnhandledExceptionFilter(MT_ignore_exceptions);
742}
743
744size_t
745MT_getrss(void)
746{
747 PROCESS_MEMORY_COUNTERS ctr;
748 if (GetProcessMemoryInfo(GetCurrentProcess(), &ctr, sizeof(ctr)))
749 return ctr.WorkingSetSize;
750 return 0;
751}
752
753/* Windows mmap keeps a global list of base addresses for complex
754 * (remapped) memory maps the reason is that each remapped segment
755 * needs to be unmapped separately in the end. */
756
757void *
758MT_mmap(const char *path, int mode, size_t len)
759{
760 DWORD mode0 = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
761 DWORD mode1 = FILE_SHARE_READ | FILE_SHARE_WRITE;
762 DWORD mode2 = mode & MMAP_ADVISE;
763 DWORD mode3 = PAGE_READONLY;
764 int mode4 = FILE_MAP_READ;
765 SECURITY_ATTRIBUTES sa;
766 HANDLE h1, h2;
767 void *ret;
768
769 if (mode & MMAP_WRITE) {
770 mode0 |= FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA;
771 }
772 if (mode2 == MMAP_RANDOM || mode2 == MMAP_DONTNEED) {
773 mode2 = FILE_FLAG_RANDOM_ACCESS;
774 } else if (mode2 == MMAP_SEQUENTIAL || mode2 == MMAP_WILLNEED) {
775 mode2 = FILE_FLAG_SEQUENTIAL_SCAN;
776 } else {
777 mode2 = FILE_FLAG_NO_BUFFERING;
778 }
779 if (mode & MMAP_SYNC) {
780 mode2 |= FILE_FLAG_WRITE_THROUGH;
781 }
782 if (mode & MMAP_COPY) {
783 mode3 = PAGE_WRITECOPY;
784 mode4 = FILE_MAP_COPY;
785 } else if (mode & MMAP_WRITE) {
786 mode3 = PAGE_READWRITE;
787 mode4 = FILE_MAP_WRITE;
788 }
789 sa.nLength = sizeof(SECURITY_ATTRIBUTES);
790 sa.bInheritHandle = TRUE;
791 sa.lpSecurityDescriptor = 0;
792
793 h1 = CreateFile(path, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
794 if (h1 == INVALID_HANDLE_VALUE) {
795 (void) SetFileAttributes(path, FILE_ATTRIBUTE_NORMAL);
796 h1 = CreateFile(path, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
797 if (h1 == INVALID_HANDLE_VALUE) {
798 GDKwinerror("MT_mmap: CreateFile('%s', %lu, %lu, &sa, %lu, %lu, NULL) failed\n",
799 path, mode0, mode1, (DWORD) OPEN_ALWAYS, mode2);
800 return NULL;
801 }
802 }
803
804 h2 = CreateFileMapping(h1, &sa, mode3, (DWORD) (((__int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)), (DWORD) (len & LL_CONSTANT(0xFFFFFFFF)), NULL);
805 if (h2 == NULL) {
806 GDKwinerror("MT_mmap: CreateFileMapping(%p, &sa, %lu, %lu, %lu, NULL) failed\n",
807 h1, mode3,
808 (DWORD) (((__int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)),
809 (DWORD) (len & LL_CONSTANT(0xFFFFFFFF)));
810 CloseHandle(h1);
811 return NULL;
812 }
813 CloseHandle(h1);
814
815 ret = MapViewOfFileEx(h2, mode4, (DWORD) 0, (DWORD) 0, len, NULL);
816 if (ret == NULL)
817 errno = winerror(GetLastError());
818 CloseHandle(h2);
819
820 return ret;
821}
822
823int
824MT_munmap(void *p, size_t dummy)
825{
826 int ret;
827
828 (void) dummy;
829 /* Windows' UnmapViewOfFile returns success!=0, error== 0,
830 * while Unix's munmap returns success==0, error==-1. */
831 ret = UnmapViewOfFile(p);
832 if (ret == 0) {
833 GDKwinerror("MT_munmap failed\n");
834 return -1;
835 }
836 return 0;
837}
838
839void *
840MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
841{
842 void *p;
843
844 /* doesn't make sense for us to extend read-only memory map */
845 assert(mode & MMAP_WRITABLE);
846
847 /* round up to multiple of page size */
848 *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
849
850 if (old_size >= *new_size) {
851 *new_size = old_size;
852 return old_address; /* don't bother shrinking */
853 }
854 if (GDKextend(path, *new_size) != GDK_SUCCEED) {
855 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): GDKextend() failed\n", __FILE__, __LINE__, path?path:"NULL", old_address, old_size, *new_size);
856 return NULL;
857 }
858 if (path && !(mode & MMAP_COPY))
859 MT_munmap(old_address, old_size);
860 p = MT_mmap(path, mode, *new_size);
861 if (p != NULL && (path == NULL || (mode & MMAP_COPY))) {
862 memcpy(p, old_address, old_size);
863 MT_munmap(old_address, old_size);
864 }
865#ifdef MMAP_DEBUG
866 fprintf(stderr, "MT_mremap(%s,%p,%zu,%zu) -> %p\n", path?path:"NULL", old_address, old_size, *new_size, p);
867#endif
868 if (p == NULL)
869 fprintf(stderr, "= %s:%d: MT_mremap(%s,%p,%zu,%zu): p == NULL\n", __FILE__, __LINE__, path?path:"NULL", old_address, old_size, *new_size);
870 return p;
871}
872
873int
874MT_msync(void *p, size_t len)
875{
876 int ret;
877
878 /* Windows' FlushViewOfFile returns success!=0, error== 0,
879 * while Unix's munmap returns success==0, error==-1. */
880 ret = FlushViewOfFile(p, len);
881 if (ret == 0) {
882 GDKwinerror("MT_msync: FlushViewOfFile failed\n");
883 return -1;
884 }
885 return 0;
886}
887
888bool
889MT_path_absolute(const char *pathname)
890{
891 /* drive letter, colon, directory separator */
892 return (((('a' <= pathname[0] && pathname[0] <= 'z') ||
893 ('A' <= pathname[0] && pathname[0] <= 'Z')) &&
894 pathname[1] == ':' &&
895 (pathname[2] == '/' || pathname[2] == '\\')) ||
896 (pathname[0] == '\\' && pathname[1] == '\\'));
897}
898
899#ifndef HAVE_GETTIMEOFDAY
900static int nodays[12] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
901
902#define LEAPYEAR(y) ((((y)%4)==0 && ((y)%100)!=0) || ((y)%400)==0)
903#define NODAYS(m,y) (((m)!=2)?nodays[(m)-1]:LEAPYEAR(y)?29:28)
904
905int
906gettimeofday(struct timeval *tv, int *ignore_zone)
907{
908 unsigned int year, day, month;
909 SYSTEMTIME st;
910
911 (void) ignore_zone;
912 GetSystemTime(&st);
913 day = 0;
914 for (year = 1970; year < st.wYear; year++)
915 day += LEAPYEAR(year) ? 366 : 365;
916
917 for (month = 1; month < st.wMonth; month++)
918 day += NODAYS(month, st.wYear);
919
920 day += st.wDay;
921 tv->tv_sec = 60 * (day * 24 * 60 + st.wMinute) + st.wSecond;
922 tv->tv_usec = 1000 * st.wMilliseconds;
923 return 0;
924}
925#endif
926
927void *
928mdlopen(const char *file, int mode)
929{
930 return dlopen(file, mode);
931}
932
933void *
934dlopen(const char *file, int mode)
935{
936 (void) mode;
937 if (file != NULL) {
938 return (void *) LoadLibrary(file);
939 }
940 return GetModuleHandle(NULL);
941}
942
943int
944dlclose(void *handle)
945{
946 if (handle != NULL) {
947 return FreeLibrary((HINSTANCE) handle);
948 }
949 return -1;
950}
951
952void *
953dlsym(void *handle, const char *name)
954{
955 if (handle != NULL) {
956 return (void *) GetProcAddress((HINSTANCE) handle, name);
957 }
958 return NULL;
959}
960
961char *
962dlerror(void)
963{
964 static char msg[1024];
965
966 FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0, msg, sizeof(msg), NULL);
967 return msg;
968}
969
970/* dir manipulations fail in WIN32 if file name contains trailing
971 * slashes; work around this */
972static char *
973reduce_dir_name(const char *src, char *dst, size_t cap)
974{
975 size_t len = strlen(src);
976 char *buf = dst;
977
978 if (len >= cap)
979 buf = malloc(len + 1);
980 if (buf == NULL)
981 return NULL;
982 while (--len > 0 && src[len - 1] != ':' && src[len] == DIR_SEP)
983 ;
984 for (buf[++len] = 0; len > 0; buf[len] = src[len])
985 len--;
986 return buf;
987}
988
989#undef _stat64
990int
991win_stat(const char *pathname, struct _stat64 *st)
992{
993 char buf[128], *p = reduce_dir_name(pathname, buf, sizeof(buf));
994 int ret;
995
996 if (p == NULL)
997 return -1;
998 ret = _stat64(p, st);
999 if (p != buf)
1000 free(p);
1001 return ret;
1002}
1003
1004int
1005win_rmdir(const char *pathname)
1006{
1007 char buf[128], *p = reduce_dir_name(pathname, buf, sizeof(buf));
1008 int ret;
1009
1010 if (p == NULL)
1011 return -1;
1012 ret = _rmdir(p);
1013 if (ret < 0 && errno != ENOENT) {
1014 /* it could be the <expletive deleted> indexing
1015 * service which prevents us from doing what we have a
1016 * right to do, so try again (once) */
1017 IODEBUG fprintf(stderr, "retry rmdir %s\n", pathname);
1018 MT_sleep_ms(100); /* wait a little */
1019 ret = _rmdir(p);
1020 }
1021 if (p != buf)
1022 free(p);
1023 return ret;
1024}
1025
1026int
1027win_unlink(const char *pathname)
1028{
1029 int ret = _unlink(pathname);
1030 if (ret < 0) {
1031 /* Vista is paranoid: we cannot delete read-only files
1032 * owned by ourselves. Vista somehow also sets these
1033 * files to read-only.
1034 */
1035 (void) SetFileAttributes(pathname, FILE_ATTRIBUTE_NORMAL);
1036 ret = _unlink(pathname);
1037 }
1038 if (ret < 0 && errno != ENOENT) {
1039 /* it could be the <expletive deleted> indexing
1040 * service which prevents us from doing what we have a
1041 * right to do, so try again (once) */
1042 IODEBUG fprintf(stderr, "retry unlink %s\n", pathname);
1043 MT_sleep_ms(100); /* wait a little */
1044 ret = _unlink(pathname);
1045 }
1046 return ret;
1047}
1048
1049#undef rename
1050int
1051win_rename(const char *old, const char *dst)
1052{
1053 int ret;
1054
1055 ret = rename(old, dst);
1056 if (ret == 0 || (ret < 0 && errno == ENOENT))
1057 return ret;
1058 if (ret < 0 && errno == EEXIST) {
1059 (void) win_unlink(dst);
1060 ret = rename(old, dst);
1061 }
1062
1063 if (ret < 0 && errno != ENOENT) {
1064 /* it could be the <expletive deleted> indexing
1065 * service which prevents us from doing what we have a
1066 * right to do, so try again (once) */
1067 IODEBUG fprintf(stderr, "#retry rename %s %s\n", old, dst);
1068 MT_sleep_ms(100); /* wait a little */
1069 ret = rename(old, dst);
1070 }
1071 return ret;
1072}
1073
1074int
1075win_mkdir(const char *pathname, const int mode)
1076{
1077 char buf[128], *p = reduce_dir_name(pathname, buf, sizeof(buf));
1078 int ret;
1079
1080 (void) mode;
1081 if (p == NULL)
1082 return -1;
1083 ret = _mkdir(p);
1084 if (p != buf)
1085 free(p);
1086 return ret;
1087}
1088#endif
1089
1090void
1091MT_sleep_ms(unsigned int ms)
1092{
1093#ifdef NATIVE_WIN32
1094 Sleep(ms);
1095#else
1096#ifdef HAVE_NANOSLEEP
1097 (void) nanosleep(&(struct timespec) {.tv_sec = ms / 1000,
1098 .tv_nsec = ms == 1 ? 1000 : (long) (ms % 1000) * 1000000,},
1099 NULL);
1100#else
1101 (void) select(0, NULL, NULL, NULL,
1102 &(struct timeval) {.tv_sec = ms / 1000,
1103 .tv_usec = ms == 1 ? 1 : (ms % 1000) * 1000,});
1104#endif
1105#endif
1106}
1107