1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * @a M. L. Kersten, P. Boncz, N. Nes |
11 | * |
12 | * @* Database Storage Management |
13 | * Contains routines for writing and reading GDK data to and from |
14 | * disk. This section contains the primitives to manage the |
15 | * disk-based images of the BATs. It relies on the existence of a UNIX |
16 | * file system, including memory mapped files. Solaris and IRIX have |
17 | * different implementations of madvise(). |
18 | * |
19 | * The current version assumes that all BATs are stored on a single |
20 | * disk partition. This simplistic assumption should be replaced in |
21 | * the near future by a multi-volume version. The intention is to use |
22 | * several BAT home locations. The files should be owned by the |
23 | * database server. Otherwise, IO operations are likely to fail. This |
24 | * is accomplished by setting the GID and UID upon system start. |
25 | */ |
26 | #include "monetdb_config.h" |
27 | #include "gdk.h" |
28 | #include "gdk_private.h" |
29 | #include "gdk_storage.h" |
30 | #include "mutils.h" |
31 | #ifdef HAVE_FCNTL_H |
32 | #include <fcntl.h> |
33 | #endif |
34 | |
35 | #ifndef O_CLOEXEC |
36 | #define O_CLOEXEC 0 |
37 | #endif |
38 | |
39 | /* GDKfilepath returns a newly allocated string containing the path |
40 | * name of a database farm. |
41 | * The arguments are the farmID or -1, the name of a subdirectory |
42 | * within the farm (i.e., something like BATDIR or BAKDIR -- see |
43 | * gdk.h) or NULL, the name of a BAT (i.e. the name that is stored in |
44 | * BBP.dir -- something like 07/714), and finally the file extension. |
45 | * |
46 | * If farmid is >= 0, GDKfilepath returns the complete path to the |
47 | * specified farm concatenated with the other arguments with |
48 | * appropriate separators. If farmid is -1, it returns the |
49 | * concatenation of its other arguments (in this case, the result |
50 | * cannot be used to access a file directly -- the farm needs to be |
51 | * prepended in some other place). */ |
52 | char * |
53 | GDKfilepath(int farmid, const char *dir, const char *name, const char *ext) |
54 | { |
55 | const char *sep; |
56 | size_t pathlen; |
57 | char *path; |
58 | |
59 | if (GDKinmemory()) |
60 | return GDKstrdup(":inmemory" ); |
61 | |
62 | assert(dir == NULL || *dir != DIR_SEP); |
63 | assert(farmid == NOFARM || |
64 | (farmid >= 0 && farmid < MAXFARMS && BBPfarms[farmid].dirname)); |
65 | if (MT_path_absolute(name)) { |
66 | GDKerror("GDKfilepath: name should not be absolute\n" ); |
67 | return NULL; |
68 | } |
69 | if (dir && *dir == DIR_SEP) |
70 | dir++; |
71 | if (dir == NULL || dir[0] == 0 || dir[strlen(dir) - 1] == DIR_SEP) { |
72 | sep = "" ; |
73 | } else { |
74 | sep = DIR_SEP_STR; |
75 | } |
76 | pathlen = (farmid == NOFARM ? 0 : strlen(BBPfarms[farmid].dirname) + 1) + |
77 | (dir ? strlen(dir) : 0) + strlen(sep) + strlen(name) + |
78 | (ext ? strlen(ext) + 1 : 0) + 1; |
79 | path = GDKmalloc(pathlen); |
80 | if (path == NULL) |
81 | return NULL; |
82 | if (farmid == NOFARM) { |
83 | strconcat_len(path, pathlen, |
84 | dir ? dir : "" , sep, name, |
85 | ext ? "." : NULL, ext, NULL); |
86 | } else { |
87 | strconcat_len(path, pathlen, |
88 | BBPfarms[farmid].dirname, DIR_SEP_STR, |
89 | dir ? dir : "" , sep, name, |
90 | ext ? "." : NULL, ext, NULL); |
91 | } |
92 | return path; |
93 | } |
94 | |
95 | /* make sure the parent directory of DIR exists (the argument itself |
96 | * is usually a file that is to be created) */ |
97 | gdk_return |
98 | GDKcreatedir(const char *dir) |
99 | { |
100 | char path[FILENAME_MAX]; |
101 | char *r; |
102 | DIR *dirp; |
103 | |
104 | IODEBUG fprintf(stderr, "#GDKcreatedir(%s)\n" , dir); |
105 | assert(!GDKinmemory()); |
106 | assert(MT_path_absolute(dir)); |
107 | if (strlen(dir) >= FILENAME_MAX) { |
108 | GDKerror("GDKcreatedir: directory name too long\n" ); |
109 | return GDK_FAIL; |
110 | } |
111 | strcpy(path, dir); /* we know this fits (see above) */ |
112 | /* skip initial /, if any */ |
113 | for (r = strchr(path + 1, DIR_SEP); r; r = strchr(r, DIR_SEP)) { |
114 | *r = 0; |
115 | if ( |
116 | #ifdef WIN32 |
117 | strlen(path) > 3 && |
118 | #endif |
119 | mkdir(path, MONETDB_DIRMODE) < 0) { |
120 | if (errno != EEXIST) { |
121 | GDKsyserror("GDKcreatedir: cannot create directory %s\n" , path); |
122 | IODEBUG fprintf(stderr, "#GDKcreatedir: mkdir(%s) failed\n" , path); |
123 | return GDK_FAIL; |
124 | } |
125 | if ((dirp = opendir(path)) == NULL) { |
126 | GDKsyserror("GDKcreatedir: %s not a directory\n" , path); |
127 | IODEBUG fprintf(stderr, "#GDKcreatedir: opendir(%s) failed\n" , path); |
128 | return GDK_FAIL; |
129 | } |
130 | /* it's a directory, we can continue */ |
131 | closedir(dirp); |
132 | } |
133 | *r++ = DIR_SEP; |
134 | } |
135 | return GDK_SUCCEED; |
136 | } |
137 | |
138 | /* remove the directory DIRNAME with its file contents; does not |
139 | * recurse into subdirectories */ |
140 | gdk_return |
141 | GDKremovedir(int farmid, const char *dirname) |
142 | { |
143 | str dirnamestr; |
144 | DIR *dirp; |
145 | char *path; |
146 | struct dirent *dent; |
147 | int ret; |
148 | |
149 | assert(!GDKinmemory()); |
150 | if ((dirnamestr = GDKfilepath(farmid, NULL, dirname, NULL)) == NULL) |
151 | return GDK_FAIL; |
152 | |
153 | IODEBUG fprintf(stderr, "#GDKremovedir(%s)\n" , dirnamestr); |
154 | |
155 | if ((dirp = opendir(dirnamestr)) == NULL) { |
156 | GDKfree(dirnamestr); |
157 | return GDK_SUCCEED; |
158 | } |
159 | while ((dent = readdir(dirp)) != NULL) { |
160 | if (dent->d_name[0] == '.' && |
161 | (dent->d_name[1] == 0 || |
162 | (dent->d_name[1] == '.' && dent->d_name[2] == 0))) { |
163 | /* skip . and .. */ |
164 | continue; |
165 | } |
166 | path = GDKfilepath(farmid, dirname, dent->d_name, NULL); |
167 | ret = remove(path); |
168 | IODEBUG fprintf(stderr, "#remove %s = %d\n" , path, ret); |
169 | GDKfree(path); |
170 | } |
171 | closedir(dirp); |
172 | ret = rmdir(dirnamestr); |
173 | if (ret != 0) |
174 | GDKsyserror("GDKremovedir: rmdir(%s) failed.\n" , dirnamestr); |
175 | IODEBUG fprintf(stderr, "#rmdir %s = %d\n" , dirnamestr, ret); |
176 | GDKfree(dirnamestr); |
177 | return ret ? GDK_FAIL : GDK_SUCCEED; |
178 | } |
179 | |
180 | #define _FUNBUF 0x040000 |
181 | #define _FWRTHR 0x080000 |
182 | #define _FRDSEQ 0x100000 |
183 | |
184 | /* open a file and return its file descriptor; the file is specified |
185 | * using farmid, name and extension; if opening for writing, we create |
186 | * the parent directory if necessary */ |
187 | int |
188 | GDKfdlocate(int farmid, const char *nme, const char *mode, const char *extension) |
189 | { |
190 | char *path = NULL; |
191 | int fd, flags = O_CLOEXEC; |
192 | |
193 | assert(!GDKinmemory()); |
194 | if (nme == NULL || *nme == 0) |
195 | return -1; |
196 | |
197 | assert(farmid != NOFARM || extension == NULL); |
198 | if (farmid != NOFARM) { |
199 | path = GDKfilepath(farmid, BATDIR, nme, extension); |
200 | if (path == NULL) |
201 | return -1; |
202 | nme = path; |
203 | } |
204 | |
205 | if (*mode == 'm') { /* file open for mmap? */ |
206 | mode++; |
207 | #ifdef _CYGNUS_H_ |
208 | } else { |
209 | flags |= _FRDSEQ; /* WIN32 CreateFile(FILE_FLAG_SEQUENTIAL_SCAN) */ |
210 | #endif |
211 | } |
212 | |
213 | if (strchr(mode, 'w')) { |
214 | flags |= O_WRONLY | O_CREAT; |
215 | } else if (!strchr(mode, '+')) { |
216 | flags |= O_RDONLY; |
217 | } else { |
218 | flags |= O_RDWR; |
219 | } |
220 | #ifdef WIN32 |
221 | flags |= strchr(mode, 'b') ? O_BINARY : O_TEXT; |
222 | #endif |
223 | fd = open(nme, flags, MONETDB_MODE); |
224 | if (fd < 0 && *mode == 'w') { |
225 | /* try to create the directory, in case that was the problem */ |
226 | if (GDKcreatedir(nme) == GDK_SUCCEED) { |
227 | fd = open(nme, flags, MONETDB_MODE); |
228 | if (fd < 0) |
229 | GDKsyserror("GDKfdlocate: cannot open file %s\n" , nme); |
230 | } |
231 | } |
232 | /* don't generate error if we can't open a file for reading */ |
233 | GDKfree(path); |
234 | return fd; |
235 | } |
236 | |
237 | /* like GDKfdlocate, except return a FILE pointer */ |
238 | FILE * |
239 | GDKfilelocate(int farmid, const char *nme, const char *mode, const char *extension) |
240 | { |
241 | int fd; |
242 | FILE *f; |
243 | |
244 | if ((fd = GDKfdlocate(farmid, nme, mode, extension)) < 0) |
245 | return NULL; |
246 | if (*mode == 'm') |
247 | mode++; |
248 | if ((f = fdopen(fd, mode)) == NULL) { |
249 | GDKsyserror("GDKfilelocate: cannot fdopen file\n" ); |
250 | close(fd); |
251 | return NULL; |
252 | } |
253 | return f; |
254 | } |
255 | |
256 | FILE * |
257 | GDKfileopen(int farmid, const char *dir, const char *name, const char *extension, const char *mode) |
258 | { |
259 | char *path; |
260 | |
261 | /* if name is null, try to get one from dir (in case it was a path) */ |
262 | path = GDKfilepath(farmid, dir, name, extension); |
263 | |
264 | if (path != NULL) { |
265 | FILE *f; |
266 | IODEBUG fprintf(stderr, "#GDKfileopen(%s)\n" , path); |
267 | f = fopen(path, mode); |
268 | GDKfree(path); |
269 | return f; |
270 | } |
271 | return NULL; |
272 | } |
273 | |
274 | /* remove the file */ |
275 | gdk_return |
276 | GDKunlink(int farmid, const char *dir, const char *nme, const char *ext) |
277 | { |
278 | if (nme && *nme) { |
279 | char *path; |
280 | |
281 | path = GDKfilepath(farmid, dir, nme, ext); |
282 | if (path == NULL) |
283 | return GDK_FAIL; |
284 | /* if file already doesn't exist, we don't care */ |
285 | if (remove(path) != 0 && errno != ENOENT) { |
286 | GDKsyserror("GDKunlink(%s)\n" , path); |
287 | IODEBUG fprintf(stderr, "#remove %s = -1\n" , path); |
288 | GDKfree(path); |
289 | return GDK_FAIL; |
290 | } |
291 | GDKfree(path); |
292 | return GDK_SUCCEED; |
293 | } |
294 | return GDK_FAIL; |
295 | } |
296 | |
297 | /* |
298 | * A move routine is overloaded to deal with extensions. |
299 | */ |
300 | gdk_return |
301 | GDKmove(int farmid, const char *dir1, const char *nme1, const char *ext1, const char *dir2, const char *nme2, const char *ext2) |
302 | { |
303 | char *path1; |
304 | char *path2; |
305 | int ret, t0 = 0; |
306 | |
307 | IODEBUG t0 = GDKms(); |
308 | |
309 | if ((nme1 == NULL) || (*nme1 == 0)) { |
310 | GDKerror("GDKmove: no file specified\n" ); |
311 | return GDK_FAIL; |
312 | } |
313 | path1 = GDKfilepath(farmid, dir1, nme1, ext1); |
314 | path2 = GDKfilepath(farmid, dir2, nme2, ext2); |
315 | if (path1 && path2) { |
316 | ret = rename(path1, path2); |
317 | if (ret < 0) |
318 | GDKsyserror("GDKmove: cannot rename %s to %s\n" , path1, path2); |
319 | |
320 | IODEBUG fprintf(stderr, "#move %s %s = %d (%dms)\n" , path1, path2, ret, GDKms() - t0); |
321 | } else { |
322 | ret = -1; |
323 | } |
324 | GDKfree(path1); |
325 | GDKfree(path2); |
326 | return ret < 0 ? GDK_FAIL : GDK_SUCCEED; |
327 | } |
328 | |
329 | gdk_return |
330 | GDKextendf(int fd, size_t size, const char *fn) |
331 | { |
332 | struct stat stb; |
333 | int rt = 0; |
334 | int t0 = 0; |
335 | |
336 | assert(!GDKinmemory()); |
337 | #ifdef STATIC_CODE_ANALYSIS |
338 | if (fd < 0) /* in real life, if fd < 0, fstat will fail */ |
339 | return GDK_FAIL; |
340 | #endif |
341 | if (fstat(fd, &stb) < 0) { |
342 | /* shouldn't happen */ |
343 | GDKsyserror("GDKextendf: fstat unexpectedly failed\n" ); |
344 | return GDK_FAIL; |
345 | } |
346 | /* if necessary, extend the underlying file */ |
347 | IODEBUG t0 = GDKms(); |
348 | if (stb.st_size < (off_t) size) { |
349 | #ifdef HAVE_FALLOCATE |
350 | if ((rt = fallocate(fd, 0, stb.st_size, (off_t) size - stb.st_size)) < 0 && |
351 | errno == EOPNOTSUPP) |
352 | /* on Linux, posix_fallocate uses a slow |
353 | * method to allocate blocks if the underlying |
354 | * file system doesn't support the operation, |
355 | * so use fallocate instead and just resize |
356 | * the file if it fails */ |
357 | #else |
358 | #ifdef HAVE_POSIX_FALLOCATE |
359 | /* posix_fallocate returns error number on failure, |
360 | * not -1 :-( */ |
361 | if ((rt = posix_fallocate(fd, stb.st_size, (off_t) size - stb.st_size)) == EINVAL) |
362 | /* on Solaris/OpenIndiana, this may mean that |
363 | * the underlying file system doesn't support |
364 | * the operation, so just resize the file */ |
365 | #endif |
366 | #endif |
367 | /* we get here when (posix_)fallocate fails because it |
368 | * is not supported on the file system, or if neither |
369 | * function exists */ |
370 | rt = ftruncate(fd, (off_t) size); |
371 | if (rt != 0) { |
372 | /* extending failed, try to reduce file size |
373 | * back to original */ |
374 | int err = errno; |
375 | if (ftruncate(fd, stb.st_size)) |
376 | perror("ftruncate" ); |
377 | errno = err; /* restore for error message */ |
378 | GDKsyserror("GDKextendf: could not extend file\n" ); |
379 | } |
380 | } |
381 | IODEBUG fprintf(stderr, "#GDKextend %s %zu -> %zu %dms%s\n" , |
382 | fn, (size_t) stb.st_size, size, |
383 | GDKms() - t0, rt != 0 ? " (failed)" : "" ); |
384 | /* posix_fallocate returns != 0 on failure, fallocate and |
385 | * ftruncate return -1 on failure, but all three return 0 on |
386 | * success */ |
387 | return rt != 0 ? GDK_FAIL : GDK_SUCCEED; |
388 | } |
389 | |
390 | gdk_return |
391 | GDKextend(const char *fn, size_t size) |
392 | { |
393 | int fd, flags = O_RDWR; |
394 | gdk_return rt = GDK_FAIL; |
395 | |
396 | assert(!GDKinmemory()); |
397 | #ifdef O_BINARY |
398 | /* On Windows, open() fails if the file is bigger than 2^32 |
399 | * bytes without O_BINARY. */ |
400 | flags |= O_BINARY; |
401 | #endif |
402 | if ((fd = open(fn, flags | O_CLOEXEC)) >= 0) { |
403 | rt = GDKextendf(fd, size, fn); |
404 | close(fd); |
405 | } else { |
406 | GDKsyserror("GDKextend: cannot open file %s\n" , fn); |
407 | } |
408 | return rt; |
409 | } |
410 | |
411 | /* |
412 | * @+ Save and load. |
413 | * The BAT is saved on disk in several files. The extension DESC |
414 | * denotes the descriptor, BUNs the bun heap, and HHEAP and THEAP the |
415 | * other heaps. The storage mechanism off a file can be memory mapped |
416 | * (STORE_MMAP) or malloced (STORE_MEM). |
417 | * |
418 | * These modes indicates the disk-layout and the intended mapping. |
419 | * The primary concern here is to handle STORE_MMAP and STORE_MEM. |
420 | */ |
421 | gdk_return |
422 | GDKsave(int farmid, const char *nme, const char *ext, void *buf, size_t size, storage_t mode, bool dosync) |
423 | { |
424 | int err = 0; |
425 | |
426 | IODEBUG fprintf(stderr, "#GDKsave: name=%s, ext=%s, mode %d, dosync=%d\n" , nme, ext ? ext : "" , (int) mode, dosync); |
427 | |
428 | assert(!GDKinmemory()); |
429 | if (mode == STORE_MMAP) { |
430 | if (dosync && size && !(GDKdebug & NOSYNCMASK) && MT_msync(buf, size) < 0) |
431 | err = -1; |
432 | if (err) |
433 | GDKsyserror("GDKsave: error on: name=%s, ext=%s, " |
434 | "mode=%d\n" , nme, ext ? ext : "" , |
435 | (int) mode); |
436 | IODEBUG fprintf(stderr, |
437 | "#MT_msync(buf %p, size %zu" |
438 | ") = %d\n" , |
439 | buf, size, err); |
440 | } else { |
441 | int fd; |
442 | |
443 | if ((fd = GDKfdlocate(farmid, nme, "wb" , ext)) >= 0) { |
444 | /* write() on 64-bits Redhat for IA64 returns |
445 | * 32-bits signed result (= OS BUG)! write() |
446 | * on Windows only takes unsigned int as |
447 | * size */ |
448 | while (size > 0) { |
449 | /* circumvent problems by writing huge |
450 | * buffers in chunks <= 1GiB */ |
451 | ssize_t ret; |
452 | |
453 | ret = write(fd, buf, |
454 | (unsigned) MIN(1 << 30, size)); |
455 | if (ret < 0) { |
456 | err = -1; |
457 | GDKsyserror("GDKsave: error %zd" |
458 | " on: name=%s, ext=%s, " |
459 | "mode=%d\n" , ret, nme, |
460 | ext ? ext : "" , (int) mode); |
461 | break; |
462 | } |
463 | size -= ret; |
464 | buf = (void *) ((char *) buf + ret); |
465 | IODEBUG fprintf(stderr, |
466 | "#write(fd %d, buf %p" |
467 | ", size %u) = %zd\n" , |
468 | fd, buf, |
469 | (unsigned) MIN(1 << 30, size), |
470 | ret); |
471 | } |
472 | if (dosync && !(GDKdebug & NOSYNCMASK) |
473 | #if defined(NATIVE_WIN32) |
474 | && _commit(fd) < 0 |
475 | #elif defined(HAVE_FDATASYNC) |
476 | && fdatasync(fd) < 0 |
477 | #elif defined(HAVE_FSYNC) |
478 | && fsync(fd) < 0 |
479 | #endif |
480 | ) { |
481 | GDKsyserror("GDKsave: error on: name=%s, " |
482 | "ext=%s, mode=%d\n" , nme, |
483 | ext ? ext : "" , (int) mode); |
484 | err = -1; |
485 | } |
486 | err |= close(fd); |
487 | if (err && GDKunlink(farmid, BATDIR, nme, ext) != GDK_SUCCEED) { |
488 | /* do not tolerate corrupt heap images |
489 | * (BBPrecover on restart will kill |
490 | * them) */ |
491 | GDKerror("GDKsave: could not remove: name=%s, " |
492 | "ext=%s, mode %d\n" , nme, |
493 | ext ? ext : "" , (int) mode); |
494 | return GDK_FAIL; |
495 | } |
496 | } else { |
497 | err = -1; |
498 | GDKerror("GDKsave: failed name=%s, ext=%s, mode %d\n" , |
499 | nme, ext ? ext : "" , (int) mode); |
500 | } |
501 | } |
502 | return err ? GDK_FAIL : GDK_SUCCEED; |
503 | } |
504 | |
505 | /* |
506 | * Space for the load is directly allocated and the heaps are mapped. |
507 | * Further initialization of the atom heaps require a separate action |
508 | * defined in their implementation. |
509 | * |
510 | * size -- how much to read |
511 | * *maxsize -- (in/out) how much to allocate / how much was allocated |
512 | */ |
513 | char * |
514 | GDKload(int farmid, const char *nme, const char *ext, size_t size, size_t *maxsize, storage_t mode) |
515 | { |
516 | char *ret = NULL; |
517 | |
518 | assert(!GDKinmemory()); |
519 | assert(size <= *maxsize); |
520 | assert(farmid != NOFARM || ext == NULL); |
521 | IODEBUG { |
522 | fprintf(stderr, "#GDKload: name=%s, ext=%s, mode %d\n" , nme, ext ? ext : "" , (int) mode); |
523 | } |
524 | if (mode == STORE_MEM) { |
525 | int fd = GDKfdlocate(farmid, nme, "rb" , ext); |
526 | |
527 | if (fd >= 0) { |
528 | char *dst = ret = GDKmalloc(*maxsize); |
529 | ssize_t n_expected, n = 0; |
530 | |
531 | if (ret) { |
532 | /* read in chunks, some OSs do not |
533 | * give you all at once and Windows |
534 | * only accepts int */ |
535 | for (n_expected = (ssize_t) size; n_expected > 0; n_expected -= n) { |
536 | n = read(fd, dst, (unsigned) MIN(1 << 30, n_expected)); |
537 | if (n < 0) |
538 | GDKsyserror("GDKload: cannot read: name=%s, ext=%s, %zu bytes missing.\n" , nme, ext ? ext : "" , (size_t) n_expected); |
539 | #ifndef STATIC_CODE_ANALYSIS |
540 | /* Coverity doesn't seem to |
541 | * recognize that we're just |
542 | * printing the value of ptr, |
543 | * not its contents */ |
544 | IODEBUG fprintf(stderr, "#read(dst %p, n_expected %zd, fd %d) = %zd\n" , (void *)dst, n_expected, fd, n); |
545 | #endif |
546 | |
547 | if (n <= 0) |
548 | break; |
549 | dst += n; |
550 | } |
551 | if (n_expected > 0) { |
552 | /* we couldn't read all, error |
553 | * already generated */ |
554 | GDKfree(ret); |
555 | ret = NULL; |
556 | } |
557 | #ifndef NDEBUG |
558 | /* just to make valgrind happy, we |
559 | * initialize the whole thing */ |
560 | if (ret && *maxsize > size) |
561 | memset(ret + size, 0, *maxsize - size); |
562 | #endif |
563 | } |
564 | close(fd); |
565 | } else { |
566 | GDKerror("GDKload: cannot open: name=%s, ext=%s\n" , nme, ext ? ext : "" ); |
567 | } |
568 | } else { |
569 | char *path = NULL; |
570 | |
571 | /* round up to multiple of GDK_mmap_pagesize with a |
572 | * minimum of one */ |
573 | size = (*maxsize + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1); |
574 | if (size == 0) |
575 | size = GDK_mmap_pagesize; |
576 | if (farmid != NOFARM) { |
577 | path = GDKfilepath(farmid, BATDIR, nme, ext); |
578 | nme = path; |
579 | } |
580 | if (nme != NULL && GDKextend(nme, size) == GDK_SUCCEED) { |
581 | int mod = MMAP_READ | MMAP_WRITE | MMAP_SEQUENTIAL; |
582 | |
583 | if (mode == STORE_PRIV) |
584 | mod |= MMAP_COPY; |
585 | else |
586 | mod |= MMAP_SYNC; |
587 | ret = GDKmmap(nme, mod, size); |
588 | if (ret != NULL) { |
589 | /* success: update allocated size */ |
590 | *maxsize = size; |
591 | } |
592 | IODEBUG fprintf(stderr, "#mmap(NULL, 0, maxsize %zu, mod %d, path %s, 0) = %p\n" , size, mod, nme, (void *)ret); |
593 | } |
594 | GDKfree(path); |
595 | } |
596 | return ret; |
597 | } |
598 | |
599 | /* |
600 | * @+ BAT disk storage |
601 | * |
602 | * Between sessions the BATs comprising the database are saved on |
603 | * disk. To simplify code, we assume a UNIX directory called its |
604 | * physical @%home@ where they are to be located. The subdirectories |
605 | * BAT and PRG contain what its name says. |
606 | * |
607 | * A BAT created by @%COLnew@ is considered temporary until one calls |
608 | * the routine @%BATsave@. This routine reserves disk space and checks |
609 | * for name clashes. |
610 | * |
611 | * Saving and restoring BATs is left to the upper layers. The library |
612 | * merely copies the data into place. Failure to read or write the |
613 | * BAT results in a NULL, otherwise it returns the BAT pointer. |
614 | */ |
615 | static BAT * |
616 | DESCload(int i) |
617 | { |
618 | const char *s, *nme = BBP_physical(i); |
619 | BAT *b = NULL; |
620 | int tt; |
621 | |
622 | IODEBUG { |
623 | fprintf(stderr, "#DESCload %s\n" , nme ? nme : "<noname>" ); |
624 | } |
625 | b = BBP_desc(i); |
626 | |
627 | if (b == NULL) |
628 | return 0; |
629 | |
630 | tt = b->ttype; |
631 | if ((tt < 0 && (tt = ATOMindex(s = ATOMunknown_name(tt))) < 0)) { |
632 | GDKerror("DESCload: atom '%s' unknown, in BAT '%s'.\n" , s, nme); |
633 | return NULL; |
634 | } |
635 | b->ttype = tt; |
636 | |
637 | /* reconstruct mode from BBP status (BATmode doesn't flush |
638 | * descriptor, so loaded mode may be stale) */ |
639 | b->batTransient = (BBP_status(b->batCacheid) & BBPPERSISTENT) == 0; |
640 | b->batCopiedtodisk = true; |
641 | DESCclean(b); |
642 | return b; |
643 | } |
644 | |
645 | void |
646 | DESCclean(BAT *b) |
647 | { |
648 | b->batDirtyflushed = DELTAdirty(b); |
649 | b->batDirtydesc = false; |
650 | b->theap.dirty = false; |
651 | if (b->tvheap) |
652 | b->tvheap->dirty = false; |
653 | } |
654 | |
655 | /* spawning the background msync should be done carefully |
656 | * because there is a (small) chance that the BAT has been |
657 | * deleted by the time you issue the msync. |
658 | * This leaves you with possibly deadbeef BAT descriptors. |
659 | */ |
660 | |
661 | /* #define DISABLE_MSYNC */ |
662 | #define MSYNC_BACKGROUND |
663 | |
664 | #ifndef DISABLE_MSYNC |
665 | #ifndef MS_ASYNC |
666 | struct msync { |
667 | bat id; |
668 | Heap *h; |
669 | }; |
670 | |
671 | static void |
672 | BATmsyncImplementation(void *arg) |
673 | { |
674 | Heap *h = ((struct msync *) arg)->h; |
675 | |
676 | (void) MT_msync(h->base, h->size); |
677 | BBPunfix(((struct msync *) arg)->id); |
678 | GDKfree(arg); |
679 | } |
680 | #endif |
681 | #endif |
682 | |
683 | void |
684 | BATmsync(BAT *b) |
685 | { |
686 | /* we don't sync views or if we're told not to */ |
687 | if (GDKinmemory() || isVIEW(b) || (GDKdebug & NOSYNCMASK)) |
688 | return; |
689 | /* we don't sync transients */ |
690 | if (b->theap.farmid != 0 || |
691 | (b->tvheap != NULL && b->tvheap->farmid != 0)) |
692 | return; |
693 | #ifndef DISABLE_MSYNC |
694 | #ifdef MS_ASYNC |
695 | if (b->theap.storage == STORE_MMAP) |
696 | (void) msync(b->theap.base, b->theap.free, MS_ASYNC); |
697 | if (b->tvheap && b->tvheap->storage == STORE_MMAP) |
698 | (void) msync(b->tvheap->base, b->tvheap->free, MS_ASYNC); |
699 | #else |
700 | { |
701 | #ifdef MSYNC_BACKGROUND |
702 | MT_Id tid; |
703 | #endif |
704 | struct msync *arg; |
705 | |
706 | assert(!b->batTransient); |
707 | if (b->theap.storage == STORE_MMAP && |
708 | (arg = GDKmalloc(sizeof(*arg))) != NULL) { |
709 | arg->id = b->batCacheid; |
710 | arg->h = &b->theap; |
711 | BBPfix(b->batCacheid); |
712 | #ifdef MSYNC_BACKGROUND |
713 | char name[16]; |
714 | snprintf(name, sizeof(name), "msync%d" , b->batCacheid); |
715 | if (MT_create_thread(&tid, BATmsyncImplementation, arg, |
716 | MT_THR_DETACHED, name) < 0) { |
717 | /* don't bother if we can't create a thread */ |
718 | BBPunfix(b->batCacheid); |
719 | GDKfree(arg); |
720 | } |
721 | #else |
722 | BATmsyncImplementation(arg); |
723 | #endif |
724 | } |
725 | |
726 | if (b->tvheap && b->tvheap->storage == STORE_MMAP && |
727 | (arg = GDKmalloc(sizeof(*arg))) != NULL) { |
728 | arg->id = b->batCacheid; |
729 | arg->h = b->tvheap; |
730 | BBPfix(b->batCacheid); |
731 | #ifdef MSYNC_BACKGROUND |
732 | char name[16]; |
733 | snprintf(name, sizeof(name), "msync%d" , b->batCacheid); |
734 | if (MT_create_thread(&tid, BATmsyncImplementation, arg, |
735 | MT_THR_DETACHED, name) < 0) { |
736 | /* don't bother if we can't create a thread */ |
737 | BBPunfix(b->batCacheid); |
738 | GDKfree(arg); |
739 | } |
740 | #else |
741 | BATmsyncImplementation(arg); |
742 | #endif |
743 | } |
744 | } |
745 | #endif |
746 | #else |
747 | (void) b; |
748 | #endif /* DISABLE_MSYNC */ |
749 | } |
750 | |
751 | gdk_return |
752 | BATsave(BAT *bd) |
753 | { |
754 | gdk_return err = GDK_SUCCEED; |
755 | const char *nme; |
756 | BAT bs; |
757 | Heap vhs; |
758 | BAT *b = bd; |
759 | |
760 | assert(!GDKinmemory()); |
761 | BATcheck(b, "BATsave" , GDK_FAIL); |
762 | |
763 | assert(b->batCacheid > 0); |
764 | /* views cannot be saved, but make an exception for |
765 | * force-remapped views */ |
766 | if (isVIEW(b) && |
767 | !(b->theap.copied && b->theap.storage == STORE_MMAP)) { |
768 | GDKerror("BATsave: %s is a view on %s; cannot be saved\n" , BATgetId(b), BBPname(VIEWtparent(b))); |
769 | return GDK_FAIL; |
770 | } |
771 | if (!BATdirty(b)) { |
772 | return GDK_SUCCEED; |
773 | } |
774 | |
775 | /* copy the descriptor to a local variable in order to let our |
776 | * messing in the BAT descriptor not affect other threads that |
777 | * only read it. */ |
778 | bs = *b; |
779 | b = &bs; |
780 | |
781 | if (b->tvheap) { |
782 | vhs = *bd->tvheap; |
783 | b->tvheap = &vhs; |
784 | } |
785 | |
786 | /* start saving data */ |
787 | nme = BBP_physical(b->batCacheid); |
788 | if (!b->batCopiedtodisk || b->batDirtydesc || b->theap.dirty) |
789 | if (err == GDK_SUCCEED && b->ttype) |
790 | err = HEAPsave(&b->theap, nme, "tail" ); |
791 | if (b->tvheap |
792 | && (!b->batCopiedtodisk || b->batDirtydesc || b->tvheap->dirty) |
793 | && b->ttype |
794 | && b->tvarsized |
795 | && err == GDK_SUCCEED) |
796 | err = HEAPsave(b->tvheap, nme, "theap" ); |
797 | |
798 | if (err == GDK_SUCCEED) { |
799 | bd->batCopiedtodisk = true; |
800 | DESCclean(bd); |
801 | return GDK_SUCCEED; |
802 | } |
803 | return err; |
804 | } |
805 | |
806 | |
807 | /* |
808 | * TODO: move to gdk_bbp.c |
809 | */ |
810 | BAT * |
811 | BATload_intern(bat bid, bool lock) |
812 | { |
813 | const char *nme; |
814 | BAT *b; |
815 | |
816 | assert(!GDKinmemory()); |
817 | assert(bid > 0); |
818 | |
819 | nme = BBP_physical(bid); |
820 | b = DESCload(bid); |
821 | |
822 | if (b == NULL) { |
823 | return NULL; |
824 | } |
825 | |
826 | /* LOAD bun heap */ |
827 | if (b->ttype != TYPE_void) { |
828 | if (HEAPload(&b->theap, nme, "tail" , b->batRestricted == BAT_READ) != GDK_SUCCEED) { |
829 | HEAPfree(&b->theap, false); |
830 | return NULL; |
831 | } |
832 | assert(b->theap.size >> b->tshift <= BUN_MAX); |
833 | b->batCapacity = (BUN) (b->theap.size >> b->tshift); |
834 | } else { |
835 | b->theap.base = NULL; |
836 | } |
837 | |
838 | /* LOAD tail heap */ |
839 | if (ATOMvarsized(b->ttype)) { |
840 | if (HEAPload(b->tvheap, nme, "theap" , b->batRestricted == BAT_READ) != GDK_SUCCEED) { |
841 | HEAPfree(&b->theap, false); |
842 | HEAPfree(b->tvheap, false); |
843 | return NULL; |
844 | } |
845 | if (ATOMstorage(b->ttype) == TYPE_str) { |
846 | strCleanHash(b->tvheap, false); /* ensure consistency */ |
847 | } else { |
848 | HEAP_recover(b->tvheap, (const var_t *) Tloc(b, 0), |
849 | BATcount(b)); |
850 | } |
851 | } |
852 | |
853 | /* initialize descriptor */ |
854 | b->batDirtydesc = false; |
855 | b->theap.parentid = 0; |
856 | |
857 | /* load succeeded; register it in BBP */ |
858 | if (BBPcacheit(b, lock) != GDK_SUCCEED) { |
859 | HEAPfree(&b->theap, false); |
860 | if (b->tvheap) |
861 | HEAPfree(b->tvheap, false); |
862 | return NULL; |
863 | } |
864 | return b; |
865 | } |
866 | |
867 | /* |
868 | * @- BATdelete |
869 | * The new behavior is to let the routine produce warnings but always |
870 | * succeed. rationale: on a delete, we must get rid of *all* the |
871 | * files. We do not have to care about preserving them or be too much |
872 | * concerned if a file that had to be deleted was not found (end |
873 | * result is still that it does not exist). The past behavior to |
874 | * delete some files and then fail was erroneous. The BAT would |
875 | * continue to exist with an incorrect disk status, causing havoc |
876 | * later on. |
877 | * |
878 | * NT forces us to close all files before deleting them; in case of |
879 | * memory mapped files this means that we have to unload the BATs |
880 | * before deleting. This is enforced now. |
881 | */ |
882 | void |
883 | BATdelete(BAT *b) |
884 | { |
885 | bat bid = b->batCacheid; |
886 | const char *o = BBP_physical(bid); |
887 | BAT *loaded = BBP_cache(bid); |
888 | |
889 | assert(bid > 0); |
890 | if (loaded) { |
891 | b = loaded; |
892 | HASHdestroy(b); |
893 | IMPSdestroy(b); |
894 | OIDXdestroy(b); |
895 | } |
896 | if (b->batCopiedtodisk || (b->theap.storage != STORE_MEM)) { |
897 | if (b->ttype != TYPE_void && |
898 | HEAPdelete(&b->theap, o, "tail" ) != GDK_SUCCEED && |
899 | b->batCopiedtodisk) |
900 | IODEBUG fprintf(stderr, "#BATdelete(%s): bun heap\n" , BATgetId(b)); |
901 | } else if (b->theap.base) { |
902 | HEAPfree(&b->theap, true); |
903 | } |
904 | if (b->tvheap) { |
905 | assert(b->tvheap->parentid == bid); |
906 | if (b->batCopiedtodisk || (b->tvheap->storage != STORE_MEM)) { |
907 | if (HEAPdelete(b->tvheap, o, "theap" ) != GDK_SUCCEED && |
908 | b->batCopiedtodisk) |
909 | IODEBUG fprintf(stderr, "#BATdelete(%s): tail heap\n" , BATgetId(b)); |
910 | } else { |
911 | HEAPfree(b->tvheap, true); |
912 | } |
913 | } |
914 | b->batCopiedtodisk = false; |
915 | } |
916 | |
917 | /* |
918 | * BAT specific printing |
919 | */ |
920 | |
921 | gdk_return |
922 | BATprintcolumns(stream *s, int argc, BAT *argv[]) |
923 | { |
924 | int i; |
925 | BUN n, cnt; |
926 | struct colinfo { |
927 | ssize_t (*s) (str *, size_t *, const void *, bool); |
928 | BATiter i; |
929 | } *colinfo; |
930 | char *buf; |
931 | size_t buflen = 0; |
932 | ssize_t len; |
933 | |
934 | /* error checking */ |
935 | for (i = 0; i < argc; i++) { |
936 | if (argv[i] == NULL) { |
937 | GDKerror("Columns missing\n" ); |
938 | return GDK_FAIL; |
939 | } |
940 | if (BATcount(argv[0]) != BATcount(argv[i])) { |
941 | GDKerror("Columns must be the same size\n" ); |
942 | return GDK_FAIL; |
943 | } |
944 | } |
945 | |
946 | if ((colinfo = GDKmalloc(argc * sizeof(*colinfo))) == NULL) { |
947 | GDKerror("Cannot allocate memory\n" ); |
948 | return GDK_FAIL; |
949 | } |
950 | |
951 | for (i = 0; i < argc; i++) { |
952 | colinfo[i].i = bat_iterator(argv[i]); |
953 | colinfo[i].s = BATatoms[argv[i]->ttype].atomToStr; |
954 | } |
955 | |
956 | mnstr_write(s, "#--------------------------#\n" , 1, 29); |
957 | mnstr_write(s, "# " , 1, 2); |
958 | for (i = 0; i < argc; i++) { |
959 | if (i > 0) |
960 | mnstr_write(s, "\t" , 1, 1); |
961 | buf = argv[i]->tident; |
962 | mnstr_write(s, buf, 1, strlen(buf)); |
963 | } |
964 | mnstr_write(s, " # name\n" , 1, 9); |
965 | mnstr_write(s, "# " , 1, 2); |
966 | for (i = 0; i < argc; i++) { |
967 | if (i > 0) |
968 | mnstr_write(s, "\t" , 1, 1); |
969 | buf = ATOMname(argv[i]->ttype); |
970 | mnstr_write(s, buf, 1, strlen(buf)); |
971 | } |
972 | mnstr_write(s, " # type\n" , 1, 9); |
973 | mnstr_write(s, "#--------------------------#\n" , 1, 29); |
974 | buf = NULL; |
975 | |
976 | for (n = 0, cnt = BATcount(argv[0]); n < cnt; n++) { |
977 | mnstr_write(s, "[ " , 1, 2); |
978 | for (i = 0; i < argc; i++) { |
979 | len = colinfo[i].s(&buf, &buflen, BUNtail(colinfo[i].i, n), true); |
980 | if (len < 0) { |
981 | GDKfree(buf); |
982 | GDKfree(colinfo); |
983 | return GDK_FAIL; |
984 | } |
985 | if (i > 0) |
986 | mnstr_write(s, ",\t" , 1, 2); |
987 | mnstr_write(s, buf, 1, len); |
988 | } |
989 | mnstr_write(s, " ]\n" , 1, 4); |
990 | } |
991 | |
992 | GDKfree(buf); |
993 | GDKfree(colinfo); |
994 | |
995 | return GDK_SUCCEED; |
996 | } |
997 | |
998 | gdk_return |
999 | BATprint(stream *fdout, BAT *b) |
1000 | { |
1001 | BAT *argv[2]; |
1002 | gdk_return ret = GDK_FAIL; |
1003 | |
1004 | argv[0] = BATdense(b->hseqbase, b->hseqbase, BATcount(b)); |
1005 | argv[1] = b; |
1006 | if (argv[0] && argv[1]) { |
1007 | ret = BATroles(argv[0], "h" ); |
1008 | if (ret == GDK_SUCCEED) |
1009 | ret = BATprintcolumns(fdout, 2, argv); |
1010 | } |
1011 | if (argv[0]) |
1012 | BBPunfix(argv[0]->batCacheid); |
1013 | return ret; |
1014 | } |
1015 | |