| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * @a M. L. Kersten, P. Boncz, N. Nes |
| 11 | * |
| 12 | * @* Database Storage Management |
| 13 | * Contains routines for writing and reading GDK data to and from |
| 14 | * disk. This section contains the primitives to manage the |
| 15 | * disk-based images of the BATs. It relies on the existence of a UNIX |
| 16 | * file system, including memory mapped files. Solaris and IRIX have |
| 17 | * different implementations of madvise(). |
| 18 | * |
| 19 | * The current version assumes that all BATs are stored on a single |
| 20 | * disk partition. This simplistic assumption should be replaced in |
| 21 | * the near future by a multi-volume version. The intention is to use |
| 22 | * several BAT home locations. The files should be owned by the |
| 23 | * database server. Otherwise, IO operations are likely to fail. This |
| 24 | * is accomplished by setting the GID and UID upon system start. |
| 25 | */ |
| 26 | #include "monetdb_config.h" |
| 27 | #include "gdk.h" |
| 28 | #include "gdk_private.h" |
| 29 | #include "gdk_storage.h" |
| 30 | #include "mutils.h" |
| 31 | #ifdef HAVE_FCNTL_H |
| 32 | #include <fcntl.h> |
| 33 | #endif |
| 34 | |
| 35 | #ifndef O_CLOEXEC |
| 36 | #define O_CLOEXEC 0 |
| 37 | #endif |
| 38 | |
| 39 | /* GDKfilepath returns a newly allocated string containing the path |
| 40 | * name of a database farm. |
| 41 | * The arguments are the farmID or -1, the name of a subdirectory |
| 42 | * within the farm (i.e., something like BATDIR or BAKDIR -- see |
| 43 | * gdk.h) or NULL, the name of a BAT (i.e. the name that is stored in |
| 44 | * BBP.dir -- something like 07/714), and finally the file extension. |
| 45 | * |
| 46 | * If farmid is >= 0, GDKfilepath returns the complete path to the |
| 47 | * specified farm concatenated with the other arguments with |
| 48 | * appropriate separators. If farmid is -1, it returns the |
| 49 | * concatenation of its other arguments (in this case, the result |
| 50 | * cannot be used to access a file directly -- the farm needs to be |
| 51 | * prepended in some other place). */ |
| 52 | char * |
| 53 | GDKfilepath(int farmid, const char *dir, const char *name, const char *ext) |
| 54 | { |
| 55 | const char *sep; |
| 56 | size_t pathlen; |
| 57 | char *path; |
| 58 | |
| 59 | if (GDKinmemory()) |
| 60 | return GDKstrdup(":inmemory" ); |
| 61 | |
| 62 | assert(dir == NULL || *dir != DIR_SEP); |
| 63 | assert(farmid == NOFARM || |
| 64 | (farmid >= 0 && farmid < MAXFARMS && BBPfarms[farmid].dirname)); |
| 65 | if (MT_path_absolute(name)) { |
| 66 | GDKerror("GDKfilepath: name should not be absolute\n" ); |
| 67 | return NULL; |
| 68 | } |
| 69 | if (dir && *dir == DIR_SEP) |
| 70 | dir++; |
| 71 | if (dir == NULL || dir[0] == 0 || dir[strlen(dir) - 1] == DIR_SEP) { |
| 72 | sep = "" ; |
| 73 | } else { |
| 74 | sep = DIR_SEP_STR; |
| 75 | } |
| 76 | pathlen = (farmid == NOFARM ? 0 : strlen(BBPfarms[farmid].dirname) + 1) + |
| 77 | (dir ? strlen(dir) : 0) + strlen(sep) + strlen(name) + |
| 78 | (ext ? strlen(ext) + 1 : 0) + 1; |
| 79 | path = GDKmalloc(pathlen); |
| 80 | if (path == NULL) |
| 81 | return NULL; |
| 82 | if (farmid == NOFARM) { |
| 83 | strconcat_len(path, pathlen, |
| 84 | dir ? dir : "" , sep, name, |
| 85 | ext ? "." : NULL, ext, NULL); |
| 86 | } else { |
| 87 | strconcat_len(path, pathlen, |
| 88 | BBPfarms[farmid].dirname, DIR_SEP_STR, |
| 89 | dir ? dir : "" , sep, name, |
| 90 | ext ? "." : NULL, ext, NULL); |
| 91 | } |
| 92 | return path; |
| 93 | } |
| 94 | |
| 95 | /* make sure the parent directory of DIR exists (the argument itself |
| 96 | * is usually a file that is to be created) */ |
| 97 | gdk_return |
| 98 | GDKcreatedir(const char *dir) |
| 99 | { |
| 100 | char path[FILENAME_MAX]; |
| 101 | char *r; |
| 102 | DIR *dirp; |
| 103 | |
| 104 | IODEBUG fprintf(stderr, "#GDKcreatedir(%s)\n" , dir); |
| 105 | assert(!GDKinmemory()); |
| 106 | assert(MT_path_absolute(dir)); |
| 107 | if (strlen(dir) >= FILENAME_MAX) { |
| 108 | GDKerror("GDKcreatedir: directory name too long\n" ); |
| 109 | return GDK_FAIL; |
| 110 | } |
| 111 | strcpy(path, dir); /* we know this fits (see above) */ |
| 112 | /* skip initial /, if any */ |
| 113 | for (r = strchr(path + 1, DIR_SEP); r; r = strchr(r, DIR_SEP)) { |
| 114 | *r = 0; |
| 115 | if ( |
| 116 | #ifdef WIN32 |
| 117 | strlen(path) > 3 && |
| 118 | #endif |
| 119 | mkdir(path, MONETDB_DIRMODE) < 0) { |
| 120 | if (errno != EEXIST) { |
| 121 | GDKsyserror("GDKcreatedir: cannot create directory %s\n" , path); |
| 122 | IODEBUG fprintf(stderr, "#GDKcreatedir: mkdir(%s) failed\n" , path); |
| 123 | return GDK_FAIL; |
| 124 | } |
| 125 | if ((dirp = opendir(path)) == NULL) { |
| 126 | GDKsyserror("GDKcreatedir: %s not a directory\n" , path); |
| 127 | IODEBUG fprintf(stderr, "#GDKcreatedir: opendir(%s) failed\n" , path); |
| 128 | return GDK_FAIL; |
| 129 | } |
| 130 | /* it's a directory, we can continue */ |
| 131 | closedir(dirp); |
| 132 | } |
| 133 | *r++ = DIR_SEP; |
| 134 | } |
| 135 | return GDK_SUCCEED; |
| 136 | } |
| 137 | |
| 138 | /* remove the directory DIRNAME with its file contents; does not |
| 139 | * recurse into subdirectories */ |
| 140 | gdk_return |
| 141 | GDKremovedir(int farmid, const char *dirname) |
| 142 | { |
| 143 | str dirnamestr; |
| 144 | DIR *dirp; |
| 145 | char *path; |
| 146 | struct dirent *dent; |
| 147 | int ret; |
| 148 | |
| 149 | assert(!GDKinmemory()); |
| 150 | if ((dirnamestr = GDKfilepath(farmid, NULL, dirname, NULL)) == NULL) |
| 151 | return GDK_FAIL; |
| 152 | |
| 153 | IODEBUG fprintf(stderr, "#GDKremovedir(%s)\n" , dirnamestr); |
| 154 | |
| 155 | if ((dirp = opendir(dirnamestr)) == NULL) { |
| 156 | GDKfree(dirnamestr); |
| 157 | return GDK_SUCCEED; |
| 158 | } |
| 159 | while ((dent = readdir(dirp)) != NULL) { |
| 160 | if (dent->d_name[0] == '.' && |
| 161 | (dent->d_name[1] == 0 || |
| 162 | (dent->d_name[1] == '.' && dent->d_name[2] == 0))) { |
| 163 | /* skip . and .. */ |
| 164 | continue; |
| 165 | } |
| 166 | path = GDKfilepath(farmid, dirname, dent->d_name, NULL); |
| 167 | ret = remove(path); |
| 168 | IODEBUG fprintf(stderr, "#remove %s = %d\n" , path, ret); |
| 169 | GDKfree(path); |
| 170 | } |
| 171 | closedir(dirp); |
| 172 | ret = rmdir(dirnamestr); |
| 173 | if (ret != 0) |
| 174 | GDKsyserror("GDKremovedir: rmdir(%s) failed.\n" , dirnamestr); |
| 175 | IODEBUG fprintf(stderr, "#rmdir %s = %d\n" , dirnamestr, ret); |
| 176 | GDKfree(dirnamestr); |
| 177 | return ret ? GDK_FAIL : GDK_SUCCEED; |
| 178 | } |
| 179 | |
| 180 | #define _FUNBUF 0x040000 |
| 181 | #define _FWRTHR 0x080000 |
| 182 | #define _FRDSEQ 0x100000 |
| 183 | |
| 184 | /* open a file and return its file descriptor; the file is specified |
| 185 | * using farmid, name and extension; if opening for writing, we create |
| 186 | * the parent directory if necessary */ |
| 187 | int |
| 188 | GDKfdlocate(int farmid, const char *nme, const char *mode, const char *extension) |
| 189 | { |
| 190 | char *path = NULL; |
| 191 | int fd, flags = O_CLOEXEC; |
| 192 | |
| 193 | assert(!GDKinmemory()); |
| 194 | if (nme == NULL || *nme == 0) |
| 195 | return -1; |
| 196 | |
| 197 | assert(farmid != NOFARM || extension == NULL); |
| 198 | if (farmid != NOFARM) { |
| 199 | path = GDKfilepath(farmid, BATDIR, nme, extension); |
| 200 | if (path == NULL) |
| 201 | return -1; |
| 202 | nme = path; |
| 203 | } |
| 204 | |
| 205 | if (*mode == 'm') { /* file open for mmap? */ |
| 206 | mode++; |
| 207 | #ifdef _CYGNUS_H_ |
| 208 | } else { |
| 209 | flags |= _FRDSEQ; /* WIN32 CreateFile(FILE_FLAG_SEQUENTIAL_SCAN) */ |
| 210 | #endif |
| 211 | } |
| 212 | |
| 213 | if (strchr(mode, 'w')) { |
| 214 | flags |= O_WRONLY | O_CREAT; |
| 215 | } else if (!strchr(mode, '+')) { |
| 216 | flags |= O_RDONLY; |
| 217 | } else { |
| 218 | flags |= O_RDWR; |
| 219 | } |
| 220 | #ifdef WIN32 |
| 221 | flags |= strchr(mode, 'b') ? O_BINARY : O_TEXT; |
| 222 | #endif |
| 223 | fd = open(nme, flags, MONETDB_MODE); |
| 224 | if (fd < 0 && *mode == 'w') { |
| 225 | /* try to create the directory, in case that was the problem */ |
| 226 | if (GDKcreatedir(nme) == GDK_SUCCEED) { |
| 227 | fd = open(nme, flags, MONETDB_MODE); |
| 228 | if (fd < 0) |
| 229 | GDKsyserror("GDKfdlocate: cannot open file %s\n" , nme); |
| 230 | } |
| 231 | } |
| 232 | /* don't generate error if we can't open a file for reading */ |
| 233 | GDKfree(path); |
| 234 | return fd; |
| 235 | } |
| 236 | |
| 237 | /* like GDKfdlocate, except return a FILE pointer */ |
| 238 | FILE * |
| 239 | GDKfilelocate(int farmid, const char *nme, const char *mode, const char *extension) |
| 240 | { |
| 241 | int fd; |
| 242 | FILE *f; |
| 243 | |
| 244 | if ((fd = GDKfdlocate(farmid, nme, mode, extension)) < 0) |
| 245 | return NULL; |
| 246 | if (*mode == 'm') |
| 247 | mode++; |
| 248 | if ((f = fdopen(fd, mode)) == NULL) { |
| 249 | GDKsyserror("GDKfilelocate: cannot fdopen file\n" ); |
| 250 | close(fd); |
| 251 | return NULL; |
| 252 | } |
| 253 | return f; |
| 254 | } |
| 255 | |
| 256 | FILE * |
| 257 | GDKfileopen(int farmid, const char *dir, const char *name, const char *extension, const char *mode) |
| 258 | { |
| 259 | char *path; |
| 260 | |
| 261 | /* if name is null, try to get one from dir (in case it was a path) */ |
| 262 | path = GDKfilepath(farmid, dir, name, extension); |
| 263 | |
| 264 | if (path != NULL) { |
| 265 | FILE *f; |
| 266 | IODEBUG fprintf(stderr, "#GDKfileopen(%s)\n" , path); |
| 267 | f = fopen(path, mode); |
| 268 | GDKfree(path); |
| 269 | return f; |
| 270 | } |
| 271 | return NULL; |
| 272 | } |
| 273 | |
| 274 | /* remove the file */ |
| 275 | gdk_return |
| 276 | GDKunlink(int farmid, const char *dir, const char *nme, const char *ext) |
| 277 | { |
| 278 | if (nme && *nme) { |
| 279 | char *path; |
| 280 | |
| 281 | path = GDKfilepath(farmid, dir, nme, ext); |
| 282 | if (path == NULL) |
| 283 | return GDK_FAIL; |
| 284 | /* if file already doesn't exist, we don't care */ |
| 285 | if (remove(path) != 0 && errno != ENOENT) { |
| 286 | GDKsyserror("GDKunlink(%s)\n" , path); |
| 287 | IODEBUG fprintf(stderr, "#remove %s = -1\n" , path); |
| 288 | GDKfree(path); |
| 289 | return GDK_FAIL; |
| 290 | } |
| 291 | GDKfree(path); |
| 292 | return GDK_SUCCEED; |
| 293 | } |
| 294 | return GDK_FAIL; |
| 295 | } |
| 296 | |
| 297 | /* |
| 298 | * A move routine is overloaded to deal with extensions. |
| 299 | */ |
| 300 | gdk_return |
| 301 | GDKmove(int farmid, const char *dir1, const char *nme1, const char *ext1, const char *dir2, const char *nme2, const char *ext2) |
| 302 | { |
| 303 | char *path1; |
| 304 | char *path2; |
| 305 | int ret, t0 = 0; |
| 306 | |
| 307 | IODEBUG t0 = GDKms(); |
| 308 | |
| 309 | if ((nme1 == NULL) || (*nme1 == 0)) { |
| 310 | GDKerror("GDKmove: no file specified\n" ); |
| 311 | return GDK_FAIL; |
| 312 | } |
| 313 | path1 = GDKfilepath(farmid, dir1, nme1, ext1); |
| 314 | path2 = GDKfilepath(farmid, dir2, nme2, ext2); |
| 315 | if (path1 && path2) { |
| 316 | ret = rename(path1, path2); |
| 317 | if (ret < 0) |
| 318 | GDKsyserror("GDKmove: cannot rename %s to %s\n" , path1, path2); |
| 319 | |
| 320 | IODEBUG fprintf(stderr, "#move %s %s = %d (%dms)\n" , path1, path2, ret, GDKms() - t0); |
| 321 | } else { |
| 322 | ret = -1; |
| 323 | } |
| 324 | GDKfree(path1); |
| 325 | GDKfree(path2); |
| 326 | return ret < 0 ? GDK_FAIL : GDK_SUCCEED; |
| 327 | } |
| 328 | |
| 329 | gdk_return |
| 330 | GDKextendf(int fd, size_t size, const char *fn) |
| 331 | { |
| 332 | struct stat stb; |
| 333 | int rt = 0; |
| 334 | int t0 = 0; |
| 335 | |
| 336 | assert(!GDKinmemory()); |
| 337 | #ifdef STATIC_CODE_ANALYSIS |
| 338 | if (fd < 0) /* in real life, if fd < 0, fstat will fail */ |
| 339 | return GDK_FAIL; |
| 340 | #endif |
| 341 | if (fstat(fd, &stb) < 0) { |
| 342 | /* shouldn't happen */ |
| 343 | GDKsyserror("GDKextendf: fstat unexpectedly failed\n" ); |
| 344 | return GDK_FAIL; |
| 345 | } |
| 346 | /* if necessary, extend the underlying file */ |
| 347 | IODEBUG t0 = GDKms(); |
| 348 | if (stb.st_size < (off_t) size) { |
| 349 | #ifdef HAVE_FALLOCATE |
| 350 | if ((rt = fallocate(fd, 0, stb.st_size, (off_t) size - stb.st_size)) < 0 && |
| 351 | errno == EOPNOTSUPP) |
| 352 | /* on Linux, posix_fallocate uses a slow |
| 353 | * method to allocate blocks if the underlying |
| 354 | * file system doesn't support the operation, |
| 355 | * so use fallocate instead and just resize |
| 356 | * the file if it fails */ |
| 357 | #else |
| 358 | #ifdef HAVE_POSIX_FALLOCATE |
| 359 | /* posix_fallocate returns error number on failure, |
| 360 | * not -1 :-( */ |
| 361 | if ((rt = posix_fallocate(fd, stb.st_size, (off_t) size - stb.st_size)) == EINVAL) |
| 362 | /* on Solaris/OpenIndiana, this may mean that |
| 363 | * the underlying file system doesn't support |
| 364 | * the operation, so just resize the file */ |
| 365 | #endif |
| 366 | #endif |
| 367 | /* we get here when (posix_)fallocate fails because it |
| 368 | * is not supported on the file system, or if neither |
| 369 | * function exists */ |
| 370 | rt = ftruncate(fd, (off_t) size); |
| 371 | if (rt != 0) { |
| 372 | /* extending failed, try to reduce file size |
| 373 | * back to original */ |
| 374 | int err = errno; |
| 375 | if (ftruncate(fd, stb.st_size)) |
| 376 | perror("ftruncate" ); |
| 377 | errno = err; /* restore for error message */ |
| 378 | GDKsyserror("GDKextendf: could not extend file\n" ); |
| 379 | } |
| 380 | } |
| 381 | IODEBUG fprintf(stderr, "#GDKextend %s %zu -> %zu %dms%s\n" , |
| 382 | fn, (size_t) stb.st_size, size, |
| 383 | GDKms() - t0, rt != 0 ? " (failed)" : "" ); |
| 384 | /* posix_fallocate returns != 0 on failure, fallocate and |
| 385 | * ftruncate return -1 on failure, but all three return 0 on |
| 386 | * success */ |
| 387 | return rt != 0 ? GDK_FAIL : GDK_SUCCEED; |
| 388 | } |
| 389 | |
| 390 | gdk_return |
| 391 | GDKextend(const char *fn, size_t size) |
| 392 | { |
| 393 | int fd, flags = O_RDWR; |
| 394 | gdk_return rt = GDK_FAIL; |
| 395 | |
| 396 | assert(!GDKinmemory()); |
| 397 | #ifdef O_BINARY |
| 398 | /* On Windows, open() fails if the file is bigger than 2^32 |
| 399 | * bytes without O_BINARY. */ |
| 400 | flags |= O_BINARY; |
| 401 | #endif |
| 402 | if ((fd = open(fn, flags | O_CLOEXEC)) >= 0) { |
| 403 | rt = GDKextendf(fd, size, fn); |
| 404 | close(fd); |
| 405 | } else { |
| 406 | GDKsyserror("GDKextend: cannot open file %s\n" , fn); |
| 407 | } |
| 408 | return rt; |
| 409 | } |
| 410 | |
| 411 | /* |
| 412 | * @+ Save and load. |
| 413 | * The BAT is saved on disk in several files. The extension DESC |
| 414 | * denotes the descriptor, BUNs the bun heap, and HHEAP and THEAP the |
| 415 | * other heaps. The storage mechanism off a file can be memory mapped |
| 416 | * (STORE_MMAP) or malloced (STORE_MEM). |
| 417 | * |
| 418 | * These modes indicates the disk-layout and the intended mapping. |
| 419 | * The primary concern here is to handle STORE_MMAP and STORE_MEM. |
| 420 | */ |
| 421 | gdk_return |
| 422 | GDKsave(int farmid, const char *nme, const char *ext, void *buf, size_t size, storage_t mode, bool dosync) |
| 423 | { |
| 424 | int err = 0; |
| 425 | |
| 426 | IODEBUG fprintf(stderr, "#GDKsave: name=%s, ext=%s, mode %d, dosync=%d\n" , nme, ext ? ext : "" , (int) mode, dosync); |
| 427 | |
| 428 | assert(!GDKinmemory()); |
| 429 | if (mode == STORE_MMAP) { |
| 430 | if (dosync && size && !(GDKdebug & NOSYNCMASK) && MT_msync(buf, size) < 0) |
| 431 | err = -1; |
| 432 | if (err) |
| 433 | GDKsyserror("GDKsave: error on: name=%s, ext=%s, " |
| 434 | "mode=%d\n" , nme, ext ? ext : "" , |
| 435 | (int) mode); |
| 436 | IODEBUG fprintf(stderr, |
| 437 | "#MT_msync(buf %p, size %zu" |
| 438 | ") = %d\n" , |
| 439 | buf, size, err); |
| 440 | } else { |
| 441 | int fd; |
| 442 | |
| 443 | if ((fd = GDKfdlocate(farmid, nme, "wb" , ext)) >= 0) { |
| 444 | /* write() on 64-bits Redhat for IA64 returns |
| 445 | * 32-bits signed result (= OS BUG)! write() |
| 446 | * on Windows only takes unsigned int as |
| 447 | * size */ |
| 448 | while (size > 0) { |
| 449 | /* circumvent problems by writing huge |
| 450 | * buffers in chunks <= 1GiB */ |
| 451 | ssize_t ret; |
| 452 | |
| 453 | ret = write(fd, buf, |
| 454 | (unsigned) MIN(1 << 30, size)); |
| 455 | if (ret < 0) { |
| 456 | err = -1; |
| 457 | GDKsyserror("GDKsave: error %zd" |
| 458 | " on: name=%s, ext=%s, " |
| 459 | "mode=%d\n" , ret, nme, |
| 460 | ext ? ext : "" , (int) mode); |
| 461 | break; |
| 462 | } |
| 463 | size -= ret; |
| 464 | buf = (void *) ((char *) buf + ret); |
| 465 | IODEBUG fprintf(stderr, |
| 466 | "#write(fd %d, buf %p" |
| 467 | ", size %u) = %zd\n" , |
| 468 | fd, buf, |
| 469 | (unsigned) MIN(1 << 30, size), |
| 470 | ret); |
| 471 | } |
| 472 | if (dosync && !(GDKdebug & NOSYNCMASK) |
| 473 | #if defined(NATIVE_WIN32) |
| 474 | && _commit(fd) < 0 |
| 475 | #elif defined(HAVE_FDATASYNC) |
| 476 | && fdatasync(fd) < 0 |
| 477 | #elif defined(HAVE_FSYNC) |
| 478 | && fsync(fd) < 0 |
| 479 | #endif |
| 480 | ) { |
| 481 | GDKsyserror("GDKsave: error on: name=%s, " |
| 482 | "ext=%s, mode=%d\n" , nme, |
| 483 | ext ? ext : "" , (int) mode); |
| 484 | err = -1; |
| 485 | } |
| 486 | err |= close(fd); |
| 487 | if (err && GDKunlink(farmid, BATDIR, nme, ext) != GDK_SUCCEED) { |
| 488 | /* do not tolerate corrupt heap images |
| 489 | * (BBPrecover on restart will kill |
| 490 | * them) */ |
| 491 | GDKerror("GDKsave: could not remove: name=%s, " |
| 492 | "ext=%s, mode %d\n" , nme, |
| 493 | ext ? ext : "" , (int) mode); |
| 494 | return GDK_FAIL; |
| 495 | } |
| 496 | } else { |
| 497 | err = -1; |
| 498 | GDKerror("GDKsave: failed name=%s, ext=%s, mode %d\n" , |
| 499 | nme, ext ? ext : "" , (int) mode); |
| 500 | } |
| 501 | } |
| 502 | return err ? GDK_FAIL : GDK_SUCCEED; |
| 503 | } |
| 504 | |
| 505 | /* |
| 506 | * Space for the load is directly allocated and the heaps are mapped. |
| 507 | * Further initialization of the atom heaps require a separate action |
| 508 | * defined in their implementation. |
| 509 | * |
| 510 | * size -- how much to read |
| 511 | * *maxsize -- (in/out) how much to allocate / how much was allocated |
| 512 | */ |
| 513 | char * |
| 514 | GDKload(int farmid, const char *nme, const char *ext, size_t size, size_t *maxsize, storage_t mode) |
| 515 | { |
| 516 | char *ret = NULL; |
| 517 | |
| 518 | assert(!GDKinmemory()); |
| 519 | assert(size <= *maxsize); |
| 520 | assert(farmid != NOFARM || ext == NULL); |
| 521 | IODEBUG { |
| 522 | fprintf(stderr, "#GDKload: name=%s, ext=%s, mode %d\n" , nme, ext ? ext : "" , (int) mode); |
| 523 | } |
| 524 | if (mode == STORE_MEM) { |
| 525 | int fd = GDKfdlocate(farmid, nme, "rb" , ext); |
| 526 | |
| 527 | if (fd >= 0) { |
| 528 | char *dst = ret = GDKmalloc(*maxsize); |
| 529 | ssize_t n_expected, n = 0; |
| 530 | |
| 531 | if (ret) { |
| 532 | /* read in chunks, some OSs do not |
| 533 | * give you all at once and Windows |
| 534 | * only accepts int */ |
| 535 | for (n_expected = (ssize_t) size; n_expected > 0; n_expected -= n) { |
| 536 | n = read(fd, dst, (unsigned) MIN(1 << 30, n_expected)); |
| 537 | if (n < 0) |
| 538 | GDKsyserror("GDKload: cannot read: name=%s, ext=%s, %zu bytes missing.\n" , nme, ext ? ext : "" , (size_t) n_expected); |
| 539 | #ifndef STATIC_CODE_ANALYSIS |
| 540 | /* Coverity doesn't seem to |
| 541 | * recognize that we're just |
| 542 | * printing the value of ptr, |
| 543 | * not its contents */ |
| 544 | IODEBUG fprintf(stderr, "#read(dst %p, n_expected %zd, fd %d) = %zd\n" , (void *)dst, n_expected, fd, n); |
| 545 | #endif |
| 546 | |
| 547 | if (n <= 0) |
| 548 | break; |
| 549 | dst += n; |
| 550 | } |
| 551 | if (n_expected > 0) { |
| 552 | /* we couldn't read all, error |
| 553 | * already generated */ |
| 554 | GDKfree(ret); |
| 555 | ret = NULL; |
| 556 | } |
| 557 | #ifndef NDEBUG |
| 558 | /* just to make valgrind happy, we |
| 559 | * initialize the whole thing */ |
| 560 | if (ret && *maxsize > size) |
| 561 | memset(ret + size, 0, *maxsize - size); |
| 562 | #endif |
| 563 | } |
| 564 | close(fd); |
| 565 | } else { |
| 566 | GDKerror("GDKload: cannot open: name=%s, ext=%s\n" , nme, ext ? ext : "" ); |
| 567 | } |
| 568 | } else { |
| 569 | char *path = NULL; |
| 570 | |
| 571 | /* round up to multiple of GDK_mmap_pagesize with a |
| 572 | * minimum of one */ |
| 573 | size = (*maxsize + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1); |
| 574 | if (size == 0) |
| 575 | size = GDK_mmap_pagesize; |
| 576 | if (farmid != NOFARM) { |
| 577 | path = GDKfilepath(farmid, BATDIR, nme, ext); |
| 578 | nme = path; |
| 579 | } |
| 580 | if (nme != NULL && GDKextend(nme, size) == GDK_SUCCEED) { |
| 581 | int mod = MMAP_READ | MMAP_WRITE | MMAP_SEQUENTIAL; |
| 582 | |
| 583 | if (mode == STORE_PRIV) |
| 584 | mod |= MMAP_COPY; |
| 585 | else |
| 586 | mod |= MMAP_SYNC; |
| 587 | ret = GDKmmap(nme, mod, size); |
| 588 | if (ret != NULL) { |
| 589 | /* success: update allocated size */ |
| 590 | *maxsize = size; |
| 591 | } |
| 592 | IODEBUG fprintf(stderr, "#mmap(NULL, 0, maxsize %zu, mod %d, path %s, 0) = %p\n" , size, mod, nme, (void *)ret); |
| 593 | } |
| 594 | GDKfree(path); |
| 595 | } |
| 596 | return ret; |
| 597 | } |
| 598 | |
| 599 | /* |
| 600 | * @+ BAT disk storage |
| 601 | * |
| 602 | * Between sessions the BATs comprising the database are saved on |
| 603 | * disk. To simplify code, we assume a UNIX directory called its |
| 604 | * physical @%home@ where they are to be located. The subdirectories |
| 605 | * BAT and PRG contain what its name says. |
| 606 | * |
| 607 | * A BAT created by @%COLnew@ is considered temporary until one calls |
| 608 | * the routine @%BATsave@. This routine reserves disk space and checks |
| 609 | * for name clashes. |
| 610 | * |
| 611 | * Saving and restoring BATs is left to the upper layers. The library |
| 612 | * merely copies the data into place. Failure to read or write the |
| 613 | * BAT results in a NULL, otherwise it returns the BAT pointer. |
| 614 | */ |
| 615 | static BAT * |
| 616 | DESCload(int i) |
| 617 | { |
| 618 | const char *s, *nme = BBP_physical(i); |
| 619 | BAT *b = NULL; |
| 620 | int tt; |
| 621 | |
| 622 | IODEBUG { |
| 623 | fprintf(stderr, "#DESCload %s\n" , nme ? nme : "<noname>" ); |
| 624 | } |
| 625 | b = BBP_desc(i); |
| 626 | |
| 627 | if (b == NULL) |
| 628 | return 0; |
| 629 | |
| 630 | tt = b->ttype; |
| 631 | if ((tt < 0 && (tt = ATOMindex(s = ATOMunknown_name(tt))) < 0)) { |
| 632 | GDKerror("DESCload: atom '%s' unknown, in BAT '%s'.\n" , s, nme); |
| 633 | return NULL; |
| 634 | } |
| 635 | b->ttype = tt; |
| 636 | |
| 637 | /* reconstruct mode from BBP status (BATmode doesn't flush |
| 638 | * descriptor, so loaded mode may be stale) */ |
| 639 | b->batTransient = (BBP_status(b->batCacheid) & BBPPERSISTENT) == 0; |
| 640 | b->batCopiedtodisk = true; |
| 641 | DESCclean(b); |
| 642 | return b; |
| 643 | } |
| 644 | |
| 645 | void |
| 646 | DESCclean(BAT *b) |
| 647 | { |
| 648 | b->batDirtyflushed = DELTAdirty(b); |
| 649 | b->batDirtydesc = false; |
| 650 | b->theap.dirty = false; |
| 651 | if (b->tvheap) |
| 652 | b->tvheap->dirty = false; |
| 653 | } |
| 654 | |
| 655 | /* spawning the background msync should be done carefully |
| 656 | * because there is a (small) chance that the BAT has been |
| 657 | * deleted by the time you issue the msync. |
| 658 | * This leaves you with possibly deadbeef BAT descriptors. |
| 659 | */ |
| 660 | |
| 661 | /* #define DISABLE_MSYNC */ |
| 662 | #define MSYNC_BACKGROUND |
| 663 | |
| 664 | #ifndef DISABLE_MSYNC |
| 665 | #ifndef MS_ASYNC |
| 666 | struct msync { |
| 667 | bat id; |
| 668 | Heap *h; |
| 669 | }; |
| 670 | |
| 671 | static void |
| 672 | BATmsyncImplementation(void *arg) |
| 673 | { |
| 674 | Heap *h = ((struct msync *) arg)->h; |
| 675 | |
| 676 | (void) MT_msync(h->base, h->size); |
| 677 | BBPunfix(((struct msync *) arg)->id); |
| 678 | GDKfree(arg); |
| 679 | } |
| 680 | #endif |
| 681 | #endif |
| 682 | |
| 683 | void |
| 684 | BATmsync(BAT *b) |
| 685 | { |
| 686 | /* we don't sync views or if we're told not to */ |
| 687 | if (GDKinmemory() || isVIEW(b) || (GDKdebug & NOSYNCMASK)) |
| 688 | return; |
| 689 | /* we don't sync transients */ |
| 690 | if (b->theap.farmid != 0 || |
| 691 | (b->tvheap != NULL && b->tvheap->farmid != 0)) |
| 692 | return; |
| 693 | #ifndef DISABLE_MSYNC |
| 694 | #ifdef MS_ASYNC |
| 695 | if (b->theap.storage == STORE_MMAP) |
| 696 | (void) msync(b->theap.base, b->theap.free, MS_ASYNC); |
| 697 | if (b->tvheap && b->tvheap->storage == STORE_MMAP) |
| 698 | (void) msync(b->tvheap->base, b->tvheap->free, MS_ASYNC); |
| 699 | #else |
| 700 | { |
| 701 | #ifdef MSYNC_BACKGROUND |
| 702 | MT_Id tid; |
| 703 | #endif |
| 704 | struct msync *arg; |
| 705 | |
| 706 | assert(!b->batTransient); |
| 707 | if (b->theap.storage == STORE_MMAP && |
| 708 | (arg = GDKmalloc(sizeof(*arg))) != NULL) { |
| 709 | arg->id = b->batCacheid; |
| 710 | arg->h = &b->theap; |
| 711 | BBPfix(b->batCacheid); |
| 712 | #ifdef MSYNC_BACKGROUND |
| 713 | char name[16]; |
| 714 | snprintf(name, sizeof(name), "msync%d" , b->batCacheid); |
| 715 | if (MT_create_thread(&tid, BATmsyncImplementation, arg, |
| 716 | MT_THR_DETACHED, name) < 0) { |
| 717 | /* don't bother if we can't create a thread */ |
| 718 | BBPunfix(b->batCacheid); |
| 719 | GDKfree(arg); |
| 720 | } |
| 721 | #else |
| 722 | BATmsyncImplementation(arg); |
| 723 | #endif |
| 724 | } |
| 725 | |
| 726 | if (b->tvheap && b->tvheap->storage == STORE_MMAP && |
| 727 | (arg = GDKmalloc(sizeof(*arg))) != NULL) { |
| 728 | arg->id = b->batCacheid; |
| 729 | arg->h = b->tvheap; |
| 730 | BBPfix(b->batCacheid); |
| 731 | #ifdef MSYNC_BACKGROUND |
| 732 | char name[16]; |
| 733 | snprintf(name, sizeof(name), "msync%d" , b->batCacheid); |
| 734 | if (MT_create_thread(&tid, BATmsyncImplementation, arg, |
| 735 | MT_THR_DETACHED, name) < 0) { |
| 736 | /* don't bother if we can't create a thread */ |
| 737 | BBPunfix(b->batCacheid); |
| 738 | GDKfree(arg); |
| 739 | } |
| 740 | #else |
| 741 | BATmsyncImplementation(arg); |
| 742 | #endif |
| 743 | } |
| 744 | } |
| 745 | #endif |
| 746 | #else |
| 747 | (void) b; |
| 748 | #endif /* DISABLE_MSYNC */ |
| 749 | } |
| 750 | |
| 751 | gdk_return |
| 752 | BATsave(BAT *bd) |
| 753 | { |
| 754 | gdk_return err = GDK_SUCCEED; |
| 755 | const char *nme; |
| 756 | BAT bs; |
| 757 | Heap vhs; |
| 758 | BAT *b = bd; |
| 759 | |
| 760 | assert(!GDKinmemory()); |
| 761 | BATcheck(b, "BATsave" , GDK_FAIL); |
| 762 | |
| 763 | assert(b->batCacheid > 0); |
| 764 | /* views cannot be saved, but make an exception for |
| 765 | * force-remapped views */ |
| 766 | if (isVIEW(b) && |
| 767 | !(b->theap.copied && b->theap.storage == STORE_MMAP)) { |
| 768 | GDKerror("BATsave: %s is a view on %s; cannot be saved\n" , BATgetId(b), BBPname(VIEWtparent(b))); |
| 769 | return GDK_FAIL; |
| 770 | } |
| 771 | if (!BATdirty(b)) { |
| 772 | return GDK_SUCCEED; |
| 773 | } |
| 774 | |
| 775 | /* copy the descriptor to a local variable in order to let our |
| 776 | * messing in the BAT descriptor not affect other threads that |
| 777 | * only read it. */ |
| 778 | bs = *b; |
| 779 | b = &bs; |
| 780 | |
| 781 | if (b->tvheap) { |
| 782 | vhs = *bd->tvheap; |
| 783 | b->tvheap = &vhs; |
| 784 | } |
| 785 | |
| 786 | /* start saving data */ |
| 787 | nme = BBP_physical(b->batCacheid); |
| 788 | if (!b->batCopiedtodisk || b->batDirtydesc || b->theap.dirty) |
| 789 | if (err == GDK_SUCCEED && b->ttype) |
| 790 | err = HEAPsave(&b->theap, nme, "tail" ); |
| 791 | if (b->tvheap |
| 792 | && (!b->batCopiedtodisk || b->batDirtydesc || b->tvheap->dirty) |
| 793 | && b->ttype |
| 794 | && b->tvarsized |
| 795 | && err == GDK_SUCCEED) |
| 796 | err = HEAPsave(b->tvheap, nme, "theap" ); |
| 797 | |
| 798 | if (err == GDK_SUCCEED) { |
| 799 | bd->batCopiedtodisk = true; |
| 800 | DESCclean(bd); |
| 801 | return GDK_SUCCEED; |
| 802 | } |
| 803 | return err; |
| 804 | } |
| 805 | |
| 806 | |
| 807 | /* |
| 808 | * TODO: move to gdk_bbp.c |
| 809 | */ |
| 810 | BAT * |
| 811 | BATload_intern(bat bid, bool lock) |
| 812 | { |
| 813 | const char *nme; |
| 814 | BAT *b; |
| 815 | |
| 816 | assert(!GDKinmemory()); |
| 817 | assert(bid > 0); |
| 818 | |
| 819 | nme = BBP_physical(bid); |
| 820 | b = DESCload(bid); |
| 821 | |
| 822 | if (b == NULL) { |
| 823 | return NULL; |
| 824 | } |
| 825 | |
| 826 | /* LOAD bun heap */ |
| 827 | if (b->ttype != TYPE_void) { |
| 828 | if (HEAPload(&b->theap, nme, "tail" , b->batRestricted == BAT_READ) != GDK_SUCCEED) { |
| 829 | HEAPfree(&b->theap, false); |
| 830 | return NULL; |
| 831 | } |
| 832 | assert(b->theap.size >> b->tshift <= BUN_MAX); |
| 833 | b->batCapacity = (BUN) (b->theap.size >> b->tshift); |
| 834 | } else { |
| 835 | b->theap.base = NULL; |
| 836 | } |
| 837 | |
| 838 | /* LOAD tail heap */ |
| 839 | if (ATOMvarsized(b->ttype)) { |
| 840 | if (HEAPload(b->tvheap, nme, "theap" , b->batRestricted == BAT_READ) != GDK_SUCCEED) { |
| 841 | HEAPfree(&b->theap, false); |
| 842 | HEAPfree(b->tvheap, false); |
| 843 | return NULL; |
| 844 | } |
| 845 | if (ATOMstorage(b->ttype) == TYPE_str) { |
| 846 | strCleanHash(b->tvheap, false); /* ensure consistency */ |
| 847 | } else { |
| 848 | HEAP_recover(b->tvheap, (const var_t *) Tloc(b, 0), |
| 849 | BATcount(b)); |
| 850 | } |
| 851 | } |
| 852 | |
| 853 | /* initialize descriptor */ |
| 854 | b->batDirtydesc = false; |
| 855 | b->theap.parentid = 0; |
| 856 | |
| 857 | /* load succeeded; register it in BBP */ |
| 858 | if (BBPcacheit(b, lock) != GDK_SUCCEED) { |
| 859 | HEAPfree(&b->theap, false); |
| 860 | if (b->tvheap) |
| 861 | HEAPfree(b->tvheap, false); |
| 862 | return NULL; |
| 863 | } |
| 864 | return b; |
| 865 | } |
| 866 | |
| 867 | /* |
| 868 | * @- BATdelete |
| 869 | * The new behavior is to let the routine produce warnings but always |
| 870 | * succeed. rationale: on a delete, we must get rid of *all* the |
| 871 | * files. We do not have to care about preserving them or be too much |
| 872 | * concerned if a file that had to be deleted was not found (end |
| 873 | * result is still that it does not exist). The past behavior to |
| 874 | * delete some files and then fail was erroneous. The BAT would |
| 875 | * continue to exist with an incorrect disk status, causing havoc |
| 876 | * later on. |
| 877 | * |
| 878 | * NT forces us to close all files before deleting them; in case of |
| 879 | * memory mapped files this means that we have to unload the BATs |
| 880 | * before deleting. This is enforced now. |
| 881 | */ |
| 882 | void |
| 883 | BATdelete(BAT *b) |
| 884 | { |
| 885 | bat bid = b->batCacheid; |
| 886 | const char *o = BBP_physical(bid); |
| 887 | BAT *loaded = BBP_cache(bid); |
| 888 | |
| 889 | assert(bid > 0); |
| 890 | if (loaded) { |
| 891 | b = loaded; |
| 892 | HASHdestroy(b); |
| 893 | IMPSdestroy(b); |
| 894 | OIDXdestroy(b); |
| 895 | } |
| 896 | if (b->batCopiedtodisk || (b->theap.storage != STORE_MEM)) { |
| 897 | if (b->ttype != TYPE_void && |
| 898 | HEAPdelete(&b->theap, o, "tail" ) != GDK_SUCCEED && |
| 899 | b->batCopiedtodisk) |
| 900 | IODEBUG fprintf(stderr, "#BATdelete(%s): bun heap\n" , BATgetId(b)); |
| 901 | } else if (b->theap.base) { |
| 902 | HEAPfree(&b->theap, true); |
| 903 | } |
| 904 | if (b->tvheap) { |
| 905 | assert(b->tvheap->parentid == bid); |
| 906 | if (b->batCopiedtodisk || (b->tvheap->storage != STORE_MEM)) { |
| 907 | if (HEAPdelete(b->tvheap, o, "theap" ) != GDK_SUCCEED && |
| 908 | b->batCopiedtodisk) |
| 909 | IODEBUG fprintf(stderr, "#BATdelete(%s): tail heap\n" , BATgetId(b)); |
| 910 | } else { |
| 911 | HEAPfree(b->tvheap, true); |
| 912 | } |
| 913 | } |
| 914 | b->batCopiedtodisk = false; |
| 915 | } |
| 916 | |
| 917 | /* |
| 918 | * BAT specific printing |
| 919 | */ |
| 920 | |
| 921 | gdk_return |
| 922 | BATprintcolumns(stream *s, int argc, BAT *argv[]) |
| 923 | { |
| 924 | int i; |
| 925 | BUN n, cnt; |
| 926 | struct colinfo { |
| 927 | ssize_t (*s) (str *, size_t *, const void *, bool); |
| 928 | BATiter i; |
| 929 | } *colinfo; |
| 930 | char *buf; |
| 931 | size_t buflen = 0; |
| 932 | ssize_t len; |
| 933 | |
| 934 | /* error checking */ |
| 935 | for (i = 0; i < argc; i++) { |
| 936 | if (argv[i] == NULL) { |
| 937 | GDKerror("Columns missing\n" ); |
| 938 | return GDK_FAIL; |
| 939 | } |
| 940 | if (BATcount(argv[0]) != BATcount(argv[i])) { |
| 941 | GDKerror("Columns must be the same size\n" ); |
| 942 | return GDK_FAIL; |
| 943 | } |
| 944 | } |
| 945 | |
| 946 | if ((colinfo = GDKmalloc(argc * sizeof(*colinfo))) == NULL) { |
| 947 | GDKerror("Cannot allocate memory\n" ); |
| 948 | return GDK_FAIL; |
| 949 | } |
| 950 | |
| 951 | for (i = 0; i < argc; i++) { |
| 952 | colinfo[i].i = bat_iterator(argv[i]); |
| 953 | colinfo[i].s = BATatoms[argv[i]->ttype].atomToStr; |
| 954 | } |
| 955 | |
| 956 | mnstr_write(s, "#--------------------------#\n" , 1, 29); |
| 957 | mnstr_write(s, "# " , 1, 2); |
| 958 | for (i = 0; i < argc; i++) { |
| 959 | if (i > 0) |
| 960 | mnstr_write(s, "\t" , 1, 1); |
| 961 | buf = argv[i]->tident; |
| 962 | mnstr_write(s, buf, 1, strlen(buf)); |
| 963 | } |
| 964 | mnstr_write(s, " # name\n" , 1, 9); |
| 965 | mnstr_write(s, "# " , 1, 2); |
| 966 | for (i = 0; i < argc; i++) { |
| 967 | if (i > 0) |
| 968 | mnstr_write(s, "\t" , 1, 1); |
| 969 | buf = ATOMname(argv[i]->ttype); |
| 970 | mnstr_write(s, buf, 1, strlen(buf)); |
| 971 | } |
| 972 | mnstr_write(s, " # type\n" , 1, 9); |
| 973 | mnstr_write(s, "#--------------------------#\n" , 1, 29); |
| 974 | buf = NULL; |
| 975 | |
| 976 | for (n = 0, cnt = BATcount(argv[0]); n < cnt; n++) { |
| 977 | mnstr_write(s, "[ " , 1, 2); |
| 978 | for (i = 0; i < argc; i++) { |
| 979 | len = colinfo[i].s(&buf, &buflen, BUNtail(colinfo[i].i, n), true); |
| 980 | if (len < 0) { |
| 981 | GDKfree(buf); |
| 982 | GDKfree(colinfo); |
| 983 | return GDK_FAIL; |
| 984 | } |
| 985 | if (i > 0) |
| 986 | mnstr_write(s, ",\t" , 1, 2); |
| 987 | mnstr_write(s, buf, 1, len); |
| 988 | } |
| 989 | mnstr_write(s, " ]\n" , 1, 4); |
| 990 | } |
| 991 | |
| 992 | GDKfree(buf); |
| 993 | GDKfree(colinfo); |
| 994 | |
| 995 | return GDK_SUCCEED; |
| 996 | } |
| 997 | |
| 998 | gdk_return |
| 999 | BATprint(stream *fdout, BAT *b) |
| 1000 | { |
| 1001 | BAT *argv[2]; |
| 1002 | gdk_return ret = GDK_FAIL; |
| 1003 | |
| 1004 | argv[0] = BATdense(b->hseqbase, b->hseqbase, BATcount(b)); |
| 1005 | argv[1] = b; |
| 1006 | if (argv[0] && argv[1]) { |
| 1007 | ret = BATroles(argv[0], "h" ); |
| 1008 | if (ret == GDK_SUCCEED) |
| 1009 | ret = BATprintcolumns(fdout, 2, argv); |
| 1010 | } |
| 1011 | if (argv[0]) |
| 1012 | BBPunfix(argv[0]->batCacheid); |
| 1013 | return ret; |
| 1014 | } |
| 1015 | |