| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * sysv_shmem.c |
| 4 | * Implement shared memory using SysV facilities |
| 5 | * |
| 6 | * These routines used to be a fairly thin layer on top of SysV shared |
| 7 | * memory functionality. With the addition of anonymous-shmem logic, |
| 8 | * they're a bit fatter now. We still require a SysV shmem block to |
| 9 | * exist, though, because mmap'd shmem provides no way to find out how |
| 10 | * many processes are attached, which we need for interlocking purposes. |
| 11 | * |
| 12 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 13 | * Portions Copyright (c) 1994, Regents of the University of California |
| 14 | * |
| 15 | * IDENTIFICATION |
| 16 | * src/backend/port/sysv_shmem.c |
| 17 | * |
| 18 | *------------------------------------------------------------------------- |
| 19 | */ |
| 20 | #include "postgres.h" |
| 21 | |
| 22 | #include <signal.h> |
| 23 | #include <unistd.h> |
| 24 | #include <sys/file.h> |
| 25 | #include <sys/mman.h> |
| 26 | #include <sys/stat.h> |
| 27 | #ifdef HAVE_SYS_IPC_H |
| 28 | #include <sys/ipc.h> |
| 29 | #endif |
| 30 | #ifdef HAVE_SYS_SHM_H |
| 31 | #include <sys/shm.h> |
| 32 | #endif |
| 33 | |
| 34 | #include "miscadmin.h" |
| 35 | #include "portability/mem.h" |
| 36 | #include "storage/dsm.h" |
| 37 | #include "storage/fd.h" |
| 38 | #include "storage/ipc.h" |
| 39 | #include "storage/pg_shmem.h" |
| 40 | #include "utils/guc.h" |
| 41 | #include "utils/pidfile.h" |
| 42 | |
| 43 | |
| 44 | /* |
| 45 | * As of PostgreSQL 9.3, we normally allocate only a very small amount of |
| 46 | * System V shared memory, and only for the purposes of providing an |
| 47 | * interlock to protect the data directory. The real shared memory block |
| 48 | * is allocated using mmap(). This works around the problem that many |
| 49 | * systems have very low limits on the amount of System V shared memory |
| 50 | * that can be allocated. Even a limit of a few megabytes will be enough |
| 51 | * to run many copies of PostgreSQL without needing to adjust system settings. |
| 52 | * |
| 53 | * We assume that no one will attempt to run PostgreSQL 9.3 or later on |
| 54 | * systems that are ancient enough that anonymous shared memory is not |
| 55 | * supported, such as pre-2.4 versions of Linux. If that turns out to be |
| 56 | * false, we might need to add compile and/or run-time tests here and do this |
| 57 | * only if the running kernel supports it. |
| 58 | * |
| 59 | * However, we must always disable this logic in the EXEC_BACKEND case, and |
| 60 | * fall back to the old method of allocating the entire segment using System V |
| 61 | * shared memory, because there's no way to attach an anonymous mmap'd segment |
| 62 | * to a process after exec(). Since EXEC_BACKEND is intended only for |
| 63 | * developer use, this shouldn't be a big problem. Because of this, we do |
| 64 | * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below. |
| 65 | * |
| 66 | * As of PostgreSQL 12, we regained the ability to use a large System V shared |
| 67 | * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set |
| 68 | * to sysv (though this is not the default). |
| 69 | */ |
| 70 | |
| 71 | |
| 72 | typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ |
| 73 | typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ |
| 74 | |
| 75 | /* |
| 76 | * How does a given IpcMemoryId relate to this PostgreSQL process? |
| 77 | * |
| 78 | * One could recycle unattached segments of different data directories if we |
| 79 | * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would |
| 80 | * cause us to visit less of the key space, making us less likely to detect a |
| 81 | * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis, |
| 82 | * in that postmasters of different data directories could simultaneously |
| 83 | * attempt to recycle a given key. We'll waste keys longer in some cases, but |
| 84 | * avoiding the problems of the alternative justifies that loss. |
| 85 | */ |
| 86 | typedef enum |
| 87 | { |
| 88 | SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */ |
| 89 | SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */ |
| 90 | SHMSTATE_ENOENT, /* no segment of that ID */ |
| 91 | SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */ |
| 92 | SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */ |
| 93 | } IpcMemoryState; |
| 94 | |
| 95 | |
| 96 | unsigned long UsedShmemSegID = 0; |
| 97 | void *UsedShmemSegAddr = NULL; |
| 98 | |
| 99 | static Size AnonymousShmemSize; |
| 100 | static void *AnonymousShmem = NULL; |
| 101 | |
| 102 | static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); |
| 103 | static void IpcMemoryDetach(int status, Datum shmaddr); |
| 104 | static void IpcMemoryDelete(int status, Datum shmId); |
| 105 | static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, |
| 106 | void *attachAt, |
| 107 | PGShmemHeader **addr); |
| 108 | |
| 109 | |
| 110 | /* |
| 111 | * InternalIpcMemoryCreate(memKey, size) |
| 112 | * |
| 113 | * Attempt to create a new shared memory segment with the specified key. |
| 114 | * Will fail (return NULL) if such a segment already exists. If successful, |
| 115 | * attach the segment to the current process and return its attached address. |
| 116 | * On success, callbacks are registered with on_shmem_exit to detach and |
| 117 | * delete the segment when on_shmem_exit is called. |
| 118 | * |
| 119 | * If we fail with a failure code other than collision-with-existing-segment, |
| 120 | * print out an error and abort. Other types of errors are not recoverable. |
| 121 | */ |
| 122 | static void * |
| 123 | InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) |
| 124 | { |
| 125 | IpcMemoryId shmid; |
| 126 | void *requestedAddress = NULL; |
| 127 | void *memAddress; |
| 128 | |
| 129 | /* |
| 130 | * Normally we just pass requestedAddress = NULL to shmat(), allowing the |
| 131 | * system to choose where the segment gets mapped. But in an EXEC_BACKEND |
| 132 | * build, it's possible for whatever is chosen in the postmaster to not |
| 133 | * work for backends, due to variations in address space layout. As a |
| 134 | * rather klugy workaround, allow the user to specify the address to use |
| 135 | * via setting the environment variable PG_SHMEM_ADDR. (If this were of |
| 136 | * interest for anything except debugging, we'd probably create a cleaner |
| 137 | * and better-documented way to set it, such as a GUC.) |
| 138 | */ |
| 139 | #ifdef EXEC_BACKEND |
| 140 | { |
| 141 | char *pg_shmem_addr = getenv("PG_SHMEM_ADDR" ); |
| 142 | |
| 143 | if (pg_shmem_addr) |
| 144 | requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0); |
| 145 | } |
| 146 | #endif |
| 147 | |
| 148 | shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); |
| 149 | |
| 150 | if (shmid < 0) |
| 151 | { |
| 152 | int shmget_errno = errno; |
| 153 | |
| 154 | /* |
| 155 | * Fail quietly if error indicates a collision with existing segment. |
| 156 | * One would expect EEXIST, given that we said IPC_EXCL, but perhaps |
| 157 | * we could get a permission violation instead? Also, EIDRM might |
| 158 | * occur if an old seg is slated for destruction but not gone yet. |
| 159 | */ |
| 160 | if (shmget_errno == EEXIST || shmget_errno == EACCES |
| 161 | #ifdef EIDRM |
| 162 | || shmget_errno == EIDRM |
| 163 | #endif |
| 164 | ) |
| 165 | return NULL; |
| 166 | |
| 167 | /* |
| 168 | * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if |
| 169 | * there is an existing segment but it's smaller than "size" (this is |
| 170 | * a result of poorly-thought-out ordering of error tests). To |
| 171 | * distinguish between collision and invalid size in such cases, we |
| 172 | * make a second try with size = 0. These kernels do not test size |
| 173 | * against SHMMIN in the preexisting-segment case, so we will not get |
| 174 | * EINVAL a second time if there is such a segment. |
| 175 | */ |
| 176 | if (shmget_errno == EINVAL) |
| 177 | { |
| 178 | shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); |
| 179 | |
| 180 | if (shmid < 0) |
| 181 | { |
| 182 | /* As above, fail quietly if we verify a collision */ |
| 183 | if (errno == EEXIST || errno == EACCES |
| 184 | #ifdef EIDRM |
| 185 | || errno == EIDRM |
| 186 | #endif |
| 187 | ) |
| 188 | return NULL; |
| 189 | /* Otherwise, fall through to report the original error */ |
| 190 | } |
| 191 | else |
| 192 | { |
| 193 | /* |
| 194 | * On most platforms we cannot get here because SHMMIN is |
| 195 | * greater than zero. However, if we do succeed in creating a |
| 196 | * zero-size segment, free it and then fall through to report |
| 197 | * the original error. |
| 198 | */ |
| 199 | if (shmctl(shmid, IPC_RMID, NULL) < 0) |
| 200 | elog(LOG, "shmctl(%d, %d, 0) failed: %m" , |
| 201 | (int) shmid, IPC_RMID); |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | /* |
| 206 | * Else complain and abort. |
| 207 | * |
| 208 | * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX |
| 209 | * is violated. SHMALL violation might be reported as either ENOMEM |
| 210 | * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which |
| 211 | * it should be. SHMMNI violation is ENOSPC, per spec. Just plain |
| 212 | * not-enough-RAM is ENOMEM. |
| 213 | */ |
| 214 | errno = shmget_errno; |
| 215 | ereport(FATAL, |
| 216 | (errmsg("could not create shared memory segment: %m" ), |
| 217 | errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o)." , |
| 218 | (unsigned long) memKey, size, |
| 219 | IPC_CREAT | IPC_EXCL | IPCProtection), |
| 220 | (shmget_errno == EINVAL) ? |
| 221 | errhint("This error usually means that PostgreSQL's request for a shared memory " |
| 222 | "segment exceeded your kernel's SHMMAX parameter, or possibly that " |
| 223 | "it is less than " |
| 224 | "your kernel's SHMMIN parameter.\n" |
| 225 | "The PostgreSQL documentation contains more information about shared " |
| 226 | "memory configuration." ) : 0, |
| 227 | (shmget_errno == ENOMEM) ? |
| 228 | errhint("This error usually means that PostgreSQL's request for a shared " |
| 229 | "memory segment exceeded your kernel's SHMALL parameter. You might need " |
| 230 | "to reconfigure the kernel with larger SHMALL.\n" |
| 231 | "The PostgreSQL documentation contains more information about shared " |
| 232 | "memory configuration." ) : 0, |
| 233 | (shmget_errno == ENOSPC) ? |
| 234 | errhint("This error does *not* mean that you have run out of disk space. " |
| 235 | "It occurs either if all available shared memory IDs have been taken, " |
| 236 | "in which case you need to raise the SHMMNI parameter in your kernel, " |
| 237 | "or because the system's overall limit for shared memory has been " |
| 238 | "reached.\n" |
| 239 | "The PostgreSQL documentation contains more information about shared " |
| 240 | "memory configuration." ) : 0)); |
| 241 | } |
| 242 | |
| 243 | /* Register on-exit routine to delete the new segment */ |
| 244 | on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); |
| 245 | |
| 246 | /* OK, should be able to attach to the segment */ |
| 247 | memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS); |
| 248 | |
| 249 | if (memAddress == (void *) -1) |
| 250 | elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m" , |
| 251 | shmid, requestedAddress, PG_SHMAT_FLAGS); |
| 252 | |
| 253 | /* Register on-exit routine to detach new segment before deleting */ |
| 254 | on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); |
| 255 | |
| 256 | /* |
| 257 | * Store shmem key and ID in data directory lockfile. Format to try to |
| 258 | * keep it the same length always (trailing junk in the lockfile won't |
| 259 | * hurt, but might confuse humans). |
| 260 | */ |
| 261 | { |
| 262 | char line[64]; |
| 263 | |
| 264 | sprintf(line, "%9lu %9lu" , |
| 265 | (unsigned long) memKey, (unsigned long) shmid); |
| 266 | AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); |
| 267 | } |
| 268 | |
| 269 | return memAddress; |
| 270 | } |
| 271 | |
| 272 | /****************************************************************************/ |
| 273 | /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ |
| 274 | /* from process' address space */ |
| 275 | /* (called as an on_shmem_exit callback, hence funny argument list) */ |
| 276 | /****************************************************************************/ |
| 277 | static void |
| 278 | IpcMemoryDetach(int status, Datum shmaddr) |
| 279 | { |
| 280 | /* Detach System V shared memory block. */ |
| 281 | if (shmdt(DatumGetPointer(shmaddr)) < 0) |
| 282 | elog(LOG, "shmdt(%p) failed: %m" , DatumGetPointer(shmaddr)); |
| 283 | } |
| 284 | |
| 285 | /****************************************************************************/ |
| 286 | /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ |
| 287 | /* (called as an on_shmem_exit callback, hence funny argument list) */ |
| 288 | /****************************************************************************/ |
| 289 | static void |
| 290 | IpcMemoryDelete(int status, Datum shmId) |
| 291 | { |
| 292 | if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) |
| 293 | elog(LOG, "shmctl(%d, %d, 0) failed: %m" , |
| 294 | DatumGetInt32(shmId), IPC_RMID); |
| 295 | } |
| 296 | |
| 297 | /* |
| 298 | * PGSharedMemoryIsInUse |
| 299 | * |
| 300 | * Is a previously-existing shmem segment still existing and in use? |
| 301 | * |
| 302 | * The point of this exercise is to detect the case where a prior postmaster |
| 303 | * crashed, but it left child backends that are still running. Therefore |
| 304 | * we only care about shmem segments that are associated with the intended |
| 305 | * DataDir. This is an important consideration since accidental matches of |
| 306 | * shmem segment IDs are reasonably common. |
| 307 | */ |
| 308 | bool |
| 309 | PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) |
| 310 | { |
| 311 | PGShmemHeader *memAddress; |
| 312 | IpcMemoryState state; |
| 313 | |
| 314 | state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress); |
| 315 | if (memAddress && shmdt(memAddress) < 0) |
| 316 | elog(LOG, "shmdt(%p) failed: %m" , memAddress); |
| 317 | switch (state) |
| 318 | { |
| 319 | case SHMSTATE_ENOENT: |
| 320 | case SHMSTATE_FOREIGN: |
| 321 | case SHMSTATE_UNATTACHED: |
| 322 | return false; |
| 323 | case SHMSTATE_ANALYSIS_FAILURE: |
| 324 | case SHMSTATE_ATTACHED: |
| 325 | return true; |
| 326 | } |
| 327 | return true; |
| 328 | } |
| 329 | |
| 330 | /* |
| 331 | * Test for a segment with id shmId; see comment at IpcMemoryState. |
| 332 | * |
| 333 | * If the segment exists, we'll attempt to attach to it, using attachAt |
| 334 | * if that's not NULL (but it's best to pass NULL if possible). |
| 335 | * |
| 336 | * *addr is set to the segment memory address if we attached to it, else NULL. |
| 337 | */ |
| 338 | static IpcMemoryState |
| 339 | PGSharedMemoryAttach(IpcMemoryId shmId, |
| 340 | void *attachAt, |
| 341 | PGShmemHeader **addr) |
| 342 | { |
| 343 | struct shmid_ds shmStat; |
| 344 | struct stat statbuf; |
| 345 | PGShmemHeader *hdr; |
| 346 | |
| 347 | *addr = NULL; |
| 348 | |
| 349 | /* |
| 350 | * First, try to stat the shm segment ID, to see if it exists at all. |
| 351 | */ |
| 352 | if (shmctl(shmId, IPC_STAT, &shmStat) < 0) |
| 353 | { |
| 354 | /* |
| 355 | * EINVAL actually has multiple possible causes documented in the |
| 356 | * shmctl man page, but we assume it must mean the segment no longer |
| 357 | * exists. |
| 358 | */ |
| 359 | if (errno == EINVAL) |
| 360 | return SHMSTATE_ENOENT; |
| 361 | |
| 362 | /* |
| 363 | * EACCES implies we have no read permission, which means it is not a |
| 364 | * Postgres shmem segment (or at least, not one that is relevant to |
| 365 | * our data directory). |
| 366 | */ |
| 367 | if (errno == EACCES) |
| 368 | return SHMSTATE_FOREIGN; |
| 369 | |
| 370 | /* |
| 371 | * Some Linux kernel versions (in fact, all of them as of July 2007) |
| 372 | * sometimes return EIDRM when EINVAL is correct. The Linux kernel |
| 373 | * actually does not have any internal state that would justify |
| 374 | * returning EIDRM, so we can get away with assuming that EIDRM is |
| 375 | * equivalent to EINVAL on that platform. |
| 376 | */ |
| 377 | #ifdef HAVE_LINUX_EIDRM_BUG |
| 378 | if (errno == EIDRM) |
| 379 | return SHMSTATE_ENOENT; |
| 380 | #endif |
| 381 | |
| 382 | /* |
| 383 | * Otherwise, we had better assume that the segment is in use. The |
| 384 | * only likely case is (non-Linux, assumed spec-compliant) EIDRM, |
| 385 | * which implies that the segment has been IPC_RMID'd but there are |
| 386 | * still processes attached to it. |
| 387 | */ |
| 388 | return SHMSTATE_ANALYSIS_FAILURE; |
| 389 | } |
| 390 | |
| 391 | /* |
| 392 | * Try to attach to the segment and see if it matches our data directory. |
| 393 | * This avoids key-conflict problems on machines that are running several |
| 394 | * postmasters under the same userid and port number. (That would not |
| 395 | * ordinarily happen in production, but it can happen during parallel |
| 396 | * testing. Since our test setups don't open any TCP ports on Unix, such |
| 397 | * cases don't conflict otherwise.) |
| 398 | */ |
| 399 | if (stat(DataDir, &statbuf) < 0) |
| 400 | return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */ |
| 401 | |
| 402 | hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS); |
| 403 | if (hdr == (PGShmemHeader *) -1) |
| 404 | { |
| 405 | /* |
| 406 | * Attachment failed. The cases we're interested in are the same as |
| 407 | * for the shmctl() call above. In particular, note that the owning |
| 408 | * postmaster could have terminated and removed the segment between |
| 409 | * shmctl() and shmat(). |
| 410 | * |
| 411 | * If attachAt isn't NULL, it's possible that EINVAL reflects a |
| 412 | * problem with that address not a vanished segment, so it's best to |
| 413 | * pass NULL when probing for conflicting segments. |
| 414 | */ |
| 415 | if (errno == EINVAL) |
| 416 | return SHMSTATE_ENOENT; /* segment disappeared */ |
| 417 | if (errno == EACCES) |
| 418 | return SHMSTATE_FOREIGN; /* must be non-Postgres */ |
| 419 | #ifdef HAVE_LINUX_EIDRM_BUG |
| 420 | if (errno == EIDRM) |
| 421 | return SHMSTATE_ENOENT; /* segment disappeared */ |
| 422 | #endif |
| 423 | /* Otherwise, be conservative. */ |
| 424 | return SHMSTATE_ANALYSIS_FAILURE; |
| 425 | } |
| 426 | *addr = hdr; |
| 427 | |
| 428 | if (hdr->magic != PGShmemMagic || |
| 429 | hdr->device != statbuf.st_dev || |
| 430 | hdr->inode != statbuf.st_ino) |
| 431 | { |
| 432 | /* |
| 433 | * It's either not a Postgres segment, or not one for my data |
| 434 | * directory. |
| 435 | */ |
| 436 | return SHMSTATE_FOREIGN; |
| 437 | } |
| 438 | |
| 439 | /* |
| 440 | * It does match our data directory, so now test whether any processes are |
| 441 | * still attached to it. (We are, now, but the shm_nattch result is from |
| 442 | * before we attached to it.) |
| 443 | */ |
| 444 | return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED; |
| 445 | } |
| 446 | |
| 447 | #ifdef MAP_HUGETLB |
| 448 | |
| 449 | /* |
| 450 | * Identify the huge page size to use. |
| 451 | * |
| 452 | * Some Linux kernel versions have a bug causing mmap() to fail on requests |
| 453 | * that are not a multiple of the hugepage size. Versions without that bug |
| 454 | * instead silently round the request up to the next hugepage multiple --- |
| 455 | * and then munmap() fails when we give it a size different from that. |
| 456 | * So we have to round our request up to a multiple of the actual hugepage |
| 457 | * size to avoid trouble. |
| 458 | * |
| 459 | * Doing the round-up ourselves also lets us make use of the extra memory, |
| 460 | * rather than just wasting it. Currently, we just increase the available |
| 461 | * space recorded in the shmem header, which will make the extra usable for |
| 462 | * purposes such as additional locktable entries. Someday, for very large |
| 463 | * hugepage sizes, we might want to think about more invasive strategies, |
| 464 | * such as increasing shared_buffers to absorb the extra space. |
| 465 | * |
| 466 | * Returns the (real or assumed) page size into *hugepagesize, |
| 467 | * and the hugepage-related mmap flags to use into *mmap_flags. |
| 468 | * |
| 469 | * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems |
| 470 | * that support it, we might OR in additional bits to specify a particular |
| 471 | * non-default huge page size. |
| 472 | */ |
| 473 | static void |
| 474 | GetHugePageSize(Size *hugepagesize, int *mmap_flags) |
| 475 | { |
| 476 | /* |
| 477 | * If we fail to find out the system's default huge page size, assume it |
| 478 | * is 2MB. This will work fine when the actual size is less. If it's |
| 479 | * more, we might get mmap() or munmap() failures due to unaligned |
| 480 | * requests; but at this writing, there are no reports of any non-Linux |
| 481 | * systems being picky about that. |
| 482 | */ |
| 483 | *hugepagesize = 2 * 1024 * 1024; |
| 484 | *mmap_flags = MAP_HUGETLB; |
| 485 | |
| 486 | /* |
| 487 | * System-dependent code to find out the default huge page size. |
| 488 | * |
| 489 | * On Linux, read /proc/meminfo looking for a line like "Hugepagesize: |
| 490 | * nnnn kB". Ignore any failures, falling back to the preset default. |
| 491 | */ |
| 492 | #ifdef __linux__ |
| 493 | { |
| 494 | FILE *fp = AllocateFile("/proc/meminfo" , "r" ); |
| 495 | char buf[128]; |
| 496 | unsigned int sz; |
| 497 | char ch; |
| 498 | |
| 499 | if (fp) |
| 500 | { |
| 501 | while (fgets(buf, sizeof(buf), fp)) |
| 502 | { |
| 503 | if (sscanf(buf, "Hugepagesize: %u %c" , &sz, &ch) == 2) |
| 504 | { |
| 505 | if (ch == 'k') |
| 506 | { |
| 507 | *hugepagesize = sz * (Size) 1024; |
| 508 | break; |
| 509 | } |
| 510 | /* We could accept other units besides kB, if needed */ |
| 511 | } |
| 512 | } |
| 513 | FreeFile(fp); |
| 514 | } |
| 515 | } |
| 516 | #endif /* __linux__ */ |
| 517 | } |
| 518 | |
| 519 | #endif /* MAP_HUGETLB */ |
| 520 | |
| 521 | /* |
| 522 | * Creates an anonymous mmap()ed shared memory segment. |
| 523 | * |
| 524 | * Pass the requested size in *size. This function will modify *size to the |
| 525 | * actual size of the allocation, if it ends up allocating a segment that is |
| 526 | * larger than requested. |
| 527 | */ |
| 528 | static void * |
| 529 | CreateAnonymousSegment(Size *size) |
| 530 | { |
| 531 | Size allocsize = *size; |
| 532 | void *ptr = MAP_FAILED; |
| 533 | int mmap_errno = 0; |
| 534 | |
| 535 | #ifndef MAP_HUGETLB |
| 536 | /* PGSharedMemoryCreate should have dealt with this case */ |
| 537 | Assert(huge_pages != HUGE_PAGES_ON); |
| 538 | #else |
| 539 | if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) |
| 540 | { |
| 541 | /* |
| 542 | * Round up the request size to a suitable large value. |
| 543 | */ |
| 544 | Size hugepagesize; |
| 545 | int mmap_flags; |
| 546 | |
| 547 | GetHugePageSize(&hugepagesize, &mmap_flags); |
| 548 | |
| 549 | if (allocsize % hugepagesize != 0) |
| 550 | allocsize += hugepagesize - (allocsize % hugepagesize); |
| 551 | |
| 552 | ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
| 553 | PG_MMAP_FLAGS | mmap_flags, -1, 0); |
| 554 | mmap_errno = errno; |
| 555 | if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) |
| 556 | elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m" , |
| 557 | allocsize); |
| 558 | } |
| 559 | #endif |
| 560 | |
| 561 | if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) |
| 562 | { |
| 563 | /* |
| 564 | * Use the original size, not the rounded-up value, when falling back |
| 565 | * to non-huge pages. |
| 566 | */ |
| 567 | allocsize = *size; |
| 568 | ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
| 569 | PG_MMAP_FLAGS, -1, 0); |
| 570 | mmap_errno = errno; |
| 571 | } |
| 572 | |
| 573 | if (ptr == MAP_FAILED) |
| 574 | { |
| 575 | errno = mmap_errno; |
| 576 | ereport(FATAL, |
| 577 | (errmsg("could not map anonymous shared memory: %m" ), |
| 578 | (mmap_errno == ENOMEM) ? |
| 579 | errhint("This error usually means that PostgreSQL's request " |
| 580 | "for a shared memory segment exceeded available memory, " |
| 581 | "swap space, or huge pages. To reduce the request size " |
| 582 | "(currently %zu bytes), reduce PostgreSQL's shared " |
| 583 | "memory usage, perhaps by reducing shared_buffers or " |
| 584 | "max_connections." , |
| 585 | *size) : 0)); |
| 586 | } |
| 587 | |
| 588 | *size = allocsize; |
| 589 | return ptr; |
| 590 | } |
| 591 | |
| 592 | /* |
| 593 | * AnonymousShmemDetach --- detach from an anonymous mmap'd block |
| 594 | * (called as an on_shmem_exit callback, hence funny argument list) |
| 595 | */ |
| 596 | static void |
| 597 | AnonymousShmemDetach(int status, Datum arg) |
| 598 | { |
| 599 | /* Release anonymous shared memory block, if any. */ |
| 600 | if (AnonymousShmem != NULL) |
| 601 | { |
| 602 | if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
| 603 | elog(LOG, "munmap(%p, %zu) failed: %m" , |
| 604 | AnonymousShmem, AnonymousShmemSize); |
| 605 | AnonymousShmem = NULL; |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | /* |
| 610 | * PGSharedMemoryCreate |
| 611 | * |
| 612 | * Create a shared memory segment of the given size and initialize its |
| 613 | * standard header. Also, register an on_shmem_exit callback to release |
| 614 | * the storage. |
| 615 | * |
| 616 | * Dead Postgres segments pertinent to this DataDir are recycled if found, but |
| 617 | * we do not fail upon collision with foreign shmem segments. The idea here |
| 618 | * is to detect and re-use keys that may have been assigned by a crashed |
| 619 | * postmaster or backend. |
| 620 | * |
| 621 | * The port number is passed for possible use as a key (for SysV, we use |
| 622 | * it to generate the starting shmem key). |
| 623 | */ |
| 624 | PGShmemHeader * |
| 625 | PGSharedMemoryCreate(Size size, int port, |
| 626 | PGShmemHeader **shim) |
| 627 | { |
| 628 | IpcMemoryKey NextShmemSegID; |
| 629 | void *memAddress; |
| 630 | PGShmemHeader *hdr; |
| 631 | struct stat statbuf; |
| 632 | Size sysvsize; |
| 633 | |
| 634 | /* Complain if hugepages demanded but we can't possibly support them */ |
| 635 | #if !defined(MAP_HUGETLB) |
| 636 | if (huge_pages == HUGE_PAGES_ON) |
| 637 | ereport(ERROR, |
| 638 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| 639 | errmsg("huge pages not supported on this platform" ))); |
| 640 | #endif |
| 641 | |
| 642 | /* Room for a header? */ |
| 643 | Assert(size > MAXALIGN(sizeof(PGShmemHeader))); |
| 644 | |
| 645 | if (shared_memory_type == SHMEM_TYPE_MMAP) |
| 646 | { |
| 647 | AnonymousShmem = CreateAnonymousSegment(&size); |
| 648 | AnonymousShmemSize = size; |
| 649 | |
| 650 | /* Register on-exit routine to unmap the anonymous segment */ |
| 651 | on_shmem_exit(AnonymousShmemDetach, (Datum) 0); |
| 652 | |
| 653 | /* Now we need only allocate a minimal-sized SysV shmem block. */ |
| 654 | sysvsize = sizeof(PGShmemHeader); |
| 655 | } |
| 656 | else |
| 657 | sysvsize = size; |
| 658 | |
| 659 | /* |
| 660 | * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to |
| 661 | * ensure no more than one postmaster per data directory can enter this |
| 662 | * loop simultaneously. (CreateDataDirLockFile() does not ensure that, |
| 663 | * but prefer fixing it over coping here.) |
| 664 | */ |
| 665 | NextShmemSegID = 1 + port * 1000; |
| 666 | |
| 667 | for (;;) |
| 668 | { |
| 669 | IpcMemoryId shmid; |
| 670 | PGShmemHeader *oldhdr; |
| 671 | IpcMemoryState state; |
| 672 | |
| 673 | /* Try to create new segment */ |
| 674 | memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); |
| 675 | if (memAddress) |
| 676 | break; /* successful create and attach */ |
| 677 | |
| 678 | /* Check shared memory and possibly remove and recreate */ |
| 679 | |
| 680 | /* |
| 681 | * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN. |
| 682 | * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can |
| 683 | * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN. |
| 684 | */ |
| 685 | shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0); |
| 686 | if (shmid < 0) |
| 687 | { |
| 688 | oldhdr = NULL; |
| 689 | state = SHMSTATE_FOREIGN; |
| 690 | } |
| 691 | else |
| 692 | state = PGSharedMemoryAttach(shmid, NULL, &oldhdr); |
| 693 | |
| 694 | switch (state) |
| 695 | { |
| 696 | case SHMSTATE_ANALYSIS_FAILURE: |
| 697 | case SHMSTATE_ATTACHED: |
| 698 | ereport(FATAL, |
| 699 | (errcode(ERRCODE_LOCK_FILE_EXISTS), |
| 700 | errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use" , |
| 701 | (unsigned long) NextShmemSegID, |
| 702 | (unsigned long) shmid), |
| 703 | errhint("Terminate any old server processes associated with data directory \"%s\"." , |
| 704 | DataDir))); |
| 705 | break; |
| 706 | case SHMSTATE_ENOENT: |
| 707 | |
| 708 | /* |
| 709 | * To our surprise, some other process deleted since our last |
| 710 | * InternalIpcMemoryCreate(). Moments earlier, we would have |
| 711 | * seen SHMSTATE_FOREIGN. Try that same ID again. |
| 712 | */ |
| 713 | elog(LOG, |
| 714 | "shared memory block (key %lu, ID %lu) deleted during startup" , |
| 715 | (unsigned long) NextShmemSegID, |
| 716 | (unsigned long) shmid); |
| 717 | break; |
| 718 | case SHMSTATE_FOREIGN: |
| 719 | NextShmemSegID++; |
| 720 | break; |
| 721 | case SHMSTATE_UNATTACHED: |
| 722 | |
| 723 | /* |
| 724 | * The segment pertains to DataDir, and every process that had |
| 725 | * used it has died or detached. Zap it, if possible, and any |
| 726 | * associated dynamic shared memory segments, as well. This |
| 727 | * shouldn't fail, but if it does, assume the segment belongs |
| 728 | * to someone else after all, and try the next candidate. |
| 729 | * Otherwise, try again to create the segment. That may fail |
| 730 | * if some other process creates the same shmem key before we |
| 731 | * do, in which case we'll try the next key. |
| 732 | */ |
| 733 | if (oldhdr->dsm_control != 0) |
| 734 | dsm_cleanup_using_control_segment(oldhdr->dsm_control); |
| 735 | if (shmctl(shmid, IPC_RMID, NULL) < 0) |
| 736 | NextShmemSegID++; |
| 737 | break; |
| 738 | } |
| 739 | |
| 740 | if (oldhdr && shmdt(oldhdr) < 0) |
| 741 | elog(LOG, "shmdt(%p) failed: %m" , oldhdr); |
| 742 | } |
| 743 | |
| 744 | /* Initialize new segment. */ |
| 745 | hdr = (PGShmemHeader *) memAddress; |
| 746 | hdr->creatorPID = getpid(); |
| 747 | hdr->magic = PGShmemMagic; |
| 748 | hdr->dsm_control = 0; |
| 749 | |
| 750 | /* Fill in the data directory ID info, too */ |
| 751 | if (stat(DataDir, &statbuf) < 0) |
| 752 | ereport(FATAL, |
| 753 | (errcode_for_file_access(), |
| 754 | errmsg("could not stat data directory \"%s\": %m" , |
| 755 | DataDir))); |
| 756 | hdr->device = statbuf.st_dev; |
| 757 | hdr->inode = statbuf.st_ino; |
| 758 | |
| 759 | /* |
| 760 | * Initialize space allocation status for segment. |
| 761 | */ |
| 762 | hdr->totalsize = size; |
| 763 | hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); |
| 764 | *shim = hdr; |
| 765 | |
| 766 | /* Save info for possible future use */ |
| 767 | UsedShmemSegAddr = memAddress; |
| 768 | UsedShmemSegID = (unsigned long) NextShmemSegID; |
| 769 | |
| 770 | /* |
| 771 | * If AnonymousShmem is NULL here, then we're not using anonymous shared |
| 772 | * memory, and should return a pointer to the System V shared memory |
| 773 | * block. Otherwise, the System V shared memory block is only a shim, and |
| 774 | * we must return a pointer to the real block. |
| 775 | */ |
| 776 | if (AnonymousShmem == NULL) |
| 777 | return hdr; |
| 778 | memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); |
| 779 | return (PGShmemHeader *) AnonymousShmem; |
| 780 | } |
| 781 | |
| 782 | #ifdef EXEC_BACKEND |
| 783 | |
| 784 | /* |
| 785 | * PGSharedMemoryReAttach |
| 786 | * |
| 787 | * This is called during startup of a postmaster child process to re-attach to |
| 788 | * an already existing shared memory segment. This is needed only in the |
| 789 | * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory |
| 790 | * segment attachment via fork(). |
| 791 | * |
| 792 | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
| 793 | * routine. The caller must have already restored them to the postmaster's |
| 794 | * values. |
| 795 | */ |
| 796 | void |
| 797 | PGSharedMemoryReAttach(void) |
| 798 | { |
| 799 | IpcMemoryId shmid; |
| 800 | PGShmemHeader *hdr; |
| 801 | IpcMemoryState state; |
| 802 | void *origUsedShmemSegAddr = UsedShmemSegAddr; |
| 803 | |
| 804 | Assert(UsedShmemSegAddr != NULL); |
| 805 | Assert(IsUnderPostmaster); |
| 806 | |
| 807 | #ifdef __CYGWIN__ |
| 808 | /* cygipc (currently) appears to not detach on exec. */ |
| 809 | PGSharedMemoryDetach(); |
| 810 | UsedShmemSegAddr = origUsedShmemSegAddr; |
| 811 | #endif |
| 812 | |
| 813 | elog(DEBUG3, "attaching to %p" , UsedShmemSegAddr); |
| 814 | shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0); |
| 815 | if (shmid < 0) |
| 816 | state = SHMSTATE_FOREIGN; |
| 817 | else |
| 818 | state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr); |
| 819 | if (state != SHMSTATE_ATTACHED) |
| 820 | elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m" , |
| 821 | (int) UsedShmemSegID, UsedShmemSegAddr); |
| 822 | if (hdr != origUsedShmemSegAddr) |
| 823 | elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)" , |
| 824 | hdr, origUsedShmemSegAddr); |
| 825 | dsm_set_control_handle(hdr->dsm_control); |
| 826 | |
| 827 | UsedShmemSegAddr = hdr; /* probably redundant */ |
| 828 | } |
| 829 | |
| 830 | /* |
| 831 | * PGSharedMemoryNoReAttach |
| 832 | * |
| 833 | * This is called during startup of a postmaster child process when we choose |
| 834 | * *not* to re-attach to the existing shared memory segment. We must clean up |
| 835 | * to leave things in the appropriate state. This is not used in the non |
| 836 | * EXEC_BACKEND case, either. |
| 837 | * |
| 838 | * The child process startup logic might or might not call PGSharedMemoryDetach |
| 839 | * after this; make sure that it will be a no-op if called. |
| 840 | * |
| 841 | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
| 842 | * routine. The caller must have already restored them to the postmaster's |
| 843 | * values. |
| 844 | */ |
| 845 | void |
| 846 | PGSharedMemoryNoReAttach(void) |
| 847 | { |
| 848 | Assert(UsedShmemSegAddr != NULL); |
| 849 | Assert(IsUnderPostmaster); |
| 850 | |
| 851 | #ifdef __CYGWIN__ |
| 852 | /* cygipc (currently) appears to not detach on exec. */ |
| 853 | PGSharedMemoryDetach(); |
| 854 | #endif |
| 855 | |
| 856 | /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */ |
| 857 | UsedShmemSegAddr = NULL; |
| 858 | /* And the same for UsedShmemSegID. */ |
| 859 | UsedShmemSegID = 0; |
| 860 | } |
| 861 | |
| 862 | #endif /* EXEC_BACKEND */ |
| 863 | |
| 864 | /* |
| 865 | * PGSharedMemoryDetach |
| 866 | * |
| 867 | * Detach from the shared memory segment, if still attached. This is not |
| 868 | * intended to be called explicitly by the process that originally created the |
| 869 | * segment (it will have on_shmem_exit callback(s) registered to do that). |
| 870 | * Rather, this is for subprocesses that have inherited an attachment and want |
| 871 | * to get rid of it. |
| 872 | * |
| 873 | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
| 874 | * routine, also AnonymousShmem and AnonymousShmemSize. |
| 875 | */ |
| 876 | void |
| 877 | PGSharedMemoryDetach(void) |
| 878 | { |
| 879 | if (UsedShmemSegAddr != NULL) |
| 880 | { |
| 881 | if ((shmdt(UsedShmemSegAddr) < 0) |
| 882 | #if defined(EXEC_BACKEND) && defined(__CYGWIN__) |
| 883 | /* Work-around for cygipc exec bug */ |
| 884 | && shmdt(NULL) < 0 |
| 885 | #endif |
| 886 | ) |
| 887 | elog(LOG, "shmdt(%p) failed: %m" , UsedShmemSegAddr); |
| 888 | UsedShmemSegAddr = NULL; |
| 889 | } |
| 890 | |
| 891 | if (AnonymousShmem != NULL) |
| 892 | { |
| 893 | if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
| 894 | elog(LOG, "munmap(%p, %zu) failed: %m" , |
| 895 | AnonymousShmem, AnonymousShmemSize); |
| 896 | AnonymousShmem = NULL; |
| 897 | } |
| 898 | } |
| 899 | |