1/*-------------------------------------------------------------------------
2 *
3 * sysv_shmem.c
4 * Implement shared memory using SysV facilities
5 *
6 * These routines used to be a fairly thin layer on top of SysV shared
7 * memory functionality. With the addition of anonymous-shmem logic,
8 * they're a bit fatter now. We still require a SysV shmem block to
9 * exist, though, because mmap'd shmem provides no way to find out how
10 * many processes are attached, which we need for interlocking purposes.
11 *
12 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
13 * Portions Copyright (c) 1994, Regents of the University of California
14 *
15 * IDENTIFICATION
16 * src/backend/port/sysv_shmem.c
17 *
18 *-------------------------------------------------------------------------
19 */
20#include "postgres.h"
21
22#include <signal.h>
23#include <unistd.h>
24#include <sys/file.h>
25#include <sys/mman.h>
26#include <sys/stat.h>
27#ifdef HAVE_SYS_IPC_H
28#include <sys/ipc.h>
29#endif
30#ifdef HAVE_SYS_SHM_H
31#include <sys/shm.h>
32#endif
33
34#include "miscadmin.h"
35#include "portability/mem.h"
36#include "storage/dsm.h"
37#include "storage/fd.h"
38#include "storage/ipc.h"
39#include "storage/pg_shmem.h"
40#include "utils/guc.h"
41#include "utils/pidfile.h"
42
43
44/*
45 * As of PostgreSQL 9.3, we normally allocate only a very small amount of
46 * System V shared memory, and only for the purposes of providing an
47 * interlock to protect the data directory. The real shared memory block
48 * is allocated using mmap(). This works around the problem that many
49 * systems have very low limits on the amount of System V shared memory
50 * that can be allocated. Even a limit of a few megabytes will be enough
51 * to run many copies of PostgreSQL without needing to adjust system settings.
52 *
53 * We assume that no one will attempt to run PostgreSQL 9.3 or later on
54 * systems that are ancient enough that anonymous shared memory is not
55 * supported, such as pre-2.4 versions of Linux. If that turns out to be
56 * false, we might need to add compile and/or run-time tests here and do this
57 * only if the running kernel supports it.
58 *
59 * However, we must always disable this logic in the EXEC_BACKEND case, and
60 * fall back to the old method of allocating the entire segment using System V
61 * shared memory, because there's no way to attach an anonymous mmap'd segment
62 * to a process after exec(). Since EXEC_BACKEND is intended only for
63 * developer use, this shouldn't be a big problem. Because of this, we do
64 * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
65 *
66 * As of PostgreSQL 12, we regained the ability to use a large System V shared
67 * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
68 * to sysv (though this is not the default).
69 */
70
71
72typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
73typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
74
75/*
76 * How does a given IpcMemoryId relate to this PostgreSQL process?
77 *
78 * One could recycle unattached segments of different data directories if we
79 * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
80 * cause us to visit less of the key space, making us less likely to detect a
81 * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
82 * in that postmasters of different data directories could simultaneously
83 * attempt to recycle a given key. We'll waste keys longer in some cases, but
84 * avoiding the problems of the alternative justifies that loss.
85 */
86typedef enum
87{
88 SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
89 SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
90 SHMSTATE_ENOENT, /* no segment of that ID */
91 SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
92 SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */
93} IpcMemoryState;
94
95
96unsigned long UsedShmemSegID = 0;
97void *UsedShmemSegAddr = NULL;
98
99static Size AnonymousShmemSize;
100static void *AnonymousShmem = NULL;
101
102static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
103static void IpcMemoryDetach(int status, Datum shmaddr);
104static void IpcMemoryDelete(int status, Datum shmId);
105static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
106 void *attachAt,
107 PGShmemHeader **addr);
108
109
110/*
111 * InternalIpcMemoryCreate(memKey, size)
112 *
113 * Attempt to create a new shared memory segment with the specified key.
114 * Will fail (return NULL) if such a segment already exists. If successful,
115 * attach the segment to the current process and return its attached address.
116 * On success, callbacks are registered with on_shmem_exit to detach and
117 * delete the segment when on_shmem_exit is called.
118 *
119 * If we fail with a failure code other than collision-with-existing-segment,
120 * print out an error and abort. Other types of errors are not recoverable.
121 */
122static void *
123InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
124{
125 IpcMemoryId shmid;
126 void *requestedAddress = NULL;
127 void *memAddress;
128
129 /*
130 * Normally we just pass requestedAddress = NULL to shmat(), allowing the
131 * system to choose where the segment gets mapped. But in an EXEC_BACKEND
132 * build, it's possible for whatever is chosen in the postmaster to not
133 * work for backends, due to variations in address space layout. As a
134 * rather klugy workaround, allow the user to specify the address to use
135 * via setting the environment variable PG_SHMEM_ADDR. (If this were of
136 * interest for anything except debugging, we'd probably create a cleaner
137 * and better-documented way to set it, such as a GUC.)
138 */
139#ifdef EXEC_BACKEND
140 {
141 char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
142
143 if (pg_shmem_addr)
144 requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
145 }
146#endif
147
148 shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
149
150 if (shmid < 0)
151 {
152 int shmget_errno = errno;
153
154 /*
155 * Fail quietly if error indicates a collision with existing segment.
156 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
157 * we could get a permission violation instead? Also, EIDRM might
158 * occur if an old seg is slated for destruction but not gone yet.
159 */
160 if (shmget_errno == EEXIST || shmget_errno == EACCES
161#ifdef EIDRM
162 || shmget_errno == EIDRM
163#endif
164 )
165 return NULL;
166
167 /*
168 * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
169 * there is an existing segment but it's smaller than "size" (this is
170 * a result of poorly-thought-out ordering of error tests). To
171 * distinguish between collision and invalid size in such cases, we
172 * make a second try with size = 0. These kernels do not test size
173 * against SHMMIN in the preexisting-segment case, so we will not get
174 * EINVAL a second time if there is such a segment.
175 */
176 if (shmget_errno == EINVAL)
177 {
178 shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
179
180 if (shmid < 0)
181 {
182 /* As above, fail quietly if we verify a collision */
183 if (errno == EEXIST || errno == EACCES
184#ifdef EIDRM
185 || errno == EIDRM
186#endif
187 )
188 return NULL;
189 /* Otherwise, fall through to report the original error */
190 }
191 else
192 {
193 /*
194 * On most platforms we cannot get here because SHMMIN is
195 * greater than zero. However, if we do succeed in creating a
196 * zero-size segment, free it and then fall through to report
197 * the original error.
198 */
199 if (shmctl(shmid, IPC_RMID, NULL) < 0)
200 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
201 (int) shmid, IPC_RMID);
202 }
203 }
204
205 /*
206 * Else complain and abort.
207 *
208 * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
209 * is violated. SHMALL violation might be reported as either ENOMEM
210 * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
211 * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
212 * not-enough-RAM is ENOMEM.
213 */
214 errno = shmget_errno;
215 ereport(FATAL,
216 (errmsg("could not create shared memory segment: %m"),
217 errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
218 (unsigned long) memKey, size,
219 IPC_CREAT | IPC_EXCL | IPCProtection),
220 (shmget_errno == EINVAL) ?
221 errhint("This error usually means that PostgreSQL's request for a shared memory "
222 "segment exceeded your kernel's SHMMAX parameter, or possibly that "
223 "it is less than "
224 "your kernel's SHMMIN parameter.\n"
225 "The PostgreSQL documentation contains more information about shared "
226 "memory configuration.") : 0,
227 (shmget_errno == ENOMEM) ?
228 errhint("This error usually means that PostgreSQL's request for a shared "
229 "memory segment exceeded your kernel's SHMALL parameter. You might need "
230 "to reconfigure the kernel with larger SHMALL.\n"
231 "The PostgreSQL documentation contains more information about shared "
232 "memory configuration.") : 0,
233 (shmget_errno == ENOSPC) ?
234 errhint("This error does *not* mean that you have run out of disk space. "
235 "It occurs either if all available shared memory IDs have been taken, "
236 "in which case you need to raise the SHMMNI parameter in your kernel, "
237 "or because the system's overall limit for shared memory has been "
238 "reached.\n"
239 "The PostgreSQL documentation contains more information about shared "
240 "memory configuration.") : 0));
241 }
242
243 /* Register on-exit routine to delete the new segment */
244 on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
245
246 /* OK, should be able to attach to the segment */
247 memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
248
249 if (memAddress == (void *) -1)
250 elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
251 shmid, requestedAddress, PG_SHMAT_FLAGS);
252
253 /* Register on-exit routine to detach new segment before deleting */
254 on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
255
256 /*
257 * Store shmem key and ID in data directory lockfile. Format to try to
258 * keep it the same length always (trailing junk in the lockfile won't
259 * hurt, but might confuse humans).
260 */
261 {
262 char line[64];
263
264 sprintf(line, "%9lu %9lu",
265 (unsigned long) memKey, (unsigned long) shmid);
266 AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
267 }
268
269 return memAddress;
270}
271
272/****************************************************************************/
273/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
274/* from process' address space */
275/* (called as an on_shmem_exit callback, hence funny argument list) */
276/****************************************************************************/
277static void
278IpcMemoryDetach(int status, Datum shmaddr)
279{
280 /* Detach System V shared memory block. */
281 if (shmdt(DatumGetPointer(shmaddr)) < 0)
282 elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
283}
284
285/****************************************************************************/
286/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
287/* (called as an on_shmem_exit callback, hence funny argument list) */
288/****************************************************************************/
289static void
290IpcMemoryDelete(int status, Datum shmId)
291{
292 if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
293 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
294 DatumGetInt32(shmId), IPC_RMID);
295}
296
297/*
298 * PGSharedMemoryIsInUse
299 *
300 * Is a previously-existing shmem segment still existing and in use?
301 *
302 * The point of this exercise is to detect the case where a prior postmaster
303 * crashed, but it left child backends that are still running. Therefore
304 * we only care about shmem segments that are associated with the intended
305 * DataDir. This is an important consideration since accidental matches of
306 * shmem segment IDs are reasonably common.
307 */
308bool
309PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
310{
311 PGShmemHeader *memAddress;
312 IpcMemoryState state;
313
314 state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
315 if (memAddress && shmdt(memAddress) < 0)
316 elog(LOG, "shmdt(%p) failed: %m", memAddress);
317 switch (state)
318 {
319 case SHMSTATE_ENOENT:
320 case SHMSTATE_FOREIGN:
321 case SHMSTATE_UNATTACHED:
322 return false;
323 case SHMSTATE_ANALYSIS_FAILURE:
324 case SHMSTATE_ATTACHED:
325 return true;
326 }
327 return true;
328}
329
330/*
331 * Test for a segment with id shmId; see comment at IpcMemoryState.
332 *
333 * If the segment exists, we'll attempt to attach to it, using attachAt
334 * if that's not NULL (but it's best to pass NULL if possible).
335 *
336 * *addr is set to the segment memory address if we attached to it, else NULL.
337 */
338static IpcMemoryState
339PGSharedMemoryAttach(IpcMemoryId shmId,
340 void *attachAt,
341 PGShmemHeader **addr)
342{
343 struct shmid_ds shmStat;
344 struct stat statbuf;
345 PGShmemHeader *hdr;
346
347 *addr = NULL;
348
349 /*
350 * First, try to stat the shm segment ID, to see if it exists at all.
351 */
352 if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
353 {
354 /*
355 * EINVAL actually has multiple possible causes documented in the
356 * shmctl man page, but we assume it must mean the segment no longer
357 * exists.
358 */
359 if (errno == EINVAL)
360 return SHMSTATE_ENOENT;
361
362 /*
363 * EACCES implies we have no read permission, which means it is not a
364 * Postgres shmem segment (or at least, not one that is relevant to
365 * our data directory).
366 */
367 if (errno == EACCES)
368 return SHMSTATE_FOREIGN;
369
370 /*
371 * Some Linux kernel versions (in fact, all of them as of July 2007)
372 * sometimes return EIDRM when EINVAL is correct. The Linux kernel
373 * actually does not have any internal state that would justify
374 * returning EIDRM, so we can get away with assuming that EIDRM is
375 * equivalent to EINVAL on that platform.
376 */
377#ifdef HAVE_LINUX_EIDRM_BUG
378 if (errno == EIDRM)
379 return SHMSTATE_ENOENT;
380#endif
381
382 /*
383 * Otherwise, we had better assume that the segment is in use. The
384 * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
385 * which implies that the segment has been IPC_RMID'd but there are
386 * still processes attached to it.
387 */
388 return SHMSTATE_ANALYSIS_FAILURE;
389 }
390
391 /*
392 * Try to attach to the segment and see if it matches our data directory.
393 * This avoids key-conflict problems on machines that are running several
394 * postmasters under the same userid and port number. (That would not
395 * ordinarily happen in production, but it can happen during parallel
396 * testing. Since our test setups don't open any TCP ports on Unix, such
397 * cases don't conflict otherwise.)
398 */
399 if (stat(DataDir, &statbuf) < 0)
400 return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
401
402 hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
403 if (hdr == (PGShmemHeader *) -1)
404 {
405 /*
406 * Attachment failed. The cases we're interested in are the same as
407 * for the shmctl() call above. In particular, note that the owning
408 * postmaster could have terminated and removed the segment between
409 * shmctl() and shmat().
410 *
411 * If attachAt isn't NULL, it's possible that EINVAL reflects a
412 * problem with that address not a vanished segment, so it's best to
413 * pass NULL when probing for conflicting segments.
414 */
415 if (errno == EINVAL)
416 return SHMSTATE_ENOENT; /* segment disappeared */
417 if (errno == EACCES)
418 return SHMSTATE_FOREIGN; /* must be non-Postgres */
419#ifdef HAVE_LINUX_EIDRM_BUG
420 if (errno == EIDRM)
421 return SHMSTATE_ENOENT; /* segment disappeared */
422#endif
423 /* Otherwise, be conservative. */
424 return SHMSTATE_ANALYSIS_FAILURE;
425 }
426 *addr = hdr;
427
428 if (hdr->magic != PGShmemMagic ||
429 hdr->device != statbuf.st_dev ||
430 hdr->inode != statbuf.st_ino)
431 {
432 /*
433 * It's either not a Postgres segment, or not one for my data
434 * directory.
435 */
436 return SHMSTATE_FOREIGN;
437 }
438
439 /*
440 * It does match our data directory, so now test whether any processes are
441 * still attached to it. (We are, now, but the shm_nattch result is from
442 * before we attached to it.)
443 */
444 return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
445}
446
447#ifdef MAP_HUGETLB
448
449/*
450 * Identify the huge page size to use.
451 *
452 * Some Linux kernel versions have a bug causing mmap() to fail on requests
453 * that are not a multiple of the hugepage size. Versions without that bug
454 * instead silently round the request up to the next hugepage multiple ---
455 * and then munmap() fails when we give it a size different from that.
456 * So we have to round our request up to a multiple of the actual hugepage
457 * size to avoid trouble.
458 *
459 * Doing the round-up ourselves also lets us make use of the extra memory,
460 * rather than just wasting it. Currently, we just increase the available
461 * space recorded in the shmem header, which will make the extra usable for
462 * purposes such as additional locktable entries. Someday, for very large
463 * hugepage sizes, we might want to think about more invasive strategies,
464 * such as increasing shared_buffers to absorb the extra space.
465 *
466 * Returns the (real or assumed) page size into *hugepagesize,
467 * and the hugepage-related mmap flags to use into *mmap_flags.
468 *
469 * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems
470 * that support it, we might OR in additional bits to specify a particular
471 * non-default huge page size.
472 */
473static void
474GetHugePageSize(Size *hugepagesize, int *mmap_flags)
475{
476 /*
477 * If we fail to find out the system's default huge page size, assume it
478 * is 2MB. This will work fine when the actual size is less. If it's
479 * more, we might get mmap() or munmap() failures due to unaligned
480 * requests; but at this writing, there are no reports of any non-Linux
481 * systems being picky about that.
482 */
483 *hugepagesize = 2 * 1024 * 1024;
484 *mmap_flags = MAP_HUGETLB;
485
486 /*
487 * System-dependent code to find out the default huge page size.
488 *
489 * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
490 * nnnn kB". Ignore any failures, falling back to the preset default.
491 */
492#ifdef __linux__
493 {
494 FILE *fp = AllocateFile("/proc/meminfo", "r");
495 char buf[128];
496 unsigned int sz;
497 char ch;
498
499 if (fp)
500 {
501 while (fgets(buf, sizeof(buf), fp))
502 {
503 if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
504 {
505 if (ch == 'k')
506 {
507 *hugepagesize = sz * (Size) 1024;
508 break;
509 }
510 /* We could accept other units besides kB, if needed */
511 }
512 }
513 FreeFile(fp);
514 }
515 }
516#endif /* __linux__ */
517}
518
519#endif /* MAP_HUGETLB */
520
521/*
522 * Creates an anonymous mmap()ed shared memory segment.
523 *
524 * Pass the requested size in *size. This function will modify *size to the
525 * actual size of the allocation, if it ends up allocating a segment that is
526 * larger than requested.
527 */
528static void *
529CreateAnonymousSegment(Size *size)
530{
531 Size allocsize = *size;
532 void *ptr = MAP_FAILED;
533 int mmap_errno = 0;
534
535#ifndef MAP_HUGETLB
536 /* PGSharedMemoryCreate should have dealt with this case */
537 Assert(huge_pages != HUGE_PAGES_ON);
538#else
539 if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
540 {
541 /*
542 * Round up the request size to a suitable large value.
543 */
544 Size hugepagesize;
545 int mmap_flags;
546
547 GetHugePageSize(&hugepagesize, &mmap_flags);
548
549 if (allocsize % hugepagesize != 0)
550 allocsize += hugepagesize - (allocsize % hugepagesize);
551
552 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
553 PG_MMAP_FLAGS | mmap_flags, -1, 0);
554 mmap_errno = errno;
555 if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
556 elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
557 allocsize);
558 }
559#endif
560
561 if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
562 {
563 /*
564 * Use the original size, not the rounded-up value, when falling back
565 * to non-huge pages.
566 */
567 allocsize = *size;
568 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
569 PG_MMAP_FLAGS, -1, 0);
570 mmap_errno = errno;
571 }
572
573 if (ptr == MAP_FAILED)
574 {
575 errno = mmap_errno;
576 ereport(FATAL,
577 (errmsg("could not map anonymous shared memory: %m"),
578 (mmap_errno == ENOMEM) ?
579 errhint("This error usually means that PostgreSQL's request "
580 "for a shared memory segment exceeded available memory, "
581 "swap space, or huge pages. To reduce the request size "
582 "(currently %zu bytes), reduce PostgreSQL's shared "
583 "memory usage, perhaps by reducing shared_buffers or "
584 "max_connections.",
585 *size) : 0));
586 }
587
588 *size = allocsize;
589 return ptr;
590}
591
592/*
593 * AnonymousShmemDetach --- detach from an anonymous mmap'd block
594 * (called as an on_shmem_exit callback, hence funny argument list)
595 */
596static void
597AnonymousShmemDetach(int status, Datum arg)
598{
599 /* Release anonymous shared memory block, if any. */
600 if (AnonymousShmem != NULL)
601 {
602 if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
603 elog(LOG, "munmap(%p, %zu) failed: %m",
604 AnonymousShmem, AnonymousShmemSize);
605 AnonymousShmem = NULL;
606 }
607}
608
609/*
610 * PGSharedMemoryCreate
611 *
612 * Create a shared memory segment of the given size and initialize its
613 * standard header. Also, register an on_shmem_exit callback to release
614 * the storage.
615 *
616 * Dead Postgres segments pertinent to this DataDir are recycled if found, but
617 * we do not fail upon collision with foreign shmem segments. The idea here
618 * is to detect and re-use keys that may have been assigned by a crashed
619 * postmaster or backend.
620 *
621 * The port number is passed for possible use as a key (for SysV, we use
622 * it to generate the starting shmem key).
623 */
624PGShmemHeader *
625PGSharedMemoryCreate(Size size, int port,
626 PGShmemHeader **shim)
627{
628 IpcMemoryKey NextShmemSegID;
629 void *memAddress;
630 PGShmemHeader *hdr;
631 struct stat statbuf;
632 Size sysvsize;
633
634 /* Complain if hugepages demanded but we can't possibly support them */
635#if !defined(MAP_HUGETLB)
636 if (huge_pages == HUGE_PAGES_ON)
637 ereport(ERROR,
638 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
639 errmsg("huge pages not supported on this platform")));
640#endif
641
642 /* Room for a header? */
643 Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
644
645 if (shared_memory_type == SHMEM_TYPE_MMAP)
646 {
647 AnonymousShmem = CreateAnonymousSegment(&size);
648 AnonymousShmemSize = size;
649
650 /* Register on-exit routine to unmap the anonymous segment */
651 on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
652
653 /* Now we need only allocate a minimal-sized SysV shmem block. */
654 sysvsize = sizeof(PGShmemHeader);
655 }
656 else
657 sysvsize = size;
658
659 /*
660 * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
661 * ensure no more than one postmaster per data directory can enter this
662 * loop simultaneously. (CreateDataDirLockFile() does not ensure that,
663 * but prefer fixing it over coping here.)
664 */
665 NextShmemSegID = 1 + port * 1000;
666
667 for (;;)
668 {
669 IpcMemoryId shmid;
670 PGShmemHeader *oldhdr;
671 IpcMemoryState state;
672
673 /* Try to create new segment */
674 memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
675 if (memAddress)
676 break; /* successful create and attach */
677
678 /* Check shared memory and possibly remove and recreate */
679
680 /*
681 * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
682 * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
683 * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
684 */
685 shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
686 if (shmid < 0)
687 {
688 oldhdr = NULL;
689 state = SHMSTATE_FOREIGN;
690 }
691 else
692 state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
693
694 switch (state)
695 {
696 case SHMSTATE_ANALYSIS_FAILURE:
697 case SHMSTATE_ATTACHED:
698 ereport(FATAL,
699 (errcode(ERRCODE_LOCK_FILE_EXISTS),
700 errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
701 (unsigned long) NextShmemSegID,
702 (unsigned long) shmid),
703 errhint("Terminate any old server processes associated with data directory \"%s\".",
704 DataDir)));
705 break;
706 case SHMSTATE_ENOENT:
707
708 /*
709 * To our surprise, some other process deleted since our last
710 * InternalIpcMemoryCreate(). Moments earlier, we would have
711 * seen SHMSTATE_FOREIGN. Try that same ID again.
712 */
713 elog(LOG,
714 "shared memory block (key %lu, ID %lu) deleted during startup",
715 (unsigned long) NextShmemSegID,
716 (unsigned long) shmid);
717 break;
718 case SHMSTATE_FOREIGN:
719 NextShmemSegID++;
720 break;
721 case SHMSTATE_UNATTACHED:
722
723 /*
724 * The segment pertains to DataDir, and every process that had
725 * used it has died or detached. Zap it, if possible, and any
726 * associated dynamic shared memory segments, as well. This
727 * shouldn't fail, but if it does, assume the segment belongs
728 * to someone else after all, and try the next candidate.
729 * Otherwise, try again to create the segment. That may fail
730 * if some other process creates the same shmem key before we
731 * do, in which case we'll try the next key.
732 */
733 if (oldhdr->dsm_control != 0)
734 dsm_cleanup_using_control_segment(oldhdr->dsm_control);
735 if (shmctl(shmid, IPC_RMID, NULL) < 0)
736 NextShmemSegID++;
737 break;
738 }
739
740 if (oldhdr && shmdt(oldhdr) < 0)
741 elog(LOG, "shmdt(%p) failed: %m", oldhdr);
742 }
743
744 /* Initialize new segment. */
745 hdr = (PGShmemHeader *) memAddress;
746 hdr->creatorPID = getpid();
747 hdr->magic = PGShmemMagic;
748 hdr->dsm_control = 0;
749
750 /* Fill in the data directory ID info, too */
751 if (stat(DataDir, &statbuf) < 0)
752 ereport(FATAL,
753 (errcode_for_file_access(),
754 errmsg("could not stat data directory \"%s\": %m",
755 DataDir)));
756 hdr->device = statbuf.st_dev;
757 hdr->inode = statbuf.st_ino;
758
759 /*
760 * Initialize space allocation status for segment.
761 */
762 hdr->totalsize = size;
763 hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
764 *shim = hdr;
765
766 /* Save info for possible future use */
767 UsedShmemSegAddr = memAddress;
768 UsedShmemSegID = (unsigned long) NextShmemSegID;
769
770 /*
771 * If AnonymousShmem is NULL here, then we're not using anonymous shared
772 * memory, and should return a pointer to the System V shared memory
773 * block. Otherwise, the System V shared memory block is only a shim, and
774 * we must return a pointer to the real block.
775 */
776 if (AnonymousShmem == NULL)
777 return hdr;
778 memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
779 return (PGShmemHeader *) AnonymousShmem;
780}
781
782#ifdef EXEC_BACKEND
783
784/*
785 * PGSharedMemoryReAttach
786 *
787 * This is called during startup of a postmaster child process to re-attach to
788 * an already existing shared memory segment. This is needed only in the
789 * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
790 * segment attachment via fork().
791 *
792 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
793 * routine. The caller must have already restored them to the postmaster's
794 * values.
795 */
796void
797PGSharedMemoryReAttach(void)
798{
799 IpcMemoryId shmid;
800 PGShmemHeader *hdr;
801 IpcMemoryState state;
802 void *origUsedShmemSegAddr = UsedShmemSegAddr;
803
804 Assert(UsedShmemSegAddr != NULL);
805 Assert(IsUnderPostmaster);
806
807#ifdef __CYGWIN__
808 /* cygipc (currently) appears to not detach on exec. */
809 PGSharedMemoryDetach();
810 UsedShmemSegAddr = origUsedShmemSegAddr;
811#endif
812
813 elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
814 shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
815 if (shmid < 0)
816 state = SHMSTATE_FOREIGN;
817 else
818 state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
819 if (state != SHMSTATE_ATTACHED)
820 elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
821 (int) UsedShmemSegID, UsedShmemSegAddr);
822 if (hdr != origUsedShmemSegAddr)
823 elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
824 hdr, origUsedShmemSegAddr);
825 dsm_set_control_handle(hdr->dsm_control);
826
827 UsedShmemSegAddr = hdr; /* probably redundant */
828}
829
830/*
831 * PGSharedMemoryNoReAttach
832 *
833 * This is called during startup of a postmaster child process when we choose
834 * *not* to re-attach to the existing shared memory segment. We must clean up
835 * to leave things in the appropriate state. This is not used in the non
836 * EXEC_BACKEND case, either.
837 *
838 * The child process startup logic might or might not call PGSharedMemoryDetach
839 * after this; make sure that it will be a no-op if called.
840 *
841 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
842 * routine. The caller must have already restored them to the postmaster's
843 * values.
844 */
845void
846PGSharedMemoryNoReAttach(void)
847{
848 Assert(UsedShmemSegAddr != NULL);
849 Assert(IsUnderPostmaster);
850
851#ifdef __CYGWIN__
852 /* cygipc (currently) appears to not detach on exec. */
853 PGSharedMemoryDetach();
854#endif
855
856 /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
857 UsedShmemSegAddr = NULL;
858 /* And the same for UsedShmemSegID. */
859 UsedShmemSegID = 0;
860}
861
862#endif /* EXEC_BACKEND */
863
864/*
865 * PGSharedMemoryDetach
866 *
867 * Detach from the shared memory segment, if still attached. This is not
868 * intended to be called explicitly by the process that originally created the
869 * segment (it will have on_shmem_exit callback(s) registered to do that).
870 * Rather, this is for subprocesses that have inherited an attachment and want
871 * to get rid of it.
872 *
873 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
874 * routine, also AnonymousShmem and AnonymousShmemSize.
875 */
876void
877PGSharedMemoryDetach(void)
878{
879 if (UsedShmemSegAddr != NULL)
880 {
881 if ((shmdt(UsedShmemSegAddr) < 0)
882#if defined(EXEC_BACKEND) && defined(__CYGWIN__)
883 /* Work-around for cygipc exec bug */
884 && shmdt(NULL) < 0
885#endif
886 )
887 elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
888 UsedShmemSegAddr = NULL;
889 }
890
891 if (AnonymousShmem != NULL)
892 {
893 if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
894 elog(LOG, "munmap(%p, %zu) failed: %m",
895 AnonymousShmem, AnonymousShmemSize);
896 AnonymousShmem = NULL;
897 }
898}
899