1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * sysv_shmem.c |
4 | * Implement shared memory using SysV facilities |
5 | * |
6 | * These routines used to be a fairly thin layer on top of SysV shared |
7 | * memory functionality. With the addition of anonymous-shmem logic, |
8 | * they're a bit fatter now. We still require a SysV shmem block to |
9 | * exist, though, because mmap'd shmem provides no way to find out how |
10 | * many processes are attached, which we need for interlocking purposes. |
11 | * |
12 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
13 | * Portions Copyright (c) 1994, Regents of the University of California |
14 | * |
15 | * IDENTIFICATION |
16 | * src/backend/port/sysv_shmem.c |
17 | * |
18 | *------------------------------------------------------------------------- |
19 | */ |
20 | #include "postgres.h" |
21 | |
22 | #include <signal.h> |
23 | #include <unistd.h> |
24 | #include <sys/file.h> |
25 | #include <sys/mman.h> |
26 | #include <sys/stat.h> |
27 | #ifdef HAVE_SYS_IPC_H |
28 | #include <sys/ipc.h> |
29 | #endif |
30 | #ifdef HAVE_SYS_SHM_H |
31 | #include <sys/shm.h> |
32 | #endif |
33 | |
34 | #include "miscadmin.h" |
35 | #include "portability/mem.h" |
36 | #include "storage/dsm.h" |
37 | #include "storage/fd.h" |
38 | #include "storage/ipc.h" |
39 | #include "storage/pg_shmem.h" |
40 | #include "utils/guc.h" |
41 | #include "utils/pidfile.h" |
42 | |
43 | |
44 | /* |
45 | * As of PostgreSQL 9.3, we normally allocate only a very small amount of |
46 | * System V shared memory, and only for the purposes of providing an |
47 | * interlock to protect the data directory. The real shared memory block |
48 | * is allocated using mmap(). This works around the problem that many |
49 | * systems have very low limits on the amount of System V shared memory |
50 | * that can be allocated. Even a limit of a few megabytes will be enough |
51 | * to run many copies of PostgreSQL without needing to adjust system settings. |
52 | * |
53 | * We assume that no one will attempt to run PostgreSQL 9.3 or later on |
54 | * systems that are ancient enough that anonymous shared memory is not |
55 | * supported, such as pre-2.4 versions of Linux. If that turns out to be |
56 | * false, we might need to add compile and/or run-time tests here and do this |
57 | * only if the running kernel supports it. |
58 | * |
59 | * However, we must always disable this logic in the EXEC_BACKEND case, and |
60 | * fall back to the old method of allocating the entire segment using System V |
61 | * shared memory, because there's no way to attach an anonymous mmap'd segment |
62 | * to a process after exec(). Since EXEC_BACKEND is intended only for |
63 | * developer use, this shouldn't be a big problem. Because of this, we do |
64 | * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below. |
65 | * |
66 | * As of PostgreSQL 12, we regained the ability to use a large System V shared |
67 | * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set |
68 | * to sysv (though this is not the default). |
69 | */ |
70 | |
71 | |
72 | typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ |
73 | typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ |
74 | |
75 | /* |
76 | * How does a given IpcMemoryId relate to this PostgreSQL process? |
77 | * |
78 | * One could recycle unattached segments of different data directories if we |
79 | * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would |
80 | * cause us to visit less of the key space, making us less likely to detect a |
81 | * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis, |
82 | * in that postmasters of different data directories could simultaneously |
83 | * attempt to recycle a given key. We'll waste keys longer in some cases, but |
84 | * avoiding the problems of the alternative justifies that loss. |
85 | */ |
86 | typedef enum |
87 | { |
88 | SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */ |
89 | SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */ |
90 | SHMSTATE_ENOENT, /* no segment of that ID */ |
91 | SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */ |
92 | SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */ |
93 | } IpcMemoryState; |
94 | |
95 | |
96 | unsigned long UsedShmemSegID = 0; |
97 | void *UsedShmemSegAddr = NULL; |
98 | |
99 | static Size AnonymousShmemSize; |
100 | static void *AnonymousShmem = NULL; |
101 | |
102 | static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); |
103 | static void IpcMemoryDetach(int status, Datum shmaddr); |
104 | static void IpcMemoryDelete(int status, Datum shmId); |
105 | static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, |
106 | void *attachAt, |
107 | PGShmemHeader **addr); |
108 | |
109 | |
110 | /* |
111 | * InternalIpcMemoryCreate(memKey, size) |
112 | * |
113 | * Attempt to create a new shared memory segment with the specified key. |
114 | * Will fail (return NULL) if such a segment already exists. If successful, |
115 | * attach the segment to the current process and return its attached address. |
116 | * On success, callbacks are registered with on_shmem_exit to detach and |
117 | * delete the segment when on_shmem_exit is called. |
118 | * |
119 | * If we fail with a failure code other than collision-with-existing-segment, |
120 | * print out an error and abort. Other types of errors are not recoverable. |
121 | */ |
122 | static void * |
123 | InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) |
124 | { |
125 | IpcMemoryId shmid; |
126 | void *requestedAddress = NULL; |
127 | void *memAddress; |
128 | |
129 | /* |
130 | * Normally we just pass requestedAddress = NULL to shmat(), allowing the |
131 | * system to choose where the segment gets mapped. But in an EXEC_BACKEND |
132 | * build, it's possible for whatever is chosen in the postmaster to not |
133 | * work for backends, due to variations in address space layout. As a |
134 | * rather klugy workaround, allow the user to specify the address to use |
135 | * via setting the environment variable PG_SHMEM_ADDR. (If this were of |
136 | * interest for anything except debugging, we'd probably create a cleaner |
137 | * and better-documented way to set it, such as a GUC.) |
138 | */ |
139 | #ifdef EXEC_BACKEND |
140 | { |
141 | char *pg_shmem_addr = getenv("PG_SHMEM_ADDR" ); |
142 | |
143 | if (pg_shmem_addr) |
144 | requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0); |
145 | } |
146 | #endif |
147 | |
148 | shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); |
149 | |
150 | if (shmid < 0) |
151 | { |
152 | int shmget_errno = errno; |
153 | |
154 | /* |
155 | * Fail quietly if error indicates a collision with existing segment. |
156 | * One would expect EEXIST, given that we said IPC_EXCL, but perhaps |
157 | * we could get a permission violation instead? Also, EIDRM might |
158 | * occur if an old seg is slated for destruction but not gone yet. |
159 | */ |
160 | if (shmget_errno == EEXIST || shmget_errno == EACCES |
161 | #ifdef EIDRM |
162 | || shmget_errno == EIDRM |
163 | #endif |
164 | ) |
165 | return NULL; |
166 | |
167 | /* |
168 | * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if |
169 | * there is an existing segment but it's smaller than "size" (this is |
170 | * a result of poorly-thought-out ordering of error tests). To |
171 | * distinguish between collision and invalid size in such cases, we |
172 | * make a second try with size = 0. These kernels do not test size |
173 | * against SHMMIN in the preexisting-segment case, so we will not get |
174 | * EINVAL a second time if there is such a segment. |
175 | */ |
176 | if (shmget_errno == EINVAL) |
177 | { |
178 | shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); |
179 | |
180 | if (shmid < 0) |
181 | { |
182 | /* As above, fail quietly if we verify a collision */ |
183 | if (errno == EEXIST || errno == EACCES |
184 | #ifdef EIDRM |
185 | || errno == EIDRM |
186 | #endif |
187 | ) |
188 | return NULL; |
189 | /* Otherwise, fall through to report the original error */ |
190 | } |
191 | else |
192 | { |
193 | /* |
194 | * On most platforms we cannot get here because SHMMIN is |
195 | * greater than zero. However, if we do succeed in creating a |
196 | * zero-size segment, free it and then fall through to report |
197 | * the original error. |
198 | */ |
199 | if (shmctl(shmid, IPC_RMID, NULL) < 0) |
200 | elog(LOG, "shmctl(%d, %d, 0) failed: %m" , |
201 | (int) shmid, IPC_RMID); |
202 | } |
203 | } |
204 | |
205 | /* |
206 | * Else complain and abort. |
207 | * |
208 | * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX |
209 | * is violated. SHMALL violation might be reported as either ENOMEM |
210 | * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which |
211 | * it should be. SHMMNI violation is ENOSPC, per spec. Just plain |
212 | * not-enough-RAM is ENOMEM. |
213 | */ |
214 | errno = shmget_errno; |
215 | ereport(FATAL, |
216 | (errmsg("could not create shared memory segment: %m" ), |
217 | errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o)." , |
218 | (unsigned long) memKey, size, |
219 | IPC_CREAT | IPC_EXCL | IPCProtection), |
220 | (shmget_errno == EINVAL) ? |
221 | errhint("This error usually means that PostgreSQL's request for a shared memory " |
222 | "segment exceeded your kernel's SHMMAX parameter, or possibly that " |
223 | "it is less than " |
224 | "your kernel's SHMMIN parameter.\n" |
225 | "The PostgreSQL documentation contains more information about shared " |
226 | "memory configuration." ) : 0, |
227 | (shmget_errno == ENOMEM) ? |
228 | errhint("This error usually means that PostgreSQL's request for a shared " |
229 | "memory segment exceeded your kernel's SHMALL parameter. You might need " |
230 | "to reconfigure the kernel with larger SHMALL.\n" |
231 | "The PostgreSQL documentation contains more information about shared " |
232 | "memory configuration." ) : 0, |
233 | (shmget_errno == ENOSPC) ? |
234 | errhint("This error does *not* mean that you have run out of disk space. " |
235 | "It occurs either if all available shared memory IDs have been taken, " |
236 | "in which case you need to raise the SHMMNI parameter in your kernel, " |
237 | "or because the system's overall limit for shared memory has been " |
238 | "reached.\n" |
239 | "The PostgreSQL documentation contains more information about shared " |
240 | "memory configuration." ) : 0)); |
241 | } |
242 | |
243 | /* Register on-exit routine to delete the new segment */ |
244 | on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); |
245 | |
246 | /* OK, should be able to attach to the segment */ |
247 | memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS); |
248 | |
249 | if (memAddress == (void *) -1) |
250 | elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m" , |
251 | shmid, requestedAddress, PG_SHMAT_FLAGS); |
252 | |
253 | /* Register on-exit routine to detach new segment before deleting */ |
254 | on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); |
255 | |
256 | /* |
257 | * Store shmem key and ID in data directory lockfile. Format to try to |
258 | * keep it the same length always (trailing junk in the lockfile won't |
259 | * hurt, but might confuse humans). |
260 | */ |
261 | { |
262 | char line[64]; |
263 | |
264 | sprintf(line, "%9lu %9lu" , |
265 | (unsigned long) memKey, (unsigned long) shmid); |
266 | AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); |
267 | } |
268 | |
269 | return memAddress; |
270 | } |
271 | |
272 | /****************************************************************************/ |
273 | /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ |
274 | /* from process' address space */ |
275 | /* (called as an on_shmem_exit callback, hence funny argument list) */ |
276 | /****************************************************************************/ |
277 | static void |
278 | IpcMemoryDetach(int status, Datum shmaddr) |
279 | { |
280 | /* Detach System V shared memory block. */ |
281 | if (shmdt(DatumGetPointer(shmaddr)) < 0) |
282 | elog(LOG, "shmdt(%p) failed: %m" , DatumGetPointer(shmaddr)); |
283 | } |
284 | |
285 | /****************************************************************************/ |
286 | /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ |
287 | /* (called as an on_shmem_exit callback, hence funny argument list) */ |
288 | /****************************************************************************/ |
289 | static void |
290 | IpcMemoryDelete(int status, Datum shmId) |
291 | { |
292 | if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) |
293 | elog(LOG, "shmctl(%d, %d, 0) failed: %m" , |
294 | DatumGetInt32(shmId), IPC_RMID); |
295 | } |
296 | |
297 | /* |
298 | * PGSharedMemoryIsInUse |
299 | * |
300 | * Is a previously-existing shmem segment still existing and in use? |
301 | * |
302 | * The point of this exercise is to detect the case where a prior postmaster |
303 | * crashed, but it left child backends that are still running. Therefore |
304 | * we only care about shmem segments that are associated with the intended |
305 | * DataDir. This is an important consideration since accidental matches of |
306 | * shmem segment IDs are reasonably common. |
307 | */ |
308 | bool |
309 | PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) |
310 | { |
311 | PGShmemHeader *memAddress; |
312 | IpcMemoryState state; |
313 | |
314 | state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress); |
315 | if (memAddress && shmdt(memAddress) < 0) |
316 | elog(LOG, "shmdt(%p) failed: %m" , memAddress); |
317 | switch (state) |
318 | { |
319 | case SHMSTATE_ENOENT: |
320 | case SHMSTATE_FOREIGN: |
321 | case SHMSTATE_UNATTACHED: |
322 | return false; |
323 | case SHMSTATE_ANALYSIS_FAILURE: |
324 | case SHMSTATE_ATTACHED: |
325 | return true; |
326 | } |
327 | return true; |
328 | } |
329 | |
330 | /* |
331 | * Test for a segment with id shmId; see comment at IpcMemoryState. |
332 | * |
333 | * If the segment exists, we'll attempt to attach to it, using attachAt |
334 | * if that's not NULL (but it's best to pass NULL if possible). |
335 | * |
336 | * *addr is set to the segment memory address if we attached to it, else NULL. |
337 | */ |
338 | static IpcMemoryState |
339 | PGSharedMemoryAttach(IpcMemoryId shmId, |
340 | void *attachAt, |
341 | PGShmemHeader **addr) |
342 | { |
343 | struct shmid_ds shmStat; |
344 | struct stat statbuf; |
345 | PGShmemHeader *hdr; |
346 | |
347 | *addr = NULL; |
348 | |
349 | /* |
350 | * First, try to stat the shm segment ID, to see if it exists at all. |
351 | */ |
352 | if (shmctl(shmId, IPC_STAT, &shmStat) < 0) |
353 | { |
354 | /* |
355 | * EINVAL actually has multiple possible causes documented in the |
356 | * shmctl man page, but we assume it must mean the segment no longer |
357 | * exists. |
358 | */ |
359 | if (errno == EINVAL) |
360 | return SHMSTATE_ENOENT; |
361 | |
362 | /* |
363 | * EACCES implies we have no read permission, which means it is not a |
364 | * Postgres shmem segment (or at least, not one that is relevant to |
365 | * our data directory). |
366 | */ |
367 | if (errno == EACCES) |
368 | return SHMSTATE_FOREIGN; |
369 | |
370 | /* |
371 | * Some Linux kernel versions (in fact, all of them as of July 2007) |
372 | * sometimes return EIDRM when EINVAL is correct. The Linux kernel |
373 | * actually does not have any internal state that would justify |
374 | * returning EIDRM, so we can get away with assuming that EIDRM is |
375 | * equivalent to EINVAL on that platform. |
376 | */ |
377 | #ifdef HAVE_LINUX_EIDRM_BUG |
378 | if (errno == EIDRM) |
379 | return SHMSTATE_ENOENT; |
380 | #endif |
381 | |
382 | /* |
383 | * Otherwise, we had better assume that the segment is in use. The |
384 | * only likely case is (non-Linux, assumed spec-compliant) EIDRM, |
385 | * which implies that the segment has been IPC_RMID'd but there are |
386 | * still processes attached to it. |
387 | */ |
388 | return SHMSTATE_ANALYSIS_FAILURE; |
389 | } |
390 | |
391 | /* |
392 | * Try to attach to the segment and see if it matches our data directory. |
393 | * This avoids key-conflict problems on machines that are running several |
394 | * postmasters under the same userid and port number. (That would not |
395 | * ordinarily happen in production, but it can happen during parallel |
396 | * testing. Since our test setups don't open any TCP ports on Unix, such |
397 | * cases don't conflict otherwise.) |
398 | */ |
399 | if (stat(DataDir, &statbuf) < 0) |
400 | return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */ |
401 | |
402 | hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS); |
403 | if (hdr == (PGShmemHeader *) -1) |
404 | { |
405 | /* |
406 | * Attachment failed. The cases we're interested in are the same as |
407 | * for the shmctl() call above. In particular, note that the owning |
408 | * postmaster could have terminated and removed the segment between |
409 | * shmctl() and shmat(). |
410 | * |
411 | * If attachAt isn't NULL, it's possible that EINVAL reflects a |
412 | * problem with that address not a vanished segment, so it's best to |
413 | * pass NULL when probing for conflicting segments. |
414 | */ |
415 | if (errno == EINVAL) |
416 | return SHMSTATE_ENOENT; /* segment disappeared */ |
417 | if (errno == EACCES) |
418 | return SHMSTATE_FOREIGN; /* must be non-Postgres */ |
419 | #ifdef HAVE_LINUX_EIDRM_BUG |
420 | if (errno == EIDRM) |
421 | return SHMSTATE_ENOENT; /* segment disappeared */ |
422 | #endif |
423 | /* Otherwise, be conservative. */ |
424 | return SHMSTATE_ANALYSIS_FAILURE; |
425 | } |
426 | *addr = hdr; |
427 | |
428 | if (hdr->magic != PGShmemMagic || |
429 | hdr->device != statbuf.st_dev || |
430 | hdr->inode != statbuf.st_ino) |
431 | { |
432 | /* |
433 | * It's either not a Postgres segment, or not one for my data |
434 | * directory. |
435 | */ |
436 | return SHMSTATE_FOREIGN; |
437 | } |
438 | |
439 | /* |
440 | * It does match our data directory, so now test whether any processes are |
441 | * still attached to it. (We are, now, but the shm_nattch result is from |
442 | * before we attached to it.) |
443 | */ |
444 | return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED; |
445 | } |
446 | |
447 | #ifdef MAP_HUGETLB |
448 | |
449 | /* |
450 | * Identify the huge page size to use. |
451 | * |
452 | * Some Linux kernel versions have a bug causing mmap() to fail on requests |
453 | * that are not a multiple of the hugepage size. Versions without that bug |
454 | * instead silently round the request up to the next hugepage multiple --- |
455 | * and then munmap() fails when we give it a size different from that. |
456 | * So we have to round our request up to a multiple of the actual hugepage |
457 | * size to avoid trouble. |
458 | * |
459 | * Doing the round-up ourselves also lets us make use of the extra memory, |
460 | * rather than just wasting it. Currently, we just increase the available |
461 | * space recorded in the shmem header, which will make the extra usable for |
462 | * purposes such as additional locktable entries. Someday, for very large |
463 | * hugepage sizes, we might want to think about more invasive strategies, |
464 | * such as increasing shared_buffers to absorb the extra space. |
465 | * |
466 | * Returns the (real or assumed) page size into *hugepagesize, |
467 | * and the hugepage-related mmap flags to use into *mmap_flags. |
468 | * |
469 | * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems |
470 | * that support it, we might OR in additional bits to specify a particular |
471 | * non-default huge page size. |
472 | */ |
473 | static void |
474 | GetHugePageSize(Size *hugepagesize, int *mmap_flags) |
475 | { |
476 | /* |
477 | * If we fail to find out the system's default huge page size, assume it |
478 | * is 2MB. This will work fine when the actual size is less. If it's |
479 | * more, we might get mmap() or munmap() failures due to unaligned |
480 | * requests; but at this writing, there are no reports of any non-Linux |
481 | * systems being picky about that. |
482 | */ |
483 | *hugepagesize = 2 * 1024 * 1024; |
484 | *mmap_flags = MAP_HUGETLB; |
485 | |
486 | /* |
487 | * System-dependent code to find out the default huge page size. |
488 | * |
489 | * On Linux, read /proc/meminfo looking for a line like "Hugepagesize: |
490 | * nnnn kB". Ignore any failures, falling back to the preset default. |
491 | */ |
492 | #ifdef __linux__ |
493 | { |
494 | FILE *fp = AllocateFile("/proc/meminfo" , "r" ); |
495 | char buf[128]; |
496 | unsigned int sz; |
497 | char ch; |
498 | |
499 | if (fp) |
500 | { |
501 | while (fgets(buf, sizeof(buf), fp)) |
502 | { |
503 | if (sscanf(buf, "Hugepagesize: %u %c" , &sz, &ch) == 2) |
504 | { |
505 | if (ch == 'k') |
506 | { |
507 | *hugepagesize = sz * (Size) 1024; |
508 | break; |
509 | } |
510 | /* We could accept other units besides kB, if needed */ |
511 | } |
512 | } |
513 | FreeFile(fp); |
514 | } |
515 | } |
516 | #endif /* __linux__ */ |
517 | } |
518 | |
519 | #endif /* MAP_HUGETLB */ |
520 | |
521 | /* |
522 | * Creates an anonymous mmap()ed shared memory segment. |
523 | * |
524 | * Pass the requested size in *size. This function will modify *size to the |
525 | * actual size of the allocation, if it ends up allocating a segment that is |
526 | * larger than requested. |
527 | */ |
528 | static void * |
529 | CreateAnonymousSegment(Size *size) |
530 | { |
531 | Size allocsize = *size; |
532 | void *ptr = MAP_FAILED; |
533 | int mmap_errno = 0; |
534 | |
535 | #ifndef MAP_HUGETLB |
536 | /* PGSharedMemoryCreate should have dealt with this case */ |
537 | Assert(huge_pages != HUGE_PAGES_ON); |
538 | #else |
539 | if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) |
540 | { |
541 | /* |
542 | * Round up the request size to a suitable large value. |
543 | */ |
544 | Size hugepagesize; |
545 | int mmap_flags; |
546 | |
547 | GetHugePageSize(&hugepagesize, &mmap_flags); |
548 | |
549 | if (allocsize % hugepagesize != 0) |
550 | allocsize += hugepagesize - (allocsize % hugepagesize); |
551 | |
552 | ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
553 | PG_MMAP_FLAGS | mmap_flags, -1, 0); |
554 | mmap_errno = errno; |
555 | if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) |
556 | elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m" , |
557 | allocsize); |
558 | } |
559 | #endif |
560 | |
561 | if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) |
562 | { |
563 | /* |
564 | * Use the original size, not the rounded-up value, when falling back |
565 | * to non-huge pages. |
566 | */ |
567 | allocsize = *size; |
568 | ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
569 | PG_MMAP_FLAGS, -1, 0); |
570 | mmap_errno = errno; |
571 | } |
572 | |
573 | if (ptr == MAP_FAILED) |
574 | { |
575 | errno = mmap_errno; |
576 | ereport(FATAL, |
577 | (errmsg("could not map anonymous shared memory: %m" ), |
578 | (mmap_errno == ENOMEM) ? |
579 | errhint("This error usually means that PostgreSQL's request " |
580 | "for a shared memory segment exceeded available memory, " |
581 | "swap space, or huge pages. To reduce the request size " |
582 | "(currently %zu bytes), reduce PostgreSQL's shared " |
583 | "memory usage, perhaps by reducing shared_buffers or " |
584 | "max_connections." , |
585 | *size) : 0)); |
586 | } |
587 | |
588 | *size = allocsize; |
589 | return ptr; |
590 | } |
591 | |
592 | /* |
593 | * AnonymousShmemDetach --- detach from an anonymous mmap'd block |
594 | * (called as an on_shmem_exit callback, hence funny argument list) |
595 | */ |
596 | static void |
597 | AnonymousShmemDetach(int status, Datum arg) |
598 | { |
599 | /* Release anonymous shared memory block, if any. */ |
600 | if (AnonymousShmem != NULL) |
601 | { |
602 | if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
603 | elog(LOG, "munmap(%p, %zu) failed: %m" , |
604 | AnonymousShmem, AnonymousShmemSize); |
605 | AnonymousShmem = NULL; |
606 | } |
607 | } |
608 | |
609 | /* |
610 | * PGSharedMemoryCreate |
611 | * |
612 | * Create a shared memory segment of the given size and initialize its |
613 | * standard header. Also, register an on_shmem_exit callback to release |
614 | * the storage. |
615 | * |
616 | * Dead Postgres segments pertinent to this DataDir are recycled if found, but |
617 | * we do not fail upon collision with foreign shmem segments. The idea here |
618 | * is to detect and re-use keys that may have been assigned by a crashed |
619 | * postmaster or backend. |
620 | * |
621 | * The port number is passed for possible use as a key (for SysV, we use |
622 | * it to generate the starting shmem key). |
623 | */ |
624 | PGShmemHeader * |
625 | PGSharedMemoryCreate(Size size, int port, |
626 | PGShmemHeader **shim) |
627 | { |
628 | IpcMemoryKey NextShmemSegID; |
629 | void *memAddress; |
630 | PGShmemHeader *hdr; |
631 | struct stat statbuf; |
632 | Size sysvsize; |
633 | |
634 | /* Complain if hugepages demanded but we can't possibly support them */ |
635 | #if !defined(MAP_HUGETLB) |
636 | if (huge_pages == HUGE_PAGES_ON) |
637 | ereport(ERROR, |
638 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
639 | errmsg("huge pages not supported on this platform" ))); |
640 | #endif |
641 | |
642 | /* Room for a header? */ |
643 | Assert(size > MAXALIGN(sizeof(PGShmemHeader))); |
644 | |
645 | if (shared_memory_type == SHMEM_TYPE_MMAP) |
646 | { |
647 | AnonymousShmem = CreateAnonymousSegment(&size); |
648 | AnonymousShmemSize = size; |
649 | |
650 | /* Register on-exit routine to unmap the anonymous segment */ |
651 | on_shmem_exit(AnonymousShmemDetach, (Datum) 0); |
652 | |
653 | /* Now we need only allocate a minimal-sized SysV shmem block. */ |
654 | sysvsize = sizeof(PGShmemHeader); |
655 | } |
656 | else |
657 | sysvsize = size; |
658 | |
659 | /* |
660 | * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to |
661 | * ensure no more than one postmaster per data directory can enter this |
662 | * loop simultaneously. (CreateDataDirLockFile() does not ensure that, |
663 | * but prefer fixing it over coping here.) |
664 | */ |
665 | NextShmemSegID = 1 + port * 1000; |
666 | |
667 | for (;;) |
668 | { |
669 | IpcMemoryId shmid; |
670 | PGShmemHeader *oldhdr; |
671 | IpcMemoryState state; |
672 | |
673 | /* Try to create new segment */ |
674 | memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); |
675 | if (memAddress) |
676 | break; /* successful create and attach */ |
677 | |
678 | /* Check shared memory and possibly remove and recreate */ |
679 | |
680 | /* |
681 | * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN. |
682 | * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can |
683 | * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN. |
684 | */ |
685 | shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0); |
686 | if (shmid < 0) |
687 | { |
688 | oldhdr = NULL; |
689 | state = SHMSTATE_FOREIGN; |
690 | } |
691 | else |
692 | state = PGSharedMemoryAttach(shmid, NULL, &oldhdr); |
693 | |
694 | switch (state) |
695 | { |
696 | case SHMSTATE_ANALYSIS_FAILURE: |
697 | case SHMSTATE_ATTACHED: |
698 | ereport(FATAL, |
699 | (errcode(ERRCODE_LOCK_FILE_EXISTS), |
700 | errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use" , |
701 | (unsigned long) NextShmemSegID, |
702 | (unsigned long) shmid), |
703 | errhint("Terminate any old server processes associated with data directory \"%s\"." , |
704 | DataDir))); |
705 | break; |
706 | case SHMSTATE_ENOENT: |
707 | |
708 | /* |
709 | * To our surprise, some other process deleted since our last |
710 | * InternalIpcMemoryCreate(). Moments earlier, we would have |
711 | * seen SHMSTATE_FOREIGN. Try that same ID again. |
712 | */ |
713 | elog(LOG, |
714 | "shared memory block (key %lu, ID %lu) deleted during startup" , |
715 | (unsigned long) NextShmemSegID, |
716 | (unsigned long) shmid); |
717 | break; |
718 | case SHMSTATE_FOREIGN: |
719 | NextShmemSegID++; |
720 | break; |
721 | case SHMSTATE_UNATTACHED: |
722 | |
723 | /* |
724 | * The segment pertains to DataDir, and every process that had |
725 | * used it has died or detached. Zap it, if possible, and any |
726 | * associated dynamic shared memory segments, as well. This |
727 | * shouldn't fail, but if it does, assume the segment belongs |
728 | * to someone else after all, and try the next candidate. |
729 | * Otherwise, try again to create the segment. That may fail |
730 | * if some other process creates the same shmem key before we |
731 | * do, in which case we'll try the next key. |
732 | */ |
733 | if (oldhdr->dsm_control != 0) |
734 | dsm_cleanup_using_control_segment(oldhdr->dsm_control); |
735 | if (shmctl(shmid, IPC_RMID, NULL) < 0) |
736 | NextShmemSegID++; |
737 | break; |
738 | } |
739 | |
740 | if (oldhdr && shmdt(oldhdr) < 0) |
741 | elog(LOG, "shmdt(%p) failed: %m" , oldhdr); |
742 | } |
743 | |
744 | /* Initialize new segment. */ |
745 | hdr = (PGShmemHeader *) memAddress; |
746 | hdr->creatorPID = getpid(); |
747 | hdr->magic = PGShmemMagic; |
748 | hdr->dsm_control = 0; |
749 | |
750 | /* Fill in the data directory ID info, too */ |
751 | if (stat(DataDir, &statbuf) < 0) |
752 | ereport(FATAL, |
753 | (errcode_for_file_access(), |
754 | errmsg("could not stat data directory \"%s\": %m" , |
755 | DataDir))); |
756 | hdr->device = statbuf.st_dev; |
757 | hdr->inode = statbuf.st_ino; |
758 | |
759 | /* |
760 | * Initialize space allocation status for segment. |
761 | */ |
762 | hdr->totalsize = size; |
763 | hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); |
764 | *shim = hdr; |
765 | |
766 | /* Save info for possible future use */ |
767 | UsedShmemSegAddr = memAddress; |
768 | UsedShmemSegID = (unsigned long) NextShmemSegID; |
769 | |
770 | /* |
771 | * If AnonymousShmem is NULL here, then we're not using anonymous shared |
772 | * memory, and should return a pointer to the System V shared memory |
773 | * block. Otherwise, the System V shared memory block is only a shim, and |
774 | * we must return a pointer to the real block. |
775 | */ |
776 | if (AnonymousShmem == NULL) |
777 | return hdr; |
778 | memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); |
779 | return (PGShmemHeader *) AnonymousShmem; |
780 | } |
781 | |
782 | #ifdef EXEC_BACKEND |
783 | |
784 | /* |
785 | * PGSharedMemoryReAttach |
786 | * |
787 | * This is called during startup of a postmaster child process to re-attach to |
788 | * an already existing shared memory segment. This is needed only in the |
789 | * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory |
790 | * segment attachment via fork(). |
791 | * |
792 | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
793 | * routine. The caller must have already restored them to the postmaster's |
794 | * values. |
795 | */ |
796 | void |
797 | PGSharedMemoryReAttach(void) |
798 | { |
799 | IpcMemoryId shmid; |
800 | PGShmemHeader *hdr; |
801 | IpcMemoryState state; |
802 | void *origUsedShmemSegAddr = UsedShmemSegAddr; |
803 | |
804 | Assert(UsedShmemSegAddr != NULL); |
805 | Assert(IsUnderPostmaster); |
806 | |
807 | #ifdef __CYGWIN__ |
808 | /* cygipc (currently) appears to not detach on exec. */ |
809 | PGSharedMemoryDetach(); |
810 | UsedShmemSegAddr = origUsedShmemSegAddr; |
811 | #endif |
812 | |
813 | elog(DEBUG3, "attaching to %p" , UsedShmemSegAddr); |
814 | shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0); |
815 | if (shmid < 0) |
816 | state = SHMSTATE_FOREIGN; |
817 | else |
818 | state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr); |
819 | if (state != SHMSTATE_ATTACHED) |
820 | elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m" , |
821 | (int) UsedShmemSegID, UsedShmemSegAddr); |
822 | if (hdr != origUsedShmemSegAddr) |
823 | elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)" , |
824 | hdr, origUsedShmemSegAddr); |
825 | dsm_set_control_handle(hdr->dsm_control); |
826 | |
827 | UsedShmemSegAddr = hdr; /* probably redundant */ |
828 | } |
829 | |
830 | /* |
831 | * PGSharedMemoryNoReAttach |
832 | * |
833 | * This is called during startup of a postmaster child process when we choose |
834 | * *not* to re-attach to the existing shared memory segment. We must clean up |
835 | * to leave things in the appropriate state. This is not used in the non |
836 | * EXEC_BACKEND case, either. |
837 | * |
838 | * The child process startup logic might or might not call PGSharedMemoryDetach |
839 | * after this; make sure that it will be a no-op if called. |
840 | * |
841 | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
842 | * routine. The caller must have already restored them to the postmaster's |
843 | * values. |
844 | */ |
845 | void |
846 | PGSharedMemoryNoReAttach(void) |
847 | { |
848 | Assert(UsedShmemSegAddr != NULL); |
849 | Assert(IsUnderPostmaster); |
850 | |
851 | #ifdef __CYGWIN__ |
852 | /* cygipc (currently) appears to not detach on exec. */ |
853 | PGSharedMemoryDetach(); |
854 | #endif |
855 | |
856 | /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */ |
857 | UsedShmemSegAddr = NULL; |
858 | /* And the same for UsedShmemSegID. */ |
859 | UsedShmemSegID = 0; |
860 | } |
861 | |
862 | #endif /* EXEC_BACKEND */ |
863 | |
864 | /* |
865 | * PGSharedMemoryDetach |
866 | * |
867 | * Detach from the shared memory segment, if still attached. This is not |
868 | * intended to be called explicitly by the process that originally created the |
869 | * segment (it will have on_shmem_exit callback(s) registered to do that). |
870 | * Rather, this is for subprocesses that have inherited an attachment and want |
871 | * to get rid of it. |
872 | * |
873 | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
874 | * routine, also AnonymousShmem and AnonymousShmemSize. |
875 | */ |
876 | void |
877 | PGSharedMemoryDetach(void) |
878 | { |
879 | if (UsedShmemSegAddr != NULL) |
880 | { |
881 | if ((shmdt(UsedShmemSegAddr) < 0) |
882 | #if defined(EXEC_BACKEND) && defined(__CYGWIN__) |
883 | /* Work-around for cygipc exec bug */ |
884 | && shmdt(NULL) < 0 |
885 | #endif |
886 | ) |
887 | elog(LOG, "shmdt(%p) failed: %m" , UsedShmemSegAddr); |
888 | UsedShmemSegAddr = NULL; |
889 | } |
890 | |
891 | if (AnonymousShmem != NULL) |
892 | { |
893 | if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
894 | elog(LOG, "munmap(%p, %zu) failed: %m" , |
895 | AnonymousShmem, AnonymousShmemSize); |
896 | AnonymousShmem = NULL; |
897 | } |
898 | } |
899 | |