1/*-------------------------------------------------------------------------
2 *
3 * dsm.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides a set of services to make programming with dynamic
7 * shared memory segments more convenient. Unlike the low-level
8 * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9 * created using this module will be cleaned up automatically. Mappings
10 * will be removed when the resource owner under which they were created
11 * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12 * have session lifespan. Segments will be removed when there are no
13 * remaining mappings, or at postmaster shutdown in any case. After a
14 * hard postmaster crash, remaining segments will be removed, if they
15 * still exist, at the next postmaster startup.
16 *
17 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 *
21 * IDENTIFICATION
22 * src/backend/storage/ipc/dsm.c
23 *
24 *-------------------------------------------------------------------------
25 */
26
27#include "postgres.h"
28
29#include <fcntl.h>
30#include <unistd.h>
31#ifndef WIN32
32#include <sys/mman.h>
33#endif
34#include <sys/stat.h>
35
36#include "lib/ilist.h"
37#include "miscadmin.h"
38#include "storage/dsm.h"
39#include "storage/ipc.h"
40#include "storage/lwlock.h"
41#include "storage/pg_shmem.h"
42#include "utils/guc.h"
43#include "utils/memutils.h"
44#include "utils/resowner_private.h"
45
46#define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32
47
48/*
49 * There's no point in getting too cheap here, because the minimum allocation
50 * is one OS page, which is probably at least 4KB and could easily be as high
51 * as 64KB. Each currently sizeof(dsm_control_item), currently 8 bytes.
52 */
53#define PG_DYNSHMEM_FIXED_SLOTS 64
54#define PG_DYNSHMEM_SLOTS_PER_BACKEND 2
55
56#define INVALID_CONTROL_SLOT ((uint32) -1)
57
58/* Backend-local tracking for on-detach callbacks. */
59typedef struct dsm_segment_detach_callback
60{
61 on_dsm_detach_callback function;
62 Datum arg;
63 slist_node node;
64} dsm_segment_detach_callback;
65
66/* Backend-local state for a dynamic shared memory segment. */
67struct dsm_segment
68{
69 dlist_node node; /* List link in dsm_segment_list. */
70 ResourceOwner resowner; /* Resource owner. */
71 dsm_handle handle; /* Segment name. */
72 uint32 control_slot; /* Slot in control segment. */
73 void *impl_private; /* Implementation-specific private data. */
74 void *mapped_address; /* Mapping address, or NULL if unmapped. */
75 Size mapped_size; /* Size of our mapping. */
76 slist_head on_detach; /* On-detach callbacks. */
77};
78
79/* Shared-memory state for a dynamic shared memory segment. */
80typedef struct dsm_control_item
81{
82 dsm_handle handle;
83 uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */
84 void *impl_private_pm_handle; /* only needed on Windows */
85 bool pinned;
86} dsm_control_item;
87
88/* Layout of the dynamic shared memory control segment. */
89typedef struct dsm_control_header
90{
91 uint32 magic;
92 uint32 nitems;
93 uint32 maxitems;
94 dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
95} dsm_control_header;
96
97static void dsm_cleanup_for_mmap(void);
98static void dsm_postmaster_shutdown(int code, Datum arg);
99static dsm_segment *dsm_create_descriptor(void);
100static bool dsm_control_segment_sane(dsm_control_header *control,
101 Size mapped_size);
102static uint64 dsm_control_bytes_needed(uint32 nitems);
103
104/* Has this backend initialized the dynamic shared memory system yet? */
105static bool dsm_init_done = false;
106
107/*
108 * List of dynamic shared memory segments used by this backend.
109 *
110 * At process exit time, we must decrement the reference count of each
111 * segment we have attached; this list makes it possible to find all such
112 * segments.
113 *
114 * This list should always be empty in the postmaster. We could probably
115 * allow the postmaster to map dynamic shared memory segments before it
116 * begins to start child processes, provided that each process adjusted
117 * the reference counts for those segments in the control segment at
118 * startup time, but there's no obvious need for such a facility, which
119 * would also be complex to handle in the EXEC_BACKEND case. Once the
120 * postmaster has begun spawning children, there's an additional problem:
121 * each new mapping would require an update to the control segment,
122 * which requires locking, in which the postmaster must not be involved.
123 */
124static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
125
126/*
127 * Control segment information.
128 *
129 * Unlike ordinary shared memory segments, the control segment is not
130 * reference counted; instead, it lasts for the postmaster's entire
131 * life cycle. For simplicity, it doesn't have a dsm_segment object either.
132 */
133static dsm_handle dsm_control_handle;
134static dsm_control_header *dsm_control;
135static Size dsm_control_mapped_size = 0;
136static void *dsm_control_impl_private = NULL;
137
138/*
139 * Start up the dynamic shared memory system.
140 *
141 * This is called just once during each cluster lifetime, at postmaster
142 * startup time.
143 */
144void
145dsm_postmaster_startup(PGShmemHeader *shim)
146{
147 void *dsm_control_address = NULL;
148 uint32 maxitems;
149 Size segsize;
150
151 Assert(!IsUnderPostmaster);
152
153 /*
154 * If we're using the mmap implementations, clean up any leftovers.
155 * Cleanup isn't needed on Windows, and happens earlier in startup for
156 * POSIX and System V shared memory, via a direct call to
157 * dsm_cleanup_using_control_segment.
158 */
159 if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
160 dsm_cleanup_for_mmap();
161
162 /* Determine size for new control segment. */
163 maxitems = PG_DYNSHMEM_FIXED_SLOTS
164 + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
165 elog(DEBUG2, "dynamic shared memory system will support %u segments",
166 maxitems);
167 segsize = dsm_control_bytes_needed(maxitems);
168
169 /*
170 * Loop until we find an unused identifier for the new control segment. We
171 * sometimes use 0 as a sentinel value indicating that no control segment
172 * is known to exist, so avoid using that value for a real control
173 * segment.
174 */
175 for (;;)
176 {
177 Assert(dsm_control_address == NULL);
178 Assert(dsm_control_mapped_size == 0);
179 dsm_control_handle = random();
180 if (dsm_control_handle == DSM_HANDLE_INVALID)
181 continue;
182 if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
183 &dsm_control_impl_private, &dsm_control_address,
184 &dsm_control_mapped_size, ERROR))
185 break;
186 }
187 dsm_control = dsm_control_address;
188 on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
189 elog(DEBUG2,
190 "created dynamic shared memory control segment %u (%zu bytes)",
191 dsm_control_handle, segsize);
192 shim->dsm_control = dsm_control_handle;
193
194 /* Initialize control segment. */
195 dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
196 dsm_control->nitems = 0;
197 dsm_control->maxitems = maxitems;
198}
199
200/*
201 * Determine whether the control segment from the previous postmaster
202 * invocation still exists. If so, remove the dynamic shared memory
203 * segments to which it refers, and then the control segment itself.
204 */
205void
206dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
207{
208 void *mapped_address = NULL;
209 void *junk_mapped_address = NULL;
210 void *impl_private = NULL;
211 void *junk_impl_private = NULL;
212 Size mapped_size = 0;
213 Size junk_mapped_size = 0;
214 uint32 nitems;
215 uint32 i;
216 dsm_control_header *old_control;
217
218 /*
219 * Try to attach the segment. If this fails, it probably just means that
220 * the operating system has been rebooted and the segment no longer
221 * exists, or an unrelated process has used the same shm ID. So just fall
222 * out quietly.
223 */
224 if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
225 &mapped_address, &mapped_size, DEBUG1))
226 return;
227
228 /*
229 * We've managed to reattach it, but the contents might not be sane. If
230 * they aren't, we disregard the segment after all.
231 */
232 old_control = (dsm_control_header *) mapped_address;
233 if (!dsm_control_segment_sane(old_control, mapped_size))
234 {
235 dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
236 &mapped_address, &mapped_size, LOG);
237 return;
238 }
239
240 /*
241 * OK, the control segment looks basically valid, so we can use it to get
242 * a list of segments that need to be removed.
243 */
244 nitems = old_control->nitems;
245 for (i = 0; i < nitems; ++i)
246 {
247 dsm_handle handle;
248 uint32 refcnt;
249
250 /* If the reference count is 0, the slot is actually unused. */
251 refcnt = old_control->item[i].refcnt;
252 if (refcnt == 0)
253 continue;
254
255 /* Log debugging information. */
256 handle = old_control->item[i].handle;
257 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
258 handle, refcnt);
259
260 /* Destroy the referenced segment. */
261 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
262 &junk_mapped_address, &junk_mapped_size, LOG);
263 }
264
265 /* Destroy the old control segment, too. */
266 elog(DEBUG2,
267 "cleaning up dynamic shared memory control segment with ID %u",
268 old_control_handle);
269 dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
270 &mapped_address, &mapped_size, LOG);
271}
272
273/*
274 * When we're using the mmap shared memory implementation, "shared memory"
275 * segments might even manage to survive an operating system reboot.
276 * But there's no guarantee as to exactly what will survive: some segments
277 * may survive, and others may not, and the contents of some may be out
278 * of date. In particular, the control segment may be out of date, so we
279 * can't rely on it to figure out what to remove. However, since we know
280 * what directory contains the files we used as shared memory, we can simply
281 * scan the directory and blow everything away that shouldn't be there.
282 */
283static void
284dsm_cleanup_for_mmap(void)
285{
286 DIR *dir;
287 struct dirent *dent;
288
289 /* Scan the directory for something with a name of the correct format. */
290 dir = AllocateDir(PG_DYNSHMEM_DIR);
291
292 while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
293 {
294 if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
295 strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
296 {
297 char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
298
299 snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
300
301 elog(DEBUG2, "removing file \"%s\"", buf);
302
303 /* We found a matching file; so remove it. */
304 if (unlink(buf) != 0)
305 ereport(ERROR,
306 (errcode_for_file_access(),
307 errmsg("could not remove file \"%s\": %m", buf)));
308 }
309 }
310
311 /* Cleanup complete. */
312 FreeDir(dir);
313}
314
315/*
316 * At shutdown time, we iterate over the control segment and remove all
317 * remaining dynamic shared memory segments. We avoid throwing errors here;
318 * the postmaster is shutting down either way, and this is just non-critical
319 * resource cleanup.
320 */
321static void
322dsm_postmaster_shutdown(int code, Datum arg)
323{
324 uint32 nitems;
325 uint32 i;
326 void *dsm_control_address;
327 void *junk_mapped_address = NULL;
328 void *junk_impl_private = NULL;
329 Size junk_mapped_size = 0;
330 PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
331
332 /*
333 * If some other backend exited uncleanly, it might have corrupted the
334 * control segment while it was dying. In that case, we warn and ignore
335 * the contents of the control segment. This may end up leaving behind
336 * stray shared memory segments, but there's not much we can do about that
337 * if the metadata is gone.
338 */
339 nitems = dsm_control->nitems;
340 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
341 {
342 ereport(LOG,
343 (errmsg("dynamic shared memory control segment is corrupt")));
344 return;
345 }
346
347 /* Remove any remaining segments. */
348 for (i = 0; i < nitems; ++i)
349 {
350 dsm_handle handle;
351
352 /* If the reference count is 0, the slot is actually unused. */
353 if (dsm_control->item[i].refcnt == 0)
354 continue;
355
356 /* Log debugging information. */
357 handle = dsm_control->item[i].handle;
358 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
359 handle);
360
361 /* Destroy the segment. */
362 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
363 &junk_mapped_address, &junk_mapped_size, LOG);
364 }
365
366 /* Remove the control segment itself. */
367 elog(DEBUG2,
368 "cleaning up dynamic shared memory control segment with ID %u",
369 dsm_control_handle);
370 dsm_control_address = dsm_control;
371 dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
372 &dsm_control_impl_private, &dsm_control_address,
373 &dsm_control_mapped_size, LOG);
374 dsm_control = dsm_control_address;
375 shim->dsm_control = 0;
376}
377
378/*
379 * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND,
380 * we must reread the state file and map the control segment; in other cases,
381 * we'll have inherited the postmaster's mapping and global variables.
382 */
383static void
384dsm_backend_startup(void)
385{
386#ifdef EXEC_BACKEND
387 {
388 void *control_address = NULL;
389
390 /* Attach control segment. */
391 Assert(dsm_control_handle != 0);
392 dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
393 &dsm_control_impl_private, &control_address,
394 &dsm_control_mapped_size, ERROR);
395 dsm_control = control_address;
396 /* If control segment doesn't look sane, something is badly wrong. */
397 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
398 {
399 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
400 &dsm_control_impl_private, &control_address,
401 &dsm_control_mapped_size, WARNING);
402 ereport(FATAL,
403 (errcode(ERRCODE_INTERNAL_ERROR),
404 errmsg("dynamic shared memory control segment is not valid")));
405 }
406 }
407#endif
408
409 dsm_init_done = true;
410}
411
412#ifdef EXEC_BACKEND
413/*
414 * When running under EXEC_BACKEND, we get a callback here when the main
415 * shared memory segment is re-attached, so that we can record the control
416 * handle retrieved from it.
417 */
418void
419dsm_set_control_handle(dsm_handle h)
420{
421 Assert(dsm_control_handle == 0 && h != 0);
422 dsm_control_handle = h;
423}
424#endif
425
426/*
427 * Create a new dynamic shared memory segment.
428 *
429 * If there is a non-NULL CurrentResourceOwner, the new segment is associated
430 * with it and must be detached before the resource owner releases, or a
431 * warning will be logged. If CurrentResourceOwner is NULL, the segment
432 * remains attached until explicitly detached or the session ends.
433 * Creating with a NULL CurrentResourceOwner is equivalent to creating
434 * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
435 */
436dsm_segment *
437dsm_create(Size size, int flags)
438{
439 dsm_segment *seg;
440 uint32 i;
441 uint32 nitems;
442
443 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
444 Assert(IsUnderPostmaster);
445
446 if (!dsm_init_done)
447 dsm_backend_startup();
448
449 /* Create a new segment descriptor. */
450 seg = dsm_create_descriptor();
451
452 /* Loop until we find an unused segment identifier. */
453 for (;;)
454 {
455 Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
456 seg->handle = random();
457 if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */
458 continue;
459 if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
460 &seg->mapped_address, &seg->mapped_size, ERROR))
461 break;
462 }
463
464 /* Lock the control segment so we can register the new segment. */
465 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
466
467 /* Search the control segment for an unused slot. */
468 nitems = dsm_control->nitems;
469 for (i = 0; i < nitems; ++i)
470 {
471 if (dsm_control->item[i].refcnt == 0)
472 {
473 dsm_control->item[i].handle = seg->handle;
474 /* refcnt of 1 triggers destruction, so start at 2 */
475 dsm_control->item[i].refcnt = 2;
476 dsm_control->item[i].impl_private_pm_handle = NULL;
477 dsm_control->item[i].pinned = false;
478 seg->control_slot = i;
479 LWLockRelease(DynamicSharedMemoryControlLock);
480 return seg;
481 }
482 }
483
484 /* Verify that we can support an additional mapping. */
485 if (nitems >= dsm_control->maxitems)
486 {
487 if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
488 {
489 LWLockRelease(DynamicSharedMemoryControlLock);
490 dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
491 &seg->mapped_address, &seg->mapped_size, WARNING);
492 if (seg->resowner != NULL)
493 ResourceOwnerForgetDSM(seg->resowner, seg);
494 dlist_delete(&seg->node);
495 pfree(seg);
496 return NULL;
497 }
498 ereport(ERROR,
499 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
500 errmsg("too many dynamic shared memory segments")));
501 }
502
503 /* Enter the handle into a new array slot. */
504 dsm_control->item[nitems].handle = seg->handle;
505 /* refcnt of 1 triggers destruction, so start at 2 */
506 dsm_control->item[nitems].refcnt = 2;
507 dsm_control->item[nitems].impl_private_pm_handle = NULL;
508 dsm_control->item[nitems].pinned = false;
509 seg->control_slot = nitems;
510 dsm_control->nitems++;
511 LWLockRelease(DynamicSharedMemoryControlLock);
512
513 return seg;
514}
515
516/*
517 * Attach a dynamic shared memory segment.
518 *
519 * See comments for dsm_segment_handle() for an explanation of how this
520 * is intended to be used.
521 *
522 * This function will return NULL if the segment isn't known to the system.
523 * This can happen if we're asked to attach the segment, but then everyone
524 * else detaches it (causing it to be destroyed) before we get around to
525 * attaching it.
526 *
527 * If there is a non-NULL CurrentResourceOwner, the attached segment is
528 * associated with it and must be detached before the resource owner releases,
529 * or a warning will be logged. Otherwise the segment remains attached until
530 * explicitly detached or the session ends. See the note atop dsm_create().
531 */
532dsm_segment *
533dsm_attach(dsm_handle h)
534{
535 dsm_segment *seg;
536 dlist_iter iter;
537 uint32 i;
538 uint32 nitems;
539
540 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
541 Assert(IsUnderPostmaster);
542
543 if (!dsm_init_done)
544 dsm_backend_startup();
545
546 /*
547 * Since this is just a debugging cross-check, we could leave it out
548 * altogether, or include it only in assert-enabled builds. But since the
549 * list of attached segments should normally be very short, let's include
550 * it always for right now.
551 *
552 * If you're hitting this error, you probably want to attempt to find an
553 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
554 * create a new one.
555 */
556 dlist_foreach(iter, &dsm_segment_list)
557 {
558 seg = dlist_container(dsm_segment, node, iter.cur);
559 if (seg->handle == h)
560 elog(ERROR, "can't attach the same segment more than once");
561 }
562
563 /* Create a new segment descriptor. */
564 seg = dsm_create_descriptor();
565 seg->handle = h;
566
567 /* Bump reference count for this segment in shared memory. */
568 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
569 nitems = dsm_control->nitems;
570 for (i = 0; i < nitems; ++i)
571 {
572 /*
573 * If the reference count is 0, the slot is actually unused. If the
574 * reference count is 1, the slot is still in use, but the segment is
575 * in the process of going away; even if the handle matches, another
576 * slot may already have started using the same handle value by
577 * coincidence so we have to keep searching.
578 */
579 if (dsm_control->item[i].refcnt <= 1)
580 continue;
581
582 /* If the handle doesn't match, it's not the slot we want. */
583 if (dsm_control->item[i].handle != seg->handle)
584 continue;
585
586 /* Otherwise we've found a match. */
587 dsm_control->item[i].refcnt++;
588 seg->control_slot = i;
589 break;
590 }
591 LWLockRelease(DynamicSharedMemoryControlLock);
592
593 /*
594 * If we didn't find the handle we're looking for in the control segment,
595 * it probably means that everyone else who had it mapped, including the
596 * original creator, died before we got to this point. It's up to the
597 * caller to decide what to do about that.
598 */
599 if (seg->control_slot == INVALID_CONTROL_SLOT)
600 {
601 dsm_detach(seg);
602 return NULL;
603 }
604
605 /* Here's where we actually try to map the segment. */
606 dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
607 &seg->mapped_address, &seg->mapped_size, ERROR);
608
609 return seg;
610}
611
612/*
613 * At backend shutdown time, detach any segments that are still attached.
614 * (This is similar to dsm_detach_all, except that there's no reason to
615 * unmap the control segment before exiting, so we don't bother.)
616 */
617void
618dsm_backend_shutdown(void)
619{
620 while (!dlist_is_empty(&dsm_segment_list))
621 {
622 dsm_segment *seg;
623
624 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
625 dsm_detach(seg);
626 }
627}
628
629/*
630 * Detach all shared memory segments, including the control segments. This
631 * should be called, along with PGSharedMemoryDetach, in processes that
632 * might inherit mappings but are not intended to be connected to dynamic
633 * shared memory.
634 */
635void
636dsm_detach_all(void)
637{
638 void *control_address = dsm_control;
639
640 while (!dlist_is_empty(&dsm_segment_list))
641 {
642 dsm_segment *seg;
643
644 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
645 dsm_detach(seg);
646 }
647
648 if (control_address != NULL)
649 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
650 &dsm_control_impl_private, &control_address,
651 &dsm_control_mapped_size, ERROR);
652}
653
654/*
655 * Detach from a shared memory segment, destroying the segment if we
656 * remove the last reference.
657 *
658 * This function should never fail. It will often be invoked when aborting
659 * a transaction, and a further error won't serve any purpose. It's not a
660 * complete disaster if we fail to unmap or destroy the segment; it means a
661 * resource leak, but that doesn't necessarily preclude further operations.
662 */
663void
664dsm_detach(dsm_segment *seg)
665{
666 /*
667 * Invoke registered callbacks. Just in case one of those callbacks
668 * throws a further error that brings us back here, pop the callback
669 * before invoking it, to avoid infinite error recursion.
670 */
671 while (!slist_is_empty(&seg->on_detach))
672 {
673 slist_node *node;
674 dsm_segment_detach_callback *cb;
675 on_dsm_detach_callback function;
676 Datum arg;
677
678 node = slist_pop_head_node(&seg->on_detach);
679 cb = slist_container(dsm_segment_detach_callback, node, node);
680 function = cb->function;
681 arg = cb->arg;
682 pfree(cb);
683
684 function(seg, arg);
685 }
686
687 /*
688 * Try to remove the mapping, if one exists. Normally, there will be, but
689 * maybe not, if we failed partway through a create or attach operation.
690 * We remove the mapping before decrementing the reference count so that
691 * the process that sees a zero reference count can be certain that no
692 * remaining mappings exist. Even if this fails, we pretend that it
693 * works, because retrying is likely to fail in the same way.
694 */
695 if (seg->mapped_address != NULL)
696 {
697 dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
698 &seg->mapped_address, &seg->mapped_size, WARNING);
699 seg->impl_private = NULL;
700 seg->mapped_address = NULL;
701 seg->mapped_size = 0;
702 }
703
704 /* Reduce reference count, if we previously increased it. */
705 if (seg->control_slot != INVALID_CONTROL_SLOT)
706 {
707 uint32 refcnt;
708 uint32 control_slot = seg->control_slot;
709
710 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
711 Assert(dsm_control->item[control_slot].handle == seg->handle);
712 Assert(dsm_control->item[control_slot].refcnt > 1);
713 refcnt = --dsm_control->item[control_slot].refcnt;
714 seg->control_slot = INVALID_CONTROL_SLOT;
715 LWLockRelease(DynamicSharedMemoryControlLock);
716
717 /* If new reference count is 1, try to destroy the segment. */
718 if (refcnt == 1)
719 {
720 /* A pinned segment should never reach 1. */
721 Assert(!dsm_control->item[control_slot].pinned);
722
723 /*
724 * If we fail to destroy the segment here, or are killed before we
725 * finish doing so, the reference count will remain at 1, which
726 * will mean that nobody else can attach to the segment. At
727 * postmaster shutdown time, or when a new postmaster is started
728 * after a hard kill, another attempt will be made to remove the
729 * segment.
730 *
731 * The main case we're worried about here is being killed by a
732 * signal before we can finish removing the segment. In that
733 * case, it's important to be sure that the segment still gets
734 * removed. If we actually fail to remove the segment for some
735 * other reason, the postmaster may not have any better luck than
736 * we did. There's not much we can do about that, though.
737 */
738 if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
739 &seg->mapped_address, &seg->mapped_size, WARNING))
740 {
741 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
742 Assert(dsm_control->item[control_slot].handle == seg->handle);
743 Assert(dsm_control->item[control_slot].refcnt == 1);
744 dsm_control->item[control_slot].refcnt = 0;
745 LWLockRelease(DynamicSharedMemoryControlLock);
746 }
747 }
748 }
749
750 /* Clean up our remaining backend-private data structures. */
751 if (seg->resowner != NULL)
752 ResourceOwnerForgetDSM(seg->resowner, seg);
753 dlist_delete(&seg->node);
754 pfree(seg);
755}
756
757/*
758 * Keep a dynamic shared memory mapping until end of session.
759 *
760 * By default, mappings are owned by the current resource owner, which
761 * typically means they stick around for the duration of the current query
762 * only.
763 */
764void
765dsm_pin_mapping(dsm_segment *seg)
766{
767 if (seg->resowner != NULL)
768 {
769 ResourceOwnerForgetDSM(seg->resowner, seg);
770 seg->resowner = NULL;
771 }
772}
773
774/*
775 * Arrange to remove a dynamic shared memory mapping at cleanup time.
776 *
777 * dsm_pin_mapping() can be used to preserve a mapping for the entire
778 * lifetime of a process; this function reverses that decision, making
779 * the segment owned by the current resource owner. This may be useful
780 * just before performing some operation that will invalidate the segment
781 * for future use by this backend.
782 */
783void
784dsm_unpin_mapping(dsm_segment *seg)
785{
786 Assert(seg->resowner == NULL);
787 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
788 seg->resowner = CurrentResourceOwner;
789 ResourceOwnerRememberDSM(seg->resowner, seg);
790}
791
792/*
793 * Keep a dynamic shared memory segment until postmaster shutdown, or until
794 * dsm_unpin_segment is called.
795 *
796 * This function should not be called more than once per segment, unless the
797 * segment is explicitly unpinned with dsm_unpin_segment in between calls.
798 *
799 * Note that this function does not arrange for the current process to
800 * keep the segment mapped indefinitely; if that behavior is desired,
801 * dsm_pin_mapping() should be used from each process that needs to
802 * retain the mapping.
803 */
804void
805dsm_pin_segment(dsm_segment *seg)
806{
807 void *handle;
808
809 /*
810 * Bump reference count for this segment in shared memory. This will
811 * ensure that even if there is no session which is attached to this
812 * segment, it will remain until postmaster shutdown or an explicit call
813 * to unpin.
814 */
815 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
816 if (dsm_control->item[seg->control_slot].pinned)
817 elog(ERROR, "cannot pin a segment that is already pinned");
818 dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
819 dsm_control->item[seg->control_slot].pinned = true;
820 dsm_control->item[seg->control_slot].refcnt++;
821 dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
822 LWLockRelease(DynamicSharedMemoryControlLock);
823}
824
825/*
826 * Unpin a dynamic shared memory segment that was previously pinned with
827 * dsm_pin_segment. This function should not be called unless dsm_pin_segment
828 * was previously called for this segment.
829 *
830 * The argument is a dsm_handle rather than a dsm_segment in case you want
831 * to unpin a segment to which you haven't attached. This turns out to be
832 * useful if, for example, a reference to one shared memory segment is stored
833 * within another shared memory segment. You might want to unpin the
834 * referenced segment before destroying the referencing segment.
835 */
836void
837dsm_unpin_segment(dsm_handle handle)
838{
839 uint32 control_slot = INVALID_CONTROL_SLOT;
840 bool destroy = false;
841 uint32 i;
842
843 /* Find the control slot for the given handle. */
844 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
845 for (i = 0; i < dsm_control->nitems; ++i)
846 {
847 /* Skip unused slots and segments that are concurrently going away. */
848 if (dsm_control->item[i].refcnt <= 1)
849 continue;
850
851 /* If we've found our handle, we can stop searching. */
852 if (dsm_control->item[i].handle == handle)
853 {
854 control_slot = i;
855 break;
856 }
857 }
858
859 /*
860 * We should definitely have found the slot, and it should not already be
861 * in the process of going away, because this function should only be
862 * called on a segment which is pinned.
863 */
864 if (control_slot == INVALID_CONTROL_SLOT)
865 elog(ERROR, "cannot unpin unknown segment handle");
866 if (!dsm_control->item[control_slot].pinned)
867 elog(ERROR, "cannot unpin a segment that is not pinned");
868 Assert(dsm_control->item[control_slot].refcnt > 1);
869
870 /*
871 * Allow implementation-specific code to run. We have to do this before
872 * releasing the lock, because impl_private_pm_handle may get modified by
873 * dsm_impl_unpin_segment.
874 */
875 dsm_impl_unpin_segment(handle,
876 &dsm_control->item[control_slot].impl_private_pm_handle);
877
878 /* Note that 1 means no references (0 means unused slot). */
879 if (--dsm_control->item[control_slot].refcnt == 1)
880 destroy = true;
881 dsm_control->item[control_slot].pinned = false;
882
883 /* Now we can release the lock. */
884 LWLockRelease(DynamicSharedMemoryControlLock);
885
886 /* Clean up resources if that was the last reference. */
887 if (destroy)
888 {
889 void *junk_impl_private = NULL;
890 void *junk_mapped_address = NULL;
891 Size junk_mapped_size = 0;
892
893 /*
894 * For an explanation of how error handling works in this case, see
895 * comments in dsm_detach. Note that if we reach this point, the
896 * current process certainly does not have the segment mapped, because
897 * if it did, the reference count would have still been greater than 1
898 * even after releasing the reference count held by the pin. The fact
899 * that there can't be a dsm_segment for this handle makes it OK to
900 * pass the mapped size, mapped address, and private data as NULL
901 * here.
902 */
903 if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
904 &junk_mapped_address, &junk_mapped_size, WARNING))
905 {
906 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
907 Assert(dsm_control->item[control_slot].handle == handle);
908 Assert(dsm_control->item[control_slot].refcnt == 1);
909 dsm_control->item[control_slot].refcnt = 0;
910 LWLockRelease(DynamicSharedMemoryControlLock);
911 }
912 }
913}
914
915/*
916 * Find an existing mapping for a shared memory segment, if there is one.
917 */
918dsm_segment *
919dsm_find_mapping(dsm_handle h)
920{
921 dlist_iter iter;
922 dsm_segment *seg;
923
924 dlist_foreach(iter, &dsm_segment_list)
925 {
926 seg = dlist_container(dsm_segment, node, iter.cur);
927 if (seg->handle == h)
928 return seg;
929 }
930
931 return NULL;
932}
933
934/*
935 * Get the address at which a dynamic shared memory segment is mapped.
936 */
937void *
938dsm_segment_address(dsm_segment *seg)
939{
940 Assert(seg->mapped_address != NULL);
941 return seg->mapped_address;
942}
943
944/*
945 * Get the size of a mapping.
946 */
947Size
948dsm_segment_map_length(dsm_segment *seg)
949{
950 Assert(seg->mapped_address != NULL);
951 return seg->mapped_size;
952}
953
954/*
955 * Get a handle for a mapping.
956 *
957 * To establish communication via dynamic shared memory between two backends,
958 * one of them should first call dsm_create() to establish a new shared
959 * memory mapping. That process should then call dsm_segment_handle() to
960 * obtain a handle for the mapping, and pass that handle to the
961 * coordinating backend via some means (e.g. bgw_main_arg, or via the
962 * main shared memory segment). The recipient, once in possession of the
963 * handle, should call dsm_attach().
964 */
965dsm_handle
966dsm_segment_handle(dsm_segment *seg)
967{
968 return seg->handle;
969}
970
971/*
972 * Register an on-detach callback for a dynamic shared memory segment.
973 */
974void
975on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
976{
977 dsm_segment_detach_callback *cb;
978
979 cb = MemoryContextAlloc(TopMemoryContext,
980 sizeof(dsm_segment_detach_callback));
981 cb->function = function;
982 cb->arg = arg;
983 slist_push_head(&seg->on_detach, &cb->node);
984}
985
986/*
987 * Unregister an on-detach callback for a dynamic shared memory segment.
988 */
989void
990cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
991 Datum arg)
992{
993 slist_mutable_iter iter;
994
995 slist_foreach_modify(iter, &seg->on_detach)
996 {
997 dsm_segment_detach_callback *cb;
998
999 cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
1000 if (cb->function == function && cb->arg == arg)
1001 {
1002 slist_delete_current(&iter);
1003 pfree(cb);
1004 break;
1005 }
1006 }
1007}
1008
1009/*
1010 * Discard all registered on-detach callbacks without executing them.
1011 */
1012void
1013reset_on_dsm_detach(void)
1014{
1015 dlist_iter iter;
1016
1017 dlist_foreach(iter, &dsm_segment_list)
1018 {
1019 dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
1020
1021 /* Throw away explicit on-detach actions one by one. */
1022 while (!slist_is_empty(&seg->on_detach))
1023 {
1024 slist_node *node;
1025 dsm_segment_detach_callback *cb;
1026
1027 node = slist_pop_head_node(&seg->on_detach);
1028 cb = slist_container(dsm_segment_detach_callback, node, node);
1029 pfree(cb);
1030 }
1031
1032 /*
1033 * Decrementing the reference count is a sort of implicit on-detach
1034 * action; make sure we don't do that, either.
1035 */
1036 seg->control_slot = INVALID_CONTROL_SLOT;
1037 }
1038}
1039
1040/*
1041 * Create a segment descriptor.
1042 */
1043static dsm_segment *
1044dsm_create_descriptor(void)
1045{
1046 dsm_segment *seg;
1047
1048 if (CurrentResourceOwner)
1049 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
1050
1051 seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
1052 dlist_push_head(&dsm_segment_list, &seg->node);
1053
1054 /* seg->handle must be initialized by the caller */
1055 seg->control_slot = INVALID_CONTROL_SLOT;
1056 seg->impl_private = NULL;
1057 seg->mapped_address = NULL;
1058 seg->mapped_size = 0;
1059
1060 seg->resowner = CurrentResourceOwner;
1061 if (CurrentResourceOwner)
1062 ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1063
1064 slist_init(&seg->on_detach);
1065
1066 return seg;
1067}
1068
1069/*
1070 * Sanity check a control segment.
1071 *
1072 * The goal here isn't to detect everything that could possibly be wrong with
1073 * the control segment; there's not enough information for that. Rather, the
1074 * goal is to make sure that someone can iterate over the items in the segment
1075 * without overrunning the end of the mapping and crashing. We also check
1076 * the magic number since, if that's messed up, this may not even be one of
1077 * our segments at all.
1078 */
1079static bool
1080dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1081{
1082 if (mapped_size < offsetof(dsm_control_header, item))
1083 return false; /* Mapped size too short to read header. */
1084 if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1085 return false; /* Magic number doesn't match. */
1086 if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1087 return false; /* Max item count won't fit in map. */
1088 if (control->nitems > control->maxitems)
1089 return false; /* Overfull. */
1090 return true;
1091}
1092
1093/*
1094 * Compute the number of control-segment bytes needed to store a given
1095 * number of items.
1096 */
1097static uint64
1098dsm_control_bytes_needed(uint32 nitems)
1099{
1100 return offsetof(dsm_control_header, item)
1101 + sizeof(dsm_control_item) * (uint64) nitems;
1102}
1103