1/*-------------------------------------------------------------------------
2 *
3 * dsm_impl.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides low-level APIs for creating and destroying shared
7 * memory segments using several different possible techniques. We refer
8 * to these segments as dynamic because they can be created, altered, and
9 * destroyed at any point during the server life cycle. This is unlike
10 * the main shared memory segment, of which there is always exactly one
11 * and which is always mapped at a fixed address in every PostgreSQL
12 * background process.
13 *
14 * Because not all systems provide the same primitives in this area, nor
15 * do all primitives behave the same way on all systems, we provide
16 * several implementations of this facility. Many systems implement
17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 * in this area, with the exception that shared memory identifiers live
19 * in a flat system-wide namespace, raising the uncomfortable prospect of
20 * name collisions with other processes (including other copies of
21 * PostgreSQL) running on the same system. Some systems only support
22 * the older System V shared memory interface (shmget etc.) which is
23 * also usable; however, the default allocation limits are often quite
24 * small, and the namespace is even more restricted.
25 *
26 * We also provide an mmap-based shared memory implementation. This may
27 * be useful on systems that provide shared memory via a special-purpose
28 * filesystem; by opting for this implementation, the user can even
29 * control precisely where their shared memory segments are placed. It
30 * can also be used as a fallback for systems where shm_open and shmget
31 * are not available or can't be used for some reason. Of course,
32 * mapping a file residing on an actual spinning disk is a fairly poor
33 * approximation for shared memory because writeback may hurt performance
34 * substantially, but there should be few systems where we must make do
35 * with such poor tools.
36 *
37 * As ever, Windows requires its own implementation.
38 *
39 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
41 *
42 *
43 * IDENTIFICATION
44 * src/backend/storage/ipc/dsm_impl.c
45 *
46 *-------------------------------------------------------------------------
47 */
48
49#include "postgres.h"
50#include "miscadmin.h"
51
52#include <fcntl.h>
53#include <unistd.h>
54#ifndef WIN32
55#include <sys/mman.h>
56#endif
57#include <sys/stat.h>
58#ifdef HAVE_SYS_IPC_H
59#include <sys/ipc.h>
60#endif
61#ifdef HAVE_SYS_SHM_H
62#include <sys/shm.h>
63#endif
64#include "common/file_perm.h"
65#include "pgstat.h"
66
67#include "portability/mem.h"
68#include "storage/dsm_impl.h"
69#include "storage/fd.h"
70#include "utils/guc.h"
71#include "utils/memutils.h"
72#include "postmaster/postmaster.h"
73
74#ifdef USE_DSM_POSIX
75static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 void **impl_private, void **mapped_address,
77 Size *mapped_size, int elevel);
78static int dsm_impl_posix_resize(int fd, off_t size);
79#endif
80#ifdef USE_DSM_SYSV
81static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 void **impl_private, void **mapped_address,
83 Size *mapped_size, int elevel);
84#endif
85#ifdef USE_DSM_WINDOWS
86static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 void **impl_private, void **mapped_address,
88 Size *mapped_size, int elevel);
89#endif
90#ifdef USE_DSM_MMAP
91static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 void **impl_private, void **mapped_address,
93 Size *mapped_size, int elevel);
94#endif
95static int errcode_for_dynamic_shared_memory(void);
96
97const struct config_enum_entry dynamic_shared_memory_options[] = {
98#ifdef USE_DSM_POSIX
99 {"posix", DSM_IMPL_POSIX, false},
100#endif
101#ifdef USE_DSM_SYSV
102 {"sysv", DSM_IMPL_SYSV, false},
103#endif
104#ifdef USE_DSM_WINDOWS
105 {"windows", DSM_IMPL_WINDOWS, false},
106#endif
107#ifdef USE_DSM_MMAP
108 {"mmap", DSM_IMPL_MMAP, false},
109#endif
110 {NULL, 0, false}
111};
112
113/* Implementation selector. */
114int dynamic_shared_memory_type;
115
116/* Size of buffer to be used for zero-filling. */
117#define ZBUFFER_SIZE 8192
118
119#define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
120
121/*------
122 * Perform a low-level shared memory operation in a platform-specific way,
123 * as dictated by the selected implementation. Each implementation is
124 * required to implement the following primitives.
125 *
126 * DSM_OP_CREATE. Create a segment whose size is the request_size and
127 * map it.
128 *
129 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
130 *
131 * DSM_OP_DETACH. Unmap the segment.
132 *
133 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
134 * segment.
135 *
136 * Arguments:
137 * op: The operation to be performed.
138 * handle: The handle of an existing object, or for DSM_OP_CREATE, the
139 * a new handle the caller wants created.
140 * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
141 * impl_private: Private, implementation-specific data. Will be a pointer
142 * to NULL for the first operation on a shared memory segment within this
143 * backend; thereafter, it will point to the value to which it was set
144 * on the previous call.
145 * mapped_address: Pointer to start of current mapping; pointer to NULL
146 * if none. Updated with new mapping address.
147 * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
148 * Updated with new mapped size.
149 * elevel: Level at which to log errors.
150 *
151 * Return value: true on success, false on failure. When false is returned,
152 * a message should first be logged at the specified elevel, except in the
153 * case where DSM_OP_CREATE experiences a name collision, which should
154 * silently return false.
155 *-----
156 */
157bool
158dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
159 void **impl_private, void **mapped_address, Size *mapped_size,
160 int elevel)
161{
162 Assert(op == DSM_OP_CREATE || request_size == 0);
163 Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
164 (*mapped_address == NULL && *mapped_size == 0));
165
166 switch (dynamic_shared_memory_type)
167 {
168#ifdef USE_DSM_POSIX
169 case DSM_IMPL_POSIX:
170 return dsm_impl_posix(op, handle, request_size, impl_private,
171 mapped_address, mapped_size, elevel);
172#endif
173#ifdef USE_DSM_SYSV
174 case DSM_IMPL_SYSV:
175 return dsm_impl_sysv(op, handle, request_size, impl_private,
176 mapped_address, mapped_size, elevel);
177#endif
178#ifdef USE_DSM_WINDOWS
179 case DSM_IMPL_WINDOWS:
180 return dsm_impl_windows(op, handle, request_size, impl_private,
181 mapped_address, mapped_size, elevel);
182#endif
183#ifdef USE_DSM_MMAP
184 case DSM_IMPL_MMAP:
185 return dsm_impl_mmap(op, handle, request_size, impl_private,
186 mapped_address, mapped_size, elevel);
187#endif
188 default:
189 elog(ERROR, "unexpected dynamic shared memory type: %d",
190 dynamic_shared_memory_type);
191 return false;
192 }
193}
194
195#ifdef USE_DSM_POSIX
196/*
197 * Operating system primitives to support POSIX shared memory.
198 *
199 * POSIX shared memory segments are created and attached using shm_open()
200 * and shm_unlink(); other operations, such as sizing or mapping the
201 * segment, are performed as if the shared memory segments were files.
202 *
203 * Indeed, on some platforms, they may be implemented that way. While
204 * POSIX shared memory segments seem intended to exist in a flat namespace,
205 * some operating systems may implement them as files, even going so far
206 * to treat a request for /xyz as a request to create a file by that name
207 * in the root directory. Users of such broken platforms should select
208 * a different shared memory implementation.
209 */
210static bool
211dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
212 void **impl_private, void **mapped_address, Size *mapped_size,
213 int elevel)
214{
215 char name[64];
216 int flags;
217 int fd;
218 char *address;
219
220 snprintf(name, 64, "/PostgreSQL.%u", handle);
221
222 /* Handle teardown cases. */
223 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
224 {
225 if (*mapped_address != NULL
226 && munmap(*mapped_address, *mapped_size) != 0)
227 {
228 ereport(elevel,
229 (errcode_for_dynamic_shared_memory(),
230 errmsg("could not unmap shared memory segment \"%s\": %m",
231 name)));
232 return false;
233 }
234 *mapped_address = NULL;
235 *mapped_size = 0;
236 if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
237 {
238 ereport(elevel,
239 (errcode_for_dynamic_shared_memory(),
240 errmsg("could not remove shared memory segment \"%s\": %m",
241 name)));
242 return false;
243 }
244 return true;
245 }
246
247 /*
248 * Create new segment or open an existing one for attach.
249 *
250 * Even though we're not going through fd.c, we should be safe against
251 * running out of file descriptors, because of NUM_RESERVED_FDS. We're
252 * only opening one extra descriptor here, and we'll close it before
253 * returning.
254 */
255 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
256 if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
257 {
258 if (errno != EEXIST)
259 ereport(elevel,
260 (errcode_for_dynamic_shared_memory(),
261 errmsg("could not open shared memory segment \"%s\": %m",
262 name)));
263 return false;
264 }
265
266 /*
267 * If we're attaching the segment, determine the current size; if we are
268 * creating the segment, set the size to the requested value.
269 */
270 if (op == DSM_OP_ATTACH)
271 {
272 struct stat st;
273
274 if (fstat(fd, &st) != 0)
275 {
276 int save_errno;
277
278 /* Back out what's already been done. */
279 save_errno = errno;
280 close(fd);
281 errno = save_errno;
282
283 ereport(elevel,
284 (errcode_for_dynamic_shared_memory(),
285 errmsg("could not stat shared memory segment \"%s\": %m",
286 name)));
287 return false;
288 }
289 request_size = st.st_size;
290 }
291 else if (dsm_impl_posix_resize(fd, request_size) != 0)
292 {
293 int save_errno;
294
295 /* Back out what's already been done. */
296 save_errno = errno;
297 close(fd);
298 shm_unlink(name);
299 errno = save_errno;
300
301 /*
302 * If we received a query cancel or termination signal, we will have
303 * EINTR set here. If the caller said that errors are OK here, check
304 * for interrupts immediately.
305 */
306 if (errno == EINTR && elevel >= ERROR)
307 CHECK_FOR_INTERRUPTS();
308
309 ereport(elevel,
310 (errcode_for_dynamic_shared_memory(),
311 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
312 name, request_size)));
313 return false;
314 }
315
316 /* Map it. */
317 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
318 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
319 if (address == MAP_FAILED)
320 {
321 int save_errno;
322
323 /* Back out what's already been done. */
324 save_errno = errno;
325 close(fd);
326 if (op == DSM_OP_CREATE)
327 shm_unlink(name);
328 errno = save_errno;
329
330 ereport(elevel,
331 (errcode_for_dynamic_shared_memory(),
332 errmsg("could not map shared memory segment \"%s\": %m",
333 name)));
334 return false;
335 }
336 *mapped_address = address;
337 *mapped_size = request_size;
338 close(fd);
339
340 return true;
341}
342
343/*
344 * Set the size of a virtual memory region associated with a file descriptor.
345 * If necessary, also ensure that virtual memory is actually allocated by the
346 * operating system, to avoid nasty surprises later.
347 *
348 * Returns non-zero if either truncation or allocation fails, and sets errno.
349 */
350static int
351dsm_impl_posix_resize(int fd, off_t size)
352{
353 int rc;
354
355 /* Truncate (or extend) the file to the requested size. */
356 rc = ftruncate(fd, size);
357
358 /*
359 * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
360 * ftruncate, the file may contain a hole. Accessing memory backed by a
361 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
362 * is no more tmpfs space available. So we ask tmpfs to allocate pages
363 * here, so we can fail gracefully with ENOSPC now rather than risking
364 * SIGBUS later.
365 */
366#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
367 if (rc == 0)
368 {
369 /*
370 * We may get interrupted. If so, just retry unless there is an
371 * interrupt pending. This avoids the possibility of looping forever
372 * if another backend is repeatedly trying to interrupt us.
373 */
374 do
375 {
376 rc = posix_fallocate(fd, 0, size);
377 } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
378
379 /*
380 * The caller expects errno to be set, but posix_fallocate() doesn't
381 * set it. Instead it returns error numbers directly. So set errno,
382 * even though we'll also return rc to indicate success or failure.
383 */
384 errno = rc;
385 }
386#endif /* HAVE_POSIX_FALLOCATE && __linux__ */
387
388 return rc;
389}
390
391#endif /* USE_DSM_POSIX */
392
393#ifdef USE_DSM_SYSV
394/*
395 * Operating system primitives to support System V shared memory.
396 *
397 * System V shared memory segments are manipulated using shmget(), shmat(),
398 * shmdt(), and shmctl(). As the default allocation limits for System V
399 * shared memory are usually quite low, the POSIX facilities may be
400 * preferable; but those are not supported everywhere.
401 */
402static bool
403dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
404 void **impl_private, void **mapped_address, Size *mapped_size,
405 int elevel)
406{
407 key_t key;
408 int ident;
409 char *address;
410 char name[64];
411 int *ident_cache;
412
413 /*
414 * POSIX shared memory and mmap-based shared memory identify segments with
415 * names. To avoid needless error message variation, we use the handle as
416 * the name.
417 */
418 snprintf(name, 64, "%u", handle);
419
420 /*
421 * The System V shared memory namespace is very restricted; names are of
422 * type key_t, which is expected to be some sort of integer data type, but
423 * not necessarily the same one as dsm_handle. Since we use dsm_handle to
424 * identify shared memory segments across processes, this might seem like
425 * a problem, but it's really not. If dsm_handle is bigger than key_t,
426 * the cast below might truncate away some bits from the handle the
427 * user-provided, but it'll truncate exactly the same bits away in exactly
428 * the same fashion every time we use that handle, which is all that
429 * really matters. Conversely, if dsm_handle is smaller than key_t, we
430 * won't use the full range of available key space, but that's no big deal
431 * either.
432 *
433 * We do make sure that the key isn't negative, because that might not be
434 * portable.
435 */
436 key = (key_t) handle;
437 if (key < 1) /* avoid compiler warning if type is unsigned */
438 key = -key;
439
440 /*
441 * There's one special key, IPC_PRIVATE, which can't be used. If we end
442 * up with that value by chance during a create operation, just pretend it
443 * already exists, so that caller will retry. If we run into it anywhere
444 * else, the caller has passed a handle that doesn't correspond to
445 * anything we ever created, which should not happen.
446 */
447 if (key == IPC_PRIVATE)
448 {
449 if (op != DSM_OP_CREATE)
450 elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
451 errno = EEXIST;
452 return false;
453 }
454
455 /*
456 * Before we can do anything with a shared memory segment, we have to map
457 * the shared memory key to a shared memory identifier using shmget(). To
458 * avoid repeated lookups, we store the key using impl_private.
459 */
460 if (*impl_private != NULL)
461 {
462 ident_cache = *impl_private;
463 ident = *ident_cache;
464 }
465 else
466 {
467 int flags = IPCProtection;
468 size_t segsize;
469
470 /*
471 * Allocate the memory BEFORE acquiring the resource, so that we don't
472 * leak the resource if memory allocation fails.
473 */
474 ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
475
476 /*
477 * When using shmget to find an existing segment, we must pass the
478 * size as 0. Passing a non-zero size which is greater than the
479 * actual size will result in EINVAL.
480 */
481 segsize = 0;
482
483 if (op == DSM_OP_CREATE)
484 {
485 flags |= IPC_CREAT | IPC_EXCL;
486 segsize = request_size;
487 }
488
489 if ((ident = shmget(key, segsize, flags)) == -1)
490 {
491 if (errno != EEXIST)
492 {
493 int save_errno = errno;
494
495 pfree(ident_cache);
496 errno = save_errno;
497 ereport(elevel,
498 (errcode_for_dynamic_shared_memory(),
499 errmsg("could not get shared memory segment: %m")));
500 }
501 return false;
502 }
503
504 *ident_cache = ident;
505 *impl_private = ident_cache;
506 }
507
508 /* Handle teardown cases. */
509 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
510 {
511 pfree(ident_cache);
512 *impl_private = NULL;
513 if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
514 {
515 ereport(elevel,
516 (errcode_for_dynamic_shared_memory(),
517 errmsg("could not unmap shared memory segment \"%s\": %m",
518 name)));
519 return false;
520 }
521 *mapped_address = NULL;
522 *mapped_size = 0;
523 if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
524 {
525 ereport(elevel,
526 (errcode_for_dynamic_shared_memory(),
527 errmsg("could not remove shared memory segment \"%s\": %m",
528 name)));
529 return false;
530 }
531 return true;
532 }
533
534 /* If we're attaching it, we must use IPC_STAT to determine the size. */
535 if (op == DSM_OP_ATTACH)
536 {
537 struct shmid_ds shm;
538
539 if (shmctl(ident, IPC_STAT, &shm) != 0)
540 {
541 ereport(elevel,
542 (errcode_for_dynamic_shared_memory(),
543 errmsg("could not stat shared memory segment \"%s\": %m",
544 name)));
545 return false;
546 }
547 request_size = shm.shm_segsz;
548 }
549
550 /* Map it. */
551 address = shmat(ident, NULL, PG_SHMAT_FLAGS);
552 if (address == (void *) -1)
553 {
554 int save_errno;
555
556 /* Back out what's already been done. */
557 save_errno = errno;
558 if (op == DSM_OP_CREATE)
559 shmctl(ident, IPC_RMID, NULL);
560 errno = save_errno;
561
562 ereport(elevel,
563 (errcode_for_dynamic_shared_memory(),
564 errmsg("could not map shared memory segment \"%s\": %m",
565 name)));
566 return false;
567 }
568 *mapped_address = address;
569 *mapped_size = request_size;
570
571 return true;
572}
573#endif
574
575#ifdef USE_DSM_WINDOWS
576/*
577 * Operating system primitives to support Windows shared memory.
578 *
579 * Windows shared memory implementation is done using file mapping
580 * which can be backed by either physical file or system paging file.
581 * Current implementation uses system paging file as other effects
582 * like performance are not clear for physical file and it is used in similar
583 * way for main shared memory in windows.
584 *
585 * A memory mapping object is a kernel object - they always get deleted when
586 * the last reference to them goes away, either explicitly via a CloseHandle or
587 * when the process containing the reference exits.
588 */
589static bool
590dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
591 void **impl_private, void **mapped_address,
592 Size *mapped_size, int elevel)
593{
594 char *address;
595 HANDLE hmap;
596 char name[64];
597 MEMORY_BASIC_INFORMATION info;
598
599 /*
600 * Storing the shared memory segment in the Global\ namespace, can allow
601 * any process running in any session to access that file mapping object
602 * provided that the caller has the required access rights. But to avoid
603 * issues faced in main shared memory, we are using the naming convention
604 * similar to main shared memory. We can change here once issue mentioned
605 * in GetSharedMemName is resolved.
606 */
607 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
608
609 /*
610 * Handle teardown cases. Since Windows automatically destroys the object
611 * when no references remain, we can treat it the same as detach.
612 */
613 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
614 {
615 if (*mapped_address != NULL
616 && UnmapViewOfFile(*mapped_address) == 0)
617 {
618 _dosmaperr(GetLastError());
619 ereport(elevel,
620 (errcode_for_dynamic_shared_memory(),
621 errmsg("could not unmap shared memory segment \"%s\": %m",
622 name)));
623 return false;
624 }
625 if (*impl_private != NULL
626 && CloseHandle(*impl_private) == 0)
627 {
628 _dosmaperr(GetLastError());
629 ereport(elevel,
630 (errcode_for_dynamic_shared_memory(),
631 errmsg("could not remove shared memory segment \"%s\": %m",
632 name)));
633 return false;
634 }
635
636 *impl_private = NULL;
637 *mapped_address = NULL;
638 *mapped_size = 0;
639 return true;
640 }
641
642 /* Create new segment or open an existing one for attach. */
643 if (op == DSM_OP_CREATE)
644 {
645 DWORD size_high;
646 DWORD size_low;
647 DWORD errcode;
648
649 /* Shifts >= the width of the type are undefined. */
650#ifdef _WIN64
651 size_high = request_size >> 32;
652#else
653 size_high = 0;
654#endif
655 size_low = (DWORD) request_size;
656
657 /* CreateFileMapping might not clear the error code on success */
658 SetLastError(0);
659
660 hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
661 NULL, /* Default security attrs */
662 PAGE_READWRITE, /* Memory is read/write */
663 size_high, /* Upper 32 bits of size */
664 size_low, /* Lower 32 bits of size */
665 name);
666
667 errcode = GetLastError();
668 if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
669 {
670 /*
671 * On Windows, when the segment already exists, a handle for the
672 * existing segment is returned. We must close it before
673 * returning. However, if the existing segment is created by a
674 * service, then it returns ERROR_ACCESS_DENIED. We don't do
675 * _dosmaperr here, so errno won't be modified.
676 */
677 if (hmap)
678 CloseHandle(hmap);
679 return false;
680 }
681
682 if (!hmap)
683 {
684 _dosmaperr(errcode);
685 ereport(elevel,
686 (errcode_for_dynamic_shared_memory(),
687 errmsg("could not create shared memory segment \"%s\": %m",
688 name)));
689 return false;
690 }
691 }
692 else
693 {
694 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
695 FALSE, /* do not inherit the name */
696 name); /* name of mapping object */
697 if (!hmap)
698 {
699 _dosmaperr(GetLastError());
700 ereport(elevel,
701 (errcode_for_dynamic_shared_memory(),
702 errmsg("could not open shared memory segment \"%s\": %m",
703 name)));
704 return false;
705 }
706 }
707
708 /* Map it. */
709 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
710 0, 0, 0);
711 if (!address)
712 {
713 int save_errno;
714
715 _dosmaperr(GetLastError());
716 /* Back out what's already been done. */
717 save_errno = errno;
718 CloseHandle(hmap);
719 errno = save_errno;
720
721 ereport(elevel,
722 (errcode_for_dynamic_shared_memory(),
723 errmsg("could not map shared memory segment \"%s\": %m",
724 name)));
725 return false;
726 }
727
728 /*
729 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
730 * need size only when we are attaching, but it's better to get the size
731 * when creating new segment to keep size consistent both for
732 * DSM_OP_CREATE and DSM_OP_ATTACH.
733 */
734 if (VirtualQuery(address, &info, sizeof(info)) == 0)
735 {
736 int save_errno;
737
738 _dosmaperr(GetLastError());
739 /* Back out what's already been done. */
740 save_errno = errno;
741 UnmapViewOfFile(address);
742 CloseHandle(hmap);
743 errno = save_errno;
744
745 ereport(elevel,
746 (errcode_for_dynamic_shared_memory(),
747 errmsg("could not stat shared memory segment \"%s\": %m",
748 name)));
749 return false;
750 }
751
752 *mapped_address = address;
753 *mapped_size = info.RegionSize;
754 *impl_private = hmap;
755
756 return true;
757}
758#endif
759
760#ifdef USE_DSM_MMAP
761/*
762 * Operating system primitives to support mmap-based shared memory.
763 *
764 * Calling this "shared memory" is somewhat of a misnomer, because what
765 * we're really doing is creating a bunch of files and mapping them into
766 * our address space. The operating system may feel obliged to
767 * synchronize the contents to disk even if nothing is being paged out,
768 * which will not serve us well. The user can relocate the pg_dynshmem
769 * directory to a ramdisk to avoid this problem, if available.
770 */
771static bool
772dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
773 void **impl_private, void **mapped_address, Size *mapped_size,
774 int elevel)
775{
776 char name[64];
777 int flags;
778 int fd;
779 char *address;
780
781 snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
782 handle);
783
784 /* Handle teardown cases. */
785 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
786 {
787 if (*mapped_address != NULL
788 && munmap(*mapped_address, *mapped_size) != 0)
789 {
790 ereport(elevel,
791 (errcode_for_dynamic_shared_memory(),
792 errmsg("could not unmap shared memory segment \"%s\": %m",
793 name)));
794 return false;
795 }
796 *mapped_address = NULL;
797 *mapped_size = 0;
798 if (op == DSM_OP_DESTROY && unlink(name) != 0)
799 {
800 ereport(elevel,
801 (errcode_for_dynamic_shared_memory(),
802 errmsg("could not remove shared memory segment \"%s\": %m",
803 name)));
804 return false;
805 }
806 return true;
807 }
808
809 /* Create new segment or open an existing one for attach. */
810 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
811 if ((fd = OpenTransientFile(name, flags)) == -1)
812 {
813 if (errno != EEXIST)
814 ereport(elevel,
815 (errcode_for_dynamic_shared_memory(),
816 errmsg("could not open shared memory segment \"%s\": %m",
817 name)));
818 return false;
819 }
820
821 /*
822 * If we're attaching the segment, determine the current size; if we are
823 * creating the segment, set the size to the requested value.
824 */
825 if (op == DSM_OP_ATTACH)
826 {
827 struct stat st;
828
829 if (fstat(fd, &st) != 0)
830 {
831 int save_errno;
832
833 /* Back out what's already been done. */
834 save_errno = errno;
835 CloseTransientFile(fd);
836 errno = save_errno;
837
838 ereport(elevel,
839 (errcode_for_dynamic_shared_memory(),
840 errmsg("could not stat shared memory segment \"%s\": %m",
841 name)));
842 return false;
843 }
844 request_size = st.st_size;
845 }
846 else
847 {
848 /*
849 * Allocate a buffer full of zeros.
850 *
851 * Note: palloc zbuffer, instead of just using a local char array, to
852 * ensure it is reasonably well-aligned; this may save a few cycles
853 * transferring data to the kernel.
854 */
855 char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
856 uint32 remaining = request_size;
857 bool success = true;
858
859 /*
860 * Zero-fill the file. We have to do this the hard way to ensure that
861 * all the file space has really been allocated, so that we don't
862 * later seg fault when accessing the memory mapping. This is pretty
863 * pessimal.
864 */
865 while (success && remaining > 0)
866 {
867 Size goal = remaining;
868
869 if (goal > ZBUFFER_SIZE)
870 goal = ZBUFFER_SIZE;
871 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
872 if (write(fd, zbuffer, goal) == goal)
873 remaining -= goal;
874 else
875 success = false;
876 pgstat_report_wait_end();
877 }
878
879 if (!success)
880 {
881 int save_errno;
882
883 /* Back out what's already been done. */
884 save_errno = errno;
885 CloseTransientFile(fd);
886 unlink(name);
887 errno = save_errno ? save_errno : ENOSPC;
888
889 ereport(elevel,
890 (errcode_for_dynamic_shared_memory(),
891 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
892 name, request_size)));
893 return false;
894 }
895 }
896
897 /* Map it. */
898 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
899 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
900 if (address == MAP_FAILED)
901 {
902 int save_errno;
903
904 /* Back out what's already been done. */
905 save_errno = errno;
906 CloseTransientFile(fd);
907 if (op == DSM_OP_CREATE)
908 unlink(name);
909 errno = save_errno;
910
911 ereport(elevel,
912 (errcode_for_dynamic_shared_memory(),
913 errmsg("could not map shared memory segment \"%s\": %m",
914 name)));
915 return false;
916 }
917 *mapped_address = address;
918 *mapped_size = request_size;
919
920 if (CloseTransientFile(fd))
921 {
922 ereport(elevel,
923 (errcode_for_file_access(),
924 errmsg("could not close shared memory segment \"%s\": %m",
925 name)));
926 return false;
927 }
928
929 return true;
930}
931#endif
932
933/*
934 * Implementation-specific actions that must be performed when a segment is to
935 * be preserved even when no backend has it attached.
936 *
937 * Except on Windows, we don't need to do anything at all. But since Windows
938 * cleans up segments automatically when no references remain, we duplicate
939 * the segment handle into the postmaster process. The postmaster needn't
940 * do anything to receive the handle; Windows transfers it automatically.
941 */
942void
943dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
944 void **impl_private_pm_handle)
945{
946 switch (dynamic_shared_memory_type)
947 {
948#ifdef USE_DSM_WINDOWS
949 case DSM_IMPL_WINDOWS:
950 {
951 HANDLE hmap;
952
953 if (!DuplicateHandle(GetCurrentProcess(), impl_private,
954 PostmasterHandle, &hmap, 0, FALSE,
955 DUPLICATE_SAME_ACCESS))
956 {
957 char name[64];
958
959 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
960 _dosmaperr(GetLastError());
961 ereport(ERROR,
962 (errcode_for_dynamic_shared_memory(),
963 errmsg("could not duplicate handle for \"%s\": %m",
964 name)));
965 }
966
967 /*
968 * Here, we remember the handle that we created in the
969 * postmaster process. This handle isn't actually usable in
970 * any process other than the postmaster, but that doesn't
971 * matter. We're just holding onto it so that, if the segment
972 * is unpinned, dsm_impl_unpin_segment can close it.
973 */
974 *impl_private_pm_handle = hmap;
975 break;
976 }
977#endif
978 default:
979 break;
980 }
981}
982
983/*
984 * Implementation-specific actions that must be performed when a segment is no
985 * longer to be preserved, so that it will be cleaned up when all backends
986 * have detached from it.
987 *
988 * Except on Windows, we don't need to do anything at all. For Windows, we
989 * close the extra handle that dsm_impl_pin_segment created in the
990 * postmaster's process space.
991 */
992void
993dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
994{
995 switch (dynamic_shared_memory_type)
996 {
997#ifdef USE_DSM_WINDOWS
998 case DSM_IMPL_WINDOWS:
999 {
1000 if (*impl_private &&
1001 !DuplicateHandle(PostmasterHandle, *impl_private,
1002 NULL, NULL, 0, FALSE,
1003 DUPLICATE_CLOSE_SOURCE))
1004 {
1005 char name[64];
1006
1007 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1008 _dosmaperr(GetLastError());
1009 ereport(ERROR,
1010 (errcode_for_dynamic_shared_memory(),
1011 errmsg("could not duplicate handle for \"%s\": %m",
1012 name)));
1013 }
1014
1015 *impl_private = NULL;
1016 break;
1017 }
1018#endif
1019 default:
1020 break;
1021 }
1022}
1023
1024static int
1025errcode_for_dynamic_shared_memory(void)
1026{
1027 if (errno == EFBIG || errno == ENOMEM)
1028 return errcode(ERRCODE_OUT_OF_MEMORY);
1029 else
1030 return errcode_for_file_access();
1031}
1032