1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * dsm_impl.c |
4 | * manage dynamic shared memory segments |
5 | * |
6 | * This file provides low-level APIs for creating and destroying shared |
7 | * memory segments using several different possible techniques. We refer |
8 | * to these segments as dynamic because they can be created, altered, and |
9 | * destroyed at any point during the server life cycle. This is unlike |
10 | * the main shared memory segment, of which there is always exactly one |
11 | * and which is always mapped at a fixed address in every PostgreSQL |
12 | * background process. |
13 | * |
14 | * Because not all systems provide the same primitives in this area, nor |
15 | * do all primitives behave the same way on all systems, we provide |
16 | * several implementations of this facility. Many systems implement |
17 | * POSIX shared memory (shm_open etc.), which is well-suited to our needs |
18 | * in this area, with the exception that shared memory identifiers live |
19 | * in a flat system-wide namespace, raising the uncomfortable prospect of |
20 | * name collisions with other processes (including other copies of |
21 | * PostgreSQL) running on the same system. Some systems only support |
22 | * the older System V shared memory interface (shmget etc.) which is |
23 | * also usable; however, the default allocation limits are often quite |
24 | * small, and the namespace is even more restricted. |
25 | * |
26 | * We also provide an mmap-based shared memory implementation. This may |
27 | * be useful on systems that provide shared memory via a special-purpose |
28 | * filesystem; by opting for this implementation, the user can even |
29 | * control precisely where their shared memory segments are placed. It |
30 | * can also be used as a fallback for systems where shm_open and shmget |
31 | * are not available or can't be used for some reason. Of course, |
32 | * mapping a file residing on an actual spinning disk is a fairly poor |
33 | * approximation for shared memory because writeback may hurt performance |
34 | * substantially, but there should be few systems where we must make do |
35 | * with such poor tools. |
36 | * |
37 | * As ever, Windows requires its own implementation. |
38 | * |
39 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
40 | * Portions Copyright (c) 1994, Regents of the University of California |
41 | * |
42 | * |
43 | * IDENTIFICATION |
44 | * src/backend/storage/ipc/dsm_impl.c |
45 | * |
46 | *------------------------------------------------------------------------- |
47 | */ |
48 | |
49 | #include "postgres.h" |
50 | #include "miscadmin.h" |
51 | |
52 | #include <fcntl.h> |
53 | #include <unistd.h> |
54 | #ifndef WIN32 |
55 | #include <sys/mman.h> |
56 | #endif |
57 | #include <sys/stat.h> |
58 | #ifdef HAVE_SYS_IPC_H |
59 | #include <sys/ipc.h> |
60 | #endif |
61 | #ifdef HAVE_SYS_SHM_H |
62 | #include <sys/shm.h> |
63 | #endif |
64 | #include "common/file_perm.h" |
65 | #include "pgstat.h" |
66 | |
67 | #include "portability/mem.h" |
68 | #include "storage/dsm_impl.h" |
69 | #include "storage/fd.h" |
70 | #include "utils/guc.h" |
71 | #include "utils/memutils.h" |
72 | #include "postmaster/postmaster.h" |
73 | |
74 | #ifdef USE_DSM_POSIX |
75 | static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, |
76 | void **impl_private, void **mapped_address, |
77 | Size *mapped_size, int elevel); |
78 | static int dsm_impl_posix_resize(int fd, off_t size); |
79 | #endif |
80 | #ifdef USE_DSM_SYSV |
81 | static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, |
82 | void **impl_private, void **mapped_address, |
83 | Size *mapped_size, int elevel); |
84 | #endif |
85 | #ifdef USE_DSM_WINDOWS |
86 | static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, |
87 | void **impl_private, void **mapped_address, |
88 | Size *mapped_size, int elevel); |
89 | #endif |
90 | #ifdef USE_DSM_MMAP |
91 | static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, |
92 | void **impl_private, void **mapped_address, |
93 | Size *mapped_size, int elevel); |
94 | #endif |
95 | static int errcode_for_dynamic_shared_memory(void); |
96 | |
97 | const struct config_enum_entry dynamic_shared_memory_options[] = { |
98 | #ifdef USE_DSM_POSIX |
99 | {"posix" , DSM_IMPL_POSIX, false}, |
100 | #endif |
101 | #ifdef USE_DSM_SYSV |
102 | {"sysv" , DSM_IMPL_SYSV, false}, |
103 | #endif |
104 | #ifdef USE_DSM_WINDOWS |
105 | {"windows" , DSM_IMPL_WINDOWS, false}, |
106 | #endif |
107 | #ifdef USE_DSM_MMAP |
108 | {"mmap" , DSM_IMPL_MMAP, false}, |
109 | #endif |
110 | {NULL, 0, false} |
111 | }; |
112 | |
113 | /* Implementation selector. */ |
114 | int dynamic_shared_memory_type; |
115 | |
116 | /* Size of buffer to be used for zero-filling. */ |
117 | #define ZBUFFER_SIZE 8192 |
118 | |
119 | #define SEGMENT_NAME_PREFIX "Global/PostgreSQL" |
120 | |
121 | /*------ |
122 | * Perform a low-level shared memory operation in a platform-specific way, |
123 | * as dictated by the selected implementation. Each implementation is |
124 | * required to implement the following primitives. |
125 | * |
126 | * DSM_OP_CREATE. Create a segment whose size is the request_size and |
127 | * map it. |
128 | * |
129 | * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. |
130 | * |
131 | * DSM_OP_DETACH. Unmap the segment. |
132 | * |
133 | * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the |
134 | * segment. |
135 | * |
136 | * Arguments: |
137 | * op: The operation to be performed. |
138 | * handle: The handle of an existing object, or for DSM_OP_CREATE, the |
139 | * a new handle the caller wants created. |
140 | * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0. |
141 | * impl_private: Private, implementation-specific data. Will be a pointer |
142 | * to NULL for the first operation on a shared memory segment within this |
143 | * backend; thereafter, it will point to the value to which it was set |
144 | * on the previous call. |
145 | * mapped_address: Pointer to start of current mapping; pointer to NULL |
146 | * if none. Updated with new mapping address. |
147 | * mapped_size: Pointer to size of current mapping; pointer to 0 if none. |
148 | * Updated with new mapped size. |
149 | * elevel: Level at which to log errors. |
150 | * |
151 | * Return value: true on success, false on failure. When false is returned, |
152 | * a message should first be logged at the specified elevel, except in the |
153 | * case where DSM_OP_CREATE experiences a name collision, which should |
154 | * silently return false. |
155 | *----- |
156 | */ |
157 | bool |
158 | dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, |
159 | void **impl_private, void **mapped_address, Size *mapped_size, |
160 | int elevel) |
161 | { |
162 | Assert(op == DSM_OP_CREATE || request_size == 0); |
163 | Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || |
164 | (*mapped_address == NULL && *mapped_size == 0)); |
165 | |
166 | switch (dynamic_shared_memory_type) |
167 | { |
168 | #ifdef USE_DSM_POSIX |
169 | case DSM_IMPL_POSIX: |
170 | return dsm_impl_posix(op, handle, request_size, impl_private, |
171 | mapped_address, mapped_size, elevel); |
172 | #endif |
173 | #ifdef USE_DSM_SYSV |
174 | case DSM_IMPL_SYSV: |
175 | return dsm_impl_sysv(op, handle, request_size, impl_private, |
176 | mapped_address, mapped_size, elevel); |
177 | #endif |
178 | #ifdef USE_DSM_WINDOWS |
179 | case DSM_IMPL_WINDOWS: |
180 | return dsm_impl_windows(op, handle, request_size, impl_private, |
181 | mapped_address, mapped_size, elevel); |
182 | #endif |
183 | #ifdef USE_DSM_MMAP |
184 | case DSM_IMPL_MMAP: |
185 | return dsm_impl_mmap(op, handle, request_size, impl_private, |
186 | mapped_address, mapped_size, elevel); |
187 | #endif |
188 | default: |
189 | elog(ERROR, "unexpected dynamic shared memory type: %d" , |
190 | dynamic_shared_memory_type); |
191 | return false; |
192 | } |
193 | } |
194 | |
195 | #ifdef USE_DSM_POSIX |
196 | /* |
197 | * Operating system primitives to support POSIX shared memory. |
198 | * |
199 | * POSIX shared memory segments are created and attached using shm_open() |
200 | * and shm_unlink(); other operations, such as sizing or mapping the |
201 | * segment, are performed as if the shared memory segments were files. |
202 | * |
203 | * Indeed, on some platforms, they may be implemented that way. While |
204 | * POSIX shared memory segments seem intended to exist in a flat namespace, |
205 | * some operating systems may implement them as files, even going so far |
206 | * to treat a request for /xyz as a request to create a file by that name |
207 | * in the root directory. Users of such broken platforms should select |
208 | * a different shared memory implementation. |
209 | */ |
210 | static bool |
211 | dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, |
212 | void **impl_private, void **mapped_address, Size *mapped_size, |
213 | int elevel) |
214 | { |
215 | char name[64]; |
216 | int flags; |
217 | int fd; |
218 | char *address; |
219 | |
220 | snprintf(name, 64, "/PostgreSQL.%u" , handle); |
221 | |
222 | /* Handle teardown cases. */ |
223 | if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
224 | { |
225 | if (*mapped_address != NULL |
226 | && munmap(*mapped_address, *mapped_size) != 0) |
227 | { |
228 | ereport(elevel, |
229 | (errcode_for_dynamic_shared_memory(), |
230 | errmsg("could not unmap shared memory segment \"%s\": %m" , |
231 | name))); |
232 | return false; |
233 | } |
234 | *mapped_address = NULL; |
235 | *mapped_size = 0; |
236 | if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) |
237 | { |
238 | ereport(elevel, |
239 | (errcode_for_dynamic_shared_memory(), |
240 | errmsg("could not remove shared memory segment \"%s\": %m" , |
241 | name))); |
242 | return false; |
243 | } |
244 | return true; |
245 | } |
246 | |
247 | /* |
248 | * Create new segment or open an existing one for attach. |
249 | * |
250 | * Even though we're not going through fd.c, we should be safe against |
251 | * running out of file descriptors, because of NUM_RESERVED_FDS. We're |
252 | * only opening one extra descriptor here, and we'll close it before |
253 | * returning. |
254 | */ |
255 | flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); |
256 | if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) |
257 | { |
258 | if (errno != EEXIST) |
259 | ereport(elevel, |
260 | (errcode_for_dynamic_shared_memory(), |
261 | errmsg("could not open shared memory segment \"%s\": %m" , |
262 | name))); |
263 | return false; |
264 | } |
265 | |
266 | /* |
267 | * If we're attaching the segment, determine the current size; if we are |
268 | * creating the segment, set the size to the requested value. |
269 | */ |
270 | if (op == DSM_OP_ATTACH) |
271 | { |
272 | struct stat st; |
273 | |
274 | if (fstat(fd, &st) != 0) |
275 | { |
276 | int save_errno; |
277 | |
278 | /* Back out what's already been done. */ |
279 | save_errno = errno; |
280 | close(fd); |
281 | errno = save_errno; |
282 | |
283 | ereport(elevel, |
284 | (errcode_for_dynamic_shared_memory(), |
285 | errmsg("could not stat shared memory segment \"%s\": %m" , |
286 | name))); |
287 | return false; |
288 | } |
289 | request_size = st.st_size; |
290 | } |
291 | else if (dsm_impl_posix_resize(fd, request_size) != 0) |
292 | { |
293 | int save_errno; |
294 | |
295 | /* Back out what's already been done. */ |
296 | save_errno = errno; |
297 | close(fd); |
298 | shm_unlink(name); |
299 | errno = save_errno; |
300 | |
301 | /* |
302 | * If we received a query cancel or termination signal, we will have |
303 | * EINTR set here. If the caller said that errors are OK here, check |
304 | * for interrupts immediately. |
305 | */ |
306 | if (errno == EINTR && elevel >= ERROR) |
307 | CHECK_FOR_INTERRUPTS(); |
308 | |
309 | ereport(elevel, |
310 | (errcode_for_dynamic_shared_memory(), |
311 | errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m" , |
312 | name, request_size))); |
313 | return false; |
314 | } |
315 | |
316 | /* Map it. */ |
317 | address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, |
318 | MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); |
319 | if (address == MAP_FAILED) |
320 | { |
321 | int save_errno; |
322 | |
323 | /* Back out what's already been done. */ |
324 | save_errno = errno; |
325 | close(fd); |
326 | if (op == DSM_OP_CREATE) |
327 | shm_unlink(name); |
328 | errno = save_errno; |
329 | |
330 | ereport(elevel, |
331 | (errcode_for_dynamic_shared_memory(), |
332 | errmsg("could not map shared memory segment \"%s\": %m" , |
333 | name))); |
334 | return false; |
335 | } |
336 | *mapped_address = address; |
337 | *mapped_size = request_size; |
338 | close(fd); |
339 | |
340 | return true; |
341 | } |
342 | |
343 | /* |
344 | * Set the size of a virtual memory region associated with a file descriptor. |
345 | * If necessary, also ensure that virtual memory is actually allocated by the |
346 | * operating system, to avoid nasty surprises later. |
347 | * |
348 | * Returns non-zero if either truncation or allocation fails, and sets errno. |
349 | */ |
350 | static int |
351 | dsm_impl_posix_resize(int fd, off_t size) |
352 | { |
353 | int rc; |
354 | |
355 | /* Truncate (or extend) the file to the requested size. */ |
356 | rc = ftruncate(fd, size); |
357 | |
358 | /* |
359 | * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with |
360 | * ftruncate, the file may contain a hole. Accessing memory backed by a |
361 | * hole causes tmpfs to allocate pages, which fails with SIGBUS if there |
362 | * is no more tmpfs space available. So we ask tmpfs to allocate pages |
363 | * here, so we can fail gracefully with ENOSPC now rather than risking |
364 | * SIGBUS later. |
365 | */ |
366 | #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) |
367 | if (rc == 0) |
368 | { |
369 | /* |
370 | * We may get interrupted. If so, just retry unless there is an |
371 | * interrupt pending. This avoids the possibility of looping forever |
372 | * if another backend is repeatedly trying to interrupt us. |
373 | */ |
374 | do |
375 | { |
376 | rc = posix_fallocate(fd, 0, size); |
377 | } while (rc == EINTR && !(ProcDiePending || QueryCancelPending)); |
378 | |
379 | /* |
380 | * The caller expects errno to be set, but posix_fallocate() doesn't |
381 | * set it. Instead it returns error numbers directly. So set errno, |
382 | * even though we'll also return rc to indicate success or failure. |
383 | */ |
384 | errno = rc; |
385 | } |
386 | #endif /* HAVE_POSIX_FALLOCATE && __linux__ */ |
387 | |
388 | return rc; |
389 | } |
390 | |
391 | #endif /* USE_DSM_POSIX */ |
392 | |
393 | #ifdef USE_DSM_SYSV |
394 | /* |
395 | * Operating system primitives to support System V shared memory. |
396 | * |
397 | * System V shared memory segments are manipulated using shmget(), shmat(), |
398 | * shmdt(), and shmctl(). As the default allocation limits for System V |
399 | * shared memory are usually quite low, the POSIX facilities may be |
400 | * preferable; but those are not supported everywhere. |
401 | */ |
402 | static bool |
403 | dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, |
404 | void **impl_private, void **mapped_address, Size *mapped_size, |
405 | int elevel) |
406 | { |
407 | key_t key; |
408 | int ident; |
409 | char *address; |
410 | char name[64]; |
411 | int *ident_cache; |
412 | |
413 | /* |
414 | * POSIX shared memory and mmap-based shared memory identify segments with |
415 | * names. To avoid needless error message variation, we use the handle as |
416 | * the name. |
417 | */ |
418 | snprintf(name, 64, "%u" , handle); |
419 | |
420 | /* |
421 | * The System V shared memory namespace is very restricted; names are of |
422 | * type key_t, which is expected to be some sort of integer data type, but |
423 | * not necessarily the same one as dsm_handle. Since we use dsm_handle to |
424 | * identify shared memory segments across processes, this might seem like |
425 | * a problem, but it's really not. If dsm_handle is bigger than key_t, |
426 | * the cast below might truncate away some bits from the handle the |
427 | * user-provided, but it'll truncate exactly the same bits away in exactly |
428 | * the same fashion every time we use that handle, which is all that |
429 | * really matters. Conversely, if dsm_handle is smaller than key_t, we |
430 | * won't use the full range of available key space, but that's no big deal |
431 | * either. |
432 | * |
433 | * We do make sure that the key isn't negative, because that might not be |
434 | * portable. |
435 | */ |
436 | key = (key_t) handle; |
437 | if (key < 1) /* avoid compiler warning if type is unsigned */ |
438 | key = -key; |
439 | |
440 | /* |
441 | * There's one special key, IPC_PRIVATE, which can't be used. If we end |
442 | * up with that value by chance during a create operation, just pretend it |
443 | * already exists, so that caller will retry. If we run into it anywhere |
444 | * else, the caller has passed a handle that doesn't correspond to |
445 | * anything we ever created, which should not happen. |
446 | */ |
447 | if (key == IPC_PRIVATE) |
448 | { |
449 | if (op != DSM_OP_CREATE) |
450 | elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE" ); |
451 | errno = EEXIST; |
452 | return false; |
453 | } |
454 | |
455 | /* |
456 | * Before we can do anything with a shared memory segment, we have to map |
457 | * the shared memory key to a shared memory identifier using shmget(). To |
458 | * avoid repeated lookups, we store the key using impl_private. |
459 | */ |
460 | if (*impl_private != NULL) |
461 | { |
462 | ident_cache = *impl_private; |
463 | ident = *ident_cache; |
464 | } |
465 | else |
466 | { |
467 | int flags = IPCProtection; |
468 | size_t segsize; |
469 | |
470 | /* |
471 | * Allocate the memory BEFORE acquiring the resource, so that we don't |
472 | * leak the resource if memory allocation fails. |
473 | */ |
474 | ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); |
475 | |
476 | /* |
477 | * When using shmget to find an existing segment, we must pass the |
478 | * size as 0. Passing a non-zero size which is greater than the |
479 | * actual size will result in EINVAL. |
480 | */ |
481 | segsize = 0; |
482 | |
483 | if (op == DSM_OP_CREATE) |
484 | { |
485 | flags |= IPC_CREAT | IPC_EXCL; |
486 | segsize = request_size; |
487 | } |
488 | |
489 | if ((ident = shmget(key, segsize, flags)) == -1) |
490 | { |
491 | if (errno != EEXIST) |
492 | { |
493 | int save_errno = errno; |
494 | |
495 | pfree(ident_cache); |
496 | errno = save_errno; |
497 | ereport(elevel, |
498 | (errcode_for_dynamic_shared_memory(), |
499 | errmsg("could not get shared memory segment: %m" ))); |
500 | } |
501 | return false; |
502 | } |
503 | |
504 | *ident_cache = ident; |
505 | *impl_private = ident_cache; |
506 | } |
507 | |
508 | /* Handle teardown cases. */ |
509 | if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
510 | { |
511 | pfree(ident_cache); |
512 | *impl_private = NULL; |
513 | if (*mapped_address != NULL && shmdt(*mapped_address) != 0) |
514 | { |
515 | ereport(elevel, |
516 | (errcode_for_dynamic_shared_memory(), |
517 | errmsg("could not unmap shared memory segment \"%s\": %m" , |
518 | name))); |
519 | return false; |
520 | } |
521 | *mapped_address = NULL; |
522 | *mapped_size = 0; |
523 | if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) |
524 | { |
525 | ereport(elevel, |
526 | (errcode_for_dynamic_shared_memory(), |
527 | errmsg("could not remove shared memory segment \"%s\": %m" , |
528 | name))); |
529 | return false; |
530 | } |
531 | return true; |
532 | } |
533 | |
534 | /* If we're attaching it, we must use IPC_STAT to determine the size. */ |
535 | if (op == DSM_OP_ATTACH) |
536 | { |
537 | struct shmid_ds shm; |
538 | |
539 | if (shmctl(ident, IPC_STAT, &shm) != 0) |
540 | { |
541 | ereport(elevel, |
542 | (errcode_for_dynamic_shared_memory(), |
543 | errmsg("could not stat shared memory segment \"%s\": %m" , |
544 | name))); |
545 | return false; |
546 | } |
547 | request_size = shm.shm_segsz; |
548 | } |
549 | |
550 | /* Map it. */ |
551 | address = shmat(ident, NULL, PG_SHMAT_FLAGS); |
552 | if (address == (void *) -1) |
553 | { |
554 | int save_errno; |
555 | |
556 | /* Back out what's already been done. */ |
557 | save_errno = errno; |
558 | if (op == DSM_OP_CREATE) |
559 | shmctl(ident, IPC_RMID, NULL); |
560 | errno = save_errno; |
561 | |
562 | ereport(elevel, |
563 | (errcode_for_dynamic_shared_memory(), |
564 | errmsg("could not map shared memory segment \"%s\": %m" , |
565 | name))); |
566 | return false; |
567 | } |
568 | *mapped_address = address; |
569 | *mapped_size = request_size; |
570 | |
571 | return true; |
572 | } |
573 | #endif |
574 | |
575 | #ifdef USE_DSM_WINDOWS |
576 | /* |
577 | * Operating system primitives to support Windows shared memory. |
578 | * |
579 | * Windows shared memory implementation is done using file mapping |
580 | * which can be backed by either physical file or system paging file. |
581 | * Current implementation uses system paging file as other effects |
582 | * like performance are not clear for physical file and it is used in similar |
583 | * way for main shared memory in windows. |
584 | * |
585 | * A memory mapping object is a kernel object - they always get deleted when |
586 | * the last reference to them goes away, either explicitly via a CloseHandle or |
587 | * when the process containing the reference exits. |
588 | */ |
589 | static bool |
590 | dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, |
591 | void **impl_private, void **mapped_address, |
592 | Size *mapped_size, int elevel) |
593 | { |
594 | char *address; |
595 | HANDLE hmap; |
596 | char name[64]; |
597 | MEMORY_BASIC_INFORMATION info; |
598 | |
599 | /* |
600 | * Storing the shared memory segment in the Global\ namespace, can allow |
601 | * any process running in any session to access that file mapping object |
602 | * provided that the caller has the required access rights. But to avoid |
603 | * issues faced in main shared memory, we are using the naming convention |
604 | * similar to main shared memory. We can change here once issue mentioned |
605 | * in GetSharedMemName is resolved. |
606 | */ |
607 | snprintf(name, 64, "%s.%u" , SEGMENT_NAME_PREFIX, handle); |
608 | |
609 | /* |
610 | * Handle teardown cases. Since Windows automatically destroys the object |
611 | * when no references remain, we can treat it the same as detach. |
612 | */ |
613 | if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
614 | { |
615 | if (*mapped_address != NULL |
616 | && UnmapViewOfFile(*mapped_address) == 0) |
617 | { |
618 | _dosmaperr(GetLastError()); |
619 | ereport(elevel, |
620 | (errcode_for_dynamic_shared_memory(), |
621 | errmsg("could not unmap shared memory segment \"%s\": %m" , |
622 | name))); |
623 | return false; |
624 | } |
625 | if (*impl_private != NULL |
626 | && CloseHandle(*impl_private) == 0) |
627 | { |
628 | _dosmaperr(GetLastError()); |
629 | ereport(elevel, |
630 | (errcode_for_dynamic_shared_memory(), |
631 | errmsg("could not remove shared memory segment \"%s\": %m" , |
632 | name))); |
633 | return false; |
634 | } |
635 | |
636 | *impl_private = NULL; |
637 | *mapped_address = NULL; |
638 | *mapped_size = 0; |
639 | return true; |
640 | } |
641 | |
642 | /* Create new segment or open an existing one for attach. */ |
643 | if (op == DSM_OP_CREATE) |
644 | { |
645 | DWORD size_high; |
646 | DWORD size_low; |
647 | DWORD errcode; |
648 | |
649 | /* Shifts >= the width of the type are undefined. */ |
650 | #ifdef _WIN64 |
651 | size_high = request_size >> 32; |
652 | #else |
653 | size_high = 0; |
654 | #endif |
655 | size_low = (DWORD) request_size; |
656 | |
657 | /* CreateFileMapping might not clear the error code on success */ |
658 | SetLastError(0); |
659 | |
660 | hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ |
661 | NULL, /* Default security attrs */ |
662 | PAGE_READWRITE, /* Memory is read/write */ |
663 | size_high, /* Upper 32 bits of size */ |
664 | size_low, /* Lower 32 bits of size */ |
665 | name); |
666 | |
667 | errcode = GetLastError(); |
668 | if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED) |
669 | { |
670 | /* |
671 | * On Windows, when the segment already exists, a handle for the |
672 | * existing segment is returned. We must close it before |
673 | * returning. However, if the existing segment is created by a |
674 | * service, then it returns ERROR_ACCESS_DENIED. We don't do |
675 | * _dosmaperr here, so errno won't be modified. |
676 | */ |
677 | if (hmap) |
678 | CloseHandle(hmap); |
679 | return false; |
680 | } |
681 | |
682 | if (!hmap) |
683 | { |
684 | _dosmaperr(errcode); |
685 | ereport(elevel, |
686 | (errcode_for_dynamic_shared_memory(), |
687 | errmsg("could not create shared memory segment \"%s\": %m" , |
688 | name))); |
689 | return false; |
690 | } |
691 | } |
692 | else |
693 | { |
694 | hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, |
695 | FALSE, /* do not inherit the name */ |
696 | name); /* name of mapping object */ |
697 | if (!hmap) |
698 | { |
699 | _dosmaperr(GetLastError()); |
700 | ereport(elevel, |
701 | (errcode_for_dynamic_shared_memory(), |
702 | errmsg("could not open shared memory segment \"%s\": %m" , |
703 | name))); |
704 | return false; |
705 | } |
706 | } |
707 | |
708 | /* Map it. */ |
709 | address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, |
710 | 0, 0, 0); |
711 | if (!address) |
712 | { |
713 | int save_errno; |
714 | |
715 | _dosmaperr(GetLastError()); |
716 | /* Back out what's already been done. */ |
717 | save_errno = errno; |
718 | CloseHandle(hmap); |
719 | errno = save_errno; |
720 | |
721 | ereport(elevel, |
722 | (errcode_for_dynamic_shared_memory(), |
723 | errmsg("could not map shared memory segment \"%s\": %m" , |
724 | name))); |
725 | return false; |
726 | } |
727 | |
728 | /* |
729 | * VirtualQuery gives size in page_size units, which is 4K for Windows. We |
730 | * need size only when we are attaching, but it's better to get the size |
731 | * when creating new segment to keep size consistent both for |
732 | * DSM_OP_CREATE and DSM_OP_ATTACH. |
733 | */ |
734 | if (VirtualQuery(address, &info, sizeof(info)) == 0) |
735 | { |
736 | int save_errno; |
737 | |
738 | _dosmaperr(GetLastError()); |
739 | /* Back out what's already been done. */ |
740 | save_errno = errno; |
741 | UnmapViewOfFile(address); |
742 | CloseHandle(hmap); |
743 | errno = save_errno; |
744 | |
745 | ereport(elevel, |
746 | (errcode_for_dynamic_shared_memory(), |
747 | errmsg("could not stat shared memory segment \"%s\": %m" , |
748 | name))); |
749 | return false; |
750 | } |
751 | |
752 | *mapped_address = address; |
753 | *mapped_size = info.RegionSize; |
754 | *impl_private = hmap; |
755 | |
756 | return true; |
757 | } |
758 | #endif |
759 | |
760 | #ifdef USE_DSM_MMAP |
761 | /* |
762 | * Operating system primitives to support mmap-based shared memory. |
763 | * |
764 | * Calling this "shared memory" is somewhat of a misnomer, because what |
765 | * we're really doing is creating a bunch of files and mapping them into |
766 | * our address space. The operating system may feel obliged to |
767 | * synchronize the contents to disk even if nothing is being paged out, |
768 | * which will not serve us well. The user can relocate the pg_dynshmem |
769 | * directory to a ramdisk to avoid this problem, if available. |
770 | */ |
771 | static bool |
772 | dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, |
773 | void **impl_private, void **mapped_address, Size *mapped_size, |
774 | int elevel) |
775 | { |
776 | char name[64]; |
777 | int flags; |
778 | int fd; |
779 | char *address; |
780 | |
781 | snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u" , |
782 | handle); |
783 | |
784 | /* Handle teardown cases. */ |
785 | if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
786 | { |
787 | if (*mapped_address != NULL |
788 | && munmap(*mapped_address, *mapped_size) != 0) |
789 | { |
790 | ereport(elevel, |
791 | (errcode_for_dynamic_shared_memory(), |
792 | errmsg("could not unmap shared memory segment \"%s\": %m" , |
793 | name))); |
794 | return false; |
795 | } |
796 | *mapped_address = NULL; |
797 | *mapped_size = 0; |
798 | if (op == DSM_OP_DESTROY && unlink(name) != 0) |
799 | { |
800 | ereport(elevel, |
801 | (errcode_for_dynamic_shared_memory(), |
802 | errmsg("could not remove shared memory segment \"%s\": %m" , |
803 | name))); |
804 | return false; |
805 | } |
806 | return true; |
807 | } |
808 | |
809 | /* Create new segment or open an existing one for attach. */ |
810 | flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); |
811 | if ((fd = OpenTransientFile(name, flags)) == -1) |
812 | { |
813 | if (errno != EEXIST) |
814 | ereport(elevel, |
815 | (errcode_for_dynamic_shared_memory(), |
816 | errmsg("could not open shared memory segment \"%s\": %m" , |
817 | name))); |
818 | return false; |
819 | } |
820 | |
821 | /* |
822 | * If we're attaching the segment, determine the current size; if we are |
823 | * creating the segment, set the size to the requested value. |
824 | */ |
825 | if (op == DSM_OP_ATTACH) |
826 | { |
827 | struct stat st; |
828 | |
829 | if (fstat(fd, &st) != 0) |
830 | { |
831 | int save_errno; |
832 | |
833 | /* Back out what's already been done. */ |
834 | save_errno = errno; |
835 | CloseTransientFile(fd); |
836 | errno = save_errno; |
837 | |
838 | ereport(elevel, |
839 | (errcode_for_dynamic_shared_memory(), |
840 | errmsg("could not stat shared memory segment \"%s\": %m" , |
841 | name))); |
842 | return false; |
843 | } |
844 | request_size = st.st_size; |
845 | } |
846 | else |
847 | { |
848 | /* |
849 | * Allocate a buffer full of zeros. |
850 | * |
851 | * Note: palloc zbuffer, instead of just using a local char array, to |
852 | * ensure it is reasonably well-aligned; this may save a few cycles |
853 | * transferring data to the kernel. |
854 | */ |
855 | char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); |
856 | uint32 remaining = request_size; |
857 | bool success = true; |
858 | |
859 | /* |
860 | * Zero-fill the file. We have to do this the hard way to ensure that |
861 | * all the file space has really been allocated, so that we don't |
862 | * later seg fault when accessing the memory mapping. This is pretty |
863 | * pessimal. |
864 | */ |
865 | while (success && remaining > 0) |
866 | { |
867 | Size goal = remaining; |
868 | |
869 | if (goal > ZBUFFER_SIZE) |
870 | goal = ZBUFFER_SIZE; |
871 | pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); |
872 | if (write(fd, zbuffer, goal) == goal) |
873 | remaining -= goal; |
874 | else |
875 | success = false; |
876 | pgstat_report_wait_end(); |
877 | } |
878 | |
879 | if (!success) |
880 | { |
881 | int save_errno; |
882 | |
883 | /* Back out what's already been done. */ |
884 | save_errno = errno; |
885 | CloseTransientFile(fd); |
886 | unlink(name); |
887 | errno = save_errno ? save_errno : ENOSPC; |
888 | |
889 | ereport(elevel, |
890 | (errcode_for_dynamic_shared_memory(), |
891 | errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m" , |
892 | name, request_size))); |
893 | return false; |
894 | } |
895 | } |
896 | |
897 | /* Map it. */ |
898 | address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, |
899 | MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); |
900 | if (address == MAP_FAILED) |
901 | { |
902 | int save_errno; |
903 | |
904 | /* Back out what's already been done. */ |
905 | save_errno = errno; |
906 | CloseTransientFile(fd); |
907 | if (op == DSM_OP_CREATE) |
908 | unlink(name); |
909 | errno = save_errno; |
910 | |
911 | ereport(elevel, |
912 | (errcode_for_dynamic_shared_memory(), |
913 | errmsg("could not map shared memory segment \"%s\": %m" , |
914 | name))); |
915 | return false; |
916 | } |
917 | *mapped_address = address; |
918 | *mapped_size = request_size; |
919 | |
920 | if (CloseTransientFile(fd)) |
921 | { |
922 | ereport(elevel, |
923 | (errcode_for_file_access(), |
924 | errmsg("could not close shared memory segment \"%s\": %m" , |
925 | name))); |
926 | return false; |
927 | } |
928 | |
929 | return true; |
930 | } |
931 | #endif |
932 | |
933 | /* |
934 | * Implementation-specific actions that must be performed when a segment is to |
935 | * be preserved even when no backend has it attached. |
936 | * |
937 | * Except on Windows, we don't need to do anything at all. But since Windows |
938 | * cleans up segments automatically when no references remain, we duplicate |
939 | * the segment handle into the postmaster process. The postmaster needn't |
940 | * do anything to receive the handle; Windows transfers it automatically. |
941 | */ |
942 | void |
943 | dsm_impl_pin_segment(dsm_handle handle, void *impl_private, |
944 | void **impl_private_pm_handle) |
945 | { |
946 | switch (dynamic_shared_memory_type) |
947 | { |
948 | #ifdef USE_DSM_WINDOWS |
949 | case DSM_IMPL_WINDOWS: |
950 | { |
951 | HANDLE hmap; |
952 | |
953 | if (!DuplicateHandle(GetCurrentProcess(), impl_private, |
954 | PostmasterHandle, &hmap, 0, FALSE, |
955 | DUPLICATE_SAME_ACCESS)) |
956 | { |
957 | char name[64]; |
958 | |
959 | snprintf(name, 64, "%s.%u" , SEGMENT_NAME_PREFIX, handle); |
960 | _dosmaperr(GetLastError()); |
961 | ereport(ERROR, |
962 | (errcode_for_dynamic_shared_memory(), |
963 | errmsg("could not duplicate handle for \"%s\": %m" , |
964 | name))); |
965 | } |
966 | |
967 | /* |
968 | * Here, we remember the handle that we created in the |
969 | * postmaster process. This handle isn't actually usable in |
970 | * any process other than the postmaster, but that doesn't |
971 | * matter. We're just holding onto it so that, if the segment |
972 | * is unpinned, dsm_impl_unpin_segment can close it. |
973 | */ |
974 | *impl_private_pm_handle = hmap; |
975 | break; |
976 | } |
977 | #endif |
978 | default: |
979 | break; |
980 | } |
981 | } |
982 | |
983 | /* |
984 | * Implementation-specific actions that must be performed when a segment is no |
985 | * longer to be preserved, so that it will be cleaned up when all backends |
986 | * have detached from it. |
987 | * |
988 | * Except on Windows, we don't need to do anything at all. For Windows, we |
989 | * close the extra handle that dsm_impl_pin_segment created in the |
990 | * postmaster's process space. |
991 | */ |
992 | void |
993 | dsm_impl_unpin_segment(dsm_handle handle, void **impl_private) |
994 | { |
995 | switch (dynamic_shared_memory_type) |
996 | { |
997 | #ifdef USE_DSM_WINDOWS |
998 | case DSM_IMPL_WINDOWS: |
999 | { |
1000 | if (*impl_private && |
1001 | !DuplicateHandle(PostmasterHandle, *impl_private, |
1002 | NULL, NULL, 0, FALSE, |
1003 | DUPLICATE_CLOSE_SOURCE)) |
1004 | { |
1005 | char name[64]; |
1006 | |
1007 | snprintf(name, 64, "%s.%u" , SEGMENT_NAME_PREFIX, handle); |
1008 | _dosmaperr(GetLastError()); |
1009 | ereport(ERROR, |
1010 | (errcode_for_dynamic_shared_memory(), |
1011 | errmsg("could not duplicate handle for \"%s\": %m" , |
1012 | name))); |
1013 | } |
1014 | |
1015 | *impl_private = NULL; |
1016 | break; |
1017 | } |
1018 | #endif |
1019 | default: |
1020 | break; |
1021 | } |
1022 | } |
1023 | |
1024 | static int |
1025 | errcode_for_dynamic_shared_memory(void) |
1026 | { |
1027 | if (errno == EFBIG || errno == ENOMEM) |
1028 | return errcode(ERRCODE_OUT_OF_MEMORY); |
1029 | else |
1030 | return errcode_for_file_access(); |
1031 | } |
1032 | |