1/*-------------------------------------------------------------------------
2 *
3 * latch.c
4 * Routines for inter-process latches
5 *
6 * The Unix implementation uses the so-called self-pipe trick to overcome the
7 * race condition involved with poll() (or epoll_wait() on linux) and setting
8 * a global flag in the signal handler. When a latch is set and the current
9 * process is waiting for it, the signal handler wakes up the poll() in
10 * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11 * poll() on all platforms, and even on platforms where it does, a signal that
12 * arrives just before the poll() call does not prevent poll() from entering
13 * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14 * and causes poll() to return immediately even if the signal arrives before
15 * poll() begins.
16 *
17 * When SetLatch is called from the same process that owns the latch,
18 * SetLatch writes the byte directly to the pipe. If it's owned by another
19 * process, SIGUSR1 is sent and the signal handler in the waiting process
20 * writes the byte to the pipe on behalf of the signaling process.
21 *
22 * The Windows implementation uses Windows events that are inherited by all
23 * postmaster child processes. There's no need for the self-pipe trick there.
24 *
25 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
26 * Portions Copyright (c) 1994, Regents of the University of California
27 *
28 * IDENTIFICATION
29 * src/backend/storage/ipc/latch.c
30 *
31 *-------------------------------------------------------------------------
32 */
33#include "postgres.h"
34
35#include <fcntl.h>
36#include <limits.h>
37#include <signal.h>
38#include <unistd.h>
39#ifdef HAVE_SYS_EPOLL_H
40#include <sys/epoll.h>
41#endif
42#ifdef HAVE_POLL_H
43#include <poll.h>
44#endif
45
46#include "miscadmin.h"
47#include "pgstat.h"
48#include "port/atomics.h"
49#include "portability/instr_time.h"
50#include "postmaster/postmaster.h"
51#include "storage/ipc.h"
52#include "storage/latch.h"
53#include "storage/pmsignal.h"
54#include "storage/shmem.h"
55
56/*
57 * Select the fd readiness primitive to use. Normally the "most modern"
58 * primitive supported by the OS will be used, but for testing it can be
59 * useful to manually specify the used primitive. If desired, just add a
60 * define somewhere before this block.
61 */
62#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
63 defined(WAIT_USE_WIN32)
64/* don't overwrite manual choice */
65#elif defined(HAVE_SYS_EPOLL_H)
66#define WAIT_USE_EPOLL
67#elif defined(HAVE_POLL)
68#define WAIT_USE_POLL
69#elif WIN32
70#define WAIT_USE_WIN32
71#else
72#error "no wait set implementation available"
73#endif
74
75/* typedef in latch.h */
76struct WaitEventSet
77{
78 int nevents; /* number of registered events */
79 int nevents_space; /* maximum number of events in this set */
80
81 /*
82 * Array, of nevents_space length, storing the definition of events this
83 * set is waiting for.
84 */
85 WaitEvent *events;
86
87 /*
88 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
89 * said latch, and latch_pos the offset in the ->events array. This is
90 * useful because we check the state of the latch before performing doing
91 * syscalls related to waiting.
92 */
93 Latch *latch;
94 int latch_pos;
95
96 /*
97 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
98 * is set so that we'll exit immediately if postmaster death is detected,
99 * instead of returning.
100 */
101 bool exit_on_postmaster_death;
102
103#if defined(WAIT_USE_EPOLL)
104 int epoll_fd;
105 /* epoll_wait returns events in a user provided arrays, allocate once */
106 struct epoll_event *epoll_ret_events;
107#elif defined(WAIT_USE_POLL)
108 /* poll expects events to be waited on every poll() call, prepare once */
109 struct pollfd *pollfds;
110#elif defined(WAIT_USE_WIN32)
111
112 /*
113 * Array of windows events. The first element always contains
114 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
115 * event->pos + 1).
116 */
117 HANDLE *handles;
118#endif
119};
120
121#ifndef WIN32
122/* Are we currently in WaitLatch? The signal handler would like to know. */
123static volatile sig_atomic_t waiting = false;
124
125/* Read and write ends of the self-pipe */
126static int selfpipe_readfd = -1;
127static int selfpipe_writefd = -1;
128
129/* Process owning the self-pipe --- needed for checking purposes */
130static int selfpipe_owner_pid = 0;
131
132/* Private function prototypes */
133static void sendSelfPipeByte(void);
134static void drainSelfPipe(void);
135#endif /* WIN32 */
136
137#if defined(WAIT_USE_EPOLL)
138static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
139#elif defined(WAIT_USE_POLL)
140static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
141#elif defined(WAIT_USE_WIN32)
142static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
143#endif
144
145static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
146 WaitEvent *occurred_events, int nevents);
147
148/*
149 * Initialize the process-local latch infrastructure.
150 *
151 * This must be called once during startup of any process that can wait on
152 * latches, before it issues any InitLatch() or OwnLatch() calls.
153 */
154void
155InitializeLatchSupport(void)
156{
157#ifndef WIN32
158 int pipefd[2];
159
160 if (IsUnderPostmaster)
161 {
162 /*
163 * We might have inherited connections to a self-pipe created by the
164 * postmaster. It's critical that child processes create their own
165 * self-pipes, of course, and we really want them to close the
166 * inherited FDs for safety's sake.
167 */
168 if (selfpipe_owner_pid != 0)
169 {
170 /* Assert we go through here but once in a child process */
171 Assert(selfpipe_owner_pid != MyProcPid);
172 /* Release postmaster's pipe FDs; ignore any error */
173 (void) close(selfpipe_readfd);
174 (void) close(selfpipe_writefd);
175 /* Clean up, just for safety's sake; we'll set these below */
176 selfpipe_readfd = selfpipe_writefd = -1;
177 selfpipe_owner_pid = 0;
178 }
179 else
180 {
181 /*
182 * Postmaster didn't create a self-pipe ... or else we're in an
183 * EXEC_BACKEND build, in which case it doesn't matter since the
184 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
185 */
186 Assert(selfpipe_readfd == -1);
187 }
188 }
189 else
190 {
191 /* In postmaster or standalone backend, assert we do this but once */
192 Assert(selfpipe_readfd == -1);
193 Assert(selfpipe_owner_pid == 0);
194 }
195
196 /*
197 * Set up the self-pipe that allows a signal handler to wake up the
198 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
199 * that SetLatch won't block if the event has already been set many times
200 * filling the kernel buffer. Make the read-end non-blocking too, so that
201 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
202 * Also, make both FDs close-on-exec, since we surely do not want any
203 * child processes messing with them.
204 */
205 if (pipe(pipefd) < 0)
206 elog(FATAL, "pipe() failed: %m");
207 if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
208 elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
209 if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
210 elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
211 if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
212 elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
213 if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
214 elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
215
216 selfpipe_readfd = pipefd[0];
217 selfpipe_writefd = pipefd[1];
218 selfpipe_owner_pid = MyProcPid;
219#else
220 /* currently, nothing to do here for Windows */
221#endif
222}
223
224/*
225 * Initialize a process-local latch.
226 */
227void
228InitLatch(Latch *latch)
229{
230 latch->is_set = false;
231 latch->owner_pid = MyProcPid;
232 latch->is_shared = false;
233
234#ifndef WIN32
235 /* Assert InitializeLatchSupport has been called in this process */
236 Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
237#else
238 latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
239 if (latch->event == NULL)
240 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
241#endif /* WIN32 */
242}
243
244/*
245 * Initialize a shared latch that can be set from other processes. The latch
246 * is initially owned by no-one; use OwnLatch to associate it with the
247 * current process.
248 *
249 * InitSharedLatch needs to be called in postmaster before forking child
250 * processes, usually right after allocating the shared memory block
251 * containing the latch with ShmemInitStruct. (The Unix implementation
252 * doesn't actually require that, but the Windows one does.) Because of
253 * this restriction, we have no concurrency issues to worry about here.
254 *
255 * Note that other handles created in this module are never marked as
256 * inheritable. Thus we do not need to worry about cleaning up child
257 * process references to postmaster-private latches or WaitEventSets.
258 */
259void
260InitSharedLatch(Latch *latch)
261{
262#ifdef WIN32
263 SECURITY_ATTRIBUTES sa;
264
265 /*
266 * Set up security attributes to specify that the events are inherited.
267 */
268 ZeroMemory(&sa, sizeof(sa));
269 sa.nLength = sizeof(sa);
270 sa.bInheritHandle = TRUE;
271
272 latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
273 if (latch->event == NULL)
274 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
275#endif
276
277 latch->is_set = false;
278 latch->owner_pid = 0;
279 latch->is_shared = true;
280}
281
282/*
283 * Associate a shared latch with the current process, allowing it to
284 * wait on the latch.
285 *
286 * Although there is a sanity check for latch-already-owned, we don't do
287 * any sort of locking here, meaning that we could fail to detect the error
288 * if two processes try to own the same latch at about the same time. If
289 * there is any risk of that, caller must provide an interlock to prevent it.
290 *
291 * In any process that calls OwnLatch(), make sure that
292 * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
293 * as shared latches use SIGUSR1 for inter-process communication.
294 */
295void
296OwnLatch(Latch *latch)
297{
298 /* Sanity checks */
299 Assert(latch->is_shared);
300
301#ifndef WIN32
302 /* Assert InitializeLatchSupport has been called in this process */
303 Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
304#endif
305
306 if (latch->owner_pid != 0)
307 elog(ERROR, "latch already owned");
308
309 latch->owner_pid = MyProcPid;
310}
311
312/*
313 * Disown a shared latch currently owned by the current process.
314 */
315void
316DisownLatch(Latch *latch)
317{
318 Assert(latch->is_shared);
319 Assert(latch->owner_pid == MyProcPid);
320
321 latch->owner_pid = 0;
322}
323
324/*
325 * Wait for a given latch to be set, or for postmaster death, or until timeout
326 * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
327 * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
328 * function returns immediately.
329 *
330 * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
331 * is given. Although it is declared as "long", we don't actually support
332 * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
333 * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
334 *
335 * The latch must be owned by the current process, ie. it must be a
336 * process-local latch initialized with InitLatch, or a shared latch
337 * associated with the current process by calling OwnLatch.
338 *
339 * Returns bit mask indicating which condition(s) caused the wake-up. Note
340 * that if multiple wake-up conditions are true, there is no guarantee that
341 * we return all of them in one call, but we will return at least one.
342 */
343int
344WaitLatch(Latch *latch, int wakeEvents, long timeout,
345 uint32 wait_event_info)
346{
347 return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
348 wait_event_info);
349}
350
351/*
352 * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
353 * conditions.
354 *
355 * When waiting on a socket, EOF and error conditions always cause the socket
356 * to be reported as readable/writable/connected, so that the caller can deal
357 * with the condition.
358 *
359 * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
360 * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
361 * return value if the postmaster dies. The latter is useful for rare cases
362 * where some behavior other than immediate exit is needed.
363 *
364 * NB: These days this is just a wrapper around the WaitEventSet API. When
365 * using a latch very frequently, consider creating a longer living
366 * WaitEventSet instead; that's more efficient.
367 */
368int
369WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
370 long timeout, uint32 wait_event_info)
371{
372 int ret = 0;
373 int rc;
374 WaitEvent event;
375 WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
376
377 if (wakeEvents & WL_TIMEOUT)
378 Assert(timeout >= 0);
379 else
380 timeout = -1;
381
382 if (wakeEvents & WL_LATCH_SET)
383 AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
384 latch, NULL);
385
386 /* Postmaster-managed callers must handle postmaster death somehow. */
387 Assert(!IsUnderPostmaster ||
388 (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
389 (wakeEvents & WL_POSTMASTER_DEATH));
390
391 if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
392 AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
393 NULL, NULL);
394
395 if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
396 AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
397 NULL, NULL);
398
399 if (wakeEvents & WL_SOCKET_MASK)
400 {
401 int ev;
402
403 ev = wakeEvents & WL_SOCKET_MASK;
404 AddWaitEventToSet(set, ev, sock, NULL, NULL);
405 }
406
407 rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
408
409 if (rc == 0)
410 ret |= WL_TIMEOUT;
411 else
412 {
413 ret |= event.events & (WL_LATCH_SET |
414 WL_POSTMASTER_DEATH |
415 WL_SOCKET_MASK);
416 }
417
418 FreeWaitEventSet(set);
419
420 return ret;
421}
422
423/*
424 * Sets a latch and wakes up anyone waiting on it.
425 *
426 * This is cheap if the latch is already set, otherwise not so much.
427 *
428 * NB: when calling this in a signal handler, be sure to save and restore
429 * errno around it. (That's standard practice in most signal handlers, of
430 * course, but we used to omit it in handlers that only set a flag.)
431 *
432 * NB: this function is called from critical sections and signal handlers so
433 * throwing an error is not a good idea.
434 */
435void
436SetLatch(Latch *latch)
437{
438#ifndef WIN32
439 pid_t owner_pid;
440#else
441 HANDLE handle;
442#endif
443
444 /*
445 * The memory barrier has to be placed here to ensure that any flag
446 * variables possibly changed by this process have been flushed to main
447 * memory, before we check/set is_set.
448 */
449 pg_memory_barrier();
450
451 /* Quick exit if already set */
452 if (latch->is_set)
453 return;
454
455 latch->is_set = true;
456
457#ifndef WIN32
458
459 /*
460 * See if anyone's waiting for the latch. It can be the current process if
461 * we're in a signal handler. We use the self-pipe to wake up the
462 * poll()/epoll_wait() in that case. If it's another process, send a
463 * signal.
464 *
465 * Fetch owner_pid only once, in case the latch is concurrently getting
466 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
467 * guaranteed to be true! In practice, the effective range of pid_t fits
468 * in a 32 bit integer, and so should be atomic. In the worst case, we
469 * might end up signaling the wrong process. Even then, you're very
470 * unlucky if a process with that bogus pid exists and belongs to
471 * Postgres; and PG database processes should handle excess SIGUSR1
472 * interrupts without a problem anyhow.
473 *
474 * Another sort of race condition that's possible here is for a new
475 * process to own the latch immediately after we look, so we don't signal
476 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
477 * the standard coding convention of waiting at the bottom of their loops,
478 * not the top, so that they'll correctly process latch-setting events
479 * that happen before they enter the loop.
480 */
481 owner_pid = latch->owner_pid;
482 if (owner_pid == 0)
483 return;
484 else if (owner_pid == MyProcPid)
485 {
486 if (waiting)
487 sendSelfPipeByte();
488 }
489 else
490 kill(owner_pid, SIGUSR1);
491#else
492
493 /*
494 * See if anyone's waiting for the latch. It can be the current process if
495 * we're in a signal handler.
496 *
497 * Use a local variable here just in case somebody changes the event field
498 * concurrently (which really should not happen).
499 */
500 handle = latch->event;
501 if (handle)
502 {
503 SetEvent(handle);
504
505 /*
506 * Note that we silently ignore any errors. We might be in a signal
507 * handler or other critical path where it's not safe to call elog().
508 */
509 }
510#endif
511
512}
513
514/*
515 * Clear the latch. Calling WaitLatch after this will sleep, unless
516 * the latch is set again before the WaitLatch call.
517 */
518void
519ResetLatch(Latch *latch)
520{
521 /* Only the owner should reset the latch */
522 Assert(latch->owner_pid == MyProcPid);
523
524 latch->is_set = false;
525
526 /*
527 * Ensure that the write to is_set gets flushed to main memory before we
528 * examine any flag variables. Otherwise a concurrent SetLatch might
529 * falsely conclude that it needn't signal us, even though we have missed
530 * seeing some flag updates that SetLatch was supposed to inform us of.
531 */
532 pg_memory_barrier();
533}
534
535/*
536 * Create a WaitEventSet with space for nevents different events to wait for.
537 *
538 * These events can then be efficiently waited upon together, using
539 * WaitEventSetWait().
540 */
541WaitEventSet *
542CreateWaitEventSet(MemoryContext context, int nevents)
543{
544 WaitEventSet *set;
545 char *data;
546 Size sz = 0;
547
548 /*
549 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
550 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
551 * platforms, but earlier allocations like WaitEventSet and WaitEvent
552 * might not sized to guarantee that when purely using sizeof().
553 */
554 sz += MAXALIGN(sizeof(WaitEventSet));
555 sz += MAXALIGN(sizeof(WaitEvent) * nevents);
556
557#if defined(WAIT_USE_EPOLL)
558 sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
559#elif defined(WAIT_USE_POLL)
560 sz += MAXALIGN(sizeof(struct pollfd) * nevents);
561#elif defined(WAIT_USE_WIN32)
562 /* need space for the pgwin32_signal_event */
563 sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
564#endif
565
566 data = (char *) MemoryContextAllocZero(context, sz);
567
568 set = (WaitEventSet *) data;
569 data += MAXALIGN(sizeof(WaitEventSet));
570
571 set->events = (WaitEvent *) data;
572 data += MAXALIGN(sizeof(WaitEvent) * nevents);
573
574#if defined(WAIT_USE_EPOLL)
575 set->epoll_ret_events = (struct epoll_event *) data;
576 data += MAXALIGN(sizeof(struct epoll_event) * nevents);
577#elif defined(WAIT_USE_POLL)
578 set->pollfds = (struct pollfd *) data;
579 data += MAXALIGN(sizeof(struct pollfd) * nevents);
580#elif defined(WAIT_USE_WIN32)
581 set->handles = (HANDLE) data;
582 data += MAXALIGN(sizeof(HANDLE) * nevents);
583#endif
584
585 set->latch = NULL;
586 set->nevents_space = nevents;
587 set->exit_on_postmaster_death = false;
588
589#if defined(WAIT_USE_EPOLL)
590#ifdef EPOLL_CLOEXEC
591 set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
592 if (set->epoll_fd < 0)
593 elog(ERROR, "epoll_create1 failed: %m");
594#else
595 /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
596 set->epoll_fd = epoll_create(nevents);
597 if (set->epoll_fd < 0)
598 elog(ERROR, "epoll_create failed: %m");
599 if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
600 elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
601#endif /* EPOLL_CLOEXEC */
602#elif defined(WAIT_USE_WIN32)
603
604 /*
605 * To handle signals while waiting, we need to add a win32 specific event.
606 * We accounted for the additional event at the top of this routine. See
607 * port/win32/signal.c for more details.
608 *
609 * Note: pgwin32_signal_event should be first to ensure that it will be
610 * reported when multiple events are set. We want to guarantee that
611 * pending signals are serviced.
612 */
613 set->handles[0] = pgwin32_signal_event;
614 StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
615#endif
616
617 return set;
618}
619
620/*
621 * Free a previously created WaitEventSet.
622 *
623 * Note: preferably, this shouldn't have to free any resources that could be
624 * inherited across an exec(). If it did, we'd likely leak those resources in
625 * many scenarios. For the epoll case, we ensure that by setting FD_CLOEXEC
626 * when the FD is created. For the Windows case, we assume that the handles
627 * involved are non-inheritable.
628 */
629void
630FreeWaitEventSet(WaitEventSet *set)
631{
632#if defined(WAIT_USE_EPOLL)
633 close(set->epoll_fd);
634#elif defined(WAIT_USE_WIN32)
635 WaitEvent *cur_event;
636
637 for (cur_event = set->events;
638 cur_event < (set->events + set->nevents);
639 cur_event++)
640 {
641 if (cur_event->events & WL_LATCH_SET)
642 {
643 /* uses the latch's HANDLE */
644 }
645 else if (cur_event->events & WL_POSTMASTER_DEATH)
646 {
647 /* uses PostmasterHandle */
648 }
649 else
650 {
651 /* Clean up the event object we created for the socket */
652 WSAEventSelect(cur_event->fd, NULL, 0);
653 WSACloseEvent(set->handles[cur_event->pos + 1]);
654 }
655 }
656#endif
657
658 pfree(set);
659}
660
661/* ---
662 * Add an event to the set. Possible events are:
663 * - WL_LATCH_SET: Wait for the latch to be set
664 * - WL_POSTMASTER_DEATH: Wait for postmaster to die
665 * - WL_SOCKET_READABLE: Wait for socket to become readable,
666 * can be combined in one event with other WL_SOCKET_* events
667 * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
668 * can be combined with other WL_SOCKET_* events
669 * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
670 * can be combined with other WL_SOCKET_* events (on non-Windows
671 * platforms, this is the same as WL_SOCKET_WRITEABLE)
672 * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
673 *
674 * Returns the offset in WaitEventSet->events (starting from 0), which can be
675 * used to modify previously added wait events using ModifyWaitEvent().
676 *
677 * In the WL_LATCH_SET case the latch must be owned by the current process,
678 * i.e. it must be a process-local latch initialized with InitLatch, or a
679 * shared latch associated with the current process by calling OwnLatch.
680 *
681 * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
682 * conditions cause the socket to be reported as readable/writable/connected,
683 * so that the caller can deal with the condition.
684 *
685 * The user_data pointer specified here will be set for the events returned
686 * by WaitEventSetWait(), allowing to easily associate additional data with
687 * events.
688 */
689int
690AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
691 void *user_data)
692{
693 WaitEvent *event;
694
695 /* not enough space */
696 Assert(set->nevents < set->nevents_space);
697
698 if (events == WL_EXIT_ON_PM_DEATH)
699 {
700 events = WL_POSTMASTER_DEATH;
701 set->exit_on_postmaster_death = true;
702 }
703
704 if (latch)
705 {
706 if (latch->owner_pid != MyProcPid)
707 elog(ERROR, "cannot wait on a latch owned by another process");
708 if (set->latch)
709 elog(ERROR, "cannot wait on more than one latch");
710 if ((events & WL_LATCH_SET) != WL_LATCH_SET)
711 elog(ERROR, "latch events only support being set");
712 }
713 else
714 {
715 if (events & WL_LATCH_SET)
716 elog(ERROR, "cannot wait on latch without a specified latch");
717 }
718
719 /* waiting for socket readiness without a socket indicates a bug */
720 if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
721 elog(ERROR, "cannot wait on socket event without a socket");
722
723 event = &set->events[set->nevents];
724 event->pos = set->nevents++;
725 event->fd = fd;
726 event->events = events;
727 event->user_data = user_data;
728#ifdef WIN32
729 event->reset = false;
730#endif
731
732 if (events == WL_LATCH_SET)
733 {
734 set->latch = latch;
735 set->latch_pos = event->pos;
736#ifndef WIN32
737 event->fd = selfpipe_readfd;
738#endif
739 }
740 else if (events == WL_POSTMASTER_DEATH)
741 {
742#ifndef WIN32
743 event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
744#endif
745 }
746
747 /* perform wait primitive specific initialization, if needed */
748#if defined(WAIT_USE_EPOLL)
749 WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
750#elif defined(WAIT_USE_POLL)
751 WaitEventAdjustPoll(set, event);
752#elif defined(WAIT_USE_WIN32)
753 WaitEventAdjustWin32(set, event);
754#endif
755
756 return event->pos;
757}
758
759/*
760 * Change the event mask and, in the WL_LATCH_SET case, the latch associated
761 * with the WaitEvent.
762 *
763 * 'pos' is the id returned by AddWaitEventToSet.
764 */
765void
766ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
767{
768 WaitEvent *event;
769
770 Assert(pos < set->nevents);
771
772 event = &set->events[pos];
773
774 /*
775 * If neither the event mask nor the associated latch changes, return
776 * early. That's an important optimization for some sockets, where
777 * ModifyWaitEvent is frequently used to switch from waiting for reads to
778 * waiting on writes.
779 */
780 if (events == event->events &&
781 (!(event->events & WL_LATCH_SET) || set->latch == latch))
782 return;
783
784 if (event->events & WL_LATCH_SET &&
785 events != event->events)
786 {
787 /* we could allow to disable latch events for a while */
788 elog(ERROR, "cannot modify latch event");
789 }
790
791 if (event->events & WL_POSTMASTER_DEATH)
792 {
793 elog(ERROR, "cannot modify postmaster death event");
794 }
795
796 /* FIXME: validate event mask */
797 event->events = events;
798
799 if (events == WL_LATCH_SET)
800 {
801 set->latch = latch;
802 }
803
804#if defined(WAIT_USE_EPOLL)
805 WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
806#elif defined(WAIT_USE_POLL)
807 WaitEventAdjustPoll(set, event);
808#elif defined(WAIT_USE_WIN32)
809 WaitEventAdjustWin32(set, event);
810#endif
811}
812
813#if defined(WAIT_USE_EPOLL)
814/*
815 * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
816 */
817static void
818WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
819{
820 struct epoll_event epoll_ev;
821 int rc;
822
823 /* pointer to our event, returned by epoll_wait */
824 epoll_ev.data.ptr = event;
825 /* always wait for errors */
826 epoll_ev.events = EPOLLERR | EPOLLHUP;
827
828 /* prepare pollfd entry once */
829 if (event->events == WL_LATCH_SET)
830 {
831 Assert(set->latch != NULL);
832 epoll_ev.events |= EPOLLIN;
833 }
834 else if (event->events == WL_POSTMASTER_DEATH)
835 {
836 epoll_ev.events |= EPOLLIN;
837 }
838 else
839 {
840 Assert(event->fd != PGINVALID_SOCKET);
841 Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
842
843 if (event->events & WL_SOCKET_READABLE)
844 epoll_ev.events |= EPOLLIN;
845 if (event->events & WL_SOCKET_WRITEABLE)
846 epoll_ev.events |= EPOLLOUT;
847 }
848
849 /*
850 * Even though unused, we also pass epoll_ev as the data argument if
851 * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
852 * requiring that, and actually it makes the code simpler...
853 */
854 rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
855
856 if (rc < 0)
857 ereport(ERROR,
858 (errcode_for_socket_access(),
859 /* translator: %s is a syscall name, such as "poll()" */
860 errmsg("%s failed: %m",
861 "epoll_ctl()")));
862}
863#endif
864
865#if defined(WAIT_USE_POLL)
866static void
867WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
868{
869 struct pollfd *pollfd = &set->pollfds[event->pos];
870
871 pollfd->revents = 0;
872 pollfd->fd = event->fd;
873
874 /* prepare pollfd entry once */
875 if (event->events == WL_LATCH_SET)
876 {
877 Assert(set->latch != NULL);
878 pollfd->events = POLLIN;
879 }
880 else if (event->events == WL_POSTMASTER_DEATH)
881 {
882 pollfd->events = POLLIN;
883 }
884 else
885 {
886 Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
887 pollfd->events = 0;
888 if (event->events & WL_SOCKET_READABLE)
889 pollfd->events |= POLLIN;
890 if (event->events & WL_SOCKET_WRITEABLE)
891 pollfd->events |= POLLOUT;
892 }
893
894 Assert(event->fd != PGINVALID_SOCKET);
895}
896#endif
897
898#if defined(WAIT_USE_WIN32)
899static void
900WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
901{
902 HANDLE *handle = &set->handles[event->pos + 1];
903
904 if (event->events == WL_LATCH_SET)
905 {
906 Assert(set->latch != NULL);
907 *handle = set->latch->event;
908 }
909 else if (event->events == WL_POSTMASTER_DEATH)
910 {
911 *handle = PostmasterHandle;
912 }
913 else
914 {
915 int flags = FD_CLOSE; /* always check for errors/EOF */
916
917 if (event->events & WL_SOCKET_READABLE)
918 flags |= FD_READ;
919 if (event->events & WL_SOCKET_WRITEABLE)
920 flags |= FD_WRITE;
921 if (event->events & WL_SOCKET_CONNECTED)
922 flags |= FD_CONNECT;
923
924 if (*handle == WSA_INVALID_EVENT)
925 {
926 *handle = WSACreateEvent();
927 if (*handle == WSA_INVALID_EVENT)
928 elog(ERROR, "failed to create event for socket: error code %u",
929 WSAGetLastError());
930 }
931 if (WSAEventSelect(event->fd, *handle, flags) != 0)
932 elog(ERROR, "failed to set up event for socket: error code %u",
933 WSAGetLastError());
934
935 Assert(event->fd != PGINVALID_SOCKET);
936 }
937}
938#endif
939
940/*
941 * Wait for events added to the set to happen, or until the timeout is
942 * reached. At most nevents occurred events are returned.
943 *
944 * If timeout = -1, block until an event occurs; if 0, check sockets for
945 * readiness, but don't block; if > 0, block for at most timeout milliseconds.
946 *
947 * Returns the number of events occurred, or 0 if the timeout was reached.
948 *
949 * Returned events will have the fd, pos, user_data fields set to the
950 * values associated with the registered event.
951 */
952int
953WaitEventSetWait(WaitEventSet *set, long timeout,
954 WaitEvent *occurred_events, int nevents,
955 uint32 wait_event_info)
956{
957 int returned_events = 0;
958 instr_time start_time;
959 instr_time cur_time;
960 long cur_timeout = -1;
961
962 Assert(nevents > 0);
963
964 /*
965 * Initialize timeout if requested. We must record the current time so
966 * that we can determine the remaining timeout if interrupted.
967 */
968 if (timeout >= 0)
969 {
970 INSTR_TIME_SET_CURRENT(start_time);
971 Assert(timeout >= 0 && timeout <= INT_MAX);
972 cur_timeout = timeout;
973 }
974
975 pgstat_report_wait_start(wait_event_info);
976
977#ifndef WIN32
978 waiting = true;
979#else
980 /* Ensure that signals are serviced even if latch is already set */
981 pgwin32_dispatch_queued_signals();
982#endif
983 while (returned_events == 0)
984 {
985 int rc;
986
987 /*
988 * Check if the latch is set already. If so, leave the loop
989 * immediately, avoid blocking again. We don't attempt to report any
990 * other events that might also be satisfied.
991 *
992 * If someone sets the latch between this and the
993 * WaitEventSetWaitBlock() below, the setter will write a byte to the
994 * pipe (or signal us and the signal handler will do that), and the
995 * readiness routine will return immediately.
996 *
997 * On unix, If there's a pending byte in the self pipe, we'll notice
998 * whenever blocking. Only clearing the pipe in that case avoids
999 * having to drain it every time WaitLatchOrSocket() is used. Should
1000 * the pipe-buffer fill up we're still ok, because the pipe is in
1001 * nonblocking mode. It's unlikely for that to happen, because the
1002 * self pipe isn't filled unless we're blocking (waiting = true), or
1003 * from inside a signal handler in latch_sigusr1_handler().
1004 *
1005 * On windows, we'll also notice if there's a pending event for the
1006 * latch when blocking, but there's no danger of anything filling up,
1007 * as "Setting an event that is already set has no effect.".
1008 *
1009 * Note: we assume that the kernel calls involved in latch management
1010 * will provide adequate synchronization on machines with weak memory
1011 * ordering, so that we cannot miss seeing is_set if a notification
1012 * has already been queued.
1013 */
1014 if (set->latch && set->latch->is_set)
1015 {
1016 occurred_events->fd = PGINVALID_SOCKET;
1017 occurred_events->pos = set->latch_pos;
1018 occurred_events->user_data =
1019 set->events[set->latch_pos].user_data;
1020 occurred_events->events = WL_LATCH_SET;
1021 occurred_events++;
1022 returned_events++;
1023
1024 break;
1025 }
1026
1027 /*
1028 * Wait for events using the readiness primitive chosen at the top of
1029 * this file. If -1 is returned, a timeout has occurred, if 0 we have
1030 * to retry, everything >= 1 is the number of returned events.
1031 */
1032 rc = WaitEventSetWaitBlock(set, cur_timeout,
1033 occurred_events, nevents);
1034
1035 if (rc == -1)
1036 break; /* timeout occurred */
1037 else
1038 returned_events = rc;
1039
1040 /* If we're not done, update cur_timeout for next iteration */
1041 if (returned_events == 0 && timeout >= 0)
1042 {
1043 INSTR_TIME_SET_CURRENT(cur_time);
1044 INSTR_TIME_SUBTRACT(cur_time, start_time);
1045 cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1046 if (cur_timeout <= 0)
1047 break;
1048 }
1049 }
1050#ifndef WIN32
1051 waiting = false;
1052#endif
1053
1054 pgstat_report_wait_end();
1055
1056 return returned_events;
1057}
1058
1059
1060#if defined(WAIT_USE_EPOLL)
1061
1062/*
1063 * Wait using linux's epoll_wait(2).
1064 *
1065 * This is the preferable wait method, as several readiness notifications are
1066 * delivered, without having to iterate through all of set->events. The return
1067 * epoll_event struct contain a pointer to our events, making association
1068 * easy.
1069 */
1070static inline int
1071WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1072 WaitEvent *occurred_events, int nevents)
1073{
1074 int returned_events = 0;
1075 int rc;
1076 WaitEvent *cur_event;
1077 struct epoll_event *cur_epoll_event;
1078
1079 /* Sleep */
1080 rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1081 nevents, cur_timeout);
1082
1083 /* Check return code */
1084 if (rc < 0)
1085 {
1086 /* EINTR is okay, otherwise complain */
1087 if (errno != EINTR)
1088 {
1089 waiting = false;
1090 ereport(ERROR,
1091 (errcode_for_socket_access(),
1092 /* translator: %s is a syscall name, such as "poll()" */
1093 errmsg("%s failed: %m",
1094 "epoll_wait()")));
1095 }
1096 return 0;
1097 }
1098 else if (rc == 0)
1099 {
1100 /* timeout exceeded */
1101 return -1;
1102 }
1103
1104 /*
1105 * At least one event occurred, iterate over the returned epoll events
1106 * until they're either all processed, or we've returned all the events
1107 * the caller desired.
1108 */
1109 for (cur_epoll_event = set->epoll_ret_events;
1110 cur_epoll_event < (set->epoll_ret_events + rc) &&
1111 returned_events < nevents;
1112 cur_epoll_event++)
1113 {
1114 /* epoll's data pointer is set to the associated WaitEvent */
1115 cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1116
1117 occurred_events->pos = cur_event->pos;
1118 occurred_events->user_data = cur_event->user_data;
1119 occurred_events->events = 0;
1120
1121 if (cur_event->events == WL_LATCH_SET &&
1122 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1123 {
1124 /* There's data in the self-pipe, clear it. */
1125 drainSelfPipe();
1126
1127 if (set->latch->is_set)
1128 {
1129 occurred_events->fd = PGINVALID_SOCKET;
1130 occurred_events->events = WL_LATCH_SET;
1131 occurred_events++;
1132 returned_events++;
1133 }
1134 }
1135 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1136 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1137 {
1138 /*
1139 * We expect an EPOLLHUP when the remote end is closed, but
1140 * because we don't expect the pipe to become readable or to have
1141 * any errors either, treat those cases as postmaster death, too.
1142 *
1143 * Be paranoid about a spurious event signalling the postmaster as
1144 * being dead. There have been reports about that happening with
1145 * older primitives (select(2) to be specific), and a spurious
1146 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1147 * cost much.
1148 */
1149 if (!PostmasterIsAliveInternal())
1150 {
1151 if (set->exit_on_postmaster_death)
1152 proc_exit(1);
1153 occurred_events->fd = PGINVALID_SOCKET;
1154 occurred_events->events = WL_POSTMASTER_DEATH;
1155 occurred_events++;
1156 returned_events++;
1157 }
1158 }
1159 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1160 {
1161 Assert(cur_event->fd != PGINVALID_SOCKET);
1162
1163 if ((cur_event->events & WL_SOCKET_READABLE) &&
1164 (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1165 {
1166 /* data available in socket, or EOF */
1167 occurred_events->events |= WL_SOCKET_READABLE;
1168 }
1169
1170 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1171 (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1172 {
1173 /* writable, or EOF */
1174 occurred_events->events |= WL_SOCKET_WRITEABLE;
1175 }
1176
1177 if (occurred_events->events != 0)
1178 {
1179 occurred_events->fd = cur_event->fd;
1180 occurred_events++;
1181 returned_events++;
1182 }
1183 }
1184 }
1185
1186 return returned_events;
1187}
1188
1189#elif defined(WAIT_USE_POLL)
1190
1191/*
1192 * Wait using poll(2).
1193 *
1194 * This allows to receive readiness notifications for several events at once,
1195 * but requires iterating through all of set->pollfds.
1196 */
1197static inline int
1198WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1199 WaitEvent *occurred_events, int nevents)
1200{
1201 int returned_events = 0;
1202 int rc;
1203 WaitEvent *cur_event;
1204 struct pollfd *cur_pollfd;
1205
1206 /* Sleep */
1207 rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1208
1209 /* Check return code */
1210 if (rc < 0)
1211 {
1212 /* EINTR is okay, otherwise complain */
1213 if (errno != EINTR)
1214 {
1215 waiting = false;
1216 ereport(ERROR,
1217 (errcode_for_socket_access(),
1218 /* translator: %s is a syscall name, such as "poll()" */
1219 errmsg("%s failed: %m",
1220 "poll()")));
1221 }
1222 return 0;
1223 }
1224 else if (rc == 0)
1225 {
1226 /* timeout exceeded */
1227 return -1;
1228 }
1229
1230 for (cur_event = set->events, cur_pollfd = set->pollfds;
1231 cur_event < (set->events + set->nevents) &&
1232 returned_events < nevents;
1233 cur_event++, cur_pollfd++)
1234 {
1235 /* no activity on this FD, skip */
1236 if (cur_pollfd->revents == 0)
1237 continue;
1238
1239 occurred_events->pos = cur_event->pos;
1240 occurred_events->user_data = cur_event->user_data;
1241 occurred_events->events = 0;
1242
1243 if (cur_event->events == WL_LATCH_SET &&
1244 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1245 {
1246 /* There's data in the self-pipe, clear it. */
1247 drainSelfPipe();
1248
1249 if (set->latch->is_set)
1250 {
1251 occurred_events->fd = PGINVALID_SOCKET;
1252 occurred_events->events = WL_LATCH_SET;
1253 occurred_events++;
1254 returned_events++;
1255 }
1256 }
1257 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1258 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1259 {
1260 /*
1261 * We expect an POLLHUP when the remote end is closed, but because
1262 * we don't expect the pipe to become readable or to have any
1263 * errors either, treat those cases as postmaster death, too.
1264 *
1265 * Be paranoid about a spurious event signalling the postmaster as
1266 * being dead. There have been reports about that happening with
1267 * older primitives (select(2) to be specific), and a spurious
1268 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1269 * cost much.
1270 */
1271 if (!PostmasterIsAliveInternal())
1272 {
1273 if (set->exit_on_postmaster_death)
1274 proc_exit(1);
1275 occurred_events->fd = PGINVALID_SOCKET;
1276 occurred_events->events = WL_POSTMASTER_DEATH;
1277 occurred_events++;
1278 returned_events++;
1279 }
1280 }
1281 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1282 {
1283 int errflags = POLLHUP | POLLERR | POLLNVAL;
1284
1285 Assert(cur_event->fd >= PGINVALID_SOCKET);
1286
1287 if ((cur_event->events & WL_SOCKET_READABLE) &&
1288 (cur_pollfd->revents & (POLLIN | errflags)))
1289 {
1290 /* data available in socket, or EOF */
1291 occurred_events->events |= WL_SOCKET_READABLE;
1292 }
1293
1294 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1295 (cur_pollfd->revents & (POLLOUT | errflags)))
1296 {
1297 /* writeable, or EOF */
1298 occurred_events->events |= WL_SOCKET_WRITEABLE;
1299 }
1300
1301 if (occurred_events->events != 0)
1302 {
1303 occurred_events->fd = cur_event->fd;
1304 occurred_events++;
1305 returned_events++;
1306 }
1307 }
1308 }
1309 return returned_events;
1310}
1311
1312#elif defined(WAIT_USE_WIN32)
1313
1314/*
1315 * Wait using Windows' WaitForMultipleObjects().
1316 *
1317 * Unfortunately this will only ever return a single readiness notification at
1318 * a time. Note that while the official documentation for
1319 * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1320 * with a single bWaitAll = FALSE call,
1321 * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1322 * that only one event is "consumed".
1323 */
1324static inline int
1325WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1326 WaitEvent *occurred_events, int nevents)
1327{
1328 int returned_events = 0;
1329 DWORD rc;
1330 WaitEvent *cur_event;
1331
1332 /* Reset any wait events that need it */
1333 for (cur_event = set->events;
1334 cur_event < (set->events + set->nevents);
1335 cur_event++)
1336 {
1337 if (cur_event->reset)
1338 {
1339 WaitEventAdjustWin32(set, cur_event);
1340 cur_event->reset = false;
1341 }
1342
1343 /*
1344 * Windows does not guarantee to log an FD_WRITE network event
1345 * indicating that more data can be sent unless the previous send()
1346 * failed with WSAEWOULDBLOCK. While our caller might well have made
1347 * such a call, we cannot assume that here. Therefore, if waiting for
1348 * write-ready, force the issue by doing a dummy send(). If the dummy
1349 * send() succeeds, assume that the socket is in fact write-ready, and
1350 * return immediately. Also, if it fails with something other than
1351 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1352 * deal with the error condition.
1353 */
1354 if (cur_event->events & WL_SOCKET_WRITEABLE)
1355 {
1356 char c;
1357 WSABUF buf;
1358 DWORD sent;
1359 int r;
1360
1361 buf.buf = &c;
1362 buf.len = 0;
1363
1364 r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1365 if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1366 {
1367 occurred_events->pos = cur_event->pos;
1368 occurred_events->user_data = cur_event->user_data;
1369 occurred_events->events = WL_SOCKET_WRITEABLE;
1370 occurred_events->fd = cur_event->fd;
1371 return 1;
1372 }
1373 }
1374 }
1375
1376 /*
1377 * Sleep.
1378 *
1379 * Need to wait for ->nevents + 1, because signal handle is in [0].
1380 */
1381 rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1382 cur_timeout);
1383
1384 /* Check return code */
1385 if (rc == WAIT_FAILED)
1386 elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1387 GetLastError());
1388 else if (rc == WAIT_TIMEOUT)
1389 {
1390 /* timeout exceeded */
1391 return -1;
1392 }
1393
1394 if (rc == WAIT_OBJECT_0)
1395 {
1396 /* Service newly-arrived signals */
1397 pgwin32_dispatch_queued_signals();
1398 return 0; /* retry */
1399 }
1400
1401 /*
1402 * With an offset of one, due to the always present pgwin32_signal_event,
1403 * the handle offset directly corresponds to a wait event.
1404 */
1405 cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1406
1407 occurred_events->pos = cur_event->pos;
1408 occurred_events->user_data = cur_event->user_data;
1409 occurred_events->events = 0;
1410
1411 if (cur_event->events == WL_LATCH_SET)
1412 {
1413 if (!ResetEvent(set->latch->event))
1414 elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1415
1416 if (set->latch->is_set)
1417 {
1418 occurred_events->fd = PGINVALID_SOCKET;
1419 occurred_events->events = WL_LATCH_SET;
1420 occurred_events++;
1421 returned_events++;
1422 }
1423 }
1424 else if (cur_event->events == WL_POSTMASTER_DEATH)
1425 {
1426 /*
1427 * Postmaster apparently died. Since the consequences of falsely
1428 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1429 * the trouble to positively verify this with PostmasterIsAlive(),
1430 * even though there is no known reason to think that the event could
1431 * be falsely set on Windows.
1432 */
1433 if (!PostmasterIsAliveInternal())
1434 {
1435 if (set->exit_on_postmaster_death)
1436 proc_exit(1);
1437 occurred_events->fd = PGINVALID_SOCKET;
1438 occurred_events->events = WL_POSTMASTER_DEATH;
1439 occurred_events++;
1440 returned_events++;
1441 }
1442 }
1443 else if (cur_event->events & WL_SOCKET_MASK)
1444 {
1445 WSANETWORKEVENTS resEvents;
1446 HANDLE handle = set->handles[cur_event->pos + 1];
1447
1448 Assert(cur_event->fd);
1449
1450 occurred_events->fd = cur_event->fd;
1451
1452 ZeroMemory(&resEvents, sizeof(resEvents));
1453 if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1454 elog(ERROR, "failed to enumerate network events: error code %u",
1455 WSAGetLastError());
1456 if ((cur_event->events & WL_SOCKET_READABLE) &&
1457 (resEvents.lNetworkEvents & FD_READ))
1458 {
1459 /* data available in socket */
1460 occurred_events->events |= WL_SOCKET_READABLE;
1461
1462 /*------
1463 * WaitForMultipleObjects doesn't guarantee that a read event will
1464 * be returned if the latch is set at the same time. Even if it
1465 * did, the caller might drop that event expecting it to reoccur
1466 * on next call. So, we must force the event to be reset if this
1467 * WaitEventSet is used again in order to avoid an indefinite
1468 * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1469 * for the behavior of socket events.
1470 *------
1471 */
1472 cur_event->reset = true;
1473 }
1474 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1475 (resEvents.lNetworkEvents & FD_WRITE))
1476 {
1477 /* writeable */
1478 occurred_events->events |= WL_SOCKET_WRITEABLE;
1479 }
1480 if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1481 (resEvents.lNetworkEvents & FD_CONNECT))
1482 {
1483 /* connected */
1484 occurred_events->events |= WL_SOCKET_CONNECTED;
1485 }
1486 if (resEvents.lNetworkEvents & FD_CLOSE)
1487 {
1488 /* EOF/error, so signal all caller-requested socket flags */
1489 occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1490 }
1491
1492 if (occurred_events->events != 0)
1493 {
1494 occurred_events++;
1495 returned_events++;
1496 }
1497 }
1498
1499 return returned_events;
1500}
1501#endif
1502
1503/*
1504 * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1505 *
1506 * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1507 * overloaded for multiple purposes; or we might not have reached WaitLatch
1508 * yet, in which case we don't need to fill the pipe either.)
1509 *
1510 * NB: when calling this in a signal handler, be sure to save and restore
1511 * errno around it.
1512 */
1513#ifndef WIN32
1514void
1515latch_sigusr1_handler(void)
1516{
1517 if (waiting)
1518 sendSelfPipeByte();
1519}
1520#endif /* !WIN32 */
1521
1522/* Send one byte to the self-pipe, to wake up WaitLatch */
1523#ifndef WIN32
1524static void
1525sendSelfPipeByte(void)
1526{
1527 int rc;
1528 char dummy = 0;
1529
1530retry:
1531 rc = write(selfpipe_writefd, &dummy, 1);
1532 if (rc < 0)
1533 {
1534 /* If interrupted by signal, just retry */
1535 if (errno == EINTR)
1536 goto retry;
1537
1538 /*
1539 * If the pipe is full, we don't need to retry, the data that's there
1540 * already is enough to wake up WaitLatch.
1541 */
1542 if (errno == EAGAIN || errno == EWOULDBLOCK)
1543 return;
1544
1545 /*
1546 * Oops, the write() failed for some other reason. We might be in a
1547 * signal handler, so it's not safe to elog(). We have no choice but
1548 * silently ignore the error.
1549 */
1550 return;
1551 }
1552}
1553#endif /* !WIN32 */
1554
1555/*
1556 * Read all available data from the self-pipe
1557 *
1558 * Note: this is only called when waiting = true. If it fails and doesn't
1559 * return, it must reset that flag first (though ideally, this will never
1560 * happen).
1561 */
1562#ifndef WIN32
1563static void
1564drainSelfPipe(void)
1565{
1566 /*
1567 * There shouldn't normally be more than one byte in the pipe, or maybe a
1568 * few bytes if multiple processes run SetLatch at the same instant.
1569 */
1570 char buf[16];
1571 int rc;
1572
1573 for (;;)
1574 {
1575 rc = read(selfpipe_readfd, buf, sizeof(buf));
1576 if (rc < 0)
1577 {
1578 if (errno == EAGAIN || errno == EWOULDBLOCK)
1579 break; /* the pipe is empty */
1580 else if (errno == EINTR)
1581 continue; /* retry */
1582 else
1583 {
1584 waiting = false;
1585 elog(ERROR, "read() on self-pipe failed: %m");
1586 }
1587 }
1588 else if (rc == 0)
1589 {
1590 waiting = false;
1591 elog(ERROR, "unexpected EOF on self-pipe");
1592 }
1593 else if (rc < sizeof(buf))
1594 {
1595 /* we successfully drained the pipe; no need to read() again */
1596 break;
1597 }
1598 /* else buffer wasn't big enough, so read again */
1599 }
1600}
1601#endif /* !WIN32 */
1602