latch.c source code [PostgreSQL/src/backend/storage/ipc/latch.c]

1	/-------------------------------------------------------------------------*
2	*
3	* latch.c
4	* Routines for inter-process latches
5	*
6	* The Unix implementation uses the so-called self-pipe trick to overcome the
7	* race condition involved with poll() (or epoll_wait() on linux) and setting
8	* a global flag in the signal handler. When a latch is set and the current
9	* process is waiting for it, the signal handler wakes up the poll() in
10	* WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11	* poll() on all platforms, and even on platforms where it does, a signal that
12	* arrives just before the poll() call does not prevent poll() from entering
13	* sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14	* and causes poll() to return immediately even if the signal arrives before
15	* poll() begins.
16	*
17	* When SetLatch is called from the same process that owns the latch,
18	* SetLatch writes the byte directly to the pipe. If it's owned by another
19	* process, SIGUSR1 is sent and the signal handler in the waiting process
20	* writes the byte to the pipe on behalf of the signaling process.
21	*
22	* The Windows implementation uses Windows events that are inherited by all
23	* postmaster child processes. There's no need for the self-pipe trick there.
24	*
25	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
26	* Portions Copyright (c) 1994, Regents of the University of California
27	*
28	* IDENTIFICATION
29	* src/backend/storage/ipc/latch.c
30	*
31	*-------------------------------------------------------------------------
32	*/
33	#include "postgres.h"
34
35	#include <fcntl.h>
36	#include <limits.h>
37	#include <signal.h>
38	#include <unistd.h>
39	#ifdef HAVE_SYS_EPOLL_H
40	#include <sys/epoll.h>
41	#endif
42	#ifdef HAVE_POLL_H
43	#include <poll.h>
44	#endif
45
46	#include "miscadmin.h"
47	#include "pgstat.h"
48	#include "port/atomics.h"
49	#include "portability/instr_time.h"
50	#include "postmaster/postmaster.h"
51	#include "storage/ipc.h"
52	#include "storage/latch.h"
53	#include "storage/pmsignal.h"
54	#include "storage/shmem.h"
55
56	/*
57	* Select the fd readiness primitive to use. Normally the "most modern"
58	* primitive supported by the OS will be used, but for testing it can be
59	* useful to manually specify the used primitive. If desired, just add a
60	* define somewhere before this block.
61	*/
62	#if defined(WAIT_USE_EPOLL) \|\| defined(WAIT_USE_POLL) \|\| \
63	defined(WAIT_USE_WIN32)
64	/ don't overwrite manual choice /
65	#elif defined(HAVE_SYS_EPOLL_H)
66	#define WAIT_USE_EPOLL
67	#elif defined(HAVE_POLL)
68	#define WAIT_USE_POLL
69	#elif WIN32
70	#define WAIT_USE_WIN32
71	#else
72	#error "no wait set implementation available"
73	#endif
74
75	/ typedef in latch.h /
76	struct WaitEventSet
77	{
78	int nevents; / number of registered events /
79	int nevents_space; / maximum number of events in this set /
80
81	/*
82	* Array, of nevents_space length, storing the definition of events this
83	* set is waiting for.
84	*/
85	WaitEvent *events;
86
87	/*
88	* If WL_LATCH_SET is specified in any wait event, latch is a pointer to
89	* said latch, and latch_pos the offset in the ->events array. This is
90	* useful because we check the state of the latch before performing doing
91	* syscalls related to waiting.
92	*/
93	Latch *latch;
94	int latch_pos;
95
96	/*
97	* WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
98	* is set so that we'll exit immediately if postmaster death is detected,
99	* instead of returning.
100	*/
101	bool exit_on_postmaster_death;
102
103	#if defined(WAIT_USE_EPOLL)
104	int epoll_fd;
105	/ epoll_wait returns events in a user provided arrays, allocate once /
106	struct epoll_event *epoll_ret_events;
107	#elif defined(WAIT_USE_POLL)
108	/ poll expects events to be waited on every poll() call, prepare once /
109	struct pollfd *pollfds;
110	#elif defined(WAIT_USE_WIN32)
111
112	/*
113	* Array of windows events. The first element always contains
114	* pgwin32_signal_event, so the remaining elements are offset by one (i.e.
115	* event->pos + 1).
116	*/
117	HANDLE *handles;
118	#endif
119	};
120
121	#ifndef WIN32
122	/ Are we currently in WaitLatch? The signal handler would like to know. /
123	static volatile sig_atomic_t waiting = false;
124
125	/ Read and write ends of the self-pipe /
126	static int selfpipe_readfd = -`1`;
127	static int selfpipe_writefd = -`1`;
128
129	/ Process owning the self-pipe --- needed for checking purposes /
130	static int selfpipe_owner_pid = `0`;
131
132	/ Private function prototypes /
133	static void sendSelfPipeByte(void);
134	static void drainSelfPipe(void);
135	#endif /* WIN32 */
136
137	#if defined(WAIT_USE_EPOLL)
138	static void WaitEventAdjustEpoll(WaitEventSet set, WaitEvent event, int action);
139	#elif defined(WAIT_USE_POLL)
140	static void WaitEventAdjustPoll(WaitEventSet set, WaitEvent event);
141	#elif defined(WAIT_USE_WIN32)
142	static void WaitEventAdjustWin32(WaitEventSet set, WaitEvent event);
143	#endif
144
145	static inline int WaitEventSetWaitBlock(WaitEventSet set, int* cur_timeout,
146	WaitEvent occurred_events, int* nevents);
147
148	/*
149	* Initialize the process-local latch infrastructure.
150	*
151	* This must be called once during startup of any process that can wait on
152	* latches, before it issues any InitLatch() or OwnLatch() calls.
153	*/
154	void
155	InitializeLatchSupport(void)
156	{
157	#ifndef WIN32
158	int pipefd[`2`];
159
160	if (IsUnderPostmaster)
161	{
162	/*
163	* We might have inherited connections to a self-pipe created by the
164	* postmaster. It's critical that child processes create their own
165	* self-pipes, of course, and we really want them to close the
166	* inherited FDs for safety's sake.
167	*/
168	if (selfpipe_owner_pid != `0`)
169	{
170	/ Assert we go through here but once in a child process /
171	Assert(selfpipe_owner_pid != MyProcPid);
172	/ Release postmaster's pipe FDs; ignore any error /
173	(void) close(selfpipe_readfd);
174	(void) close(selfpipe_writefd);
175	/ Clean up, just for safety's sake; we'll set these below /
176	selfpipe_readfd = selfpipe_writefd = -`1`;
177	selfpipe_owner_pid = `0`;
178	}
179	else
180	{
181	/*
182	* Postmaster didn't create a self-pipe ... or else we're in an
183	* EXEC_BACKEND build, in which case it doesn't matter since the
184	* postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
185	*/
186	Assert(selfpipe_readfd == -`1`);
187	}
188	}
189	else
190	{
191	/ In postmaster or standalone backend, assert we do this but once /
192	Assert(selfpipe_readfd == -`1`);
193	Assert(selfpipe_owner_pid == `0`);
194	}
195
196	/*
197	* Set up the self-pipe that allows a signal handler to wake up the
198	* poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
199	* that SetLatch won't block if the event has already been set many times
200	* filling the kernel buffer. Make the read-end non-blocking too, so that
201	* we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
202	* Also, make both FDs close-on-exec, since we surely do not want any
203	* child processes messing with them.
204	*/
205	if (pipe(pipefd) < `0`)
206	elog(FATAL, "pipe() failed: %m");
207	if (fcntl(pipefd[`0`], F_SETFL, O_NONBLOCK) == -`1`)
208	elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
209	if (fcntl(pipefd[`1`], F_SETFL, O_NONBLOCK) == -`1`)
210	elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
211	if (fcntl(pipefd[`0`], F_SETFD, FD_CLOEXEC) == -`1`)
212	elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
213	if (fcntl(pipefd[`1`], F_SETFD, FD_CLOEXEC) == -`1`)
214	elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
215
216	selfpipe_readfd = pipefd[`0`];
217	selfpipe_writefd = pipefd[`1`];
218	selfpipe_owner_pid = MyProcPid;
219	#else
220	/ currently, nothing to do here for Windows /
221	#endif
222	}
223
224	/*
225	* Initialize a process-local latch.
226	*/
227	void
228	InitLatch(Latch *latch)
229	{
230	latch->is_set = false;
231	latch->owner_pid = MyProcPid;
232	latch->is_shared = false;
233
234	#ifndef WIN32
235	/ Assert InitializeLatchSupport has been called in this process /
236	Assert(selfpipe_readfd >= `0` && selfpipe_owner_pid == MyProcPid);
237	#else
238	latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
239	if (latch->event == NULL)
240	elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
241	#endif /* WIN32 */
242	}
243
244	/*
245	* Initialize a shared latch that can be set from other processes. The latch
246	* is initially owned by no-one; use OwnLatch to associate it with the
247	* current process.
248	*
249	* InitSharedLatch needs to be called in postmaster before forking child
250	* processes, usually right after allocating the shared memory block
251	* containing the latch with ShmemInitStruct. (The Unix implementation
252	* doesn't actually require that, but the Windows one does.) Because of
253	* this restriction, we have no concurrency issues to worry about here.
254	*
255	* Note that other handles created in this module are never marked as
256	* inheritable. Thus we do not need to worry about cleaning up child
257	* process references to postmaster-private latches or WaitEventSets.
258	*/
259	void
260	InitSharedLatch(Latch *latch)
261	{
262	#ifdef WIN32
263	SECURITY_ATTRIBUTES sa;
264
265	/*
266	* Set up security attributes to specify that the events are inherited.
267	*/
268	ZeroMemory(&sa, sizeof(sa));
269	sa.nLength = sizeof(sa);
270	sa.bInheritHandle = TRUE;
271
272	latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
273	if (latch->event == NULL)
274	elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
275	#endif
276
277	latch->is_set = false;
278	latch->owner_pid = `0`;
279	latch->is_shared = true;
280	}
281
282	/*
283	* Associate a shared latch with the current process, allowing it to
284	* wait on the latch.
285	*
286	* Although there is a sanity check for latch-already-owned, we don't do
287	* any sort of locking here, meaning that we could fail to detect the error
288	* if two processes try to own the same latch at about the same time. If
289	* there is any risk of that, caller must provide an interlock to prevent it.
290	*
291	* In any process that calls OwnLatch(), make sure that
292	* latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
293	* as shared latches use SIGUSR1 for inter-process communication.
294	*/
295	void
296	OwnLatch(Latch *latch)
297	{
298	/ Sanity checks /
299	Assert(latch->is_shared);
300
301	#ifndef WIN32
302	/ Assert InitializeLatchSupport has been called in this process /
303	Assert(selfpipe_readfd >= `0` && selfpipe_owner_pid == MyProcPid);
304	#endif
305
306	if (latch->owner_pid != `0`)
307	elog(ERROR, "latch already owned");
308
309	latch->owner_pid = MyProcPid;
310	}
311
312	/*
313	* Disown a shared latch currently owned by the current process.
314	*/
315	void
316	DisownLatch(Latch *latch)
317	{
318	Assert(latch->is_shared);
319	Assert(latch->owner_pid == MyProcPid);
320
321	latch->owner_pid = `0`;
322	}
323
324	/*
325	* Wait for a given latch to be set, or for postmaster death, or until timeout
326	* is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
327	* to wait for. If the latch is already set (and WL_LATCH_SET is given), the
328	* function returns immediately.
329	*
330	* The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
331	* is given. Although it is declared as "long", we don't actually support
332	* timeouts longer than INT_MAX milliseconds. Note that some extra overhead
333	* is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
334	*
335	* The latch must be owned by the current process, ie. it must be a
336	* process-local latch initialized with InitLatch, or a shared latch
337	* associated with the current process by calling OwnLatch.
338	*
339	* Returns bit mask indicating which condition(s) caused the wake-up. Note
340	* that if multiple wake-up conditions are true, there is no guarantee that
341	* we return all of them in one call, but we will return at least one.
342	*/
343	int
344	WaitLatch(Latch latch, int* wakeEvents, long timeout,
345	uint32 wait_event_info)
346	{
347	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
348	wait_event_info);
349	}
350
351	/*
352	* Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
353	* conditions.
354	*
355	* When waiting on a socket, EOF and error conditions always cause the socket
356	* to be reported as readable/writable/connected, so that the caller can deal
357	* with the condition.
358	*
359	* wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
360	* if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
361	* return value if the postmaster dies. The latter is useful for rare cases
362	* where some behavior other than immediate exit is needed.
363	*
364	* NB: These days this is just a wrapper around the WaitEventSet API. When
365	* using a latch very frequently, consider creating a longer living
366	* WaitEventSet instead; that's more efficient.
367	*/
368	int
369	WaitLatchOrSocket(Latch latch, int* wakeEvents, pgsocket sock,
370	long timeout, uint32 wait_event_info)
371	{
372	int ret = `0`;
373	int rc;
374	WaitEvent event;
375	WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, `3`);
376
377	if (wakeEvents & WL_TIMEOUT)
378	Assert(timeout >= `0`);
379	else
380	timeout = -`1`;
381
382	if (wakeEvents & WL_LATCH_SET)
383	AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
384	latch, NULL);
385
386	/ Postmaster-managed callers must handle postmaster death somehow. /
387	Assert(!IsUnderPostmaster \|\|
388	(wakeEvents & WL_EXIT_ON_PM_DEATH) \|\|
389	(wakeEvents & WL_POSTMASTER_DEATH));
390
391	if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
392	AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
393	NULL, NULL);
394
395	if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
396	AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
397	NULL, NULL);
398
399	if (wakeEvents & WL_SOCKET_MASK)
400	{
401	int ev;
402
403	ev = wakeEvents & WL_SOCKET_MASK;
404	AddWaitEventToSet(set, ev, sock, NULL, NULL);
405	}
406
407	rc = WaitEventSetWait(set, timeout, &event, `1`, wait_event_info);
408
409	if (rc == `0`)
410	ret \|= WL_TIMEOUT;
411	else
412	{
413	ret \|= event.events & (WL_LATCH_SET \|
414	WL_POSTMASTER_DEATH \|
415	WL_SOCKET_MASK);
416	}
417
418	FreeWaitEventSet(set);
419
420	return ret;
421	}
422
423	/*
424	* Sets a latch and wakes up anyone waiting on it.
425	*
426	* This is cheap if the latch is already set, otherwise not so much.
427	*
428	* NB: when calling this in a signal handler, be sure to save and restore
429	* errno around it. (That's standard practice in most signal handlers, of
430	* course, but we used to omit it in handlers that only set a flag.)
431	*
432	* NB: this function is called from critical sections and signal handlers so
433	* throwing an error is not a good idea.
434	*/
435	void
436	SetLatch(Latch *latch)
437	{
438	#ifndef WIN32
439	pid_t owner_pid;
440	#else
441	HANDLE handle;
442	#endif
443
444	/*
445	* The memory barrier has to be placed here to ensure that any flag
446	* variables possibly changed by this process have been flushed to main
447	* memory, before we check/set is_set.
448	*/
449	pg_memory_barrier();
450
451	/ Quick exit if already set /
452	if (latch->is_set)
453	return;
454
455	latch->is_set = true;
456
457	#ifndef WIN32
458
459	/*
460	* See if anyone's waiting for the latch. It can be the current process if
461	* we're in a signal handler. We use the self-pipe to wake up the
462	* poll()/epoll_wait() in that case. If it's another process, send a
463	* signal.
464	*
465	* Fetch owner_pid only once, in case the latch is concurrently getting
466	* owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
467	* guaranteed to be true! In practice, the effective range of pid_t fits
468	* in a 32 bit integer, and so should be atomic. In the worst case, we
469	* might end up signaling the wrong process. Even then, you're very
470	* unlucky if a process with that bogus pid exists and belongs to
471	* Postgres; and PG database processes should handle excess SIGUSR1
472	* interrupts without a problem anyhow.
473	*
474	* Another sort of race condition that's possible here is for a new
475	* process to own the latch immediately after we look, so we don't signal
476	* it. This is okay so long as all callers of ResetLatch/WaitLatch follow
477	* the standard coding convention of waiting at the bottom of their loops,
478	* not the top, so that they'll correctly process latch-setting events
479	* that happen before they enter the loop.
480	*/
481	owner_pid = latch->owner_pid;
482	if (owner_pid == `0`)
483	return;
484	else if (owner_pid == MyProcPid)
485	{
486	if (waiting)
487	sendSelfPipeByte();
488	}
489	else
490	kill(owner_pid, SIGUSR1);
491	#else
492
493	/*
494	* See if anyone's waiting for the latch. It can be the current process if
495	* we're in a signal handler.
496	*
497	* Use a local variable here just in case somebody changes the event field
498	* concurrently (which really should not happen).
499	*/
500	handle = latch->event;
501	if (handle)
502	{
503	SetEvent(handle);
504
505	/*
506	* Note that we silently ignore any errors. We might be in a signal
507	* handler or other critical path where it's not safe to call elog().
508	*/
509	}
510	#endif
511
512	}
513
514	/*
515	* Clear the latch. Calling WaitLatch after this will sleep, unless
516	* the latch is set again before the WaitLatch call.
517	*/
518	void
519	ResetLatch(Latch *latch)
520	{
521	/ Only the owner should reset the latch /
522	Assert(latch->owner_pid == MyProcPid);
523
524	latch->is_set = false;
525
526	/*
527	* Ensure that the write to is_set gets flushed to main memory before we
528	* examine any flag variables. Otherwise a concurrent SetLatch might
529	* falsely conclude that it needn't signal us, even though we have missed
530	* seeing some flag updates that SetLatch was supposed to inform us of.
531	*/
532	pg_memory_barrier();
533	}
534
535	/*
536	* Create a WaitEventSet with space for nevents different events to wait for.
537	*
538	* These events can then be efficiently waited upon together, using
539	* WaitEventSetWait().
540	*/
541	WaitEventSet *
542	CreateWaitEventSet(MemoryContext context, int nevents)
543	{
544	WaitEventSet *set;
545	char *data;
546	Size sz = `0`;
547
548	/*
549	* Use MAXALIGN size/alignment to guarantee that later uses of memory are
550	* aligned correctly. E.g. epoll_event might need 8 byte alignment on some
551	* platforms, but earlier allocations like WaitEventSet and WaitEvent
552	* might not sized to guarantee that when purely using sizeof().
553	*/
554	sz += MAXALIGN(sizeof(WaitEventSet));
555	sz += MAXALIGN(sizeof(WaitEvent) * nevents);
556
557	#if defined(WAIT_USE_EPOLL)
558	sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
559	#elif defined(WAIT_USE_POLL)
560	sz += MAXALIGN(sizeof(struct pollfd) * nevents);
561	#elif defined(WAIT_USE_WIN32)
562	/ need space for the pgwin32_signal_event /
563	sz += MAXALIGN(sizeof(HANDLE) * (nevents + `1`));
564	#endif
565
566	data = (char *) MemoryContextAllocZero(context, sz);
567
568	set = (WaitEventSet *) data;
569	data += MAXALIGN(sizeof(WaitEventSet));
570
571	set->events = (WaitEvent *) data;
572	data += MAXALIGN(sizeof(WaitEvent) * nevents);
573
574	#if defined(WAIT_USE_EPOLL)
575	set->epoll_ret_events = (struct epoll_event *) data;
576	data += MAXALIGN(sizeof(struct epoll_event) * nevents);
577	#elif defined(WAIT_USE_POLL)
578	set->pollfds = (struct pollfd *) data;
579	data += MAXALIGN(sizeof(struct pollfd) * nevents);
580	#elif defined(WAIT_USE_WIN32)
581	set->handles = (HANDLE) data;
582	data += MAXALIGN(sizeof(HANDLE) * nevents);
583	#endif
584
585	set->latch = NULL;
586	set->nevents_space = nevents;
587	set->exit_on_postmaster_death = false;
588
589	#if defined(WAIT_USE_EPOLL)
590	#ifdef EPOLL_CLOEXEC
591	set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
592	if (set->epoll_fd < `0`)
593	elog(ERROR, "epoll_create1 failed: %m");
594	#else
595	/ cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) /
596	set->epoll_fd = epoll_create(nevents);
597	if (set->epoll_fd < `0`)
598	elog(ERROR, "epoll_create failed: %m");
599	if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -`1`)
600	elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
601	#endif /* EPOLL_CLOEXEC */
602	#elif defined(WAIT_USE_WIN32)
603
604	/*
605	* To handle signals while waiting, we need to add a win32 specific event.
606	* We accounted for the additional event at the top of this routine. See
607	* port/win32/signal.c for more details.
608	*
609	* Note: pgwin32_signal_event should be first to ensure that it will be
610	* reported when multiple events are set. We want to guarantee that
611	* pending signals are serviced.
612	*/
613	set->handles[`0`] = pgwin32_signal_event;
614	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
615	#endif
616
617	return set;
618	}
619
620	/*
621	* Free a previously created WaitEventSet.
622	*
623	* Note: preferably, this shouldn't have to free any resources that could be
624	* inherited across an exec(). If it did, we'd likely leak those resources in
625	* many scenarios. For the epoll case, we ensure that by setting FD_CLOEXEC
626	* when the FD is created. For the Windows case, we assume that the handles
627	* involved are non-inheritable.
628	*/
629	void
630	FreeWaitEventSet(WaitEventSet *set)
631	{
632	#if defined(WAIT_USE_EPOLL)
633	close(set->epoll_fd);
634	#elif defined(WAIT_USE_WIN32)
635	WaitEvent *cur_event;
636
637	for (cur_event = set->events;
638	cur_event < (set->events + set->nevents);
639	cur_event++)
640	{
641	if (cur_event->events & WL_LATCH_SET)
642	{
643	/ uses the latch's HANDLE /
644	}
645	else if (cur_event->events & WL_POSTMASTER_DEATH)
646	{
647	/ uses PostmasterHandle /
648	}
649	else
650	{
651	/ Clean up the event object we created for the socket /
652	WSAEventSelect(cur_event->fd, NULL, `0`);
653	WSACloseEvent(set->handles[cur_event->pos + `1`]);
654	}
655	}
656	#endif
657
658	pfree(set);
659	}
660
661	/ ---*
662	* Add an event to the set. Possible events are:
663	* - WL_LATCH_SET: Wait for the latch to be set
664	* - WL_POSTMASTER_DEATH: Wait for postmaster to die
665	* - WL_SOCKET_READABLE: Wait for socket to become readable,
666	* can be combined in one event with other WL_SOCKET_* events
667	* - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
668	* can be combined with other WL_SOCKET_* events
669	* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
670	* can be combined with other WL_SOCKET_* events (on non-Windows
671	* platforms, this is the same as WL_SOCKET_WRITEABLE)
672	* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
673	*
674	* Returns the offset in WaitEventSet->events (starting from 0), which can be
675	* used to modify previously added wait events using ModifyWaitEvent().
676	*
677	* In the WL_LATCH_SET case the latch must be owned by the current process,
678	* i.e. it must be a process-local latch initialized with InitLatch, or a
679	* shared latch associated with the current process by calling OwnLatch.
680	*
681	* In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
682	* conditions cause the socket to be reported as readable/writable/connected,
683	* so that the caller can deal with the condition.
684	*
685	* The user_data pointer specified here will be set for the events returned
686	* by WaitEventSetWait(), allowing to easily associate additional data with
687	* events.
688	*/
689	int
690	AddWaitEventToSet(WaitEventSet set, uint32 events, pgsocket fd, Latch latch,
691	void *user_data)
692	{
693	WaitEvent *event;
694
695	/ not enough space /
696	Assert(set->nevents < set->nevents_space);
697
698	if (events == WL_EXIT_ON_PM_DEATH)
699	{
700	events = WL_POSTMASTER_DEATH;
701	set->exit_on_postmaster_death = true;
702	}
703
704	if (latch)
705	{
706	if (latch->owner_pid != MyProcPid)
707	elog(ERROR, "cannot wait on a latch owned by another process");
708	if (set->latch)
709	elog(ERROR, "cannot wait on more than one latch");
710	if ((events & WL_LATCH_SET) != WL_LATCH_SET)
711	elog(ERROR, "latch events only support being set");
712	}
713	else
714	{
715	if (events & WL_LATCH_SET)
716	elog(ERROR, "cannot wait on latch without a specified latch");
717	}
718
719	/ waiting for socket readiness without a socket indicates a bug /
720	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
721	elog(ERROR, "cannot wait on socket event without a socket");
722
723	event = &set->events[set->nevents];
724	event->pos = set->nevents++;
725	event->fd = fd;
726	event->events = events;
727	event->user_data = user_data;
728	#ifdef WIN32
729	event->reset = false;
730	#endif
731
732	if (events == WL_LATCH_SET)
733	{
734	set->latch = latch;
735	set->latch_pos = event->pos;
736	#ifndef WIN32
737	event->fd = selfpipe_readfd;
738	#endif
739	}
740	else if (events == WL_POSTMASTER_DEATH)
741	{
742	#ifndef WIN32
743	event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
744	#endif
745	}
746
747	/ perform wait primitive specific initialization, if needed /
748	#if defined(WAIT_USE_EPOLL)
749	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
750	#elif defined(WAIT_USE_POLL)
751	WaitEventAdjustPoll(set, event);
752	#elif defined(WAIT_USE_WIN32)
753	WaitEventAdjustWin32(set, event);
754	#endif
755
756	return event->pos;
757	}
758
759	/*
760	* Change the event mask and, in the WL_LATCH_SET case, the latch associated
761	* with the WaitEvent.
762	*
763	* 'pos' is the id returned by AddWaitEventToSet.
764	*/
765	void
766	ModifyWaitEvent(WaitEventSet set, int* pos, uint32 events, Latch *latch)
767	{
768	WaitEvent *event;
769
770	Assert(pos < set->nevents);
771
772	event = &set->events[pos];
773
774	/*
775	* If neither the event mask nor the associated latch changes, return
776	* early. That's an important optimization for some sockets, where
777	* ModifyWaitEvent is frequently used to switch from waiting for reads to
778	* waiting on writes.
779	*/
780	if (events == event->events &&
781	(!(event->events & WL_LATCH_SET) \|\| set->latch == latch))
782	return;
783
784	if (event->events & WL_LATCH_SET &&
785	events != event->events)
786	{
787	/ we could allow to disable latch events for a while /
788	elog(ERROR, "cannot modify latch event");
789	}
790
791	if (event->events & WL_POSTMASTER_DEATH)
792	{
793	elog(ERROR, "cannot modify postmaster death event");
794	}
795
796	/ FIXME: validate event mask /
797	event->events = events;
798
799	if (events == WL_LATCH_SET)
800	{
801	set->latch = latch;
802	}
803
804	#if defined(WAIT_USE_EPOLL)
805	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
806	#elif defined(WAIT_USE_POLL)
807	WaitEventAdjustPoll(set, event);
808	#elif defined(WAIT_USE_WIN32)
809	WaitEventAdjustWin32(set, event);
810	#endif
811	}
812
813	#if defined(WAIT_USE_EPOLL)
814	/*
815	* action can be one of EPOLL_CTL_ADD \| EPOLL_CTL_MOD \| EPOLL_CTL_DEL
816	*/
817	static void
818	WaitEventAdjustEpoll(WaitEventSet set, WaitEvent event, int action)
819	{
820	struct epoll_event epoll_ev;
821	int rc;
822
823	/ pointer to our event, returned by epoll_wait /
824	epoll_ev.data.ptr = event;
825	/ always wait for errors /
826	epoll_ev.events = EPOLLERR \| EPOLLHUP;
827
828	/ prepare pollfd entry once /
829	if (event->events == WL_LATCH_SET)
830	{
831	Assert(set->latch != NULL);
832	epoll_ev.events \|= EPOLLIN;
833	}
834	else if (event->events == WL_POSTMASTER_DEATH)
835	{
836	epoll_ev.events \|= EPOLLIN;
837	}
838	else
839	{
840	Assert(event->fd != PGINVALID_SOCKET);
841	Assert(event->events & (WL_SOCKET_READABLE \| WL_SOCKET_WRITEABLE));
842
843	if (event->events & WL_SOCKET_READABLE)
844	epoll_ev.events \|= EPOLLIN;
845	if (event->events & WL_SOCKET_WRITEABLE)
846	epoll_ev.events \|= EPOLLOUT;
847	}
848
849	/*
850	* Even though unused, we also pass epoll_ev as the data argument if
851	* EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
852	* requiring that, and actually it makes the code simpler...
853	*/
854	rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
855
856	if (rc < `0`)
857	ereport(ERROR,
858	(errcode_for_socket_access(),
859	/ translator: %s is a syscall name, such as "poll()" /
860	errmsg("%s failed: %m",
861	"epoll_ctl()")));
862	}
863	#endif
864
865	#if defined(WAIT_USE_POLL)
866	static void
867	WaitEventAdjustPoll(WaitEventSet set, WaitEvent event)
868	{
869	struct pollfd *pollfd = &set->pollfds[event->pos];
870
871	pollfd->revents = `0`;
872	pollfd->fd = event->fd;
873
874	/ prepare pollfd entry once /
875	if (event->events == WL_LATCH_SET)
876	{
877	Assert(set->latch != NULL);
878	pollfd->events = POLLIN;
879	}
880	else if (event->events == WL_POSTMASTER_DEATH)
881	{
882	pollfd->events = POLLIN;
883	}
884	else
885	{
886	Assert(event->events & (WL_SOCKET_READABLE \| WL_SOCKET_WRITEABLE));
887	pollfd->events = `0`;
888	if (event->events & WL_SOCKET_READABLE)
889	pollfd->events \|= POLLIN;
890	if (event->events & WL_SOCKET_WRITEABLE)
891	pollfd->events \|= POLLOUT;
892	}
893
894	Assert(event->fd != PGINVALID_SOCKET);
895	}
896	#endif
897
898	#if defined(WAIT_USE_WIN32)
899	static void
900	WaitEventAdjustWin32(WaitEventSet set, WaitEvent event)
901	{
902	HANDLE *handle = &set->handles[event->pos + `1`];
903
904	if (event->events == WL_LATCH_SET)
905	{
906	Assert(set->latch != NULL);
907	*handle = set->latch->event;
908	}
909	else if (event->events == WL_POSTMASTER_DEATH)
910	{
911	*handle = PostmasterHandle;
912	}
913	else
914	{
915	int flags = FD_CLOSE; / always check for errors/EOF /
916
917	if (event->events & WL_SOCKET_READABLE)
918	flags \|= FD_READ;
919	if (event->events & WL_SOCKET_WRITEABLE)
920	flags \|= FD_WRITE;
921	if (event->events & WL_SOCKET_CONNECTED)
922	flags \|= FD_CONNECT;
923
924	if (*handle == WSA_INVALID_EVENT)
925	{
926	*handle = WSACreateEvent();
927	if (*handle == WSA_INVALID_EVENT)
928	elog(ERROR, "failed to create event for socket: error code %u",
929	WSAGetLastError());
930	}
931	if (WSAEventSelect(event->fd, *handle, flags) != `0`)
932	elog(ERROR, "failed to set up event for socket: error code %u",
933	WSAGetLastError());
934
935	Assert(event->fd != PGINVALID_SOCKET);
936	}
937	}
938	#endif
939
940	/*
941	* Wait for events added to the set to happen, or until the timeout is
942	* reached. At most nevents occurred events are returned.
943	*
944	* If timeout = -1, block until an event occurs; if 0, check sockets for
945	* readiness, but don't block; if > 0, block for at most timeout milliseconds.
946	*
947	* Returns the number of events occurred, or 0 if the timeout was reached.
948	*
949	* Returned events will have the fd, pos, user_data fields set to the
950	* values associated with the registered event.
951	*/
952	int
953	WaitEventSetWait(WaitEventSet set, long* timeout,
954	WaitEvent occurred_events, int* nevents,
955	uint32 wait_event_info)
956	{
957	int returned_events = `0`;
958	instr_time start_time;
959	instr_time cur_time;
960	long cur_timeout = -`1`;
961
962	Assert(nevents > `0`);
963
964	/*
965	* Initialize timeout if requested. We must record the current time so
966	* that we can determine the remaining timeout if interrupted.
967	*/
968	if (timeout >= `0`)
969	{
970	INSTR_TIME_SET_CURRENT(start_time);
971	Assert(timeout >= `0` && timeout <= INT_MAX);
972	cur_timeout = timeout;
973	}
974
975	pgstat_report_wait_start(wait_event_info);
976
977	#ifndef WIN32
978	waiting = true;
979	#else
980	/ Ensure that signals are serviced even if latch is already set /
981	pgwin32_dispatch_queued_signals();
982	#endif
983	while (returned_events == `0`)
984	{
985	int rc;
986
987	/*
988	* Check if the latch is set already. If so, leave the loop
989	* immediately, avoid blocking again. We don't attempt to report any
990	* other events that might also be satisfied.
991	*
992	* If someone sets the latch between this and the
993	* WaitEventSetWaitBlock() below, the setter will write a byte to the
994	* pipe (or signal us and the signal handler will do that), and the
995	* readiness routine will return immediately.
996	*
997	* On unix, If there's a pending byte in the self pipe, we'll notice
998	* whenever blocking. Only clearing the pipe in that case avoids
999	* having to drain it every time WaitLatchOrSocket() is used. Should
1000	* the pipe-buffer fill up we're still ok, because the pipe is in
1001	* nonblocking mode. It's unlikely for that to happen, because the
1002	* self pipe isn't filled unless we're blocking (waiting = true), or
1003	* from inside a signal handler in latch_sigusr1_handler().
1004	*
1005	* On windows, we'll also notice if there's a pending event for the
1006	* latch when blocking, but there's no danger of anything filling up,
1007	* as "Setting an event that is already set has no effect.".
1008	*
1009	* Note: we assume that the kernel calls involved in latch management
1010	* will provide adequate synchronization on machines with weak memory
1011	* ordering, so that we cannot miss seeing is_set if a notification
1012	* has already been queued.
1013	*/
1014	if (set->latch && set->latch->is_set)
1015	{
1016	occurred_events->fd = PGINVALID_SOCKET;
1017	occurred_events->pos = set->latch_pos;
1018	occurred_events->user_data =
1019	set->events[set->latch_pos].user_data;
1020	occurred_events->events = WL_LATCH_SET;
1021	occurred_events++;
1022	returned_events++;
1023
1024	break;
1025	}
1026
1027	/*
1028	* Wait for events using the readiness primitive chosen at the top of
1029	* this file. If -1 is returned, a timeout has occurred, if 0 we have
1030	* to retry, everything >= 1 is the number of returned events.
1031	*/
1032	rc = WaitEventSetWaitBlock(set, cur_timeout,
1033	occurred_events, nevents);
1034
1035	if (rc == -`1`)
1036	break; / timeout occurred /
1037	else
1038	returned_events = rc;
1039
1040	/ If we're not done, update cur_timeout for next iteration /
1041	if (returned_events == `0` && timeout >= `0`)
1042	{
1043	INSTR_TIME_SET_CURRENT(cur_time);
1044	INSTR_TIME_SUBTRACT(cur_time, start_time);
1045	cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1046	if (cur_timeout <= `0`)
1047	break;
1048	}
1049	}
1050	#ifndef WIN32
1051	waiting = false;
1052	#endif
1053
1054	pgstat_report_wait_end();
1055
1056	return returned_events;
1057	}
1058
1059
1060	#if defined(WAIT_USE_EPOLL)
1061
1062	/*
1063	* Wait using linux's epoll_wait(2).
1064	*
1065	* This is the preferable wait method, as several readiness notifications are
1066	* delivered, without having to iterate through all of set->events. The return
1067	* epoll_event struct contain a pointer to our events, making association
1068	* easy.
1069	*/
1070	static inline int
1071	WaitEventSetWaitBlock(WaitEventSet set, int* cur_timeout,
1072	WaitEvent occurred_events, int* nevents)
1073	{
1074	int returned_events = `0`;
1075	int rc;
1076	WaitEvent *cur_event;
1077	struct epoll_event *cur_epoll_event;
1078
1079	/ Sleep /
1080	rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1081	nevents, cur_timeout);
1082
1083	/ Check return code /
1084	if (rc < `0`)
1085	{
1086	/ EINTR is okay, otherwise complain /
1087	if (errno != EINTR)
1088	{
1089	waiting = false;
1090	ereport(ERROR,
1091	(errcode_for_socket_access(),
1092	/ translator: %s is a syscall name, such as "poll()" /
1093	errmsg("%s failed: %m",
1094	"epoll_wait()")));
1095	}
1096	return `0`;
1097	}
1098	else if (rc == `0`)
1099	{
1100	/ timeout exceeded /
1101	return -`1`;
1102	}
1103
1104	/*
1105	* At least one event occurred, iterate over the returned epoll events
1106	* until they're either all processed, or we've returned all the events
1107	* the caller desired.
1108	*/
1109	for (cur_epoll_event = set->epoll_ret_events;
1110	cur_epoll_event < (set->epoll_ret_events + rc) &&
1111	returned_events < nevents;
1112	cur_epoll_event++)
1113	{
1114	/ epoll's data pointer is set to the associated WaitEvent /
1115	cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1116
1117	occurred_events->pos = cur_event->pos;
1118	occurred_events->user_data = cur_event->user_data;
1119	occurred_events->events = `0`;
1120
1121	if (cur_event->events == WL_LATCH_SET &&
1122	cur_epoll_event->events & (EPOLLIN \| EPOLLERR \| EPOLLHUP))
1123	{
1124	/ There's data in the self-pipe, clear it. /
1125	drainSelfPipe();
1126
1127	if (set->latch->is_set)
1128	{
1129	occurred_events->fd = PGINVALID_SOCKET;
1130	occurred_events->events = WL_LATCH_SET;
1131	occurred_events++;
1132	returned_events++;
1133	}
1134	}
1135	else if (cur_event->events == WL_POSTMASTER_DEATH &&
1136	cur_epoll_event->events & (EPOLLIN \| EPOLLERR \| EPOLLHUP))
1137	{
1138	/*
1139	* We expect an EPOLLHUP when the remote end is closed, but
1140	* because we don't expect the pipe to become readable or to have
1141	* any errors either, treat those cases as postmaster death, too.
1142	*
1143	* Be paranoid about a spurious event signalling the postmaster as
1144	* being dead. There have been reports about that happening with
1145	* older primitives (select(2) to be specific), and a spurious
1146	* WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1147	* cost much.
1148	*/
1149	if (!PostmasterIsAliveInternal())
1150	{
1151	if (set->exit_on_postmaster_death)
1152	proc_exit(`1`);
1153	occurred_events->fd = PGINVALID_SOCKET;
1154	occurred_events->events = WL_POSTMASTER_DEATH;
1155	occurred_events++;
1156	returned_events++;
1157	}
1158	}
1159	else if (cur_event->events & (WL_SOCKET_READABLE \| WL_SOCKET_WRITEABLE))
1160	{
1161	Assert(cur_event->fd != PGINVALID_SOCKET);
1162
1163	if ((cur_event->events & WL_SOCKET_READABLE) &&
1164	(cur_epoll_event->events & (EPOLLIN \| EPOLLERR \| EPOLLHUP)))
1165	{
1166	/ data available in socket, or EOF /
1167	occurred_events->events \|= WL_SOCKET_READABLE;
1168	}
1169
1170	if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1171	(cur_epoll_event->events & (EPOLLOUT \| EPOLLERR \| EPOLLHUP)))
1172	{
1173	/ writable, or EOF /
1174	occurred_events->events \|= WL_SOCKET_WRITEABLE;
1175	}
1176
1177	if (occurred_events->events != `0`)
1178	{
1179	occurred_events->fd = cur_event->fd;
1180	occurred_events++;
1181	returned_events++;
1182	}
1183	}
1184	}
1185
1186	return returned_events;
1187	}
1188
1189	#elif defined(WAIT_USE_POLL)
1190
1191	/*
1192	* Wait using poll(2).
1193	*
1194	* This allows to receive readiness notifications for several events at once,
1195	* but requires iterating through all of set->pollfds.
1196	*/
1197	static inline int
1198	WaitEventSetWaitBlock(WaitEventSet set, int* cur_timeout,
1199	WaitEvent occurred_events, int* nevents)
1200	{
1201	int returned_events = `0`;
1202	int rc;
1203	WaitEvent *cur_event;
1204	struct pollfd *cur_pollfd;
1205
1206	/ Sleep /
1207	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1208
1209	/ Check return code /
1210	if (rc < `0`)
1211	{
1212	/ EINTR is okay, otherwise complain /
1213	if (errno != EINTR)
1214	{
1215	waiting = false;
1216	ereport(ERROR,
1217	(errcode_for_socket_access(),
1218	/ translator: %s is a syscall name, such as "poll()" /
1219	errmsg("%s failed: %m",
1220	"poll()")));
1221	}
1222	return `0`;
1223	}
1224	else if (rc == `0`)
1225	{
1226	/ timeout exceeded /
1227	return -`1`;
1228	}
1229
1230	for (cur_event = set->events, cur_pollfd = set->pollfds;
1231	cur_event < (set->events + set->nevents) &&
1232	returned_events < nevents;
1233	cur_event++, cur_pollfd++)
1234	{
1235	/ no activity on this FD, skip /
1236	if (cur_pollfd->revents == `0`)
1237	continue;
1238
1239	occurred_events->pos = cur_event->pos;
1240	occurred_events->user_data = cur_event->user_data;
1241	occurred_events->events = `0`;
1242
1243	if (cur_event->events == WL_LATCH_SET &&
1244	(cur_pollfd->revents & (POLLIN \| POLLHUP \| POLLERR \| POLLNVAL)))
1245	{
1246	/ There's data in the self-pipe, clear it. /
1247	drainSelfPipe();
1248
1249	if (set->latch->is_set)
1250	{
1251	occurred_events->fd = PGINVALID_SOCKET;
1252	occurred_events->events = WL_LATCH_SET;
1253	occurred_events++;
1254	returned_events++;
1255	}
1256	}
1257	else if (cur_event->events == WL_POSTMASTER_DEATH &&
1258	(cur_pollfd->revents & (POLLIN \| POLLHUP \| POLLERR \| POLLNVAL)))
1259	{
1260	/*
1261	* We expect an POLLHUP when the remote end is closed, but because
1262	* we don't expect the pipe to become readable or to have any
1263	* errors either, treat those cases as postmaster death, too.
1264	*
1265	* Be paranoid about a spurious event signalling the postmaster as
1266	* being dead. There have been reports about that happening with
1267	* older primitives (select(2) to be specific), and a spurious
1268	* WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1269	* cost much.
1270	*/
1271	if (!PostmasterIsAliveInternal())
1272	{
1273	if (set->exit_on_postmaster_death)
1274	proc_exit(`1`);
1275	occurred_events->fd = PGINVALID_SOCKET;
1276	occurred_events->events = WL_POSTMASTER_DEATH;
1277	occurred_events++;
1278	returned_events++;
1279	}
1280	}
1281	else if (cur_event->events & (WL_SOCKET_READABLE \| WL_SOCKET_WRITEABLE))
1282	{
1283	int errflags = POLLHUP \| POLLERR \| POLLNVAL;
1284
1285	Assert(cur_event->fd >= PGINVALID_SOCKET);
1286
1287	if ((cur_event->events & WL_SOCKET_READABLE) &&
1288	(cur_pollfd->revents & (POLLIN \| errflags)))
1289	{
1290	/ data available in socket, or EOF /
1291	occurred_events->events \|= WL_SOCKET_READABLE;
1292	}
1293
1294	if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1295	(cur_pollfd->revents & (POLLOUT \| errflags)))
1296	{
1297	/ writeable, or EOF /
1298	occurred_events->events \|= WL_SOCKET_WRITEABLE;
1299	}
1300
1301	if (occurred_events->events != `0`)
1302	{
1303	occurred_events->fd = cur_event->fd;
1304	occurred_events++;
1305	returned_events++;
1306	}
1307	}
1308	}
1309	return returned_events;
1310	}
1311
1312	#elif defined(WAIT_USE_WIN32)
1313
1314	/*
1315	* Wait using Windows' WaitForMultipleObjects().
1316	*
1317	* Unfortunately this will only ever return a single readiness notification at
1318	* a time. Note that while the official documentation for
1319	* WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1320	* with a single bWaitAll = FALSE call,
1321	* https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1322	* that only one event is "consumed".
1323	*/
1324	static inline int
1325	WaitEventSetWaitBlock(WaitEventSet set, int* cur_timeout,
1326	WaitEvent occurred_events, int* nevents)
1327	{
1328	int returned_events = `0`;
1329	DWORD rc;
1330	WaitEvent *cur_event;
1331
1332	/ Reset any wait events that need it /
1333	for (cur_event = set->events;
1334	cur_event < (set->events + set->nevents);
1335	cur_event++)
1336	{
1337	if (cur_event->reset)
1338	{
1339	WaitEventAdjustWin32(set, cur_event);
1340	cur_event->reset = false;
1341	}
1342
1343	/*
1344	* Windows does not guarantee to log an FD_WRITE network event
1345	* indicating that more data can be sent unless the previous send()
1346	* failed with WSAEWOULDBLOCK. While our caller might well have made
1347	* such a call, we cannot assume that here. Therefore, if waiting for
1348	* write-ready, force the issue by doing a dummy send(). If the dummy
1349	* send() succeeds, assume that the socket is in fact write-ready, and
1350	* return immediately. Also, if it fails with something other than
1351	* WSAEWOULDBLOCK, return a write-ready indication to let our caller
1352	* deal with the error condition.
1353	*/
1354	if (cur_event->events & WL_SOCKET_WRITEABLE)
1355	{
1356	char c;
1357	WSABUF buf;
1358	DWORD sent;
1359	int r;
1360
1361	buf.buf = &c;
1362	buf.len = `0`;
1363
1364	r = WSASend(cur_event->fd, &buf, `1`, &sent, `0`, NULL, NULL);
1365	if (r == `0` \|\| WSAGetLastError() != WSAEWOULDBLOCK)
1366	{
1367	occurred_events->pos = cur_event->pos;
1368	occurred_events->user_data = cur_event->user_data;
1369	occurred_events->events = WL_SOCKET_WRITEABLE;
1370	occurred_events->fd = cur_event->fd;
1371	return `1`;
1372	}
1373	}
1374	}
1375
1376	/*
1377	* Sleep.
1378	*
1379	* Need to wait for ->nevents + 1, because signal handle is in [0].
1380	*/
1381	rc = WaitForMultipleObjects(set->nevents + `1`, set->handles, FALSE,
1382	cur_timeout);
1383
1384	/ Check return code /
1385	if (rc == WAIT_FAILED)
1386	elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1387	GetLastError());
1388	else if (rc == WAIT_TIMEOUT)
1389	{
1390	/ timeout exceeded /
1391	return -`1`;
1392	}
1393
1394	if (rc == WAIT_OBJECT_0)
1395	{
1396	/ Service newly-arrived signals /
1397	pgwin32_dispatch_queued_signals();
1398	return `0`; / retry /
1399	}
1400
1401	/*
1402	* With an offset of one, due to the always present pgwin32_signal_event,
1403	* the handle offset directly corresponds to a wait event.
1404	*/
1405	cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - `1`];
1406
1407	occurred_events->pos = cur_event->pos;
1408	occurred_events->user_data = cur_event->user_data;
1409	occurred_events->events = `0`;
1410
1411	if (cur_event->events == WL_LATCH_SET)
1412	{
1413	if (!ResetEvent(set->latch->event))
1414	elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1415
1416	if (set->latch->is_set)
1417	{
1418	occurred_events->fd = PGINVALID_SOCKET;
1419	occurred_events->events = WL_LATCH_SET;
1420	occurred_events++;
1421	returned_events++;
1422	}
1423	}
1424	else if (cur_event->events == WL_POSTMASTER_DEATH)
1425	{
1426	/*
1427	* Postmaster apparently died. Since the consequences of falsely
1428	* returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1429	* the trouble to positively verify this with PostmasterIsAlive(),
1430	* even though there is no known reason to think that the event could
1431	* be falsely set on Windows.
1432	*/
1433	if (!PostmasterIsAliveInternal())
1434	{
1435	if (set->exit_on_postmaster_death)
1436	proc_exit(`1`);
1437	occurred_events->fd = PGINVALID_SOCKET;
1438	occurred_events->events = WL_POSTMASTER_DEATH;
1439	occurred_events++;
1440	returned_events++;
1441	}
1442	}
1443	else if (cur_event->events & WL_SOCKET_MASK)
1444	{
1445	WSANETWORKEVENTS resEvents;
1446	HANDLE handle = set->handles[cur_event->pos + `1`];
1447
1448	Assert(cur_event->fd);
1449
1450	occurred_events->fd = cur_event->fd;
1451
1452	ZeroMemory(&resEvents, sizeof(resEvents));
1453	if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != `0`)
1454	elog(ERROR, "failed to enumerate network events: error code %u",
1455	WSAGetLastError());
1456	if ((cur_event->events & WL_SOCKET_READABLE) &&
1457	(resEvents.lNetworkEvents & FD_READ))
1458	{
1459	/ data available in socket /
1460	occurred_events->events \|= WL_SOCKET_READABLE;
1461
1462	/------*
1463	* WaitForMultipleObjects doesn't guarantee that a read event will
1464	* be returned if the latch is set at the same time. Even if it
1465	* did, the caller might drop that event expecting it to reoccur
1466	* on next call. So, we must force the event to be reset if this
1467	* WaitEventSet is used again in order to avoid an indefinite
1468	* hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1469	* for the behavior of socket events.
1470	*------
1471	*/
1472	cur_event->reset = true;
1473	}
1474	if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1475	(resEvents.lNetworkEvents & FD_WRITE))
1476	{
1477	/ writeable /
1478	occurred_events->events \|= WL_SOCKET_WRITEABLE;
1479	}
1480	if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1481	(resEvents.lNetworkEvents & FD_CONNECT))
1482	{
1483	/ connected /
1484	occurred_events->events \|= WL_SOCKET_CONNECTED;
1485	}
1486	if (resEvents.lNetworkEvents & FD_CLOSE)
1487	{
1488	/ EOF/error, so signal all caller-requested socket flags /
1489	occurred_events->events \|= (cur_event->events & WL_SOCKET_MASK);
1490	}
1491
1492	if (occurred_events->events != `0`)
1493	{
1494	occurred_events++;
1495	returned_events++;
1496	}
1497	}
1498
1499	return returned_events;
1500	}
1501	#endif
1502
1503	/*
1504	* SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1505	*
1506	* Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1507	* overloaded for multiple purposes; or we might not have reached WaitLatch
1508	* yet, in which case we don't need to fill the pipe either.)
1509	*
1510	* NB: when calling this in a signal handler, be sure to save and restore
1511	* errno around it.
1512	*/
1513	#ifndef WIN32
1514	void
1515	latch_sigusr1_handler(void)
1516	{
1517	if (waiting)
1518	sendSelfPipeByte();
1519	}
1520	#endif /* !WIN32 */
1521
1522	/ Send one byte to the self-pipe, to wake up WaitLatch /
1523	#ifndef WIN32
1524	static void
1525	sendSelfPipeByte(void)
1526	{
1527	int rc;
1528	char dummy = `0`;
1529
1530	retry:
1531	rc = write(selfpipe_writefd, &dummy, `1`);
1532	if (rc < `0`)
1533	{
1534	/ If interrupted by signal, just retry /
1535	if (errno == EINTR)
1536	goto retry;
1537
1538	/*
1539	* If the pipe is full, we don't need to retry, the data that's there
1540	* already is enough to wake up WaitLatch.
1541	*/
1542	if (errno == EAGAIN \|\| errno == EWOULDBLOCK)
1543	return;
1544
1545	/*
1546	* Oops, the write() failed for some other reason. We might be in a
1547	* signal handler, so it's not safe to elog(). We have no choice but
1548	* silently ignore the error.
1549	*/
1550	return;
1551	}
1552	}
1553	#endif /* !WIN32 */
1554
1555	/*
1556	* Read all available data from the self-pipe
1557	*
1558	* Note: this is only called when waiting = true. If it fails and doesn't
1559	* return, it must reset that flag first (though ideally, this will never
1560	* happen).
1561	*/
1562	#ifndef WIN32
1563	static void
1564	drainSelfPipe(void)
1565	{
1566	/*
1567	* There shouldn't normally be more than one byte in the pipe, or maybe a
1568	* few bytes if multiple processes run SetLatch at the same instant.
1569	*/
1570	char buf[`16`];
1571	int rc;
1572
1573	for (;;)
1574	{
1575	rc = read(selfpipe_readfd, buf, sizeof(buf));
1576	if (rc < `0`)
1577	{
1578	if (errno == EAGAIN \|\| errno == EWOULDBLOCK)
1579	break; / the pipe is empty /
1580	else if (errno == EINTR)
1581	continue; / retry /
1582	else
1583	{
1584	waiting = false;
1585	elog(ERROR, "read() on self-pipe failed: %m");
1586	}
1587	}
1588	else if (rc == `0`)
1589	{
1590	waiting = false;
1591	elog(ERROR, "unexpected EOF on self-pipe");
1592	}
1593	else if (rc < sizeof(buf))
1594	{
1595	/ we successfully drained the pipe; no need to read() again /
1596	break;
1597	}
1598	/ else buffer wasn't big enough, so read again /
1599	}
1600	}
1601	#endif /* !WIN32 */
1602

Browse the source code of PostgreSQL/src/backend/storage/ipc/latch.c