standby.c source code [PostgreSQL/src/backend/storage/ipc/standby.c]

1	/-------------------------------------------------------------------------*
2	*
3	* standby.c
4	* Misc functions used in Hot Standby mode.
5	*
6	* All functions for handling RM_STANDBY_ID, which relate to
7	* AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8	* Plus conflict recovery processing.
9	*
10	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
11	* Portions Copyright (c) 1994, Regents of the University of California
12	*
13	* IDENTIFICATION
14	* src/backend/storage/ipc/standby.c
15	*
16	*-------------------------------------------------------------------------
17	*/
18	#include "postgres.h"
19	#include "access/transam.h"
20	#include "access/twophase.h"
21	#include "access/xact.h"
22	#include "access/xlog.h"
23	#include "access/xloginsert.h"
24	#include "miscadmin.h"
25	#include "pgstat.h"
26	#include "storage/bufmgr.h"
27	#include "storage/lmgr.h"
28	#include "storage/proc.h"
29	#include "storage/procarray.h"
30	#include "storage/sinvaladt.h"
31	#include "storage/standby.h"
32	#include "utils/hsearch.h"
33	#include "utils/memutils.h"
34	#include "utils/ps_status.h"
35	#include "utils/timeout.h"
36	#include "utils/timestamp.h"
37
38	/ User-settable GUC parameters /
39	int vacuum_defer_cleanup_age;
40	int max_standby_archive_delay = `30` * `1000`;
41	int max_standby_streaming_delay = `30` * `1000`;
42
43	static HTAB *RecoveryLockLists;
44
45	static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
46	ProcSignalReason reason);
47	static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
48	static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
49	static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
50
51	/*
52	* Keep track of all the locks owned by a given transaction.
53	*/
54	typedef struct RecoveryLockListsEntry
55	{
56	TransactionId xid;
57	List *locks;
58	} RecoveryLockListsEntry;
59
60	/*
61	* InitRecoveryTransactionEnvironment
62	* Initialize tracking of in-progress transactions in master
63	*
64	* We need to issue shared invalidations and hold locks. Holding locks
65	* means others may want to wait on us, so we need to make a lock table
66	* vxact entry like a real transaction. We could create and delete
67	* lock table entries for each transaction but its simpler just to create
68	* one permanent entry and leave it there all the time. Locks are then
69	* acquired and released as needed. Yes, this means you can see the
70	* Startup process in pg_locks once we have run this.
71	*/
72	void
73	InitRecoveryTransactionEnvironment(void)
74	{
75	VirtualTransactionId vxid;
76	HASHCTL hash_ctl;
77
78	/*
79	* Initialize the hash table for tracking the list of locks held by each
80	* transaction.
81	*/
82	memset(&hash_ctl, `0`, sizeof(hash_ctl));
83	hash_ctl.keysize = sizeof(TransactionId);
84	hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
85	RecoveryLockLists = hash_create("RecoveryLockLists",
86	`64`,
87	&hash_ctl,
88	HASH_ELEM \| HASH_BLOBS);
89
90	/*
91	* Initialize shared invalidation management for Startup process, being
92	* careful to register ourselves as a sendOnly process so we don't need to
93	* read messages, nor will we get signalled when the queue starts filling
94	* up.
95	*/
96	SharedInvalBackendInit(true);
97
98	/*
99	* Lock a virtual transaction id for Startup process.
100	*
101	* We need to do GetNextLocalTransactionId() because
102	* SharedInvalBackendInit() leaves localTransactionid invalid and the lock
103	* manager doesn't like that at all.
104	*
105	* Note that we don't need to run XactLockTableInsert() because nobody
106	* needs to wait on xids. That sounds a little strange, but table locks
107	* are held by vxids and row level locks are held by xids. All queries
108	* hold AccessShareLocks so never block while we write or lock new rows.
109	*/
110	vxid.backendId = MyBackendId;
111	vxid.localTransactionId = GetNextLocalTransactionId();
112	VirtualXactLockTableInsert(vxid);
113
114	standbyState = STANDBY_INITIALIZED;
115	}
116
117	/*
118	* ShutdownRecoveryTransactionEnvironment
119	* Shut down transaction tracking
120	*
121	* Prepare to switch from hot standby mode to normal operation. Shut down
122	* recovery-time transaction tracking.
123	*/
124	void
125	ShutdownRecoveryTransactionEnvironment(void)
126	{
127	/ Mark all tracked in-progress transactions as finished. /
128	ExpireAllKnownAssignedTransactionIds();
129
130	/ Release all locks the tracked transactions were holding /
131	StandbyReleaseAllLocks();
132
133	/ Destroy the hash table of locks. /
134	hash_destroy(RecoveryLockLists);
135	RecoveryLockLists = NULL;
136
137	/ Cleanup our VirtualTransaction /
138	VirtualXactLockTableCleanup();
139	}
140
141
142	/*
143	* -----------------------------------------------------
144	* Standby wait timers and backend cancel logic
145	* -----------------------------------------------------
146	*/
147
148	/*
149	* Determine the cutoff time at which we want to start canceling conflicting
150	* transactions. Returns zero (a time safely in the past) if we are willing
151	* to wait forever.
152	*/
153	static TimestampTz
154	GetStandbyLimitTime(void)
155	{
156	TimestampTz rtime;
157	bool fromStream;
158
159	/*
160	* The cutoff time is the last WAL data receipt time plus the appropriate
161	* delay variable. Delay of -1 means wait forever.
162	*/
163	GetXLogReceiptTime(&rtime, &fromStream);
164	if (fromStream)
165	{
166	if (max_standby_streaming_delay < `0`)
167	return `0`; / wait forever /
168	return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
169	}
170	else
171	{
172	if (max_standby_archive_delay < `0`)
173	return `0`; / wait forever /
174	return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
175	}
176	}
177
178	#define STANDBY_INITIAL_WAIT_US 1000
179	static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
180
181	/*
182	* Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
183	* We wait here for a while then return. If we decide we can't wait any
184	* more then we return true, if we can wait some more return false.
185	*/
186	static bool
187	WaitExceedsMaxStandbyDelay(void)
188	{
189	TimestampTz ltime;
190
191	CHECK_FOR_INTERRUPTS();
192
193	/ Are we past the limit time? /
194	ltime = GetStandbyLimitTime();
195	if (ltime && GetCurrentTimestamp() >= ltime)
196	return true;
197
198	/*
199	* Sleep a bit (this is essential to avoid busy-waiting).
200	*/
201	pg_usleep(standbyWait_us);
202
203	/*
204	* Progressively increase the sleep times, but not to more than 1s, since
205	* pg_usleep isn't interruptible on some platforms.
206	*/
207	standbyWait_us *= `2`;
208	if (standbyWait_us > `1000000`)
209	standbyWait_us = `1000000`;
210
211	return false;
212	}
213
214	/*
215	* This is the main executioner for any query backend that conflicts with
216	* recovery processing. Judgement has already been passed on it within
217	* a specific rmgr. Here we just issue the orders to the procs. The procs
218	* then throw the required error as instructed.
219	*/
220	static void
221	ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
222	ProcSignalReason reason)
223	{
224	TimestampTz waitStart;
225	char *new_status;
226
227	/ Fast exit, to avoid a kernel call if there's no work to be done. /
228	if (!VirtualTransactionIdIsValid(*waitlist))
229	return;
230
231	waitStart = GetCurrentTimestamp();
232	new_status = NULL; / we haven't changed the ps display /
233
234	while (VirtualTransactionIdIsValid(*waitlist))
235	{
236	/ reset standbyWait_us for each xact we wait for /
237	standbyWait_us = STANDBY_INITIAL_WAIT_US;
238
239	/ wait until the virtual xid is gone /
240	while (!VirtualXactLock(*waitlist, false))
241	{
242	/*
243	* Report via ps if we have been waiting for more than 500 msec
244	* (should that be configurable?)
245	*/
246	if (update_process_title && new_status == NULL &&
247	TimestampDifferenceExceeds(waitStart, GetCurrentTimestamp(),
248	`500`))
249	{
250	const char *old_status;
251	int len;
252
253	old_status = get_ps_display(&len);
254	new_status = (char *) palloc(len + `8` + `1`);
255	memcpy(new_status, old_status, len);
256	strcpy(new_status + len, " waiting");
257	set_ps_display(new_status, false);
258	new_status[len] = `'\0'`; / truncate off " waiting" /
259	}
260
261	/ Is it time to kill it? /
262	if (WaitExceedsMaxStandbyDelay())
263	{
264	pid_t pid;
265
266	/*
267	* Now find out who to throw out of the balloon.
268	*/
269	Assert(VirtualTransactionIdIsValid(*waitlist));
270	pid = CancelVirtualTransaction(*waitlist, reason);
271
272	/*
273	* Wait a little bit for it to die so that we avoid flooding
274	* an unresponsive backend when system is heavily loaded.
275	*/
276	if (pid != `0`)
277	pg_usleep(`5000L`);
278	}
279	}
280
281	/ The virtual transaction is gone now, wait for the next one /
282	waitlist++;
283	}
284
285	/ Reset ps display if we changed it /
286	if (new_status)
287	{
288	set_ps_display(new_status, false);
289	pfree(new_status);
290	}
291	}
292
293	void
294	ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
295	{
296	VirtualTransactionId *backends;
297
298	/*
299	* If we get passed InvalidTransactionId then we are a little surprised,
300	* but it is theoretically possible in normal running. It also happens
301	* when replaying already applied WAL records after a standby crash or
302	* restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as
303	* frozen a page which was already all-visible. If latestRemovedXid is
304	* invalid then there is no conflict. That rule applies across all record
305	* types that suffer from this conflict.
306	*/
307	if (!TransactionIdIsValid(latestRemovedXid))
308	return;
309
310	backends = GetConflictingVirtualXIDs(latestRemovedXid,
311	node.dbNode);
312
313	ResolveRecoveryConflictWithVirtualXIDs(backends,
314	PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
315	}
316
317	void
318	ResolveRecoveryConflictWithTablespace(Oid tsid)
319	{
320	VirtualTransactionId *temp_file_users;
321
322	/*
323	* Standby users may be currently using this tablespace for their
324	* temporary files. We only care about current users because
325	* temp_tablespace parameter will just ignore tablespaces that no longer
326	* exist.
327	*
328	* Ask everybody to cancel their queries immediately so we can ensure no
329	* temp files remain and we can remove the tablespace. Nuke the entire
330	* site from orbit, it's the only way to be sure.
331	*
332	* XXX: We could work out the pids of active backends using this
333	* tablespace by examining the temp filenames in the directory. We would
334	* then convert the pids into VirtualXIDs before attempting to cancel
335	* them.
336	*
337	* We don't wait for commit because drop tablespace is non-transactional.
338	*/
339	temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
340	InvalidOid);
341	ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
342	PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
343	}
344
345	void
346	ResolveRecoveryConflictWithDatabase(Oid dbid)
347	{
348	/*
349	* We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
350	* only waits for transactions and completely idle sessions would block
351	* us. This is rare enough that we do this as simply as possible: no wait,
352	* just force them off immediately.
353	*
354	* No locking is required here because we already acquired
355	* AccessExclusiveLock. Anybody trying to connect while we do this will
356	* block during InitPostgres() and then disconnect when they see the
357	* database has been removed.
358	*/
359	while (CountDBBackends(dbid) > `0`)
360	{
361	CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
362
363	/*
364	* Wait awhile for them to die so that we avoid flooding an
365	* unresponsive backend when system is heavily loaded.
366	*/
367	pg_usleep(`10000`);
368	}
369	}
370
371	/*
372	* ResolveRecoveryConflictWithLock is called from ProcSleep()
373	* to resolve conflicts with other backends holding relation locks.
374	*
375	* The WaitLatch sleep normally done in ProcSleep()
376	* (when not InHotStandby) is performed here, for code clarity.
377	*
378	* We either resolve conflicts immediately or set a timeout to wake us at
379	* the limit of our patience.
380	*
381	* Resolve conflicts by canceling to all backends holding a conflicting
382	* lock. As we are already queued to be granted the lock, no new lock
383	* requests conflicting with ours will be granted in the meantime.
384	*
385	* Deadlocks involving the Startup process and an ordinary backend process
386	* will be detected by the deadlock detector within the ordinary backend.
387	*/
388	void
389	ResolveRecoveryConflictWithLock(LOCKTAG locktag)
390	{
391	TimestampTz ltime;
392
393	Assert(InHotStandby);
394
395	ltime = GetStandbyLimitTime();
396
397	if (GetCurrentTimestamp() >= ltime)
398	{
399	/*
400	* We're already behind, so clear a path as quickly as possible.
401	*/
402	VirtualTransactionId *backends;
403
404	backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
405	ResolveRecoveryConflictWithVirtualXIDs(backends,
406	PROCSIG_RECOVERY_CONFLICT_LOCK);
407	}
408	else
409	{
410	/*
411	* Wait (or wait again) until ltime
412	*/
413	EnableTimeoutParams timeouts[`1`];
414
415	timeouts[`0`].id = STANDBY_LOCK_TIMEOUT;
416	timeouts[`0`].type = TMPARAM_AT;
417	timeouts[`0`].fin_time = ltime;
418	enable_timeouts(timeouts, `1`);
419	}
420
421	/ Wait to be signaled by the release of the Relation Lock /
422	ProcWaitForSignal(PG_WAIT_LOCK \| locktag.locktag_type);
423
424	/*
425	* Clear any timeout requests established above. We assume here that the
426	* Startup process doesn't have any other outstanding timeouts than those
427	* used by this function. If that stops being true, we could cancel the
428	* timeouts individually, but that'd be slower.
429	*/
430	disable_all_timeouts(false);
431	}
432
433	/*
434	* ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
435	* to resolve conflicts with other backends holding buffer pins.
436	*
437	* The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
438	* (when not InHotStandby) is performed here, for code clarity.
439	*
440	* We either resolve conflicts immediately or set a timeout to wake us at
441	* the limit of our patience.
442	*
443	* Resolve conflicts by sending a PROCSIG signal to all backends to check if
444	* they hold one of the buffer pins that is blocking Startup process. If so,
445	* those backends will take an appropriate error action, ERROR or FATAL.
446	*
447	* We also must check for deadlocks. Deadlocks occur because if queries
448	* wait on a lock, that must be behind an AccessExclusiveLock, which can only
449	* be cleared if the Startup process replays a transaction completion record.
450	* If Startup process is also waiting then that is a deadlock. The deadlock
451	* can occur if the query is waiting and then the Startup sleeps, or if
452	* Startup is sleeping and the query waits on a lock. We protect against
453	* only the former sequence here, the latter sequence is checked prior to
454	* the query sleeping, in CheckRecoveryConflictDeadlock().
455	*
456	* Deadlocks are extremely rare, and relatively expensive to check for,
457	* so we don't do a deadlock check right away ... only if we have had to wait
458	* at least deadlock_timeout.
459	*/
460	void
461	ResolveRecoveryConflictWithBufferPin(void)
462	{
463	TimestampTz ltime;
464
465	Assert(InHotStandby);
466
467	ltime = GetStandbyLimitTime();
468
469	if (ltime == `0`)
470	{
471	/*
472	* We're willing to wait forever for conflicts, so set timeout for
473	* deadlock check only
474	*/
475	enable_timeout_after(STANDBY_DEADLOCK_TIMEOUT, DeadlockTimeout);
476	}
477	else if (GetCurrentTimestamp() >= ltime)
478	{
479	/*
480	* We're already behind, so clear a path as quickly as possible.
481	*/
482	SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
483	}
484	else
485	{
486	/*
487	* Wake up at ltime, and check for deadlocks as well if we will be
488	* waiting longer than deadlock_timeout
489	*/
490	EnableTimeoutParams timeouts[`2`];
491
492	timeouts[`0`].id = STANDBY_TIMEOUT;
493	timeouts[`0`].type = TMPARAM_AT;
494	timeouts[`0`].fin_time = ltime;
495	timeouts[`1`].id = STANDBY_DEADLOCK_TIMEOUT;
496	timeouts[`1`].type = TMPARAM_AFTER;
497	timeouts[`1`].delay_ms = DeadlockTimeout;
498	enable_timeouts(timeouts, `2`);
499	}
500
501	/ Wait to be signaled by UnpinBuffer() /
502	ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
503
504	/*
505	* Clear any timeout requests established above. We assume here that the
506	* Startup process doesn't have any other timeouts than what this function
507	* uses. If that stops being true, we could cancel the timeouts
508	* individually, but that'd be slower.
509	*/
510	disable_all_timeouts(false);
511	}
512
513	static void
514	SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
515	{
516	Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN \|\|
517	reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
518
519	/*
520	* We send signal to all backends to ask them if they are holding the
521	* buffer pin which is delaying the Startup process. We must not set the
522	* conflict flag yet, since most backends will be innocent. Let the
523	* SIGUSR1 handling in each backend decide their own fate.
524	*/
525	CancelDBBackends(InvalidOid, reason, false);
526	}
527
528	/*
529	* In Hot Standby perform early deadlock detection. We abort the lock
530	* wait if we are about to sleep while holding the buffer pin that Startup
531	* process is waiting for.
532	*
533	* Note: this code is pessimistic, because there is no way for it to
534	* determine whether an actual deadlock condition is present: the lock we
535	* need to wait for might be unrelated to any held by the Startup process.
536	* Sooner or later, this mechanism should get ripped out in favor of somehow
537	* accounting for buffer locks in DeadLockCheck(). However, errors here
538	* seem to be very low-probability in practice, so for now it's not worth
539	* the trouble.
540	*/
541	void
542	CheckRecoveryConflictDeadlock(void)
543	{
544	Assert(!InRecovery); / do not call in Startup process /
545
546	if (!HoldingBufferPinThatDelaysRecovery())
547	return;
548
549	/*
550	* Error message should match ProcessInterrupts() but we avoid calling
551	* that because we aren't handling an interrupt at this point. Note that
552	* we only cancel the current transaction here, so if we are in a
553	* subtransaction and the pin is held by a parent, then the Startup
554	* process will continue to wait even though we have avoided deadlock.
555	*/
556	ereport(ERROR,
557	(errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
558	errmsg("canceling statement due to conflict with recovery"),
559	errdetail("User transaction caused buffer deadlock with recovery.")));
560	}
561
562
563	/ --------------------------------*
564	* timeout handler routines
565	* --------------------------------
566	*/
567
568	/*
569	* StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT
570	* occurs before STANDBY_TIMEOUT. Send out a request for hot-standby
571	* backends to check themselves for deadlocks.
572	*/
573	void
574	StandbyDeadLockHandler(void)
575	{
576	SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
577	}
578
579	/*
580	* StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
581	* Send out a request to release conflicting buffer pins unconditionally,
582	* so we can press ahead with applying changes in recovery.
583	*/
584	void
585	StandbyTimeoutHandler(void)
586	{
587	/ forget any pending STANDBY_DEADLOCK_TIMEOUT request /
588	disable_timeout(STANDBY_DEADLOCK_TIMEOUT, false);
589
590	SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
591	}
592
593	/*
594	* StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
595	* This doesn't need to do anything, simply waking up is enough.
596	*/
597	void
598	StandbyLockTimeoutHandler(void)
599	{
600	}
601
602	/*
603	* -----------------------------------------------------
604	* Locking in Recovery Mode
605	* -----------------------------------------------------
606	*
607	* All locks are held by the Startup process using a single virtual
608	* transaction. This implementation is both simpler and in some senses,
609	* more correct. The locks held mean "some original transaction held
610	* this lock, so query access is not allowed at this time". So the Startup
611	* process is the proxy by which the original locks are implemented.
612	*
613	* We only keep track of AccessExclusiveLocks, which are only ever held by
614	* one transaction on one relation.
615	*
616	* We keep a hash table of lists of locks in local memory keyed by xid,
617	* RecoveryLockLists, so we can keep track of the various entries made by
618	* the Startup process's virtual xid in the shared lock table.
619	*
620	* List elements use type xl_standby_lock, since the WAL record type exactly
621	* matches the information that we need to keep track of.
622	*
623	* We use session locks rather than normal locks so we don't need
624	* ResourceOwners.
625	*/
626
627
628	void
629	StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
630	{
631	RecoveryLockListsEntry *entry;
632	xl_standby_lock *newlock;
633	LOCKTAG locktag;
634	bool found;
635
636	/ Already processed? /
637	if (!TransactionIdIsValid(xid) \|\|
638	TransactionIdDidCommit(xid) \|\|
639	TransactionIdDidAbort(xid))
640	return;
641
642	elog(trace_recovery(DEBUG4),
643	"adding recovery lock: db %u rel %u", dbOid, relOid);
644
645	/ dbOid is InvalidOid when we are locking a shared relation. /
646	Assert(OidIsValid(relOid));
647
648	/ Create a new list for this xid, if we don't have one already. /
649	entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
650	if (!found)
651	{
652	entry->xid = xid;
653	entry->locks = NIL;
654	}
655
656	newlock = palloc(sizeof(xl_standby_lock));
657	newlock->xid = xid;
658	newlock->dbOid = dbOid;
659	newlock->relOid = relOid;
660	entry->locks = lappend(entry->locks, newlock);
661
662	SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
663
664	(void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
665	}
666
667	static void
668	StandbyReleaseLockList(List *locks)
669	{
670	while (locks)
671	{
672	xl_standby_lock lock = (xl_standby_lock ) linitial(locks);
673	LOCKTAG locktag;
674
675	elog(trace_recovery(DEBUG4),
676	"releasing recovery lock: xid %u db %u rel %u",
677	lock->xid, lock->dbOid, lock->relOid);
678	SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
679	if (!LockRelease(&locktag, AccessExclusiveLock, true))
680	{
681	elog(LOG,
682	"RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
683	lock->xid, lock->dbOid, lock->relOid);
684	Assert(false);
685	}
686	pfree(lock);
687	locks = list_delete_first(locks);
688	}
689	}
690
691	static void
692	StandbyReleaseLocks(TransactionId xid)
693	{
694	RecoveryLockListsEntry *entry;
695
696	if (TransactionIdIsValid(xid))
697	{
698	if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
699	{
700	StandbyReleaseLockList(entry->locks);
701	hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
702	}
703	}
704	else
705	StandbyReleaseAllLocks();
706	}
707
708	/*
709	* Release locks for a transaction tree, starting at xid down, from
710	* RecoveryLockLists.
711	*
712	* Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
713	* to remove any AccessExclusiveLocks requested by a transaction.
714	*/
715	void
716	StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
717	{
718	int i;
719
720	StandbyReleaseLocks(xid);
721
722	for (i = `0`; i < nsubxids; i++)
723	StandbyReleaseLocks(subxids[i]);
724	}
725
726	/*
727	* Called at end of recovery and when we see a shutdown checkpoint.
728	*/
729	void
730	StandbyReleaseAllLocks(void)
731	{
732	HASH_SEQ_STATUS status;
733	RecoveryLockListsEntry *entry;
734
735	elog(trace_recovery(DEBUG2), "release all standby locks");
736
737	hash_seq_init(&status, RecoveryLockLists);
738	while ((entry = hash_seq_search(&status)))
739	{
740	StandbyReleaseLockList(entry->locks);
741	hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
742	}
743	}
744
745	/*
746	* StandbyReleaseOldLocks
747	* Release standby locks held by top-level XIDs that aren't running,
748	* as long as they're not prepared transactions.
749	*/
750	void
751	StandbyReleaseOldLocks(TransactionId oldxid)
752	{
753	HASH_SEQ_STATUS status;
754	RecoveryLockListsEntry *entry;
755
756	hash_seq_init(&status, RecoveryLockLists);
757	while ((entry = hash_seq_search(&status)))
758	{
759	Assert(TransactionIdIsValid(entry->xid));
760
761	/ Skip if prepared transaction. /
762	if (StandbyTransactionIdIsPrepared(entry->xid))
763	continue;
764
765	/ Skip if >= oldxid. /
766	if (!TransactionIdPrecedes(entry->xid, oldxid))
767	continue;
768
769	/ Remove all locks and hash table entry. /
770	StandbyReleaseLockList(entry->locks);
771	hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
772	}
773	}
774
775	/*
776	* --------------------------------------------------------------------
777	* Recovery handling for Rmgr RM_STANDBY_ID
778	*
779	* These record types will only be created if XLogStandbyInfoActive()
780	* --------------------------------------------------------------------
781	*/
782
783	void
784	standby_redo(XLogReaderState *record)
785	{
786	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
787
788	/ Backup blocks are not used in standby records /
789	Assert(!XLogRecHasAnyBlockRefs(record));
790
791	/ Do nothing if we're not in hot standby mode /
792	if (standbyState == STANDBY_DISABLED)
793	return;
794
795	if (info == XLOG_STANDBY_LOCK)
796	{
797	xl_standby_locks xlrec = (xl_standby_locks ) XLogRecGetData(record);
798	int i;
799
800	for (i = `0`; i < xlrec->nlocks; i++)
801	StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
802	xlrec->locks[i].dbOid,
803	xlrec->locks[i].relOid);
804	}
805	else if (info == XLOG_RUNNING_XACTS)
806	{
807	xl_running_xacts xlrec = (xl_running_xacts ) XLogRecGetData(record);
808	RunningTransactionsData running;
809
810	running.xcnt = xlrec->xcnt;
811	running.subxcnt = xlrec->subxcnt;
812	running.subxid_overflow = xlrec->subxid_overflow;
813	running.nextXid = xlrec->nextXid;
814	running.latestCompletedXid = xlrec->latestCompletedXid;
815	running.oldestRunningXid = xlrec->oldestRunningXid;
816	running.xids = xlrec->xids;
817
818	ProcArrayApplyRecoveryInfo(&running);
819	}
820	else if (info == XLOG_INVALIDATIONS)
821	{
822	xl_invalidations xlrec = (xl_invalidations ) XLogRecGetData(record);
823
824	ProcessCommittedInvalidationMessages(xlrec->msgs,
825	xlrec->nmsgs,
826	xlrec->relcacheInitFileInval,
827	xlrec->dbId,
828	xlrec->tsId);
829	}
830	else
831	elog(PANIC, "standby_redo: unknown op code %u", info);
832	}
833
834	/*
835	* Log details of the current snapshot to WAL. This allows the snapshot state
836	* to be reconstructed on the standby and for logical decoding.
837	*
838	* This is used for Hot Standby as follows:
839	*
840	* We can move directly to STANDBY_SNAPSHOT_READY at startup if we
841	* start from a shutdown checkpoint because we know nothing was running
842	* at that time and our recovery snapshot is known empty. In the more
843	* typical case of an online checkpoint we need to jump through a few
844	* hoops to get a correct recovery snapshot and this requires a two or
845	* sometimes a three stage process.
846	*
847	* The initial snapshot must contain all running xids and all current
848	* AccessExclusiveLocks at a point in time on the standby. Assembling
849	* that information while the server is running requires many and
850	* various LWLocks, so we choose to derive that information piece by
851	* piece and then re-assemble that info on the standby. When that
852	* information is fully assembled we move to STANDBY_SNAPSHOT_READY.
853	*
854	* Since locking on the primary when we derive the information is not
855	* strict, we note that there is a time window between the derivation and
856	* writing to WAL of the derived information. That allows race conditions
857	* that we must resolve, since xids and locks may enter or leave the
858	* snapshot during that window. This creates the issue that an xid or
859	* lock may start after the snapshot has been derived yet before the
860	* snapshot is logged in the running xacts WAL record. We resolve this by
861	* starting to accumulate changes at a point just prior to when we derive
862	* the snapshot on the primary, then ignore duplicates when we later apply
863	* the snapshot from the running xacts record. This is implemented during
864	* CreateCheckpoint() where we use the logical checkpoint location as
865	* our starting point and then write the running xacts record immediately
866	* before writing the main checkpoint WAL record. Since we always start
867	* up from a checkpoint and are immediately at our starting point, we
868	* unconditionally move to STANDBY_INITIALIZED. After this point we
869	* must do 4 things:
870	* * move shared nextFullXid forwards as we see new xids
871	* * extend the clog and subtrans with each new xid
872	* * keep track of uncommitted known assigned xids
873	* * keep track of uncommitted AccessExclusiveLocks
874	*
875	* When we see a commit/abort we must remove known assigned xids and locks
876	* from the completing transaction. Attempted removals that cannot locate
877	* an entry are expected and must not cause an error when we are in state
878	* STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
879	* KnownAssignedXidsRemove().
880	*
881	* Later, when we apply the running xact data we must be careful to ignore
882	* transactions already committed, since those commits raced ahead when
883	* making WAL entries.
884	*
885	* The loose timing also means that locks may be recorded that have a
886	* zero xid, since xids are removed from procs before locks are removed.
887	* So we must prune the lock list down to ensure we hold locks only for
888	* currently running xids, performed by StandbyReleaseOldLocks().
889	* Zero xids should no longer be possible, but we may be replaying WAL
890	* from a time when they were possible.
891	*
892	* For logical decoding only the running xacts information is needed;
893	* there's no need to look at the locking information, but it's logged anyway,
894	* as there's no independent knob to just enable logical decoding. For
895	* details of how this is used, check snapbuild.c's introductory comment.
896	*
897	*
898	* Returns the RecPtr of the last inserted record.
899	*/
900	XLogRecPtr
901	LogStandbySnapshot(void)
902	{
903	XLogRecPtr recptr;
904	RunningTransactions running;
905	xl_standby_lock *locks;
906	int nlocks;
907
908	Assert(XLogStandbyInfoActive());
909
910	/*
911	* Get details of any AccessExclusiveLocks being held at the moment.
912	*/
913	locks = GetRunningTransactionLocks(&nlocks);
914	if (nlocks > `0`)
915	LogAccessExclusiveLocks(nlocks, locks);
916	pfree(locks);
917
918	/*
919	* Log details of all in-progress transactions. This should be the last
920	* record we write, because standby will open up when it sees this.
921	*/
922	running = GetRunningTransactionData();
923
924	/*
925	* GetRunningTransactionData() acquired ProcArrayLock, we must release it.
926	* For Hot Standby this can be done before inserting the WAL record
927	* because ProcArrayApplyRecoveryInfo() rechecks the commit status using
928	* the clog. For logical decoding, though, the lock can't be released
929	* early because the clog might be "in the future" from the POV of the
930	* historic snapshot. This would allow for situations where we're waiting
931	* for the end of a transaction listed in the xl_running_xacts record
932	* which, according to the WAL, has committed before the xl_running_xacts
933	* record. Fortunately this routine isn't executed frequently, and it's
934	* only a shared lock.
935	*/
936	if (wal_level < WAL_LEVEL_LOGICAL)
937	LWLockRelease(ProcArrayLock);
938
939	recptr = LogCurrentRunningXacts(running);
940
941	/ Release lock if we kept it longer ... /
942	if (wal_level >= WAL_LEVEL_LOGICAL)
943	LWLockRelease(ProcArrayLock);
944
945	/ GetRunningTransactionData() acquired XidGenLock, we must release it /
946	LWLockRelease(XidGenLock);
947
948	return recptr;
949	}
950
951	/*
952	* Record an enhanced snapshot of running transactions into WAL.
953	*
954	* The definitions of RunningTransactionsData and xl_xact_running_xacts are
955	* similar. We keep them separate because xl_xact_running_xacts is a
956	* contiguous chunk of memory and never exists fully until it is assembled in
957	* WAL. The inserted records are marked as not being important for durability,
958	* to avoid triggering superfluous checkpoint / archiving activity.
959	*/
960	static XLogRecPtr
961	LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
962	{
963	xl_running_xacts xlrec;
964	XLogRecPtr recptr;
965
966	xlrec.xcnt = CurrRunningXacts->xcnt;
967	xlrec.subxcnt = CurrRunningXacts->subxcnt;
968	xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
969	xlrec.nextXid = CurrRunningXacts->nextXid;
970	xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
971	xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
972
973	/ Header /
974	XLogBeginInsert();
975	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
976	XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
977
978	/ array of TransactionIds /
979	if (xlrec.xcnt > `0`)
980	XLogRegisterData((char *) CurrRunningXacts->xids,
981	(xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
982
983	recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
984
985	if (CurrRunningXacts->subxid_overflow)
986	elog(trace_recovery(DEBUG2),
987	"snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
988	CurrRunningXacts->xcnt,
989	(uint32) (recptr >> `32`), (uint32) recptr,
990	CurrRunningXacts->oldestRunningXid,
991	CurrRunningXacts->latestCompletedXid,
992	CurrRunningXacts->nextXid);
993	else
994	elog(trace_recovery(DEBUG2),
995	"snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
996	CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
997	(uint32) (recptr >> `32`), (uint32) recptr,
998	CurrRunningXacts->oldestRunningXid,
999	CurrRunningXacts->latestCompletedXid,
1000	CurrRunningXacts->nextXid);
1001
1002	/*
1003	* Ensure running_xacts information is synced to disk not too far in the
1004	* future. We don't want to stall anything though (i.e. use XLogFlush()),
1005	* so we let the wal writer do it during normal operation.
1006	* XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1007	* and nudge the WALWriter into action if sleeping. Check
1008	* XLogBackgroundFlush() for details why a record might not be flushed
1009	* without it.
1010	*/
1011	XLogSetAsyncXactLSN(recptr);
1012
1013	return recptr;
1014	}
1015
1016	/*
1017	* Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1018	* logged, as described in backend/storage/lmgr/README.
1019	*/
1020	static void
1021	LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1022	{
1023	xl_standby_locks xlrec;
1024
1025	xlrec.nlocks = nlocks;
1026
1027	XLogBeginInsert();
1028	XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
1029	XLogRegisterData((char ) locks, nlocks sizeof(xl_standby_lock));
1030	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1031
1032	(void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1033	}
1034
1035	/*
1036	* Individual logging of AccessExclusiveLocks for use during LockAcquire()
1037	*/
1038	void
1039	LogAccessExclusiveLock(Oid dbOid, Oid relOid)
1040	{
1041	xl_standby_lock xlrec;
1042
1043	xlrec.xid = GetCurrentTransactionId();
1044
1045	xlrec.dbOid = dbOid;
1046	xlrec.relOid = relOid;
1047
1048	LogAccessExclusiveLocks(`1`, &xlrec);
1049	MyXactFlags \|= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
1050	}
1051
1052	/*
1053	* Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1054	*/
1055	void
1056	LogAccessExclusiveLockPrepare(void)
1057	{
1058	/*
1059	* Ensure that a TransactionId has been assigned to this transaction, for
1060	* two reasons, both related to lock release on the standby. First, we
1061	* must assign an xid so that RecordTransactionCommit() and
1062	* RecordTransactionAbort() do not optimise away the transaction
1063	* completion record which recovery relies upon to release locks. It's a
1064	* hack, but for a corner case not worth adding code for into the main
1065	* commit path. Second, we must assign an xid before the lock is recorded
1066	* in shared memory, otherwise a concurrently executing
1067	* GetRunningTransactionLocks() might see a lock associated with an
1068	* InvalidTransactionId which we later assert cannot happen.
1069	*/
1070	(void) GetCurrentTransactionId();
1071	}
1072
1073	/*
1074	* Emit WAL for invalidations. This currently is only used for commits without
1075	* an xid but which contain invalidations.
1076	*/
1077	void
1078	LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1079	bool relcacheInitFileInval)
1080	{
1081	xl_invalidations xlrec;
1082
1083	/ prepare record /
1084	memset(&xlrec, `0`, sizeof(xlrec));
1085	xlrec.dbId = MyDatabaseId;
1086	xlrec.tsId = MyDatabaseTableSpace;
1087	xlrec.relcacheInitFileInval = relcacheInitFileInval;
1088	xlrec.nmsgs = nmsgs;
1089
1090	/ perform insertion /
1091	XLogBeginInsert();
1092	XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
1093	XLogRegisterData((char *) msgs,
1094	nmsgs * sizeof(SharedInvalidationMessage));
1095	XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1096	}
1097

Browse the source code of PostgreSQL/src/backend/storage/ipc/standby.c