xlog.c source code [PostgreSQL/src/backend/access/transam/xlog.c]

1	/-------------------------------------------------------------------------*
2	*
3	* xlog.c
4	* PostgreSQL write-ahead log manager
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* src/backend/access/transam/xlog.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14
15	#include "postgres.h"
16
17	#include <ctype.h>
18	#include <math.h>
19	#include <time.h>
20	#include <fcntl.h>
21	#include <sys/stat.h>
22	#include <sys/time.h>
23	#include <unistd.h>
24
25	#include "access/clog.h"
26	#include "access/commit_ts.h"
27	#include "access/multixact.h"
28	#include "access/rewriteheap.h"
29	#include "access/subtrans.h"
30	#include "access/timeline.h"
31	#include "access/transam.h"
32	#include "access/tuptoaster.h"
33	#include "access/twophase.h"
34	#include "access/xact.h"
35	#include "access/xlog_internal.h"
36	#include "access/xloginsert.h"
37	#include "access/xlogreader.h"
38	#include "access/xlogutils.h"
39	#include "catalog/catversion.h"
40	#include "catalog/pg_control.h"
41	#include "catalog/pg_database.h"
42	#include "commands/tablespace.h"
43	#include "common/controldata_utils.h"
44	#include "miscadmin.h"
45	#include "pgstat.h"
46	#include "port/atomics.h"
47	#include "postmaster/bgwriter.h"
48	#include "postmaster/walwriter.h"
49	#include "postmaster/startup.h"
50	#include "replication/basebackup.h"
51	#include "replication/logical.h"
52	#include "replication/slot.h"
53	#include "replication/origin.h"
54	#include "replication/snapbuild.h"
55	#include "replication/walreceiver.h"
56	#include "replication/walsender.h"
57	#include "storage/bufmgr.h"
58	#include "storage/fd.h"
59	#include "storage/ipc.h"
60	#include "storage/large_object.h"
61	#include "storage/latch.h"
62	#include "storage/pmsignal.h"
63	#include "storage/predicate.h"
64	#include "storage/proc.h"
65	#include "storage/procarray.h"
66	#include "storage/reinit.h"
67	#include "storage/smgr.h"
68	#include "storage/spin.h"
69	#include "storage/sync.h"
70	#include "utils/builtins.h"
71	#include "utils/guc.h"
72	#include "utils/memutils.h"
73	#include "utils/ps_status.h"
74	#include "utils/relmapper.h"
75	#include "utils/snapmgr.h"
76	#include "utils/timestamp.h"
77	#include "pg_trace.h"
78
79	extern uint32 bootstrap_data_checksum_version;
80
81	/ Unsupported old recovery command file names (relative to $PGDATA) /
82	#define RECOVERY_COMMAND_FILE "recovery.conf"
83	#define RECOVERY_COMMAND_DONE "recovery.done"
84
85	/ User-settable parameters /
86	int max_wal_size_mb = `1024`; / 1 GB /
87	int min_wal_size_mb = `80`; / 80 MB /
88	int wal_keep_segments = `0`;
89	int XLOGbuffers = -`1`;
90	int XLogArchiveTimeout = `0`;
91	int XLogArchiveMode = ARCHIVE_MODE_OFF;
92	char *XLogArchiveCommand = NULL;
93	bool EnableHotStandby = false;
94	bool fullPageWrites = true;
95	bool wal_log_hints = false;
96	bool wal_compression = false;
97	char *wal_consistency_checking_string = NULL;
98	bool *wal_consistency_checking = NULL;
99	bool wal_init_zero = true;
100	bool wal_recycle = true;
101	bool log_checkpoints = false;
102	int sync_method = DEFAULT_SYNC_METHOD;
103	int wal_level = WAL_LEVEL_MINIMAL;
104	int CommitDelay = `0`; / precommit delay in microseconds /
105	int CommitSiblings = `5`; / # concurrent xacts needed to sleep /
106	int wal_retrieve_retry_interval = `5000`;
107
108	#ifdef WAL_DEBUG
109	bool XLOG_DEBUG = false;
110	#endif
111
112	int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
113
114	/*
115	* Number of WAL insertion locks to use. A higher value allows more insertions
116	* to happen concurrently, but adds some CPU overhead to flushing the WAL,
117	* which needs to iterate all the locks.
118	*/
119	#define NUM_XLOGINSERT_LOCKS 8
120
121	/*
122	* Max distance from last checkpoint, before triggering a new xlog-based
123	* checkpoint.
124	*/
125	int CheckPointSegments;
126
127	/ Estimated distance between checkpoints, in bytes /
128	static double CheckPointDistanceEstimate = `0`;
129	static double PrevCheckPointDistance = `0`;
130
131	/*
132	* GUC support
133	*/
134	const struct config_enum_entry sync_method_options[] = {
135	{"fsync", SYNC_METHOD_FSYNC, false},
136	#ifdef HAVE_FSYNC_WRITETHROUGH
137	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
138	#endif
139	#ifdef HAVE_FDATASYNC
140	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
141	#endif
142	#ifdef OPEN_SYNC_FLAG
143	{"open_sync", SYNC_METHOD_OPEN, false},
144	#endif
145	#ifdef OPEN_DATASYNC_FLAG
146	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
147	#endif
148	{NULL, `0`, false}
149	};
150
151
152	/*
153	* Although only "on", "off", and "always" are documented,
154	* we accept all the likely variants of "on" and "off".
155	*/
156	const struct config_enum_entry archive_mode_options[] = {
157	{"always", ARCHIVE_MODE_ALWAYS, false},
158	{"on", ARCHIVE_MODE_ON, false},
159	{"off", ARCHIVE_MODE_OFF, false},
160	{"true", ARCHIVE_MODE_ON, true},
161	{"false", ARCHIVE_MODE_OFF, true},
162	{"yes", ARCHIVE_MODE_ON, true},
163	{"no", ARCHIVE_MODE_OFF, true},
164	{"1", ARCHIVE_MODE_ON, true},
165	{"0", ARCHIVE_MODE_OFF, true},
166	{NULL, `0`, false}
167	};
168
169	const struct config_enum_entry recovery_target_action_options[] = {
170	{"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
171	{"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
172	{"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
173	{NULL, `0`, false}
174	};
175
176	/*
177	* Statistics for current checkpoint are collected in this global struct.
178	* Because only the checkpointer or a stand-alone backend can perform
179	* checkpoints, this will be unused in normal backends.
180	*/
181	CheckpointStatsData CheckpointStats;
182
183	/*
184	* ThisTimeLineID will be same in all backends --- it identifies current
185	* WAL timeline for the database system.
186	*/
187	TimeLineID ThisTimeLineID = `0`;
188
189	/*
190	* Are we doing recovery from XLOG?
191	*
192	* This is only ever true in the startup process; it should be read as meaning
193	* "this process is replaying WAL records", rather than "the system is in
194	* recovery mode". It should be examined primarily by functions that need
195	* to act differently when called from a WAL redo function (e.g., to skip WAL
196	* logging). To check whether the system is in recovery regardless of which
197	* process you're running in, use RecoveryInProgress() but only after shared
198	* memory startup and lock initialization.
199	*/
200	bool InRecovery = false;
201
202	/ Are we in Hot Standby mode? Only valid in startup process, see xlog.h /
203	HotStandbyState standbyState = STANDBY_DISABLED;
204
205	static XLogRecPtr LastRec;
206
207	/ Local copy of WalRcv->receivedUpto /
208	static XLogRecPtr receivedUpto = `0`;
209	static TimeLineID receiveTLI = `0`;
210
211	/*
212	* During recovery, lastFullPageWrites keeps track of full_page_writes that
213	* the replayed WAL records indicate. It's initialized with full_page_writes
214	* that the recovery starting checkpoint record indicates, and then updated
215	* each time XLOG_FPW_CHANGE record is replayed.
216	*/
217	static bool lastFullPageWrites;
218
219	/*
220	* Local copy of SharedRecoveryInProgress variable. True actually means "not
221	* known, need to check the shared state".
222	*/
223	static bool LocalRecoveryInProgress = true;
224
225	/*
226	* Local copy of SharedHotStandbyActive variable. False actually means "not
227	* known, need to check the shared state".
228	*/
229	static bool LocalHotStandbyActive = false;
230
231	/*
232	* Local state for XLogInsertAllowed():
233	* 1: unconditionally allowed to insert XLOG
234	* 0: unconditionally not allowed to insert XLOG
235	* -1: must check RecoveryInProgress(); disallow until it is false
236	* Most processes start with -1 and transition to 1 after seeing that recovery
237	* is not in progress. But we can also force the value for special cases.
238	* The coding in XLogInsertAllowed() depends on the first two of these states
239	* being numerically the same as bool true and false.
240	*/
241	static int LocalXLogInsertAllowed = -`1`;
242
243	/*
244	* When ArchiveRecoveryRequested is set, archive recovery was requested,
245	* ie. signal files were present. When InArchiveRecovery is set, we are
246	* currently recovering using offline XLOG archives. These variables are only
247	* valid in the startup process.
248	*
249	* When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
250	* currently performing crash recovery using only XLOG files in pg_wal, but
251	* will switch to using offline XLOG archives as soon as we reach the end of
252	* WAL in pg_wal.
253	*/
254	bool ArchiveRecoveryRequested = false;
255	bool InArchiveRecovery = false;
256
257	static bool standby_signal_file_found = false;
258	static bool recovery_signal_file_found = false;
259
260	/ Was the last xlog file restored from archive, or local? /
261	static bool restoredFromArchive = false;
262
263	/ Buffers dedicated to consistency checks of size BLCKSZ /
264	static char *replay_image_masked = NULL;
265	static char *master_image_masked = NULL;
266
267	/ options formerly taken from recovery.conf for archive recovery /
268	char *recoveryRestoreCommand = NULL;
269	char *recoveryEndCommand = NULL;
270	char *archiveCleanupCommand = NULL;
271	RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
272	bool recoveryTargetInclusive = true;
273	int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
274	TransactionId recoveryTargetXid;
275	char *recovery_target_time_string;
276	static TimestampTz recoveryTargetTime;
277	const char *recoveryTargetName;
278	XLogRecPtr recoveryTargetLSN;
279	int recovery_min_apply_delay = `0`;
280	TimestampTz recoveryDelayUntilTime;
281
282	/ options formerly taken from recovery.conf for XLOG streaming /
283	bool StandbyModeRequested = false;
284	char *PrimaryConnInfo = NULL;
285	char *PrimarySlotName = NULL;
286	char *PromoteTriggerFile = NULL;
287
288	/ are we currently in standby mode? /
289	bool StandbyMode = false;
290
291	/ whether request for fast promotion has been made yet /
292	static bool fast_promote = false;
293
294	/*
295	* if recoveryStopsBefore/After returns true, it saves information of the stop
296	* point here
297	*/
298	static TransactionId recoveryStopXid;
299	static TimestampTz recoveryStopTime;
300	static XLogRecPtr recoveryStopLSN;
301	static char recoveryStopName[MAXFNAMELEN];
302	static bool recoveryStopAfter;
303
304	/*
305	* During normal operation, the only timeline we care about is ThisTimeLineID.
306	* During recovery, however, things are more complicated. To simplify life
307	* for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
308	* scan through the WAL history (that is, it is the line that was active when
309	* the currently-scanned WAL record was generated). We also need these
310	* timeline values:
311	*
312	* recoveryTargetTimeLineGoal: what the user requested, if any
313	*
314	* recoveryTargetTLIRequested: numeric value of requested timeline, if constant
315	*
316	* recoveryTargetTLI: the currently understood target timeline; changes
317	*
318	* expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
319	* its known parents, newest first (so recoveryTargetTLI is always the
320	* first list member). Only these TLIs are expected to be seen in the WAL
321	* segments we read, and indeed only these TLIs will be considered as
322	* candidate WAL files to open at all.
323	*
324	* curFileTLI: the TLI appearing in the name of the current input WAL file.
325	* (This is not necessarily the same as ThisTimeLineID, because we could
326	* be scanning data that was copied from an ancestor timeline when the current
327	* file was created.) During a sequential scan we do not allow this value
328	* to decrease.
329	*/
330	RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
331	TimeLineID recoveryTargetTLIRequested = `0`;
332	TimeLineID recoveryTargetTLI = `0`;
333	static List *expectedTLEs;
334	static TimeLineID curFileTLI;
335
336	/*
337	* ProcLastRecPtr points to the start of the last XLOG record inserted by the
338	* current backend. It is updated for all inserts. XactLastRecEnd points to
339	* end+1 of the last record, and is reset when we end a top-level transaction,
340	* or start a new one; so it can be used to tell if the current transaction has
341	* created any XLOG records.
342	*
343	* While in parallel mode, this may not be fully up to date. When committing,
344	* a transaction can assume this covers all xlog records written either by the
345	* user backend or by any parallel worker which was present at any point during
346	* the transaction. But when aborting, or when still in parallel mode, other
347	* parallel backends may have written WAL records at later LSNs than the value
348	* stored here. The parallel leader advances its own copy, when necessary,
349	* in WaitForParallelWorkersToFinish.
350	*/
351	XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
352	XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
353	XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
354
355	/*
356	* RedoRecPtr is this backend's local copy of the REDO record pointer
357	* (which is almost but not quite the same as a pointer to the most recent
358	* CHECKPOINT record). We update this from the shared-memory copy,
359	* XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
360	* hold an insertion lock). See XLogInsertRecord for details. We are also
361	* allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
362	* see GetRedoRecPtr. A freshly spawned backend obtains the value during
363	* InitXLOGAccess.
364	*/
365	static XLogRecPtr RedoRecPtr;
366
367	/*
368	* doPageWrites is this backend's local copy of (forcePageWrites \|\|
369	* fullPageWrites). It is used together with RedoRecPtr to decide whether
370	* a full-page image of a page need to be taken.
371	*/
372	static bool doPageWrites;
373
374	/ Has the recovery code requested a walreceiver wakeup? /
375	static bool doRequestWalReceiverReply;
376
377	/*
378	* RedoStartLSN points to the checkpoint's REDO location which is specified
379	* in a backup label file, backup history file or control file. In standby
380	* mode, XLOG streaming usually starts from the position where an invalid
381	* record was found. But if we fail to read even the initial checkpoint
382	* record, we use the REDO location instead of the checkpoint location as
383	* the start position of XLOG streaming. Otherwise we would have to jump
384	* backwards to the REDO location after reading the checkpoint record,
385	* because the REDO record can precede the checkpoint record.
386	*/
387	static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
388
389	/----------*
390	* Shared-memory data structures for XLOG control
391	*
392	* LogwrtRqst indicates a byte position that we need to write and/or fsync
393	* the log up to (all records before that point must be written or fsynced).
394	* LogwrtResult indicates the byte positions we have already written/fsynced.
395	* These structs are identical but are declared separately to indicate their
396	* slightly different functions.
397	*
398	* To read XLogCtl->LogwrtResult, you must hold either info_lck or
399	* WALWriteLock. To update it, you need to hold both locks. The point of
400	* this arrangement is that the value can be examined by code that already
401	* holds WALWriteLock without needing to grab info_lck as well. In addition
402	* to the shared variable, each backend has a private copy of LogwrtResult,
403	* which is updated when convenient.
404	*
405	* The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
406	* (protected by info_lck), but we don't need to cache any copies of it.
407	*
408	* info_lck is only held long enough to read/update the protected variables,
409	* so it's a plain spinlock. The other locks are held longer (potentially
410	* over I/O operations), so we use LWLocks for them. These locks are:
411	*
412	* WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
413	* It is only held while initializing and changing the mapping. If the
414	* contents of the buffer being replaced haven't been written yet, the mapping
415	* lock is released while the write is done, and reacquired afterwards.
416	*
417	* WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
418	* XLogFlush).
419	*
420	* ControlFileLock: must be held to read/update control file or create
421	* new log file.
422	*
423	* CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
424	* only one checkpointer at a time; currently, with all checkpoints done by
425	* the checkpointer, this is just pro forma).
426	*
427	*----------
428	*/
429
430	typedef struct XLogwrtRqst
431	{
432	XLogRecPtr Write; / last byte + 1 to write out /
433	XLogRecPtr Flush; / last byte + 1 to flush /
434	} XLogwrtRqst;
435
436	typedef struct XLogwrtResult
437	{
438	XLogRecPtr Write; / last byte + 1 written out /
439	XLogRecPtr Flush; / last byte + 1 flushed /
440	} XLogwrtResult;
441
442	/*
443	* Inserting to WAL is protected by a small fixed number of WAL insertion
444	* locks. To insert to the WAL, you must hold one of the locks - it doesn't
445	* matter which one. To lock out other concurrent insertions, you must hold
446	* of them. Each WAL insertion lock consists of a lightweight lock, plus an
447	* indicator of how far the insertion has progressed (insertingAt).
448	*
449	* The insertingAt values are read when a process wants to flush WAL from
450	* the in-memory buffers to disk, to check that all the insertions to the
451	* region the process is about to write out have finished. You could simply
452	* wait for all currently in-progress insertions to finish, but the
453	* insertingAt indicator allows you to ignore insertions to later in the WAL,
454	* so that you only wait for the insertions that are modifying the buffers
455	* you're about to write out.
456	*
457	* This isn't just an optimization. If all the WAL buffers are dirty, an
458	* inserter that's holding a WAL insert lock might need to evict an old WAL
459	* buffer, which requires flushing the WAL. If it's possible for an inserter
460	* to block on another inserter unnecessarily, deadlock can arise when two
461	* inserters holding a WAL insert lock wait for each other to finish their
462	* insertion.
463	*
464	* Small WAL records that don't cross a page boundary never update the value,
465	* the WAL record is just copied to the page and the lock is released. But
466	* to avoid the deadlock-scenario explained above, the indicator is always
467	* updated before sleeping while holding an insertion lock.
468	*
469	* lastImportantAt contains the LSN of the last important WAL record inserted
470	* using a given lock. This value is used to detect if there has been
471	* important WAL activity since the last time some action, like a checkpoint,
472	* was performed - allowing to not repeat the action if not. The LSN is
473	* updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
474	* set. lastImportantAt is never cleared, only overwritten by the LSN of newer
475	* records. Tracking the WAL activity directly in WALInsertLock has the
476	* advantage of not needing any additional locks to update the value.
477	*/
478	typedef struct
479	{
480	LWLock lock;
481	XLogRecPtr insertingAt;
482	XLogRecPtr lastImportantAt;
483	} WALInsertLock;
484
485	/*
486	* All the WAL insertion locks are allocated as an array in shared memory. We
487	* force the array stride to be a power of 2, which saves a few cycles in
488	* indexing, but more importantly also ensures that individual slots don't
489	* cross cache line boundaries. (Of course, we have to also ensure that the
490	* array start address is suitably aligned.)
491	*/
492	typedef union WALInsertLockPadded
493	{
494	WALInsertLock l;
495	char pad[PG_CACHE_LINE_SIZE];
496	} WALInsertLockPadded;
497
498	/*
499	* State of an exclusive backup, necessary to control concurrent activities
500	* across sessions when working on exclusive backups.
501	*
502	* EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
503	* running, to be more precise pg_start_backup() is not being executed for
504	* an exclusive backup and there is no exclusive backup in progress.
505	* EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
506	* exclusive backup.
507	* EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
508	* running and an exclusive backup is in progress. pg_stop_backup() is
509	* needed to finish it.
510	* EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
511	* exclusive backup.
512	*/
513	typedef enum ExclusiveBackupState
514	{
515	EXCLUSIVE_BACKUP_NONE = `0`,
516	EXCLUSIVE_BACKUP_STARTING,
517	EXCLUSIVE_BACKUP_IN_PROGRESS,
518	EXCLUSIVE_BACKUP_STOPPING
519	} ExclusiveBackupState;
520
521	/*
522	* Session status of running backup, used for sanity checks in SQL-callable
523	* functions to start and stop backups.
524	*/
525	static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
526
527	/*
528	* Shared state data for WAL insertion.
529	*/
530	typedef struct XLogCtlInsert
531	{
532	slock_t insertpos_lck; / protects CurrBytePos and PrevBytePos /
533
534	/*
535	* CurrBytePos is the end of reserved WAL. The next record will be
536	* inserted at that position. PrevBytePos is the start position of the
537	* previously inserted (or rather, reserved) record - it is copied to the
538	* prev-link of the next record. These are stored as "usable byte
539	* positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
540	*/
541	uint64 CurrBytePos;
542	uint64 PrevBytePos;
543
544	/*
545	* Make sure the above heavily-contended spinlock and byte positions are
546	* on their own cache line. In particular, the RedoRecPtr and full page
547	* write variables below should be on a different cache line. They are
548	* read on every WAL insertion, but updated rarely, and we don't want
549	* those reads to steal the cache line containing Curr/PrevBytePos.
550	*/
551	char pad[PG_CACHE_LINE_SIZE];
552
553	/*
554	* fullPageWrites is the master copy used by all backends to determine
555	* whether to write full-page to WAL, instead of using process-local one.
556	* This is required because, when full_page_writes is changed by SIGHUP,
557	* we must WAL-log it before it actually affects WAL-logging by backends.
558	* Checkpointer sets at startup or after SIGHUP.
559	*
560	* To read these fields, you must hold an insertion lock. To modify them,
561	* you must hold ALL the locks.
562	*/
563	XLogRecPtr RedoRecPtr; / current redo point for insertions /
564	bool forcePageWrites; / forcing full-page writes for PITR? /
565	bool fullPageWrites;
566
567	/*
568	* exclusiveBackupState indicates the state of an exclusive backup (see
569	* comments of ExclusiveBackupState for more details). nonExclusiveBackups
570	* is a counter indicating the number of streaming base backups currently
571	* in progress. forcePageWrites is set to true when either of these is
572	* non-zero. lastBackupStart is the latest checkpoint redo location used
573	* as a starting point for an online backup.
574	*/
575	ExclusiveBackupState exclusiveBackupState;
576	int nonExclusiveBackups;
577	XLogRecPtr lastBackupStart;
578
579	/*
580	* WAL insertion locks.
581	*/
582	WALInsertLockPadded *WALInsertLocks;
583	} XLogCtlInsert;
584
585	/*
586	* Total shared-memory state for XLOG.
587	*/
588	typedef struct XLogCtlData
589	{
590	XLogCtlInsert Insert;
591
592	/ Protected by info_lck: /
593	XLogwrtRqst LogwrtRqst;
594	XLogRecPtr RedoRecPtr; / a recent copy of Insert->RedoRecPtr /
595	FullTransactionId ckptFullXid; / nextFullXid of latest checkpoint /
596	XLogRecPtr asyncXactLSN; / LSN of newest async commit/abort /
597	XLogRecPtr replicationSlotMinLSN; / oldest LSN needed by any slot /
598
599	XLogSegNo lastRemovedSegNo; / latest removed/recycled XLOG segment /
600
601	/ Fake LSN counter, for unlogged relations. Protected by ulsn_lck. /
602	XLogRecPtr unloggedLSN;
603	slock_t ulsn_lck;
604
605	/ Time and LSN of last xlog segment switch. Protected by WALWriteLock. /
606	pg_time_t lastSegSwitchTime;
607	XLogRecPtr lastSegSwitchLSN;
608
609	/*
610	* Protected by info_lck and WALWriteLock (you must hold either lock to
611	* read it, but both to update)
612	*/
613	XLogwrtResult LogwrtResult;
614
615	/*
616	* Latest initialized page in the cache (last byte position + 1).
617	*
618	* To change the identity of a buffer (and InitializedUpTo), you need to
619	* hold WALBufMappingLock. To change the identity of a buffer that's
620	* still dirty, the old page needs to be written out first, and for that
621	* you need WALWriteLock, and you need to ensure that there are no
622	* in-progress insertions to the page by calling
623	* WaitXLogInsertionsToFinish().
624	*/
625	XLogRecPtr InitializedUpTo;
626
627	/*
628	* These values do not change after startup, although the pointed-to pages
629	* and xlblocks values certainly do. xlblock values are protected by
630	* WALBufMappingLock.
631	*/
632	char pages; /* buffers for unwritten XLOG pages /
633	XLogRecPtr xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ /
634	int XLogCacheBlck; / highest allocated xlog buffer index /
635
636	/*
637	* Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
638	* If we created a new timeline when the system was started up,
639	* PrevTimeLineID is the old timeline's ID that we forked off from.
640	* Otherwise it's equal to ThisTimeLineID.
641	*/
642	TimeLineID ThisTimeLineID;
643	TimeLineID PrevTimeLineID;
644
645	/*
646	* SharedRecoveryInProgress indicates if we're still in crash or archive
647	* recovery. Protected by info_lck.
648	*/
649	bool SharedRecoveryInProgress;
650
651	/*
652	* SharedHotStandbyActive indicates if we're still in crash or archive
653	* recovery. Protected by info_lck.
654	*/
655	bool SharedHotStandbyActive;
656
657	/*
658	* WalWriterSleeping indicates whether the WAL writer is currently in
659	* low-power mode (and hence should be nudged if an async commit occurs).
660	* Protected by info_lck.
661	*/
662	bool WalWriterSleeping;
663
664	/*
665	* recoveryWakeupLatch is used to wake up the startup process to continue
666	* WAL replay, if it is waiting for WAL to arrive or failover trigger file
667	* to appear.
668	*/
669	Latch recoveryWakeupLatch;
670
671	/*
672	* During recovery, we keep a copy of the latest checkpoint record here.
673	* lastCheckPointRecPtr points to start of checkpoint record and
674	* lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
675	* checkpointer when it wants to create a restartpoint.
676	*
677	* Protected by info_lck.
678	*/
679	XLogRecPtr lastCheckPointRecPtr;
680	XLogRecPtr lastCheckPointEndPtr;
681	CheckPoint lastCheckPoint;
682
683	/*
684	* lastReplayedEndRecPtr points to end+1 of the last record successfully
685	* replayed. When we're currently replaying a record, ie. in a redo
686	* function, replayEndRecPtr points to the end+1 of the record being
687	* replayed, otherwise it's equal to lastReplayedEndRecPtr.
688	*/
689	XLogRecPtr lastReplayedEndRecPtr;
690	TimeLineID lastReplayedTLI;
691	XLogRecPtr replayEndRecPtr;
692	TimeLineID replayEndTLI;
693	/ timestamp of last COMMIT/ABORT record replayed (or being replayed) /
694	TimestampTz recoveryLastXTime;
695
696	/*
697	* timestamp of when we started replaying the current chunk of WAL data,
698	* only relevant for replication or archive recovery
699	*/
700	TimestampTz currentChunkStartTime;
701	/ Are we requested to pause recovery? /
702	bool recoveryPause;
703
704	/*
705	* lastFpwDisableRecPtr points to the start of the last replayed
706	* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
707	*/
708	XLogRecPtr lastFpwDisableRecPtr;
709
710	slock_t info_lck; / locks shared variables shown above /
711	} XLogCtlData;
712
713	static XLogCtlData *XLogCtl = NULL;
714
715	/ a private copy of XLogCtl->Insert.WALInsertLocks, for convenience /
716	static WALInsertLockPadded *WALInsertLocks = NULL;
717
718	/*
719	* We maintain an image of pg_control in shared memory.
720	*/
721	static ControlFileData *ControlFile = NULL;
722
723	/*
724	* Calculate the amount of space left on the page after 'endptr'. Beware
725	* multiple evaluation!
726	*/
727	#define INSERT_FREESPACE(endptr) \
728	(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
729
730	/ Macro to advance to next buffer index. /
731	#define NextBufIdx(idx) \
732	(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
733
734	/*
735	* XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
736	* would hold if it was in cache, the page containing 'recptr'.
737	*/
738	#define XLogRecPtrToBufIdx(recptr) \
739	(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
740
741	/*
742	* These are the number of bytes in a WAL page usable for WAL data.
743	*/
744	#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
745
746	/ Convert min_wal_size_mb and max wal_size_mb to equivalent segment count /
747	#define ConvertToXSegs(x, segsize) \
748	(x / ((segsize) / (1024 * 1024)))
749
750	/ The number of bytes in a WAL segment usable for WAL data. /
751	static int UsableBytesInSegment;
752
753	/*
754	* Private, possibly out-of-date copy of shared LogwrtResult.
755	* See discussion above.
756	*/
757	static XLogwrtResult LogwrtResult = {`0`, `0`};
758
759	/*
760	* Codes indicating where we got a WAL file from during recovery, or where
761	* to attempt to get one.
762	*/
763	typedef enum
764	{
765	XLOG_FROM_ANY = `0`, / request to read WAL from any source /
766	XLOG_FROM_ARCHIVE, / restored using restore_command /
767	XLOG_FROM_PG_WAL, / existing file in pg_wal /
768	XLOG_FROM_STREAM / streamed from master /
769	} XLogSource;
770
771	/ human-readable names for XLogSources, for debugging output /
772	static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
773
774	/*
775	* openLogFile is -1 or a kernel FD for an open log file segment.
776	* openLogSegNo identifies the segment. These variables are only used to
777	* write the XLOG, and so will normally refer to the active segment.
778	*/
779	static int openLogFile = -`1`;
780	static XLogSegNo openLogSegNo = `0`;
781
782	/*
783	* These variables are used similarly to the ones above, but for reading
784	* the XLOG. Note, however, that readOff generally represents the offset
785	* of the page just read, not the seek position of the FD itself, which
786	* will be just past that page. readLen indicates how much of the current
787	* page has been read into readBuf, and readSource indicates where we got
788	* the currently open file from.
789	*/
790	static int readFile = -`1`;
791	static XLogSegNo readSegNo = `0`;
792	static uint32 readOff = `0`;
793	static uint32 readLen = `0`;
794	static XLogSource readSource = `0`; / XLOG_FROM_* code /
795
796	/*
797	* Keeps track of which source we're currently reading from. This is
798	* different from readSource in that this is always set, even when we don't
799	* currently have a WAL file open. If lastSourceFailed is set, our last
800	* attempt to read from currentSource failed, and we should try another source
801	* next.
802	*/
803	static XLogSource currentSource = `0`; / XLOG_FROM_* code /
804	static bool lastSourceFailed = false;
805
806	typedef struct XLogPageReadPrivate
807	{
808	int emode;
809	bool fetching_ckpt; / are we fetching a checkpoint record? /
810	bool randAccess;
811	} XLogPageReadPrivate;
812
813	/*
814	* These variables track when we last obtained some WAL data to process,
815	* and where we got it from. (XLogReceiptSource is initially the same as
816	* readSource, but readSource gets reset to zero when we don't have data
817	* to process right now. It is also different from currentSource, which
818	* also changes when we try to read from a source and fail, while
819	* XLogReceiptSource tracks where we last successfully read some WAL.)
820	*/
821	static TimestampTz XLogReceiptTime = `0`;
822	static XLogSource XLogReceiptSource = `0`; / XLOG_FROM_* code /
823
824	/ State information for XLOG reading /
825	static XLogRecPtr ReadRecPtr; / start of last record read /
826	static XLogRecPtr EndRecPtr; / end+1 of last record read /
827
828	/*
829	* Local copies of equivalent fields in the control file. When running
830	* crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
831	* expect to replay all the WAL available, and updateMinRecoveryPoint is
832	* switched to false to prevent any updates while replaying records.
833	* Those values are kept consistent as long as crash recovery runs.
834	*/
835	static XLogRecPtr minRecoveryPoint;
836	static TimeLineID minRecoveryPointTLI;
837	static bool updateMinRecoveryPoint = true;
838
839	/*
840	* Have we reached a consistent database state? In crash recovery, we have
841	* to replay all the WAL, so reachedConsistency is never set. During archive
842	* recovery, the database is consistent once minRecoveryPoint is reached.
843	*/
844	bool reachedConsistency = false;
845
846	static bool InRedo = false;
847
848	/ Have we launched bgwriter during recovery? /
849	static bool bgwriterLaunched = false;
850
851	/ For WALInsertLockAcquire/Release functions /
852	static int MyLockNo = `0`;
853	static bool holdingAllLocks = false;
854
855	#ifdef WAL_DEBUG
856	static MemoryContext walDebugCxt = NULL;
857	#endif
858
859	static void readRecoverySignalFile(void);
860	static void validateRecoveryParameters(void);
861	static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
862	static bool recoveryStopsBefore(XLogReaderState *record);
863	static bool recoveryStopsAfter(XLogReaderState *record);
864	static void recoveryPausesHere(void);
865	static bool recoveryApplyDelay(XLogReaderState *record);
866	static void SetLatestXTime(TimestampTz xtime);
867	static void SetCurrentChunkStartTime(TimestampTz xtime);
868	static void CheckRequiredParameterValues(void);
869	static void XLogReportParameters(void);
870	static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
871	TimeLineID prevTLI);
872	static void LocalSetXLogInsertAllowed(void);
873	static void CreateEndOfRecoveryRecord(void);
874	static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
875	static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
876	static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
877
878	static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
879	static bool XLogCheckpointNeeded(XLogSegNo new_segno);
880	static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
881	static bool InstallXLogFileSegment(XLogSegNo segno, char* *tmppath,
882	bool find_free, XLogSegNo max_segno,
883	bool use_lock);
884	static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
885	int source, bool notfoundOk);
886	static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
887	static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
888	int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
889	TimeLineID *readTLI);
890	static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
891	bool fetching_ckpt, XLogRecPtr tliRecPtr);
892	static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
893	static void XLogFileClose(void);
894	static void PreallocXlogFiles(XLogRecPtr endptr);
895	static void RemoveTempXlogFiles(void);
896	static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
897	static void RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
898	static void UpdateLastRemovedPtr(char *filename);
899	static void ValidateXLOGDirectoryStructure(void);
900	static void CleanupBackupHistory(void);
901	static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
902	static XLogRecord ReadRecord(XLogReaderState xlogreader, XLogRecPtr RecPtr,
903	int emode, bool fetching_ckpt);
904	static void CheckRecoveryConsistency(void);
905	static XLogRecord ReadCheckpointRecord(XLogReaderState xlogreader,
906	XLogRecPtr RecPtr, int whichChkpti, bool report);
907	static bool rescanLatestTimeLine(void);
908	static void WriteControlFile(void);
909	static void ReadControlFile(void);
910	static char *str_time(pg_time_t tnow);
911	static bool CheckForStandbyTrigger(void);
912
913	#ifdef WAL_DEBUG
914	static void xlog_outrec(StringInfo buf, XLogReaderState *record);
915	#endif
916	static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
917	static void pg_start_backup_callback(int code, Datum arg);
918	static void pg_stop_backup_callback(int code, Datum arg);
919	static bool read_backup_label(XLogRecPtr *checkPointLoc,
920	bool backupEndRequired, bool backupFromStandby);
921	static bool read_tablespace_map(List **tablespaces);
922
923	static void rm_redo_error_callback(void *arg);
924	static int get_sync_bit(int method);
925
926	static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
927	XLogRecData *rdata,
928	XLogRecPtr StartPos, XLogRecPtr EndPos);
929	static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
930	XLogRecPtr EndPos, XLogRecPtr PrevPtr);
931	static bool ReserveXLogSwitch(XLogRecPtr StartPos, XLogRecPtr EndPos,
932	XLogRecPtr *PrevPtr);
933	static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
934	static char *GetXLogBuffer(XLogRecPtr ptr);
935	static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
936	static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
937	static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
938	static void checkXLogConsistency(XLogReaderState *record);
939
940	static void WALInsertLockAcquire(void);
941	static void WALInsertLockAcquireExclusive(void);
942	static void WALInsertLockRelease(void);
943	static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
944
945	/*
946	* Insert an XLOG record represented by an already-constructed chain of data
947	* chunks. This is a low-level routine; to construct the WAL record header
948	* and data, use the higher-level routines in xloginsert.c.
949	*
950	* If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
951	* WAL record applies to, that were not included in the record as full page
952	* images. If fpw_lsn <= RedoRecPtr, the function does not perform the
953	* insertion and returns InvalidXLogRecPtr. The caller can then recalculate
954	* which pages need a full-page image, and retry. If fpw_lsn is invalid, the
955	* record is always inserted.
956	*
957	* 'flags' gives more in-depth control on the record being inserted. See
958	* XLogSetRecordFlags() for details.
959	*
960	* The first XLogRecData in the chain must be for the record header, and its
961	* data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
962	* xl_crc fields in the header, the rest of the header must already be filled
963	* by the caller.
964	*
965	* Returns XLOG pointer to end of record (beginning of next record).
966	* This can be used as LSN for data pages affected by the logged action.
967	* (LSN is the XLOG point up to which the XLOG must be flushed to disk
968	* before the data page can be written out. This implements the basic
969	* WAL rule "write the log before the data".)
970	*/
971	XLogRecPtr
972	XLogInsertRecord(XLogRecData *rdata,
973	XLogRecPtr fpw_lsn,
974	uint8 flags)
975	{
976	XLogCtlInsert *Insert = &XLogCtl->Insert;
977	pg_crc32c rdata_crc;
978	bool inserted;
979	XLogRecord rechdr = (XLogRecord ) rdata->data;
980	uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
981	bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
982	info == XLOG_SWITCH);
983	XLogRecPtr StartPos;
984	XLogRecPtr EndPos;
985	bool prevDoPageWrites = doPageWrites;
986
987	/ we assume that all of the record header is in the first chunk /
988	Assert(rdata->len >= SizeOfXLogRecord);
989
990	/ cross-check on whether we should be here or not /
991	if (!XLogInsertAllowed())
992	elog(ERROR, "cannot make new WAL entries during recovery");
993
994	/----------*
995	*
996	* We have now done all the preparatory work we can without holding a
997	* lock or modifying shared state. From here on, inserting the new WAL
998	* record to the shared WAL buffer cache is a two-step process:
999	*
1000	* 1. Reserve the right amount of space from the WAL. The current head of
1001	* reserved space is kept in Insert->CurrBytePos, and is protected by
1002	* insertpos_lck.
1003	*
1004	* 2. Copy the record to the reserved WAL space. This involves finding the
1005	* correct WAL buffer containing the reserved space, and copying the
1006	* record in place. This can be done concurrently in multiple processes.
1007	*
1008	* To keep track of which insertions are still in-progress, each concurrent
1009	* inserter acquires an insertion lock. In addition to just indicating that
1010	* an insertion is in progress, the lock tells others how far the inserter
1011	* has progressed. There is a small fixed number of insertion locks,
1012	* determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1013	* boundary, it updates the value stored in the lock to the how far it has
1014	* inserted, to allow the previous buffer to be flushed.
1015	*
1016	* Holding onto an insertion lock also protects RedoRecPtr and
1017	* fullPageWrites from changing until the insertion is finished.
1018	*
1019	* Step 2 can usually be done completely in parallel. If the required WAL
1020	* page is not initialized yet, you have to grab WALBufMappingLock to
1021	* initialize it, but the WAL writer tries to do that ahead of insertions
1022	* to avoid that from happening in the critical path.
1023	*
1024	*----------
1025	*/
1026	START_CRIT_SECTION();
1027	if (isLogSwitch)
1028	WALInsertLockAcquireExclusive();
1029	else
1030	WALInsertLockAcquire();
1031
1032	/*
1033	* Check to see if my copy of RedoRecPtr is out of date. If so, may have
1034	* to go back and have the caller recompute everything. This can only
1035	* happen just after a checkpoint, so it's better to be slow in this case
1036	* and fast otherwise.
1037	*
1038	* Also check to see if fullPageWrites or forcePageWrites was just turned
1039	* on; if we weren't already doing full-page writes then go back and
1040	* recompute.
1041	*
1042	* If we aren't doing full-page writes then RedoRecPtr doesn't actually
1043	* affect the contents of the XLOG record, so we'll update our local copy
1044	* but not force a recomputation. (If doPageWrites was just turned off,
1045	* we could recompute the record without full pages, but we choose not to
1046	* bother.)
1047	*/
1048	if (RedoRecPtr != Insert->RedoRecPtr)
1049	{
1050	Assert(RedoRecPtr < Insert->RedoRecPtr);
1051	RedoRecPtr = Insert->RedoRecPtr;
1052	}
1053	doPageWrites = (Insert->fullPageWrites \|\| Insert->forcePageWrites);
1054
1055	if (doPageWrites &&
1056	(!prevDoPageWrites \|\|
1057	(fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1058	{
1059	/*
1060	* Oops, some buffer now needs to be backed up that the caller didn't
1061	* back up. Start over.
1062	*/
1063	WALInsertLockRelease();
1064	END_CRIT_SECTION();
1065	return InvalidXLogRecPtr;
1066	}
1067
1068	/*
1069	* Reserve space for the record in the WAL. This also sets the xl_prev
1070	* pointer.
1071	*/
1072	if (isLogSwitch)
1073	inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1074	else
1075	{
1076	ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1077	&rechdr->xl_prev);
1078	inserted = true;
1079	}
1080
1081	if (inserted)
1082	{
1083	/*
1084	* Now that xl_prev has been filled in, calculate CRC of the record
1085	* header.
1086	*/
1087	rdata_crc = rechdr->xl_crc;
1088	COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1089	FIN_CRC32C(rdata_crc);
1090	rechdr->xl_crc = rdata_crc;
1091
1092	/*
1093	* All the record data, including the header, is now ready to be
1094	* inserted. Copy the record in the space reserved.
1095	*/
1096	CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1097	StartPos, EndPos);
1098
1099	/*
1100	* Unless record is flagged as not important, update LSN of last
1101	* important record in the current slot. When holding all locks, just
1102	* update the first one.
1103	*/
1104	if ((flags & XLOG_MARK_UNIMPORTANT) == `0`)
1105	{
1106	int lockno = holdingAllLocks ? `0` : MyLockNo;
1107
1108	WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1109	}
1110	}
1111	else
1112	{
1113	/*
1114	* This was an xlog-switch record, but the current insert location was
1115	* already exactly at the beginning of a segment, so there was no need
1116	* to do anything.
1117	*/
1118	}
1119
1120	/*
1121	* Done! Let others know that we're finished.
1122	*/
1123	WALInsertLockRelease();
1124
1125	MarkCurrentTransactionIdLoggedIfAny();
1126
1127	END_CRIT_SECTION();
1128
1129	/*
1130	* Update shared LogwrtRqst.Write, if we crossed page boundary.
1131	*/
1132	if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1133	{
1134	SpinLockAcquire(&XLogCtl->info_lck);
1135	/ advance global request to include new block(s) /
1136	if (XLogCtl->LogwrtRqst.Write < EndPos)
1137	XLogCtl->LogwrtRqst.Write = EndPos;
1138	/ update local result copy while I have the chance /
1139	LogwrtResult = XLogCtl->LogwrtResult;
1140	SpinLockRelease(&XLogCtl->info_lck);
1141	}
1142
1143	/*
1144	* If this was an XLOG_SWITCH record, flush the record and the empty
1145	* padding space that fills the rest of the segment, and perform
1146	* end-of-segment actions (eg, notifying archiver).
1147	*/
1148	if (isLogSwitch)
1149	{
1150	TRACE_POSTGRESQL_WAL_SWITCH();
1151	XLogFlush(EndPos);
1152
1153	/*
1154	* Even though we reserved the rest of the segment for us, which is
1155	* reflected in EndPos, we return a pointer to just the end of the
1156	* xlog-switch record.
1157	*/
1158	if (inserted)
1159	{
1160	EndPos = StartPos + SizeOfXLogRecord;
1161	if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1162	{
1163	uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
1164
1165	if (offset == EndPos % XLOG_BLCKSZ)
1166	EndPos += SizeOfXLogLongPHD;
1167	else
1168	EndPos += SizeOfXLogShortPHD;
1169	}
1170	}
1171	}
1172
1173	#ifdef WAL_DEBUG
1174	if (XLOG_DEBUG)
1175	{
1176	static XLogReaderState *debug_reader = NULL;
1177	StringInfoData buf;
1178	StringInfoData recordBuf;
1179	char *errormsg = NULL;
1180	MemoryContext oldCxt;
1181
1182	oldCxt = MemoryContextSwitchTo(walDebugCxt);
1183
1184	initStringInfo(&buf);
1185	appendStringInfo(&buf, "INSERT @ %X/%X: ",
1186	(uint32) (EndPos >> `32`), (uint32) EndPos);
1187
1188	/*
1189	* We have to piece together the WAL record data from the XLogRecData
1190	* entries, so that we can pass it to the rm_desc function as one
1191	* contiguous chunk.
1192	*/
1193	initStringInfo(&recordBuf);
1194	for (; rdata != NULL; rdata = rdata->next)
1195	appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1196
1197	if (!debug_reader)
1198	debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
1199
1200	if (!debug_reader)
1201	{
1202	appendStringInfoString(&buf, "error decoding record: out of memory");
1203	}
1204	else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1205	&errormsg))
1206	{
1207	appendStringInfo(&buf, "error decoding record: %s",
1208	errormsg ? errormsg : "no error message");
1209	}
1210	else
1211	{
1212	appendStringInfoString(&buf, " - ");
1213	xlog_outdesc(&buf, debug_reader);
1214	}
1215	elog(LOG, "%s", buf.data);
1216
1217	pfree(buf.data);
1218	pfree(recordBuf.data);
1219	MemoryContextSwitchTo(oldCxt);
1220	}
1221	#endif
1222
1223	/*
1224	* Update our global variables
1225	*/
1226	ProcLastRecPtr = StartPos;
1227	XactLastRecEnd = EndPos;
1228
1229	return EndPos;
1230	}
1231
1232	/*
1233	* Reserves the right amount of space for a record of given size from the WAL.
1234	* StartPos is set to the beginning of the reserved section, EndPos to
1235	* its end+1. *PrevPtr is set to the beginning of the previous record; it is
1236	* used to set the xl_prev of this record.
1237	*
1238	* This is the performance critical part of XLogInsert that must be serialized
1239	* across backends. The rest can happen mostly in parallel. Try to keep this
1240	* section as short as possible, insertpos_lck can be heavily contended on a
1241	* busy system.
1242	*
1243	* NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1244	* where we actually copy the record to the reserved space.
1245	*/
1246	static void
1247	ReserveXLogInsertLocation(int size, XLogRecPtr StartPos, XLogRecPtr EndPos,
1248	XLogRecPtr *PrevPtr)
1249	{
1250	XLogCtlInsert *Insert = &XLogCtl->Insert;
1251	uint64 startbytepos;
1252	uint64 endbytepos;
1253	uint64 prevbytepos;
1254
1255	size = MAXALIGN(size);
1256
1257	/ All (non xlog-switch) records should contain data. /
1258	Assert(size > SizeOfXLogRecord);
1259
1260	/*
1261	* The duration the spinlock needs to be held is minimized by minimizing
1262	* the calculations that have to be done while holding the lock. The
1263	* current tip of reserved WAL is kept in CurrBytePos, as a byte position
1264	* that only counts "usable" bytes in WAL, that is, it excludes all WAL
1265	* page headers. The mapping between "usable" byte positions and physical
1266	* positions (XLogRecPtrs) can be done outside the locked region, and
1267	* because the usable byte position doesn't include any headers, reserving
1268	* X bytes from WAL is almost as simple as "CurrBytePos += X".
1269	*/
1270	SpinLockAcquire(&Insert->insertpos_lck);
1271
1272	startbytepos = Insert->CurrBytePos;
1273	endbytepos = startbytepos + size;
1274	prevbytepos = Insert->PrevBytePos;
1275	Insert->CurrBytePos = endbytepos;
1276	Insert->PrevBytePos = startbytepos;
1277
1278	SpinLockRelease(&Insert->insertpos_lck);
1279
1280	*StartPos = XLogBytePosToRecPtr(startbytepos);
1281	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
1282	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1283
1284	/*
1285	* Check that the conversions between "usable byte positions" and
1286	* XLogRecPtrs work consistently in both directions.
1287	*/
1288	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1289	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1290	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1291	}
1292
1293	/*
1294	* Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1295	*
1296	* A log-switch record is handled slightly differently. The rest of the
1297	* segment will be reserved for this insertion, as indicated by the returned
1298	* *EndPos value. However, if we are already at the beginning of the current
1299	* segment, StartPos and EndPos are set to the current location without
1300	* reserving any space, and the function returns false.
1301	*/
1302	static bool
1303	ReserveXLogSwitch(XLogRecPtr StartPos, XLogRecPtr EndPos, XLogRecPtr *PrevPtr)
1304	{
1305	XLogCtlInsert *Insert = &XLogCtl->Insert;
1306	uint64 startbytepos;
1307	uint64 endbytepos;
1308	uint64 prevbytepos;
1309	uint32 size = MAXALIGN(SizeOfXLogRecord);
1310	XLogRecPtr ptr;
1311	uint32 segleft;
1312
1313	/*
1314	* These calculations are a bit heavy-weight to be done while holding a
1315	* spinlock, but since we're holding all the WAL insertion locks, there
1316	* are no other inserters competing for it. GetXLogInsertRecPtr() does
1317	* compete for it, but that's not called very frequently.
1318	*/
1319	SpinLockAcquire(&Insert->insertpos_lck);
1320
1321	startbytepos = Insert->CurrBytePos;
1322
1323	ptr = XLogBytePosToEndRecPtr(startbytepos);
1324	if (XLogSegmentOffset(ptr, wal_segment_size) == `0`)
1325	{
1326	SpinLockRelease(&Insert->insertpos_lck);
1327	EndPos = StartPos = ptr;
1328	return false;
1329	}
1330
1331	endbytepos = startbytepos + size;
1332	prevbytepos = Insert->PrevBytePos;
1333
1334	*StartPos = XLogBytePosToRecPtr(startbytepos);
1335	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
1336
1337	segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1338	if (segleft != wal_segment_size)
1339	{
1340	/ consume the rest of the segment /
1341	*EndPos += segleft;
1342	endbytepos = XLogRecPtrToBytePos(*EndPos);
1343	}
1344	Insert->CurrBytePos = endbytepos;
1345	Insert->PrevBytePos = startbytepos;
1346
1347	SpinLockRelease(&Insert->insertpos_lck);
1348
1349	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1350
1351	Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == `0`);
1352	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1353	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1354	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1355
1356	return true;
1357	}
1358
1359	/*
1360	* Checks whether the current buffer page and backup page stored in the
1361	* WAL record are consistent or not. Before comparing the two pages, a
1362	* masking can be applied to the pages to ignore certain areas like hint bits,
1363	* unused space between pd_lower and pd_upper among other things. This
1364	* function should be called once WAL replay has been completed for a
1365	* given record.
1366	*/
1367	static void
1368	checkXLogConsistency(XLogReaderState *record)
1369	{
1370	RmgrId rmid = XLogRecGetRmid(record);
1371	RelFileNode rnode;
1372	ForkNumber forknum;
1373	BlockNumber blkno;
1374	int block_id;
1375
1376	/ Records with no backup blocks have no need for consistency checks. /
1377	if (!XLogRecHasAnyBlockRefs(record))
1378	return;
1379
1380	Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != `0`);
1381
1382	for (block_id = `0`; block_id <= record->max_block_id; block_id++)
1383	{
1384	Buffer buf;
1385	Page page;
1386
1387	if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1388	{
1389	/*
1390	* WAL record doesn't contain a block reference with the given id.
1391	* Do nothing.
1392	*/
1393	continue;
1394	}
1395
1396	Assert(XLogRecHasBlockImage(record, block_id));
1397
1398	if (XLogRecBlockImageApply(record, block_id))
1399	{
1400	/*
1401	* WAL record has already applied the page, so bypass the
1402	* consistency check as that would result in comparing the full
1403	* page stored in the record with itself.
1404	*/
1405	continue;
1406	}
1407
1408	/*
1409	* Read the contents from the current buffer and store it in a
1410	* temporary page.
1411	*/
1412	buf = XLogReadBufferExtended(rnode, forknum, blkno,
1413	RBM_NORMAL_NO_LOG);
1414	if (!BufferIsValid(buf))
1415	continue;
1416
1417	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1418	page = BufferGetPage(buf);
1419
1420	/*
1421	* Take a copy of the local page where WAL has been applied to have a
1422	* comparison base before masking it...
1423	*/
1424	memcpy(replay_image_masked, page, BLCKSZ);
1425
1426	/ No need for this page anymore now that a copy is in. /
1427	UnlockReleaseBuffer(buf);
1428
1429	/*
1430	* If the block LSN is already ahead of this WAL record, we can't
1431	* expect contents to match. This can happen if recovery is
1432	* restarted.
1433	*/
1434	if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1435	continue;
1436
1437	/*
1438	* Read the contents from the backup copy, stored in WAL record and
1439	* store it in a temporary page. There is no need to allocate a new
1440	* page here, a local buffer is fine to hold its contents and a mask
1441	* can be directly applied on it.
1442	*/
1443	if (!RestoreBlockImage(record, block_id, master_image_masked))
1444	elog(ERROR, "failed to restore block image");
1445
1446	/*
1447	* If masking function is defined, mask both the master and replay
1448	* images
1449	*/
1450	if (RmgrTable[rmid].rm_mask != NULL)
1451	{
1452	RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1453	RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1454	}
1455
1456	/ Time to compare the master and replay images. /
1457	if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != `0`)
1458	{
1459	elog(FATAL,
1460	"inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1461	rnode.spcNode, rnode.dbNode, rnode.relNode,
1462	forknum, blkno);
1463	}
1464	}
1465	}
1466
1467	/*
1468	* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1469	* area in the WAL.
1470	*/
1471	static void
1472	CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1473	XLogRecPtr StartPos, XLogRecPtr EndPos)
1474	{
1475	char *currpos;
1476	int freespace;
1477	int written;
1478	XLogRecPtr CurrPos;
1479	XLogPageHeader pagehdr;
1480
1481	/*
1482	* Get a pointer to the right place in the right WAL buffer to start
1483	* inserting to.
1484	*/
1485	CurrPos = StartPos;
1486	currpos = GetXLogBuffer(CurrPos);
1487	freespace = INSERT_FREESPACE(CurrPos);
1488
1489	/*
1490	* there should be enough space for at least the first field (xl_tot_len)
1491	* on this page.
1492	*/
1493	Assert(freespace >= sizeof(uint32));
1494
1495	/ Copy record data /
1496	written = `0`;
1497	while (rdata != NULL)
1498	{
1499	char *rdata_data = rdata->data;
1500	int rdata_len = rdata->len;
1501
1502	while (rdata_len > freespace)
1503	{
1504	/*
1505	* Write what fits on this page, and continue on the next page.
1506	*/
1507	Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD \|\| freespace == `0`);
1508	memcpy(currpos, rdata_data, freespace);
1509	rdata_data += freespace;
1510	rdata_len -= freespace;
1511	written += freespace;
1512	CurrPos += freespace;
1513
1514	/*
1515	* Get pointer to beginning of next page, and set the xlp_rem_len
1516	* in the page header. Set XLP_FIRST_IS_CONTRECORD.
1517	*
1518	* It's safe to set the contrecord flag and xlp_rem_len without a
1519	* lock on the page. All the other flags were already set when the
1520	* page was initialized, in AdvanceXLInsertBuffer, and we're the
1521	* only backend that needs to set the contrecord flag.
1522	*/
1523	currpos = GetXLogBuffer(CurrPos);
1524	pagehdr = (XLogPageHeader) currpos;
1525	pagehdr->xlp_rem_len = write_len - written;
1526	pagehdr->xlp_info \|= XLP_FIRST_IS_CONTRECORD;
1527
1528	/ skip over the page header /
1529	if (XLogSegmentOffset(CurrPos, wal_segment_size) == `0`)
1530	{
1531	CurrPos += SizeOfXLogLongPHD;
1532	currpos += SizeOfXLogLongPHD;
1533	}
1534	else
1535	{
1536	CurrPos += SizeOfXLogShortPHD;
1537	currpos += SizeOfXLogShortPHD;
1538	}
1539	freespace = INSERT_FREESPACE(CurrPos);
1540	}
1541
1542	Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD \|\| rdata_len == `0`);
1543	memcpy(currpos, rdata_data, rdata_len);
1544	currpos += rdata_len;
1545	CurrPos += rdata_len;
1546	freespace -= rdata_len;
1547	written += rdata_len;
1548
1549	rdata = rdata->next;
1550	}
1551	Assert(written == write_len);
1552
1553	/*
1554	* If this was an xlog-switch, it's not enough to write the switch record,
1555	* we also have to consume all the remaining space in the WAL segment. We
1556	* have already reserved that space, but we need to actually fill it.
1557	*/
1558	if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != `0`)
1559	{
1560	/ An xlog-switch record doesn't contain any data besides the header /
1561	Assert(write_len == SizeOfXLogRecord);
1562
1563	/ Assert that we did reserve the right amount of space /
1564	Assert(XLogSegmentOffset(EndPos, wal_segment_size) == `0`);
1565
1566	/ Use up all the remaining space on the current page /
1567	CurrPos += freespace;
1568
1569	/*
1570	* Cause all remaining pages in the segment to be flushed, leaving the
1571	* XLog position where it should be, at the start of the next segment.
1572	* We do this one page at a time, to make sure we don't deadlock
1573	* against ourselves if wal_buffers < wal_segment_size.
1574	*/
1575	while (CurrPos < EndPos)
1576	{
1577	/*
1578	* The minimal action to flush the page would be to call
1579	* WALInsertLockUpdateInsertingAt(CurrPos) followed by
1580	* AdvanceXLInsertBuffer(...). The page would be left initialized
1581	* mostly to zeros, except for the page header (always the short
1582	* variant, as this is never a segment's first page).
1583	*
1584	* The large vistas of zeros are good for compressibility, but the
1585	* headers interrupting them every XLOG_BLCKSZ (with values that
1586	* differ from page to page) are not. The effect varies with
1587	* compression tool, but bzip2 for instance compresses about an
1588	* order of magnitude worse if those headers are left in place.
1589	*
1590	* Rather than complicating AdvanceXLInsertBuffer itself (which is
1591	* called in heavily-loaded circumstances as well as this lightly-
1592	* loaded one) with variant behavior, we just use GetXLogBuffer
1593	* (which itself calls the two methods we need) to get the pointer
1594	* and zero most of the page. Then we just zero the page header.
1595	*/
1596	currpos = GetXLogBuffer(CurrPos);
1597	MemSet(currpos, `0`, SizeOfXLogShortPHD);
1598
1599	CurrPos += XLOG_BLCKSZ;
1600	}
1601	}
1602	else
1603	{
1604	/ Align the end position, so that the next record starts aligned /
1605	CurrPos = MAXALIGN64(CurrPos);
1606	}
1607
1608	if (CurrPos != EndPos)
1609	elog(PANIC, "space reserved for WAL record does not match what was written");
1610	}
1611
1612	/*
1613	* Acquire a WAL insertion lock, for inserting to WAL.
1614	*/
1615	static void
1616	WALInsertLockAcquire(void)
1617	{
1618	bool immed;
1619
1620	/*
1621	* It doesn't matter which of the WAL insertion locks we acquire, so try
1622	* the one we used last time. If the system isn't particularly busy, it's
1623	* a good bet that it's still available, and it's good to have some
1624	* affinity to a particular lock so that you don't unnecessarily bounce
1625	* cache lines between processes when there's no contention.
1626	*
1627	* If this is the first time through in this backend, pick a lock
1628	* (semi-)randomly. This allows the locks to be used evenly if you have a
1629	* lot of very short connections.
1630	*/
1631	static int lockToTry = -`1`;
1632
1633	if (lockToTry == -`1`)
1634	lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1635	MyLockNo = lockToTry;
1636
1637	/*
1638	* The insertingAt value is initially set to 0, as we don't know our
1639	* insert location yet.
1640	*/
1641	immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1642	if (!immed)
1643	{
1644	/*
1645	* If we couldn't get the lock immediately, try another lock next
1646	* time. On a system with more insertion locks than concurrent
1647	* inserters, this causes all the inserters to eventually migrate to a
1648	* lock that no-one else is using. On a system with more inserters
1649	* than locks, it still helps to distribute the inserters evenly
1650	* across the locks.
1651	*/
1652	lockToTry = (lockToTry + `1`) % NUM_XLOGINSERT_LOCKS;
1653	}
1654	}
1655
1656	/*
1657	* Acquire all WAL insertion locks, to prevent other backends from inserting
1658	* to WAL.
1659	*/
1660	static void
1661	WALInsertLockAcquireExclusive(void)
1662	{
1663	int i;
1664
1665	/*
1666	* When holding all the locks, all but the last lock's insertingAt
1667	* indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1668	* XLogRecPtr value, to make sure that no-one blocks waiting on those.
1669	*/
1670	for (i = `0`; i < NUM_XLOGINSERT_LOCKS - `1`; i++)
1671	{
1672	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1673	LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1674	&WALInsertLocks[i].l.insertingAt,
1675	PG_UINT64_MAX);
1676	}
1677	/ Variable value reset to 0 at release /
1678	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1679
1680	holdingAllLocks = true;
1681	}
1682
1683	/*
1684	* Release our insertion lock (or locks, if we're holding them all).
1685	*
1686	* NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1687	* next time the lock is acquired.
1688	*/
1689	static void
1690	WALInsertLockRelease(void)
1691	{
1692	if (holdingAllLocks)
1693	{
1694	int i;
1695
1696	for (i = `0`; i < NUM_XLOGINSERT_LOCKS; i++)
1697	LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1698	&WALInsertLocks[i].l.insertingAt,
1699	`0`);
1700
1701	holdingAllLocks = false;
1702	}
1703	else
1704	{
1705	LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1706	&WALInsertLocks[MyLockNo].l.insertingAt,
1707	`0`);
1708	}
1709	}
1710
1711	/*
1712	* Update our insertingAt value, to let others know that we've finished
1713	* inserting up to that point.
1714	*/
1715	static void
1716	WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1717	{
1718	if (holdingAllLocks)
1719	{
1720	/*
1721	* We use the last lock to mark our actual position, see comments in
1722	* WALInsertLockAcquireExclusive.
1723	*/
1724	LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - `1`].l.lock,
1725	&WALInsertLocks[NUM_XLOGINSERT_LOCKS - `1`].l.insertingAt,
1726	insertingAt);
1727	}
1728	else
1729	LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1730	&WALInsertLocks[MyLockNo].l.insertingAt,
1731	insertingAt);
1732	}
1733
1734	/*
1735	* Wait for any WAL insertions < upto to finish.
1736	*
1737	* Returns the location of the oldest insertion that is still in-progress.
1738	* Any WAL prior to that point has been fully copied into WAL buffers, and
1739	* can be flushed out to disk. Because this waits for any insertions older
1740	* than 'upto' to finish, the return value is always >= 'upto'.
1741	*
1742	* Note: When you are about to write out WAL, you must call this function
1743	* before acquiring WALWriteLock, to avoid deadlocks. This function might
1744	* need to wait for an insertion to finish (or at least advance to next
1745	* uninitialized page), and the inserter might need to evict an old WAL buffer
1746	* to make room for a new one, which in turn requires WALWriteLock.
1747	*/
1748	static XLogRecPtr
1749	WaitXLogInsertionsToFinish(XLogRecPtr upto)
1750	{
1751	uint64 bytepos;
1752	XLogRecPtr reservedUpto;
1753	XLogRecPtr finishedUpto;
1754	XLogCtlInsert *Insert = &XLogCtl->Insert;
1755	int i;
1756
1757	if (MyProc == NULL)
1758	elog(PANIC, "cannot wait without a PGPROC structure");
1759
1760	/ Read the current insert position /
1761	SpinLockAcquire(&Insert->insertpos_lck);
1762	bytepos = Insert->CurrBytePos;
1763	SpinLockRelease(&Insert->insertpos_lck);
1764	reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1765
1766	/*
1767	* No-one should request to flush a piece of WAL that hasn't even been
1768	* reserved yet. However, it can happen if there is a block with a bogus
1769	* LSN on disk, for example. XLogFlush checks for that situation and
1770	* complains, but only after the flush. Here we just assume that to mean
1771	* that all WAL that has been reserved needs to be finished. In this
1772	* corner-case, the return value can be smaller than 'upto' argument.
1773	*/
1774	if (upto > reservedUpto)
1775	{
1776	elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1777	(uint32) (upto >> `32`), (uint32) upto,
1778	(uint32) (reservedUpto >> `32`), (uint32) reservedUpto);
1779	upto = reservedUpto;
1780	}
1781
1782	/*
1783	* Loop through all the locks, sleeping on any in-progress insert older
1784	* than 'upto'.
1785	*
1786	* finishedUpto is our return value, indicating the point upto which all
1787	* the WAL insertions have been finished. Initialize it to the head of
1788	* reserved WAL, and as we iterate through the insertion locks, back it
1789	* out for any insertion that's still in progress.
1790	*/
1791	finishedUpto = reservedUpto;
1792	for (i = `0`; i < NUM_XLOGINSERT_LOCKS; i++)
1793	{
1794	XLogRecPtr insertingat = InvalidXLogRecPtr;
1795
1796	do
1797	{
1798	/*
1799	* See if this insertion is in progress. LWLockWait will wait for
1800	* the lock to be released, or for the 'value' to be set by a
1801	* LWLockUpdateVar call. When a lock is initially acquired, its
1802	* value is 0 (InvalidXLogRecPtr), which means that we don't know
1803	* where it's inserting yet. We will have to wait for it. If
1804	* it's a small insertion, the record will most likely fit on the
1805	* same page and the inserter will release the lock without ever
1806	* calling LWLockUpdateVar. But if it has to sleep, it will
1807	* advertise the insertion point with LWLockUpdateVar before
1808	* sleeping.
1809	*/
1810	if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1811	&WALInsertLocks[i].l.insertingAt,
1812	insertingat, &insertingat))
1813	{
1814	/ the lock was free, so no insertion in progress /
1815	insertingat = InvalidXLogRecPtr;
1816	break;
1817	}
1818
1819	/*
1820	* This insertion is still in progress. Have to wait, unless the
1821	* inserter has proceeded past 'upto'.
1822	*/
1823	} while (insertingat < upto);
1824
1825	if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1826	finishedUpto = insertingat;
1827	}
1828	return finishedUpto;
1829	}
1830
1831	/*
1832	* Get a pointer to the right location in the WAL buffer containing the
1833	* given XLogRecPtr.
1834	*
1835	* If the page is not initialized yet, it is initialized. That might require
1836	* evicting an old dirty buffer from the buffer cache, which means I/O.
1837	*
1838	* The caller must ensure that the page containing the requested location
1839	* isn't evicted yet, and won't be evicted. The way to ensure that is to
1840	* hold onto a WAL insertion lock with the insertingAt position set to
1841	* something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1842	* to evict an old page from the buffer. (This means that once you call
1843	* GetXLogBuffer() with a given 'ptr', you must not access anything before
1844	* that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1845	* later, because older buffers might be recycled already)
1846	*/
1847	static char *
1848	GetXLogBuffer(XLogRecPtr ptr)
1849	{
1850	int idx;
1851	XLogRecPtr endptr;
1852	static uint64 cachedPage = `0`;
1853	static char *cachedPos = NULL;
1854	XLogRecPtr expectedEndPtr;
1855
1856	/*
1857	* Fast path for the common case that we need to access again the same
1858	* page as last time.
1859	*/
1860	if (ptr / XLOG_BLCKSZ == cachedPage)
1861	{
1862	Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1863	Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1864	return cachedPos + ptr % XLOG_BLCKSZ;
1865	}
1866
1867	/*
1868	* The XLog buffer cache is organized so that a page is always loaded to a
1869	* particular buffer. That way we can easily calculate the buffer a given
1870	* page must be loaded into, from the XLogRecPtr alone.
1871	*/
1872	idx = XLogRecPtrToBufIdx(ptr);
1873
1874	/*
1875	* See what page is loaded in the buffer at the moment. It could be the
1876	* page we're looking for, or something older. It can't be anything newer
1877	* - that would imply the page we're looking for has already been written
1878	* out to disk and evicted, and the caller is responsible for making sure
1879	* that doesn't happen.
1880	*
1881	* However, we don't hold a lock while we read the value. If someone has
1882	* just initialized the page, it's possible that we get a "torn read" of
1883	* the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1884	* that case we will see a bogus value. That's ok, we'll grab the mapping
1885	* lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1886	* the page we're looking for. But it means that when we do this unlocked
1887	* read, we might see a value that appears to be ahead of the page we're
1888	* looking for. Don't PANIC on that, until we've verified the value while
1889	* holding the lock.
1890	*/
1891	expectedEndPtr = ptr;
1892	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1893
1894	endptr = XLogCtl->xlblocks[idx];
1895	if (expectedEndPtr != endptr)
1896	{
1897	XLogRecPtr initializedUpto;
1898
1899	/*
1900	* Before calling AdvanceXLInsertBuffer(), which can block, let others
1901	* know how far we're finished with inserting the record.
1902	*
1903	* NB: If 'ptr' points to just after the page header, advertise a
1904	* position at the beginning of the page rather than 'ptr' itself. If
1905	* there are no other insertions running, someone might try to flush
1906	* up to our advertised location. If we advertised a position after
1907	* the page header, someone might try to flush the page header, even
1908	* though page might actually not be initialized yet. As the first
1909	* inserter on the page, we are effectively responsible for making
1910	* sure that it's initialized, before we let insertingAt to move past
1911	* the page header.
1912	*/
1913	if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1914	XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1915	initializedUpto = ptr - SizeOfXLogShortPHD;
1916	else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1917	XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1918	initializedUpto = ptr - SizeOfXLogLongPHD;
1919	else
1920	initializedUpto = ptr;
1921
1922	WALInsertLockUpdateInsertingAt(initializedUpto);
1923
1924	AdvanceXLInsertBuffer(ptr, false);
1925	endptr = XLogCtl->xlblocks[idx];
1926
1927	if (expectedEndPtr != endptr)
1928	elog(PANIC, "could not find WAL buffer for %X/%X",
1929	(uint32) (ptr >> `32`), (uint32) ptr);
1930	}
1931	else
1932	{
1933	/*
1934	* Make sure the initialization of the page is visible to us, and
1935	* won't arrive later to overwrite the WAL data we write on the page.
1936	*/
1937	pg_memory_barrier();
1938	}
1939
1940	/*
1941	* Found the buffer holding this page. Return a pointer to the right
1942	* offset within the page.
1943	*/
1944	cachedPage = ptr / XLOG_BLCKSZ;
1945	cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1946
1947	Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1948	Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1949
1950	return cachedPos + ptr % XLOG_BLCKSZ;
1951	}
1952
1953	/*
1954	* Converts a "usable byte position" to XLogRecPtr. A usable byte position
1955	* is the position starting from the beginning of WAL, excluding all WAL
1956	* page headers.
1957	*/
1958	static XLogRecPtr
1959	XLogBytePosToRecPtr(uint64 bytepos)
1960	{
1961	uint64 fullsegs;
1962	uint64 fullpages;
1963	uint64 bytesleft;
1964	uint32 seg_offset;
1965	XLogRecPtr result;
1966
1967	fullsegs = bytepos / UsableBytesInSegment;
1968	bytesleft = bytepos % UsableBytesInSegment;
1969
1970	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1971	{
1972	/ fits on first page of segment /
1973	seg_offset = bytesleft + SizeOfXLogLongPHD;
1974	}
1975	else
1976	{
1977	/ account for the first page on segment with long header /
1978	seg_offset = XLOG_BLCKSZ;
1979	bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1980
1981	fullpages = bytesleft / UsableBytesInPage;
1982	bytesleft = bytesleft % UsableBytesInPage;
1983
1984	seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1985	}
1986
1987	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1988
1989	return result;
1990	}
1991
1992	/*
1993	* Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1994	* returns a pointer to the beginning of the page (ie. before page header),
1995	* not to where the first xlog record on that page would go to. This is used
1996	* when converting a pointer to the end of a record.
1997	*/
1998	static XLogRecPtr
1999	XLogBytePosToEndRecPtr(uint64 bytepos)
2000	{
2001	uint64 fullsegs;
2002	uint64 fullpages;
2003	uint64 bytesleft;
2004	uint32 seg_offset;
2005	XLogRecPtr result;
2006
2007	fullsegs = bytepos / UsableBytesInSegment;
2008	bytesleft = bytepos % UsableBytesInSegment;
2009
2010	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2011	{
2012	/ fits on first page of segment /
2013	if (bytesleft == `0`)
2014	seg_offset = `0`;
2015	else
2016	seg_offset = bytesleft + SizeOfXLogLongPHD;
2017	}
2018	else
2019	{
2020	/ account for the first page on segment with long header /
2021	seg_offset = XLOG_BLCKSZ;
2022	bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2023
2024	fullpages = bytesleft / UsableBytesInPage;
2025	bytesleft = bytesleft % UsableBytesInPage;
2026
2027	if (bytesleft == `0`)
2028	seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2029	else
2030	seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2031	}
2032
2033	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2034
2035	return result;
2036	}
2037
2038	/*
2039	* Convert an XLogRecPtr to a "usable byte position".
2040	*/
2041	static uint64
2042	XLogRecPtrToBytePos(XLogRecPtr ptr)
2043	{
2044	uint64 fullsegs;
2045	uint32 fullpages;
2046	uint32 offset;
2047	uint64 result;
2048
2049	XLByteToSeg(ptr, fullsegs, wal_segment_size);
2050
2051	fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2052	offset = ptr % XLOG_BLCKSZ;
2053
2054	if (fullpages == `0`)
2055	{
2056	result = fullsegs * UsableBytesInSegment;
2057	if (offset > `0`)
2058	{
2059	Assert(offset >= SizeOfXLogLongPHD);
2060	result += offset - SizeOfXLogLongPHD;
2061	}
2062	}
2063	else
2064	{
2065	result = fullsegs * UsableBytesInSegment +
2066	(XLOG_BLCKSZ - SizeOfXLogLongPHD) + / account for first page /
2067	(fullpages - `1`) * UsableBytesInPage; / full pages /
2068	if (offset > `0`)
2069	{
2070	Assert(offset >= SizeOfXLogShortPHD);
2071	result += offset - SizeOfXLogShortPHD;
2072	}
2073	}
2074
2075	return result;
2076	}
2077
2078	/*
2079	* Initialize XLOG buffers, writing out old buffers if they still contain
2080	* unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2081	* true, initialize as many pages as we can without having to write out
2082	* unwritten data. Any new pages are initialized to zeros, with pages headers
2083	* initialized properly.
2084	*/
2085	static void
2086	AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2087	{
2088	XLogCtlInsert *Insert = &XLogCtl->Insert;
2089	int nextidx;
2090	XLogRecPtr OldPageRqstPtr;
2091	XLogwrtRqst WriteRqst;
2092	XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2093	XLogRecPtr NewPageBeginPtr;
2094	XLogPageHeader NewPage;
2095	int npages = `0`;
2096
2097	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2098
2099	/*
2100	* Now that we have the lock, check if someone initialized the page
2101	* already.
2102	*/
2103	while (upto >= XLogCtl->InitializedUpTo \|\| opportunistic)
2104	{
2105	nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2106
2107	/*
2108	* Get ending-offset of the buffer page we need to replace (this may
2109	* be zero if the buffer hasn't been used yet). Fall through if it's
2110	* already written out.
2111	*/
2112	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2113	if (LogwrtResult.Write < OldPageRqstPtr)
2114	{
2115	/*
2116	* Nope, got work to do. If we just want to pre-initialize as much
2117	* as we can without flushing, give up now.
2118	*/
2119	if (opportunistic)
2120	break;
2121
2122	/ Before waiting, get info_lck and update LogwrtResult /
2123	SpinLockAcquire(&XLogCtl->info_lck);
2124	if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2125	XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2126	LogwrtResult = XLogCtl->LogwrtResult;
2127	SpinLockRelease(&XLogCtl->info_lck);
2128
2129	/*
2130	* Now that we have an up-to-date LogwrtResult value, see if we
2131	* still need to write it or if someone else already did.
2132	*/
2133	if (LogwrtResult.Write < OldPageRqstPtr)
2134	{
2135	/*
2136	* Must acquire write lock. Release WALBufMappingLock first,
2137	* to make sure that all insertions that we need to wait for
2138	* can finish (up to this same position). Otherwise we risk
2139	* deadlock.
2140	*/
2141	LWLockRelease(WALBufMappingLock);
2142
2143	WaitXLogInsertionsToFinish(OldPageRqstPtr);
2144
2145	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2146
2147	LogwrtResult = XLogCtl->LogwrtResult;
2148	if (LogwrtResult.Write >= OldPageRqstPtr)
2149	{
2150	/ OK, someone wrote it already /
2151	LWLockRelease(WALWriteLock);
2152	}
2153	else
2154	{
2155	/ Have to write it ourselves /
2156	TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2157	WriteRqst.Write = OldPageRqstPtr;
2158	WriteRqst.Flush = `0`;
2159	XLogWrite(WriteRqst, false);
2160	LWLockRelease(WALWriteLock);
2161	TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2162	}
2163	/ Re-acquire WALBufMappingLock and retry /
2164	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2165	continue;
2166	}
2167	}
2168
2169	/*
2170	* Now the next buffer slot is free and we can set it up to be the
2171	* next output page.
2172	*/
2173	NewPageBeginPtr = XLogCtl->InitializedUpTo;
2174	NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2175
2176	Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2177
2178	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2179
2180	/*
2181	* Be sure to re-zero the buffer so that bytes beyond what we've
2182	* written will look like zeroes and not valid XLOG records...
2183	*/
2184	MemSet((char *) NewPage, `0`, XLOG_BLCKSZ);
2185
2186	/*
2187	* Fill the new page's header
2188	*/
2189	NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2190
2191	/ NewPage->xlp_info = 0; / / done by memset /
2192	NewPage->xlp_tli = ThisTimeLineID;
2193	NewPage->xlp_pageaddr = NewPageBeginPtr;
2194
2195	/ NewPage->xlp_rem_len = 0; / / done by memset /
2196
2197	/*
2198	* If online backup is not in progress, mark the header to indicate
2199	* that WAL records beginning in this page have removable backup
2200	* blocks. This allows the WAL archiver to know whether it is safe to
2201	* compress archived WAL data by transforming full-block records into
2202	* the non-full-block format. It is sufficient to record this at the
2203	* page level because we force a page switch (in fact a segment
2204	* switch) when starting a backup, so the flag will be off before any
2205	* records can be written during the backup. At the end of a backup,
2206	* the last page will be marked as all unsafe when perhaps only part
2207	* is unsafe, but at worst the archiver would miss the opportunity to
2208	* compress a few records.
2209	*/
2210	if (!Insert->forcePageWrites)
2211	NewPage->xlp_info \|= XLP_BKP_REMOVABLE;
2212
2213	/*
2214	* If first page of an XLOG segment file, make it a long header.
2215	*/
2216	if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == `0`)
2217	{
2218	XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2219
2220	NewLongPage->xlp_sysid = ControlFile->system_identifier;
2221	NewLongPage->xlp_seg_size = wal_segment_size;
2222	NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2223	NewPage->xlp_info \|= XLP_LONG_HEADER;
2224	}
2225
2226	/*
2227	* Make sure the initialization of the page becomes visible to others
2228	* before the xlblocks update. GetXLogBuffer() reads xlblocks without
2229	* holding a lock.
2230	*/
2231	pg_write_barrier();
2232
2233	((volatile* XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2234
2235	XLogCtl->InitializedUpTo = NewPageEndPtr;
2236
2237	npages++;
2238	}
2239	LWLockRelease(WALBufMappingLock);
2240
2241	#ifdef WAL_DEBUG
2242	if (XLOG_DEBUG && npages > `0`)
2243	{
2244	elog(DEBUG1, "initialized %d pages, up to %X/%X",
2245	npages, (uint32) (NewPageEndPtr >> `32`), (uint32) NewPageEndPtr);
2246	}
2247	#endif
2248	}
2249
2250	/*
2251	* Calculate CheckPointSegments based on max_wal_size_mb and
2252	* checkpoint_completion_target.
2253	*/
2254	static void
2255	CalculateCheckpointSegments(void)
2256	{
2257	double target;
2258
2259	/-------*
2260	* Calculate the distance at which to trigger a checkpoint, to avoid
2261	* exceeding max_wal_size_mb. This is based on two assumptions:
2262	*
2263	* a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2264	* WAL for two checkpoint cycles to allow us to recover from the
2265	* secondary checkpoint if the first checkpoint failed, though we
2266	* only did this on the master anyway, not on standby. Keeping just
2267	* one checkpoint simplifies processing and reduces disk space in
2268	* many smaller databases.)
2269	* b) during checkpoint, we consume checkpoint_completion_target *
2270	* number of segments consumed between checkpoints.
2271	*-------
2272	*/
2273	target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2274	(`1.0` + CheckPointCompletionTarget);
2275
2276	/ round down /
2277	CheckPointSegments = (int) target;
2278
2279	if (CheckPointSegments < `1`)
2280	CheckPointSegments = `1`;
2281	}
2282
2283	void
2284	assign_max_wal_size(int newval, void *extra)
2285	{
2286	max_wal_size_mb = newval;
2287	CalculateCheckpointSegments();
2288	}
2289
2290	void
2291	assign_checkpoint_completion_target(double newval, void *extra)
2292	{
2293	CheckPointCompletionTarget = newval;
2294	CalculateCheckpointSegments();
2295	}
2296
2297	/*
2298	* At a checkpoint, how many WAL segments to recycle as preallocated future
2299	* XLOG segments? Returns the highest segment that should be preallocated.
2300	*/
2301	static XLogSegNo
2302	XLOGfileslop(XLogRecPtr RedoRecPtr)
2303	{
2304	XLogSegNo minSegNo;
2305	XLogSegNo maxSegNo;
2306	double distance;
2307	XLogSegNo recycleSegNo;
2308
2309	/*
2310	* Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2311	* correspond to. Always recycle enough segments to meet the minimum, and
2312	* remove enough segments to stay below the maximum.
2313	*/
2314	minSegNo = RedoRecPtr / wal_segment_size +
2315	ConvertToXSegs(min_wal_size_mb, wal_segment_size) - `1`;
2316	maxSegNo = RedoRecPtr / wal_segment_size +
2317	ConvertToXSegs(max_wal_size_mb, wal_segment_size) - `1`;
2318
2319	/*
2320	* Between those limits, recycle enough segments to get us through to the
2321	* estimated end of next checkpoint.
2322	*
2323	* To estimate where the next checkpoint will finish, assume that the
2324	* system runs steadily consuming CheckPointDistanceEstimate bytes between
2325	* every checkpoint.
2326	*/
2327	distance = (`1.0` + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2328	/ add 10% for good measure. /
2329	distance *= `1.10`;
2330
2331	recycleSegNo = (XLogSegNo) ceil(((double) RedoRecPtr + distance) /
2332	wal_segment_size);
2333
2334	if (recycleSegNo < minSegNo)
2335	recycleSegNo = minSegNo;
2336	if (recycleSegNo > maxSegNo)
2337	recycleSegNo = maxSegNo;
2338
2339	return recycleSegNo;
2340	}
2341
2342	/*
2343	* Check whether we've consumed enough xlog space that a checkpoint is needed.
2344	*
2345	* new_segno indicates a log file that has just been filled up (or read
2346	* during recovery). We measure the distance from RedoRecPtr to new_segno
2347	* and see if that exceeds CheckPointSegments.
2348	*
2349	* Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2350	*/
2351	static bool
2352	XLogCheckpointNeeded(XLogSegNo new_segno)
2353	{
2354	XLogSegNo old_segno;
2355
2356	XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2357
2358	if (new_segno >= old_segno + (uint64) (CheckPointSegments - `1`))
2359	return true;
2360	return false;
2361	}
2362
2363	/*
2364	* Write and/or fsync the log at least as far as WriteRqst indicates.
2365	*
2366	* If flexible == true, we don't have to write as far as WriteRqst, but
2367	* may stop at any convenient boundary (such as a cache or logfile boundary).
2368	* This option allows us to avoid uselessly issuing multiple writes when a
2369	* single one would do.
2370	*
2371	* Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2372	* must be called before grabbing the lock, to make sure the data is ready to
2373	* write.
2374	*/
2375	static void
2376	XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2377	{
2378	bool ispartialpage;
2379	bool last_iteration;
2380	bool finishing_seg;
2381	bool use_existent;
2382	int curridx;
2383	int npages;
2384	int startidx;
2385	uint32 startoffset;
2386
2387	/ We should always be inside a critical section here /
2388	Assert(CritSectionCount > `0`);
2389
2390	/*
2391	* Update local LogwrtResult (caller probably did this already, but...)
2392	*/
2393	LogwrtResult = XLogCtl->LogwrtResult;
2394
2395	/*
2396	* Since successive pages in the xlog cache are consecutively allocated,
2397	* we can usually gather multiple pages together and issue just one
2398	* write() call. npages is the number of pages we have determined can be
2399	* written together; startidx is the cache block index of the first one,
2400	* and startoffset is the file offset at which it should go. The latter
2401	* two variables are only valid when npages > 0, but we must initialize
2402	* all of them to keep the compiler quiet.
2403	*/
2404	npages = `0`;
2405	startidx = `0`;
2406	startoffset = `0`;
2407
2408	/*
2409	* Within the loop, curridx is the cache block index of the page to
2410	* consider writing. Begin at the buffer containing the next unwritten
2411	* page, or last partially written page.
2412	*/
2413	curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2414
2415	while (LogwrtResult.Write < WriteRqst.Write)
2416	{
2417	/*
2418	* Make sure we're not ahead of the insert process. This could happen
2419	* if we're passed a bogus WriteRqst.Write that is past the end of the
2420	* last page that's been initialized by AdvanceXLInsertBuffer.
2421	*/
2422	XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2423
2424	if (LogwrtResult.Write >= EndPtr)
2425	elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2426	(uint32) (LogwrtResult.Write >> `32`),
2427	(uint32) LogwrtResult.Write,
2428	(uint32) (EndPtr >> `32`), (uint32) EndPtr);
2429
2430	/ Advance LogwrtResult.Write to end of current buffer page /
2431	LogwrtResult.Write = EndPtr;
2432	ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2433
2434	if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2435	wal_segment_size))
2436	{
2437	/*
2438	* Switch to new logfile segment. We cannot have any pending
2439	* pages here (since we dump what we have at segment end).
2440	*/
2441	Assert(npages == `0`);
2442	if (openLogFile >= `0`)
2443	XLogFileClose();
2444	XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2445	wal_segment_size);
2446
2447	/ create/use new log file /
2448	use_existent = true;
2449	openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2450	}
2451
2452	/ Make sure we have the current logfile open /
2453	if (openLogFile < `0`)
2454	{
2455	XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2456	wal_segment_size);
2457	openLogFile = XLogFileOpen(openLogSegNo);
2458	}
2459
2460	/ Add current page to the set of pending pages-to-dump /
2461	if (npages == `0`)
2462	{
2463	/ first of group /
2464	startidx = curridx;
2465	startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2466	wal_segment_size);
2467	}
2468	npages++;
2469
2470	/*
2471	* Dump the set if this will be the last loop iteration, or if we are
2472	* at the last page of the cache area (since the next page won't be
2473	* contiguous in memory), or if we are at the end of the logfile
2474	* segment.
2475	*/
2476	last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2477
2478	finishing_seg = !ispartialpage &&
2479	(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2480
2481	if (last_iteration \|\|
2482	curridx == XLogCtl->XLogCacheBlck \|\|
2483	finishing_seg)
2484	{
2485	char *from;
2486	Size nbytes;
2487	Size nleft;
2488	int written;
2489
2490	/ OK to write the page(s) /
2491	from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2492	nbytes = npages * (Size) XLOG_BLCKSZ;
2493	nleft = nbytes;
2494	do
2495	{
2496	errno = `0`;
2497	pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2498	written = pg_pwrite(openLogFile, from, nleft, startoffset);
2499	pgstat_report_wait_end();
2500	if (written <= `0`)
2501	{
2502	if (errno == EINTR)
2503	continue;
2504	ereport(PANIC,
2505	(errcode_for_file_access(),
2506	errmsg("could not write to log file %s "
2507	"at offset %u, length %zu: %m",
2508	XLogFileNameP(ThisTimeLineID, openLogSegNo),
2509	startoffset, nleft)));
2510	}
2511	nleft -= written;
2512	from += written;
2513	startoffset += written;
2514	} while (nleft > `0`);
2515
2516	npages = `0`;
2517
2518	/*
2519	* If we just wrote the whole last page of a logfile segment,
2520	* fsync the segment immediately. This avoids having to go back
2521	* and re-open prior segments when an fsync request comes along
2522	* later. Doing it here ensures that one and only one backend will
2523	* perform this fsync.
2524	*
2525	* This is also the right place to notify the Archiver that the
2526	* segment is ready to copy to archival storage, and to update the
2527	* timer for archive_timeout, and to signal for a checkpoint if
2528	* too many logfile segments have been used since the last
2529	* checkpoint.
2530	*/
2531	if (finishing_seg)
2532	{
2533	issue_xlog_fsync(openLogFile, openLogSegNo);
2534
2535	/ signal that we need to wakeup walsenders later /
2536	WalSndWakeupRequest();
2537
2538	LogwrtResult.Flush = LogwrtResult.Write; / end of page /
2539
2540	if (XLogArchivingActive())
2541	XLogArchiveNotifySeg(openLogSegNo);
2542
2543	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2544	XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2545
2546	/*
2547	* Request a checkpoint if we've consumed too much xlog since
2548	* the last one. For speed, we first check using the local
2549	* copy of RedoRecPtr, which might be out of date; if it looks
2550	* like a checkpoint is needed, forcibly update RedoRecPtr and
2551	* recheck.
2552	*/
2553	if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2554	{
2555	(void) GetRedoRecPtr();
2556	if (XLogCheckpointNeeded(openLogSegNo))
2557	RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2558	}
2559	}
2560	}
2561
2562	if (ispartialpage)
2563	{
2564	/ Only asked to write a partial page /
2565	LogwrtResult.Write = WriteRqst.Write;
2566	break;
2567	}
2568	curridx = NextBufIdx(curridx);
2569
2570	/ If flexible, break out of loop as soon as we wrote something /
2571	if (flexible && npages == `0`)
2572	break;
2573	}
2574
2575	Assert(npages == `0`);
2576
2577	/*
2578	* If asked to flush, do so
2579	*/
2580	if (LogwrtResult.Flush < WriteRqst.Flush &&
2581	LogwrtResult.Flush < LogwrtResult.Write)
2582
2583	{
2584	/*
2585	* Could get here without iterating above loop, in which case we might
2586	* have no open file or the wrong one. However, we do not need to
2587	* fsync more than one file.
2588	*/
2589	if (sync_method != SYNC_METHOD_OPEN &&
2590	sync_method != SYNC_METHOD_OPEN_DSYNC)
2591	{
2592	if (openLogFile >= `0` &&
2593	!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2594	wal_segment_size))
2595	XLogFileClose();
2596	if (openLogFile < `0`)
2597	{
2598	XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2599	wal_segment_size);
2600	openLogFile = XLogFileOpen(openLogSegNo);
2601	}
2602
2603	issue_xlog_fsync(openLogFile, openLogSegNo);
2604	}
2605
2606	/ signal that we need to wakeup walsenders later /
2607	WalSndWakeupRequest();
2608
2609	LogwrtResult.Flush = LogwrtResult.Write;
2610	}
2611
2612	/*
2613	* Update shared-memory status
2614	*
2615	* We make sure that the shared 'request' values do not fall behind the
2616	* 'result' values. This is not absolutely essential, but it saves some
2617	* code in a couple of places.
2618	*/
2619	{
2620	SpinLockAcquire(&XLogCtl->info_lck);
2621	XLogCtl->LogwrtResult = LogwrtResult;
2622	if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2623	XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2624	if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2625	XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2626	SpinLockRelease(&XLogCtl->info_lck);
2627	}
2628	}
2629
2630	/*
2631	* Record the LSN for an asynchronous transaction commit/abort
2632	* and nudge the WALWriter if there is work for it to do.
2633	* (This should not be called for synchronous commits.)
2634	*/
2635	void
2636	XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2637	{
2638	XLogRecPtr WriteRqstPtr = asyncXactLSN;
2639	bool sleeping;
2640
2641	SpinLockAcquire(&XLogCtl->info_lck);
2642	LogwrtResult = XLogCtl->LogwrtResult;
2643	sleeping = XLogCtl->WalWriterSleeping;
2644	if (XLogCtl->asyncXactLSN < asyncXactLSN)
2645	XLogCtl->asyncXactLSN = asyncXactLSN;
2646	SpinLockRelease(&XLogCtl->info_lck);
2647
2648	/*
2649	* If the WALWriter is sleeping, we should kick it to make it come out of
2650	* low-power mode. Otherwise, determine whether there's a full page of
2651	* WAL available to write.
2652	*/
2653	if (!sleeping)
2654	{
2655	/ back off to last completed page boundary /
2656	WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2657
2658	/ if we have already flushed that far, we're done /
2659	if (WriteRqstPtr <= LogwrtResult.Flush)
2660	return;
2661	}
2662
2663	/*
2664	* Nudge the WALWriter: it has a full page of WAL to write, or we want it
2665	* to come out of low-power mode so that this async commit will reach disk
2666	* within the expected amount of time.
2667	*/
2668	if (ProcGlobal->walwriterLatch)
2669	SetLatch(ProcGlobal->walwriterLatch);
2670	}
2671
2672	/*
2673	* Record the LSN up to which we can remove WAL because it's not required by
2674	* any replication slot.
2675	*/
2676	void
2677	XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2678	{
2679	SpinLockAcquire(&XLogCtl->info_lck);
2680	XLogCtl->replicationSlotMinLSN = lsn;
2681	SpinLockRelease(&XLogCtl->info_lck);
2682	}
2683
2684
2685	/*
2686	* Return the oldest LSN we must retain to satisfy the needs of some
2687	* replication slot.
2688	*/
2689	static XLogRecPtr
2690	XLogGetReplicationSlotMinimumLSN(void)
2691	{
2692	XLogRecPtr retval;
2693
2694	SpinLockAcquire(&XLogCtl->info_lck);
2695	retval = XLogCtl->replicationSlotMinLSN;
2696	SpinLockRelease(&XLogCtl->info_lck);
2697
2698	return retval;
2699	}
2700
2701	/*
2702	* Advance minRecoveryPoint in control file.
2703	*
2704	* If we crash during recovery, we must reach this point again before the
2705	* database is consistent.
2706	*
2707	* If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2708	* is only updated if it's not already greater than or equal to 'lsn'.
2709	*/
2710	static void
2711	UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2712	{
2713	/ Quick check using our local copy of the variable /
2714	if (!updateMinRecoveryPoint \|\| (!force && lsn <= minRecoveryPoint))
2715	return;
2716
2717	/*
2718	* An invalid minRecoveryPoint means that we need to recover all the WAL,
2719	* i.e., we're doing crash recovery. We never modify the control file's
2720	* value in that case, so we can short-circuit future checks here too. The
2721	* local values of minRecoveryPoint and minRecoveryPointTLI should not be
2722	* updated until crash recovery finishes. We only do this for the startup
2723	* process as it should not update its own reference of minRecoveryPoint
2724	* until it has finished crash recovery to make sure that all WAL
2725	* available is replayed in this case. This also saves from extra locks
2726	* taken on the control file from the startup process.
2727	*/
2728	if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2729	{
2730	updateMinRecoveryPoint = false;
2731	return;
2732	}
2733
2734	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2735
2736	/ update local copy /
2737	minRecoveryPoint = ControlFile->minRecoveryPoint;
2738	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2739
2740	if (XLogRecPtrIsInvalid(minRecoveryPoint))
2741	updateMinRecoveryPoint = false;
2742	else if (force \|\| minRecoveryPoint < lsn)
2743	{
2744	XLogRecPtr newMinRecoveryPoint;
2745	TimeLineID newMinRecoveryPointTLI;
2746
2747	/*
2748	* To avoid having to update the control file too often, we update it
2749	* all the way to the last record being replayed, even though 'lsn'
2750	* would suffice for correctness. This also allows the 'force' case
2751	* to not need a valid 'lsn' value.
2752	*
2753	* Another important reason for doing it this way is that the passed
2754	* 'lsn' value could be bogus, i.e., past the end of available WAL, if
2755	* the caller got it from a corrupted heap page. Accepting such a
2756	* value as the min recovery point would prevent us from coming up at
2757	* all. Instead, we just log a warning and continue with recovery.
2758	* (See also the comments about corrupt LSNs in XLogFlush.)
2759	*/
2760	SpinLockAcquire(&XLogCtl->info_lck);
2761	newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2762	newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2763	SpinLockRelease(&XLogCtl->info_lck);
2764
2765	if (!force && newMinRecoveryPoint < lsn)
2766	elog(WARNING,
2767	"xlog min recovery request %X/%X is past current point %X/%X",
2768	(uint32) (lsn >> `32`), (uint32) lsn,
2769	(uint32) (newMinRecoveryPoint >> `32`),
2770	(uint32) newMinRecoveryPoint);
2771
2772	/ update control file /
2773	if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2774	{
2775	ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2776	ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2777	UpdateControlFile();
2778	minRecoveryPoint = newMinRecoveryPoint;
2779	minRecoveryPointTLI = newMinRecoveryPointTLI;
2780
2781	ereport(DEBUG2,
2782	(errmsg("updated min recovery point to %X/%X on timeline %u",
2783	(uint32) (minRecoveryPoint >> `32`),
2784	(uint32) minRecoveryPoint,
2785	newMinRecoveryPointTLI)));
2786	}
2787	}
2788	LWLockRelease(ControlFileLock);
2789	}
2790
2791	/*
2792	* Ensure that all XLOG data through the given position is flushed to disk.
2793	*
2794	* NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2795	* already held, and we try to avoid acquiring it if possible.
2796	*/
2797	void
2798	XLogFlush(XLogRecPtr record)
2799	{
2800	XLogRecPtr WriteRqstPtr;
2801	XLogwrtRqst WriteRqst;
2802
2803	/*
2804	* During REDO, we are reading not writing WAL. Therefore, instead of
2805	* trying to flush the WAL, we should update minRecoveryPoint instead. We
2806	* test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2807	* to act this way too, and because when it tries to write the
2808	* end-of-recovery checkpoint, it should indeed flush.
2809	*/
2810	if (!XLogInsertAllowed())
2811	{
2812	UpdateMinRecoveryPoint(record, false);
2813	return;
2814	}
2815
2816	/ Quick exit if already known flushed /
2817	if (record <= LogwrtResult.Flush)
2818	return;
2819
2820	#ifdef WAL_DEBUG
2821	if (XLOG_DEBUG)
2822	elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2823	(uint32) (record >> `32`), (uint32) record,
2824	(uint32) (LogwrtResult.Write >> `32`), (uint32) LogwrtResult.Write,
2825	(uint32) (LogwrtResult.Flush >> `32`), (uint32) LogwrtResult.Flush);
2826	#endif
2827
2828	START_CRIT_SECTION();
2829
2830	/*
2831	* Since fsync is usually a horribly expensive operation, we try to
2832	* piggyback as much data as we can on each fsync: if we see any more data
2833	* entered into the xlog buffer, we'll write and fsync that too, so that
2834	* the final value of LogwrtResult.Flush is as large as possible. This
2835	* gives us some chance of avoiding another fsync immediately after.
2836	*/
2837
2838	/ initialize to given target; may increase below /
2839	WriteRqstPtr = record;
2840
2841	/*
2842	* Now wait until we get the write lock, or someone else does the flush
2843	* for us.
2844	*/
2845	for (;;)
2846	{
2847	XLogRecPtr insertpos;
2848
2849	/ read LogwrtResult and update local state /
2850	SpinLockAcquire(&XLogCtl->info_lck);
2851	if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2852	WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2853	LogwrtResult = XLogCtl->LogwrtResult;
2854	SpinLockRelease(&XLogCtl->info_lck);
2855
2856	/ done already? /
2857	if (record <= LogwrtResult.Flush)
2858	break;
2859
2860	/*
2861	* Before actually performing the write, wait for all in-flight
2862	* insertions to the pages we're about to write to finish.
2863	*/
2864	insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2865
2866	/*
2867	* Try to get the write lock. If we can't get it immediately, wait
2868	* until it's released, and recheck if we still need to do the flush
2869	* or if the backend that held the lock did it for us already. This
2870	* helps to maintain a good rate of group committing when the system
2871	* is bottlenecked by the speed of fsyncing.
2872	*/
2873	if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2874	{
2875	/*
2876	* The lock is now free, but we didn't acquire it yet. Before we
2877	* do, loop back to check if someone else flushed the record for
2878	* us already.
2879	*/
2880	continue;
2881	}
2882
2883	/ Got the lock; recheck whether request is satisfied /
2884	LogwrtResult = XLogCtl->LogwrtResult;
2885	if (record <= LogwrtResult.Flush)
2886	{
2887	LWLockRelease(WALWriteLock);
2888	break;
2889	}
2890
2891	/*
2892	* Sleep before flush! By adding a delay here, we may give further
2893	* backends the opportunity to join the backlog of group commit
2894	* followers; this can significantly improve transaction throughput,
2895	* at the risk of increasing transaction latency.
2896	*
2897	* We do not sleep if enableFsync is not turned on, nor if there are
2898	* fewer than CommitSiblings other backends with active transactions.
2899	*/
2900	if (CommitDelay > `0` && enableFsync &&
2901	MinimumActiveBackends(CommitSiblings))
2902	{
2903	pg_usleep(CommitDelay);
2904
2905	/*
2906	* Re-check how far we can now flush the WAL. It's generally not
2907	* safe to call WaitXLogInsertionsToFinish while holding
2908	* WALWriteLock, because an in-progress insertion might need to
2909	* also grab WALWriteLock to make progress. But we know that all
2910	* the insertions up to insertpos have already finished, because
2911	* that's what the earlier WaitXLogInsertionsToFinish() returned.
2912	* We're only calling it again to allow insertpos to be moved
2913	* further forward, not to actually wait for anyone.
2914	*/
2915	insertpos = WaitXLogInsertionsToFinish(insertpos);
2916	}
2917
2918	/ try to write/flush later additions to XLOG as well /
2919	WriteRqst.Write = insertpos;
2920	WriteRqst.Flush = insertpos;
2921
2922	XLogWrite(WriteRqst, false);
2923
2924	LWLockRelease(WALWriteLock);
2925	/ done /
2926	break;
2927	}
2928
2929	END_CRIT_SECTION();
2930
2931	/ wake up walsenders now that we've released heavily contended locks /
2932	WalSndWakeupProcessRequests();
2933
2934	/*
2935	* If we still haven't flushed to the request point then we have a
2936	* problem; most likely, the requested flush point is past end of XLOG.
2937	* This has been seen to occur when a disk page has a corrupted LSN.
2938	*
2939	* Formerly we treated this as a PANIC condition, but that hurts the
2940	* system's robustness rather than helping it: we do not want to take down
2941	* the whole system due to corruption on one data page. In particular, if
2942	* the bad page is encountered again during recovery then we would be
2943	* unable to restart the database at all! (This scenario actually
2944	* happened in the field several times with 7.1 releases.) As of 8.4, bad
2945	* LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2946	* the only time we can reach here during recovery is while flushing the
2947	* end-of-recovery checkpoint record, and we don't expect that to have a
2948	* bad LSN.
2949	*
2950	* Note that for calls from xact.c, the ERROR will be promoted to PANIC
2951	* since xact.c calls this routine inside a critical section. However,
2952	* calls from bufmgr.c are not within critical sections and so we will not
2953	* force a restart for a bad LSN on a data page.
2954	*/
2955	if (LogwrtResult.Flush < record)
2956	elog(ERROR,
2957	"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2958	(uint32) (record >> `32`), (uint32) record,
2959	(uint32) (LogwrtResult.Flush >> `32`), (uint32) LogwrtResult.Flush);
2960	}
2961
2962	/*
2963	* Write & flush xlog, but without specifying exactly where to.
2964	*
2965	* We normally write only completed blocks; but if there is nothing to do on
2966	* that basis, we check for unwritten async commits in the current incomplete
2967	* block, and write through the latest one of those. Thus, if async commits
2968	* are not being used, we will write complete blocks only.
2969	*
2970	* If, based on the above, there's anything to write we do so immediately. But
2971	* to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2972	* concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2973	* more than wal_writer_flush_after unflushed blocks.
2974	*
2975	* We can guarantee that async commits reach disk after at most three
2976	* wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2977	* to write "flexibly", meaning it can stop at the end of the buffer ring;
2978	* this makes a difference only with very high load or long wal_writer_delay,
2979	* but imposes one extra cycle for the worst case for async commits.)
2980	*
2981	* This routine is invoked periodically by the background walwriter process.
2982	*
2983	* Returns true if there was any work to do, even if we skipped flushing due
2984	* to wal_writer_delay/wal_writer_flush_after.
2985	*/
2986	bool
2987	XLogBackgroundFlush(void)
2988	{
2989	XLogwrtRqst WriteRqst;
2990	bool flexible = true;
2991	static TimestampTz lastflush;
2992	TimestampTz now;
2993	int flushbytes;
2994
2995	/ XLOG doesn't need flushing during recovery /
2996	if (RecoveryInProgress())
2997	return false;
2998
2999	/ read LogwrtResult and update local state /
3000	SpinLockAcquire(&XLogCtl->info_lck);
3001	LogwrtResult = XLogCtl->LogwrtResult;
3002	WriteRqst = XLogCtl->LogwrtRqst;
3003	SpinLockRelease(&XLogCtl->info_lck);
3004
3005	/ back off to last completed page boundary /
3006	WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3007
3008	/ if we have already flushed that far, consider async commit records /
3009	if (WriteRqst.Write <= LogwrtResult.Flush)
3010	{
3011	SpinLockAcquire(&XLogCtl->info_lck);
3012	WriteRqst.Write = XLogCtl->asyncXactLSN;
3013	SpinLockRelease(&XLogCtl->info_lck);
3014	flexible = false; / ensure it all gets written /
3015	}
3016
3017	/*
3018	* If already known flushed, we're done. Just need to check if we are
3019	* holding an open file handle to a logfile that's no longer in use,
3020	* preventing the file from being deleted.
3021	*/
3022	if (WriteRqst.Write <= LogwrtResult.Flush)
3023	{
3024	if (openLogFile >= `0`)
3025	{
3026	if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3027	wal_segment_size))
3028	{
3029	XLogFileClose();
3030	}
3031	}
3032	return false;
3033	}
3034
3035	/*
3036	* Determine how far to flush WAL, based on the wal_writer_delay and
3037	* wal_writer_flush_after GUCs.
3038	*/
3039	now = GetCurrentTimestamp();
3040	flushbytes =
3041	WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3042
3043	if (WalWriterFlushAfter == `0` \|\| lastflush == `0`)
3044	{
3045	/ first call, or block based limits disabled /
3046	WriteRqst.Flush = WriteRqst.Write;
3047	lastflush = now;
3048	}
3049	else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3050	{
3051	/*
3052	* Flush the writes at least every WalWriteDelay ms. This is important
3053	* to bound the amount of time it takes for an asynchronous commit to
3054	* hit disk.
3055	*/
3056	WriteRqst.Flush = WriteRqst.Write;
3057	lastflush = now;
3058	}
3059	else if (flushbytes >= WalWriterFlushAfter)
3060	{
3061	/ exceeded wal_writer_flush_after blocks, flush /
3062	WriteRqst.Flush = WriteRqst.Write;
3063	lastflush = now;
3064	}
3065	else
3066	{
3067	/ no flushing, this time round /
3068	WriteRqst.Flush = `0`;
3069	}
3070
3071	#ifdef WAL_DEBUG
3072	if (XLOG_DEBUG)
3073	elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3074	(uint32) (WriteRqst.Write >> `32`), (uint32) WriteRqst.Write,
3075	(uint32) (WriteRqst.Flush >> `32`), (uint32) WriteRqst.Flush,
3076	(uint32) (LogwrtResult.Write >> `32`), (uint32) LogwrtResult.Write,
3077	(uint32) (LogwrtResult.Flush >> `32`), (uint32) LogwrtResult.Flush);
3078	#endif
3079
3080	START_CRIT_SECTION();
3081
3082	/ now wait for any in-progress insertions to finish and get write lock /
3083	WaitXLogInsertionsToFinish(WriteRqst.Write);
3084	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3085	LogwrtResult = XLogCtl->LogwrtResult;
3086	if (WriteRqst.Write > LogwrtResult.Write \|\|
3087	WriteRqst.Flush > LogwrtResult.Flush)
3088	{
3089	XLogWrite(WriteRqst, flexible);
3090	}
3091	LWLockRelease(WALWriteLock);
3092
3093	END_CRIT_SECTION();
3094
3095	/ wake up walsenders now that we've released heavily contended locks /
3096	WalSndWakeupProcessRequests();
3097
3098	/*
3099	* Great, done. To take some work off the critical path, try to initialize
3100	* as many of the no-longer-needed WAL buffers for future use as we can.
3101	*/
3102	AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3103
3104	/*
3105	* If we determined that we need to write data, but somebody else
3106	* wrote/flushed already, it should be considered as being active, to
3107	* avoid hibernating too early.
3108	*/
3109	return true;
3110	}
3111
3112	/*
3113	* Test whether XLOG data has been flushed up to (at least) the given position.
3114	*
3115	* Returns true if a flush is still needed. (It may be that someone else
3116	* is already in process of flushing that far, however.)
3117	*/
3118	bool
3119	XLogNeedsFlush(XLogRecPtr record)
3120	{
3121	/*
3122	* During recovery, we don't flush WAL but update minRecoveryPoint
3123	* instead. So "needs flush" is taken to mean whether minRecoveryPoint
3124	* would need to be updated.
3125	*/
3126	if (RecoveryInProgress())
3127	{
3128	/*
3129	* An invalid minRecoveryPoint means that we need to recover all the
3130	* WAL, i.e., we're doing crash recovery. We never modify the control
3131	* file's value in that case, so we can short-circuit future checks
3132	* here too. This triggers a quick exit path for the startup process,
3133	* which cannot update its local copy of minRecoveryPoint as long as
3134	* it has not replayed all WAL available when doing crash recovery.
3135	*/
3136	if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3137	updateMinRecoveryPoint = false;
3138
3139	/ Quick exit if already known to be updated or cannot be updated /
3140	if (record <= minRecoveryPoint \|\| !updateMinRecoveryPoint)
3141	return false;
3142
3143	/*
3144	* Update local copy of minRecoveryPoint. But if the lock is busy,
3145	* just return a conservative guess.
3146	*/
3147	if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3148	return true;
3149	minRecoveryPoint = ControlFile->minRecoveryPoint;
3150	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3151	LWLockRelease(ControlFileLock);
3152
3153	/*
3154	* Check minRecoveryPoint for any other process than the startup
3155	* process doing crash recovery, which should not update the control
3156	* file value if crash recovery is still running.
3157	*/
3158	if (XLogRecPtrIsInvalid(minRecoveryPoint))
3159	updateMinRecoveryPoint = false;
3160
3161	/ check again /
3162	if (record <= minRecoveryPoint \|\| !updateMinRecoveryPoint)
3163	return false;
3164	else
3165	return true;
3166	}
3167
3168	/ Quick exit if already known flushed /
3169	if (record <= LogwrtResult.Flush)
3170	return false;
3171
3172	/ read LogwrtResult and update local state /
3173	SpinLockAcquire(&XLogCtl->info_lck);
3174	LogwrtResult = XLogCtl->LogwrtResult;
3175	SpinLockRelease(&XLogCtl->info_lck);
3176
3177	/ check again /
3178	if (record <= LogwrtResult.Flush)
3179	return false;
3180
3181	return true;
3182	}
3183
3184	/*
3185	* Create a new XLOG file segment, or open a pre-existing one.
3186	*
3187	* log, seg: identify segment to be created/opened.
3188	*
3189	* *use_existent: if true, OK to use a pre-existing file (else, any
3190	* pre-existing file will be deleted). On return, true if a pre-existing
3191	* file was used.
3192	*
3193	* use_lock: if true, acquire ControlFileLock while moving file into
3194	* place. This should be true except during bootstrap log creation. The
3195	* caller must not hold the lock at call.
3196	*
3197	* Returns FD of opened file.
3198	*
3199	* Note: errors here are ERROR not PANIC because we might or might not be
3200	* inside a critical section (eg, during checkpoint there is no reason to
3201	* take down the system on failure). They will promote to PANIC if we are
3202	* in a critical section.
3203	*/
3204	int
3205	XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3206	{
3207	char path[MAXPGPATH];
3208	char tmppath[MAXPGPATH];
3209	PGAlignedXLogBlock zbuffer;
3210	XLogSegNo installed_segno;
3211	XLogSegNo max_segno;
3212	int fd;
3213	int nbytes;
3214	int save_errno;
3215
3216	XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3217
3218	/*
3219	* Try to use existent file (checkpoint maker may have created it already)
3220	*/
3221	if (*use_existent)
3222	{
3223	fd = BasicOpenFile(path, O_RDWR \| PG_BINARY \| get_sync_bit(sync_method));
3224	if (fd < `0`)
3225	{
3226	if (errno != ENOENT)
3227	ereport(ERROR,
3228	(errcode_for_file_access(),
3229	errmsg("could not open file \"%s\": %m", path)));
3230	}
3231	else
3232	return fd;
3233	}
3234
3235	/*
3236	* Initialize an empty (all zeroes) segment. NOTE: it is possible that
3237	* another process is doing the same thing. If so, we will end up
3238	* pre-creating an extra log segment. That seems OK, and better than
3239	* holding the lock throughout this lengthy process.
3240	*/
3241	elog(DEBUG2, "creating and filling new WAL file");
3242
3243	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3244
3245	unlink(tmppath);
3246
3247	/ do not use get_sync_bit() here --- want to fsync only at end of fill /
3248	fd = BasicOpenFile(tmppath, O_RDWR \| O_CREAT \| O_EXCL \| PG_BINARY);
3249	if (fd < `0`)
3250	ereport(ERROR,
3251	(errcode_for_file_access(),
3252	errmsg("could not create file \"%s\": %m", tmppath)));
3253
3254	memset(zbuffer.data, `0`, XLOG_BLCKSZ);
3255
3256	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3257	save_errno = `0`;
3258	if (wal_init_zero)
3259	{
3260	/*
3261	* Zero-fill the file. With this setting, we do this the hard way to
3262	* ensure that all the file space has really been allocated. On
3263	* platforms that allow "holes" in files, just seeking to the end
3264	* doesn't allocate intermediate space. This way, we know that we
3265	* have all the space and (after the fsync below) that all the
3266	* indirect blocks are down on disk. Therefore, fdatasync(2) or
3267	* O_DSYNC will be sufficient to sync future writes to the log file.
3268	*/
3269	for (nbytes = `0`; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
3270	{
3271	errno = `0`;
3272	if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3273	{
3274	/ if write didn't set errno, assume no disk space /
3275	save_errno = errno ? errno : ENOSPC;
3276	break;
3277	}
3278	}
3279	}
3280	else
3281	{
3282	/*
3283	* Otherwise, seeking to the end and writing a solitary byte is
3284	* enough.
3285	*/
3286	errno = `0`;
3287	if (pg_pwrite(fd, zbuffer.data, `1`, wal_segment_size - `1`) != `1`)
3288	{
3289	/ if write didn't set errno, assume no disk space /
3290	save_errno = errno ? errno : ENOSPC;
3291	}
3292	}
3293	pgstat_report_wait_end();
3294
3295	if (save_errno)
3296	{
3297	/*
3298	* If we fail to make the file, delete it to release disk space
3299	*/
3300	unlink(tmppath);
3301
3302	close(fd);
3303
3304	errno = save_errno;
3305
3306	ereport(ERROR,
3307	(errcode_for_file_access(),
3308	errmsg("could not write to file \"%s\": %m", tmppath)));
3309	}
3310
3311	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3312	if (pg_fsync(fd) != `0`)
3313	{
3314	int save_errno = errno;
3315
3316	close(fd);
3317	errno = save_errno;
3318	ereport(ERROR,
3319	(errcode_for_file_access(),
3320	errmsg("could not fsync file \"%s\": %m", tmppath)));
3321	}
3322	pgstat_report_wait_end();
3323
3324	if (close(fd))
3325	ereport(ERROR,
3326	(errcode_for_file_access(),
3327	errmsg("could not close file \"%s\": %m", tmppath)));
3328
3329	/*
3330	* Now move the segment into place with its final name.
3331	*
3332	* If caller didn't want to use a pre-existing file, get rid of any
3333	* pre-existing file. Otherwise, cope with possibility that someone else
3334	* has created the file while we were filling ours: if so, use ours to
3335	* pre-create a future log segment.
3336	*/
3337	installed_segno = logsegno;
3338
3339	/*
3340	* XXX: What should we use as max_segno? We used to use XLOGfileslop when
3341	* that was a constant, but that was always a bit dubious: normally, at a
3342	* checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3343	* here, it was the offset from the insert location. We can't do the
3344	* normal XLOGfileslop calculation here because we don't have access to
3345	* the prior checkpoint's redo location. So somewhat arbitrarily, just use
3346	* CheckPointSegments.
3347	*/
3348	max_segno = logsegno + CheckPointSegments;
3349	if (!InstallXLogFileSegment(&installed_segno, tmppath,
3350	*use_existent, max_segno,
3351	use_lock))
3352	{
3353	/*
3354	* No need for any more future segments, or InstallXLogFileSegment()
3355	* failed to rename the file into place. If the rename failed, opening
3356	* the file below will fail.
3357	*/
3358	unlink(tmppath);
3359	}
3360
3361	/ Set flag to tell caller there was no existent file /
3362	*use_existent = false;
3363
3364	/ Now open original target segment (might not be file I just made) /
3365	fd = BasicOpenFile(path, O_RDWR \| PG_BINARY \| get_sync_bit(sync_method));
3366	if (fd < `0`)
3367	ereport(ERROR,
3368	(errcode_for_file_access(),
3369	errmsg("could not open file \"%s\": %m", path)));
3370
3371	elog(DEBUG2, "done creating and filling new WAL file");
3372
3373	return fd;
3374	}
3375
3376	/*
3377	* Create a new XLOG file segment by copying a pre-existing one.
3378	*
3379	* destsegno: identify segment to be created.
3380	*
3381	* srcTLI, srcsegno: identify segment to be copied (could be from
3382	* a different timeline)
3383	*
3384	* upto: how much of the source file to copy (the rest is filled with
3385	* zeros)
3386	*
3387	* Currently this is only used during recovery, and so there are no locking
3388	* considerations. But we should be just as tense as XLogFileInit to avoid
3389	* emplacing a bogus file.
3390	*/
3391	static void
3392	XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3393	int upto)
3394	{
3395	char path[MAXPGPATH];
3396	char tmppath[MAXPGPATH];
3397	PGAlignedXLogBlock buffer;
3398	int srcfd;
3399	int fd;
3400	int nbytes;
3401
3402	/*
3403	* Open the source file
3404	*/
3405	XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3406	srcfd = OpenTransientFile(path, O_RDONLY \| PG_BINARY);
3407	if (srcfd < `0`)
3408	ereport(ERROR,
3409	(errcode_for_file_access(),
3410	errmsg("could not open file \"%s\": %m", path)));
3411
3412	/*
3413	* Copy into a temp file name.
3414	*/
3415	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3416
3417	unlink(tmppath);
3418
3419	/ do not use get_sync_bit() here --- want to fsync only at end of fill /
3420	fd = OpenTransientFile(tmppath, O_RDWR \| O_CREAT \| O_EXCL \| PG_BINARY);
3421	if (fd < `0`)
3422	ereport(ERROR,
3423	(errcode_for_file_access(),
3424	errmsg("could not create file \"%s\": %m", tmppath)));
3425
3426	/*
3427	* Do the data copying.
3428	*/
3429	for (nbytes = `0`; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3430	{
3431	int nread;
3432
3433	nread = upto - nbytes;
3434
3435	/*
3436	* The part that is not read from the source file is filled with
3437	* zeros.
3438	*/
3439	if (nread < sizeof(buffer))
3440	memset(buffer.data, `0`, sizeof(buffer));
3441
3442	if (nread > `0`)
3443	{
3444	int r;
3445
3446	if (nread > sizeof(buffer))
3447	nread = sizeof(buffer);
3448	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3449	r = read(srcfd, buffer.data, nread);
3450	if (r != nread)
3451	{
3452	if (r < `0`)
3453	ereport(ERROR,
3454	(errcode_for_file_access(),
3455	errmsg("could not read file \"%s\": %m",
3456	path)));
3457	else
3458	ereport(ERROR,
3459	(errcode(ERRCODE_DATA_CORRUPTED),
3460	errmsg("could not read file \"%s\": read %d of %zu",
3461	path, r, (Size) nread)));
3462	}
3463	pgstat_report_wait_end();
3464	}
3465	errno = `0`;
3466	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3467	if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3468	{
3469	int save_errno = errno;
3470
3471	/*
3472	* If we fail to make the file, delete it to release disk space
3473	*/
3474	unlink(tmppath);
3475	/ if write didn't set errno, assume problem is no disk space /
3476	errno = save_errno ? save_errno : ENOSPC;
3477
3478	ereport(ERROR,
3479	(errcode_for_file_access(),
3480	errmsg("could not write to file \"%s\": %m", tmppath)));
3481	}
3482	pgstat_report_wait_end();
3483	}
3484
3485	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3486	if (pg_fsync(fd) != `0`)
3487	ereport(data_sync_elevel(ERROR),
3488	(errcode_for_file_access(),
3489	errmsg("could not fsync file \"%s\": %m", tmppath)));
3490	pgstat_report_wait_end();
3491
3492	if (CloseTransientFile(fd))
3493	ereport(ERROR,
3494	(errcode_for_file_access(),
3495	errmsg("could not close file \"%s\": %m", tmppath)));
3496
3497	if (CloseTransientFile(srcfd))
3498	ereport(ERROR,
3499	(errcode_for_file_access(),
3500	errmsg("could not close file \"%s\": %m", path)));
3501
3502	/*
3503	* Now move the segment into place with its final name.
3504	*/
3505	if (!InstallXLogFileSegment(&destsegno, tmppath, false, `0`, false))
3506	elog(ERROR, "InstallXLogFileSegment should not have failed");
3507	}
3508
3509	/*
3510	* Install a new XLOG segment file as a current or future log segment.
3511	*
3512	* This is used both to install a newly-created segment (which has a temp
3513	* filename while it's being created) and to recycle an old segment.
3514	*
3515	* *segno: identify segment to install as (or first possible target).
3516	* When find_free is true, this is modified on return to indicate the
3517	* actual installation location or last segment searched.
3518	*
3519	* tmppath: initial name of file to install. It will be renamed into place.
3520	*
3521	* find_free: if true, install the new segment at the first empty segno
3522	* number at or after the passed numbers. If false, install the new segment
3523	* exactly where specified, deleting any existing segment file there.
3524	*
3525	* max_segno: maximum segment number to install the new file as. Fail if no
3526	* free slot is found between *segno and max_segno. (Ignored when find_free
3527	* is false.)
3528	*
3529	* use_lock: if true, acquire ControlFileLock while moving file into
3530	* place. This should be true except during bootstrap log creation. The
3531	* caller must not hold the lock at call.
3532	*
3533	* Returns true if the file was installed successfully. false indicates that
3534	* max_segno limit was exceeded, or an error occurred while renaming the
3535	* file into place.
3536	*/
3537	static bool
3538	InstallXLogFileSegment(XLogSegNo segno, char* *tmppath,
3539	bool find_free, XLogSegNo max_segno,
3540	bool use_lock)
3541	{
3542	char path[MAXPGPATH];
3543	struct stat stat_buf;
3544
3545	XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3546
3547	/*
3548	* We want to be sure that only one process does this at a time.
3549	*/
3550	if (use_lock)
3551	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3552
3553	if (!find_free)
3554	{
3555	/ Force installation: get rid of any pre-existing segment file /
3556	durable_unlink(path, DEBUG1);
3557	}
3558	else
3559	{
3560	/ Find a free slot to put it in /
3561	while (stat(path, &stat_buf) == `0`)
3562	{
3563	if ((*segno) >= max_segno)
3564	{
3565	/ Failed to find a free slot within specified range /
3566	if (use_lock)
3567	LWLockRelease(ControlFileLock);
3568	return false;
3569	}
3570	(*segno)++;
3571	XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3572	}
3573	}
3574
3575	/*
3576	* Perform the rename using link if available, paranoidly trying to avoid
3577	* overwriting an existing file (there shouldn't be one).
3578	*/
3579	if (durable_link_or_rename(tmppath, path, LOG) != `0`)
3580	{
3581	if (use_lock)
3582	LWLockRelease(ControlFileLock);
3583	/ durable_link_or_rename already emitted log message /
3584	return false;
3585	}
3586
3587	if (use_lock)
3588	LWLockRelease(ControlFileLock);
3589
3590	return true;
3591	}
3592
3593	/*
3594	* Open a pre-existing logfile segment for writing.
3595	*/
3596	int
3597	XLogFileOpen(XLogSegNo segno)
3598	{
3599	char path[MAXPGPATH];
3600	int fd;
3601
3602	XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3603
3604	fd = BasicOpenFile(path, O_RDWR \| PG_BINARY \| get_sync_bit(sync_method));
3605	if (fd < `0`)
3606	ereport(PANIC,
3607	(errcode_for_file_access(),
3608	errmsg("could not open file \"%s\": %m", path)));
3609
3610	return fd;
3611	}
3612
3613	/*
3614	* Open a logfile segment for reading (during recovery).
3615	*
3616	* If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3617	* Otherwise, it's assumed to be already available in pg_wal.
3618	*/
3619	static int
3620	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3621	int source, bool notfoundOk)
3622	{
3623	char xlogfname[MAXFNAMELEN];
3624	char activitymsg[MAXFNAMELEN + `16`];
3625	char path[MAXPGPATH];
3626	int fd;
3627
3628	XLogFileName(xlogfname, tli, segno, wal_segment_size);
3629
3630	switch (source)
3631	{
3632	case XLOG_FROM_ARCHIVE:
3633	/ Report recovery progress in PS display /
3634	snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3635	xlogfname);
3636	set_ps_display(activitymsg, false);
3637
3638	restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3639	"RECOVERYXLOG",
3640	wal_segment_size,
3641	InRedo);
3642	if (!restoredFromArchive)
3643	return -`1`;
3644	break;
3645
3646	case XLOG_FROM_PG_WAL:
3647	case XLOG_FROM_STREAM:
3648	XLogFilePath(path, tli, segno, wal_segment_size);
3649	restoredFromArchive = false;
3650	break;
3651
3652	default:
3653	elog(ERROR, "invalid XLogFileRead source %d", source);
3654	}
3655
3656	/*
3657	* If the segment was fetched from archival storage, replace the existing
3658	* xlog segment (if any) with the archival version.
3659	*/
3660	if (source == XLOG_FROM_ARCHIVE)
3661	{
3662	KeepFileRestoredFromArchive(path, xlogfname);
3663
3664	/*
3665	* Set path to point at the new file in pg_wal.
3666	*/
3667	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3668	}
3669
3670	fd = BasicOpenFile(path, O_RDONLY \| PG_BINARY);
3671	if (fd >= `0`)
3672	{
3673	/ Success! /
3674	curFileTLI = tli;
3675
3676	/ Report recovery progress in PS display /
3677	snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3678	xlogfname);
3679	set_ps_display(activitymsg, false);
3680
3681	/ Track source of data in assorted state variables /
3682	readSource = source;
3683	XLogReceiptSource = source;
3684	/ In FROM_STREAM case, caller tracks receipt time, not me /
3685	if (source != XLOG_FROM_STREAM)
3686	XLogReceiptTime = GetCurrentTimestamp();
3687
3688	return fd;
3689	}
3690	if (errno != ENOENT \|\| !notfoundOk) / unexpected failure? /
3691	ereport(PANIC,
3692	(errcode_for_file_access(),
3693	errmsg("could not open file \"%s\": %m", path)));
3694	return -`1`;
3695	}
3696
3697	/*
3698	* Open a logfile segment for reading (during recovery).
3699	*
3700	* This version searches for the segment with any TLI listed in expectedTLEs.
3701	*/
3702	static int
3703	XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3704	{
3705	char path[MAXPGPATH];
3706	ListCell *cell;
3707	int fd;
3708	List *tles;
3709
3710	/*
3711	* Loop looking for a suitable timeline ID: we might need to read any of
3712	* the timelines listed in expectedTLEs.
3713	*
3714	* We expect curFileTLI on entry to be the TLI of the preceding file in
3715	* sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3716	* to go backwards; this prevents us from picking up the wrong file when a
3717	* parent timeline extends to higher segment numbers than the child we
3718	* want to read.
3719	*
3720	* If we haven't read the timeline history file yet, read it now, so that
3721	* we know which TLIs to scan. We don't save the list in expectedTLEs,
3722	* however, unless we actually find a valid segment. That way if there is
3723	* neither a timeline history file nor a WAL segment in the archive, and
3724	* streaming replication is set up, we'll read the timeline history file
3725	* streamed from the master when we start streaming, instead of recovering
3726	* with a dummy history generated here.
3727	*/
3728	if (expectedTLEs)
3729	tles = expectedTLEs;
3730	else
3731	tles = readTimeLineHistory(recoveryTargetTLI);
3732
3733	foreach(cell, tles)
3734	{
3735	TimeLineID tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3736
3737	if (tli < curFileTLI)
3738	break; / don't bother looking at too-old TLIs /
3739
3740	if (source == XLOG_FROM_ANY \|\| source == XLOG_FROM_ARCHIVE)
3741	{
3742	fd = XLogFileRead(segno, emode, tli,
3743	XLOG_FROM_ARCHIVE, true);
3744	if (fd != -`1`)
3745	{
3746	elog(DEBUG1, "got WAL segment from archive");
3747	if (!expectedTLEs)
3748	expectedTLEs = tles;
3749	return fd;
3750	}
3751	}
3752
3753	if (source == XLOG_FROM_ANY \|\| source == XLOG_FROM_PG_WAL)
3754	{
3755	fd = XLogFileRead(segno, emode, tli,
3756	XLOG_FROM_PG_WAL, true);
3757	if (fd != -`1`)
3758	{
3759	if (!expectedTLEs)
3760	expectedTLEs = tles;
3761	return fd;
3762	}
3763	}
3764	}
3765
3766	/ Couldn't find it. For simplicity, complain about front timeline /
3767	XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3768	errno = ENOENT;
3769	ereport(emode,
3770	(errcode_for_file_access(),
3771	errmsg("could not open file \"%s\": %m", path)));
3772	return -`1`;
3773	}
3774
3775	/*
3776	* Close the current logfile segment for writing.
3777	*/
3778	static void
3779	XLogFileClose(void)
3780	{
3781	Assert(openLogFile >= `0`);
3782
3783	/*
3784	* WAL segment files will not be re-read in normal operation, so we advise
3785	* the OS to release any cached pages. But do not do so if WAL archiving
3786	* or streaming is active, because archiver and walsender process could
3787	* use the cache to read the WAL segment.
3788	*/
3789	#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3790	if (!XLogIsNeeded())
3791	(void) posix_fadvise(openLogFile, `0`, `0`, POSIX_FADV_DONTNEED);
3792	#endif
3793
3794	if (close(openLogFile))
3795	ereport(PANIC,
3796	(errcode_for_file_access(),
3797	errmsg("could not close file \"%s\": %m",
3798	XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3799	openLogFile = -`1`;
3800	}
3801
3802	/*
3803	* Preallocate log files beyond the specified log endpoint.
3804	*
3805	* XXX this is currently extremely conservative, since it forces only one
3806	* future log segment to exist, and even that only if we are 75% done with
3807	* the current one. This is only appropriate for very low-WAL-volume systems.
3808	* High-volume systems will be OK once they've built up a sufficient set of
3809	* recycled log segments, but the startup transient is likely to include
3810	* a lot of segment creations by foreground processes, which is not so good.
3811	*/
3812	static void
3813	PreallocXlogFiles(XLogRecPtr endptr)
3814	{
3815	XLogSegNo _logSegNo;
3816	int lf;
3817	bool use_existent;
3818	uint64 offset;
3819
3820	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3821	offset = XLogSegmentOffset(endptr - `1`, wal_segment_size);
3822	if (offset >= (uint32) (`0.75` * wal_segment_size))
3823	{
3824	_logSegNo++;
3825	use_existent = true;
3826	lf = XLogFileInit(_logSegNo, &use_existent, true);
3827	close(lf);
3828	if (!use_existent)
3829	CheckpointStats.ckpt_segs_added++;
3830	}
3831	}
3832
3833	/*
3834	* Throws an error if the given log segment has already been removed or
3835	* recycled. The caller should only pass a segment that it knows to have
3836	* existed while the server has been running, as this function always
3837	* succeeds if no WAL segments have been removed since startup.
3838	* 'tli' is only used in the error message.
3839	*
3840	* Note: this function guarantees to keep errno unchanged on return.
3841	* This supports callers that use this to possibly deliver a better
3842	* error message about a missing file, while still being able to throw
3843	* a normal file-access error afterwards, if this does return.
3844	*/
3845	void
3846	CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3847	{
3848	int save_errno = errno;
3849	XLogSegNo lastRemovedSegNo;
3850
3851	SpinLockAcquire(&XLogCtl->info_lck);
3852	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3853	SpinLockRelease(&XLogCtl->info_lck);
3854
3855	if (segno <= lastRemovedSegNo)
3856	{
3857	char filename[MAXFNAMELEN];
3858
3859	XLogFileName(filename, tli, segno, wal_segment_size);
3860	errno = save_errno;
3861	ereport(ERROR,
3862	(errcode_for_file_access(),
3863	errmsg("requested WAL segment %s has already been removed",
3864	filename)));
3865	}
3866	errno = save_errno;
3867	}
3868
3869	/*
3870	* Return the last WAL segment removed, or 0 if no segment has been removed
3871	* since startup.
3872	*
3873	* NB: the result can be out of date arbitrarily fast, the caller has to deal
3874	* with that.
3875	*/
3876	XLogSegNo
3877	XLogGetLastRemovedSegno(void)
3878	{
3879	XLogSegNo lastRemovedSegNo;
3880
3881	SpinLockAcquire(&XLogCtl->info_lck);
3882	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3883	SpinLockRelease(&XLogCtl->info_lck);
3884
3885	return lastRemovedSegNo;
3886	}
3887
3888	/*
3889	* Update the last removed segno pointer in shared memory, to reflect
3890	* that the given XLOG file has been removed.
3891	*/
3892	static void
3893	UpdateLastRemovedPtr(char *filename)
3894	{
3895	uint32 tli;
3896	XLogSegNo segno;
3897
3898	XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3899
3900	SpinLockAcquire(&XLogCtl->info_lck);
3901	if (segno > XLogCtl->lastRemovedSegNo)
3902	XLogCtl->lastRemovedSegNo = segno;
3903	SpinLockRelease(&XLogCtl->info_lck);
3904	}
3905
3906	/*
3907	* Remove all temporary log files in pg_wal
3908	*
3909	* This is called at the beginning of recovery after a previous crash,
3910	* at a point where no other processes write fresh WAL data.
3911	*/
3912	static void
3913	RemoveTempXlogFiles(void)
3914	{
3915	DIR *xldir;
3916	struct dirent *xlde;
3917
3918	elog(DEBUG2, "removing all temporary WAL segments");
3919
3920	xldir = AllocateDir(XLOGDIR);
3921	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3922	{
3923	char path[MAXPGPATH];
3924
3925	if (strncmp(xlde->d_name, "xlogtemp.", `9`) != `0`)
3926	continue;
3927
3928	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3929	unlink(path);
3930	elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3931	}
3932	FreeDir(xldir);
3933	}
3934
3935	/*
3936	* Recycle or remove all log files older or equal to passed segno.
3937	*
3938	* endptr is current (or recent) end of xlog, and RedoRecPtr is the
3939	* redo pointer of the last checkpoint. These are used to determine
3940	* whether we want to recycle rather than delete no-longer-wanted log files.
3941	*/
3942	static void
3943	RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
3944	{
3945	DIR *xldir;
3946	struct dirent *xlde;
3947	char lastoff[MAXFNAMELEN];
3948
3949	/*
3950	* Construct a filename of the last segment to be kept. The timeline ID
3951	* doesn't matter, we ignore that in the comparison. (During recovery,
3952	* ThisTimeLineID isn't set, so we can't use that.)
3953	*/
3954	XLogFileName(lastoff, `0`, segno, wal_segment_size);
3955
3956	elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3957	lastoff);
3958
3959	xldir = AllocateDir(XLOGDIR);
3960
3961	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3962	{
3963	/ Ignore files that are not XLOG segments /
3964	if (!IsXLogFileName(xlde->d_name) &&
3965	!IsPartialXLogFileName(xlde->d_name))
3966	continue;
3967
3968	/*
3969	* We ignore the timeline part of the XLOG segment identifiers in
3970	* deciding whether a segment is still needed. This ensures that we
3971	* won't prematurely remove a segment from a parent timeline. We could
3972	* probably be a little more proactive about removing segments of
3973	* non-parent timelines, but that would be a whole lot more
3974	* complicated.
3975	*
3976	* We use the alphanumeric sorting property of the filenames to decide
3977	* which ones are earlier than the lastoff segment.
3978	*/
3979	if (strcmp(xlde->d_name + `8`, lastoff + `8`) <= `0`)
3980	{
3981	if (XLogArchiveCheckDone(xlde->d_name))
3982	{
3983	/ Update the last removed location in shared memory first /
3984	UpdateLastRemovedPtr(xlde->d_name);
3985
3986	RemoveXlogFile(xlde->d_name, RedoRecPtr, endptr);
3987	}
3988	}
3989	}
3990
3991	FreeDir(xldir);
3992	}
3993
3994	/*
3995	* Remove WAL files that are not part of the given timeline's history.
3996	*
3997	* This is called during recovery, whenever we switch to follow a new
3998	* timeline, and at the end of recovery when we create a new timeline. We
3999	* wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4000	* might be leftover pre-allocated or recycled WAL segments on the old timeline
4001	* that we haven't used yet, and contain garbage. If we just leave them in
4002	* pg_wal, they will eventually be archived, and we can't let that happen.
4003	* Files that belong to our timeline history are valid, because we have
4004	* successfully replayed them, but from others we can't be sure.
4005	*
4006	* 'switchpoint' is the current point in WAL where we switch to new timeline,
4007	* and 'newTLI' is the new timeline we switch to.
4008	*/
4009	static void
4010	RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4011	{
4012	DIR *xldir;
4013	struct dirent *xlde;
4014	char switchseg[MAXFNAMELEN];
4015	XLogSegNo endLogSegNo;
4016
4017	XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
4018
4019	/*
4020	* Construct a filename of the last segment to be kept.
4021	*/
4022	XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
4023
4024	elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4025	switchseg);
4026
4027	xldir = AllocateDir(XLOGDIR);
4028
4029	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4030	{
4031	/ Ignore files that are not XLOG segments /
4032	if (!IsXLogFileName(xlde->d_name))
4033	continue;
4034
4035	/*
4036	* Remove files that are on a timeline older than the new one we're
4037	* switching to, but with a segment number >= the first segment on the
4038	* new timeline.
4039	*/
4040	if (strncmp(xlde->d_name, switchseg, `8`) < `0` &&
4041	strcmp(xlde->d_name + `8`, switchseg + `8`) > `0`)
4042	{
4043	/*
4044	* If the file has already been marked as .ready, however, don't
4045	* remove it yet. It should be OK to remove it - files that are
4046	* not part of our timeline history are not required for recovery
4047	* - but seems safer to let them be archived and removed later.
4048	*/
4049	if (!XLogArchiveIsReady(xlde->d_name))
4050	RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4051	}
4052	}
4053
4054	FreeDir(xldir);
4055	}
4056
4057	/*
4058	* Recycle or remove a log file that's no longer needed.
4059	*
4060	* endptr is current (or recent) end of xlog, and RedoRecPtr is the
4061	* redo pointer of the last checkpoint. These are used to determine
4062	* whether we want to recycle rather than delete no-longer-wanted log files.
4063	* If RedoRecPtr is not known, pass invalid, and the function will recycle,
4064	* somewhat arbitrarily, 10 future segments.
4065	*/
4066	static void
4067	RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
4068	{
4069	char path[MAXPGPATH];
4070	#ifdef WIN32
4071	char newpath[MAXPGPATH];
4072	#endif
4073	struct stat statbuf;
4074	XLogSegNo endlogSegNo;
4075	XLogSegNo recycleSegNo;
4076
4077	if (wal_recycle)
4078	{
4079	/*
4080	* Initialize info about where to try to recycle to.
4081	*/
4082	XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4083	if (RedoRecPtr == InvalidXLogRecPtr)
4084	recycleSegNo = endlogSegNo + `10`;
4085	else
4086	recycleSegNo = XLOGfileslop(RedoRecPtr);
4087	}
4088	else
4089	recycleSegNo = `0`; / keep compiler quiet /
4090
4091	snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4092
4093	/*
4094	* Before deleting the file, see if it can be recycled as a future log
4095	* segment. Only recycle normal files, pg_standby for example can create
4096	* symbolic links pointing to a separate archive directory.
4097	*/
4098	if (wal_recycle &&
4099	endlogSegNo <= recycleSegNo &&
4100	lstat(path, &statbuf) == `0` && S_ISREG(statbuf.st_mode) &&
4101	InstallXLogFileSegment(&endlogSegNo, path,
4102	true, recycleSegNo, true))
4103	{
4104	ereport(DEBUG2,
4105	(errmsg("recycled write-ahead log file \"%s\"",
4106	segname)));
4107	CheckpointStats.ckpt_segs_recycled++;
4108	/ Needn't recheck that slot on future iterations /
4109	endlogSegNo++;
4110	}
4111	else
4112	{
4113	/ No need for any more future segments... /
4114	int rc;
4115
4116	ereport(DEBUG2,
4117	(errmsg("removing write-ahead log file \"%s\"",
4118	segname)));
4119
4120	#ifdef WIN32
4121
4122	/*
4123	* On Windows, if another process (e.g another backend) holds the file
4124	* open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4125	* will still show up in directory listing until the last handle is
4126	* closed. To avoid confusing the lingering deleted file for a live
4127	* WAL file that needs to be archived, rename it before deleting it.
4128	*
4129	* If another process holds the file open without FILE_SHARE_DELETE
4130	* flag, rename will fail. We'll try again at the next checkpoint.
4131	*/
4132	snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4133	if (rename(path, newpath) != `0`)
4134	{
4135	ereport(LOG,
4136	(errcode_for_file_access(),
4137	errmsg("could not rename file \"%s\": %m",
4138	path)));
4139	return;
4140	}
4141	rc = durable_unlink(newpath, LOG);
4142	#else
4143	rc = durable_unlink(path, LOG);
4144	#endif
4145	if (rc != `0`)
4146	{
4147	/ Message already logged by durable_unlink() /
4148	return;
4149	}
4150	CheckpointStats.ckpt_segs_removed++;
4151	}
4152
4153	XLogArchiveCleanup(segname);
4154	}
4155
4156	/*
4157	* Verify whether pg_wal and pg_wal/archive_status exist.
4158	* If the latter does not exist, recreate it.
4159	*
4160	* It is not the goal of this function to verify the contents of these
4161	* directories, but to help in cases where someone has performed a cluster
4162	* copy for PITR purposes but omitted pg_wal from the copy.
4163	*
4164	* We could also recreate pg_wal if it doesn't exist, but a deliberate
4165	* policy decision was made not to. It is fairly common for pg_wal to be
4166	* a symlink, and if that was the DBA's intent then automatically making a
4167	* plain directory would result in degraded performance with no notice.
4168	*/
4169	static void
4170	ValidateXLOGDirectoryStructure(void)
4171	{
4172	char path[MAXPGPATH];
4173	struct stat stat_buf;
4174
4175	/ Check for pg_wal; if it doesn't exist, error out /
4176	if (stat(XLOGDIR, &stat_buf) != `0` \|\|
4177	!S_ISDIR(stat_buf.st_mode))
4178	ereport(FATAL,
4179	(errmsg("required WAL directory \"%s\" does not exist",
4180	XLOGDIR)));
4181
4182	/ Check for archive_status /
4183	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4184	if (stat(path, &stat_buf) == `0`)
4185	{
4186	/ Check for weird cases where it exists but isn't a directory /
4187	if (!S_ISDIR(stat_buf.st_mode))
4188	ereport(FATAL,
4189	(errmsg("required WAL directory \"%s\" does not exist",
4190	path)));
4191	}
4192	else
4193	{
4194	ereport(LOG,
4195	(errmsg("creating missing WAL directory \"%s\"", path)));
4196	if (MakePGDirectory(path) < `0`)
4197	ereport(FATAL,
4198	(errmsg("could not create missing directory \"%s\": %m",
4199	path)));
4200	}
4201	}
4202
4203	/*
4204	* Remove previous backup history files. This also retries creation of
4205	* .ready files for any backup history files for which XLogArchiveNotify
4206	* failed earlier.
4207	*/
4208	static void
4209	CleanupBackupHistory(void)
4210	{
4211	DIR *xldir;
4212	struct dirent *xlde;
4213	char path[MAXPGPATH + sizeof(XLOGDIR)];
4214
4215	xldir = AllocateDir(XLOGDIR);
4216
4217	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4218	{
4219	if (IsBackupHistoryFileName(xlde->d_name))
4220	{
4221	if (XLogArchiveCheckDone(xlde->d_name))
4222	{
4223	elog(DEBUG2, "removing WAL backup history file \"%s\"",
4224	xlde->d_name);
4225	snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4226	unlink(path);
4227	XLogArchiveCleanup(xlde->d_name);
4228	}
4229	}
4230	}
4231
4232	FreeDir(xldir);
4233	}
4234
4235	/*
4236	* Attempt to read an XLOG record.
4237	*
4238	* If RecPtr is valid, try to read a record at that position. Otherwise
4239	* try to read a record just after the last one previously read.
4240	*
4241	* If no valid record is available, returns NULL, or fails if emode is PANIC.
4242	* (emode must be either PANIC, LOG). In standby mode, retries until a valid
4243	* record is available.
4244	*/
4245	static XLogRecord *
4246	ReadRecord(XLogReaderState xlogreader, XLogRecPtr RecPtr, int* emode,
4247	bool fetching_ckpt)
4248	{
4249	XLogRecord *record;
4250	XLogPageReadPrivate private = (XLogPageReadPrivate ) xlogreader->private_data;
4251
4252	/ Pass through parameters to XLogPageRead /
4253	private->fetching_ckpt = fetching_ckpt;
4254	private->emode = emode;
4255	private->randAccess = (RecPtr != InvalidXLogRecPtr);
4256
4257	/ This is the first attempt to read this page. /
4258	lastSourceFailed = false;
4259
4260	for (;;)
4261	{
4262	char *errormsg;
4263
4264	record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4265	ReadRecPtr = xlogreader->ReadRecPtr;
4266	EndRecPtr = xlogreader->EndRecPtr;
4267	if (record == NULL)
4268	{
4269	if (readFile >= `0`)
4270	{
4271	close(readFile);
4272	readFile = -`1`;
4273	}
4274
4275	/*
4276	* We only end up here without a message when XLogPageRead()
4277	* failed - in that case we already logged something. In
4278	* StandbyMode that only happens if we have been triggered, so we
4279	* shouldn't loop anymore in that case.
4280	*/
4281	if (errormsg)
4282	ereport(emode_for_corrupt_record(emode,
4283	RecPtr ? RecPtr : EndRecPtr),
4284	(errmsg_internal("%s", errormsg) / already translated / ));
4285	}
4286
4287	/*
4288	* Check page TLI is one of the expected values.
4289	*/
4290	else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4291	{
4292	char fname[MAXFNAMELEN];
4293	XLogSegNo segno;
4294	int32 offset;
4295
4296	XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4297	offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4298	wal_segment_size);
4299	XLogFileName(fname, xlogreader->readPageTLI, segno,
4300	wal_segment_size);
4301	ereport(emode_for_corrupt_record(emode,
4302	RecPtr ? RecPtr : EndRecPtr),
4303	(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4304	xlogreader->latestPageTLI,
4305	fname,
4306	offset)));
4307	record = NULL;
4308	}
4309
4310	if (record)
4311	{
4312	/ Great, got a record /
4313	return record;
4314	}
4315	else
4316	{
4317	/ No valid record available from this source /
4318	lastSourceFailed = true;
4319
4320	/*
4321	* If archive recovery was requested, but we were still doing
4322	* crash recovery, switch to archive recovery and retry using the
4323	* offline archive. We have now replayed all the valid WAL in
4324	* pg_wal, so we are presumably now consistent.
4325	*
4326	* We require that there's at least some valid WAL present in
4327	* pg_wal, however (!fetching_ckpt). We could recover using the
4328	* WAL from the archive, even if pg_wal is completely empty, but
4329	* we'd have no idea how far we'd have to replay to reach
4330	* consistency. So err on the safe side and give up.
4331	*/
4332	if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4333	!fetching_ckpt)
4334	{
4335	ereport(DEBUG1,
4336	(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4337	InArchiveRecovery = true;
4338	if (StandbyModeRequested)
4339	StandbyMode = true;
4340
4341	/ initialize minRecoveryPoint to this record /
4342	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4343	ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4344	if (ControlFile->minRecoveryPoint < EndRecPtr)
4345	{
4346	ControlFile->minRecoveryPoint = EndRecPtr;
4347	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4348	}
4349	/ update local copy /
4350	minRecoveryPoint = ControlFile->minRecoveryPoint;
4351	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4352
4353	/*
4354	* The startup process can update its local copy of
4355	* minRecoveryPoint from this point.
4356	*/
4357	updateMinRecoveryPoint = true;
4358
4359	UpdateControlFile();
4360	LWLockRelease(ControlFileLock);
4361
4362	CheckRecoveryConsistency();
4363
4364	/*
4365	* Before we retry, reset lastSourceFailed and currentSource
4366	* so that we will check the archive next.
4367	*/
4368	lastSourceFailed = false;
4369	currentSource = `0`;
4370
4371	continue;
4372	}
4373
4374	/ In standby mode, loop back to retry. Otherwise, give up. /
4375	if (StandbyMode && !CheckForStandbyTrigger())
4376	continue;
4377	else
4378	return NULL;
4379	}
4380	}
4381	}
4382
4383	/*
4384	* Scan for new timelines that might have appeared in the archive since we
4385	* started recovery.
4386	*
4387	* If there are any, the function changes recovery target TLI to the latest
4388	* one and returns 'true'.
4389	*/
4390	static bool
4391	rescanLatestTimeLine(void)
4392	{
4393	List *newExpectedTLEs;
4394	bool found;
4395	ListCell *cell;
4396	TimeLineID newtarget;
4397	TimeLineID oldtarget = recoveryTargetTLI;
4398	TimeLineHistoryEntry *currentTle = NULL;
4399
4400	newtarget = findNewestTimeLine(recoveryTargetTLI);
4401	if (newtarget == recoveryTargetTLI)
4402	{
4403	/ No new timelines found /
4404	return false;
4405	}
4406
4407	/*
4408	* Determine the list of expected TLIs for the new TLI
4409	*/
4410
4411	newExpectedTLEs = readTimeLineHistory(newtarget);
4412
4413	/*
4414	* If the current timeline is not part of the history of the new timeline,
4415	* we cannot proceed to it.
4416	*/
4417	found = false;
4418	foreach(cell, newExpectedTLEs)
4419	{
4420	currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4421
4422	if (currentTle->tli == recoveryTargetTLI)
4423	{
4424	found = true;
4425	break;
4426	}
4427	}
4428	if (!found)
4429	{
4430	ereport(LOG,
4431	(errmsg("new timeline %u is not a child of database system timeline %u",
4432	newtarget,
4433	ThisTimeLineID)));
4434	return false;
4435	}
4436
4437	/*
4438	* The current timeline was found in the history file, but check that the
4439	* next timeline was forked off from it after the current recovery
4440	* location.
4441	*/
4442	if (currentTle->end < EndRecPtr)
4443	{
4444	ereport(LOG,
4445	(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4446	newtarget,
4447	ThisTimeLineID,
4448	(uint32) (EndRecPtr >> `32`), (uint32) EndRecPtr)));
4449	return false;
4450	}
4451
4452	/ The new timeline history seems valid. Switch target /
4453	recoveryTargetTLI = newtarget;
4454	list_free_deep(expectedTLEs);
4455	expectedTLEs = newExpectedTLEs;
4456
4457	/*
4458	* As in StartupXLOG(), try to ensure we have all the history files
4459	* between the old target and new target in pg_wal.
4460	*/
4461	restoreTimeLineHistoryFiles(oldtarget + `1`, newtarget);
4462
4463	ereport(LOG,
4464	(errmsg("new target timeline is %u",
4465	recoveryTargetTLI)));
4466
4467	return true;
4468	}
4469
4470	/*
4471	* I/O routines for pg_control
4472	*
4473	* *ControlFile is a buffer in shared memory that holds an image of the
4474	* contents of pg_control. WriteControlFile() initializes pg_control
4475	* given a preloaded buffer, ReadControlFile() loads the buffer from
4476	* the pg_control file (during postmaster or standalone-backend startup),
4477	* and UpdateControlFile() rewrites pg_control after we modify xlog state.
4478	*
4479	* For simplicity, WriteControlFile() initializes the fields of pg_control
4480	* that are related to checking backend/database compatibility, and
4481	* ReadControlFile() verifies they are correct. We could split out the
4482	* I/O and compatibility-check functions, but there seems no need currently.
4483	*/
4484	static void
4485	WriteControlFile(void)
4486	{
4487	int fd;
4488	char buffer[PG_CONTROL_FILE_SIZE]; / need not be aligned /
4489
4490	/*
4491	* Ensure that the size of the pg_control data structure is sane. See the
4492	* comments for these symbols in pg_control.h.
4493	*/
4494	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4495	"pg_control is too large for atomic disk writes");
4496	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4497	"sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4498
4499	/*
4500	* Initialize version and compatibility-check fields
4501	*/
4502	ControlFile->pg_control_version = PG_CONTROL_VERSION;
4503	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4504
4505	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4506	ControlFile->floatFormat = FLOATFORMAT_VALUE;
4507
4508	ControlFile->blcksz = BLCKSZ;
4509	ControlFile->relseg_size = RELSEG_SIZE;
4510	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4511	ControlFile->xlog_seg_size = wal_segment_size;
4512
4513	ControlFile->nameDataLen = NAMEDATALEN;
4514	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4515
4516	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4517	ControlFile->loblksize = LOBLKSIZE;
4518
4519	ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4520	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4521
4522	/ Contents are protected with a CRC /
4523	INIT_CRC32C(ControlFile->crc);
4524	COMP_CRC32C(ControlFile->crc,
4525	(char *) ControlFile,
4526	offsetof(ControlFileData, crc));
4527	FIN_CRC32C(ControlFile->crc);
4528
4529	/*
4530	* We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4531	* the excess over sizeof(ControlFileData). This reduces the odds of
4532	* premature-EOF errors when reading pg_control. We'll still fail when we
4533	* check the contents of the file, but hopefully with a more specific
4534	* error than "couldn't read pg_control".
4535	*/
4536	memset(buffer, `0`, PG_CONTROL_FILE_SIZE);
4537	memcpy(buffer, ControlFile, sizeof(ControlFileData));
4538
4539	fd = BasicOpenFile(XLOG_CONTROL_FILE,
4540	O_RDWR \| O_CREAT \| O_EXCL \| PG_BINARY);
4541	if (fd < `0`)
4542	ereport(PANIC,
4543	(errcode_for_file_access(),
4544	errmsg("could not create file \"%s\": %m",
4545	XLOG_CONTROL_FILE)));
4546
4547	errno = `0`;
4548	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4549	if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4550	{
4551	/ if write didn't set errno, assume problem is no disk space /
4552	if (errno == `0`)
4553	errno = ENOSPC;
4554	ereport(PANIC,
4555	(errcode_for_file_access(),
4556	errmsg("could not write to file \"%s\": %m",
4557	XLOG_CONTROL_FILE)));
4558	}
4559	pgstat_report_wait_end();
4560
4561	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4562	if (pg_fsync(fd) != `0`)
4563	ereport(PANIC,
4564	(errcode_for_file_access(),
4565	errmsg("could not fsync file \"%s\": %m",
4566	XLOG_CONTROL_FILE)));
4567	pgstat_report_wait_end();
4568
4569	if (close(fd))
4570	ereport(PANIC,
4571	(errcode_for_file_access(),
4572	errmsg("could not close file \"%s\": %m",
4573	XLOG_CONTROL_FILE)));
4574	}
4575
4576	static void
4577	ReadControlFile(void)
4578	{
4579	pg_crc32c crc;
4580	int fd;
4581	static char wal_segsz_str[`20`];
4582	int r;
4583
4584	/*
4585	* Read data...
4586	*/
4587	fd = BasicOpenFile(XLOG_CONTROL_FILE,
4588	O_RDWR \| PG_BINARY);
4589	if (fd < `0`)
4590	ereport(PANIC,
4591	(errcode_for_file_access(),
4592	errmsg("could not open file \"%s\": %m",
4593	XLOG_CONTROL_FILE)));
4594
4595	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4596	r = read(fd, ControlFile, sizeof(ControlFileData));
4597	if (r != sizeof(ControlFileData))
4598	{
4599	if (r < `0`)
4600	ereport(PANIC,
4601	(errcode_for_file_access(),
4602	errmsg("could not read file \"%s\": %m",
4603	XLOG_CONTROL_FILE)));
4604	else
4605	ereport(PANIC,
4606	(errcode(ERRCODE_DATA_CORRUPTED),
4607	errmsg("could not read file \"%s\": read %d of %zu",
4608	XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4609	}
4610	pgstat_report_wait_end();
4611
4612	close(fd);
4613
4614	/*
4615	* Check for expected pg_control format version. If this is wrong, the
4616	* CRC check will likely fail because we'll be checking the wrong number
4617	* of bytes. Complaining about wrong version will probably be more
4618	* enlightening than complaining about wrong CRC.
4619	*/
4620
4621	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % `65536` == `0` && ControlFile->pg_control_version / `65536` != `0`)
4622	ereport(FATAL,
4623	(errmsg("database files are incompatible with server"),
4624	errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4625	" but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4626	ControlFile->pg_control_version, ControlFile->pg_control_version,
4627	PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4628	errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4629
4630	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4631	ereport(FATAL,
4632	(errmsg("database files are incompatible with server"),
4633	errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4634	" but the server was compiled with PG_CONTROL_VERSION %d.",
4635	ControlFile->pg_control_version, PG_CONTROL_VERSION),
4636	errhint("It looks like you need to initdb.")));
4637
4638	/ Now check the CRC. /
4639	INIT_CRC32C(crc);
4640	COMP_CRC32C(crc,
4641	(char *) ControlFile,
4642	offsetof(ControlFileData, crc));
4643	FIN_CRC32C(crc);
4644
4645	if (!EQ_CRC32C(crc, ControlFile->crc))
4646	ereport(FATAL,
4647	(errmsg("incorrect checksum in control file")));
4648
4649	/*
4650	* Do compatibility checking immediately. If the database isn't
4651	* compatible with the backend executable, we want to abort before we can
4652	* possibly do any damage.
4653	*/
4654	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4655	ereport(FATAL,
4656	(errmsg("database files are incompatible with server"),
4657	errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4658	" but the server was compiled with CATALOG_VERSION_NO %d.",
4659	ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4660	errhint("It looks like you need to initdb.")));
4661	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4662	ereport(FATAL,
4663	(errmsg("database files are incompatible with server"),
4664	errdetail("The database cluster was initialized with MAXALIGN %d,"
4665	" but the server was compiled with MAXALIGN %d.",
4666	ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4667	errhint("It looks like you need to initdb.")));
4668	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4669	ereport(FATAL,
4670	(errmsg("database files are incompatible with server"),
4671	errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4672	errhint("It looks like you need to initdb.")));
4673	if (ControlFile->blcksz != BLCKSZ)
4674	ereport(FATAL,
4675	(errmsg("database files are incompatible with server"),
4676	errdetail("The database cluster was initialized with BLCKSZ %d,"
4677	" but the server was compiled with BLCKSZ %d.",
4678	ControlFile->blcksz, BLCKSZ),
4679	errhint("It looks like you need to recompile or initdb.")));
4680	if (ControlFile->relseg_size != RELSEG_SIZE)
4681	ereport(FATAL,
4682	(errmsg("database files are incompatible with server"),
4683	errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4684	" but the server was compiled with RELSEG_SIZE %d.",
4685	ControlFile->relseg_size, RELSEG_SIZE),
4686	errhint("It looks like you need to recompile or initdb.")));
4687	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4688	ereport(FATAL,
4689	(errmsg("database files are incompatible with server"),
4690	errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4691	" but the server was compiled with XLOG_BLCKSZ %d.",
4692	ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4693	errhint("It looks like you need to recompile or initdb.")));
4694	if (ControlFile->nameDataLen != NAMEDATALEN)
4695	ereport(FATAL,
4696	(errmsg("database files are incompatible with server"),
4697	errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4698	" but the server was compiled with NAMEDATALEN %d.",
4699	ControlFile->nameDataLen, NAMEDATALEN),
4700	errhint("It looks like you need to recompile or initdb.")));
4701	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4702	ereport(FATAL,
4703	(errmsg("database files are incompatible with server"),
4704	errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4705	" but the server was compiled with INDEX_MAX_KEYS %d.",
4706	ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4707	errhint("It looks like you need to recompile or initdb.")));
4708	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4709	ereport(FATAL,
4710	(errmsg("database files are incompatible with server"),
4711	errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4712	" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4713	ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4714	errhint("It looks like you need to recompile or initdb.")));
4715	if (ControlFile->loblksize != LOBLKSIZE)
4716	ereport(FATAL,
4717	(errmsg("database files are incompatible with server"),
4718	errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4719	" but the server was compiled with LOBLKSIZE %d.",
4720	ControlFile->loblksize, (int) LOBLKSIZE),
4721	errhint("It looks like you need to recompile or initdb.")));
4722
4723	#ifdef USE_FLOAT4_BYVAL
4724	if (ControlFile->float4ByVal != true)
4725	ereport(FATAL,
4726	(errmsg("database files are incompatible with server"),
4727	errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4728	" but the server was compiled with USE_FLOAT4_BYVAL."),
4729	errhint("It looks like you need to recompile or initdb.")));
4730	#else
4731	if (ControlFile->float4ByVal != false)
4732	ereport(FATAL,
4733	(errmsg("database files are incompatible with server"),
4734	errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4735	" but the server was compiled without USE_FLOAT4_BYVAL."),
4736	errhint("It looks like you need to recompile or initdb.")));
4737	#endif
4738
4739	#ifdef USE_FLOAT8_BYVAL
4740	if (ControlFile->float8ByVal != true)
4741	ereport(FATAL,
4742	(errmsg("database files are incompatible with server"),
4743	errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4744	" but the server was compiled with USE_FLOAT8_BYVAL."),
4745	errhint("It looks like you need to recompile or initdb.")));
4746	#else
4747	if (ControlFile->float8ByVal != false)
4748	ereport(FATAL,
4749	(errmsg("database files are incompatible with server"),
4750	errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4751	" but the server was compiled without USE_FLOAT8_BYVAL."),
4752	errhint("It looks like you need to recompile or initdb.")));
4753	#endif
4754
4755	wal_segment_size = ControlFile->xlog_seg_size;
4756
4757	if (!IsValidWalSegSize(wal_segment_size))
4758	ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4759	errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4760	"WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4761	wal_segment_size,
4762	wal_segment_size)));
4763
4764	snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4765	SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4766	PGC_S_OVERRIDE);
4767
4768	/ check and update variables dependent on wal_segment_size /
4769	if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < `2`)
4770	ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4771	errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4772
4773	if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < `2`)
4774	ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4775	errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4776
4777	UsableBytesInSegment =
4778	(wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4779	(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4780
4781	CalculateCheckpointSegments();
4782
4783	/ Make the initdb settings visible as GUC variables, too /
4784	SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4785	PGC_INTERNAL, PGC_S_OVERRIDE);
4786	}
4787
4788	/*
4789	* Utility wrapper to update the control file. Note that the control
4790	* file gets flushed.
4791	*/
4792	void
4793	UpdateControlFile(void)
4794	{
4795	update_controlfile(DataDir, ControlFile, true);
4796	}
4797
4798	/*
4799	* Returns the unique system identifier from control file.
4800	*/
4801	uint64
4802	GetSystemIdentifier(void)
4803	{
4804	Assert(ControlFile != NULL);
4805	return ControlFile->system_identifier;
4806	}
4807
4808	/*
4809	* Returns the random nonce from control file.
4810	*/
4811	char *
4812	GetMockAuthenticationNonce(void)
4813	{
4814	Assert(ControlFile != NULL);
4815	return ControlFile->mock_authentication_nonce;
4816	}
4817
4818	/*
4819	* Are checksums enabled for data pages?
4820	*/
4821	bool
4822	DataChecksumsEnabled(void)
4823	{
4824	Assert(ControlFile != NULL);
4825	return (ControlFile->data_checksum_version > `0`);
4826	}
4827
4828	/*
4829	* Returns a fake LSN for unlogged relations.
4830	*
4831	* Each call generates an LSN that is greater than any previous value
4832	* returned. The current counter value is saved and restored across clean
4833	* shutdowns, but like unlogged relations, does not survive a crash. This can
4834	* be used in lieu of real LSN values returned by XLogInsert, if you need an
4835	* LSN-like increasing sequence of numbers without writing any WAL.
4836	*/
4837	XLogRecPtr
4838	GetFakeLSNForUnloggedRel(void)
4839	{
4840	XLogRecPtr nextUnloggedLSN;
4841
4842	/ increment the unloggedLSN counter, need SpinLock /
4843	SpinLockAcquire(&XLogCtl->ulsn_lck);
4844	nextUnloggedLSN = XLogCtl->unloggedLSN++;
4845	SpinLockRelease(&XLogCtl->ulsn_lck);
4846
4847	return nextUnloggedLSN;
4848	}
4849
4850	/*
4851	* Auto-tune the number of XLOG buffers.
4852	*
4853	* The preferred setting for wal_buffers is about 3% of shared_buffers, with
4854	* a maximum of one XLOG segment (there is little reason to think that more
4855	* is helpful, at least so long as we force an fsync when switching log files)
4856	* and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4857	* 9.1, when auto-tuning was added).
4858	*
4859	* This should not be called until NBuffers has received its final value.
4860	*/
4861	static int
4862	XLOGChooseNumBuffers(void)
4863	{
4864	int xbuffers;
4865
4866	xbuffers = NBuffers / `32`;
4867	if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4868	xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4869	if (xbuffers < `8`)
4870	xbuffers = `8`;
4871	return xbuffers;
4872	}
4873
4874	/*
4875	* GUC check_hook for wal_buffers
4876	*/
4877	bool
4878	check_wal_buffers(int newval, void* **extra, GucSource source)
4879	{
4880	/*
4881	* -1 indicates a request for auto-tune.
4882	*/
4883	if (*newval == -`1`)
4884	{
4885	/*
4886	* If we haven't yet changed the boot_val default of -1, just let it
4887	* be. We'll fix it when XLOGShmemSize is called.
4888	*/
4889	if (XLOGbuffers == -`1`)
4890	return true;
4891
4892	/ Otherwise, substitute the auto-tune value /
4893	*newval = XLOGChooseNumBuffers();
4894	}
4895
4896	/*
4897	* We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
4898	* 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4899	* the case, we just silently treat such values as a request for the
4900	* minimum. (We could throw an error instead, but that doesn't seem very
4901	* helpful.)
4902	*/
4903	if (*newval < `4`)
4904	*newval = `4`;
4905
4906	return true;
4907	}
4908
4909	/*
4910	* Read the control file, set respective GUCs.
4911	*
4912	* This is to be called during startup, including a crash recovery cycle,
4913	* unless in bootstrap mode, where no control file yet exists. As there's no
4914	* usable shared memory yet (its sizing can depend on the contents of the
4915	* control file!), first store the contents in local memory. XLOGShmemInit()
4916	* will then copy it to shared memory later.
4917	*
4918	* reset just controls whether previous contents are to be expected (in the
4919	* reset case, there's a dangling pointer into old shared memory), or not.
4920	*/
4921	void
4922	LocalProcessControlFile(bool reset)
4923	{
4924	Assert(reset \|\| ControlFile == NULL);
4925	ControlFile = palloc(sizeof(ControlFileData));
4926	ReadControlFile();
4927	}
4928
4929	/*
4930	* Initialization of shared memory for XLOG
4931	*/
4932	Size
4933	XLOGShmemSize(void)
4934	{
4935	Size size;
4936
4937	/*
4938	* If the value of wal_buffers is -1, use the preferred auto-tune value.
4939	* This isn't an amazingly clean place to do this, but we must wait till
4940	* NBuffers has received its final value, and must do it before using the
4941	* value of XLOGbuffers to do anything important.
4942	*/
4943	if (XLOGbuffers == -`1`)
4944	{
4945	char buf[`32`];
4946
4947	snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4948	SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4949	}
4950	Assert(XLOGbuffers > `0`);
4951
4952	/ XLogCtl /
4953	size = sizeof(XLogCtlData);
4954
4955	/ WAL insertion locks, plus alignment /
4956	size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + `1`));
4957	/ xlblocks array /
4958	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4959	/ extra alignment padding for XLOG I/O buffers /
4960	size = add_size(size, XLOG_BLCKSZ);
4961	/ and the buffers themselves /
4962	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4963
4964	/*
4965	* Note: we don't count ControlFileData, it comes out of the "slop factor"
4966	* added by CreateSharedMemoryAndSemaphores. This lets us use this
4967	* routine again below to compute the actual allocation size.
4968	*/
4969
4970	return size;
4971	}
4972
4973	void
4974	XLOGShmemInit(void)
4975	{
4976	bool foundCFile,
4977	foundXLog;
4978	char *allocptr;
4979	int i;
4980	ControlFileData *localControlFile;
4981
4982	#ifdef WAL_DEBUG
4983
4984	/*
4985	* Create a memory context for WAL debugging that's exempt from the normal
4986	* "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4987	* an allocation fails, but wal_debug is not for production use anyway.
4988	*/
4989	if (walDebugCxt == NULL)
4990	{
4991	walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4992	"WAL Debug",
4993	ALLOCSET_DEFAULT_SIZES);
4994	MemoryContextAllowInCriticalSection(walDebugCxt, true);
4995	}
4996	#endif
4997
4998
4999	XLogCtl = (XLogCtlData *)
5000	ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5001
5002	localControlFile = ControlFile;
5003	ControlFile = (ControlFileData *)
5004	ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5005
5006	if (foundCFile \|\| foundXLog)
5007	{
5008	/ both should be present or neither /
5009	Assert(foundCFile && foundXLog);
5010
5011	/ Initialize local copy of WALInsertLocks and register the tranche /
5012	WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5013	LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
5014	"wal_insert");
5015
5016	if (localControlFile)
5017	pfree(localControlFile);
5018	return;
5019	}
5020	memset(XLogCtl, `0`, sizeof(XLogCtlData));
5021
5022	/*
5023	* Already have read control file locally, unless in bootstrap mode. Move
5024	* contents into shared memory.
5025	*/
5026	if (localControlFile)
5027	{
5028	memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5029	pfree(localControlFile);
5030	}
5031
5032	/*
5033	* Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5034	* multiple of the alignment for same, so no extra alignment padding is
5035	* needed here.
5036	*/
5037	allocptr = ((char ) XLogCtl) + sizeof*(XLogCtlData);
5038	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5039	memset(XLogCtl->xlblocks, `0`, sizeof(XLogRecPtr) * XLOGbuffers);
5040	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5041
5042
5043	/ WAL insertion locks. Ensure they're aligned to the full padded size /
5044	allocptr += sizeof(WALInsertLockPadded) -
5045	((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5046	WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5047	(WALInsertLockPadded *) allocptr;
5048	allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5049
5050	LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
5051	for (i = `0`; i < NUM_XLOGINSERT_LOCKS; i++)
5052	{
5053	LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5054	WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5055	WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5056	}
5057
5058	/*
5059	* Align the start of the page buffers to a full xlog block size boundary.
5060	* This simplifies some calculations in XLOG insertion. It is also
5061	* required for O_DIRECT.
5062	*/
5063	allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5064	XLogCtl->pages = allocptr;
5065	memset(XLogCtl->pages, `0`, (Size) XLOG_BLCKSZ * XLOGbuffers);
5066
5067	/*
5068	* Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5069	* in additional info.)
5070	*/
5071	XLogCtl->XLogCacheBlck = XLOGbuffers - `1`;
5072	XLogCtl->SharedRecoveryInProgress = true;
5073	XLogCtl->SharedHotStandbyActive = false;
5074	XLogCtl->WalWriterSleeping = false;
5075
5076	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5077	SpinLockInit(&XLogCtl->info_lck);
5078	SpinLockInit(&XLogCtl->ulsn_lck);
5079	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5080	}
5081
5082	/*
5083	* This func must be called ONCE on system install. It creates pg_control
5084	* and the initial XLOG segment.
5085	*/
5086	void
5087	BootStrapXLOG(void)
5088	{
5089	CheckPoint checkPoint;
5090	char *buffer;
5091	XLogPageHeader page;
5092	XLogLongPageHeader longpage;
5093	XLogRecord *record;
5094	char *recptr;
5095	bool use_existent;
5096	uint64 sysidentifier;
5097	char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
5098	struct timeval tv;
5099	pg_crc32c crc;
5100
5101	/*
5102	* Select a hopefully-unique system identifier code for this installation.
5103	* We use the result of gettimeofday(), including the fractional seconds
5104	* field, as being about as unique as we can easily get. (Think not to
5105	* use random(), since it hasn't been seeded and there's no portable way
5106	* to seed it other than the system clock value...) The upper half of the
5107	* uint64 value is just the tv_sec part, while the lower half contains the
5108	* tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5109	* PID for a little extra uniqueness. A person knowing this encoding can
5110	* determine the initialization time of the installation, which could
5111	* perhaps be useful sometimes.
5112	*/
5113	gettimeofday(&tv, NULL);
5114	sysidentifier = ((uint64) tv.tv_sec) << `32`;
5115	sysidentifier \|= ((uint64) tv.tv_usec) << `12`;
5116	sysidentifier \|= getpid() & `0xFFF`;
5117
5118	/*
5119	* Generate a random nonce. This is used for authentication requests that
5120	* will fail because the user does not exist. The nonce is used to create
5121	* a genuine-looking password challenge for the non-existent user, in lieu
5122	* of an actual stored password.
5123	*/
5124	if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
5125	ereport(PANIC,
5126	(errcode(ERRCODE_INTERNAL_ERROR),
5127	errmsg("could not generate secret authorization token")));
5128
5129	/ First timeline ID is always 1 /
5130	ThisTimeLineID = `1`;
5131
5132	/ page buffer must be aligned suitably for O_DIRECT /
5133	buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5134	page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5135	memset(page, `0`, XLOG_BLCKSZ);
5136
5137	/*
5138	* Set up information for the initial checkpoint record
5139	*
5140	* The initial checkpoint record is written to the beginning of the WAL
5141	* segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5142	* used, so that we can use 0/0 to mean "before any valid WAL segment".
5143	*/
5144	checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5145	checkPoint.ThisTimeLineID = ThisTimeLineID;
5146	checkPoint.PrevTimeLineID = ThisTimeLineID;
5147	checkPoint.fullPageWrites = fullPageWrites;
5148	checkPoint.nextFullXid =
5149	FullTransactionIdFromEpochAndXid(`0`, FirstNormalTransactionId);
5150	checkPoint.nextOid = FirstBootstrapObjectId;
5151	checkPoint.nextMulti = FirstMultiXactId;
5152	checkPoint.nextMultiOffset = `0`;
5153	checkPoint.oldestXid = FirstNormalTransactionId;
5154	checkPoint.oldestXidDB = TemplateDbOid;
5155	checkPoint.oldestMulti = FirstMultiXactId;
5156	checkPoint.oldestMultiDB = TemplateDbOid;
5157	checkPoint.oldestCommitTsXid = InvalidTransactionId;
5158	checkPoint.newestCommitTsXid = InvalidTransactionId;
5159	checkPoint.time = (pg_time_t) time(NULL);
5160	checkPoint.oldestActiveXid = InvalidTransactionId;
5161
5162	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
5163	ShmemVariableCache->nextOid = checkPoint.nextOid;
5164	ShmemVariableCache->oidCount = `0`;
5165	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5166	AdvanceOldestClogXid(checkPoint.oldestXid);
5167	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5168	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5169	SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5170
5171	/ Set up the XLOG page header /
5172	page->xlp_magic = XLOG_PAGE_MAGIC;
5173	page->xlp_info = XLP_LONG_HEADER;
5174	page->xlp_tli = ThisTimeLineID;
5175	page->xlp_pageaddr = wal_segment_size;
5176	longpage = (XLogLongPageHeader) page;
5177	longpage->xlp_sysid = sysidentifier;
5178	longpage->xlp_seg_size = wal_segment_size;
5179	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5180
5181	/ Insert the initial checkpoint record /
5182	recptr = ((char *) page + SizeOfXLogLongPHD);
5183	record = (XLogRecord *) recptr;
5184	record->xl_prev = `0`;
5185	record->xl_xid = InvalidTransactionId;
5186	record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5187	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5188	record->xl_rmid = RM_XLOG_ID;
5189	recptr += SizeOfXLogRecord;
5190	/ fill the XLogRecordDataHeaderShort struct /
5191	(recptr++) = (char*) XLR_BLOCK_ID_DATA_SHORT;
5192	(recptr++) = sizeof*(checkPoint);
5193	memcpy(recptr, &checkPoint, sizeof(checkPoint));
5194	recptr += sizeof(checkPoint);
5195	Assert(recptr - (char *) record == record->xl_tot_len);
5196
5197	INIT_CRC32C(crc);
5198	COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5199	COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5200	FIN_CRC32C(crc);
5201	record->xl_crc = crc;
5202
5203	/ Create first XLOG segment file /
5204	use_existent = false;
5205	openLogFile = XLogFileInit(`1`, &use_existent, false);
5206
5207	/ Write the first page with the initial record /
5208	errno = `0`;
5209	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5210	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5211	{
5212	/ if write didn't set errno, assume problem is no disk space /
5213	if (errno == `0`)
5214	errno = ENOSPC;
5215	ereport(PANIC,
5216	(errcode_for_file_access(),
5217	errmsg("could not write bootstrap write-ahead log file: %m")));
5218	}
5219	pgstat_report_wait_end();
5220
5221	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5222	if (pg_fsync(openLogFile) != `0`)
5223	ereport(PANIC,
5224	(errcode_for_file_access(),
5225	errmsg("could not fsync bootstrap write-ahead log file: %m")));
5226	pgstat_report_wait_end();
5227
5228	if (close(openLogFile))
5229	ereport(PANIC,
5230	(errcode_for_file_access(),
5231	errmsg("could not close bootstrap write-ahead log file: %m")));
5232
5233	openLogFile = -`1`;
5234
5235	/ Now create pg_control /
5236
5237	memset(ControlFile, `0`, sizeof(ControlFileData));
5238	/ Initialize pg_control status fields /
5239	ControlFile->system_identifier = sysidentifier;
5240	memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5241	ControlFile->state = DB_SHUTDOWNED;
5242	ControlFile->time = checkPoint.time;
5243	ControlFile->checkPoint = checkPoint.redo;
5244	ControlFile->checkPointCopy = checkPoint;
5245	ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
5246
5247	/ Set important parameter values for use when replaying WAL /
5248	ControlFile->MaxConnections = MaxConnections;
5249	ControlFile->max_worker_processes = max_worker_processes;
5250	ControlFile->max_wal_senders = max_wal_senders;
5251	ControlFile->max_prepared_xacts = max_prepared_xacts;
5252	ControlFile->max_locks_per_xact = max_locks_per_xact;
5253	ControlFile->wal_level = wal_level;
5254	ControlFile->wal_log_hints = wal_log_hints;
5255	ControlFile->track_commit_timestamp = track_commit_timestamp;
5256	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5257
5258	/ some additional ControlFile fields are set in WriteControlFile() /
5259
5260	WriteControlFile();
5261
5262	/ Bootstrap the commit log, too /
5263	BootStrapCLOG();
5264	BootStrapCommitTs();
5265	BootStrapSUBTRANS();
5266	BootStrapMultiXact();
5267
5268	pfree(buffer);
5269
5270	/*
5271	* Force control file to be read - in contrast to normal processing we'd
5272	* otherwise never run the checks and GUC related initializations therein.
5273	*/
5274	ReadControlFile();
5275	}
5276
5277	static char *
5278	str_time(pg_time_t tnow)
5279	{
5280	static char buf[`128`];
5281
5282	pg_strftime(buf, sizeof(buf),
5283	"%Y-%m-%d %H:%M:%S %Z",
5284	pg_localtime(&tnow, log_timezone));
5285
5286	return buf;
5287	}
5288
5289	/*
5290	* See if there are any recovery signal files and if so, set state for
5291	* recovery.
5292	*
5293	* See if there is a recovery command file (recovery.conf), and if so
5294	* throw an ERROR since as of PG12 we no longer recognize that.
5295	*/
5296	static void
5297	readRecoverySignalFile(void)
5298	{
5299	struct stat stat_buf;
5300
5301	if (IsBootstrapProcessingMode())
5302	return;
5303
5304	/*
5305	* Check for old recovery API file: recovery.conf
5306	*/
5307	if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == `0`)
5308	ereport(FATAL,
5309	(errcode_for_file_access(),
5310	errmsg("using recovery command file \"%s\" is not supported",
5311	RECOVERY_COMMAND_FILE)));
5312
5313	/*
5314	* Remove unused .done file, if present. Ignore if absent.
5315	*/
5316	unlink(RECOVERY_COMMAND_DONE);
5317
5318	/*
5319	* Check for recovery signal files and if found, fsync them since they
5320	* represent server state information. We don't sweat too much about the
5321	* possibility of fsync failure, however.
5322	*
5323	* If present, standby signal file takes precedence. If neither is present
5324	* then we won't enter archive recovery.
5325	*/
5326	if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == `0`)
5327	{
5328	int fd;
5329
5330	fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR \| PG_BINARY \| get_sync_bit(sync_method),
5331	S_IRUSR \| S_IWUSR);
5332	if (fd >= `0`)
5333	{
5334	(void) pg_fsync(fd);
5335	close(fd);
5336	}
5337	standby_signal_file_found = true;
5338	}
5339	else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == `0`)
5340	{
5341	int fd;
5342
5343	fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR \| PG_BINARY \| get_sync_bit(sync_method),
5344	S_IRUSR \| S_IWUSR);
5345	if (fd >= `0`)
5346	{
5347	(void) pg_fsync(fd);
5348	close(fd);
5349	}
5350	recovery_signal_file_found = true;
5351	}
5352
5353	StandbyModeRequested = false;
5354	ArchiveRecoveryRequested = false;
5355	if (standby_signal_file_found)
5356	{
5357	StandbyModeRequested = true;
5358	ArchiveRecoveryRequested = true;
5359	}
5360	else if (recovery_signal_file_found)
5361	{
5362	StandbyModeRequested = false;
5363	ArchiveRecoveryRequested = true;
5364	}
5365	else
5366	return;
5367
5368	/*
5369	* We don't support standby mode in standalone backends; that requires
5370	* other processes such as the WAL receiver to be alive.
5371	*/
5372	if (StandbyModeRequested && !IsUnderPostmaster)
5373	ereport(FATAL,
5374	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5375	errmsg("standby mode is not supported by single-user servers")));
5376	}
5377
5378	static void
5379	validateRecoveryParameters(void)
5380	{
5381	if (!ArchiveRecoveryRequested)
5382	return;
5383
5384	/*
5385	* Check for compulsory parameters
5386	*/
5387	if (StandbyModeRequested)
5388	{
5389	if ((PrimaryConnInfo == NULL \|\| strcmp(PrimaryConnInfo, "") == `0`) &&
5390	(recoveryRestoreCommand == NULL \|\| strcmp(recoveryRestoreCommand, "") == `0`))
5391	ereport(WARNING,
5392	(errmsg("specified neither primary_conninfo nor restore_command"),
5393	errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5394	}
5395	else
5396	{
5397	if (recoveryRestoreCommand == NULL \|\|
5398	strcmp(recoveryRestoreCommand, "") == `0`)
5399	ereport(FATAL,
5400	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5401	errmsg("must specify restore_command when standby mode is not enabled")));
5402	}
5403
5404	/*
5405	* Override any inconsistent requests. Note that this is a change of
5406	* behaviour in 9.5; prior to this we simply ignored a request to pause if
5407	* hot_standby = off, which was surprising behaviour.
5408	*/
5409	if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5410	!EnableHotStandby)
5411	recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5412
5413	/*
5414	* Final parsing of recovery_target_time string; see also
5415	* check_recovery_target_time().
5416	*/
5417	if (recoveryTarget == RECOVERY_TARGET_TIME)
5418	{
5419	recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5420	CStringGetDatum(recovery_target_time_string),
5421	ObjectIdGetDatum(InvalidOid),
5422	Int32GetDatum(-`1`)));
5423	}
5424
5425	/*
5426	* If user specified recovery_target_timeline, validate it or compute the
5427	* "latest" value. We can't do this until after we've gotten the restore
5428	* command and set InArchiveRecovery, because we need to fetch timeline
5429	* history files from the archive.
5430	*/
5431	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5432	{
5433	TimeLineID rtli = recoveryTargetTLIRequested;
5434
5435	/ Timeline 1 does not have a history file, all else should /
5436	if (rtli != `1` && !existsTimeLineHistory(rtli))
5437	ereport(FATAL,
5438	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5439	errmsg("recovery target timeline %u does not exist",
5440	rtli)));
5441	recoveryTargetTLI = rtli;
5442	}
5443	else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
5444	{
5445	/ We start the "latest" search from pg_control's timeline /
5446	recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5447	}
5448	else
5449	{
5450	/*
5451	* else we just use the recoveryTargetTLI as already read from
5452	* ControlFile
5453	*/
5454	Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
5455	}
5456	}
5457
5458	/*
5459	* Exit archive-recovery state
5460	*/
5461	static void
5462	exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5463	{
5464	char recoveryPath[MAXPGPATH];
5465	char xlogfname[MAXFNAMELEN];
5466	XLogSegNo endLogSegNo;
5467	XLogSegNo startLogSegNo;
5468
5469	/ we always switch to a new timeline after archive recovery /
5470	Assert(endTLI != ThisTimeLineID);
5471
5472	/*
5473	* We are no longer in archive recovery state.
5474	*/
5475	InArchiveRecovery = false;
5476
5477	/*
5478	* Update min recovery point one last time.
5479	*/
5480	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5481
5482	/*
5483	* If the ending log segment is still open, close it (to avoid problems on
5484	* Windows with trying to rename or delete an open file).
5485	*/
5486	if (readFile >= `0`)
5487	{
5488	close(readFile);
5489	readFile = -`1`;
5490	}
5491
5492	/*
5493	* Calculate the last segment on the old timeline, and the first segment
5494	* on the new timeline. If the switch happens in the middle of a segment,
5495	* they are the same, but if the switch happens exactly at a segment
5496	* boundary, startLogSegNo will be endLogSegNo + 1.
5497	*/
5498	XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5499	XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5500
5501	/*
5502	* Initialize the starting WAL segment for the new timeline. If the switch
5503	* happens in the middle of a segment, copy data from the last WAL segment
5504	* of the old timeline up to the switch point, to the starting WAL segment
5505	* on the new timeline.
5506	*/
5507	if (endLogSegNo == startLogSegNo)
5508	{
5509	/*
5510	* Make a copy of the file on the new timeline.
5511	*
5512	* Writing WAL isn't allowed yet, so there are no locking
5513	* considerations. But we should be just as tense as XLogFileInit to
5514	* avoid emplacing a bogus file.
5515	*/
5516	XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5517	XLogSegmentOffset(endOfLog, wal_segment_size));
5518	}
5519	else
5520	{
5521	/*
5522	* The switch happened at a segment boundary, so just create the next
5523	* segment on the new timeline.
5524	*/
5525	bool use_existent = true;
5526	int fd;
5527
5528	fd = XLogFileInit(startLogSegNo, &use_existent, true);
5529
5530	if (close(fd))
5531	ereport(ERROR,
5532	(errcode_for_file_access(),
5533	errmsg("could not close file \"%s\": %m",
5534	XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5535	}
5536
5537	/*
5538	* Let's just make real sure there are not .ready or .done flags posted
5539	* for the new segment.
5540	*/
5541	XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5542	XLogArchiveCleanup(xlogfname);
5543
5544	/*
5545	* Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5546	* of it.
5547	*/
5548	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5549	unlink(recoveryPath); / ignore any error /
5550
5551	/ Get rid of any remaining recovered timeline-history file, too /
5552	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5553	unlink(recoveryPath); / ignore any error /
5554
5555	/*
5556	* Remove the signal files out of the way, so that we don't accidentally
5557	* re-enter archive recovery mode in a subsequent crash.
5558	*/
5559	if (standby_signal_file_found)
5560	durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5561
5562	if (recovery_signal_file_found)
5563	durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5564
5565	ereport(LOG,
5566	(errmsg("archive recovery complete")));
5567	}
5568
5569	/*
5570	* Extract timestamp from WAL record.
5571	*
5572	* If the record contains a timestamp, returns true, and saves the timestamp
5573	* in *recordXtime. If the record type has no timestamp, returns false.
5574	* Currently, only transaction commit/abort records and restore points contain
5575	* timestamps.
5576	*/
5577	static bool
5578	getRecordTimestamp(XLogReaderState record, TimestampTz recordXtime)
5579	{
5580	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5581	uint8 xact_info = info & XLOG_XACT_OPMASK;
5582	uint8 rmid = XLogRecGetRmid(record);
5583
5584	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5585	{
5586	recordXtime = ((xl_restore_point ) XLogRecGetData(record))->rp_time;
5587	return true;
5588	}
5589	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT \|\|
5590	xact_info == XLOG_XACT_COMMIT_PREPARED))
5591	{
5592	recordXtime = ((xl_xact_commit ) XLogRecGetData(record))->xact_time;
5593	return true;
5594	}
5595	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT \|\|
5596	xact_info == XLOG_XACT_ABORT_PREPARED))
5597	{
5598	recordXtime = ((xl_xact_abort ) XLogRecGetData(record))->xact_time;
5599	return true;
5600	}
5601	return false;
5602	}
5603
5604	/*
5605	* For point-in-time recovery, this function decides whether we want to
5606	* stop applying the XLOG before the current record.
5607	*
5608	* Returns true if we are stopping, false otherwise. If stopping, some
5609	* information is saved in recoveryStopXid et al for use in annotating the
5610	* new timeline's history file.
5611	*/
5612	static bool
5613	recoveryStopsBefore(XLogReaderState *record)
5614	{
5615	bool stopsHere = false;
5616	uint8 xact_info;
5617	bool isCommit;
5618	TimestampTz recordXtime = `0`;
5619	TransactionId recordXid;
5620
5621	/*
5622	* Ignore recovery target settings when not in archive recovery (meaning
5623	* we are in crash recovery).
5624	*/
5625	if (!ArchiveRecoveryRequested)
5626	return false;
5627
5628	/ Check if we should stop as soon as reaching consistency /
5629	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5630	{
5631	ereport(LOG,
5632	(errmsg("recovery stopping after reaching consistency")));
5633
5634	recoveryStopAfter = false;
5635	recoveryStopXid = InvalidTransactionId;
5636	recoveryStopLSN = InvalidXLogRecPtr;
5637	recoveryStopTime = `0`;
5638	recoveryStopName[`0`] = `'\0'`;
5639	return true;
5640	}
5641
5642	/ Check if target LSN has been reached /
5643	if (recoveryTarget == RECOVERY_TARGET_LSN &&
5644	!recoveryTargetInclusive &&
5645	record->ReadRecPtr >= recoveryTargetLSN)
5646	{
5647	recoveryStopAfter = false;
5648	recoveryStopXid = InvalidTransactionId;
5649	recoveryStopLSN = record->ReadRecPtr;
5650	recoveryStopTime = `0`;
5651	recoveryStopName[`0`] = `'\0'`;
5652	ereport(LOG,
5653	(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5654	(uint32) (recoveryStopLSN >> `32`),
5655	(uint32) recoveryStopLSN)));
5656	return true;
5657	}
5658
5659	/ Otherwise we only consider stopping before COMMIT or ABORT records. /
5660	if (XLogRecGetRmid(record) != RM_XACT_ID)
5661	return false;
5662
5663	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5664
5665	if (xact_info == XLOG_XACT_COMMIT)
5666	{
5667	isCommit = true;
5668	recordXid = XLogRecGetXid(record);
5669	}
5670	else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5671	{
5672	xl_xact_commit xlrec = (xl_xact_commit ) XLogRecGetData(record);
5673	xl_xact_parsed_commit parsed;
5674
5675	isCommit = true;
5676	ParseCommitRecord(XLogRecGetInfo(record),
5677	xlrec,
5678	&parsed);
5679	recordXid = parsed.twophase_xid;
5680	}
5681	else if (xact_info == XLOG_XACT_ABORT)
5682	{
5683	isCommit = false;
5684	recordXid = XLogRecGetXid(record);
5685	}
5686	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5687	{
5688	xl_xact_abort xlrec = (xl_xact_abort ) XLogRecGetData(record);
5689	xl_xact_parsed_abort parsed;
5690
5691	isCommit = true;
5692	ParseAbortRecord(XLogRecGetInfo(record),
5693	xlrec,
5694	&parsed);
5695	recordXid = parsed.twophase_xid;
5696	}
5697	else
5698	return false;
5699
5700	if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5701	{
5702	/*
5703	* There can be only one transaction end record with this exact
5704	* transactionid
5705	*
5706	* when testing for an xid, we MUST test for equality only, since
5707	* transactions are numbered in the order they start, not the order
5708	* they complete. A higher numbered xid will complete before you about
5709	* 50% of the time...
5710	*/
5711	stopsHere = (recordXid == recoveryTargetXid);
5712	}
5713
5714	if (recoveryTarget == RECOVERY_TARGET_TIME &&
5715	getRecordTimestamp(record, &recordXtime))
5716	{
5717	/*
5718	* There can be many transactions that share the same commit time, so
5719	* we stop after the last one, if we are inclusive, or stop at the
5720	* first one if we are exclusive
5721	*/
5722	if (recoveryTargetInclusive)
5723	stopsHere = (recordXtime > recoveryTargetTime);
5724	else
5725	stopsHere = (recordXtime >= recoveryTargetTime);
5726	}
5727
5728	if (stopsHere)
5729	{
5730	recoveryStopAfter = false;
5731	recoveryStopXid = recordXid;
5732	recoveryStopTime = recordXtime;
5733	recoveryStopLSN = InvalidXLogRecPtr;
5734	recoveryStopName[`0`] = `'\0'`;
5735
5736	if (isCommit)
5737	{
5738	ereport(LOG,
5739	(errmsg("recovery stopping before commit of transaction %u, time %s",
5740	recoveryStopXid,
5741	timestamptz_to_str(recoveryStopTime))));
5742	}
5743	else
5744	{
5745	ereport(LOG,
5746	(errmsg("recovery stopping before abort of transaction %u, time %s",
5747	recoveryStopXid,
5748	timestamptz_to_str(recoveryStopTime))));
5749	}
5750	}
5751
5752	return stopsHere;
5753	}
5754
5755	/*
5756	* Same as recoveryStopsBefore, but called after applying the record.
5757	*
5758	* We also track the timestamp of the latest applied COMMIT/ABORT
5759	* record in XLogCtl->recoveryLastXTime.
5760	*/
5761	static bool
5762	recoveryStopsAfter(XLogReaderState *record)
5763	{
5764	uint8 info;
5765	uint8 xact_info;
5766	uint8 rmid;
5767	TimestampTz recordXtime;
5768
5769	/*
5770	* Ignore recovery target settings when not in archive recovery (meaning
5771	* we are in crash recovery).
5772	*/
5773	if (!ArchiveRecoveryRequested)
5774	return false;
5775
5776	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5777	rmid = XLogRecGetRmid(record);
5778
5779	/*
5780	* There can be many restore points that share the same name; we stop at
5781	* the first one.
5782	*/
5783	if (recoveryTarget == RECOVERY_TARGET_NAME &&
5784	rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5785	{
5786	xl_restore_point *recordRestorePointData;
5787
5788	recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5789
5790	if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == `0`)
5791	{
5792	recoveryStopAfter = true;
5793	recoveryStopXid = InvalidTransactionId;
5794	recoveryStopLSN = InvalidXLogRecPtr;
5795	(void) getRecordTimestamp(record, &recoveryStopTime);
5796	strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5797
5798	ereport(LOG,
5799	(errmsg("recovery stopping at restore point \"%s\", time %s",
5800	recoveryStopName,
5801	timestamptz_to_str(recoveryStopTime))));
5802	return true;
5803	}
5804	}
5805
5806	/ Check if the target LSN has been reached /
5807	if (recoveryTarget == RECOVERY_TARGET_LSN &&
5808	recoveryTargetInclusive &&
5809	record->ReadRecPtr >= recoveryTargetLSN)
5810	{
5811	recoveryStopAfter = true;
5812	recoveryStopXid = InvalidTransactionId;
5813	recoveryStopLSN = record->ReadRecPtr;
5814	recoveryStopTime = `0`;
5815	recoveryStopName[`0`] = `'\0'`;
5816	ereport(LOG,
5817	(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5818	(uint32) (recoveryStopLSN >> `32`),
5819	(uint32) recoveryStopLSN)));
5820	return true;
5821	}
5822
5823	if (rmid != RM_XACT_ID)
5824	return false;
5825
5826	xact_info = info & XLOG_XACT_OPMASK;
5827
5828	if (xact_info == XLOG_XACT_COMMIT \|\|
5829	xact_info == XLOG_XACT_COMMIT_PREPARED \|\|
5830	xact_info == XLOG_XACT_ABORT \|\|
5831	xact_info == XLOG_XACT_ABORT_PREPARED)
5832	{
5833	TransactionId recordXid;
5834
5835	/ Update the last applied transaction timestamp /
5836	if (getRecordTimestamp(record, &recordXtime))
5837	SetLatestXTime(recordXtime);
5838
5839	/ Extract the XID of the committed/aborted transaction /
5840	if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5841	{
5842	xl_xact_commit xlrec = (xl_xact_commit ) XLogRecGetData(record);
5843	xl_xact_parsed_commit parsed;
5844
5845	ParseCommitRecord(XLogRecGetInfo(record),
5846	xlrec,
5847	&parsed);
5848	recordXid = parsed.twophase_xid;
5849	}
5850	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5851	{
5852	xl_xact_abort xlrec = (xl_xact_abort ) XLogRecGetData(record);
5853	xl_xact_parsed_abort parsed;
5854
5855	ParseAbortRecord(XLogRecGetInfo(record),
5856	xlrec,
5857	&parsed);
5858	recordXid = parsed.twophase_xid;
5859	}
5860	else
5861	recordXid = XLogRecGetXid(record);
5862
5863	/*
5864	* There can be only one transaction end record with this exact
5865	* transactionid
5866	*
5867	* when testing for an xid, we MUST test for equality only, since
5868	* transactions are numbered in the order they start, not the order
5869	* they complete. A higher numbered xid will complete before you about
5870	* 50% of the time...
5871	*/
5872	if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5873	recordXid == recoveryTargetXid)
5874	{
5875	recoveryStopAfter = true;
5876	recoveryStopXid = recordXid;
5877	recoveryStopTime = recordXtime;
5878	recoveryStopLSN = InvalidXLogRecPtr;
5879	recoveryStopName[`0`] = `'\0'`;
5880
5881	if (xact_info == XLOG_XACT_COMMIT \|\|
5882	xact_info == XLOG_XACT_COMMIT_PREPARED)
5883	{
5884	ereport(LOG,
5885	(errmsg("recovery stopping after commit of transaction %u, time %s",
5886	recoveryStopXid,
5887	timestamptz_to_str(recoveryStopTime))));
5888	}
5889	else if (xact_info == XLOG_XACT_ABORT \|\|
5890	xact_info == XLOG_XACT_ABORT_PREPARED)
5891	{
5892	ereport(LOG,
5893	(errmsg("recovery stopping after abort of transaction %u, time %s",
5894	recoveryStopXid,
5895	timestamptz_to_str(recoveryStopTime))));
5896	}
5897	return true;
5898	}
5899	}
5900
5901	/ Check if we should stop as soon as reaching consistency /
5902	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5903	{
5904	ereport(LOG,
5905	(errmsg("recovery stopping after reaching consistency")));
5906
5907	recoveryStopAfter = true;
5908	recoveryStopXid = InvalidTransactionId;
5909	recoveryStopTime = `0`;
5910	recoveryStopLSN = InvalidXLogRecPtr;
5911	recoveryStopName[`0`] = `'\0'`;
5912	return true;
5913	}
5914
5915	return false;
5916	}
5917
5918	/*
5919	* Wait until shared recoveryPause flag is cleared.
5920	*
5921	* XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5922	* Probably not worth the trouble though. This state shouldn't be one that
5923	* anyone cares about server power consumption in.
5924	*/
5925	static void
5926	recoveryPausesHere(void)
5927	{
5928	/ Don't pause unless users can connect! /
5929	if (!LocalHotStandbyActive)
5930	return;
5931
5932	ereport(LOG,
5933	(errmsg("recovery has paused"),
5934	errhint("Execute pg_wal_replay_resume() to continue.")));
5935
5936	while (RecoveryIsPaused())
5937	{
5938	pg_usleep(`1000000L`); / 1000 ms /
5939	HandleStartupProcInterrupts();
5940	}
5941	}
5942
5943	bool
5944	RecoveryIsPaused(void)
5945	{
5946	bool recoveryPause;
5947
5948	SpinLockAcquire(&XLogCtl->info_lck);
5949	recoveryPause = XLogCtl->recoveryPause;
5950	SpinLockRelease(&XLogCtl->info_lck);
5951
5952	return recoveryPause;
5953	}
5954
5955	void
5956	SetRecoveryPause(bool recoveryPause)
5957	{
5958	SpinLockAcquire(&XLogCtl->info_lck);
5959	XLogCtl->recoveryPause = recoveryPause;
5960	SpinLockRelease(&XLogCtl->info_lck);
5961	}
5962
5963	/*
5964	* When recovery_min_apply_delay is set, we wait long enough to make sure
5965	* certain record types are applied at least that interval behind the master.
5966	*
5967	* Returns true if we waited.
5968	*
5969	* Note that the delay is calculated between the WAL record log time and
5970	* the current time on standby. We would prefer to keep track of when this
5971	* standby received each WAL record, which would allow a more consistent
5972	* approach and one not affected by time synchronisation issues, but that
5973	* is significantly more effort and complexity for little actual gain in
5974	* usability.
5975	*/
5976	static bool
5977	recoveryApplyDelay(XLogReaderState *record)
5978	{
5979	uint8 xact_info;
5980	TimestampTz xtime;
5981	long secs;
5982	int microsecs;
5983
5984	/ nothing to do if no delay configured /
5985	if (recovery_min_apply_delay <= `0`)
5986	return false;
5987
5988	/ no delay is applied on a database not yet consistent /
5989	if (!reachedConsistency)
5990	return false;
5991
5992	/*
5993	* Is it a COMMIT record?
5994	*
5995	* We deliberately choose not to delay aborts since they have no effect on
5996	* MVCC. We already allow replay of records that don't have a timestamp,
5997	* so there is already opportunity for issues caused by early conflicts on
5998	* standbys.
5999	*/
6000	if (XLogRecGetRmid(record) != RM_XACT_ID)
6001	return false;
6002
6003	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6004
6005	if (xact_info != XLOG_XACT_COMMIT &&
6006	xact_info != XLOG_XACT_COMMIT_PREPARED)
6007	return false;
6008
6009	if (!getRecordTimestamp(record, &xtime))
6010	return false;
6011
6012	recoveryDelayUntilTime =
6013	TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6014
6015	/*
6016	* Exit without arming the latch if it's already past time to apply this
6017	* record
6018	*/
6019	TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6020	&secs, &microsecs);
6021	if (secs <= `0` && microsecs <= `0`)
6022	return false;
6023
6024	while (true)
6025	{
6026	ResetLatch(&XLogCtl->recoveryWakeupLatch);
6027
6028	/ might change the trigger file's location /
6029	HandleStartupProcInterrupts();
6030
6031	if (CheckForStandbyTrigger())
6032	break;
6033
6034	/*
6035	* Wait for difference between GetCurrentTimestamp() and
6036	* recoveryDelayUntilTime
6037	*/
6038	TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6039	&secs, &microsecs);
6040
6041	/ NB: We're ignoring waits below min_apply_delay's resolution. /
6042	if (secs <= `0` && microsecs / `1000` <= `0`)
6043	break;
6044
6045	elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6046	secs, microsecs / `1000`);
6047
6048	(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6049	WL_LATCH_SET \| WL_TIMEOUT \| WL_EXIT_ON_PM_DEATH,
6050	secs * `1000L` + microsecs / `1000`,
6051	WAIT_EVENT_RECOVERY_APPLY_DELAY);
6052	}
6053	return true;
6054	}
6055
6056	/*
6057	* Save timestamp of latest processed commit/abort record.
6058	*
6059	* We keep this in XLogCtl, not a simple static variable, so that it can be
6060	* seen by processes other than the startup process. Note in particular
6061	* that CreateRestartPoint is executed in the checkpointer.
6062	*/
6063	static void
6064	SetLatestXTime(TimestampTz xtime)
6065	{
6066	SpinLockAcquire(&XLogCtl->info_lck);
6067	XLogCtl->recoveryLastXTime = xtime;
6068	SpinLockRelease(&XLogCtl->info_lck);
6069	}
6070
6071	/*
6072	* Fetch timestamp of latest processed commit/abort record.
6073	*/
6074	TimestampTz
6075	GetLatestXTime(void)
6076	{
6077	TimestampTz xtime;
6078
6079	SpinLockAcquire(&XLogCtl->info_lck);
6080	xtime = XLogCtl->recoveryLastXTime;
6081	SpinLockRelease(&XLogCtl->info_lck);
6082
6083	return xtime;
6084	}
6085
6086	/*
6087	* Save timestamp of the next chunk of WAL records to apply.
6088	*
6089	* We keep this in XLogCtl, not a simple static variable, so that it can be
6090	* seen by all backends.
6091	*/
6092	static void
6093	SetCurrentChunkStartTime(TimestampTz xtime)
6094	{
6095	SpinLockAcquire(&XLogCtl->info_lck);
6096	XLogCtl->currentChunkStartTime = xtime;
6097	SpinLockRelease(&XLogCtl->info_lck);
6098	}
6099
6100	/*
6101	* Fetch timestamp of latest processed commit/abort record.
6102	* Startup process maintains an accurate local copy in XLogReceiptTime
6103	*/
6104	TimestampTz
6105	GetCurrentChunkReplayStartTime(void)
6106	{
6107	TimestampTz xtime;
6108
6109	SpinLockAcquire(&XLogCtl->info_lck);
6110	xtime = XLogCtl->currentChunkStartTime;
6111	SpinLockRelease(&XLogCtl->info_lck);
6112
6113	return xtime;
6114	}
6115
6116	/*
6117	* Returns time of receipt of current chunk of XLOG data, as well as
6118	* whether it was received from streaming replication or from archives.
6119	*/
6120	void
6121	GetXLogReceiptTime(TimestampTz rtime, bool fromStream)
6122	{
6123	/*
6124	* This must be executed in the startup process, since we don't export the
6125	* relevant state to shared memory.
6126	*/
6127	Assert(InRecovery);
6128
6129	*rtime = XLogReceiptTime;
6130	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6131	}
6132
6133	/*
6134	* Note that text field supplied is a parameter name and does not require
6135	* translation
6136	*/
6137	#define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6138	do { \
6139	if ((currValue) < (minValue)) \
6140	ereport(ERROR, \
6141	(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6142	errmsg("hot standby is not possible because " \
6143	"%s = %d is a lower setting than on the master server " \
6144	"(its value was %d)", \
6145	param_name, \
6146	currValue, \
6147	minValue))); \
6148	} while(0)
6149
6150	/*
6151	* Check to see if required parameters are set high enough on this server
6152	* for various aspects of recovery operation.
6153	*
6154	* Note that all the parameters which this function tests need to be
6155	* listed in Administrator's Overview section in high-availability.sgml.
6156	* If you change them, don't forget to update the list.
6157	*/
6158	static void
6159	CheckRequiredParameterValues(void)
6160	{
6161	/*
6162	* For archive recovery, the WAL must be generated with at least 'replica'
6163	* wal_level.
6164	*/
6165	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6166	{
6167	ereport(WARNING,
6168	(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6169	errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6170	}
6171
6172	/*
6173	* For Hot Standby, the WAL must be generated with 'replica' mode, and we
6174	* must have at least as many backend slots as the primary.
6175	*/
6176	if (ArchiveRecoveryRequested && EnableHotStandby)
6177	{
6178	if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6179	ereport(ERROR,
6180	(errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6181	errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6182
6183	/ We ignore autovacuum_max_workers when we make this test. /
6184	RecoveryRequiresIntParameter("max_connections",
6185	MaxConnections,
6186	ControlFile->MaxConnections);
6187	RecoveryRequiresIntParameter("max_worker_processes",
6188	max_worker_processes,
6189	ControlFile->max_worker_processes);
6190	RecoveryRequiresIntParameter("max_wal_senders",
6191	max_wal_senders,
6192	ControlFile->max_wal_senders);
6193	RecoveryRequiresIntParameter("max_prepared_transactions",
6194	max_prepared_xacts,
6195	ControlFile->max_prepared_xacts);
6196	RecoveryRequiresIntParameter("max_locks_per_transaction",
6197	max_locks_per_xact,
6198	ControlFile->max_locks_per_xact);
6199	}
6200	}
6201
6202	/*
6203	* This must be called ONCE during postmaster or standalone-backend startup
6204	*/
6205	void
6206	StartupXLOG(void)
6207	{
6208	XLogCtlInsert *Insert;
6209	CheckPoint checkPoint;
6210	bool wasShutdown;
6211	bool reachedStopPoint = false;
6212	bool haveBackupLabel = false;
6213	bool haveTblspcMap = false;
6214	XLogRecPtr RecPtr,
6215	checkPointLoc,
6216	EndOfLog;
6217	TimeLineID EndOfLogTLI;
6218	TimeLineID PrevTimeLineID;
6219	XLogRecord *record;
6220	TransactionId oldestActiveXID;
6221	bool backupEndRequired = false;
6222	bool backupFromStandby = false;
6223	DBState dbstate_at_startup;
6224	XLogReaderState *xlogreader;
6225	XLogPageReadPrivate private;
6226	bool fast_promoted = false;
6227	struct stat st;
6228
6229	/*
6230	* We should have an aux process resource owner to use, and we should not
6231	* be in a transaction that's installed some other resowner.
6232	*/
6233	Assert(AuxProcessResourceOwner != NULL);
6234	Assert(CurrentResourceOwner == NULL \|\|
6235	CurrentResourceOwner == AuxProcessResourceOwner);
6236	CurrentResourceOwner = AuxProcessResourceOwner;
6237
6238	/*
6239	* Verify XLOG status looks valid.
6240	*/
6241	if (ControlFile->state < DB_SHUTDOWNED \|\|
6242	ControlFile->state > DB_IN_PRODUCTION \|\|
6243	!XRecOffIsValid(ControlFile->checkPoint))
6244	ereport(FATAL,
6245	(errmsg("control file contains invalid data")));
6246
6247	if (ControlFile->state == DB_SHUTDOWNED)
6248	{
6249	/ This is the expected case, so don't be chatty in standalone mode /
6250	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6251	(errmsg("database system was shut down at %s",
6252	str_time(ControlFile->time))));
6253	}
6254	else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6255	ereport(LOG,
6256	(errmsg("database system was shut down in recovery at %s",
6257	str_time(ControlFile->time))));
6258	else if (ControlFile->state == DB_SHUTDOWNING)
6259	ereport(LOG,
6260	(errmsg("database system shutdown was interrupted; last known up at %s",
6261	str_time(ControlFile->time))));
6262	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6263	ereport(LOG,
6264	(errmsg("database system was interrupted while in recovery at %s",
6265	str_time(ControlFile->time)),
6266	errhint("This probably means that some data is corrupted and"
6267	" you will have to use the last backup for recovery.")));
6268	else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6269	ereport(LOG,
6270	(errmsg("database system was interrupted while in recovery at log time %s",
6271	str_time(ControlFile->checkPointCopy.time)),
6272	errhint("If this has occurred more than once some data might be corrupted"
6273	" and you might need to choose an earlier recovery target.")));
6274	else if (ControlFile->state == DB_IN_PRODUCTION)
6275	ereport(LOG,
6276	(errmsg("database system was interrupted; last known up at %s",
6277	str_time(ControlFile->time))));
6278
6279	/ This is just to allow attaching to startup process with a debugger /
6280	#ifdef XLOG_REPLAY_DELAY
6281	if (ControlFile->state != DB_SHUTDOWNED)
6282	pg_usleep(`60000000L`);
6283	#endif
6284
6285	/*
6286	* Verify that pg_wal and pg_wal/archive_status exist. In cases where
6287	* someone has performed a copy for PITR, these directories may have been
6288	* excluded and need to be re-created.
6289	*/
6290	ValidateXLOGDirectoryStructure();
6291
6292	/----------*
6293	* If we previously crashed, perform a couple of actions:
6294	* - The pg_wal directory may still include some temporary WAL segments
6295	* used when creating a new segment, so perform some clean up to not
6296	* bloat this path. This is done first as there is no point to sync this
6297	* temporary data.
6298	* - There might be data which we had written, intending to fsync it,
6299	* but which we had not actually fsync'd yet. Therefore, a power failure
6300	* in the near future might cause earlier unflushed writes to be lost,
6301	* even though more recent data written to disk from here on would be
6302	* persisted. To avoid that, fsync the entire data directory.
6303	*---------
6304	*/
6305	if (ControlFile->state != DB_SHUTDOWNED &&
6306	ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6307	{
6308	RemoveTempXlogFiles();
6309	SyncDataDirectory();
6310	}
6311
6312	/*
6313	* Initialize on the assumption we want to recover to the latest timeline
6314	* that's active according to pg_control.
6315	*/
6316	if (ControlFile->minRecoveryPointTLI >
6317	ControlFile->checkPointCopy.ThisTimeLineID)
6318	recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6319	else
6320	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6321
6322	/*
6323	* Check for signal files, and if so set up state for offline recovery
6324	*/
6325	readRecoverySignalFile();
6326	validateRecoveryParameters();
6327
6328	if (ArchiveRecoveryRequested)
6329	{
6330	if (StandbyModeRequested)
6331	ereport(LOG,
6332	(errmsg("entering standby mode")));
6333	else if (recoveryTarget == RECOVERY_TARGET_XID)
6334	ereport(LOG,
6335	(errmsg("starting point-in-time recovery to XID %u",
6336	recoveryTargetXid)));
6337	else if (recoveryTarget == RECOVERY_TARGET_TIME)
6338	ereport(LOG,
6339	(errmsg("starting point-in-time recovery to %s",
6340	timestamptz_to_str(recoveryTargetTime))));
6341	else if (recoveryTarget == RECOVERY_TARGET_NAME)
6342	ereport(LOG,
6343	(errmsg("starting point-in-time recovery to \"%s\"",
6344	recoveryTargetName)));
6345	else if (recoveryTarget == RECOVERY_TARGET_LSN)
6346	ereport(LOG,
6347	(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6348	(uint32) (recoveryTargetLSN >> `32`),
6349	(uint32) recoveryTargetLSN)));
6350	else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6351	ereport(LOG,
6352	(errmsg("starting point-in-time recovery to earliest consistent point")));
6353	else
6354	ereport(LOG,
6355	(errmsg("starting archive recovery")));
6356	}
6357
6358	/*
6359	* Take ownership of the wakeup latch if we're going to sleep during
6360	* recovery.
6361	*/
6362	if (StandbyModeRequested)
6363	OwnLatch(&XLogCtl->recoveryWakeupLatch);
6364
6365	/ Set up XLOG reader facility /
6366	MemSet(&private, `0`, sizeof(XLogPageReadPrivate));
6367	xlogreader = XLogReaderAllocate(wal_segment_size, &XLogPageRead, &private);
6368	if (!xlogreader)
6369	ereport(ERROR,
6370	(errcode(ERRCODE_OUT_OF_MEMORY),
6371	errmsg("out of memory"),
6372	errdetail("Failed while allocating a WAL reading processor.")));
6373	xlogreader->system_identifier = ControlFile->system_identifier;
6374
6375	/*
6376	* Allocate two page buffers dedicated to WAL consistency checks. We do
6377	* it this way, rather than just making static arrays, for two reasons:
6378	* (1) no need to waste the storage in most instantiations of the backend;
6379	* (2) a static char array isn't guaranteed to have any particular
6380	* alignment, whereas palloc() will provide MAXALIGN'd storage.
6381	*/
6382	replay_image_masked = (char *) palloc(BLCKSZ);
6383	master_image_masked = (char *) palloc(BLCKSZ);
6384
6385	if (read_backup_label(&checkPointLoc, &backupEndRequired,
6386	&backupFromStandby))
6387	{
6388	List *tablespaces = NIL;
6389
6390	/*
6391	* Archive recovery was requested, and thanks to the backup label
6392	* file, we know how far we need to replay to reach consistency. Enter
6393	* archive recovery directly.
6394	*/
6395	InArchiveRecovery = true;
6396	if (StandbyModeRequested)
6397	StandbyMode = true;
6398
6399	/*
6400	* When a backup_label file is present, we want to roll forward from
6401	* the checkpoint it identifies, rather than using pg_control.
6402	*/
6403	record = ReadCheckpointRecord(xlogreader, checkPointLoc, `0`, true);
6404	if (record != NULL)
6405	{
6406	memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6407	wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6408	ereport(DEBUG1,
6409	(errmsg("checkpoint record is at %X/%X",
6410	(uint32) (checkPointLoc >> `32`), (uint32) checkPointLoc)));
6411	InRecovery = true; / force recovery even if SHUTDOWNED /
6412
6413	/*
6414	* Make sure that REDO location exists. This may not be the case
6415	* if there was a crash during an online backup, which left a
6416	* backup_label around that references a WAL segment that's
6417	* already been archived.
6418	*/
6419	if (checkPoint.redo < checkPointLoc)
6420	{
6421	if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6422	ereport(FATAL,
6423	(errmsg("could not find redo location referenced by checkpoint record"),
6424	errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6425	"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6426	"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6427	DataDir, DataDir, DataDir)));
6428	}
6429	}
6430	else
6431	{
6432	ereport(FATAL,
6433	(errmsg("could not locate required checkpoint record"),
6434	errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6435	"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6436	"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6437	DataDir, DataDir, DataDir)));
6438	wasShutdown = false; / keep compiler quiet /
6439	}
6440
6441	/ read the tablespace_map file if present and create symlinks. /
6442	if (read_tablespace_map(&tablespaces))
6443	{
6444	ListCell *lc;
6445
6446	foreach(lc, tablespaces)
6447	{
6448	tablespaceinfo *ti = lfirst(lc);
6449	char *linkloc;
6450
6451	linkloc = psprintf("pg_tblspc/%s", ti->oid);
6452
6453	/*
6454	* Remove the existing symlink if any and Create the symlink
6455	* under PGDATA.
6456	*/
6457	remove_tablespace_symlink(linkloc);
6458
6459	if (symlink(ti->path, linkloc) < `0`)
6460	ereport(ERROR,
6461	(errcode_for_file_access(),
6462	errmsg("could not create symbolic link \"%s\": %m",
6463	linkloc)));
6464
6465	pfree(ti->oid);
6466	pfree(ti->path);
6467	pfree(ti);
6468	}
6469
6470	/ set flag to delete it later /
6471	haveTblspcMap = true;
6472	}
6473
6474	/ set flag to delete it later /
6475	haveBackupLabel = true;
6476	}
6477	else
6478	{
6479	/*
6480	* If tablespace_map file is present without backup_label file, there
6481	* is no use of such file. There is no harm in retaining it, but it
6482	* is better to get rid of the map file so that we don't have any
6483	* redundant file in data directory and it will avoid any sort of
6484	* confusion. It seems prudent though to just rename the file out of
6485	* the way rather than delete it completely, also we ignore any error
6486	* that occurs in rename operation as even if map file is present
6487	* without backup_label file, it is harmless.
6488	*/
6489	if (stat(TABLESPACE_MAP, &st) == `0`)
6490	{
6491	unlink(TABLESPACE_MAP_OLD);
6492	if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == `0`)
6493	ereport(LOG,
6494	(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6495	TABLESPACE_MAP, BACKUP_LABEL_FILE),
6496	errdetail("File \"%s\" was renamed to \"%s\".",
6497	TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6498	else
6499	ereport(LOG,
6500	(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6501	TABLESPACE_MAP, BACKUP_LABEL_FILE),
6502	errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6503	TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6504	}
6505
6506	/*
6507	* It's possible that archive recovery was requested, but we don't
6508	* know how far we need to replay the WAL before we reach consistency.
6509	* This can happen for example if a base backup is taken from a
6510	* running server using an atomic filesystem snapshot, without calling
6511	* pg_start/stop_backup. Or if you just kill a running master server
6512	* and put it into archive recovery by creating a recovery signal
6513	* file.
6514	*
6515	* Our strategy in that case is to perform crash recovery first,
6516	* replaying all the WAL present in pg_wal, and only enter archive
6517	* recovery after that.
6518	*
6519	* But usually we already know how far we need to replay the WAL (up
6520	* to minRecoveryPoint, up to backupEndPoint, or until we see an
6521	* end-of-backup record), and we can enter archive recovery directly.
6522	*/
6523	if (ArchiveRecoveryRequested &&
6524	(ControlFile->minRecoveryPoint != InvalidXLogRecPtr \|\|
6525	ControlFile->backupEndRequired \|\|
6526	ControlFile->backupEndPoint != InvalidXLogRecPtr \|\|
6527	ControlFile->state == DB_SHUTDOWNED))
6528	{
6529	InArchiveRecovery = true;
6530	if (StandbyModeRequested)
6531	StandbyMode = true;
6532	}
6533
6534	/ Get the last valid checkpoint record. /
6535	checkPointLoc = ControlFile->checkPoint;
6536	RedoStartLSN = ControlFile->checkPointCopy.redo;
6537	record = ReadCheckpointRecord(xlogreader, checkPointLoc, `1`, true);
6538	if (record != NULL)
6539	{
6540	ereport(DEBUG1,
6541	(errmsg("checkpoint record is at %X/%X",
6542	(uint32) (checkPointLoc >> `32`), (uint32) checkPointLoc)));
6543	}
6544	else
6545	{
6546	/*
6547	* We used to attempt to go back to a secondary checkpoint record
6548	* here, but only when not in standby mode. We now just fail if we
6549	* can't read the last checkpoint because this allows us to
6550	* simplify processing around checkpoints.
6551	*/
6552	ereport(PANIC,
6553	(errmsg("could not locate a valid checkpoint record")));
6554	}
6555	memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6556	wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6557	}
6558
6559	/*
6560	* Clear out any old relcache cache files. This is necessary if we do
6561	* any WAL replay, since that would probably result in the cache files
6562	* being out of sync with database reality. In theory we could leave them
6563	* in place if the database had been cleanly shut down, but it seems
6564	* safest to just remove them always and let them be rebuilt during the
6565	* first backend startup. These files needs to be removed from all
6566	* directories including pg_tblspc, however the symlinks are created only
6567	* after reading tablespace_map file in case of archive recovery from
6568	* backup, so needs to clear old relcache files here after creating
6569	* symlinks.
6570	*/
6571	RelationCacheInitFileRemove();
6572
6573	/*
6574	* If the location of the checkpoint record is not on the expected
6575	* timeline in the history of the requested timeline, we cannot proceed:
6576	* the backup is not part of the history of the requested timeline.
6577	*/
6578	Assert(expectedTLEs); / was initialized by reading checkpoint*
6579	* record */
6580	if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6581	checkPoint.ThisTimeLineID)
6582	{
6583	XLogRecPtr switchpoint;
6584
6585	/*
6586	* tliSwitchPoint will throw an error if the checkpoint's timeline is
6587	* not in expectedTLEs at all.
6588	*/
6589	switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6590	ereport(FATAL,
6591	(errmsg("requested timeline %u is not a child of this server's history",
6592	recoveryTargetTLI),
6593	errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6594	(uint32) (ControlFile->checkPoint >> `32`),
6595	(uint32) ControlFile->checkPoint,
6596	ControlFile->checkPointCopy.ThisTimeLineID,
6597	(uint32) (switchpoint >> `32`),
6598	(uint32) switchpoint)));
6599	}
6600
6601	/*
6602	* The min recovery point should be part of the requested timeline's
6603	* history, too.
6604	*/
6605	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6606	tliOfPointInHistory(ControlFile->minRecoveryPoint - `1`, expectedTLEs) !=
6607	ControlFile->minRecoveryPointTLI)
6608	ereport(FATAL,
6609	(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6610	recoveryTargetTLI,
6611	(uint32) (ControlFile->minRecoveryPoint >> `32`),
6612	(uint32) ControlFile->minRecoveryPoint,
6613	ControlFile->minRecoveryPointTLI)));
6614
6615	LastRec = RecPtr = checkPointLoc;
6616
6617	ereport(DEBUG1,
6618	(errmsg_internal("redo record is at %X/%X; shutdown %s",
6619	(uint32) (checkPoint.redo >> `32`), (uint32) checkPoint.redo,
6620	wasShutdown ? "true" : "false")));
6621	ereport(DEBUG1,
6622	(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
6623	U64FromFullTransactionId(checkPoint.nextFullXid),
6624	checkPoint.nextOid)));
6625	ereport(DEBUG1,
6626	(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6627	checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6628	ereport(DEBUG1,
6629	(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6630	checkPoint.oldestXid, checkPoint.oldestXidDB)));
6631	ereport(DEBUG1,
6632	(errmsg_internal("oldest MultiXactId: %u, in database %u",
6633	checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6634	ereport(DEBUG1,
6635	(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6636	checkPoint.oldestCommitTsXid,
6637	checkPoint.newestCommitTsXid)));
6638	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid)))
6639	ereport(PANIC,
6640	(errmsg("invalid next transaction ID")));
6641
6642	/ initialize shared memory variables from the checkpoint record /
6643	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
6644	ShmemVariableCache->nextOid = checkPoint.nextOid;
6645	ShmemVariableCache->oidCount = `0`;
6646	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6647	AdvanceOldestClogXid(checkPoint.oldestXid);
6648	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6649	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6650	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6651	checkPoint.newestCommitTsXid);
6652	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
6653
6654	/*
6655	* Initialize replication slots, before there's a chance to remove
6656	* required resources.
6657	*/
6658	StartupReplicationSlots();
6659
6660	/*
6661	* Startup logical state, needs to be setup now so we have proper data
6662	* during crash recovery.
6663	*/
6664	StartupReorderBuffer();
6665
6666	/*
6667	* Startup MultiXact. We need to do this early to be able to replay
6668	* truncations.
6669	*/
6670	StartupMultiXact();
6671
6672	/*
6673	* Ditto for commit timestamps. Activate the facility if the setting is
6674	* enabled in the control file, as there should be no tracking of commit
6675	* timestamps done when the setting was disabled. This facility can be
6676	* started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6677	*/
6678	if (ControlFile->track_commit_timestamp)
6679	StartupCommitTs();
6680
6681	/*
6682	* Recover knowledge about replay progress of known replication partners.
6683	*/
6684	StartupReplicationOrigin();
6685
6686	/*
6687	* Initialize unlogged LSN. On a clean shutdown, it's restored from the
6688	* control file. On recovery, all unlogged relations are blown away, so
6689	* the unlogged LSN counter can be reset too.
6690	*/
6691	if (ControlFile->state == DB_SHUTDOWNED)
6692	XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6693	else
6694	XLogCtl->unloggedLSN = `1`;
6695
6696	/*
6697	* We must replay WAL entries using the same TimeLineID they were created
6698	* under, so temporarily adopt the TLI indicated by the checkpoint (see
6699	* also xlog_redo()).
6700	*/
6701	ThisTimeLineID = checkPoint.ThisTimeLineID;
6702
6703	/*
6704	* Copy any missing timeline history files between 'now' and the recovery
6705	* target timeline from archive to pg_wal. While we don't need those files
6706	* ourselves - the history file of the recovery target timeline covers all
6707	* the previous timelines in the history too - a cascading standby server
6708	* might be interested in them. Or, if you archive the WAL from this
6709	* server to a different archive than the master, it'd be good for all the
6710	* history files to get archived there after failover, so that you can use
6711	* one of the old timelines as a PITR target. Timeline history files are
6712	* small, so it's better to copy them unnecessarily than not copy them and
6713	* regret later.
6714	*/
6715	restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6716
6717	/*
6718	* Before running in recovery, scan pg_twophase and fill in its status to
6719	* be able to work on entries generated by redo. Doing a scan before
6720	* taking any recovery action has the merit to discard any 2PC files that
6721	* are newer than the first record to replay, saving from any conflicts at
6722	* replay. This avoids as well any subsequent scans when doing recovery
6723	* of the on-disk two-phase data.
6724	*/
6725	restoreTwoPhaseData();
6726
6727	lastFullPageWrites = checkPoint.fullPageWrites;
6728
6729	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6730	doPageWrites = lastFullPageWrites;
6731
6732	if (RecPtr < checkPoint.redo)
6733	ereport(PANIC,
6734	(errmsg("invalid redo in checkpoint record")));
6735
6736	/*
6737	* Check whether we need to force recovery from WAL. If it appears to
6738	* have been a clean shutdown and we did not have a recovery signal file,
6739	* then assume no recovery needed.
6740	*/
6741	if (checkPoint.redo < RecPtr)
6742	{
6743	if (wasShutdown)
6744	ereport(PANIC,
6745	(errmsg("invalid redo record in shutdown checkpoint")));
6746	InRecovery = true;
6747	}
6748	else if (ControlFile->state != DB_SHUTDOWNED)
6749	InRecovery = true;
6750	else if (ArchiveRecoveryRequested)
6751	{
6752	/ force recovery due to presence of recovery signal file /
6753	InRecovery = true;
6754	}
6755
6756	/ REDO /
6757	if (InRecovery)
6758	{
6759	int rmid;
6760
6761	/*
6762	* Update pg_control to show that we are recovering and to show the
6763	* selected checkpoint as the place we are starting from. We also mark
6764	* pg_control with any minimum recovery stop point obtained from a
6765	* backup history file.
6766	*/
6767	dbstate_at_startup = ControlFile->state;
6768	if (InArchiveRecovery)
6769	ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6770	else
6771	{
6772	ereport(LOG,
6773	(errmsg("database system was not properly shut down; "
6774	"automatic recovery in progress")));
6775	if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6776	ereport(LOG,
6777	(errmsg("crash recovery starts in timeline %u "
6778	"and has target timeline %u",
6779	ControlFile->checkPointCopy.ThisTimeLineID,
6780	recoveryTargetTLI)));
6781	ControlFile->state = DB_IN_CRASH_RECOVERY;
6782	}
6783	ControlFile->checkPoint = checkPointLoc;
6784	ControlFile->checkPointCopy = checkPoint;
6785	if (InArchiveRecovery)
6786	{
6787	/ initialize minRecoveryPoint if not set yet /
6788	if (ControlFile->minRecoveryPoint < checkPoint.redo)
6789	{
6790	ControlFile->minRecoveryPoint = checkPoint.redo;
6791	ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6792	}
6793	}
6794
6795	/*
6796	* Set backupStartPoint if we're starting recovery from a base backup.
6797	*
6798	* Also set backupEndPoint and use minRecoveryPoint as the backup end
6799	* location if we're starting recovery from a base backup which was
6800	* taken from a standby. In this case, the database system status in
6801	* pg_control must indicate that the database was already in recovery.
6802	* Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6803	* DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6804	* before reaching this point; e.g. because restore_command or
6805	* primary_conninfo were faulty.
6806	*
6807	* Any other state indicates that the backup somehow became corrupted
6808	* and we can't sensibly continue with recovery.
6809	*/
6810	if (haveBackupLabel)
6811	{
6812	ControlFile->backupStartPoint = checkPoint.redo;
6813	ControlFile->backupEndRequired = backupEndRequired;
6814
6815	if (backupFromStandby)
6816	{
6817	if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6818	dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6819	ereport(FATAL,
6820	(errmsg("backup_label contains data inconsistent with control file"),
6821	errhint("This means that the backup is corrupted and you will "
6822	"have to use another backup for recovery.")));
6823	ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6824	}
6825	}
6826	ControlFile->time = (pg_time_t) time(NULL);
6827	/ No need to hold ControlFileLock yet, we aren't up far enough /
6828	UpdateControlFile();
6829
6830	/*
6831	* Initialize our local copy of minRecoveryPoint. When doing crash
6832	* recovery we want to replay up to the end of WAL. Particularly, in
6833	* the case of a promoted standby minRecoveryPoint value in the
6834	* control file is only updated after the first checkpoint. However,
6835	* if the instance crashes before the first post-recovery checkpoint
6836	* is completed then recovery will use a stale location causing the
6837	* startup process to think that there are still invalid page
6838	* references when checking for data consistency.
6839	*/
6840	if (InArchiveRecovery)
6841	{
6842	minRecoveryPoint = ControlFile->minRecoveryPoint;
6843	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6844	}
6845	else
6846	{
6847	minRecoveryPoint = InvalidXLogRecPtr;
6848	minRecoveryPointTLI = `0`;
6849	}
6850
6851	/*
6852	* Reset pgstat data, because it may be invalid after recovery.
6853	*/
6854	pgstat_reset_all();
6855
6856	/*
6857	* If there was a backup label file, it's done its job and the info
6858	* has now been propagated into pg_control. We must get rid of the
6859	* label file so that if we crash during recovery, we'll pick up at
6860	* the latest recovery restartpoint instead of going all the way back
6861	* to the backup start point. It seems prudent though to just rename
6862	* the file out of the way rather than delete it completely.
6863	*/
6864	if (haveBackupLabel)
6865	{
6866	unlink(BACKUP_LABEL_OLD);
6867	durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
6868	}
6869
6870	/*
6871	* If there was a tablespace_map file, it's done its job and the
6872	* symlinks have been created. We must get rid of the map file so
6873	* that if we crash during recovery, we don't create symlinks again.
6874	* It seems prudent though to just rename the file out of the way
6875	* rather than delete it completely.
6876	*/
6877	if (haveTblspcMap)
6878	{
6879	unlink(TABLESPACE_MAP_OLD);
6880	durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
6881	}
6882
6883	/ Check that the GUCs used to generate the WAL allow recovery /
6884	CheckRequiredParameterValues();
6885
6886	/*
6887	* We're in recovery, so unlogged relations may be trashed and must be
6888	* reset. This should be done BEFORE allowing Hot Standby
6889	* connections, so that read-only backends don't try to read whatever
6890	* garbage is left over from before.
6891	*/
6892	ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6893
6894	/*
6895	* Likewise, delete any saved transaction snapshot files that got left
6896	* behind by crashed backends.
6897	*/
6898	DeleteAllExportedSnapshotFiles();
6899
6900	/*
6901	* Initialize for Hot Standby, if enabled. We won't let backends in
6902	* yet, not until we've reached the min recovery point specified in
6903	* control file and we've established a recovery snapshot from a
6904	* running-xacts WAL record.
6905	*/
6906	if (ArchiveRecoveryRequested && EnableHotStandby)
6907	{
6908	TransactionId *xids;
6909	int nxids;
6910
6911	ereport(DEBUG1,
6912	(errmsg("initializing for hot standby")));
6913
6914	InitRecoveryTransactionEnvironment();
6915
6916	if (wasShutdown)
6917	oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6918	else
6919	oldestActiveXID = checkPoint.oldestActiveXid;
6920	Assert(TransactionIdIsValid(oldestActiveXID));
6921
6922	/ Tell procarray about the range of xids it has to deal with /
6923	ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
6924
6925	/*
6926	* Startup commit log and subtrans only. MultiXact and commit
6927	* timestamp have already been started up and other SLRUs are not
6928	* maintained during recovery and need not be started yet.
6929	*/
6930	StartupCLOG();
6931	StartupSUBTRANS(oldestActiveXID);
6932
6933	/*
6934	* If we're beginning at a shutdown checkpoint, we know that
6935	* nothing was running on the master at this point. So fake-up an
6936	* empty running-xacts record and use that here and now. Recover
6937	* additional standby state for prepared transactions.
6938	*/
6939	if (wasShutdown)
6940	{
6941	RunningTransactionsData running;
6942	TransactionId latestCompletedXid;
6943
6944	/*
6945	* Construct a RunningTransactions snapshot representing a
6946	* shut down server, with only prepared transactions still
6947	* alive. We're never overflowed at this point because all
6948	* subxids are listed with their parent prepared transactions.
6949	*/
6950	running.xcnt = nxids;
6951	running.subxcnt = `0`;
6952	running.subxid_overflow = false;
6953	running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
6954	running.oldestRunningXid = oldestActiveXID;
6955	latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
6956	TransactionIdRetreat(latestCompletedXid);
6957	Assert(TransactionIdIsNormal(latestCompletedXid));
6958	running.latestCompletedXid = latestCompletedXid;
6959	running.xids = xids;
6960
6961	ProcArrayApplyRecoveryInfo(&running);
6962
6963	StandbyRecoverPreparedTransactions();
6964	}
6965	}
6966
6967	/ Initialize resource managers /
6968	for (rmid = `0`; rmid <= RM_MAX_ID; rmid++)
6969	{
6970	if (RmgrTable[rmid].rm_startup != NULL)
6971	RmgrTable[rmid].rm_startup();
6972	}
6973
6974	/*
6975	* Initialize shared variables for tracking progress of WAL replay, as
6976	* if we had just replayed the record before the REDO location (or the
6977	* checkpoint record itself, if it's a shutdown checkpoint).
6978	*/
6979	SpinLockAcquire(&XLogCtl->info_lck);
6980	if (checkPoint.redo < RecPtr)
6981	XLogCtl->replayEndRecPtr = checkPoint.redo;
6982	else
6983	XLogCtl->replayEndRecPtr = EndRecPtr;
6984	XLogCtl->replayEndTLI = ThisTimeLineID;
6985	XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
6986	XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
6987	XLogCtl->recoveryLastXTime = `0`;
6988	XLogCtl->currentChunkStartTime = `0`;
6989	XLogCtl->recoveryPause = false;
6990	SpinLockRelease(&XLogCtl->info_lck);
6991
6992	/ Also ensure XLogReceiptTime has a sane value /
6993	XLogReceiptTime = GetCurrentTimestamp();
6994
6995	/*
6996	* Let postmaster know we've started redo now, so that it can launch
6997	* checkpointer to perform restartpoints. We don't bother during
6998	* crash recovery as restartpoints can only be performed during
6999	* archive recovery. And we'd like to keep crash recovery simple, to
7000	* avoid introducing bugs that could affect you when recovering after
7001	* crash.
7002	*
7003	* After this point, we can no longer assume that we're the only
7004	* process in addition to postmaster! Also, fsync requests are
7005	* subsequently to be handled by the checkpointer, not locally.
7006	*/
7007	if (ArchiveRecoveryRequested && IsUnderPostmaster)
7008	{
7009	PublishStartupProcessInformation();
7010	EnableSyncRequestForwarding();
7011	SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7012	bgwriterLaunched = true;
7013	}
7014
7015	/*
7016	* Allow read-only connections immediately if we're consistent
7017	* already.
7018	*/
7019	CheckRecoveryConsistency();
7020
7021	/*
7022	* Find the first record that logically follows the checkpoint --- it
7023	* might physically precede it, though.
7024	*/
7025	if (checkPoint.redo < RecPtr)
7026	{
7027	/ back up to find the record /
7028	record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
7029	}
7030	else
7031	{
7032	/ just have to read next record after CheckPoint /
7033	record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7034	}
7035
7036	if (record != NULL)
7037	{
7038	ErrorContextCallback errcallback;
7039	TimestampTz xtime;
7040
7041	InRedo = true;
7042
7043	ereport(LOG,
7044	(errmsg("redo starts at %X/%X",
7045	(uint32) (ReadRecPtr >> `32`), (uint32) ReadRecPtr)));
7046
7047	/*
7048	* main redo apply loop
7049	*/
7050	do
7051	{
7052	bool switchedTLI = false;
7053
7054	#ifdef WAL_DEBUG
7055	if (XLOG_DEBUG \|\|
7056	(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) \|\|
7057	(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7058	{
7059	StringInfoData buf;
7060
7061	initStringInfo(&buf);
7062	appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7063	(uint32) (ReadRecPtr >> `32`), (uint32) ReadRecPtr,
7064	(uint32) (EndRecPtr >> `32`), (uint32) EndRecPtr);
7065	xlog_outrec(&buf, xlogreader);
7066	appendStringInfoString(&buf, " - ");
7067	xlog_outdesc(&buf, xlogreader);
7068	elog(LOG, "%s", buf.data);
7069	pfree(buf.data);
7070	}
7071	#endif
7072
7073	/ Handle interrupt signals of startup process /
7074	HandleStartupProcInterrupts();
7075
7076	/*
7077	* Pause WAL replay, if requested by a hot-standby session via
7078	* SetRecoveryPause().
7079	*
7080	* Note that we intentionally don't take the info_lck spinlock
7081	* here. We might therefore read a slightly stale value of
7082	* the recoveryPause flag, but it can't be very stale (no
7083	* worse than the last spinlock we did acquire). Since a
7084	* pause request is a pretty asynchronous thing anyway,
7085	* possibly responding to it one WAL record later than we
7086	* otherwise would is a minor issue, so it doesn't seem worth
7087	* adding another spinlock cycle to prevent that.
7088	*/
7089	if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7090	recoveryPausesHere();
7091
7092	/*
7093	* Have we reached our recovery target?
7094	*/
7095	if (recoveryStopsBefore(xlogreader))
7096	{
7097	reachedStopPoint = true; / see below /
7098	break;
7099	}
7100
7101	/*
7102	* If we've been asked to lag the master, wait on latch until
7103	* enough time has passed.
7104	*/
7105	if (recoveryApplyDelay(xlogreader))
7106	{
7107	/*
7108	* We test for paused recovery again here. If user sets
7109	* delayed apply, it may be because they expect to pause
7110	* recovery in case of problems, so we must test again
7111	* here otherwise pausing during the delay-wait wouldn't
7112	* work.
7113	*/
7114	if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7115	recoveryPausesHere();
7116	}
7117
7118	/ Setup error traceback support for ereport() /
7119	errcallback.callback = rm_redo_error_callback;
7120	errcallback.arg = (void *) xlogreader;
7121	errcallback.previous = error_context_stack;
7122	error_context_stack = &errcallback;
7123
7124	/*
7125	* ShmemVariableCache->nextFullXid must be beyond record's
7126	* xid.
7127	*/
7128	AdvanceNextFullTransactionIdPastXid(record->xl_xid);
7129
7130	/*
7131	* Before replaying this record, check if this record causes
7132	* the current timeline to change. The record is already
7133	* considered to be part of the new timeline, so we update
7134	* ThisTimeLineID before replaying it. That's important so
7135	* that replayEndTLI, which is recorded as the minimum
7136	* recovery point's TLI if recovery stops after this record,
7137	* is set correctly.
7138	*/
7139	if (record->xl_rmid == RM_XLOG_ID)
7140	{
7141	TimeLineID newTLI = ThisTimeLineID;
7142	TimeLineID prevTLI = ThisTimeLineID;
7143	uint8 info = record->xl_info & ~XLR_INFO_MASK;
7144
7145	if (info == XLOG_CHECKPOINT_SHUTDOWN)
7146	{
7147	CheckPoint checkPoint;
7148
7149	memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7150	newTLI = checkPoint.ThisTimeLineID;
7151	prevTLI = checkPoint.PrevTimeLineID;
7152	}
7153	else if (info == XLOG_END_OF_RECOVERY)
7154	{
7155	xl_end_of_recovery xlrec;
7156
7157	memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7158	newTLI = xlrec.ThisTimeLineID;
7159	prevTLI = xlrec.PrevTimeLineID;
7160	}
7161
7162	if (newTLI != ThisTimeLineID)
7163	{
7164	/ Check that it's OK to switch to this TLI /
7165	checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7166
7167	/ Following WAL records should be run with new TLI /
7168	ThisTimeLineID = newTLI;
7169	switchedTLI = true;
7170	}
7171	}
7172
7173	/*
7174	* Update shared replayEndRecPtr before replaying this record,
7175	* so that XLogFlush will update minRecoveryPoint correctly.
7176	*/
7177	SpinLockAcquire(&XLogCtl->info_lck);
7178	XLogCtl->replayEndRecPtr = EndRecPtr;
7179	XLogCtl->replayEndTLI = ThisTimeLineID;
7180	SpinLockRelease(&XLogCtl->info_lck);
7181
7182	/*
7183	* If we are attempting to enter Hot Standby mode, process
7184	* XIDs we see
7185	*/
7186	if (standbyState >= STANDBY_INITIALIZED &&
7187	TransactionIdIsValid(record->xl_xid))
7188	RecordKnownAssignedTransactionIds(record->xl_xid);
7189
7190	/ Now apply the WAL record itself /
7191	RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7192
7193	/*
7194	* After redo, check whether the backup pages associated with
7195	* the WAL record are consistent with the existing pages. This
7196	* check is done only if consistency check is enabled for this
7197	* record.
7198	*/
7199	if ((record->xl_info & XLR_CHECK_CONSISTENCY) != `0`)
7200	checkXLogConsistency(xlogreader);
7201
7202	/ Pop the error context stack /
7203	error_context_stack = errcallback.previous;
7204
7205	/*
7206	* Update lastReplayedEndRecPtr after this record has been
7207	* successfully replayed.
7208	*/
7209	SpinLockAcquire(&XLogCtl->info_lck);
7210	XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7211	XLogCtl->lastReplayedTLI = ThisTimeLineID;
7212	SpinLockRelease(&XLogCtl->info_lck);
7213
7214	/*
7215	* If rm_redo called XLogRequestWalReceiverReply, then we wake
7216	* up the receiver so that it notices the updated
7217	* lastReplayedEndRecPtr and sends a reply to the master.
7218	*/
7219	if (doRequestWalReceiverReply)
7220	{
7221	doRequestWalReceiverReply = false;
7222	WalRcvForceReply();
7223	}
7224
7225	/ Remember this record as the last-applied one /
7226	LastRec = ReadRecPtr;
7227
7228	/ Allow read-only connections if we're consistent now /
7229	CheckRecoveryConsistency();
7230
7231	/ Is this a timeline switch? /
7232	if (switchedTLI)
7233	{
7234	/*
7235	* Before we continue on the new timeline, clean up any
7236	* (possibly bogus) future WAL segments on the old
7237	* timeline.
7238	*/
7239	RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7240
7241	/*
7242	* Wake up any walsenders to notice that we are on a new
7243	* timeline.
7244	*/
7245	if (switchedTLI && AllowCascadeReplication())
7246	WalSndWakeup();
7247	}
7248
7249	/ Exit loop if we reached inclusive recovery target /
7250	if (recoveryStopsAfter(xlogreader))
7251	{
7252	reachedStopPoint = true;
7253	break;
7254	}
7255
7256	/ Else, try to fetch the next WAL record /
7257	record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7258	} while (record != NULL);
7259
7260	/*
7261	* end of main redo apply loop
7262	*/
7263
7264	if (reachedStopPoint)
7265	{
7266	if (!reachedConsistency)
7267	ereport(FATAL,
7268	(errmsg("requested recovery stop point is before consistent recovery point")));
7269
7270	/*
7271	* This is the last point where we can restart recovery with a
7272	* new recovery target, if we shutdown and begin again. After
7273	* this, Resource Managers may choose to do permanent
7274	* corrective actions at end of recovery.
7275	*/
7276	switch (recoveryTargetAction)
7277	{
7278	case RECOVERY_TARGET_ACTION_SHUTDOWN:
7279
7280	/*
7281	* exit with special return code to request shutdown
7282	* of postmaster. Log messages issued from
7283	* postmaster.
7284	*/
7285	proc_exit(`3`);
7286
7287	case RECOVERY_TARGET_ACTION_PAUSE:
7288	SetRecoveryPause(true);
7289	recoveryPausesHere();
7290
7291	/ drop into promote /
7292
7293	case RECOVERY_TARGET_ACTION_PROMOTE:
7294	break;
7295	}
7296	}
7297
7298	/ Allow resource managers to do any required cleanup. /
7299	for (rmid = `0`; rmid <= RM_MAX_ID; rmid++)
7300	{
7301	if (RmgrTable[rmid].rm_cleanup != NULL)
7302	RmgrTable[rmid].rm_cleanup();
7303	}
7304
7305	ereport(LOG,
7306	(errmsg("redo done at %X/%X",
7307	(uint32) (ReadRecPtr >> `32`), (uint32) ReadRecPtr)));
7308	xtime = GetLatestXTime();
7309	if (xtime)
7310	ereport(LOG,
7311	(errmsg("last completed transaction was at log time %s",
7312	timestamptz_to_str(xtime))));
7313
7314	InRedo = false;
7315	}
7316	else
7317	{
7318	/ there are no WAL records following the checkpoint /
7319	ereport(LOG,
7320	(errmsg("redo is not required")));
7321	}
7322	}
7323
7324	/*
7325	* Kill WAL receiver, if it's still running, before we continue to write
7326	* the startup checkpoint record. It will trump over the checkpoint and
7327	* subsequent records if it's still alive when we start writing WAL.
7328	*/
7329	ShutdownWalRcv();
7330
7331	/*
7332	* Reset unlogged relations to the contents of their INIT fork. This is
7333	* done AFTER recovery is complete so as to include any unlogged relations
7334	* created during recovery, but BEFORE recovery is marked as having
7335	* completed successfully. Otherwise we'd not retry if any of the post
7336	* end-of-recovery steps fail.
7337	*/
7338	if (InRecovery)
7339	ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7340
7341	/*
7342	* We don't need the latch anymore. It's not strictly necessary to disown
7343	* it, but let's do it for the sake of tidiness.
7344	*/
7345	if (StandbyModeRequested)
7346	DisownLatch(&XLogCtl->recoveryWakeupLatch);
7347
7348	/*
7349	* We are now done reading the xlog from stream. Turn off streaming
7350	* recovery to force fetching the files (which would be required at end of
7351	* recovery, e.g., timeline history file) from archive or pg_wal.
7352	*/
7353	StandbyMode = false;
7354
7355	/*
7356	* Re-fetch the last valid or last applied record, so we can identify the
7357	* exact endpoint of what we consider the valid portion of WAL.
7358	*/
7359	record = ReadRecord(xlogreader, LastRec, PANIC, false);
7360	EndOfLog = EndRecPtr;
7361
7362	/*
7363	* EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7364	* the end-of-log. It could be different from the timeline that EndOfLog
7365	* nominally belongs to, if there was a timeline switch in that segment,
7366	* and we were reading the old WAL from a segment belonging to a higher
7367	* timeline.
7368	*/
7369	EndOfLogTLI = xlogreader->readPageTLI;
7370
7371	/*
7372	* Complain if we did not roll forward far enough to render the backup
7373	* dump consistent. Note: it is indeed okay to look at the local variable
7374	* minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7375	* be further ahead --- ControlFile->minRecoveryPoint cannot have been
7376	* advanced beyond the WAL we processed.
7377	*/
7378	if (InRecovery &&
7379	(EndOfLog < minRecoveryPoint \|\|
7380	!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7381	{
7382	/*
7383	* Ran off end of WAL before reaching end-of-backup WAL record, or
7384	* minRecoveryPoint. That's usually a bad sign, indicating that you
7385	* tried to recover from an online backup but never called
7386	* pg_stop_backup(), or you didn't archive all the WAL up to that
7387	* point. However, this also happens in crash recovery, if the system
7388	* crashes while an online backup is in progress. We must not treat
7389	* that as an error, or the database will refuse to start up.
7390	*/
7391	if (ArchiveRecoveryRequested \|\| ControlFile->backupEndRequired)
7392	{
7393	if (ControlFile->backupEndRequired)
7394	ereport(FATAL,
7395	(errmsg("WAL ends before end of online backup"),
7396	errhint("All WAL generated while online backup was taken must be available at recovery.")));
7397	else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7398	ereport(FATAL,
7399	(errmsg("WAL ends before end of online backup"),
7400	errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7401	else
7402	ereport(FATAL,
7403	(errmsg("WAL ends before consistent recovery point")));
7404	}
7405	}
7406
7407	/*
7408	* Pre-scan prepared transactions to find out the range of XIDs present.
7409	* This information is not quite needed yet, but it is positioned here so
7410	* as potential problems are detected before any on-disk change is done.
7411	*/
7412	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7413
7414	/*
7415	* Consider whether we need to assign a new timeline ID.
7416	*
7417	* If we are doing an archive recovery, we always assign a new ID. This
7418	* handles a couple of issues. If we stopped short of the end of WAL
7419	* during recovery, then we are clearly generating a new timeline and must
7420	* assign it a unique new ID. Even if we ran to the end, modifying the
7421	* current last segment is problematic because it may result in trying to
7422	* overwrite an already-archived copy of that segment, and we encourage
7423	* DBAs to make their archive_commands reject that. We can dodge the
7424	* problem by making the new active segment have a new timeline ID.
7425	*
7426	* In a normal crash recovery, we can just extend the timeline we were in.
7427	*/
7428	PrevTimeLineID = ThisTimeLineID;
7429	if (ArchiveRecoveryRequested)
7430	{
7431	char reason[`200`];
7432
7433	Assert(InArchiveRecovery);
7434
7435	ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + `1`;
7436	ereport(LOG,
7437	(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7438
7439	/*
7440	* Create a comment for the history file to explain why and where
7441	* timeline changed.
7442	*/
7443	if (recoveryTarget == RECOVERY_TARGET_XID)
7444	snprintf(reason, sizeof(reason),
7445	"%s transaction %u",
7446	recoveryStopAfter ? "after" : "before",
7447	recoveryStopXid);
7448	else if (recoveryTarget == RECOVERY_TARGET_TIME)
7449	snprintf(reason, sizeof(reason),
7450	"%s %s\n",
7451	recoveryStopAfter ? "after" : "before",
7452	timestamptz_to_str(recoveryStopTime));
7453	else if (recoveryTarget == RECOVERY_TARGET_LSN)
7454	snprintf(reason, sizeof(reason),
7455	"%s LSN %X/%X\n",
7456	recoveryStopAfter ? "after" : "before",
7457	(uint32) (recoveryStopLSN >> `32`),
7458	(uint32) recoveryStopLSN);
7459	else if (recoveryTarget == RECOVERY_TARGET_NAME)
7460	snprintf(reason, sizeof(reason),
7461	"at restore point \"%s\"",
7462	recoveryStopName);
7463	else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7464	snprintf(reason, sizeof(reason), "reached consistency");
7465	else
7466	snprintf(reason, sizeof(reason), "no recovery target specified");
7467
7468	/*
7469	* We are now done reading the old WAL. Turn off archive fetching if
7470	* it was active, and make a writable copy of the last WAL segment.
7471	* (Note that we also have a copy of the last block of the old WAL in
7472	* readBuf; we will use that below.)
7473	*/
7474	exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7475
7476	/*
7477	* Write the timeline history file, and have it archived. After this
7478	* point (or rather, as soon as the file is archived), the timeline
7479	* will appear as "taken" in the WAL archive and to any standby
7480	* servers. If we crash before actually switching to the new
7481	* timeline, standby servers will nevertheless think that we switched
7482	* to the new timeline, and will try to connect to the new timeline.
7483	* To minimize the window for that, try to do as little as possible
7484	* between here and writing the end-of-recovery record.
7485	*/
7486	writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7487	EndRecPtr, reason);
7488	}
7489
7490	/ Save the selected TimeLineID in shared memory, too /
7491	XLogCtl->ThisTimeLineID = ThisTimeLineID;
7492	XLogCtl->PrevTimeLineID = PrevTimeLineID;
7493
7494	/*
7495	* Prepare to write WAL starting at EndOfLog location, and init xlog
7496	* buffer cache using the block containing the last record from the
7497	* previous incarnation.
7498	*/
7499	Insert = &XLogCtl->Insert;
7500	Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7501	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7502
7503	/*
7504	* Tricky point here: readBuf contains the last block that the LastRec
7505	* record spans, not the one it starts in. The last block is indeed the
7506	* one we want to use.
7507	*/
7508	if (EndOfLog % XLOG_BLCKSZ != `0`)
7509	{
7510	char *page;
7511	int len;
7512	int firstIdx;
7513	XLogRecPtr pageBeginPtr;
7514
7515	pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7516	Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7517
7518	firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7519
7520	/ Copy the valid part of the last block, and zero the rest /
7521	page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7522	len = EndOfLog % XLOG_BLCKSZ;
7523	memcpy(page, xlogreader->readBuf, len);
7524	memset(page + len, `0`, XLOG_BLCKSZ - len);
7525
7526	XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7527	XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7528	}
7529	else
7530	{
7531	/*
7532	* There is no partial block to copy. Just set InitializedUpTo, and
7533	* let the first attempt to insert a log record to initialize the next
7534	* buffer.
7535	*/
7536	XLogCtl->InitializedUpTo = EndOfLog;
7537	}
7538
7539	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7540
7541	XLogCtl->LogwrtResult = LogwrtResult;
7542
7543	XLogCtl->LogwrtRqst.Write = EndOfLog;
7544	XLogCtl->LogwrtRqst.Flush = EndOfLog;
7545
7546	/*
7547	* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7548	* record before resource manager writes cleanup WAL records or checkpoint
7549	* record is written.
7550	*/
7551	Insert->fullPageWrites = lastFullPageWrites;
7552	LocalSetXLogInsertAllowed();
7553	UpdateFullPageWrites();
7554	LocalXLogInsertAllowed = -`1`;
7555
7556	if (InRecovery)
7557	{
7558	/*
7559	* Perform a checkpoint to update all our recovery activity to disk.
7560	*
7561	* Note that we write a shutdown checkpoint rather than an on-line
7562	* one. This is not particularly critical, but since we may be
7563	* assigning a new TLI, using a shutdown checkpoint allows us to have
7564	* the rule that TLI only changes in shutdown checkpoints, which
7565	* allows some extra error checking in xlog_redo.
7566	*
7567	* In fast promotion, only create a lightweight end-of-recovery record
7568	* instead of a full checkpoint. A checkpoint is requested later,
7569	* after we're fully out of recovery mode and already accepting
7570	* queries.
7571	*/
7572	if (bgwriterLaunched)
7573	{
7574	if (fast_promote)
7575	{
7576	checkPointLoc = ControlFile->checkPoint;
7577
7578	/*
7579	* Confirm the last checkpoint is available for us to recover
7580	* from if we fail.
7581	*/
7582	record = ReadCheckpointRecord(xlogreader, checkPointLoc, `1`, false);
7583	if (record != NULL)
7584	{
7585	fast_promoted = true;
7586
7587	/*
7588	* Insert a special WAL record to mark the end of
7589	* recovery, since we aren't doing a checkpoint. That
7590	* means that the checkpointer process may likely be in
7591	* the middle of a time-smoothed restartpoint and could
7592	* continue to be for minutes after this. That sounds
7593	* strange, but the effect is roughly the same and it
7594	* would be stranger to try to come out of the
7595	* restartpoint and then checkpoint. We request a
7596	* checkpoint later anyway, just for safety.
7597	*/
7598	CreateEndOfRecoveryRecord();
7599	}
7600	}
7601
7602	if (!fast_promoted)
7603	RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY \|
7604	CHECKPOINT_IMMEDIATE \|
7605	CHECKPOINT_WAIT);
7606	}
7607	else
7608	CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY \| CHECKPOINT_IMMEDIATE);
7609
7610	/*
7611	* And finally, execute the recovery_end_command, if any.
7612	*/
7613	if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != `0`)
7614	ExecuteRecoveryCommand(recoveryEndCommand,
7615	"recovery_end_command",
7616	true);
7617	}
7618
7619	if (ArchiveRecoveryRequested)
7620	{
7621	/*
7622	* We switched to a new timeline. Clean up segments on the old
7623	* timeline.
7624	*
7625	* If there are any higher-numbered segments on the old timeline,
7626	* remove them. They might contain valid WAL, but they might also be
7627	* pre-allocated files containing garbage. In any case, they are not
7628	* part of the new timeline's history so we don't need them.
7629	*/
7630	RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7631
7632	/*
7633	* If the switch happened in the middle of a segment, what to do with
7634	* the last, partial segment on the old timeline? If we don't archive
7635	* it, and the server that created the WAL never archives it either
7636	* (e.g. because it was hit by a meteor), it will never make it to the
7637	* archive. That's OK from our point of view, because the new segment
7638	* that we created with the new TLI contains all the WAL from the old
7639	* timeline up to the switch point. But if you later try to do PITR to
7640	* the "missing" WAL on the old timeline, recovery won't find it in
7641	* the archive. It's physically present in the new file with new TLI,
7642	* but recovery won't look there when it's recovering to the older
7643	* timeline. On the other hand, if we archive the partial segment, and
7644	* the original server on that timeline is still running and archives
7645	* the completed version of the same segment later, it will fail. (We
7646	* used to do that in 9.4 and below, and it caused such problems).
7647	*
7648	* As a compromise, we rename the last segment with the .partial
7649	* suffix, and archive it. Archive recovery will never try to read
7650	* .partial segments, so they will normally go unused. But in the odd
7651	* PITR case, the administrator can copy them manually to the pg_wal
7652	* directory (removing the suffix). They can be useful in debugging,
7653	* too.
7654	*
7655	* If a .done or .ready file already exists for the old timeline,
7656	* however, we had already determined that the segment is complete, so
7657	* we can let it be archived normally. (In particular, if it was
7658	* restored from the archive to begin with, it's expected to have a
7659	* .done file).
7660	*/
7661	if (XLogSegmentOffset(EndOfLog, wal_segment_size) != `0` &&
7662	XLogArchivingActive())
7663	{
7664	char origfname[MAXFNAMELEN];
7665	XLogSegNo endLogSegNo;
7666
7667	XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7668	XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7669
7670	if (!XLogArchiveIsReadyOrDone(origfname))
7671	{
7672	char origpath[MAXPGPATH];
7673	char partialfname[MAXFNAMELEN];
7674	char partialpath[MAXPGPATH];
7675
7676	XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7677	snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7678	snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7679
7680	/*
7681	* Make sure there's no .done or .ready file for the .partial
7682	* file.
7683	*/
7684	XLogArchiveCleanup(partialfname);
7685
7686	durable_rename(origpath, partialpath, ERROR);
7687	XLogArchiveNotify(partialfname);
7688	}
7689	}
7690	}
7691
7692	/*
7693	* Preallocate additional log files, if wanted.
7694	*/
7695	PreallocXlogFiles(EndOfLog);
7696
7697	/*
7698	* Okay, we're officially UP.
7699	*/
7700	InRecovery = false;
7701
7702	/ start the archive_timeout timer and LSN running /
7703	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7704	XLogCtl->lastSegSwitchLSN = EndOfLog;
7705
7706	/ also initialize latestCompletedXid, to nextXid - 1 /
7707	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7708	ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
7709	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7710	LWLockRelease(ProcArrayLock);
7711
7712	/*
7713	* Start up the commit log and subtrans, if not already done for hot
7714	* standby. (commit timestamps are started below, if necessary.)
7715	*/
7716	if (standbyState == STANDBY_DISABLED)
7717	{
7718	StartupCLOG();
7719	StartupSUBTRANS(oldestActiveXID);
7720	}
7721
7722	/*
7723	* Perform end of recovery actions for any SLRUs that need it.
7724	*/
7725	TrimCLOG();
7726	TrimMultiXact();
7727
7728	/ Reload shared-memory state for prepared transactions /
7729	RecoverPreparedTransactions();
7730
7731	/*
7732	* Shutdown the recovery environment. This must occur after
7733	* RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7734	*/
7735	if (standbyState != STANDBY_DISABLED)
7736	ShutdownRecoveryTransactionEnvironment();
7737
7738	/ Shut down xlogreader /
7739	if (readFile >= `0`)
7740	{
7741	close(readFile);
7742	readFile = -`1`;
7743	}
7744	XLogReaderFree(xlogreader);
7745
7746	/*
7747	* If any of the critical GUCs have changed, log them before we allow
7748	* backends to write WAL.
7749	*/
7750	LocalSetXLogInsertAllowed();
7751	XLogReportParameters();
7752
7753	/*
7754	* Local WAL inserts enabled, so it's time to finish initialization of
7755	* commit timestamp.
7756	*/
7757	CompleteCommitTsInitialization();
7758
7759	/*
7760	* All done with end-of-recovery actions.
7761	*
7762	* Now allow backends to write WAL and update the control file status in
7763	* consequence. The boolean flag allowing backends to write WAL is
7764	* updated while holding ControlFileLock to prevent other backends to look
7765	* at an inconsistent state of the control file in shared memory. There
7766	* is still a small window during which backends can write WAL and the
7767	* control file is still referring to a system not in DB_IN_PRODUCTION
7768	* state while looking at the on-disk control file.
7769	*
7770	* Also, although the boolean flag to allow WAL is probably atomic in
7771	* itself, we use the info_lck here to ensure that there are no race
7772	* conditions concerning visibility of other recent updates to shared
7773	* memory.
7774	*/
7775	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7776	ControlFile->state = DB_IN_PRODUCTION;
7777	ControlFile->time = (pg_time_t) time(NULL);
7778
7779	SpinLockAcquire(&XLogCtl->info_lck);
7780	XLogCtl->SharedRecoveryInProgress = false;
7781	SpinLockRelease(&XLogCtl->info_lck);
7782
7783	UpdateControlFile();
7784	LWLockRelease(ControlFileLock);
7785
7786	/*
7787	* If there were cascading standby servers connected to us, nudge any wal
7788	* sender processes to notice that we've been promoted.
7789	*/
7790	WalSndWakeup();
7791
7792	/*
7793	* If this was a fast promotion, request an (online) checkpoint now. This
7794	* isn't required for consistency, but the last restartpoint might be far
7795	* back, and in case of a crash, recovering from it might take a longer
7796	* than is appropriate now that we're not in standby mode anymore.
7797	*/
7798	if (fast_promoted)
7799	RequestCheckpoint(CHECKPOINT_FORCE);
7800	}
7801
7802	/*
7803	* Checks if recovery has reached a consistent state. When consistency is
7804	* reached and we have a valid starting standby snapshot, tell postmaster
7805	* that it can start accepting read-only connections.
7806	*/
7807	static void
7808	CheckRecoveryConsistency(void)
7809	{
7810	XLogRecPtr lastReplayedEndRecPtr;
7811
7812	/*
7813	* During crash recovery, we don't reach a consistent state until we've
7814	* replayed all the WAL.
7815	*/
7816	if (XLogRecPtrIsInvalid(minRecoveryPoint))
7817	return;
7818
7819	Assert(InArchiveRecovery);
7820
7821	/*
7822	* assume that we are called in the startup process, and hence don't need
7823	* a lock to read lastReplayedEndRecPtr
7824	*/
7825	lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7826
7827	/*
7828	* Have we reached the point where our base backup was completed?
7829	*/
7830	if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7831	ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7832	{
7833	/*
7834	* We have reached the end of base backup, as indicated by pg_control.
7835	* The data on disk is now consistent. Reset backupStartPoint and
7836	* backupEndPoint, and update minRecoveryPoint to make sure we don't
7837	* allow starting up at an earlier point even if recovery is stopped
7838	* and restarted soon after this.
7839	*/
7840	elog(DEBUG1, "end of backup reached");
7841
7842	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7843
7844	if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7845	ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7846
7847	ControlFile->backupStartPoint = InvalidXLogRecPtr;
7848	ControlFile->backupEndPoint = InvalidXLogRecPtr;
7849	ControlFile->backupEndRequired = false;
7850	UpdateControlFile();
7851
7852	LWLockRelease(ControlFileLock);
7853	}
7854
7855	/*
7856	* Have we passed our safe starting point? Note that minRecoveryPoint is
7857	* known to be incorrectly set if ControlFile->backupEndRequired, until
7858	* the XLOG_BACKUP_END arrives to advise us of the correct
7859	* minRecoveryPoint. All we know prior to that is that we're not
7860	* consistent yet.
7861	*/
7862	if (!reachedConsistency && !ControlFile->backupEndRequired &&
7863	minRecoveryPoint <= lastReplayedEndRecPtr &&
7864	XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7865	{
7866	/*
7867	* Check to see if the XLOG sequence contained any unresolved
7868	* references to uninitialized pages.
7869	*/
7870	XLogCheckInvalidPages();
7871
7872	reachedConsistency = true;
7873	ereport(LOG,
7874	(errmsg("consistent recovery state reached at %X/%X",
7875	(uint32) (lastReplayedEndRecPtr >> `32`),
7876	(uint32) lastReplayedEndRecPtr)));
7877	}
7878
7879	/*
7880	* Have we got a valid starting snapshot that will allow queries to be
7881	* run? If so, we can tell postmaster that the database is consistent now,
7882	* enabling connections.
7883	*/
7884	if (standbyState == STANDBY_SNAPSHOT_READY &&
7885	!LocalHotStandbyActive &&
7886	reachedConsistency &&
7887	IsUnderPostmaster)
7888	{
7889	SpinLockAcquire(&XLogCtl->info_lck);
7890	XLogCtl->SharedHotStandbyActive = true;
7891	SpinLockRelease(&XLogCtl->info_lck);
7892
7893	LocalHotStandbyActive = true;
7894
7895	SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7896	}
7897	}
7898
7899	/*
7900	* Is the system still in recovery?
7901	*
7902	* Unlike testing InRecovery, this works in any process that's connected to
7903	* shared memory.
7904	*
7905	* As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7906	* variables the first time we see that recovery is finished.
7907	*/
7908	bool
7909	RecoveryInProgress(void)
7910	{
7911	/*
7912	* We check shared state each time only until we leave recovery mode. We
7913	* can't re-enter recovery, so there's no need to keep checking after the
7914	* shared variable has once been seen false.
7915	*/
7916	if (!LocalRecoveryInProgress)
7917	return false;
7918	else
7919	{
7920	/*
7921	* use volatile pointer to make sure we make a fresh read of the
7922	* shared variable.
7923	*/
7924	volatile XLogCtlData *xlogctl = XLogCtl;
7925
7926	LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7927
7928	/*
7929	* Initialize TimeLineID and RedoRecPtr when we discover that recovery
7930	* is finished. InitPostgres() relies upon this behaviour to ensure
7931	* that InitXLOGAccess() is called at backend startup. (If you change
7932	* this, see also LocalSetXLogInsertAllowed.)
7933	*/
7934	if (!LocalRecoveryInProgress)
7935	{
7936	/*
7937	* If we just exited recovery, make sure we read TimeLineID and
7938	* RedoRecPtr after SharedRecoveryInProgress (for machines with
7939	* weak memory ordering).
7940	*/
7941	pg_memory_barrier();
7942	InitXLOGAccess();
7943	}
7944
7945	/*
7946	* Note: We don't need a memory barrier when we're still in recovery.
7947	* We might exit recovery immediately after return, so the caller
7948	* can't rely on 'true' meaning that we're still in recovery anyway.
7949	*/
7950
7951	return LocalRecoveryInProgress;
7952	}
7953	}
7954
7955	/*
7956	* Is HotStandby active yet? This is only important in special backends
7957	* since normal backends won't ever be able to connect until this returns
7958	* true. Postmaster knows this by way of signal, not via shared memory.
7959	*
7960	* Unlike testing standbyState, this works in any process that's connected to
7961	* shared memory. (And note that standbyState alone doesn't tell the truth
7962	* anyway.)
7963	*/
7964	bool
7965	HotStandbyActive(void)
7966	{
7967	/*
7968	* We check shared state each time only until Hot Standby is active. We
7969	* can't de-activate Hot Standby, so there's no need to keep checking
7970	* after the shared variable has once been seen true.
7971	*/
7972	if (LocalHotStandbyActive)
7973	return true;
7974	else
7975	{
7976	/ spinlock is essential on machines with weak memory ordering! /
7977	SpinLockAcquire(&XLogCtl->info_lck);
7978	LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
7979	SpinLockRelease(&XLogCtl->info_lck);
7980
7981	return LocalHotStandbyActive;
7982	}
7983	}
7984
7985	/*
7986	* Like HotStandbyActive(), but to be used only in WAL replay code,
7987	* where we don't need to ask any other process what the state is.
7988	*/
7989	bool
7990	HotStandbyActiveInReplay(void)
7991	{
7992	Assert(AmStartupProcess() \|\| !IsPostmasterEnvironment);
7993	return LocalHotStandbyActive;
7994	}
7995
7996	/*
7997	* Is this process allowed to insert new WAL records?
7998	*
7999	* Ordinarily this is essentially equivalent to !RecoveryInProgress().
8000	* But we also have provisions for forcing the result "true" or "false"
8001	* within specific processes regardless of the global state.
8002	*/
8003	bool
8004	XLogInsertAllowed(void)
8005	{
8006	/*
8007	* If value is "unconditionally true" or "unconditionally false", just
8008	* return it. This provides the normal fast path once recovery is known
8009	* done.
8010	*/
8011	if (LocalXLogInsertAllowed >= `0`)
8012	return (bool) LocalXLogInsertAllowed;
8013
8014	/*
8015	* Else, must check to see if we're still in recovery.
8016	*/
8017	if (RecoveryInProgress())
8018	return false;
8019
8020	/*
8021	* On exit from recovery, reset to "unconditionally true", since there is
8022	* no need to keep checking.
8023	*/
8024	LocalXLogInsertAllowed = `1`;
8025	return true;
8026	}
8027
8028	/*
8029	* Make XLogInsertAllowed() return true in the current process only.
8030	*
8031	* Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8032	* and even call LocalSetXLogInsertAllowed() again after that.
8033	*/
8034	static void
8035	LocalSetXLogInsertAllowed(void)
8036	{
8037	Assert(LocalXLogInsertAllowed == -`1`);
8038	LocalXLogInsertAllowed = `1`;
8039
8040	/ Initialize as RecoveryInProgress() would do when switching state /
8041	InitXLOGAccess();
8042	}
8043
8044	/*
8045	* Subroutine to try to fetch and validate a prior checkpoint record.
8046	*
8047	* whichChkpt identifies the checkpoint (merely for reporting purposes).
8048	* 1 for "primary", 0 for "other" (backup_label)
8049	*/
8050	static XLogRecord *
8051	ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8052	int whichChkpt, bool report)
8053	{
8054	XLogRecord *record;
8055	uint8 info;
8056
8057	if (!XRecOffIsValid(RecPtr))
8058	{
8059	if (!report)
8060	return NULL;
8061
8062	switch (whichChkpt)
8063	{
8064	case `1`:
8065	ereport(LOG,
8066	(errmsg("invalid primary checkpoint link in control file")));
8067	break;
8068	default:
8069	ereport(LOG,
8070	(errmsg("invalid checkpoint link in backup_label file")));
8071	break;
8072	}
8073	return NULL;
8074	}
8075
8076	record = ReadRecord(xlogreader, RecPtr, LOG, true);
8077
8078	if (record == NULL)
8079	{
8080	if (!report)
8081	return NULL;
8082
8083	switch (whichChkpt)
8084	{
8085	case `1`:
8086	ereport(LOG,
8087	(errmsg("invalid primary checkpoint record")));
8088	break;
8089	default:
8090	ereport(LOG,
8091	(errmsg("invalid checkpoint record")));
8092	break;
8093	}
8094	return NULL;
8095	}
8096	if (record->xl_rmid != RM_XLOG_ID)
8097	{
8098	switch (whichChkpt)
8099	{
8100	case `1`:
8101	ereport(LOG,
8102	(errmsg("invalid resource manager ID in primary checkpoint record")));
8103	break;
8104	default:
8105	ereport(LOG,
8106	(errmsg("invalid resource manager ID in checkpoint record")));
8107	break;
8108	}
8109	return NULL;
8110	}
8111	info = record->xl_info & ~XLR_INFO_MASK;
8112	if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8113	info != XLOG_CHECKPOINT_ONLINE)
8114	{
8115	switch (whichChkpt)
8116	{
8117	case `1`:
8118	ereport(LOG,
8119	(errmsg("invalid xl_info in primary checkpoint record")));
8120	break;
8121	default:
8122	ereport(LOG,
8123	(errmsg("invalid xl_info in checkpoint record")));
8124	break;
8125	}
8126	return NULL;
8127	}
8128	if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8129	{
8130	switch (whichChkpt)
8131	{
8132	case `1`:
8133	ereport(LOG,
8134	(errmsg("invalid length of primary checkpoint record")));
8135	break;
8136	default:
8137	ereport(LOG,
8138	(errmsg("invalid length of checkpoint record")));
8139	break;
8140	}
8141	return NULL;
8142	}
8143	return record;
8144	}
8145
8146	/*
8147	* This must be called in a backend process before creating WAL records
8148	* (except in a standalone backend, which does StartupXLOG instead). We need
8149	* to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8150	*
8151	* Note: before Postgres 8.0, we went to some effort to keep the postmaster
8152	* process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
8153	* unnecessary however, since the postmaster itself never touches XLOG anyway.
8154	*/
8155	void
8156	InitXLOGAccess(void)
8157	{
8158	XLogCtlInsert *Insert = &XLogCtl->Insert;
8159
8160	/ ThisTimeLineID doesn't change so we need no lock to copy it /
8161	ThisTimeLineID = XLogCtl->ThisTimeLineID;
8162	Assert(ThisTimeLineID != `0` \|\| IsBootstrapProcessingMode());
8163
8164	/ set wal_segment_size /
8165	wal_segment_size = ControlFile->xlog_seg_size;
8166
8167	/ Use GetRedoRecPtr to copy the RedoRecPtr safely /
8168	(void) GetRedoRecPtr();
8169	/ Also update our copy of doPageWrites. /
8170	doPageWrites = (Insert->fullPageWrites \|\| Insert->forcePageWrites);
8171
8172	/ Also initialize the working areas for constructing WAL records /
8173	InitXLogInsert();
8174	}
8175
8176	/*
8177	* Return the current Redo pointer from shared memory.
8178	*
8179	* As a side-effect, the local RedoRecPtr copy is updated.
8180	*/
8181	XLogRecPtr
8182	GetRedoRecPtr(void)
8183	{
8184	XLogRecPtr ptr;
8185
8186	/*
8187	* The possibly not up-to-date copy in XlogCtl is enough. Even if we
8188	* grabbed a WAL insertion lock to read the master copy, someone might
8189	* update it just after we've released the lock.
8190	*/
8191	SpinLockAcquire(&XLogCtl->info_lck);
8192	ptr = XLogCtl->RedoRecPtr;
8193	SpinLockRelease(&XLogCtl->info_lck);
8194
8195	if (RedoRecPtr < ptr)
8196	RedoRecPtr = ptr;
8197
8198	return RedoRecPtr;
8199	}
8200
8201	/*
8202	* Return information needed to decide whether a modified block needs a
8203	* full-page image to be included in the WAL record.
8204	*
8205	* The returned values are cached copies from backend-private memory, and
8206	* possibly out-of-date. XLogInsertRecord will re-check them against
8207	* up-to-date values, while holding the WAL insert lock.
8208	*/
8209	void
8210	GetFullPageWriteInfo(XLogRecPtr RedoRecPtr_p, bool doPageWrites_p)
8211	{
8212	*RedoRecPtr_p = RedoRecPtr;
8213	*doPageWrites_p = doPageWrites;
8214	}
8215
8216	/*
8217	* GetInsertRecPtr -- Returns the current insert position.
8218	*
8219	* NOTE: The value actually returned is the position of the last full
8220	* xlog page. It lags behind the real insert position by at most 1 page.
8221	* For that, we don't need to scan through WAL insertion locks, and an
8222	* approximation is enough for the current usage of this function.
8223	*/
8224	XLogRecPtr
8225	GetInsertRecPtr(void)
8226	{
8227	XLogRecPtr recptr;
8228
8229	SpinLockAcquire(&XLogCtl->info_lck);
8230	recptr = XLogCtl->LogwrtRqst.Write;
8231	SpinLockRelease(&XLogCtl->info_lck);
8232
8233	return recptr;
8234	}
8235
8236	/*
8237	* GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8238	* position known to be fsync'd to disk.
8239	*/
8240	XLogRecPtr
8241	GetFlushRecPtr(void)
8242	{
8243	SpinLockAcquire(&XLogCtl->info_lck);
8244	LogwrtResult = XLogCtl->LogwrtResult;
8245	SpinLockRelease(&XLogCtl->info_lck);
8246
8247	return LogwrtResult.Flush;
8248	}
8249
8250	/*
8251	* GetLastImportantRecPtr -- Returns the LSN of the last important record
8252	* inserted. All records not explicitly marked as unimportant are considered
8253	* important.
8254	*
8255	* The LSN is determined by computing the maximum of
8256	* WALInsertLocks[i].lastImportantAt.
8257	*/
8258	XLogRecPtr
8259	GetLastImportantRecPtr(void)
8260	{
8261	XLogRecPtr res = InvalidXLogRecPtr;
8262	int i;
8263
8264	for (i = `0`; i < NUM_XLOGINSERT_LOCKS; i++)
8265	{
8266	XLogRecPtr last_important;
8267
8268	/*
8269	* Need to take a lock to prevent torn reads of the LSN, which are
8270	* possible on some of the supported platforms. WAL insert locks only
8271	* support exclusive mode, so we have to use that.
8272	*/
8273	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8274	last_important = WALInsertLocks[i].l.lastImportantAt;
8275	LWLockRelease(&WALInsertLocks[i].l.lock);
8276
8277	if (res < last_important)
8278	res = last_important;
8279	}
8280
8281	return res;
8282	}
8283
8284	/*
8285	* Get the time and LSN of the last xlog segment switch
8286	*/
8287	pg_time_t
8288	GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8289	{
8290	pg_time_t result;
8291
8292	/ Need WALWriteLock, but shared lock is sufficient /
8293	LWLockAcquire(WALWriteLock, LW_SHARED);
8294	result = XLogCtl->lastSegSwitchTime;
8295	*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8296	LWLockRelease(WALWriteLock);
8297
8298	return result;
8299	}
8300
8301	/*
8302	* This must be called ONCE during postmaster or standalone-backend shutdown
8303	*/
8304	void
8305	ShutdownXLOG(int code, Datum arg)
8306	{
8307	/*
8308	* We should have an aux process resource owner to use, and we should not
8309	* be in a transaction that's installed some other resowner.
8310	*/
8311	Assert(AuxProcessResourceOwner != NULL);
8312	Assert(CurrentResourceOwner == NULL \|\|
8313	CurrentResourceOwner == AuxProcessResourceOwner);
8314	CurrentResourceOwner = AuxProcessResourceOwner;
8315
8316	/ Don't be chatty in standalone mode /
8317	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8318	(errmsg("shutting down")));
8319
8320	/*
8321	* Signal walsenders to move to stopping state.
8322	*/
8323	WalSndInitStopping();
8324
8325	/*
8326	* Wait for WAL senders to be in stopping state. This prevents commands
8327	* from writing new WAL.
8328	*/
8329	WalSndWaitStopping();
8330
8331	if (RecoveryInProgress())
8332	CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN \| CHECKPOINT_IMMEDIATE);
8333	else
8334	{
8335	/*
8336	* If archiving is enabled, rotate the last XLOG file so that all the
8337	* remaining records are archived (postmaster wakes up the archiver
8338	* process one more time at the end of shutdown). The checkpoint
8339	* record will go to the next XLOG file and won't be archived (yet).
8340	*/
8341	if (XLogArchivingActive() && XLogArchiveCommandSet())
8342	RequestXLogSwitch(false);
8343
8344	CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN \| CHECKPOINT_IMMEDIATE);
8345	}
8346	ShutdownCLOG();
8347	ShutdownCommitTs();
8348	ShutdownSUBTRANS();
8349	ShutdownMultiXact();
8350	}
8351
8352	/*
8353	* Log start of a checkpoint.
8354	*/
8355	static void
8356	LogCheckpointStart(int flags, bool restartpoint)
8357	{
8358	elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8359	restartpoint ? "restartpoint" : "checkpoint",
8360	(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8361	(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8362	(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8363	(flags & CHECKPOINT_FORCE) ? " force" : "",
8364	(flags & CHECKPOINT_WAIT) ? " wait" : "",
8365	(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8366	(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8367	(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8368	}
8369
8370	/*
8371	* Log end of a checkpoint.
8372	*/
8373	static void
8374	LogCheckpointEnd(bool restartpoint)
8375	{
8376	long write_secs,
8377	sync_secs,
8378	total_secs,
8379	longest_secs,
8380	average_secs;
8381	int write_usecs,
8382	sync_usecs,
8383	total_usecs,
8384	longest_usecs,
8385	average_usecs;
8386	uint64 average_sync_time;
8387
8388	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8389
8390	TimestampDifference(CheckpointStats.ckpt_write_t,
8391	CheckpointStats.ckpt_sync_t,
8392	&write_secs, &write_usecs);
8393
8394	TimestampDifference(CheckpointStats.ckpt_sync_t,
8395	CheckpointStats.ckpt_sync_end_t,
8396	&sync_secs, &sync_usecs);
8397
8398	/ Accumulate checkpoint timing summary data, in milliseconds. /
8399	BgWriterStats.m_checkpoint_write_time +=
8400	write_secs * `1000` + write_usecs / `1000`;
8401	BgWriterStats.m_checkpoint_sync_time +=
8402	sync_secs * `1000` + sync_usecs / `1000`;
8403
8404	/*
8405	* All of the published timing statistics are accounted for. Only
8406	* continue if a log message is to be written.
8407	*/
8408	if (!log_checkpoints)
8409	return;
8410
8411	TimestampDifference(CheckpointStats.ckpt_start_t,
8412	CheckpointStats.ckpt_end_t,
8413	&total_secs, &total_usecs);
8414
8415	/*
8416	* Timing values returned from CheckpointStats are in microseconds.
8417	* Convert to the second plus microsecond form that TimestampDifference
8418	* returns for homogeneous printing.
8419	*/
8420	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / `1000000`);
8421	longest_usecs = CheckpointStats.ckpt_longest_sync -
8422	(uint64) longest_secs * `1000000`;
8423
8424	average_sync_time = `0`;
8425	if (CheckpointStats.ckpt_sync_rels > `0`)
8426	average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8427	CheckpointStats.ckpt_sync_rels;
8428	average_secs = (long) (average_sync_time / `1000000`);
8429	average_usecs = average_sync_time - (uint64) average_secs * `1000000`;
8430
8431	elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8432	"%d WAL file(s) added, %d removed, %d recycled; "
8433	"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8434	"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8435	"distance=%d kB, estimate=%d kB",
8436	restartpoint ? "restartpoint" : "checkpoint",
8437	CheckpointStats.ckpt_bufs_written,
8438	(double) CheckpointStats.ckpt_bufs_written * `100` / NBuffers,
8439	CheckpointStats.ckpt_segs_added,
8440	CheckpointStats.ckpt_segs_removed,
8441	CheckpointStats.ckpt_segs_recycled,
8442	write_secs, write_usecs / `1000`,
8443	sync_secs, sync_usecs / `1000`,
8444	total_secs, total_usecs / `1000`,
8445	CheckpointStats.ckpt_sync_rels,
8446	longest_secs, longest_usecs / `1000`,
8447	average_secs, average_usecs / `1000`,
8448	(int) (PrevCheckPointDistance / `1024.0`),
8449	(int) (CheckPointDistanceEstimate / `1024.0`));
8450	}
8451
8452	/*
8453	* Update the estimate of distance between checkpoints.
8454	*
8455	* The estimate is used to calculate the number of WAL segments to keep
8456	* preallocated, see XLOGFileSlop().
8457	*/
8458	static void
8459	UpdateCheckPointDistanceEstimate(uint64 nbytes)
8460	{
8461	/*
8462	* To estimate the number of segments consumed between checkpoints, keep a
8463	* moving average of the amount of WAL generated in previous checkpoint
8464	* cycles. However, if the load is bursty, with quiet periods and busy
8465	* periods, we want to cater for the peak load. So instead of a plain
8466	* moving average, let the average decline slowly if the previous cycle
8467	* used less WAL than estimated, but bump it up immediately if it used
8468	* more.
8469	*
8470	* When checkpoints are triggered by max_wal_size, this should converge to
8471	* CheckpointSegments * wal_segment_size,
8472	*
8473	* Note: This doesn't pay any attention to what caused the checkpoint.
8474	* Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8475	* starting a base backup, are counted the same as those created
8476	* automatically. The slow-decline will largely mask them out, if they are
8477	* not frequent. If they are frequent, it seems reasonable to count them
8478	* in as any others; if you issue a manual checkpoint every 5 minutes and
8479	* never let a timed checkpoint happen, it makes sense to base the
8480	* preallocation on that 5 minute interval rather than whatever
8481	* checkpoint_timeout is set to.
8482	*/
8483	PrevCheckPointDistance = nbytes;
8484	if (CheckPointDistanceEstimate < nbytes)
8485	CheckPointDistanceEstimate = nbytes;
8486	else
8487	CheckPointDistanceEstimate =
8488	(`0.90` * CheckPointDistanceEstimate + `0.10` * (double) nbytes);
8489	}
8490
8491	/*
8492	* Perform a checkpoint --- either during shutdown, or on-the-fly
8493	*
8494	* flags is a bitwise OR of the following:
8495	* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8496	* CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8497	* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8498	* ignoring checkpoint_completion_target parameter.
8499	* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8500	* since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8501	* CHECKPOINT_END_OF_RECOVERY).
8502	* CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8503	*
8504	* Note: flags contains other bits, of interest here only for logging purposes.
8505	* In particular note that this routine is synchronous and does not pay
8506	* attention to CHECKPOINT_WAIT.
8507	*
8508	* If !shutdown then we are writing an online checkpoint. This is a very special
8509	* kind of operation and WAL record because the checkpoint action occurs over
8510	* a period of time yet logically occurs at just a single LSN. The logical
8511	* position of the WAL record (redo ptr) is the same or earlier than the
8512	* physical position. When we replay WAL we locate the checkpoint via its
8513	* physical position then read the redo ptr and actually start replay at the
8514	* earlier logical position. Note that we don't write anything to WAL at
8515	* the logical position, so that location could be any other kind of WAL record.
8516	* All of this mechanism allows us to continue working while we checkpoint.
8517	* As a result, timing of actions is critical here and be careful to note that
8518	* this function will likely take minutes to execute on a busy system.
8519	*/
8520	void
8521	CreateCheckPoint(int flags)
8522	{
8523	bool shutdown;
8524	CheckPoint checkPoint;
8525	XLogRecPtr recptr;
8526	XLogSegNo _logSegNo;
8527	XLogCtlInsert *Insert = &XLogCtl->Insert;
8528	uint32 freespace;
8529	XLogRecPtr PriorRedoPtr;
8530	XLogRecPtr curInsert;
8531	XLogRecPtr last_important_lsn;
8532	VirtualTransactionId *vxids;
8533	int nvxids;
8534
8535	/*
8536	* An end-of-recovery checkpoint is really a shutdown checkpoint, just
8537	* issued at a different time.
8538	*/
8539	if (flags & (CHECKPOINT_IS_SHUTDOWN \| CHECKPOINT_END_OF_RECOVERY))
8540	shutdown = true;
8541	else
8542	shutdown = false;
8543
8544	/ sanity check /
8545	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == `0`)
8546	elog(ERROR, "can't create a checkpoint during recovery");
8547
8548	/*
8549	* Initialize InitXLogInsert working areas before entering the critical
8550	* section. Normally, this is done by the first call to
8551	* RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8552	* an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8553	* done below in a critical section, and InitXLogInsert cannot be called
8554	* in a critical section.
8555	*/
8556	InitXLogInsert();
8557
8558	/*
8559	* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8560	* (This is just pro forma, since in the present system structure there is
8561	* only one process that is allowed to issue checkpoints at any given
8562	* time.)
8563	*/
8564	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8565
8566	/*
8567	* Prepare to accumulate statistics.
8568	*
8569	* Note: because it is possible for log_checkpoints to change while a
8570	* checkpoint proceeds, we always accumulate stats, even if
8571	* log_checkpoints is currently off.
8572	*/
8573	MemSet(&CheckpointStats, `0`, sizeof(CheckpointStats));
8574	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8575
8576	/*
8577	* Use a critical section to force system panic if we have trouble.
8578	*/
8579	START_CRIT_SECTION();
8580
8581	if (shutdown)
8582	{
8583	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8584	ControlFile->state = DB_SHUTDOWNING;
8585	ControlFile->time = (pg_time_t) time(NULL);
8586	UpdateControlFile();
8587	LWLockRelease(ControlFileLock);
8588	}
8589
8590	/*
8591	* Let smgr prepare for checkpoint; this has to happen before we determine
8592	* the REDO pointer. Note that smgr must not do anything that'd have to
8593	* be undone if we decide no checkpoint is needed.
8594	*/
8595	SyncPreCheckpoint();
8596
8597	/ Begin filling in the checkpoint WAL record /
8598	MemSet(&checkPoint, `0`, sizeof(checkPoint));
8599	checkPoint.time = (pg_time_t) time(NULL);
8600
8601	/*
8602	* For Hot Standby, derive the oldestActiveXid before we fix the redo
8603	* pointer. This allows us to begin accumulating changes to assemble our
8604	* starting snapshot of locks and transactions.
8605	*/
8606	if (!shutdown && XLogStandbyInfoActive())
8607	checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8608	else
8609	checkPoint.oldestActiveXid = InvalidTransactionId;
8610
8611	/*
8612	* Get location of last important record before acquiring insert locks (as
8613	* GetLastImportantRecPtr() also locks WAL locks).
8614	*/
8615	last_important_lsn = GetLastImportantRecPtr();
8616
8617	/*
8618	* We must block concurrent insertions while examining insert state to
8619	* determine the checkpoint REDO pointer.
8620	*/
8621	WALInsertLockAcquireExclusive();
8622	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8623
8624	/*
8625	* If this isn't a shutdown or forced checkpoint, and if there has been no
8626	* WAL activity requiring a checkpoint, skip it. The idea here is to
8627	* avoid inserting duplicate checkpoints when the system is idle.
8628	*/
8629	if ((flags & (CHECKPOINT_IS_SHUTDOWN \| CHECKPOINT_END_OF_RECOVERY \|
8630	CHECKPOINT_FORCE)) == `0`)
8631	{
8632	if (last_important_lsn == ControlFile->checkPoint)
8633	{
8634	WALInsertLockRelease();
8635	LWLockRelease(CheckpointLock);
8636	END_CRIT_SECTION();
8637	ereport(DEBUG1,
8638	(errmsg("checkpoint skipped because system is idle")));
8639	return;
8640	}
8641	}
8642
8643	/*
8644	* An end-of-recovery checkpoint is created before anyone is allowed to
8645	* write WAL. To allow us to write the checkpoint record, temporarily
8646	* enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
8647	* initialized, which we need here and in AdvanceXLInsertBuffer.)
8648	*/
8649	if (flags & CHECKPOINT_END_OF_RECOVERY)
8650	LocalSetXLogInsertAllowed();
8651
8652	checkPoint.ThisTimeLineID = ThisTimeLineID;
8653	if (flags & CHECKPOINT_END_OF_RECOVERY)
8654	checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8655	else
8656	checkPoint.PrevTimeLineID = ThisTimeLineID;
8657
8658	checkPoint.fullPageWrites = Insert->fullPageWrites;
8659
8660	/*
8661	* Compute new REDO record ptr = location of next XLOG record.
8662	*
8663	* NB: this is NOT necessarily where the checkpoint record itself will be,
8664	* since other backends may insert more XLOG records while we're off doing
8665	* the buffer flush work. Those XLOG records are logically after the
8666	* checkpoint, even though physically before it. Got that?
8667	*/
8668	freespace = INSERT_FREESPACE(curInsert);
8669	if (freespace == `0`)
8670	{
8671	if (XLogSegmentOffset(curInsert, wal_segment_size) == `0`)
8672	curInsert += SizeOfXLogLongPHD;
8673	else
8674	curInsert += SizeOfXLogShortPHD;
8675	}
8676	checkPoint.redo = curInsert;
8677
8678	/*
8679	* Here we update the shared RedoRecPtr for future XLogInsert calls; this
8680	* must be done while holding all the insertion locks.
8681	*
8682	* Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8683	* pointing past where it really needs to point. This is okay; the only
8684	* consequence is that XLogInsert might back up whole buffers that it
8685	* didn't really need to. We can't postpone advancing RedoRecPtr because
8686	* XLogInserts that happen while we are dumping buffers must assume that
8687	* their buffer changes are not included in the checkpoint.
8688	*/
8689	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8690
8691	/*
8692	* Now we can release the WAL insertion locks, allowing other xacts to
8693	* proceed while we are flushing disk buffers.
8694	*/
8695	WALInsertLockRelease();
8696
8697	/ Update the info_lck-protected copy of RedoRecPtr as well /
8698	SpinLockAcquire(&XLogCtl->info_lck);
8699	XLogCtl->RedoRecPtr = checkPoint.redo;
8700	SpinLockRelease(&XLogCtl->info_lck);
8701
8702	/*
8703	* If enabled, log checkpoint start. We postpone this until now so as not
8704	* to log anything if we decided to skip the checkpoint.
8705	*/
8706	if (log_checkpoints)
8707	LogCheckpointStart(flags, false);
8708
8709	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8710
8711	/*
8712	* Get the other info we need for the checkpoint record.
8713	*
8714	* We don't need to save oldestClogXid in the checkpoint, it only matters
8715	* for the short period in which clog is being truncated, and if we crash
8716	* during that we'll redo the clog truncation and fix up oldestClogXid
8717	* there.
8718	*/
8719	LWLockAcquire(XidGenLock, LW_SHARED);
8720	checkPoint.nextFullXid = ShmemVariableCache->nextFullXid;
8721	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8722	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8723	LWLockRelease(XidGenLock);
8724
8725	LWLockAcquire(CommitTsLock, LW_SHARED);
8726	checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8727	checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8728	LWLockRelease(CommitTsLock);
8729
8730	LWLockAcquire(OidGenLock, LW_SHARED);
8731	checkPoint.nextOid = ShmemVariableCache->nextOid;
8732	if (!shutdown)
8733	checkPoint.nextOid += ShmemVariableCache->oidCount;
8734	LWLockRelease(OidGenLock);
8735
8736	MultiXactGetCheckptMulti(shutdown,
8737	&checkPoint.nextMulti,
8738	&checkPoint.nextMultiOffset,
8739	&checkPoint.oldestMulti,
8740	&checkPoint.oldestMultiDB);
8741
8742	/*
8743	* Having constructed the checkpoint record, ensure all shmem disk buffers
8744	* and commit-log buffers are flushed to disk.
8745	*
8746	* This I/O could fail for various reasons. If so, we will fail to
8747	* complete the checkpoint, but there is no reason to force a system
8748	* panic. Accordingly, exit critical section while doing it.
8749	*/
8750	END_CRIT_SECTION();
8751
8752	/*
8753	* In some cases there are groups of actions that must all occur on one
8754	* side or the other of a checkpoint record. Before flushing the
8755	* checkpoint record we must explicitly wait for any backend currently
8756	* performing those groups of actions.
8757	*
8758	* One example is end of transaction, so we must wait for any transactions
8759	* that are currently in commit critical sections. If an xact inserted
8760	* its commit record into XLOG just before the REDO point, then a crash
8761	* restart from the REDO point would not replay that record, which means
8762	* that our flushing had better include the xact's update of pg_xact. So
8763	* we wait till he's out of his commit critical section before proceeding.
8764	* See notes in RecordTransactionCommit().
8765	*
8766	* Because we've already released the insertion locks, this test is a bit
8767	* fuzzy: it is possible that we will wait for xacts we didn't really need
8768	* to wait for. But the delay should be short and it seems better to make
8769	* checkpoint take a bit longer than to hold off insertions longer than
8770	* necessary. (In fact, the whole reason we have this issue is that xact.c
8771	* does commit record XLOG insertion and clog update as two separate steps
8772	* protected by different locks, but again that seems best on grounds of
8773	* minimizing lock contention.)
8774	*
8775	* A transaction that has not yet set delayChkpt when we look cannot be at
8776	* risk, since he's not inserted his commit record yet; and one that's
8777	* already cleared it is not at risk either, since he's done fixing clog
8778	* and we will correctly flush the update below. So we cannot miss any
8779	* xacts we need to wait for.
8780	*/
8781	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8782	if (nvxids > `0`)
8783	{
8784	do
8785	{
8786	pg_usleep(`10000L`); / wait for 10 msec /
8787	} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8788	}
8789	pfree(vxids);
8790
8791	CheckPointGuts(checkPoint.redo, flags);
8792
8793	/*
8794	* Take a snapshot of running transactions and write this to WAL. This
8795	* allows us to reconstruct the state of running transactions during
8796	* archive recovery, if required. Skip, if this info disabled.
8797	*
8798	* If we are shutting down, or Startup process is completing crash
8799	* recovery we don't need to write running xact data.
8800	*/
8801	if (!shutdown && XLogStandbyInfoActive())
8802	LogStandbySnapshot();
8803
8804	START_CRIT_SECTION();
8805
8806	/*
8807	* Now insert the checkpoint record into XLOG.
8808	*/
8809	XLogBeginInsert();
8810	XLogRegisterData((char ) (&checkPoint), sizeof*(checkPoint));
8811	recptr = XLogInsert(RM_XLOG_ID,
8812	shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8813	XLOG_CHECKPOINT_ONLINE);
8814
8815	XLogFlush(recptr);
8816
8817	/*
8818	* We mustn't write any new WAL after a shutdown checkpoint, or it will be
8819	* overwritten at next startup. No-one should even try, this just allows
8820	* sanity-checking. In the case of an end-of-recovery checkpoint, we want
8821	* to just temporarily disable writing until the system has exited
8822	* recovery.
8823	*/
8824	if (shutdown)
8825	{
8826	if (flags & CHECKPOINT_END_OF_RECOVERY)
8827	LocalXLogInsertAllowed = -`1`; / return to "check" state /
8828	else
8829	LocalXLogInsertAllowed = `0`; / never again write WAL /
8830	}
8831
8832	/*
8833	* We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8834	* = end of actual checkpoint record.
8835	*/
8836	if (shutdown && checkPoint.redo != ProcLastRecPtr)
8837	ereport(PANIC,
8838	(errmsg("concurrent write-ahead log activity while database system is shutting down")));
8839
8840	/*
8841	* Remember the prior checkpoint's redo ptr for
8842	* UpdateCheckPointDistanceEstimate()
8843	*/
8844	PriorRedoPtr = ControlFile->checkPointCopy.redo;
8845
8846	/*
8847	* Update the control file.
8848	*/
8849	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8850	if (shutdown)
8851	ControlFile->state = DB_SHUTDOWNED;
8852	ControlFile->checkPoint = ProcLastRecPtr;
8853	ControlFile->checkPointCopy = checkPoint;
8854	ControlFile->time = (pg_time_t) time(NULL);
8855	/ crash recovery should always recover to the end of WAL /
8856	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8857	ControlFile->minRecoveryPointTLI = `0`;
8858
8859	/*
8860	* Persist unloggedLSN value. It's reset on crash recovery, so this goes
8861	* unused on non-shutdown checkpoints, but seems useful to store it always
8862	* for debugging purposes.
8863	*/
8864	SpinLockAcquire(&XLogCtl->ulsn_lck);
8865	ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8866	SpinLockRelease(&XLogCtl->ulsn_lck);
8867
8868	UpdateControlFile();
8869	LWLockRelease(ControlFileLock);
8870
8871	/ Update shared-memory copy of checkpoint XID/epoch /
8872	SpinLockAcquire(&XLogCtl->info_lck);
8873	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
8874	SpinLockRelease(&XLogCtl->info_lck);
8875
8876	/*
8877	* We are now done with critical updates; no need for system panic if we
8878	* have trouble while fooling with old log segments.
8879	*/
8880	END_CRIT_SECTION();
8881
8882	/*
8883	* Let smgr do post-checkpoint cleanup (eg, deleting old files).
8884	*/
8885	SyncPostCheckpoint();
8886
8887	/*
8888	* Update the average distance between checkpoints if the prior checkpoint
8889	* exists.
8890	*/
8891	if (PriorRedoPtr != InvalidXLogRecPtr)
8892	UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
8893
8894	/*
8895	* Delete old log files, those no longer needed for last checkpoint to
8896	* prevent the disk holding the xlog from growing full.
8897	*/
8898	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
8899	KeepLogSeg(recptr, &_logSegNo);
8900	_logSegNo--;
8901	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
8902
8903	/*
8904	* Make more log segments if needed. (Do this after recycling old log
8905	* segments, since that may supply some of the needed files.)
8906	*/
8907	if (!shutdown)
8908	PreallocXlogFiles(recptr);
8909
8910	/*
8911	* Truncate pg_subtrans if possible. We can throw away all data before
8912	* the oldest XMIN of any running transaction. No future transaction will
8913	* attempt to reference any pg_subtrans entry older than that (see Asserts
8914	* in subtrans.c). During recovery, though, we mustn't do this because
8915	* StartupSUBTRANS hasn't been called yet.
8916	*/
8917	if (!RecoveryInProgress())
8918	TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
8919
8920	/ Real work is done, but log and update stats before releasing lock. /
8921	LogCheckpointEnd(false);
8922
8923	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8924	NBuffers,
8925	CheckpointStats.ckpt_segs_added,
8926	CheckpointStats.ckpt_segs_removed,
8927	CheckpointStats.ckpt_segs_recycled);
8928
8929	LWLockRelease(CheckpointLock);
8930	}
8931
8932	/*
8933	* Mark the end of recovery in WAL though without running a full checkpoint.
8934	* We can expect that a restartpoint is likely to be in progress as we
8935	* do this, though we are unwilling to wait for it to complete. So be
8936	* careful to avoid taking the CheckpointLock anywhere here.
8937	*
8938	* CreateRestartPoint() allows for the case where recovery may end before
8939	* the restartpoint completes so there is no concern of concurrent behaviour.
8940	*/
8941	static void
8942	CreateEndOfRecoveryRecord(void)
8943	{
8944	xl_end_of_recovery xlrec;
8945	XLogRecPtr recptr;
8946
8947	/ sanity check /
8948	if (!RecoveryInProgress())
8949	elog(ERROR, "can only be used to end recovery");
8950
8951	xlrec.end_time = GetCurrentTimestamp();
8952
8953	WALInsertLockAcquireExclusive();
8954	xlrec.ThisTimeLineID = ThisTimeLineID;
8955	xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8956	WALInsertLockRelease();
8957
8958	LocalSetXLogInsertAllowed();
8959
8960	START_CRIT_SECTION();
8961
8962	XLogBeginInsert();
8963	XLogRegisterData((char ) &xlrec, sizeof*(xl_end_of_recovery));
8964	recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
8965
8966	XLogFlush(recptr);
8967
8968	/*
8969	* Update the control file so that crash recovery can follow the timeline
8970	* changes to this point.
8971	*/
8972	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8973	ControlFile->time = (pg_time_t) time(NULL);
8974	ControlFile->minRecoveryPoint = recptr;
8975	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8976	UpdateControlFile();
8977	LWLockRelease(ControlFileLock);
8978
8979	END_CRIT_SECTION();
8980
8981	LocalXLogInsertAllowed = -`1`; / return to "check" state /
8982	}
8983
8984	/*
8985	* Flush all data in shared memory to disk, and fsync
8986	*
8987	* This is the common code shared between regular checkpoints and
8988	* recovery restartpoints.
8989	*/
8990	static void
8991	CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8992	{
8993	CheckPointCLOG();
8994	CheckPointCommitTs();
8995	CheckPointSUBTRANS();
8996	CheckPointMultiXact();
8997	CheckPointPredicate();
8998	CheckPointRelationMap();
8999	CheckPointReplicationSlots();
9000	CheckPointSnapBuild();
9001	CheckPointLogicalRewriteHeap();
9002	CheckPointBuffers(flags); / performs all required fsyncs /
9003	CheckPointReplicationOrigin();
9004	/ We deliberately delay 2PC checkpointing as long as possible /
9005	CheckPointTwoPhase(checkPointRedo);
9006	}
9007
9008	/*
9009	* Save a checkpoint for recovery restart if appropriate
9010	*
9011	* This function is called each time a checkpoint record is read from XLOG.
9012	* It must determine whether the checkpoint represents a safe restartpoint or
9013	* not. If so, the checkpoint record is stashed in shared memory so that
9014	* CreateRestartPoint can consult it. (Note that the latter function is
9015	* executed by the checkpointer, while this one will be executed by the
9016	* startup process.)
9017	*/
9018	static void
9019	RecoveryRestartPoint(const CheckPoint *checkPoint)
9020	{
9021	/*
9022	* Also refrain from creating a restartpoint if we have seen any
9023	* references to non-existent pages. Restarting recovery from the
9024	* restartpoint would not see the references, so we would lose the
9025	* cross-check that the pages belonged to a relation that was dropped
9026	* later.
9027	*/
9028	if (XLogHaveInvalidPages())
9029	{
9030	elog(trace_recovery(DEBUG2),
9031	"could not record restart point at %X/%X because there "
9032	"are unresolved references to invalid pages",
9033	(uint32) (checkPoint->redo >> `32`),
9034	(uint32) checkPoint->redo);
9035	return;
9036	}
9037
9038	/*
9039	* Copy the checkpoint record to shared memory, so that checkpointer can
9040	* work out the next time it wants to perform a restartpoint.
9041	*/
9042	SpinLockAcquire(&XLogCtl->info_lck);
9043	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9044	XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9045	XLogCtl->lastCheckPoint = *checkPoint;
9046	SpinLockRelease(&XLogCtl->info_lck);
9047	}
9048
9049	/*
9050	* Establish a restartpoint if possible.
9051	*
9052	* This is similar to CreateCheckPoint, but is used during WAL recovery
9053	* to establish a point from which recovery can roll forward without
9054	* replaying the entire recovery log.
9055	*
9056	* Returns true if a new restartpoint was established. We can only establish
9057	* a restartpoint if we have replayed a safe checkpoint record since last
9058	* restartpoint.
9059	*/
9060	bool
9061	CreateRestartPoint(int flags)
9062	{
9063	XLogRecPtr lastCheckPointRecPtr;
9064	XLogRecPtr lastCheckPointEndPtr;
9065	CheckPoint lastCheckPoint;
9066	XLogRecPtr PriorRedoPtr;
9067	XLogRecPtr receivePtr;
9068	XLogRecPtr replayPtr;
9069	TimeLineID replayTLI;
9070	XLogRecPtr endptr;
9071	XLogSegNo _logSegNo;
9072	TimestampTz xtime;
9073
9074	/*
9075	* Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9076	* happens at a time.
9077	*/
9078	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9079
9080	/ Get a local copy of the last safe checkpoint record. /
9081	SpinLockAcquire(&XLogCtl->info_lck);
9082	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9083	lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9084	lastCheckPoint = XLogCtl->lastCheckPoint;
9085	SpinLockRelease(&XLogCtl->info_lck);
9086
9087	/*
9088	* Check that we're still in recovery mode. It's ok if we exit recovery
9089	* mode after this check, the restart point is valid anyway.
9090	*/
9091	if (!RecoveryInProgress())
9092	{
9093	ereport(DEBUG2,
9094	(errmsg("skipping restartpoint, recovery has already ended")));
9095	LWLockRelease(CheckpointLock);
9096	return false;
9097	}
9098
9099	/*
9100	* If the last checkpoint record we've replayed is already our last
9101	* restartpoint, we can't perform a new restart point. We still update
9102	* minRecoveryPoint in that case, so that if this is a shutdown restart
9103	* point, we won't start up earlier than before. That's not strictly
9104	* necessary, but when hot standby is enabled, it would be rather weird if
9105	* the database opened up for read-only connections at a point-in-time
9106	* before the last shutdown. Such time travel is still possible in case of
9107	* immediate shutdown, though.
9108	*
9109	* We don't explicitly advance minRecoveryPoint when we do create a
9110	* restartpoint. It's assumed that flushing the buffers will do that as a
9111	* side-effect.
9112	*/
9113	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) \|\|
9114	lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9115	{
9116	ereport(DEBUG2,
9117	(errmsg("skipping restartpoint, already performed at %X/%X",
9118	(uint32) (lastCheckPoint.redo >> `32`),
9119	(uint32) lastCheckPoint.redo)));
9120
9121	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9122	if (flags & CHECKPOINT_IS_SHUTDOWN)
9123	{
9124	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9125	ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9126	ControlFile->time = (pg_time_t) time(NULL);
9127	UpdateControlFile();
9128	LWLockRelease(ControlFileLock);
9129	}
9130	LWLockRelease(CheckpointLock);
9131	return false;
9132	}
9133
9134	/*
9135	* Update the shared RedoRecPtr so that the startup process can calculate
9136	* the number of segments replayed since last restartpoint, and request a
9137	* restartpoint if it exceeds CheckPointSegments.
9138	*
9139	* Like in CreateCheckPoint(), hold off insertions to update it, although
9140	* during recovery this is just pro forma, because no WAL insertions are
9141	* happening.
9142	*/
9143	WALInsertLockAcquireExclusive();
9144	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9145	WALInsertLockRelease();
9146
9147	/ Also update the info_lck-protected copy /
9148	SpinLockAcquire(&XLogCtl->info_lck);
9149	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9150	SpinLockRelease(&XLogCtl->info_lck);
9151
9152	/*
9153	* Prepare to accumulate statistics.
9154	*
9155	* Note: because it is possible for log_checkpoints to change while a
9156	* checkpoint proceeds, we always accumulate stats, even if
9157	* log_checkpoints is currently off.
9158	*/
9159	MemSet(&CheckpointStats, `0`, sizeof(CheckpointStats));
9160	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9161
9162	if (log_checkpoints)
9163	LogCheckpointStart(flags, true);
9164
9165	CheckPointGuts(lastCheckPoint.redo, flags);
9166
9167	/*
9168	* Remember the prior checkpoint's redo ptr for
9169	* UpdateCheckPointDistanceEstimate()
9170	*/
9171	PriorRedoPtr = ControlFile->checkPointCopy.redo;
9172
9173	/*
9174	* Update pg_control, using current time. Check that it still shows
9175	* IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9176	* this is a quick hack to make sure nothing really bad happens if somehow
9177	* we get here after the end-of-recovery checkpoint.
9178	*/
9179	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9180	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9181	ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9182	{
9183	ControlFile->checkPoint = lastCheckPointRecPtr;
9184	ControlFile->checkPointCopy = lastCheckPoint;
9185	ControlFile->time = (pg_time_t) time(NULL);
9186
9187	/*
9188	* Ensure minRecoveryPoint is past the checkpoint record. Normally,
9189	* this will have happened already while writing out dirty buffers,
9190	* but not necessarily - e.g. because no buffers were dirtied. We do
9191	* this because a non-exclusive base backup uses minRecoveryPoint to
9192	* determine which WAL files must be included in the backup, and the
9193	* file (or files) containing the checkpoint record must be included,
9194	* at a minimum. Note that for an ordinary restart of recovery there's
9195	* no value in having the minimum recovery point any earlier than this
9196	* anyway, because redo will begin just after the checkpoint record.
9197	*/
9198	if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9199	{
9200	ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9201	ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9202
9203	/ update local copy /
9204	minRecoveryPoint = ControlFile->minRecoveryPoint;
9205	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9206	}
9207	if (flags & CHECKPOINT_IS_SHUTDOWN)
9208	ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9209	UpdateControlFile();
9210	}
9211	LWLockRelease(ControlFileLock);
9212
9213	/*
9214	* Update the average distance between checkpoints/restartpoints if the
9215	* prior checkpoint exists.
9216	*/
9217	if (PriorRedoPtr != InvalidXLogRecPtr)
9218	UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9219
9220	/*
9221	* Delete old log files, those no longer needed for last restartpoint to
9222	* prevent the disk holding the xlog from growing full.
9223	*/
9224	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9225
9226	/*
9227	* Retreat _logSegNo using the current end of xlog replayed or received,
9228	* whichever is later.
9229	*/
9230	receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9231	replayPtr = GetXLogReplayRecPtr(&replayTLI);
9232	endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9233	KeepLogSeg(endptr, &_logSegNo);
9234	_logSegNo--;
9235
9236	/*
9237	* Try to recycle segments on a useful timeline. If we've been promoted
9238	* since the beginning of this restartpoint, use the new timeline chosen
9239	* at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9240	* case). If we're still in recovery, use the timeline we're currently
9241	* replaying.
9242	*
9243	* There is no guarantee that the WAL segments will be useful on the
9244	* current timeline; if recovery proceeds to a new timeline right after
9245	* this, the pre-allocated WAL segments on this timeline will not be used,
9246	* and will go wasted until recycled on the next restartpoint. We'll live
9247	* with that.
9248	*/
9249	if (RecoveryInProgress())
9250	ThisTimeLineID = replayTLI;
9251
9252	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9253
9254	/*
9255	* Make more log segments if needed. (Do this after recycling old log
9256	* segments, since that may supply some of the needed files.)
9257	*/
9258	PreallocXlogFiles(endptr);
9259
9260	/*
9261	* ThisTimeLineID is normally not set when we're still in recovery.
9262	* However, recycling/preallocating segments above needed ThisTimeLineID
9263	* to determine which timeline to install the segments on. Reset it now,
9264	* to restore the normal state of affairs for debugging purposes.
9265	*/
9266	if (RecoveryInProgress())
9267	ThisTimeLineID = `0`;
9268
9269	/*
9270	* Truncate pg_subtrans if possible. We can throw away all data before
9271	* the oldest XMIN of any running transaction. No future transaction will
9272	* attempt to reference any pg_subtrans entry older than that (see Asserts
9273	* in subtrans.c). When hot standby is disabled, though, we mustn't do
9274	* this because StartupSUBTRANS hasn't been called yet.
9275	*/
9276	if (EnableHotStandby)
9277	TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9278
9279	/ Real work is done, but log and update before releasing lock. /
9280	LogCheckpointEnd(true);
9281
9282	xtime = GetLatestXTime();
9283	ereport((log_checkpoints ? LOG : DEBUG2),
9284	(errmsg("recovery restart point at %X/%X",
9285	(uint32) (lastCheckPoint.redo >> `32`), (uint32) lastCheckPoint.redo),
9286	xtime ? errdetail("Last completed transaction was at log time %s.",
9287	timestamptz_to_str(xtime)) : `0`));
9288
9289	LWLockRelease(CheckpointLock);
9290
9291	/*
9292	* Finally, execute archive_cleanup_command, if any.
9293	*/
9294	if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != `0`)
9295	ExecuteRecoveryCommand(archiveCleanupCommand,
9296	"archive_cleanup_command",
9297	false);
9298
9299	return true;
9300	}
9301
9302	/*
9303	* Retreat *logSegNo to the last segment that we need to retain because of
9304	* either wal_keep_segments or replication slots.
9305	*
9306	* This is calculated by subtracting wal_keep_segments from the given xlog
9307	* location, recptr and by making sure that that result is below the
9308	* requirement of replication slots.
9309	*/
9310	static void
9311	KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9312	{
9313	XLogSegNo segno;
9314	XLogRecPtr keep;
9315
9316	XLByteToSeg(recptr, segno, wal_segment_size);
9317	keep = XLogGetReplicationSlotMinimumLSN();
9318
9319	/ compute limit for wal_keep_segments first /
9320	if (wal_keep_segments > `0`)
9321	{
9322	/ avoid underflow, don't go below 1 /
9323	if (segno <= wal_keep_segments)
9324	segno = `1`;
9325	else
9326	segno = segno - wal_keep_segments;
9327	}
9328
9329	/ then check whether slots limit removal further /
9330	if (max_replication_slots > `0` && keep != InvalidXLogRecPtr)
9331	{
9332	XLogSegNo slotSegNo;
9333
9334	XLByteToSeg(keep, slotSegNo, wal_segment_size);
9335
9336	if (slotSegNo <= `0`)
9337	segno = `1`;
9338	else if (slotSegNo < segno)
9339	segno = slotSegNo;
9340	}
9341
9342	/ don't delete WAL segments newer than the calculated segment /
9343	if (segno < *logSegNo)
9344	*logSegNo = segno;
9345	}
9346
9347	/*
9348	* Write a NEXTOID log record
9349	*/
9350	void
9351	XLogPutNextOid(Oid nextOid)
9352	{
9353	XLogBeginInsert();
9354	XLogRegisterData((char ) (&nextOid), sizeof*(Oid));
9355	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9356
9357	/*
9358	* We need not flush the NEXTOID record immediately, because any of the
9359	* just-allocated OIDs could only reach disk as part of a tuple insert or
9360	* update that would have its own XLOG record that must follow the NEXTOID
9361	* record. Therefore, the standard buffer LSN interlock applied to those
9362	* records will ensure no such OID reaches disk before the NEXTOID record
9363	* does.
9364	*
9365	* Note, however, that the above statement only covers state "within" the
9366	* database. When we use a generated OID as a file or directory name, we
9367	* are in a sense violating the basic WAL rule, because that filesystem
9368	* change may reach disk before the NEXTOID WAL record does. The impact
9369	* of this is that if a database crash occurs immediately afterward, we
9370	* might after restart re-generate the same OID and find that it conflicts
9371	* with the leftover file or directory. But since for safety's sake we
9372	* always loop until finding a nonconflicting filename, this poses no real
9373	* problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9374	*/
9375	}
9376
9377	/*
9378	* Write an XLOG SWITCH record.
9379	*
9380	* Here we just blindly issue an XLogInsert request for the record.
9381	* All the magic happens inside XLogInsert.
9382	*
9383	* The return value is either the end+1 address of the switch record,
9384	* or the end+1 address of the prior segment if we did not need to
9385	* write a switch record because we are already at segment start.
9386	*/
9387	XLogRecPtr
9388	RequestXLogSwitch(bool mark_unimportant)
9389	{
9390	XLogRecPtr RecPtr;
9391
9392	/ XLOG SWITCH has no data /
9393	XLogBeginInsert();
9394
9395	if (mark_unimportant)
9396	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9397	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9398
9399	return RecPtr;
9400	}
9401
9402	/*
9403	* Write a RESTORE POINT record
9404	*/
9405	XLogRecPtr
9406	XLogRestorePoint(const char *rpName)
9407	{
9408	XLogRecPtr RecPtr;
9409	xl_restore_point xlrec;
9410
9411	xlrec.rp_time = GetCurrentTimestamp();
9412	strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9413
9414	XLogBeginInsert();
9415	XLogRegisterData((char ) &xlrec, sizeof*(xl_restore_point));
9416
9417	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9418
9419	ereport(LOG,
9420	(errmsg("restore point \"%s\" created at %X/%X",
9421	rpName, (uint32) (RecPtr >> `32`), (uint32) RecPtr)));
9422
9423	return RecPtr;
9424	}
9425
9426	/*
9427	* Check if any of the GUC parameters that are critical for hot standby
9428	* have changed, and update the value in pg_control file if necessary.
9429	*/
9430	static void
9431	XLogReportParameters(void)
9432	{
9433	if (wal_level != ControlFile->wal_level \|\|
9434	wal_log_hints != ControlFile->wal_log_hints \|\|
9435	MaxConnections != ControlFile->MaxConnections \|\|
9436	max_worker_processes != ControlFile->max_worker_processes \|\|
9437	max_wal_senders != ControlFile->max_wal_senders \|\|
9438	max_prepared_xacts != ControlFile->max_prepared_xacts \|\|
9439	max_locks_per_xact != ControlFile->max_locks_per_xact \|\|
9440	track_commit_timestamp != ControlFile->track_commit_timestamp)
9441	{
9442	/*
9443	* The change in number of backend slots doesn't need to be WAL-logged
9444	* if archiving is not enabled, as you can't start archive recovery
9445	* with wal_level=minimal anyway. We don't really care about the
9446	* values in pg_control either if wal_level=minimal, but seems better
9447	* to keep them up-to-date to avoid confusion.
9448	*/
9449	if (wal_level != ControlFile->wal_level \|\| XLogIsNeeded())
9450	{
9451	xl_parameter_change xlrec;
9452	XLogRecPtr recptr;
9453
9454	xlrec.MaxConnections = MaxConnections;
9455	xlrec.max_worker_processes = max_worker_processes;
9456	xlrec.max_wal_senders = max_wal_senders;
9457	xlrec.max_prepared_xacts = max_prepared_xacts;
9458	xlrec.max_locks_per_xact = max_locks_per_xact;
9459	xlrec.wal_level = wal_level;
9460	xlrec.wal_log_hints = wal_log_hints;
9461	xlrec.track_commit_timestamp = track_commit_timestamp;
9462
9463	XLogBeginInsert();
9464	XLogRegisterData((char ) &xlrec, sizeof*(xlrec));
9465
9466	recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9467	XLogFlush(recptr);
9468	}
9469
9470	ControlFile->MaxConnections = MaxConnections;
9471	ControlFile->max_worker_processes = max_worker_processes;
9472	ControlFile->max_wal_senders = max_wal_senders;
9473	ControlFile->max_prepared_xacts = max_prepared_xacts;
9474	ControlFile->max_locks_per_xact = max_locks_per_xact;
9475	ControlFile->wal_level = wal_level;
9476	ControlFile->wal_log_hints = wal_log_hints;
9477	ControlFile->track_commit_timestamp = track_commit_timestamp;
9478	UpdateControlFile();
9479	}
9480	}
9481
9482	/*
9483	* Update full_page_writes in shared memory, and write an
9484	* XLOG_FPW_CHANGE record if necessary.
9485	*
9486	* Note: this function assumes there is no other process running
9487	* concurrently that could update it.
9488	*/
9489	void
9490	UpdateFullPageWrites(void)
9491	{
9492	XLogCtlInsert *Insert = &XLogCtl->Insert;
9493	bool recoveryInProgress;
9494
9495	/*
9496	* Do nothing if full_page_writes has not been changed.
9497	*
9498	* It's safe to check the shared full_page_writes without the lock,
9499	* because we assume that there is no concurrently running process which
9500	* can update it.
9501	*/
9502	if (fullPageWrites == Insert->fullPageWrites)
9503	return;
9504
9505	/*
9506	* Perform this outside critical section so that the WAL insert
9507	* initialization done by RecoveryInProgress() doesn't trigger an
9508	* assertion failure.
9509	*/
9510	recoveryInProgress = RecoveryInProgress();
9511
9512	START_CRIT_SECTION();
9513
9514	/*
9515	* It's always safe to take full page images, even when not strictly
9516	* required, but not the other round. So if we're setting full_page_writes
9517	* to true, first set it true and then write the WAL record. If we're
9518	* setting it to false, first write the WAL record and then set the global
9519	* flag.
9520	*/
9521	if (fullPageWrites)
9522	{
9523	WALInsertLockAcquireExclusive();
9524	Insert->fullPageWrites = true;
9525	WALInsertLockRelease();
9526	}
9527
9528	/*
9529	* Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9530	* full_page_writes during archive recovery, if required.
9531	*/
9532	if (XLogStandbyInfoActive() && !recoveryInProgress)
9533	{
9534	XLogBeginInsert();
9535	XLogRegisterData((char ) (&fullPageWrites), sizeof*(bool));
9536
9537	XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9538	}
9539
9540	if (!fullPageWrites)
9541	{
9542	WALInsertLockAcquireExclusive();
9543	Insert->fullPageWrites = false;
9544	WALInsertLockRelease();
9545	}
9546	END_CRIT_SECTION();
9547	}
9548
9549	/*
9550	* Check that it's OK to switch to new timeline during recovery.
9551	*
9552	* 'lsn' is the address of the shutdown checkpoint record we're about to
9553	* replay. (Currently, timeline can only change at a shutdown checkpoint).
9554	*/
9555	static void
9556	checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9557	{
9558	/ Check that the record agrees on what the current (old) timeline is /
9559	if (prevTLI != ThisTimeLineID)
9560	ereport(PANIC,
9561	(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9562	prevTLI, ThisTimeLineID)));
9563
9564	/*
9565	* The new timeline better be in the list of timelines we expect to see,
9566	* according to the timeline history. It should also not decrease.
9567	*/
9568	if (newTLI < ThisTimeLineID \|\| !tliInHistory(newTLI, expectedTLEs))
9569	ereport(PANIC,
9570	(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9571	newTLI, ThisTimeLineID)));
9572
9573	/*
9574	* If we have not yet reached min recovery point, and we're about to
9575	* switch to a timeline greater than the timeline of the min recovery
9576	* point: trouble. After switching to the new timeline, we could not
9577	* possibly visit the min recovery point on the correct timeline anymore.
9578	* This can happen if there is a newer timeline in the archive that
9579	* branched before the timeline the min recovery point is on, and you
9580	* attempt to do PITR to the new timeline.
9581	*/
9582	if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9583	lsn < minRecoveryPoint &&
9584	newTLI > minRecoveryPointTLI)
9585	ereport(PANIC,
9586	(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9587	newTLI,
9588	(uint32) (minRecoveryPoint >> `32`),
9589	(uint32) minRecoveryPoint,
9590	minRecoveryPointTLI)));
9591
9592	/ Looks good /
9593	}
9594
9595	/*
9596	* XLOG resource manager's routines
9597	*
9598	* Definitions of info values are in include/catalog/pg_control.h, though
9599	* not all record types are related to control file updates.
9600	*/
9601	void
9602	xlog_redo(XLogReaderState *record)
9603	{
9604	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9605	XLogRecPtr lsn = record->EndRecPtr;
9606
9607	/ in XLOG rmgr, backup blocks are only used by XLOG_FPI records /
9608	Assert(info == XLOG_FPI \|\| info == XLOG_FPI_FOR_HINT \|\|
9609	!XLogRecHasAnyBlockRefs(record));
9610
9611	if (info == XLOG_NEXTOID)
9612	{
9613	Oid nextOid;
9614
9615	/*
9616	* We used to try to take the maximum of ShmemVariableCache->nextOid
9617	* and the recorded nextOid, but that fails if the OID counter wraps
9618	* around. Since no OID allocation should be happening during replay
9619	* anyway, better to just believe the record exactly. We still take
9620	* OidGenLock while setting the variable, just in case.
9621	*/
9622	memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9623	LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9624	ShmemVariableCache->nextOid = nextOid;
9625	ShmemVariableCache->oidCount = `0`;
9626	LWLockRelease(OidGenLock);
9627	}
9628	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9629	{
9630	CheckPoint checkPoint;
9631
9632	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9633	/ In a SHUTDOWN checkpoint, believe the counters exactly /
9634	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9635	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
9636	LWLockRelease(XidGenLock);
9637	LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9638	ShmemVariableCache->nextOid = checkPoint.nextOid;
9639	ShmemVariableCache->oidCount = `0`;
9640	LWLockRelease(OidGenLock);
9641	MultiXactSetNextMXact(checkPoint.nextMulti,
9642	checkPoint.nextMultiOffset);
9643
9644	MultiXactAdvanceOldest(checkPoint.oldestMulti,
9645	checkPoint.oldestMultiDB);
9646
9647	/*
9648	* No need to set oldestClogXid here as well; it'll be set when we
9649	* redo an xl_clog_truncate if it changed since initialization.
9650	*/
9651	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9652
9653	/*
9654	* If we see a shutdown checkpoint while waiting for an end-of-backup
9655	* record, the backup was canceled and the end-of-backup record will
9656	* never arrive.
9657	*/
9658	if (ArchiveRecoveryRequested &&
9659	!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9660	XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9661	ereport(PANIC,
9662	(errmsg("online backup was canceled, recovery cannot continue")));
9663
9664	/*
9665	* If we see a shutdown checkpoint, we know that nothing was running
9666	* on the master at this point. So fake-up an empty running-xacts
9667	* record and use that here and now. Recover additional standby state
9668	* for prepared transactions.
9669	*/
9670	if (standbyState >= STANDBY_INITIALIZED)
9671	{
9672	TransactionId *xids;
9673	int nxids;
9674	TransactionId oldestActiveXID;
9675	TransactionId latestCompletedXid;
9676	RunningTransactionsData running;
9677
9678	oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9679
9680	/*
9681	* Construct a RunningTransactions snapshot representing a shut
9682	* down server, with only prepared transactions still alive. We're
9683	* never overflowed at this point because all subxids are listed
9684	* with their parent prepared transactions.
9685	*/
9686	running.xcnt = nxids;
9687	running.subxcnt = `0`;
9688	running.subxid_overflow = false;
9689	running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
9690	running.oldestRunningXid = oldestActiveXID;
9691	latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
9692	TransactionIdRetreat(latestCompletedXid);
9693	Assert(TransactionIdIsNormal(latestCompletedXid));
9694	running.latestCompletedXid = latestCompletedXid;
9695	running.xids = xids;
9696
9697	ProcArrayApplyRecoveryInfo(&running);
9698
9699	StandbyRecoverPreparedTransactions();
9700	}
9701
9702	/ ControlFile->checkPointCopy always tracks the latest ckpt XID /
9703	ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
9704
9705	/ Update shared-memory copy of checkpoint XID/epoch /
9706	SpinLockAcquire(&XLogCtl->info_lck);
9707	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9708	SpinLockRelease(&XLogCtl->info_lck);
9709
9710	/*
9711	* We should've already switched to the new TLI before replaying this
9712	* record.
9713	*/
9714	if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9715	ereport(PANIC,
9716	(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9717	checkPoint.ThisTimeLineID, ThisTimeLineID)));
9718
9719	RecoveryRestartPoint(&checkPoint);
9720	}
9721	else if (info == XLOG_CHECKPOINT_ONLINE)
9722	{
9723	CheckPoint checkPoint;
9724
9725	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9726	/ In an ONLINE checkpoint, treat the XID counter as a minimum /
9727	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9728	if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid,
9729	checkPoint.nextFullXid))
9730	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
9731	LWLockRelease(XidGenLock);
9732
9733	/*
9734	* We ignore the nextOid counter in an ONLINE checkpoint, preferring
9735	* to track OID assignment through XLOG_NEXTOID records. The nextOid
9736	* counter is from the start of the checkpoint and might well be stale
9737	* compared to later XLOG_NEXTOID records. We could try to take the
9738	* maximum of the nextOid counter and our latest value, but since
9739	* there's no particular guarantee about the speed with which the OID
9740	* counter wraps around, that's a risky thing to do. In any case,
9741	* users of the nextOid counter are required to avoid assignment of
9742	* duplicates, so that a somewhat out-of-date value should be safe.
9743	*/
9744
9745	/ Handle multixact /
9746	MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9747	checkPoint.nextMultiOffset);
9748
9749	/*
9750	* NB: This may perform multixact truncation when replaying WAL
9751	* generated by an older primary.
9752	*/
9753	MultiXactAdvanceOldest(checkPoint.oldestMulti,
9754	checkPoint.oldestMultiDB);
9755	if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9756	checkPoint.oldestXid))
9757	SetTransactionIdLimit(checkPoint.oldestXid,
9758	checkPoint.oldestXidDB);
9759	/ ControlFile->checkPointCopy always tracks the latest ckpt XID /
9760	ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
9761
9762	/ Update shared-memory copy of checkpoint XID/epoch /
9763	SpinLockAcquire(&XLogCtl->info_lck);
9764	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9765	SpinLockRelease(&XLogCtl->info_lck);
9766
9767	/ TLI should not change in an on-line checkpoint /
9768	if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9769	ereport(PANIC,
9770	(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9771	checkPoint.ThisTimeLineID, ThisTimeLineID)));
9772
9773	RecoveryRestartPoint(&checkPoint);
9774	}
9775	else if (info == XLOG_END_OF_RECOVERY)
9776	{
9777	xl_end_of_recovery xlrec;
9778
9779	memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9780
9781	/*
9782	* For Hot Standby, we could treat this like a Shutdown Checkpoint,
9783	* but this case is rarer and harder to test, so the benefit doesn't
9784	* outweigh the potential extra cost of maintenance.
9785	*/
9786
9787	/*
9788	* We should've already switched to the new TLI before replaying this
9789	* record.
9790	*/
9791	if (xlrec.ThisTimeLineID != ThisTimeLineID)
9792	ereport(PANIC,
9793	(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9794	xlrec.ThisTimeLineID, ThisTimeLineID)));
9795	}
9796	else if (info == XLOG_NOOP)
9797	{
9798	/ nothing to do here /
9799	}
9800	else if (info == XLOG_SWITCH)
9801	{
9802	/ nothing to do here /
9803	}
9804	else if (info == XLOG_RESTORE_POINT)
9805	{
9806	/ nothing to do here /
9807	}
9808	else if (info == XLOG_FPI \|\| info == XLOG_FPI_FOR_HINT)
9809	{
9810	/*
9811	* Full-page image (FPI) records contain nothing else but a backup
9812	* block (or multiple backup blocks). Every block reference must
9813	* include a full-page image - otherwise there would be no point in
9814	* this record.
9815	*
9816	* No recovery conflicts are generated by these generic records - if a
9817	* resource manager needs to generate conflicts, it has to define a
9818	* separate WAL record type and redo routine.
9819	*
9820	* XLOG_FPI_FOR_HINT records are generated when a page needs to be
9821	* WAL- logged because of a hint bit update. They are only generated
9822	* when checksums are enabled. There is no difference in handling
9823	* XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
9824	* code just to distinguish them for statistics purposes.
9825	*/
9826	for (uint8 block_id = `0`; block_id <= record->max_block_id; block_id++)
9827	{
9828	Buffer buffer;
9829
9830	if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
9831	elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
9832	UnlockReleaseBuffer(buffer);
9833	}
9834	}
9835	else if (info == XLOG_BACKUP_END)
9836	{
9837	XLogRecPtr startpoint;
9838
9839	memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9840
9841	if (ControlFile->backupStartPoint == startpoint)
9842	{
9843	/*
9844	* We have reached the end of base backup, the point where
9845	* pg_stop_backup() was done. The data on disk is now consistent.
9846	* Reset backupStartPoint, and update minRecoveryPoint to make
9847	* sure we don't allow starting up at an earlier point even if
9848	* recovery is stopped and restarted soon after this.
9849	*/
9850	elog(DEBUG1, "end of backup reached");
9851
9852	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9853
9854	if (ControlFile->minRecoveryPoint < lsn)
9855	{
9856	ControlFile->minRecoveryPoint = lsn;
9857	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9858	}
9859	ControlFile->backupStartPoint = InvalidXLogRecPtr;
9860	ControlFile->backupEndRequired = false;
9861	UpdateControlFile();
9862
9863	LWLockRelease(ControlFileLock);
9864	}
9865	}
9866	else if (info == XLOG_PARAMETER_CHANGE)
9867	{
9868	xl_parameter_change xlrec;
9869
9870	/ Update our copy of the parameters in pg_control /
9871	memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9872
9873	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9874	ControlFile->MaxConnections = xlrec.MaxConnections;
9875	ControlFile->max_worker_processes = xlrec.max_worker_processes;
9876	ControlFile->max_wal_senders = xlrec.max_wal_senders;
9877	ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9878	ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9879	ControlFile->wal_level = xlrec.wal_level;
9880	ControlFile->wal_log_hints = xlrec.wal_log_hints;
9881
9882	/*
9883	* Update minRecoveryPoint to ensure that if recovery is aborted, we
9884	* recover back up to this point before allowing hot standby again.
9885	* This is important if the max_* settings are decreased, to ensure
9886	* you don't run queries against the WAL preceding the change. The
9887	* local copies cannot be updated as long as crash recovery is
9888	* happening and we expect all the WAL to be replayed.
9889	*/
9890	if (InArchiveRecovery)
9891	{
9892	minRecoveryPoint = ControlFile->minRecoveryPoint;
9893	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9894	}
9895	if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
9896	{
9897	ControlFile->minRecoveryPoint = lsn;
9898	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9899	}
9900
9901	CommitTsParameterChange(xlrec.track_commit_timestamp,
9902	ControlFile->track_commit_timestamp);
9903	ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
9904
9905	UpdateControlFile();
9906	LWLockRelease(ControlFileLock);
9907
9908	/ Check to see if any parameter change gives a problem on recovery /
9909	CheckRequiredParameterValues();
9910	}
9911	else if (info == XLOG_FPW_CHANGE)
9912	{
9913	bool fpw;
9914
9915	memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9916
9917	/*
9918	* Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9919	* do_pg_start_backup() and do_pg_stop_backup() can check whether
9920	* full_page_writes has been disabled during online backup.
9921	*/
9922	if (!fpw)
9923	{
9924	SpinLockAcquire(&XLogCtl->info_lck);
9925	if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
9926	XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
9927	SpinLockRelease(&XLogCtl->info_lck);
9928	}
9929
9930	/ Keep track of full_page_writes /
9931	lastFullPageWrites = fpw;
9932	}
9933	}
9934
9935	#ifdef WAL_DEBUG
9936
9937	static void
9938	xlog_outrec(StringInfo buf, XLogReaderState *record)
9939	{
9940	int block_id;
9941
9942	appendStringInfo(buf, "prev %X/%X; xid %u",
9943	(uint32) (XLogRecGetPrev(record) >> `32`),
9944	(uint32) XLogRecGetPrev(record),
9945	XLogRecGetXid(record));
9946
9947	appendStringInfo(buf, "; len %u",
9948	XLogRecGetDataLen(record));
9949
9950	/ decode block references /
9951	for (block_id = `0`; block_id <= record->max_block_id; block_id++)
9952	{
9953	RelFileNode rnode;
9954	ForkNumber forknum;
9955	BlockNumber blk;
9956
9957	if (!XLogRecHasBlockRef(record, block_id))
9958	continue;
9959
9960	XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
9961	if (forknum != MAIN_FORKNUM)
9962	appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
9963	block_id,
9964	rnode.spcNode, rnode.dbNode, rnode.relNode,
9965	forknum,
9966	blk);
9967	else
9968	appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
9969	block_id,
9970	rnode.spcNode, rnode.dbNode, rnode.relNode,
9971	blk);
9972	if (XLogRecHasBlockImage(record, block_id))
9973	appendStringInfoString(buf, " FPW");
9974	}
9975	}
9976	#endif /* WAL_DEBUG */
9977
9978	/*
9979	* Returns a string describing an XLogRecord, consisting of its identity
9980	* optionally followed by a colon, a space, and a further description.
9981	*/
9982	static void
9983	xlog_outdesc(StringInfo buf, XLogReaderState *record)
9984	{
9985	RmgrId rmid = XLogRecGetRmid(record);
9986	uint8 info = XLogRecGetInfo(record);
9987	const char *id;
9988
9989	appendStringInfoString(buf, RmgrTable[rmid].rm_name);
9990	appendStringInfoChar(buf, `'/'`);
9991
9992	id = RmgrTable[rmid].rm_identify(info);
9993	if (id == NULL)
9994	appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
9995	else
9996	appendStringInfo(buf, "%s: ", id);
9997
9998	RmgrTable[rmid].rm_desc(buf, record);
9999	}
10000
10001
10002	/*
10003	* Return the (possible) sync flag used for opening a file, depending on the
10004	* value of the GUC wal_sync_method.
10005	*/
10006	static int
10007	get_sync_bit(int method)
10008	{
10009	int o_direct_flag = `0`;
10010
10011	/ If fsync is disabled, never open in sync mode /
10012	if (!enableFsync)
10013	return `0`;
10014
10015	/*
10016	* Optimize writes by bypassing kernel cache with O_DIRECT when using
10017	* O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
10018	* disabled, otherwise the archive command or walsender process will read
10019	* the WAL soon after writing it, which is guaranteed to cause a physical
10020	* read if we bypassed the kernel cache. We also skip the
10021	* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10022	* reason.
10023	*
10024	* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10025	* written by walreceiver is normally read by the startup process soon
10026	* after its written. Also, walreceiver performs unaligned writes, which
10027	* don't work with O_DIRECT, so it is required for correctness too.
10028	*/
10029	if (!XLogIsNeeded() && !AmWalReceiverProcess())
10030	o_direct_flag = PG_O_DIRECT;
10031
10032	switch (method)
10033	{
10034	/*
10035	* enum values for all sync options are defined even if they are
10036	* not supported on the current platform. But if not, they are
10037	* not included in the enum option array, and therefore will never
10038	* be seen here.
10039	*/
10040	case SYNC_METHOD_FSYNC:
10041	case SYNC_METHOD_FSYNC_WRITETHROUGH:
10042	case SYNC_METHOD_FDATASYNC:
10043	return `0`;
10044	#ifdef OPEN_SYNC_FLAG
10045	case SYNC_METHOD_OPEN:
10046	return OPEN_SYNC_FLAG \| o_direct_flag;
10047	#endif
10048	#ifdef OPEN_DATASYNC_FLAG
10049	case SYNC_METHOD_OPEN_DSYNC:
10050	return OPEN_DATASYNC_FLAG \| o_direct_flag;
10051	#endif
10052	default:
10053	/ can't happen (unless we are out of sync with option array) /
10054	elog(ERROR, "unrecognized wal_sync_method: %d", method);
10055	return `0`; / silence warning /
10056	}
10057	}
10058
10059	/*
10060	* GUC support
10061	*/
10062	void
10063	assign_xlog_sync_method(int new_sync_method, void *extra)
10064	{
10065	if (sync_method != new_sync_method)
10066	{
10067	/*
10068	* To ensure that no blocks escape unsynced, force an fsync on the
10069	* currently open log segment (if any). Also, if the open flag is
10070	* changing, close the log file so it will be reopened (with new flag
10071	* bit) at next use.
10072	*/
10073	if (openLogFile >= `0`)
10074	{
10075	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10076	if (pg_fsync(openLogFile) != `0`)
10077	ereport(PANIC,
10078	(errcode_for_file_access(),
10079	errmsg("could not fsync file \"%s\": %m",
10080	XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10081	pgstat_report_wait_end();
10082	if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10083	XLogFileClose();
10084	}
10085	}
10086	}
10087
10088
10089	/*
10090	* Issue appropriate kind of fsync (if any) for an XLOG output file.
10091	*
10092	* 'fd' is a file descriptor for the XLOG file to be fsync'd.
10093	* 'segno' is for error reporting purposes.
10094	*/
10095	void
10096	issue_xlog_fsync(int fd, XLogSegNo segno)
10097	{
10098	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10099	switch (sync_method)
10100	{
10101	case SYNC_METHOD_FSYNC:
10102	if (pg_fsync_no_writethrough(fd) != `0`)
10103	ereport(PANIC,
10104	(errcode_for_file_access(),
10105	errmsg("could not fsync file \"%s\": %m",
10106	XLogFileNameP(ThisTimeLineID, segno))));
10107	break;
10108	#ifdef HAVE_FSYNC_WRITETHROUGH
10109	case SYNC_METHOD_FSYNC_WRITETHROUGH:
10110	if (pg_fsync_writethrough(fd) != `0`)
10111	ereport(PANIC,
10112	(errcode_for_file_access(),
10113	errmsg("could not fsync write-through file \"%s\": %m",
10114	XLogFileNameP(ThisTimeLineID, segno))));
10115	break;
10116	#endif
10117	#ifdef HAVE_FDATASYNC
10118	case SYNC_METHOD_FDATASYNC:
10119	if (pg_fdatasync(fd) != `0`)
10120	ereport(PANIC,
10121	(errcode_for_file_access(),
10122	errmsg("could not fdatasync file \"%s\": %m",
10123	XLogFileNameP(ThisTimeLineID, segno))));
10124	break;
10125	#endif
10126	case SYNC_METHOD_OPEN:
10127	case SYNC_METHOD_OPEN_DSYNC:
10128	/ write synced it already /
10129	break;
10130	default:
10131	elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10132	break;
10133	}
10134	pgstat_report_wait_end();
10135	}
10136
10137	/*
10138	* Return the filename of given log segment, as a palloc'd string.
10139	*/
10140	char *
10141	XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10142	{
10143	char *result = palloc(MAXFNAMELEN);
10144
10145	XLogFileName(result, tli, segno, wal_segment_size);
10146	return result;
10147	}
10148
10149	/*
10150	* do_pg_start_backup
10151	*
10152	* Utility function called at the start of an online backup. It creates the
10153	* necessary starting checkpoint and constructs the backup label file.
10154	*
10155	* There are two kind of backups: exclusive and non-exclusive. An exclusive
10156	* backup is started with pg_start_backup(), and there can be only one active
10157	* at a time. The backup and tablespace map files of an exclusive backup are
10158	* written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10159	* removed by pg_stop_backup().
10160	*
10161	* A non-exclusive backup is used for the streaming base backups (see
10162	* src/backend/replication/basebackup.c). The difference to exclusive backups
10163	* is that the backup label and tablespace map files are not written to disk.
10164	* Instead, their would-be contents are returned in labelfile and tblspcmapfile,
10165	* and the caller is responsible for including them in the backup archive as
10166	* 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10167	* active at the same time, and they don't conflict with an exclusive backup
10168	* either.
10169	*
10170	* tblspcmapfile is required mainly for tar format in windows as native windows
10171	* utilities are not able to create symlinks while extracting files from tar.
10172	* However for consistency, the same is used for all platforms.
10173	*
10174	* needtblspcmapfile is true for the cases (exclusive backup and for
10175	* non-exclusive backup only when tar format is used for taking backup)
10176	* when backup needs to generate tablespace_map file, it is used to
10177	* embed escape character before newline character in tablespace path.
10178	*
10179	* Returns the minimum WAL location that must be present to restore from this
10180	* backup, and the corresponding timeline ID in *starttli_p.
10181	*
10182	* Every successfully started non-exclusive backup must be stopped by calling
10183	* do_pg_stop_backup() or do_pg_abort_backup().
10184	*
10185	* It is the responsibility of the caller of this function to verify the
10186	* permissions of the calling user!
10187	*/
10188	XLogRecPtr
10189	do_pg_start_backup(const char backupidstr, bool fast, TimeLineID starttli_p,
10190	StringInfo labelfile, List **tablespaces,
10191	StringInfo tblspcmapfile, bool infotbssize,
10192	bool needtblspcmapfile)
10193	{
10194	bool exclusive = (labelfile == NULL);
10195	bool backup_started_in_recovery = false;
10196	XLogRecPtr checkpointloc;
10197	XLogRecPtr startpoint;
10198	TimeLineID starttli;
10199	pg_time_t stamp_time;
10200	char strfbuf[`128`];
10201	char xlogfilename[MAXFNAMELEN];
10202	XLogSegNo _logSegNo;
10203	struct stat stat_buf;
10204	FILE *fp;
10205
10206	backup_started_in_recovery = RecoveryInProgress();
10207
10208	/*
10209	* Currently only non-exclusive backup can be taken during recovery.
10210	*/
10211	if (backup_started_in_recovery && exclusive)
10212	ereport(ERROR,
10213	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10214	errmsg("recovery is in progress"),
10215	errhint("WAL control functions cannot be executed during recovery.")));
10216
10217	/*
10218	* During recovery, we don't need to check WAL level. Because, if WAL
10219	* level is not sufficient, it's impossible to get here during recovery.
10220	*/
10221	if (!backup_started_in_recovery && !XLogIsNeeded())
10222	ereport(ERROR,
10223	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10224	errmsg("WAL level not sufficient for making an online backup"),
10225	errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10226
10227	if (strlen(backupidstr) > MAXPGPATH)
10228	ereport(ERROR,
10229	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10230	errmsg("backup label too long (max %d bytes)",
10231	MAXPGPATH)));
10232
10233	/*
10234	* Mark backup active in shared memory. We must do full-page WAL writes
10235	* during an on-line backup even if not doing so at other times, because
10236	* it's quite possible for the backup dump to obtain a "torn" (partially
10237	* written) copy of a database page if it reads the page concurrently with
10238	* our write to the same page. This can be fixed as long as the first
10239	* write to the page in the WAL sequence is a full-page write. Hence, we
10240	* turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10241	* are no dirty pages in shared memory that might get dumped while the
10242	* backup is in progress without having a corresponding WAL record. (Once
10243	* the backup is complete, we need not force full-page writes anymore,
10244	* since we expect that any pages not modified during the backup interval
10245	* must have been correctly captured by the backup.)
10246	*
10247	* Note that forcePageWrites has no effect during an online backup from
10248	* the standby.
10249	*
10250	* We must hold all the insertion locks to change the value of
10251	* forcePageWrites, to ensure adequate interlocking against
10252	* XLogInsertRecord().
10253	*/
10254	WALInsertLockAcquireExclusive();
10255	if (exclusive)
10256	{
10257	/*
10258	* At first, mark that we're now starting an exclusive backup, to
10259	* ensure that there are no other sessions currently running
10260	* pg_start_backup() or pg_stop_backup().
10261	*/
10262	if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10263	{
10264	WALInsertLockRelease();
10265	ereport(ERROR,
10266	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10267	errmsg("a backup is already in progress"),
10268	errhint("Run pg_stop_backup() and try again.")));
10269	}
10270	XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10271	}
10272	else
10273	XLogCtl->Insert.nonExclusiveBackups++;
10274	XLogCtl->Insert.forcePageWrites = true;
10275	WALInsertLockRelease();
10276
10277	/ Ensure we release forcePageWrites if fail below /
10278	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10279	{
10280	bool gotUniqueStartpoint = false;
10281	DIR *tblspcdir;
10282	struct dirent *de;
10283	tablespaceinfo *ti;
10284	int datadirpathlen;
10285
10286	/*
10287	* Force an XLOG file switch before the checkpoint, to ensure that the
10288	* WAL segment the checkpoint is written to doesn't contain pages with
10289	* old timeline IDs. That would otherwise happen if you called
10290	* pg_start_backup() right after restoring from a PITR archive: the
10291	* first WAL segment containing the startup checkpoint has pages in
10292	* the beginning with the old timeline ID. That can cause trouble at
10293	* recovery: we won't have a history file covering the old timeline if
10294	* pg_wal directory was not included in the base backup and the WAL
10295	* archive was cleared too before starting the backup.
10296	*
10297	* This also ensures that we have emitted a WAL page header that has
10298	* XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10299	* Therefore, if a WAL archiver (such as pglesslog) is trying to
10300	* compress out removable backup blocks, it won't remove any that
10301	* occur after this point.
10302	*
10303	* During recovery, we skip forcing XLOG file switch, which means that
10304	* the backup taken during recovery is not available for the special
10305	* recovery case described above.
10306	*/
10307	if (!backup_started_in_recovery)
10308	RequestXLogSwitch(false);
10309
10310	do
10311	{
10312	bool checkpointfpw;
10313
10314	/*
10315	* Force a CHECKPOINT. Aside from being necessary to prevent torn
10316	* page problems, this guarantees that two successive backup runs
10317	* will have different checkpoint positions and hence different
10318	* history file names, even if nothing happened in between.
10319	*
10320	* During recovery, establish a restartpoint if possible. We use
10321	* the last restartpoint as the backup starting checkpoint. This
10322	* means that two successive backup runs can have same checkpoint
10323	* positions.
10324	*
10325	* Since the fact that we are executing do_pg_start_backup()
10326	* during recovery means that checkpointer is running, we can use
10327	* RequestCheckpoint() to establish a restartpoint.
10328	*
10329	* We use CHECKPOINT_IMMEDIATE only if requested by user (via
10330	* passing fast = true). Otherwise this can take awhile.
10331	*/
10332	RequestCheckpoint(CHECKPOINT_FORCE \| CHECKPOINT_WAIT \|
10333	(fast ? CHECKPOINT_IMMEDIATE : `0`));
10334
10335	/*
10336	* Now we need to fetch the checkpoint record location, and also
10337	* its REDO pointer. The oldest point in WAL that would be needed
10338	* to restore starting from the checkpoint is precisely the REDO
10339	* pointer.
10340	*/
10341	LWLockAcquire(ControlFileLock, LW_SHARED);
10342	checkpointloc = ControlFile->checkPoint;
10343	startpoint = ControlFile->checkPointCopy.redo;
10344	starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10345	checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10346	LWLockRelease(ControlFileLock);
10347
10348	if (backup_started_in_recovery)
10349	{
10350	XLogRecPtr recptr;
10351
10352	/*
10353	* Check to see if all WAL replayed during online backup
10354	* (i.e., since last restartpoint used as backup starting
10355	* checkpoint) contain full-page writes.
10356	*/
10357	SpinLockAcquire(&XLogCtl->info_lck);
10358	recptr = XLogCtl->lastFpwDisableRecPtr;
10359	SpinLockRelease(&XLogCtl->info_lck);
10360
10361	if (!checkpointfpw \|\| startpoint <= recptr)
10362	ereport(ERROR,
10363	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10364	errmsg("WAL generated with full_page_writes=off was replayed "
10365	"since last restartpoint"),
10366	errhint("This means that the backup being taken on the standby "
10367	"is corrupt and should not be used. "
10368	"Enable full_page_writes and run CHECKPOINT on the master, "
10369	"and then try an online backup again.")));
10370
10371	/*
10372	* During recovery, since we don't use the end-of-backup WAL
10373	* record and don't write the backup history file, the
10374	* starting WAL location doesn't need to be unique. This means
10375	* that two base backups started at the same time might use
10376	* the same checkpoint as starting locations.
10377	*/
10378	gotUniqueStartpoint = true;
10379	}
10380
10381	/*
10382	* If two base backups are started at the same time (in WAL sender
10383	* processes), we need to make sure that they use different
10384	* checkpoints as starting locations, because we use the starting
10385	* WAL location as a unique identifier for the base backup in the
10386	* end-of-backup WAL record and when we write the backup history
10387	* file. Perhaps it would be better generate a separate unique ID
10388	* for each backup instead of forcing another checkpoint, but
10389	* taking a checkpoint right after another is not that expensive
10390	* either because only few buffers have been dirtied yet.
10391	*/
10392	WALInsertLockAcquireExclusive();
10393	if (XLogCtl->Insert.lastBackupStart < startpoint)
10394	{
10395	XLogCtl->Insert.lastBackupStart = startpoint;
10396	gotUniqueStartpoint = true;
10397	}
10398	WALInsertLockRelease();
10399	} while (!gotUniqueStartpoint);
10400
10401	XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10402	XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10403
10404	/*
10405	* Construct tablespace_map file
10406	*/
10407	if (exclusive)
10408	tblspcmapfile = makeStringInfo();
10409
10410	datadirpathlen = strlen(DataDir);
10411
10412	/ Collect information about all tablespaces /
10413	tblspcdir = AllocateDir("pg_tblspc");
10414	while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10415	{
10416	char fullpath[MAXPGPATH + `10`];
10417	char linkpath[MAXPGPATH];
10418	char *relpath = NULL;
10419	int rllen;
10420	StringInfoData buflinkpath;
10421	char *s = linkpath;
10422
10423	/ Skip special stuff /
10424	if (strcmp(de->d_name, ".") == `0` \|\| strcmp(de->d_name, "..") == `0`)
10425	continue;
10426
10427	snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10428
10429	#if defined(HAVE_READLINK) \|\| defined(WIN32)
10430	rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10431	if (rllen < `0`)
10432	{
10433	ereport(WARNING,
10434	(errmsg("could not read symbolic link \"%s\": %m",
10435	fullpath)));
10436	continue;
10437	}
10438	else if (rllen >= sizeof(linkpath))
10439	{
10440	ereport(WARNING,
10441	(errmsg("symbolic link \"%s\" target is too long",
10442	fullpath)));
10443	continue;
10444	}
10445	linkpath[rllen] = `'\0'`;
10446
10447	/*
10448	* Add the escape character '\\' before newline in a string to
10449	* ensure that we can distinguish between the newline in the
10450	* tablespace path and end of line while reading tablespace_map
10451	* file during archive recovery.
10452	*/
10453	initStringInfo(&buflinkpath);
10454
10455	while (*s)
10456	{
10457	if ((s == `'\n'` \|\| s == `'\r'`) && needtblspcmapfile)
10458	appendStringInfoChar(&buflinkpath, `'\\'`);
10459	appendStringInfoChar(&buflinkpath, *s++);
10460	}
10461
10462	/*
10463	* Relpath holds the relative path of the tablespace directory
10464	* when it's located within PGDATA, or NULL if it's located
10465	* elsewhere.
10466	*/
10467	if (rllen > datadirpathlen &&
10468	strncmp(linkpath, DataDir, datadirpathlen) == `0` &&
10469	IS_DIR_SEP(linkpath[datadirpathlen]))
10470	relpath = linkpath + datadirpathlen + `1`;
10471
10472	ti = palloc(sizeof(tablespaceinfo));
10473	ti->oid = pstrdup(de->d_name);
10474	ti->path = pstrdup(buflinkpath.data);
10475	ti->rpath = relpath ? pstrdup(relpath) : NULL;
10476	ti->size = infotbssize ? sendTablespace(fullpath, true) : -`1`;
10477
10478	if (tablespaces)
10479	tablespaces = lappend(tablespaces, ti);
10480
10481	appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10482
10483	pfree(buflinkpath.data);
10484	#else
10485
10486	/*
10487	* If the platform does not have symbolic links, it should not be
10488	* possible to have tablespaces - clearly somebody else created
10489	* them. Warn about it and ignore.
10490	*/
10491	ereport(WARNING,
10492	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10493	errmsg("tablespaces are not supported on this platform")));
10494	#endif
10495	}
10496	FreeDir(tblspcdir);
10497
10498	/*
10499	* Construct backup label file
10500	*/
10501	if (exclusive)
10502	labelfile = makeStringInfo();
10503
10504	/ Use the log timezone here, not the session timezone /
10505	stamp_time = (pg_time_t) time(NULL);
10506	pg_strftime(strfbuf, sizeof(strfbuf),
10507	"%Y-%m-%d %H:%M:%S %Z",
10508	pg_localtime(&stamp_time, log_timezone));
10509	appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10510	(uint32) (startpoint >> `32`), (uint32) startpoint, xlogfilename);
10511	appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10512	(uint32) (checkpointloc >> `32`), (uint32) checkpointloc);
10513	appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10514	exclusive ? "pg_start_backup" : "streamed");
10515	appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10516	backup_started_in_recovery ? "standby" : "master");
10517	appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10518	appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10519	appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10520
10521	/*
10522	* Okay, write the file, or return its contents to caller.
10523	*/
10524	if (exclusive)
10525	{
10526	/*
10527	* Check for existing backup label --- implies a backup is already
10528	* running. (XXX given that we checked exclusiveBackupState
10529	* above, maybe it would be OK to just unlink any such label
10530	* file?)
10531	*/
10532	if (stat(BACKUP_LABEL_FILE, &stat_buf) != `0`)
10533	{
10534	if (errno != ENOENT)
10535	ereport(ERROR,
10536	(errcode_for_file_access(),
10537	errmsg("could not stat file \"%s\": %m",
10538	BACKUP_LABEL_FILE)));
10539	}
10540	else
10541	ereport(ERROR,
10542	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10543	errmsg("a backup is already in progress"),
10544	errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10545	BACKUP_LABEL_FILE)));
10546
10547	fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10548
10549	if (!fp)
10550	ereport(ERROR,
10551	(errcode_for_file_access(),
10552	errmsg("could not create file \"%s\": %m",
10553	BACKUP_LABEL_FILE)));
10554	if (fwrite(labelfile->data, labelfile->len, `1`, fp) != `1` \|\|
10555	fflush(fp) != `0` \|\|
10556	pg_fsync(fileno(fp)) != `0` \|\|
10557	ferror(fp) \|\|
10558	FreeFile(fp))
10559	ereport(ERROR,
10560	(errcode_for_file_access(),
10561	errmsg("could not write file \"%s\": %m",
10562	BACKUP_LABEL_FILE)));
10563	/ Allocated locally for exclusive backups, so free separately /
10564	pfree(labelfile->data);
10565	pfree(labelfile);
10566
10567	/ Write backup tablespace_map file. /
10568	if (tblspcmapfile->len > `0`)
10569	{
10570	if (stat(TABLESPACE_MAP, &stat_buf) != `0`)
10571	{
10572	if (errno != ENOENT)
10573	ereport(ERROR,
10574	(errcode_for_file_access(),
10575	errmsg("could not stat file \"%s\": %m",
10576	TABLESPACE_MAP)));
10577	}
10578	else
10579	ereport(ERROR,
10580	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10581	errmsg("a backup is already in progress"),
10582	errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10583	TABLESPACE_MAP)));
10584
10585	fp = AllocateFile(TABLESPACE_MAP, "w");
10586
10587	if (!fp)
10588	ereport(ERROR,
10589	(errcode_for_file_access(),
10590	errmsg("could not create file \"%s\": %m",
10591	TABLESPACE_MAP)));
10592	if (fwrite(tblspcmapfile->data, tblspcmapfile->len, `1`, fp) != `1` \|\|
10593	fflush(fp) != `0` \|\|
10594	pg_fsync(fileno(fp)) != `0` \|\|
10595	ferror(fp) \|\|
10596	FreeFile(fp))
10597	ereport(ERROR,
10598	(errcode_for_file_access(),
10599	errmsg("could not write file \"%s\": %m",
10600	TABLESPACE_MAP)));
10601	}
10602
10603	/ Allocated locally for exclusive backups, so free separately /
10604	pfree(tblspcmapfile->data);
10605	pfree(tblspcmapfile);
10606	}
10607	}
10608	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10609
10610	/*
10611	* Mark that start phase has correctly finished for an exclusive backup.
10612	* Session-level locks are updated as well to reflect that state.
10613	*
10614	* Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
10615	* counters and session-level lock. Otherwise they can be updated
10616	* inconsistently, and which might cause do_pg_abort_backup() to fail.
10617	*/
10618	if (exclusive)
10619	{
10620	WALInsertLockAcquireExclusive();
10621	XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10622
10623	/ Set session-level lock /
10624	sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10625	WALInsertLockRelease();
10626	}
10627	else
10628	sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10629
10630	/*
10631	* We're done. As a convenience, return the starting WAL location.
10632	*/
10633	if (starttli_p)
10634	*starttli_p = starttli;
10635	return startpoint;
10636	}
10637
10638	/ Error cleanup callback for pg_start_backup /
10639	static void
10640	pg_start_backup_callback(int code, Datum arg)
10641	{
10642	bool exclusive = DatumGetBool(arg);
10643
10644	/ Update backup counters and forcePageWrites on failure /
10645	WALInsertLockAcquireExclusive();
10646	if (exclusive)
10647	{
10648	Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10649	XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10650	}
10651	else
10652	{
10653	Assert(XLogCtl->Insert.nonExclusiveBackups > `0`);
10654	XLogCtl->Insert.nonExclusiveBackups--;
10655	}
10656
10657	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10658	XLogCtl->Insert.nonExclusiveBackups == `0`)
10659	{
10660	XLogCtl->Insert.forcePageWrites = false;
10661	}
10662	WALInsertLockRelease();
10663	}
10664
10665	/*
10666	* Error cleanup callback for pg_stop_backup
10667	*/
10668	static void
10669	pg_stop_backup_callback(int code, Datum arg)
10670	{
10671	bool exclusive = DatumGetBool(arg);
10672
10673	/ Update backup status on failure /
10674	WALInsertLockAcquireExclusive();
10675	if (exclusive)
10676	{
10677	Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10678	XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10679	}
10680	WALInsertLockRelease();
10681	}
10682
10683	/*
10684	* Utility routine to fetch the session-level status of a backup running.
10685	*/
10686	SessionBackupState
10687	get_backup_status(void)
10688	{
10689	return sessionBackupState;
10690	}
10691
10692	/*
10693	* do_pg_stop_backup
10694	*
10695	* Utility function called at the end of an online backup. It cleans up the
10696	* backup state and can optionally wait for WAL segments to be archived.
10697	*
10698	* If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10699	* the non-exclusive backup specified by 'labelfile'.
10700	*
10701	* Returns the last WAL location that must be present to restore from this
10702	* backup, and the corresponding timeline ID in *stoptli_p.
10703	*
10704	* It is the responsibility of the caller of this function to verify the
10705	* permissions of the calling user!
10706	*/
10707	XLogRecPtr
10708	do_pg_stop_backup(char labelfile, bool waitforarchive, TimeLineID stoptli_p)
10709	{
10710	bool exclusive = (labelfile == NULL);
10711	bool backup_started_in_recovery = false;
10712	XLogRecPtr startpoint;
10713	XLogRecPtr stoppoint;
10714	TimeLineID stoptli;
10715	pg_time_t stamp_time;
10716	char strfbuf[`128`];
10717	char histfilepath[MAXPGPATH];
10718	char startxlogfilename[MAXFNAMELEN];
10719	char stopxlogfilename[MAXFNAMELEN];
10720	char lastxlogfilename[MAXFNAMELEN];
10721	char histfilename[MAXFNAMELEN];
10722	char backupfrom[`20`];
10723	XLogSegNo _logSegNo;
10724	FILE *lfp;
10725	FILE *fp;
10726	char ch;
10727	int seconds_before_warning;
10728	int waits = `0`;
10729	bool reported_waiting = false;
10730	char *remaining;
10731	char *ptr;
10732	uint32 hi,
10733	lo;
10734
10735	backup_started_in_recovery = RecoveryInProgress();
10736
10737	/*
10738	* Currently only non-exclusive backup can be taken during recovery.
10739	*/
10740	if (backup_started_in_recovery && exclusive)
10741	ereport(ERROR,
10742	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10743	errmsg("recovery is in progress"),
10744	errhint("WAL control functions cannot be executed during recovery.")));
10745
10746	/*
10747	* During recovery, we don't need to check WAL level. Because, if WAL
10748	* level is not sufficient, it's impossible to get here during recovery.
10749	*/
10750	if (!backup_started_in_recovery && !XLogIsNeeded())
10751	ereport(ERROR,
10752	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10753	errmsg("WAL level not sufficient for making an online backup"),
10754	errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10755
10756	if (exclusive)
10757	{
10758	/*
10759	* At first, mark that we're now stopping an exclusive backup, to
10760	* ensure that there are no other sessions currently running
10761	* pg_start_backup() or pg_stop_backup().
10762	*/
10763	WALInsertLockAcquireExclusive();
10764	if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10765	{
10766	WALInsertLockRelease();
10767	ereport(ERROR,
10768	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10769	errmsg("exclusive backup not in progress")));
10770	}
10771	XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10772	WALInsertLockRelease();
10773
10774	/*
10775	* Remove backup_label. In case of failure, the state for an exclusive
10776	* backup is switched back to in-progress.
10777	*/
10778	PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10779	{
10780	/*
10781	* Read the existing label file into memory.
10782	*/
10783	struct stat statbuf;
10784	int r;
10785
10786	if (stat(BACKUP_LABEL_FILE, &statbuf))
10787	{
10788	/ should not happen per the upper checks /
10789	if (errno != ENOENT)
10790	ereport(ERROR,
10791	(errcode_for_file_access(),
10792	errmsg("could not stat file \"%s\": %m",
10793	BACKUP_LABEL_FILE)));
10794	ereport(ERROR,
10795	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10796	errmsg("a backup is not in progress")));
10797	}
10798
10799	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10800	if (!lfp)
10801	{
10802	ereport(ERROR,
10803	(errcode_for_file_access(),
10804	errmsg("could not read file \"%s\": %m",
10805	BACKUP_LABEL_FILE)));
10806	}
10807	labelfile = palloc(statbuf.st_size + `1`);
10808	r = fread(labelfile, statbuf.st_size, `1`, lfp);
10809	labelfile[statbuf.st_size] = `'\0'`;
10810
10811	/*
10812	* Close and remove the backup label file
10813	*/
10814	if (r != `1` \|\| ferror(lfp) \|\| FreeFile(lfp))
10815	ereport(ERROR,
10816	(errcode_for_file_access(),
10817	errmsg("could not read file \"%s\": %m",
10818	BACKUP_LABEL_FILE)));
10819	durable_unlink(BACKUP_LABEL_FILE, ERROR);
10820
10821	/*
10822	* Remove tablespace_map file if present, it is created only if
10823	* there are tablespaces.
10824	*/
10825	durable_unlink(TABLESPACE_MAP, DEBUG1);
10826	}
10827	PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10828	}
10829
10830	/*
10831	* OK to update backup counters, forcePageWrites and session-level lock.
10832	*
10833	* Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
10834	* Otherwise they can be updated inconsistently, and which might cause
10835	* do_pg_abort_backup() to fail.
10836	*/
10837	WALInsertLockAcquireExclusive();
10838	if (exclusive)
10839	{
10840	XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10841	}
10842	else
10843	{
10844	/*
10845	* The user-visible pg_start/stop_backup() functions that operate on
10846	* exclusive backups can be called at any time, but for non-exclusive
10847	* backups, it is expected that each do_pg_start_backup() call is
10848	* matched by exactly one do_pg_stop_backup() call.
10849	*/
10850	Assert(XLogCtl->Insert.nonExclusiveBackups > `0`);
10851	XLogCtl->Insert.nonExclusiveBackups--;
10852	}
10853
10854	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10855	XLogCtl->Insert.nonExclusiveBackups == `0`)
10856	{
10857	XLogCtl->Insert.forcePageWrites = false;
10858	}
10859
10860	/*
10861	* Clean up session-level lock.
10862	*
10863	* You might think that WALInsertLockRelease() can be called before
10864	* cleaning up session-level lock because session-level lock doesn't need
10865	* to be protected with WAL insertion lock. But since
10866	* CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
10867	* cleaned up before it.
10868	*/
10869	sessionBackupState = SESSION_BACKUP_NONE;
10870
10871	WALInsertLockRelease();
10872
10873	/*
10874	* Read and parse the START WAL LOCATION line (this code is pretty crude,
10875	* but we are not expecting any variability in the file format).
10876	*/
10877	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10878	&hi, &lo, startxlogfilename,
10879	&ch) != `4` \|\| ch != `'\n'`)
10880	ereport(ERROR,
10881	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10882	errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10883	startpoint = ((uint64) hi) << `32` \| lo;
10884	remaining = strchr(labelfile, `'\n'`) + `1`; / %n is not portable enough /
10885
10886	/*
10887	* Parse the BACKUP FROM line. If we are taking an online backup from the
10888	* standby, we confirm that the standby has not been promoted during the
10889	* backup.
10890	*/
10891	ptr = strstr(remaining, "BACKUP FROM:");
10892	if (!ptr \|\| sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != `1`)
10893	ereport(ERROR,
10894	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10895	errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10896	if (strcmp(backupfrom, "standby") == `0` && !backup_started_in_recovery)
10897	ereport(ERROR,
10898	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10899	errmsg("the standby was promoted during online backup"),
10900	errhint("This means that the backup being taken is corrupt "
10901	"and should not be used. "
10902	"Try taking another online backup.")));
10903
10904	/*
10905	* During recovery, we don't write an end-of-backup record. We assume that
10906	* pg_control was backed up last and its minimum recovery point can be
10907	* available as the backup end location. Since we don't have an
10908	* end-of-backup record, we use the pg_control value to check whether
10909	* we've reached the end of backup when starting recovery from this
10910	* backup. We have no way of checking if pg_control wasn't backed up last
10911	* however.
10912	*
10913	* We don't force a switch to new WAL file but it is still possible to
10914	* wait for all the required files to be archived if waitforarchive is
10915	* true. This is okay if we use the backup to start a standby and fetch
10916	* the missing WAL using streaming replication. But in the case of an
10917	* archive recovery, a user should set waitforarchive to true and wait for
10918	* them to be archived to ensure that all the required files are
10919	* available.
10920	*
10921	* We return the current minimum recovery point as the backup end
10922	* location. Note that it can be greater than the exact backup end
10923	* location if the minimum recovery point is updated after the backup of
10924	* pg_control. This is harmless for current uses.
10925	*
10926	* XXX currently a backup history file is for informational and debug
10927	* purposes only. It's not essential for an online backup. Furthermore,
10928	* even if it's created, it will not be archived during recovery because
10929	* an archiver is not invoked. So it doesn't seem worthwhile to write a
10930	* backup history file during recovery.
10931	*/
10932	if (backup_started_in_recovery)
10933	{
10934	XLogRecPtr recptr;
10935
10936	/*
10937	* Check to see if all WAL replayed during online backup contain
10938	* full-page writes.
10939	*/
10940	SpinLockAcquire(&XLogCtl->info_lck);
10941	recptr = XLogCtl->lastFpwDisableRecPtr;
10942	SpinLockRelease(&XLogCtl->info_lck);
10943
10944	if (startpoint <= recptr)
10945	ereport(ERROR,
10946	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10947	errmsg("WAL generated with full_page_writes=off was replayed "
10948	"during online backup"),
10949	errhint("This means that the backup being taken on the standby "
10950	"is corrupt and should not be used. "
10951	"Enable full_page_writes and run CHECKPOINT on the master, "
10952	"and then try an online backup again.")));
10953
10954
10955	LWLockAcquire(ControlFileLock, LW_SHARED);
10956	stoppoint = ControlFile->minRecoveryPoint;
10957	stoptli = ControlFile->minRecoveryPointTLI;
10958	LWLockRelease(ControlFileLock);
10959	}
10960	else
10961	{
10962	/*
10963	* Write the backup-end xlog record
10964	*/
10965	XLogBeginInsert();
10966	XLogRegisterData((char ) (&startpoint), sizeof*(startpoint));
10967	stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
10968	stoptli = ThisTimeLineID;
10969
10970	/*
10971	* Force a switch to a new xlog segment file, so that the backup is
10972	* valid as soon as archiver moves out the current segment file.
10973	*/
10974	RequestXLogSwitch(false);
10975
10976	XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
10977	XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
10978
10979	/ Use the log timezone here, not the session timezone /
10980	stamp_time = (pg_time_t) time(NULL);
10981	pg_strftime(strfbuf, sizeof(strfbuf),
10982	"%Y-%m-%d %H:%M:%S %Z",
10983	pg_localtime(&stamp_time, log_timezone));
10984
10985	/*
10986	* Write the backup history file
10987	*/
10988	XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10989	BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
10990	startpoint, wal_segment_size);
10991	fp = AllocateFile(histfilepath, "w");
10992	if (!fp)
10993	ereport(ERROR,
10994	(errcode_for_file_access(),
10995	errmsg("could not create file \"%s\": %m",
10996	histfilepath)));
10997	fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10998	(uint32) (startpoint >> `32`), (uint32) startpoint, startxlogfilename);
10999	fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11000	(uint32) (stoppoint >> `32`), (uint32) stoppoint, stopxlogfilename);
11001
11002	/*
11003	* Transfer remaining lines including label and start timeline to
11004	* history file.
11005	*/
11006	fprintf(fp, "%s", remaining);
11007	fprintf(fp, "STOP TIME: %s\n", strfbuf);
11008	fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11009	if (fflush(fp) \|\| ferror(fp) \|\| FreeFile(fp))
11010	ereport(ERROR,
11011	(errcode_for_file_access(),
11012	errmsg("could not write file \"%s\": %m",
11013	histfilepath)));
11014
11015	/*
11016	* Clean out any no-longer-needed history files. As a side effect,
11017	* this will post a .ready file for the newly created history file,
11018	* notifying the archiver that history file may be archived
11019	* immediately.
11020	*/
11021	CleanupBackupHistory();
11022	}
11023
11024	/*
11025	* If archiving is enabled, wait for all the required WAL files to be
11026	* archived before returning. If archiving isn't enabled, the required WAL
11027	* needs to be transported via streaming replication (hopefully with
11028	* wal_keep_segments set high enough), or some more exotic mechanism like
11029	* polling and copying files from pg_wal with script. We have no knowledge
11030	* of those mechanisms, so it's up to the user to ensure that he gets all
11031	* the required WAL.
11032	*
11033	* We wait until both the last WAL file filled during backup and the
11034	* history file have been archived, and assume that the alphabetic sorting
11035	* property of the WAL files ensures any earlier WAL files are safely
11036	* archived as well.
11037	*
11038	* We wait forever, since archive_command is supposed to work and we
11039	* assume the admin wanted his backup to work completely. If you don't
11040	* wish to wait, then either waitforarchive should be passed in as false,
11041	* or you can set statement_timeout. Also, some notices are issued to
11042	* clue in anyone who might be doing this interactively.
11043	*/
11044
11045	if (waitforarchive &&
11046	((!backup_started_in_recovery && XLogArchivingActive()) \|\|
11047	(backup_started_in_recovery && XLogArchivingAlways())))
11048	{
11049	XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11050	XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11051
11052	XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11053	BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11054	startpoint, wal_segment_size);
11055
11056	seconds_before_warning = `60`;
11057	waits = `0`;
11058
11059	while (XLogArchiveIsBusy(lastxlogfilename) \|\|
11060	XLogArchiveIsBusy(histfilename))
11061	{
11062	CHECK_FOR_INTERRUPTS();
11063
11064	if (!reported_waiting && waits > `5`)
11065	{
11066	ereport(NOTICE,
11067	(errmsg("base backup done, waiting for required WAL segments to be archived")));
11068	reported_waiting = true;
11069	}
11070
11071	pg_usleep(`1000000L`);
11072
11073	if (++waits >= seconds_before_warning)
11074	{
11075	seconds_before_warning = `2`; /* This wraps in >10 years... /
11076	ereport(WARNING,
11077	(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11078	waits),
11079	errhint("Check that your archive_command is executing properly. "
11080	"You can safely cancel this backup, "
11081	"but the database backup will not be usable without all the WAL segments.")));
11082	}
11083	}
11084
11085	ereport(NOTICE,
11086	(errmsg("all required WAL segments have been archived")));
11087	}
11088	else if (waitforarchive)
11089	ereport(NOTICE,
11090	(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11091
11092	/*
11093	* We're done. As a convenience, return the ending WAL location.
11094	*/
11095	if (stoptli_p)
11096	*stoptli_p = stoptli;
11097	return stoppoint;
11098	}
11099
11100
11101	/*
11102	* do_pg_abort_backup: abort a running backup
11103	*
11104	* This does just the most basic steps of do_pg_stop_backup(), by taking the
11105	* system out of backup mode, thus making it a lot more safe to call from
11106	* an error handler.
11107	*
11108	* NB: This is only for aborting a non-exclusive backup that doesn't write
11109	* backup_label. A backup started with pg_start_backup() needs to be finished
11110	* with pg_stop_backup().
11111	*/
11112	void
11113	do_pg_abort_backup(void)
11114	{
11115	/*
11116	* Quick exit if session is not keeping around a non-exclusive backup
11117	* already started.
11118	*/
11119	if (sessionBackupState == SESSION_BACKUP_NONE)
11120	return;
11121
11122	WALInsertLockAcquireExclusive();
11123	Assert(XLogCtl->Insert.nonExclusiveBackups > `0`);
11124	Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
11125	XLogCtl->Insert.nonExclusiveBackups--;
11126
11127	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11128	XLogCtl->Insert.nonExclusiveBackups == `0`)
11129	{
11130	XLogCtl->Insert.forcePageWrites = false;
11131	}
11132	WALInsertLockRelease();
11133	}
11134
11135	/*
11136	* Get latest redo apply position.
11137	*
11138	* Exported to allow WALReceiver to read the pointer directly.
11139	*/
11140	XLogRecPtr
11141	GetXLogReplayRecPtr(TimeLineID *replayTLI)
11142	{
11143	XLogRecPtr recptr;
11144	TimeLineID tli;
11145
11146	SpinLockAcquire(&XLogCtl->info_lck);
11147	recptr = XLogCtl->lastReplayedEndRecPtr;
11148	tli = XLogCtl->lastReplayedTLI;
11149	SpinLockRelease(&XLogCtl->info_lck);
11150
11151	if (replayTLI)
11152	*replayTLI = tli;
11153	return recptr;
11154	}
11155
11156	/*
11157	* Get latest WAL insert pointer
11158	*/
11159	XLogRecPtr
11160	GetXLogInsertRecPtr(void)
11161	{
11162	XLogCtlInsert *Insert = &XLogCtl->Insert;
11163	uint64 current_bytepos;
11164
11165	SpinLockAcquire(&Insert->insertpos_lck);
11166	current_bytepos = Insert->CurrBytePos;
11167	SpinLockRelease(&Insert->insertpos_lck);
11168
11169	return XLogBytePosToRecPtr(current_bytepos);
11170	}
11171
11172	/*
11173	* Get latest WAL write pointer
11174	*/
11175	XLogRecPtr
11176	GetXLogWriteRecPtr(void)
11177	{
11178	SpinLockAcquire(&XLogCtl->info_lck);
11179	LogwrtResult = XLogCtl->LogwrtResult;
11180	SpinLockRelease(&XLogCtl->info_lck);
11181
11182	return LogwrtResult.Write;
11183	}
11184
11185	/*
11186	* Returns the redo pointer of the last checkpoint or restartpoint. This is
11187	* the oldest point in WAL that we still need, if we have to restart recovery.
11188	*/
11189	void
11190	GetOldestRestartPoint(XLogRecPtr oldrecptr, TimeLineID oldtli)
11191	{
11192	LWLockAcquire(ControlFileLock, LW_SHARED);
11193	*oldrecptr = ControlFile->checkPointCopy.redo;
11194	*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11195	LWLockRelease(ControlFileLock);
11196	}
11197
11198	/*
11199	* read_backup_label: check to see if a backup_label file is present
11200	*
11201	* If we see a backup_label during recovery, we assume that we are recovering
11202	* from a backup dump file, and we therefore roll forward from the checkpoint
11203	* identified by the label file, NOT what pg_control says. This avoids the
11204	* problem that pg_control might have been archived one or more checkpoints
11205	* later than the start of the dump, and so if we rely on it as the start
11206	* point, we will fail to restore a consistent database state.
11207	*
11208	* Returns true if a backup_label was found (and fills the checkpoint
11209	* location and its REDO location into *checkPointLoc and RedoStartLSN,
11210	* respectively); returns false if not. If this backup_label came from a
11211	* streamed backup, *backupEndRequired is set to true. If this backup_label
11212	* was created during recovery, *backupFromStandby is set to true.
11213	*/
11214	static bool
11215	read_backup_label(XLogRecPtr checkPointLoc, bool backupEndRequired,
11216	bool *backupFromStandby)
11217	{
11218	char startxlogfilename[MAXFNAMELEN];
11219	TimeLineID tli_from_walseg,
11220	tli_from_file;
11221	FILE *lfp;
11222	char ch;
11223	char backuptype[`20`];
11224	char backupfrom[`20`];
11225	char backuplabel[MAXPGPATH];
11226	char backuptime[`128`];
11227	uint32 hi,
11228	lo;
11229
11230	*backupEndRequired = false;
11231	*backupFromStandby = false;
11232
11233	/*
11234	* See if label file is present
11235	*/
11236	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11237	if (!lfp)
11238	{
11239	if (errno != ENOENT)
11240	ereport(FATAL,
11241	(errcode_for_file_access(),
11242	errmsg("could not read file \"%s\": %m",
11243	BACKUP_LABEL_FILE)));
11244	return false; / it's not there, all is fine /
11245	}
11246
11247	/*
11248	* Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11249	* is pretty crude, but we are not expecting any variability in the file
11250	* format).
11251	*/
11252	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11253	&hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != `5` \|\| ch != `'\n'`)
11254	ereport(FATAL,
11255	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11256	errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11257	RedoStartLSN = ((uint64) hi) << `32` \| lo;
11258	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11259	&hi, &lo, &ch) != `3` \|\| ch != `'\n'`)
11260	ereport(FATAL,
11261	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11262	errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11263	*checkPointLoc = ((uint64) hi) << `32` \| lo;
11264
11265	/*
11266	* BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11267	* from an older backup anyway, but since the information on it is not
11268	* strictly required, don't error out if it's missing for some reason.
11269	*/
11270	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == `1`)
11271	{
11272	if (strcmp(backuptype, "streamed") == `0`)
11273	*backupEndRequired = true;
11274	}
11275
11276	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == `1`)
11277	{
11278	if (strcmp(backupfrom, "standby") == `0`)
11279	*backupFromStandby = true;
11280	}
11281
11282	/*
11283	* Parse START TIME and LABEL. Those are not mandatory fields for recovery
11284	* but checking for their presence is useful for debugging and the next
11285	* sanity checks. Cope also with the fact that the result buffers have a
11286	* pre-allocated size, hence if the backup_label file has been generated
11287	* with strings longer than the maximum assumed here an incorrect parsing
11288	* happens. That's fine as only minor consistency checks are done
11289	* afterwards.
11290	*/
11291	if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == `1`)
11292	ereport(DEBUG1,
11293	(errmsg("backup time %s in file \"%s\"",
11294	backuptime, BACKUP_LABEL_FILE)));
11295
11296	if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == `1`)
11297	ereport(DEBUG1,
11298	(errmsg("backup label %s in file \"%s\"",
11299	backuplabel, BACKUP_LABEL_FILE)));
11300
11301	/*
11302	* START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11303	* it as a sanity check if present.
11304	*/
11305	if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == `1`)
11306	{
11307	if (tli_from_walseg != tli_from_file)
11308	ereport(FATAL,
11309	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11310	errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11311	errdetail("Timeline ID parsed is %u, but expected %u.",
11312	tli_from_file, tli_from_walseg)));
11313
11314	ereport(DEBUG1,
11315	(errmsg("backup timeline %u in file \"%s\"",
11316	tli_from_file, BACKUP_LABEL_FILE)));
11317	}
11318
11319	if (ferror(lfp) \|\| FreeFile(lfp))
11320	ereport(FATAL,
11321	(errcode_for_file_access(),
11322	errmsg("could not read file \"%s\": %m",
11323	BACKUP_LABEL_FILE)));
11324
11325	return true;
11326	}
11327
11328	/*
11329	* read_tablespace_map: check to see if a tablespace_map file is present
11330	*
11331	* If we see a tablespace_map file during recovery, we assume that we are
11332	* recovering from a backup dump file, and we therefore need to create symlinks
11333	* as per the information present in tablespace_map file.
11334	*
11335	* Returns true if a tablespace_map file was found (and fills the link
11336	* information for all the tablespace links present in file); returns false
11337	* if not.
11338	*/
11339	static bool
11340	read_tablespace_map(List **tablespaces)
11341	{
11342	tablespaceinfo *ti;
11343	FILE *lfp;
11344	char tbsoid[MAXPGPATH];
11345	char *tbslinkpath;
11346	char str[MAXPGPATH];
11347	int ch,
11348	prev_ch = -`1`,
11349	i = `0`,
11350	n;
11351
11352	/*
11353	* See if tablespace_map file is present
11354	*/
11355	lfp = AllocateFile(TABLESPACE_MAP, "r");
11356	if (!lfp)
11357	{
11358	if (errno != ENOENT)
11359	ereport(FATAL,
11360	(errcode_for_file_access(),
11361	errmsg("could not read file \"%s\": %m",
11362	TABLESPACE_MAP)));
11363	return false; / it's not there, all is fine /
11364	}
11365
11366	/*
11367	* Read and parse the link name and path lines from tablespace_map file
11368	* (this code is pretty crude, but we are not expecting any variability in
11369	* the file format). While taking backup we embed escape character '\\'
11370	* before newline in tablespace path, so that during reading of
11371	* tablespace_map file, we could distinguish newline in tablespace path
11372	* and end of line. Now while reading tablespace_map file, remove the
11373	* escape character that has been added in tablespace path during backup.
11374	*/
11375	while ((ch = fgetc(lfp)) != EOF)
11376	{
11377	if ((ch == `'\n'` \|\| ch == `'\r'`) && prev_ch != `'\\'`)
11378	{
11379	str[i] = `'\0'`;
11380	if (sscanf(str, "%s %n", tbsoid, &n) != `1`)
11381	ereport(FATAL,
11382	(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11383	errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11384	tbslinkpath = str + n;
11385	i = `0`;
11386
11387	ti = palloc(sizeof(tablespaceinfo));
11388	ti->oid = pstrdup(tbsoid);
11389	ti->path = pstrdup(tbslinkpath);
11390
11391	tablespaces = lappend(tablespaces, ti);
11392	continue;
11393	}
11394	else if ((ch == `'\n'` \|\| ch == `'\r'`) && prev_ch == `'\\'`)
11395	str[i - `1`] = ch;
11396	else
11397	str[i++] = ch;
11398	prev_ch = ch;
11399	}
11400
11401	if (ferror(lfp) \|\| FreeFile(lfp))
11402	ereport(FATAL,
11403	(errcode_for_file_access(),
11404	errmsg("could not read file \"%s\": %m",
11405	TABLESPACE_MAP)));
11406
11407	return true;
11408	}
11409
11410	/*
11411	* Error context callback for errors occurring during rm_redo().
11412	*/
11413	static void
11414	rm_redo_error_callback(void *arg)
11415	{
11416	XLogReaderState record = (XLogReaderState ) arg;
11417	StringInfoData buf;
11418
11419	initStringInfo(&buf);
11420	xlog_outdesc(&buf, record);
11421
11422	/ translator: %s is a WAL record description /
11423	errcontext("WAL redo at %X/%X for %s",
11424	(uint32) (record->ReadRecPtr >> `32`),
11425	(uint32) record->ReadRecPtr,
11426	buf.data);
11427
11428	pfree(buf.data);
11429	}
11430
11431	/*
11432	* BackupInProgress: check if online backup mode is active
11433	*
11434	* This is done by checking for existence of the "backup_label" file.
11435	*/
11436	bool
11437	BackupInProgress(void)
11438	{
11439	struct stat stat_buf;
11440
11441	return (stat(BACKUP_LABEL_FILE, &stat_buf) == `0`);
11442	}
11443
11444	/*
11445	* CancelBackup: rename the "backup_label" and "tablespace_map"
11446	* files to cancel backup mode
11447	*
11448	* If the "backup_label" file exists, it will be renamed to "backup_label.old".
11449	* Similarly, if the "tablespace_map" file exists, it will be renamed to
11450	* "tablespace_map.old".
11451	*
11452	* Note that this will render an online backup in progress
11453	* useless. To correctly finish an online backup, pg_stop_backup must be
11454	* called.
11455	*/
11456	void
11457	CancelBackup(void)
11458	{
11459	struct stat stat_buf;
11460
11461	/ if the backup_label file is not there, return /
11462	if (stat(BACKUP_LABEL_FILE, &stat_buf) < `0`)
11463	return;
11464
11465	/ remove leftover file from previously canceled backup if it exists /
11466	unlink(BACKUP_LABEL_OLD);
11467
11468	if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != `0`)
11469	{
11470	ereport(WARNING,
11471	(errcode_for_file_access(),
11472	errmsg("online backup mode was not canceled"),
11473	errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11474	BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11475	return;
11476	}
11477
11478	/ if the tablespace_map file is not there, return /
11479	if (stat(TABLESPACE_MAP, &stat_buf) < `0`)
11480	{
11481	ereport(LOG,
11482	(errmsg("online backup mode canceled"),
11483	errdetail("File \"%s\" was renamed to \"%s\".",
11484	BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11485	return;
11486	}
11487
11488	/ remove leftover file from previously canceled backup if it exists /
11489	unlink(TABLESPACE_MAP_OLD);
11490
11491	if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == `0`)
11492	{
11493	ereport(LOG,
11494	(errmsg("online backup mode canceled"),
11495	errdetail("Files \"%s\" and \"%s\" were renamed to "
11496	"\"%s\" and \"%s\", respectively.",
11497	BACKUP_LABEL_FILE, TABLESPACE_MAP,
11498	BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11499	}
11500	else
11501	{
11502	ereport(WARNING,
11503	(errcode_for_file_access(),
11504	errmsg("online backup mode canceled"),
11505	errdetail("File \"%s\" was renamed to \"%s\", but "
11506	"file \"%s\" could not be renamed to \"%s\": %m.",
11507	BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11508	TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11509	}
11510	}
11511
11512	/*
11513	* Read the XLOG page containing RecPtr into readBuf (if not read already).
11514	* Returns number of bytes read, if the page is read successfully, or -1
11515	* in case of errors. When errors occur, they are ereport'ed, but only
11516	* if they have not been previously reported.
11517	*
11518	* This is responsible for restoring files from archive as needed, as well
11519	* as for waiting for the requested WAL record to arrive in standby mode.
11520	*
11521	* 'emode' specifies the log level used for reporting "file not found" or
11522	* "end of WAL" situations in archive recovery, or in standby mode when a
11523	* trigger file is found. If set to WARNING or below, XLogPageRead() returns
11524	* false in those situations, on higher log levels the ereport() won't
11525	* return.
11526	*
11527	* In standby mode, if after a successful return of XLogPageRead() the
11528	* caller finds the record it's interested in to be broken, it should
11529	* ereport the error with the level determined by
11530	* emode_for_corrupt_record(), and then set lastSourceFailed
11531	* and call XLogPageRead() again with the same arguments. This lets
11532	* XLogPageRead() to try fetching the record from another source, or to
11533	* sleep and retry.
11534	*/
11535	static int
11536	XLogPageRead(XLogReaderState xlogreader, XLogRecPtr targetPagePtr, int* reqLen,
11537	XLogRecPtr targetRecPtr, char readBuf, TimeLineID readTLI)
11538	{
11539	XLogPageReadPrivate *private =
11540	(XLogPageReadPrivate *) xlogreader->private_data;
11541	int emode = private->emode;
11542	uint32 targetPageOff;
11543	XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11544	int r;
11545
11546	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
11547	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
11548
11549	/*
11550	* See if we need to switch to a new segment because the requested record
11551	* is not in the currently open one.
11552	*/
11553	if (readFile >= `0` &&
11554	!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
11555	{
11556	/*
11557	* Request a restartpoint if we've replayed too much xlog since the
11558	* last one.
11559	*/
11560	if (bgwriterLaunched)
11561	{
11562	if (XLogCheckpointNeeded(readSegNo))
11563	{
11564	(void) GetRedoRecPtr();
11565	if (XLogCheckpointNeeded(readSegNo))
11566	RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11567	}
11568	}
11569
11570	close(readFile);
11571	readFile = -`1`;
11572	readSource = `0`;
11573	}
11574
11575	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
11576
11577	retry:
11578	/ See if we need to retrieve more data /
11579	if (readFile < `0` \|\|
11580	(readSource == XLOG_FROM_STREAM &&
11581	receivedUpto < targetPagePtr + reqLen))
11582	{
11583	if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11584	private->randAccess,
11585	private->fetching_ckpt,
11586	targetRecPtr))
11587	{
11588	if (readFile >= `0`)
11589	close(readFile);
11590	readFile = -`1`;
11591	readLen = `0`;
11592	readSource = `0`;
11593
11594	return -`1`;
11595	}
11596	}
11597
11598	/*
11599	* At this point, we have the right segment open and if we're streaming we
11600	* know the requested record is in it.
11601	*/
11602	Assert(readFile != -`1`);
11603
11604	/*
11605	* If the current segment is being streamed from master, calculate how
11606	* much of the current page we have received already. We know the
11607	* requested record has been received, but this is for the benefit of
11608	* future calls, to allow quick exit at the top of this function.
11609	*/
11610	if (readSource == XLOG_FROM_STREAM)
11611	{
11612	if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11613	readLen = XLOG_BLCKSZ;
11614	else
11615	readLen = XLogSegmentOffset(receivedUpto, wal_segment_size) -
11616	targetPageOff;
11617	}
11618	else
11619	readLen = XLOG_BLCKSZ;
11620
11621	/ Read the requested page /
11622	readOff = targetPageOff;
11623
11624	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11625	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
11626	if (r != XLOG_BLCKSZ)
11627	{
11628	char fname[MAXFNAMELEN];
11629	int save_errno = errno;
11630
11631	pgstat_report_wait_end();
11632	XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11633	if (r < `0`)
11634	{
11635	errno = save_errno;
11636	ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11637	(errcode_for_file_access(),
11638	errmsg("could not read from log segment %s, offset %u: %m",
11639	fname, readOff)));
11640	}
11641	else
11642	ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11643	(errcode(ERRCODE_DATA_CORRUPTED),
11644	errmsg("could not read from log segment %s, offset %u: read %d of %zu",
11645	fname, readOff, r, (Size) XLOG_BLCKSZ)));
11646	goto next_record_is_invalid;
11647	}
11648	pgstat_report_wait_end();
11649
11650	Assert(targetSegNo == readSegNo);
11651	Assert(targetPageOff == readOff);
11652	Assert(reqLen <= readLen);
11653
11654	*readTLI = curFileTLI;
11655
11656	/*
11657	* Check the page header immediately, so that we can retry immediately if
11658	* it's not valid. This may seem unnecessary, because XLogReadRecord()
11659	* validates the page header anyway, and would propagate the failure up to
11660	* ReadRecord(), which would retry. However, there's a corner case with
11661	* continuation records, if a record is split across two pages such that
11662	* we would need to read the two pages from different sources. For
11663	* example, imagine a scenario where a streaming replica is started up,
11664	* and replay reaches a record that's split across two WAL segments. The
11665	* first page is only available locally, in pg_wal, because it's already
11666	* been recycled in the master. The second page, however, is not present
11667	* in pg_wal, and we should stream it from the master. There is a recycled
11668	* WAL segment present in pg_wal, with garbage contents, however. We would
11669	* read the first page from the local WAL segment, but when reading the
11670	* second page, we would read the bogus, recycled, WAL segment. If we
11671	* didn't catch that case here, we would never recover, because
11672	* ReadRecord() would retry reading the whole record from the beginning.
11673	*
11674	* Of course, this only catches errors in the page header, which is what
11675	* happens in the case of a recycled WAL segment. Other kinds of errors or
11676	* corruption still has the same problem. But this at least fixes the
11677	* common case, which can happen as part of normal operation.
11678	*
11679	* Validating the page header is cheap enough that doing it twice
11680	* shouldn't be a big deal from a performance point of view.
11681	*/
11682	if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11683	{
11684	/ reset any error XLogReaderValidatePageHeader() might have set /
11685	xlogreader->errormsg_buf[`0`] = `'\0'`;
11686	goto next_record_is_invalid;
11687	}
11688
11689	return readLen;
11690
11691	next_record_is_invalid:
11692	lastSourceFailed = true;
11693
11694	if (readFile >= `0`)
11695	close(readFile);
11696	readFile = -`1`;
11697	readLen = `0`;
11698	readSource = `0`;
11699
11700	/ In standby-mode, keep trying /
11701	if (StandbyMode)
11702	goto retry;
11703	else
11704	return -`1`;
11705	}
11706
11707	/*
11708	* Open the WAL segment containing WAL location 'RecPtr'.
11709	*
11710	* The segment can be fetched via restore_command, or via walreceiver having
11711	* streamed the record, or it can already be present in pg_wal. Checking
11712	* pg_wal is mainly for crash recovery, but it will be polled in standby mode
11713	* too, in case someone copies a new segment directly to pg_wal. That is not
11714	* documented or recommended, though.
11715	*
11716	* If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11717	* prepare to read WAL starting from RedoStartLSN after this.
11718	*
11719	* 'RecPtr' might not point to the beginning of the record we're interested
11720	* in, it might also point to the page or segment header. In that case,
11721	* 'tliRecPtr' is the position of the WAL record we're interested in. It is
11722	* used to decide which timeline to stream the requested WAL from.
11723	*
11724	* If the record is not immediately available, the function returns false
11725	* if we're not in standby mode. In standby mode, waits for it to become
11726	* available.
11727	*
11728	* When the requested record becomes available, the function opens the file
11729	* containing it (if not open already), and returns true. When end of standby
11730	* mode is triggered by the user, and there is no more WAL available, returns
11731	* false.
11732	*/
11733	static bool
11734	WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11735	bool fetching_ckpt, XLogRecPtr tliRecPtr)
11736	{
11737	static TimestampTz last_fail_time = `0`;
11738	TimestampTz now;
11739	bool streaming_reply_sent = false;
11740
11741	/-------*
11742	* Standby mode is implemented by a state machine:
11743	*
11744	* 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11745	* pg_wal (XLOG_FROM_PG_WAL)
11746	* 2. Check trigger file
11747	* 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11748	* 4. Rescan timelines
11749	* 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11750	*
11751	* Failure to read from the current source advances the state machine to
11752	* the next state.
11753	*
11754	* 'currentSource' indicates the current state. There are no currentSource
11755	* values for "check trigger", "rescan timelines", and "sleep" states,
11756	* those actions are taken when reading from the previous source fails, as
11757	* part of advancing to the next state.
11758	*-------
11759	*/
11760	if (!InArchiveRecovery)
11761	currentSource = XLOG_FROM_PG_WAL;
11762	else if (currentSource == `0`)
11763	currentSource = XLOG_FROM_ARCHIVE;
11764
11765	for (;;)
11766	{
11767	int oldSource = currentSource;
11768
11769	/*
11770	* First check if we failed to read from the current source, and
11771	* advance the state machine if so. The failure to read might've
11772	* happened outside this function, e.g when a CRC check fails on a
11773	* record, or within this loop.
11774	*/
11775	if (lastSourceFailed)
11776	{
11777	switch (currentSource)
11778	{
11779	case XLOG_FROM_ARCHIVE:
11780	case XLOG_FROM_PG_WAL:
11781
11782	/*
11783	* Check to see if the trigger file exists. Note that we
11784	* do this only after failure, so when you create the
11785	* trigger file, we still finish replaying as much as we
11786	* can from archive and pg_wal before failover.
11787	*/
11788	if (StandbyMode && CheckForStandbyTrigger())
11789	{
11790	ShutdownWalRcv();
11791	return false;
11792	}
11793
11794	/*
11795	* Not in standby mode, and we've now tried the archive
11796	* and pg_wal.
11797	*/
11798	if (!StandbyMode)
11799	return false;
11800
11801	/*
11802	* If primary_conninfo is set, launch walreceiver to try
11803	* to stream the missing WAL.
11804	*
11805	* If fetching_ckpt is true, RecPtr points to the initial
11806	* checkpoint location. In that case, we use RedoStartLSN
11807	* as the streaming start position instead of RecPtr, so
11808	* that when we later jump backwards to start redo at
11809	* RedoStartLSN, we will have the logs streamed already.
11810	*/
11811	if (PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != `0`)
11812	{
11813	XLogRecPtr ptr;
11814	TimeLineID tli;
11815
11816	if (fetching_ckpt)
11817	{
11818	ptr = RedoStartLSN;
11819	tli = ControlFile->checkPointCopy.ThisTimeLineID;
11820	}
11821	else
11822	{
11823	ptr = RecPtr;
11824
11825	/*
11826	* Use the record begin position to determine the
11827	* TLI, rather than the position we're reading.
11828	*/
11829	tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11830
11831	if (curFileTLI > `0` && tli < curFileTLI)
11832	elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11833	(uint32) (tliRecPtr >> `32`),
11834	(uint32) tliRecPtr,
11835	tli, curFileTLI);
11836	}
11837	curFileTLI = tli;
11838	RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11839	PrimarySlotName);
11840	receivedUpto = `0`;
11841	}
11842
11843	/*
11844	* Move to XLOG_FROM_STREAM state in either case. We'll
11845	* get immediate failure if we didn't launch walreceiver,
11846	* and move on to the next state.
11847	*/
11848	currentSource = XLOG_FROM_STREAM;
11849	break;
11850
11851	case XLOG_FROM_STREAM:
11852
11853	/*
11854	* Failure while streaming. Most likely, we got here
11855	* because streaming replication was terminated, or
11856	* promotion was triggered. But we also get here if we
11857	* find an invalid record in the WAL streamed from master,
11858	* in which case something is seriously wrong. There's
11859	* little chance that the problem will just go away, but
11860	* PANIC is not good for availability either, especially
11861	* in hot standby mode. So, we treat that the same as
11862	* disconnection, and retry from archive/pg_wal again. The
11863	* WAL in the archive should be identical to what was
11864	* streamed, so it's unlikely that it helps, but one can
11865	* hope...
11866	*/
11867
11868	/*
11869	* Before we leave XLOG_FROM_STREAM state, make sure that
11870	* walreceiver is not active, so that it won't overwrite
11871	* WAL that we restore from archive.
11872	*/
11873	if (WalRcvStreaming())
11874	ShutdownWalRcv();
11875
11876	/*
11877	* Before we sleep, re-scan for possible new timelines if
11878	* we were requested to recover to the latest timeline.
11879	*/
11880	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
11881	{
11882	if (rescanLatestTimeLine())
11883	{
11884	currentSource = XLOG_FROM_ARCHIVE;
11885	break;
11886	}
11887	}
11888
11889	/*
11890	* XLOG_FROM_STREAM is the last state in our state
11891	* machine, so we've exhausted all the options for
11892	* obtaining the requested WAL. We're going to loop back
11893	* and retry from the archive, but if it hasn't been long
11894	* since last attempt, sleep wal_retrieve_retry_interval
11895	* milliseconds to avoid busy-waiting.
11896	*/
11897	now = GetCurrentTimestamp();
11898	if (!TimestampDifferenceExceeds(last_fail_time, now,
11899	wal_retrieve_retry_interval))
11900	{
11901	long secs,
11902	wait_time;
11903	int usecs;
11904
11905	TimestampDifference(last_fail_time, now, &secs, &usecs);
11906	wait_time = wal_retrieve_retry_interval -
11907	(secs * `1000` + usecs / `1000`);
11908
11909	(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
11910	WL_LATCH_SET \| WL_TIMEOUT \|
11911	WL_EXIT_ON_PM_DEATH,
11912	wait_time,
11913	WAIT_EVENT_RECOVERY_WAL_STREAM);
11914	ResetLatch(&XLogCtl->recoveryWakeupLatch);
11915	now = GetCurrentTimestamp();
11916	}
11917	last_fail_time = now;
11918	currentSource = XLOG_FROM_ARCHIVE;
11919	break;
11920
11921	default:
11922	elog(ERROR, "unexpected WAL source %d", currentSource);
11923	}
11924	}
11925	else if (currentSource == XLOG_FROM_PG_WAL)
11926	{
11927	/*
11928	* We just successfully read a file in pg_wal. We prefer files in
11929	* the archive over ones in pg_wal, so try the next file again
11930	* from the archive first.
11931	*/
11932	if (InArchiveRecovery)
11933	currentSource = XLOG_FROM_ARCHIVE;
11934	}
11935
11936	if (currentSource != oldSource)
11937	elog(DEBUG2, "switched WAL source from %s to %s after %s",
11938	xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11939	lastSourceFailed ? "failure" : "success");
11940
11941	/*
11942	* We've now handled possible failure. Try to read from the chosen
11943	* source.
11944	*/
11945	lastSourceFailed = false;
11946
11947	switch (currentSource)
11948	{
11949	case XLOG_FROM_ARCHIVE:
11950	case XLOG_FROM_PG_WAL:
11951	/ Close any old file we might have open. /
11952	if (readFile >= `0`)
11953	{
11954	close(readFile);
11955	readFile = -`1`;
11956	}
11957	/ Reset curFileTLI if random fetch. /
11958	if (randAccess)
11959	curFileTLI = `0`;
11960
11961	/*
11962	* Try to restore the file from archive, or read an existing
11963	* file from pg_wal.
11964	*/
11965	readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
11966	currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
11967	currentSource);
11968	if (readFile >= `0`)
11969	return true; / success! /
11970
11971	/*
11972	* Nope, not found in archive or pg_wal.
11973	*/
11974	lastSourceFailed = true;
11975	break;
11976
11977	case XLOG_FROM_STREAM:
11978	{
11979	bool havedata;
11980
11981	/*
11982	* Check if WAL receiver is still active.
11983	*/
11984	if (!WalRcvStreaming())
11985	{
11986	lastSourceFailed = true;
11987	break;
11988	}
11989
11990	/*
11991	* Walreceiver is active, so see if new data has arrived.
11992	*
11993	* We only advance XLogReceiptTime when we obtain fresh
11994	* WAL from walreceiver and observe that we had already
11995	* processed everything before the most recent "chunk"
11996	* that it flushed to disk. In steady state where we are
11997	* keeping up with the incoming data, XLogReceiptTime will
11998	* be updated on each cycle. When we are behind,
11999	* XLogReceiptTime will not advance, so the grace time
12000	* allotted to conflicting queries will decrease.
12001	*/
12002	if (RecPtr < receivedUpto)
12003	havedata = true;
12004	else
12005	{
12006	XLogRecPtr latestChunkStart;
12007
12008	receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
12009	if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
12010	{
12011	havedata = true;
12012	if (latestChunkStart <= RecPtr)
12013	{
12014	XLogReceiptTime = GetCurrentTimestamp();
12015	SetCurrentChunkStartTime(XLogReceiptTime);
12016	}
12017	}
12018	else
12019	havedata = false;
12020	}
12021	if (havedata)
12022	{
12023	/*
12024	* Great, streamed far enough. Open the file if it's
12025	* not open already. Also read the timeline history
12026	* file if we haven't initialized timeline history
12027	* yet; it should be streamed over and present in
12028	* pg_wal by now. Use XLOG_FROM_STREAM so that source
12029	* info is set correctly and XLogReceiptTime isn't
12030	* changed.
12031	*/
12032	if (readFile < `0`)
12033	{
12034	if (!expectedTLEs)
12035	expectedTLEs = readTimeLineHistory(receiveTLI);
12036	readFile = XLogFileRead(readSegNo, PANIC,
12037	receiveTLI,
12038	XLOG_FROM_STREAM, false);
12039	Assert(readFile >= `0`);
12040	}
12041	else
12042	{
12043	/ just make sure source info is correct... /
12044	readSource = XLOG_FROM_STREAM;
12045	XLogReceiptSource = XLOG_FROM_STREAM;
12046	return true;
12047	}
12048	break;
12049	}
12050
12051	/*
12052	* Data not here yet. Check for trigger, then wait for
12053	* walreceiver to wake us up when new WAL arrives.
12054	*/
12055	if (CheckForStandbyTrigger())
12056	{
12057	/*
12058	* Note that we don't "return false" immediately here.
12059	* After being triggered, we still want to replay all
12060	* the WAL that was already streamed. It's in pg_wal
12061	* now, so we just treat this as a failure, and the
12062	* state machine will move on to replay the streamed
12063	* WAL from pg_wal, and then recheck the trigger and
12064	* exit replay.
12065	*/
12066	lastSourceFailed = true;
12067	break;
12068	}
12069
12070	/*
12071	* Since we have replayed everything we have received so
12072	* far and are about to start waiting for more WAL, let's
12073	* tell the upstream server our replay location now so
12074	* that pg_stat_replication doesn't show stale
12075	* information.
12076	*/
12077	if (!streaming_reply_sent)
12078	{
12079	WalRcvForceReply();
12080	streaming_reply_sent = true;
12081	}
12082
12083	/*
12084	* Wait for more WAL to arrive. Time out after 5 seconds
12085	* to react to a trigger file promptly and to check if the
12086	* WAL receiver is still active.
12087	*/
12088	(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12089	WL_LATCH_SET \| WL_TIMEOUT \|
12090	WL_EXIT_ON_PM_DEATH,
12091	`5000L`, WAIT_EVENT_RECOVERY_WAL_ALL);
12092	ResetLatch(&XLogCtl->recoveryWakeupLatch);
12093	break;
12094	}
12095
12096	default:
12097	elog(ERROR, "unexpected WAL source %d", currentSource);
12098	}
12099
12100	/*
12101	* This possibly-long loop needs to handle interrupts of startup
12102	* process.
12103	*/
12104	HandleStartupProcInterrupts();
12105	}
12106
12107	return false; / not reached /
12108	}
12109
12110	/*
12111	* Determine what log level should be used to report a corrupt WAL record
12112	* in the current WAL page, previously read by XLogPageRead().
12113	*
12114	* 'emode' is the error mode that would be used to report a file-not-found
12115	* or legitimate end-of-WAL situation. Generally, we use it as-is, but if
12116	* we're retrying the exact same record that we've tried previously, only
12117	* complain the first time to keep the noise down. However, we only do when
12118	* reading from pg_wal, because we don't expect any invalid records in archive
12119	* or in records streamed from master. Files in the archive should be complete,
12120	* and we should never hit the end of WAL because we stop and wait for more WAL
12121	* to arrive before replaying it.
12122	*
12123	* NOTE: This function remembers the RecPtr value it was last called with,
12124	* to suppress repeated messages about the same record. Only call this when
12125	* you are about to ereport(), or you might cause a later message to be
12126	* erroneously suppressed.
12127	*/
12128	static int
12129	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12130	{
12131	static XLogRecPtr lastComplaint = `0`;
12132
12133	if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12134	{
12135	if (RecPtr == lastComplaint)
12136	emode = DEBUG1;
12137	else
12138	lastComplaint = RecPtr;
12139	}
12140	return emode;
12141	}
12142
12143	/*
12144	* Check to see whether the user-specified trigger file exists and whether a
12145	* promote request has arrived. If either condition holds, return true.
12146	*/
12147	static bool
12148	CheckForStandbyTrigger(void)
12149	{
12150	struct stat stat_buf;
12151	static bool triggered = false;
12152
12153	if (triggered)
12154	return true;
12155
12156	if (IsPromoteTriggered())
12157	{
12158	/*
12159	* In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12160	* signal handler. It now leaves the file in place and lets the
12161	* Startup process do the unlink. This allows Startup to know whether
12162	* it should create a full checkpoint before starting up (fallback
12163	* mode). Fast promotion takes precedence.
12164	*/
12165	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == `0`)
12166	{
12167	unlink(PROMOTE_SIGNAL_FILE);
12168	unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12169	fast_promote = true;
12170	}
12171	else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == `0`)
12172	{
12173	unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12174	fast_promote = false;
12175	}
12176
12177	ereport(LOG, (errmsg("received promote request")));
12178
12179	ResetPromoteTriggered();
12180	triggered = true;
12181	return true;
12182	}
12183
12184	if (PromoteTriggerFile == NULL \|\| strcmp(PromoteTriggerFile, "") == `0`)
12185	return false;
12186
12187	if (stat(PromoteTriggerFile, &stat_buf) == `0`)
12188	{
12189	ereport(LOG,
12190	(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
12191	unlink(PromoteTriggerFile);
12192	triggered = true;
12193	fast_promote = true;
12194	return true;
12195	}
12196	else if (errno != ENOENT)
12197	ereport(ERROR,
12198	(errcode_for_file_access(),
12199	errmsg("could not stat promote trigger file \"%s\": %m",
12200	PromoteTriggerFile)));
12201
12202	return false;
12203	}
12204
12205	/*
12206	* Remove the files signaling a standby promotion request.
12207	*/
12208	void
12209	RemovePromoteSignalFiles(void)
12210	{
12211	unlink(PROMOTE_SIGNAL_FILE);
12212	unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12213	}
12214
12215	/*
12216	* Check to see if a promote request has arrived. Should be
12217	* called by postmaster after receiving SIGUSR1.
12218	*/
12219	bool
12220	CheckPromoteSignal(void)
12221	{
12222	struct stat stat_buf;
12223
12224	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == `0` \|\|
12225	stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == `0`)
12226	return true;
12227
12228	return false;
12229	}
12230
12231	/*
12232	* Wake up startup process to replay newly arrived WAL, or to notice that
12233	* failover has been requested.
12234	*/
12235	void
12236	WakeupRecovery(void)
12237	{
12238	SetLatch(&XLogCtl->recoveryWakeupLatch);
12239	}
12240
12241	/*
12242	* Update the WalWriterSleeping flag.
12243	*/
12244	void
12245	SetWalWriterSleeping(bool sleeping)
12246	{
12247	SpinLockAcquire(&XLogCtl->info_lck);
12248	XLogCtl->WalWriterSleeping = sleeping;
12249	SpinLockRelease(&XLogCtl->info_lck);
12250	}
12251
12252	/*
12253	* Schedule a walreceiver wakeup in the main recovery loop.
12254	*/
12255	void
12256	XLogRequestWalReceiverReply(void)
12257	{
12258	doRequestWalReceiverReply = true;
12259	}
12260

Browse the source code of PostgreSQL/src/backend/access/transam/xlog.c