multixact.c source code [PostgreSQL/src/backend/access/transam/multixact.c]

1	/-------------------------------------------------------------------------*
2	*
3	* multixact.c
4	* PostgreSQL multi-transaction-log manager
5	*
6	* The pg_multixact manager is a pg_xact-like manager that stores an array of
7	* MultiXactMember for each MultiXactId. It is a fundamental part of the
8	* shared-row-lock implementation. Each MultiXactMember is comprised of a
9	* TransactionId and a set of flag bits. The name is a bit historical:
10	* originally, a MultiXactId consisted of more than one TransactionId (except
11	* in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12	* legitimate to have MultiXactIds that only include a single Xid.
13	*
14	* The meaning of the flag bits is opaque to this module, but they are mostly
15	* used in heapam.c to identify lock modes that each of the member transactions
16	* is holding on any given tuple. This module just contains support to store
17	* and retrieve the arrays.
18	*
19	* We use two SLRU areas, one for storing the offsets at which the data
20	* starts for each MultiXactId in the other one. This trick allows us to
21	* store variable length arrays of TransactionIds. (We could alternatively
22	* use one area containing counts and TransactionIds, with valid MultiXactId
23	* values pointing at slots containing counts; but that way seems less robust
24	* since it would get completely confused if someone inquired about a bogus
25	* MultiXactId that pointed to an intermediate slot containing an XID.)
26	*
27	* XLOG interactions: this module generates a record whenever a new OFFSETs or
28	* MEMBERs page is initialized to zeroes, as well as an
29	* XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30	* This module ignores the WAL rule "write xlog before data," because it
31	* suffices that actions recording a MultiXactId in a heap xmax do follow that
32	* rule. The only way for the MXID to be referenced from any data page is for
33	* heap_lock_tuple() or heap_update() to have put it there, and each generates
34	* an XLOG record that must follow ours. The normal LSN interlock between the
35	* data page and that XLOG record will ensure that our XLOG record reaches
36	* disk first. If the SLRU members/offsets data reaches disk sooner than the
37	* XLOG records, we do not care; after recovery, no xmax will refer to it. On
38	* the flip side, to ensure that all referenced entries _do_ reach disk, this
39	* module's XLOG records completely rebuild the data entered since the last
40	* checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41	* before each checkpoint is considered complete.
42	*
43	* Like clog.c, and unlike subtrans.c, we have to preserve state across
44	* crashes and ensure that MXID and offset numbering increases monotonically
45	* across a crash. We do this in the same way as it's done for transaction
46	* IDs: the WAL record is guaranteed to contain evidence of every MXID we
47	* could need to worry about, and we just make sure that at the end of
48	* replay, the next-MXID and next-offset counters are at least as large as
49	* anything we saw during replay.
50	*
51	* We are able to remove segments no longer necessary by carefully tracking
52	* each table's used values: during vacuum, any multixact older than a certain
53	* value is removed; the cutoff value is stored in pg_class. The minimum value
54	* across all tables in each database is stored in pg_database, and the global
55	* minimum across all databases is part of pg_control and is kept in shared
56	* memory. Whenever that minimum is advanced, the SLRUs are truncated.
57	*
58	* When new multixactid values are to be created, care is taken that the
59	* counter does not fall within the wraparound horizon considering the global
60	* minimum value.
61	*
62	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
63	* Portions Copyright (c) 1994, Regents of the University of California
64	*
65	* src/backend/access/transam/multixact.c
66	*
67	*-------------------------------------------------------------------------
68	*/
69	#include "postgres.h"
70
71	#include "access/multixact.h"
72	#include "access/slru.h"
73	#include "access/transam.h"
74	#include "access/twophase.h"
75	#include "access/twophase_rmgr.h"
76	#include "access/xact.h"
77	#include "access/xlog.h"
78	#include "access/xloginsert.h"
79	#include "catalog/pg_type.h"
80	#include "commands/dbcommands.h"
81	#include "funcapi.h"
82	#include "lib/ilist.h"
83	#include "miscadmin.h"
84	#include "pg_trace.h"
85	#include "postmaster/autovacuum.h"
86	#include "storage/lmgr.h"
87	#include "storage/pmsignal.h"
88	#include "storage/proc.h"
89	#include "storage/procarray.h"
90	#include "utils/builtins.h"
91	#include "utils/memutils.h"
92	#include "utils/snapmgr.h"
93
94
95	/*
96	* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
97	* used everywhere else in Postgres.
98	*
99	* Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
100	* MultiXact page numbering also wraps around at
101	* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
102	* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
103	* take no explicit notice of that fact in this module, except when comparing
104	* segment and page numbers in TruncateMultiXact (see
105	* MultiXactOffsetPagePrecedes).
106	*/
107
108	/ We need four bytes per offset /
109	#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
110
111	#define MultiXactIdToOffsetPage(xid) \
112	((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
113	#define MultiXactIdToOffsetEntry(xid) \
114	((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
115	#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT)
116
117	/*
118	* The situation for members is a bit more complex: we store one byte of
119	* additional flag bits for each TransactionId. To do this without getting
120	* into alignment issues, we store four bytes of flags, and then the
121	* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
122	* are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
123	* per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
124	* performance) trumps space efficiency here.
125	*
126	* Note that the "offset" macros work with byte offset, not array indexes, so
127	* arithmetic must be done using "char *" pointers.
128	*/
129	/ We need eight bits per xact, so one xact fits in a byte /
130	#define MXACT_MEMBER_BITS_PER_XACT 8
131	#define MXACT_MEMBER_FLAGS_PER_BYTE 1
132	#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
133
134	/ how many full bytes of flags are there in a group? /
135	#define MULTIXACT_FLAGBYTES_PER_GROUP 4
136	#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
137	(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
138	/ size in bytes of a complete group /
139	#define MULTIXACT_MEMBERGROUP_SIZE \
140	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
141	#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
142	#define MULTIXACT_MEMBERS_PER_PAGE \
143	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
144
145	/*
146	* Because the number of items per page is not a divisor of the last item
147	* number (member 0xFFFFFFFF), the last segment does not use the maximum number
148	* of pages, and moreover the last used page therein does not use the same
149	* number of items as previous pages. (Another way to say it is that the
150	* 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
151	* has some empty space after that item.)
152	*
153	* This constant is the number of members in the last page of the last segment.
154	*/
155	#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
156	((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
157
158	/ page in which a member is to be found /
159	#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
160	#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT)
161
162	/ Location (byte offset within page) of flag word for a given member /
163	#define MXOffsetToFlagsOffset(xid) \
164	((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
165	(TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
166	(TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
167	#define MXOffsetToFlagsBitShift(xid) \
168	(((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
169	MXACT_MEMBER_BITS_PER_XACT)
170
171	/ Location (byte offset within page) of TransactionId of given member /
172	#define MXOffsetToMemberOffset(xid) \
173	(MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
174	((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
175
176	/ Multixact members wraparound thresholds. /
177	#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
178	#define MULTIXACT_MEMBER_DANGER_THRESHOLD \
179	(MaxMultiXactOffset - MaxMultiXactOffset / 4)
180
181	#define PreviousMultiXactId(xid) \
182	((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1)
183
184	/*
185	* Links to shared-memory data structures for MultiXact control
186	*/
187	static SlruCtlData MultiXactOffsetCtlData;
188	static SlruCtlData MultiXactMemberCtlData;
189
190	#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
191	#define MultiXactMemberCtl (&MultiXactMemberCtlData)
192
193	/*
194	* MultiXact state shared across all backends. All this state is protected
195	* by MultiXactGenLock. (We also use MultiXactOffsetControlLock and
196	* MultiXactMemberControlLock to guard accesses to the two sets of SLRU
197	* buffers. For concurrency's sake, we avoid holding more than one of these
198	* locks at a time.)
199	*/
200	typedef struct MultiXactStateData
201	{
202	/ next-to-be-assigned MultiXactId /
203	MultiXactId nextMXact;
204
205	/ next-to-be-assigned offset /
206	MultiXactOffset nextOffset;
207
208	/ Have we completed multixact startup? /
209	bool finishedStartup;
210
211	/*
212	* Oldest multixact that is still potentially referenced by a relation.
213	* Anything older than this should not be consulted. These values are
214	* updated by vacuum.
215	*/
216	MultiXactId oldestMultiXactId;
217	Oid oldestMultiXactDB;
218
219	/*
220	* Oldest multixact offset that is potentially referenced by a multixact
221	* referenced by a relation. We don't always know this value, so there's
222	* a flag here to indicate whether or not we currently do.
223	*/
224	MultiXactOffset oldestOffset;
225	bool oldestOffsetKnown;
226
227	/ support for anti-wraparound measures /
228	MultiXactId multiVacLimit;
229	MultiXactId multiWarnLimit;
230	MultiXactId multiStopLimit;
231	MultiXactId multiWrapLimit;
232
233	/ support for members anti-wraparound measures /
234	MultiXactOffset offsetStopLimit; / known if oldestOffsetKnown /
235
236	/*
237	* Per-backend data starts here. We have two arrays stored in the area
238	* immediately following the MultiXactStateData struct. Each is indexed by
239	* BackendId.
240	*
241	* In both arrays, there's a slot for all normal backends (1..MaxBackends)
242	* followed by a slot for max_prepared_xacts prepared transactions. Valid
243	* BackendIds start from 1; element zero of each array is never used.
244	*
245	* OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
246	* transaction(s) could possibly be a member of, or InvalidMultiXactId
247	* when the backend has no live transaction that could possibly be a
248	* member of a MultiXact. Each backend sets its entry to the current
249	* nextMXact counter just before first acquiring a shared lock in a given
250	* transaction, and clears it at transaction end. (This works because only
251	* during or after acquiring a shared lock could an XID possibly become a
252	* member of a MultiXact, and that MultiXact would have to be created
253	* during or after the lock acquisition.)
254	*
255	* OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
256	* current transaction(s) think is potentially live, or InvalidMultiXactId
257	* when not in a transaction or not in a transaction that's paid any
258	* attention to MultiXacts yet. This is computed when first needed in a
259	* given transaction, and cleared at transaction end. We can compute it
260	* as the minimum of the valid OldestMemberMXactId[] entries at the time
261	* we compute it (using nextMXact if none are valid). Each backend is
262	* required not to attempt to access any SLRU data for MultiXactIds older
263	* than its own OldestVisibleMXactId[] setting; this is necessary because
264	* the checkpointer could truncate away such data at any instant.
265	*
266	* The oldest valid value among all of the OldestMemberMXactId[] and
267	* OldestVisibleMXactId[] entries is considered by vacuum as the earliest
268	* possible value still having any live member transaction. Subtracting
269	* vacuum_multixact_freeze_min_age from that value we obtain the freezing
270	* point for multixacts for that table. Any value older than that is
271	* removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note
272	* that multis that have member xids that are older than the cutoff point
273	* for xids must also be frozen, even if the multis themselves are newer
274	* than the multixid cutoff point). Whenever a full table vacuum happens,
275	* the freezing point so computed is used as the new pg_class.relminmxid
276	* value. The minimum of all those values in a database is stored as
277	* pg_database.datminmxid. In turn, the minimum of all of those values is
278	* stored in pg_control and used as truncation point for pg_multixact. At
279	* checkpoint or restartpoint, unneeded segments are removed.
280	*/
281	MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
282	} MultiXactStateData;
283
284	/*
285	* Last element of OldestMemberMXactID and OldestVisibleMXactId arrays.
286	* Valid elements are (1..MaxOldestSlot); element 0 is never used.
287	*/
288	#define MaxOldestSlot (MaxBackends + max_prepared_xacts)
289
290	/ Pointers to the state data in shared memory /
291	static MultiXactStateData *MultiXactState;
292	static MultiXactId *OldestMemberMXactId;
293	static MultiXactId *OldestVisibleMXactId;
294
295
296	/*
297	* Definitions for the backend-local MultiXactId cache.
298	*
299	* We use this cache to store known MultiXacts, so we don't need to go to
300	* SLRU areas every time.
301	*
302	* The cache lasts for the duration of a single transaction, the rationale
303	* for this being that most entries will contain our own TransactionId and
304	* so they will be uninteresting by the time our next transaction starts.
305	* (XXX not clear that this is correct --- other members of the MultiXact
306	* could hang around longer than we did. However, it's not clear what a
307	* better policy for flushing old cache entries would be.) FIXME actually
308	* this is plain wrong now that multixact's may contain update Xids.
309	*
310	* We allocate the cache entries in a memory context that is deleted at
311	* transaction end, so we don't need to do retail freeing of entries.
312	*/
313	typedef struct mXactCacheEnt
314	{
315	MultiXactId multi;
316	int nmembers;
317	dlist_node node;
318	MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
319	} mXactCacheEnt;
320
321	#define MAX_CACHE_ENTRIES 256
322	static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
323	static int MXactCacheMembers = `0`;
324	static MemoryContext MXactContext = NULL;
325
326	#ifdef MULTIXACT_DEBUG
327	#define debug_elog2(a,b) elog(a,b)
328	#define debug_elog3(a,b,c) elog(a,b,c)
329	#define debug_elog4(a,b,c,d) elog(a,b,c,d)
330	#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
331	#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
332	#else
333	#define debug_elog2(a,b)
334	#define debug_elog3(a,b,c)
335	#define debug_elog4(a,b,c,d)
336	#define debug_elog5(a,b,c,d,e)
337	#define debug_elog6(a,b,c,d,e,f)
338	#endif
339
340	/ internal MultiXactId management /
341	static void MultiXactIdSetOldestVisible(void);
342	static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
343	int nmembers, MultiXactMember *members);
344	static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
345
346	/ MultiXact cache management /
347	static int mxactMemberComparator(const void arg1, const* void *arg2);
348	static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
349	static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
350	static void mXactCachePut(MultiXactId multi, int nmembers,
351	MultiXactMember *members);
352
353	static char *mxstatus_to_string(MultiXactStatus status);
354
355	/ management of SLRU infrastructure /
356	static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
357	static int ZeroMultiXactMemberPage(int pageno, bool writeXlog);
358	static bool MultiXactOffsetPagePrecedes(int page1, int page2);
359	static bool MultiXactMemberPagePrecedes(int page1, int page2);
360	static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
361	MultiXactOffset offset2);
362	static void ExtendMultiXactOffset(MultiXactId multi);
363	static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
364	static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
365	MultiXactOffset start, uint32 distance);
366	static bool SetOffsetVacuumLimit(bool is_startup);
367	static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
368	static void WriteMZeroPageXlogRec(int pageno, uint8 info);
369	static void WriteMTruncateXlogRec(Oid oldestMultiDB,
370	MultiXactId startOff, MultiXactId endOff,
371	MultiXactOffset startMemb, MultiXactOffset endMemb);
372
373
374	/*
375	* MultiXactIdCreate
376	* Construct a MultiXactId representing two TransactionIds.
377	*
378	* The two XIDs must be different, or be requesting different statuses.
379	*
380	* NB - we don't worry about our local MultiXactId cache here, because that
381	* is handled by the lower-level routines.
382	*/
383	MultiXactId
384	MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
385	TransactionId xid2, MultiXactStatus status2)
386	{
387	MultiXactId newMulti;
388	MultiXactMember members[`2`];
389
390	AssertArg(TransactionIdIsValid(xid1));
391	AssertArg(TransactionIdIsValid(xid2));
392
393	Assert(!TransactionIdEquals(xid1, xid2) \|\| (status1 != status2));
394
395	/ MultiXactIdSetOldestMember() must have been called already. /
396	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
397
398	/*
399	* Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
400	* are still running. In typical usage, xid2 will be our own XID and the
401	* caller just did a check on xid1, so it'd be wasted effort.
402	*/
403
404	members[`0`].xid = xid1;
405	members[`0`].status = status1;
406	members[`1`].xid = xid2;
407	members[`1`].status = status2;
408
409	newMulti = MultiXactIdCreateFromMembers(`2`, members);
410
411	debug_elog3(DEBUG2, "Create: %s",
412	mxid_to_string(newMulti, `2`, members));
413
414	return newMulti;
415	}
416
417	/*
418	* MultiXactIdExpand
419	* Add a TransactionId to a pre-existing MultiXactId.
420	*
421	* If the TransactionId is already a member of the passed MultiXactId with the
422	* same status, just return it as-is.
423	*
424	* Note that we do NOT actually modify the membership of a pre-existing
425	* MultiXactId; instead we create a new one. This is necessary to avoid
426	* a race condition against code trying to wait for one MultiXactId to finish;
427	* see notes in heapam.c.
428	*
429	* NB - we don't worry about our local MultiXactId cache here, because that
430	* is handled by the lower-level routines.
431	*
432	* Note: It is critical that MultiXactIds that come from an old cluster (i.e.
433	* one upgraded by pg_upgrade from a cluster older than this feature) are not
434	* passed in.
435	*/
436	MultiXactId
437	MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
438	{
439	MultiXactId newMulti;
440	MultiXactMember *members;
441	MultiXactMember *newMembers;
442	int nmembers;
443	int i;
444	int j;
445
446	AssertArg(MultiXactIdIsValid(multi));
447	AssertArg(TransactionIdIsValid(xid));
448
449	/ MultiXactIdSetOldestMember() must have been called already. /
450	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
451
452	debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
453	multi, xid, mxstatus_to_string(status));
454
455	/*
456	* Note: we don't allow for old multis here. The reason is that the only
457	* caller of this function does a check that the multixact is no longer
458	* running.
459	*/
460	nmembers = GetMultiXactIdMembers(multi, &members, false, false);
461
462	if (nmembers < `0`)
463	{
464	MultiXactMember member;
465
466	/*
467	* The MultiXactId is obsolete. This can only happen if all the
468	* MultiXactId members stop running between the caller checking and
469	* passing it to us. It would be better to return that fact to the
470	* caller, but it would complicate the API and it's unlikely to happen
471	* too often, so just deal with it by creating a singleton MultiXact.
472	*/
473	member.xid = xid;
474	member.status = status;
475	newMulti = MultiXactIdCreateFromMembers(`1`, &member);
476
477	debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
478	multi, newMulti);
479	return newMulti;
480	}
481
482	/*
483	* If the TransactionId is already a member of the MultiXactId with the
484	* same status, just return the existing MultiXactId.
485	*/
486	for (i = `0`; i < nmembers; i++)
487	{
488	if (TransactionIdEquals(members[i].xid, xid) &&
489	(members[i].status == status))
490	{
491	debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
492	xid, multi);
493	pfree(members);
494	return multi;
495	}
496	}
497
498	/*
499	* Determine which of the members of the MultiXactId are still of
500	* interest. This is any running transaction, and also any transaction
501	* that grabbed something stronger than just a lock and was committed. (An
502	* update that aborted is of no interest here; and having more than one
503	* update Xid in a multixact would cause errors elsewhere.)
504	*
505	* Removing dead members is not just an optimization: freezing of tuples
506	* whose Xmax are multis depends on this behavior.
507	*
508	* Note we have the same race condition here as above: j could be 0 at the
509	* end of the loop.
510	*/
511	newMembers = (MultiXactMember *)
512	palloc(sizeof(MultiXactMember) * (nmembers + `1`));
513
514	for (i = `0`, j = `0`; i < nmembers; i++)
515	{
516	if (TransactionIdIsInProgress(members[i].xid) \|\|
517	(ISUPDATE_from_mxstatus(members[i].status) &&
518	TransactionIdDidCommit(members[i].xid)))
519	{
520	newMembers[j].xid = members[i].xid;
521	newMembers[j++].status = members[i].status;
522	}
523	}
524
525	newMembers[j].xid = xid;
526	newMembers[j++].status = status;
527	newMulti = MultiXactIdCreateFromMembers(j, newMembers);
528
529	pfree(members);
530	pfree(newMembers);
531
532	debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
533
534	return newMulti;
535	}
536
537	/*
538	* MultiXactIdIsRunning
539	* Returns whether a MultiXactId is "running".
540	*
541	* We return true if at least one member of the given MultiXactId is still
542	* running. Note that a "false" result is certain not to change,
543	* because it is not legal to add members to an existing MultiXactId.
544	*
545	* Caller is expected to have verified that the multixact does not come from
546	* a pg_upgraded share-locked tuple.
547	*/
548	bool
549	MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
550	{
551	MultiXactMember *members;
552	int nmembers;
553	int i;
554
555	debug_elog3(DEBUG2, "IsRunning %u?", multi);
556
557	/*
558	* "false" here means we assume our callers have checked that the given
559	* multi cannot possibly come from a pg_upgraded database.
560	*/
561	nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
562
563	if (nmembers <= `0`)
564	{
565	debug_elog2(DEBUG2, "IsRunning: no members");
566	return false;
567	}
568
569	/*
570	* Checking for myself is cheap compared to looking in shared memory;
571	* return true if any live subtransaction of the current top-level
572	* transaction is a member.
573	*
574	* This is not needed for correctness, it's just a fast path.
575	*/
576	for (i = `0`; i < nmembers; i++)
577	{
578	if (TransactionIdIsCurrentTransactionId(members[i].xid))
579	{
580	debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
581	pfree(members);
582	return true;
583	}
584	}
585
586	/*
587	* This could be made faster by having another entry point in procarray.c,
588	* walking the PGPROC array only once for all the members. But in most
589	* cases nmembers should be small enough that it doesn't much matter.
590	*/
591	for (i = `0`; i < nmembers; i++)
592	{
593	if (TransactionIdIsInProgress(members[i].xid))
594	{
595	debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
596	i, members[i].xid);
597	pfree(members);
598	return true;
599	}
600	}
601
602	pfree(members);
603
604	debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
605
606	return false;
607	}
608
609	/*
610	* MultiXactIdSetOldestMember
611	* Save the oldest MultiXactId this transaction could be a member of.
612	*
613	* We set the OldestMemberMXactId for a given transaction the first time it's
614	* going to do some operation that might require a MultiXactId (tuple lock,
615	* update or delete). We need to do this even if we end up using a
616	* TransactionId instead of a MultiXactId, because there is a chance that
617	* another transaction would add our XID to a MultiXactId.
618	*
619	* The value to set is the next-to-be-assigned MultiXactId, so this is meant to
620	* be called just before doing any such possibly-MultiXactId-able operation.
621	*/
622	void
623	MultiXactIdSetOldestMember(void)
624	{
625	if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
626	{
627	MultiXactId nextMXact;
628
629	/*
630	* You might think we don't need to acquire a lock here, since
631	* fetching and storing of TransactionIds is probably atomic, but in
632	* fact we do: suppose we pick up nextMXact and then lose the CPU for
633	* a long time. Someone else could advance nextMXact, and then
634	* another someone else could compute an OldestVisibleMXactId that
635	* would be after the value we are going to store when we get control
636	* back. Which would be wrong.
637	*
638	* Note that a shared lock is sufficient, because it's enough to stop
639	* someone from advancing nextMXact; and nobody else could be trying
640	* to write to our OldestMember entry, only reading (and we assume
641	* storing it is atomic.)
642	*/
643	LWLockAcquire(MultiXactGenLock, LW_SHARED);
644
645	/*
646	* We have to beware of the possibility that nextMXact is in the
647	* wrapped-around state. We don't fix the counter itself here, but we
648	* must be sure to store a valid value in our array entry.
649	*/
650	nextMXact = MultiXactState->nextMXact;
651	if (nextMXact < FirstMultiXactId)
652	nextMXact = FirstMultiXactId;
653
654	OldestMemberMXactId[MyBackendId] = nextMXact;
655
656	LWLockRelease(MultiXactGenLock);
657
658	debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
659	MyBackendId, nextMXact);
660	}
661	}
662
663	/*
664	* MultiXactIdSetOldestVisible
665	* Save the oldest MultiXactId this transaction considers possibly live.
666	*
667	* We set the OldestVisibleMXactId for a given transaction the first time
668	* it's going to inspect any MultiXactId. Once we have set this, we are
669	* guaranteed that the checkpointer won't truncate off SLRU data for
670	* MultiXactIds at or after our OldestVisibleMXactId.
671	*
672	* The value to set is the oldest of nextMXact and all the valid per-backend
673	* OldestMemberMXactId[] entries. Because of the locking we do, we can be
674	* certain that no subsequent call to MultiXactIdSetOldestMember can set
675	* an OldestMemberMXactId[] entry older than what we compute here. Therefore
676	* there is no live transaction, now or later, that can be a member of any
677	* MultiXactId older than the OldestVisibleMXactId we compute here.
678	*/
679	static void
680	MultiXactIdSetOldestVisible(void)
681	{
682	if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
683	{
684	MultiXactId oldestMXact;
685	int i;
686
687	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
688
689	/*
690	* We have to beware of the possibility that nextMXact is in the
691	* wrapped-around state. We don't fix the counter itself here, but we
692	* must be sure to store a valid value in our array entry.
693	*/
694	oldestMXact = MultiXactState->nextMXact;
695	if (oldestMXact < FirstMultiXactId)
696	oldestMXact = FirstMultiXactId;
697
698	for (i = `1`; i <= MaxOldestSlot; i++)
699	{
700	MultiXactId thisoldest = OldestMemberMXactId[i];
701
702	if (MultiXactIdIsValid(thisoldest) &&
703	MultiXactIdPrecedes(thisoldest, oldestMXact))
704	oldestMXact = thisoldest;
705	}
706
707	OldestVisibleMXactId[MyBackendId] = oldestMXact;
708
709	LWLockRelease(MultiXactGenLock);
710
711	debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
712	MyBackendId, oldestMXact);
713	}
714	}
715
716	/*
717	* ReadNextMultiXactId
718	* Return the next MultiXactId to be assigned, but don't allocate it
719	*/
720	MultiXactId
721	ReadNextMultiXactId(void)
722	{
723	MultiXactId mxid;
724
725	/ XXX we could presumably do this without a lock. /
726	LWLockAcquire(MultiXactGenLock, LW_SHARED);
727	mxid = MultiXactState->nextMXact;
728	LWLockRelease(MultiXactGenLock);
729
730	if (mxid < FirstMultiXactId)
731	mxid = FirstMultiXactId;
732
733	return mxid;
734	}
735
736	/*
737	* MultiXactIdCreateFromMembers
738	* Make a new MultiXactId from the specified set of members
739	*
740	* Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
741	* given TransactionIds as members. Returns the newly created MultiXactId.
742	*
743	* NB: the passed members[] array will be sorted in-place.
744	*/
745	MultiXactId
746	MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
747	{
748	MultiXactId multi;
749	MultiXactOffset offset;
750	xl_multixact_create xlrec;
751
752	debug_elog3(DEBUG2, "Create: %s",
753	mxid_to_string(InvalidMultiXactId, nmembers, members));
754
755	/*
756	* See if the same set of members already exists in our cache; if so, just
757	* re-use that MultiXactId. (Note: it might seem that looking in our
758	* cache is insufficient, and we ought to search disk to see if a
759	* duplicate definition already exists. But since we only ever create
760	* MultiXacts containing our own XID, in most cases any such MultiXacts
761	* were in fact created by us, and so will be in our cache. There are
762	* corner cases where someone else added us to a MultiXact without our
763	* knowledge, but it's not worth checking for.)
764	*/
765	multi = mXactCacheGetBySet(nmembers, members);
766	if (MultiXactIdIsValid(multi))
767	{
768	debug_elog2(DEBUG2, "Create: in cache!");
769	return multi;
770	}
771
772	/ Verify that there is a single update Xid among the given members. /
773	{
774	int i;
775	bool has_update = false;
776
777	for (i = `0`; i < nmembers; i++)
778	{
779	if (ISUPDATE_from_mxstatus(members[i].status))
780	{
781	if (has_update)
782	elog(ERROR, "new multixact has more than one updating member");
783	has_update = true;
784	}
785	}
786	}
787
788	/*
789	* Assign the MXID and offsets range to use, and make sure there is space
790	* in the OFFSETs and MEMBERs files. NB: this routine does
791	* START_CRIT_SECTION().
792	*
793	* Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
794	* that we've called MultiXactIdSetOldestMember here. This is because
795	* this routine is used in some places to create new MultiXactIds of which
796	* the current backend is not a member, notably during freezing of multis
797	* in vacuum. During vacuum, in particular, it would be unacceptable to
798	* keep OldestMulti set, in case it runs for long.
799	*/
800	multi = GetNewMultiXactId(nmembers, &offset);
801
802	/ Make an XLOG entry describing the new MXID. /
803	xlrec.mid = multi;
804	xlrec.moff = offset;
805	xlrec.nmembers = nmembers;
806
807	/*
808	* XXX Note: there's a lot of padding space in MultiXactMember. We could
809	* find a more compact representation of this Xlog record -- perhaps all
810	* the status flags in one XLogRecData, then all the xids in another one?
811	* Not clear that it's worth the trouble though.
812	*/
813	XLogBeginInsert();
814	XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
815	XLogRegisterData((char ) members, nmembers sizeof(MultiXactMember));
816
817	(void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
818
819	/ Now enter the information into the OFFSETs and MEMBERs logs /
820	RecordNewMultiXact(multi, offset, nmembers, members);
821
822	/ Done with critical section /
823	END_CRIT_SECTION();
824
825	/ Store the new MultiXactId in the local cache, too /
826	mXactCachePut(multi, nmembers, members);
827
828	debug_elog2(DEBUG2, "Create: all done");
829
830	return multi;
831	}
832
833	/*
834	* RecordNewMultiXact
835	* Write info about a new multixact into the offsets and members files
836	*
837	* This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
838	* use it.
839	*/
840	static void
841	RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
842	int nmembers, MultiXactMember *members)
843	{
844	int pageno;
845	int prev_pageno;
846	int entryno;
847	int slotno;
848	MultiXactOffset *offptr;
849	int i;
850
851	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
852
853	pageno = MultiXactIdToOffsetPage(multi);
854	entryno = MultiXactIdToOffsetEntry(multi);
855
856	/*
857	* Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
858	* to complain about if there's any I/O error. This is kinda bogus, but
859	* since the errors will always give the full pathname, it should be clear
860	* enough that a MultiXactId is really involved. Perhaps someday we'll
861	* take the trouble to generalize the slru.c error reporting code.
862	*/
863	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
864	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
865	offptr += entryno;
866
867	*offptr = offset;
868
869	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
870
871	/ Exchange our lock /
872	LWLockRelease(MultiXactOffsetControlLock);
873
874	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
875
876	prev_pageno = -`1`;
877
878	for (i = `0`; i < nmembers; i++, offset++)
879	{
880	TransactionId *memberptr;
881	uint32 *flagsptr;
882	uint32 flagsval;
883	int bshift;
884	int flagsoff;
885	int memberoff;
886
887	Assert(members[i].status <= MultiXactStatusUpdate);
888
889	pageno = MXOffsetToMemberPage(offset);
890	memberoff = MXOffsetToMemberOffset(offset);
891	flagsoff = MXOffsetToFlagsOffset(offset);
892	bshift = MXOffsetToFlagsBitShift(offset);
893
894	if (pageno != prev_pageno)
895	{
896	slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
897	prev_pageno = pageno;
898	}
899
900	memberptr = (TransactionId *)
901	(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
902
903	*memberptr = members[i].xid;
904
905	flagsptr = (uint32 *)
906	(MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
907
908	flagsval = *flagsptr;
909	flagsval &= ~(((`1` << MXACT_MEMBER_BITS_PER_XACT) - `1`) << bshift);
910	flagsval \|= (members[i].status << bshift);
911	*flagsptr = flagsval;
912
913	MultiXactMemberCtl->shared->page_dirty[slotno] = true;
914	}
915
916	LWLockRelease(MultiXactMemberControlLock);
917	}
918
919	/*
920	* GetNewMultiXactId
921	* Get the next MultiXactId.
922	*
923	* Also, reserve the needed amount of space in the "members" area. The
924	* starting offset of the reserved space is returned in *offset.
925	*
926	* This may generate XLOG records for expansion of the offsets and/or members
927	* files. Unfortunately, we have to do that while holding MultiXactGenLock
928	* to avoid race conditions --- the XLOG record for zeroing a page must appear
929	* before any backend can possibly try to store data in that page!
930	*
931	* We start a critical section before advancing the shared counters. The
932	* caller must end the critical section after writing SLRU data.
933	*/
934	static MultiXactId
935	GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
936	{
937	MultiXactId result;
938	MultiXactOffset nextOffset;
939
940	debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
941
942	/ safety check, we should never get this far in a HS standby /
943	if (RecoveryInProgress())
944	elog(ERROR, "cannot assign MultiXactIds during recovery");
945
946	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
947
948	/ Handle wraparound of the nextMXact counter /
949	if (MultiXactState->nextMXact < FirstMultiXactId)
950	MultiXactState->nextMXact = FirstMultiXactId;
951
952	/ Assign the MXID /
953	result = MultiXactState->nextMXact;
954
955	/----------*
956	* Check to see if it's safe to assign another MultiXactId. This protects
957	* against catastrophic data loss due to multixact wraparound. The basic
958	* rules are:
959	*
960	* If we're past multiVacLimit or the safe threshold for member storage
961	* space, or we don't know what the safe threshold for member storage is,
962	* start trying to force autovacuum cycles.
963	* If we're past multiWarnLimit, start issuing warnings.
964	* If we're past multiStopLimit, refuse to create new MultiXactIds.
965	*
966	* Note these are pretty much the same protections in GetNewTransactionId.
967	*----------
968	*/
969	if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
970	{
971	/*
972	* For safety's sake, we release MultiXactGenLock while sending
973	* signals, warnings, etc. This is not so much because we care about
974	* preserving concurrency in this situation, as to avoid any
975	* possibility of deadlock while doing get_database_name(). First,
976	* copy all the shared values we'll need in this path.
977	*/
978	MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
979	MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
980	MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
981	Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
982
983	LWLockRelease(MultiXactGenLock);
984
985	if (IsUnderPostmaster &&
986	!MultiXactIdPrecedes(result, multiStopLimit))
987	{
988	char *oldest_datname = get_database_name(oldest_datoid);
989
990	/*
991	* Immediately kick autovacuum into action as we're already in
992	* ERROR territory.
993	*/
994	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
995
996	/ complain even if that DB has disappeared /
997	if (oldest_datname)
998	ereport(ERROR,
999	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1000	errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1001	oldest_datname),
1002	errhint("Execute a database-wide VACUUM in that database.\n"
1003	"You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1004	else
1005	ereport(ERROR,
1006	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1007	errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
1008	oldest_datoid),
1009	errhint("Execute a database-wide VACUUM in that database.\n"
1010	"You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1011	}
1012
1013	/*
1014	* To avoid swamping the postmaster with signals, we issue the autovac
1015	* request only once per 64K multis generated. This still gives
1016	* plenty of chances before we get into real trouble.
1017	*/
1018	if (IsUnderPostmaster && (result % `65536`) == `0`)
1019	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1020
1021	if (!MultiXactIdPrecedes(result, multiWarnLimit))
1022	{
1023	char *oldest_datname = get_database_name(oldest_datoid);
1024
1025	/ complain even if that DB has disappeared /
1026	if (oldest_datname)
1027	ereport(WARNING,
1028	(errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1029	"database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1030	multiWrapLimit - result,
1031	oldest_datname,
1032	multiWrapLimit - result),
1033	errhint("Execute a database-wide VACUUM in that database.\n"
1034	"You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1035	else
1036	ereport(WARNING,
1037	(errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1038	"database with OID %u must be vacuumed before %u more MultiXactIds are used",
1039	multiWrapLimit - result,
1040	oldest_datoid,
1041	multiWrapLimit - result),
1042	errhint("Execute a database-wide VACUUM in that database.\n"
1043	"You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1044	}
1045
1046	/ Re-acquire lock and start over /
1047	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1048	result = MultiXactState->nextMXact;
1049	if (result < FirstMultiXactId)
1050	result = FirstMultiXactId;
1051	}
1052
1053	/ Make sure there is room for the MXID in the file. /
1054	ExtendMultiXactOffset(result);
1055
1056	/*
1057	* Reserve the members space, similarly to above. Also, be careful not to
1058	* return zero as the starting offset for any multixact. See
1059	* GetMultiXactIdMembers() for motivation.
1060	*/
1061	nextOffset = MultiXactState->nextOffset;
1062	if (nextOffset == `0`)
1063	{
1064	*offset = `1`;
1065	nmembers++; / allocate member slot 0 too /
1066	}
1067	else
1068	*offset = nextOffset;
1069
1070	/----------*
1071	* Protect against overrun of the members space as well, with the
1072	* following rules:
1073	*
1074	* If we're past offsetStopLimit, refuse to generate more multis.
1075	* If we're close to offsetStopLimit, emit a warning.
1076	*
1077	* Arbitrarily, we start emitting warnings when we're 20 segments or less
1078	* from offsetStopLimit.
1079	*
1080	* Note we haven't updated the shared state yet, so if we fail at this
1081	* point, the multixact ID we grabbed can still be used by the next guy.
1082	*
1083	* Note that there is no point in forcing autovacuum runs here: the
1084	* multixact freeze settings would have to be reduced for that to have any
1085	* effect.
1086	*----------
1087	*/
1088	#define OFFSET_WARN_SEGMENTS 20
1089	if (MultiXactState->oldestOffsetKnown &&
1090	MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
1091	nmembers))
1092	{
1093	/ see comment in the corresponding offsets wraparound case /
1094	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1095
1096	ereport(ERROR,
1097	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1098	errmsg("multixact \"members\" limit exceeded"),
1099	errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1100	"This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1101	MultiXactState->offsetStopLimit - nextOffset - `1`,
1102	nmembers,
1103	MultiXactState->offsetStopLimit - nextOffset - `1`),
1104	errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.",
1105	MultiXactState->oldestMultiXactDB)));
1106	}
1107
1108	/*
1109	* Check whether we should kick autovacuum into action, to prevent members
1110	* wraparound. NB we use a much larger window to trigger autovacuum than
1111	* just the warning limit. The warning is just a measure of last resort -
1112	* this is in line with GetNewTransactionId's behaviour.
1113	*/
1114	if (!MultiXactState->oldestOffsetKnown \|\|
1115	(MultiXactState->nextOffset - MultiXactState->oldestOffset
1116	> MULTIXACT_MEMBER_SAFE_THRESHOLD))
1117	{
1118	/*
1119	* To avoid swamping the postmaster with signals, we issue the autovac
1120	* request only when crossing a segment boundary. With default
1121	* compilation settings that's roughly after 50k members. This still
1122	* gives plenty of chances before we get into real trouble.
1123	*/
1124	if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1125	(MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1126	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1127	}
1128
1129	if (MultiXactState->oldestOffsetKnown &&
1130	MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
1131	nextOffset,
1132	nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
1133	ereport(WARNING,
1134	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1135	errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1136	"database with OID %u must be vacuumed before %d more multixact members are used",
1137	MultiXactState->offsetStopLimit - nextOffset + nmembers,
1138	MultiXactState->oldestMultiXactDB,
1139	MultiXactState->offsetStopLimit - nextOffset + nmembers),
1140	errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.")));
1141
1142	ExtendMultiXactMember(nextOffset, nmembers);
1143
1144	/*
1145	* Critical section from here until caller has written the data into the
1146	* just-reserved SLRU space; we don't want to error out with a partly
1147	* written MultiXact structure. (In particular, failing to write our
1148	* start offset after advancing nextMXact would effectively corrupt the
1149	* previous MultiXact.)
1150	*/
1151	START_CRIT_SECTION();
1152
1153	/*
1154	* Advance counters. As in GetNewTransactionId(), this must not happen
1155	* until after file extension has succeeded!
1156	*
1157	* We don't care about MultiXactId wraparound here; it will be handled by
1158	* the next iteration. But note that nextMXact may be InvalidMultiXactId
1159	* or the first value on a segment-beginning page after this routine
1160	* exits, so anyone else looking at the variable must be prepared to deal
1161	* with either case. Similarly, nextOffset may be zero, but we won't use
1162	* that as the actual start offset of the next multixact.
1163	*/
1164	(MultiXactState->nextMXact)++;
1165
1166	MultiXactState->nextOffset += nmembers;
1167
1168	LWLockRelease(MultiXactGenLock);
1169
1170	debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1171	return result;
1172	}
1173
1174	/*
1175	* GetMultiXactIdMembers
1176	* Return the set of MultiXactMembers that make up a MultiXactId
1177	*
1178	* Return value is the number of members found, or -1 if there are none,
1179	* and *members is set to a newly palloc'ed array of members. It's the
1180	* caller's responsibility to free it when done with it.
1181	*
1182	* from_pgupgrade must be passed as true if and only if only the multixact
1183	* corresponds to a value from a tuple that was locked in a 9.2-or-older
1184	* installation and later pg_upgrade'd (that is, the infomask is
1185	* HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1186	* can still be running, so we return -1 just like for an empty multixact
1187	* without any further checking. It would be wrong to try to resolve such a
1188	* multixact: either the multixact is within the current valid multixact
1189	* range, in which case the returned result would be bogus, or outside that
1190	* range, in which case an error would be raised.
1191	*
1192	* In all other cases, the passed multixact must be within the known valid
1193	* range, that is, greater to or equal than oldestMultiXactId, and less than
1194	* nextMXact. Otherwise, an error is raised.
1195	*
1196	* onlyLock must be set to true if caller is certain that the given multi
1197	* is used only to lock tuples; can be false without loss of correctness,
1198	* but passing a true means we can return quickly without checking for
1199	* old updates.
1200	*/
1201	int
1202	GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1203	bool from_pgupgrade, bool onlyLock)
1204	{
1205	int pageno;
1206	int prev_pageno;
1207	int entryno;
1208	int slotno;
1209	MultiXactOffset *offptr;
1210	MultiXactOffset offset;
1211	int length;
1212	int truelength;
1213	int i;
1214	MultiXactId oldestMXact;
1215	MultiXactId nextMXact;
1216	MultiXactId tmpMXact;
1217	MultiXactOffset nextOffset;
1218	MultiXactMember *ptr;
1219
1220	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1221
1222	if (!MultiXactIdIsValid(multi) \|\| from_pgupgrade)
1223	return -`1`;
1224
1225	/ See if the MultiXactId is in the local cache /
1226	length = mXactCacheGetById(multi, members);
1227	if (length >= `0`)
1228	{
1229	debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1230	mxid_to_string(multi, length, *members));
1231	return length;
1232	}
1233
1234	/ Set our OldestVisibleMXactId[] entry if we didn't already /
1235	MultiXactIdSetOldestVisible();
1236
1237	/*
1238	* If we know the multi is used only for locking and not for updates, then
1239	* we can skip checking if the value is older than our oldest visible
1240	* multi. It cannot possibly still be running.
1241	*/
1242	if (onlyLock &&
1243	MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
1244	{
1245	debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1246	*members = NULL;
1247	return -`1`;
1248	}
1249
1250	/*
1251	* We check known limits on MultiXact before resorting to the SLRU area.
1252	*
1253	* An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1254	* useful; it has already been removed, or will be removed shortly, by
1255	* truncation. If one is passed, an error is raised.
1256	*
1257	* Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1258	* implies undetected ID wraparound has occurred. This raises a hard
1259	* error.
1260	*
1261	* Shared lock is enough here since we aren't modifying any global state.
1262	* Acquire it just long enough to grab the current counter values. We may
1263	* need both nextMXact and nextOffset; see below.
1264	*/
1265	LWLockAcquire(MultiXactGenLock, LW_SHARED);
1266
1267	oldestMXact = MultiXactState->oldestMultiXactId;
1268	nextMXact = MultiXactState->nextMXact;
1269	nextOffset = MultiXactState->nextOffset;
1270
1271	LWLockRelease(MultiXactGenLock);
1272
1273	if (MultiXactIdPrecedes(multi, oldestMXact))
1274	{
1275	ereport(ERROR,
1276	(errcode(ERRCODE_INTERNAL_ERROR),
1277	errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1278	multi)));
1279	return -`1`;
1280	}
1281
1282	if (!MultiXactIdPrecedes(multi, nextMXact))
1283	ereport(ERROR,
1284	(errcode(ERRCODE_INTERNAL_ERROR),
1285	errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1286	multi)));
1287
1288	/*
1289	* Find out the offset at which we need to start reading MultiXactMembers
1290	* and the number of members in the multixact. We determine the latter as
1291	* the difference between this multixact's starting offset and the next
1292	* one's. However, there are some corner cases to worry about:
1293	*
1294	* 1. This multixact may be the latest one created, in which case there is
1295	* no next one to look at. In this case the nextOffset value we just
1296	* saved is the correct endpoint.
1297	*
1298	* 2. The next multixact may still be in process of being filled in: that
1299	* is, another process may have done GetNewMultiXactId but not yet written
1300	* the offset entry for that ID. In that scenario, it is guaranteed that
1301	* the offset entry for that multixact exists (because GetNewMultiXactId
1302	* won't release MultiXactGenLock until it does) but contains zero
1303	* (because we are careful to pre-zero offset pages). Because
1304	* GetNewMultiXactId will never return zero as the starting offset for a
1305	* multixact, when we read zero as the next multixact's offset, we know we
1306	* have this case. We sleep for a bit and try again.
1307	*
1308	* 3. Because GetNewMultiXactId increments offset zero to offset one to
1309	* handle case #2, there is an ambiguity near the point of offset
1310	* wraparound. If we see next multixact's offset is one, is that our
1311	* multixact's actual endpoint, or did it end at zero with a subsequent
1312	* increment? We handle this using the knowledge that if the zero'th
1313	* member slot wasn't filled, it'll contain zero, and zero isn't a valid
1314	* transaction ID so it can't be a multixact member. Therefore, if we
1315	* read a zero from the members array, just ignore it.
1316	*
1317	* This is all pretty messy, but the mess occurs only in infrequent corner
1318	* cases, so it seems better than holding the MultiXactGenLock for a long
1319	* time on every multixact creation.
1320	*/
1321	retry:
1322	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1323
1324	pageno = MultiXactIdToOffsetPage(multi);
1325	entryno = MultiXactIdToOffsetEntry(multi);
1326
1327	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1328	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1329	offptr += entryno;
1330	offset = *offptr;
1331
1332	Assert(offset != `0`);
1333
1334	/*
1335	* Use the same increment rule as GetNewMultiXactId(), that is, don't
1336	* handle wraparound explicitly until needed.
1337	*/
1338	tmpMXact = multi + `1`;
1339
1340	if (nextMXact == tmpMXact)
1341	{
1342	/ Corner case 1: there is no next multixact /
1343	length = nextOffset - offset;
1344	}
1345	else
1346	{
1347	MultiXactOffset nextMXOffset;
1348
1349	/ handle wraparound if needed /
1350	if (tmpMXact < FirstMultiXactId)
1351	tmpMXact = FirstMultiXactId;
1352
1353	prev_pageno = pageno;
1354
1355	pageno = MultiXactIdToOffsetPage(tmpMXact);
1356	entryno = MultiXactIdToOffsetEntry(tmpMXact);
1357
1358	if (pageno != prev_pageno)
1359	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1360
1361	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1362	offptr += entryno;
1363	nextMXOffset = *offptr;
1364
1365	if (nextMXOffset == `0`)
1366	{
1367	/ Corner case 2: next multixact is still being filled in /
1368	LWLockRelease(MultiXactOffsetControlLock);
1369	CHECK_FOR_INTERRUPTS();
1370	pg_usleep(`1000L`);
1371	goto retry;
1372	}
1373
1374	length = nextMXOffset - offset;
1375	}
1376
1377	LWLockRelease(MultiXactOffsetControlLock);
1378
1379	ptr = (MultiXactMember ) palloc(length sizeof(MultiXactMember));
1380	*members = ptr;
1381
1382	/ Now get the members themselves. /
1383	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
1384
1385	truelength = `0`;
1386	prev_pageno = -`1`;
1387	for (i = `0`; i < length; i++, offset++)
1388	{
1389	TransactionId *xactptr;
1390	uint32 *flagsptr;
1391	int flagsoff;
1392	int bshift;
1393	int memberoff;
1394
1395	pageno = MXOffsetToMemberPage(offset);
1396	memberoff = MXOffsetToMemberOffset(offset);
1397
1398	if (pageno != prev_pageno)
1399	{
1400	slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1401	prev_pageno = pageno;
1402	}
1403
1404	xactptr = (TransactionId *)
1405	(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1406
1407	if (!TransactionIdIsValid(*xactptr))
1408	{
1409	/ Corner case 3: we must be looking at unused slot zero /
1410	Assert(offset == `0`);
1411	continue;
1412	}
1413
1414	flagsoff = MXOffsetToFlagsOffset(offset);
1415	bshift = MXOffsetToFlagsBitShift(offset);
1416	flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1417
1418	ptr[truelength].xid = *xactptr;
1419	ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1420	truelength++;
1421	}
1422
1423	LWLockRelease(MultiXactMemberControlLock);
1424
1425	/*
1426	* Copy the result into the local cache.
1427	*/
1428	mXactCachePut(multi, truelength, ptr);
1429
1430	debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1431	mxid_to_string(multi, truelength, ptr));
1432	return truelength;
1433	}
1434
1435	/*
1436	* mxactMemberComparator
1437	* qsort comparison function for MultiXactMember
1438	*
1439	* We can't use wraparound comparison for XIDs because that does not respect
1440	* the triangle inequality! Any old sort order will do.
1441	*/
1442	static int
1443	mxactMemberComparator(const void arg1, const* void *arg2)
1444	{
1445	MultiXactMember member1 = (const* MultiXactMember *) arg1;
1446	MultiXactMember member2 = (const* MultiXactMember *) arg2;
1447
1448	if (member1.xid > member2.xid)
1449	return `1`;
1450	if (member1.xid < member2.xid)
1451	return -`1`;
1452	if (member1.status > member2.status)
1453	return `1`;
1454	if (member1.status < member2.status)
1455	return -`1`;
1456	return `0`;
1457	}
1458
1459	/*
1460	* mXactCacheGetBySet
1461	* returns a MultiXactId from the cache based on the set of
1462	* TransactionIds that compose it, or InvalidMultiXactId if
1463	* none matches.
1464	*
1465	* This is helpful, for example, if two transactions want to lock a huge
1466	* table. By using the cache, the second will use the same MultiXactId
1467	* for the majority of tuples, thus keeping MultiXactId usage low (saving
1468	* both I/O and wraparound issues).
1469	*
1470	* NB: the passed members array will be sorted in-place.
1471	*/
1472	static MultiXactId
1473	mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1474	{
1475	dlist_iter iter;
1476
1477	debug_elog3(DEBUG2, "CacheGet: looking for %s",
1478	mxid_to_string(InvalidMultiXactId, nmembers, members));
1479
1480	/ sort the array so comparison is easy /
1481	qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1482
1483	dlist_foreach(iter, &MXactCache)
1484	{
1485	mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
1486
1487	if (entry->nmembers != nmembers)
1488	continue;
1489
1490	/*
1491	* We assume the cache entries are sorted, and that the unused bits in
1492	* "status" are zeroed.
1493	*/
1494	if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == `0`)
1495	{
1496	debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1497	dlist_move_head(&MXactCache, iter.cur);
1498	return entry->multi;
1499	}
1500	}
1501
1502	debug_elog2(DEBUG2, "CacheGet: not found :-(");
1503	return InvalidMultiXactId;
1504	}
1505
1506	/*
1507	* mXactCacheGetById
1508	* returns the composing MultiXactMember set from the cache for a
1509	* given MultiXactId, if present.
1510	*
1511	* If successful, *xids is set to the address of a palloc'd copy of the
1512	* MultiXactMember set. Return value is number of members, or -1 on failure.
1513	*/
1514	static int
1515	mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1516	{
1517	dlist_iter iter;
1518
1519	debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1520
1521	dlist_foreach(iter, &MXactCache)
1522	{
1523	mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
1524
1525	if (entry->multi == multi)
1526	{
1527	MultiXactMember *ptr;
1528	Size size;
1529
1530	size = sizeof(MultiXactMember) * entry->nmembers;
1531	ptr = (MultiXactMember *) palloc(size);
1532	*members = ptr;
1533
1534	memcpy(ptr, entry->members, size);
1535
1536	debug_elog3(DEBUG2, "CacheGet: found %s",
1537	mxid_to_string(multi,
1538	entry->nmembers,
1539	entry->members));
1540
1541	/*
1542	* Note we modify the list while not using a modifiable iterator.
1543	* This is acceptable only because we exit the iteration
1544	* immediately afterwards.
1545	*/
1546	dlist_move_head(&MXactCache, iter.cur);
1547
1548	return entry->nmembers;
1549	}
1550	}
1551
1552	debug_elog2(DEBUG2, "CacheGet: not found");
1553	return -`1`;
1554	}
1555
1556	/*
1557	* mXactCachePut
1558	* Add a new MultiXactId and its composing set into the local cache.
1559	*/
1560	static void
1561	mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1562	{
1563	mXactCacheEnt *entry;
1564
1565	debug_elog3(DEBUG2, "CachePut: storing %s",
1566	mxid_to_string(multi, nmembers, members));
1567
1568	if (MXactContext == NULL)
1569	{
1570	/ The cache only lives as long as the current transaction /
1571	debug_elog2(DEBUG2, "CachePut: initializing memory context");
1572	MXactContext = AllocSetContextCreate(TopTransactionContext,
1573	"MultiXact cache context",
1574	ALLOCSET_SMALL_SIZES);
1575	}
1576
1577	entry = (mXactCacheEnt *)
1578	MemoryContextAlloc(MXactContext,
1579	offsetof(mXactCacheEnt, members) +
1580	nmembers * sizeof(MultiXactMember));
1581
1582	entry->multi = multi;
1583	entry->nmembers = nmembers;
1584	memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1585
1586	/ mXactCacheGetBySet assumes the entries are sorted, so sort them /
1587	qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1588
1589	dlist_push_head(&MXactCache, &entry->node);
1590	if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES)
1591	{
1592	dlist_node *node;
1593	mXactCacheEnt *entry;
1594
1595	node = dlist_tail_node(&MXactCache);
1596	dlist_delete(node);
1597	MXactCacheMembers--;
1598
1599	entry = dlist_container(mXactCacheEnt, node, node);
1600	debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1601	entry->multi);
1602
1603	pfree(entry);
1604	}
1605	}
1606
1607	static char *
1608	mxstatus_to_string(MultiXactStatus status)
1609	{
1610	switch (status)
1611	{
1612	case MultiXactStatusForKeyShare:
1613	return "keysh";
1614	case MultiXactStatusForShare:
1615	return "sh";
1616	case MultiXactStatusForNoKeyUpdate:
1617	return "fornokeyupd";
1618	case MultiXactStatusForUpdate:
1619	return "forupd";
1620	case MultiXactStatusNoKeyUpdate:
1621	return "nokeyupd";
1622	case MultiXactStatusUpdate:
1623	return "upd";
1624	default:
1625	elog(ERROR, "unrecognized multixact status %d", status);
1626	return "";
1627	}
1628	}
1629
1630	char *
1631	mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1632	{
1633	static char *str = NULL;
1634	StringInfoData buf;
1635	int i;
1636
1637	if (str != NULL)
1638	pfree(str);
1639
1640	initStringInfo(&buf);
1641
1642	appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[`0`].xid,
1643	mxstatus_to_string(members[`0`].status));
1644
1645	for (i = `1`; i < nmembers; i++)
1646	appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1647	mxstatus_to_string(members[i].status));
1648
1649	appendStringInfoChar(&buf, `']'`);
1650	str = MemoryContextStrdup(TopMemoryContext, buf.data);
1651	pfree(buf.data);
1652	return str;
1653	}
1654
1655	/*
1656	* AtEOXact_MultiXact
1657	* Handle transaction end for MultiXact
1658	*
1659	* This is called at top transaction commit or abort (we don't care which).
1660	*/
1661	void
1662	AtEOXact_MultiXact(void)
1663	{
1664	/*
1665	* Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1666	* which should only be valid while within a transaction.
1667	*
1668	* We assume that storing a MultiXactId is atomic and so we need not take
1669	* MultiXactGenLock to do this.
1670	*/
1671	OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
1672	OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
1673
1674	/*
1675	* Discard the local MultiXactId cache. Since MXactContext was created as
1676	* a child of TopTransactionContext, we needn't delete it explicitly.
1677	*/
1678	MXactContext = NULL;
1679	dlist_init(&MXactCache);
1680	MXactCacheMembers = `0`;
1681	}
1682
1683	/*
1684	* AtPrepare_MultiXact
1685	* Save multixact state at 2PC transaction prepare
1686	*
1687	* In this phase, we only store our OldestMemberMXactId value in the two-phase
1688	* state file.
1689	*/
1690	void
1691	AtPrepare_MultiXact(void)
1692	{
1693	MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
1694
1695	if (MultiXactIdIsValid(myOldestMember))
1696	RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, `0`,
1697	&myOldestMember, sizeof(MultiXactId));
1698	}
1699
1700	/*
1701	* PostPrepare_MultiXact
1702	* Clean up after successful PREPARE TRANSACTION
1703	*/
1704	void
1705	PostPrepare_MultiXact(TransactionId xid)
1706	{
1707	MultiXactId myOldestMember;
1708
1709	/*
1710	* Transfer our OldestMemberMXactId value to the slot reserved for the
1711	* prepared transaction.
1712	*/
1713	myOldestMember = OldestMemberMXactId[MyBackendId];
1714	if (MultiXactIdIsValid(myOldestMember))
1715	{
1716	BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
1717
1718	/*
1719	* Even though storing MultiXactId is atomic, acquire lock to make
1720	* sure others see both changes, not just the reset of the slot of the
1721	* current backend. Using a volatile pointer might suffice, but this
1722	* isn't a hot spot.
1723	*/
1724	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1725
1726	OldestMemberMXactId[dummyBackendId] = myOldestMember;
1727	OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
1728
1729	LWLockRelease(MultiXactGenLock);
1730	}
1731
1732	/*
1733	* We don't need to transfer OldestVisibleMXactId value, because the
1734	* transaction is not going to be looking at any more multixacts once it's
1735	* prepared.
1736	*
1737	* We assume that storing a MultiXactId is atomic and so we need not take
1738	* MultiXactGenLock to do this.
1739	*/
1740	OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
1741
1742	/*
1743	* Discard the local MultiXactId cache like in AtEOX_MultiXact
1744	*/
1745	MXactContext = NULL;
1746	dlist_init(&MXactCache);
1747	MXactCacheMembers = `0`;
1748	}
1749
1750	/*
1751	* multixact_twophase_recover
1752	* Recover the state of a prepared transaction at startup
1753	*/
1754	void
1755	multixact_twophase_recover(TransactionId xid, uint16 info,
1756	void *recdata, uint32 len)
1757	{
1758	BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
1759	MultiXactId oldestMember;
1760
1761	/*
1762	* Get the oldest member XID from the state file record, and set it in the
1763	* OldestMemberMXactId slot reserved for this prepared transaction.
1764	*/
1765	Assert(len == sizeof(MultiXactId));
1766	oldestMember = ((MultiXactId ) recdata);
1767
1768	OldestMemberMXactId[dummyBackendId] = oldestMember;
1769	}
1770
1771	/*
1772	* multixact_twophase_postcommit
1773	* Similar to AtEOX_MultiXact but for COMMIT PREPARED
1774	*/
1775	void
1776	multixact_twophase_postcommit(TransactionId xid, uint16 info,
1777	void *recdata, uint32 len)
1778	{
1779	BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true);
1780
1781	Assert(len == sizeof(MultiXactId));
1782
1783	OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
1784	}
1785
1786	/*
1787	* multixact_twophase_postabort
1788	* This is actually just the same as the COMMIT case.
1789	*/
1790	void
1791	multixact_twophase_postabort(TransactionId xid, uint16 info,
1792	void *recdata, uint32 len)
1793	{
1794	multixact_twophase_postcommit(xid, info, recdata, len);
1795	}
1796
1797	/*
1798	* Initialization of shared memory for MultiXact. We use two SLRU areas,
1799	* thus double memory. Also, reserve space for the shared MultiXactState
1800	* struct and the per-backend MultiXactId arrays (two of those, too).
1801	*/
1802	Size
1803	MultiXactShmemSize(void)
1804	{
1805	Size size;
1806
1807	/ We need 2MaxOldestSlot + 1 perBackendXactIds[] entries /*
1808	#define SHARED_MULTIXACT_STATE_SIZE \
1809	add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \
1810	mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1811
1812	size = SHARED_MULTIXACT_STATE_SIZE;
1813	size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, `0`));
1814	size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, `0`));
1815
1816	return size;
1817	}
1818
1819	void
1820	MultiXactShmemInit(void)
1821	{
1822	bool found;
1823
1824	debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1825
1826	MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1827	MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1828
1829	SimpleLruInit(MultiXactOffsetCtl,
1830	"multixact_offset", NUM_MXACTOFFSET_BUFFERS, `0`,
1831	MultiXactOffsetControlLock, "pg_multixact/offsets",
1832	LWTRANCHE_MXACTOFFSET_BUFFERS);
1833	SimpleLruInit(MultiXactMemberCtl,
1834	"multixact_member", NUM_MXACTMEMBER_BUFFERS, `0`,
1835	MultiXactMemberControlLock, "pg_multixact/members",
1836	LWTRANCHE_MXACTMEMBER_BUFFERS);
1837
1838	/ Initialize our shared state struct /
1839	MultiXactState = ShmemInitStruct("Shared MultiXact State",
1840	SHARED_MULTIXACT_STATE_SIZE,
1841	&found);
1842	if (!IsUnderPostmaster)
1843	{
1844	Assert(!found);
1845
1846	/ Make sure we zero out the per-backend state /
1847	MemSet(MultiXactState, `0`, SHARED_MULTIXACT_STATE_SIZE);
1848	}
1849	else
1850	Assert(found);
1851
1852	/*
1853	* Set up array pointers. Note that perBackendXactIds[0] is wasted space
1854	* since we only use indexes 1..MaxOldestSlot in each array.
1855	*/
1856	OldestMemberMXactId = MultiXactState->perBackendXactIds;
1857	OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
1858	}
1859
1860	/*
1861	* This func must be called ONCE on system install. It creates the initial
1862	* MultiXact segments. (The MultiXacts directories are assumed to have been
1863	* created by initdb, and MultiXactShmemInit must have been called already.)
1864	*/
1865	void
1866	BootStrapMultiXact(void)
1867	{
1868	int slotno;
1869
1870	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1871
1872	/ Create and zero the first page of the offsets log /
1873	slotno = ZeroMultiXactOffsetPage(`0`, false);
1874
1875	/ Make sure it's written out /
1876	SimpleLruWritePage(MultiXactOffsetCtl, slotno);
1877	Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
1878
1879	LWLockRelease(MultiXactOffsetControlLock);
1880
1881	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
1882
1883	/ Create and zero the first page of the members log /
1884	slotno = ZeroMultiXactMemberPage(`0`, false);
1885
1886	/ Make sure it's written out /
1887	SimpleLruWritePage(MultiXactMemberCtl, slotno);
1888	Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
1889
1890	LWLockRelease(MultiXactMemberControlLock);
1891	}
1892
1893	/*
1894	* Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
1895	* If writeXlog is true, also emit an XLOG record saying we did this.
1896	*
1897	* The page is not actually written, just set up in shared memory.
1898	* The slot number of the new page is returned.
1899	*
1900	* Control lock must be held at entry, and will be held at exit.
1901	*/
1902	static int
1903	ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
1904	{
1905	int slotno;
1906
1907	slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
1908
1909	if (writeXlog)
1910	WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
1911
1912	return slotno;
1913	}
1914
1915	/*
1916	* Ditto, for MultiXactMember
1917	*/
1918	static int
1919	ZeroMultiXactMemberPage(int pageno, bool writeXlog)
1920	{
1921	int slotno;
1922
1923	slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
1924
1925	if (writeXlog)
1926	WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
1927
1928	return slotno;
1929	}
1930
1931	/*
1932	* MaybeExtendOffsetSlru
1933	* Extend the offsets SLRU area, if necessary
1934	*
1935	* After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
1936	* contain files that are shorter than necessary; this would occur if the old
1937	* installation had used multixacts beyond the first page (files cannot be
1938	* copied, because the on-disk representation is different). pg_upgrade would
1939	* update pg_control to set the next offset value to be at that position, so
1940	* that tuples marked as locked by such MultiXacts would be seen as visible
1941	* without having to consult multixact. However, trying to create and use a
1942	* new MultiXactId would result in an error because the page on which the new
1943	* value would reside does not exist. This routine is in charge of creating
1944	* such pages.
1945	*/
1946	static void
1947	MaybeExtendOffsetSlru(void)
1948	{
1949	int pageno;
1950
1951	pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
1952
1953	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
1954
1955	if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
1956	{
1957	int slotno;
1958
1959	/*
1960	* Fortunately for us, SimpleLruWritePage is already prepared to deal
1961	* with creating a new segment file even if the page we're writing is
1962	* not the first in it, so this is enough.
1963	*/
1964	slotno = ZeroMultiXactOffsetPage(pageno, false);
1965	SimpleLruWritePage(MultiXactOffsetCtl, slotno);
1966	}
1967
1968	LWLockRelease(MultiXactOffsetControlLock);
1969	}
1970
1971	/*
1972	* This must be called ONCE during postmaster or standalone-backend startup.
1973	*
1974	* StartupXLOG has already established nextMXact/nextOffset by calling
1975	* MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
1976	* info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
1977	* replayed WAL.
1978	*/
1979	void
1980	StartupMultiXact(void)
1981	{
1982	MultiXactId multi = MultiXactState->nextMXact;
1983	MultiXactOffset offset = MultiXactState->nextOffset;
1984	int pageno;
1985
1986	/*
1987	* Initialize offset's idea of the latest page number.
1988	*/
1989	pageno = MultiXactIdToOffsetPage(multi);
1990	MultiXactOffsetCtl->shared->latest_page_number = pageno;
1991
1992	/*
1993	* Initialize member's idea of the latest page number.
1994	*/
1995	pageno = MXOffsetToMemberPage(offset);
1996	MultiXactMemberCtl->shared->latest_page_number = pageno;
1997	}
1998
1999	/*
2000	* This must be called ONCE at the end of startup/recovery.
2001	*/
2002	void
2003	TrimMultiXact(void)
2004	{
2005	MultiXactId nextMXact;
2006	MultiXactOffset offset;
2007	MultiXactId oldestMXact;
2008	Oid oldestMXactDB;
2009	int pageno;
2010	int entryno;
2011	int flagsoff;
2012
2013	LWLockAcquire(MultiXactGenLock, LW_SHARED);
2014	nextMXact = MultiXactState->nextMXact;
2015	offset = MultiXactState->nextOffset;
2016	oldestMXact = MultiXactState->oldestMultiXactId;
2017	oldestMXactDB = MultiXactState->oldestMultiXactDB;
2018	LWLockRelease(MultiXactGenLock);
2019
2020	/ Clean up offsets state /
2021	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
2022
2023	/*
2024	* (Re-)Initialize our idea of the latest page number for offsets.
2025	*/
2026	pageno = MultiXactIdToOffsetPage(nextMXact);
2027	MultiXactOffsetCtl->shared->latest_page_number = pageno;
2028
2029	/*
2030	* Zero out the remainder of the current offsets page. See notes in
2031	* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2032	* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2033	* rule "write xlog before data," nextMXact successors may carry obsolete,
2034	* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2035	* operates normally.
2036	*/
2037	entryno = MultiXactIdToOffsetEntry(nextMXact);
2038	if (entryno != `0`)
2039	{
2040	int slotno;
2041	MultiXactOffset *offptr;
2042
2043	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2044	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2045	offptr += entryno;
2046
2047	MemSet(offptr, `0`, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2048
2049	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2050	}
2051
2052	LWLockRelease(MultiXactOffsetControlLock);
2053
2054	/ And the same for members /
2055	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
2056
2057	/*
2058	* (Re-)Initialize our idea of the latest page number for members.
2059	*/
2060	pageno = MXOffsetToMemberPage(offset);
2061	MultiXactMemberCtl->shared->latest_page_number = pageno;
2062
2063	/*
2064	* Zero out the remainder of the current members page. See notes in
2065	* TrimCLOG() for motivation.
2066	*/
2067	flagsoff = MXOffsetToFlagsOffset(offset);
2068	if (flagsoff != `0`)
2069	{
2070	int slotno;
2071	TransactionId *xidptr;
2072	int memberoff;
2073
2074	memberoff = MXOffsetToMemberOffset(offset);
2075	slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2076	xidptr = (TransactionId *)
2077	(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2078
2079	MemSet(xidptr, `0`, BLCKSZ - memberoff);
2080
2081	/*
2082	* Note: we don't need to zero out the flag bits in the remaining
2083	* members of the current group, because they are always reset before
2084	* writing.
2085	*/
2086
2087	MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2088	}
2089
2090	LWLockRelease(MultiXactMemberControlLock);
2091
2092	/ signal that we're officially up /
2093	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2094	MultiXactState->finishedStartup = true;
2095	LWLockRelease(MultiXactGenLock);
2096
2097	/ Now compute how far away the next members wraparound is. /
2098	SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2099	}
2100
2101	/*
2102	* This must be called ONCE during postmaster or standalone-backend shutdown
2103	*/
2104	void
2105	ShutdownMultiXact(void)
2106	{
2107	/ Flush dirty MultiXact pages to disk /
2108	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
2109	SimpleLruFlush(MultiXactOffsetCtl, false);
2110	SimpleLruFlush(MultiXactMemberCtl, false);
2111	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
2112	}
2113
2114	/*
2115	* Get the MultiXact data to save in a checkpoint record
2116	*/
2117	void
2118	MultiXactGetCheckptMulti(bool is_shutdown,
2119	MultiXactId *nextMulti,
2120	MultiXactOffset *nextMultiOffset,
2121	MultiXactId *oldestMulti,
2122	Oid *oldestMultiDB)
2123	{
2124	LWLockAcquire(MultiXactGenLock, LW_SHARED);
2125	*nextMulti = MultiXactState->nextMXact;
2126	*nextMultiOffset = MultiXactState->nextOffset;
2127	*oldestMulti = MultiXactState->oldestMultiXactId;
2128	*oldestMultiDB = MultiXactState->oldestMultiXactDB;
2129	LWLockRelease(MultiXactGenLock);
2130
2131	debug_elog6(DEBUG2,
2132	"MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2133	nextMulti, nextMultiOffset, oldestMulti, oldestMultiDB);
2134	}
2135
2136	/*
2137	* Perform a checkpoint --- either during shutdown, or on-the-fly
2138	*/
2139	void
2140	CheckPointMultiXact(void)
2141	{
2142	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2143
2144	/ Flush dirty MultiXact pages to disk /
2145	SimpleLruFlush(MultiXactOffsetCtl, true);
2146	SimpleLruFlush(MultiXactMemberCtl, true);
2147
2148	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2149	}
2150
2151	/*
2152	* Set the next-to-be-assigned MultiXactId and offset
2153	*
2154	* This is used when we can determine the correct next ID/offset exactly
2155	* from a checkpoint record. Although this is only called during bootstrap
2156	* and XLog replay, we take the lock in case any hot-standby backends are
2157	* examining the values.
2158	*/
2159	void
2160	MultiXactSetNextMXact(MultiXactId nextMulti,
2161	MultiXactOffset nextMultiOffset)
2162	{
2163	debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2164	nextMulti, nextMultiOffset);
2165	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2166	MultiXactState->nextMXact = nextMulti;
2167	MultiXactState->nextOffset = nextMultiOffset;
2168	LWLockRelease(MultiXactGenLock);
2169
2170	/*
2171	* During a binary upgrade, make sure that the offsets SLRU is large
2172	* enough to contain the next value that would be created.
2173	*
2174	* We need to do this pretty early during the first startup in binary
2175	* upgrade mode: before StartupMultiXact() in fact, because this routine
2176	* is called even before that by StartupXLOG(). And we can't do it
2177	* earlier than at this point, because during that first call of this
2178	* routine we determine the MultiXactState->nextMXact value that
2179	* MaybeExtendOffsetSlru needs.
2180	*/
2181	if (IsBinaryUpgrade)
2182	MaybeExtendOffsetSlru();
2183	}
2184
2185	/*
2186	* Determine the last safe MultiXactId to allocate given the currently oldest
2187	* datminmxid (ie, the oldest MultiXactId that might exist in any database
2188	* of our cluster), and the OID of the (or a) database with that value.
2189	*
2190	* is_startup is true when we are just starting the cluster, false when we
2191	* are updating state in a running cluster. This only affects log messages.
2192	*/
2193	void
2194	SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2195	bool is_startup)
2196	{
2197	MultiXactId multiVacLimit;
2198	MultiXactId multiWarnLimit;
2199	MultiXactId multiStopLimit;
2200	MultiXactId multiWrapLimit;
2201	MultiXactId curMulti;
2202	bool needs_offset_vacuum;
2203
2204	Assert(MultiXactIdIsValid(oldest_datminmxid));
2205
2206	/*
2207	* We pretend that a wrap will happen halfway through the multixact ID
2208	* space, but that's not really true, because multixacts wrap differently
2209	* from transaction IDs. Note that, separately from any concern about
2210	* multixact IDs wrapping, we must ensure that multixact members do not
2211	* wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2212	*/
2213	multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> `1`);
2214	if (multiWrapLimit < FirstMultiXactId)
2215	multiWrapLimit += FirstMultiXactId;
2216
2217	/*
2218	* We'll refuse to continue assigning MultiXactIds once we get within 100
2219	* multi of data loss.
2220	*
2221	* Note: This differs from the magic number used in
2222	* SetTransactionIdLimit() since vacuum itself will never generate new
2223	* multis. XXX actually it does, if it needs to freeze old multis.
2224	*/
2225	multiStopLimit = multiWrapLimit - `100`;
2226	if (multiStopLimit < FirstMultiXactId)
2227	multiStopLimit -= FirstMultiXactId;
2228
2229	/*
2230	* We'll start complaining loudly when we get within 10M multis of the
2231	* stop point. This is kind of arbitrary, but if you let your gas gauge
2232	* get down to 1% of full, would you be looking for the next gas station?
2233	* We need to be fairly liberal about this number because there are lots
2234	* of scenarios where most transactions are done by automatic clients that
2235	* won't pay attention to warnings. (No, we're not gonna make this
2236	* configurable. If you know enough to configure it, you know enough to
2237	* not get in this kind of trouble in the first place.)
2238	*/
2239	multiWarnLimit = multiStopLimit - `10000000`;
2240	if (multiWarnLimit < FirstMultiXactId)
2241	multiWarnLimit -= FirstMultiXactId;
2242
2243	/*
2244	* We'll start trying to force autovacuums when oldest_datminmxid gets to
2245	* be more than autovacuum_multixact_freeze_max_age mxids old.
2246	*
2247	* Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2248	* so that we don't have to worry about dealing with on-the-fly changes in
2249	* its value. See SetTransactionIdLimit.
2250	*/
2251	multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2252	if (multiVacLimit < FirstMultiXactId)
2253	multiVacLimit += FirstMultiXactId;
2254
2255	/ Grab lock for just long enough to set the new limit values /
2256	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2257	MultiXactState->oldestMultiXactId = oldest_datminmxid;
2258	MultiXactState->oldestMultiXactDB = oldest_datoid;
2259	MultiXactState->multiVacLimit = multiVacLimit;
2260	MultiXactState->multiWarnLimit = multiWarnLimit;
2261	MultiXactState->multiStopLimit = multiStopLimit;
2262	MultiXactState->multiWrapLimit = multiWrapLimit;
2263	curMulti = MultiXactState->nextMXact;
2264	LWLockRelease(MultiXactGenLock);
2265
2266	/ Log the info /
2267	ereport(DEBUG1,
2268	(errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
2269	multiWrapLimit, oldest_datoid)));
2270
2271	/*
2272	* Computing the actual limits is only possible once the data directory is
2273	* in a consistent state. There's no need to compute the limits while
2274	* still replaying WAL - no decisions about new multis are made even
2275	* though multixact creations might be replayed. So we'll only do further
2276	* checks after TrimMultiXact() has been called.
2277	*/
2278	if (!MultiXactState->finishedStartup)
2279	return;
2280
2281	Assert(!InRecovery);
2282
2283	/ Set limits for offset vacuum. /
2284	needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2285
2286	/*
2287	* If past the autovacuum force point, immediately signal an autovac
2288	* request. The reason for this is that autovac only processes one
2289	* database per invocation. Once it's finished cleaning up the oldest
2290	* database, it'll call here, and we'll signal the postmaster to start
2291	* another iteration immediately if there are still any old databases.
2292	*/
2293	if ((MultiXactIdPrecedes(multiVacLimit, curMulti) \|\|
2294	needs_offset_vacuum) && IsUnderPostmaster)
2295	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2296
2297	/ Give an immediate warning if past the wrap warn point /
2298	if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2299	{
2300	char *oldest_datname;
2301
2302	/*
2303	* We can be called when not inside a transaction, for example during
2304	* StartupXLOG(). In such a case we cannot do database access, so we
2305	* must just report the oldest DB's OID.
2306	*
2307	* Note: it's also possible that get_database_name fails and returns
2308	* NULL, for example because the database just got dropped. We'll
2309	* still warn, even though the warning might now be unnecessary.
2310	*/
2311	if (IsTransactionState())
2312	oldest_datname = get_database_name(oldest_datoid);
2313	else
2314	oldest_datname = NULL;
2315
2316	if (oldest_datname)
2317	ereport(WARNING,
2318	(errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2319	"database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2320	multiWrapLimit - curMulti,
2321	oldest_datname,
2322	multiWrapLimit - curMulti),
2323	errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
2324	"You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2325	else
2326	ereport(WARNING,
2327	(errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2328	"database with OID %u must be vacuumed before %u more MultiXactIds are used",
2329	multiWrapLimit - curMulti,
2330	oldest_datoid,
2331	multiWrapLimit - curMulti),
2332	errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
2333	"You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2334	}
2335	}
2336
2337	/*
2338	* Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2339	* and similarly nextOffset is at least minMultiOffset.
2340	*
2341	* This is used when we can determine minimum safe values from an XLog
2342	* record (either an on-line checkpoint or an mxact creation log entry).
2343	* Although this is only called during XLog replay, we take the lock in case
2344	* any hot-standby backends are examining the values.
2345	*/
2346	void
2347	MultiXactAdvanceNextMXact(MultiXactId minMulti,
2348	MultiXactOffset minMultiOffset)
2349	{
2350	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2351	if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2352	{
2353	debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2354	MultiXactState->nextMXact = minMulti;
2355	}
2356	if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2357	{
2358	debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2359	minMultiOffset);
2360	MultiXactState->nextOffset = minMultiOffset;
2361	}
2362	LWLockRelease(MultiXactGenLock);
2363	}
2364
2365	/*
2366	* Update our oldestMultiXactId value, but only if it's more recent than what
2367	* we had.
2368	*
2369	* This may only be called during WAL replay.
2370	*/
2371	void
2372	MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2373	{
2374	Assert(InRecovery);
2375
2376	if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2377	SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2378	}
2379
2380	/*
2381	* Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2382	*
2383	* NB: this is called while holding MultiXactGenLock. We want it to be very
2384	* fast most of the time; even when it's not so fast, no actual I/O need
2385	* happen unless we're forced to write out a dirty log or xlog page to make
2386	* room in shared memory.
2387	*/
2388	static void
2389	ExtendMultiXactOffset(MultiXactId multi)
2390	{
2391	int pageno;
2392
2393	/*
2394	* No work except at first MultiXactId of a page. But beware: just after
2395	* wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2396	*/
2397	if (MultiXactIdToOffsetEntry(multi) != `0` &&
2398	multi != FirstMultiXactId)
2399	return;
2400
2401	pageno = MultiXactIdToOffsetPage(multi);
2402
2403	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
2404
2405	/ Zero the page and make an XLOG entry about it /
2406	ZeroMultiXactOffsetPage(pageno, true);
2407
2408	LWLockRelease(MultiXactOffsetControlLock);
2409	}
2410
2411	/*
2412	* Make sure that MultiXactMember has room for the members of a newly-
2413	* allocated MultiXactId.
2414	*
2415	* Like the above routine, this is called while holding MultiXactGenLock;
2416	* same comments apply.
2417	*/
2418	static void
2419	ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2420	{
2421	/*
2422	* It's possible that the members span more than one page of the members
2423	* file, so we loop to ensure we consider each page. The coding is not
2424	* optimal if the members span several pages, but that seems unusual
2425	* enough to not worry much about.
2426	*/
2427	while (nmembers > `0`)
2428	{
2429	int flagsoff;
2430	int flagsbit;
2431	uint32 difference;
2432
2433	/*
2434	* Only zero when at first entry of a page.
2435	*/
2436	flagsoff = MXOffsetToFlagsOffset(offset);
2437	flagsbit = MXOffsetToFlagsBitShift(offset);
2438	if (flagsoff == `0` && flagsbit == `0`)
2439	{
2440	int pageno;
2441
2442	pageno = MXOffsetToMemberPage(offset);
2443
2444	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
2445
2446	/ Zero the page and make an XLOG entry about it /
2447	ZeroMultiXactMemberPage(pageno, true);
2448
2449	LWLockRelease(MultiXactMemberControlLock);
2450	}
2451
2452	/*
2453	* Compute the number of items till end of current page. Careful: if
2454	* addition of unsigned ints wraps around, we're at the last page of
2455	* the last segment; since that page holds a different number of items
2456	* than other pages, we need to do it differently.
2457	*/
2458	if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2459	{
2460	/*
2461	* This is the last page of the last segment; we can compute the
2462	* number of items left to allocate in it without modulo
2463	* arithmetic.
2464	*/
2465	difference = MaxMultiXactOffset - offset + `1`;
2466	}
2467	else
2468	difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2469
2470	/*
2471	* Advance to next page, taking care to properly handle the wraparound
2472	* case. OK if nmembers goes negative.
2473	*/
2474	nmembers -= difference;
2475	offset += difference;
2476	}
2477	}
2478
2479	/*
2480	* GetOldestMultiXactId
2481	*
2482	* Return the oldest MultiXactId that's still possibly still seen as live by
2483	* any running transaction. Older ones might still exist on disk, but they no
2484	* longer have any running member transaction.
2485	*
2486	* It's not safe to truncate MultiXact SLRU segments on the value returned by
2487	* this function; however, it can be used by a full-table vacuum to set the
2488	* point at which it will be possible to truncate SLRU for that table.
2489	*/
2490	MultiXactId
2491	GetOldestMultiXactId(void)
2492	{
2493	MultiXactId oldestMXact;
2494	MultiXactId nextMXact;
2495	int i;
2496
2497	/*
2498	* This is the oldest valid value among all the OldestMemberMXactId[] and
2499	* OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2500	*/
2501	LWLockAcquire(MultiXactGenLock, LW_SHARED);
2502
2503	/*
2504	* We have to beware of the possibility that nextMXact is in the
2505	* wrapped-around state. We don't fix the counter itself here, but we
2506	* must be sure to use a valid value in our calculation.
2507	*/
2508	nextMXact = MultiXactState->nextMXact;
2509	if (nextMXact < FirstMultiXactId)
2510	nextMXact = FirstMultiXactId;
2511
2512	oldestMXact = nextMXact;
2513	for (i = `1`; i <= MaxOldestSlot; i++)
2514	{
2515	MultiXactId thisoldest;
2516
2517	thisoldest = OldestMemberMXactId[i];
2518	if (MultiXactIdIsValid(thisoldest) &&
2519	MultiXactIdPrecedes(thisoldest, oldestMXact))
2520	oldestMXact = thisoldest;
2521	thisoldest = OldestVisibleMXactId[i];
2522	if (MultiXactIdIsValid(thisoldest) &&
2523	MultiXactIdPrecedes(thisoldest, oldestMXact))
2524	oldestMXact = thisoldest;
2525	}
2526
2527	LWLockRelease(MultiXactGenLock);
2528
2529	return oldestMXact;
2530	}
2531
2532	/*
2533	* Determine how aggressively we need to vacuum in order to prevent member
2534	* wraparound.
2535	*
2536	* To do so determine what's the oldest member offset and install the limit
2537	* info in MultiXactState, where it can be used to prevent overrun of old data
2538	* in the members SLRU area.
2539	*
2540	* The return value is true if emergency autovacuum is required and false
2541	* otherwise.
2542	*/
2543	static bool
2544	SetOffsetVacuumLimit(bool is_startup)
2545	{
2546	MultiXactId oldestMultiXactId;
2547	MultiXactId nextMXact;
2548	MultiXactOffset oldestOffset = `0`; / placate compiler /
2549	MultiXactOffset prevOldestOffset;
2550	MultiXactOffset nextOffset;
2551	bool oldestOffsetKnown = false;
2552	bool prevOldestOffsetKnown;
2553	MultiXactOffset offsetStopLimit = `0`;
2554	MultiXactOffset prevOffsetStopLimit;
2555
2556	/*
2557	* NB: Have to prevent concurrent truncation, we might otherwise try to
2558	* lookup an oldestMulti that's concurrently getting truncated away.
2559	*/
2560	LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2561
2562	/ Read relevant fields from shared memory. /
2563	LWLockAcquire(MultiXactGenLock, LW_SHARED);
2564	oldestMultiXactId = MultiXactState->oldestMultiXactId;
2565	nextMXact = MultiXactState->nextMXact;
2566	nextOffset = MultiXactState->nextOffset;
2567	prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2568	prevOldestOffset = MultiXactState->oldestOffset;
2569	prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2570	Assert(MultiXactState->finishedStartup);
2571	LWLockRelease(MultiXactGenLock);
2572
2573	/*
2574	* Determine the offset of the oldest multixact. Normally, we can read
2575	* the offset from the multixact itself, but there's an important special
2576	* case: if there are no multixacts in existence at all, oldestMXact
2577	* obviously can't point to one. It will instead point to the multixact
2578	* ID that will be assigned the next time one is needed.
2579	*/
2580	if (oldestMultiXactId == nextMXact)
2581	{
2582	/*
2583	* When the next multixact gets created, it will be stored at the next
2584	* offset.
2585	*/
2586	oldestOffset = nextOffset;
2587	oldestOffsetKnown = true;
2588	}
2589	else
2590	{
2591	/*
2592	* Figure out where the oldest existing multixact's offsets are
2593	* stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2594	* the supposedly-earliest multixact might not really exist. We are
2595	* careful not to fail in that case.
2596	*/
2597	oldestOffsetKnown =
2598	find_multixact_start(oldestMultiXactId, &oldestOffset);
2599
2600	if (oldestOffsetKnown)
2601	ereport(DEBUG1,
2602	(errmsg("oldest MultiXactId member is at offset %u",
2603	oldestOffset)));
2604	else
2605	ereport(LOG,
2606	(errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2607	oldestMultiXactId)));
2608	}
2609
2610	LWLockRelease(MultiXactTruncationLock);
2611
2612	/*
2613	* If we can, compute limits (and install them MultiXactState) to prevent
2614	* overrun of old data in the members SLRU area. We can only do so if the
2615	* oldest offset is known though.
2616	*/
2617	if (oldestOffsetKnown)
2618	{
2619	/ move back to start of the corresponding segment /
2620	offsetStopLimit = oldestOffset - (oldestOffset %
2621	(MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
2622
2623	/ always leave one segment before the wraparound point /
2624	offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2625
2626	if (!prevOldestOffsetKnown && !is_startup)
2627	ereport(LOG,
2628	(errmsg("MultiXact member wraparound protections are now enabled")));
2629
2630	ereport(DEBUG1,
2631	(errmsg("MultiXact member stop limit is now %u based on MultiXact %u",
2632	offsetStopLimit, oldestMultiXactId)));
2633	}
2634	else if (prevOldestOffsetKnown)
2635	{
2636	/*
2637	* If we failed to get the oldest offset this time, but we have a
2638	* value from a previous pass through this function, use the old
2639	* values rather than automatically forcing an emergency autovacuum
2640	* cycle again.
2641	*/
2642	oldestOffset = prevOldestOffset;
2643	oldestOffsetKnown = true;
2644	offsetStopLimit = prevOffsetStopLimit;
2645	}
2646
2647	/ Install the computed values /
2648	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2649	MultiXactState->oldestOffset = oldestOffset;
2650	MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2651	MultiXactState->offsetStopLimit = offsetStopLimit;
2652	LWLockRelease(MultiXactGenLock);
2653
2654	/*
2655	* Do we need an emergency autovacuum? If we're not sure, assume yes.
2656	*/
2657	return !oldestOffsetKnown \|\|
2658	(nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2659	}
2660
2661	/*
2662	* Return whether adding "distance" to "start" would move past "boundary".
2663	*
2664	* We use this to determine whether the addition is "wrapping around" the
2665	* boundary point, hence the name. The reason we don't want to use the regular
2666	* 2^31-modulo arithmetic here is that we want to be able to use the whole of
2667	* the 2^32-1 space here, allowing for more multixacts that would fit
2668	* otherwise.
2669	*/
2670	static bool
2671	MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
2672	uint32 distance)
2673	{
2674	MultiXactOffset finish;
2675
2676	/*
2677	* Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2678	* if the addition wraps around the UINT_MAX boundary, skip that value.
2679	*/
2680	finish = start + distance;
2681	if (finish < start)
2682	finish++;
2683
2684	/-----------------------------------------------------------------------*
2685	* When the boundary is numerically greater than the starting point, any
2686	* value numerically between the two is not wrapped:
2687	*
2688	* <----S----B---->
2689	* [---) = F wrapped past B (and UINT_MAX)
2690	* [---) = F not wrapped
2691	* [----] = F wrapped past B
2692	*
2693	* When the boundary is numerically less than the starting point (i.e. the
2694	* UINT_MAX wraparound occurs somewhere in between) then all values in
2695	* between are wrapped:
2696	*
2697	* <----B----S---->
2698	* [---) = F not wrapped past B (but wrapped past UINT_MAX)
2699	* [---) = F wrapped past B (and UINT_MAX)
2700	* [----] = F not wrapped
2701	*-----------------------------------------------------------------------
2702	*/
2703	if (start < boundary)
2704	return finish >= boundary \|\| finish < start;
2705	else
2706	return finish >= boundary && finish < start;
2707	}
2708
2709	/*
2710	* Find the starting offset of the given MultiXactId.
2711	*
2712	* Returns false if the file containing the multi does not exist on disk.
2713	* Otherwise, returns true and sets *result to the starting member offset.
2714	*
2715	* This function does not prevent concurrent truncation, so if that's
2716	* required, the caller has to protect against that.
2717	*/
2718	static bool
2719	find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2720	{
2721	MultiXactOffset offset;
2722	int pageno;
2723	int entryno;
2724	int slotno;
2725	MultiXactOffset *offptr;
2726
2727	Assert(MultiXactState->finishedStartup);
2728
2729	pageno = MultiXactIdToOffsetPage(multi);
2730	entryno = MultiXactIdToOffsetEntry(multi);
2731
2732	/*
2733	* Flush out dirty data, so PhysicalPageExists can work correctly.
2734	* SimpleLruFlush() is a pretty big hammer for that. Alternatively we
2735	* could add an in-memory version of page exists, but find_multixact_start
2736	* is called infrequently, and it doesn't seem bad to flush buffers to
2737	* disk before truncation.
2738	*/
2739	SimpleLruFlush(MultiXactOffsetCtl, true);
2740	SimpleLruFlush(MultiXactMemberCtl, true);
2741
2742	if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2743	return false;
2744
2745	/ lock is acquired by SimpleLruReadPage_ReadOnly /
2746	slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2747	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2748	offptr += entryno;
2749	offset = *offptr;
2750	LWLockRelease(MultiXactOffsetControlLock);
2751
2752	*result = offset;
2753	return true;
2754	}
2755
2756	/*
2757	* Determine how many multixacts, and how many multixact members, currently
2758	* exist. Return false if unable to determine.
2759	*/
2760	static bool
2761	ReadMultiXactCounts(uint32 multixacts, MultiXactOffset members)
2762	{
2763	MultiXactOffset nextOffset;
2764	MultiXactOffset oldestOffset;
2765	MultiXactId oldestMultiXactId;
2766	MultiXactId nextMultiXactId;
2767	bool oldestOffsetKnown;
2768
2769	LWLockAcquire(MultiXactGenLock, LW_SHARED);
2770	nextOffset = MultiXactState->nextOffset;
2771	oldestMultiXactId = MultiXactState->oldestMultiXactId;
2772	nextMultiXactId = MultiXactState->nextMXact;
2773	oldestOffset = MultiXactState->oldestOffset;
2774	oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2775	LWLockRelease(MultiXactGenLock);
2776
2777	if (!oldestOffsetKnown)
2778	return false;
2779
2780	*members = nextOffset - oldestOffset;
2781	*multixacts = nextMultiXactId - oldestMultiXactId;
2782	return true;
2783	}
2784
2785	/*
2786	* Multixact members can be removed once the multixacts that refer to them
2787	* are older than every datminxmid. autovacuum_multixact_freeze_max_age and
2788	* vacuum_multixact_freeze_table_age work together to make sure we never have
2789	* too many multixacts; we hope that, at least under normal circumstances,
2790	* this will also be sufficient to keep us from using too many offsets.
2791	* However, if the average multixact has many members, we might exhaust the
2792	* members space while still using few enough members that these limits fail
2793	* to trigger full table scans for relminmxid advancement. At that point,
2794	* we'd have no choice but to start failing multixact-creating operations
2795	* with an error.
2796	*
2797	* To prevent that, if more than a threshold portion of the members space is
2798	* used, we effectively reduce autovacuum_multixact_freeze_max_age and
2799	* to a value just less than the number of multixacts in use. We hope that
2800	* this will quickly trigger autovacuuming on the table or tables with the
2801	* oldest relminmxid, thus allowing datminmxid values to advance and removing
2802	* some members.
2803	*
2804	* As the fraction of the member space currently in use grows, we become
2805	* more aggressive in clamping this value. That not only causes autovacuum
2806	* to ramp up, but also makes any manual vacuums the user issues more
2807	* aggressive. This happens because vacuum_set_xid_limits() clamps the
2808	* freeze table and the minimum freeze age based on the effective
2809	* autovacuum_multixact_freeze_max_age this function returns. In the worst
2810	* case, we'll claim the freeze_max_age to zero, and every vacuum of any
2811	* table will try to freeze every multixact.
2812	*
2813	* It's possible that these thresholds should be user-tunable, but for now
2814	* we keep it simple.
2815	*/
2816	int
2817	MultiXactMemberFreezeThreshold(void)
2818	{
2819	MultiXactOffset members;
2820	uint32 multixacts;
2821	uint32 victim_multixacts;
2822	double fraction;
2823
2824	/ If we can't determine member space utilization, assume the worst. /
2825	if (!ReadMultiXactCounts(&multixacts, &members))
2826	return `0`;
2827
2828	/ If member space utilization is low, no special action is required. /
2829	if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2830	return autovacuum_multixact_freeze_max_age;
2831
2832	/*
2833	* Compute a target for relminmxid advancement. The number of multixacts
2834	* we try to eliminate from the system is based on how far we are past
2835	* MULTIXACT_MEMBER_SAFE_THRESHOLD.
2836	*/
2837	fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2838	(MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
2839	victim_multixacts = multixacts * fraction;
2840
2841	/ fraction could be > 1.0, but lowest possible freeze age is zero /
2842	if (victim_multixacts > multixacts)
2843	return `0`;
2844	return multixacts - victim_multixacts;
2845	}
2846
2847	typedef struct mxtruncinfo
2848	{
2849	int earliestExistingPage;
2850	} mxtruncinfo;
2851
2852	/*
2853	* SlruScanDirectory callback
2854	* This callback determines the earliest existing page number.
2855	*/
2856	static bool
2857	SlruScanDirCbFindEarliest(SlruCtl ctl, char filename, int* segpage, void *data)
2858	{
2859	mxtruncinfo trunc = (mxtruncinfo ) data;
2860
2861	if (trunc->earliestExistingPage == -`1` \|\|
2862	ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2863	{
2864	trunc->earliestExistingPage = segpage;
2865	}
2866
2867	return false; / keep going /
2868	}
2869
2870
2871	/*
2872	* Delete members segments [oldest, newOldest)
2873	*
2874	* The members SLRU can, in contrast to the offsets one, be filled to almost
2875	* the full range at once. This means SimpleLruTruncate() can't trivially be
2876	* used - instead the to-be-deleted range is computed using the offsets
2877	* SLRU. C.f. TruncateMultiXact().
2878	*/
2879	static void
2880	PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
2881	{
2882	const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
2883	int startsegment = MXOffsetToMemberSegment(oldestOffset);
2884	int endsegment = MXOffsetToMemberSegment(newOldestOffset);
2885	int segment = startsegment;
2886
2887	/*
2888	* Delete all the segments but the last one. The last segment can still
2889	* contain, possibly partially, valid data.
2890	*/
2891	while (segment != endsegment)
2892	{
2893	elog(DEBUG2, "truncating multixact members segment %x", segment);
2894	SlruDeleteSegment(MultiXactMemberCtl, segment);
2895
2896	/ move to next segment, handling wraparound correctly /
2897	if (segment == maxsegment)
2898	segment = `0`;
2899	else
2900	segment += `1`;
2901	}
2902	}
2903
2904	/*
2905	* Delete offsets segments [oldest, newOldest)
2906	*/
2907	static void
2908	PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
2909	{
2910	/*
2911	* We step back one multixact to avoid passing a cutoff page that hasn't
2912	* been created yet in the rare case that oldestMulti would be the first
2913	* item on a page and oldestMulti == nextMulti. In that case, if we
2914	* didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
2915	* detection.
2916	*/
2917	SimpleLruTruncate(MultiXactOffsetCtl,
2918	MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
2919	}
2920
2921	/*
2922	* Remove all MultiXactOffset and MultiXactMember segments before the oldest
2923	* ones still of interest.
2924	*
2925	* This is only called on a primary as part of vacuum (via
2926	* vac_truncate_clog()). During recovery truncation is done by replaying
2927	* truncation WAL records logged here.
2928	*
2929	* newOldestMulti is the oldest currently required multixact, newOldestMultiDB
2930	* is one of the databases preventing newOldestMulti from increasing.
2931	*/
2932	void
2933	TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
2934	{
2935	MultiXactId oldestMulti;
2936	MultiXactId nextMulti;
2937	MultiXactOffset newOldestOffset;
2938	MultiXactOffset oldestOffset;
2939	MultiXactOffset nextOffset;
2940	mxtruncinfo trunc;
2941	MultiXactId earliest;
2942
2943	Assert(!RecoveryInProgress());
2944	Assert(MultiXactState->finishedStartup);
2945
2946	/*
2947	* We can only allow one truncation to happen at once. Otherwise parts of
2948	* members might vanish while we're doing lookups or similar. There's no
2949	* need to have an interlock with creating new multis or such, since those
2950	* are constrained by the limits (which only grow, never shrink).
2951	*/
2952	LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2953
2954	LWLockAcquire(MultiXactGenLock, LW_SHARED);
2955	nextMulti = MultiXactState->nextMXact;
2956	nextOffset = MultiXactState->nextOffset;
2957	oldestMulti = MultiXactState->oldestMultiXactId;
2958	LWLockRelease(MultiXactGenLock);
2959	Assert(MultiXactIdIsValid(oldestMulti));
2960
2961	/*
2962	* Make sure to only attempt truncation if there's values to truncate
2963	* away. In normal processing values shouldn't go backwards, but there's
2964	* some corner cases (due to bugs) where that's possible.
2965	*/
2966	if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
2967	{
2968	LWLockRelease(MultiXactTruncationLock);
2969	return;
2970	}
2971
2972	/*
2973	* Note we can't just plow ahead with the truncation; it's possible that
2974	* there are no segments to truncate, which is a problem because we are
2975	* going to attempt to read the offsets page to determine where to
2976	* truncate the members SLRU. So we first scan the directory to determine
2977	* the earliest offsets page number that we can read without error.
2978	*
2979	* NB: It's also possible that the page that oldestMulti is on has already
2980	* been truncated away, and we crashed before updating oldestMulti.
2981	*/
2982	trunc.earliestExistingPage = -`1`;
2983	SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
2984	earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
2985	if (earliest < FirstMultiXactId)
2986	earliest = FirstMultiXactId;
2987
2988	/ If there's nothing to remove, we can bail out early. /
2989	if (MultiXactIdPrecedes(oldestMulti, earliest))
2990	{
2991	LWLockRelease(MultiXactTruncationLock);
2992	return;
2993	}
2994
2995	/*
2996	* First, compute the safe truncation point for MultiXactMember. This is
2997	* the starting offset of the oldest multixact.
2998	*
2999	* Hopefully, find_multixact_start will always work here, because we've
3000	* already checked that it doesn't precede the earliest MultiXact on disk.
3001	* But if it fails, don't truncate anything, and log a message.
3002	*/
3003	if (oldestMulti == nextMulti)
3004	{
3005	/ there are NO MultiXacts /
3006	oldestOffset = nextOffset;
3007	}
3008	else if (!find_multixact_start(oldestMulti, &oldestOffset))
3009	{
3010	ereport(LOG,
3011	(errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3012	oldestMulti, earliest)));
3013	LWLockRelease(MultiXactTruncationLock);
3014	return;
3015	}
3016
3017	/*
3018	* Secondly compute up to where to truncate. Lookup the corresponding
3019	* member offset for newOldestMulti for that.
3020	*/
3021	if (newOldestMulti == nextMulti)
3022	{
3023	/ there are NO MultiXacts /
3024	newOldestOffset = nextOffset;
3025	}
3026	else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3027	{
3028	ereport(LOG,
3029	(errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3030	newOldestMulti)));
3031	LWLockRelease(MultiXactTruncationLock);
3032	return;
3033	}
3034
3035	elog(DEBUG1, "performing multixact truncation: "
3036	"offsets [%u, %u), offsets segments [%x, %x), "
3037	"members [%u, %u), members segments [%x, %x)",
3038	oldestMulti, newOldestMulti,
3039	MultiXactIdToOffsetSegment(oldestMulti),
3040	MultiXactIdToOffsetSegment(newOldestMulti),
3041	oldestOffset, newOldestOffset,
3042	MXOffsetToMemberSegment(oldestOffset),
3043	MXOffsetToMemberSegment(newOldestOffset));
3044
3045	/*
3046	* Do truncation, and the WAL logging of the truncation, in a critical
3047	* section. That way offsets/members cannot get out of sync anymore, i.e.
3048	* once consistent the newOldestMulti will always exist in members, even
3049	* if we crashed in the wrong moment.
3050	*/
3051	START_CRIT_SECTION();
3052
3053	/*
3054	* Prevent checkpoints from being scheduled concurrently. This is critical
3055	* because otherwise a truncation record might not be replayed after a
3056	* crash/basebackup, even though the state of the data directory would
3057	* require it.
3058	*/
3059	Assert(!MyPgXact->delayChkpt);
3060	MyPgXact->delayChkpt = true;
3061
3062	/ WAL log truncation /
3063	WriteMTruncateXlogRec(newOldestMultiDB,
3064	oldestMulti, newOldestMulti,
3065	oldestOffset, newOldestOffset);
3066
3067	/*
3068	* Update in-memory limits before performing the truncation, while inside
3069	* the critical section: Have to do it before truncation, to prevent
3070	* concurrent lookups of those values. Has to be inside the critical
3071	* section as otherwise a future call to this function would error out,
3072	* while looking up the oldest member in offsets, if our caller crashes
3073	* before updating the limits.
3074	*/
3075	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3076	MultiXactState->oldestMultiXactId = newOldestMulti;
3077	MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3078	LWLockRelease(MultiXactGenLock);
3079
3080	/ First truncate members /
3081	PerformMembersTruncation(oldestOffset, newOldestOffset);
3082
3083	/ Then offsets /
3084	PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3085
3086	MyPgXact->delayChkpt = false;
3087
3088	END_CRIT_SECTION();
3089	LWLockRelease(MultiXactTruncationLock);
3090	}
3091
3092	/*
3093	* Decide which of two MultiXactOffset page numbers is "older" for truncation
3094	* purposes.
3095	*
3096	* We need to use comparison of MultiXactId here in order to do the right
3097	* thing with wraparound. However, if we are asked about page number zero, we
3098	* don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get
3099	* weird. So, offset both multis by FirstMultiXactId to avoid that.
3100	* (Actually, the current implementation doesn't do anything weird with
3101	* InvalidMultiXactId, but there's no harm in leaving this code like this.)
3102	*/
3103	static bool
3104	MultiXactOffsetPagePrecedes(int page1, int page2)
3105	{
3106	MultiXactId multi1;
3107	MultiXactId multi2;
3108
3109	multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3110	multi1 += FirstMultiXactId;
3111	multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3112	multi2 += FirstMultiXactId;
3113
3114	return MultiXactIdPrecedes(multi1, multi2);
3115	}
3116
3117	/*
3118	* Decide which of two MultiXactMember page numbers is "older" for truncation
3119	* purposes. There is no "invalid offset number" so use the numbers verbatim.
3120	*/
3121	static bool
3122	MultiXactMemberPagePrecedes(int page1, int page2)
3123	{
3124	MultiXactOffset offset1;
3125	MultiXactOffset offset2;
3126
3127	offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3128	offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3129
3130	return MultiXactOffsetPrecedes(offset1, offset2);
3131	}
3132
3133	/*
3134	* Decide which of two MultiXactIds is earlier.
3135	*
3136	* XXX do we need to do something special for InvalidMultiXactId?
3137	* (Doesn't look like it.)
3138	*/
3139	bool
3140	MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
3141	{
3142	int32 diff = (int32) (multi1 - multi2);
3143
3144	return (diff < `0`);
3145	}
3146
3147	/*
3148	* MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3149	*
3150	* XXX do we need to do something special for InvalidMultiXactId?
3151	* (Doesn't look like it.)
3152	*/
3153	bool
3154	MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
3155	{
3156	int32 diff = (int32) (multi1 - multi2);
3157
3158	return (diff <= `0`);
3159	}
3160
3161
3162	/*
3163	* Decide which of two offsets is earlier.
3164	*/
3165	static bool
3166	MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
3167	{
3168	int32 diff = (int32) (offset1 - offset2);
3169
3170	return (diff < `0`);
3171	}
3172
3173	/*
3174	* Write an xlog record reflecting the zeroing of either a MEMBERs or
3175	* OFFSETs page (info shows which)
3176	*/
3177	static void
3178	WriteMZeroPageXlogRec(int pageno, uint8 info)
3179	{
3180	XLogBeginInsert();
3181	XLogRegisterData((char ) (&pageno), sizeof(int*));
3182	(void) XLogInsert(RM_MULTIXACT_ID, info);
3183	}
3184
3185	/*
3186	* Write a TRUNCATE xlog record
3187	*
3188	* We must flush the xlog record to disk before returning --- see notes in
3189	* TruncateCLOG().
3190	*/
3191	static void
3192	WriteMTruncateXlogRec(Oid oldestMultiDB,
3193	MultiXactId startTruncOff, MultiXactId endTruncOff,
3194	MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3195	{
3196	XLogRecPtr recptr;
3197	xl_multixact_truncate xlrec;
3198
3199	xlrec.oldestMultiDB = oldestMultiDB;
3200
3201	xlrec.startTruncOff = startTruncOff;
3202	xlrec.endTruncOff = endTruncOff;
3203
3204	xlrec.startTruncMemb = startTruncMemb;
3205	xlrec.endTruncMemb = endTruncMemb;
3206
3207	XLogBeginInsert();
3208	XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
3209	recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3210	XLogFlush(recptr);
3211	}
3212
3213	/*
3214	* MULTIXACT resource manager's routines
3215	*/
3216	void
3217	multixact_redo(XLogReaderState *record)
3218	{
3219	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3220
3221	/ Backup blocks are not used in multixact records /
3222	Assert(!XLogRecHasAnyBlockRefs(record));
3223
3224	if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3225	{
3226	int pageno;
3227	int slotno;
3228
3229	memcpy(&pageno, XLogRecGetData(record), sizeof(int));
3230
3231	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
3232
3233	slotno = ZeroMultiXactOffsetPage(pageno, false);
3234	SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3235	Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3236
3237	LWLockRelease(MultiXactOffsetControlLock);
3238	}
3239	else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3240	{
3241	int pageno;
3242	int slotno;
3243
3244	memcpy(&pageno, XLogRecGetData(record), sizeof(int));
3245
3246	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
3247
3248	slotno = ZeroMultiXactMemberPage(pageno, false);
3249	SimpleLruWritePage(MultiXactMemberCtl, slotno);
3250	Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
3251
3252	LWLockRelease(MultiXactMemberControlLock);
3253	}
3254	else if (info == XLOG_MULTIXACT_CREATE_ID)
3255	{
3256	xl_multixact_create *xlrec =
3257	(xl_multixact_create *) XLogRecGetData(record);
3258	TransactionId max_xid;
3259	int i;
3260
3261	/ Store the data back into the SLRU files /
3262	RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3263	xlrec->members);
3264
3265	/ Make sure nextMXact/nextOffset are beyond what this record has /
3266	MultiXactAdvanceNextMXact(xlrec->mid + `1`,
3267	xlrec->moff + xlrec->nmembers);
3268
3269	/*
3270	* Make sure nextFullXid is beyond any XID mentioned in the record.
3271	* This should be unnecessary, since any XID found here ought to have
3272	* other evidence in the XLOG, but let's be safe.
3273	*/
3274	max_xid = XLogRecGetXid(record);
3275	for (i = `0`; i < xlrec->nmembers; i++)
3276	{
3277	if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3278	max_xid = xlrec->members[i].xid;
3279	}
3280
3281	AdvanceNextFullTransactionIdPastXid(max_xid);
3282	}
3283	else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3284	{
3285	xl_multixact_truncate xlrec;
3286	int pageno;
3287
3288	memcpy(&xlrec, XLogRecGetData(record),
3289	SizeOfMultiXactTruncate);
3290
3291	elog(DEBUG1, "replaying multixact truncation: "
3292	"offsets [%u, %u), offsets segments [%x, %x), "
3293	"members [%u, %u), members segments [%x, %x)",
3294	xlrec.startTruncOff, xlrec.endTruncOff,
3295	MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3296	MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3297	xlrec.startTruncMemb, xlrec.endTruncMemb,
3298	MXOffsetToMemberSegment(xlrec.startTruncMemb),
3299	MXOffsetToMemberSegment(xlrec.endTruncMemb));
3300
3301	/ should not be required, but more than cheap enough /
3302	LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3303
3304	/*
3305	* Advance the horizon values, so they're current at the end of
3306	* recovery.
3307	*/
3308	SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3309
3310	PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
3311
3312	/*
3313	* During XLOG replay, latest_page_number isn't necessarily set up
3314	* yet; insert a suitable value to bypass the sanity test in
3315	* SimpleLruTruncate.
3316	*/
3317	pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3318	MultiXactOffsetCtl->shared->latest_page_number = pageno;
3319	PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
3320
3321	LWLockRelease(MultiXactTruncationLock);
3322	}
3323	else
3324	elog(PANIC, "multixact_redo: unknown op code %u", info);
3325	}
3326
3327	Datum
3328	pg_get_multixact_members(PG_FUNCTION_ARGS)
3329	{
3330	typedef struct
3331	{
3332	MultiXactMember *members;
3333	int nmembers;
3334	int iter;
3335	} mxact;
3336	MultiXactId mxid = PG_GETARG_UINT32(`0`);
3337	mxact *multi;
3338	FuncCallContext *funccxt;
3339
3340	if (mxid < FirstMultiXactId)
3341	ereport(ERROR,
3342	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3343	errmsg("invalid MultiXactId: %u", mxid)));
3344
3345	if (SRF_IS_FIRSTCALL())
3346	{
3347	MemoryContext oldcxt;
3348	TupleDesc tupdesc;
3349
3350	funccxt = SRF_FIRSTCALL_INIT();
3351	oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
3352
3353	multi = palloc(sizeof(mxact));
3354	/ no need to allow for old values here /
3355	multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
3356	false);
3357	multi->iter = `0`;
3358
3359	tupdesc = CreateTemplateTupleDesc(`2`);
3360	TupleDescInitEntry(tupdesc, (AttrNumber) `1`, "xid",
3361	XIDOID, -`1`, `0`);
3362	TupleDescInitEntry(tupdesc, (AttrNumber) `2`, "mode",
3363	TEXTOID, -`1`, `0`);
3364
3365	funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
3366	funccxt->user_fctx = multi;
3367
3368	MemoryContextSwitchTo(oldcxt);
3369	}
3370
3371	funccxt = SRF_PERCALL_SETUP();
3372	multi = (mxact *) funccxt->user_fctx;
3373
3374	while (multi->iter < multi->nmembers)
3375	{
3376	HeapTuple tuple;
3377	char *values[`2`];
3378
3379	values[`0`] = psprintf("%u", multi->members[multi->iter].xid);
3380	values[`1`] = mxstatus_to_string(multi->members[multi->iter].status);
3381
3382	tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
3383
3384	multi->iter++;
3385	pfree(values[`0`]);
3386	SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
3387	}
3388
3389	if (multi->nmembers > `0`)
3390	pfree(multi->members);
3391	pfree(multi);
3392
3393	SRF_RETURN_DONE(funccxt);
3394	}
3395

Browse the source code of PostgreSQL/src/backend/access/transam/multixact.c