1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * multixact.c |
4 | * PostgreSQL multi-transaction-log manager |
5 | * |
6 | * The pg_multixact manager is a pg_xact-like manager that stores an array of |
7 | * MultiXactMember for each MultiXactId. It is a fundamental part of the |
8 | * shared-row-lock implementation. Each MultiXactMember is comprised of a |
9 | * TransactionId and a set of flag bits. The name is a bit historical: |
10 | * originally, a MultiXactId consisted of more than one TransactionId (except |
11 | * in rare corner cases), hence "multi". Nowadays, however, it's perfectly |
12 | * legitimate to have MultiXactIds that only include a single Xid. |
13 | * |
14 | * The meaning of the flag bits is opaque to this module, but they are mostly |
15 | * used in heapam.c to identify lock modes that each of the member transactions |
16 | * is holding on any given tuple. This module just contains support to store |
17 | * and retrieve the arrays. |
18 | * |
19 | * We use two SLRU areas, one for storing the offsets at which the data |
20 | * starts for each MultiXactId in the other one. This trick allows us to |
21 | * store variable length arrays of TransactionIds. (We could alternatively |
22 | * use one area containing counts and TransactionIds, with valid MultiXactId |
23 | * values pointing at slots containing counts; but that way seems less robust |
24 | * since it would get completely confused if someone inquired about a bogus |
25 | * MultiXactId that pointed to an intermediate slot containing an XID.) |
26 | * |
27 | * XLOG interactions: this module generates a record whenever a new OFFSETs or |
28 | * MEMBERs page is initialized to zeroes, as well as an |
29 | * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined. |
30 | * This module ignores the WAL rule "write xlog before data," because it |
31 | * suffices that actions recording a MultiXactId in a heap xmax do follow that |
32 | * rule. The only way for the MXID to be referenced from any data page is for |
33 | * heap_lock_tuple() or heap_update() to have put it there, and each generates |
34 | * an XLOG record that must follow ours. The normal LSN interlock between the |
35 | * data page and that XLOG record will ensure that our XLOG record reaches |
36 | * disk first. If the SLRU members/offsets data reaches disk sooner than the |
37 | * XLOG records, we do not care; after recovery, no xmax will refer to it. On |
38 | * the flip side, to ensure that all referenced entries _do_ reach disk, this |
39 | * module's XLOG records completely rebuild the data entered since the last |
40 | * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk |
41 | * before each checkpoint is considered complete. |
42 | * |
43 | * Like clog.c, and unlike subtrans.c, we have to preserve state across |
44 | * crashes and ensure that MXID and offset numbering increases monotonically |
45 | * across a crash. We do this in the same way as it's done for transaction |
46 | * IDs: the WAL record is guaranteed to contain evidence of every MXID we |
47 | * could need to worry about, and we just make sure that at the end of |
48 | * replay, the next-MXID and next-offset counters are at least as large as |
49 | * anything we saw during replay. |
50 | * |
51 | * We are able to remove segments no longer necessary by carefully tracking |
52 | * each table's used values: during vacuum, any multixact older than a certain |
53 | * value is removed; the cutoff value is stored in pg_class. The minimum value |
54 | * across all tables in each database is stored in pg_database, and the global |
55 | * minimum across all databases is part of pg_control and is kept in shared |
56 | * memory. Whenever that minimum is advanced, the SLRUs are truncated. |
57 | * |
58 | * When new multixactid values are to be created, care is taken that the |
59 | * counter does not fall within the wraparound horizon considering the global |
60 | * minimum value. |
61 | * |
62 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
63 | * Portions Copyright (c) 1994, Regents of the University of California |
64 | * |
65 | * src/backend/access/transam/multixact.c |
66 | * |
67 | *------------------------------------------------------------------------- |
68 | */ |
69 | #include "postgres.h" |
70 | |
71 | #include "access/multixact.h" |
72 | #include "access/slru.h" |
73 | #include "access/transam.h" |
74 | #include "access/twophase.h" |
75 | #include "access/twophase_rmgr.h" |
76 | #include "access/xact.h" |
77 | #include "access/xlog.h" |
78 | #include "access/xloginsert.h" |
79 | #include "catalog/pg_type.h" |
80 | #include "commands/dbcommands.h" |
81 | #include "funcapi.h" |
82 | #include "lib/ilist.h" |
83 | #include "miscadmin.h" |
84 | #include "pg_trace.h" |
85 | #include "postmaster/autovacuum.h" |
86 | #include "storage/lmgr.h" |
87 | #include "storage/pmsignal.h" |
88 | #include "storage/proc.h" |
89 | #include "storage/procarray.h" |
90 | #include "utils/builtins.h" |
91 | #include "utils/memutils.h" |
92 | #include "utils/snapmgr.h" |
93 | |
94 | |
95 | /* |
96 | * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is |
97 | * used everywhere else in Postgres. |
98 | * |
99 | * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, |
100 | * MultiXact page numbering also wraps around at |
101 | * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at |
102 | * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need |
103 | * take no explicit notice of that fact in this module, except when comparing |
104 | * segment and page numbers in TruncateMultiXact (see |
105 | * MultiXactOffsetPagePrecedes). |
106 | */ |
107 | |
108 | /* We need four bytes per offset */ |
109 | #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) |
110 | |
111 | #define MultiXactIdToOffsetPage(xid) \ |
112 | ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) |
113 | #define MultiXactIdToOffsetEntry(xid) \ |
114 | ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) |
115 | #define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT) |
116 | |
117 | /* |
118 | * The situation for members is a bit more complex: we store one byte of |
119 | * additional flag bits for each TransactionId. To do this without getting |
120 | * into alignment issues, we store four bytes of flags, and then the |
121 | * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and |
122 | * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups |
123 | * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and |
124 | * performance) trumps space efficiency here. |
125 | * |
126 | * Note that the "offset" macros work with byte offset, not array indexes, so |
127 | * arithmetic must be done using "char *" pointers. |
128 | */ |
129 | /* We need eight bits per xact, so one xact fits in a byte */ |
130 | #define MXACT_MEMBER_BITS_PER_XACT 8 |
131 | #define MXACT_MEMBER_FLAGS_PER_BYTE 1 |
132 | #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) |
133 | |
134 | /* how many full bytes of flags are there in a group? */ |
135 | #define MULTIXACT_FLAGBYTES_PER_GROUP 4 |
136 | #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ |
137 | (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) |
138 | /* size in bytes of a complete group */ |
139 | #define MULTIXACT_MEMBERGROUP_SIZE \ |
140 | (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) |
141 | #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) |
142 | #define MULTIXACT_MEMBERS_PER_PAGE \ |
143 | (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) |
144 | |
145 | /* |
146 | * Because the number of items per page is not a divisor of the last item |
147 | * number (member 0xFFFFFFFF), the last segment does not use the maximum number |
148 | * of pages, and moreover the last used page therein does not use the same |
149 | * number of items as previous pages. (Another way to say it is that the |
150 | * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page |
151 | * has some empty space after that item.) |
152 | * |
153 | * This constant is the number of members in the last page of the last segment. |
154 | */ |
155 | #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ |
156 | ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) |
157 | |
158 | /* page in which a member is to be found */ |
159 | #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) |
160 | #define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT) |
161 | |
162 | /* Location (byte offset within page) of flag word for a given member */ |
163 | #define MXOffsetToFlagsOffset(xid) \ |
164 | ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ |
165 | (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \ |
166 | (TransactionId) MULTIXACT_MEMBERGROUP_SIZE) |
167 | #define MXOffsetToFlagsBitShift(xid) \ |
168 | (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \ |
169 | MXACT_MEMBER_BITS_PER_XACT) |
170 | |
171 | /* Location (byte offset within page) of TransactionId of given member */ |
172 | #define MXOffsetToMemberOffset(xid) \ |
173 | (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \ |
174 | ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId)) |
175 | |
176 | /* Multixact members wraparound thresholds. */ |
177 | #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) |
178 | #define MULTIXACT_MEMBER_DANGER_THRESHOLD \ |
179 | (MaxMultiXactOffset - MaxMultiXactOffset / 4) |
180 | |
181 | #define PreviousMultiXactId(xid) \ |
182 | ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1) |
183 | |
184 | /* |
185 | * Links to shared-memory data structures for MultiXact control |
186 | */ |
187 | static SlruCtlData MultiXactOffsetCtlData; |
188 | static SlruCtlData MultiXactMemberCtlData; |
189 | |
190 | #define MultiXactOffsetCtl (&MultiXactOffsetCtlData) |
191 | #define MultiXactMemberCtl (&MultiXactMemberCtlData) |
192 | |
193 | /* |
194 | * MultiXact state shared across all backends. All this state is protected |
195 | * by MultiXactGenLock. (We also use MultiXactOffsetControlLock and |
196 | * MultiXactMemberControlLock to guard accesses to the two sets of SLRU |
197 | * buffers. For concurrency's sake, we avoid holding more than one of these |
198 | * locks at a time.) |
199 | */ |
200 | typedef struct MultiXactStateData |
201 | { |
202 | /* next-to-be-assigned MultiXactId */ |
203 | MultiXactId nextMXact; |
204 | |
205 | /* next-to-be-assigned offset */ |
206 | MultiXactOffset nextOffset; |
207 | |
208 | /* Have we completed multixact startup? */ |
209 | bool finishedStartup; |
210 | |
211 | /* |
212 | * Oldest multixact that is still potentially referenced by a relation. |
213 | * Anything older than this should not be consulted. These values are |
214 | * updated by vacuum. |
215 | */ |
216 | MultiXactId oldestMultiXactId; |
217 | Oid oldestMultiXactDB; |
218 | |
219 | /* |
220 | * Oldest multixact offset that is potentially referenced by a multixact |
221 | * referenced by a relation. We don't always know this value, so there's |
222 | * a flag here to indicate whether or not we currently do. |
223 | */ |
224 | MultiXactOffset oldestOffset; |
225 | bool oldestOffsetKnown; |
226 | |
227 | /* support for anti-wraparound measures */ |
228 | MultiXactId multiVacLimit; |
229 | MultiXactId multiWarnLimit; |
230 | MultiXactId multiStopLimit; |
231 | MultiXactId multiWrapLimit; |
232 | |
233 | /* support for members anti-wraparound measures */ |
234 | MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ |
235 | |
236 | /* |
237 | * Per-backend data starts here. We have two arrays stored in the area |
238 | * immediately following the MultiXactStateData struct. Each is indexed by |
239 | * BackendId. |
240 | * |
241 | * In both arrays, there's a slot for all normal backends (1..MaxBackends) |
242 | * followed by a slot for max_prepared_xacts prepared transactions. Valid |
243 | * BackendIds start from 1; element zero of each array is never used. |
244 | * |
245 | * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current |
246 | * transaction(s) could possibly be a member of, or InvalidMultiXactId |
247 | * when the backend has no live transaction that could possibly be a |
248 | * member of a MultiXact. Each backend sets its entry to the current |
249 | * nextMXact counter just before first acquiring a shared lock in a given |
250 | * transaction, and clears it at transaction end. (This works because only |
251 | * during or after acquiring a shared lock could an XID possibly become a |
252 | * member of a MultiXact, and that MultiXact would have to be created |
253 | * during or after the lock acquisition.) |
254 | * |
255 | * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's |
256 | * current transaction(s) think is potentially live, or InvalidMultiXactId |
257 | * when not in a transaction or not in a transaction that's paid any |
258 | * attention to MultiXacts yet. This is computed when first needed in a |
259 | * given transaction, and cleared at transaction end. We can compute it |
260 | * as the minimum of the valid OldestMemberMXactId[] entries at the time |
261 | * we compute it (using nextMXact if none are valid). Each backend is |
262 | * required not to attempt to access any SLRU data for MultiXactIds older |
263 | * than its own OldestVisibleMXactId[] setting; this is necessary because |
264 | * the checkpointer could truncate away such data at any instant. |
265 | * |
266 | * The oldest valid value among all of the OldestMemberMXactId[] and |
267 | * OldestVisibleMXactId[] entries is considered by vacuum as the earliest |
268 | * possible value still having any live member transaction. Subtracting |
269 | * vacuum_multixact_freeze_min_age from that value we obtain the freezing |
270 | * point for multixacts for that table. Any value older than that is |
271 | * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note |
272 | * that multis that have member xids that are older than the cutoff point |
273 | * for xids must also be frozen, even if the multis themselves are newer |
274 | * than the multixid cutoff point). Whenever a full table vacuum happens, |
275 | * the freezing point so computed is used as the new pg_class.relminmxid |
276 | * value. The minimum of all those values in a database is stored as |
277 | * pg_database.datminmxid. In turn, the minimum of all of those values is |
278 | * stored in pg_control and used as truncation point for pg_multixact. At |
279 | * checkpoint or restartpoint, unneeded segments are removed. |
280 | */ |
281 | MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]; |
282 | } MultiXactStateData; |
283 | |
284 | /* |
285 | * Last element of OldestMemberMXactID and OldestVisibleMXactId arrays. |
286 | * Valid elements are (1..MaxOldestSlot); element 0 is never used. |
287 | */ |
288 | #define MaxOldestSlot (MaxBackends + max_prepared_xacts) |
289 | |
290 | /* Pointers to the state data in shared memory */ |
291 | static MultiXactStateData *MultiXactState; |
292 | static MultiXactId *OldestMemberMXactId; |
293 | static MultiXactId *OldestVisibleMXactId; |
294 | |
295 | |
296 | /* |
297 | * Definitions for the backend-local MultiXactId cache. |
298 | * |
299 | * We use this cache to store known MultiXacts, so we don't need to go to |
300 | * SLRU areas every time. |
301 | * |
302 | * The cache lasts for the duration of a single transaction, the rationale |
303 | * for this being that most entries will contain our own TransactionId and |
304 | * so they will be uninteresting by the time our next transaction starts. |
305 | * (XXX not clear that this is correct --- other members of the MultiXact |
306 | * could hang around longer than we did. However, it's not clear what a |
307 | * better policy for flushing old cache entries would be.) FIXME actually |
308 | * this is plain wrong now that multixact's may contain update Xids. |
309 | * |
310 | * We allocate the cache entries in a memory context that is deleted at |
311 | * transaction end, so we don't need to do retail freeing of entries. |
312 | */ |
313 | typedef struct mXactCacheEnt |
314 | { |
315 | MultiXactId multi; |
316 | int nmembers; |
317 | dlist_node node; |
318 | MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; |
319 | } mXactCacheEnt; |
320 | |
321 | #define MAX_CACHE_ENTRIES 256 |
322 | static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache); |
323 | static int MXactCacheMembers = 0; |
324 | static MemoryContext MXactContext = NULL; |
325 | |
326 | #ifdef MULTIXACT_DEBUG |
327 | #define debug_elog2(a,b) elog(a,b) |
328 | #define debug_elog3(a,b,c) elog(a,b,c) |
329 | #define debug_elog4(a,b,c,d) elog(a,b,c,d) |
330 | #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) |
331 | #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f) |
332 | #else |
333 | #define debug_elog2(a,b) |
334 | #define debug_elog3(a,b,c) |
335 | #define debug_elog4(a,b,c,d) |
336 | #define debug_elog5(a,b,c,d,e) |
337 | #define debug_elog6(a,b,c,d,e,f) |
338 | #endif |
339 | |
340 | /* internal MultiXactId management */ |
341 | static void MultiXactIdSetOldestVisible(void); |
342 | static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
343 | int nmembers, MultiXactMember *members); |
344 | static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); |
345 | |
346 | /* MultiXact cache management */ |
347 | static int mxactMemberComparator(const void *arg1, const void *arg2); |
348 | static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); |
349 | static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); |
350 | static void mXactCachePut(MultiXactId multi, int nmembers, |
351 | MultiXactMember *members); |
352 | |
353 | static char *mxstatus_to_string(MultiXactStatus status); |
354 | |
355 | /* management of SLRU infrastructure */ |
356 | static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); |
357 | static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); |
358 | static bool MultiXactOffsetPagePrecedes(int page1, int page2); |
359 | static bool MultiXactMemberPagePrecedes(int page1, int page2); |
360 | static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, |
361 | MultiXactOffset offset2); |
362 | static void ExtendMultiXactOffset(MultiXactId multi); |
363 | static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); |
364 | static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, |
365 | MultiXactOffset start, uint32 distance); |
366 | static bool SetOffsetVacuumLimit(bool is_startup); |
367 | static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); |
368 | static void WriteMZeroPageXlogRec(int pageno, uint8 info); |
369 | static void WriteMTruncateXlogRec(Oid oldestMultiDB, |
370 | MultiXactId startOff, MultiXactId endOff, |
371 | MultiXactOffset startMemb, MultiXactOffset endMemb); |
372 | |
373 | |
374 | /* |
375 | * MultiXactIdCreate |
376 | * Construct a MultiXactId representing two TransactionIds. |
377 | * |
378 | * The two XIDs must be different, or be requesting different statuses. |
379 | * |
380 | * NB - we don't worry about our local MultiXactId cache here, because that |
381 | * is handled by the lower-level routines. |
382 | */ |
383 | MultiXactId |
384 | MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, |
385 | TransactionId xid2, MultiXactStatus status2) |
386 | { |
387 | MultiXactId newMulti; |
388 | MultiXactMember members[2]; |
389 | |
390 | AssertArg(TransactionIdIsValid(xid1)); |
391 | AssertArg(TransactionIdIsValid(xid2)); |
392 | |
393 | Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); |
394 | |
395 | /* MultiXactIdSetOldestMember() must have been called already. */ |
396 | Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); |
397 | |
398 | /* |
399 | * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs |
400 | * are still running. In typical usage, xid2 will be our own XID and the |
401 | * caller just did a check on xid1, so it'd be wasted effort. |
402 | */ |
403 | |
404 | members[0].xid = xid1; |
405 | members[0].status = status1; |
406 | members[1].xid = xid2; |
407 | members[1].status = status2; |
408 | |
409 | newMulti = MultiXactIdCreateFromMembers(2, members); |
410 | |
411 | debug_elog3(DEBUG2, "Create: %s" , |
412 | mxid_to_string(newMulti, 2, members)); |
413 | |
414 | return newMulti; |
415 | } |
416 | |
417 | /* |
418 | * MultiXactIdExpand |
419 | * Add a TransactionId to a pre-existing MultiXactId. |
420 | * |
421 | * If the TransactionId is already a member of the passed MultiXactId with the |
422 | * same status, just return it as-is. |
423 | * |
424 | * Note that we do NOT actually modify the membership of a pre-existing |
425 | * MultiXactId; instead we create a new one. This is necessary to avoid |
426 | * a race condition against code trying to wait for one MultiXactId to finish; |
427 | * see notes in heapam.c. |
428 | * |
429 | * NB - we don't worry about our local MultiXactId cache here, because that |
430 | * is handled by the lower-level routines. |
431 | * |
432 | * Note: It is critical that MultiXactIds that come from an old cluster (i.e. |
433 | * one upgraded by pg_upgrade from a cluster older than this feature) are not |
434 | * passed in. |
435 | */ |
436 | MultiXactId |
437 | MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) |
438 | { |
439 | MultiXactId newMulti; |
440 | MultiXactMember *members; |
441 | MultiXactMember *newMembers; |
442 | int nmembers; |
443 | int i; |
444 | int j; |
445 | |
446 | AssertArg(MultiXactIdIsValid(multi)); |
447 | AssertArg(TransactionIdIsValid(xid)); |
448 | |
449 | /* MultiXactIdSetOldestMember() must have been called already. */ |
450 | Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); |
451 | |
452 | debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s" , |
453 | multi, xid, mxstatus_to_string(status)); |
454 | |
455 | /* |
456 | * Note: we don't allow for old multis here. The reason is that the only |
457 | * caller of this function does a check that the multixact is no longer |
458 | * running. |
459 | */ |
460 | nmembers = GetMultiXactIdMembers(multi, &members, false, false); |
461 | |
462 | if (nmembers < 0) |
463 | { |
464 | MultiXactMember member; |
465 | |
466 | /* |
467 | * The MultiXactId is obsolete. This can only happen if all the |
468 | * MultiXactId members stop running between the caller checking and |
469 | * passing it to us. It would be better to return that fact to the |
470 | * caller, but it would complicate the API and it's unlikely to happen |
471 | * too often, so just deal with it by creating a singleton MultiXact. |
472 | */ |
473 | member.xid = xid; |
474 | member.status = status; |
475 | newMulti = MultiXactIdCreateFromMembers(1, &member); |
476 | |
477 | debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u" , |
478 | multi, newMulti); |
479 | return newMulti; |
480 | } |
481 | |
482 | /* |
483 | * If the TransactionId is already a member of the MultiXactId with the |
484 | * same status, just return the existing MultiXactId. |
485 | */ |
486 | for (i = 0; i < nmembers; i++) |
487 | { |
488 | if (TransactionIdEquals(members[i].xid, xid) && |
489 | (members[i].status == status)) |
490 | { |
491 | debug_elog4(DEBUG2, "Expand: %u is already a member of %u" , |
492 | xid, multi); |
493 | pfree(members); |
494 | return multi; |
495 | } |
496 | } |
497 | |
498 | /* |
499 | * Determine which of the members of the MultiXactId are still of |
500 | * interest. This is any running transaction, and also any transaction |
501 | * that grabbed something stronger than just a lock and was committed. (An |
502 | * update that aborted is of no interest here; and having more than one |
503 | * update Xid in a multixact would cause errors elsewhere.) |
504 | * |
505 | * Removing dead members is not just an optimization: freezing of tuples |
506 | * whose Xmax are multis depends on this behavior. |
507 | * |
508 | * Note we have the same race condition here as above: j could be 0 at the |
509 | * end of the loop. |
510 | */ |
511 | newMembers = (MultiXactMember *) |
512 | palloc(sizeof(MultiXactMember) * (nmembers + 1)); |
513 | |
514 | for (i = 0, j = 0; i < nmembers; i++) |
515 | { |
516 | if (TransactionIdIsInProgress(members[i].xid) || |
517 | (ISUPDATE_from_mxstatus(members[i].status) && |
518 | TransactionIdDidCommit(members[i].xid))) |
519 | { |
520 | newMembers[j].xid = members[i].xid; |
521 | newMembers[j++].status = members[i].status; |
522 | } |
523 | } |
524 | |
525 | newMembers[j].xid = xid; |
526 | newMembers[j++].status = status; |
527 | newMulti = MultiXactIdCreateFromMembers(j, newMembers); |
528 | |
529 | pfree(members); |
530 | pfree(newMembers); |
531 | |
532 | debug_elog3(DEBUG2, "Expand: returning new multi %u" , newMulti); |
533 | |
534 | return newMulti; |
535 | } |
536 | |
537 | /* |
538 | * MultiXactIdIsRunning |
539 | * Returns whether a MultiXactId is "running". |
540 | * |
541 | * We return true if at least one member of the given MultiXactId is still |
542 | * running. Note that a "false" result is certain not to change, |
543 | * because it is not legal to add members to an existing MultiXactId. |
544 | * |
545 | * Caller is expected to have verified that the multixact does not come from |
546 | * a pg_upgraded share-locked tuple. |
547 | */ |
548 | bool |
549 | MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly) |
550 | { |
551 | MultiXactMember *members; |
552 | int nmembers; |
553 | int i; |
554 | |
555 | debug_elog3(DEBUG2, "IsRunning %u?" , multi); |
556 | |
557 | /* |
558 | * "false" here means we assume our callers have checked that the given |
559 | * multi cannot possibly come from a pg_upgraded database. |
560 | */ |
561 | nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly); |
562 | |
563 | if (nmembers <= 0) |
564 | { |
565 | debug_elog2(DEBUG2, "IsRunning: no members" ); |
566 | return false; |
567 | } |
568 | |
569 | /* |
570 | * Checking for myself is cheap compared to looking in shared memory; |
571 | * return true if any live subtransaction of the current top-level |
572 | * transaction is a member. |
573 | * |
574 | * This is not needed for correctness, it's just a fast path. |
575 | */ |
576 | for (i = 0; i < nmembers; i++) |
577 | { |
578 | if (TransactionIdIsCurrentTransactionId(members[i].xid)) |
579 | { |
580 | debug_elog3(DEBUG2, "IsRunning: I (%d) am running!" , i); |
581 | pfree(members); |
582 | return true; |
583 | } |
584 | } |
585 | |
586 | /* |
587 | * This could be made faster by having another entry point in procarray.c, |
588 | * walking the PGPROC array only once for all the members. But in most |
589 | * cases nmembers should be small enough that it doesn't much matter. |
590 | */ |
591 | for (i = 0; i < nmembers; i++) |
592 | { |
593 | if (TransactionIdIsInProgress(members[i].xid)) |
594 | { |
595 | debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running" , |
596 | i, members[i].xid); |
597 | pfree(members); |
598 | return true; |
599 | } |
600 | } |
601 | |
602 | pfree(members); |
603 | |
604 | debug_elog3(DEBUG2, "IsRunning: %u is not running" , multi); |
605 | |
606 | return false; |
607 | } |
608 | |
609 | /* |
610 | * MultiXactIdSetOldestMember |
611 | * Save the oldest MultiXactId this transaction could be a member of. |
612 | * |
613 | * We set the OldestMemberMXactId for a given transaction the first time it's |
614 | * going to do some operation that might require a MultiXactId (tuple lock, |
615 | * update or delete). We need to do this even if we end up using a |
616 | * TransactionId instead of a MultiXactId, because there is a chance that |
617 | * another transaction would add our XID to a MultiXactId. |
618 | * |
619 | * The value to set is the next-to-be-assigned MultiXactId, so this is meant to |
620 | * be called just before doing any such possibly-MultiXactId-able operation. |
621 | */ |
622 | void |
623 | MultiXactIdSetOldestMember(void) |
624 | { |
625 | if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])) |
626 | { |
627 | MultiXactId nextMXact; |
628 | |
629 | /* |
630 | * You might think we don't need to acquire a lock here, since |
631 | * fetching and storing of TransactionIds is probably atomic, but in |
632 | * fact we do: suppose we pick up nextMXact and then lose the CPU for |
633 | * a long time. Someone else could advance nextMXact, and then |
634 | * another someone else could compute an OldestVisibleMXactId that |
635 | * would be after the value we are going to store when we get control |
636 | * back. Which would be wrong. |
637 | * |
638 | * Note that a shared lock is sufficient, because it's enough to stop |
639 | * someone from advancing nextMXact; and nobody else could be trying |
640 | * to write to our OldestMember entry, only reading (and we assume |
641 | * storing it is atomic.) |
642 | */ |
643 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
644 | |
645 | /* |
646 | * We have to beware of the possibility that nextMXact is in the |
647 | * wrapped-around state. We don't fix the counter itself here, but we |
648 | * must be sure to store a valid value in our array entry. |
649 | */ |
650 | nextMXact = MultiXactState->nextMXact; |
651 | if (nextMXact < FirstMultiXactId) |
652 | nextMXact = FirstMultiXactId; |
653 | |
654 | OldestMemberMXactId[MyBackendId] = nextMXact; |
655 | |
656 | LWLockRelease(MultiXactGenLock); |
657 | |
658 | debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u" , |
659 | MyBackendId, nextMXact); |
660 | } |
661 | } |
662 | |
663 | /* |
664 | * MultiXactIdSetOldestVisible |
665 | * Save the oldest MultiXactId this transaction considers possibly live. |
666 | * |
667 | * We set the OldestVisibleMXactId for a given transaction the first time |
668 | * it's going to inspect any MultiXactId. Once we have set this, we are |
669 | * guaranteed that the checkpointer won't truncate off SLRU data for |
670 | * MultiXactIds at or after our OldestVisibleMXactId. |
671 | * |
672 | * The value to set is the oldest of nextMXact and all the valid per-backend |
673 | * OldestMemberMXactId[] entries. Because of the locking we do, we can be |
674 | * certain that no subsequent call to MultiXactIdSetOldestMember can set |
675 | * an OldestMemberMXactId[] entry older than what we compute here. Therefore |
676 | * there is no live transaction, now or later, that can be a member of any |
677 | * MultiXactId older than the OldestVisibleMXactId we compute here. |
678 | */ |
679 | static void |
680 | MultiXactIdSetOldestVisible(void) |
681 | { |
682 | if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId])) |
683 | { |
684 | MultiXactId oldestMXact; |
685 | int i; |
686 | |
687 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
688 | |
689 | /* |
690 | * We have to beware of the possibility that nextMXact is in the |
691 | * wrapped-around state. We don't fix the counter itself here, but we |
692 | * must be sure to store a valid value in our array entry. |
693 | */ |
694 | oldestMXact = MultiXactState->nextMXact; |
695 | if (oldestMXact < FirstMultiXactId) |
696 | oldestMXact = FirstMultiXactId; |
697 | |
698 | for (i = 1; i <= MaxOldestSlot; i++) |
699 | { |
700 | MultiXactId thisoldest = OldestMemberMXactId[i]; |
701 | |
702 | if (MultiXactIdIsValid(thisoldest) && |
703 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
704 | oldestMXact = thisoldest; |
705 | } |
706 | |
707 | OldestVisibleMXactId[MyBackendId] = oldestMXact; |
708 | |
709 | LWLockRelease(MultiXactGenLock); |
710 | |
711 | debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u" , |
712 | MyBackendId, oldestMXact); |
713 | } |
714 | } |
715 | |
716 | /* |
717 | * ReadNextMultiXactId |
718 | * Return the next MultiXactId to be assigned, but don't allocate it |
719 | */ |
720 | MultiXactId |
721 | ReadNextMultiXactId(void) |
722 | { |
723 | MultiXactId mxid; |
724 | |
725 | /* XXX we could presumably do this without a lock. */ |
726 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
727 | mxid = MultiXactState->nextMXact; |
728 | LWLockRelease(MultiXactGenLock); |
729 | |
730 | if (mxid < FirstMultiXactId) |
731 | mxid = FirstMultiXactId; |
732 | |
733 | return mxid; |
734 | } |
735 | |
736 | /* |
737 | * MultiXactIdCreateFromMembers |
738 | * Make a new MultiXactId from the specified set of members |
739 | * |
740 | * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the |
741 | * given TransactionIds as members. Returns the newly created MultiXactId. |
742 | * |
743 | * NB: the passed members[] array will be sorted in-place. |
744 | */ |
745 | MultiXactId |
746 | MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) |
747 | { |
748 | MultiXactId multi; |
749 | MultiXactOffset offset; |
750 | xl_multixact_create xlrec; |
751 | |
752 | debug_elog3(DEBUG2, "Create: %s" , |
753 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
754 | |
755 | /* |
756 | * See if the same set of members already exists in our cache; if so, just |
757 | * re-use that MultiXactId. (Note: it might seem that looking in our |
758 | * cache is insufficient, and we ought to search disk to see if a |
759 | * duplicate definition already exists. But since we only ever create |
760 | * MultiXacts containing our own XID, in most cases any such MultiXacts |
761 | * were in fact created by us, and so will be in our cache. There are |
762 | * corner cases where someone else added us to a MultiXact without our |
763 | * knowledge, but it's not worth checking for.) |
764 | */ |
765 | multi = mXactCacheGetBySet(nmembers, members); |
766 | if (MultiXactIdIsValid(multi)) |
767 | { |
768 | debug_elog2(DEBUG2, "Create: in cache!" ); |
769 | return multi; |
770 | } |
771 | |
772 | /* Verify that there is a single update Xid among the given members. */ |
773 | { |
774 | int i; |
775 | bool has_update = false; |
776 | |
777 | for (i = 0; i < nmembers; i++) |
778 | { |
779 | if (ISUPDATE_from_mxstatus(members[i].status)) |
780 | { |
781 | if (has_update) |
782 | elog(ERROR, "new multixact has more than one updating member" ); |
783 | has_update = true; |
784 | } |
785 | } |
786 | } |
787 | |
788 | /* |
789 | * Assign the MXID and offsets range to use, and make sure there is space |
790 | * in the OFFSETs and MEMBERs files. NB: this routine does |
791 | * START_CRIT_SECTION(). |
792 | * |
793 | * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check |
794 | * that we've called MultiXactIdSetOldestMember here. This is because |
795 | * this routine is used in some places to create new MultiXactIds of which |
796 | * the current backend is not a member, notably during freezing of multis |
797 | * in vacuum. During vacuum, in particular, it would be unacceptable to |
798 | * keep OldestMulti set, in case it runs for long. |
799 | */ |
800 | multi = GetNewMultiXactId(nmembers, &offset); |
801 | |
802 | /* Make an XLOG entry describing the new MXID. */ |
803 | xlrec.mid = multi; |
804 | xlrec.moff = offset; |
805 | xlrec.nmembers = nmembers; |
806 | |
807 | /* |
808 | * XXX Note: there's a lot of padding space in MultiXactMember. We could |
809 | * find a more compact representation of this Xlog record -- perhaps all |
810 | * the status flags in one XLogRecData, then all the xids in another one? |
811 | * Not clear that it's worth the trouble though. |
812 | */ |
813 | XLogBeginInsert(); |
814 | XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate); |
815 | XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember)); |
816 | |
817 | (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); |
818 | |
819 | /* Now enter the information into the OFFSETs and MEMBERs logs */ |
820 | RecordNewMultiXact(multi, offset, nmembers, members); |
821 | |
822 | /* Done with critical section */ |
823 | END_CRIT_SECTION(); |
824 | |
825 | /* Store the new MultiXactId in the local cache, too */ |
826 | mXactCachePut(multi, nmembers, members); |
827 | |
828 | debug_elog2(DEBUG2, "Create: all done" ); |
829 | |
830 | return multi; |
831 | } |
832 | |
833 | /* |
834 | * RecordNewMultiXact |
835 | * Write info about a new multixact into the offsets and members files |
836 | * |
837 | * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can |
838 | * use it. |
839 | */ |
840 | static void |
841 | RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
842 | int nmembers, MultiXactMember *members) |
843 | { |
844 | int pageno; |
845 | int prev_pageno; |
846 | int entryno; |
847 | int slotno; |
848 | MultiXactOffset *offptr; |
849 | int i; |
850 | |
851 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
852 | |
853 | pageno = MultiXactIdToOffsetPage(multi); |
854 | entryno = MultiXactIdToOffsetEntry(multi); |
855 | |
856 | /* |
857 | * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" |
858 | * to complain about if there's any I/O error. This is kinda bogus, but |
859 | * since the errors will always give the full pathname, it should be clear |
860 | * enough that a MultiXactId is really involved. Perhaps someday we'll |
861 | * take the trouble to generalize the slru.c error reporting code. |
862 | */ |
863 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); |
864 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
865 | offptr += entryno; |
866 | |
867 | *offptr = offset; |
868 | |
869 | MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
870 | |
871 | /* Exchange our lock */ |
872 | LWLockRelease(MultiXactOffsetControlLock); |
873 | |
874 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
875 | |
876 | prev_pageno = -1; |
877 | |
878 | for (i = 0; i < nmembers; i++, offset++) |
879 | { |
880 | TransactionId *memberptr; |
881 | uint32 *flagsptr; |
882 | uint32 flagsval; |
883 | int bshift; |
884 | int flagsoff; |
885 | int memberoff; |
886 | |
887 | Assert(members[i].status <= MultiXactStatusUpdate); |
888 | |
889 | pageno = MXOffsetToMemberPage(offset); |
890 | memberoff = MXOffsetToMemberOffset(offset); |
891 | flagsoff = MXOffsetToFlagsOffset(offset); |
892 | bshift = MXOffsetToFlagsBitShift(offset); |
893 | |
894 | if (pageno != prev_pageno) |
895 | { |
896 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); |
897 | prev_pageno = pageno; |
898 | } |
899 | |
900 | memberptr = (TransactionId *) |
901 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
902 | |
903 | *memberptr = members[i].xid; |
904 | |
905 | flagsptr = (uint32 *) |
906 | (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); |
907 | |
908 | flagsval = *flagsptr; |
909 | flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); |
910 | flagsval |= (members[i].status << bshift); |
911 | *flagsptr = flagsval; |
912 | |
913 | MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
914 | } |
915 | |
916 | LWLockRelease(MultiXactMemberControlLock); |
917 | } |
918 | |
919 | /* |
920 | * GetNewMultiXactId |
921 | * Get the next MultiXactId. |
922 | * |
923 | * Also, reserve the needed amount of space in the "members" area. The |
924 | * starting offset of the reserved space is returned in *offset. |
925 | * |
926 | * This may generate XLOG records for expansion of the offsets and/or members |
927 | * files. Unfortunately, we have to do that while holding MultiXactGenLock |
928 | * to avoid race conditions --- the XLOG record for zeroing a page must appear |
929 | * before any backend can possibly try to store data in that page! |
930 | * |
931 | * We start a critical section before advancing the shared counters. The |
932 | * caller must end the critical section after writing SLRU data. |
933 | */ |
934 | static MultiXactId |
935 | GetNewMultiXactId(int nmembers, MultiXactOffset *offset) |
936 | { |
937 | MultiXactId result; |
938 | MultiXactOffset nextOffset; |
939 | |
940 | debug_elog3(DEBUG2, "GetNew: for %d xids" , nmembers); |
941 | |
942 | /* safety check, we should never get this far in a HS standby */ |
943 | if (RecoveryInProgress()) |
944 | elog(ERROR, "cannot assign MultiXactIds during recovery" ); |
945 | |
946 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
947 | |
948 | /* Handle wraparound of the nextMXact counter */ |
949 | if (MultiXactState->nextMXact < FirstMultiXactId) |
950 | MultiXactState->nextMXact = FirstMultiXactId; |
951 | |
952 | /* Assign the MXID */ |
953 | result = MultiXactState->nextMXact; |
954 | |
955 | /*---------- |
956 | * Check to see if it's safe to assign another MultiXactId. This protects |
957 | * against catastrophic data loss due to multixact wraparound. The basic |
958 | * rules are: |
959 | * |
960 | * If we're past multiVacLimit or the safe threshold for member storage |
961 | * space, or we don't know what the safe threshold for member storage is, |
962 | * start trying to force autovacuum cycles. |
963 | * If we're past multiWarnLimit, start issuing warnings. |
964 | * If we're past multiStopLimit, refuse to create new MultiXactIds. |
965 | * |
966 | * Note these are pretty much the same protections in GetNewTransactionId. |
967 | *---------- |
968 | */ |
969 | if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit)) |
970 | { |
971 | /* |
972 | * For safety's sake, we release MultiXactGenLock while sending |
973 | * signals, warnings, etc. This is not so much because we care about |
974 | * preserving concurrency in this situation, as to avoid any |
975 | * possibility of deadlock while doing get_database_name(). First, |
976 | * copy all the shared values we'll need in this path. |
977 | */ |
978 | MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit; |
979 | MultiXactId multiStopLimit = MultiXactState->multiStopLimit; |
980 | MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit; |
981 | Oid oldest_datoid = MultiXactState->oldestMultiXactDB; |
982 | |
983 | LWLockRelease(MultiXactGenLock); |
984 | |
985 | if (IsUnderPostmaster && |
986 | !MultiXactIdPrecedes(result, multiStopLimit)) |
987 | { |
988 | char *oldest_datname = get_database_name(oldest_datoid); |
989 | |
990 | /* |
991 | * Immediately kick autovacuum into action as we're already in |
992 | * ERROR territory. |
993 | */ |
994 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
995 | |
996 | /* complain even if that DB has disappeared */ |
997 | if (oldest_datname) |
998 | ereport(ERROR, |
999 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1000 | errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"" , |
1001 | oldest_datname), |
1002 | errhint("Execute a database-wide VACUUM in that database.\n" |
1003 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
1004 | else |
1005 | ereport(ERROR, |
1006 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1007 | errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u" , |
1008 | oldest_datoid), |
1009 | errhint("Execute a database-wide VACUUM in that database.\n" |
1010 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
1011 | } |
1012 | |
1013 | /* |
1014 | * To avoid swamping the postmaster with signals, we issue the autovac |
1015 | * request only once per 64K multis generated. This still gives |
1016 | * plenty of chances before we get into real trouble. |
1017 | */ |
1018 | if (IsUnderPostmaster && (result % 65536) == 0) |
1019 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1020 | |
1021 | if (!MultiXactIdPrecedes(result, multiWarnLimit)) |
1022 | { |
1023 | char *oldest_datname = get_database_name(oldest_datoid); |
1024 | |
1025 | /* complain even if that DB has disappeared */ |
1026 | if (oldest_datname) |
1027 | ereport(WARNING, |
1028 | (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used" , |
1029 | "database \"%s\" must be vacuumed before %u more MultiXactIds are used" , |
1030 | multiWrapLimit - result, |
1031 | oldest_datname, |
1032 | multiWrapLimit - result), |
1033 | errhint("Execute a database-wide VACUUM in that database.\n" |
1034 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
1035 | else |
1036 | ereport(WARNING, |
1037 | (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used" , |
1038 | "database with OID %u must be vacuumed before %u more MultiXactIds are used" , |
1039 | multiWrapLimit - result, |
1040 | oldest_datoid, |
1041 | multiWrapLimit - result), |
1042 | errhint("Execute a database-wide VACUUM in that database.\n" |
1043 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
1044 | } |
1045 | |
1046 | /* Re-acquire lock and start over */ |
1047 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
1048 | result = MultiXactState->nextMXact; |
1049 | if (result < FirstMultiXactId) |
1050 | result = FirstMultiXactId; |
1051 | } |
1052 | |
1053 | /* Make sure there is room for the MXID in the file. */ |
1054 | ExtendMultiXactOffset(result); |
1055 | |
1056 | /* |
1057 | * Reserve the members space, similarly to above. Also, be careful not to |
1058 | * return zero as the starting offset for any multixact. See |
1059 | * GetMultiXactIdMembers() for motivation. |
1060 | */ |
1061 | nextOffset = MultiXactState->nextOffset; |
1062 | if (nextOffset == 0) |
1063 | { |
1064 | *offset = 1; |
1065 | nmembers++; /* allocate member slot 0 too */ |
1066 | } |
1067 | else |
1068 | *offset = nextOffset; |
1069 | |
1070 | /*---------- |
1071 | * Protect against overrun of the members space as well, with the |
1072 | * following rules: |
1073 | * |
1074 | * If we're past offsetStopLimit, refuse to generate more multis. |
1075 | * If we're close to offsetStopLimit, emit a warning. |
1076 | * |
1077 | * Arbitrarily, we start emitting warnings when we're 20 segments or less |
1078 | * from offsetStopLimit. |
1079 | * |
1080 | * Note we haven't updated the shared state yet, so if we fail at this |
1081 | * point, the multixact ID we grabbed can still be used by the next guy. |
1082 | * |
1083 | * Note that there is no point in forcing autovacuum runs here: the |
1084 | * multixact freeze settings would have to be reduced for that to have any |
1085 | * effect. |
1086 | *---------- |
1087 | */ |
1088 | #define OFFSET_WARN_SEGMENTS 20 |
1089 | if (MultiXactState->oldestOffsetKnown && |
1090 | MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, |
1091 | nmembers)) |
1092 | { |
1093 | /* see comment in the corresponding offsets wraparound case */ |
1094 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1095 | |
1096 | ereport(ERROR, |
1097 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1098 | errmsg("multixact \"members\" limit exceeded" ), |
1099 | errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member." , |
1100 | "This command would create a multixact with %u members, but the remaining space is only enough for %u members." , |
1101 | MultiXactState->offsetStopLimit - nextOffset - 1, |
1102 | nmembers, |
1103 | MultiXactState->offsetStopLimit - nextOffset - 1), |
1104 | errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings." , |
1105 | MultiXactState->oldestMultiXactDB))); |
1106 | } |
1107 | |
1108 | /* |
1109 | * Check whether we should kick autovacuum into action, to prevent members |
1110 | * wraparound. NB we use a much larger window to trigger autovacuum than |
1111 | * just the warning limit. The warning is just a measure of last resort - |
1112 | * this is in line with GetNewTransactionId's behaviour. |
1113 | */ |
1114 | if (!MultiXactState->oldestOffsetKnown || |
1115 | (MultiXactState->nextOffset - MultiXactState->oldestOffset |
1116 | > MULTIXACT_MEMBER_SAFE_THRESHOLD)) |
1117 | { |
1118 | /* |
1119 | * To avoid swamping the postmaster with signals, we issue the autovac |
1120 | * request only when crossing a segment boundary. With default |
1121 | * compilation settings that's roughly after 50k members. This still |
1122 | * gives plenty of chances before we get into real trouble. |
1123 | */ |
1124 | if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != |
1125 | (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) |
1126 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1127 | } |
1128 | |
1129 | if (MultiXactState->oldestOffsetKnown && |
1130 | MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, |
1131 | nextOffset, |
1132 | nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) |
1133 | ereport(WARNING, |
1134 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1135 | errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used" , |
1136 | "database with OID %u must be vacuumed before %d more multixact members are used" , |
1137 | MultiXactState->offsetStopLimit - nextOffset + nmembers, |
1138 | MultiXactState->oldestMultiXactDB, |
1139 | MultiXactState->offsetStopLimit - nextOffset + nmembers), |
1140 | errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings." ))); |
1141 | |
1142 | ExtendMultiXactMember(nextOffset, nmembers); |
1143 | |
1144 | /* |
1145 | * Critical section from here until caller has written the data into the |
1146 | * just-reserved SLRU space; we don't want to error out with a partly |
1147 | * written MultiXact structure. (In particular, failing to write our |
1148 | * start offset after advancing nextMXact would effectively corrupt the |
1149 | * previous MultiXact.) |
1150 | */ |
1151 | START_CRIT_SECTION(); |
1152 | |
1153 | /* |
1154 | * Advance counters. As in GetNewTransactionId(), this must not happen |
1155 | * until after file extension has succeeded! |
1156 | * |
1157 | * We don't care about MultiXactId wraparound here; it will be handled by |
1158 | * the next iteration. But note that nextMXact may be InvalidMultiXactId |
1159 | * or the first value on a segment-beginning page after this routine |
1160 | * exits, so anyone else looking at the variable must be prepared to deal |
1161 | * with either case. Similarly, nextOffset may be zero, but we won't use |
1162 | * that as the actual start offset of the next multixact. |
1163 | */ |
1164 | (MultiXactState->nextMXact)++; |
1165 | |
1166 | MultiXactState->nextOffset += nmembers; |
1167 | |
1168 | LWLockRelease(MultiXactGenLock); |
1169 | |
1170 | debug_elog4(DEBUG2, "GetNew: returning %u offset %u" , result, *offset); |
1171 | return result; |
1172 | } |
1173 | |
1174 | /* |
1175 | * GetMultiXactIdMembers |
1176 | * Return the set of MultiXactMembers that make up a MultiXactId |
1177 | * |
1178 | * Return value is the number of members found, or -1 if there are none, |
1179 | * and *members is set to a newly palloc'ed array of members. It's the |
1180 | * caller's responsibility to free it when done with it. |
1181 | * |
1182 | * from_pgupgrade must be passed as true if and only if only the multixact |
1183 | * corresponds to a value from a tuple that was locked in a 9.2-or-older |
1184 | * installation and later pg_upgrade'd (that is, the infomask is |
1185 | * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members |
1186 | * can still be running, so we return -1 just like for an empty multixact |
1187 | * without any further checking. It would be wrong to try to resolve such a |
1188 | * multixact: either the multixact is within the current valid multixact |
1189 | * range, in which case the returned result would be bogus, or outside that |
1190 | * range, in which case an error would be raised. |
1191 | * |
1192 | * In all other cases, the passed multixact must be within the known valid |
1193 | * range, that is, greater to or equal than oldestMultiXactId, and less than |
1194 | * nextMXact. Otherwise, an error is raised. |
1195 | * |
1196 | * onlyLock must be set to true if caller is certain that the given multi |
1197 | * is used only to lock tuples; can be false without loss of correctness, |
1198 | * but passing a true means we can return quickly without checking for |
1199 | * old updates. |
1200 | */ |
1201 | int |
1202 | GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, |
1203 | bool from_pgupgrade, bool onlyLock) |
1204 | { |
1205 | int pageno; |
1206 | int prev_pageno; |
1207 | int entryno; |
1208 | int slotno; |
1209 | MultiXactOffset *offptr; |
1210 | MultiXactOffset offset; |
1211 | int length; |
1212 | int truelength; |
1213 | int i; |
1214 | MultiXactId oldestMXact; |
1215 | MultiXactId nextMXact; |
1216 | MultiXactId tmpMXact; |
1217 | MultiXactOffset nextOffset; |
1218 | MultiXactMember *ptr; |
1219 | |
1220 | debug_elog3(DEBUG2, "GetMembers: asked for %u" , multi); |
1221 | |
1222 | if (!MultiXactIdIsValid(multi) || from_pgupgrade) |
1223 | return -1; |
1224 | |
1225 | /* See if the MultiXactId is in the local cache */ |
1226 | length = mXactCacheGetById(multi, members); |
1227 | if (length >= 0) |
1228 | { |
1229 | debug_elog3(DEBUG2, "GetMembers: found %s in the cache" , |
1230 | mxid_to_string(multi, length, *members)); |
1231 | return length; |
1232 | } |
1233 | |
1234 | /* Set our OldestVisibleMXactId[] entry if we didn't already */ |
1235 | MultiXactIdSetOldestVisible(); |
1236 | |
1237 | /* |
1238 | * If we know the multi is used only for locking and not for updates, then |
1239 | * we can skip checking if the value is older than our oldest visible |
1240 | * multi. It cannot possibly still be running. |
1241 | */ |
1242 | if (onlyLock && |
1243 | MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId])) |
1244 | { |
1245 | debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old" ); |
1246 | *members = NULL; |
1247 | return -1; |
1248 | } |
1249 | |
1250 | /* |
1251 | * We check known limits on MultiXact before resorting to the SLRU area. |
1252 | * |
1253 | * An ID older than MultiXactState->oldestMultiXactId cannot possibly be |
1254 | * useful; it has already been removed, or will be removed shortly, by |
1255 | * truncation. If one is passed, an error is raised. |
1256 | * |
1257 | * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it |
1258 | * implies undetected ID wraparound has occurred. This raises a hard |
1259 | * error. |
1260 | * |
1261 | * Shared lock is enough here since we aren't modifying any global state. |
1262 | * Acquire it just long enough to grab the current counter values. We may |
1263 | * need both nextMXact and nextOffset; see below. |
1264 | */ |
1265 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
1266 | |
1267 | oldestMXact = MultiXactState->oldestMultiXactId; |
1268 | nextMXact = MultiXactState->nextMXact; |
1269 | nextOffset = MultiXactState->nextOffset; |
1270 | |
1271 | LWLockRelease(MultiXactGenLock); |
1272 | |
1273 | if (MultiXactIdPrecedes(multi, oldestMXact)) |
1274 | { |
1275 | ereport(ERROR, |
1276 | (errcode(ERRCODE_INTERNAL_ERROR), |
1277 | errmsg("MultiXactId %u does no longer exist -- apparent wraparound" , |
1278 | multi))); |
1279 | return -1; |
1280 | } |
1281 | |
1282 | if (!MultiXactIdPrecedes(multi, nextMXact)) |
1283 | ereport(ERROR, |
1284 | (errcode(ERRCODE_INTERNAL_ERROR), |
1285 | errmsg("MultiXactId %u has not been created yet -- apparent wraparound" , |
1286 | multi))); |
1287 | |
1288 | /* |
1289 | * Find out the offset at which we need to start reading MultiXactMembers |
1290 | * and the number of members in the multixact. We determine the latter as |
1291 | * the difference between this multixact's starting offset and the next |
1292 | * one's. However, there are some corner cases to worry about: |
1293 | * |
1294 | * 1. This multixact may be the latest one created, in which case there is |
1295 | * no next one to look at. In this case the nextOffset value we just |
1296 | * saved is the correct endpoint. |
1297 | * |
1298 | * 2. The next multixact may still be in process of being filled in: that |
1299 | * is, another process may have done GetNewMultiXactId but not yet written |
1300 | * the offset entry for that ID. In that scenario, it is guaranteed that |
1301 | * the offset entry for that multixact exists (because GetNewMultiXactId |
1302 | * won't release MultiXactGenLock until it does) but contains zero |
1303 | * (because we are careful to pre-zero offset pages). Because |
1304 | * GetNewMultiXactId will never return zero as the starting offset for a |
1305 | * multixact, when we read zero as the next multixact's offset, we know we |
1306 | * have this case. We sleep for a bit and try again. |
1307 | * |
1308 | * 3. Because GetNewMultiXactId increments offset zero to offset one to |
1309 | * handle case #2, there is an ambiguity near the point of offset |
1310 | * wraparound. If we see next multixact's offset is one, is that our |
1311 | * multixact's actual endpoint, or did it end at zero with a subsequent |
1312 | * increment? We handle this using the knowledge that if the zero'th |
1313 | * member slot wasn't filled, it'll contain zero, and zero isn't a valid |
1314 | * transaction ID so it can't be a multixact member. Therefore, if we |
1315 | * read a zero from the members array, just ignore it. |
1316 | * |
1317 | * This is all pretty messy, but the mess occurs only in infrequent corner |
1318 | * cases, so it seems better than holding the MultiXactGenLock for a long |
1319 | * time on every multixact creation. |
1320 | */ |
1321 | retry: |
1322 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
1323 | |
1324 | pageno = MultiXactIdToOffsetPage(multi); |
1325 | entryno = MultiXactIdToOffsetEntry(multi); |
1326 | |
1327 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); |
1328 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
1329 | offptr += entryno; |
1330 | offset = *offptr; |
1331 | |
1332 | Assert(offset != 0); |
1333 | |
1334 | /* |
1335 | * Use the same increment rule as GetNewMultiXactId(), that is, don't |
1336 | * handle wraparound explicitly until needed. |
1337 | */ |
1338 | tmpMXact = multi + 1; |
1339 | |
1340 | if (nextMXact == tmpMXact) |
1341 | { |
1342 | /* Corner case 1: there is no next multixact */ |
1343 | length = nextOffset - offset; |
1344 | } |
1345 | else |
1346 | { |
1347 | MultiXactOffset nextMXOffset; |
1348 | |
1349 | /* handle wraparound if needed */ |
1350 | if (tmpMXact < FirstMultiXactId) |
1351 | tmpMXact = FirstMultiXactId; |
1352 | |
1353 | prev_pageno = pageno; |
1354 | |
1355 | pageno = MultiXactIdToOffsetPage(tmpMXact); |
1356 | entryno = MultiXactIdToOffsetEntry(tmpMXact); |
1357 | |
1358 | if (pageno != prev_pageno) |
1359 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); |
1360 | |
1361 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
1362 | offptr += entryno; |
1363 | nextMXOffset = *offptr; |
1364 | |
1365 | if (nextMXOffset == 0) |
1366 | { |
1367 | /* Corner case 2: next multixact is still being filled in */ |
1368 | LWLockRelease(MultiXactOffsetControlLock); |
1369 | CHECK_FOR_INTERRUPTS(); |
1370 | pg_usleep(1000L); |
1371 | goto retry; |
1372 | } |
1373 | |
1374 | length = nextMXOffset - offset; |
1375 | } |
1376 | |
1377 | LWLockRelease(MultiXactOffsetControlLock); |
1378 | |
1379 | ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); |
1380 | *members = ptr; |
1381 | |
1382 | /* Now get the members themselves. */ |
1383 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
1384 | |
1385 | truelength = 0; |
1386 | prev_pageno = -1; |
1387 | for (i = 0; i < length; i++, offset++) |
1388 | { |
1389 | TransactionId *xactptr; |
1390 | uint32 *flagsptr; |
1391 | int flagsoff; |
1392 | int bshift; |
1393 | int memberoff; |
1394 | |
1395 | pageno = MXOffsetToMemberPage(offset); |
1396 | memberoff = MXOffsetToMemberOffset(offset); |
1397 | |
1398 | if (pageno != prev_pageno) |
1399 | { |
1400 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); |
1401 | prev_pageno = pageno; |
1402 | } |
1403 | |
1404 | xactptr = (TransactionId *) |
1405 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
1406 | |
1407 | if (!TransactionIdIsValid(*xactptr)) |
1408 | { |
1409 | /* Corner case 3: we must be looking at unused slot zero */ |
1410 | Assert(offset == 0); |
1411 | continue; |
1412 | } |
1413 | |
1414 | flagsoff = MXOffsetToFlagsOffset(offset); |
1415 | bshift = MXOffsetToFlagsBitShift(offset); |
1416 | flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); |
1417 | |
1418 | ptr[truelength].xid = *xactptr; |
1419 | ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; |
1420 | truelength++; |
1421 | } |
1422 | |
1423 | LWLockRelease(MultiXactMemberControlLock); |
1424 | |
1425 | /* |
1426 | * Copy the result into the local cache. |
1427 | */ |
1428 | mXactCachePut(multi, truelength, ptr); |
1429 | |
1430 | debug_elog3(DEBUG2, "GetMembers: no cache for %s" , |
1431 | mxid_to_string(multi, truelength, ptr)); |
1432 | return truelength; |
1433 | } |
1434 | |
1435 | /* |
1436 | * mxactMemberComparator |
1437 | * qsort comparison function for MultiXactMember |
1438 | * |
1439 | * We can't use wraparound comparison for XIDs because that does not respect |
1440 | * the triangle inequality! Any old sort order will do. |
1441 | */ |
1442 | static int |
1443 | mxactMemberComparator(const void *arg1, const void *arg2) |
1444 | { |
1445 | MultiXactMember member1 = *(const MultiXactMember *) arg1; |
1446 | MultiXactMember member2 = *(const MultiXactMember *) arg2; |
1447 | |
1448 | if (member1.xid > member2.xid) |
1449 | return 1; |
1450 | if (member1.xid < member2.xid) |
1451 | return -1; |
1452 | if (member1.status > member2.status) |
1453 | return 1; |
1454 | if (member1.status < member2.status) |
1455 | return -1; |
1456 | return 0; |
1457 | } |
1458 | |
1459 | /* |
1460 | * mXactCacheGetBySet |
1461 | * returns a MultiXactId from the cache based on the set of |
1462 | * TransactionIds that compose it, or InvalidMultiXactId if |
1463 | * none matches. |
1464 | * |
1465 | * This is helpful, for example, if two transactions want to lock a huge |
1466 | * table. By using the cache, the second will use the same MultiXactId |
1467 | * for the majority of tuples, thus keeping MultiXactId usage low (saving |
1468 | * both I/O and wraparound issues). |
1469 | * |
1470 | * NB: the passed members array will be sorted in-place. |
1471 | */ |
1472 | static MultiXactId |
1473 | mXactCacheGetBySet(int nmembers, MultiXactMember *members) |
1474 | { |
1475 | dlist_iter iter; |
1476 | |
1477 | debug_elog3(DEBUG2, "CacheGet: looking for %s" , |
1478 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
1479 | |
1480 | /* sort the array so comparison is easy */ |
1481 | qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); |
1482 | |
1483 | dlist_foreach(iter, &MXactCache) |
1484 | { |
1485 | mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); |
1486 | |
1487 | if (entry->nmembers != nmembers) |
1488 | continue; |
1489 | |
1490 | /* |
1491 | * We assume the cache entries are sorted, and that the unused bits in |
1492 | * "status" are zeroed. |
1493 | */ |
1494 | if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0) |
1495 | { |
1496 | debug_elog3(DEBUG2, "CacheGet: found %u" , entry->multi); |
1497 | dlist_move_head(&MXactCache, iter.cur); |
1498 | return entry->multi; |
1499 | } |
1500 | } |
1501 | |
1502 | debug_elog2(DEBUG2, "CacheGet: not found :-(" ); |
1503 | return InvalidMultiXactId; |
1504 | } |
1505 | |
1506 | /* |
1507 | * mXactCacheGetById |
1508 | * returns the composing MultiXactMember set from the cache for a |
1509 | * given MultiXactId, if present. |
1510 | * |
1511 | * If successful, *xids is set to the address of a palloc'd copy of the |
1512 | * MultiXactMember set. Return value is number of members, or -1 on failure. |
1513 | */ |
1514 | static int |
1515 | mXactCacheGetById(MultiXactId multi, MultiXactMember **members) |
1516 | { |
1517 | dlist_iter iter; |
1518 | |
1519 | debug_elog3(DEBUG2, "CacheGet: looking for %u" , multi); |
1520 | |
1521 | dlist_foreach(iter, &MXactCache) |
1522 | { |
1523 | mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); |
1524 | |
1525 | if (entry->multi == multi) |
1526 | { |
1527 | MultiXactMember *ptr; |
1528 | Size size; |
1529 | |
1530 | size = sizeof(MultiXactMember) * entry->nmembers; |
1531 | ptr = (MultiXactMember *) palloc(size); |
1532 | *members = ptr; |
1533 | |
1534 | memcpy(ptr, entry->members, size); |
1535 | |
1536 | debug_elog3(DEBUG2, "CacheGet: found %s" , |
1537 | mxid_to_string(multi, |
1538 | entry->nmembers, |
1539 | entry->members)); |
1540 | |
1541 | /* |
1542 | * Note we modify the list while not using a modifiable iterator. |
1543 | * This is acceptable only because we exit the iteration |
1544 | * immediately afterwards. |
1545 | */ |
1546 | dlist_move_head(&MXactCache, iter.cur); |
1547 | |
1548 | return entry->nmembers; |
1549 | } |
1550 | } |
1551 | |
1552 | debug_elog2(DEBUG2, "CacheGet: not found" ); |
1553 | return -1; |
1554 | } |
1555 | |
1556 | /* |
1557 | * mXactCachePut |
1558 | * Add a new MultiXactId and its composing set into the local cache. |
1559 | */ |
1560 | static void |
1561 | mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) |
1562 | { |
1563 | mXactCacheEnt *entry; |
1564 | |
1565 | debug_elog3(DEBUG2, "CachePut: storing %s" , |
1566 | mxid_to_string(multi, nmembers, members)); |
1567 | |
1568 | if (MXactContext == NULL) |
1569 | { |
1570 | /* The cache only lives as long as the current transaction */ |
1571 | debug_elog2(DEBUG2, "CachePut: initializing memory context" ); |
1572 | MXactContext = AllocSetContextCreate(TopTransactionContext, |
1573 | "MultiXact cache context" , |
1574 | ALLOCSET_SMALL_SIZES); |
1575 | } |
1576 | |
1577 | entry = (mXactCacheEnt *) |
1578 | MemoryContextAlloc(MXactContext, |
1579 | offsetof(mXactCacheEnt, members) + |
1580 | nmembers * sizeof(MultiXactMember)); |
1581 | |
1582 | entry->multi = multi; |
1583 | entry->nmembers = nmembers; |
1584 | memcpy(entry->members, members, nmembers * sizeof(MultiXactMember)); |
1585 | |
1586 | /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ |
1587 | qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); |
1588 | |
1589 | dlist_push_head(&MXactCache, &entry->node); |
1590 | if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES) |
1591 | { |
1592 | dlist_node *node; |
1593 | mXactCacheEnt *entry; |
1594 | |
1595 | node = dlist_tail_node(&MXactCache); |
1596 | dlist_delete(node); |
1597 | MXactCacheMembers--; |
1598 | |
1599 | entry = dlist_container(mXactCacheEnt, node, node); |
1600 | debug_elog3(DEBUG2, "CachePut: pruning cached multi %u" , |
1601 | entry->multi); |
1602 | |
1603 | pfree(entry); |
1604 | } |
1605 | } |
1606 | |
1607 | static char * |
1608 | mxstatus_to_string(MultiXactStatus status) |
1609 | { |
1610 | switch (status) |
1611 | { |
1612 | case MultiXactStatusForKeyShare: |
1613 | return "keysh" ; |
1614 | case MultiXactStatusForShare: |
1615 | return "sh" ; |
1616 | case MultiXactStatusForNoKeyUpdate: |
1617 | return "fornokeyupd" ; |
1618 | case MultiXactStatusForUpdate: |
1619 | return "forupd" ; |
1620 | case MultiXactStatusNoKeyUpdate: |
1621 | return "nokeyupd" ; |
1622 | case MultiXactStatusUpdate: |
1623 | return "upd" ; |
1624 | default: |
1625 | elog(ERROR, "unrecognized multixact status %d" , status); |
1626 | return "" ; |
1627 | } |
1628 | } |
1629 | |
1630 | char * |
1631 | mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) |
1632 | { |
1633 | static char *str = NULL; |
1634 | StringInfoData buf; |
1635 | int i; |
1636 | |
1637 | if (str != NULL) |
1638 | pfree(str); |
1639 | |
1640 | initStringInfo(&buf); |
1641 | |
1642 | appendStringInfo(&buf, "%u %d[%u (%s)" , multi, nmembers, members[0].xid, |
1643 | mxstatus_to_string(members[0].status)); |
1644 | |
1645 | for (i = 1; i < nmembers; i++) |
1646 | appendStringInfo(&buf, ", %u (%s)" , members[i].xid, |
1647 | mxstatus_to_string(members[i].status)); |
1648 | |
1649 | appendStringInfoChar(&buf, ']'); |
1650 | str = MemoryContextStrdup(TopMemoryContext, buf.data); |
1651 | pfree(buf.data); |
1652 | return str; |
1653 | } |
1654 | |
1655 | /* |
1656 | * AtEOXact_MultiXact |
1657 | * Handle transaction end for MultiXact |
1658 | * |
1659 | * This is called at top transaction commit or abort (we don't care which). |
1660 | */ |
1661 | void |
1662 | AtEOXact_MultiXact(void) |
1663 | { |
1664 | /* |
1665 | * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of |
1666 | * which should only be valid while within a transaction. |
1667 | * |
1668 | * We assume that storing a MultiXactId is atomic and so we need not take |
1669 | * MultiXactGenLock to do this. |
1670 | */ |
1671 | OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; |
1672 | OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; |
1673 | |
1674 | /* |
1675 | * Discard the local MultiXactId cache. Since MXactContext was created as |
1676 | * a child of TopTransactionContext, we needn't delete it explicitly. |
1677 | */ |
1678 | MXactContext = NULL; |
1679 | dlist_init(&MXactCache); |
1680 | MXactCacheMembers = 0; |
1681 | } |
1682 | |
1683 | /* |
1684 | * AtPrepare_MultiXact |
1685 | * Save multixact state at 2PC transaction prepare |
1686 | * |
1687 | * In this phase, we only store our OldestMemberMXactId value in the two-phase |
1688 | * state file. |
1689 | */ |
1690 | void |
1691 | AtPrepare_MultiXact(void) |
1692 | { |
1693 | MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId]; |
1694 | |
1695 | if (MultiXactIdIsValid(myOldestMember)) |
1696 | RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0, |
1697 | &myOldestMember, sizeof(MultiXactId)); |
1698 | } |
1699 | |
1700 | /* |
1701 | * PostPrepare_MultiXact |
1702 | * Clean up after successful PREPARE TRANSACTION |
1703 | */ |
1704 | void |
1705 | PostPrepare_MultiXact(TransactionId xid) |
1706 | { |
1707 | MultiXactId myOldestMember; |
1708 | |
1709 | /* |
1710 | * Transfer our OldestMemberMXactId value to the slot reserved for the |
1711 | * prepared transaction. |
1712 | */ |
1713 | myOldestMember = OldestMemberMXactId[MyBackendId]; |
1714 | if (MultiXactIdIsValid(myOldestMember)) |
1715 | { |
1716 | BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); |
1717 | |
1718 | /* |
1719 | * Even though storing MultiXactId is atomic, acquire lock to make |
1720 | * sure others see both changes, not just the reset of the slot of the |
1721 | * current backend. Using a volatile pointer might suffice, but this |
1722 | * isn't a hot spot. |
1723 | */ |
1724 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
1725 | |
1726 | OldestMemberMXactId[dummyBackendId] = myOldestMember; |
1727 | OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; |
1728 | |
1729 | LWLockRelease(MultiXactGenLock); |
1730 | } |
1731 | |
1732 | /* |
1733 | * We don't need to transfer OldestVisibleMXactId value, because the |
1734 | * transaction is not going to be looking at any more multixacts once it's |
1735 | * prepared. |
1736 | * |
1737 | * We assume that storing a MultiXactId is atomic and so we need not take |
1738 | * MultiXactGenLock to do this. |
1739 | */ |
1740 | OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; |
1741 | |
1742 | /* |
1743 | * Discard the local MultiXactId cache like in AtEOX_MultiXact |
1744 | */ |
1745 | MXactContext = NULL; |
1746 | dlist_init(&MXactCache); |
1747 | MXactCacheMembers = 0; |
1748 | } |
1749 | |
1750 | /* |
1751 | * multixact_twophase_recover |
1752 | * Recover the state of a prepared transaction at startup |
1753 | */ |
1754 | void |
1755 | multixact_twophase_recover(TransactionId xid, uint16 info, |
1756 | void *recdata, uint32 len) |
1757 | { |
1758 | BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); |
1759 | MultiXactId oldestMember; |
1760 | |
1761 | /* |
1762 | * Get the oldest member XID from the state file record, and set it in the |
1763 | * OldestMemberMXactId slot reserved for this prepared transaction. |
1764 | */ |
1765 | Assert(len == sizeof(MultiXactId)); |
1766 | oldestMember = *((MultiXactId *) recdata); |
1767 | |
1768 | OldestMemberMXactId[dummyBackendId] = oldestMember; |
1769 | } |
1770 | |
1771 | /* |
1772 | * multixact_twophase_postcommit |
1773 | * Similar to AtEOX_MultiXact but for COMMIT PREPARED |
1774 | */ |
1775 | void |
1776 | multixact_twophase_postcommit(TransactionId xid, uint16 info, |
1777 | void *recdata, uint32 len) |
1778 | { |
1779 | BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true); |
1780 | |
1781 | Assert(len == sizeof(MultiXactId)); |
1782 | |
1783 | OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId; |
1784 | } |
1785 | |
1786 | /* |
1787 | * multixact_twophase_postabort |
1788 | * This is actually just the same as the COMMIT case. |
1789 | */ |
1790 | void |
1791 | multixact_twophase_postabort(TransactionId xid, uint16 info, |
1792 | void *recdata, uint32 len) |
1793 | { |
1794 | multixact_twophase_postcommit(xid, info, recdata, len); |
1795 | } |
1796 | |
1797 | /* |
1798 | * Initialization of shared memory for MultiXact. We use two SLRU areas, |
1799 | * thus double memory. Also, reserve space for the shared MultiXactState |
1800 | * struct and the per-backend MultiXactId arrays (two of those, too). |
1801 | */ |
1802 | Size |
1803 | MultiXactShmemSize(void) |
1804 | { |
1805 | Size size; |
1806 | |
1807 | /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */ |
1808 | #define SHARED_MULTIXACT_STATE_SIZE \ |
1809 | add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \ |
1810 | mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) |
1811 | |
1812 | size = SHARED_MULTIXACT_STATE_SIZE; |
1813 | size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0)); |
1814 | size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0)); |
1815 | |
1816 | return size; |
1817 | } |
1818 | |
1819 | void |
1820 | MultiXactShmemInit(void) |
1821 | { |
1822 | bool found; |
1823 | |
1824 | debug_elog2(DEBUG2, "Shared Memory Init for MultiXact" ); |
1825 | |
1826 | MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; |
1827 | MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; |
1828 | |
1829 | SimpleLruInit(MultiXactOffsetCtl, |
1830 | "multixact_offset" , NUM_MXACTOFFSET_BUFFERS, 0, |
1831 | MultiXactOffsetControlLock, "pg_multixact/offsets" , |
1832 | LWTRANCHE_MXACTOFFSET_BUFFERS); |
1833 | SimpleLruInit(MultiXactMemberCtl, |
1834 | "multixact_member" , NUM_MXACTMEMBER_BUFFERS, 0, |
1835 | MultiXactMemberControlLock, "pg_multixact/members" , |
1836 | LWTRANCHE_MXACTMEMBER_BUFFERS); |
1837 | |
1838 | /* Initialize our shared state struct */ |
1839 | MultiXactState = ShmemInitStruct("Shared MultiXact State" , |
1840 | SHARED_MULTIXACT_STATE_SIZE, |
1841 | &found); |
1842 | if (!IsUnderPostmaster) |
1843 | { |
1844 | Assert(!found); |
1845 | |
1846 | /* Make sure we zero out the per-backend state */ |
1847 | MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); |
1848 | } |
1849 | else |
1850 | Assert(found); |
1851 | |
1852 | /* |
1853 | * Set up array pointers. Note that perBackendXactIds[0] is wasted space |
1854 | * since we only use indexes 1..MaxOldestSlot in each array. |
1855 | */ |
1856 | OldestMemberMXactId = MultiXactState->perBackendXactIds; |
1857 | OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot; |
1858 | } |
1859 | |
1860 | /* |
1861 | * This func must be called ONCE on system install. It creates the initial |
1862 | * MultiXact segments. (The MultiXacts directories are assumed to have been |
1863 | * created by initdb, and MultiXactShmemInit must have been called already.) |
1864 | */ |
1865 | void |
1866 | BootStrapMultiXact(void) |
1867 | { |
1868 | int slotno; |
1869 | |
1870 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
1871 | |
1872 | /* Create and zero the first page of the offsets log */ |
1873 | slotno = ZeroMultiXactOffsetPage(0, false); |
1874 | |
1875 | /* Make sure it's written out */ |
1876 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
1877 | Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
1878 | |
1879 | LWLockRelease(MultiXactOffsetControlLock); |
1880 | |
1881 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
1882 | |
1883 | /* Create and zero the first page of the members log */ |
1884 | slotno = ZeroMultiXactMemberPage(0, false); |
1885 | |
1886 | /* Make sure it's written out */ |
1887 | SimpleLruWritePage(MultiXactMemberCtl, slotno); |
1888 | Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
1889 | |
1890 | LWLockRelease(MultiXactMemberControlLock); |
1891 | } |
1892 | |
1893 | /* |
1894 | * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. |
1895 | * If writeXlog is true, also emit an XLOG record saying we did this. |
1896 | * |
1897 | * The page is not actually written, just set up in shared memory. |
1898 | * The slot number of the new page is returned. |
1899 | * |
1900 | * Control lock must be held at entry, and will be held at exit. |
1901 | */ |
1902 | static int |
1903 | ZeroMultiXactOffsetPage(int pageno, bool writeXlog) |
1904 | { |
1905 | int slotno; |
1906 | |
1907 | slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); |
1908 | |
1909 | if (writeXlog) |
1910 | WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); |
1911 | |
1912 | return slotno; |
1913 | } |
1914 | |
1915 | /* |
1916 | * Ditto, for MultiXactMember |
1917 | */ |
1918 | static int |
1919 | ZeroMultiXactMemberPage(int pageno, bool writeXlog) |
1920 | { |
1921 | int slotno; |
1922 | |
1923 | slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); |
1924 | |
1925 | if (writeXlog) |
1926 | WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); |
1927 | |
1928 | return slotno; |
1929 | } |
1930 | |
1931 | /* |
1932 | * MaybeExtendOffsetSlru |
1933 | * Extend the offsets SLRU area, if necessary |
1934 | * |
1935 | * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might |
1936 | * contain files that are shorter than necessary; this would occur if the old |
1937 | * installation had used multixacts beyond the first page (files cannot be |
1938 | * copied, because the on-disk representation is different). pg_upgrade would |
1939 | * update pg_control to set the next offset value to be at that position, so |
1940 | * that tuples marked as locked by such MultiXacts would be seen as visible |
1941 | * without having to consult multixact. However, trying to create and use a |
1942 | * new MultiXactId would result in an error because the page on which the new |
1943 | * value would reside does not exist. This routine is in charge of creating |
1944 | * such pages. |
1945 | */ |
1946 | static void |
1947 | MaybeExtendOffsetSlru(void) |
1948 | { |
1949 | int pageno; |
1950 | |
1951 | pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); |
1952 | |
1953 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
1954 | |
1955 | if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) |
1956 | { |
1957 | int slotno; |
1958 | |
1959 | /* |
1960 | * Fortunately for us, SimpleLruWritePage is already prepared to deal |
1961 | * with creating a new segment file even if the page we're writing is |
1962 | * not the first in it, so this is enough. |
1963 | */ |
1964 | slotno = ZeroMultiXactOffsetPage(pageno, false); |
1965 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
1966 | } |
1967 | |
1968 | LWLockRelease(MultiXactOffsetControlLock); |
1969 | } |
1970 | |
1971 | /* |
1972 | * This must be called ONCE during postmaster or standalone-backend startup. |
1973 | * |
1974 | * StartupXLOG has already established nextMXact/nextOffset by calling |
1975 | * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti |
1976 | * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet |
1977 | * replayed WAL. |
1978 | */ |
1979 | void |
1980 | StartupMultiXact(void) |
1981 | { |
1982 | MultiXactId multi = MultiXactState->nextMXact; |
1983 | MultiXactOffset offset = MultiXactState->nextOffset; |
1984 | int pageno; |
1985 | |
1986 | /* |
1987 | * Initialize offset's idea of the latest page number. |
1988 | */ |
1989 | pageno = MultiXactIdToOffsetPage(multi); |
1990 | MultiXactOffsetCtl->shared->latest_page_number = pageno; |
1991 | |
1992 | /* |
1993 | * Initialize member's idea of the latest page number. |
1994 | */ |
1995 | pageno = MXOffsetToMemberPage(offset); |
1996 | MultiXactMemberCtl->shared->latest_page_number = pageno; |
1997 | } |
1998 | |
1999 | /* |
2000 | * This must be called ONCE at the end of startup/recovery. |
2001 | */ |
2002 | void |
2003 | TrimMultiXact(void) |
2004 | { |
2005 | MultiXactId nextMXact; |
2006 | MultiXactOffset offset; |
2007 | MultiXactId oldestMXact; |
2008 | Oid oldestMXactDB; |
2009 | int pageno; |
2010 | int entryno; |
2011 | int flagsoff; |
2012 | |
2013 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2014 | nextMXact = MultiXactState->nextMXact; |
2015 | offset = MultiXactState->nextOffset; |
2016 | oldestMXact = MultiXactState->oldestMultiXactId; |
2017 | oldestMXactDB = MultiXactState->oldestMultiXactDB; |
2018 | LWLockRelease(MultiXactGenLock); |
2019 | |
2020 | /* Clean up offsets state */ |
2021 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
2022 | |
2023 | /* |
2024 | * (Re-)Initialize our idea of the latest page number for offsets. |
2025 | */ |
2026 | pageno = MultiXactIdToOffsetPage(nextMXact); |
2027 | MultiXactOffsetCtl->shared->latest_page_number = pageno; |
2028 | |
2029 | /* |
2030 | * Zero out the remainder of the current offsets page. See notes in |
2031 | * TrimCLOG() for background. Unlike CLOG, some WAL record covers every |
2032 | * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL |
2033 | * rule "write xlog before data," nextMXact successors may carry obsolete, |
2034 | * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() |
2035 | * operates normally. |
2036 | */ |
2037 | entryno = MultiXactIdToOffsetEntry(nextMXact); |
2038 | if (entryno != 0) |
2039 | { |
2040 | int slotno; |
2041 | MultiXactOffset *offptr; |
2042 | |
2043 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); |
2044 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
2045 | offptr += entryno; |
2046 | |
2047 | MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); |
2048 | |
2049 | MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
2050 | } |
2051 | |
2052 | LWLockRelease(MultiXactOffsetControlLock); |
2053 | |
2054 | /* And the same for members */ |
2055 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
2056 | |
2057 | /* |
2058 | * (Re-)Initialize our idea of the latest page number for members. |
2059 | */ |
2060 | pageno = MXOffsetToMemberPage(offset); |
2061 | MultiXactMemberCtl->shared->latest_page_number = pageno; |
2062 | |
2063 | /* |
2064 | * Zero out the remainder of the current members page. See notes in |
2065 | * TrimCLOG() for motivation. |
2066 | */ |
2067 | flagsoff = MXOffsetToFlagsOffset(offset); |
2068 | if (flagsoff != 0) |
2069 | { |
2070 | int slotno; |
2071 | TransactionId *xidptr; |
2072 | int memberoff; |
2073 | |
2074 | memberoff = MXOffsetToMemberOffset(offset); |
2075 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); |
2076 | xidptr = (TransactionId *) |
2077 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
2078 | |
2079 | MemSet(xidptr, 0, BLCKSZ - memberoff); |
2080 | |
2081 | /* |
2082 | * Note: we don't need to zero out the flag bits in the remaining |
2083 | * members of the current group, because they are always reset before |
2084 | * writing. |
2085 | */ |
2086 | |
2087 | MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
2088 | } |
2089 | |
2090 | LWLockRelease(MultiXactMemberControlLock); |
2091 | |
2092 | /* signal that we're officially up */ |
2093 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2094 | MultiXactState->finishedStartup = true; |
2095 | LWLockRelease(MultiXactGenLock); |
2096 | |
2097 | /* Now compute how far away the next members wraparound is. */ |
2098 | SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); |
2099 | } |
2100 | |
2101 | /* |
2102 | * This must be called ONCE during postmaster or standalone-backend shutdown |
2103 | */ |
2104 | void |
2105 | ShutdownMultiXact(void) |
2106 | { |
2107 | /* Flush dirty MultiXact pages to disk */ |
2108 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false); |
2109 | SimpleLruFlush(MultiXactOffsetCtl, false); |
2110 | SimpleLruFlush(MultiXactMemberCtl, false); |
2111 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false); |
2112 | } |
2113 | |
2114 | /* |
2115 | * Get the MultiXact data to save in a checkpoint record |
2116 | */ |
2117 | void |
2118 | MultiXactGetCheckptMulti(bool is_shutdown, |
2119 | MultiXactId *nextMulti, |
2120 | MultiXactOffset *nextMultiOffset, |
2121 | MultiXactId *oldestMulti, |
2122 | Oid *oldestMultiDB) |
2123 | { |
2124 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2125 | *nextMulti = MultiXactState->nextMXact; |
2126 | *nextMultiOffset = MultiXactState->nextOffset; |
2127 | *oldestMulti = MultiXactState->oldestMultiXactId; |
2128 | *oldestMultiDB = MultiXactState->oldestMultiXactDB; |
2129 | LWLockRelease(MultiXactGenLock); |
2130 | |
2131 | debug_elog6(DEBUG2, |
2132 | "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u" , |
2133 | *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); |
2134 | } |
2135 | |
2136 | /* |
2137 | * Perform a checkpoint --- either during shutdown, or on-the-fly |
2138 | */ |
2139 | void |
2140 | CheckPointMultiXact(void) |
2141 | { |
2142 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); |
2143 | |
2144 | /* Flush dirty MultiXact pages to disk */ |
2145 | SimpleLruFlush(MultiXactOffsetCtl, true); |
2146 | SimpleLruFlush(MultiXactMemberCtl, true); |
2147 | |
2148 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); |
2149 | } |
2150 | |
2151 | /* |
2152 | * Set the next-to-be-assigned MultiXactId and offset |
2153 | * |
2154 | * This is used when we can determine the correct next ID/offset exactly |
2155 | * from a checkpoint record. Although this is only called during bootstrap |
2156 | * and XLog replay, we take the lock in case any hot-standby backends are |
2157 | * examining the values. |
2158 | */ |
2159 | void |
2160 | MultiXactSetNextMXact(MultiXactId nextMulti, |
2161 | MultiXactOffset nextMultiOffset) |
2162 | { |
2163 | debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u" , |
2164 | nextMulti, nextMultiOffset); |
2165 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2166 | MultiXactState->nextMXact = nextMulti; |
2167 | MultiXactState->nextOffset = nextMultiOffset; |
2168 | LWLockRelease(MultiXactGenLock); |
2169 | |
2170 | /* |
2171 | * During a binary upgrade, make sure that the offsets SLRU is large |
2172 | * enough to contain the next value that would be created. |
2173 | * |
2174 | * We need to do this pretty early during the first startup in binary |
2175 | * upgrade mode: before StartupMultiXact() in fact, because this routine |
2176 | * is called even before that by StartupXLOG(). And we can't do it |
2177 | * earlier than at this point, because during that first call of this |
2178 | * routine we determine the MultiXactState->nextMXact value that |
2179 | * MaybeExtendOffsetSlru needs. |
2180 | */ |
2181 | if (IsBinaryUpgrade) |
2182 | MaybeExtendOffsetSlru(); |
2183 | } |
2184 | |
2185 | /* |
2186 | * Determine the last safe MultiXactId to allocate given the currently oldest |
2187 | * datminmxid (ie, the oldest MultiXactId that might exist in any database |
2188 | * of our cluster), and the OID of the (or a) database with that value. |
2189 | * |
2190 | * is_startup is true when we are just starting the cluster, false when we |
2191 | * are updating state in a running cluster. This only affects log messages. |
2192 | */ |
2193 | void |
2194 | SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, |
2195 | bool is_startup) |
2196 | { |
2197 | MultiXactId multiVacLimit; |
2198 | MultiXactId multiWarnLimit; |
2199 | MultiXactId multiStopLimit; |
2200 | MultiXactId multiWrapLimit; |
2201 | MultiXactId curMulti; |
2202 | bool needs_offset_vacuum; |
2203 | |
2204 | Assert(MultiXactIdIsValid(oldest_datminmxid)); |
2205 | |
2206 | /* |
2207 | * We pretend that a wrap will happen halfway through the multixact ID |
2208 | * space, but that's not really true, because multixacts wrap differently |
2209 | * from transaction IDs. Note that, separately from any concern about |
2210 | * multixact IDs wrapping, we must ensure that multixact members do not |
2211 | * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. |
2212 | */ |
2213 | multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); |
2214 | if (multiWrapLimit < FirstMultiXactId) |
2215 | multiWrapLimit += FirstMultiXactId; |
2216 | |
2217 | /* |
2218 | * We'll refuse to continue assigning MultiXactIds once we get within 100 |
2219 | * multi of data loss. |
2220 | * |
2221 | * Note: This differs from the magic number used in |
2222 | * SetTransactionIdLimit() since vacuum itself will never generate new |
2223 | * multis. XXX actually it does, if it needs to freeze old multis. |
2224 | */ |
2225 | multiStopLimit = multiWrapLimit - 100; |
2226 | if (multiStopLimit < FirstMultiXactId) |
2227 | multiStopLimit -= FirstMultiXactId; |
2228 | |
2229 | /* |
2230 | * We'll start complaining loudly when we get within 10M multis of the |
2231 | * stop point. This is kind of arbitrary, but if you let your gas gauge |
2232 | * get down to 1% of full, would you be looking for the next gas station? |
2233 | * We need to be fairly liberal about this number because there are lots |
2234 | * of scenarios where most transactions are done by automatic clients that |
2235 | * won't pay attention to warnings. (No, we're not gonna make this |
2236 | * configurable. If you know enough to configure it, you know enough to |
2237 | * not get in this kind of trouble in the first place.) |
2238 | */ |
2239 | multiWarnLimit = multiStopLimit - 10000000; |
2240 | if (multiWarnLimit < FirstMultiXactId) |
2241 | multiWarnLimit -= FirstMultiXactId; |
2242 | |
2243 | /* |
2244 | * We'll start trying to force autovacuums when oldest_datminmxid gets to |
2245 | * be more than autovacuum_multixact_freeze_max_age mxids old. |
2246 | * |
2247 | * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter |
2248 | * so that we don't have to worry about dealing with on-the-fly changes in |
2249 | * its value. See SetTransactionIdLimit. |
2250 | */ |
2251 | multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age; |
2252 | if (multiVacLimit < FirstMultiXactId) |
2253 | multiVacLimit += FirstMultiXactId; |
2254 | |
2255 | /* Grab lock for just long enough to set the new limit values */ |
2256 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2257 | MultiXactState->oldestMultiXactId = oldest_datminmxid; |
2258 | MultiXactState->oldestMultiXactDB = oldest_datoid; |
2259 | MultiXactState->multiVacLimit = multiVacLimit; |
2260 | MultiXactState->multiWarnLimit = multiWarnLimit; |
2261 | MultiXactState->multiStopLimit = multiStopLimit; |
2262 | MultiXactState->multiWrapLimit = multiWrapLimit; |
2263 | curMulti = MultiXactState->nextMXact; |
2264 | LWLockRelease(MultiXactGenLock); |
2265 | |
2266 | /* Log the info */ |
2267 | ereport(DEBUG1, |
2268 | (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u" , |
2269 | multiWrapLimit, oldest_datoid))); |
2270 | |
2271 | /* |
2272 | * Computing the actual limits is only possible once the data directory is |
2273 | * in a consistent state. There's no need to compute the limits while |
2274 | * still replaying WAL - no decisions about new multis are made even |
2275 | * though multixact creations might be replayed. So we'll only do further |
2276 | * checks after TrimMultiXact() has been called. |
2277 | */ |
2278 | if (!MultiXactState->finishedStartup) |
2279 | return; |
2280 | |
2281 | Assert(!InRecovery); |
2282 | |
2283 | /* Set limits for offset vacuum. */ |
2284 | needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); |
2285 | |
2286 | /* |
2287 | * If past the autovacuum force point, immediately signal an autovac |
2288 | * request. The reason for this is that autovac only processes one |
2289 | * database per invocation. Once it's finished cleaning up the oldest |
2290 | * database, it'll call here, and we'll signal the postmaster to start |
2291 | * another iteration immediately if there are still any old databases. |
2292 | */ |
2293 | if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || |
2294 | needs_offset_vacuum) && IsUnderPostmaster) |
2295 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
2296 | |
2297 | /* Give an immediate warning if past the wrap warn point */ |
2298 | if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) |
2299 | { |
2300 | char *oldest_datname; |
2301 | |
2302 | /* |
2303 | * We can be called when not inside a transaction, for example during |
2304 | * StartupXLOG(). In such a case we cannot do database access, so we |
2305 | * must just report the oldest DB's OID. |
2306 | * |
2307 | * Note: it's also possible that get_database_name fails and returns |
2308 | * NULL, for example because the database just got dropped. We'll |
2309 | * still warn, even though the warning might now be unnecessary. |
2310 | */ |
2311 | if (IsTransactionState()) |
2312 | oldest_datname = get_database_name(oldest_datoid); |
2313 | else |
2314 | oldest_datname = NULL; |
2315 | |
2316 | if (oldest_datname) |
2317 | ereport(WARNING, |
2318 | (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used" , |
2319 | "database \"%s\" must be vacuumed before %u more MultiXactIds are used" , |
2320 | multiWrapLimit - curMulti, |
2321 | oldest_datname, |
2322 | multiWrapLimit - curMulti), |
2323 | errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" |
2324 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
2325 | else |
2326 | ereport(WARNING, |
2327 | (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used" , |
2328 | "database with OID %u must be vacuumed before %u more MultiXactIds are used" , |
2329 | multiWrapLimit - curMulti, |
2330 | oldest_datoid, |
2331 | multiWrapLimit - curMulti), |
2332 | errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" |
2333 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
2334 | } |
2335 | } |
2336 | |
2337 | /* |
2338 | * Ensure the next-to-be-assigned MultiXactId is at least minMulti, |
2339 | * and similarly nextOffset is at least minMultiOffset. |
2340 | * |
2341 | * This is used when we can determine minimum safe values from an XLog |
2342 | * record (either an on-line checkpoint or an mxact creation log entry). |
2343 | * Although this is only called during XLog replay, we take the lock in case |
2344 | * any hot-standby backends are examining the values. |
2345 | */ |
2346 | void |
2347 | MultiXactAdvanceNextMXact(MultiXactId minMulti, |
2348 | MultiXactOffset minMultiOffset) |
2349 | { |
2350 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2351 | if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) |
2352 | { |
2353 | debug_elog3(DEBUG2, "MultiXact: setting next multi to %u" , minMulti); |
2354 | MultiXactState->nextMXact = minMulti; |
2355 | } |
2356 | if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) |
2357 | { |
2358 | debug_elog3(DEBUG2, "MultiXact: setting next offset to %u" , |
2359 | minMultiOffset); |
2360 | MultiXactState->nextOffset = minMultiOffset; |
2361 | } |
2362 | LWLockRelease(MultiXactGenLock); |
2363 | } |
2364 | |
2365 | /* |
2366 | * Update our oldestMultiXactId value, but only if it's more recent than what |
2367 | * we had. |
2368 | * |
2369 | * This may only be called during WAL replay. |
2370 | */ |
2371 | void |
2372 | MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) |
2373 | { |
2374 | Assert(InRecovery); |
2375 | |
2376 | if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) |
2377 | SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); |
2378 | } |
2379 | |
2380 | /* |
2381 | * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. |
2382 | * |
2383 | * NB: this is called while holding MultiXactGenLock. We want it to be very |
2384 | * fast most of the time; even when it's not so fast, no actual I/O need |
2385 | * happen unless we're forced to write out a dirty log or xlog page to make |
2386 | * room in shared memory. |
2387 | */ |
2388 | static void |
2389 | ExtendMultiXactOffset(MultiXactId multi) |
2390 | { |
2391 | int pageno; |
2392 | |
2393 | /* |
2394 | * No work except at first MultiXactId of a page. But beware: just after |
2395 | * wraparound, the first MultiXactId of page zero is FirstMultiXactId. |
2396 | */ |
2397 | if (MultiXactIdToOffsetEntry(multi) != 0 && |
2398 | multi != FirstMultiXactId) |
2399 | return; |
2400 | |
2401 | pageno = MultiXactIdToOffsetPage(multi); |
2402 | |
2403 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
2404 | |
2405 | /* Zero the page and make an XLOG entry about it */ |
2406 | ZeroMultiXactOffsetPage(pageno, true); |
2407 | |
2408 | LWLockRelease(MultiXactOffsetControlLock); |
2409 | } |
2410 | |
2411 | /* |
2412 | * Make sure that MultiXactMember has room for the members of a newly- |
2413 | * allocated MultiXactId. |
2414 | * |
2415 | * Like the above routine, this is called while holding MultiXactGenLock; |
2416 | * same comments apply. |
2417 | */ |
2418 | static void |
2419 | ExtendMultiXactMember(MultiXactOffset offset, int nmembers) |
2420 | { |
2421 | /* |
2422 | * It's possible that the members span more than one page of the members |
2423 | * file, so we loop to ensure we consider each page. The coding is not |
2424 | * optimal if the members span several pages, but that seems unusual |
2425 | * enough to not worry much about. |
2426 | */ |
2427 | while (nmembers > 0) |
2428 | { |
2429 | int flagsoff; |
2430 | int flagsbit; |
2431 | uint32 difference; |
2432 | |
2433 | /* |
2434 | * Only zero when at first entry of a page. |
2435 | */ |
2436 | flagsoff = MXOffsetToFlagsOffset(offset); |
2437 | flagsbit = MXOffsetToFlagsBitShift(offset); |
2438 | if (flagsoff == 0 && flagsbit == 0) |
2439 | { |
2440 | int pageno; |
2441 | |
2442 | pageno = MXOffsetToMemberPage(offset); |
2443 | |
2444 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
2445 | |
2446 | /* Zero the page and make an XLOG entry about it */ |
2447 | ZeroMultiXactMemberPage(pageno, true); |
2448 | |
2449 | LWLockRelease(MultiXactMemberControlLock); |
2450 | } |
2451 | |
2452 | /* |
2453 | * Compute the number of items till end of current page. Careful: if |
2454 | * addition of unsigned ints wraps around, we're at the last page of |
2455 | * the last segment; since that page holds a different number of items |
2456 | * than other pages, we need to do it differently. |
2457 | */ |
2458 | if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) |
2459 | { |
2460 | /* |
2461 | * This is the last page of the last segment; we can compute the |
2462 | * number of items left to allocate in it without modulo |
2463 | * arithmetic. |
2464 | */ |
2465 | difference = MaxMultiXactOffset - offset + 1; |
2466 | } |
2467 | else |
2468 | difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; |
2469 | |
2470 | /* |
2471 | * Advance to next page, taking care to properly handle the wraparound |
2472 | * case. OK if nmembers goes negative. |
2473 | */ |
2474 | nmembers -= difference; |
2475 | offset += difference; |
2476 | } |
2477 | } |
2478 | |
2479 | /* |
2480 | * GetOldestMultiXactId |
2481 | * |
2482 | * Return the oldest MultiXactId that's still possibly still seen as live by |
2483 | * any running transaction. Older ones might still exist on disk, but they no |
2484 | * longer have any running member transaction. |
2485 | * |
2486 | * It's not safe to truncate MultiXact SLRU segments on the value returned by |
2487 | * this function; however, it can be used by a full-table vacuum to set the |
2488 | * point at which it will be possible to truncate SLRU for that table. |
2489 | */ |
2490 | MultiXactId |
2491 | GetOldestMultiXactId(void) |
2492 | { |
2493 | MultiXactId oldestMXact; |
2494 | MultiXactId nextMXact; |
2495 | int i; |
2496 | |
2497 | /* |
2498 | * This is the oldest valid value among all the OldestMemberMXactId[] and |
2499 | * OldestVisibleMXactId[] entries, or nextMXact if none are valid. |
2500 | */ |
2501 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2502 | |
2503 | /* |
2504 | * We have to beware of the possibility that nextMXact is in the |
2505 | * wrapped-around state. We don't fix the counter itself here, but we |
2506 | * must be sure to use a valid value in our calculation. |
2507 | */ |
2508 | nextMXact = MultiXactState->nextMXact; |
2509 | if (nextMXact < FirstMultiXactId) |
2510 | nextMXact = FirstMultiXactId; |
2511 | |
2512 | oldestMXact = nextMXact; |
2513 | for (i = 1; i <= MaxOldestSlot; i++) |
2514 | { |
2515 | MultiXactId thisoldest; |
2516 | |
2517 | thisoldest = OldestMemberMXactId[i]; |
2518 | if (MultiXactIdIsValid(thisoldest) && |
2519 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
2520 | oldestMXact = thisoldest; |
2521 | thisoldest = OldestVisibleMXactId[i]; |
2522 | if (MultiXactIdIsValid(thisoldest) && |
2523 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
2524 | oldestMXact = thisoldest; |
2525 | } |
2526 | |
2527 | LWLockRelease(MultiXactGenLock); |
2528 | |
2529 | return oldestMXact; |
2530 | } |
2531 | |
2532 | /* |
2533 | * Determine how aggressively we need to vacuum in order to prevent member |
2534 | * wraparound. |
2535 | * |
2536 | * To do so determine what's the oldest member offset and install the limit |
2537 | * info in MultiXactState, where it can be used to prevent overrun of old data |
2538 | * in the members SLRU area. |
2539 | * |
2540 | * The return value is true if emergency autovacuum is required and false |
2541 | * otherwise. |
2542 | */ |
2543 | static bool |
2544 | SetOffsetVacuumLimit(bool is_startup) |
2545 | { |
2546 | MultiXactId oldestMultiXactId; |
2547 | MultiXactId nextMXact; |
2548 | MultiXactOffset oldestOffset = 0; /* placate compiler */ |
2549 | MultiXactOffset prevOldestOffset; |
2550 | MultiXactOffset nextOffset; |
2551 | bool oldestOffsetKnown = false; |
2552 | bool prevOldestOffsetKnown; |
2553 | MultiXactOffset offsetStopLimit = 0; |
2554 | MultiXactOffset prevOffsetStopLimit; |
2555 | |
2556 | /* |
2557 | * NB: Have to prevent concurrent truncation, we might otherwise try to |
2558 | * lookup an oldestMulti that's concurrently getting truncated away. |
2559 | */ |
2560 | LWLockAcquire(MultiXactTruncationLock, LW_SHARED); |
2561 | |
2562 | /* Read relevant fields from shared memory. */ |
2563 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2564 | oldestMultiXactId = MultiXactState->oldestMultiXactId; |
2565 | nextMXact = MultiXactState->nextMXact; |
2566 | nextOffset = MultiXactState->nextOffset; |
2567 | prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; |
2568 | prevOldestOffset = MultiXactState->oldestOffset; |
2569 | prevOffsetStopLimit = MultiXactState->offsetStopLimit; |
2570 | Assert(MultiXactState->finishedStartup); |
2571 | LWLockRelease(MultiXactGenLock); |
2572 | |
2573 | /* |
2574 | * Determine the offset of the oldest multixact. Normally, we can read |
2575 | * the offset from the multixact itself, but there's an important special |
2576 | * case: if there are no multixacts in existence at all, oldestMXact |
2577 | * obviously can't point to one. It will instead point to the multixact |
2578 | * ID that will be assigned the next time one is needed. |
2579 | */ |
2580 | if (oldestMultiXactId == nextMXact) |
2581 | { |
2582 | /* |
2583 | * When the next multixact gets created, it will be stored at the next |
2584 | * offset. |
2585 | */ |
2586 | oldestOffset = nextOffset; |
2587 | oldestOffsetKnown = true; |
2588 | } |
2589 | else |
2590 | { |
2591 | /* |
2592 | * Figure out where the oldest existing multixact's offsets are |
2593 | * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, |
2594 | * the supposedly-earliest multixact might not really exist. We are |
2595 | * careful not to fail in that case. |
2596 | */ |
2597 | oldestOffsetKnown = |
2598 | find_multixact_start(oldestMultiXactId, &oldestOffset); |
2599 | |
2600 | if (oldestOffsetKnown) |
2601 | ereport(DEBUG1, |
2602 | (errmsg("oldest MultiXactId member is at offset %u" , |
2603 | oldestOffset))); |
2604 | else |
2605 | ereport(LOG, |
2606 | (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk" , |
2607 | oldestMultiXactId))); |
2608 | } |
2609 | |
2610 | LWLockRelease(MultiXactTruncationLock); |
2611 | |
2612 | /* |
2613 | * If we can, compute limits (and install them MultiXactState) to prevent |
2614 | * overrun of old data in the members SLRU area. We can only do so if the |
2615 | * oldest offset is known though. |
2616 | */ |
2617 | if (oldestOffsetKnown) |
2618 | { |
2619 | /* move back to start of the corresponding segment */ |
2620 | offsetStopLimit = oldestOffset - (oldestOffset % |
2621 | (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); |
2622 | |
2623 | /* always leave one segment before the wraparound point */ |
2624 | offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); |
2625 | |
2626 | if (!prevOldestOffsetKnown && !is_startup) |
2627 | ereport(LOG, |
2628 | (errmsg("MultiXact member wraparound protections are now enabled" ))); |
2629 | |
2630 | ereport(DEBUG1, |
2631 | (errmsg("MultiXact member stop limit is now %u based on MultiXact %u" , |
2632 | offsetStopLimit, oldestMultiXactId))); |
2633 | } |
2634 | else if (prevOldestOffsetKnown) |
2635 | { |
2636 | /* |
2637 | * If we failed to get the oldest offset this time, but we have a |
2638 | * value from a previous pass through this function, use the old |
2639 | * values rather than automatically forcing an emergency autovacuum |
2640 | * cycle again. |
2641 | */ |
2642 | oldestOffset = prevOldestOffset; |
2643 | oldestOffsetKnown = true; |
2644 | offsetStopLimit = prevOffsetStopLimit; |
2645 | } |
2646 | |
2647 | /* Install the computed values */ |
2648 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2649 | MultiXactState->oldestOffset = oldestOffset; |
2650 | MultiXactState->oldestOffsetKnown = oldestOffsetKnown; |
2651 | MultiXactState->offsetStopLimit = offsetStopLimit; |
2652 | LWLockRelease(MultiXactGenLock); |
2653 | |
2654 | /* |
2655 | * Do we need an emergency autovacuum? If we're not sure, assume yes. |
2656 | */ |
2657 | return !oldestOffsetKnown || |
2658 | (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); |
2659 | } |
2660 | |
2661 | /* |
2662 | * Return whether adding "distance" to "start" would move past "boundary". |
2663 | * |
2664 | * We use this to determine whether the addition is "wrapping around" the |
2665 | * boundary point, hence the name. The reason we don't want to use the regular |
2666 | * 2^31-modulo arithmetic here is that we want to be able to use the whole of |
2667 | * the 2^32-1 space here, allowing for more multixacts that would fit |
2668 | * otherwise. |
2669 | */ |
2670 | static bool |
2671 | MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, |
2672 | uint32 distance) |
2673 | { |
2674 | MultiXactOffset finish; |
2675 | |
2676 | /* |
2677 | * Note that offset number 0 is not used (see GetMultiXactIdMembers), so |
2678 | * if the addition wraps around the UINT_MAX boundary, skip that value. |
2679 | */ |
2680 | finish = start + distance; |
2681 | if (finish < start) |
2682 | finish++; |
2683 | |
2684 | /*----------------------------------------------------------------------- |
2685 | * When the boundary is numerically greater than the starting point, any |
2686 | * value numerically between the two is not wrapped: |
2687 | * |
2688 | * <----S----B----> |
2689 | * [---) = F wrapped past B (and UINT_MAX) |
2690 | * [---) = F not wrapped |
2691 | * [----] = F wrapped past B |
2692 | * |
2693 | * When the boundary is numerically less than the starting point (i.e. the |
2694 | * UINT_MAX wraparound occurs somewhere in between) then all values in |
2695 | * between are wrapped: |
2696 | * |
2697 | * <----B----S----> |
2698 | * [---) = F not wrapped past B (but wrapped past UINT_MAX) |
2699 | * [---) = F wrapped past B (and UINT_MAX) |
2700 | * [----] = F not wrapped |
2701 | *----------------------------------------------------------------------- |
2702 | */ |
2703 | if (start < boundary) |
2704 | return finish >= boundary || finish < start; |
2705 | else |
2706 | return finish >= boundary && finish < start; |
2707 | } |
2708 | |
2709 | /* |
2710 | * Find the starting offset of the given MultiXactId. |
2711 | * |
2712 | * Returns false if the file containing the multi does not exist on disk. |
2713 | * Otherwise, returns true and sets *result to the starting member offset. |
2714 | * |
2715 | * This function does not prevent concurrent truncation, so if that's |
2716 | * required, the caller has to protect against that. |
2717 | */ |
2718 | static bool |
2719 | find_multixact_start(MultiXactId multi, MultiXactOffset *result) |
2720 | { |
2721 | MultiXactOffset offset; |
2722 | int pageno; |
2723 | int entryno; |
2724 | int slotno; |
2725 | MultiXactOffset *offptr; |
2726 | |
2727 | Assert(MultiXactState->finishedStartup); |
2728 | |
2729 | pageno = MultiXactIdToOffsetPage(multi); |
2730 | entryno = MultiXactIdToOffsetEntry(multi); |
2731 | |
2732 | /* |
2733 | * Flush out dirty data, so PhysicalPageExists can work correctly. |
2734 | * SimpleLruFlush() is a pretty big hammer for that. Alternatively we |
2735 | * could add an in-memory version of page exists, but find_multixact_start |
2736 | * is called infrequently, and it doesn't seem bad to flush buffers to |
2737 | * disk before truncation. |
2738 | */ |
2739 | SimpleLruFlush(MultiXactOffsetCtl, true); |
2740 | SimpleLruFlush(MultiXactMemberCtl, true); |
2741 | |
2742 | if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) |
2743 | return false; |
2744 | |
2745 | /* lock is acquired by SimpleLruReadPage_ReadOnly */ |
2746 | slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); |
2747 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
2748 | offptr += entryno; |
2749 | offset = *offptr; |
2750 | LWLockRelease(MultiXactOffsetControlLock); |
2751 | |
2752 | *result = offset; |
2753 | return true; |
2754 | } |
2755 | |
2756 | /* |
2757 | * Determine how many multixacts, and how many multixact members, currently |
2758 | * exist. Return false if unable to determine. |
2759 | */ |
2760 | static bool |
2761 | ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) |
2762 | { |
2763 | MultiXactOffset nextOffset; |
2764 | MultiXactOffset oldestOffset; |
2765 | MultiXactId oldestMultiXactId; |
2766 | MultiXactId nextMultiXactId; |
2767 | bool oldestOffsetKnown; |
2768 | |
2769 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2770 | nextOffset = MultiXactState->nextOffset; |
2771 | oldestMultiXactId = MultiXactState->oldestMultiXactId; |
2772 | nextMultiXactId = MultiXactState->nextMXact; |
2773 | oldestOffset = MultiXactState->oldestOffset; |
2774 | oldestOffsetKnown = MultiXactState->oldestOffsetKnown; |
2775 | LWLockRelease(MultiXactGenLock); |
2776 | |
2777 | if (!oldestOffsetKnown) |
2778 | return false; |
2779 | |
2780 | *members = nextOffset - oldestOffset; |
2781 | *multixacts = nextMultiXactId - oldestMultiXactId; |
2782 | return true; |
2783 | } |
2784 | |
2785 | /* |
2786 | * Multixact members can be removed once the multixacts that refer to them |
2787 | * are older than every datminxmid. autovacuum_multixact_freeze_max_age and |
2788 | * vacuum_multixact_freeze_table_age work together to make sure we never have |
2789 | * too many multixacts; we hope that, at least under normal circumstances, |
2790 | * this will also be sufficient to keep us from using too many offsets. |
2791 | * However, if the average multixact has many members, we might exhaust the |
2792 | * members space while still using few enough members that these limits fail |
2793 | * to trigger full table scans for relminmxid advancement. At that point, |
2794 | * we'd have no choice but to start failing multixact-creating operations |
2795 | * with an error. |
2796 | * |
2797 | * To prevent that, if more than a threshold portion of the members space is |
2798 | * used, we effectively reduce autovacuum_multixact_freeze_max_age and |
2799 | * to a value just less than the number of multixacts in use. We hope that |
2800 | * this will quickly trigger autovacuuming on the table or tables with the |
2801 | * oldest relminmxid, thus allowing datminmxid values to advance and removing |
2802 | * some members. |
2803 | * |
2804 | * As the fraction of the member space currently in use grows, we become |
2805 | * more aggressive in clamping this value. That not only causes autovacuum |
2806 | * to ramp up, but also makes any manual vacuums the user issues more |
2807 | * aggressive. This happens because vacuum_set_xid_limits() clamps the |
2808 | * freeze table and the minimum freeze age based on the effective |
2809 | * autovacuum_multixact_freeze_max_age this function returns. In the worst |
2810 | * case, we'll claim the freeze_max_age to zero, and every vacuum of any |
2811 | * table will try to freeze every multixact. |
2812 | * |
2813 | * It's possible that these thresholds should be user-tunable, but for now |
2814 | * we keep it simple. |
2815 | */ |
2816 | int |
2817 | MultiXactMemberFreezeThreshold(void) |
2818 | { |
2819 | MultiXactOffset members; |
2820 | uint32 multixacts; |
2821 | uint32 victim_multixacts; |
2822 | double fraction; |
2823 | |
2824 | /* If we can't determine member space utilization, assume the worst. */ |
2825 | if (!ReadMultiXactCounts(&multixacts, &members)) |
2826 | return 0; |
2827 | |
2828 | /* If member space utilization is low, no special action is required. */ |
2829 | if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) |
2830 | return autovacuum_multixact_freeze_max_age; |
2831 | |
2832 | /* |
2833 | * Compute a target for relminmxid advancement. The number of multixacts |
2834 | * we try to eliminate from the system is based on how far we are past |
2835 | * MULTIXACT_MEMBER_SAFE_THRESHOLD. |
2836 | */ |
2837 | fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / |
2838 | (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); |
2839 | victim_multixacts = multixacts * fraction; |
2840 | |
2841 | /* fraction could be > 1.0, but lowest possible freeze age is zero */ |
2842 | if (victim_multixacts > multixacts) |
2843 | return 0; |
2844 | return multixacts - victim_multixacts; |
2845 | } |
2846 | |
2847 | typedef struct mxtruncinfo |
2848 | { |
2849 | int earliestExistingPage; |
2850 | } mxtruncinfo; |
2851 | |
2852 | /* |
2853 | * SlruScanDirectory callback |
2854 | * This callback determines the earliest existing page number. |
2855 | */ |
2856 | static bool |
2857 | SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) |
2858 | { |
2859 | mxtruncinfo *trunc = (mxtruncinfo *) data; |
2860 | |
2861 | if (trunc->earliestExistingPage == -1 || |
2862 | ctl->PagePrecedes(segpage, trunc->earliestExistingPage)) |
2863 | { |
2864 | trunc->earliestExistingPage = segpage; |
2865 | } |
2866 | |
2867 | return false; /* keep going */ |
2868 | } |
2869 | |
2870 | |
2871 | /* |
2872 | * Delete members segments [oldest, newOldest) |
2873 | * |
2874 | * The members SLRU can, in contrast to the offsets one, be filled to almost |
2875 | * the full range at once. This means SimpleLruTruncate() can't trivially be |
2876 | * used - instead the to-be-deleted range is computed using the offsets |
2877 | * SLRU. C.f. TruncateMultiXact(). |
2878 | */ |
2879 | static void |
2880 | PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) |
2881 | { |
2882 | const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); |
2883 | int startsegment = MXOffsetToMemberSegment(oldestOffset); |
2884 | int endsegment = MXOffsetToMemberSegment(newOldestOffset); |
2885 | int segment = startsegment; |
2886 | |
2887 | /* |
2888 | * Delete all the segments but the last one. The last segment can still |
2889 | * contain, possibly partially, valid data. |
2890 | */ |
2891 | while (segment != endsegment) |
2892 | { |
2893 | elog(DEBUG2, "truncating multixact members segment %x" , segment); |
2894 | SlruDeleteSegment(MultiXactMemberCtl, segment); |
2895 | |
2896 | /* move to next segment, handling wraparound correctly */ |
2897 | if (segment == maxsegment) |
2898 | segment = 0; |
2899 | else |
2900 | segment += 1; |
2901 | } |
2902 | } |
2903 | |
2904 | /* |
2905 | * Delete offsets segments [oldest, newOldest) |
2906 | */ |
2907 | static void |
2908 | PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) |
2909 | { |
2910 | /* |
2911 | * We step back one multixact to avoid passing a cutoff page that hasn't |
2912 | * been created yet in the rare case that oldestMulti would be the first |
2913 | * item on a page and oldestMulti == nextMulti. In that case, if we |
2914 | * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound |
2915 | * detection. |
2916 | */ |
2917 | SimpleLruTruncate(MultiXactOffsetCtl, |
2918 | MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); |
2919 | } |
2920 | |
2921 | /* |
2922 | * Remove all MultiXactOffset and MultiXactMember segments before the oldest |
2923 | * ones still of interest. |
2924 | * |
2925 | * This is only called on a primary as part of vacuum (via |
2926 | * vac_truncate_clog()). During recovery truncation is done by replaying |
2927 | * truncation WAL records logged here. |
2928 | * |
2929 | * newOldestMulti is the oldest currently required multixact, newOldestMultiDB |
2930 | * is one of the databases preventing newOldestMulti from increasing. |
2931 | */ |
2932 | void |
2933 | TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) |
2934 | { |
2935 | MultiXactId oldestMulti; |
2936 | MultiXactId nextMulti; |
2937 | MultiXactOffset newOldestOffset; |
2938 | MultiXactOffset oldestOffset; |
2939 | MultiXactOffset nextOffset; |
2940 | mxtruncinfo trunc; |
2941 | MultiXactId earliest; |
2942 | |
2943 | Assert(!RecoveryInProgress()); |
2944 | Assert(MultiXactState->finishedStartup); |
2945 | |
2946 | /* |
2947 | * We can only allow one truncation to happen at once. Otherwise parts of |
2948 | * members might vanish while we're doing lookups or similar. There's no |
2949 | * need to have an interlock with creating new multis or such, since those |
2950 | * are constrained by the limits (which only grow, never shrink). |
2951 | */ |
2952 | LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); |
2953 | |
2954 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2955 | nextMulti = MultiXactState->nextMXact; |
2956 | nextOffset = MultiXactState->nextOffset; |
2957 | oldestMulti = MultiXactState->oldestMultiXactId; |
2958 | LWLockRelease(MultiXactGenLock); |
2959 | Assert(MultiXactIdIsValid(oldestMulti)); |
2960 | |
2961 | /* |
2962 | * Make sure to only attempt truncation if there's values to truncate |
2963 | * away. In normal processing values shouldn't go backwards, but there's |
2964 | * some corner cases (due to bugs) where that's possible. |
2965 | */ |
2966 | if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti)) |
2967 | { |
2968 | LWLockRelease(MultiXactTruncationLock); |
2969 | return; |
2970 | } |
2971 | |
2972 | /* |
2973 | * Note we can't just plow ahead with the truncation; it's possible that |
2974 | * there are no segments to truncate, which is a problem because we are |
2975 | * going to attempt to read the offsets page to determine where to |
2976 | * truncate the members SLRU. So we first scan the directory to determine |
2977 | * the earliest offsets page number that we can read without error. |
2978 | * |
2979 | * NB: It's also possible that the page that oldestMulti is on has already |
2980 | * been truncated away, and we crashed before updating oldestMulti. |
2981 | */ |
2982 | trunc.earliestExistingPage = -1; |
2983 | SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); |
2984 | earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; |
2985 | if (earliest < FirstMultiXactId) |
2986 | earliest = FirstMultiXactId; |
2987 | |
2988 | /* If there's nothing to remove, we can bail out early. */ |
2989 | if (MultiXactIdPrecedes(oldestMulti, earliest)) |
2990 | { |
2991 | LWLockRelease(MultiXactTruncationLock); |
2992 | return; |
2993 | } |
2994 | |
2995 | /* |
2996 | * First, compute the safe truncation point for MultiXactMember. This is |
2997 | * the starting offset of the oldest multixact. |
2998 | * |
2999 | * Hopefully, find_multixact_start will always work here, because we've |
3000 | * already checked that it doesn't precede the earliest MultiXact on disk. |
3001 | * But if it fails, don't truncate anything, and log a message. |
3002 | */ |
3003 | if (oldestMulti == nextMulti) |
3004 | { |
3005 | /* there are NO MultiXacts */ |
3006 | oldestOffset = nextOffset; |
3007 | } |
3008 | else if (!find_multixact_start(oldestMulti, &oldestOffset)) |
3009 | { |
3010 | ereport(LOG, |
3011 | (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation" , |
3012 | oldestMulti, earliest))); |
3013 | LWLockRelease(MultiXactTruncationLock); |
3014 | return; |
3015 | } |
3016 | |
3017 | /* |
3018 | * Secondly compute up to where to truncate. Lookup the corresponding |
3019 | * member offset for newOldestMulti for that. |
3020 | */ |
3021 | if (newOldestMulti == nextMulti) |
3022 | { |
3023 | /* there are NO MultiXacts */ |
3024 | newOldestOffset = nextOffset; |
3025 | } |
3026 | else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) |
3027 | { |
3028 | ereport(LOG, |
3029 | (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation" , |
3030 | newOldestMulti))); |
3031 | LWLockRelease(MultiXactTruncationLock); |
3032 | return; |
3033 | } |
3034 | |
3035 | elog(DEBUG1, "performing multixact truncation: " |
3036 | "offsets [%u, %u), offsets segments [%x, %x), " |
3037 | "members [%u, %u), members segments [%x, %x)" , |
3038 | oldestMulti, newOldestMulti, |
3039 | MultiXactIdToOffsetSegment(oldestMulti), |
3040 | MultiXactIdToOffsetSegment(newOldestMulti), |
3041 | oldestOffset, newOldestOffset, |
3042 | MXOffsetToMemberSegment(oldestOffset), |
3043 | MXOffsetToMemberSegment(newOldestOffset)); |
3044 | |
3045 | /* |
3046 | * Do truncation, and the WAL logging of the truncation, in a critical |
3047 | * section. That way offsets/members cannot get out of sync anymore, i.e. |
3048 | * once consistent the newOldestMulti will always exist in members, even |
3049 | * if we crashed in the wrong moment. |
3050 | */ |
3051 | START_CRIT_SECTION(); |
3052 | |
3053 | /* |
3054 | * Prevent checkpoints from being scheduled concurrently. This is critical |
3055 | * because otherwise a truncation record might not be replayed after a |
3056 | * crash/basebackup, even though the state of the data directory would |
3057 | * require it. |
3058 | */ |
3059 | Assert(!MyPgXact->delayChkpt); |
3060 | MyPgXact->delayChkpt = true; |
3061 | |
3062 | /* WAL log truncation */ |
3063 | WriteMTruncateXlogRec(newOldestMultiDB, |
3064 | oldestMulti, newOldestMulti, |
3065 | oldestOffset, newOldestOffset); |
3066 | |
3067 | /* |
3068 | * Update in-memory limits before performing the truncation, while inside |
3069 | * the critical section: Have to do it before truncation, to prevent |
3070 | * concurrent lookups of those values. Has to be inside the critical |
3071 | * section as otherwise a future call to this function would error out, |
3072 | * while looking up the oldest member in offsets, if our caller crashes |
3073 | * before updating the limits. |
3074 | */ |
3075 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
3076 | MultiXactState->oldestMultiXactId = newOldestMulti; |
3077 | MultiXactState->oldestMultiXactDB = newOldestMultiDB; |
3078 | LWLockRelease(MultiXactGenLock); |
3079 | |
3080 | /* First truncate members */ |
3081 | PerformMembersTruncation(oldestOffset, newOldestOffset); |
3082 | |
3083 | /* Then offsets */ |
3084 | PerformOffsetsTruncation(oldestMulti, newOldestMulti); |
3085 | |
3086 | MyPgXact->delayChkpt = false; |
3087 | |
3088 | END_CRIT_SECTION(); |
3089 | LWLockRelease(MultiXactTruncationLock); |
3090 | } |
3091 | |
3092 | /* |
3093 | * Decide which of two MultiXactOffset page numbers is "older" for truncation |
3094 | * purposes. |
3095 | * |
3096 | * We need to use comparison of MultiXactId here in order to do the right |
3097 | * thing with wraparound. However, if we are asked about page number zero, we |
3098 | * don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get |
3099 | * weird. So, offset both multis by FirstMultiXactId to avoid that. |
3100 | * (Actually, the current implementation doesn't do anything weird with |
3101 | * InvalidMultiXactId, but there's no harm in leaving this code like this.) |
3102 | */ |
3103 | static bool |
3104 | MultiXactOffsetPagePrecedes(int page1, int page2) |
3105 | { |
3106 | MultiXactId multi1; |
3107 | MultiXactId multi2; |
3108 | |
3109 | multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; |
3110 | multi1 += FirstMultiXactId; |
3111 | multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; |
3112 | multi2 += FirstMultiXactId; |
3113 | |
3114 | return MultiXactIdPrecedes(multi1, multi2); |
3115 | } |
3116 | |
3117 | /* |
3118 | * Decide which of two MultiXactMember page numbers is "older" for truncation |
3119 | * purposes. There is no "invalid offset number" so use the numbers verbatim. |
3120 | */ |
3121 | static bool |
3122 | MultiXactMemberPagePrecedes(int page1, int page2) |
3123 | { |
3124 | MultiXactOffset offset1; |
3125 | MultiXactOffset offset2; |
3126 | |
3127 | offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; |
3128 | offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; |
3129 | |
3130 | return MultiXactOffsetPrecedes(offset1, offset2); |
3131 | } |
3132 | |
3133 | /* |
3134 | * Decide which of two MultiXactIds is earlier. |
3135 | * |
3136 | * XXX do we need to do something special for InvalidMultiXactId? |
3137 | * (Doesn't look like it.) |
3138 | */ |
3139 | bool |
3140 | MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) |
3141 | { |
3142 | int32 diff = (int32) (multi1 - multi2); |
3143 | |
3144 | return (diff < 0); |
3145 | } |
3146 | |
3147 | /* |
3148 | * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2? |
3149 | * |
3150 | * XXX do we need to do something special for InvalidMultiXactId? |
3151 | * (Doesn't look like it.) |
3152 | */ |
3153 | bool |
3154 | MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) |
3155 | { |
3156 | int32 diff = (int32) (multi1 - multi2); |
3157 | |
3158 | return (diff <= 0); |
3159 | } |
3160 | |
3161 | |
3162 | /* |
3163 | * Decide which of two offsets is earlier. |
3164 | */ |
3165 | static bool |
3166 | MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) |
3167 | { |
3168 | int32 diff = (int32) (offset1 - offset2); |
3169 | |
3170 | return (diff < 0); |
3171 | } |
3172 | |
3173 | /* |
3174 | * Write an xlog record reflecting the zeroing of either a MEMBERs or |
3175 | * OFFSETs page (info shows which) |
3176 | */ |
3177 | static void |
3178 | WriteMZeroPageXlogRec(int pageno, uint8 info) |
3179 | { |
3180 | XLogBeginInsert(); |
3181 | XLogRegisterData((char *) (&pageno), sizeof(int)); |
3182 | (void) XLogInsert(RM_MULTIXACT_ID, info); |
3183 | } |
3184 | |
3185 | /* |
3186 | * Write a TRUNCATE xlog record |
3187 | * |
3188 | * We must flush the xlog record to disk before returning --- see notes in |
3189 | * TruncateCLOG(). |
3190 | */ |
3191 | static void |
3192 | WriteMTruncateXlogRec(Oid oldestMultiDB, |
3193 | MultiXactId startTruncOff, MultiXactId endTruncOff, |
3194 | MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb) |
3195 | { |
3196 | XLogRecPtr recptr; |
3197 | xl_multixact_truncate xlrec; |
3198 | |
3199 | xlrec.oldestMultiDB = oldestMultiDB; |
3200 | |
3201 | xlrec.startTruncOff = startTruncOff; |
3202 | xlrec.endTruncOff = endTruncOff; |
3203 | |
3204 | xlrec.startTruncMemb = startTruncMemb; |
3205 | xlrec.endTruncMemb = endTruncMemb; |
3206 | |
3207 | XLogBeginInsert(); |
3208 | XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate); |
3209 | recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID); |
3210 | XLogFlush(recptr); |
3211 | } |
3212 | |
3213 | /* |
3214 | * MULTIXACT resource manager's routines |
3215 | */ |
3216 | void |
3217 | multixact_redo(XLogReaderState *record) |
3218 | { |
3219 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
3220 | |
3221 | /* Backup blocks are not used in multixact records */ |
3222 | Assert(!XLogRecHasAnyBlockRefs(record)); |
3223 | |
3224 | if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) |
3225 | { |
3226 | int pageno; |
3227 | int slotno; |
3228 | |
3229 | memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
3230 | |
3231 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
3232 | |
3233 | slotno = ZeroMultiXactOffsetPage(pageno, false); |
3234 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
3235 | Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
3236 | |
3237 | LWLockRelease(MultiXactOffsetControlLock); |
3238 | } |
3239 | else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) |
3240 | { |
3241 | int pageno; |
3242 | int slotno; |
3243 | |
3244 | memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
3245 | |
3246 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
3247 | |
3248 | slotno = ZeroMultiXactMemberPage(pageno, false); |
3249 | SimpleLruWritePage(MultiXactMemberCtl, slotno); |
3250 | Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
3251 | |
3252 | LWLockRelease(MultiXactMemberControlLock); |
3253 | } |
3254 | else if (info == XLOG_MULTIXACT_CREATE_ID) |
3255 | { |
3256 | xl_multixact_create *xlrec = |
3257 | (xl_multixact_create *) XLogRecGetData(record); |
3258 | TransactionId max_xid; |
3259 | int i; |
3260 | |
3261 | /* Store the data back into the SLRU files */ |
3262 | RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, |
3263 | xlrec->members); |
3264 | |
3265 | /* Make sure nextMXact/nextOffset are beyond what this record has */ |
3266 | MultiXactAdvanceNextMXact(xlrec->mid + 1, |
3267 | xlrec->moff + xlrec->nmembers); |
3268 | |
3269 | /* |
3270 | * Make sure nextFullXid is beyond any XID mentioned in the record. |
3271 | * This should be unnecessary, since any XID found here ought to have |
3272 | * other evidence in the XLOG, but let's be safe. |
3273 | */ |
3274 | max_xid = XLogRecGetXid(record); |
3275 | for (i = 0; i < xlrec->nmembers; i++) |
3276 | { |
3277 | if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid)) |
3278 | max_xid = xlrec->members[i].xid; |
3279 | } |
3280 | |
3281 | AdvanceNextFullTransactionIdPastXid(max_xid); |
3282 | } |
3283 | else if (info == XLOG_MULTIXACT_TRUNCATE_ID) |
3284 | { |
3285 | xl_multixact_truncate xlrec; |
3286 | int pageno; |
3287 | |
3288 | memcpy(&xlrec, XLogRecGetData(record), |
3289 | SizeOfMultiXactTruncate); |
3290 | |
3291 | elog(DEBUG1, "replaying multixact truncation: " |
3292 | "offsets [%u, %u), offsets segments [%x, %x), " |
3293 | "members [%u, %u), members segments [%x, %x)" , |
3294 | xlrec.startTruncOff, xlrec.endTruncOff, |
3295 | MultiXactIdToOffsetSegment(xlrec.startTruncOff), |
3296 | MultiXactIdToOffsetSegment(xlrec.endTruncOff), |
3297 | xlrec.startTruncMemb, xlrec.endTruncMemb, |
3298 | MXOffsetToMemberSegment(xlrec.startTruncMemb), |
3299 | MXOffsetToMemberSegment(xlrec.endTruncMemb)); |
3300 | |
3301 | /* should not be required, but more than cheap enough */ |
3302 | LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); |
3303 | |
3304 | /* |
3305 | * Advance the horizon values, so they're current at the end of |
3306 | * recovery. |
3307 | */ |
3308 | SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); |
3309 | |
3310 | PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); |
3311 | |
3312 | /* |
3313 | * During XLOG replay, latest_page_number isn't necessarily set up |
3314 | * yet; insert a suitable value to bypass the sanity test in |
3315 | * SimpleLruTruncate. |
3316 | */ |
3317 | pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); |
3318 | MultiXactOffsetCtl->shared->latest_page_number = pageno; |
3319 | PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); |
3320 | |
3321 | LWLockRelease(MultiXactTruncationLock); |
3322 | } |
3323 | else |
3324 | elog(PANIC, "multixact_redo: unknown op code %u" , info); |
3325 | } |
3326 | |
3327 | Datum |
3328 | pg_get_multixact_members(PG_FUNCTION_ARGS) |
3329 | { |
3330 | typedef struct |
3331 | { |
3332 | MultiXactMember *members; |
3333 | int nmembers; |
3334 | int iter; |
3335 | } mxact; |
3336 | MultiXactId mxid = PG_GETARG_UINT32(0); |
3337 | mxact *multi; |
3338 | FuncCallContext *funccxt; |
3339 | |
3340 | if (mxid < FirstMultiXactId) |
3341 | ereport(ERROR, |
3342 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
3343 | errmsg("invalid MultiXactId: %u" , mxid))); |
3344 | |
3345 | if (SRF_IS_FIRSTCALL()) |
3346 | { |
3347 | MemoryContext oldcxt; |
3348 | TupleDesc tupdesc; |
3349 | |
3350 | funccxt = SRF_FIRSTCALL_INIT(); |
3351 | oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); |
3352 | |
3353 | multi = palloc(sizeof(mxact)); |
3354 | /* no need to allow for old values here */ |
3355 | multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, |
3356 | false); |
3357 | multi->iter = 0; |
3358 | |
3359 | tupdesc = CreateTemplateTupleDesc(2); |
3360 | TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid" , |
3361 | XIDOID, -1, 0); |
3362 | TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode" , |
3363 | TEXTOID, -1, 0); |
3364 | |
3365 | funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); |
3366 | funccxt->user_fctx = multi; |
3367 | |
3368 | MemoryContextSwitchTo(oldcxt); |
3369 | } |
3370 | |
3371 | funccxt = SRF_PERCALL_SETUP(); |
3372 | multi = (mxact *) funccxt->user_fctx; |
3373 | |
3374 | while (multi->iter < multi->nmembers) |
3375 | { |
3376 | HeapTuple tuple; |
3377 | char *values[2]; |
3378 | |
3379 | values[0] = psprintf("%u" , multi->members[multi->iter].xid); |
3380 | values[1] = mxstatus_to_string(multi->members[multi->iter].status); |
3381 | |
3382 | tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); |
3383 | |
3384 | multi->iter++; |
3385 | pfree(values[0]); |
3386 | SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); |
3387 | } |
3388 | |
3389 | if (multi->nmembers > 0) |
3390 | pfree(multi->members); |
3391 | pfree(multi); |
3392 | |
3393 | SRF_RETURN_DONE(funccxt); |
3394 | } |
3395 | |