| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * multixact.c |
| 4 | * PostgreSQL multi-transaction-log manager |
| 5 | * |
| 6 | * The pg_multixact manager is a pg_xact-like manager that stores an array of |
| 7 | * MultiXactMember for each MultiXactId. It is a fundamental part of the |
| 8 | * shared-row-lock implementation. Each MultiXactMember is comprised of a |
| 9 | * TransactionId and a set of flag bits. The name is a bit historical: |
| 10 | * originally, a MultiXactId consisted of more than one TransactionId (except |
| 11 | * in rare corner cases), hence "multi". Nowadays, however, it's perfectly |
| 12 | * legitimate to have MultiXactIds that only include a single Xid. |
| 13 | * |
| 14 | * The meaning of the flag bits is opaque to this module, but they are mostly |
| 15 | * used in heapam.c to identify lock modes that each of the member transactions |
| 16 | * is holding on any given tuple. This module just contains support to store |
| 17 | * and retrieve the arrays. |
| 18 | * |
| 19 | * We use two SLRU areas, one for storing the offsets at which the data |
| 20 | * starts for each MultiXactId in the other one. This trick allows us to |
| 21 | * store variable length arrays of TransactionIds. (We could alternatively |
| 22 | * use one area containing counts and TransactionIds, with valid MultiXactId |
| 23 | * values pointing at slots containing counts; but that way seems less robust |
| 24 | * since it would get completely confused if someone inquired about a bogus |
| 25 | * MultiXactId that pointed to an intermediate slot containing an XID.) |
| 26 | * |
| 27 | * XLOG interactions: this module generates a record whenever a new OFFSETs or |
| 28 | * MEMBERs page is initialized to zeroes, as well as an |
| 29 | * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined. |
| 30 | * This module ignores the WAL rule "write xlog before data," because it |
| 31 | * suffices that actions recording a MultiXactId in a heap xmax do follow that |
| 32 | * rule. The only way for the MXID to be referenced from any data page is for |
| 33 | * heap_lock_tuple() or heap_update() to have put it there, and each generates |
| 34 | * an XLOG record that must follow ours. The normal LSN interlock between the |
| 35 | * data page and that XLOG record will ensure that our XLOG record reaches |
| 36 | * disk first. If the SLRU members/offsets data reaches disk sooner than the |
| 37 | * XLOG records, we do not care; after recovery, no xmax will refer to it. On |
| 38 | * the flip side, to ensure that all referenced entries _do_ reach disk, this |
| 39 | * module's XLOG records completely rebuild the data entered since the last |
| 40 | * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk |
| 41 | * before each checkpoint is considered complete. |
| 42 | * |
| 43 | * Like clog.c, and unlike subtrans.c, we have to preserve state across |
| 44 | * crashes and ensure that MXID and offset numbering increases monotonically |
| 45 | * across a crash. We do this in the same way as it's done for transaction |
| 46 | * IDs: the WAL record is guaranteed to contain evidence of every MXID we |
| 47 | * could need to worry about, and we just make sure that at the end of |
| 48 | * replay, the next-MXID and next-offset counters are at least as large as |
| 49 | * anything we saw during replay. |
| 50 | * |
| 51 | * We are able to remove segments no longer necessary by carefully tracking |
| 52 | * each table's used values: during vacuum, any multixact older than a certain |
| 53 | * value is removed; the cutoff value is stored in pg_class. The minimum value |
| 54 | * across all tables in each database is stored in pg_database, and the global |
| 55 | * minimum across all databases is part of pg_control and is kept in shared |
| 56 | * memory. Whenever that minimum is advanced, the SLRUs are truncated. |
| 57 | * |
| 58 | * When new multixactid values are to be created, care is taken that the |
| 59 | * counter does not fall within the wraparound horizon considering the global |
| 60 | * minimum value. |
| 61 | * |
| 62 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 63 | * Portions Copyright (c) 1994, Regents of the University of California |
| 64 | * |
| 65 | * src/backend/access/transam/multixact.c |
| 66 | * |
| 67 | *------------------------------------------------------------------------- |
| 68 | */ |
| 69 | #include "postgres.h" |
| 70 | |
| 71 | #include "access/multixact.h" |
| 72 | #include "access/slru.h" |
| 73 | #include "access/transam.h" |
| 74 | #include "access/twophase.h" |
| 75 | #include "access/twophase_rmgr.h" |
| 76 | #include "access/xact.h" |
| 77 | #include "access/xlog.h" |
| 78 | #include "access/xloginsert.h" |
| 79 | #include "catalog/pg_type.h" |
| 80 | #include "commands/dbcommands.h" |
| 81 | #include "funcapi.h" |
| 82 | #include "lib/ilist.h" |
| 83 | #include "miscadmin.h" |
| 84 | #include "pg_trace.h" |
| 85 | #include "postmaster/autovacuum.h" |
| 86 | #include "storage/lmgr.h" |
| 87 | #include "storage/pmsignal.h" |
| 88 | #include "storage/proc.h" |
| 89 | #include "storage/procarray.h" |
| 90 | #include "utils/builtins.h" |
| 91 | #include "utils/memutils.h" |
| 92 | #include "utils/snapmgr.h" |
| 93 | |
| 94 | |
| 95 | /* |
| 96 | * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is |
| 97 | * used everywhere else in Postgres. |
| 98 | * |
| 99 | * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, |
| 100 | * MultiXact page numbering also wraps around at |
| 101 | * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at |
| 102 | * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need |
| 103 | * take no explicit notice of that fact in this module, except when comparing |
| 104 | * segment and page numbers in TruncateMultiXact (see |
| 105 | * MultiXactOffsetPagePrecedes). |
| 106 | */ |
| 107 | |
| 108 | /* We need four bytes per offset */ |
| 109 | #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) |
| 110 | |
| 111 | #define MultiXactIdToOffsetPage(xid) \ |
| 112 | ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) |
| 113 | #define MultiXactIdToOffsetEntry(xid) \ |
| 114 | ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) |
| 115 | #define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT) |
| 116 | |
| 117 | /* |
| 118 | * The situation for members is a bit more complex: we store one byte of |
| 119 | * additional flag bits for each TransactionId. To do this without getting |
| 120 | * into alignment issues, we store four bytes of flags, and then the |
| 121 | * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and |
| 122 | * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups |
| 123 | * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and |
| 124 | * performance) trumps space efficiency here. |
| 125 | * |
| 126 | * Note that the "offset" macros work with byte offset, not array indexes, so |
| 127 | * arithmetic must be done using "char *" pointers. |
| 128 | */ |
| 129 | /* We need eight bits per xact, so one xact fits in a byte */ |
| 130 | #define MXACT_MEMBER_BITS_PER_XACT 8 |
| 131 | #define MXACT_MEMBER_FLAGS_PER_BYTE 1 |
| 132 | #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) |
| 133 | |
| 134 | /* how many full bytes of flags are there in a group? */ |
| 135 | #define MULTIXACT_FLAGBYTES_PER_GROUP 4 |
| 136 | #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ |
| 137 | (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) |
| 138 | /* size in bytes of a complete group */ |
| 139 | #define MULTIXACT_MEMBERGROUP_SIZE \ |
| 140 | (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) |
| 141 | #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) |
| 142 | #define MULTIXACT_MEMBERS_PER_PAGE \ |
| 143 | (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) |
| 144 | |
| 145 | /* |
| 146 | * Because the number of items per page is not a divisor of the last item |
| 147 | * number (member 0xFFFFFFFF), the last segment does not use the maximum number |
| 148 | * of pages, and moreover the last used page therein does not use the same |
| 149 | * number of items as previous pages. (Another way to say it is that the |
| 150 | * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page |
| 151 | * has some empty space after that item.) |
| 152 | * |
| 153 | * This constant is the number of members in the last page of the last segment. |
| 154 | */ |
| 155 | #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ |
| 156 | ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) |
| 157 | |
| 158 | /* page in which a member is to be found */ |
| 159 | #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) |
| 160 | #define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT) |
| 161 | |
| 162 | /* Location (byte offset within page) of flag word for a given member */ |
| 163 | #define MXOffsetToFlagsOffset(xid) \ |
| 164 | ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ |
| 165 | (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \ |
| 166 | (TransactionId) MULTIXACT_MEMBERGROUP_SIZE) |
| 167 | #define MXOffsetToFlagsBitShift(xid) \ |
| 168 | (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \ |
| 169 | MXACT_MEMBER_BITS_PER_XACT) |
| 170 | |
| 171 | /* Location (byte offset within page) of TransactionId of given member */ |
| 172 | #define MXOffsetToMemberOffset(xid) \ |
| 173 | (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \ |
| 174 | ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId)) |
| 175 | |
| 176 | /* Multixact members wraparound thresholds. */ |
| 177 | #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) |
| 178 | #define MULTIXACT_MEMBER_DANGER_THRESHOLD \ |
| 179 | (MaxMultiXactOffset - MaxMultiXactOffset / 4) |
| 180 | |
| 181 | #define PreviousMultiXactId(xid) \ |
| 182 | ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1) |
| 183 | |
| 184 | /* |
| 185 | * Links to shared-memory data structures for MultiXact control |
| 186 | */ |
| 187 | static SlruCtlData MultiXactOffsetCtlData; |
| 188 | static SlruCtlData MultiXactMemberCtlData; |
| 189 | |
| 190 | #define MultiXactOffsetCtl (&MultiXactOffsetCtlData) |
| 191 | #define MultiXactMemberCtl (&MultiXactMemberCtlData) |
| 192 | |
| 193 | /* |
| 194 | * MultiXact state shared across all backends. All this state is protected |
| 195 | * by MultiXactGenLock. (We also use MultiXactOffsetControlLock and |
| 196 | * MultiXactMemberControlLock to guard accesses to the two sets of SLRU |
| 197 | * buffers. For concurrency's sake, we avoid holding more than one of these |
| 198 | * locks at a time.) |
| 199 | */ |
| 200 | typedef struct MultiXactStateData |
| 201 | { |
| 202 | /* next-to-be-assigned MultiXactId */ |
| 203 | MultiXactId nextMXact; |
| 204 | |
| 205 | /* next-to-be-assigned offset */ |
| 206 | MultiXactOffset nextOffset; |
| 207 | |
| 208 | /* Have we completed multixact startup? */ |
| 209 | bool finishedStartup; |
| 210 | |
| 211 | /* |
| 212 | * Oldest multixact that is still potentially referenced by a relation. |
| 213 | * Anything older than this should not be consulted. These values are |
| 214 | * updated by vacuum. |
| 215 | */ |
| 216 | MultiXactId oldestMultiXactId; |
| 217 | Oid oldestMultiXactDB; |
| 218 | |
| 219 | /* |
| 220 | * Oldest multixact offset that is potentially referenced by a multixact |
| 221 | * referenced by a relation. We don't always know this value, so there's |
| 222 | * a flag here to indicate whether or not we currently do. |
| 223 | */ |
| 224 | MultiXactOffset oldestOffset; |
| 225 | bool oldestOffsetKnown; |
| 226 | |
| 227 | /* support for anti-wraparound measures */ |
| 228 | MultiXactId multiVacLimit; |
| 229 | MultiXactId multiWarnLimit; |
| 230 | MultiXactId multiStopLimit; |
| 231 | MultiXactId multiWrapLimit; |
| 232 | |
| 233 | /* support for members anti-wraparound measures */ |
| 234 | MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ |
| 235 | |
| 236 | /* |
| 237 | * Per-backend data starts here. We have two arrays stored in the area |
| 238 | * immediately following the MultiXactStateData struct. Each is indexed by |
| 239 | * BackendId. |
| 240 | * |
| 241 | * In both arrays, there's a slot for all normal backends (1..MaxBackends) |
| 242 | * followed by a slot for max_prepared_xacts prepared transactions. Valid |
| 243 | * BackendIds start from 1; element zero of each array is never used. |
| 244 | * |
| 245 | * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current |
| 246 | * transaction(s) could possibly be a member of, or InvalidMultiXactId |
| 247 | * when the backend has no live transaction that could possibly be a |
| 248 | * member of a MultiXact. Each backend sets its entry to the current |
| 249 | * nextMXact counter just before first acquiring a shared lock in a given |
| 250 | * transaction, and clears it at transaction end. (This works because only |
| 251 | * during or after acquiring a shared lock could an XID possibly become a |
| 252 | * member of a MultiXact, and that MultiXact would have to be created |
| 253 | * during or after the lock acquisition.) |
| 254 | * |
| 255 | * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's |
| 256 | * current transaction(s) think is potentially live, or InvalidMultiXactId |
| 257 | * when not in a transaction or not in a transaction that's paid any |
| 258 | * attention to MultiXacts yet. This is computed when first needed in a |
| 259 | * given transaction, and cleared at transaction end. We can compute it |
| 260 | * as the minimum of the valid OldestMemberMXactId[] entries at the time |
| 261 | * we compute it (using nextMXact if none are valid). Each backend is |
| 262 | * required not to attempt to access any SLRU data for MultiXactIds older |
| 263 | * than its own OldestVisibleMXactId[] setting; this is necessary because |
| 264 | * the checkpointer could truncate away such data at any instant. |
| 265 | * |
| 266 | * The oldest valid value among all of the OldestMemberMXactId[] and |
| 267 | * OldestVisibleMXactId[] entries is considered by vacuum as the earliest |
| 268 | * possible value still having any live member transaction. Subtracting |
| 269 | * vacuum_multixact_freeze_min_age from that value we obtain the freezing |
| 270 | * point for multixacts for that table. Any value older than that is |
| 271 | * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note |
| 272 | * that multis that have member xids that are older than the cutoff point |
| 273 | * for xids must also be frozen, even if the multis themselves are newer |
| 274 | * than the multixid cutoff point). Whenever a full table vacuum happens, |
| 275 | * the freezing point so computed is used as the new pg_class.relminmxid |
| 276 | * value. The minimum of all those values in a database is stored as |
| 277 | * pg_database.datminmxid. In turn, the minimum of all of those values is |
| 278 | * stored in pg_control and used as truncation point for pg_multixact. At |
| 279 | * checkpoint or restartpoint, unneeded segments are removed. |
| 280 | */ |
| 281 | MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]; |
| 282 | } MultiXactStateData; |
| 283 | |
| 284 | /* |
| 285 | * Last element of OldestMemberMXactID and OldestVisibleMXactId arrays. |
| 286 | * Valid elements are (1..MaxOldestSlot); element 0 is never used. |
| 287 | */ |
| 288 | #define MaxOldestSlot (MaxBackends + max_prepared_xacts) |
| 289 | |
| 290 | /* Pointers to the state data in shared memory */ |
| 291 | static MultiXactStateData *MultiXactState; |
| 292 | static MultiXactId *OldestMemberMXactId; |
| 293 | static MultiXactId *OldestVisibleMXactId; |
| 294 | |
| 295 | |
| 296 | /* |
| 297 | * Definitions for the backend-local MultiXactId cache. |
| 298 | * |
| 299 | * We use this cache to store known MultiXacts, so we don't need to go to |
| 300 | * SLRU areas every time. |
| 301 | * |
| 302 | * The cache lasts for the duration of a single transaction, the rationale |
| 303 | * for this being that most entries will contain our own TransactionId and |
| 304 | * so they will be uninteresting by the time our next transaction starts. |
| 305 | * (XXX not clear that this is correct --- other members of the MultiXact |
| 306 | * could hang around longer than we did. However, it's not clear what a |
| 307 | * better policy for flushing old cache entries would be.) FIXME actually |
| 308 | * this is plain wrong now that multixact's may contain update Xids. |
| 309 | * |
| 310 | * We allocate the cache entries in a memory context that is deleted at |
| 311 | * transaction end, so we don't need to do retail freeing of entries. |
| 312 | */ |
| 313 | typedef struct mXactCacheEnt |
| 314 | { |
| 315 | MultiXactId multi; |
| 316 | int nmembers; |
| 317 | dlist_node node; |
| 318 | MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; |
| 319 | } mXactCacheEnt; |
| 320 | |
| 321 | #define MAX_CACHE_ENTRIES 256 |
| 322 | static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache); |
| 323 | static int MXactCacheMembers = 0; |
| 324 | static MemoryContext MXactContext = NULL; |
| 325 | |
| 326 | #ifdef MULTIXACT_DEBUG |
| 327 | #define debug_elog2(a,b) elog(a,b) |
| 328 | #define debug_elog3(a,b,c) elog(a,b,c) |
| 329 | #define debug_elog4(a,b,c,d) elog(a,b,c,d) |
| 330 | #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) |
| 331 | #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f) |
| 332 | #else |
| 333 | #define debug_elog2(a,b) |
| 334 | #define debug_elog3(a,b,c) |
| 335 | #define debug_elog4(a,b,c,d) |
| 336 | #define debug_elog5(a,b,c,d,e) |
| 337 | #define debug_elog6(a,b,c,d,e,f) |
| 338 | #endif |
| 339 | |
| 340 | /* internal MultiXactId management */ |
| 341 | static void MultiXactIdSetOldestVisible(void); |
| 342 | static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
| 343 | int nmembers, MultiXactMember *members); |
| 344 | static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); |
| 345 | |
| 346 | /* MultiXact cache management */ |
| 347 | static int mxactMemberComparator(const void *arg1, const void *arg2); |
| 348 | static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); |
| 349 | static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); |
| 350 | static void mXactCachePut(MultiXactId multi, int nmembers, |
| 351 | MultiXactMember *members); |
| 352 | |
| 353 | static char *mxstatus_to_string(MultiXactStatus status); |
| 354 | |
| 355 | /* management of SLRU infrastructure */ |
| 356 | static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); |
| 357 | static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); |
| 358 | static bool MultiXactOffsetPagePrecedes(int page1, int page2); |
| 359 | static bool MultiXactMemberPagePrecedes(int page1, int page2); |
| 360 | static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, |
| 361 | MultiXactOffset offset2); |
| 362 | static void ExtendMultiXactOffset(MultiXactId multi); |
| 363 | static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); |
| 364 | static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, |
| 365 | MultiXactOffset start, uint32 distance); |
| 366 | static bool SetOffsetVacuumLimit(bool is_startup); |
| 367 | static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); |
| 368 | static void WriteMZeroPageXlogRec(int pageno, uint8 info); |
| 369 | static void WriteMTruncateXlogRec(Oid oldestMultiDB, |
| 370 | MultiXactId startOff, MultiXactId endOff, |
| 371 | MultiXactOffset startMemb, MultiXactOffset endMemb); |
| 372 | |
| 373 | |
| 374 | /* |
| 375 | * MultiXactIdCreate |
| 376 | * Construct a MultiXactId representing two TransactionIds. |
| 377 | * |
| 378 | * The two XIDs must be different, or be requesting different statuses. |
| 379 | * |
| 380 | * NB - we don't worry about our local MultiXactId cache here, because that |
| 381 | * is handled by the lower-level routines. |
| 382 | */ |
| 383 | MultiXactId |
| 384 | MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, |
| 385 | TransactionId xid2, MultiXactStatus status2) |
| 386 | { |
| 387 | MultiXactId newMulti; |
| 388 | MultiXactMember members[2]; |
| 389 | |
| 390 | AssertArg(TransactionIdIsValid(xid1)); |
| 391 | AssertArg(TransactionIdIsValid(xid2)); |
| 392 | |
| 393 | Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); |
| 394 | |
| 395 | /* MultiXactIdSetOldestMember() must have been called already. */ |
| 396 | Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); |
| 397 | |
| 398 | /* |
| 399 | * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs |
| 400 | * are still running. In typical usage, xid2 will be our own XID and the |
| 401 | * caller just did a check on xid1, so it'd be wasted effort. |
| 402 | */ |
| 403 | |
| 404 | members[0].xid = xid1; |
| 405 | members[0].status = status1; |
| 406 | members[1].xid = xid2; |
| 407 | members[1].status = status2; |
| 408 | |
| 409 | newMulti = MultiXactIdCreateFromMembers(2, members); |
| 410 | |
| 411 | debug_elog3(DEBUG2, "Create: %s" , |
| 412 | mxid_to_string(newMulti, 2, members)); |
| 413 | |
| 414 | return newMulti; |
| 415 | } |
| 416 | |
| 417 | /* |
| 418 | * MultiXactIdExpand |
| 419 | * Add a TransactionId to a pre-existing MultiXactId. |
| 420 | * |
| 421 | * If the TransactionId is already a member of the passed MultiXactId with the |
| 422 | * same status, just return it as-is. |
| 423 | * |
| 424 | * Note that we do NOT actually modify the membership of a pre-existing |
| 425 | * MultiXactId; instead we create a new one. This is necessary to avoid |
| 426 | * a race condition against code trying to wait for one MultiXactId to finish; |
| 427 | * see notes in heapam.c. |
| 428 | * |
| 429 | * NB - we don't worry about our local MultiXactId cache here, because that |
| 430 | * is handled by the lower-level routines. |
| 431 | * |
| 432 | * Note: It is critical that MultiXactIds that come from an old cluster (i.e. |
| 433 | * one upgraded by pg_upgrade from a cluster older than this feature) are not |
| 434 | * passed in. |
| 435 | */ |
| 436 | MultiXactId |
| 437 | MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) |
| 438 | { |
| 439 | MultiXactId newMulti; |
| 440 | MultiXactMember *members; |
| 441 | MultiXactMember *newMembers; |
| 442 | int nmembers; |
| 443 | int i; |
| 444 | int j; |
| 445 | |
| 446 | AssertArg(MultiXactIdIsValid(multi)); |
| 447 | AssertArg(TransactionIdIsValid(xid)); |
| 448 | |
| 449 | /* MultiXactIdSetOldestMember() must have been called already. */ |
| 450 | Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); |
| 451 | |
| 452 | debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s" , |
| 453 | multi, xid, mxstatus_to_string(status)); |
| 454 | |
| 455 | /* |
| 456 | * Note: we don't allow for old multis here. The reason is that the only |
| 457 | * caller of this function does a check that the multixact is no longer |
| 458 | * running. |
| 459 | */ |
| 460 | nmembers = GetMultiXactIdMembers(multi, &members, false, false); |
| 461 | |
| 462 | if (nmembers < 0) |
| 463 | { |
| 464 | MultiXactMember member; |
| 465 | |
| 466 | /* |
| 467 | * The MultiXactId is obsolete. This can only happen if all the |
| 468 | * MultiXactId members stop running between the caller checking and |
| 469 | * passing it to us. It would be better to return that fact to the |
| 470 | * caller, but it would complicate the API and it's unlikely to happen |
| 471 | * too often, so just deal with it by creating a singleton MultiXact. |
| 472 | */ |
| 473 | member.xid = xid; |
| 474 | member.status = status; |
| 475 | newMulti = MultiXactIdCreateFromMembers(1, &member); |
| 476 | |
| 477 | debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u" , |
| 478 | multi, newMulti); |
| 479 | return newMulti; |
| 480 | } |
| 481 | |
| 482 | /* |
| 483 | * If the TransactionId is already a member of the MultiXactId with the |
| 484 | * same status, just return the existing MultiXactId. |
| 485 | */ |
| 486 | for (i = 0; i < nmembers; i++) |
| 487 | { |
| 488 | if (TransactionIdEquals(members[i].xid, xid) && |
| 489 | (members[i].status == status)) |
| 490 | { |
| 491 | debug_elog4(DEBUG2, "Expand: %u is already a member of %u" , |
| 492 | xid, multi); |
| 493 | pfree(members); |
| 494 | return multi; |
| 495 | } |
| 496 | } |
| 497 | |
| 498 | /* |
| 499 | * Determine which of the members of the MultiXactId are still of |
| 500 | * interest. This is any running transaction, and also any transaction |
| 501 | * that grabbed something stronger than just a lock and was committed. (An |
| 502 | * update that aborted is of no interest here; and having more than one |
| 503 | * update Xid in a multixact would cause errors elsewhere.) |
| 504 | * |
| 505 | * Removing dead members is not just an optimization: freezing of tuples |
| 506 | * whose Xmax are multis depends on this behavior. |
| 507 | * |
| 508 | * Note we have the same race condition here as above: j could be 0 at the |
| 509 | * end of the loop. |
| 510 | */ |
| 511 | newMembers = (MultiXactMember *) |
| 512 | palloc(sizeof(MultiXactMember) * (nmembers + 1)); |
| 513 | |
| 514 | for (i = 0, j = 0; i < nmembers; i++) |
| 515 | { |
| 516 | if (TransactionIdIsInProgress(members[i].xid) || |
| 517 | (ISUPDATE_from_mxstatus(members[i].status) && |
| 518 | TransactionIdDidCommit(members[i].xid))) |
| 519 | { |
| 520 | newMembers[j].xid = members[i].xid; |
| 521 | newMembers[j++].status = members[i].status; |
| 522 | } |
| 523 | } |
| 524 | |
| 525 | newMembers[j].xid = xid; |
| 526 | newMembers[j++].status = status; |
| 527 | newMulti = MultiXactIdCreateFromMembers(j, newMembers); |
| 528 | |
| 529 | pfree(members); |
| 530 | pfree(newMembers); |
| 531 | |
| 532 | debug_elog3(DEBUG2, "Expand: returning new multi %u" , newMulti); |
| 533 | |
| 534 | return newMulti; |
| 535 | } |
| 536 | |
| 537 | /* |
| 538 | * MultiXactIdIsRunning |
| 539 | * Returns whether a MultiXactId is "running". |
| 540 | * |
| 541 | * We return true if at least one member of the given MultiXactId is still |
| 542 | * running. Note that a "false" result is certain not to change, |
| 543 | * because it is not legal to add members to an existing MultiXactId. |
| 544 | * |
| 545 | * Caller is expected to have verified that the multixact does not come from |
| 546 | * a pg_upgraded share-locked tuple. |
| 547 | */ |
| 548 | bool |
| 549 | MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly) |
| 550 | { |
| 551 | MultiXactMember *members; |
| 552 | int nmembers; |
| 553 | int i; |
| 554 | |
| 555 | debug_elog3(DEBUG2, "IsRunning %u?" , multi); |
| 556 | |
| 557 | /* |
| 558 | * "false" here means we assume our callers have checked that the given |
| 559 | * multi cannot possibly come from a pg_upgraded database. |
| 560 | */ |
| 561 | nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly); |
| 562 | |
| 563 | if (nmembers <= 0) |
| 564 | { |
| 565 | debug_elog2(DEBUG2, "IsRunning: no members" ); |
| 566 | return false; |
| 567 | } |
| 568 | |
| 569 | /* |
| 570 | * Checking for myself is cheap compared to looking in shared memory; |
| 571 | * return true if any live subtransaction of the current top-level |
| 572 | * transaction is a member. |
| 573 | * |
| 574 | * This is not needed for correctness, it's just a fast path. |
| 575 | */ |
| 576 | for (i = 0; i < nmembers; i++) |
| 577 | { |
| 578 | if (TransactionIdIsCurrentTransactionId(members[i].xid)) |
| 579 | { |
| 580 | debug_elog3(DEBUG2, "IsRunning: I (%d) am running!" , i); |
| 581 | pfree(members); |
| 582 | return true; |
| 583 | } |
| 584 | } |
| 585 | |
| 586 | /* |
| 587 | * This could be made faster by having another entry point in procarray.c, |
| 588 | * walking the PGPROC array only once for all the members. But in most |
| 589 | * cases nmembers should be small enough that it doesn't much matter. |
| 590 | */ |
| 591 | for (i = 0; i < nmembers; i++) |
| 592 | { |
| 593 | if (TransactionIdIsInProgress(members[i].xid)) |
| 594 | { |
| 595 | debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running" , |
| 596 | i, members[i].xid); |
| 597 | pfree(members); |
| 598 | return true; |
| 599 | } |
| 600 | } |
| 601 | |
| 602 | pfree(members); |
| 603 | |
| 604 | debug_elog3(DEBUG2, "IsRunning: %u is not running" , multi); |
| 605 | |
| 606 | return false; |
| 607 | } |
| 608 | |
| 609 | /* |
| 610 | * MultiXactIdSetOldestMember |
| 611 | * Save the oldest MultiXactId this transaction could be a member of. |
| 612 | * |
| 613 | * We set the OldestMemberMXactId for a given transaction the first time it's |
| 614 | * going to do some operation that might require a MultiXactId (tuple lock, |
| 615 | * update or delete). We need to do this even if we end up using a |
| 616 | * TransactionId instead of a MultiXactId, because there is a chance that |
| 617 | * another transaction would add our XID to a MultiXactId. |
| 618 | * |
| 619 | * The value to set is the next-to-be-assigned MultiXactId, so this is meant to |
| 620 | * be called just before doing any such possibly-MultiXactId-able operation. |
| 621 | */ |
| 622 | void |
| 623 | MultiXactIdSetOldestMember(void) |
| 624 | { |
| 625 | if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])) |
| 626 | { |
| 627 | MultiXactId nextMXact; |
| 628 | |
| 629 | /* |
| 630 | * You might think we don't need to acquire a lock here, since |
| 631 | * fetching and storing of TransactionIds is probably atomic, but in |
| 632 | * fact we do: suppose we pick up nextMXact and then lose the CPU for |
| 633 | * a long time. Someone else could advance nextMXact, and then |
| 634 | * another someone else could compute an OldestVisibleMXactId that |
| 635 | * would be after the value we are going to store when we get control |
| 636 | * back. Which would be wrong. |
| 637 | * |
| 638 | * Note that a shared lock is sufficient, because it's enough to stop |
| 639 | * someone from advancing nextMXact; and nobody else could be trying |
| 640 | * to write to our OldestMember entry, only reading (and we assume |
| 641 | * storing it is atomic.) |
| 642 | */ |
| 643 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 644 | |
| 645 | /* |
| 646 | * We have to beware of the possibility that nextMXact is in the |
| 647 | * wrapped-around state. We don't fix the counter itself here, but we |
| 648 | * must be sure to store a valid value in our array entry. |
| 649 | */ |
| 650 | nextMXact = MultiXactState->nextMXact; |
| 651 | if (nextMXact < FirstMultiXactId) |
| 652 | nextMXact = FirstMultiXactId; |
| 653 | |
| 654 | OldestMemberMXactId[MyBackendId] = nextMXact; |
| 655 | |
| 656 | LWLockRelease(MultiXactGenLock); |
| 657 | |
| 658 | debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u" , |
| 659 | MyBackendId, nextMXact); |
| 660 | } |
| 661 | } |
| 662 | |
| 663 | /* |
| 664 | * MultiXactIdSetOldestVisible |
| 665 | * Save the oldest MultiXactId this transaction considers possibly live. |
| 666 | * |
| 667 | * We set the OldestVisibleMXactId for a given transaction the first time |
| 668 | * it's going to inspect any MultiXactId. Once we have set this, we are |
| 669 | * guaranteed that the checkpointer won't truncate off SLRU data for |
| 670 | * MultiXactIds at or after our OldestVisibleMXactId. |
| 671 | * |
| 672 | * The value to set is the oldest of nextMXact and all the valid per-backend |
| 673 | * OldestMemberMXactId[] entries. Because of the locking we do, we can be |
| 674 | * certain that no subsequent call to MultiXactIdSetOldestMember can set |
| 675 | * an OldestMemberMXactId[] entry older than what we compute here. Therefore |
| 676 | * there is no live transaction, now or later, that can be a member of any |
| 677 | * MultiXactId older than the OldestVisibleMXactId we compute here. |
| 678 | */ |
| 679 | static void |
| 680 | MultiXactIdSetOldestVisible(void) |
| 681 | { |
| 682 | if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId])) |
| 683 | { |
| 684 | MultiXactId oldestMXact; |
| 685 | int i; |
| 686 | |
| 687 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 688 | |
| 689 | /* |
| 690 | * We have to beware of the possibility that nextMXact is in the |
| 691 | * wrapped-around state. We don't fix the counter itself here, but we |
| 692 | * must be sure to store a valid value in our array entry. |
| 693 | */ |
| 694 | oldestMXact = MultiXactState->nextMXact; |
| 695 | if (oldestMXact < FirstMultiXactId) |
| 696 | oldestMXact = FirstMultiXactId; |
| 697 | |
| 698 | for (i = 1; i <= MaxOldestSlot; i++) |
| 699 | { |
| 700 | MultiXactId thisoldest = OldestMemberMXactId[i]; |
| 701 | |
| 702 | if (MultiXactIdIsValid(thisoldest) && |
| 703 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
| 704 | oldestMXact = thisoldest; |
| 705 | } |
| 706 | |
| 707 | OldestVisibleMXactId[MyBackendId] = oldestMXact; |
| 708 | |
| 709 | LWLockRelease(MultiXactGenLock); |
| 710 | |
| 711 | debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u" , |
| 712 | MyBackendId, oldestMXact); |
| 713 | } |
| 714 | } |
| 715 | |
| 716 | /* |
| 717 | * ReadNextMultiXactId |
| 718 | * Return the next MultiXactId to be assigned, but don't allocate it |
| 719 | */ |
| 720 | MultiXactId |
| 721 | ReadNextMultiXactId(void) |
| 722 | { |
| 723 | MultiXactId mxid; |
| 724 | |
| 725 | /* XXX we could presumably do this without a lock. */ |
| 726 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 727 | mxid = MultiXactState->nextMXact; |
| 728 | LWLockRelease(MultiXactGenLock); |
| 729 | |
| 730 | if (mxid < FirstMultiXactId) |
| 731 | mxid = FirstMultiXactId; |
| 732 | |
| 733 | return mxid; |
| 734 | } |
| 735 | |
| 736 | /* |
| 737 | * MultiXactIdCreateFromMembers |
| 738 | * Make a new MultiXactId from the specified set of members |
| 739 | * |
| 740 | * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the |
| 741 | * given TransactionIds as members. Returns the newly created MultiXactId. |
| 742 | * |
| 743 | * NB: the passed members[] array will be sorted in-place. |
| 744 | */ |
| 745 | MultiXactId |
| 746 | MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) |
| 747 | { |
| 748 | MultiXactId multi; |
| 749 | MultiXactOffset offset; |
| 750 | xl_multixact_create xlrec; |
| 751 | |
| 752 | debug_elog3(DEBUG2, "Create: %s" , |
| 753 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
| 754 | |
| 755 | /* |
| 756 | * See if the same set of members already exists in our cache; if so, just |
| 757 | * re-use that MultiXactId. (Note: it might seem that looking in our |
| 758 | * cache is insufficient, and we ought to search disk to see if a |
| 759 | * duplicate definition already exists. But since we only ever create |
| 760 | * MultiXacts containing our own XID, in most cases any such MultiXacts |
| 761 | * were in fact created by us, and so will be in our cache. There are |
| 762 | * corner cases where someone else added us to a MultiXact without our |
| 763 | * knowledge, but it's not worth checking for.) |
| 764 | */ |
| 765 | multi = mXactCacheGetBySet(nmembers, members); |
| 766 | if (MultiXactIdIsValid(multi)) |
| 767 | { |
| 768 | debug_elog2(DEBUG2, "Create: in cache!" ); |
| 769 | return multi; |
| 770 | } |
| 771 | |
| 772 | /* Verify that there is a single update Xid among the given members. */ |
| 773 | { |
| 774 | int i; |
| 775 | bool has_update = false; |
| 776 | |
| 777 | for (i = 0; i < nmembers; i++) |
| 778 | { |
| 779 | if (ISUPDATE_from_mxstatus(members[i].status)) |
| 780 | { |
| 781 | if (has_update) |
| 782 | elog(ERROR, "new multixact has more than one updating member" ); |
| 783 | has_update = true; |
| 784 | } |
| 785 | } |
| 786 | } |
| 787 | |
| 788 | /* |
| 789 | * Assign the MXID and offsets range to use, and make sure there is space |
| 790 | * in the OFFSETs and MEMBERs files. NB: this routine does |
| 791 | * START_CRIT_SECTION(). |
| 792 | * |
| 793 | * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check |
| 794 | * that we've called MultiXactIdSetOldestMember here. This is because |
| 795 | * this routine is used in some places to create new MultiXactIds of which |
| 796 | * the current backend is not a member, notably during freezing of multis |
| 797 | * in vacuum. During vacuum, in particular, it would be unacceptable to |
| 798 | * keep OldestMulti set, in case it runs for long. |
| 799 | */ |
| 800 | multi = GetNewMultiXactId(nmembers, &offset); |
| 801 | |
| 802 | /* Make an XLOG entry describing the new MXID. */ |
| 803 | xlrec.mid = multi; |
| 804 | xlrec.moff = offset; |
| 805 | xlrec.nmembers = nmembers; |
| 806 | |
| 807 | /* |
| 808 | * XXX Note: there's a lot of padding space in MultiXactMember. We could |
| 809 | * find a more compact representation of this Xlog record -- perhaps all |
| 810 | * the status flags in one XLogRecData, then all the xids in another one? |
| 811 | * Not clear that it's worth the trouble though. |
| 812 | */ |
| 813 | XLogBeginInsert(); |
| 814 | XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate); |
| 815 | XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember)); |
| 816 | |
| 817 | (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); |
| 818 | |
| 819 | /* Now enter the information into the OFFSETs and MEMBERs logs */ |
| 820 | RecordNewMultiXact(multi, offset, nmembers, members); |
| 821 | |
| 822 | /* Done with critical section */ |
| 823 | END_CRIT_SECTION(); |
| 824 | |
| 825 | /* Store the new MultiXactId in the local cache, too */ |
| 826 | mXactCachePut(multi, nmembers, members); |
| 827 | |
| 828 | debug_elog2(DEBUG2, "Create: all done" ); |
| 829 | |
| 830 | return multi; |
| 831 | } |
| 832 | |
| 833 | /* |
| 834 | * RecordNewMultiXact |
| 835 | * Write info about a new multixact into the offsets and members files |
| 836 | * |
| 837 | * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can |
| 838 | * use it. |
| 839 | */ |
| 840 | static void |
| 841 | RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
| 842 | int nmembers, MultiXactMember *members) |
| 843 | { |
| 844 | int pageno; |
| 845 | int prev_pageno; |
| 846 | int entryno; |
| 847 | int slotno; |
| 848 | MultiXactOffset *offptr; |
| 849 | int i; |
| 850 | |
| 851 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 852 | |
| 853 | pageno = MultiXactIdToOffsetPage(multi); |
| 854 | entryno = MultiXactIdToOffsetEntry(multi); |
| 855 | |
| 856 | /* |
| 857 | * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" |
| 858 | * to complain about if there's any I/O error. This is kinda bogus, but |
| 859 | * since the errors will always give the full pathname, it should be clear |
| 860 | * enough that a MultiXactId is really involved. Perhaps someday we'll |
| 861 | * take the trouble to generalize the slru.c error reporting code. |
| 862 | */ |
| 863 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); |
| 864 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| 865 | offptr += entryno; |
| 866 | |
| 867 | *offptr = offset; |
| 868 | |
| 869 | MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
| 870 | |
| 871 | /* Exchange our lock */ |
| 872 | LWLockRelease(MultiXactOffsetControlLock); |
| 873 | |
| 874 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| 875 | |
| 876 | prev_pageno = -1; |
| 877 | |
| 878 | for (i = 0; i < nmembers; i++, offset++) |
| 879 | { |
| 880 | TransactionId *memberptr; |
| 881 | uint32 *flagsptr; |
| 882 | uint32 flagsval; |
| 883 | int bshift; |
| 884 | int flagsoff; |
| 885 | int memberoff; |
| 886 | |
| 887 | Assert(members[i].status <= MultiXactStatusUpdate); |
| 888 | |
| 889 | pageno = MXOffsetToMemberPage(offset); |
| 890 | memberoff = MXOffsetToMemberOffset(offset); |
| 891 | flagsoff = MXOffsetToFlagsOffset(offset); |
| 892 | bshift = MXOffsetToFlagsBitShift(offset); |
| 893 | |
| 894 | if (pageno != prev_pageno) |
| 895 | { |
| 896 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); |
| 897 | prev_pageno = pageno; |
| 898 | } |
| 899 | |
| 900 | memberptr = (TransactionId *) |
| 901 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
| 902 | |
| 903 | *memberptr = members[i].xid; |
| 904 | |
| 905 | flagsptr = (uint32 *) |
| 906 | (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); |
| 907 | |
| 908 | flagsval = *flagsptr; |
| 909 | flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); |
| 910 | flagsval |= (members[i].status << bshift); |
| 911 | *flagsptr = flagsval; |
| 912 | |
| 913 | MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
| 914 | } |
| 915 | |
| 916 | LWLockRelease(MultiXactMemberControlLock); |
| 917 | } |
| 918 | |
| 919 | /* |
| 920 | * GetNewMultiXactId |
| 921 | * Get the next MultiXactId. |
| 922 | * |
| 923 | * Also, reserve the needed amount of space in the "members" area. The |
| 924 | * starting offset of the reserved space is returned in *offset. |
| 925 | * |
| 926 | * This may generate XLOG records for expansion of the offsets and/or members |
| 927 | * files. Unfortunately, we have to do that while holding MultiXactGenLock |
| 928 | * to avoid race conditions --- the XLOG record for zeroing a page must appear |
| 929 | * before any backend can possibly try to store data in that page! |
| 930 | * |
| 931 | * We start a critical section before advancing the shared counters. The |
| 932 | * caller must end the critical section after writing SLRU data. |
| 933 | */ |
| 934 | static MultiXactId |
| 935 | GetNewMultiXactId(int nmembers, MultiXactOffset *offset) |
| 936 | { |
| 937 | MultiXactId result; |
| 938 | MultiXactOffset nextOffset; |
| 939 | |
| 940 | debug_elog3(DEBUG2, "GetNew: for %d xids" , nmembers); |
| 941 | |
| 942 | /* safety check, we should never get this far in a HS standby */ |
| 943 | if (RecoveryInProgress()) |
| 944 | elog(ERROR, "cannot assign MultiXactIds during recovery" ); |
| 945 | |
| 946 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 947 | |
| 948 | /* Handle wraparound of the nextMXact counter */ |
| 949 | if (MultiXactState->nextMXact < FirstMultiXactId) |
| 950 | MultiXactState->nextMXact = FirstMultiXactId; |
| 951 | |
| 952 | /* Assign the MXID */ |
| 953 | result = MultiXactState->nextMXact; |
| 954 | |
| 955 | /*---------- |
| 956 | * Check to see if it's safe to assign another MultiXactId. This protects |
| 957 | * against catastrophic data loss due to multixact wraparound. The basic |
| 958 | * rules are: |
| 959 | * |
| 960 | * If we're past multiVacLimit or the safe threshold for member storage |
| 961 | * space, or we don't know what the safe threshold for member storage is, |
| 962 | * start trying to force autovacuum cycles. |
| 963 | * If we're past multiWarnLimit, start issuing warnings. |
| 964 | * If we're past multiStopLimit, refuse to create new MultiXactIds. |
| 965 | * |
| 966 | * Note these are pretty much the same protections in GetNewTransactionId. |
| 967 | *---------- |
| 968 | */ |
| 969 | if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit)) |
| 970 | { |
| 971 | /* |
| 972 | * For safety's sake, we release MultiXactGenLock while sending |
| 973 | * signals, warnings, etc. This is not so much because we care about |
| 974 | * preserving concurrency in this situation, as to avoid any |
| 975 | * possibility of deadlock while doing get_database_name(). First, |
| 976 | * copy all the shared values we'll need in this path. |
| 977 | */ |
| 978 | MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit; |
| 979 | MultiXactId multiStopLimit = MultiXactState->multiStopLimit; |
| 980 | MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit; |
| 981 | Oid oldest_datoid = MultiXactState->oldestMultiXactDB; |
| 982 | |
| 983 | LWLockRelease(MultiXactGenLock); |
| 984 | |
| 985 | if (IsUnderPostmaster && |
| 986 | !MultiXactIdPrecedes(result, multiStopLimit)) |
| 987 | { |
| 988 | char *oldest_datname = get_database_name(oldest_datoid); |
| 989 | |
| 990 | /* |
| 991 | * Immediately kick autovacuum into action as we're already in |
| 992 | * ERROR territory. |
| 993 | */ |
| 994 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
| 995 | |
| 996 | /* complain even if that DB has disappeared */ |
| 997 | if (oldest_datname) |
| 998 | ereport(ERROR, |
| 999 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 1000 | errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"" , |
| 1001 | oldest_datname), |
| 1002 | errhint("Execute a database-wide VACUUM in that database.\n" |
| 1003 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
| 1004 | else |
| 1005 | ereport(ERROR, |
| 1006 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 1007 | errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u" , |
| 1008 | oldest_datoid), |
| 1009 | errhint("Execute a database-wide VACUUM in that database.\n" |
| 1010 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
| 1011 | } |
| 1012 | |
| 1013 | /* |
| 1014 | * To avoid swamping the postmaster with signals, we issue the autovac |
| 1015 | * request only once per 64K multis generated. This still gives |
| 1016 | * plenty of chances before we get into real trouble. |
| 1017 | */ |
| 1018 | if (IsUnderPostmaster && (result % 65536) == 0) |
| 1019 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
| 1020 | |
| 1021 | if (!MultiXactIdPrecedes(result, multiWarnLimit)) |
| 1022 | { |
| 1023 | char *oldest_datname = get_database_name(oldest_datoid); |
| 1024 | |
| 1025 | /* complain even if that DB has disappeared */ |
| 1026 | if (oldest_datname) |
| 1027 | ereport(WARNING, |
| 1028 | (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used" , |
| 1029 | "database \"%s\" must be vacuumed before %u more MultiXactIds are used" , |
| 1030 | multiWrapLimit - result, |
| 1031 | oldest_datname, |
| 1032 | multiWrapLimit - result), |
| 1033 | errhint("Execute a database-wide VACUUM in that database.\n" |
| 1034 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
| 1035 | else |
| 1036 | ereport(WARNING, |
| 1037 | (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used" , |
| 1038 | "database with OID %u must be vacuumed before %u more MultiXactIds are used" , |
| 1039 | multiWrapLimit - result, |
| 1040 | oldest_datoid, |
| 1041 | multiWrapLimit - result), |
| 1042 | errhint("Execute a database-wide VACUUM in that database.\n" |
| 1043 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
| 1044 | } |
| 1045 | |
| 1046 | /* Re-acquire lock and start over */ |
| 1047 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 1048 | result = MultiXactState->nextMXact; |
| 1049 | if (result < FirstMultiXactId) |
| 1050 | result = FirstMultiXactId; |
| 1051 | } |
| 1052 | |
| 1053 | /* Make sure there is room for the MXID in the file. */ |
| 1054 | ExtendMultiXactOffset(result); |
| 1055 | |
| 1056 | /* |
| 1057 | * Reserve the members space, similarly to above. Also, be careful not to |
| 1058 | * return zero as the starting offset for any multixact. See |
| 1059 | * GetMultiXactIdMembers() for motivation. |
| 1060 | */ |
| 1061 | nextOffset = MultiXactState->nextOffset; |
| 1062 | if (nextOffset == 0) |
| 1063 | { |
| 1064 | *offset = 1; |
| 1065 | nmembers++; /* allocate member slot 0 too */ |
| 1066 | } |
| 1067 | else |
| 1068 | *offset = nextOffset; |
| 1069 | |
| 1070 | /*---------- |
| 1071 | * Protect against overrun of the members space as well, with the |
| 1072 | * following rules: |
| 1073 | * |
| 1074 | * If we're past offsetStopLimit, refuse to generate more multis. |
| 1075 | * If we're close to offsetStopLimit, emit a warning. |
| 1076 | * |
| 1077 | * Arbitrarily, we start emitting warnings when we're 20 segments or less |
| 1078 | * from offsetStopLimit. |
| 1079 | * |
| 1080 | * Note we haven't updated the shared state yet, so if we fail at this |
| 1081 | * point, the multixact ID we grabbed can still be used by the next guy. |
| 1082 | * |
| 1083 | * Note that there is no point in forcing autovacuum runs here: the |
| 1084 | * multixact freeze settings would have to be reduced for that to have any |
| 1085 | * effect. |
| 1086 | *---------- |
| 1087 | */ |
| 1088 | #define OFFSET_WARN_SEGMENTS 20 |
| 1089 | if (MultiXactState->oldestOffsetKnown && |
| 1090 | MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, |
| 1091 | nmembers)) |
| 1092 | { |
| 1093 | /* see comment in the corresponding offsets wraparound case */ |
| 1094 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
| 1095 | |
| 1096 | ereport(ERROR, |
| 1097 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 1098 | errmsg("multixact \"members\" limit exceeded" ), |
| 1099 | errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member." , |
| 1100 | "This command would create a multixact with %u members, but the remaining space is only enough for %u members." , |
| 1101 | MultiXactState->offsetStopLimit - nextOffset - 1, |
| 1102 | nmembers, |
| 1103 | MultiXactState->offsetStopLimit - nextOffset - 1), |
| 1104 | errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings." , |
| 1105 | MultiXactState->oldestMultiXactDB))); |
| 1106 | } |
| 1107 | |
| 1108 | /* |
| 1109 | * Check whether we should kick autovacuum into action, to prevent members |
| 1110 | * wraparound. NB we use a much larger window to trigger autovacuum than |
| 1111 | * just the warning limit. The warning is just a measure of last resort - |
| 1112 | * this is in line with GetNewTransactionId's behaviour. |
| 1113 | */ |
| 1114 | if (!MultiXactState->oldestOffsetKnown || |
| 1115 | (MultiXactState->nextOffset - MultiXactState->oldestOffset |
| 1116 | > MULTIXACT_MEMBER_SAFE_THRESHOLD)) |
| 1117 | { |
| 1118 | /* |
| 1119 | * To avoid swamping the postmaster with signals, we issue the autovac |
| 1120 | * request only when crossing a segment boundary. With default |
| 1121 | * compilation settings that's roughly after 50k members. This still |
| 1122 | * gives plenty of chances before we get into real trouble. |
| 1123 | */ |
| 1124 | if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != |
| 1125 | (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) |
| 1126 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
| 1127 | } |
| 1128 | |
| 1129 | if (MultiXactState->oldestOffsetKnown && |
| 1130 | MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, |
| 1131 | nextOffset, |
| 1132 | nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) |
| 1133 | ereport(WARNING, |
| 1134 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 1135 | errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used" , |
| 1136 | "database with OID %u must be vacuumed before %d more multixact members are used" , |
| 1137 | MultiXactState->offsetStopLimit - nextOffset + nmembers, |
| 1138 | MultiXactState->oldestMultiXactDB, |
| 1139 | MultiXactState->offsetStopLimit - nextOffset + nmembers), |
| 1140 | errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings." ))); |
| 1141 | |
| 1142 | ExtendMultiXactMember(nextOffset, nmembers); |
| 1143 | |
| 1144 | /* |
| 1145 | * Critical section from here until caller has written the data into the |
| 1146 | * just-reserved SLRU space; we don't want to error out with a partly |
| 1147 | * written MultiXact structure. (In particular, failing to write our |
| 1148 | * start offset after advancing nextMXact would effectively corrupt the |
| 1149 | * previous MultiXact.) |
| 1150 | */ |
| 1151 | START_CRIT_SECTION(); |
| 1152 | |
| 1153 | /* |
| 1154 | * Advance counters. As in GetNewTransactionId(), this must not happen |
| 1155 | * until after file extension has succeeded! |
| 1156 | * |
| 1157 | * We don't care about MultiXactId wraparound here; it will be handled by |
| 1158 | * the next iteration. But note that nextMXact may be InvalidMultiXactId |
| 1159 | * or the first value on a segment-beginning page after this routine |
| 1160 | * exits, so anyone else looking at the variable must be prepared to deal |
| 1161 | * with either case. Similarly, nextOffset may be zero, but we won't use |
| 1162 | * that as the actual start offset of the next multixact. |
| 1163 | */ |
| 1164 | (MultiXactState->nextMXact)++; |
| 1165 | |
| 1166 | MultiXactState->nextOffset += nmembers; |
| 1167 | |
| 1168 | LWLockRelease(MultiXactGenLock); |
| 1169 | |
| 1170 | debug_elog4(DEBUG2, "GetNew: returning %u offset %u" , result, *offset); |
| 1171 | return result; |
| 1172 | } |
| 1173 | |
| 1174 | /* |
| 1175 | * GetMultiXactIdMembers |
| 1176 | * Return the set of MultiXactMembers that make up a MultiXactId |
| 1177 | * |
| 1178 | * Return value is the number of members found, or -1 if there are none, |
| 1179 | * and *members is set to a newly palloc'ed array of members. It's the |
| 1180 | * caller's responsibility to free it when done with it. |
| 1181 | * |
| 1182 | * from_pgupgrade must be passed as true if and only if only the multixact |
| 1183 | * corresponds to a value from a tuple that was locked in a 9.2-or-older |
| 1184 | * installation and later pg_upgrade'd (that is, the infomask is |
| 1185 | * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members |
| 1186 | * can still be running, so we return -1 just like for an empty multixact |
| 1187 | * without any further checking. It would be wrong to try to resolve such a |
| 1188 | * multixact: either the multixact is within the current valid multixact |
| 1189 | * range, in which case the returned result would be bogus, or outside that |
| 1190 | * range, in which case an error would be raised. |
| 1191 | * |
| 1192 | * In all other cases, the passed multixact must be within the known valid |
| 1193 | * range, that is, greater to or equal than oldestMultiXactId, and less than |
| 1194 | * nextMXact. Otherwise, an error is raised. |
| 1195 | * |
| 1196 | * onlyLock must be set to true if caller is certain that the given multi |
| 1197 | * is used only to lock tuples; can be false without loss of correctness, |
| 1198 | * but passing a true means we can return quickly without checking for |
| 1199 | * old updates. |
| 1200 | */ |
| 1201 | int |
| 1202 | GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, |
| 1203 | bool from_pgupgrade, bool onlyLock) |
| 1204 | { |
| 1205 | int pageno; |
| 1206 | int prev_pageno; |
| 1207 | int entryno; |
| 1208 | int slotno; |
| 1209 | MultiXactOffset *offptr; |
| 1210 | MultiXactOffset offset; |
| 1211 | int length; |
| 1212 | int truelength; |
| 1213 | int i; |
| 1214 | MultiXactId oldestMXact; |
| 1215 | MultiXactId nextMXact; |
| 1216 | MultiXactId tmpMXact; |
| 1217 | MultiXactOffset nextOffset; |
| 1218 | MultiXactMember *ptr; |
| 1219 | |
| 1220 | debug_elog3(DEBUG2, "GetMembers: asked for %u" , multi); |
| 1221 | |
| 1222 | if (!MultiXactIdIsValid(multi) || from_pgupgrade) |
| 1223 | return -1; |
| 1224 | |
| 1225 | /* See if the MultiXactId is in the local cache */ |
| 1226 | length = mXactCacheGetById(multi, members); |
| 1227 | if (length >= 0) |
| 1228 | { |
| 1229 | debug_elog3(DEBUG2, "GetMembers: found %s in the cache" , |
| 1230 | mxid_to_string(multi, length, *members)); |
| 1231 | return length; |
| 1232 | } |
| 1233 | |
| 1234 | /* Set our OldestVisibleMXactId[] entry if we didn't already */ |
| 1235 | MultiXactIdSetOldestVisible(); |
| 1236 | |
| 1237 | /* |
| 1238 | * If we know the multi is used only for locking and not for updates, then |
| 1239 | * we can skip checking if the value is older than our oldest visible |
| 1240 | * multi. It cannot possibly still be running. |
| 1241 | */ |
| 1242 | if (onlyLock && |
| 1243 | MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId])) |
| 1244 | { |
| 1245 | debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old" ); |
| 1246 | *members = NULL; |
| 1247 | return -1; |
| 1248 | } |
| 1249 | |
| 1250 | /* |
| 1251 | * We check known limits on MultiXact before resorting to the SLRU area. |
| 1252 | * |
| 1253 | * An ID older than MultiXactState->oldestMultiXactId cannot possibly be |
| 1254 | * useful; it has already been removed, or will be removed shortly, by |
| 1255 | * truncation. If one is passed, an error is raised. |
| 1256 | * |
| 1257 | * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it |
| 1258 | * implies undetected ID wraparound has occurred. This raises a hard |
| 1259 | * error. |
| 1260 | * |
| 1261 | * Shared lock is enough here since we aren't modifying any global state. |
| 1262 | * Acquire it just long enough to grab the current counter values. We may |
| 1263 | * need both nextMXact and nextOffset; see below. |
| 1264 | */ |
| 1265 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 1266 | |
| 1267 | oldestMXact = MultiXactState->oldestMultiXactId; |
| 1268 | nextMXact = MultiXactState->nextMXact; |
| 1269 | nextOffset = MultiXactState->nextOffset; |
| 1270 | |
| 1271 | LWLockRelease(MultiXactGenLock); |
| 1272 | |
| 1273 | if (MultiXactIdPrecedes(multi, oldestMXact)) |
| 1274 | { |
| 1275 | ereport(ERROR, |
| 1276 | (errcode(ERRCODE_INTERNAL_ERROR), |
| 1277 | errmsg("MultiXactId %u does no longer exist -- apparent wraparound" , |
| 1278 | multi))); |
| 1279 | return -1; |
| 1280 | } |
| 1281 | |
| 1282 | if (!MultiXactIdPrecedes(multi, nextMXact)) |
| 1283 | ereport(ERROR, |
| 1284 | (errcode(ERRCODE_INTERNAL_ERROR), |
| 1285 | errmsg("MultiXactId %u has not been created yet -- apparent wraparound" , |
| 1286 | multi))); |
| 1287 | |
| 1288 | /* |
| 1289 | * Find out the offset at which we need to start reading MultiXactMembers |
| 1290 | * and the number of members in the multixact. We determine the latter as |
| 1291 | * the difference between this multixact's starting offset and the next |
| 1292 | * one's. However, there are some corner cases to worry about: |
| 1293 | * |
| 1294 | * 1. This multixact may be the latest one created, in which case there is |
| 1295 | * no next one to look at. In this case the nextOffset value we just |
| 1296 | * saved is the correct endpoint. |
| 1297 | * |
| 1298 | * 2. The next multixact may still be in process of being filled in: that |
| 1299 | * is, another process may have done GetNewMultiXactId but not yet written |
| 1300 | * the offset entry for that ID. In that scenario, it is guaranteed that |
| 1301 | * the offset entry for that multixact exists (because GetNewMultiXactId |
| 1302 | * won't release MultiXactGenLock until it does) but contains zero |
| 1303 | * (because we are careful to pre-zero offset pages). Because |
| 1304 | * GetNewMultiXactId will never return zero as the starting offset for a |
| 1305 | * multixact, when we read zero as the next multixact's offset, we know we |
| 1306 | * have this case. We sleep for a bit and try again. |
| 1307 | * |
| 1308 | * 3. Because GetNewMultiXactId increments offset zero to offset one to |
| 1309 | * handle case #2, there is an ambiguity near the point of offset |
| 1310 | * wraparound. If we see next multixact's offset is one, is that our |
| 1311 | * multixact's actual endpoint, or did it end at zero with a subsequent |
| 1312 | * increment? We handle this using the knowledge that if the zero'th |
| 1313 | * member slot wasn't filled, it'll contain zero, and zero isn't a valid |
| 1314 | * transaction ID so it can't be a multixact member. Therefore, if we |
| 1315 | * read a zero from the members array, just ignore it. |
| 1316 | * |
| 1317 | * This is all pretty messy, but the mess occurs only in infrequent corner |
| 1318 | * cases, so it seems better than holding the MultiXactGenLock for a long |
| 1319 | * time on every multixact creation. |
| 1320 | */ |
| 1321 | retry: |
| 1322 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 1323 | |
| 1324 | pageno = MultiXactIdToOffsetPage(multi); |
| 1325 | entryno = MultiXactIdToOffsetEntry(multi); |
| 1326 | |
| 1327 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); |
| 1328 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| 1329 | offptr += entryno; |
| 1330 | offset = *offptr; |
| 1331 | |
| 1332 | Assert(offset != 0); |
| 1333 | |
| 1334 | /* |
| 1335 | * Use the same increment rule as GetNewMultiXactId(), that is, don't |
| 1336 | * handle wraparound explicitly until needed. |
| 1337 | */ |
| 1338 | tmpMXact = multi + 1; |
| 1339 | |
| 1340 | if (nextMXact == tmpMXact) |
| 1341 | { |
| 1342 | /* Corner case 1: there is no next multixact */ |
| 1343 | length = nextOffset - offset; |
| 1344 | } |
| 1345 | else |
| 1346 | { |
| 1347 | MultiXactOffset nextMXOffset; |
| 1348 | |
| 1349 | /* handle wraparound if needed */ |
| 1350 | if (tmpMXact < FirstMultiXactId) |
| 1351 | tmpMXact = FirstMultiXactId; |
| 1352 | |
| 1353 | prev_pageno = pageno; |
| 1354 | |
| 1355 | pageno = MultiXactIdToOffsetPage(tmpMXact); |
| 1356 | entryno = MultiXactIdToOffsetEntry(tmpMXact); |
| 1357 | |
| 1358 | if (pageno != prev_pageno) |
| 1359 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); |
| 1360 | |
| 1361 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| 1362 | offptr += entryno; |
| 1363 | nextMXOffset = *offptr; |
| 1364 | |
| 1365 | if (nextMXOffset == 0) |
| 1366 | { |
| 1367 | /* Corner case 2: next multixact is still being filled in */ |
| 1368 | LWLockRelease(MultiXactOffsetControlLock); |
| 1369 | CHECK_FOR_INTERRUPTS(); |
| 1370 | pg_usleep(1000L); |
| 1371 | goto retry; |
| 1372 | } |
| 1373 | |
| 1374 | length = nextMXOffset - offset; |
| 1375 | } |
| 1376 | |
| 1377 | LWLockRelease(MultiXactOffsetControlLock); |
| 1378 | |
| 1379 | ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); |
| 1380 | *members = ptr; |
| 1381 | |
| 1382 | /* Now get the members themselves. */ |
| 1383 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| 1384 | |
| 1385 | truelength = 0; |
| 1386 | prev_pageno = -1; |
| 1387 | for (i = 0; i < length; i++, offset++) |
| 1388 | { |
| 1389 | TransactionId *xactptr; |
| 1390 | uint32 *flagsptr; |
| 1391 | int flagsoff; |
| 1392 | int bshift; |
| 1393 | int memberoff; |
| 1394 | |
| 1395 | pageno = MXOffsetToMemberPage(offset); |
| 1396 | memberoff = MXOffsetToMemberOffset(offset); |
| 1397 | |
| 1398 | if (pageno != prev_pageno) |
| 1399 | { |
| 1400 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); |
| 1401 | prev_pageno = pageno; |
| 1402 | } |
| 1403 | |
| 1404 | xactptr = (TransactionId *) |
| 1405 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
| 1406 | |
| 1407 | if (!TransactionIdIsValid(*xactptr)) |
| 1408 | { |
| 1409 | /* Corner case 3: we must be looking at unused slot zero */ |
| 1410 | Assert(offset == 0); |
| 1411 | continue; |
| 1412 | } |
| 1413 | |
| 1414 | flagsoff = MXOffsetToFlagsOffset(offset); |
| 1415 | bshift = MXOffsetToFlagsBitShift(offset); |
| 1416 | flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); |
| 1417 | |
| 1418 | ptr[truelength].xid = *xactptr; |
| 1419 | ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; |
| 1420 | truelength++; |
| 1421 | } |
| 1422 | |
| 1423 | LWLockRelease(MultiXactMemberControlLock); |
| 1424 | |
| 1425 | /* |
| 1426 | * Copy the result into the local cache. |
| 1427 | */ |
| 1428 | mXactCachePut(multi, truelength, ptr); |
| 1429 | |
| 1430 | debug_elog3(DEBUG2, "GetMembers: no cache for %s" , |
| 1431 | mxid_to_string(multi, truelength, ptr)); |
| 1432 | return truelength; |
| 1433 | } |
| 1434 | |
| 1435 | /* |
| 1436 | * mxactMemberComparator |
| 1437 | * qsort comparison function for MultiXactMember |
| 1438 | * |
| 1439 | * We can't use wraparound comparison for XIDs because that does not respect |
| 1440 | * the triangle inequality! Any old sort order will do. |
| 1441 | */ |
| 1442 | static int |
| 1443 | mxactMemberComparator(const void *arg1, const void *arg2) |
| 1444 | { |
| 1445 | MultiXactMember member1 = *(const MultiXactMember *) arg1; |
| 1446 | MultiXactMember member2 = *(const MultiXactMember *) arg2; |
| 1447 | |
| 1448 | if (member1.xid > member2.xid) |
| 1449 | return 1; |
| 1450 | if (member1.xid < member2.xid) |
| 1451 | return -1; |
| 1452 | if (member1.status > member2.status) |
| 1453 | return 1; |
| 1454 | if (member1.status < member2.status) |
| 1455 | return -1; |
| 1456 | return 0; |
| 1457 | } |
| 1458 | |
| 1459 | /* |
| 1460 | * mXactCacheGetBySet |
| 1461 | * returns a MultiXactId from the cache based on the set of |
| 1462 | * TransactionIds that compose it, or InvalidMultiXactId if |
| 1463 | * none matches. |
| 1464 | * |
| 1465 | * This is helpful, for example, if two transactions want to lock a huge |
| 1466 | * table. By using the cache, the second will use the same MultiXactId |
| 1467 | * for the majority of tuples, thus keeping MultiXactId usage low (saving |
| 1468 | * both I/O and wraparound issues). |
| 1469 | * |
| 1470 | * NB: the passed members array will be sorted in-place. |
| 1471 | */ |
| 1472 | static MultiXactId |
| 1473 | mXactCacheGetBySet(int nmembers, MultiXactMember *members) |
| 1474 | { |
| 1475 | dlist_iter iter; |
| 1476 | |
| 1477 | debug_elog3(DEBUG2, "CacheGet: looking for %s" , |
| 1478 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
| 1479 | |
| 1480 | /* sort the array so comparison is easy */ |
| 1481 | qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); |
| 1482 | |
| 1483 | dlist_foreach(iter, &MXactCache) |
| 1484 | { |
| 1485 | mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); |
| 1486 | |
| 1487 | if (entry->nmembers != nmembers) |
| 1488 | continue; |
| 1489 | |
| 1490 | /* |
| 1491 | * We assume the cache entries are sorted, and that the unused bits in |
| 1492 | * "status" are zeroed. |
| 1493 | */ |
| 1494 | if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0) |
| 1495 | { |
| 1496 | debug_elog3(DEBUG2, "CacheGet: found %u" , entry->multi); |
| 1497 | dlist_move_head(&MXactCache, iter.cur); |
| 1498 | return entry->multi; |
| 1499 | } |
| 1500 | } |
| 1501 | |
| 1502 | debug_elog2(DEBUG2, "CacheGet: not found :-(" ); |
| 1503 | return InvalidMultiXactId; |
| 1504 | } |
| 1505 | |
| 1506 | /* |
| 1507 | * mXactCacheGetById |
| 1508 | * returns the composing MultiXactMember set from the cache for a |
| 1509 | * given MultiXactId, if present. |
| 1510 | * |
| 1511 | * If successful, *xids is set to the address of a palloc'd copy of the |
| 1512 | * MultiXactMember set. Return value is number of members, or -1 on failure. |
| 1513 | */ |
| 1514 | static int |
| 1515 | mXactCacheGetById(MultiXactId multi, MultiXactMember **members) |
| 1516 | { |
| 1517 | dlist_iter iter; |
| 1518 | |
| 1519 | debug_elog3(DEBUG2, "CacheGet: looking for %u" , multi); |
| 1520 | |
| 1521 | dlist_foreach(iter, &MXactCache) |
| 1522 | { |
| 1523 | mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); |
| 1524 | |
| 1525 | if (entry->multi == multi) |
| 1526 | { |
| 1527 | MultiXactMember *ptr; |
| 1528 | Size size; |
| 1529 | |
| 1530 | size = sizeof(MultiXactMember) * entry->nmembers; |
| 1531 | ptr = (MultiXactMember *) palloc(size); |
| 1532 | *members = ptr; |
| 1533 | |
| 1534 | memcpy(ptr, entry->members, size); |
| 1535 | |
| 1536 | debug_elog3(DEBUG2, "CacheGet: found %s" , |
| 1537 | mxid_to_string(multi, |
| 1538 | entry->nmembers, |
| 1539 | entry->members)); |
| 1540 | |
| 1541 | /* |
| 1542 | * Note we modify the list while not using a modifiable iterator. |
| 1543 | * This is acceptable only because we exit the iteration |
| 1544 | * immediately afterwards. |
| 1545 | */ |
| 1546 | dlist_move_head(&MXactCache, iter.cur); |
| 1547 | |
| 1548 | return entry->nmembers; |
| 1549 | } |
| 1550 | } |
| 1551 | |
| 1552 | debug_elog2(DEBUG2, "CacheGet: not found" ); |
| 1553 | return -1; |
| 1554 | } |
| 1555 | |
| 1556 | /* |
| 1557 | * mXactCachePut |
| 1558 | * Add a new MultiXactId and its composing set into the local cache. |
| 1559 | */ |
| 1560 | static void |
| 1561 | mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) |
| 1562 | { |
| 1563 | mXactCacheEnt *entry; |
| 1564 | |
| 1565 | debug_elog3(DEBUG2, "CachePut: storing %s" , |
| 1566 | mxid_to_string(multi, nmembers, members)); |
| 1567 | |
| 1568 | if (MXactContext == NULL) |
| 1569 | { |
| 1570 | /* The cache only lives as long as the current transaction */ |
| 1571 | debug_elog2(DEBUG2, "CachePut: initializing memory context" ); |
| 1572 | MXactContext = AllocSetContextCreate(TopTransactionContext, |
| 1573 | "MultiXact cache context" , |
| 1574 | ALLOCSET_SMALL_SIZES); |
| 1575 | } |
| 1576 | |
| 1577 | entry = (mXactCacheEnt *) |
| 1578 | MemoryContextAlloc(MXactContext, |
| 1579 | offsetof(mXactCacheEnt, members) + |
| 1580 | nmembers * sizeof(MultiXactMember)); |
| 1581 | |
| 1582 | entry->multi = multi; |
| 1583 | entry->nmembers = nmembers; |
| 1584 | memcpy(entry->members, members, nmembers * sizeof(MultiXactMember)); |
| 1585 | |
| 1586 | /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ |
| 1587 | qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); |
| 1588 | |
| 1589 | dlist_push_head(&MXactCache, &entry->node); |
| 1590 | if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES) |
| 1591 | { |
| 1592 | dlist_node *node; |
| 1593 | mXactCacheEnt *entry; |
| 1594 | |
| 1595 | node = dlist_tail_node(&MXactCache); |
| 1596 | dlist_delete(node); |
| 1597 | MXactCacheMembers--; |
| 1598 | |
| 1599 | entry = dlist_container(mXactCacheEnt, node, node); |
| 1600 | debug_elog3(DEBUG2, "CachePut: pruning cached multi %u" , |
| 1601 | entry->multi); |
| 1602 | |
| 1603 | pfree(entry); |
| 1604 | } |
| 1605 | } |
| 1606 | |
| 1607 | static char * |
| 1608 | mxstatus_to_string(MultiXactStatus status) |
| 1609 | { |
| 1610 | switch (status) |
| 1611 | { |
| 1612 | case MultiXactStatusForKeyShare: |
| 1613 | return "keysh" ; |
| 1614 | case MultiXactStatusForShare: |
| 1615 | return "sh" ; |
| 1616 | case MultiXactStatusForNoKeyUpdate: |
| 1617 | return "fornokeyupd" ; |
| 1618 | case MultiXactStatusForUpdate: |
| 1619 | return "forupd" ; |
| 1620 | case MultiXactStatusNoKeyUpdate: |
| 1621 | return "nokeyupd" ; |
| 1622 | case MultiXactStatusUpdate: |
| 1623 | return "upd" ; |
| 1624 | default: |
| 1625 | elog(ERROR, "unrecognized multixact status %d" , status); |
| 1626 | return "" ; |
| 1627 | } |
| 1628 | } |
| 1629 | |
| 1630 | char * |
| 1631 | mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) |
| 1632 | { |
| 1633 | static char *str = NULL; |
| 1634 | StringInfoData buf; |
| 1635 | int i; |
| 1636 | |
| 1637 | if (str != NULL) |
| 1638 | pfree(str); |
| 1639 | |
| 1640 | initStringInfo(&buf); |
| 1641 | |
| 1642 | appendStringInfo(&buf, "%u %d[%u (%s)" , multi, nmembers, members[0].xid, |
| 1643 | mxstatus_to_string(members[0].status)); |
| 1644 | |
| 1645 | for (i = 1; i < nmembers; i++) |
| 1646 | appendStringInfo(&buf, ", %u (%s)" , members[i].xid, |
| 1647 | mxstatus_to_string(members[i].status)); |
| 1648 | |
| 1649 | appendStringInfoChar(&buf, ']'); |
| 1650 | str = MemoryContextStrdup(TopMemoryContext, buf.data); |
| 1651 | pfree(buf.data); |
| 1652 | return str; |
| 1653 | } |
| 1654 | |
| 1655 | /* |
| 1656 | * AtEOXact_MultiXact |
| 1657 | * Handle transaction end for MultiXact |
| 1658 | * |
| 1659 | * This is called at top transaction commit or abort (we don't care which). |
| 1660 | */ |
| 1661 | void |
| 1662 | AtEOXact_MultiXact(void) |
| 1663 | { |
| 1664 | /* |
| 1665 | * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of |
| 1666 | * which should only be valid while within a transaction. |
| 1667 | * |
| 1668 | * We assume that storing a MultiXactId is atomic and so we need not take |
| 1669 | * MultiXactGenLock to do this. |
| 1670 | */ |
| 1671 | OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; |
| 1672 | OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; |
| 1673 | |
| 1674 | /* |
| 1675 | * Discard the local MultiXactId cache. Since MXactContext was created as |
| 1676 | * a child of TopTransactionContext, we needn't delete it explicitly. |
| 1677 | */ |
| 1678 | MXactContext = NULL; |
| 1679 | dlist_init(&MXactCache); |
| 1680 | MXactCacheMembers = 0; |
| 1681 | } |
| 1682 | |
| 1683 | /* |
| 1684 | * AtPrepare_MultiXact |
| 1685 | * Save multixact state at 2PC transaction prepare |
| 1686 | * |
| 1687 | * In this phase, we only store our OldestMemberMXactId value in the two-phase |
| 1688 | * state file. |
| 1689 | */ |
| 1690 | void |
| 1691 | AtPrepare_MultiXact(void) |
| 1692 | { |
| 1693 | MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId]; |
| 1694 | |
| 1695 | if (MultiXactIdIsValid(myOldestMember)) |
| 1696 | RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0, |
| 1697 | &myOldestMember, sizeof(MultiXactId)); |
| 1698 | } |
| 1699 | |
| 1700 | /* |
| 1701 | * PostPrepare_MultiXact |
| 1702 | * Clean up after successful PREPARE TRANSACTION |
| 1703 | */ |
| 1704 | void |
| 1705 | PostPrepare_MultiXact(TransactionId xid) |
| 1706 | { |
| 1707 | MultiXactId myOldestMember; |
| 1708 | |
| 1709 | /* |
| 1710 | * Transfer our OldestMemberMXactId value to the slot reserved for the |
| 1711 | * prepared transaction. |
| 1712 | */ |
| 1713 | myOldestMember = OldestMemberMXactId[MyBackendId]; |
| 1714 | if (MultiXactIdIsValid(myOldestMember)) |
| 1715 | { |
| 1716 | BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); |
| 1717 | |
| 1718 | /* |
| 1719 | * Even though storing MultiXactId is atomic, acquire lock to make |
| 1720 | * sure others see both changes, not just the reset of the slot of the |
| 1721 | * current backend. Using a volatile pointer might suffice, but this |
| 1722 | * isn't a hot spot. |
| 1723 | */ |
| 1724 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 1725 | |
| 1726 | OldestMemberMXactId[dummyBackendId] = myOldestMember; |
| 1727 | OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; |
| 1728 | |
| 1729 | LWLockRelease(MultiXactGenLock); |
| 1730 | } |
| 1731 | |
| 1732 | /* |
| 1733 | * We don't need to transfer OldestVisibleMXactId value, because the |
| 1734 | * transaction is not going to be looking at any more multixacts once it's |
| 1735 | * prepared. |
| 1736 | * |
| 1737 | * We assume that storing a MultiXactId is atomic and so we need not take |
| 1738 | * MultiXactGenLock to do this. |
| 1739 | */ |
| 1740 | OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; |
| 1741 | |
| 1742 | /* |
| 1743 | * Discard the local MultiXactId cache like in AtEOX_MultiXact |
| 1744 | */ |
| 1745 | MXactContext = NULL; |
| 1746 | dlist_init(&MXactCache); |
| 1747 | MXactCacheMembers = 0; |
| 1748 | } |
| 1749 | |
| 1750 | /* |
| 1751 | * multixact_twophase_recover |
| 1752 | * Recover the state of a prepared transaction at startup |
| 1753 | */ |
| 1754 | void |
| 1755 | multixact_twophase_recover(TransactionId xid, uint16 info, |
| 1756 | void *recdata, uint32 len) |
| 1757 | { |
| 1758 | BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); |
| 1759 | MultiXactId oldestMember; |
| 1760 | |
| 1761 | /* |
| 1762 | * Get the oldest member XID from the state file record, and set it in the |
| 1763 | * OldestMemberMXactId slot reserved for this prepared transaction. |
| 1764 | */ |
| 1765 | Assert(len == sizeof(MultiXactId)); |
| 1766 | oldestMember = *((MultiXactId *) recdata); |
| 1767 | |
| 1768 | OldestMemberMXactId[dummyBackendId] = oldestMember; |
| 1769 | } |
| 1770 | |
| 1771 | /* |
| 1772 | * multixact_twophase_postcommit |
| 1773 | * Similar to AtEOX_MultiXact but for COMMIT PREPARED |
| 1774 | */ |
| 1775 | void |
| 1776 | multixact_twophase_postcommit(TransactionId xid, uint16 info, |
| 1777 | void *recdata, uint32 len) |
| 1778 | { |
| 1779 | BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true); |
| 1780 | |
| 1781 | Assert(len == sizeof(MultiXactId)); |
| 1782 | |
| 1783 | OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId; |
| 1784 | } |
| 1785 | |
| 1786 | /* |
| 1787 | * multixact_twophase_postabort |
| 1788 | * This is actually just the same as the COMMIT case. |
| 1789 | */ |
| 1790 | void |
| 1791 | multixact_twophase_postabort(TransactionId xid, uint16 info, |
| 1792 | void *recdata, uint32 len) |
| 1793 | { |
| 1794 | multixact_twophase_postcommit(xid, info, recdata, len); |
| 1795 | } |
| 1796 | |
| 1797 | /* |
| 1798 | * Initialization of shared memory for MultiXact. We use two SLRU areas, |
| 1799 | * thus double memory. Also, reserve space for the shared MultiXactState |
| 1800 | * struct and the per-backend MultiXactId arrays (two of those, too). |
| 1801 | */ |
| 1802 | Size |
| 1803 | MultiXactShmemSize(void) |
| 1804 | { |
| 1805 | Size size; |
| 1806 | |
| 1807 | /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */ |
| 1808 | #define SHARED_MULTIXACT_STATE_SIZE \ |
| 1809 | add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \ |
| 1810 | mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) |
| 1811 | |
| 1812 | size = SHARED_MULTIXACT_STATE_SIZE; |
| 1813 | size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0)); |
| 1814 | size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0)); |
| 1815 | |
| 1816 | return size; |
| 1817 | } |
| 1818 | |
| 1819 | void |
| 1820 | MultiXactShmemInit(void) |
| 1821 | { |
| 1822 | bool found; |
| 1823 | |
| 1824 | debug_elog2(DEBUG2, "Shared Memory Init for MultiXact" ); |
| 1825 | |
| 1826 | MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; |
| 1827 | MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; |
| 1828 | |
| 1829 | SimpleLruInit(MultiXactOffsetCtl, |
| 1830 | "multixact_offset" , NUM_MXACTOFFSET_BUFFERS, 0, |
| 1831 | MultiXactOffsetControlLock, "pg_multixact/offsets" , |
| 1832 | LWTRANCHE_MXACTOFFSET_BUFFERS); |
| 1833 | SimpleLruInit(MultiXactMemberCtl, |
| 1834 | "multixact_member" , NUM_MXACTMEMBER_BUFFERS, 0, |
| 1835 | MultiXactMemberControlLock, "pg_multixact/members" , |
| 1836 | LWTRANCHE_MXACTMEMBER_BUFFERS); |
| 1837 | |
| 1838 | /* Initialize our shared state struct */ |
| 1839 | MultiXactState = ShmemInitStruct("Shared MultiXact State" , |
| 1840 | SHARED_MULTIXACT_STATE_SIZE, |
| 1841 | &found); |
| 1842 | if (!IsUnderPostmaster) |
| 1843 | { |
| 1844 | Assert(!found); |
| 1845 | |
| 1846 | /* Make sure we zero out the per-backend state */ |
| 1847 | MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); |
| 1848 | } |
| 1849 | else |
| 1850 | Assert(found); |
| 1851 | |
| 1852 | /* |
| 1853 | * Set up array pointers. Note that perBackendXactIds[0] is wasted space |
| 1854 | * since we only use indexes 1..MaxOldestSlot in each array. |
| 1855 | */ |
| 1856 | OldestMemberMXactId = MultiXactState->perBackendXactIds; |
| 1857 | OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot; |
| 1858 | } |
| 1859 | |
| 1860 | /* |
| 1861 | * This func must be called ONCE on system install. It creates the initial |
| 1862 | * MultiXact segments. (The MultiXacts directories are assumed to have been |
| 1863 | * created by initdb, and MultiXactShmemInit must have been called already.) |
| 1864 | */ |
| 1865 | void |
| 1866 | BootStrapMultiXact(void) |
| 1867 | { |
| 1868 | int slotno; |
| 1869 | |
| 1870 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 1871 | |
| 1872 | /* Create and zero the first page of the offsets log */ |
| 1873 | slotno = ZeroMultiXactOffsetPage(0, false); |
| 1874 | |
| 1875 | /* Make sure it's written out */ |
| 1876 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
| 1877 | Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
| 1878 | |
| 1879 | LWLockRelease(MultiXactOffsetControlLock); |
| 1880 | |
| 1881 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| 1882 | |
| 1883 | /* Create and zero the first page of the members log */ |
| 1884 | slotno = ZeroMultiXactMemberPage(0, false); |
| 1885 | |
| 1886 | /* Make sure it's written out */ |
| 1887 | SimpleLruWritePage(MultiXactMemberCtl, slotno); |
| 1888 | Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
| 1889 | |
| 1890 | LWLockRelease(MultiXactMemberControlLock); |
| 1891 | } |
| 1892 | |
| 1893 | /* |
| 1894 | * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. |
| 1895 | * If writeXlog is true, also emit an XLOG record saying we did this. |
| 1896 | * |
| 1897 | * The page is not actually written, just set up in shared memory. |
| 1898 | * The slot number of the new page is returned. |
| 1899 | * |
| 1900 | * Control lock must be held at entry, and will be held at exit. |
| 1901 | */ |
| 1902 | static int |
| 1903 | ZeroMultiXactOffsetPage(int pageno, bool writeXlog) |
| 1904 | { |
| 1905 | int slotno; |
| 1906 | |
| 1907 | slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); |
| 1908 | |
| 1909 | if (writeXlog) |
| 1910 | WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); |
| 1911 | |
| 1912 | return slotno; |
| 1913 | } |
| 1914 | |
| 1915 | /* |
| 1916 | * Ditto, for MultiXactMember |
| 1917 | */ |
| 1918 | static int |
| 1919 | ZeroMultiXactMemberPage(int pageno, bool writeXlog) |
| 1920 | { |
| 1921 | int slotno; |
| 1922 | |
| 1923 | slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); |
| 1924 | |
| 1925 | if (writeXlog) |
| 1926 | WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); |
| 1927 | |
| 1928 | return slotno; |
| 1929 | } |
| 1930 | |
| 1931 | /* |
| 1932 | * MaybeExtendOffsetSlru |
| 1933 | * Extend the offsets SLRU area, if necessary |
| 1934 | * |
| 1935 | * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might |
| 1936 | * contain files that are shorter than necessary; this would occur if the old |
| 1937 | * installation had used multixacts beyond the first page (files cannot be |
| 1938 | * copied, because the on-disk representation is different). pg_upgrade would |
| 1939 | * update pg_control to set the next offset value to be at that position, so |
| 1940 | * that tuples marked as locked by such MultiXacts would be seen as visible |
| 1941 | * without having to consult multixact. However, trying to create and use a |
| 1942 | * new MultiXactId would result in an error because the page on which the new |
| 1943 | * value would reside does not exist. This routine is in charge of creating |
| 1944 | * such pages. |
| 1945 | */ |
| 1946 | static void |
| 1947 | MaybeExtendOffsetSlru(void) |
| 1948 | { |
| 1949 | int pageno; |
| 1950 | |
| 1951 | pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); |
| 1952 | |
| 1953 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 1954 | |
| 1955 | if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) |
| 1956 | { |
| 1957 | int slotno; |
| 1958 | |
| 1959 | /* |
| 1960 | * Fortunately for us, SimpleLruWritePage is already prepared to deal |
| 1961 | * with creating a new segment file even if the page we're writing is |
| 1962 | * not the first in it, so this is enough. |
| 1963 | */ |
| 1964 | slotno = ZeroMultiXactOffsetPage(pageno, false); |
| 1965 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
| 1966 | } |
| 1967 | |
| 1968 | LWLockRelease(MultiXactOffsetControlLock); |
| 1969 | } |
| 1970 | |
| 1971 | /* |
| 1972 | * This must be called ONCE during postmaster or standalone-backend startup. |
| 1973 | * |
| 1974 | * StartupXLOG has already established nextMXact/nextOffset by calling |
| 1975 | * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti |
| 1976 | * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet |
| 1977 | * replayed WAL. |
| 1978 | */ |
| 1979 | void |
| 1980 | StartupMultiXact(void) |
| 1981 | { |
| 1982 | MultiXactId multi = MultiXactState->nextMXact; |
| 1983 | MultiXactOffset offset = MultiXactState->nextOffset; |
| 1984 | int pageno; |
| 1985 | |
| 1986 | /* |
| 1987 | * Initialize offset's idea of the latest page number. |
| 1988 | */ |
| 1989 | pageno = MultiXactIdToOffsetPage(multi); |
| 1990 | MultiXactOffsetCtl->shared->latest_page_number = pageno; |
| 1991 | |
| 1992 | /* |
| 1993 | * Initialize member's idea of the latest page number. |
| 1994 | */ |
| 1995 | pageno = MXOffsetToMemberPage(offset); |
| 1996 | MultiXactMemberCtl->shared->latest_page_number = pageno; |
| 1997 | } |
| 1998 | |
| 1999 | /* |
| 2000 | * This must be called ONCE at the end of startup/recovery. |
| 2001 | */ |
| 2002 | void |
| 2003 | TrimMultiXact(void) |
| 2004 | { |
| 2005 | MultiXactId nextMXact; |
| 2006 | MultiXactOffset offset; |
| 2007 | MultiXactId oldestMXact; |
| 2008 | Oid oldestMXactDB; |
| 2009 | int pageno; |
| 2010 | int entryno; |
| 2011 | int flagsoff; |
| 2012 | |
| 2013 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 2014 | nextMXact = MultiXactState->nextMXact; |
| 2015 | offset = MultiXactState->nextOffset; |
| 2016 | oldestMXact = MultiXactState->oldestMultiXactId; |
| 2017 | oldestMXactDB = MultiXactState->oldestMultiXactDB; |
| 2018 | LWLockRelease(MultiXactGenLock); |
| 2019 | |
| 2020 | /* Clean up offsets state */ |
| 2021 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 2022 | |
| 2023 | /* |
| 2024 | * (Re-)Initialize our idea of the latest page number for offsets. |
| 2025 | */ |
| 2026 | pageno = MultiXactIdToOffsetPage(nextMXact); |
| 2027 | MultiXactOffsetCtl->shared->latest_page_number = pageno; |
| 2028 | |
| 2029 | /* |
| 2030 | * Zero out the remainder of the current offsets page. See notes in |
| 2031 | * TrimCLOG() for background. Unlike CLOG, some WAL record covers every |
| 2032 | * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL |
| 2033 | * rule "write xlog before data," nextMXact successors may carry obsolete, |
| 2034 | * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() |
| 2035 | * operates normally. |
| 2036 | */ |
| 2037 | entryno = MultiXactIdToOffsetEntry(nextMXact); |
| 2038 | if (entryno != 0) |
| 2039 | { |
| 2040 | int slotno; |
| 2041 | MultiXactOffset *offptr; |
| 2042 | |
| 2043 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); |
| 2044 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| 2045 | offptr += entryno; |
| 2046 | |
| 2047 | MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); |
| 2048 | |
| 2049 | MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
| 2050 | } |
| 2051 | |
| 2052 | LWLockRelease(MultiXactOffsetControlLock); |
| 2053 | |
| 2054 | /* And the same for members */ |
| 2055 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| 2056 | |
| 2057 | /* |
| 2058 | * (Re-)Initialize our idea of the latest page number for members. |
| 2059 | */ |
| 2060 | pageno = MXOffsetToMemberPage(offset); |
| 2061 | MultiXactMemberCtl->shared->latest_page_number = pageno; |
| 2062 | |
| 2063 | /* |
| 2064 | * Zero out the remainder of the current members page. See notes in |
| 2065 | * TrimCLOG() for motivation. |
| 2066 | */ |
| 2067 | flagsoff = MXOffsetToFlagsOffset(offset); |
| 2068 | if (flagsoff != 0) |
| 2069 | { |
| 2070 | int slotno; |
| 2071 | TransactionId *xidptr; |
| 2072 | int memberoff; |
| 2073 | |
| 2074 | memberoff = MXOffsetToMemberOffset(offset); |
| 2075 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); |
| 2076 | xidptr = (TransactionId *) |
| 2077 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
| 2078 | |
| 2079 | MemSet(xidptr, 0, BLCKSZ - memberoff); |
| 2080 | |
| 2081 | /* |
| 2082 | * Note: we don't need to zero out the flag bits in the remaining |
| 2083 | * members of the current group, because they are always reset before |
| 2084 | * writing. |
| 2085 | */ |
| 2086 | |
| 2087 | MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
| 2088 | } |
| 2089 | |
| 2090 | LWLockRelease(MultiXactMemberControlLock); |
| 2091 | |
| 2092 | /* signal that we're officially up */ |
| 2093 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 2094 | MultiXactState->finishedStartup = true; |
| 2095 | LWLockRelease(MultiXactGenLock); |
| 2096 | |
| 2097 | /* Now compute how far away the next members wraparound is. */ |
| 2098 | SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); |
| 2099 | } |
| 2100 | |
| 2101 | /* |
| 2102 | * This must be called ONCE during postmaster or standalone-backend shutdown |
| 2103 | */ |
| 2104 | void |
| 2105 | ShutdownMultiXact(void) |
| 2106 | { |
| 2107 | /* Flush dirty MultiXact pages to disk */ |
| 2108 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false); |
| 2109 | SimpleLruFlush(MultiXactOffsetCtl, false); |
| 2110 | SimpleLruFlush(MultiXactMemberCtl, false); |
| 2111 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false); |
| 2112 | } |
| 2113 | |
| 2114 | /* |
| 2115 | * Get the MultiXact data to save in a checkpoint record |
| 2116 | */ |
| 2117 | void |
| 2118 | MultiXactGetCheckptMulti(bool is_shutdown, |
| 2119 | MultiXactId *nextMulti, |
| 2120 | MultiXactOffset *nextMultiOffset, |
| 2121 | MultiXactId *oldestMulti, |
| 2122 | Oid *oldestMultiDB) |
| 2123 | { |
| 2124 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 2125 | *nextMulti = MultiXactState->nextMXact; |
| 2126 | *nextMultiOffset = MultiXactState->nextOffset; |
| 2127 | *oldestMulti = MultiXactState->oldestMultiXactId; |
| 2128 | *oldestMultiDB = MultiXactState->oldestMultiXactDB; |
| 2129 | LWLockRelease(MultiXactGenLock); |
| 2130 | |
| 2131 | debug_elog6(DEBUG2, |
| 2132 | "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u" , |
| 2133 | *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); |
| 2134 | } |
| 2135 | |
| 2136 | /* |
| 2137 | * Perform a checkpoint --- either during shutdown, or on-the-fly |
| 2138 | */ |
| 2139 | void |
| 2140 | CheckPointMultiXact(void) |
| 2141 | { |
| 2142 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); |
| 2143 | |
| 2144 | /* Flush dirty MultiXact pages to disk */ |
| 2145 | SimpleLruFlush(MultiXactOffsetCtl, true); |
| 2146 | SimpleLruFlush(MultiXactMemberCtl, true); |
| 2147 | |
| 2148 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); |
| 2149 | } |
| 2150 | |
| 2151 | /* |
| 2152 | * Set the next-to-be-assigned MultiXactId and offset |
| 2153 | * |
| 2154 | * This is used when we can determine the correct next ID/offset exactly |
| 2155 | * from a checkpoint record. Although this is only called during bootstrap |
| 2156 | * and XLog replay, we take the lock in case any hot-standby backends are |
| 2157 | * examining the values. |
| 2158 | */ |
| 2159 | void |
| 2160 | MultiXactSetNextMXact(MultiXactId nextMulti, |
| 2161 | MultiXactOffset nextMultiOffset) |
| 2162 | { |
| 2163 | debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u" , |
| 2164 | nextMulti, nextMultiOffset); |
| 2165 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 2166 | MultiXactState->nextMXact = nextMulti; |
| 2167 | MultiXactState->nextOffset = nextMultiOffset; |
| 2168 | LWLockRelease(MultiXactGenLock); |
| 2169 | |
| 2170 | /* |
| 2171 | * During a binary upgrade, make sure that the offsets SLRU is large |
| 2172 | * enough to contain the next value that would be created. |
| 2173 | * |
| 2174 | * We need to do this pretty early during the first startup in binary |
| 2175 | * upgrade mode: before StartupMultiXact() in fact, because this routine |
| 2176 | * is called even before that by StartupXLOG(). And we can't do it |
| 2177 | * earlier than at this point, because during that first call of this |
| 2178 | * routine we determine the MultiXactState->nextMXact value that |
| 2179 | * MaybeExtendOffsetSlru needs. |
| 2180 | */ |
| 2181 | if (IsBinaryUpgrade) |
| 2182 | MaybeExtendOffsetSlru(); |
| 2183 | } |
| 2184 | |
| 2185 | /* |
| 2186 | * Determine the last safe MultiXactId to allocate given the currently oldest |
| 2187 | * datminmxid (ie, the oldest MultiXactId that might exist in any database |
| 2188 | * of our cluster), and the OID of the (or a) database with that value. |
| 2189 | * |
| 2190 | * is_startup is true when we are just starting the cluster, false when we |
| 2191 | * are updating state in a running cluster. This only affects log messages. |
| 2192 | */ |
| 2193 | void |
| 2194 | SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, |
| 2195 | bool is_startup) |
| 2196 | { |
| 2197 | MultiXactId multiVacLimit; |
| 2198 | MultiXactId multiWarnLimit; |
| 2199 | MultiXactId multiStopLimit; |
| 2200 | MultiXactId multiWrapLimit; |
| 2201 | MultiXactId curMulti; |
| 2202 | bool needs_offset_vacuum; |
| 2203 | |
| 2204 | Assert(MultiXactIdIsValid(oldest_datminmxid)); |
| 2205 | |
| 2206 | /* |
| 2207 | * We pretend that a wrap will happen halfway through the multixact ID |
| 2208 | * space, but that's not really true, because multixacts wrap differently |
| 2209 | * from transaction IDs. Note that, separately from any concern about |
| 2210 | * multixact IDs wrapping, we must ensure that multixact members do not |
| 2211 | * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. |
| 2212 | */ |
| 2213 | multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); |
| 2214 | if (multiWrapLimit < FirstMultiXactId) |
| 2215 | multiWrapLimit += FirstMultiXactId; |
| 2216 | |
| 2217 | /* |
| 2218 | * We'll refuse to continue assigning MultiXactIds once we get within 100 |
| 2219 | * multi of data loss. |
| 2220 | * |
| 2221 | * Note: This differs from the magic number used in |
| 2222 | * SetTransactionIdLimit() since vacuum itself will never generate new |
| 2223 | * multis. XXX actually it does, if it needs to freeze old multis. |
| 2224 | */ |
| 2225 | multiStopLimit = multiWrapLimit - 100; |
| 2226 | if (multiStopLimit < FirstMultiXactId) |
| 2227 | multiStopLimit -= FirstMultiXactId; |
| 2228 | |
| 2229 | /* |
| 2230 | * We'll start complaining loudly when we get within 10M multis of the |
| 2231 | * stop point. This is kind of arbitrary, but if you let your gas gauge |
| 2232 | * get down to 1% of full, would you be looking for the next gas station? |
| 2233 | * We need to be fairly liberal about this number because there are lots |
| 2234 | * of scenarios where most transactions are done by automatic clients that |
| 2235 | * won't pay attention to warnings. (No, we're not gonna make this |
| 2236 | * configurable. If you know enough to configure it, you know enough to |
| 2237 | * not get in this kind of trouble in the first place.) |
| 2238 | */ |
| 2239 | multiWarnLimit = multiStopLimit - 10000000; |
| 2240 | if (multiWarnLimit < FirstMultiXactId) |
| 2241 | multiWarnLimit -= FirstMultiXactId; |
| 2242 | |
| 2243 | /* |
| 2244 | * We'll start trying to force autovacuums when oldest_datminmxid gets to |
| 2245 | * be more than autovacuum_multixact_freeze_max_age mxids old. |
| 2246 | * |
| 2247 | * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter |
| 2248 | * so that we don't have to worry about dealing with on-the-fly changes in |
| 2249 | * its value. See SetTransactionIdLimit. |
| 2250 | */ |
| 2251 | multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age; |
| 2252 | if (multiVacLimit < FirstMultiXactId) |
| 2253 | multiVacLimit += FirstMultiXactId; |
| 2254 | |
| 2255 | /* Grab lock for just long enough to set the new limit values */ |
| 2256 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 2257 | MultiXactState->oldestMultiXactId = oldest_datminmxid; |
| 2258 | MultiXactState->oldestMultiXactDB = oldest_datoid; |
| 2259 | MultiXactState->multiVacLimit = multiVacLimit; |
| 2260 | MultiXactState->multiWarnLimit = multiWarnLimit; |
| 2261 | MultiXactState->multiStopLimit = multiStopLimit; |
| 2262 | MultiXactState->multiWrapLimit = multiWrapLimit; |
| 2263 | curMulti = MultiXactState->nextMXact; |
| 2264 | LWLockRelease(MultiXactGenLock); |
| 2265 | |
| 2266 | /* Log the info */ |
| 2267 | ereport(DEBUG1, |
| 2268 | (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u" , |
| 2269 | multiWrapLimit, oldest_datoid))); |
| 2270 | |
| 2271 | /* |
| 2272 | * Computing the actual limits is only possible once the data directory is |
| 2273 | * in a consistent state. There's no need to compute the limits while |
| 2274 | * still replaying WAL - no decisions about new multis are made even |
| 2275 | * though multixact creations might be replayed. So we'll only do further |
| 2276 | * checks after TrimMultiXact() has been called. |
| 2277 | */ |
| 2278 | if (!MultiXactState->finishedStartup) |
| 2279 | return; |
| 2280 | |
| 2281 | Assert(!InRecovery); |
| 2282 | |
| 2283 | /* Set limits for offset vacuum. */ |
| 2284 | needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); |
| 2285 | |
| 2286 | /* |
| 2287 | * If past the autovacuum force point, immediately signal an autovac |
| 2288 | * request. The reason for this is that autovac only processes one |
| 2289 | * database per invocation. Once it's finished cleaning up the oldest |
| 2290 | * database, it'll call here, and we'll signal the postmaster to start |
| 2291 | * another iteration immediately if there are still any old databases. |
| 2292 | */ |
| 2293 | if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || |
| 2294 | needs_offset_vacuum) && IsUnderPostmaster) |
| 2295 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
| 2296 | |
| 2297 | /* Give an immediate warning if past the wrap warn point */ |
| 2298 | if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) |
| 2299 | { |
| 2300 | char *oldest_datname; |
| 2301 | |
| 2302 | /* |
| 2303 | * We can be called when not inside a transaction, for example during |
| 2304 | * StartupXLOG(). In such a case we cannot do database access, so we |
| 2305 | * must just report the oldest DB's OID. |
| 2306 | * |
| 2307 | * Note: it's also possible that get_database_name fails and returns |
| 2308 | * NULL, for example because the database just got dropped. We'll |
| 2309 | * still warn, even though the warning might now be unnecessary. |
| 2310 | */ |
| 2311 | if (IsTransactionState()) |
| 2312 | oldest_datname = get_database_name(oldest_datoid); |
| 2313 | else |
| 2314 | oldest_datname = NULL; |
| 2315 | |
| 2316 | if (oldest_datname) |
| 2317 | ereport(WARNING, |
| 2318 | (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used" , |
| 2319 | "database \"%s\" must be vacuumed before %u more MultiXactIds are used" , |
| 2320 | multiWrapLimit - curMulti, |
| 2321 | oldest_datname, |
| 2322 | multiWrapLimit - curMulti), |
| 2323 | errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" |
| 2324 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
| 2325 | else |
| 2326 | ereport(WARNING, |
| 2327 | (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used" , |
| 2328 | "database with OID %u must be vacuumed before %u more MultiXactIds are used" , |
| 2329 | multiWrapLimit - curMulti, |
| 2330 | oldest_datoid, |
| 2331 | multiWrapLimit - curMulti), |
| 2332 | errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" |
| 2333 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots." ))); |
| 2334 | } |
| 2335 | } |
| 2336 | |
| 2337 | /* |
| 2338 | * Ensure the next-to-be-assigned MultiXactId is at least minMulti, |
| 2339 | * and similarly nextOffset is at least minMultiOffset. |
| 2340 | * |
| 2341 | * This is used when we can determine minimum safe values from an XLog |
| 2342 | * record (either an on-line checkpoint or an mxact creation log entry). |
| 2343 | * Although this is only called during XLog replay, we take the lock in case |
| 2344 | * any hot-standby backends are examining the values. |
| 2345 | */ |
| 2346 | void |
| 2347 | MultiXactAdvanceNextMXact(MultiXactId minMulti, |
| 2348 | MultiXactOffset minMultiOffset) |
| 2349 | { |
| 2350 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 2351 | if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) |
| 2352 | { |
| 2353 | debug_elog3(DEBUG2, "MultiXact: setting next multi to %u" , minMulti); |
| 2354 | MultiXactState->nextMXact = minMulti; |
| 2355 | } |
| 2356 | if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) |
| 2357 | { |
| 2358 | debug_elog3(DEBUG2, "MultiXact: setting next offset to %u" , |
| 2359 | minMultiOffset); |
| 2360 | MultiXactState->nextOffset = minMultiOffset; |
| 2361 | } |
| 2362 | LWLockRelease(MultiXactGenLock); |
| 2363 | } |
| 2364 | |
| 2365 | /* |
| 2366 | * Update our oldestMultiXactId value, but only if it's more recent than what |
| 2367 | * we had. |
| 2368 | * |
| 2369 | * This may only be called during WAL replay. |
| 2370 | */ |
| 2371 | void |
| 2372 | MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) |
| 2373 | { |
| 2374 | Assert(InRecovery); |
| 2375 | |
| 2376 | if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) |
| 2377 | SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); |
| 2378 | } |
| 2379 | |
| 2380 | /* |
| 2381 | * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. |
| 2382 | * |
| 2383 | * NB: this is called while holding MultiXactGenLock. We want it to be very |
| 2384 | * fast most of the time; even when it's not so fast, no actual I/O need |
| 2385 | * happen unless we're forced to write out a dirty log or xlog page to make |
| 2386 | * room in shared memory. |
| 2387 | */ |
| 2388 | static void |
| 2389 | ExtendMultiXactOffset(MultiXactId multi) |
| 2390 | { |
| 2391 | int pageno; |
| 2392 | |
| 2393 | /* |
| 2394 | * No work except at first MultiXactId of a page. But beware: just after |
| 2395 | * wraparound, the first MultiXactId of page zero is FirstMultiXactId. |
| 2396 | */ |
| 2397 | if (MultiXactIdToOffsetEntry(multi) != 0 && |
| 2398 | multi != FirstMultiXactId) |
| 2399 | return; |
| 2400 | |
| 2401 | pageno = MultiXactIdToOffsetPage(multi); |
| 2402 | |
| 2403 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 2404 | |
| 2405 | /* Zero the page and make an XLOG entry about it */ |
| 2406 | ZeroMultiXactOffsetPage(pageno, true); |
| 2407 | |
| 2408 | LWLockRelease(MultiXactOffsetControlLock); |
| 2409 | } |
| 2410 | |
| 2411 | /* |
| 2412 | * Make sure that MultiXactMember has room for the members of a newly- |
| 2413 | * allocated MultiXactId. |
| 2414 | * |
| 2415 | * Like the above routine, this is called while holding MultiXactGenLock; |
| 2416 | * same comments apply. |
| 2417 | */ |
| 2418 | static void |
| 2419 | ExtendMultiXactMember(MultiXactOffset offset, int nmembers) |
| 2420 | { |
| 2421 | /* |
| 2422 | * It's possible that the members span more than one page of the members |
| 2423 | * file, so we loop to ensure we consider each page. The coding is not |
| 2424 | * optimal if the members span several pages, but that seems unusual |
| 2425 | * enough to not worry much about. |
| 2426 | */ |
| 2427 | while (nmembers > 0) |
| 2428 | { |
| 2429 | int flagsoff; |
| 2430 | int flagsbit; |
| 2431 | uint32 difference; |
| 2432 | |
| 2433 | /* |
| 2434 | * Only zero when at first entry of a page. |
| 2435 | */ |
| 2436 | flagsoff = MXOffsetToFlagsOffset(offset); |
| 2437 | flagsbit = MXOffsetToFlagsBitShift(offset); |
| 2438 | if (flagsoff == 0 && flagsbit == 0) |
| 2439 | { |
| 2440 | int pageno; |
| 2441 | |
| 2442 | pageno = MXOffsetToMemberPage(offset); |
| 2443 | |
| 2444 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| 2445 | |
| 2446 | /* Zero the page and make an XLOG entry about it */ |
| 2447 | ZeroMultiXactMemberPage(pageno, true); |
| 2448 | |
| 2449 | LWLockRelease(MultiXactMemberControlLock); |
| 2450 | } |
| 2451 | |
| 2452 | /* |
| 2453 | * Compute the number of items till end of current page. Careful: if |
| 2454 | * addition of unsigned ints wraps around, we're at the last page of |
| 2455 | * the last segment; since that page holds a different number of items |
| 2456 | * than other pages, we need to do it differently. |
| 2457 | */ |
| 2458 | if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) |
| 2459 | { |
| 2460 | /* |
| 2461 | * This is the last page of the last segment; we can compute the |
| 2462 | * number of items left to allocate in it without modulo |
| 2463 | * arithmetic. |
| 2464 | */ |
| 2465 | difference = MaxMultiXactOffset - offset + 1; |
| 2466 | } |
| 2467 | else |
| 2468 | difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; |
| 2469 | |
| 2470 | /* |
| 2471 | * Advance to next page, taking care to properly handle the wraparound |
| 2472 | * case. OK if nmembers goes negative. |
| 2473 | */ |
| 2474 | nmembers -= difference; |
| 2475 | offset += difference; |
| 2476 | } |
| 2477 | } |
| 2478 | |
| 2479 | /* |
| 2480 | * GetOldestMultiXactId |
| 2481 | * |
| 2482 | * Return the oldest MultiXactId that's still possibly still seen as live by |
| 2483 | * any running transaction. Older ones might still exist on disk, but they no |
| 2484 | * longer have any running member transaction. |
| 2485 | * |
| 2486 | * It's not safe to truncate MultiXact SLRU segments on the value returned by |
| 2487 | * this function; however, it can be used by a full-table vacuum to set the |
| 2488 | * point at which it will be possible to truncate SLRU for that table. |
| 2489 | */ |
| 2490 | MultiXactId |
| 2491 | GetOldestMultiXactId(void) |
| 2492 | { |
| 2493 | MultiXactId oldestMXact; |
| 2494 | MultiXactId nextMXact; |
| 2495 | int i; |
| 2496 | |
| 2497 | /* |
| 2498 | * This is the oldest valid value among all the OldestMemberMXactId[] and |
| 2499 | * OldestVisibleMXactId[] entries, or nextMXact if none are valid. |
| 2500 | */ |
| 2501 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 2502 | |
| 2503 | /* |
| 2504 | * We have to beware of the possibility that nextMXact is in the |
| 2505 | * wrapped-around state. We don't fix the counter itself here, but we |
| 2506 | * must be sure to use a valid value in our calculation. |
| 2507 | */ |
| 2508 | nextMXact = MultiXactState->nextMXact; |
| 2509 | if (nextMXact < FirstMultiXactId) |
| 2510 | nextMXact = FirstMultiXactId; |
| 2511 | |
| 2512 | oldestMXact = nextMXact; |
| 2513 | for (i = 1; i <= MaxOldestSlot; i++) |
| 2514 | { |
| 2515 | MultiXactId thisoldest; |
| 2516 | |
| 2517 | thisoldest = OldestMemberMXactId[i]; |
| 2518 | if (MultiXactIdIsValid(thisoldest) && |
| 2519 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
| 2520 | oldestMXact = thisoldest; |
| 2521 | thisoldest = OldestVisibleMXactId[i]; |
| 2522 | if (MultiXactIdIsValid(thisoldest) && |
| 2523 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
| 2524 | oldestMXact = thisoldest; |
| 2525 | } |
| 2526 | |
| 2527 | LWLockRelease(MultiXactGenLock); |
| 2528 | |
| 2529 | return oldestMXact; |
| 2530 | } |
| 2531 | |
| 2532 | /* |
| 2533 | * Determine how aggressively we need to vacuum in order to prevent member |
| 2534 | * wraparound. |
| 2535 | * |
| 2536 | * To do so determine what's the oldest member offset and install the limit |
| 2537 | * info in MultiXactState, where it can be used to prevent overrun of old data |
| 2538 | * in the members SLRU area. |
| 2539 | * |
| 2540 | * The return value is true if emergency autovacuum is required and false |
| 2541 | * otherwise. |
| 2542 | */ |
| 2543 | static bool |
| 2544 | SetOffsetVacuumLimit(bool is_startup) |
| 2545 | { |
| 2546 | MultiXactId oldestMultiXactId; |
| 2547 | MultiXactId nextMXact; |
| 2548 | MultiXactOffset oldestOffset = 0; /* placate compiler */ |
| 2549 | MultiXactOffset prevOldestOffset; |
| 2550 | MultiXactOffset nextOffset; |
| 2551 | bool oldestOffsetKnown = false; |
| 2552 | bool prevOldestOffsetKnown; |
| 2553 | MultiXactOffset offsetStopLimit = 0; |
| 2554 | MultiXactOffset prevOffsetStopLimit; |
| 2555 | |
| 2556 | /* |
| 2557 | * NB: Have to prevent concurrent truncation, we might otherwise try to |
| 2558 | * lookup an oldestMulti that's concurrently getting truncated away. |
| 2559 | */ |
| 2560 | LWLockAcquire(MultiXactTruncationLock, LW_SHARED); |
| 2561 | |
| 2562 | /* Read relevant fields from shared memory. */ |
| 2563 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 2564 | oldestMultiXactId = MultiXactState->oldestMultiXactId; |
| 2565 | nextMXact = MultiXactState->nextMXact; |
| 2566 | nextOffset = MultiXactState->nextOffset; |
| 2567 | prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; |
| 2568 | prevOldestOffset = MultiXactState->oldestOffset; |
| 2569 | prevOffsetStopLimit = MultiXactState->offsetStopLimit; |
| 2570 | Assert(MultiXactState->finishedStartup); |
| 2571 | LWLockRelease(MultiXactGenLock); |
| 2572 | |
| 2573 | /* |
| 2574 | * Determine the offset of the oldest multixact. Normally, we can read |
| 2575 | * the offset from the multixact itself, but there's an important special |
| 2576 | * case: if there are no multixacts in existence at all, oldestMXact |
| 2577 | * obviously can't point to one. It will instead point to the multixact |
| 2578 | * ID that will be assigned the next time one is needed. |
| 2579 | */ |
| 2580 | if (oldestMultiXactId == nextMXact) |
| 2581 | { |
| 2582 | /* |
| 2583 | * When the next multixact gets created, it will be stored at the next |
| 2584 | * offset. |
| 2585 | */ |
| 2586 | oldestOffset = nextOffset; |
| 2587 | oldestOffsetKnown = true; |
| 2588 | } |
| 2589 | else |
| 2590 | { |
| 2591 | /* |
| 2592 | * Figure out where the oldest existing multixact's offsets are |
| 2593 | * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, |
| 2594 | * the supposedly-earliest multixact might not really exist. We are |
| 2595 | * careful not to fail in that case. |
| 2596 | */ |
| 2597 | oldestOffsetKnown = |
| 2598 | find_multixact_start(oldestMultiXactId, &oldestOffset); |
| 2599 | |
| 2600 | if (oldestOffsetKnown) |
| 2601 | ereport(DEBUG1, |
| 2602 | (errmsg("oldest MultiXactId member is at offset %u" , |
| 2603 | oldestOffset))); |
| 2604 | else |
| 2605 | ereport(LOG, |
| 2606 | (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk" , |
| 2607 | oldestMultiXactId))); |
| 2608 | } |
| 2609 | |
| 2610 | LWLockRelease(MultiXactTruncationLock); |
| 2611 | |
| 2612 | /* |
| 2613 | * If we can, compute limits (and install them MultiXactState) to prevent |
| 2614 | * overrun of old data in the members SLRU area. We can only do so if the |
| 2615 | * oldest offset is known though. |
| 2616 | */ |
| 2617 | if (oldestOffsetKnown) |
| 2618 | { |
| 2619 | /* move back to start of the corresponding segment */ |
| 2620 | offsetStopLimit = oldestOffset - (oldestOffset % |
| 2621 | (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); |
| 2622 | |
| 2623 | /* always leave one segment before the wraparound point */ |
| 2624 | offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); |
| 2625 | |
| 2626 | if (!prevOldestOffsetKnown && !is_startup) |
| 2627 | ereport(LOG, |
| 2628 | (errmsg("MultiXact member wraparound protections are now enabled" ))); |
| 2629 | |
| 2630 | ereport(DEBUG1, |
| 2631 | (errmsg("MultiXact member stop limit is now %u based on MultiXact %u" , |
| 2632 | offsetStopLimit, oldestMultiXactId))); |
| 2633 | } |
| 2634 | else if (prevOldestOffsetKnown) |
| 2635 | { |
| 2636 | /* |
| 2637 | * If we failed to get the oldest offset this time, but we have a |
| 2638 | * value from a previous pass through this function, use the old |
| 2639 | * values rather than automatically forcing an emergency autovacuum |
| 2640 | * cycle again. |
| 2641 | */ |
| 2642 | oldestOffset = prevOldestOffset; |
| 2643 | oldestOffsetKnown = true; |
| 2644 | offsetStopLimit = prevOffsetStopLimit; |
| 2645 | } |
| 2646 | |
| 2647 | /* Install the computed values */ |
| 2648 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 2649 | MultiXactState->oldestOffset = oldestOffset; |
| 2650 | MultiXactState->oldestOffsetKnown = oldestOffsetKnown; |
| 2651 | MultiXactState->offsetStopLimit = offsetStopLimit; |
| 2652 | LWLockRelease(MultiXactGenLock); |
| 2653 | |
| 2654 | /* |
| 2655 | * Do we need an emergency autovacuum? If we're not sure, assume yes. |
| 2656 | */ |
| 2657 | return !oldestOffsetKnown || |
| 2658 | (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); |
| 2659 | } |
| 2660 | |
| 2661 | /* |
| 2662 | * Return whether adding "distance" to "start" would move past "boundary". |
| 2663 | * |
| 2664 | * We use this to determine whether the addition is "wrapping around" the |
| 2665 | * boundary point, hence the name. The reason we don't want to use the regular |
| 2666 | * 2^31-modulo arithmetic here is that we want to be able to use the whole of |
| 2667 | * the 2^32-1 space here, allowing for more multixacts that would fit |
| 2668 | * otherwise. |
| 2669 | */ |
| 2670 | static bool |
| 2671 | MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, |
| 2672 | uint32 distance) |
| 2673 | { |
| 2674 | MultiXactOffset finish; |
| 2675 | |
| 2676 | /* |
| 2677 | * Note that offset number 0 is not used (see GetMultiXactIdMembers), so |
| 2678 | * if the addition wraps around the UINT_MAX boundary, skip that value. |
| 2679 | */ |
| 2680 | finish = start + distance; |
| 2681 | if (finish < start) |
| 2682 | finish++; |
| 2683 | |
| 2684 | /*----------------------------------------------------------------------- |
| 2685 | * When the boundary is numerically greater than the starting point, any |
| 2686 | * value numerically between the two is not wrapped: |
| 2687 | * |
| 2688 | * <----S----B----> |
| 2689 | * [---) = F wrapped past B (and UINT_MAX) |
| 2690 | * [---) = F not wrapped |
| 2691 | * [----] = F wrapped past B |
| 2692 | * |
| 2693 | * When the boundary is numerically less than the starting point (i.e. the |
| 2694 | * UINT_MAX wraparound occurs somewhere in between) then all values in |
| 2695 | * between are wrapped: |
| 2696 | * |
| 2697 | * <----B----S----> |
| 2698 | * [---) = F not wrapped past B (but wrapped past UINT_MAX) |
| 2699 | * [---) = F wrapped past B (and UINT_MAX) |
| 2700 | * [----] = F not wrapped |
| 2701 | *----------------------------------------------------------------------- |
| 2702 | */ |
| 2703 | if (start < boundary) |
| 2704 | return finish >= boundary || finish < start; |
| 2705 | else |
| 2706 | return finish >= boundary && finish < start; |
| 2707 | } |
| 2708 | |
| 2709 | /* |
| 2710 | * Find the starting offset of the given MultiXactId. |
| 2711 | * |
| 2712 | * Returns false if the file containing the multi does not exist on disk. |
| 2713 | * Otherwise, returns true and sets *result to the starting member offset. |
| 2714 | * |
| 2715 | * This function does not prevent concurrent truncation, so if that's |
| 2716 | * required, the caller has to protect against that. |
| 2717 | */ |
| 2718 | static bool |
| 2719 | find_multixact_start(MultiXactId multi, MultiXactOffset *result) |
| 2720 | { |
| 2721 | MultiXactOffset offset; |
| 2722 | int pageno; |
| 2723 | int entryno; |
| 2724 | int slotno; |
| 2725 | MultiXactOffset *offptr; |
| 2726 | |
| 2727 | Assert(MultiXactState->finishedStartup); |
| 2728 | |
| 2729 | pageno = MultiXactIdToOffsetPage(multi); |
| 2730 | entryno = MultiXactIdToOffsetEntry(multi); |
| 2731 | |
| 2732 | /* |
| 2733 | * Flush out dirty data, so PhysicalPageExists can work correctly. |
| 2734 | * SimpleLruFlush() is a pretty big hammer for that. Alternatively we |
| 2735 | * could add an in-memory version of page exists, but find_multixact_start |
| 2736 | * is called infrequently, and it doesn't seem bad to flush buffers to |
| 2737 | * disk before truncation. |
| 2738 | */ |
| 2739 | SimpleLruFlush(MultiXactOffsetCtl, true); |
| 2740 | SimpleLruFlush(MultiXactMemberCtl, true); |
| 2741 | |
| 2742 | if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) |
| 2743 | return false; |
| 2744 | |
| 2745 | /* lock is acquired by SimpleLruReadPage_ReadOnly */ |
| 2746 | slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); |
| 2747 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| 2748 | offptr += entryno; |
| 2749 | offset = *offptr; |
| 2750 | LWLockRelease(MultiXactOffsetControlLock); |
| 2751 | |
| 2752 | *result = offset; |
| 2753 | return true; |
| 2754 | } |
| 2755 | |
| 2756 | /* |
| 2757 | * Determine how many multixacts, and how many multixact members, currently |
| 2758 | * exist. Return false if unable to determine. |
| 2759 | */ |
| 2760 | static bool |
| 2761 | ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) |
| 2762 | { |
| 2763 | MultiXactOffset nextOffset; |
| 2764 | MultiXactOffset oldestOffset; |
| 2765 | MultiXactId oldestMultiXactId; |
| 2766 | MultiXactId nextMultiXactId; |
| 2767 | bool oldestOffsetKnown; |
| 2768 | |
| 2769 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 2770 | nextOffset = MultiXactState->nextOffset; |
| 2771 | oldestMultiXactId = MultiXactState->oldestMultiXactId; |
| 2772 | nextMultiXactId = MultiXactState->nextMXact; |
| 2773 | oldestOffset = MultiXactState->oldestOffset; |
| 2774 | oldestOffsetKnown = MultiXactState->oldestOffsetKnown; |
| 2775 | LWLockRelease(MultiXactGenLock); |
| 2776 | |
| 2777 | if (!oldestOffsetKnown) |
| 2778 | return false; |
| 2779 | |
| 2780 | *members = nextOffset - oldestOffset; |
| 2781 | *multixacts = nextMultiXactId - oldestMultiXactId; |
| 2782 | return true; |
| 2783 | } |
| 2784 | |
| 2785 | /* |
| 2786 | * Multixact members can be removed once the multixacts that refer to them |
| 2787 | * are older than every datminxmid. autovacuum_multixact_freeze_max_age and |
| 2788 | * vacuum_multixact_freeze_table_age work together to make sure we never have |
| 2789 | * too many multixacts; we hope that, at least under normal circumstances, |
| 2790 | * this will also be sufficient to keep us from using too many offsets. |
| 2791 | * However, if the average multixact has many members, we might exhaust the |
| 2792 | * members space while still using few enough members that these limits fail |
| 2793 | * to trigger full table scans for relminmxid advancement. At that point, |
| 2794 | * we'd have no choice but to start failing multixact-creating operations |
| 2795 | * with an error. |
| 2796 | * |
| 2797 | * To prevent that, if more than a threshold portion of the members space is |
| 2798 | * used, we effectively reduce autovacuum_multixact_freeze_max_age and |
| 2799 | * to a value just less than the number of multixacts in use. We hope that |
| 2800 | * this will quickly trigger autovacuuming on the table or tables with the |
| 2801 | * oldest relminmxid, thus allowing datminmxid values to advance and removing |
| 2802 | * some members. |
| 2803 | * |
| 2804 | * As the fraction of the member space currently in use grows, we become |
| 2805 | * more aggressive in clamping this value. That not only causes autovacuum |
| 2806 | * to ramp up, but also makes any manual vacuums the user issues more |
| 2807 | * aggressive. This happens because vacuum_set_xid_limits() clamps the |
| 2808 | * freeze table and the minimum freeze age based on the effective |
| 2809 | * autovacuum_multixact_freeze_max_age this function returns. In the worst |
| 2810 | * case, we'll claim the freeze_max_age to zero, and every vacuum of any |
| 2811 | * table will try to freeze every multixact. |
| 2812 | * |
| 2813 | * It's possible that these thresholds should be user-tunable, but for now |
| 2814 | * we keep it simple. |
| 2815 | */ |
| 2816 | int |
| 2817 | MultiXactMemberFreezeThreshold(void) |
| 2818 | { |
| 2819 | MultiXactOffset members; |
| 2820 | uint32 multixacts; |
| 2821 | uint32 victim_multixacts; |
| 2822 | double fraction; |
| 2823 | |
| 2824 | /* If we can't determine member space utilization, assume the worst. */ |
| 2825 | if (!ReadMultiXactCounts(&multixacts, &members)) |
| 2826 | return 0; |
| 2827 | |
| 2828 | /* If member space utilization is low, no special action is required. */ |
| 2829 | if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) |
| 2830 | return autovacuum_multixact_freeze_max_age; |
| 2831 | |
| 2832 | /* |
| 2833 | * Compute a target for relminmxid advancement. The number of multixacts |
| 2834 | * we try to eliminate from the system is based on how far we are past |
| 2835 | * MULTIXACT_MEMBER_SAFE_THRESHOLD. |
| 2836 | */ |
| 2837 | fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / |
| 2838 | (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); |
| 2839 | victim_multixacts = multixacts * fraction; |
| 2840 | |
| 2841 | /* fraction could be > 1.0, but lowest possible freeze age is zero */ |
| 2842 | if (victim_multixacts > multixacts) |
| 2843 | return 0; |
| 2844 | return multixacts - victim_multixacts; |
| 2845 | } |
| 2846 | |
| 2847 | typedef struct mxtruncinfo |
| 2848 | { |
| 2849 | int earliestExistingPage; |
| 2850 | } mxtruncinfo; |
| 2851 | |
| 2852 | /* |
| 2853 | * SlruScanDirectory callback |
| 2854 | * This callback determines the earliest existing page number. |
| 2855 | */ |
| 2856 | static bool |
| 2857 | SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) |
| 2858 | { |
| 2859 | mxtruncinfo *trunc = (mxtruncinfo *) data; |
| 2860 | |
| 2861 | if (trunc->earliestExistingPage == -1 || |
| 2862 | ctl->PagePrecedes(segpage, trunc->earliestExistingPage)) |
| 2863 | { |
| 2864 | trunc->earliestExistingPage = segpage; |
| 2865 | } |
| 2866 | |
| 2867 | return false; /* keep going */ |
| 2868 | } |
| 2869 | |
| 2870 | |
| 2871 | /* |
| 2872 | * Delete members segments [oldest, newOldest) |
| 2873 | * |
| 2874 | * The members SLRU can, in contrast to the offsets one, be filled to almost |
| 2875 | * the full range at once. This means SimpleLruTruncate() can't trivially be |
| 2876 | * used - instead the to-be-deleted range is computed using the offsets |
| 2877 | * SLRU. C.f. TruncateMultiXact(). |
| 2878 | */ |
| 2879 | static void |
| 2880 | PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) |
| 2881 | { |
| 2882 | const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); |
| 2883 | int startsegment = MXOffsetToMemberSegment(oldestOffset); |
| 2884 | int endsegment = MXOffsetToMemberSegment(newOldestOffset); |
| 2885 | int segment = startsegment; |
| 2886 | |
| 2887 | /* |
| 2888 | * Delete all the segments but the last one. The last segment can still |
| 2889 | * contain, possibly partially, valid data. |
| 2890 | */ |
| 2891 | while (segment != endsegment) |
| 2892 | { |
| 2893 | elog(DEBUG2, "truncating multixact members segment %x" , segment); |
| 2894 | SlruDeleteSegment(MultiXactMemberCtl, segment); |
| 2895 | |
| 2896 | /* move to next segment, handling wraparound correctly */ |
| 2897 | if (segment == maxsegment) |
| 2898 | segment = 0; |
| 2899 | else |
| 2900 | segment += 1; |
| 2901 | } |
| 2902 | } |
| 2903 | |
| 2904 | /* |
| 2905 | * Delete offsets segments [oldest, newOldest) |
| 2906 | */ |
| 2907 | static void |
| 2908 | PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) |
| 2909 | { |
| 2910 | /* |
| 2911 | * We step back one multixact to avoid passing a cutoff page that hasn't |
| 2912 | * been created yet in the rare case that oldestMulti would be the first |
| 2913 | * item on a page and oldestMulti == nextMulti. In that case, if we |
| 2914 | * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound |
| 2915 | * detection. |
| 2916 | */ |
| 2917 | SimpleLruTruncate(MultiXactOffsetCtl, |
| 2918 | MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); |
| 2919 | } |
| 2920 | |
| 2921 | /* |
| 2922 | * Remove all MultiXactOffset and MultiXactMember segments before the oldest |
| 2923 | * ones still of interest. |
| 2924 | * |
| 2925 | * This is only called on a primary as part of vacuum (via |
| 2926 | * vac_truncate_clog()). During recovery truncation is done by replaying |
| 2927 | * truncation WAL records logged here. |
| 2928 | * |
| 2929 | * newOldestMulti is the oldest currently required multixact, newOldestMultiDB |
| 2930 | * is one of the databases preventing newOldestMulti from increasing. |
| 2931 | */ |
| 2932 | void |
| 2933 | TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) |
| 2934 | { |
| 2935 | MultiXactId oldestMulti; |
| 2936 | MultiXactId nextMulti; |
| 2937 | MultiXactOffset newOldestOffset; |
| 2938 | MultiXactOffset oldestOffset; |
| 2939 | MultiXactOffset nextOffset; |
| 2940 | mxtruncinfo trunc; |
| 2941 | MultiXactId earliest; |
| 2942 | |
| 2943 | Assert(!RecoveryInProgress()); |
| 2944 | Assert(MultiXactState->finishedStartup); |
| 2945 | |
| 2946 | /* |
| 2947 | * We can only allow one truncation to happen at once. Otherwise parts of |
| 2948 | * members might vanish while we're doing lookups or similar. There's no |
| 2949 | * need to have an interlock with creating new multis or such, since those |
| 2950 | * are constrained by the limits (which only grow, never shrink). |
| 2951 | */ |
| 2952 | LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); |
| 2953 | |
| 2954 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| 2955 | nextMulti = MultiXactState->nextMXact; |
| 2956 | nextOffset = MultiXactState->nextOffset; |
| 2957 | oldestMulti = MultiXactState->oldestMultiXactId; |
| 2958 | LWLockRelease(MultiXactGenLock); |
| 2959 | Assert(MultiXactIdIsValid(oldestMulti)); |
| 2960 | |
| 2961 | /* |
| 2962 | * Make sure to only attempt truncation if there's values to truncate |
| 2963 | * away. In normal processing values shouldn't go backwards, but there's |
| 2964 | * some corner cases (due to bugs) where that's possible. |
| 2965 | */ |
| 2966 | if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti)) |
| 2967 | { |
| 2968 | LWLockRelease(MultiXactTruncationLock); |
| 2969 | return; |
| 2970 | } |
| 2971 | |
| 2972 | /* |
| 2973 | * Note we can't just plow ahead with the truncation; it's possible that |
| 2974 | * there are no segments to truncate, which is a problem because we are |
| 2975 | * going to attempt to read the offsets page to determine where to |
| 2976 | * truncate the members SLRU. So we first scan the directory to determine |
| 2977 | * the earliest offsets page number that we can read without error. |
| 2978 | * |
| 2979 | * NB: It's also possible that the page that oldestMulti is on has already |
| 2980 | * been truncated away, and we crashed before updating oldestMulti. |
| 2981 | */ |
| 2982 | trunc.earliestExistingPage = -1; |
| 2983 | SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); |
| 2984 | earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; |
| 2985 | if (earliest < FirstMultiXactId) |
| 2986 | earliest = FirstMultiXactId; |
| 2987 | |
| 2988 | /* If there's nothing to remove, we can bail out early. */ |
| 2989 | if (MultiXactIdPrecedes(oldestMulti, earliest)) |
| 2990 | { |
| 2991 | LWLockRelease(MultiXactTruncationLock); |
| 2992 | return; |
| 2993 | } |
| 2994 | |
| 2995 | /* |
| 2996 | * First, compute the safe truncation point for MultiXactMember. This is |
| 2997 | * the starting offset of the oldest multixact. |
| 2998 | * |
| 2999 | * Hopefully, find_multixact_start will always work here, because we've |
| 3000 | * already checked that it doesn't precede the earliest MultiXact on disk. |
| 3001 | * But if it fails, don't truncate anything, and log a message. |
| 3002 | */ |
| 3003 | if (oldestMulti == nextMulti) |
| 3004 | { |
| 3005 | /* there are NO MultiXacts */ |
| 3006 | oldestOffset = nextOffset; |
| 3007 | } |
| 3008 | else if (!find_multixact_start(oldestMulti, &oldestOffset)) |
| 3009 | { |
| 3010 | ereport(LOG, |
| 3011 | (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation" , |
| 3012 | oldestMulti, earliest))); |
| 3013 | LWLockRelease(MultiXactTruncationLock); |
| 3014 | return; |
| 3015 | } |
| 3016 | |
| 3017 | /* |
| 3018 | * Secondly compute up to where to truncate. Lookup the corresponding |
| 3019 | * member offset for newOldestMulti for that. |
| 3020 | */ |
| 3021 | if (newOldestMulti == nextMulti) |
| 3022 | { |
| 3023 | /* there are NO MultiXacts */ |
| 3024 | newOldestOffset = nextOffset; |
| 3025 | } |
| 3026 | else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) |
| 3027 | { |
| 3028 | ereport(LOG, |
| 3029 | (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation" , |
| 3030 | newOldestMulti))); |
| 3031 | LWLockRelease(MultiXactTruncationLock); |
| 3032 | return; |
| 3033 | } |
| 3034 | |
| 3035 | elog(DEBUG1, "performing multixact truncation: " |
| 3036 | "offsets [%u, %u), offsets segments [%x, %x), " |
| 3037 | "members [%u, %u), members segments [%x, %x)" , |
| 3038 | oldestMulti, newOldestMulti, |
| 3039 | MultiXactIdToOffsetSegment(oldestMulti), |
| 3040 | MultiXactIdToOffsetSegment(newOldestMulti), |
| 3041 | oldestOffset, newOldestOffset, |
| 3042 | MXOffsetToMemberSegment(oldestOffset), |
| 3043 | MXOffsetToMemberSegment(newOldestOffset)); |
| 3044 | |
| 3045 | /* |
| 3046 | * Do truncation, and the WAL logging of the truncation, in a critical |
| 3047 | * section. That way offsets/members cannot get out of sync anymore, i.e. |
| 3048 | * once consistent the newOldestMulti will always exist in members, even |
| 3049 | * if we crashed in the wrong moment. |
| 3050 | */ |
| 3051 | START_CRIT_SECTION(); |
| 3052 | |
| 3053 | /* |
| 3054 | * Prevent checkpoints from being scheduled concurrently. This is critical |
| 3055 | * because otherwise a truncation record might not be replayed after a |
| 3056 | * crash/basebackup, even though the state of the data directory would |
| 3057 | * require it. |
| 3058 | */ |
| 3059 | Assert(!MyPgXact->delayChkpt); |
| 3060 | MyPgXact->delayChkpt = true; |
| 3061 | |
| 3062 | /* WAL log truncation */ |
| 3063 | WriteMTruncateXlogRec(newOldestMultiDB, |
| 3064 | oldestMulti, newOldestMulti, |
| 3065 | oldestOffset, newOldestOffset); |
| 3066 | |
| 3067 | /* |
| 3068 | * Update in-memory limits before performing the truncation, while inside |
| 3069 | * the critical section: Have to do it before truncation, to prevent |
| 3070 | * concurrent lookups of those values. Has to be inside the critical |
| 3071 | * section as otherwise a future call to this function would error out, |
| 3072 | * while looking up the oldest member in offsets, if our caller crashes |
| 3073 | * before updating the limits. |
| 3074 | */ |
| 3075 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| 3076 | MultiXactState->oldestMultiXactId = newOldestMulti; |
| 3077 | MultiXactState->oldestMultiXactDB = newOldestMultiDB; |
| 3078 | LWLockRelease(MultiXactGenLock); |
| 3079 | |
| 3080 | /* First truncate members */ |
| 3081 | PerformMembersTruncation(oldestOffset, newOldestOffset); |
| 3082 | |
| 3083 | /* Then offsets */ |
| 3084 | PerformOffsetsTruncation(oldestMulti, newOldestMulti); |
| 3085 | |
| 3086 | MyPgXact->delayChkpt = false; |
| 3087 | |
| 3088 | END_CRIT_SECTION(); |
| 3089 | LWLockRelease(MultiXactTruncationLock); |
| 3090 | } |
| 3091 | |
| 3092 | /* |
| 3093 | * Decide which of two MultiXactOffset page numbers is "older" for truncation |
| 3094 | * purposes. |
| 3095 | * |
| 3096 | * We need to use comparison of MultiXactId here in order to do the right |
| 3097 | * thing with wraparound. However, if we are asked about page number zero, we |
| 3098 | * don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get |
| 3099 | * weird. So, offset both multis by FirstMultiXactId to avoid that. |
| 3100 | * (Actually, the current implementation doesn't do anything weird with |
| 3101 | * InvalidMultiXactId, but there's no harm in leaving this code like this.) |
| 3102 | */ |
| 3103 | static bool |
| 3104 | MultiXactOffsetPagePrecedes(int page1, int page2) |
| 3105 | { |
| 3106 | MultiXactId multi1; |
| 3107 | MultiXactId multi2; |
| 3108 | |
| 3109 | multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; |
| 3110 | multi1 += FirstMultiXactId; |
| 3111 | multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; |
| 3112 | multi2 += FirstMultiXactId; |
| 3113 | |
| 3114 | return MultiXactIdPrecedes(multi1, multi2); |
| 3115 | } |
| 3116 | |
| 3117 | /* |
| 3118 | * Decide which of two MultiXactMember page numbers is "older" for truncation |
| 3119 | * purposes. There is no "invalid offset number" so use the numbers verbatim. |
| 3120 | */ |
| 3121 | static bool |
| 3122 | MultiXactMemberPagePrecedes(int page1, int page2) |
| 3123 | { |
| 3124 | MultiXactOffset offset1; |
| 3125 | MultiXactOffset offset2; |
| 3126 | |
| 3127 | offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; |
| 3128 | offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; |
| 3129 | |
| 3130 | return MultiXactOffsetPrecedes(offset1, offset2); |
| 3131 | } |
| 3132 | |
| 3133 | /* |
| 3134 | * Decide which of two MultiXactIds is earlier. |
| 3135 | * |
| 3136 | * XXX do we need to do something special for InvalidMultiXactId? |
| 3137 | * (Doesn't look like it.) |
| 3138 | */ |
| 3139 | bool |
| 3140 | MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) |
| 3141 | { |
| 3142 | int32 diff = (int32) (multi1 - multi2); |
| 3143 | |
| 3144 | return (diff < 0); |
| 3145 | } |
| 3146 | |
| 3147 | /* |
| 3148 | * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2? |
| 3149 | * |
| 3150 | * XXX do we need to do something special for InvalidMultiXactId? |
| 3151 | * (Doesn't look like it.) |
| 3152 | */ |
| 3153 | bool |
| 3154 | MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) |
| 3155 | { |
| 3156 | int32 diff = (int32) (multi1 - multi2); |
| 3157 | |
| 3158 | return (diff <= 0); |
| 3159 | } |
| 3160 | |
| 3161 | |
| 3162 | /* |
| 3163 | * Decide which of two offsets is earlier. |
| 3164 | */ |
| 3165 | static bool |
| 3166 | MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) |
| 3167 | { |
| 3168 | int32 diff = (int32) (offset1 - offset2); |
| 3169 | |
| 3170 | return (diff < 0); |
| 3171 | } |
| 3172 | |
| 3173 | /* |
| 3174 | * Write an xlog record reflecting the zeroing of either a MEMBERs or |
| 3175 | * OFFSETs page (info shows which) |
| 3176 | */ |
| 3177 | static void |
| 3178 | WriteMZeroPageXlogRec(int pageno, uint8 info) |
| 3179 | { |
| 3180 | XLogBeginInsert(); |
| 3181 | XLogRegisterData((char *) (&pageno), sizeof(int)); |
| 3182 | (void) XLogInsert(RM_MULTIXACT_ID, info); |
| 3183 | } |
| 3184 | |
| 3185 | /* |
| 3186 | * Write a TRUNCATE xlog record |
| 3187 | * |
| 3188 | * We must flush the xlog record to disk before returning --- see notes in |
| 3189 | * TruncateCLOG(). |
| 3190 | */ |
| 3191 | static void |
| 3192 | WriteMTruncateXlogRec(Oid oldestMultiDB, |
| 3193 | MultiXactId startTruncOff, MultiXactId endTruncOff, |
| 3194 | MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb) |
| 3195 | { |
| 3196 | XLogRecPtr recptr; |
| 3197 | xl_multixact_truncate xlrec; |
| 3198 | |
| 3199 | xlrec.oldestMultiDB = oldestMultiDB; |
| 3200 | |
| 3201 | xlrec.startTruncOff = startTruncOff; |
| 3202 | xlrec.endTruncOff = endTruncOff; |
| 3203 | |
| 3204 | xlrec.startTruncMemb = startTruncMemb; |
| 3205 | xlrec.endTruncMemb = endTruncMemb; |
| 3206 | |
| 3207 | XLogBeginInsert(); |
| 3208 | XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate); |
| 3209 | recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID); |
| 3210 | XLogFlush(recptr); |
| 3211 | } |
| 3212 | |
| 3213 | /* |
| 3214 | * MULTIXACT resource manager's routines |
| 3215 | */ |
| 3216 | void |
| 3217 | multixact_redo(XLogReaderState *record) |
| 3218 | { |
| 3219 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
| 3220 | |
| 3221 | /* Backup blocks are not used in multixact records */ |
| 3222 | Assert(!XLogRecHasAnyBlockRefs(record)); |
| 3223 | |
| 3224 | if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) |
| 3225 | { |
| 3226 | int pageno; |
| 3227 | int slotno; |
| 3228 | |
| 3229 | memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
| 3230 | |
| 3231 | LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| 3232 | |
| 3233 | slotno = ZeroMultiXactOffsetPage(pageno, false); |
| 3234 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
| 3235 | Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
| 3236 | |
| 3237 | LWLockRelease(MultiXactOffsetControlLock); |
| 3238 | } |
| 3239 | else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) |
| 3240 | { |
| 3241 | int pageno; |
| 3242 | int slotno; |
| 3243 | |
| 3244 | memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
| 3245 | |
| 3246 | LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| 3247 | |
| 3248 | slotno = ZeroMultiXactMemberPage(pageno, false); |
| 3249 | SimpleLruWritePage(MultiXactMemberCtl, slotno); |
| 3250 | Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
| 3251 | |
| 3252 | LWLockRelease(MultiXactMemberControlLock); |
| 3253 | } |
| 3254 | else if (info == XLOG_MULTIXACT_CREATE_ID) |
| 3255 | { |
| 3256 | xl_multixact_create *xlrec = |
| 3257 | (xl_multixact_create *) XLogRecGetData(record); |
| 3258 | TransactionId max_xid; |
| 3259 | int i; |
| 3260 | |
| 3261 | /* Store the data back into the SLRU files */ |
| 3262 | RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, |
| 3263 | xlrec->members); |
| 3264 | |
| 3265 | /* Make sure nextMXact/nextOffset are beyond what this record has */ |
| 3266 | MultiXactAdvanceNextMXact(xlrec->mid + 1, |
| 3267 | xlrec->moff + xlrec->nmembers); |
| 3268 | |
| 3269 | /* |
| 3270 | * Make sure nextFullXid is beyond any XID mentioned in the record. |
| 3271 | * This should be unnecessary, since any XID found here ought to have |
| 3272 | * other evidence in the XLOG, but let's be safe. |
| 3273 | */ |
| 3274 | max_xid = XLogRecGetXid(record); |
| 3275 | for (i = 0; i < xlrec->nmembers; i++) |
| 3276 | { |
| 3277 | if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid)) |
| 3278 | max_xid = xlrec->members[i].xid; |
| 3279 | } |
| 3280 | |
| 3281 | AdvanceNextFullTransactionIdPastXid(max_xid); |
| 3282 | } |
| 3283 | else if (info == XLOG_MULTIXACT_TRUNCATE_ID) |
| 3284 | { |
| 3285 | xl_multixact_truncate xlrec; |
| 3286 | int pageno; |
| 3287 | |
| 3288 | memcpy(&xlrec, XLogRecGetData(record), |
| 3289 | SizeOfMultiXactTruncate); |
| 3290 | |
| 3291 | elog(DEBUG1, "replaying multixact truncation: " |
| 3292 | "offsets [%u, %u), offsets segments [%x, %x), " |
| 3293 | "members [%u, %u), members segments [%x, %x)" , |
| 3294 | xlrec.startTruncOff, xlrec.endTruncOff, |
| 3295 | MultiXactIdToOffsetSegment(xlrec.startTruncOff), |
| 3296 | MultiXactIdToOffsetSegment(xlrec.endTruncOff), |
| 3297 | xlrec.startTruncMemb, xlrec.endTruncMemb, |
| 3298 | MXOffsetToMemberSegment(xlrec.startTruncMemb), |
| 3299 | MXOffsetToMemberSegment(xlrec.endTruncMemb)); |
| 3300 | |
| 3301 | /* should not be required, but more than cheap enough */ |
| 3302 | LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); |
| 3303 | |
| 3304 | /* |
| 3305 | * Advance the horizon values, so they're current at the end of |
| 3306 | * recovery. |
| 3307 | */ |
| 3308 | SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); |
| 3309 | |
| 3310 | PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); |
| 3311 | |
| 3312 | /* |
| 3313 | * During XLOG replay, latest_page_number isn't necessarily set up |
| 3314 | * yet; insert a suitable value to bypass the sanity test in |
| 3315 | * SimpleLruTruncate. |
| 3316 | */ |
| 3317 | pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); |
| 3318 | MultiXactOffsetCtl->shared->latest_page_number = pageno; |
| 3319 | PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); |
| 3320 | |
| 3321 | LWLockRelease(MultiXactTruncationLock); |
| 3322 | } |
| 3323 | else |
| 3324 | elog(PANIC, "multixact_redo: unknown op code %u" , info); |
| 3325 | } |
| 3326 | |
| 3327 | Datum |
| 3328 | pg_get_multixact_members(PG_FUNCTION_ARGS) |
| 3329 | { |
| 3330 | typedef struct |
| 3331 | { |
| 3332 | MultiXactMember *members; |
| 3333 | int nmembers; |
| 3334 | int iter; |
| 3335 | } mxact; |
| 3336 | MultiXactId mxid = PG_GETARG_UINT32(0); |
| 3337 | mxact *multi; |
| 3338 | FuncCallContext *funccxt; |
| 3339 | |
| 3340 | if (mxid < FirstMultiXactId) |
| 3341 | ereport(ERROR, |
| 3342 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| 3343 | errmsg("invalid MultiXactId: %u" , mxid))); |
| 3344 | |
| 3345 | if (SRF_IS_FIRSTCALL()) |
| 3346 | { |
| 3347 | MemoryContext oldcxt; |
| 3348 | TupleDesc tupdesc; |
| 3349 | |
| 3350 | funccxt = SRF_FIRSTCALL_INIT(); |
| 3351 | oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); |
| 3352 | |
| 3353 | multi = palloc(sizeof(mxact)); |
| 3354 | /* no need to allow for old values here */ |
| 3355 | multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, |
| 3356 | false); |
| 3357 | multi->iter = 0; |
| 3358 | |
| 3359 | tupdesc = CreateTemplateTupleDesc(2); |
| 3360 | TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid" , |
| 3361 | XIDOID, -1, 0); |
| 3362 | TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode" , |
| 3363 | TEXTOID, -1, 0); |
| 3364 | |
| 3365 | funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); |
| 3366 | funccxt->user_fctx = multi; |
| 3367 | |
| 3368 | MemoryContextSwitchTo(oldcxt); |
| 3369 | } |
| 3370 | |
| 3371 | funccxt = SRF_PERCALL_SETUP(); |
| 3372 | multi = (mxact *) funccxt->user_fctx; |
| 3373 | |
| 3374 | while (multi->iter < multi->nmembers) |
| 3375 | { |
| 3376 | HeapTuple tuple; |
| 3377 | char *values[2]; |
| 3378 | |
| 3379 | values[0] = psprintf("%u" , multi->members[multi->iter].xid); |
| 3380 | values[1] = mxstatus_to_string(multi->members[multi->iter].status); |
| 3381 | |
| 3382 | tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); |
| 3383 | |
| 3384 | multi->iter++; |
| 3385 | pfree(values[0]); |
| 3386 | SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); |
| 3387 | } |
| 3388 | |
| 3389 | if (multi->nmembers > 0) |
| 3390 | pfree(multi->members); |
| 3391 | pfree(multi); |
| 3392 | |
| 3393 | SRF_RETURN_DONE(funccxt); |
| 3394 | } |
| 3395 | |