| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * snapmgr.c |
| 4 | * PostgreSQL snapshot manager |
| 5 | * |
| 6 | * We keep track of snapshots in two ways: those "registered" by resowner.c, |
| 7 | * and the "active snapshot" stack. All snapshots in either of them live in |
| 8 | * persistent memory. When a snapshot is no longer in any of these lists |
| 9 | * (tracked by separate refcounts on each snapshot), its memory can be freed. |
| 10 | * |
| 11 | * The FirstXactSnapshot, if any, is treated a bit specially: we increment its |
| 12 | * regd_count and list it in RegisteredSnapshots, but this reference is not |
| 13 | * tracked by a resource owner. We used to use the TopTransactionResourceOwner |
| 14 | * to track this snapshot reference, but that introduces logical circularity |
| 15 | * and thus makes it impossible to clean up in a sane fashion. It's better to |
| 16 | * handle this reference as an internally-tracked registration, so that this |
| 17 | * module is entirely lower-level than ResourceOwners. |
| 18 | * |
| 19 | * Likewise, any snapshots that have been exported by pg_export_snapshot |
| 20 | * have regd_count = 1 and are listed in RegisteredSnapshots, but are not |
| 21 | * tracked by any resource owner. |
| 22 | * |
| 23 | * Likewise, the CatalogSnapshot is listed in RegisteredSnapshots when it |
| 24 | * is valid, but is not tracked by any resource owner. |
| 25 | * |
| 26 | * The same is true for historic snapshots used during logical decoding, |
| 27 | * their lifetime is managed separately (as they live longer than one xact.c |
| 28 | * transaction). |
| 29 | * |
| 30 | * These arrangements let us reset MyPgXact->xmin when there are no snapshots |
| 31 | * referenced by this transaction, and advance it when the one with oldest |
| 32 | * Xmin is no longer referenced. For simplicity however, only registered |
| 33 | * snapshots not active snapshots participate in tracking which one is oldest; |
| 34 | * we don't try to change MyPgXact->xmin except when the active-snapshot |
| 35 | * stack is empty. |
| 36 | * |
| 37 | * |
| 38 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 39 | * Portions Copyright (c) 1994, Regents of the University of California |
| 40 | * |
| 41 | * IDENTIFICATION |
| 42 | * src/backend/utils/time/snapmgr.c |
| 43 | * |
| 44 | *------------------------------------------------------------------------- |
| 45 | */ |
| 46 | #include "postgres.h" |
| 47 | |
| 48 | #include <sys/stat.h> |
| 49 | #include <unistd.h> |
| 50 | |
| 51 | #include "access/subtrans.h" |
| 52 | #include "access/transam.h" |
| 53 | #include "access/xact.h" |
| 54 | #include "access/xlog.h" |
| 55 | #include "catalog/catalog.h" |
| 56 | #include "lib/pairingheap.h" |
| 57 | #include "miscadmin.h" |
| 58 | #include "storage/predicate.h" |
| 59 | #include "storage/proc.h" |
| 60 | #include "storage/procarray.h" |
| 61 | #include "storage/sinval.h" |
| 62 | #include "storage/sinvaladt.h" |
| 63 | #include "storage/spin.h" |
| 64 | #include "utils/builtins.h" |
| 65 | #include "utils/memutils.h" |
| 66 | #include "utils/rel.h" |
| 67 | #include "utils/resowner_private.h" |
| 68 | #include "utils/snapmgr.h" |
| 69 | #include "utils/syscache.h" |
| 70 | |
| 71 | |
| 72 | /* |
| 73 | * GUC parameters |
| 74 | */ |
| 75 | int old_snapshot_threshold; /* number of minutes, -1 disables */ |
| 76 | |
| 77 | /* |
| 78 | * Structure for dealing with old_snapshot_threshold implementation. |
| 79 | */ |
| 80 | typedef struct OldSnapshotControlData |
| 81 | { |
| 82 | /* |
| 83 | * Variables for old snapshot handling are shared among processes and are |
| 84 | * only allowed to move forward. |
| 85 | */ |
| 86 | slock_t mutex_current; /* protect current_timestamp */ |
| 87 | TimestampTz current_timestamp; /* latest snapshot timestamp */ |
| 88 | slock_t mutex_latest_xmin; /* protect latest_xmin and next_map_update */ |
| 89 | TransactionId latest_xmin; /* latest snapshot xmin */ |
| 90 | TimestampTz next_map_update; /* latest snapshot valid up to */ |
| 91 | slock_t mutex_threshold; /* protect threshold fields */ |
| 92 | TimestampTz threshold_timestamp; /* earlier snapshot is old */ |
| 93 | TransactionId threshold_xid; /* earlier xid may be gone */ |
| 94 | |
| 95 | /* |
| 96 | * Keep one xid per minute for old snapshot error handling. |
| 97 | * |
| 98 | * Use a circular buffer with a head offset, a count of entries currently |
| 99 | * used, and a timestamp corresponding to the xid at the head offset. A |
| 100 | * count_used value of zero means that there are no times stored; a |
| 101 | * count_used value of OLD_SNAPSHOT_TIME_MAP_ENTRIES means that the buffer |
| 102 | * is full and the head must be advanced to add new entries. Use |
| 103 | * timestamps aligned to minute boundaries, since that seems less |
| 104 | * surprising than aligning based on the first usage timestamp. The |
| 105 | * latest bucket is effectively stored within latest_xmin. The circular |
| 106 | * buffer is updated when we get a new xmin value that doesn't fall into |
| 107 | * the same interval. |
| 108 | * |
| 109 | * It is OK if the xid for a given time slot is from earlier than |
| 110 | * calculated by adding the number of minutes corresponding to the |
| 111 | * (possibly wrapped) distance from the head offset to the time of the |
| 112 | * head entry, since that just results in the vacuuming of old tuples |
| 113 | * being slightly less aggressive. It would not be OK for it to be off in |
| 114 | * the other direction, since it might result in vacuuming tuples that are |
| 115 | * still expected to be there. |
| 116 | * |
| 117 | * Use of an SLRU was considered but not chosen because it is more |
| 118 | * heavyweight than is needed for this, and would probably not be any less |
| 119 | * code to implement. |
| 120 | * |
| 121 | * Persistence is not needed. |
| 122 | */ |
| 123 | int head_offset; /* subscript of oldest tracked time */ |
| 124 | TimestampTz head_timestamp; /* time corresponding to head xid */ |
| 125 | int count_used; /* how many slots are in use */ |
| 126 | TransactionId xid_by_minute[FLEXIBLE_ARRAY_MEMBER]; |
| 127 | } OldSnapshotControlData; |
| 128 | |
| 129 | static volatile OldSnapshotControlData *oldSnapshotControl; |
| 130 | |
| 131 | |
| 132 | /* |
| 133 | * CurrentSnapshot points to the only snapshot taken in transaction-snapshot |
| 134 | * mode, and to the latest one taken in a read-committed transaction. |
| 135 | * SecondarySnapshot is a snapshot that's always up-to-date as of the current |
| 136 | * instant, even in transaction-snapshot mode. It should only be used for |
| 137 | * special-purpose code (say, RI checking.) CatalogSnapshot points to an |
| 138 | * MVCC snapshot intended to be used for catalog scans; we must invalidate it |
| 139 | * whenever a system catalog change occurs. |
| 140 | * |
| 141 | * These SnapshotData structs are static to simplify memory allocation |
| 142 | * (see the hack in GetSnapshotData to avoid repeated malloc/free). |
| 143 | */ |
| 144 | static SnapshotData CurrentSnapshotData = {SNAPSHOT_MVCC}; |
| 145 | static SnapshotData SecondarySnapshotData = {SNAPSHOT_MVCC}; |
| 146 | SnapshotData CatalogSnapshotData = {SNAPSHOT_MVCC}; |
| 147 | SnapshotData SnapshotSelfData = {SNAPSHOT_SELF}; |
| 148 | SnapshotData SnapshotAnyData = {SNAPSHOT_ANY}; |
| 149 | |
| 150 | /* Pointers to valid snapshots */ |
| 151 | static Snapshot CurrentSnapshot = NULL; |
| 152 | static Snapshot SecondarySnapshot = NULL; |
| 153 | static Snapshot CatalogSnapshot = NULL; |
| 154 | static Snapshot HistoricSnapshot = NULL; |
| 155 | |
| 156 | /* |
| 157 | * These are updated by GetSnapshotData. We initialize them this way |
| 158 | * for the convenience of TransactionIdIsInProgress: even in bootstrap |
| 159 | * mode, we don't want it to say that BootstrapTransactionId is in progress. |
| 160 | * |
| 161 | * RecentGlobalXmin and RecentGlobalDataXmin are initialized to |
| 162 | * InvalidTransactionId, to ensure that no one tries to use a stale |
| 163 | * value. Readers should ensure that it has been set to something else |
| 164 | * before using it. |
| 165 | */ |
| 166 | TransactionId TransactionXmin = FirstNormalTransactionId; |
| 167 | TransactionId RecentXmin = FirstNormalTransactionId; |
| 168 | TransactionId RecentGlobalXmin = InvalidTransactionId; |
| 169 | TransactionId RecentGlobalDataXmin = InvalidTransactionId; |
| 170 | |
| 171 | /* (table, ctid) => (cmin, cmax) mapping during timetravel */ |
| 172 | static HTAB *tuplecid_data = NULL; |
| 173 | |
| 174 | /* |
| 175 | * Elements of the active snapshot stack. |
| 176 | * |
| 177 | * Each element here accounts for exactly one active_count on SnapshotData. |
| 178 | * |
| 179 | * NB: the code assumes that elements in this list are in non-increasing |
| 180 | * order of as_level; also, the list must be NULL-terminated. |
| 181 | */ |
| 182 | typedef struct ActiveSnapshotElt |
| 183 | { |
| 184 | Snapshot as_snap; |
| 185 | int as_level; |
| 186 | struct ActiveSnapshotElt *as_next; |
| 187 | } ActiveSnapshotElt; |
| 188 | |
| 189 | /* Top of the stack of active snapshots */ |
| 190 | static ActiveSnapshotElt *ActiveSnapshot = NULL; |
| 191 | |
| 192 | /* Bottom of the stack of active snapshots */ |
| 193 | static ActiveSnapshotElt *OldestActiveSnapshot = NULL; |
| 194 | |
| 195 | /* |
| 196 | * Currently registered Snapshots. Ordered in a heap by xmin, so that we can |
| 197 | * quickly find the one with lowest xmin, to advance our MyPgXact->xmin. |
| 198 | */ |
| 199 | static int xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, |
| 200 | void *arg); |
| 201 | |
| 202 | static pairingheap RegisteredSnapshots = {&xmin_cmp, NULL, NULL}; |
| 203 | |
| 204 | /* first GetTransactionSnapshot call in a transaction? */ |
| 205 | bool FirstSnapshotSet = false; |
| 206 | |
| 207 | /* |
| 208 | * Remember the serializable transaction snapshot, if any. We cannot trust |
| 209 | * FirstSnapshotSet in combination with IsolationUsesXactSnapshot(), because |
| 210 | * GUC may be reset before us, changing the value of IsolationUsesXactSnapshot. |
| 211 | */ |
| 212 | static Snapshot FirstXactSnapshot = NULL; |
| 213 | |
| 214 | /* Define pathname of exported-snapshot files */ |
| 215 | #define SNAPSHOT_EXPORT_DIR "pg_snapshots" |
| 216 | |
| 217 | /* Structure holding info about exported snapshot. */ |
| 218 | typedef struct ExportedSnapshot |
| 219 | { |
| 220 | char *snapfile; |
| 221 | Snapshot snapshot; |
| 222 | } ExportedSnapshot; |
| 223 | |
| 224 | /* Current xact's exported snapshots (a list of ExportedSnapshot structs) */ |
| 225 | static List *exportedSnapshots = NIL; |
| 226 | |
| 227 | /* Prototypes for local functions */ |
| 228 | static TimestampTz AlignTimestampToMinuteBoundary(TimestampTz ts); |
| 229 | static Snapshot CopySnapshot(Snapshot snapshot); |
| 230 | static void FreeSnapshot(Snapshot snapshot); |
| 231 | static void SnapshotResetXmin(void); |
| 232 | |
| 233 | /* |
| 234 | * Snapshot fields to be serialized. |
| 235 | * |
| 236 | * Only these fields need to be sent to the cooperating backend; the |
| 237 | * remaining ones can (and must) be set by the receiver upon restore. |
| 238 | */ |
| 239 | typedef struct SerializedSnapshotData |
| 240 | { |
| 241 | TransactionId xmin; |
| 242 | TransactionId xmax; |
| 243 | uint32 xcnt; |
| 244 | int32 subxcnt; |
| 245 | bool suboverflowed; |
| 246 | bool takenDuringRecovery; |
| 247 | CommandId curcid; |
| 248 | TimestampTz whenTaken; |
| 249 | XLogRecPtr lsn; |
| 250 | } SerializedSnapshotData; |
| 251 | |
| 252 | Size |
| 253 | SnapMgrShmemSize(void) |
| 254 | { |
| 255 | Size size; |
| 256 | |
| 257 | size = offsetof(OldSnapshotControlData, xid_by_minute); |
| 258 | if (old_snapshot_threshold > 0) |
| 259 | size = add_size(size, mul_size(sizeof(TransactionId), |
| 260 | OLD_SNAPSHOT_TIME_MAP_ENTRIES)); |
| 261 | |
| 262 | return size; |
| 263 | } |
| 264 | |
| 265 | /* |
| 266 | * Initialize for managing old snapshot detection. |
| 267 | */ |
| 268 | void |
| 269 | SnapMgrInit(void) |
| 270 | { |
| 271 | bool found; |
| 272 | |
| 273 | /* |
| 274 | * Create or attach to the OldSnapshotControlData structure. |
| 275 | */ |
| 276 | oldSnapshotControl = (volatile OldSnapshotControlData *) |
| 277 | ShmemInitStruct("OldSnapshotControlData" , |
| 278 | SnapMgrShmemSize(), &found); |
| 279 | |
| 280 | if (!found) |
| 281 | { |
| 282 | SpinLockInit(&oldSnapshotControl->mutex_current); |
| 283 | oldSnapshotControl->current_timestamp = 0; |
| 284 | SpinLockInit(&oldSnapshotControl->mutex_latest_xmin); |
| 285 | oldSnapshotControl->latest_xmin = InvalidTransactionId; |
| 286 | oldSnapshotControl->next_map_update = 0; |
| 287 | SpinLockInit(&oldSnapshotControl->mutex_threshold); |
| 288 | oldSnapshotControl->threshold_timestamp = 0; |
| 289 | oldSnapshotControl->threshold_xid = InvalidTransactionId; |
| 290 | oldSnapshotControl->head_offset = 0; |
| 291 | oldSnapshotControl->head_timestamp = 0; |
| 292 | oldSnapshotControl->count_used = 0; |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | /* |
| 297 | * GetTransactionSnapshot |
| 298 | * Get the appropriate snapshot for a new query in a transaction. |
| 299 | * |
| 300 | * Note that the return value may point at static storage that will be modified |
| 301 | * by future calls and by CommandCounterIncrement(). Callers should call |
| 302 | * RegisterSnapshot or PushActiveSnapshot on the returned snap if it is to be |
| 303 | * used very long. |
| 304 | */ |
| 305 | Snapshot |
| 306 | GetTransactionSnapshot(void) |
| 307 | { |
| 308 | /* |
| 309 | * Return historic snapshot if doing logical decoding. We'll never need a |
| 310 | * non-historic transaction snapshot in this (sub-)transaction, so there's |
| 311 | * no need to be careful to set one up for later calls to |
| 312 | * GetTransactionSnapshot(). |
| 313 | */ |
| 314 | if (HistoricSnapshotActive()) |
| 315 | { |
| 316 | Assert(!FirstSnapshotSet); |
| 317 | return HistoricSnapshot; |
| 318 | } |
| 319 | |
| 320 | /* First call in transaction? */ |
| 321 | if (!FirstSnapshotSet) |
| 322 | { |
| 323 | /* |
| 324 | * Don't allow catalog snapshot to be older than xact snapshot. Must |
| 325 | * do this first to allow the empty-heap Assert to succeed. |
| 326 | */ |
| 327 | InvalidateCatalogSnapshot(); |
| 328 | |
| 329 | Assert(pairingheap_is_empty(&RegisteredSnapshots)); |
| 330 | Assert(FirstXactSnapshot == NULL); |
| 331 | |
| 332 | if (IsInParallelMode()) |
| 333 | elog(ERROR, |
| 334 | "cannot take query snapshot during a parallel operation" ); |
| 335 | |
| 336 | /* |
| 337 | * In transaction-snapshot mode, the first snapshot must live until |
| 338 | * end of xact regardless of what the caller does with it, so we must |
| 339 | * make a copy of it rather than returning CurrentSnapshotData |
| 340 | * directly. Furthermore, if we're running in serializable mode, |
| 341 | * predicate.c needs to wrap the snapshot fetch in its own processing. |
| 342 | */ |
| 343 | if (IsolationUsesXactSnapshot()) |
| 344 | { |
| 345 | /* First, create the snapshot in CurrentSnapshotData */ |
| 346 | if (IsolationIsSerializable()) |
| 347 | CurrentSnapshot = GetSerializableTransactionSnapshot(&CurrentSnapshotData); |
| 348 | else |
| 349 | CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); |
| 350 | /* Make a saved copy */ |
| 351 | CurrentSnapshot = CopySnapshot(CurrentSnapshot); |
| 352 | FirstXactSnapshot = CurrentSnapshot; |
| 353 | /* Mark it as "registered" in FirstXactSnapshot */ |
| 354 | FirstXactSnapshot->regd_count++; |
| 355 | pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); |
| 356 | } |
| 357 | else |
| 358 | CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); |
| 359 | |
| 360 | FirstSnapshotSet = true; |
| 361 | return CurrentSnapshot; |
| 362 | } |
| 363 | |
| 364 | if (IsolationUsesXactSnapshot()) |
| 365 | return CurrentSnapshot; |
| 366 | |
| 367 | /* Don't allow catalog snapshot to be older than xact snapshot. */ |
| 368 | InvalidateCatalogSnapshot(); |
| 369 | |
| 370 | CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); |
| 371 | |
| 372 | return CurrentSnapshot; |
| 373 | } |
| 374 | |
| 375 | /* |
| 376 | * GetLatestSnapshot |
| 377 | * Get a snapshot that is up-to-date as of the current instant, |
| 378 | * even if we are executing in transaction-snapshot mode. |
| 379 | */ |
| 380 | Snapshot |
| 381 | GetLatestSnapshot(void) |
| 382 | { |
| 383 | /* |
| 384 | * We might be able to relax this, but nothing that could otherwise work |
| 385 | * needs it. |
| 386 | */ |
| 387 | if (IsInParallelMode()) |
| 388 | elog(ERROR, |
| 389 | "cannot update SecondarySnapshot during a parallel operation" ); |
| 390 | |
| 391 | /* |
| 392 | * So far there are no cases requiring support for GetLatestSnapshot() |
| 393 | * during logical decoding, but it wouldn't be hard to add if required. |
| 394 | */ |
| 395 | Assert(!HistoricSnapshotActive()); |
| 396 | |
| 397 | /* If first call in transaction, go ahead and set the xact snapshot */ |
| 398 | if (!FirstSnapshotSet) |
| 399 | return GetTransactionSnapshot(); |
| 400 | |
| 401 | SecondarySnapshot = GetSnapshotData(&SecondarySnapshotData); |
| 402 | |
| 403 | return SecondarySnapshot; |
| 404 | } |
| 405 | |
| 406 | /* |
| 407 | * GetOldestSnapshot |
| 408 | * |
| 409 | * Get the transaction's oldest known snapshot, as judged by the LSN. |
| 410 | * Will return NULL if there are no active or registered snapshots. |
| 411 | */ |
| 412 | Snapshot |
| 413 | GetOldestSnapshot(void) |
| 414 | { |
| 415 | Snapshot OldestRegisteredSnapshot = NULL; |
| 416 | XLogRecPtr RegisteredLSN = InvalidXLogRecPtr; |
| 417 | |
| 418 | if (!pairingheap_is_empty(&RegisteredSnapshots)) |
| 419 | { |
| 420 | OldestRegisteredSnapshot = pairingheap_container(SnapshotData, ph_node, |
| 421 | pairingheap_first(&RegisteredSnapshots)); |
| 422 | RegisteredLSN = OldestRegisteredSnapshot->lsn; |
| 423 | } |
| 424 | |
| 425 | if (OldestActiveSnapshot != NULL) |
| 426 | { |
| 427 | XLogRecPtr ActiveLSN = OldestActiveSnapshot->as_snap->lsn; |
| 428 | |
| 429 | if (XLogRecPtrIsInvalid(RegisteredLSN) || RegisteredLSN > ActiveLSN) |
| 430 | return OldestActiveSnapshot->as_snap; |
| 431 | } |
| 432 | |
| 433 | return OldestRegisteredSnapshot; |
| 434 | } |
| 435 | |
| 436 | /* |
| 437 | * GetCatalogSnapshot |
| 438 | * Get a snapshot that is sufficiently up-to-date for scan of the |
| 439 | * system catalog with the specified OID. |
| 440 | */ |
| 441 | Snapshot |
| 442 | GetCatalogSnapshot(Oid relid) |
| 443 | { |
| 444 | /* |
| 445 | * Return historic snapshot while we're doing logical decoding, so we can |
| 446 | * see the appropriate state of the catalog. |
| 447 | * |
| 448 | * This is the primary reason for needing to reset the system caches after |
| 449 | * finishing decoding. |
| 450 | */ |
| 451 | if (HistoricSnapshotActive()) |
| 452 | return HistoricSnapshot; |
| 453 | |
| 454 | return GetNonHistoricCatalogSnapshot(relid); |
| 455 | } |
| 456 | |
| 457 | /* |
| 458 | * GetNonHistoricCatalogSnapshot |
| 459 | * Get a snapshot that is sufficiently up-to-date for scan of the system |
| 460 | * catalog with the specified OID, even while historic snapshots are set |
| 461 | * up. |
| 462 | */ |
| 463 | Snapshot |
| 464 | GetNonHistoricCatalogSnapshot(Oid relid) |
| 465 | { |
| 466 | /* |
| 467 | * If the caller is trying to scan a relation that has no syscache, no |
| 468 | * catcache invalidations will be sent when it is updated. For a few key |
| 469 | * relations, snapshot invalidations are sent instead. If we're trying to |
| 470 | * scan a relation for which neither catcache nor snapshot invalidations |
| 471 | * are sent, we must refresh the snapshot every time. |
| 472 | */ |
| 473 | if (CatalogSnapshot && |
| 474 | !RelationInvalidatesSnapshotsOnly(relid) && |
| 475 | !RelationHasSysCache(relid)) |
| 476 | InvalidateCatalogSnapshot(); |
| 477 | |
| 478 | if (CatalogSnapshot == NULL) |
| 479 | { |
| 480 | /* Get new snapshot. */ |
| 481 | CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData); |
| 482 | |
| 483 | /* |
| 484 | * Make sure the catalog snapshot will be accounted for in decisions |
| 485 | * about advancing PGXACT->xmin. We could apply RegisterSnapshot, but |
| 486 | * that would result in making a physical copy, which is overkill; and |
| 487 | * it would also create a dependency on some resource owner, which we |
| 488 | * do not want for reasons explained at the head of this file. Instead |
| 489 | * just shove the CatalogSnapshot into the pairing heap manually. This |
| 490 | * has to be reversed in InvalidateCatalogSnapshot, of course. |
| 491 | * |
| 492 | * NB: it had better be impossible for this to throw error, since the |
| 493 | * CatalogSnapshot pointer is already valid. |
| 494 | */ |
| 495 | pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node); |
| 496 | } |
| 497 | |
| 498 | return CatalogSnapshot; |
| 499 | } |
| 500 | |
| 501 | /* |
| 502 | * InvalidateCatalogSnapshot |
| 503 | * Mark the current catalog snapshot, if any, as invalid |
| 504 | * |
| 505 | * We could change this API to allow the caller to provide more fine-grained |
| 506 | * invalidation details, so that a change to relation A wouldn't prevent us |
| 507 | * from using our cached snapshot to scan relation B, but so far there's no |
| 508 | * evidence that the CPU cycles we spent tracking such fine details would be |
| 509 | * well-spent. |
| 510 | */ |
| 511 | void |
| 512 | InvalidateCatalogSnapshot(void) |
| 513 | { |
| 514 | if (CatalogSnapshot) |
| 515 | { |
| 516 | pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); |
| 517 | CatalogSnapshot = NULL; |
| 518 | SnapshotResetXmin(); |
| 519 | } |
| 520 | } |
| 521 | |
| 522 | /* |
| 523 | * InvalidateCatalogSnapshotConditionally |
| 524 | * Drop catalog snapshot if it's the only one we have |
| 525 | * |
| 526 | * This is called when we are about to wait for client input, so we don't |
| 527 | * want to continue holding the catalog snapshot if it might mean that the |
| 528 | * global xmin horizon can't advance. However, if there are other snapshots |
| 529 | * still active or registered, the catalog snapshot isn't likely to be the |
| 530 | * oldest one, so we might as well keep it. |
| 531 | */ |
| 532 | void |
| 533 | InvalidateCatalogSnapshotConditionally(void) |
| 534 | { |
| 535 | if (CatalogSnapshot && |
| 536 | ActiveSnapshot == NULL && |
| 537 | pairingheap_is_singular(&RegisteredSnapshots)) |
| 538 | InvalidateCatalogSnapshot(); |
| 539 | } |
| 540 | |
| 541 | /* |
| 542 | * SnapshotSetCommandId |
| 543 | * Propagate CommandCounterIncrement into the static snapshots, if set |
| 544 | */ |
| 545 | void |
| 546 | SnapshotSetCommandId(CommandId curcid) |
| 547 | { |
| 548 | if (!FirstSnapshotSet) |
| 549 | return; |
| 550 | |
| 551 | if (CurrentSnapshot) |
| 552 | CurrentSnapshot->curcid = curcid; |
| 553 | if (SecondarySnapshot) |
| 554 | SecondarySnapshot->curcid = curcid; |
| 555 | /* Should we do the same with CatalogSnapshot? */ |
| 556 | } |
| 557 | |
| 558 | /* |
| 559 | * SetTransactionSnapshot |
| 560 | * Set the transaction's snapshot from an imported MVCC snapshot. |
| 561 | * |
| 562 | * Note that this is very closely tied to GetTransactionSnapshot --- it |
| 563 | * must take care of all the same considerations as the first-snapshot case |
| 564 | * in GetTransactionSnapshot. |
| 565 | */ |
| 566 | static void |
| 567 | SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, |
| 568 | int sourcepid, PGPROC *sourceproc) |
| 569 | { |
| 570 | /* Caller should have checked this already */ |
| 571 | Assert(!FirstSnapshotSet); |
| 572 | |
| 573 | /* Better do this to ensure following Assert succeeds. */ |
| 574 | InvalidateCatalogSnapshot(); |
| 575 | |
| 576 | Assert(pairingheap_is_empty(&RegisteredSnapshots)); |
| 577 | Assert(FirstXactSnapshot == NULL); |
| 578 | Assert(!HistoricSnapshotActive()); |
| 579 | |
| 580 | /* |
| 581 | * Even though we are not going to use the snapshot it computes, we must |
| 582 | * call GetSnapshotData, for two reasons: (1) to be sure that |
| 583 | * CurrentSnapshotData's XID arrays have been allocated, and (2) to update |
| 584 | * RecentXmin and RecentGlobalXmin. (We could alternatively include those |
| 585 | * two variables in exported snapshot files, but it seems better to have |
| 586 | * snapshot importers compute reasonably up-to-date values for them.) |
| 587 | */ |
| 588 | CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); |
| 589 | |
| 590 | /* |
| 591 | * Now copy appropriate fields from the source snapshot. |
| 592 | */ |
| 593 | CurrentSnapshot->xmin = sourcesnap->xmin; |
| 594 | CurrentSnapshot->xmax = sourcesnap->xmax; |
| 595 | CurrentSnapshot->xcnt = sourcesnap->xcnt; |
| 596 | Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount()); |
| 597 | memcpy(CurrentSnapshot->xip, sourcesnap->xip, |
| 598 | sourcesnap->xcnt * sizeof(TransactionId)); |
| 599 | CurrentSnapshot->subxcnt = sourcesnap->subxcnt; |
| 600 | Assert(sourcesnap->subxcnt <= GetMaxSnapshotSubxidCount()); |
| 601 | memcpy(CurrentSnapshot->subxip, sourcesnap->subxip, |
| 602 | sourcesnap->subxcnt * sizeof(TransactionId)); |
| 603 | CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed; |
| 604 | CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery; |
| 605 | /* NB: curcid should NOT be copied, it's a local matter */ |
| 606 | |
| 607 | /* |
| 608 | * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and |
| 609 | * TransactionXmin. There is a race condition: to make sure we are not |
| 610 | * causing the global xmin to go backwards, we have to test that the |
| 611 | * source transaction is still running, and that has to be done |
| 612 | * atomically. So let procarray.c do it. |
| 613 | * |
| 614 | * Note: in serializable mode, predicate.c will do this a second time. It |
| 615 | * doesn't seem worth contorting the logic here to avoid two calls, |
| 616 | * especially since it's not clear that predicate.c *must* do this. |
| 617 | */ |
| 618 | if (sourceproc != NULL) |
| 619 | { |
| 620 | if (!ProcArrayInstallRestoredXmin(CurrentSnapshot->xmin, sourceproc)) |
| 621 | ereport(ERROR, |
| 622 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| 623 | errmsg("could not import the requested snapshot" ), |
| 624 | errdetail("The source transaction is not running anymore." ))); |
| 625 | } |
| 626 | else if (!ProcArrayInstallImportedXmin(CurrentSnapshot->xmin, sourcevxid)) |
| 627 | ereport(ERROR, |
| 628 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| 629 | errmsg("could not import the requested snapshot" ), |
| 630 | errdetail("The source process with PID %d is not running anymore." , |
| 631 | sourcepid))); |
| 632 | |
| 633 | /* |
| 634 | * In transaction-snapshot mode, the first snapshot must live until end of |
| 635 | * xact, so we must make a copy of it. Furthermore, if we're running in |
| 636 | * serializable mode, predicate.c needs to do its own processing. |
| 637 | */ |
| 638 | if (IsolationUsesXactSnapshot()) |
| 639 | { |
| 640 | if (IsolationIsSerializable()) |
| 641 | SetSerializableTransactionSnapshot(CurrentSnapshot, sourcevxid, |
| 642 | sourcepid); |
| 643 | /* Make a saved copy */ |
| 644 | CurrentSnapshot = CopySnapshot(CurrentSnapshot); |
| 645 | FirstXactSnapshot = CurrentSnapshot; |
| 646 | /* Mark it as "registered" in FirstXactSnapshot */ |
| 647 | FirstXactSnapshot->regd_count++; |
| 648 | pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); |
| 649 | } |
| 650 | |
| 651 | FirstSnapshotSet = true; |
| 652 | } |
| 653 | |
| 654 | /* |
| 655 | * CopySnapshot |
| 656 | * Copy the given snapshot. |
| 657 | * |
| 658 | * The copy is palloc'd in TopTransactionContext and has initial refcounts set |
| 659 | * to 0. The returned snapshot has the copied flag set. |
| 660 | */ |
| 661 | static Snapshot |
| 662 | CopySnapshot(Snapshot snapshot) |
| 663 | { |
| 664 | Snapshot newsnap; |
| 665 | Size subxipoff; |
| 666 | Size size; |
| 667 | |
| 668 | Assert(snapshot != InvalidSnapshot); |
| 669 | |
| 670 | /* We allocate any XID arrays needed in the same palloc block. */ |
| 671 | size = subxipoff = sizeof(SnapshotData) + |
| 672 | snapshot->xcnt * sizeof(TransactionId); |
| 673 | if (snapshot->subxcnt > 0) |
| 674 | size += snapshot->subxcnt * sizeof(TransactionId); |
| 675 | |
| 676 | newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size); |
| 677 | memcpy(newsnap, snapshot, sizeof(SnapshotData)); |
| 678 | |
| 679 | newsnap->regd_count = 0; |
| 680 | newsnap->active_count = 0; |
| 681 | newsnap->copied = true; |
| 682 | |
| 683 | /* setup XID array */ |
| 684 | if (snapshot->xcnt > 0) |
| 685 | { |
| 686 | newsnap->xip = (TransactionId *) (newsnap + 1); |
| 687 | memcpy(newsnap->xip, snapshot->xip, |
| 688 | snapshot->xcnt * sizeof(TransactionId)); |
| 689 | } |
| 690 | else |
| 691 | newsnap->xip = NULL; |
| 692 | |
| 693 | /* |
| 694 | * Setup subXID array. Don't bother to copy it if it had overflowed, |
| 695 | * though, because it's not used anywhere in that case. Except if it's a |
| 696 | * snapshot taken during recovery; all the top-level XIDs are in subxip as |
| 697 | * well in that case, so we mustn't lose them. |
| 698 | */ |
| 699 | if (snapshot->subxcnt > 0 && |
| 700 | (!snapshot->suboverflowed || snapshot->takenDuringRecovery)) |
| 701 | { |
| 702 | newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff); |
| 703 | memcpy(newsnap->subxip, snapshot->subxip, |
| 704 | snapshot->subxcnt * sizeof(TransactionId)); |
| 705 | } |
| 706 | else |
| 707 | newsnap->subxip = NULL; |
| 708 | |
| 709 | return newsnap; |
| 710 | } |
| 711 | |
| 712 | /* |
| 713 | * FreeSnapshot |
| 714 | * Free the memory associated with a snapshot. |
| 715 | */ |
| 716 | static void |
| 717 | FreeSnapshot(Snapshot snapshot) |
| 718 | { |
| 719 | Assert(snapshot->regd_count == 0); |
| 720 | Assert(snapshot->active_count == 0); |
| 721 | Assert(snapshot->copied); |
| 722 | |
| 723 | pfree(snapshot); |
| 724 | } |
| 725 | |
| 726 | /* |
| 727 | * PushActiveSnapshot |
| 728 | * Set the given snapshot as the current active snapshot |
| 729 | * |
| 730 | * If the passed snapshot is a statically-allocated one, or it is possibly |
| 731 | * subject to a future command counter update, create a new long-lived copy |
| 732 | * with active refcount=1. Otherwise, only increment the refcount. |
| 733 | */ |
| 734 | void |
| 735 | PushActiveSnapshot(Snapshot snap) |
| 736 | { |
| 737 | ActiveSnapshotElt *newactive; |
| 738 | |
| 739 | Assert(snap != InvalidSnapshot); |
| 740 | |
| 741 | newactive = MemoryContextAlloc(TopTransactionContext, sizeof(ActiveSnapshotElt)); |
| 742 | |
| 743 | /* |
| 744 | * Checking SecondarySnapshot is probably useless here, but it seems |
| 745 | * better to be sure. |
| 746 | */ |
| 747 | if (snap == CurrentSnapshot || snap == SecondarySnapshot || !snap->copied) |
| 748 | newactive->as_snap = CopySnapshot(snap); |
| 749 | else |
| 750 | newactive->as_snap = snap; |
| 751 | |
| 752 | newactive->as_next = ActiveSnapshot; |
| 753 | newactive->as_level = GetCurrentTransactionNestLevel(); |
| 754 | |
| 755 | newactive->as_snap->active_count++; |
| 756 | |
| 757 | ActiveSnapshot = newactive; |
| 758 | if (OldestActiveSnapshot == NULL) |
| 759 | OldestActiveSnapshot = ActiveSnapshot; |
| 760 | } |
| 761 | |
| 762 | /* |
| 763 | * PushCopiedSnapshot |
| 764 | * As above, except forcibly copy the presented snapshot. |
| 765 | * |
| 766 | * This should be used when the ActiveSnapshot has to be modifiable, for |
| 767 | * example if the caller intends to call UpdateActiveSnapshotCommandId. |
| 768 | * The new snapshot will be released when popped from the stack. |
| 769 | */ |
| 770 | void |
| 771 | PushCopiedSnapshot(Snapshot snapshot) |
| 772 | { |
| 773 | PushActiveSnapshot(CopySnapshot(snapshot)); |
| 774 | } |
| 775 | |
| 776 | /* |
| 777 | * UpdateActiveSnapshotCommandId |
| 778 | * |
| 779 | * Update the current CID of the active snapshot. This can only be applied |
| 780 | * to a snapshot that is not referenced elsewhere. |
| 781 | */ |
| 782 | void |
| 783 | UpdateActiveSnapshotCommandId(void) |
| 784 | { |
| 785 | CommandId save_curcid, |
| 786 | curcid; |
| 787 | |
| 788 | Assert(ActiveSnapshot != NULL); |
| 789 | Assert(ActiveSnapshot->as_snap->active_count == 1); |
| 790 | Assert(ActiveSnapshot->as_snap->regd_count == 0); |
| 791 | |
| 792 | /* |
| 793 | * Don't allow modification of the active snapshot during parallel |
| 794 | * operation. We share the snapshot to worker backends at the beginning |
| 795 | * of parallel operation, so any change to the snapshot can lead to |
| 796 | * inconsistencies. We have other defenses against |
| 797 | * CommandCounterIncrement, but there are a few places that call this |
| 798 | * directly, so we put an additional guard here. |
| 799 | */ |
| 800 | save_curcid = ActiveSnapshot->as_snap->curcid; |
| 801 | curcid = GetCurrentCommandId(false); |
| 802 | if (IsInParallelMode() && save_curcid != curcid) |
| 803 | elog(ERROR, "cannot modify commandid in active snapshot during a parallel operation" ); |
| 804 | ActiveSnapshot->as_snap->curcid = curcid; |
| 805 | } |
| 806 | |
| 807 | /* |
| 808 | * PopActiveSnapshot |
| 809 | * |
| 810 | * Remove the topmost snapshot from the active snapshot stack, decrementing the |
| 811 | * reference count, and free it if this was the last reference. |
| 812 | */ |
| 813 | void |
| 814 | PopActiveSnapshot(void) |
| 815 | { |
| 816 | ActiveSnapshotElt *newstack; |
| 817 | |
| 818 | newstack = ActiveSnapshot->as_next; |
| 819 | |
| 820 | Assert(ActiveSnapshot->as_snap->active_count > 0); |
| 821 | |
| 822 | ActiveSnapshot->as_snap->active_count--; |
| 823 | |
| 824 | if (ActiveSnapshot->as_snap->active_count == 0 && |
| 825 | ActiveSnapshot->as_snap->regd_count == 0) |
| 826 | FreeSnapshot(ActiveSnapshot->as_snap); |
| 827 | |
| 828 | pfree(ActiveSnapshot); |
| 829 | ActiveSnapshot = newstack; |
| 830 | if (ActiveSnapshot == NULL) |
| 831 | OldestActiveSnapshot = NULL; |
| 832 | |
| 833 | SnapshotResetXmin(); |
| 834 | } |
| 835 | |
| 836 | /* |
| 837 | * GetActiveSnapshot |
| 838 | * Return the topmost snapshot in the Active stack. |
| 839 | */ |
| 840 | Snapshot |
| 841 | GetActiveSnapshot(void) |
| 842 | { |
| 843 | Assert(ActiveSnapshot != NULL); |
| 844 | |
| 845 | return ActiveSnapshot->as_snap; |
| 846 | } |
| 847 | |
| 848 | /* |
| 849 | * ActiveSnapshotSet |
| 850 | * Return whether there is at least one snapshot in the Active stack |
| 851 | */ |
| 852 | bool |
| 853 | ActiveSnapshotSet(void) |
| 854 | { |
| 855 | return ActiveSnapshot != NULL; |
| 856 | } |
| 857 | |
| 858 | /* |
| 859 | * RegisterSnapshot |
| 860 | * Register a snapshot as being in use by the current resource owner |
| 861 | * |
| 862 | * If InvalidSnapshot is passed, it is not registered. |
| 863 | */ |
| 864 | Snapshot |
| 865 | RegisterSnapshot(Snapshot snapshot) |
| 866 | { |
| 867 | if (snapshot == InvalidSnapshot) |
| 868 | return InvalidSnapshot; |
| 869 | |
| 870 | return RegisterSnapshotOnOwner(snapshot, CurrentResourceOwner); |
| 871 | } |
| 872 | |
| 873 | /* |
| 874 | * RegisterSnapshotOnOwner |
| 875 | * As above, but use the specified resource owner |
| 876 | */ |
| 877 | Snapshot |
| 878 | RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner) |
| 879 | { |
| 880 | Snapshot snap; |
| 881 | |
| 882 | if (snapshot == InvalidSnapshot) |
| 883 | return InvalidSnapshot; |
| 884 | |
| 885 | /* Static snapshot? Create a persistent copy */ |
| 886 | snap = snapshot->copied ? snapshot : CopySnapshot(snapshot); |
| 887 | |
| 888 | /* and tell resowner.c about it */ |
| 889 | ResourceOwnerEnlargeSnapshots(owner); |
| 890 | snap->regd_count++; |
| 891 | ResourceOwnerRememberSnapshot(owner, snap); |
| 892 | |
| 893 | if (snap->regd_count == 1) |
| 894 | pairingheap_add(&RegisteredSnapshots, &snap->ph_node); |
| 895 | |
| 896 | return snap; |
| 897 | } |
| 898 | |
| 899 | /* |
| 900 | * UnregisterSnapshot |
| 901 | * |
| 902 | * Decrement the reference count of a snapshot, remove the corresponding |
| 903 | * reference from CurrentResourceOwner, and free the snapshot if no more |
| 904 | * references remain. |
| 905 | */ |
| 906 | void |
| 907 | UnregisterSnapshot(Snapshot snapshot) |
| 908 | { |
| 909 | if (snapshot == NULL) |
| 910 | return; |
| 911 | |
| 912 | UnregisterSnapshotFromOwner(snapshot, CurrentResourceOwner); |
| 913 | } |
| 914 | |
| 915 | /* |
| 916 | * UnregisterSnapshotFromOwner |
| 917 | * As above, but use the specified resource owner |
| 918 | */ |
| 919 | void |
| 920 | UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner) |
| 921 | { |
| 922 | if (snapshot == NULL) |
| 923 | return; |
| 924 | |
| 925 | Assert(snapshot->regd_count > 0); |
| 926 | Assert(!pairingheap_is_empty(&RegisteredSnapshots)); |
| 927 | |
| 928 | ResourceOwnerForgetSnapshot(owner, snapshot); |
| 929 | |
| 930 | snapshot->regd_count--; |
| 931 | if (snapshot->regd_count == 0) |
| 932 | pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node); |
| 933 | |
| 934 | if (snapshot->regd_count == 0 && snapshot->active_count == 0) |
| 935 | { |
| 936 | FreeSnapshot(snapshot); |
| 937 | SnapshotResetXmin(); |
| 938 | } |
| 939 | } |
| 940 | |
| 941 | /* |
| 942 | * Comparison function for RegisteredSnapshots heap. Snapshots are ordered |
| 943 | * by xmin, so that the snapshot with smallest xmin is at the top. |
| 944 | */ |
| 945 | static int |
| 946 | xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) |
| 947 | { |
| 948 | const SnapshotData *asnap = pairingheap_const_container(SnapshotData, ph_node, a); |
| 949 | const SnapshotData *bsnap = pairingheap_const_container(SnapshotData, ph_node, b); |
| 950 | |
| 951 | if (TransactionIdPrecedes(asnap->xmin, bsnap->xmin)) |
| 952 | return 1; |
| 953 | else if (TransactionIdFollows(asnap->xmin, bsnap->xmin)) |
| 954 | return -1; |
| 955 | else |
| 956 | return 0; |
| 957 | } |
| 958 | |
| 959 | /* |
| 960 | * Get current RecentGlobalXmin value, as a FullTransactionId. |
| 961 | */ |
| 962 | FullTransactionId |
| 963 | GetFullRecentGlobalXmin(void) |
| 964 | { |
| 965 | FullTransactionId nextxid_full; |
| 966 | uint32 nextxid_epoch; |
| 967 | TransactionId nextxid_xid; |
| 968 | uint32 epoch; |
| 969 | |
| 970 | Assert(TransactionIdIsNormal(RecentGlobalXmin)); |
| 971 | |
| 972 | /* |
| 973 | * Compute the epoch from the next XID's epoch. This relies on the fact |
| 974 | * that RecentGlobalXmin must be within the 2 billion XID horizon from the |
| 975 | * next XID. |
| 976 | */ |
| 977 | nextxid_full = ReadNextFullTransactionId(); |
| 978 | nextxid_epoch = EpochFromFullTransactionId(nextxid_full); |
| 979 | nextxid_xid = XidFromFullTransactionId(nextxid_full); |
| 980 | |
| 981 | if (RecentGlobalXmin > nextxid_xid) |
| 982 | epoch = nextxid_epoch - 1; |
| 983 | else |
| 984 | epoch = nextxid_epoch; |
| 985 | |
| 986 | return FullTransactionIdFromEpochAndXid(epoch, RecentGlobalXmin); |
| 987 | } |
| 988 | |
| 989 | /* |
| 990 | * SnapshotResetXmin |
| 991 | * |
| 992 | * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid. |
| 993 | * Note we can do this without locking because we assume that storing an Xid |
| 994 | * is atomic. |
| 995 | * |
| 996 | * Even if there are some remaining snapshots, we may be able to advance our |
| 997 | * PGXACT->xmin to some degree. This typically happens when a portal is |
| 998 | * dropped. For efficiency, we only consider recomputing PGXACT->xmin when |
| 999 | * the active snapshot stack is empty; this allows us not to need to track |
| 1000 | * which active snapshot is oldest. |
| 1001 | * |
| 1002 | * Note: it's tempting to use GetOldestSnapshot() here so that we can include |
| 1003 | * active snapshots in the calculation. However, that compares by LSN not |
| 1004 | * xmin so it's not entirely clear that it's the same thing. Also, we'd be |
| 1005 | * critically dependent on the assumption that the bottommost active snapshot |
| 1006 | * stack entry has the oldest xmin. (Current uses of GetOldestSnapshot() are |
| 1007 | * not actually critical, but this would be.) |
| 1008 | */ |
| 1009 | static void |
| 1010 | SnapshotResetXmin(void) |
| 1011 | { |
| 1012 | Snapshot minSnapshot; |
| 1013 | |
| 1014 | if (ActiveSnapshot != NULL) |
| 1015 | return; |
| 1016 | |
| 1017 | if (pairingheap_is_empty(&RegisteredSnapshots)) |
| 1018 | { |
| 1019 | MyPgXact->xmin = InvalidTransactionId; |
| 1020 | return; |
| 1021 | } |
| 1022 | |
| 1023 | minSnapshot = pairingheap_container(SnapshotData, ph_node, |
| 1024 | pairingheap_first(&RegisteredSnapshots)); |
| 1025 | |
| 1026 | if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin)) |
| 1027 | MyPgXact->xmin = minSnapshot->xmin; |
| 1028 | } |
| 1029 | |
| 1030 | /* |
| 1031 | * AtSubCommit_Snapshot |
| 1032 | */ |
| 1033 | void |
| 1034 | AtSubCommit_Snapshot(int level) |
| 1035 | { |
| 1036 | ActiveSnapshotElt *active; |
| 1037 | |
| 1038 | /* |
| 1039 | * Relabel the active snapshots set in this subtransaction as though they |
| 1040 | * are owned by the parent subxact. |
| 1041 | */ |
| 1042 | for (active = ActiveSnapshot; active != NULL; active = active->as_next) |
| 1043 | { |
| 1044 | if (active->as_level < level) |
| 1045 | break; |
| 1046 | active->as_level = level - 1; |
| 1047 | } |
| 1048 | } |
| 1049 | |
| 1050 | /* |
| 1051 | * AtSubAbort_Snapshot |
| 1052 | * Clean up snapshots after a subtransaction abort |
| 1053 | */ |
| 1054 | void |
| 1055 | AtSubAbort_Snapshot(int level) |
| 1056 | { |
| 1057 | /* Forget the active snapshots set by this subtransaction */ |
| 1058 | while (ActiveSnapshot && ActiveSnapshot->as_level >= level) |
| 1059 | { |
| 1060 | ActiveSnapshotElt *next; |
| 1061 | |
| 1062 | next = ActiveSnapshot->as_next; |
| 1063 | |
| 1064 | /* |
| 1065 | * Decrement the snapshot's active count. If it's still registered or |
| 1066 | * marked as active by an outer subtransaction, we can't free it yet. |
| 1067 | */ |
| 1068 | Assert(ActiveSnapshot->as_snap->active_count >= 1); |
| 1069 | ActiveSnapshot->as_snap->active_count -= 1; |
| 1070 | |
| 1071 | if (ActiveSnapshot->as_snap->active_count == 0 && |
| 1072 | ActiveSnapshot->as_snap->regd_count == 0) |
| 1073 | FreeSnapshot(ActiveSnapshot->as_snap); |
| 1074 | |
| 1075 | /* and free the stack element */ |
| 1076 | pfree(ActiveSnapshot); |
| 1077 | |
| 1078 | ActiveSnapshot = next; |
| 1079 | if (ActiveSnapshot == NULL) |
| 1080 | OldestActiveSnapshot = NULL; |
| 1081 | } |
| 1082 | |
| 1083 | SnapshotResetXmin(); |
| 1084 | } |
| 1085 | |
| 1086 | /* |
| 1087 | * AtEOXact_Snapshot |
| 1088 | * Snapshot manager's cleanup function for end of transaction |
| 1089 | */ |
| 1090 | void |
| 1091 | AtEOXact_Snapshot(bool isCommit, bool resetXmin) |
| 1092 | { |
| 1093 | /* |
| 1094 | * In transaction-snapshot mode we must release our privately-managed |
| 1095 | * reference to the transaction snapshot. We must remove it from |
| 1096 | * RegisteredSnapshots to keep the check below happy. But we don't bother |
| 1097 | * to do FreeSnapshot, for two reasons: the memory will go away with |
| 1098 | * TopTransactionContext anyway, and if someone has left the snapshot |
| 1099 | * stacked as active, we don't want the code below to be chasing through a |
| 1100 | * dangling pointer. |
| 1101 | */ |
| 1102 | if (FirstXactSnapshot != NULL) |
| 1103 | { |
| 1104 | Assert(FirstXactSnapshot->regd_count > 0); |
| 1105 | Assert(!pairingheap_is_empty(&RegisteredSnapshots)); |
| 1106 | pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); |
| 1107 | } |
| 1108 | FirstXactSnapshot = NULL; |
| 1109 | |
| 1110 | /* |
| 1111 | * If we exported any snapshots, clean them up. |
| 1112 | */ |
| 1113 | if (exportedSnapshots != NIL) |
| 1114 | { |
| 1115 | ListCell *lc; |
| 1116 | |
| 1117 | /* |
| 1118 | * Get rid of the files. Unlink failure is only a WARNING because (1) |
| 1119 | * it's too late to abort the transaction, and (2) leaving a leaked |
| 1120 | * file around has little real consequence anyway. |
| 1121 | * |
| 1122 | * We also need to remove the snapshots from RegisteredSnapshots to |
| 1123 | * prevent a warning below. |
| 1124 | * |
| 1125 | * As with the FirstXactSnapshot, we don't need to free resources of |
| 1126 | * the snapshot iself as it will go away with the memory context. |
| 1127 | */ |
| 1128 | foreach(lc, exportedSnapshots) |
| 1129 | { |
| 1130 | ExportedSnapshot *esnap = (ExportedSnapshot *) lfirst(lc); |
| 1131 | |
| 1132 | if (unlink(esnap->snapfile)) |
| 1133 | elog(WARNING, "could not unlink file \"%s\": %m" , |
| 1134 | esnap->snapfile); |
| 1135 | |
| 1136 | pairingheap_remove(&RegisteredSnapshots, |
| 1137 | &esnap->snapshot->ph_node); |
| 1138 | } |
| 1139 | |
| 1140 | exportedSnapshots = NIL; |
| 1141 | } |
| 1142 | |
| 1143 | /* Drop catalog snapshot if any */ |
| 1144 | InvalidateCatalogSnapshot(); |
| 1145 | |
| 1146 | /* On commit, complain about leftover snapshots */ |
| 1147 | if (isCommit) |
| 1148 | { |
| 1149 | ActiveSnapshotElt *active; |
| 1150 | |
| 1151 | if (!pairingheap_is_empty(&RegisteredSnapshots)) |
| 1152 | elog(WARNING, "registered snapshots seem to remain after cleanup" ); |
| 1153 | |
| 1154 | /* complain about unpopped active snapshots */ |
| 1155 | for (active = ActiveSnapshot; active != NULL; active = active->as_next) |
| 1156 | elog(WARNING, "snapshot %p still active" , active); |
| 1157 | } |
| 1158 | |
| 1159 | /* |
| 1160 | * And reset our state. We don't need to free the memory explicitly -- |
| 1161 | * it'll go away with TopTransactionContext. |
| 1162 | */ |
| 1163 | ActiveSnapshot = NULL; |
| 1164 | OldestActiveSnapshot = NULL; |
| 1165 | pairingheap_reset(&RegisteredSnapshots); |
| 1166 | |
| 1167 | CurrentSnapshot = NULL; |
| 1168 | SecondarySnapshot = NULL; |
| 1169 | |
| 1170 | FirstSnapshotSet = false; |
| 1171 | |
| 1172 | /* |
| 1173 | * During normal commit processing, we call ProcArrayEndTransaction() to |
| 1174 | * reset the PgXact->xmin. That call happens prior to the call to |
| 1175 | * AtEOXact_Snapshot(), so we need not touch xmin here at all. |
| 1176 | */ |
| 1177 | if (resetXmin) |
| 1178 | SnapshotResetXmin(); |
| 1179 | |
| 1180 | Assert(resetXmin || MyPgXact->xmin == 0); |
| 1181 | } |
| 1182 | |
| 1183 | |
| 1184 | /* |
| 1185 | * ExportSnapshot |
| 1186 | * Export the snapshot to a file so that other backends can import it. |
| 1187 | * Returns the token (the file name) that can be used to import this |
| 1188 | * snapshot. |
| 1189 | */ |
| 1190 | char * |
| 1191 | ExportSnapshot(Snapshot snapshot) |
| 1192 | { |
| 1193 | TransactionId topXid; |
| 1194 | TransactionId *children; |
| 1195 | ExportedSnapshot *esnap; |
| 1196 | int nchildren; |
| 1197 | int addTopXid; |
| 1198 | StringInfoData buf; |
| 1199 | FILE *f; |
| 1200 | int i; |
| 1201 | MemoryContext oldcxt; |
| 1202 | char path[MAXPGPATH]; |
| 1203 | char pathtmp[MAXPGPATH]; |
| 1204 | |
| 1205 | /* |
| 1206 | * It's tempting to call RequireTransactionBlock here, since it's not very |
| 1207 | * useful to export a snapshot that will disappear immediately afterwards. |
| 1208 | * However, we haven't got enough information to do that, since we don't |
| 1209 | * know if we're at top level or not. For example, we could be inside a |
| 1210 | * plpgsql function that is going to fire off other transactions via |
| 1211 | * dblink. Rather than disallow perfectly legitimate usages, don't make a |
| 1212 | * check. |
| 1213 | * |
| 1214 | * Also note that we don't make any restriction on the transaction's |
| 1215 | * isolation level; however, importers must check the level if they are |
| 1216 | * serializable. |
| 1217 | */ |
| 1218 | |
| 1219 | /* |
| 1220 | * Get our transaction ID if there is one, to include in the snapshot. |
| 1221 | */ |
| 1222 | topXid = GetTopTransactionIdIfAny(); |
| 1223 | |
| 1224 | /* |
| 1225 | * We cannot export a snapshot from a subtransaction because there's no |
| 1226 | * easy way for importers to verify that the same subtransaction is still |
| 1227 | * running. |
| 1228 | */ |
| 1229 | if (IsSubTransaction()) |
| 1230 | ereport(ERROR, |
| 1231 | (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), |
| 1232 | errmsg("cannot export a snapshot from a subtransaction" ))); |
| 1233 | |
| 1234 | /* |
| 1235 | * We do however allow previous committed subtransactions to exist. |
| 1236 | * Importers of the snapshot must see them as still running, so get their |
| 1237 | * XIDs to add them to the snapshot. |
| 1238 | */ |
| 1239 | nchildren = xactGetCommittedChildren(&children); |
| 1240 | |
| 1241 | /* |
| 1242 | * Generate file path for the snapshot. We start numbering of snapshots |
| 1243 | * inside the transaction from 1. |
| 1244 | */ |
| 1245 | snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%08X-%d" , |
| 1246 | MyProc->backendId, MyProc->lxid, list_length(exportedSnapshots) + 1); |
| 1247 | |
| 1248 | /* |
| 1249 | * Copy the snapshot into TopTransactionContext, add it to the |
| 1250 | * exportedSnapshots list, and mark it pseudo-registered. We do this to |
| 1251 | * ensure that the snapshot's xmin is honored for the rest of the |
| 1252 | * transaction. |
| 1253 | */ |
| 1254 | snapshot = CopySnapshot(snapshot); |
| 1255 | |
| 1256 | oldcxt = MemoryContextSwitchTo(TopTransactionContext); |
| 1257 | esnap = (ExportedSnapshot *) palloc(sizeof(ExportedSnapshot)); |
| 1258 | esnap->snapfile = pstrdup(path); |
| 1259 | esnap->snapshot = snapshot; |
| 1260 | exportedSnapshots = lappend(exportedSnapshots, esnap); |
| 1261 | MemoryContextSwitchTo(oldcxt); |
| 1262 | |
| 1263 | snapshot->regd_count++; |
| 1264 | pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node); |
| 1265 | |
| 1266 | /* |
| 1267 | * Fill buf with a text serialization of the snapshot, plus identification |
| 1268 | * data about this transaction. The format expected by ImportSnapshot is |
| 1269 | * pretty rigid: each line must be fieldname:value. |
| 1270 | */ |
| 1271 | initStringInfo(&buf); |
| 1272 | |
| 1273 | appendStringInfo(&buf, "vxid:%d/%u\n" , MyProc->backendId, MyProc->lxid); |
| 1274 | appendStringInfo(&buf, "pid:%d\n" , MyProcPid); |
| 1275 | appendStringInfo(&buf, "dbid:%u\n" , MyDatabaseId); |
| 1276 | appendStringInfo(&buf, "iso:%d\n" , XactIsoLevel); |
| 1277 | appendStringInfo(&buf, "ro:%d\n" , XactReadOnly); |
| 1278 | |
| 1279 | appendStringInfo(&buf, "xmin:%u\n" , snapshot->xmin); |
| 1280 | appendStringInfo(&buf, "xmax:%u\n" , snapshot->xmax); |
| 1281 | |
| 1282 | /* |
| 1283 | * We must include our own top transaction ID in the top-xid data, since |
| 1284 | * by definition we will still be running when the importing transaction |
| 1285 | * adopts the snapshot, but GetSnapshotData never includes our own XID in |
| 1286 | * the snapshot. (There must, therefore, be enough room to add it.) |
| 1287 | * |
| 1288 | * However, it could be that our topXid is after the xmax, in which case |
| 1289 | * we shouldn't include it because xip[] members are expected to be before |
| 1290 | * xmax. (We need not make the same check for subxip[] members, see |
| 1291 | * snapshot.h.) |
| 1292 | */ |
| 1293 | addTopXid = (TransactionIdIsValid(topXid) && |
| 1294 | TransactionIdPrecedes(topXid, snapshot->xmax)) ? 1 : 0; |
| 1295 | appendStringInfo(&buf, "xcnt:%d\n" , snapshot->xcnt + addTopXid); |
| 1296 | for (i = 0; i < snapshot->xcnt; i++) |
| 1297 | appendStringInfo(&buf, "xip:%u\n" , snapshot->xip[i]); |
| 1298 | if (addTopXid) |
| 1299 | appendStringInfo(&buf, "xip:%u\n" , topXid); |
| 1300 | |
| 1301 | /* |
| 1302 | * Similarly, we add our subcommitted child XIDs to the subxid data. Here, |
| 1303 | * we have to cope with possible overflow. |
| 1304 | */ |
| 1305 | if (snapshot->suboverflowed || |
| 1306 | snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount()) |
| 1307 | appendStringInfoString(&buf, "sof:1\n" ); |
| 1308 | else |
| 1309 | { |
| 1310 | appendStringInfoString(&buf, "sof:0\n" ); |
| 1311 | appendStringInfo(&buf, "sxcnt:%d\n" , snapshot->subxcnt + nchildren); |
| 1312 | for (i = 0; i < snapshot->subxcnt; i++) |
| 1313 | appendStringInfo(&buf, "sxp:%u\n" , snapshot->subxip[i]); |
| 1314 | for (i = 0; i < nchildren; i++) |
| 1315 | appendStringInfo(&buf, "sxp:%u\n" , children[i]); |
| 1316 | } |
| 1317 | appendStringInfo(&buf, "rec:%u\n" , snapshot->takenDuringRecovery); |
| 1318 | |
| 1319 | /* |
| 1320 | * Now write the text representation into a file. We first write to a |
| 1321 | * ".tmp" filename, and rename to final filename if no error. This |
| 1322 | * ensures that no other backend can read an incomplete file |
| 1323 | * (ImportSnapshot won't allow it because of its valid-characters check). |
| 1324 | */ |
| 1325 | snprintf(pathtmp, sizeof(pathtmp), "%s.tmp" , path); |
| 1326 | if (!(f = AllocateFile(pathtmp, PG_BINARY_W))) |
| 1327 | ereport(ERROR, |
| 1328 | (errcode_for_file_access(), |
| 1329 | errmsg("could not create file \"%s\": %m" , pathtmp))); |
| 1330 | |
| 1331 | if (fwrite(buf.data, buf.len, 1, f) != 1) |
| 1332 | ereport(ERROR, |
| 1333 | (errcode_for_file_access(), |
| 1334 | errmsg("could not write to file \"%s\": %m" , pathtmp))); |
| 1335 | |
| 1336 | /* no fsync() since file need not survive a system crash */ |
| 1337 | |
| 1338 | if (FreeFile(f)) |
| 1339 | ereport(ERROR, |
| 1340 | (errcode_for_file_access(), |
| 1341 | errmsg("could not write to file \"%s\": %m" , pathtmp))); |
| 1342 | |
| 1343 | /* |
| 1344 | * Now that we have written everything into a .tmp file, rename the file |
| 1345 | * to remove the .tmp suffix. |
| 1346 | */ |
| 1347 | if (rename(pathtmp, path) < 0) |
| 1348 | ereport(ERROR, |
| 1349 | (errcode_for_file_access(), |
| 1350 | errmsg("could not rename file \"%s\" to \"%s\": %m" , |
| 1351 | pathtmp, path))); |
| 1352 | |
| 1353 | /* |
| 1354 | * The basename of the file is what we return from pg_export_snapshot(). |
| 1355 | * It's already in path in a textual format and we know that the path |
| 1356 | * starts with SNAPSHOT_EXPORT_DIR. Skip over the prefix and the slash |
| 1357 | * and pstrdup it so as not to return the address of a local variable. |
| 1358 | */ |
| 1359 | return pstrdup(path + strlen(SNAPSHOT_EXPORT_DIR) + 1); |
| 1360 | } |
| 1361 | |
| 1362 | /* |
| 1363 | * pg_export_snapshot |
| 1364 | * SQL-callable wrapper for ExportSnapshot. |
| 1365 | */ |
| 1366 | Datum |
| 1367 | pg_export_snapshot(PG_FUNCTION_ARGS) |
| 1368 | { |
| 1369 | char *snapshotName; |
| 1370 | |
| 1371 | snapshotName = ExportSnapshot(GetActiveSnapshot()); |
| 1372 | PG_RETURN_TEXT_P(cstring_to_text(snapshotName)); |
| 1373 | } |
| 1374 | |
| 1375 | |
| 1376 | /* |
| 1377 | * Parsing subroutines for ImportSnapshot: parse a line with the given |
| 1378 | * prefix followed by a value, and advance *s to the next line. The |
| 1379 | * filename is provided for use in error messages. |
| 1380 | */ |
| 1381 | static int |
| 1382 | parseIntFromText(const char *prefix, char **s, const char *filename) |
| 1383 | { |
| 1384 | char *ptr = *s; |
| 1385 | int prefixlen = strlen(prefix); |
| 1386 | int val; |
| 1387 | |
| 1388 | if (strncmp(ptr, prefix, prefixlen) != 0) |
| 1389 | ereport(ERROR, |
| 1390 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1391 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1392 | ptr += prefixlen; |
| 1393 | if (sscanf(ptr, "%d" , &val) != 1) |
| 1394 | ereport(ERROR, |
| 1395 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1396 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1397 | ptr = strchr(ptr, '\n'); |
| 1398 | if (!ptr) |
| 1399 | ereport(ERROR, |
| 1400 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1401 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1402 | *s = ptr + 1; |
| 1403 | return val; |
| 1404 | } |
| 1405 | |
| 1406 | static TransactionId |
| 1407 | parseXidFromText(const char *prefix, char **s, const char *filename) |
| 1408 | { |
| 1409 | char *ptr = *s; |
| 1410 | int prefixlen = strlen(prefix); |
| 1411 | TransactionId val; |
| 1412 | |
| 1413 | if (strncmp(ptr, prefix, prefixlen) != 0) |
| 1414 | ereport(ERROR, |
| 1415 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1416 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1417 | ptr += prefixlen; |
| 1418 | if (sscanf(ptr, "%u" , &val) != 1) |
| 1419 | ereport(ERROR, |
| 1420 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1421 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1422 | ptr = strchr(ptr, '\n'); |
| 1423 | if (!ptr) |
| 1424 | ereport(ERROR, |
| 1425 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1426 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1427 | *s = ptr + 1; |
| 1428 | return val; |
| 1429 | } |
| 1430 | |
| 1431 | static void |
| 1432 | parseVxidFromText(const char *prefix, char **s, const char *filename, |
| 1433 | VirtualTransactionId *vxid) |
| 1434 | { |
| 1435 | char *ptr = *s; |
| 1436 | int prefixlen = strlen(prefix); |
| 1437 | |
| 1438 | if (strncmp(ptr, prefix, prefixlen) != 0) |
| 1439 | ereport(ERROR, |
| 1440 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1441 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1442 | ptr += prefixlen; |
| 1443 | if (sscanf(ptr, "%d/%u" , &vxid->backendId, &vxid->localTransactionId) != 2) |
| 1444 | ereport(ERROR, |
| 1445 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1446 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1447 | ptr = strchr(ptr, '\n'); |
| 1448 | if (!ptr) |
| 1449 | ereport(ERROR, |
| 1450 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1451 | errmsg("invalid snapshot data in file \"%s\"" , filename))); |
| 1452 | *s = ptr + 1; |
| 1453 | } |
| 1454 | |
| 1455 | /* |
| 1456 | * ImportSnapshot |
| 1457 | * Import a previously exported snapshot. The argument should be a |
| 1458 | * filename in SNAPSHOT_EXPORT_DIR. Load the snapshot from that file. |
| 1459 | * This is called by "SET TRANSACTION SNAPSHOT 'foo'". |
| 1460 | */ |
| 1461 | void |
| 1462 | ImportSnapshot(const char *idstr) |
| 1463 | { |
| 1464 | char path[MAXPGPATH]; |
| 1465 | FILE *f; |
| 1466 | struct stat stat_buf; |
| 1467 | char *filebuf; |
| 1468 | int xcnt; |
| 1469 | int i; |
| 1470 | VirtualTransactionId src_vxid; |
| 1471 | int src_pid; |
| 1472 | Oid src_dbid; |
| 1473 | int src_isolevel; |
| 1474 | bool src_readonly; |
| 1475 | SnapshotData snapshot; |
| 1476 | |
| 1477 | /* |
| 1478 | * Must be at top level of a fresh transaction. Note in particular that |
| 1479 | * we check we haven't acquired an XID --- if we have, it's conceivable |
| 1480 | * that the snapshot would show it as not running, making for very screwy |
| 1481 | * behavior. |
| 1482 | */ |
| 1483 | if (FirstSnapshotSet || |
| 1484 | GetTopTransactionIdIfAny() != InvalidTransactionId || |
| 1485 | IsSubTransaction()) |
| 1486 | ereport(ERROR, |
| 1487 | (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), |
| 1488 | errmsg("SET TRANSACTION SNAPSHOT must be called before any query" ))); |
| 1489 | |
| 1490 | /* |
| 1491 | * If we are in read committed mode then the next query would execute with |
| 1492 | * a new snapshot thus making this function call quite useless. |
| 1493 | */ |
| 1494 | if (!IsolationUsesXactSnapshot()) |
| 1495 | ereport(ERROR, |
| 1496 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| 1497 | errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ" ))); |
| 1498 | |
| 1499 | /* |
| 1500 | * Verify the identifier: only 0-9, A-F and hyphens are allowed. We do |
| 1501 | * this mainly to prevent reading arbitrary files. |
| 1502 | */ |
| 1503 | if (strspn(idstr, "0123456789ABCDEF-" ) != strlen(idstr)) |
| 1504 | ereport(ERROR, |
| 1505 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| 1506 | errmsg("invalid snapshot identifier: \"%s\"" , idstr))); |
| 1507 | |
| 1508 | /* OK, read the file */ |
| 1509 | snprintf(path, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s" , idstr); |
| 1510 | |
| 1511 | f = AllocateFile(path, PG_BINARY_R); |
| 1512 | if (!f) |
| 1513 | ereport(ERROR, |
| 1514 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| 1515 | errmsg("invalid snapshot identifier: \"%s\"" , idstr))); |
| 1516 | |
| 1517 | /* get the size of the file so that we know how much memory we need */ |
| 1518 | if (fstat(fileno(f), &stat_buf)) |
| 1519 | elog(ERROR, "could not stat file \"%s\": %m" , path); |
| 1520 | |
| 1521 | /* and read the file into a palloc'd string */ |
| 1522 | filebuf = (char *) palloc(stat_buf.st_size + 1); |
| 1523 | if (fread(filebuf, stat_buf.st_size, 1, f) != 1) |
| 1524 | elog(ERROR, "could not read file \"%s\": %m" , path); |
| 1525 | |
| 1526 | filebuf[stat_buf.st_size] = '\0'; |
| 1527 | |
| 1528 | FreeFile(f); |
| 1529 | |
| 1530 | /* |
| 1531 | * Construct a snapshot struct by parsing the file content. |
| 1532 | */ |
| 1533 | memset(&snapshot, 0, sizeof(snapshot)); |
| 1534 | |
| 1535 | parseVxidFromText("vxid:" , &filebuf, path, &src_vxid); |
| 1536 | src_pid = parseIntFromText("pid:" , &filebuf, path); |
| 1537 | /* we abuse parseXidFromText a bit here ... */ |
| 1538 | src_dbid = parseXidFromText("dbid:" , &filebuf, path); |
| 1539 | src_isolevel = parseIntFromText("iso:" , &filebuf, path); |
| 1540 | src_readonly = parseIntFromText("ro:" , &filebuf, path); |
| 1541 | |
| 1542 | snapshot.snapshot_type = SNAPSHOT_MVCC; |
| 1543 | |
| 1544 | snapshot.xmin = parseXidFromText("xmin:" , &filebuf, path); |
| 1545 | snapshot.xmax = parseXidFromText("xmax:" , &filebuf, path); |
| 1546 | |
| 1547 | snapshot.xcnt = xcnt = parseIntFromText("xcnt:" , &filebuf, path); |
| 1548 | |
| 1549 | /* sanity-check the xid count before palloc */ |
| 1550 | if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount()) |
| 1551 | ereport(ERROR, |
| 1552 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1553 | errmsg("invalid snapshot data in file \"%s\"" , path))); |
| 1554 | |
| 1555 | snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId)); |
| 1556 | for (i = 0; i < xcnt; i++) |
| 1557 | snapshot.xip[i] = parseXidFromText("xip:" , &filebuf, path); |
| 1558 | |
| 1559 | snapshot.suboverflowed = parseIntFromText("sof:" , &filebuf, path); |
| 1560 | |
| 1561 | if (!snapshot.suboverflowed) |
| 1562 | { |
| 1563 | snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:" , &filebuf, path); |
| 1564 | |
| 1565 | /* sanity-check the xid count before palloc */ |
| 1566 | if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount()) |
| 1567 | ereport(ERROR, |
| 1568 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1569 | errmsg("invalid snapshot data in file \"%s\"" , path))); |
| 1570 | |
| 1571 | snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId)); |
| 1572 | for (i = 0; i < xcnt; i++) |
| 1573 | snapshot.subxip[i] = parseXidFromText("sxp:" , &filebuf, path); |
| 1574 | } |
| 1575 | else |
| 1576 | { |
| 1577 | snapshot.subxcnt = 0; |
| 1578 | snapshot.subxip = NULL; |
| 1579 | } |
| 1580 | |
| 1581 | snapshot.takenDuringRecovery = parseIntFromText("rec:" , &filebuf, path); |
| 1582 | |
| 1583 | /* |
| 1584 | * Do some additional sanity checking, just to protect ourselves. We |
| 1585 | * don't trouble to check the array elements, just the most critical |
| 1586 | * fields. |
| 1587 | */ |
| 1588 | if (!VirtualTransactionIdIsValid(src_vxid) || |
| 1589 | !OidIsValid(src_dbid) || |
| 1590 | !TransactionIdIsNormal(snapshot.xmin) || |
| 1591 | !TransactionIdIsNormal(snapshot.xmax)) |
| 1592 | ereport(ERROR, |
| 1593 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| 1594 | errmsg("invalid snapshot data in file \"%s\"" , path))); |
| 1595 | |
| 1596 | /* |
| 1597 | * If we're serializable, the source transaction must be too, otherwise |
| 1598 | * predicate.c has problems (SxactGlobalXmin could go backwards). Also, a |
| 1599 | * non-read-only transaction can't adopt a snapshot from a read-only |
| 1600 | * transaction, as predicate.c handles the cases very differently. |
| 1601 | */ |
| 1602 | if (IsolationIsSerializable()) |
| 1603 | { |
| 1604 | if (src_isolevel != XACT_SERIALIZABLE) |
| 1605 | ereport(ERROR, |
| 1606 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| 1607 | errmsg("a serializable transaction cannot import a snapshot from a non-serializable transaction" ))); |
| 1608 | if (src_readonly && !XactReadOnly) |
| 1609 | ereport(ERROR, |
| 1610 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| 1611 | errmsg("a non-read-only serializable transaction cannot import a snapshot from a read-only transaction" ))); |
| 1612 | } |
| 1613 | |
| 1614 | /* |
| 1615 | * We cannot import a snapshot that was taken in a different database, |
| 1616 | * because vacuum calculates OldestXmin on a per-database basis; so the |
| 1617 | * source transaction's xmin doesn't protect us from data loss. This |
| 1618 | * restriction could be removed if the source transaction were to mark its |
| 1619 | * xmin as being globally applicable. But that would require some |
| 1620 | * additional syntax, since that has to be known when the snapshot is |
| 1621 | * initially taken. (See pgsql-hackers discussion of 2011-10-21.) |
| 1622 | */ |
| 1623 | if (src_dbid != MyDatabaseId) |
| 1624 | ereport(ERROR, |
| 1625 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| 1626 | errmsg("cannot import a snapshot from a different database" ))); |
| 1627 | |
| 1628 | /* OK, install the snapshot */ |
| 1629 | SetTransactionSnapshot(&snapshot, &src_vxid, src_pid, NULL); |
| 1630 | } |
| 1631 | |
| 1632 | /* |
| 1633 | * XactHasExportedSnapshots |
| 1634 | * Test whether current transaction has exported any snapshots. |
| 1635 | */ |
| 1636 | bool |
| 1637 | XactHasExportedSnapshots(void) |
| 1638 | { |
| 1639 | return (exportedSnapshots != NIL); |
| 1640 | } |
| 1641 | |
| 1642 | /* |
| 1643 | * DeleteAllExportedSnapshotFiles |
| 1644 | * Clean up any files that have been left behind by a crashed backend |
| 1645 | * that had exported snapshots before it died. |
| 1646 | * |
| 1647 | * This should be called during database startup or crash recovery. |
| 1648 | */ |
| 1649 | void |
| 1650 | DeleteAllExportedSnapshotFiles(void) |
| 1651 | { |
| 1652 | char buf[MAXPGPATH + sizeof(SNAPSHOT_EXPORT_DIR)]; |
| 1653 | DIR *s_dir; |
| 1654 | struct dirent *s_de; |
| 1655 | |
| 1656 | /* |
| 1657 | * Problems in reading the directory, or unlinking files, are reported at |
| 1658 | * LOG level. Since we're running in the startup process, ERROR level |
| 1659 | * would prevent database start, and it's not important enough for that. |
| 1660 | */ |
| 1661 | s_dir = AllocateDir(SNAPSHOT_EXPORT_DIR); |
| 1662 | |
| 1663 | while ((s_de = ReadDirExtended(s_dir, SNAPSHOT_EXPORT_DIR, LOG)) != NULL) |
| 1664 | { |
| 1665 | if (strcmp(s_de->d_name, "." ) == 0 || |
| 1666 | strcmp(s_de->d_name, ".." ) == 0) |
| 1667 | continue; |
| 1668 | |
| 1669 | snprintf(buf, sizeof(buf), SNAPSHOT_EXPORT_DIR "/%s" , s_de->d_name); |
| 1670 | |
| 1671 | if (unlink(buf) != 0) |
| 1672 | ereport(LOG, |
| 1673 | (errcode_for_file_access(), |
| 1674 | errmsg("could not remove file \"%s\": %m" , buf))); |
| 1675 | } |
| 1676 | |
| 1677 | FreeDir(s_dir); |
| 1678 | } |
| 1679 | |
| 1680 | /* |
| 1681 | * ThereAreNoPriorRegisteredSnapshots |
| 1682 | * Is the registered snapshot count less than or equal to one? |
| 1683 | * |
| 1684 | * Don't use this to settle important decisions. While zero registrations and |
| 1685 | * no ActiveSnapshot would confirm a certain idleness, the system makes no |
| 1686 | * guarantees about the significance of one registered snapshot. |
| 1687 | */ |
| 1688 | bool |
| 1689 | ThereAreNoPriorRegisteredSnapshots(void) |
| 1690 | { |
| 1691 | if (pairingheap_is_empty(&RegisteredSnapshots) || |
| 1692 | pairingheap_is_singular(&RegisteredSnapshots)) |
| 1693 | return true; |
| 1694 | |
| 1695 | return false; |
| 1696 | } |
| 1697 | |
| 1698 | |
| 1699 | /* |
| 1700 | * Return a timestamp that is exactly on a minute boundary. |
| 1701 | * |
| 1702 | * If the argument is already aligned, return that value, otherwise move to |
| 1703 | * the next minute boundary following the given time. |
| 1704 | */ |
| 1705 | static TimestampTz |
| 1706 | AlignTimestampToMinuteBoundary(TimestampTz ts) |
| 1707 | { |
| 1708 | TimestampTz retval = ts + (USECS_PER_MINUTE - 1); |
| 1709 | |
| 1710 | return retval - (retval % USECS_PER_MINUTE); |
| 1711 | } |
| 1712 | |
| 1713 | /* |
| 1714 | * Get current timestamp for snapshots |
| 1715 | * |
| 1716 | * This is basically GetCurrentTimestamp(), but with a guarantee that |
| 1717 | * the result never moves backward. |
| 1718 | */ |
| 1719 | TimestampTz |
| 1720 | GetSnapshotCurrentTimestamp(void) |
| 1721 | { |
| 1722 | TimestampTz now = GetCurrentTimestamp(); |
| 1723 | |
| 1724 | /* |
| 1725 | * Don't let time move backward; if it hasn't advanced, use the old value. |
| 1726 | */ |
| 1727 | SpinLockAcquire(&oldSnapshotControl->mutex_current); |
| 1728 | if (now <= oldSnapshotControl->current_timestamp) |
| 1729 | now = oldSnapshotControl->current_timestamp; |
| 1730 | else |
| 1731 | oldSnapshotControl->current_timestamp = now; |
| 1732 | SpinLockRelease(&oldSnapshotControl->mutex_current); |
| 1733 | |
| 1734 | return now; |
| 1735 | } |
| 1736 | |
| 1737 | /* |
| 1738 | * Get timestamp through which vacuum may have processed based on last stored |
| 1739 | * value for threshold_timestamp. |
| 1740 | * |
| 1741 | * XXX: So far, we never trust that a 64-bit value can be read atomically; if |
| 1742 | * that ever changes, we could get rid of the spinlock here. |
| 1743 | */ |
| 1744 | TimestampTz |
| 1745 | GetOldSnapshotThresholdTimestamp(void) |
| 1746 | { |
| 1747 | TimestampTz threshold_timestamp; |
| 1748 | |
| 1749 | SpinLockAcquire(&oldSnapshotControl->mutex_threshold); |
| 1750 | threshold_timestamp = oldSnapshotControl->threshold_timestamp; |
| 1751 | SpinLockRelease(&oldSnapshotControl->mutex_threshold); |
| 1752 | |
| 1753 | return threshold_timestamp; |
| 1754 | } |
| 1755 | |
| 1756 | static void |
| 1757 | SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit) |
| 1758 | { |
| 1759 | SpinLockAcquire(&oldSnapshotControl->mutex_threshold); |
| 1760 | oldSnapshotControl->threshold_timestamp = ts; |
| 1761 | oldSnapshotControl->threshold_xid = xlimit; |
| 1762 | SpinLockRelease(&oldSnapshotControl->mutex_threshold); |
| 1763 | } |
| 1764 | |
| 1765 | /* |
| 1766 | * TransactionIdLimitedForOldSnapshots |
| 1767 | * |
| 1768 | * Apply old snapshot limit, if any. This is intended to be called for page |
| 1769 | * pruning and table vacuuming, to allow old_snapshot_threshold to override |
| 1770 | * the normal global xmin value. Actual testing for snapshot too old will be |
| 1771 | * based on whether a snapshot timestamp is prior to the threshold timestamp |
| 1772 | * set in this function. |
| 1773 | */ |
| 1774 | TransactionId |
| 1775 | TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, |
| 1776 | Relation relation) |
| 1777 | { |
| 1778 | if (TransactionIdIsNormal(recentXmin) |
| 1779 | && old_snapshot_threshold >= 0 |
| 1780 | && RelationAllowsEarlyPruning(relation)) |
| 1781 | { |
| 1782 | TimestampTz ts = GetSnapshotCurrentTimestamp(); |
| 1783 | TransactionId xlimit = recentXmin; |
| 1784 | TransactionId latest_xmin; |
| 1785 | TimestampTz update_ts; |
| 1786 | bool same_ts_as_threshold = false; |
| 1787 | |
| 1788 | SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); |
| 1789 | latest_xmin = oldSnapshotControl->latest_xmin; |
| 1790 | update_ts = oldSnapshotControl->next_map_update; |
| 1791 | SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); |
| 1792 | |
| 1793 | /* |
| 1794 | * Zero threshold always overrides to latest xmin, if valid. Without |
| 1795 | * some heuristic it will find its own snapshot too old on, for |
| 1796 | * example, a simple UPDATE -- which would make it useless for most |
| 1797 | * testing, but there is no principled way to ensure that it doesn't |
| 1798 | * fail in this way. Use a five-second delay to try to get useful |
| 1799 | * testing behavior, but this may need adjustment. |
| 1800 | */ |
| 1801 | if (old_snapshot_threshold == 0) |
| 1802 | { |
| 1803 | if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) |
| 1804 | && TransactionIdFollows(latest_xmin, xlimit)) |
| 1805 | xlimit = latest_xmin; |
| 1806 | |
| 1807 | ts -= 5 * USECS_PER_SEC; |
| 1808 | SetOldSnapshotThresholdTimestamp(ts, xlimit); |
| 1809 | |
| 1810 | return xlimit; |
| 1811 | } |
| 1812 | |
| 1813 | ts = AlignTimestampToMinuteBoundary(ts) |
| 1814 | - (old_snapshot_threshold * USECS_PER_MINUTE); |
| 1815 | |
| 1816 | /* Check for fast exit without LW locking. */ |
| 1817 | SpinLockAcquire(&oldSnapshotControl->mutex_threshold); |
| 1818 | if (ts == oldSnapshotControl->threshold_timestamp) |
| 1819 | { |
| 1820 | xlimit = oldSnapshotControl->threshold_xid; |
| 1821 | same_ts_as_threshold = true; |
| 1822 | } |
| 1823 | SpinLockRelease(&oldSnapshotControl->mutex_threshold); |
| 1824 | |
| 1825 | if (!same_ts_as_threshold) |
| 1826 | { |
| 1827 | if (ts == update_ts) |
| 1828 | { |
| 1829 | xlimit = latest_xmin; |
| 1830 | if (NormalTransactionIdFollows(xlimit, recentXmin)) |
| 1831 | SetOldSnapshotThresholdTimestamp(ts, xlimit); |
| 1832 | } |
| 1833 | else |
| 1834 | { |
| 1835 | LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED); |
| 1836 | |
| 1837 | if (oldSnapshotControl->count_used > 0 |
| 1838 | && ts >= oldSnapshotControl->head_timestamp) |
| 1839 | { |
| 1840 | int offset; |
| 1841 | |
| 1842 | offset = ((ts - oldSnapshotControl->head_timestamp) |
| 1843 | / USECS_PER_MINUTE); |
| 1844 | if (offset > oldSnapshotControl->count_used - 1) |
| 1845 | offset = oldSnapshotControl->count_used - 1; |
| 1846 | offset = (oldSnapshotControl->head_offset + offset) |
| 1847 | % OLD_SNAPSHOT_TIME_MAP_ENTRIES; |
| 1848 | xlimit = oldSnapshotControl->xid_by_minute[offset]; |
| 1849 | |
| 1850 | if (NormalTransactionIdFollows(xlimit, recentXmin)) |
| 1851 | SetOldSnapshotThresholdTimestamp(ts, xlimit); |
| 1852 | } |
| 1853 | |
| 1854 | LWLockRelease(OldSnapshotTimeMapLock); |
| 1855 | } |
| 1856 | } |
| 1857 | |
| 1858 | /* |
| 1859 | * Failsafe protection against vacuuming work of active transaction. |
| 1860 | * |
| 1861 | * This is not an assertion because we avoid the spinlock for |
| 1862 | * performance, leaving open the possibility that xlimit could advance |
| 1863 | * and be more current; but it seems prudent to apply this limit. It |
| 1864 | * might make pruning a tiny bit less aggressive than it could be, but |
| 1865 | * protects against data loss bugs. |
| 1866 | */ |
| 1867 | if (TransactionIdIsNormal(latest_xmin) |
| 1868 | && TransactionIdPrecedes(latest_xmin, xlimit)) |
| 1869 | xlimit = latest_xmin; |
| 1870 | |
| 1871 | if (NormalTransactionIdFollows(xlimit, recentXmin)) |
| 1872 | return xlimit; |
| 1873 | } |
| 1874 | |
| 1875 | return recentXmin; |
| 1876 | } |
| 1877 | |
| 1878 | /* |
| 1879 | * Take care of the circular buffer that maps time to xid. |
| 1880 | */ |
| 1881 | void |
| 1882 | MaintainOldSnapshotTimeMapping(TimestampTz whenTaken, TransactionId xmin) |
| 1883 | { |
| 1884 | TimestampTz ts; |
| 1885 | TransactionId latest_xmin; |
| 1886 | TimestampTz update_ts; |
| 1887 | bool map_update_required = false; |
| 1888 | |
| 1889 | /* Never call this function when old snapshot checking is disabled. */ |
| 1890 | Assert(old_snapshot_threshold >= 0); |
| 1891 | |
| 1892 | ts = AlignTimestampToMinuteBoundary(whenTaken); |
| 1893 | |
| 1894 | /* |
| 1895 | * Keep track of the latest xmin seen by any process. Update mapping with |
| 1896 | * a new value when we have crossed a bucket boundary. |
| 1897 | */ |
| 1898 | SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); |
| 1899 | latest_xmin = oldSnapshotControl->latest_xmin; |
| 1900 | update_ts = oldSnapshotControl->next_map_update; |
| 1901 | if (ts > update_ts) |
| 1902 | { |
| 1903 | oldSnapshotControl->next_map_update = ts; |
| 1904 | map_update_required = true; |
| 1905 | } |
| 1906 | if (TransactionIdFollows(xmin, latest_xmin)) |
| 1907 | oldSnapshotControl->latest_xmin = xmin; |
| 1908 | SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); |
| 1909 | |
| 1910 | /* We only needed to update the most recent xmin value. */ |
| 1911 | if (!map_update_required) |
| 1912 | return; |
| 1913 | |
| 1914 | /* No further tracking needed for 0 (used for testing). */ |
| 1915 | if (old_snapshot_threshold == 0) |
| 1916 | return; |
| 1917 | |
| 1918 | /* |
| 1919 | * We don't want to do something stupid with unusual values, but we don't |
| 1920 | * want to litter the log with warnings or break otherwise normal |
| 1921 | * processing for this feature; so if something seems unreasonable, just |
| 1922 | * log at DEBUG level and return without doing anything. |
| 1923 | */ |
| 1924 | if (whenTaken < 0) |
| 1925 | { |
| 1926 | elog(DEBUG1, |
| 1927 | "MaintainOldSnapshotTimeMapping called with negative whenTaken = %ld" , |
| 1928 | (long) whenTaken); |
| 1929 | return; |
| 1930 | } |
| 1931 | if (!TransactionIdIsNormal(xmin)) |
| 1932 | { |
| 1933 | elog(DEBUG1, |
| 1934 | "MaintainOldSnapshotTimeMapping called with xmin = %lu" , |
| 1935 | (unsigned long) xmin); |
| 1936 | return; |
| 1937 | } |
| 1938 | |
| 1939 | LWLockAcquire(OldSnapshotTimeMapLock, LW_EXCLUSIVE); |
| 1940 | |
| 1941 | Assert(oldSnapshotControl->head_offset >= 0); |
| 1942 | Assert(oldSnapshotControl->head_offset < OLD_SNAPSHOT_TIME_MAP_ENTRIES); |
| 1943 | Assert((oldSnapshotControl->head_timestamp % USECS_PER_MINUTE) == 0); |
| 1944 | Assert(oldSnapshotControl->count_used >= 0); |
| 1945 | Assert(oldSnapshotControl->count_used <= OLD_SNAPSHOT_TIME_MAP_ENTRIES); |
| 1946 | |
| 1947 | if (oldSnapshotControl->count_used == 0) |
| 1948 | { |
| 1949 | /* set up first entry for empty mapping */ |
| 1950 | oldSnapshotControl->head_offset = 0; |
| 1951 | oldSnapshotControl->head_timestamp = ts; |
| 1952 | oldSnapshotControl->count_used = 1; |
| 1953 | oldSnapshotControl->xid_by_minute[0] = xmin; |
| 1954 | } |
| 1955 | else if (ts < oldSnapshotControl->head_timestamp) |
| 1956 | { |
| 1957 | /* old ts; log it at DEBUG */ |
| 1958 | LWLockRelease(OldSnapshotTimeMapLock); |
| 1959 | elog(DEBUG1, |
| 1960 | "MaintainOldSnapshotTimeMapping called with old whenTaken = %ld" , |
| 1961 | (long) whenTaken); |
| 1962 | return; |
| 1963 | } |
| 1964 | else if (ts <= (oldSnapshotControl->head_timestamp + |
| 1965 | ((oldSnapshotControl->count_used - 1) |
| 1966 | * USECS_PER_MINUTE))) |
| 1967 | { |
| 1968 | /* existing mapping; advance xid if possible */ |
| 1969 | int bucket = (oldSnapshotControl->head_offset |
| 1970 | + ((ts - oldSnapshotControl->head_timestamp) |
| 1971 | / USECS_PER_MINUTE)) |
| 1972 | % OLD_SNAPSHOT_TIME_MAP_ENTRIES; |
| 1973 | |
| 1974 | if (TransactionIdPrecedes(oldSnapshotControl->xid_by_minute[bucket], xmin)) |
| 1975 | oldSnapshotControl->xid_by_minute[bucket] = xmin; |
| 1976 | } |
| 1977 | else |
| 1978 | { |
| 1979 | /* We need a new bucket, but it might not be the very next one. */ |
| 1980 | int advance = ((ts - oldSnapshotControl->head_timestamp) |
| 1981 | / USECS_PER_MINUTE); |
| 1982 | |
| 1983 | oldSnapshotControl->head_timestamp = ts; |
| 1984 | |
| 1985 | if (advance >= OLD_SNAPSHOT_TIME_MAP_ENTRIES) |
| 1986 | { |
| 1987 | /* Advance is so far that all old data is junk; start over. */ |
| 1988 | oldSnapshotControl->head_offset = 0; |
| 1989 | oldSnapshotControl->count_used = 1; |
| 1990 | oldSnapshotControl->xid_by_minute[0] = xmin; |
| 1991 | } |
| 1992 | else |
| 1993 | { |
| 1994 | /* Store the new value in one or more buckets. */ |
| 1995 | int i; |
| 1996 | |
| 1997 | for (i = 0; i < advance; i++) |
| 1998 | { |
| 1999 | if (oldSnapshotControl->count_used == OLD_SNAPSHOT_TIME_MAP_ENTRIES) |
| 2000 | { |
| 2001 | /* Map full and new value replaces old head. */ |
| 2002 | int old_head = oldSnapshotControl->head_offset; |
| 2003 | |
| 2004 | if (old_head == (OLD_SNAPSHOT_TIME_MAP_ENTRIES - 1)) |
| 2005 | oldSnapshotControl->head_offset = 0; |
| 2006 | else |
| 2007 | oldSnapshotControl->head_offset = old_head + 1; |
| 2008 | oldSnapshotControl->xid_by_minute[old_head] = xmin; |
| 2009 | } |
| 2010 | else |
| 2011 | { |
| 2012 | /* Extend map to unused entry. */ |
| 2013 | int new_tail = (oldSnapshotControl->head_offset |
| 2014 | + oldSnapshotControl->count_used) |
| 2015 | % OLD_SNAPSHOT_TIME_MAP_ENTRIES; |
| 2016 | |
| 2017 | oldSnapshotControl->count_used++; |
| 2018 | oldSnapshotControl->xid_by_minute[new_tail] = xmin; |
| 2019 | } |
| 2020 | } |
| 2021 | } |
| 2022 | } |
| 2023 | |
| 2024 | LWLockRelease(OldSnapshotTimeMapLock); |
| 2025 | } |
| 2026 | |
| 2027 | |
| 2028 | /* |
| 2029 | * Setup a snapshot that replaces normal catalog snapshots that allows catalog |
| 2030 | * access to behave just like it did at a certain point in the past. |
| 2031 | * |
| 2032 | * Needed for logical decoding. |
| 2033 | */ |
| 2034 | void |
| 2035 | SetupHistoricSnapshot(Snapshot historic_snapshot, HTAB *tuplecids) |
| 2036 | { |
| 2037 | Assert(historic_snapshot != NULL); |
| 2038 | |
| 2039 | /* setup the timetravel snapshot */ |
| 2040 | HistoricSnapshot = historic_snapshot; |
| 2041 | |
| 2042 | /* setup (cmin, cmax) lookup hash */ |
| 2043 | tuplecid_data = tuplecids; |
| 2044 | } |
| 2045 | |
| 2046 | |
| 2047 | /* |
| 2048 | * Make catalog snapshots behave normally again. |
| 2049 | */ |
| 2050 | void |
| 2051 | TeardownHistoricSnapshot(bool is_error) |
| 2052 | { |
| 2053 | HistoricSnapshot = NULL; |
| 2054 | tuplecid_data = NULL; |
| 2055 | } |
| 2056 | |
| 2057 | bool |
| 2058 | HistoricSnapshotActive(void) |
| 2059 | { |
| 2060 | return HistoricSnapshot != NULL; |
| 2061 | } |
| 2062 | |
| 2063 | HTAB * |
| 2064 | HistoricSnapshotGetTupleCids(void) |
| 2065 | { |
| 2066 | Assert(HistoricSnapshotActive()); |
| 2067 | return tuplecid_data; |
| 2068 | } |
| 2069 | |
| 2070 | /* |
| 2071 | * EstimateSnapshotSpace |
| 2072 | * Returns the size needed to store the given snapshot. |
| 2073 | * |
| 2074 | * We are exporting only required fields from the Snapshot, stored in |
| 2075 | * SerializedSnapshotData. |
| 2076 | */ |
| 2077 | Size |
| 2078 | EstimateSnapshotSpace(Snapshot snap) |
| 2079 | { |
| 2080 | Size size; |
| 2081 | |
| 2082 | Assert(snap != InvalidSnapshot); |
| 2083 | Assert(snap->snapshot_type == SNAPSHOT_MVCC); |
| 2084 | |
| 2085 | /* We allocate any XID arrays needed in the same palloc block. */ |
| 2086 | size = add_size(sizeof(SerializedSnapshotData), |
| 2087 | mul_size(snap->xcnt, sizeof(TransactionId))); |
| 2088 | if (snap->subxcnt > 0 && |
| 2089 | (!snap->suboverflowed || snap->takenDuringRecovery)) |
| 2090 | size = add_size(size, |
| 2091 | mul_size(snap->subxcnt, sizeof(TransactionId))); |
| 2092 | |
| 2093 | return size; |
| 2094 | } |
| 2095 | |
| 2096 | /* |
| 2097 | * SerializeSnapshot |
| 2098 | * Dumps the serialized snapshot (extracted from given snapshot) onto the |
| 2099 | * memory location at start_address. |
| 2100 | */ |
| 2101 | void |
| 2102 | SerializeSnapshot(Snapshot snapshot, char *start_address) |
| 2103 | { |
| 2104 | SerializedSnapshotData serialized_snapshot; |
| 2105 | |
| 2106 | Assert(snapshot->subxcnt >= 0); |
| 2107 | |
| 2108 | /* Copy all required fields */ |
| 2109 | serialized_snapshot.xmin = snapshot->xmin; |
| 2110 | serialized_snapshot.xmax = snapshot->xmax; |
| 2111 | serialized_snapshot.xcnt = snapshot->xcnt; |
| 2112 | serialized_snapshot.subxcnt = snapshot->subxcnt; |
| 2113 | serialized_snapshot.suboverflowed = snapshot->suboverflowed; |
| 2114 | serialized_snapshot.takenDuringRecovery = snapshot->takenDuringRecovery; |
| 2115 | serialized_snapshot.curcid = snapshot->curcid; |
| 2116 | serialized_snapshot.whenTaken = snapshot->whenTaken; |
| 2117 | serialized_snapshot.lsn = snapshot->lsn; |
| 2118 | |
| 2119 | /* |
| 2120 | * Ignore the SubXID array if it has overflowed, unless the snapshot was |
| 2121 | * taken during recovery - in that case, top-level XIDs are in subxip as |
| 2122 | * well, and we mustn't lose them. |
| 2123 | */ |
| 2124 | if (serialized_snapshot.suboverflowed && !snapshot->takenDuringRecovery) |
| 2125 | serialized_snapshot.subxcnt = 0; |
| 2126 | |
| 2127 | /* Copy struct to possibly-unaligned buffer */ |
| 2128 | memcpy(start_address, |
| 2129 | &serialized_snapshot, sizeof(SerializedSnapshotData)); |
| 2130 | |
| 2131 | /* Copy XID array */ |
| 2132 | if (snapshot->xcnt > 0) |
| 2133 | memcpy((TransactionId *) (start_address + |
| 2134 | sizeof(SerializedSnapshotData)), |
| 2135 | snapshot->xip, snapshot->xcnt * sizeof(TransactionId)); |
| 2136 | |
| 2137 | /* |
| 2138 | * Copy SubXID array. Don't bother to copy it if it had overflowed, |
| 2139 | * though, because it's not used anywhere in that case. Except if it's a |
| 2140 | * snapshot taken during recovery; all the top-level XIDs are in subxip as |
| 2141 | * well in that case, so we mustn't lose them. |
| 2142 | */ |
| 2143 | if (serialized_snapshot.subxcnt > 0) |
| 2144 | { |
| 2145 | Size subxipoff = sizeof(SerializedSnapshotData) + |
| 2146 | snapshot->xcnt * sizeof(TransactionId); |
| 2147 | |
| 2148 | memcpy((TransactionId *) (start_address + subxipoff), |
| 2149 | snapshot->subxip, snapshot->subxcnt * sizeof(TransactionId)); |
| 2150 | } |
| 2151 | } |
| 2152 | |
| 2153 | /* |
| 2154 | * RestoreSnapshot |
| 2155 | * Restore a serialized snapshot from the specified address. |
| 2156 | * |
| 2157 | * The copy is palloc'd in TopTransactionContext and has initial refcounts set |
| 2158 | * to 0. The returned snapshot has the copied flag set. |
| 2159 | */ |
| 2160 | Snapshot |
| 2161 | RestoreSnapshot(char *start_address) |
| 2162 | { |
| 2163 | SerializedSnapshotData serialized_snapshot; |
| 2164 | Size size; |
| 2165 | Snapshot snapshot; |
| 2166 | TransactionId *serialized_xids; |
| 2167 | |
| 2168 | memcpy(&serialized_snapshot, start_address, |
| 2169 | sizeof(SerializedSnapshotData)); |
| 2170 | serialized_xids = (TransactionId *) |
| 2171 | (start_address + sizeof(SerializedSnapshotData)); |
| 2172 | |
| 2173 | /* We allocate any XID arrays needed in the same palloc block. */ |
| 2174 | size = sizeof(SnapshotData) |
| 2175 | + serialized_snapshot.xcnt * sizeof(TransactionId) |
| 2176 | + serialized_snapshot.subxcnt * sizeof(TransactionId); |
| 2177 | |
| 2178 | /* Copy all required fields */ |
| 2179 | snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, size); |
| 2180 | snapshot->snapshot_type = SNAPSHOT_MVCC; |
| 2181 | snapshot->xmin = serialized_snapshot.xmin; |
| 2182 | snapshot->xmax = serialized_snapshot.xmax; |
| 2183 | snapshot->xip = NULL; |
| 2184 | snapshot->xcnt = serialized_snapshot.xcnt; |
| 2185 | snapshot->subxip = NULL; |
| 2186 | snapshot->subxcnt = serialized_snapshot.subxcnt; |
| 2187 | snapshot->suboverflowed = serialized_snapshot.suboverflowed; |
| 2188 | snapshot->takenDuringRecovery = serialized_snapshot.takenDuringRecovery; |
| 2189 | snapshot->curcid = serialized_snapshot.curcid; |
| 2190 | snapshot->whenTaken = serialized_snapshot.whenTaken; |
| 2191 | snapshot->lsn = serialized_snapshot.lsn; |
| 2192 | |
| 2193 | /* Copy XIDs, if present. */ |
| 2194 | if (serialized_snapshot.xcnt > 0) |
| 2195 | { |
| 2196 | snapshot->xip = (TransactionId *) (snapshot + 1); |
| 2197 | memcpy(snapshot->xip, serialized_xids, |
| 2198 | serialized_snapshot.xcnt * sizeof(TransactionId)); |
| 2199 | } |
| 2200 | |
| 2201 | /* Copy SubXIDs, if present. */ |
| 2202 | if (serialized_snapshot.subxcnt > 0) |
| 2203 | { |
| 2204 | snapshot->subxip = ((TransactionId *) (snapshot + 1)) + |
| 2205 | serialized_snapshot.xcnt; |
| 2206 | memcpy(snapshot->subxip, serialized_xids + serialized_snapshot.xcnt, |
| 2207 | serialized_snapshot.subxcnt * sizeof(TransactionId)); |
| 2208 | } |
| 2209 | |
| 2210 | /* Set the copied flag so that the caller will set refcounts correctly. */ |
| 2211 | snapshot->regd_count = 0; |
| 2212 | snapshot->active_count = 0; |
| 2213 | snapshot->copied = true; |
| 2214 | |
| 2215 | return snapshot; |
| 2216 | } |
| 2217 | |
| 2218 | /* |
| 2219 | * Install a restored snapshot as the transaction snapshot. |
| 2220 | * |
| 2221 | * The second argument is of type void * so that snapmgr.h need not include |
| 2222 | * the declaration for PGPROC. |
| 2223 | */ |
| 2224 | void |
| 2225 | RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc) |
| 2226 | { |
| 2227 | SetTransactionSnapshot(snapshot, NULL, InvalidPid, master_pgproc); |
| 2228 | } |
| 2229 | |
| 2230 | /* |
| 2231 | * XidInMVCCSnapshot |
| 2232 | * Is the given XID still-in-progress according to the snapshot? |
| 2233 | * |
| 2234 | * Note: GetSnapshotData never stores either top xid or subxids of our own |
| 2235 | * backend into a snapshot, so these xids will not be reported as "running" |
| 2236 | * by this function. This is OK for current uses, because we always check |
| 2237 | * TransactionIdIsCurrentTransactionId first, except when it's known the |
| 2238 | * XID could not be ours anyway. |
| 2239 | */ |
| 2240 | bool |
| 2241 | XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) |
| 2242 | { |
| 2243 | uint32 i; |
| 2244 | |
| 2245 | /* |
| 2246 | * Make a quick range check to eliminate most XIDs without looking at the |
| 2247 | * xip arrays. Note that this is OK even if we convert a subxact XID to |
| 2248 | * its parent below, because a subxact with XID < xmin has surely also got |
| 2249 | * a parent with XID < xmin, while one with XID >= xmax must belong to a |
| 2250 | * parent that was not yet committed at the time of this snapshot. |
| 2251 | */ |
| 2252 | |
| 2253 | /* Any xid < xmin is not in-progress */ |
| 2254 | if (TransactionIdPrecedes(xid, snapshot->xmin)) |
| 2255 | return false; |
| 2256 | /* Any xid >= xmax is in-progress */ |
| 2257 | if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) |
| 2258 | return true; |
| 2259 | |
| 2260 | /* |
| 2261 | * Snapshot information is stored slightly differently in snapshots taken |
| 2262 | * during recovery. |
| 2263 | */ |
| 2264 | if (!snapshot->takenDuringRecovery) |
| 2265 | { |
| 2266 | /* |
| 2267 | * If the snapshot contains full subxact data, the fastest way to |
| 2268 | * check things is just to compare the given XID against both subxact |
| 2269 | * XIDs and top-level XIDs. If the snapshot overflowed, we have to |
| 2270 | * use pg_subtrans to convert a subxact XID to its parent XID, but |
| 2271 | * then we need only look at top-level XIDs not subxacts. |
| 2272 | */ |
| 2273 | if (!snapshot->suboverflowed) |
| 2274 | { |
| 2275 | /* we have full data, so search subxip */ |
| 2276 | int32 j; |
| 2277 | |
| 2278 | for (j = 0; j < snapshot->subxcnt; j++) |
| 2279 | { |
| 2280 | if (TransactionIdEquals(xid, snapshot->subxip[j])) |
| 2281 | return true; |
| 2282 | } |
| 2283 | |
| 2284 | /* not there, fall through to search xip[] */ |
| 2285 | } |
| 2286 | else |
| 2287 | { |
| 2288 | /* |
| 2289 | * Snapshot overflowed, so convert xid to top-level. This is safe |
| 2290 | * because we eliminated too-old XIDs above. |
| 2291 | */ |
| 2292 | xid = SubTransGetTopmostTransaction(xid); |
| 2293 | |
| 2294 | /* |
| 2295 | * If xid was indeed a subxact, we might now have an xid < xmin, |
| 2296 | * so recheck to avoid an array scan. No point in rechecking |
| 2297 | * xmax. |
| 2298 | */ |
| 2299 | if (TransactionIdPrecedes(xid, snapshot->xmin)) |
| 2300 | return false; |
| 2301 | } |
| 2302 | |
| 2303 | for (i = 0; i < snapshot->xcnt; i++) |
| 2304 | { |
| 2305 | if (TransactionIdEquals(xid, snapshot->xip[i])) |
| 2306 | return true; |
| 2307 | } |
| 2308 | } |
| 2309 | else |
| 2310 | { |
| 2311 | int32 j; |
| 2312 | |
| 2313 | /* |
| 2314 | * In recovery we store all xids in the subxact array because it is by |
| 2315 | * far the bigger array, and we mostly don't know which xids are |
| 2316 | * top-level and which are subxacts. The xip array is empty. |
| 2317 | * |
| 2318 | * We start by searching subtrans, if we overflowed. |
| 2319 | */ |
| 2320 | if (snapshot->suboverflowed) |
| 2321 | { |
| 2322 | /* |
| 2323 | * Snapshot overflowed, so convert xid to top-level. This is safe |
| 2324 | * because we eliminated too-old XIDs above. |
| 2325 | */ |
| 2326 | xid = SubTransGetTopmostTransaction(xid); |
| 2327 | |
| 2328 | /* |
| 2329 | * If xid was indeed a subxact, we might now have an xid < xmin, |
| 2330 | * so recheck to avoid an array scan. No point in rechecking |
| 2331 | * xmax. |
| 2332 | */ |
| 2333 | if (TransactionIdPrecedes(xid, snapshot->xmin)) |
| 2334 | return false; |
| 2335 | } |
| 2336 | |
| 2337 | /* |
| 2338 | * We now have either a top-level xid higher than xmin or an |
| 2339 | * indeterminate xid. We don't know whether it's top level or subxact |
| 2340 | * but it doesn't matter. If it's present, the xid is visible. |
| 2341 | */ |
| 2342 | for (j = 0; j < snapshot->subxcnt; j++) |
| 2343 | { |
| 2344 | if (TransactionIdEquals(xid, snapshot->subxip[j])) |
| 2345 | return true; |
| 2346 | } |
| 2347 | } |
| 2348 | |
| 2349 | return false; |
| 2350 | } |
| 2351 | |