| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * nbtxlog.c |
| 4 | * WAL replay logic for btrees. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/nbtree/nbtxlog.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "access/bufmask.h" |
| 18 | #include "access/nbtree.h" |
| 19 | #include "access/nbtxlog.h" |
| 20 | #include "access/transam.h" |
| 21 | #include "access/xlog.h" |
| 22 | #include "access/xlogutils.h" |
| 23 | #include "storage/procarray.h" |
| 24 | #include "miscadmin.h" |
| 25 | |
| 26 | /* |
| 27 | * _bt_restore_page -- re-enter all the index tuples on a page |
| 28 | * |
| 29 | * The page is freshly init'd, and *from (length len) is a copy of what |
| 30 | * had been its upper part (pd_upper to pd_special). We assume that the |
| 31 | * tuples had been added to the page in item-number order, and therefore |
| 32 | * the one with highest item number appears first (lowest on the page). |
| 33 | */ |
| 34 | static void |
| 35 | _bt_restore_page(Page page, char *from, int len) |
| 36 | { |
| 37 | IndexTupleData itupdata; |
| 38 | Size itemsz; |
| 39 | char *end = from + len; |
| 40 | Item items[MaxIndexTuplesPerPage]; |
| 41 | uint16 itemsizes[MaxIndexTuplesPerPage]; |
| 42 | int i; |
| 43 | int nitems; |
| 44 | |
| 45 | /* |
| 46 | * To get the items back in the original order, we add them to the page in |
| 47 | * reverse. To figure out where one tuple ends and another begins, we |
| 48 | * have to scan them in forward order first. |
| 49 | */ |
| 50 | i = 0; |
| 51 | while (from < end) |
| 52 | { |
| 53 | /* |
| 54 | * As we step through the items, 'from' won't always be properly |
| 55 | * aligned, so we need to use memcpy(). Further, we use Item (which |
| 56 | * is just a char*) here for our items array for the same reason; |
| 57 | * wouldn't want the compiler or anyone thinking that an item is |
| 58 | * aligned when it isn't. |
| 59 | */ |
| 60 | memcpy(&itupdata, from, sizeof(IndexTupleData)); |
| 61 | itemsz = IndexTupleSize(&itupdata); |
| 62 | itemsz = MAXALIGN(itemsz); |
| 63 | |
| 64 | items[i] = (Item) from; |
| 65 | itemsizes[i] = itemsz; |
| 66 | i++; |
| 67 | |
| 68 | from += itemsz; |
| 69 | } |
| 70 | nitems = i; |
| 71 | |
| 72 | for (i = nitems - 1; i >= 0; i--) |
| 73 | { |
| 74 | if (PageAddItem(page, items[i], itemsizes[i], nitems - i, |
| 75 | false, false) == InvalidOffsetNumber) |
| 76 | elog(PANIC, "_bt_restore_page: cannot add item to page" ); |
| 77 | from += itemsz; |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | static void |
| 82 | _bt_restore_meta(XLogReaderState *record, uint8 block_id) |
| 83 | { |
| 84 | XLogRecPtr lsn = record->EndRecPtr; |
| 85 | Buffer metabuf; |
| 86 | Page metapg; |
| 87 | BTMetaPageData *md; |
| 88 | BTPageOpaque pageop; |
| 89 | xl_btree_metadata *xlrec; |
| 90 | char *ptr; |
| 91 | Size len; |
| 92 | |
| 93 | metabuf = XLogInitBufferForRedo(record, block_id); |
| 94 | ptr = XLogRecGetBlockData(record, block_id, &len); |
| 95 | |
| 96 | Assert(len == sizeof(xl_btree_metadata)); |
| 97 | Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); |
| 98 | xlrec = (xl_btree_metadata *) ptr; |
| 99 | metapg = BufferGetPage(metabuf); |
| 100 | |
| 101 | _bt_pageinit(metapg, BufferGetPageSize(metabuf)); |
| 102 | |
| 103 | md = BTPageGetMeta(metapg); |
| 104 | md->btm_magic = BTREE_MAGIC; |
| 105 | md->btm_version = xlrec->version; |
| 106 | md->btm_root = xlrec->root; |
| 107 | md->btm_level = xlrec->level; |
| 108 | md->btm_fastroot = xlrec->fastroot; |
| 109 | md->btm_fastlevel = xlrec->fastlevel; |
| 110 | /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ |
| 111 | Assert(md->btm_version >= BTREE_NOVAC_VERSION); |
| 112 | md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; |
| 113 | md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; |
| 114 | |
| 115 | pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); |
| 116 | pageop->btpo_flags = BTP_META; |
| 117 | |
| 118 | /* |
| 119 | * Set pd_lower just past the end of the metadata. This is essential, |
| 120 | * because without doing so, metadata will be lost if xlog.c compresses |
| 121 | * the page. |
| 122 | */ |
| 123 | ((PageHeader) metapg)->pd_lower = |
| 124 | ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg; |
| 125 | |
| 126 | PageSetLSN(metapg, lsn); |
| 127 | MarkBufferDirty(metabuf); |
| 128 | UnlockReleaseBuffer(metabuf); |
| 129 | } |
| 130 | |
| 131 | /* |
| 132 | * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page |
| 133 | * |
| 134 | * This is a common subroutine of the redo functions of all the WAL record |
| 135 | * types that can insert a downlink: insert, split, and newroot. |
| 136 | */ |
| 137 | static void |
| 138 | _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) |
| 139 | { |
| 140 | XLogRecPtr lsn = record->EndRecPtr; |
| 141 | Buffer buf; |
| 142 | |
| 143 | if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) |
| 144 | { |
| 145 | Page page = (Page) BufferGetPage(buf); |
| 146 | BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 147 | |
| 148 | Assert(P_INCOMPLETE_SPLIT(pageop)); |
| 149 | pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; |
| 150 | |
| 151 | PageSetLSN(page, lsn); |
| 152 | MarkBufferDirty(buf); |
| 153 | } |
| 154 | if (BufferIsValid(buf)) |
| 155 | UnlockReleaseBuffer(buf); |
| 156 | } |
| 157 | |
| 158 | static void |
| 159 | btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) |
| 160 | { |
| 161 | XLogRecPtr lsn = record->EndRecPtr; |
| 162 | xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); |
| 163 | Buffer buffer; |
| 164 | Page page; |
| 165 | |
| 166 | /* |
| 167 | * Insertion to an internal page finishes an incomplete split at the child |
| 168 | * level. Clear the incomplete-split flag in the child. Note: during |
| 169 | * normal operation, the child and parent pages are locked at the same |
| 170 | * time, so that clearing the flag and inserting the downlink appear |
| 171 | * atomic to other backends. We don't bother with that during replay, |
| 172 | * because readers don't care about the incomplete-split flag and there |
| 173 | * cannot be updates happening. |
| 174 | */ |
| 175 | if (!isleaf) |
| 176 | _bt_clear_incomplete_split(record, 1); |
| 177 | if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) |
| 178 | { |
| 179 | Size datalen; |
| 180 | char *datapos = XLogRecGetBlockData(record, 0, &datalen); |
| 181 | |
| 182 | page = BufferGetPage(buffer); |
| 183 | |
| 184 | if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, |
| 185 | false, false) == InvalidOffsetNumber) |
| 186 | elog(PANIC, "btree_xlog_insert: failed to add item" ); |
| 187 | |
| 188 | PageSetLSN(page, lsn); |
| 189 | MarkBufferDirty(buffer); |
| 190 | } |
| 191 | if (BufferIsValid(buffer)) |
| 192 | UnlockReleaseBuffer(buffer); |
| 193 | |
| 194 | /* |
| 195 | * Note: in normal operation, we'd update the metapage while still holding |
| 196 | * lock on the page we inserted into. But during replay it's not |
| 197 | * necessary to hold that lock, since no other index updates can be |
| 198 | * happening concurrently, and readers will cope fine with following an |
| 199 | * obsolete link from the metapage. |
| 200 | */ |
| 201 | if (ismeta) |
| 202 | _bt_restore_meta(record, 2); |
| 203 | } |
| 204 | |
| 205 | static void |
| 206 | btree_xlog_split(bool onleft, XLogReaderState *record) |
| 207 | { |
| 208 | XLogRecPtr lsn = record->EndRecPtr; |
| 209 | xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); |
| 210 | bool isleaf = (xlrec->level == 0); |
| 211 | Buffer lbuf; |
| 212 | Buffer rbuf; |
| 213 | Page rpage; |
| 214 | BTPageOpaque ropaque; |
| 215 | char *datapos; |
| 216 | Size datalen; |
| 217 | BlockNumber leftsib; |
| 218 | BlockNumber rightsib; |
| 219 | BlockNumber rnext; |
| 220 | |
| 221 | XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); |
| 222 | XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); |
| 223 | if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) |
| 224 | rnext = P_NONE; |
| 225 | |
| 226 | /* |
| 227 | * Clear the incomplete split flag on the left sibling of the child page |
| 228 | * this is a downlink for. (Like in btree_xlog_insert, this can be done |
| 229 | * before locking the other pages) |
| 230 | */ |
| 231 | if (!isleaf) |
| 232 | _bt_clear_incomplete_split(record, 3); |
| 233 | |
| 234 | /* Reconstruct right (new) sibling page from scratch */ |
| 235 | rbuf = XLogInitBufferForRedo(record, 1); |
| 236 | datapos = XLogRecGetBlockData(record, 1, &datalen); |
| 237 | rpage = (Page) BufferGetPage(rbuf); |
| 238 | |
| 239 | _bt_pageinit(rpage, BufferGetPageSize(rbuf)); |
| 240 | ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); |
| 241 | |
| 242 | ropaque->btpo_prev = leftsib; |
| 243 | ropaque->btpo_next = rnext; |
| 244 | ropaque->btpo.level = xlrec->level; |
| 245 | ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; |
| 246 | ropaque->btpo_cycleid = 0; |
| 247 | |
| 248 | _bt_restore_page(rpage, datapos, datalen); |
| 249 | |
| 250 | PageSetLSN(rpage, lsn); |
| 251 | MarkBufferDirty(rbuf); |
| 252 | |
| 253 | /* Now reconstruct left (original) sibling page */ |
| 254 | if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) |
| 255 | { |
| 256 | /* |
| 257 | * To retain the same physical order of the tuples that they had, we |
| 258 | * initialize a temporary empty page for the left page and add all the |
| 259 | * items to that in item number order. This mirrors how _bt_split() |
| 260 | * works. Retaining the same physical order makes WAL consistency |
| 261 | * checking possible. See also _bt_restore_page(), which does the |
| 262 | * same for the right page. |
| 263 | */ |
| 264 | Page lpage = (Page) BufferGetPage(lbuf); |
| 265 | BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); |
| 266 | OffsetNumber off; |
| 267 | IndexTuple newitem = NULL, |
| 268 | left_hikey = NULL; |
| 269 | Size newitemsz = 0, |
| 270 | left_hikeysz = 0; |
| 271 | Page newlpage; |
| 272 | OffsetNumber leftoff; |
| 273 | |
| 274 | datapos = XLogRecGetBlockData(record, 0, &datalen); |
| 275 | |
| 276 | if (onleft) |
| 277 | { |
| 278 | newitem = (IndexTuple) datapos; |
| 279 | newitemsz = MAXALIGN(IndexTupleSize(newitem)); |
| 280 | datapos += newitemsz; |
| 281 | datalen -= newitemsz; |
| 282 | } |
| 283 | |
| 284 | /* Extract left hikey and its size (assuming 16-bit alignment) */ |
| 285 | left_hikey = (IndexTuple) datapos; |
| 286 | left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); |
| 287 | datapos += left_hikeysz; |
| 288 | datalen -= left_hikeysz; |
| 289 | |
| 290 | Assert(datalen == 0); |
| 291 | |
| 292 | newlpage = PageGetTempPageCopySpecial(lpage); |
| 293 | |
| 294 | /* Set high key */ |
| 295 | leftoff = P_HIKEY; |
| 296 | if (PageAddItem(newlpage, (Item) left_hikey, left_hikeysz, |
| 297 | P_HIKEY, false, false) == InvalidOffsetNumber) |
| 298 | elog(PANIC, "failed to add high key to left page after split" ); |
| 299 | leftoff = OffsetNumberNext(leftoff); |
| 300 | |
| 301 | for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) |
| 302 | { |
| 303 | ItemId itemid; |
| 304 | Size itemsz; |
| 305 | IndexTuple item; |
| 306 | |
| 307 | /* add the new item if it was inserted on left page */ |
| 308 | if (onleft && off == xlrec->newitemoff) |
| 309 | { |
| 310 | if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, |
| 311 | false, false) == InvalidOffsetNumber) |
| 312 | elog(ERROR, "failed to add new item to left page after split" ); |
| 313 | leftoff = OffsetNumberNext(leftoff); |
| 314 | } |
| 315 | |
| 316 | itemid = PageGetItemId(lpage, off); |
| 317 | itemsz = ItemIdGetLength(itemid); |
| 318 | item = (IndexTuple) PageGetItem(lpage, itemid); |
| 319 | if (PageAddItem(newlpage, (Item) item, itemsz, leftoff, |
| 320 | false, false) == InvalidOffsetNumber) |
| 321 | elog(ERROR, "failed to add old item to left page after split" ); |
| 322 | leftoff = OffsetNumberNext(leftoff); |
| 323 | } |
| 324 | |
| 325 | /* cope with possibility that newitem goes at the end */ |
| 326 | if (onleft && off == xlrec->newitemoff) |
| 327 | { |
| 328 | if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, |
| 329 | false, false) == InvalidOffsetNumber) |
| 330 | elog(ERROR, "failed to add new item to left page after split" ); |
| 331 | leftoff = OffsetNumberNext(leftoff); |
| 332 | } |
| 333 | |
| 334 | PageRestoreTempPage(newlpage, lpage); |
| 335 | |
| 336 | /* Fix opaque fields */ |
| 337 | lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; |
| 338 | if (isleaf) |
| 339 | lopaque->btpo_flags |= BTP_LEAF; |
| 340 | lopaque->btpo_next = rightsib; |
| 341 | lopaque->btpo_cycleid = 0; |
| 342 | |
| 343 | PageSetLSN(lpage, lsn); |
| 344 | MarkBufferDirty(lbuf); |
| 345 | } |
| 346 | |
| 347 | /* |
| 348 | * We no longer need the buffers. They must be released together, so that |
| 349 | * readers cannot observe two inconsistent halves. |
| 350 | */ |
| 351 | if (BufferIsValid(lbuf)) |
| 352 | UnlockReleaseBuffer(lbuf); |
| 353 | UnlockReleaseBuffer(rbuf); |
| 354 | |
| 355 | /* |
| 356 | * Fix left-link of the page to the right of the new right sibling. |
| 357 | * |
| 358 | * Note: in normal operation, we do this while still holding lock on the |
| 359 | * two split pages. However, that's not necessary for correctness in WAL |
| 360 | * replay, because no other index update can be in progress, and readers |
| 361 | * will cope properly when following an obsolete left-link. |
| 362 | */ |
| 363 | if (rnext != P_NONE) |
| 364 | { |
| 365 | Buffer buffer; |
| 366 | |
| 367 | if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) |
| 368 | { |
| 369 | Page page = (Page) BufferGetPage(buffer); |
| 370 | BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 371 | |
| 372 | pageop->btpo_prev = rightsib; |
| 373 | |
| 374 | PageSetLSN(page, lsn); |
| 375 | MarkBufferDirty(buffer); |
| 376 | } |
| 377 | if (BufferIsValid(buffer)) |
| 378 | UnlockReleaseBuffer(buffer); |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | static void |
| 383 | btree_xlog_vacuum(XLogReaderState *record) |
| 384 | { |
| 385 | XLogRecPtr lsn = record->EndRecPtr; |
| 386 | Buffer buffer; |
| 387 | Page page; |
| 388 | BTPageOpaque opaque; |
| 389 | #ifdef UNUSED |
| 390 | xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); |
| 391 | |
| 392 | /* |
| 393 | * This section of code is thought to be no longer needed, after analysis |
| 394 | * of the calling paths. It is retained to allow the code to be reinstated |
| 395 | * if a flaw is revealed in that thinking. |
| 396 | * |
| 397 | * If we are running non-MVCC scans using this index we need to do some |
| 398 | * additional work to ensure correctness, which is known as a "pin scan" |
| 399 | * described in more detail in next paragraphs. We used to do the extra |
| 400 | * work in all cases, whereas we now avoid that work in most cases. If |
| 401 | * lastBlockVacuumed is set to InvalidBlockNumber then we skip the |
| 402 | * additional work required for the pin scan. |
| 403 | * |
| 404 | * Avoiding this extra work is important since it requires us to touch |
| 405 | * every page in the index, so is an O(N) operation. Worse, it is an |
| 406 | * operation performed in the foreground during redo, so it delays |
| 407 | * replication directly. |
| 408 | * |
| 409 | * If queries might be active then we need to ensure every leaf page is |
| 410 | * unpinned between the lastBlockVacuumed and the current block, if there |
| 411 | * are any. This prevents replay of the VACUUM from reaching the stage of |
| 412 | * removing heap tuples while there could still be indexscans "in flight" |
| 413 | * to those particular tuples for those scans which could be confused by |
| 414 | * finding new tuples at the old TID locations (see nbtree/README). |
| 415 | * |
| 416 | * It might be worth checking if there are actually any backends running; |
| 417 | * if not, we could just skip this. |
| 418 | * |
| 419 | * Since VACUUM can visit leaf pages out-of-order, it might issue records |
| 420 | * with lastBlockVacuumed >= block; that's not an error, it just means |
| 421 | * nothing to do now. |
| 422 | * |
| 423 | * Note: since we touch all pages in the range, we will lock non-leaf |
| 424 | * pages, and also any empty (all-zero) pages that may be in the index. It |
| 425 | * doesn't seem worth the complexity to avoid that. But it's important |
| 426 | * that HotStandbyActiveInReplay() will not return true if the database |
| 427 | * isn't yet consistent; so we need not fear reading still-corrupt blocks |
| 428 | * here during crash recovery. |
| 429 | */ |
| 430 | if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) |
| 431 | { |
| 432 | RelFileNode thisrnode; |
| 433 | BlockNumber thisblkno; |
| 434 | BlockNumber blkno; |
| 435 | |
| 436 | XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); |
| 437 | |
| 438 | for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) |
| 439 | { |
| 440 | /* |
| 441 | * We use RBM_NORMAL_NO_LOG mode because it's not an error |
| 442 | * condition to see all-zero pages. The original btvacuumpage |
| 443 | * scan would have skipped over all-zero pages, noting them in FSM |
| 444 | * but not bothering to initialize them just yet; so we mustn't |
| 445 | * throw an error here. (We could skip acquiring the cleanup lock |
| 446 | * if PageIsNew, but it's probably not worth the cycles to test.) |
| 447 | * |
| 448 | * XXX we don't actually need to read the block, we just need to |
| 449 | * confirm it is unpinned. If we had a special call into the |
| 450 | * buffer manager we could optimise this so that if the block is |
| 451 | * not in shared_buffers we confirm it as unpinned. Optimizing |
| 452 | * this is now moot, since in most cases we avoid the scan. |
| 453 | */ |
| 454 | buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, |
| 455 | RBM_NORMAL_NO_LOG); |
| 456 | if (BufferIsValid(buffer)) |
| 457 | { |
| 458 | LockBufferForCleanup(buffer); |
| 459 | UnlockReleaseBuffer(buffer); |
| 460 | } |
| 461 | } |
| 462 | } |
| 463 | #endif |
| 464 | |
| 465 | /* |
| 466 | * Like in btvacuumpage(), we need to take a cleanup lock on every leaf |
| 467 | * page. See nbtree/README for details. |
| 468 | */ |
| 469 | if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) |
| 470 | == BLK_NEEDS_REDO) |
| 471 | { |
| 472 | char *ptr; |
| 473 | Size len; |
| 474 | |
| 475 | ptr = XLogRecGetBlockData(record, 0, &len); |
| 476 | |
| 477 | page = (Page) BufferGetPage(buffer); |
| 478 | |
| 479 | if (len > 0) |
| 480 | { |
| 481 | OffsetNumber *unused; |
| 482 | OffsetNumber *unend; |
| 483 | |
| 484 | unused = (OffsetNumber *) ptr; |
| 485 | unend = (OffsetNumber *) ((char *) ptr + len); |
| 486 | |
| 487 | if ((unend - unused) > 0) |
| 488 | PageIndexMultiDelete(page, unused, unend - unused); |
| 489 | } |
| 490 | |
| 491 | /* |
| 492 | * Mark the page as not containing any LP_DEAD items --- see comments |
| 493 | * in _bt_delitems_vacuum(). |
| 494 | */ |
| 495 | opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
| 496 | opaque->btpo_flags &= ~BTP_HAS_GARBAGE; |
| 497 | |
| 498 | PageSetLSN(page, lsn); |
| 499 | MarkBufferDirty(buffer); |
| 500 | } |
| 501 | if (BufferIsValid(buffer)) |
| 502 | UnlockReleaseBuffer(buffer); |
| 503 | } |
| 504 | |
| 505 | static void |
| 506 | btree_xlog_delete(XLogReaderState *record) |
| 507 | { |
| 508 | XLogRecPtr lsn = record->EndRecPtr; |
| 509 | xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); |
| 510 | Buffer buffer; |
| 511 | Page page; |
| 512 | BTPageOpaque opaque; |
| 513 | |
| 514 | /* |
| 515 | * If we have any conflict processing to do, it must happen before we |
| 516 | * update the page. |
| 517 | * |
| 518 | * Btree delete records can conflict with standby queries. You might |
| 519 | * think that vacuum records would conflict as well, but we've handled |
| 520 | * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid |
| 521 | * cleaned by the vacuum of the heap and so we can resolve any conflicts |
| 522 | * just once when that arrives. After that we know that no conflicts |
| 523 | * exist from individual btree vacuum records on that index. |
| 524 | */ |
| 525 | if (InHotStandby) |
| 526 | { |
| 527 | RelFileNode rnode; |
| 528 | |
| 529 | XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); |
| 530 | |
| 531 | ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode); |
| 532 | } |
| 533 | |
| 534 | /* |
| 535 | * We don't need to take a cleanup lock to apply these changes. See |
| 536 | * nbtree/README for details. |
| 537 | */ |
| 538 | if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) |
| 539 | { |
| 540 | page = (Page) BufferGetPage(buffer); |
| 541 | |
| 542 | if (XLogRecGetDataLen(record) > SizeOfBtreeDelete) |
| 543 | { |
| 544 | OffsetNumber *unused; |
| 545 | |
| 546 | unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); |
| 547 | |
| 548 | PageIndexMultiDelete(page, unused, xlrec->nitems); |
| 549 | } |
| 550 | |
| 551 | /* |
| 552 | * Mark the page as not containing any LP_DEAD items --- see comments |
| 553 | * in _bt_delitems_delete(). |
| 554 | */ |
| 555 | opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
| 556 | opaque->btpo_flags &= ~BTP_HAS_GARBAGE; |
| 557 | |
| 558 | PageSetLSN(page, lsn); |
| 559 | MarkBufferDirty(buffer); |
| 560 | } |
| 561 | if (BufferIsValid(buffer)) |
| 562 | UnlockReleaseBuffer(buffer); |
| 563 | } |
| 564 | |
| 565 | static void |
| 566 | btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) |
| 567 | { |
| 568 | XLogRecPtr lsn = record->EndRecPtr; |
| 569 | xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); |
| 570 | Buffer buffer; |
| 571 | Page page; |
| 572 | BTPageOpaque pageop; |
| 573 | IndexTupleData trunctuple; |
| 574 | |
| 575 | /* |
| 576 | * In normal operation, we would lock all the pages this WAL record |
| 577 | * touches before changing any of them. In WAL replay, it should be okay |
| 578 | * to lock just one page at a time, since no concurrent index updates can |
| 579 | * be happening, and readers should not care whether they arrive at the |
| 580 | * target page or not (since it's surely empty). |
| 581 | */ |
| 582 | |
| 583 | /* parent page */ |
| 584 | if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) |
| 585 | { |
| 586 | OffsetNumber poffset; |
| 587 | ItemId itemid; |
| 588 | IndexTuple itup; |
| 589 | OffsetNumber nextoffset; |
| 590 | BlockNumber rightsib; |
| 591 | |
| 592 | page = (Page) BufferGetPage(buffer); |
| 593 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 594 | |
| 595 | poffset = xlrec->poffset; |
| 596 | |
| 597 | nextoffset = OffsetNumberNext(poffset); |
| 598 | itemid = PageGetItemId(page, nextoffset); |
| 599 | itup = (IndexTuple) PageGetItem(page, itemid); |
| 600 | rightsib = BTreeInnerTupleGetDownLink(itup); |
| 601 | |
| 602 | itemid = PageGetItemId(page, poffset); |
| 603 | itup = (IndexTuple) PageGetItem(page, itemid); |
| 604 | BTreeInnerTupleSetDownLink(itup, rightsib); |
| 605 | nextoffset = OffsetNumberNext(poffset); |
| 606 | PageIndexTupleDelete(page, nextoffset); |
| 607 | |
| 608 | PageSetLSN(page, lsn); |
| 609 | MarkBufferDirty(buffer); |
| 610 | } |
| 611 | if (BufferIsValid(buffer)) |
| 612 | UnlockReleaseBuffer(buffer); |
| 613 | |
| 614 | /* Rewrite the leaf page as a halfdead page */ |
| 615 | buffer = XLogInitBufferForRedo(record, 0); |
| 616 | page = (Page) BufferGetPage(buffer); |
| 617 | |
| 618 | _bt_pageinit(page, BufferGetPageSize(buffer)); |
| 619 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 620 | |
| 621 | pageop->btpo_prev = xlrec->leftblk; |
| 622 | pageop->btpo_next = xlrec->rightblk; |
| 623 | pageop->btpo.level = 0; |
| 624 | pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; |
| 625 | pageop->btpo_cycleid = 0; |
| 626 | |
| 627 | /* |
| 628 | * Construct a dummy hikey item that points to the next parent to be |
| 629 | * deleted (if any). |
| 630 | */ |
| 631 | MemSet(&trunctuple, 0, sizeof(IndexTupleData)); |
| 632 | trunctuple.t_info = sizeof(IndexTupleData); |
| 633 | BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); |
| 634 | |
| 635 | if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, |
| 636 | false, false) == InvalidOffsetNumber) |
| 637 | elog(ERROR, "could not add dummy high key to half-dead page" ); |
| 638 | |
| 639 | PageSetLSN(page, lsn); |
| 640 | MarkBufferDirty(buffer); |
| 641 | UnlockReleaseBuffer(buffer); |
| 642 | } |
| 643 | |
| 644 | |
| 645 | static void |
| 646 | btree_xlog_unlink_page(uint8 info, XLogReaderState *record) |
| 647 | { |
| 648 | XLogRecPtr lsn = record->EndRecPtr; |
| 649 | xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); |
| 650 | BlockNumber leftsib; |
| 651 | BlockNumber rightsib; |
| 652 | Buffer buffer; |
| 653 | Page page; |
| 654 | BTPageOpaque pageop; |
| 655 | |
| 656 | leftsib = xlrec->leftsib; |
| 657 | rightsib = xlrec->rightsib; |
| 658 | |
| 659 | /* |
| 660 | * In normal operation, we would lock all the pages this WAL record |
| 661 | * touches before changing any of them. In WAL replay, it should be okay |
| 662 | * to lock just one page at a time, since no concurrent index updates can |
| 663 | * be happening, and readers should not care whether they arrive at the |
| 664 | * target page or not (since it's surely empty). |
| 665 | */ |
| 666 | |
| 667 | /* Fix left-link of right sibling */ |
| 668 | if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) |
| 669 | { |
| 670 | page = (Page) BufferGetPage(buffer); |
| 671 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 672 | pageop->btpo_prev = leftsib; |
| 673 | |
| 674 | PageSetLSN(page, lsn); |
| 675 | MarkBufferDirty(buffer); |
| 676 | } |
| 677 | if (BufferIsValid(buffer)) |
| 678 | UnlockReleaseBuffer(buffer); |
| 679 | |
| 680 | /* Fix right-link of left sibling, if any */ |
| 681 | if (leftsib != P_NONE) |
| 682 | { |
| 683 | if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) |
| 684 | { |
| 685 | page = (Page) BufferGetPage(buffer); |
| 686 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 687 | pageop->btpo_next = rightsib; |
| 688 | |
| 689 | PageSetLSN(page, lsn); |
| 690 | MarkBufferDirty(buffer); |
| 691 | } |
| 692 | if (BufferIsValid(buffer)) |
| 693 | UnlockReleaseBuffer(buffer); |
| 694 | } |
| 695 | |
| 696 | /* Rewrite target page as empty deleted page */ |
| 697 | buffer = XLogInitBufferForRedo(record, 0); |
| 698 | page = (Page) BufferGetPage(buffer); |
| 699 | |
| 700 | _bt_pageinit(page, BufferGetPageSize(buffer)); |
| 701 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 702 | |
| 703 | pageop->btpo_prev = leftsib; |
| 704 | pageop->btpo_next = rightsib; |
| 705 | pageop->btpo.xact = xlrec->btpo_xact; |
| 706 | pageop->btpo_flags = BTP_DELETED; |
| 707 | pageop->btpo_cycleid = 0; |
| 708 | |
| 709 | PageSetLSN(page, lsn); |
| 710 | MarkBufferDirty(buffer); |
| 711 | UnlockReleaseBuffer(buffer); |
| 712 | |
| 713 | /* |
| 714 | * If we deleted a parent of the targeted leaf page, instead of the leaf |
| 715 | * itself, update the leaf to point to the next remaining child in the |
| 716 | * branch. |
| 717 | */ |
| 718 | if (XLogRecHasBlockRef(record, 3)) |
| 719 | { |
| 720 | /* |
| 721 | * There is no real data on the page, so we just re-create it from |
| 722 | * scratch using the information from the WAL record. |
| 723 | */ |
| 724 | IndexTupleData trunctuple; |
| 725 | |
| 726 | buffer = XLogInitBufferForRedo(record, 3); |
| 727 | page = (Page) BufferGetPage(buffer); |
| 728 | |
| 729 | _bt_pageinit(page, BufferGetPageSize(buffer)); |
| 730 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 731 | |
| 732 | pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; |
| 733 | pageop->btpo_prev = xlrec->leafleftsib; |
| 734 | pageop->btpo_next = xlrec->leafrightsib; |
| 735 | pageop->btpo.level = 0; |
| 736 | pageop->btpo_cycleid = 0; |
| 737 | |
| 738 | /* Add a dummy hikey item */ |
| 739 | MemSet(&trunctuple, 0, sizeof(IndexTupleData)); |
| 740 | trunctuple.t_info = sizeof(IndexTupleData); |
| 741 | BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); |
| 742 | |
| 743 | if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, |
| 744 | false, false) == InvalidOffsetNumber) |
| 745 | elog(ERROR, "could not add dummy high key to half-dead page" ); |
| 746 | |
| 747 | PageSetLSN(page, lsn); |
| 748 | MarkBufferDirty(buffer); |
| 749 | UnlockReleaseBuffer(buffer); |
| 750 | } |
| 751 | |
| 752 | /* Update metapage if needed */ |
| 753 | if (info == XLOG_BTREE_UNLINK_PAGE_META) |
| 754 | _bt_restore_meta(record, 4); |
| 755 | } |
| 756 | |
| 757 | static void |
| 758 | btree_xlog_newroot(XLogReaderState *record) |
| 759 | { |
| 760 | XLogRecPtr lsn = record->EndRecPtr; |
| 761 | xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); |
| 762 | Buffer buffer; |
| 763 | Page page; |
| 764 | BTPageOpaque pageop; |
| 765 | char *ptr; |
| 766 | Size len; |
| 767 | |
| 768 | buffer = XLogInitBufferForRedo(record, 0); |
| 769 | page = (Page) BufferGetPage(buffer); |
| 770 | |
| 771 | _bt_pageinit(page, BufferGetPageSize(buffer)); |
| 772 | pageop = (BTPageOpaque) PageGetSpecialPointer(page); |
| 773 | |
| 774 | pageop->btpo_flags = BTP_ROOT; |
| 775 | pageop->btpo_prev = pageop->btpo_next = P_NONE; |
| 776 | pageop->btpo.level = xlrec->level; |
| 777 | if (xlrec->level == 0) |
| 778 | pageop->btpo_flags |= BTP_LEAF; |
| 779 | pageop->btpo_cycleid = 0; |
| 780 | |
| 781 | if (xlrec->level > 0) |
| 782 | { |
| 783 | ptr = XLogRecGetBlockData(record, 0, &len); |
| 784 | _bt_restore_page(page, ptr, len); |
| 785 | |
| 786 | /* Clear the incomplete-split flag in left child */ |
| 787 | _bt_clear_incomplete_split(record, 1); |
| 788 | } |
| 789 | |
| 790 | PageSetLSN(page, lsn); |
| 791 | MarkBufferDirty(buffer); |
| 792 | UnlockReleaseBuffer(buffer); |
| 793 | |
| 794 | _bt_restore_meta(record, 2); |
| 795 | } |
| 796 | |
| 797 | static void |
| 798 | btree_xlog_reuse_page(XLogReaderState *record) |
| 799 | { |
| 800 | xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); |
| 801 | |
| 802 | /* |
| 803 | * Btree reuse_page records exist to provide a conflict point when we |
| 804 | * reuse pages in the index via the FSM. That's all they do though. |
| 805 | * |
| 806 | * latestRemovedXid was the page's btpo.xact. The btpo.xact < |
| 807 | * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the |
| 808 | * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). |
| 809 | * Consequently, one XID value achieves the same exclusion effect on |
| 810 | * master and standby. |
| 811 | */ |
| 812 | if (InHotStandby) |
| 813 | { |
| 814 | ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, |
| 815 | xlrec->node); |
| 816 | } |
| 817 | } |
| 818 | |
| 819 | void |
| 820 | btree_redo(XLogReaderState *record) |
| 821 | { |
| 822 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
| 823 | |
| 824 | switch (info) |
| 825 | { |
| 826 | case XLOG_BTREE_INSERT_LEAF: |
| 827 | btree_xlog_insert(true, false, record); |
| 828 | break; |
| 829 | case XLOG_BTREE_INSERT_UPPER: |
| 830 | btree_xlog_insert(false, false, record); |
| 831 | break; |
| 832 | case XLOG_BTREE_INSERT_META: |
| 833 | btree_xlog_insert(false, true, record); |
| 834 | break; |
| 835 | case XLOG_BTREE_SPLIT_L: |
| 836 | btree_xlog_split(true, record); |
| 837 | break; |
| 838 | case XLOG_BTREE_SPLIT_R: |
| 839 | btree_xlog_split(false, record); |
| 840 | break; |
| 841 | case XLOG_BTREE_VACUUM: |
| 842 | btree_xlog_vacuum(record); |
| 843 | break; |
| 844 | case XLOG_BTREE_DELETE: |
| 845 | btree_xlog_delete(record); |
| 846 | break; |
| 847 | case XLOG_BTREE_MARK_PAGE_HALFDEAD: |
| 848 | btree_xlog_mark_page_halfdead(info, record); |
| 849 | break; |
| 850 | case XLOG_BTREE_UNLINK_PAGE: |
| 851 | case XLOG_BTREE_UNLINK_PAGE_META: |
| 852 | btree_xlog_unlink_page(info, record); |
| 853 | break; |
| 854 | case XLOG_BTREE_NEWROOT: |
| 855 | btree_xlog_newroot(record); |
| 856 | break; |
| 857 | case XLOG_BTREE_REUSE_PAGE: |
| 858 | btree_xlog_reuse_page(record); |
| 859 | break; |
| 860 | case XLOG_BTREE_META_CLEANUP: |
| 861 | _bt_restore_meta(record, 0); |
| 862 | break; |
| 863 | default: |
| 864 | elog(PANIC, "btree_redo: unknown op code %u" , info); |
| 865 | } |
| 866 | } |
| 867 | |
| 868 | /* |
| 869 | * Mask a btree page before performing consistency checks on it. |
| 870 | */ |
| 871 | void |
| 872 | btree_mask(char *pagedata, BlockNumber blkno) |
| 873 | { |
| 874 | Page page = (Page) pagedata; |
| 875 | BTPageOpaque maskopaq; |
| 876 | |
| 877 | mask_page_lsn_and_checksum(page); |
| 878 | |
| 879 | mask_page_hint_bits(page); |
| 880 | mask_unused_space(page); |
| 881 | |
| 882 | maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); |
| 883 | |
| 884 | if (P_ISDELETED(maskopaq)) |
| 885 | { |
| 886 | /* |
| 887 | * Mask page content on a DELETED page since it will be re-initialized |
| 888 | * during replay. See btree_xlog_unlink_page() for details. |
| 889 | */ |
| 890 | mask_page_content(page); |
| 891 | } |
| 892 | else if (P_ISLEAF(maskopaq)) |
| 893 | { |
| 894 | /* |
| 895 | * In btree leaf pages, it is possible to modify the LP_FLAGS without |
| 896 | * emitting any WAL record. Hence, mask the line pointer flags. See |
| 897 | * _bt_killitems(), _bt_check_unique() for details. |
| 898 | */ |
| 899 | mask_lp_flags(page); |
| 900 | } |
| 901 | |
| 902 | /* |
| 903 | * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See |
| 904 | * _bt_killitems(), _bt_check_unique() for details. |
| 905 | */ |
| 906 | maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; |
| 907 | |
| 908 | /* |
| 909 | * During replay of a btree page split, we don't set the BTP_SPLIT_END |
| 910 | * flag of the right sibling and initialize the cycle_id to 0 for the same |
| 911 | * page. See btree_xlog_split() for details. |
| 912 | */ |
| 913 | maskopaq->btpo_flags &= ~BTP_SPLIT_END; |
| 914 | maskopaq->btpo_cycleid = 0; |
| 915 | } |
| 916 | |