| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * ginbtree.c |
| 4 | * page utilities routines for the postgres inverted index access method. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/gin/ginbtree.c |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "access/gin_private.h" |
| 18 | #include "access/ginxlog.h" |
| 19 | #include "access/xloginsert.h" |
| 20 | #include "storage/predicate.h" |
| 21 | #include "miscadmin.h" |
| 22 | #include "utils/memutils.h" |
| 23 | #include "utils/rel.h" |
| 24 | |
| 25 | static void ginFindParents(GinBtree btree, GinBtreeStack *stack); |
| 26 | static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, |
| 27 | void *insertdata, BlockNumber updateblkno, |
| 28 | Buffer childbuf, GinStatsData *buildStats); |
| 29 | static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, |
| 30 | bool freestack, GinStatsData *buildStats); |
| 31 | |
| 32 | /* |
| 33 | * Lock buffer by needed method for search. |
| 34 | */ |
| 35 | int |
| 36 | ginTraverseLock(Buffer buffer, bool searchMode) |
| 37 | { |
| 38 | Page page; |
| 39 | int access = GIN_SHARE; |
| 40 | |
| 41 | LockBuffer(buffer, GIN_SHARE); |
| 42 | page = BufferGetPage(buffer); |
| 43 | if (GinPageIsLeaf(page)) |
| 44 | { |
| 45 | if (searchMode == false) |
| 46 | { |
| 47 | /* we should relock our page */ |
| 48 | LockBuffer(buffer, GIN_UNLOCK); |
| 49 | LockBuffer(buffer, GIN_EXCLUSIVE); |
| 50 | |
| 51 | /* But root can become non-leaf during relock */ |
| 52 | if (!GinPageIsLeaf(page)) |
| 53 | { |
| 54 | /* restore old lock type (very rare) */ |
| 55 | LockBuffer(buffer, GIN_UNLOCK); |
| 56 | LockBuffer(buffer, GIN_SHARE); |
| 57 | } |
| 58 | else |
| 59 | access = GIN_EXCLUSIVE; |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | return access; |
| 64 | } |
| 65 | |
| 66 | /* |
| 67 | * Descend the tree to the leaf page that contains or would contain the key |
| 68 | * we're searching for. The key should already be filled in 'btree', in |
| 69 | * tree-type specific manner. If btree->fullScan is true, descends to the |
| 70 | * leftmost leaf page. |
| 71 | * |
| 72 | * If 'searchmode' is false, on return stack->buffer is exclusively locked, |
| 73 | * and the stack represents the full path to the root. Otherwise stack->buffer |
| 74 | * is share-locked, and stack->parent is NULL. |
| 75 | * |
| 76 | * If 'rootConflictCheck' is true, tree root is checked for serialization |
| 77 | * conflict. |
| 78 | */ |
| 79 | GinBtreeStack * |
| 80 | ginFindLeafPage(GinBtree btree, bool searchMode, |
| 81 | bool rootConflictCheck, Snapshot snapshot) |
| 82 | { |
| 83 | GinBtreeStack *stack; |
| 84 | |
| 85 | stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); |
| 86 | stack->blkno = btree->rootBlkno; |
| 87 | stack->buffer = ReadBuffer(btree->index, btree->rootBlkno); |
| 88 | stack->parent = NULL; |
| 89 | stack->predictNumber = 1; |
| 90 | |
| 91 | if (rootConflictCheck) |
| 92 | CheckForSerializableConflictIn(btree->index, NULL, stack->buffer); |
| 93 | |
| 94 | for (;;) |
| 95 | { |
| 96 | Page page; |
| 97 | BlockNumber child; |
| 98 | int access; |
| 99 | |
| 100 | stack->off = InvalidOffsetNumber; |
| 101 | |
| 102 | page = BufferGetPage(stack->buffer); |
| 103 | TestForOldSnapshot(snapshot, btree->index, page); |
| 104 | |
| 105 | access = ginTraverseLock(stack->buffer, searchMode); |
| 106 | |
| 107 | /* |
| 108 | * If we're going to modify the tree, finish any incomplete splits we |
| 109 | * encounter on the way. |
| 110 | */ |
| 111 | if (!searchMode && GinPageIsIncompleteSplit(page)) |
| 112 | ginFinishSplit(btree, stack, false, NULL); |
| 113 | |
| 114 | /* |
| 115 | * ok, page is correctly locked, we should check to move right .., |
| 116 | * root never has a right link, so small optimization |
| 117 | */ |
| 118 | while (btree->fullScan == false && stack->blkno != btree->rootBlkno && |
| 119 | btree->isMoveRight(btree, page)) |
| 120 | { |
| 121 | BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; |
| 122 | |
| 123 | if (rightlink == InvalidBlockNumber) |
| 124 | /* rightmost page */ |
| 125 | break; |
| 126 | |
| 127 | stack->buffer = ginStepRight(stack->buffer, btree->index, access); |
| 128 | stack->blkno = rightlink; |
| 129 | page = BufferGetPage(stack->buffer); |
| 130 | TestForOldSnapshot(snapshot, btree->index, page); |
| 131 | |
| 132 | if (!searchMode && GinPageIsIncompleteSplit(page)) |
| 133 | ginFinishSplit(btree, stack, false, NULL); |
| 134 | } |
| 135 | |
| 136 | if (GinPageIsLeaf(page)) /* we found, return locked page */ |
| 137 | return stack; |
| 138 | |
| 139 | /* now we have correct buffer, try to find child */ |
| 140 | child = btree->findChildPage(btree, stack); |
| 141 | |
| 142 | LockBuffer(stack->buffer, GIN_UNLOCK); |
| 143 | Assert(child != InvalidBlockNumber); |
| 144 | Assert(stack->blkno != child); |
| 145 | |
| 146 | if (searchMode) |
| 147 | { |
| 148 | /* in search mode we may forget path to leaf */ |
| 149 | stack->blkno = child; |
| 150 | stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); |
| 151 | } |
| 152 | else |
| 153 | { |
| 154 | GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); |
| 155 | |
| 156 | ptr->parent = stack; |
| 157 | stack = ptr; |
| 158 | stack->blkno = child; |
| 159 | stack->buffer = ReadBuffer(btree->index, stack->blkno); |
| 160 | stack->predictNumber = 1; |
| 161 | } |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | /* |
| 166 | * Step right from current page. |
| 167 | * |
| 168 | * The next page is locked first, before releasing the current page. This is |
| 169 | * crucial to protect from concurrent page deletion (see comment in |
| 170 | * ginDeletePage). |
| 171 | */ |
| 172 | Buffer |
| 173 | ginStepRight(Buffer buffer, Relation index, int lockmode) |
| 174 | { |
| 175 | Buffer nextbuffer; |
| 176 | Page page = BufferGetPage(buffer); |
| 177 | bool isLeaf = GinPageIsLeaf(page); |
| 178 | bool isData = GinPageIsData(page); |
| 179 | BlockNumber blkno = GinPageGetOpaque(page)->rightlink; |
| 180 | |
| 181 | nextbuffer = ReadBuffer(index, blkno); |
| 182 | LockBuffer(nextbuffer, lockmode); |
| 183 | UnlockReleaseBuffer(buffer); |
| 184 | |
| 185 | /* Sanity check that the page we stepped to is of similar kind. */ |
| 186 | page = BufferGetPage(nextbuffer); |
| 187 | if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page)) |
| 188 | elog(ERROR, "right sibling of GIN page is of different type" ); |
| 189 | |
| 190 | /* |
| 191 | * Given the proper lock sequence above, we should never land on a deleted |
| 192 | * page. |
| 193 | */ |
| 194 | if (GinPageIsDeleted(page)) |
| 195 | elog(ERROR, "right sibling of GIN page was deleted" ); |
| 196 | |
| 197 | return nextbuffer; |
| 198 | } |
| 199 | |
| 200 | void |
| 201 | freeGinBtreeStack(GinBtreeStack *stack) |
| 202 | { |
| 203 | while (stack) |
| 204 | { |
| 205 | GinBtreeStack *tmp = stack->parent; |
| 206 | |
| 207 | if (stack->buffer != InvalidBuffer) |
| 208 | ReleaseBuffer(stack->buffer); |
| 209 | |
| 210 | pfree(stack); |
| 211 | stack = tmp; |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | /* |
| 216 | * Try to find parent for current stack position. Returns correct parent and |
| 217 | * child's offset in stack->parent. The root page is never released, to |
| 218 | * prevent conflict with vacuum process. |
| 219 | */ |
| 220 | static void |
| 221 | ginFindParents(GinBtree btree, GinBtreeStack *stack) |
| 222 | { |
| 223 | Page page; |
| 224 | Buffer buffer; |
| 225 | BlockNumber blkno, |
| 226 | leftmostBlkno; |
| 227 | OffsetNumber offset; |
| 228 | GinBtreeStack *root; |
| 229 | GinBtreeStack *ptr; |
| 230 | |
| 231 | /* |
| 232 | * Unwind the stack all the way up to the root, leaving only the root |
| 233 | * item. |
| 234 | * |
| 235 | * Be careful not to release the pin on the root page! The pin on root |
| 236 | * page is required to lock out concurrent vacuums on the tree. |
| 237 | */ |
| 238 | root = stack->parent; |
| 239 | while (root->parent) |
| 240 | { |
| 241 | ReleaseBuffer(root->buffer); |
| 242 | root = root->parent; |
| 243 | } |
| 244 | |
| 245 | Assert(root->blkno == btree->rootBlkno); |
| 246 | Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno); |
| 247 | root->off = InvalidOffsetNumber; |
| 248 | |
| 249 | blkno = root->blkno; |
| 250 | buffer = root->buffer; |
| 251 | offset = InvalidOffsetNumber; |
| 252 | |
| 253 | ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack)); |
| 254 | |
| 255 | for (;;) |
| 256 | { |
| 257 | LockBuffer(buffer, GIN_EXCLUSIVE); |
| 258 | page = BufferGetPage(buffer); |
| 259 | if (GinPageIsLeaf(page)) |
| 260 | elog(ERROR, "Lost path" ); |
| 261 | |
| 262 | if (GinPageIsIncompleteSplit(page)) |
| 263 | { |
| 264 | Assert(blkno != btree->rootBlkno); |
| 265 | ptr->blkno = blkno; |
| 266 | ptr->buffer = buffer; |
| 267 | |
| 268 | /* |
| 269 | * parent may be wrong, but if so, the ginFinishSplit call will |
| 270 | * recurse to call ginFindParents again to fix it. |
| 271 | */ |
| 272 | ptr->parent = root; |
| 273 | ptr->off = InvalidOffsetNumber; |
| 274 | |
| 275 | ginFinishSplit(btree, ptr, false, NULL); |
| 276 | } |
| 277 | |
| 278 | leftmostBlkno = btree->getLeftMostChild(btree, page); |
| 279 | |
| 280 | while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) |
| 281 | { |
| 282 | blkno = GinPageGetOpaque(page)->rightlink; |
| 283 | if (blkno == InvalidBlockNumber) |
| 284 | { |
| 285 | UnlockReleaseBuffer(buffer); |
| 286 | break; |
| 287 | } |
| 288 | buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE); |
| 289 | page = BufferGetPage(buffer); |
| 290 | |
| 291 | /* finish any incomplete splits, as above */ |
| 292 | if (GinPageIsIncompleteSplit(page)) |
| 293 | { |
| 294 | Assert(blkno != btree->rootBlkno); |
| 295 | ptr->blkno = blkno; |
| 296 | ptr->buffer = buffer; |
| 297 | ptr->parent = root; |
| 298 | ptr->off = InvalidOffsetNumber; |
| 299 | |
| 300 | ginFinishSplit(btree, ptr, false, NULL); |
| 301 | } |
| 302 | } |
| 303 | |
| 304 | if (blkno != InvalidBlockNumber) |
| 305 | { |
| 306 | ptr->blkno = blkno; |
| 307 | ptr->buffer = buffer; |
| 308 | ptr->parent = root; /* it may be wrong, but in next call we will |
| 309 | * correct */ |
| 310 | ptr->off = offset; |
| 311 | stack->parent = ptr; |
| 312 | return; |
| 313 | } |
| 314 | |
| 315 | /* Descend down to next level */ |
| 316 | blkno = leftmostBlkno; |
| 317 | buffer = ReadBuffer(btree->index, blkno); |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | /* |
| 322 | * Insert a new item to a page. |
| 323 | * |
| 324 | * Returns true if the insertion was finished. On false, the page was split and |
| 325 | * the parent needs to be updated. (A root split returns true as it doesn't |
| 326 | * need any further action by the caller to complete.) |
| 327 | * |
| 328 | * When inserting a downlink to an internal page, 'childbuf' contains the |
| 329 | * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared |
| 330 | * atomically with the insert. Also, the existing item at offset stack->off |
| 331 | * in the target page is updated to point to updateblkno. |
| 332 | * |
| 333 | * stack->buffer is locked on entry, and is kept locked. |
| 334 | * Likewise for childbuf, if given. |
| 335 | */ |
| 336 | static bool |
| 337 | ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, |
| 338 | void *insertdata, BlockNumber updateblkno, |
| 339 | Buffer childbuf, GinStatsData *buildStats) |
| 340 | { |
| 341 | Page page = BufferGetPage(stack->buffer); |
| 342 | bool result; |
| 343 | GinPlaceToPageRC rc; |
| 344 | uint16 xlflags = 0; |
| 345 | Page childpage = NULL; |
| 346 | Page newlpage = NULL, |
| 347 | newrpage = NULL; |
| 348 | void *ptp_workspace = NULL; |
| 349 | MemoryContext tmpCxt; |
| 350 | MemoryContext oldCxt; |
| 351 | |
| 352 | /* |
| 353 | * We do all the work of this function and its subfunctions in a temporary |
| 354 | * memory context. This avoids leakages and simplifies APIs, since some |
| 355 | * subfunctions allocate storage that has to survive until we've finished |
| 356 | * the WAL insertion. |
| 357 | */ |
| 358 | tmpCxt = AllocSetContextCreate(CurrentMemoryContext, |
| 359 | "ginPlaceToPage temporary context" , |
| 360 | ALLOCSET_DEFAULT_SIZES); |
| 361 | oldCxt = MemoryContextSwitchTo(tmpCxt); |
| 362 | |
| 363 | if (GinPageIsData(page)) |
| 364 | xlflags |= GIN_INSERT_ISDATA; |
| 365 | if (GinPageIsLeaf(page)) |
| 366 | { |
| 367 | xlflags |= GIN_INSERT_ISLEAF; |
| 368 | Assert(!BufferIsValid(childbuf)); |
| 369 | Assert(updateblkno == InvalidBlockNumber); |
| 370 | } |
| 371 | else |
| 372 | { |
| 373 | Assert(BufferIsValid(childbuf)); |
| 374 | Assert(updateblkno != InvalidBlockNumber); |
| 375 | childpage = BufferGetPage(childbuf); |
| 376 | } |
| 377 | |
| 378 | /* |
| 379 | * See if the incoming tuple will fit on the page. beginPlaceToPage will |
| 380 | * decide if the page needs to be split, and will compute the split |
| 381 | * contents if so. See comments for beginPlaceToPage and execPlaceToPage |
| 382 | * functions for more details of the API here. |
| 383 | */ |
| 384 | rc = btree->beginPlaceToPage(btree, stack->buffer, stack, |
| 385 | insertdata, updateblkno, |
| 386 | &ptp_workspace, |
| 387 | &newlpage, &newrpage); |
| 388 | |
| 389 | if (rc == GPTP_NO_WORK) |
| 390 | { |
| 391 | /* Nothing to do */ |
| 392 | result = true; |
| 393 | } |
| 394 | else if (rc == GPTP_INSERT) |
| 395 | { |
| 396 | /* It will fit, perform the insertion */ |
| 397 | START_CRIT_SECTION(); |
| 398 | |
| 399 | if (RelationNeedsWAL(btree->index) && !btree->isBuild) |
| 400 | { |
| 401 | XLogBeginInsert(); |
| 402 | XLogRegisterBuffer(0, stack->buffer, REGBUF_STANDARD); |
| 403 | if (BufferIsValid(childbuf)) |
| 404 | XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); |
| 405 | } |
| 406 | |
| 407 | /* Perform the page update, and register any extra WAL data */ |
| 408 | btree->execPlaceToPage(btree, stack->buffer, stack, |
| 409 | insertdata, updateblkno, ptp_workspace); |
| 410 | |
| 411 | MarkBufferDirty(stack->buffer); |
| 412 | |
| 413 | /* An insert to an internal page finishes the split of the child. */ |
| 414 | if (BufferIsValid(childbuf)) |
| 415 | { |
| 416 | GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; |
| 417 | MarkBufferDirty(childbuf); |
| 418 | } |
| 419 | |
| 420 | if (RelationNeedsWAL(btree->index) && !btree->isBuild) |
| 421 | { |
| 422 | XLogRecPtr recptr; |
| 423 | ginxlogInsert xlrec; |
| 424 | BlockIdData childblknos[2]; |
| 425 | |
| 426 | xlrec.flags = xlflags; |
| 427 | |
| 428 | XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert)); |
| 429 | |
| 430 | /* |
| 431 | * Log information about child if this was an insertion of a |
| 432 | * downlink. |
| 433 | */ |
| 434 | if (BufferIsValid(childbuf)) |
| 435 | { |
| 436 | BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); |
| 437 | BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); |
| 438 | XLogRegisterData((char *) childblknos, |
| 439 | sizeof(BlockIdData) * 2); |
| 440 | } |
| 441 | |
| 442 | recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT); |
| 443 | PageSetLSN(page, recptr); |
| 444 | if (BufferIsValid(childbuf)) |
| 445 | PageSetLSN(childpage, recptr); |
| 446 | } |
| 447 | |
| 448 | END_CRIT_SECTION(); |
| 449 | |
| 450 | /* Insertion is complete. */ |
| 451 | result = true; |
| 452 | } |
| 453 | else if (rc == GPTP_SPLIT) |
| 454 | { |
| 455 | /* |
| 456 | * Didn't fit, need to split. The split has been computed in newlpage |
| 457 | * and newrpage, which are pointers to palloc'd pages, not associated |
| 458 | * with buffers. stack->buffer is not touched yet. |
| 459 | */ |
| 460 | Buffer rbuffer; |
| 461 | BlockNumber savedRightLink; |
| 462 | ginxlogSplit data; |
| 463 | Buffer lbuffer = InvalidBuffer; |
| 464 | Page newrootpg = NULL; |
| 465 | |
| 466 | /* Get a new index page to become the right page */ |
| 467 | rbuffer = GinNewBuffer(btree->index); |
| 468 | |
| 469 | /* During index build, count the new page */ |
| 470 | if (buildStats) |
| 471 | { |
| 472 | if (btree->isData) |
| 473 | buildStats->nDataPages++; |
| 474 | else |
| 475 | buildStats->nEntryPages++; |
| 476 | } |
| 477 | |
| 478 | savedRightLink = GinPageGetOpaque(page)->rightlink; |
| 479 | |
| 480 | /* Begin setting up WAL record */ |
| 481 | data.node = btree->index->rd_node; |
| 482 | data.flags = xlflags; |
| 483 | if (BufferIsValid(childbuf)) |
| 484 | { |
| 485 | data.leftChildBlkno = BufferGetBlockNumber(childbuf); |
| 486 | data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; |
| 487 | } |
| 488 | else |
| 489 | data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; |
| 490 | |
| 491 | if (stack->parent == NULL) |
| 492 | { |
| 493 | /* |
| 494 | * splitting the root, so we need to allocate new left page and |
| 495 | * place pointers to left and right page on root page. |
| 496 | */ |
| 497 | lbuffer = GinNewBuffer(btree->index); |
| 498 | |
| 499 | /* During index build, count the new left page */ |
| 500 | if (buildStats) |
| 501 | { |
| 502 | if (btree->isData) |
| 503 | buildStats->nDataPages++; |
| 504 | else |
| 505 | buildStats->nEntryPages++; |
| 506 | } |
| 507 | |
| 508 | data.rrlink = InvalidBlockNumber; |
| 509 | data.flags |= GIN_SPLIT_ROOT; |
| 510 | |
| 511 | GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; |
| 512 | GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); |
| 513 | |
| 514 | /* |
| 515 | * Construct a new root page containing downlinks to the new left |
| 516 | * and right pages. (Do this in a temporary copy rather than |
| 517 | * overwriting the original page directly, since we're not in the |
| 518 | * critical section yet.) |
| 519 | */ |
| 520 | newrootpg = PageGetTempPage(newrpage); |
| 521 | GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); |
| 522 | |
| 523 | btree->fillRoot(btree, newrootpg, |
| 524 | BufferGetBlockNumber(lbuffer), newlpage, |
| 525 | BufferGetBlockNumber(rbuffer), newrpage); |
| 526 | |
| 527 | if (GinPageIsLeaf(BufferGetPage(stack->buffer))) |
| 528 | { |
| 529 | |
| 530 | PredicateLockPageSplit(btree->index, |
| 531 | BufferGetBlockNumber(stack->buffer), |
| 532 | BufferGetBlockNumber(lbuffer)); |
| 533 | |
| 534 | PredicateLockPageSplit(btree->index, |
| 535 | BufferGetBlockNumber(stack->buffer), |
| 536 | BufferGetBlockNumber(rbuffer)); |
| 537 | } |
| 538 | |
| 539 | } |
| 540 | else |
| 541 | { |
| 542 | /* splitting a non-root page */ |
| 543 | data.rrlink = savedRightLink; |
| 544 | |
| 545 | GinPageGetOpaque(newrpage)->rightlink = savedRightLink; |
| 546 | GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; |
| 547 | GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); |
| 548 | |
| 549 | if (GinPageIsLeaf(BufferGetPage(stack->buffer))) |
| 550 | { |
| 551 | |
| 552 | PredicateLockPageSplit(btree->index, |
| 553 | BufferGetBlockNumber(stack->buffer), |
| 554 | BufferGetBlockNumber(rbuffer)); |
| 555 | } |
| 556 | } |
| 557 | |
| 558 | /* |
| 559 | * OK, we have the new contents of the left page in a temporary copy |
| 560 | * now (newlpage), and likewise for the new contents of the |
| 561 | * newly-allocated right block. The original page is still unchanged. |
| 562 | * |
| 563 | * If this is a root split, we also have a temporary page containing |
| 564 | * the new contents of the root. |
| 565 | */ |
| 566 | |
| 567 | START_CRIT_SECTION(); |
| 568 | |
| 569 | MarkBufferDirty(rbuffer); |
| 570 | MarkBufferDirty(stack->buffer); |
| 571 | |
| 572 | /* |
| 573 | * Restore the temporary copies over the real buffers. |
| 574 | */ |
| 575 | if (stack->parent == NULL) |
| 576 | { |
| 577 | /* Splitting the root, three pages to update */ |
| 578 | MarkBufferDirty(lbuffer); |
| 579 | memcpy(page, newrootpg, BLCKSZ); |
| 580 | memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); |
| 581 | memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); |
| 582 | } |
| 583 | else |
| 584 | { |
| 585 | /* Normal split, only two pages to update */ |
| 586 | memcpy(page, newlpage, BLCKSZ); |
| 587 | memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); |
| 588 | } |
| 589 | |
| 590 | /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ |
| 591 | if (BufferIsValid(childbuf)) |
| 592 | { |
| 593 | GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; |
| 594 | MarkBufferDirty(childbuf); |
| 595 | } |
| 596 | |
| 597 | /* write WAL record */ |
| 598 | if (RelationNeedsWAL(btree->index) && !btree->isBuild) |
| 599 | { |
| 600 | XLogRecPtr recptr; |
| 601 | |
| 602 | XLogBeginInsert(); |
| 603 | |
| 604 | /* |
| 605 | * We just take full page images of all the split pages. Splits |
| 606 | * are uncommon enough that it's not worth complicating the code |
| 607 | * to be more efficient. |
| 608 | */ |
| 609 | if (stack->parent == NULL) |
| 610 | { |
| 611 | XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
| 612 | XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
| 613 | XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
| 614 | } |
| 615 | else |
| 616 | { |
| 617 | XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
| 618 | XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
| 619 | } |
| 620 | if (BufferIsValid(childbuf)) |
| 621 | XLogRegisterBuffer(3, childbuf, REGBUF_STANDARD); |
| 622 | |
| 623 | XLogRegisterData((char *) &data, sizeof(ginxlogSplit)); |
| 624 | |
| 625 | recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT); |
| 626 | |
| 627 | PageSetLSN(page, recptr); |
| 628 | PageSetLSN(BufferGetPage(rbuffer), recptr); |
| 629 | if (stack->parent == NULL) |
| 630 | PageSetLSN(BufferGetPage(lbuffer), recptr); |
| 631 | if (BufferIsValid(childbuf)) |
| 632 | PageSetLSN(childpage, recptr); |
| 633 | } |
| 634 | END_CRIT_SECTION(); |
| 635 | |
| 636 | /* |
| 637 | * We can release the locks/pins on the new pages now, but keep |
| 638 | * stack->buffer locked. childbuf doesn't get unlocked either. |
| 639 | */ |
| 640 | UnlockReleaseBuffer(rbuffer); |
| 641 | if (stack->parent == NULL) |
| 642 | UnlockReleaseBuffer(lbuffer); |
| 643 | |
| 644 | /* |
| 645 | * If we split the root, we're done. Otherwise the split is not |
| 646 | * complete until the downlink for the new page has been inserted to |
| 647 | * the parent. |
| 648 | */ |
| 649 | result = (stack->parent == NULL); |
| 650 | } |
| 651 | else |
| 652 | { |
| 653 | elog(ERROR, "invalid return code from GIN placeToPage method: %d" , rc); |
| 654 | result = false; /* keep compiler quiet */ |
| 655 | } |
| 656 | |
| 657 | /* Clean up temp context */ |
| 658 | MemoryContextSwitchTo(oldCxt); |
| 659 | MemoryContextDelete(tmpCxt); |
| 660 | |
| 661 | return result; |
| 662 | } |
| 663 | |
| 664 | /* |
| 665 | * Finish a split by inserting the downlink for the new page to parent. |
| 666 | * |
| 667 | * On entry, stack->buffer is exclusively locked. |
| 668 | * |
| 669 | * If freestack is true, all the buffers are released and unlocked as we |
| 670 | * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept |
| 671 | * locked, and stack is unmodified, except for possibly moving right to find |
| 672 | * the correct parent of page. |
| 673 | */ |
| 674 | static void |
| 675 | ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, |
| 676 | GinStatsData *buildStats) |
| 677 | { |
| 678 | Page page; |
| 679 | bool done; |
| 680 | bool first = true; |
| 681 | |
| 682 | /* |
| 683 | * freestack == false when we encounter an incompletely split page during |
| 684 | * a scan, while freestack == true is used in the normal scenario that a |
| 685 | * split is finished right after the initial insert. |
| 686 | */ |
| 687 | if (!freestack) |
| 688 | elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"" , |
| 689 | stack->blkno, RelationGetRelationName(btree->index)); |
| 690 | |
| 691 | /* this loop crawls up the stack until the insertion is complete */ |
| 692 | do |
| 693 | { |
| 694 | GinBtreeStack *parent = stack->parent; |
| 695 | void *insertdata; |
| 696 | BlockNumber updateblkno; |
| 697 | |
| 698 | /* search parent to lock */ |
| 699 | LockBuffer(parent->buffer, GIN_EXCLUSIVE); |
| 700 | |
| 701 | /* |
| 702 | * If the parent page was incompletely split, finish that split first, |
| 703 | * then continue with the current one. |
| 704 | * |
| 705 | * Note: we have to finish *all* incomplete splits we encounter, even |
| 706 | * if we have to move right. Otherwise we might choose as the target a |
| 707 | * page that has no downlink in the parent, and splitting it further |
| 708 | * would fail. |
| 709 | */ |
| 710 | if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) |
| 711 | ginFinishSplit(btree, parent, false, buildStats); |
| 712 | |
| 713 | /* move right if it's needed */ |
| 714 | page = BufferGetPage(parent->buffer); |
| 715 | while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) |
| 716 | { |
| 717 | if (GinPageRightMost(page)) |
| 718 | { |
| 719 | /* |
| 720 | * rightmost page, but we don't find parent, we should use |
| 721 | * plain search... |
| 722 | */ |
| 723 | LockBuffer(parent->buffer, GIN_UNLOCK); |
| 724 | ginFindParents(btree, stack); |
| 725 | parent = stack->parent; |
| 726 | Assert(parent != NULL); |
| 727 | break; |
| 728 | } |
| 729 | |
| 730 | parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE); |
| 731 | parent->blkno = BufferGetBlockNumber(parent->buffer); |
| 732 | page = BufferGetPage(parent->buffer); |
| 733 | |
| 734 | if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) |
| 735 | ginFinishSplit(btree, parent, false, buildStats); |
| 736 | } |
| 737 | |
| 738 | /* insert the downlink */ |
| 739 | insertdata = btree->prepareDownlink(btree, stack->buffer); |
| 740 | updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink; |
| 741 | done = ginPlaceToPage(btree, parent, |
| 742 | insertdata, updateblkno, |
| 743 | stack->buffer, buildStats); |
| 744 | pfree(insertdata); |
| 745 | |
| 746 | /* |
| 747 | * If the caller requested to free the stack, unlock and release the |
| 748 | * child buffer now. Otherwise keep it pinned and locked, but if we |
| 749 | * have to recurse up the tree, we can unlock the upper pages, only |
| 750 | * keeping the page at the bottom of the stack locked. |
| 751 | */ |
| 752 | if (!first || freestack) |
| 753 | LockBuffer(stack->buffer, GIN_UNLOCK); |
| 754 | if (freestack) |
| 755 | { |
| 756 | ReleaseBuffer(stack->buffer); |
| 757 | pfree(stack); |
| 758 | } |
| 759 | stack = parent; |
| 760 | |
| 761 | first = false; |
| 762 | } while (!done); |
| 763 | |
| 764 | /* unlock the parent */ |
| 765 | LockBuffer(stack->buffer, GIN_UNLOCK); |
| 766 | |
| 767 | if (freestack) |
| 768 | freeGinBtreeStack(stack); |
| 769 | } |
| 770 | |
| 771 | /* |
| 772 | * Insert a value to tree described by stack. |
| 773 | * |
| 774 | * The value to be inserted is given in 'insertdata'. Its format depends |
| 775 | * on whether this is an entry or data tree, ginInsertValue just passes it |
| 776 | * through to the tree-specific callback function. |
| 777 | * |
| 778 | * During an index build, buildStats is non-null and the counters it contains |
| 779 | * are incremented as needed. |
| 780 | * |
| 781 | * NB: the passed-in stack is freed, as though by freeGinBtreeStack. |
| 782 | */ |
| 783 | void |
| 784 | ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, |
| 785 | GinStatsData *buildStats) |
| 786 | { |
| 787 | bool done; |
| 788 | |
| 789 | /* If the leaf page was incompletely split, finish the split first */ |
| 790 | if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer))) |
| 791 | ginFinishSplit(btree, stack, false, buildStats); |
| 792 | |
| 793 | done = ginPlaceToPage(btree, stack, |
| 794 | insertdata, InvalidBlockNumber, |
| 795 | InvalidBuffer, buildStats); |
| 796 | if (done) |
| 797 | { |
| 798 | LockBuffer(stack->buffer, GIN_UNLOCK); |
| 799 | freeGinBtreeStack(stack); |
| 800 | } |
| 801 | else |
| 802 | ginFinishSplit(btree, stack, true, buildStats); |
| 803 | } |
| 804 | |