| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * gistbuildbuffers.c |
| 4 | * node buffer management functions for GiST buffering build algorithm. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/gist/gistbuildbuffers.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "access/genam.h" |
| 18 | #include "access/gist_private.h" |
| 19 | #include "catalog/index.h" |
| 20 | #include "miscadmin.h" |
| 21 | #include "storage/buffile.h" |
| 22 | #include "storage/bufmgr.h" |
| 23 | #include "utils/memutils.h" |
| 24 | #include "utils/rel.h" |
| 25 | |
| 26 | static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb); |
| 27 | static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, |
| 28 | GISTNodeBuffer *nodeBuffer); |
| 29 | static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, |
| 30 | GISTNodeBuffer *nodeBuffer); |
| 31 | static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, |
| 32 | GISTNodeBuffer *nodeBuffer); |
| 33 | static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, |
| 34 | IndexTuple item); |
| 35 | static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, |
| 36 | IndexTuple *item); |
| 37 | static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb); |
| 38 | static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum); |
| 39 | |
| 40 | static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr); |
| 41 | static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr); |
| 42 | |
| 43 | |
| 44 | /* |
| 45 | * Initialize GiST build buffers. |
| 46 | */ |
| 47 | GISTBuildBuffers * |
| 48 | gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) |
| 49 | { |
| 50 | GISTBuildBuffers *gfbb; |
| 51 | HASHCTL hashCtl; |
| 52 | |
| 53 | gfbb = palloc(sizeof(GISTBuildBuffers)); |
| 54 | gfbb->pagesPerBuffer = pagesPerBuffer; |
| 55 | gfbb->levelStep = levelStep; |
| 56 | |
| 57 | /* |
| 58 | * Create a temporary file to hold buffer pages that are swapped out of |
| 59 | * memory. |
| 60 | */ |
| 61 | gfbb->pfile = BufFileCreateTemp(false); |
| 62 | gfbb->nFileBlocks = 0; |
| 63 | |
| 64 | /* Initialize free page management. */ |
| 65 | gfbb->nFreeBlocks = 0; |
| 66 | gfbb->freeBlocksLen = 32; |
| 67 | gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); |
| 68 | |
| 69 | /* |
| 70 | * Current memory context will be used for all in-memory data structures |
| 71 | * of buffers which are persistent during buffering build. |
| 72 | */ |
| 73 | gfbb->context = CurrentMemoryContext; |
| 74 | |
| 75 | /* |
| 76 | * nodeBuffersTab hash is association between index blocks and it's |
| 77 | * buffers. |
| 78 | */ |
| 79 | memset(&hashCtl, 0, sizeof(hashCtl)); |
| 80 | hashCtl.keysize = sizeof(BlockNumber); |
| 81 | hashCtl.entrysize = sizeof(GISTNodeBuffer); |
| 82 | hashCtl.hcxt = CurrentMemoryContext; |
| 83 | gfbb->nodeBuffersTab = hash_create("gistbuildbuffers" , |
| 84 | 1024, |
| 85 | &hashCtl, |
| 86 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| 87 | |
| 88 | gfbb->bufferEmptyingQueue = NIL; |
| 89 | |
| 90 | /* |
| 91 | * Per-level node buffers lists for final buffers emptying process. Node |
| 92 | * buffers are inserted here when they are created. |
| 93 | */ |
| 94 | gfbb->buffersOnLevelsLen = 1; |
| 95 | gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * |
| 96 | gfbb->buffersOnLevelsLen); |
| 97 | gfbb->buffersOnLevels[0] = NIL; |
| 98 | |
| 99 | /* |
| 100 | * Block numbers of node buffers which last pages are currently loaded |
| 101 | * into main memory. |
| 102 | */ |
| 103 | gfbb->loadedBuffersLen = 32; |
| 104 | gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * |
| 105 | sizeof(GISTNodeBuffer *)); |
| 106 | gfbb->loadedBuffersCount = 0; |
| 107 | |
| 108 | gfbb->rootlevel = maxLevel; |
| 109 | |
| 110 | return gfbb; |
| 111 | } |
| 112 | |
| 113 | /* |
| 114 | * Returns a node buffer for given block. The buffer is created if it |
| 115 | * doesn't exist yet. |
| 116 | */ |
| 117 | GISTNodeBuffer * |
| 118 | gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, |
| 119 | BlockNumber nodeBlocknum, int level) |
| 120 | { |
| 121 | GISTNodeBuffer *nodeBuffer; |
| 122 | bool found; |
| 123 | |
| 124 | /* Find node buffer in hash table */ |
| 125 | nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab, |
| 126 | (const void *) &nodeBlocknum, |
| 127 | HASH_ENTER, |
| 128 | &found); |
| 129 | if (!found) |
| 130 | { |
| 131 | /* |
| 132 | * Node buffer wasn't found. Initialize the new buffer as empty. |
| 133 | */ |
| 134 | MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); |
| 135 | |
| 136 | /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ |
| 137 | nodeBuffer->blocksCount = 0; |
| 138 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
| 139 | nodeBuffer->pageBuffer = NULL; |
| 140 | nodeBuffer->queuedForEmptying = false; |
| 141 | nodeBuffer->isTemp = false; |
| 142 | nodeBuffer->level = level; |
| 143 | |
| 144 | /* |
| 145 | * Add this buffer to the list of buffers on this level. Enlarge |
| 146 | * buffersOnLevels array if needed. |
| 147 | */ |
| 148 | if (level >= gfbb->buffersOnLevelsLen) |
| 149 | { |
| 150 | int i; |
| 151 | |
| 152 | gfbb->buffersOnLevels = |
| 153 | (List **) repalloc(gfbb->buffersOnLevels, |
| 154 | (level + 1) * sizeof(List *)); |
| 155 | |
| 156 | /* initialize the enlarged portion */ |
| 157 | for (i = gfbb->buffersOnLevelsLen; i <= level; i++) |
| 158 | gfbb->buffersOnLevels[i] = NIL; |
| 159 | gfbb->buffersOnLevelsLen = level + 1; |
| 160 | } |
| 161 | |
| 162 | /* |
| 163 | * Prepend the new buffer to the list of buffers on this level. It's |
| 164 | * not arbitrary that the new buffer is put to the beginning of the |
| 165 | * list: in the final emptying phase we loop through all buffers at |
| 166 | * each level, and flush them. If a page is split during the emptying, |
| 167 | * it's more efficient to flush the new splitted pages first, before |
| 168 | * moving on to pre-existing pages on the level. The buffers just |
| 169 | * created during the page split are likely still in cache, so |
| 170 | * flushing them immediately is more efficient than putting them to |
| 171 | * the end of the queue. |
| 172 | */ |
| 173 | gfbb->buffersOnLevels[level] = lcons(nodeBuffer, |
| 174 | gfbb->buffersOnLevels[level]); |
| 175 | |
| 176 | MemoryContextSwitchTo(oldcxt); |
| 177 | } |
| 178 | |
| 179 | return nodeBuffer; |
| 180 | } |
| 181 | |
| 182 | /* |
| 183 | * Allocate memory for a buffer page. |
| 184 | */ |
| 185 | static GISTNodeBufferPage * |
| 186 | gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) |
| 187 | { |
| 188 | GISTNodeBufferPage *pageBuffer; |
| 189 | |
| 190 | pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, |
| 191 | BLCKSZ); |
| 192 | pageBuffer->prev = InvalidBlockNumber; |
| 193 | |
| 194 | /* Set page free space */ |
| 195 | PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; |
| 196 | return pageBuffer; |
| 197 | } |
| 198 | |
| 199 | /* |
| 200 | * Add specified buffer into loadedBuffers array. |
| 201 | */ |
| 202 | static void |
| 203 | gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
| 204 | { |
| 205 | /* Never add a temporary buffer to the array */ |
| 206 | if (nodeBuffer->isTemp) |
| 207 | return; |
| 208 | |
| 209 | /* Enlarge the array if needed */ |
| 210 | if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) |
| 211 | { |
| 212 | gfbb->loadedBuffersLen *= 2; |
| 213 | gfbb->loadedBuffers = (GISTNodeBuffer **) |
| 214 | repalloc(gfbb->loadedBuffers, |
| 215 | gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); |
| 216 | } |
| 217 | |
| 218 | gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer; |
| 219 | gfbb->loadedBuffersCount++; |
| 220 | } |
| 221 | |
| 222 | /* |
| 223 | * Load last page of node buffer into main memory. |
| 224 | */ |
| 225 | static void |
| 226 | gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
| 227 | { |
| 228 | /* Check if we really should load something */ |
| 229 | if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) |
| 230 | { |
| 231 | /* Allocate memory for page */ |
| 232 | nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); |
| 233 | |
| 234 | /* Read block from temporary file */ |
| 235 | ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum, |
| 236 | nodeBuffer->pageBuffer); |
| 237 | |
| 238 | /* Mark file block as free */ |
| 239 | gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum); |
| 240 | |
| 241 | /* Mark node buffer as loaded */ |
| 242 | gistAddLoadedBuffer(gfbb, nodeBuffer); |
| 243 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | /* |
| 248 | * Write last page of node buffer to the disk. |
| 249 | */ |
| 250 | static void |
| 251 | gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
| 252 | { |
| 253 | /* Check if we have something to write */ |
| 254 | if (nodeBuffer->pageBuffer) |
| 255 | { |
| 256 | BlockNumber blkno; |
| 257 | |
| 258 | /* Get free file block */ |
| 259 | blkno = gistBuffersGetFreeBlock(gfbb); |
| 260 | |
| 261 | /* Write block to the temporary file */ |
| 262 | WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); |
| 263 | |
| 264 | /* Free memory of that page */ |
| 265 | pfree(nodeBuffer->pageBuffer); |
| 266 | nodeBuffer->pageBuffer = NULL; |
| 267 | |
| 268 | /* Save block number */ |
| 269 | nodeBuffer->pageBlocknum = blkno; |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | /* |
| 274 | * Write last pages of all node buffers to the disk. |
| 275 | */ |
| 276 | void |
| 277 | gistUnloadNodeBuffers(GISTBuildBuffers *gfbb) |
| 278 | { |
| 279 | int i; |
| 280 | |
| 281 | /* Unload all the buffers that have a page loaded in memory. */ |
| 282 | for (i = 0; i < gfbb->loadedBuffersCount; i++) |
| 283 | gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]); |
| 284 | |
| 285 | /* Now there are no node buffers with loaded last page */ |
| 286 | gfbb->loadedBuffersCount = 0; |
| 287 | } |
| 288 | |
| 289 | /* |
| 290 | * Add index tuple to buffer page. |
| 291 | */ |
| 292 | static void |
| 293 | gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup) |
| 294 | { |
| 295 | Size itupsz = IndexTupleSize(itup); |
| 296 | char *ptr; |
| 297 | |
| 298 | /* There should be enough of space. */ |
| 299 | Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz)); |
| 300 | |
| 301 | /* Reduce free space value of page to reserve a spot for the tuple. */ |
| 302 | PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz); |
| 303 | |
| 304 | /* Get pointer to the spot we reserved (ie. end of free space). */ |
| 305 | ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET |
| 306 | + PAGE_FREE_SPACE(pageBuffer); |
| 307 | |
| 308 | /* Copy the index tuple there. */ |
| 309 | memcpy(ptr, itup, itupsz); |
| 310 | } |
| 311 | |
| 312 | /* |
| 313 | * Get last item from buffer page and remove it from page. |
| 314 | */ |
| 315 | static void |
| 316 | gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup) |
| 317 | { |
| 318 | IndexTuple ptr; |
| 319 | Size itupsz; |
| 320 | |
| 321 | Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */ |
| 322 | |
| 323 | /* Get pointer to last index tuple */ |
| 324 | ptr = (IndexTuple) ((char *) pageBuffer |
| 325 | + BUFFER_PAGE_DATA_OFFSET |
| 326 | + PAGE_FREE_SPACE(pageBuffer)); |
| 327 | itupsz = IndexTupleSize(ptr); |
| 328 | |
| 329 | /* Make a copy of the tuple */ |
| 330 | *itup = (IndexTuple) palloc(itupsz); |
| 331 | memcpy(*itup, ptr, itupsz); |
| 332 | |
| 333 | /* Mark the space used by the tuple as free */ |
| 334 | PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz); |
| 335 | } |
| 336 | |
| 337 | /* |
| 338 | * Push an index tuple to node buffer. |
| 339 | */ |
| 340 | void |
| 341 | gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, |
| 342 | IndexTuple itup) |
| 343 | { |
| 344 | /* |
| 345 | * Most part of memory operations will be in buffering build persistent |
| 346 | * context. So, let's switch to it. |
| 347 | */ |
| 348 | MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); |
| 349 | |
| 350 | /* |
| 351 | * If the buffer is currently empty, create the first page. |
| 352 | */ |
| 353 | if (nodeBuffer->blocksCount == 0) |
| 354 | { |
| 355 | nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); |
| 356 | nodeBuffer->blocksCount = 1; |
| 357 | gistAddLoadedBuffer(gfbb, nodeBuffer); |
| 358 | } |
| 359 | |
| 360 | /* Load last page of node buffer if it wasn't in memory already */ |
| 361 | if (!nodeBuffer->pageBuffer) |
| 362 | gistLoadNodeBuffer(gfbb, nodeBuffer); |
| 363 | |
| 364 | /* |
| 365 | * Check if there is enough space on the last page for the tuple. |
| 366 | */ |
| 367 | if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) |
| 368 | { |
| 369 | /* |
| 370 | * Nope. Swap previous block to disk and allocate a new one. |
| 371 | */ |
| 372 | BlockNumber blkno; |
| 373 | |
| 374 | /* Write filled page to the disk */ |
| 375 | blkno = gistBuffersGetFreeBlock(gfbb); |
| 376 | WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); |
| 377 | |
| 378 | /* |
| 379 | * Reset the in-memory page as empty, and link the previous block to |
| 380 | * the new page by storing its block number in the prev-link. |
| 381 | */ |
| 382 | PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = |
| 383 | BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); |
| 384 | nodeBuffer->pageBuffer->prev = blkno; |
| 385 | |
| 386 | /* We've just added one more page */ |
| 387 | nodeBuffer->blocksCount++; |
| 388 | } |
| 389 | |
| 390 | gistPlaceItupToPage(nodeBuffer->pageBuffer, itup); |
| 391 | |
| 392 | /* |
| 393 | * If the buffer just overflowed, add it to the emptying queue. |
| 394 | */ |
| 395 | if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying) |
| 396 | { |
| 397 | gfbb->bufferEmptyingQueue = lcons(nodeBuffer, |
| 398 | gfbb->bufferEmptyingQueue); |
| 399 | nodeBuffer->queuedForEmptying = true; |
| 400 | } |
| 401 | |
| 402 | /* Restore memory context */ |
| 403 | MemoryContextSwitchTo(oldcxt); |
| 404 | } |
| 405 | |
| 406 | /* |
| 407 | * Removes one index tuple from node buffer. Returns true if success and false |
| 408 | * if node buffer is empty. |
| 409 | */ |
| 410 | bool |
| 411 | gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, |
| 412 | IndexTuple *itup) |
| 413 | { |
| 414 | /* |
| 415 | * If node buffer is empty then return false. |
| 416 | */ |
| 417 | if (nodeBuffer->blocksCount <= 0) |
| 418 | return false; |
| 419 | |
| 420 | /* Load last page of node buffer if needed */ |
| 421 | if (!nodeBuffer->pageBuffer) |
| 422 | gistLoadNodeBuffer(gfbb, nodeBuffer); |
| 423 | |
| 424 | /* |
| 425 | * Get index tuple from last non-empty page. |
| 426 | */ |
| 427 | gistGetItupFromPage(nodeBuffer->pageBuffer, itup); |
| 428 | |
| 429 | /* |
| 430 | * If we just removed the last tuple from the page, fetch previous page on |
| 431 | * this node buffer (if any). |
| 432 | */ |
| 433 | if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) |
| 434 | { |
| 435 | BlockNumber prevblkno; |
| 436 | |
| 437 | /* |
| 438 | * blocksCount includes the page in pageBuffer, so decrease it now. |
| 439 | */ |
| 440 | nodeBuffer->blocksCount--; |
| 441 | |
| 442 | /* |
| 443 | * If there's more pages, fetch previous one. |
| 444 | */ |
| 445 | prevblkno = nodeBuffer->pageBuffer->prev; |
| 446 | if (prevblkno != InvalidBlockNumber) |
| 447 | { |
| 448 | /* There is a previous page. Fetch it. */ |
| 449 | Assert(nodeBuffer->blocksCount > 0); |
| 450 | ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer); |
| 451 | |
| 452 | /* |
| 453 | * Now that we've read the block in memory, we can release its |
| 454 | * on-disk block for reuse. |
| 455 | */ |
| 456 | gistBuffersReleaseBlock(gfbb, prevblkno); |
| 457 | } |
| 458 | else |
| 459 | { |
| 460 | /* No more pages. Free memory. */ |
| 461 | Assert(nodeBuffer->blocksCount == 0); |
| 462 | pfree(nodeBuffer->pageBuffer); |
| 463 | nodeBuffer->pageBuffer = NULL; |
| 464 | } |
| 465 | } |
| 466 | return true; |
| 467 | } |
| 468 | |
| 469 | /* |
| 470 | * Select a currently unused block for writing to. |
| 471 | */ |
| 472 | static long |
| 473 | gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb) |
| 474 | { |
| 475 | /* |
| 476 | * If there are multiple free blocks, we select the one appearing last in |
| 477 | * freeBlocks[]. If there are none, assign the next block at the end of |
| 478 | * the file (causing the file to be extended). |
| 479 | */ |
| 480 | if (gfbb->nFreeBlocks > 0) |
| 481 | return gfbb->freeBlocks[--gfbb->nFreeBlocks]; |
| 482 | else |
| 483 | return gfbb->nFileBlocks++; |
| 484 | } |
| 485 | |
| 486 | /* |
| 487 | * Return a block# to the freelist. |
| 488 | */ |
| 489 | static void |
| 490 | gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum) |
| 491 | { |
| 492 | int ndx; |
| 493 | |
| 494 | /* Enlarge freeBlocks array if full. */ |
| 495 | if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) |
| 496 | { |
| 497 | gfbb->freeBlocksLen *= 2; |
| 498 | gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, |
| 499 | gfbb->freeBlocksLen * |
| 500 | sizeof(long)); |
| 501 | } |
| 502 | |
| 503 | /* Add blocknum to array */ |
| 504 | ndx = gfbb->nFreeBlocks++; |
| 505 | gfbb->freeBlocks[ndx] = blocknum; |
| 506 | } |
| 507 | |
| 508 | /* |
| 509 | * Free buffering build data structure. |
| 510 | */ |
| 511 | void |
| 512 | gistFreeBuildBuffers(GISTBuildBuffers *gfbb) |
| 513 | { |
| 514 | /* Close buffers file. */ |
| 515 | BufFileClose(gfbb->pfile); |
| 516 | |
| 517 | /* All other things will be freed on memory context release */ |
| 518 | } |
| 519 | |
| 520 | /* |
| 521 | * Data structure representing information about node buffer for index tuples |
| 522 | * relocation from splitted node buffer. |
| 523 | */ |
| 524 | typedef struct |
| 525 | { |
| 526 | GISTENTRY entry[INDEX_MAX_KEYS]; |
| 527 | bool isnull[INDEX_MAX_KEYS]; |
| 528 | GISTPageSplitInfo *splitinfo; |
| 529 | GISTNodeBuffer *nodeBuffer; |
| 530 | } RelocationBufferInfo; |
| 531 | |
| 532 | /* |
| 533 | * At page split, distribute tuples from the buffer of the split page to |
| 534 | * new buffers for the created page halves. This also adjusts the downlinks |
| 535 | * in 'splitinfo' to include the tuples in the buffers. |
| 536 | */ |
| 537 | void |
| 538 | gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, |
| 539 | Relation r, int level, |
| 540 | Buffer buffer, List *splitinfo) |
| 541 | { |
| 542 | RelocationBufferInfo *relocationBuffersInfos; |
| 543 | bool found; |
| 544 | GISTNodeBuffer *nodeBuffer; |
| 545 | BlockNumber blocknum; |
| 546 | IndexTuple itup; |
| 547 | int splitPagesCount = 0, |
| 548 | i; |
| 549 | GISTENTRY entry[INDEX_MAX_KEYS]; |
| 550 | bool isnull[INDEX_MAX_KEYS]; |
| 551 | GISTNodeBuffer oldBuf; |
| 552 | ListCell *lc; |
| 553 | |
| 554 | /* If the splitted page doesn't have buffers, we have nothing to do. */ |
| 555 | if (!LEVEL_HAS_BUFFERS(level, gfbb)) |
| 556 | return; |
| 557 | |
| 558 | /* |
| 559 | * Get the node buffer of the splitted page. |
| 560 | */ |
| 561 | blocknum = BufferGetBlockNumber(buffer); |
| 562 | nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, |
| 563 | HASH_FIND, &found); |
| 564 | if (!found) |
| 565 | { |
| 566 | /* The page has no buffer, so we have nothing to do. */ |
| 567 | return; |
| 568 | } |
| 569 | |
| 570 | /* |
| 571 | * Make a copy of the old buffer, as we're going reuse it as the buffer |
| 572 | * for the new left page, which is on the same block as the old page. |
| 573 | * That's not true for the root page, but that's fine because we never |
| 574 | * have a buffer on the root page anyway. The original algorithm as |
| 575 | * described by Arge et al did, but it's of no use, as you might as well |
| 576 | * read the tuples straight from the heap instead of the root buffer. |
| 577 | */ |
| 578 | Assert(blocknum != GIST_ROOT_BLKNO); |
| 579 | memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer)); |
| 580 | oldBuf.isTemp = true; |
| 581 | |
| 582 | /* Reset the old buffer, used for the new left page from now on */ |
| 583 | nodeBuffer->blocksCount = 0; |
| 584 | nodeBuffer->pageBuffer = NULL; |
| 585 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
| 586 | |
| 587 | /* |
| 588 | * Allocate memory for information about relocation buffers. |
| 589 | */ |
| 590 | splitPagesCount = list_length(splitinfo); |
| 591 | relocationBuffersInfos = |
| 592 | (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * |
| 593 | splitPagesCount); |
| 594 | |
| 595 | /* |
| 596 | * Fill relocation buffers information for node buffers of pages produced |
| 597 | * by split. |
| 598 | */ |
| 599 | i = 0; |
| 600 | foreach(lc, splitinfo) |
| 601 | { |
| 602 | GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc); |
| 603 | GISTNodeBuffer *newNodeBuffer; |
| 604 | |
| 605 | /* Decompress parent index tuple of node buffer page. */ |
| 606 | gistDeCompressAtt(giststate, r, |
| 607 | si->downlink, NULL, (OffsetNumber) 0, |
| 608 | relocationBuffersInfos[i].entry, |
| 609 | relocationBuffersInfos[i].isnull); |
| 610 | |
| 611 | /* |
| 612 | * Create a node buffer for the page. The leftmost half is on the same |
| 613 | * block as the old page before split, so for the leftmost half this |
| 614 | * will return the original buffer. The tuples on the original buffer |
| 615 | * were relinked to the temporary buffer, so the original one is now |
| 616 | * empty. |
| 617 | */ |
| 618 | newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level); |
| 619 | |
| 620 | relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; |
| 621 | relocationBuffersInfos[i].splitinfo = si; |
| 622 | |
| 623 | i++; |
| 624 | } |
| 625 | |
| 626 | /* |
| 627 | * Loop through all index tuples in the buffer of the page being split, |
| 628 | * moving them to buffers for the new pages. We try to move each tuple to |
| 629 | * the page that will result in the lowest penalty for the leading column |
| 630 | * or, in the case of a tie, the lowest penalty for the earliest column |
| 631 | * that is not tied. |
| 632 | * |
| 633 | * The page searching logic is very similar to gistchoose(). |
| 634 | */ |
| 635 | while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) |
| 636 | { |
| 637 | float best_penalty[INDEX_MAX_KEYS]; |
| 638 | int i, |
| 639 | which; |
| 640 | IndexTuple newtup; |
| 641 | RelocationBufferInfo *targetBufferInfo; |
| 642 | |
| 643 | gistDeCompressAtt(giststate, r, |
| 644 | itup, NULL, (OffsetNumber) 0, entry, isnull); |
| 645 | |
| 646 | /* default to using first page (shouldn't matter) */ |
| 647 | which = 0; |
| 648 | |
| 649 | /* |
| 650 | * best_penalty[j] is the best penalty we have seen so far for column |
| 651 | * j, or -1 when we haven't yet examined column j. Array entries to |
| 652 | * the right of the first -1 are undefined. |
| 653 | */ |
| 654 | best_penalty[0] = -1; |
| 655 | |
| 656 | /* |
| 657 | * Loop over possible target pages, looking for one to move this tuple |
| 658 | * to. |
| 659 | */ |
| 660 | for (i = 0; i < splitPagesCount; i++) |
| 661 | { |
| 662 | RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; |
| 663 | bool zero_penalty; |
| 664 | int j; |
| 665 | |
| 666 | zero_penalty = true; |
| 667 | |
| 668 | /* Loop over index attributes. */ |
| 669 | for (j = 0; j < r->rd_att->natts; j++) |
| 670 | { |
| 671 | float usize; |
| 672 | |
| 673 | /* Compute penalty for this column. */ |
| 674 | usize = gistpenalty(giststate, j, |
| 675 | &splitPageInfo->entry[j], |
| 676 | splitPageInfo->isnull[j], |
| 677 | &entry[j], isnull[j]); |
| 678 | if (usize > 0) |
| 679 | zero_penalty = false; |
| 680 | |
| 681 | if (best_penalty[j] < 0 || usize < best_penalty[j]) |
| 682 | { |
| 683 | /* |
| 684 | * New best penalty for column. Tentatively select this |
| 685 | * page as the target, and record the best penalty. Then |
| 686 | * reset the next column's penalty to "unknown" (and |
| 687 | * indirectly, the same for all the ones to its right). |
| 688 | * This will force us to adopt this page's penalty values |
| 689 | * as the best for all the remaining columns during |
| 690 | * subsequent loop iterations. |
| 691 | */ |
| 692 | which = i; |
| 693 | best_penalty[j] = usize; |
| 694 | |
| 695 | if (j < r->rd_att->natts - 1) |
| 696 | best_penalty[j + 1] = -1; |
| 697 | } |
| 698 | else if (best_penalty[j] == usize) |
| 699 | { |
| 700 | /* |
| 701 | * The current page is exactly as good for this column as |
| 702 | * the best page seen so far. The next iteration of this |
| 703 | * loop will compare the next column. |
| 704 | */ |
| 705 | } |
| 706 | else |
| 707 | { |
| 708 | /* |
| 709 | * The current page is worse for this column than the best |
| 710 | * page seen so far. Skip the remaining columns and move |
| 711 | * on to the next page, if any. |
| 712 | */ |
| 713 | zero_penalty = false; /* so outer loop won't exit */ |
| 714 | break; |
| 715 | } |
| 716 | } |
| 717 | |
| 718 | /* |
| 719 | * If we find a page with zero penalty for all columns, there's no |
| 720 | * need to examine remaining pages; just break out of the loop and |
| 721 | * return it. |
| 722 | */ |
| 723 | if (zero_penalty) |
| 724 | break; |
| 725 | } |
| 726 | |
| 727 | /* OK, "which" is the page index to push the tuple to */ |
| 728 | targetBufferInfo = &relocationBuffersInfos[which]; |
| 729 | |
| 730 | /* Push item to selected node buffer */ |
| 731 | gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup); |
| 732 | |
| 733 | /* Adjust the downlink for this page, if needed. */ |
| 734 | newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink, |
| 735 | itup, giststate); |
| 736 | if (newtup) |
| 737 | { |
| 738 | gistDeCompressAtt(giststate, r, |
| 739 | newtup, NULL, (OffsetNumber) 0, |
| 740 | targetBufferInfo->entry, |
| 741 | targetBufferInfo->isnull); |
| 742 | |
| 743 | targetBufferInfo->splitinfo->downlink = newtup; |
| 744 | } |
| 745 | } |
| 746 | |
| 747 | pfree(relocationBuffersInfos); |
| 748 | } |
| 749 | |
| 750 | |
| 751 | /* |
| 752 | * Wrappers around BufFile operations. The main difference is that these |
| 753 | * wrappers report errors with ereport(), so that the callers don't need |
| 754 | * to check the return code. |
| 755 | */ |
| 756 | |
| 757 | static void |
| 758 | ReadTempFileBlock(BufFile *file, long blknum, void *ptr) |
| 759 | { |
| 760 | if (BufFileSeekBlock(file, blknum) != 0) |
| 761 | elog(ERROR, "could not seek temporary file: %m" ); |
| 762 | if (BufFileRead(file, ptr, BLCKSZ) != BLCKSZ) |
| 763 | elog(ERROR, "could not read temporary file: %m" ); |
| 764 | } |
| 765 | |
| 766 | static void |
| 767 | WriteTempFileBlock(BufFile *file, long blknum, void *ptr) |
| 768 | { |
| 769 | if (BufFileSeekBlock(file, blknum) != 0) |
| 770 | elog(ERROR, "could not seek temporary file: %m" ); |
| 771 | if (BufFileWrite(file, ptr, BLCKSZ) != BLCKSZ) |
| 772 | { |
| 773 | /* |
| 774 | * the other errors in Read/WriteTempFileBlock shouldn't happen, but |
| 775 | * an error at write can easily happen if you run out of disk space. |
| 776 | */ |
| 777 | ereport(ERROR, |
| 778 | (errcode_for_file_access(), |
| 779 | errmsg("could not write block %ld of temporary file: %m" , |
| 780 | blknum))); |
| 781 | } |
| 782 | } |
| 783 | |