1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * gistbuildbuffers.c |
4 | * node buffer management functions for GiST buffering build algorithm. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/gist/gistbuildbuffers.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres.h" |
16 | |
17 | #include "access/genam.h" |
18 | #include "access/gist_private.h" |
19 | #include "catalog/index.h" |
20 | #include "miscadmin.h" |
21 | #include "storage/buffile.h" |
22 | #include "storage/bufmgr.h" |
23 | #include "utils/memutils.h" |
24 | #include "utils/rel.h" |
25 | |
26 | static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb); |
27 | static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, |
28 | GISTNodeBuffer *nodeBuffer); |
29 | static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, |
30 | GISTNodeBuffer *nodeBuffer); |
31 | static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, |
32 | GISTNodeBuffer *nodeBuffer); |
33 | static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, |
34 | IndexTuple item); |
35 | static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, |
36 | IndexTuple *item); |
37 | static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb); |
38 | static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum); |
39 | |
40 | static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr); |
41 | static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr); |
42 | |
43 | |
44 | /* |
45 | * Initialize GiST build buffers. |
46 | */ |
47 | GISTBuildBuffers * |
48 | gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) |
49 | { |
50 | GISTBuildBuffers *gfbb; |
51 | HASHCTL hashCtl; |
52 | |
53 | gfbb = palloc(sizeof(GISTBuildBuffers)); |
54 | gfbb->pagesPerBuffer = pagesPerBuffer; |
55 | gfbb->levelStep = levelStep; |
56 | |
57 | /* |
58 | * Create a temporary file to hold buffer pages that are swapped out of |
59 | * memory. |
60 | */ |
61 | gfbb->pfile = BufFileCreateTemp(false); |
62 | gfbb->nFileBlocks = 0; |
63 | |
64 | /* Initialize free page management. */ |
65 | gfbb->nFreeBlocks = 0; |
66 | gfbb->freeBlocksLen = 32; |
67 | gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); |
68 | |
69 | /* |
70 | * Current memory context will be used for all in-memory data structures |
71 | * of buffers which are persistent during buffering build. |
72 | */ |
73 | gfbb->context = CurrentMemoryContext; |
74 | |
75 | /* |
76 | * nodeBuffersTab hash is association between index blocks and it's |
77 | * buffers. |
78 | */ |
79 | memset(&hashCtl, 0, sizeof(hashCtl)); |
80 | hashCtl.keysize = sizeof(BlockNumber); |
81 | hashCtl.entrysize = sizeof(GISTNodeBuffer); |
82 | hashCtl.hcxt = CurrentMemoryContext; |
83 | gfbb->nodeBuffersTab = hash_create("gistbuildbuffers" , |
84 | 1024, |
85 | &hashCtl, |
86 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
87 | |
88 | gfbb->bufferEmptyingQueue = NIL; |
89 | |
90 | /* |
91 | * Per-level node buffers lists for final buffers emptying process. Node |
92 | * buffers are inserted here when they are created. |
93 | */ |
94 | gfbb->buffersOnLevelsLen = 1; |
95 | gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * |
96 | gfbb->buffersOnLevelsLen); |
97 | gfbb->buffersOnLevels[0] = NIL; |
98 | |
99 | /* |
100 | * Block numbers of node buffers which last pages are currently loaded |
101 | * into main memory. |
102 | */ |
103 | gfbb->loadedBuffersLen = 32; |
104 | gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * |
105 | sizeof(GISTNodeBuffer *)); |
106 | gfbb->loadedBuffersCount = 0; |
107 | |
108 | gfbb->rootlevel = maxLevel; |
109 | |
110 | return gfbb; |
111 | } |
112 | |
113 | /* |
114 | * Returns a node buffer for given block. The buffer is created if it |
115 | * doesn't exist yet. |
116 | */ |
117 | GISTNodeBuffer * |
118 | gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, |
119 | BlockNumber nodeBlocknum, int level) |
120 | { |
121 | GISTNodeBuffer *nodeBuffer; |
122 | bool found; |
123 | |
124 | /* Find node buffer in hash table */ |
125 | nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab, |
126 | (const void *) &nodeBlocknum, |
127 | HASH_ENTER, |
128 | &found); |
129 | if (!found) |
130 | { |
131 | /* |
132 | * Node buffer wasn't found. Initialize the new buffer as empty. |
133 | */ |
134 | MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); |
135 | |
136 | /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ |
137 | nodeBuffer->blocksCount = 0; |
138 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
139 | nodeBuffer->pageBuffer = NULL; |
140 | nodeBuffer->queuedForEmptying = false; |
141 | nodeBuffer->isTemp = false; |
142 | nodeBuffer->level = level; |
143 | |
144 | /* |
145 | * Add this buffer to the list of buffers on this level. Enlarge |
146 | * buffersOnLevels array if needed. |
147 | */ |
148 | if (level >= gfbb->buffersOnLevelsLen) |
149 | { |
150 | int i; |
151 | |
152 | gfbb->buffersOnLevels = |
153 | (List **) repalloc(gfbb->buffersOnLevels, |
154 | (level + 1) * sizeof(List *)); |
155 | |
156 | /* initialize the enlarged portion */ |
157 | for (i = gfbb->buffersOnLevelsLen; i <= level; i++) |
158 | gfbb->buffersOnLevels[i] = NIL; |
159 | gfbb->buffersOnLevelsLen = level + 1; |
160 | } |
161 | |
162 | /* |
163 | * Prepend the new buffer to the list of buffers on this level. It's |
164 | * not arbitrary that the new buffer is put to the beginning of the |
165 | * list: in the final emptying phase we loop through all buffers at |
166 | * each level, and flush them. If a page is split during the emptying, |
167 | * it's more efficient to flush the new splitted pages first, before |
168 | * moving on to pre-existing pages on the level. The buffers just |
169 | * created during the page split are likely still in cache, so |
170 | * flushing them immediately is more efficient than putting them to |
171 | * the end of the queue. |
172 | */ |
173 | gfbb->buffersOnLevels[level] = lcons(nodeBuffer, |
174 | gfbb->buffersOnLevels[level]); |
175 | |
176 | MemoryContextSwitchTo(oldcxt); |
177 | } |
178 | |
179 | return nodeBuffer; |
180 | } |
181 | |
182 | /* |
183 | * Allocate memory for a buffer page. |
184 | */ |
185 | static GISTNodeBufferPage * |
186 | gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) |
187 | { |
188 | GISTNodeBufferPage *pageBuffer; |
189 | |
190 | pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, |
191 | BLCKSZ); |
192 | pageBuffer->prev = InvalidBlockNumber; |
193 | |
194 | /* Set page free space */ |
195 | PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; |
196 | return pageBuffer; |
197 | } |
198 | |
199 | /* |
200 | * Add specified buffer into loadedBuffers array. |
201 | */ |
202 | static void |
203 | gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
204 | { |
205 | /* Never add a temporary buffer to the array */ |
206 | if (nodeBuffer->isTemp) |
207 | return; |
208 | |
209 | /* Enlarge the array if needed */ |
210 | if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) |
211 | { |
212 | gfbb->loadedBuffersLen *= 2; |
213 | gfbb->loadedBuffers = (GISTNodeBuffer **) |
214 | repalloc(gfbb->loadedBuffers, |
215 | gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); |
216 | } |
217 | |
218 | gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer; |
219 | gfbb->loadedBuffersCount++; |
220 | } |
221 | |
222 | /* |
223 | * Load last page of node buffer into main memory. |
224 | */ |
225 | static void |
226 | gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
227 | { |
228 | /* Check if we really should load something */ |
229 | if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) |
230 | { |
231 | /* Allocate memory for page */ |
232 | nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); |
233 | |
234 | /* Read block from temporary file */ |
235 | ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum, |
236 | nodeBuffer->pageBuffer); |
237 | |
238 | /* Mark file block as free */ |
239 | gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum); |
240 | |
241 | /* Mark node buffer as loaded */ |
242 | gistAddLoadedBuffer(gfbb, nodeBuffer); |
243 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
244 | } |
245 | } |
246 | |
247 | /* |
248 | * Write last page of node buffer to the disk. |
249 | */ |
250 | static void |
251 | gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
252 | { |
253 | /* Check if we have something to write */ |
254 | if (nodeBuffer->pageBuffer) |
255 | { |
256 | BlockNumber blkno; |
257 | |
258 | /* Get free file block */ |
259 | blkno = gistBuffersGetFreeBlock(gfbb); |
260 | |
261 | /* Write block to the temporary file */ |
262 | WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); |
263 | |
264 | /* Free memory of that page */ |
265 | pfree(nodeBuffer->pageBuffer); |
266 | nodeBuffer->pageBuffer = NULL; |
267 | |
268 | /* Save block number */ |
269 | nodeBuffer->pageBlocknum = blkno; |
270 | } |
271 | } |
272 | |
273 | /* |
274 | * Write last pages of all node buffers to the disk. |
275 | */ |
276 | void |
277 | gistUnloadNodeBuffers(GISTBuildBuffers *gfbb) |
278 | { |
279 | int i; |
280 | |
281 | /* Unload all the buffers that have a page loaded in memory. */ |
282 | for (i = 0; i < gfbb->loadedBuffersCount; i++) |
283 | gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]); |
284 | |
285 | /* Now there are no node buffers with loaded last page */ |
286 | gfbb->loadedBuffersCount = 0; |
287 | } |
288 | |
289 | /* |
290 | * Add index tuple to buffer page. |
291 | */ |
292 | static void |
293 | gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup) |
294 | { |
295 | Size itupsz = IndexTupleSize(itup); |
296 | char *ptr; |
297 | |
298 | /* There should be enough of space. */ |
299 | Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz)); |
300 | |
301 | /* Reduce free space value of page to reserve a spot for the tuple. */ |
302 | PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz); |
303 | |
304 | /* Get pointer to the spot we reserved (ie. end of free space). */ |
305 | ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET |
306 | + PAGE_FREE_SPACE(pageBuffer); |
307 | |
308 | /* Copy the index tuple there. */ |
309 | memcpy(ptr, itup, itupsz); |
310 | } |
311 | |
312 | /* |
313 | * Get last item from buffer page and remove it from page. |
314 | */ |
315 | static void |
316 | gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup) |
317 | { |
318 | IndexTuple ptr; |
319 | Size itupsz; |
320 | |
321 | Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */ |
322 | |
323 | /* Get pointer to last index tuple */ |
324 | ptr = (IndexTuple) ((char *) pageBuffer |
325 | + BUFFER_PAGE_DATA_OFFSET |
326 | + PAGE_FREE_SPACE(pageBuffer)); |
327 | itupsz = IndexTupleSize(ptr); |
328 | |
329 | /* Make a copy of the tuple */ |
330 | *itup = (IndexTuple) palloc(itupsz); |
331 | memcpy(*itup, ptr, itupsz); |
332 | |
333 | /* Mark the space used by the tuple as free */ |
334 | PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz); |
335 | } |
336 | |
337 | /* |
338 | * Push an index tuple to node buffer. |
339 | */ |
340 | void |
341 | gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, |
342 | IndexTuple itup) |
343 | { |
344 | /* |
345 | * Most part of memory operations will be in buffering build persistent |
346 | * context. So, let's switch to it. |
347 | */ |
348 | MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); |
349 | |
350 | /* |
351 | * If the buffer is currently empty, create the first page. |
352 | */ |
353 | if (nodeBuffer->blocksCount == 0) |
354 | { |
355 | nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); |
356 | nodeBuffer->blocksCount = 1; |
357 | gistAddLoadedBuffer(gfbb, nodeBuffer); |
358 | } |
359 | |
360 | /* Load last page of node buffer if it wasn't in memory already */ |
361 | if (!nodeBuffer->pageBuffer) |
362 | gistLoadNodeBuffer(gfbb, nodeBuffer); |
363 | |
364 | /* |
365 | * Check if there is enough space on the last page for the tuple. |
366 | */ |
367 | if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) |
368 | { |
369 | /* |
370 | * Nope. Swap previous block to disk and allocate a new one. |
371 | */ |
372 | BlockNumber blkno; |
373 | |
374 | /* Write filled page to the disk */ |
375 | blkno = gistBuffersGetFreeBlock(gfbb); |
376 | WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); |
377 | |
378 | /* |
379 | * Reset the in-memory page as empty, and link the previous block to |
380 | * the new page by storing its block number in the prev-link. |
381 | */ |
382 | PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = |
383 | BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); |
384 | nodeBuffer->pageBuffer->prev = blkno; |
385 | |
386 | /* We've just added one more page */ |
387 | nodeBuffer->blocksCount++; |
388 | } |
389 | |
390 | gistPlaceItupToPage(nodeBuffer->pageBuffer, itup); |
391 | |
392 | /* |
393 | * If the buffer just overflowed, add it to the emptying queue. |
394 | */ |
395 | if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying) |
396 | { |
397 | gfbb->bufferEmptyingQueue = lcons(nodeBuffer, |
398 | gfbb->bufferEmptyingQueue); |
399 | nodeBuffer->queuedForEmptying = true; |
400 | } |
401 | |
402 | /* Restore memory context */ |
403 | MemoryContextSwitchTo(oldcxt); |
404 | } |
405 | |
406 | /* |
407 | * Removes one index tuple from node buffer. Returns true if success and false |
408 | * if node buffer is empty. |
409 | */ |
410 | bool |
411 | gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, |
412 | IndexTuple *itup) |
413 | { |
414 | /* |
415 | * If node buffer is empty then return false. |
416 | */ |
417 | if (nodeBuffer->blocksCount <= 0) |
418 | return false; |
419 | |
420 | /* Load last page of node buffer if needed */ |
421 | if (!nodeBuffer->pageBuffer) |
422 | gistLoadNodeBuffer(gfbb, nodeBuffer); |
423 | |
424 | /* |
425 | * Get index tuple from last non-empty page. |
426 | */ |
427 | gistGetItupFromPage(nodeBuffer->pageBuffer, itup); |
428 | |
429 | /* |
430 | * If we just removed the last tuple from the page, fetch previous page on |
431 | * this node buffer (if any). |
432 | */ |
433 | if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) |
434 | { |
435 | BlockNumber prevblkno; |
436 | |
437 | /* |
438 | * blocksCount includes the page in pageBuffer, so decrease it now. |
439 | */ |
440 | nodeBuffer->blocksCount--; |
441 | |
442 | /* |
443 | * If there's more pages, fetch previous one. |
444 | */ |
445 | prevblkno = nodeBuffer->pageBuffer->prev; |
446 | if (prevblkno != InvalidBlockNumber) |
447 | { |
448 | /* There is a previous page. Fetch it. */ |
449 | Assert(nodeBuffer->blocksCount > 0); |
450 | ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer); |
451 | |
452 | /* |
453 | * Now that we've read the block in memory, we can release its |
454 | * on-disk block for reuse. |
455 | */ |
456 | gistBuffersReleaseBlock(gfbb, prevblkno); |
457 | } |
458 | else |
459 | { |
460 | /* No more pages. Free memory. */ |
461 | Assert(nodeBuffer->blocksCount == 0); |
462 | pfree(nodeBuffer->pageBuffer); |
463 | nodeBuffer->pageBuffer = NULL; |
464 | } |
465 | } |
466 | return true; |
467 | } |
468 | |
469 | /* |
470 | * Select a currently unused block for writing to. |
471 | */ |
472 | static long |
473 | gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb) |
474 | { |
475 | /* |
476 | * If there are multiple free blocks, we select the one appearing last in |
477 | * freeBlocks[]. If there are none, assign the next block at the end of |
478 | * the file (causing the file to be extended). |
479 | */ |
480 | if (gfbb->nFreeBlocks > 0) |
481 | return gfbb->freeBlocks[--gfbb->nFreeBlocks]; |
482 | else |
483 | return gfbb->nFileBlocks++; |
484 | } |
485 | |
486 | /* |
487 | * Return a block# to the freelist. |
488 | */ |
489 | static void |
490 | gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum) |
491 | { |
492 | int ndx; |
493 | |
494 | /* Enlarge freeBlocks array if full. */ |
495 | if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) |
496 | { |
497 | gfbb->freeBlocksLen *= 2; |
498 | gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, |
499 | gfbb->freeBlocksLen * |
500 | sizeof(long)); |
501 | } |
502 | |
503 | /* Add blocknum to array */ |
504 | ndx = gfbb->nFreeBlocks++; |
505 | gfbb->freeBlocks[ndx] = blocknum; |
506 | } |
507 | |
508 | /* |
509 | * Free buffering build data structure. |
510 | */ |
511 | void |
512 | gistFreeBuildBuffers(GISTBuildBuffers *gfbb) |
513 | { |
514 | /* Close buffers file. */ |
515 | BufFileClose(gfbb->pfile); |
516 | |
517 | /* All other things will be freed on memory context release */ |
518 | } |
519 | |
520 | /* |
521 | * Data structure representing information about node buffer for index tuples |
522 | * relocation from splitted node buffer. |
523 | */ |
524 | typedef struct |
525 | { |
526 | GISTENTRY entry[INDEX_MAX_KEYS]; |
527 | bool isnull[INDEX_MAX_KEYS]; |
528 | GISTPageSplitInfo *splitinfo; |
529 | GISTNodeBuffer *nodeBuffer; |
530 | } RelocationBufferInfo; |
531 | |
532 | /* |
533 | * At page split, distribute tuples from the buffer of the split page to |
534 | * new buffers for the created page halves. This also adjusts the downlinks |
535 | * in 'splitinfo' to include the tuples in the buffers. |
536 | */ |
537 | void |
538 | gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, |
539 | Relation r, int level, |
540 | Buffer buffer, List *splitinfo) |
541 | { |
542 | RelocationBufferInfo *relocationBuffersInfos; |
543 | bool found; |
544 | GISTNodeBuffer *nodeBuffer; |
545 | BlockNumber blocknum; |
546 | IndexTuple itup; |
547 | int splitPagesCount = 0, |
548 | i; |
549 | GISTENTRY entry[INDEX_MAX_KEYS]; |
550 | bool isnull[INDEX_MAX_KEYS]; |
551 | GISTNodeBuffer oldBuf; |
552 | ListCell *lc; |
553 | |
554 | /* If the splitted page doesn't have buffers, we have nothing to do. */ |
555 | if (!LEVEL_HAS_BUFFERS(level, gfbb)) |
556 | return; |
557 | |
558 | /* |
559 | * Get the node buffer of the splitted page. |
560 | */ |
561 | blocknum = BufferGetBlockNumber(buffer); |
562 | nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, |
563 | HASH_FIND, &found); |
564 | if (!found) |
565 | { |
566 | /* The page has no buffer, so we have nothing to do. */ |
567 | return; |
568 | } |
569 | |
570 | /* |
571 | * Make a copy of the old buffer, as we're going reuse it as the buffer |
572 | * for the new left page, which is on the same block as the old page. |
573 | * That's not true for the root page, but that's fine because we never |
574 | * have a buffer on the root page anyway. The original algorithm as |
575 | * described by Arge et al did, but it's of no use, as you might as well |
576 | * read the tuples straight from the heap instead of the root buffer. |
577 | */ |
578 | Assert(blocknum != GIST_ROOT_BLKNO); |
579 | memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer)); |
580 | oldBuf.isTemp = true; |
581 | |
582 | /* Reset the old buffer, used for the new left page from now on */ |
583 | nodeBuffer->blocksCount = 0; |
584 | nodeBuffer->pageBuffer = NULL; |
585 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
586 | |
587 | /* |
588 | * Allocate memory for information about relocation buffers. |
589 | */ |
590 | splitPagesCount = list_length(splitinfo); |
591 | relocationBuffersInfos = |
592 | (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * |
593 | splitPagesCount); |
594 | |
595 | /* |
596 | * Fill relocation buffers information for node buffers of pages produced |
597 | * by split. |
598 | */ |
599 | i = 0; |
600 | foreach(lc, splitinfo) |
601 | { |
602 | GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc); |
603 | GISTNodeBuffer *newNodeBuffer; |
604 | |
605 | /* Decompress parent index tuple of node buffer page. */ |
606 | gistDeCompressAtt(giststate, r, |
607 | si->downlink, NULL, (OffsetNumber) 0, |
608 | relocationBuffersInfos[i].entry, |
609 | relocationBuffersInfos[i].isnull); |
610 | |
611 | /* |
612 | * Create a node buffer for the page. The leftmost half is on the same |
613 | * block as the old page before split, so for the leftmost half this |
614 | * will return the original buffer. The tuples on the original buffer |
615 | * were relinked to the temporary buffer, so the original one is now |
616 | * empty. |
617 | */ |
618 | newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level); |
619 | |
620 | relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; |
621 | relocationBuffersInfos[i].splitinfo = si; |
622 | |
623 | i++; |
624 | } |
625 | |
626 | /* |
627 | * Loop through all index tuples in the buffer of the page being split, |
628 | * moving them to buffers for the new pages. We try to move each tuple to |
629 | * the page that will result in the lowest penalty for the leading column |
630 | * or, in the case of a tie, the lowest penalty for the earliest column |
631 | * that is not tied. |
632 | * |
633 | * The page searching logic is very similar to gistchoose(). |
634 | */ |
635 | while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) |
636 | { |
637 | float best_penalty[INDEX_MAX_KEYS]; |
638 | int i, |
639 | which; |
640 | IndexTuple newtup; |
641 | RelocationBufferInfo *targetBufferInfo; |
642 | |
643 | gistDeCompressAtt(giststate, r, |
644 | itup, NULL, (OffsetNumber) 0, entry, isnull); |
645 | |
646 | /* default to using first page (shouldn't matter) */ |
647 | which = 0; |
648 | |
649 | /* |
650 | * best_penalty[j] is the best penalty we have seen so far for column |
651 | * j, or -1 when we haven't yet examined column j. Array entries to |
652 | * the right of the first -1 are undefined. |
653 | */ |
654 | best_penalty[0] = -1; |
655 | |
656 | /* |
657 | * Loop over possible target pages, looking for one to move this tuple |
658 | * to. |
659 | */ |
660 | for (i = 0; i < splitPagesCount; i++) |
661 | { |
662 | RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; |
663 | bool zero_penalty; |
664 | int j; |
665 | |
666 | zero_penalty = true; |
667 | |
668 | /* Loop over index attributes. */ |
669 | for (j = 0; j < r->rd_att->natts; j++) |
670 | { |
671 | float usize; |
672 | |
673 | /* Compute penalty for this column. */ |
674 | usize = gistpenalty(giststate, j, |
675 | &splitPageInfo->entry[j], |
676 | splitPageInfo->isnull[j], |
677 | &entry[j], isnull[j]); |
678 | if (usize > 0) |
679 | zero_penalty = false; |
680 | |
681 | if (best_penalty[j] < 0 || usize < best_penalty[j]) |
682 | { |
683 | /* |
684 | * New best penalty for column. Tentatively select this |
685 | * page as the target, and record the best penalty. Then |
686 | * reset the next column's penalty to "unknown" (and |
687 | * indirectly, the same for all the ones to its right). |
688 | * This will force us to adopt this page's penalty values |
689 | * as the best for all the remaining columns during |
690 | * subsequent loop iterations. |
691 | */ |
692 | which = i; |
693 | best_penalty[j] = usize; |
694 | |
695 | if (j < r->rd_att->natts - 1) |
696 | best_penalty[j + 1] = -1; |
697 | } |
698 | else if (best_penalty[j] == usize) |
699 | { |
700 | /* |
701 | * The current page is exactly as good for this column as |
702 | * the best page seen so far. The next iteration of this |
703 | * loop will compare the next column. |
704 | */ |
705 | } |
706 | else |
707 | { |
708 | /* |
709 | * The current page is worse for this column than the best |
710 | * page seen so far. Skip the remaining columns and move |
711 | * on to the next page, if any. |
712 | */ |
713 | zero_penalty = false; /* so outer loop won't exit */ |
714 | break; |
715 | } |
716 | } |
717 | |
718 | /* |
719 | * If we find a page with zero penalty for all columns, there's no |
720 | * need to examine remaining pages; just break out of the loop and |
721 | * return it. |
722 | */ |
723 | if (zero_penalty) |
724 | break; |
725 | } |
726 | |
727 | /* OK, "which" is the page index to push the tuple to */ |
728 | targetBufferInfo = &relocationBuffersInfos[which]; |
729 | |
730 | /* Push item to selected node buffer */ |
731 | gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup); |
732 | |
733 | /* Adjust the downlink for this page, if needed. */ |
734 | newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink, |
735 | itup, giststate); |
736 | if (newtup) |
737 | { |
738 | gistDeCompressAtt(giststate, r, |
739 | newtup, NULL, (OffsetNumber) 0, |
740 | targetBufferInfo->entry, |
741 | targetBufferInfo->isnull); |
742 | |
743 | targetBufferInfo->splitinfo->downlink = newtup; |
744 | } |
745 | } |
746 | |
747 | pfree(relocationBuffersInfos); |
748 | } |
749 | |
750 | |
751 | /* |
752 | * Wrappers around BufFile operations. The main difference is that these |
753 | * wrappers report errors with ereport(), so that the callers don't need |
754 | * to check the return code. |
755 | */ |
756 | |
757 | static void |
758 | ReadTempFileBlock(BufFile *file, long blknum, void *ptr) |
759 | { |
760 | if (BufFileSeekBlock(file, blknum) != 0) |
761 | elog(ERROR, "could not seek temporary file: %m" ); |
762 | if (BufFileRead(file, ptr, BLCKSZ) != BLCKSZ) |
763 | elog(ERROR, "could not read temporary file: %m" ); |
764 | } |
765 | |
766 | static void |
767 | WriteTempFileBlock(BufFile *file, long blknum, void *ptr) |
768 | { |
769 | if (BufFileSeekBlock(file, blknum) != 0) |
770 | elog(ERROR, "could not seek temporary file: %m" ); |
771 | if (BufFileWrite(file, ptr, BLCKSZ) != BLCKSZ) |
772 | { |
773 | /* |
774 | * the other errors in Read/WriteTempFileBlock shouldn't happen, but |
775 | * an error at write can easily happen if you run out of disk space. |
776 | */ |
777 | ereport(ERROR, |
778 | (errcode_for_file_access(), |
779 | errmsg("could not write block %ld of temporary file: %m" , |
780 | blknum))); |
781 | } |
782 | } |
783 | |