1/*-------------------------------------------------------------------------
2 *
3 * gistbuild.c
4 * build algorithm for GiST indexes implementation.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gist/gistbuild.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include <math.h>
18
19#include "access/genam.h"
20#include "access/gist_private.h"
21#include "access/gistxlog.h"
22#include "access/tableam.h"
23#include "access/xloginsert.h"
24#include "catalog/index.h"
25#include "miscadmin.h"
26#include "optimizer/optimizer.h"
27#include "storage/bufmgr.h"
28#include "storage/smgr.h"
29#include "utils/memutils.h"
30#include "utils/rel.h"
31
32/* Step of index tuples for check whether to switch to buffering build mode */
33#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
34
35/*
36 * Number of tuples to process in the slow way before switching to buffering
37 * mode, when buffering is explicitly turned on. Also, the number of tuples
38 * to process between readjusting the buffer size parameter, while in
39 * buffering mode.
40 */
41#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
42
43typedef enum
44{
45 GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
46 * switch */
47 GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
48 * buffering build mode if the index grows too
49 * big */
50 GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
51 * before switching to the buffering build
52 * mode */
53 GIST_BUFFERING_ACTIVE /* in buffering build mode */
54} GistBufferingMode;
55
56/* Working state for gistbuild and its callback */
57typedef struct
58{
59 Relation indexrel;
60 Relation heaprel;
61 GISTSTATE *giststate;
62
63 int64 indtuples; /* number of tuples indexed */
64 int64 indtuplesSize; /* total size of all indexed tuples */
65
66 Size freespace; /* amount of free space to leave on pages */
67
68 /*
69 * Extra data structures used during a buffering build. 'gfbb' contains
70 * information related to managing the build buffers. 'parentMap' is a
71 * lookup table of the parent of each internal page.
72 */
73 GISTBuildBuffers *gfbb;
74 HTAB *parentMap;
75
76 GistBufferingMode bufferingMode;
77} GISTBuildState;
78
79/* prototypes for private functions */
80static void gistInitBuffering(GISTBuildState *buildstate);
81static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
82static void gistBuildCallback(Relation index,
83 HeapTuple htup,
84 Datum *values,
85 bool *isnull,
86 bool tupleIsAlive,
87 void *state);
88static void gistBufferingBuildInsert(GISTBuildState *buildstate,
89 IndexTuple itup);
90static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
91 BlockNumber startblkno, int startlevel);
92static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
93 Buffer buffer, int level,
94 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
95 BlockNumber parentblk, OffsetNumber downlinkoffnum);
96static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
97 BlockNumber childblkno, int level,
98 BlockNumber *parentblk,
99 OffsetNumber *downlinkoffnum);
100static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
101static void gistEmptyAllBuffers(GISTBuildState *buildstate);
102static int gistGetMaxLevel(Relation index);
103
104static void gistInitParentMap(GISTBuildState *buildstate);
105static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
106 BlockNumber parent);
107static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
108static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
109
110/*
111 * Main entry point to GiST index build. Initially calls insert over and over,
112 * but switches to more efficient buffering build algorithm after a certain
113 * number of tuples (unless buffering mode is disabled).
114 */
115IndexBuildResult *
116gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
117{
118 IndexBuildResult *result;
119 double reltuples;
120 GISTBuildState buildstate;
121 Buffer buffer;
122 Page page;
123 MemoryContext oldcxt = CurrentMemoryContext;
124 int fillfactor;
125
126 buildstate.indexrel = index;
127 buildstate.heaprel = heap;
128 if (index->rd_options)
129 {
130 /* Get buffering mode from the options string */
131 GiSTOptions *options = (GiSTOptions *) index->rd_options;
132 char *bufferingMode = (char *) options + options->bufferingModeOffset;
133
134 if (strcmp(bufferingMode, "on") == 0)
135 buildstate.bufferingMode = GIST_BUFFERING_STATS;
136 else if (strcmp(bufferingMode, "off") == 0)
137 buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
138 else
139 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
140
141 fillfactor = options->fillfactor;
142 }
143 else
144 {
145 /*
146 * By default, switch to buffering mode when the index grows too large
147 * to fit in cache.
148 */
149 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
150 fillfactor = GIST_DEFAULT_FILLFACTOR;
151 }
152 /* Calculate target amount of free space to leave on pages */
153 buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
154
155 /*
156 * We expect to be called exactly once for any index relation. If that's
157 * not the case, big trouble's what we have.
158 */
159 if (RelationGetNumberOfBlocks(index) != 0)
160 elog(ERROR, "index \"%s\" already contains data",
161 RelationGetRelationName(index));
162
163 /* no locking is needed */
164 buildstate.giststate = initGISTstate(index);
165
166 /*
167 * Create a temporary memory context that is reset once for each tuple
168 * processed. (Note: we don't bother to make this a child of the
169 * giststate's scanCxt, so we have to delete it separately at the end.)
170 */
171 buildstate.giststate->tempCxt = createTempGistContext();
172
173 /* initialize the root page */
174 buffer = gistNewBuffer(index);
175 Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
176 page = BufferGetPage(buffer);
177
178 START_CRIT_SECTION();
179
180 GISTInitBuffer(buffer, F_LEAF);
181
182 MarkBufferDirty(buffer);
183 PageSetLSN(page, GistBuildLSN);
184
185 UnlockReleaseBuffer(buffer);
186
187 END_CRIT_SECTION();
188
189 /* build the index */
190 buildstate.indtuples = 0;
191 buildstate.indtuplesSize = 0;
192
193 /*
194 * Do the heap scan.
195 */
196 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
197 gistBuildCallback,
198 (void *) &buildstate, NULL);
199
200 /*
201 * If buffering was used, flush out all the tuples that are still in the
202 * buffers.
203 */
204 if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
205 {
206 elog(DEBUG1, "all tuples processed, emptying buffers");
207 gistEmptyAllBuffers(&buildstate);
208 gistFreeBuildBuffers(buildstate.gfbb);
209 }
210
211 /* okay, all heap tuples are indexed */
212 MemoryContextSwitchTo(oldcxt);
213 MemoryContextDelete(buildstate.giststate->tempCxt);
214
215 freeGISTstate(buildstate.giststate);
216
217 /*
218 * We didn't write WAL records as we built the index, so if WAL-logging is
219 * required, write all pages to the WAL now.
220 */
221 if (RelationNeedsWAL(index))
222 {
223 log_newpage_range(index, MAIN_FORKNUM,
224 0, RelationGetNumberOfBlocks(index),
225 true);
226 }
227
228 /*
229 * Return statistics
230 */
231 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
232
233 result->heap_tuples = reltuples;
234 result->index_tuples = (double) buildstate.indtuples;
235
236 return result;
237}
238
239/*
240 * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
241 * and "auto" values.
242 */
243void
244gistValidateBufferingOption(const char *value)
245{
246 if (value == NULL ||
247 (strcmp(value, "on") != 0 &&
248 strcmp(value, "off") != 0 &&
249 strcmp(value, "auto") != 0))
250 {
251 ereport(ERROR,
252 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
253 errmsg("invalid value for \"buffering\" option"),
254 errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
255 }
256}
257
258/*
259 * Attempt to switch to buffering mode.
260 *
261 * If there is not enough memory for buffering build, sets bufferingMode
262 * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
263 * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
264 * GIST_BUFFERING_ACTIVE.
265 */
266static void
267gistInitBuffering(GISTBuildState *buildstate)
268{
269 Relation index = buildstate->indexrel;
270 int pagesPerBuffer;
271 Size pageFreeSpace;
272 Size itupAvgSize,
273 itupMinSize;
274 double avgIndexTuplesPerPage,
275 maxIndexTuplesPerPage;
276 int i;
277 int levelStep;
278
279 /* Calc space of index page which is available for index tuples */
280 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
281 - sizeof(ItemIdData)
282 - buildstate->freespace;
283
284 /*
285 * Calculate average size of already inserted index tuples using gathered
286 * statistics.
287 */
288 itupAvgSize = (double) buildstate->indtuplesSize /
289 (double) buildstate->indtuples;
290
291 /*
292 * Calculate minimal possible size of index tuple by index metadata.
293 * Minimal possible size of varlena is VARHDRSZ.
294 *
295 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
296 * And we should take padding into account here.
297 */
298 itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
299 for (i = 0; i < index->rd_att->natts; i++)
300 {
301 if (TupleDescAttr(index->rd_att, i)->attlen < 0)
302 itupMinSize += VARHDRSZ;
303 else
304 itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
305 }
306
307 /* Calculate average and maximal number of index tuples which fit to page */
308 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
309 maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
310
311 /*
312 * We need to calculate two parameters for the buffering algorithm:
313 * levelStep and pagesPerBuffer.
314 *
315 * levelStep determines the size of subtree that we operate on, while
316 * emptying a buffer. A higher value is better, as you need fewer buffer
317 * emptying steps to build the index. However, if you set it too high, the
318 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
319 * of the buffers.
320 *
321 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
322 * the number of tuples on page (ie. fanout), and M is the amount of
323 * internal memory available. Curiously, they doesn't explain *why* that
324 * setting is optimal. We calculate it by taking the highest levelStep so
325 * that a subtree still fits in cache. For a small B, our way of
326 * calculating levelStep is very close to Arge et al's formula. For a
327 * large B, our formula gives a value that is 2x higher.
328 *
329 * The average size (in pages) of a subtree of depth n can be calculated
330 * as a geometric series:
331 *
332 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
333 *
334 * where B is the average number of index tuples on page. The subtree is
335 * cached in the shared buffer cache and the OS cache, so we choose
336 * levelStep so that the subtree size is comfortably smaller than
337 * effective_cache_size, with a safety factor of 4.
338 *
339 * The estimate on the average number of index tuples on page is based on
340 * average tuple sizes observed before switching to buffered build, so the
341 * real subtree size can be somewhat larger. Also, it would selfish to
342 * gobble the whole cache for our index build. The safety factor of 4
343 * should account for those effects.
344 *
345 * The other limiting factor for setting levelStep is that while
346 * processing a subtree, we need to hold one page for each buffer at the
347 * next lower buffered level. The max. number of buffers needed for that
348 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
349 * hopefully maintenance_work_mem is set high enough that you're
350 * constrained by effective_cache_size rather than maintenance_work_mem.
351 *
352 * XXX: the buffer hash table consumes a fair amount of memory too per
353 * buffer, but that is not currently taken into account. That scales on
354 * the total number of buffers used, ie. the index size and on levelStep.
355 * Note that a higher levelStep *reduces* the amount of memory needed for
356 * the hash table.
357 */
358 levelStep = 1;
359 for (;;)
360 {
361 double subtreesize;
362 double maxlowestlevelpages;
363
364 /* size of an average subtree at this levelStep (in pages). */
365 subtreesize =
366 (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
367 (1 - avgIndexTuplesPerPage);
368
369 /* max number of pages at the lowest level of a subtree */
370 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
371
372 /* subtree must fit in cache (with safety factor of 4) */
373 if (subtreesize > effective_cache_size / 4)
374 break;
375
376 /* each node in the lowest level of a subtree has one page in memory */
377 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
378 break;
379
380 /* Good, we can handle this levelStep. See if we can go one higher. */
381 levelStep++;
382 }
383
384 /*
385 * We just reached an unacceptable value of levelStep in previous loop.
386 * So, decrease levelStep to get last acceptable value.
387 */
388 levelStep--;
389
390 /*
391 * If there's not enough cache or maintenance_work_mem, fall back to plain
392 * inserts.
393 */
394 if (levelStep <= 0)
395 {
396 elog(DEBUG1, "failed to switch to buffered GiST build");
397 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
398 return;
399 }
400
401 /*
402 * The second parameter to set is pagesPerBuffer, which determines the
403 * size of each buffer. We adjust pagesPerBuffer also during the build,
404 * which is why this calculation is in a separate function.
405 */
406 pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
407
408 /* Initialize GISTBuildBuffers with these parameters */
409 buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
410 gistGetMaxLevel(index));
411
412 gistInitParentMap(buildstate);
413
414 buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
415
416 elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
417 levelStep, pagesPerBuffer);
418}
419
420/*
421 * Calculate pagesPerBuffer parameter for the buffering algorithm.
422 *
423 * Buffer size is chosen so that assuming that tuples are distributed
424 * randomly, emptying half a buffer fills on average one page in every buffer
425 * at the next lower level.
426 */
427static int
428calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
429{
430 double pagesPerBuffer;
431 double avgIndexTuplesPerPage;
432 double itupAvgSize;
433 Size pageFreeSpace;
434
435 /* Calc space of index page which is available for index tuples */
436 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
437 - sizeof(ItemIdData)
438 - buildstate->freespace;
439
440 /*
441 * Calculate average size of already inserted index tuples using gathered
442 * statistics.
443 */
444 itupAvgSize = (double) buildstate->indtuplesSize /
445 (double) buildstate->indtuples;
446
447 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
448
449 /*
450 * Recalculate required size of buffers.
451 */
452 pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
453
454 return (int) rint(pagesPerBuffer);
455}
456
457/*
458 * Per-tuple callback for table_index_build_scan.
459 */
460static void
461gistBuildCallback(Relation index,
462 HeapTuple htup,
463 Datum *values,
464 bool *isnull,
465 bool tupleIsAlive,
466 void *state)
467{
468 GISTBuildState *buildstate = (GISTBuildState *) state;
469 IndexTuple itup;
470 MemoryContext oldCtx;
471
472 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
473
474 /* form an index tuple and point it at the heap tuple */
475 itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
476 itup->t_tid = htup->t_self;
477
478 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
479 {
480 /* We have buffers, so use them. */
481 gistBufferingBuildInsert(buildstate, itup);
482 }
483 else
484 {
485 /*
486 * There's no buffers (yet). Since we already have the index relation
487 * locked, we call gistdoinsert directly.
488 */
489 gistdoinsert(index, itup, buildstate->freespace,
490 buildstate->giststate, buildstate->heaprel, true);
491 }
492
493 /* Update tuple count and total size. */
494 buildstate->indtuples += 1;
495 buildstate->indtuplesSize += IndexTupleSize(itup);
496
497 MemoryContextSwitchTo(oldCtx);
498 MemoryContextReset(buildstate->giststate->tempCxt);
499
500 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
501 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
502 {
503 /* Adjust the target buffer size now */
504 buildstate->gfbb->pagesPerBuffer =
505 calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
506 }
507
508 /*
509 * In 'auto' mode, check if the index has grown too large to fit in cache,
510 * and switch to buffering mode if it has.
511 *
512 * To avoid excessive calls to smgrnblocks(), only check this every
513 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
514 */
515 if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
516 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
517 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
518 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
519 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
520 {
521 /*
522 * Index doesn't fit in effective cache anymore. Try to switch to
523 * buffering build mode.
524 */
525 gistInitBuffering(buildstate);
526 }
527}
528
529/*
530 * Insert function for buffering index build.
531 */
532static void
533gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
534{
535 /* Insert the tuple to buffers. */
536 gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
537
538 /* If we filled up (half of a) buffer, process buffer emptying. */
539 gistProcessEmptyingQueue(buildstate);
540}
541
542/*
543 * Process an index tuple. Runs the tuple down the tree until we reach a leaf
544 * page or node buffer, and inserts the tuple there. Returns true if we have
545 * to stop buffer emptying process (because one of child buffers can't take
546 * index tuples anymore).
547 */
548static bool
549gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
550 BlockNumber startblkno, int startlevel)
551{
552 GISTSTATE *giststate = buildstate->giststate;
553 GISTBuildBuffers *gfbb = buildstate->gfbb;
554 Relation indexrel = buildstate->indexrel;
555 BlockNumber childblkno;
556 Buffer buffer;
557 bool result = false;
558 BlockNumber blkno;
559 int level;
560 OffsetNumber downlinkoffnum = InvalidOffsetNumber;
561 BlockNumber parentblkno = InvalidBlockNumber;
562
563 CHECK_FOR_INTERRUPTS();
564
565 /*
566 * Loop until we reach a leaf page (level == 0) or a level with buffers
567 * (not including the level we start at, because we would otherwise make
568 * no progress).
569 */
570 blkno = startblkno;
571 level = startlevel;
572 for (;;)
573 {
574 ItemId iid;
575 IndexTuple idxtuple,
576 newtup;
577 Page page;
578 OffsetNumber childoffnum;
579
580 /* Have we reached a level with buffers? */
581 if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
582 break;
583
584 /* Have we reached a leaf page? */
585 if (level == 0)
586 break;
587
588 /*
589 * Nope. Descend down to the next level then. Choose a child to
590 * descend down to.
591 */
592
593 buffer = ReadBuffer(indexrel, blkno);
594 LockBuffer(buffer, GIST_EXCLUSIVE);
595
596 page = (Page) BufferGetPage(buffer);
597 childoffnum = gistchoose(indexrel, page, itup, giststate);
598 iid = PageGetItemId(page, childoffnum);
599 idxtuple = (IndexTuple) PageGetItem(page, iid);
600 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
601
602 if (level > 1)
603 gistMemorizeParent(buildstate, childblkno, blkno);
604
605 /*
606 * Check that the key representing the target child node is consistent
607 * with the key we're inserting. Update it if it's not.
608 */
609 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
610 if (newtup)
611 {
612 blkno = gistbufferinginserttuples(buildstate,
613 buffer,
614 level,
615 &newtup,
616 1,
617 childoffnum,
618 InvalidBlockNumber,
619 InvalidOffsetNumber);
620 /* gistbufferinginserttuples() released the buffer */
621 }
622 else
623 UnlockReleaseBuffer(buffer);
624
625 /* Descend to the child */
626 parentblkno = blkno;
627 blkno = childblkno;
628 downlinkoffnum = childoffnum;
629 Assert(level > 0);
630 level--;
631 }
632
633 if (LEVEL_HAS_BUFFERS(level, gfbb))
634 {
635 /*
636 * We've reached level with buffers. Place the index tuple to the
637 * buffer, and add the buffer to the emptying queue if it overflows.
638 */
639 GISTNodeBuffer *childNodeBuffer;
640
641 /* Find the buffer or create a new one */
642 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
643
644 /* Add index tuple to it */
645 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
646
647 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
648 result = true;
649 }
650 else
651 {
652 /*
653 * We've reached a leaf page. Place the tuple here.
654 */
655 Assert(level == 0);
656 buffer = ReadBuffer(indexrel, blkno);
657 LockBuffer(buffer, GIST_EXCLUSIVE);
658 gistbufferinginserttuples(buildstate, buffer, level,
659 &itup, 1, InvalidOffsetNumber,
660 parentblkno, downlinkoffnum);
661 /* gistbufferinginserttuples() released the buffer */
662 }
663
664 return result;
665}
666
667/*
668 * Insert tuples to a given page.
669 *
670 * This is analogous with gistinserttuples() in the regular insertion code.
671 *
672 * Returns the block number of the page where the (first) new or updated tuple
673 * was inserted. Usually that's the original page, but might be a sibling page
674 * if the original page was split.
675 *
676 * Caller should hold a lock on 'buffer' on entry. This function will unlock
677 * and unpin it.
678 */
679static BlockNumber
680gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
681 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
682 BlockNumber parentblk, OffsetNumber downlinkoffnum)
683{
684 GISTBuildBuffers *gfbb = buildstate->gfbb;
685 List *splitinfo;
686 bool is_split;
687 BlockNumber placed_to_blk = InvalidBlockNumber;
688
689 is_split = gistplacetopage(buildstate->indexrel,
690 buildstate->freespace,
691 buildstate->giststate,
692 buffer,
693 itup, ntup, oldoffnum, &placed_to_blk,
694 InvalidBuffer,
695 &splitinfo,
696 false,
697 buildstate->heaprel, true);
698
699 /*
700 * If this is a root split, update the root path item kept in memory. This
701 * ensures that all path stacks are always complete, including all parent
702 * nodes up to the root. That simplifies the algorithm to re-find correct
703 * parent.
704 */
705 if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
706 {
707 Page page = BufferGetPage(buffer);
708 OffsetNumber off;
709 OffsetNumber maxoff;
710
711 Assert(level == gfbb->rootlevel);
712 gfbb->rootlevel++;
713
714 elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
715
716 /*
717 * All the downlinks on the old root page are now on one of the child
718 * pages. Visit all the new child pages to memorize the parents of the
719 * grandchildren.
720 */
721 if (gfbb->rootlevel > 1)
722 {
723 maxoff = PageGetMaxOffsetNumber(page);
724 for (off = FirstOffsetNumber; off <= maxoff; off++)
725 {
726 ItemId iid = PageGetItemId(page, off);
727 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
728 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
729 Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
730
731 LockBuffer(childbuf, GIST_SHARE);
732 gistMemorizeAllDownlinks(buildstate, childbuf);
733 UnlockReleaseBuffer(childbuf);
734
735 /*
736 * Also remember that the parent of the new child page is the
737 * root block.
738 */
739 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
740 }
741 }
742 }
743
744 if (splitinfo)
745 {
746 /*
747 * Insert the downlinks to the parent. This is analogous with
748 * gistfinishsplit() in the regular insertion code, but the locking is
749 * simpler, and we have to maintain the buffers on internal nodes and
750 * the parent map.
751 */
752 IndexTuple *downlinks;
753 int ndownlinks,
754 i;
755 Buffer parentBuffer;
756 ListCell *lc;
757
758 /* Parent may have changed since we memorized this path. */
759 parentBuffer =
760 gistBufferingFindCorrectParent(buildstate,
761 BufferGetBlockNumber(buffer),
762 level,
763 &parentblk,
764 &downlinkoffnum);
765
766 /*
767 * If there's a buffer associated with this page, that needs to be
768 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
769 * downlinks in 'splitinfo', to make sure they're consistent not only
770 * with the tuples already on the pages, but also the tuples in the
771 * buffers that will eventually be inserted to them.
772 */
773 gistRelocateBuildBuffersOnSplit(gfbb,
774 buildstate->giststate,
775 buildstate->indexrel,
776 level,
777 buffer, splitinfo);
778
779 /* Create an array of all the downlink tuples */
780 ndownlinks = list_length(splitinfo);
781 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
782 i = 0;
783 foreach(lc, splitinfo)
784 {
785 GISTPageSplitInfo *splitinfo = lfirst(lc);
786
787 /*
788 * Remember the parent of each new child page in our parent map.
789 * This assumes that the downlinks fit on the parent page. If the
790 * parent page is split, too, when we recurse up to insert the
791 * downlinks, the recursive gistbufferinginserttuples() call will
792 * update the map again.
793 */
794 if (level > 0)
795 gistMemorizeParent(buildstate,
796 BufferGetBlockNumber(splitinfo->buf),
797 BufferGetBlockNumber(parentBuffer));
798
799 /*
800 * Also update the parent map for all the downlinks that got moved
801 * to a different page. (actually this also loops through the
802 * downlinks that stayed on the original page, but it does no
803 * harm).
804 */
805 if (level > 1)
806 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
807
808 /*
809 * Since there's no concurrent access, we can release the lower
810 * level buffers immediately. This includes the original page.
811 */
812 UnlockReleaseBuffer(splitinfo->buf);
813 downlinks[i++] = splitinfo->downlink;
814 }
815
816 /* Insert them into parent. */
817 gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
818 downlinks, ndownlinks, downlinkoffnum,
819 InvalidBlockNumber, InvalidOffsetNumber);
820
821 list_free_deep(splitinfo); /* we don't need this anymore */
822 }
823 else
824 UnlockReleaseBuffer(buffer);
825
826 return placed_to_blk;
827}
828
829/*
830 * Find the downlink pointing to a child page.
831 *
832 * 'childblkno' indicates the child page to find the parent for. 'level' is
833 * the level of the child. On entry, *parentblkno and *downlinkoffnum can
834 * point to a location where the downlink used to be - we will check that
835 * location first, and save some cycles if it hasn't moved. The function
836 * returns a buffer containing the downlink, exclusively-locked, and
837 * *parentblkno and *downlinkoffnum are set to the real location of the
838 * downlink.
839 *
840 * If the child page is a leaf (level == 0), the caller must supply a correct
841 * parentblkno. Otherwise we use the parent map hash table to find the parent
842 * block.
843 *
844 * This function serves the same purpose as gistFindCorrectParent() during
845 * normal index inserts, but this is simpler because we don't need to deal
846 * with concurrent inserts.
847 */
848static Buffer
849gistBufferingFindCorrectParent(GISTBuildState *buildstate,
850 BlockNumber childblkno, int level,
851 BlockNumber *parentblkno,
852 OffsetNumber *downlinkoffnum)
853{
854 BlockNumber parent;
855 Buffer buffer;
856 Page page;
857 OffsetNumber maxoff;
858 OffsetNumber off;
859
860 if (level > 0)
861 parent = gistGetParent(buildstate, childblkno);
862 else
863 {
864 /*
865 * For a leaf page, the caller must supply a correct parent block
866 * number.
867 */
868 if (*parentblkno == InvalidBlockNumber)
869 elog(ERROR, "no parent buffer provided of child %d", childblkno);
870 parent = *parentblkno;
871 }
872
873 buffer = ReadBuffer(buildstate->indexrel, parent);
874 page = BufferGetPage(buffer);
875 LockBuffer(buffer, GIST_EXCLUSIVE);
876 gistcheckpage(buildstate->indexrel, buffer);
877 maxoff = PageGetMaxOffsetNumber(page);
878
879 /* Check if it was not moved */
880 if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
881 *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
882 {
883 ItemId iid = PageGetItemId(page, *downlinkoffnum);
884 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
885
886 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
887 {
888 /* Still there */
889 return buffer;
890 }
891 }
892
893 /*
894 * Downlink was not at the offset where it used to be. Scan the page to
895 * find it. During normal gist insertions, it might've moved to another
896 * page, to the right, but during a buffering build, we keep track of the
897 * parent of each page in the lookup table so we should always know what
898 * page it's on.
899 */
900 for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
901 {
902 ItemId iid = PageGetItemId(page, off);
903 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
904
905 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
906 {
907 /* yes!!, found it */
908 *downlinkoffnum = off;
909 return buffer;
910 }
911 }
912
913 elog(ERROR, "failed to re-find parent for block %u", childblkno);
914 return InvalidBuffer; /* keep compiler quiet */
915}
916
917/*
918 * Process buffers emptying stack. Emptying of one buffer can cause emptying
919 * of other buffers. This function iterates until this cascading emptying
920 * process finished, e.g. until buffers emptying stack is empty.
921 */
922static void
923gistProcessEmptyingQueue(GISTBuildState *buildstate)
924{
925 GISTBuildBuffers *gfbb = buildstate->gfbb;
926
927 /* Iterate while we have elements in buffers emptying stack. */
928 while (gfbb->bufferEmptyingQueue != NIL)
929 {
930 GISTNodeBuffer *emptyingNodeBuffer;
931
932 /* Get node buffer from emptying stack. */
933 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
934 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
935 emptyingNodeBuffer->queuedForEmptying = false;
936
937 /*
938 * We are going to load last pages of buffers where emptying will be
939 * to. So let's unload any previously loaded buffers.
940 */
941 gistUnloadNodeBuffers(gfbb);
942
943 /*
944 * Pop tuples from the buffer and run them down to the buffers at
945 * lower level, or leaf pages. We continue until one of the lower
946 * level buffers fills up, or this buffer runs empty.
947 *
948 * In Arge et al's paper, the buffer emptying is stopped after
949 * processing 1/2 node buffer worth of tuples, to avoid overfilling
950 * any of the lower level buffers. However, it's more efficient to
951 * keep going until one of the lower level buffers actually fills up,
952 * so that's what we do. This doesn't need to be exact, if a buffer
953 * overfills by a few tuples, there's no harm done.
954 */
955 while (true)
956 {
957 IndexTuple itup;
958
959 /* Get next index tuple from the buffer */
960 if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
961 break;
962
963 /*
964 * Run it down to the underlying node buffer or leaf page.
965 *
966 * Note: it's possible that the buffer we're emptying splits as a
967 * result of this call. If that happens, our emptyingNodeBuffer
968 * points to the left half of the split. After split, it's very
969 * likely that the new left buffer is no longer over the half-full
970 * threshold, but we might as well keep flushing tuples from it
971 * until we fill a lower-level buffer.
972 */
973 if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
974 {
975 /*
976 * A lower level buffer filled up. Stop emptying this buffer,
977 * to avoid overflowing the lower level buffer.
978 */
979 break;
980 }
981
982 /* Free all the memory allocated during index tuple processing */
983 MemoryContextReset(buildstate->giststate->tempCxt);
984 }
985 }
986}
987
988/*
989 * Empty all node buffers, from top to bottom. This is done at the end of
990 * index build to flush all remaining tuples to the index.
991 *
992 * Note: This destroys the buffersOnLevels lists, so the buffers should not
993 * be inserted to after this call.
994 */
995static void
996gistEmptyAllBuffers(GISTBuildState *buildstate)
997{
998 GISTBuildBuffers *gfbb = buildstate->gfbb;
999 MemoryContext oldCtx;
1000 int i;
1001
1002 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1003
1004 /*
1005 * Iterate through the levels from top to bottom.
1006 */
1007 for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1008 {
1009 /*
1010 * Empty all buffers on this level. Note that new buffers can pop up
1011 * in the list during the processing, as a result of page splits, so a
1012 * simple walk through the list won't work. We remove buffers from the
1013 * list when we see them empty; a buffer can't become non-empty once
1014 * it's been fully emptied.
1015 */
1016 while (gfbb->buffersOnLevels[i] != NIL)
1017 {
1018 GISTNodeBuffer *nodeBuffer;
1019
1020 nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1021
1022 if (nodeBuffer->blocksCount != 0)
1023 {
1024 /*
1025 * Add this buffer to the emptying queue, and proceed to empty
1026 * the queue.
1027 */
1028 if (!nodeBuffer->queuedForEmptying)
1029 {
1030 MemoryContextSwitchTo(gfbb->context);
1031 nodeBuffer->queuedForEmptying = true;
1032 gfbb->bufferEmptyingQueue =
1033 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1034 MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1035 }
1036 gistProcessEmptyingQueue(buildstate);
1037 }
1038 else
1039 gfbb->buffersOnLevels[i] =
1040 list_delete_first(gfbb->buffersOnLevels[i]);
1041 }
1042 elog(DEBUG2, "emptied all buffers at level %d", i);
1043 }
1044 MemoryContextSwitchTo(oldCtx);
1045}
1046
1047/*
1048 * Get the depth of the GiST index.
1049 */
1050static int
1051gistGetMaxLevel(Relation index)
1052{
1053 int maxLevel;
1054 BlockNumber blkno;
1055
1056 /*
1057 * Traverse down the tree, starting from the root, until we hit the leaf
1058 * level.
1059 */
1060 maxLevel = 0;
1061 blkno = GIST_ROOT_BLKNO;
1062 while (true)
1063 {
1064 Buffer buffer;
1065 Page page;
1066 IndexTuple itup;
1067
1068 buffer = ReadBuffer(index, blkno);
1069
1070 /*
1071 * There's no concurrent access during index build, so locking is just
1072 * pro forma.
1073 */
1074 LockBuffer(buffer, GIST_SHARE);
1075 page = (Page) BufferGetPage(buffer);
1076
1077 if (GistPageIsLeaf(page))
1078 {
1079 /* We hit the bottom, so we're done. */
1080 UnlockReleaseBuffer(buffer);
1081 break;
1082 }
1083
1084 /*
1085 * Pick the first downlink on the page, and follow it. It doesn't
1086 * matter which downlink we choose, the tree has the same depth
1087 * everywhere, so we just pick the first one.
1088 */
1089 itup = (IndexTuple) PageGetItem(page,
1090 PageGetItemId(page, FirstOffsetNumber));
1091 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1092 UnlockReleaseBuffer(buffer);
1093
1094 /*
1095 * We're going down on the tree. It means that there is yet one more
1096 * level in the tree.
1097 */
1098 maxLevel++;
1099 }
1100 return maxLevel;
1101}
1102
1103
1104/*
1105 * Routines for managing the parent map.
1106 *
1107 * Whenever a page is split, we need to insert the downlinks into the parent.
1108 * We need to somehow find the parent page to do that. In normal insertions,
1109 * we keep a stack of nodes visited when we descend the tree. However, in
1110 * buffering build, we can start descending the tree from any internal node,
1111 * when we empty a buffer by cascading tuples to its children. So we don't
1112 * have a full stack up to the root available at that time.
1113 *
1114 * So instead, we maintain a hash table to track the parent of every internal
1115 * page. We don't need to track the parents of leaf nodes, however. Whenever
1116 * we insert to a leaf, we've just descended down from its parent, so we know
1117 * its immediate parent already. This helps a lot to limit the memory used
1118 * by this hash table.
1119 *
1120 * Whenever an internal node is split, the parent map needs to be updated.
1121 * the parent of the new child page needs to be recorded, and also the
1122 * entries for all page whose downlinks are moved to a new page at the split
1123 * needs to be updated.
1124 *
1125 * We also update the parent map whenever we descend the tree. That might seem
1126 * unnecessary, because we maintain the map whenever a downlink is moved or
1127 * created, but it is needed because we switch to buffering mode after
1128 * creating a tree with regular index inserts. Any pages created before
1129 * switching to buffering mode will not be present in the parent map initially,
1130 * but will be added there the first time we visit them.
1131 */
1132
1133typedef struct
1134{
1135 BlockNumber childblkno; /* hash key */
1136 BlockNumber parentblkno;
1137} ParentMapEntry;
1138
1139static void
1140gistInitParentMap(GISTBuildState *buildstate)
1141{
1142 HASHCTL hashCtl;
1143
1144 hashCtl.keysize = sizeof(BlockNumber);
1145 hashCtl.entrysize = sizeof(ParentMapEntry);
1146 hashCtl.hcxt = CurrentMemoryContext;
1147 buildstate->parentMap = hash_create("gistbuild parent map",
1148 1024,
1149 &hashCtl,
1150 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1151}
1152
1153static void
1154gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1155{
1156 ParentMapEntry *entry;
1157 bool found;
1158
1159 entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1160 (const void *) &child,
1161 HASH_ENTER,
1162 &found);
1163 entry->parentblkno = parent;
1164}
1165
1166/*
1167 * Scan all downlinks on a page, and memorize their parent.
1168 */
1169static void
1170gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1171{
1172 OffsetNumber maxoff;
1173 OffsetNumber off;
1174 BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1175 Page page = BufferGetPage(parentbuf);
1176
1177 Assert(!GistPageIsLeaf(page));
1178
1179 maxoff = PageGetMaxOffsetNumber(page);
1180 for (off = FirstOffsetNumber; off <= maxoff; off++)
1181 {
1182 ItemId iid = PageGetItemId(page, off);
1183 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1184 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1185
1186 gistMemorizeParent(buildstate, childblkno, parentblkno);
1187 }
1188}
1189
1190static BlockNumber
1191gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1192{
1193 ParentMapEntry *entry;
1194 bool found;
1195
1196 /* Find node buffer in hash table */
1197 entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1198 (const void *) &child,
1199 HASH_FIND,
1200 &found);
1201 if (!found)
1202 elog(ERROR, "could not find parent of block %d in lookup table", child);
1203
1204 return entry->parentblkno;
1205}
1206