| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * gistbuild.c |
| 4 | * build algorithm for GiST indexes implementation. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/gist/gistbuild.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include <math.h> |
| 18 | |
| 19 | #include "access/genam.h" |
| 20 | #include "access/gist_private.h" |
| 21 | #include "access/gistxlog.h" |
| 22 | #include "access/tableam.h" |
| 23 | #include "access/xloginsert.h" |
| 24 | #include "catalog/index.h" |
| 25 | #include "miscadmin.h" |
| 26 | #include "optimizer/optimizer.h" |
| 27 | #include "storage/bufmgr.h" |
| 28 | #include "storage/smgr.h" |
| 29 | #include "utils/memutils.h" |
| 30 | #include "utils/rel.h" |
| 31 | |
| 32 | /* Step of index tuples for check whether to switch to buffering build mode */ |
| 33 | #define BUFFERING_MODE_SWITCH_CHECK_STEP 256 |
| 34 | |
| 35 | /* |
| 36 | * Number of tuples to process in the slow way before switching to buffering |
| 37 | * mode, when buffering is explicitly turned on. Also, the number of tuples |
| 38 | * to process between readjusting the buffer size parameter, while in |
| 39 | * buffering mode. |
| 40 | */ |
| 41 | #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096 |
| 42 | |
| 43 | typedef enum |
| 44 | { |
| 45 | GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to |
| 46 | * switch */ |
| 47 | GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to |
| 48 | * buffering build mode if the index grows too |
| 49 | * big */ |
| 50 | GIST_BUFFERING_STATS, /* gathering statistics of index tuple size |
| 51 | * before switching to the buffering build |
| 52 | * mode */ |
| 53 | GIST_BUFFERING_ACTIVE /* in buffering build mode */ |
| 54 | } GistBufferingMode; |
| 55 | |
| 56 | /* Working state for gistbuild and its callback */ |
| 57 | typedef struct |
| 58 | { |
| 59 | Relation indexrel; |
| 60 | Relation heaprel; |
| 61 | GISTSTATE *giststate; |
| 62 | |
| 63 | int64 indtuples; /* number of tuples indexed */ |
| 64 | int64 indtuplesSize; /* total size of all indexed tuples */ |
| 65 | |
| 66 | Size freespace; /* amount of free space to leave on pages */ |
| 67 | |
| 68 | /* |
| 69 | * Extra data structures used during a buffering build. 'gfbb' contains |
| 70 | * information related to managing the build buffers. 'parentMap' is a |
| 71 | * lookup table of the parent of each internal page. |
| 72 | */ |
| 73 | GISTBuildBuffers *gfbb; |
| 74 | HTAB *parentMap; |
| 75 | |
| 76 | GistBufferingMode bufferingMode; |
| 77 | } GISTBuildState; |
| 78 | |
| 79 | /* prototypes for private functions */ |
| 80 | static void gistInitBuffering(GISTBuildState *buildstate); |
| 81 | static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); |
| 82 | static void gistBuildCallback(Relation index, |
| 83 | HeapTuple htup, |
| 84 | Datum *values, |
| 85 | bool *isnull, |
| 86 | bool tupleIsAlive, |
| 87 | void *state); |
| 88 | static void gistBufferingBuildInsert(GISTBuildState *buildstate, |
| 89 | IndexTuple itup); |
| 90 | static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, |
| 91 | BlockNumber startblkno, int startlevel); |
| 92 | static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, |
| 93 | Buffer buffer, int level, |
| 94 | IndexTuple *itup, int ntup, OffsetNumber oldoffnum, |
| 95 | BlockNumber parentblk, OffsetNumber downlinkoffnum); |
| 96 | static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, |
| 97 | BlockNumber childblkno, int level, |
| 98 | BlockNumber *parentblk, |
| 99 | OffsetNumber *downlinkoffnum); |
| 100 | static void gistProcessEmptyingQueue(GISTBuildState *buildstate); |
| 101 | static void gistEmptyAllBuffers(GISTBuildState *buildstate); |
| 102 | static int gistGetMaxLevel(Relation index); |
| 103 | |
| 104 | static void gistInitParentMap(GISTBuildState *buildstate); |
| 105 | static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, |
| 106 | BlockNumber parent); |
| 107 | static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent); |
| 108 | static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); |
| 109 | |
| 110 | /* |
| 111 | * Main entry point to GiST index build. Initially calls insert over and over, |
| 112 | * but switches to more efficient buffering build algorithm after a certain |
| 113 | * number of tuples (unless buffering mode is disabled). |
| 114 | */ |
| 115 | IndexBuildResult * |
| 116 | gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
| 117 | { |
| 118 | IndexBuildResult *result; |
| 119 | double reltuples; |
| 120 | GISTBuildState buildstate; |
| 121 | Buffer buffer; |
| 122 | Page page; |
| 123 | MemoryContext oldcxt = CurrentMemoryContext; |
| 124 | int fillfactor; |
| 125 | |
| 126 | buildstate.indexrel = index; |
| 127 | buildstate.heaprel = heap; |
| 128 | if (index->rd_options) |
| 129 | { |
| 130 | /* Get buffering mode from the options string */ |
| 131 | GiSTOptions *options = (GiSTOptions *) index->rd_options; |
| 132 | char *bufferingMode = (char *) options + options->bufferingModeOffset; |
| 133 | |
| 134 | if (strcmp(bufferingMode, "on" ) == 0) |
| 135 | buildstate.bufferingMode = GIST_BUFFERING_STATS; |
| 136 | else if (strcmp(bufferingMode, "off" ) == 0) |
| 137 | buildstate.bufferingMode = GIST_BUFFERING_DISABLED; |
| 138 | else |
| 139 | buildstate.bufferingMode = GIST_BUFFERING_AUTO; |
| 140 | |
| 141 | fillfactor = options->fillfactor; |
| 142 | } |
| 143 | else |
| 144 | { |
| 145 | /* |
| 146 | * By default, switch to buffering mode when the index grows too large |
| 147 | * to fit in cache. |
| 148 | */ |
| 149 | buildstate.bufferingMode = GIST_BUFFERING_AUTO; |
| 150 | fillfactor = GIST_DEFAULT_FILLFACTOR; |
| 151 | } |
| 152 | /* Calculate target amount of free space to leave on pages */ |
| 153 | buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; |
| 154 | |
| 155 | /* |
| 156 | * We expect to be called exactly once for any index relation. If that's |
| 157 | * not the case, big trouble's what we have. |
| 158 | */ |
| 159 | if (RelationGetNumberOfBlocks(index) != 0) |
| 160 | elog(ERROR, "index \"%s\" already contains data" , |
| 161 | RelationGetRelationName(index)); |
| 162 | |
| 163 | /* no locking is needed */ |
| 164 | buildstate.giststate = initGISTstate(index); |
| 165 | |
| 166 | /* |
| 167 | * Create a temporary memory context that is reset once for each tuple |
| 168 | * processed. (Note: we don't bother to make this a child of the |
| 169 | * giststate's scanCxt, so we have to delete it separately at the end.) |
| 170 | */ |
| 171 | buildstate.giststate->tempCxt = createTempGistContext(); |
| 172 | |
| 173 | /* initialize the root page */ |
| 174 | buffer = gistNewBuffer(index); |
| 175 | Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); |
| 176 | page = BufferGetPage(buffer); |
| 177 | |
| 178 | START_CRIT_SECTION(); |
| 179 | |
| 180 | GISTInitBuffer(buffer, F_LEAF); |
| 181 | |
| 182 | MarkBufferDirty(buffer); |
| 183 | PageSetLSN(page, GistBuildLSN); |
| 184 | |
| 185 | UnlockReleaseBuffer(buffer); |
| 186 | |
| 187 | END_CRIT_SECTION(); |
| 188 | |
| 189 | /* build the index */ |
| 190 | buildstate.indtuples = 0; |
| 191 | buildstate.indtuplesSize = 0; |
| 192 | |
| 193 | /* |
| 194 | * Do the heap scan. |
| 195 | */ |
| 196 | reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
| 197 | gistBuildCallback, |
| 198 | (void *) &buildstate, NULL); |
| 199 | |
| 200 | /* |
| 201 | * If buffering was used, flush out all the tuples that are still in the |
| 202 | * buffers. |
| 203 | */ |
| 204 | if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) |
| 205 | { |
| 206 | elog(DEBUG1, "all tuples processed, emptying buffers" ); |
| 207 | gistEmptyAllBuffers(&buildstate); |
| 208 | gistFreeBuildBuffers(buildstate.gfbb); |
| 209 | } |
| 210 | |
| 211 | /* okay, all heap tuples are indexed */ |
| 212 | MemoryContextSwitchTo(oldcxt); |
| 213 | MemoryContextDelete(buildstate.giststate->tempCxt); |
| 214 | |
| 215 | freeGISTstate(buildstate.giststate); |
| 216 | |
| 217 | /* |
| 218 | * We didn't write WAL records as we built the index, so if WAL-logging is |
| 219 | * required, write all pages to the WAL now. |
| 220 | */ |
| 221 | if (RelationNeedsWAL(index)) |
| 222 | { |
| 223 | log_newpage_range(index, MAIN_FORKNUM, |
| 224 | 0, RelationGetNumberOfBlocks(index), |
| 225 | true); |
| 226 | } |
| 227 | |
| 228 | /* |
| 229 | * Return statistics |
| 230 | */ |
| 231 | result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
| 232 | |
| 233 | result->heap_tuples = reltuples; |
| 234 | result->index_tuples = (double) buildstate.indtuples; |
| 235 | |
| 236 | return result; |
| 237 | } |
| 238 | |
| 239 | /* |
| 240 | * Validator for "buffering" reloption on GiST indexes. Allows "on", "off" |
| 241 | * and "auto" values. |
| 242 | */ |
| 243 | void |
| 244 | gistValidateBufferingOption(const char *value) |
| 245 | { |
| 246 | if (value == NULL || |
| 247 | (strcmp(value, "on" ) != 0 && |
| 248 | strcmp(value, "off" ) != 0 && |
| 249 | strcmp(value, "auto" ) != 0)) |
| 250 | { |
| 251 | ereport(ERROR, |
| 252 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| 253 | errmsg("invalid value for \"buffering\" option" ), |
| 254 | errdetail("Valid values are \"on\", \"off\", and \"auto\"." ))); |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | /* |
| 259 | * Attempt to switch to buffering mode. |
| 260 | * |
| 261 | * If there is not enough memory for buffering build, sets bufferingMode |
| 262 | * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch |
| 263 | * anymore. Otherwise initializes the build buffers, and sets bufferingMode to |
| 264 | * GIST_BUFFERING_ACTIVE. |
| 265 | */ |
| 266 | static void |
| 267 | gistInitBuffering(GISTBuildState *buildstate) |
| 268 | { |
| 269 | Relation index = buildstate->indexrel; |
| 270 | int pagesPerBuffer; |
| 271 | Size pageFreeSpace; |
| 272 | Size itupAvgSize, |
| 273 | itupMinSize; |
| 274 | double avgIndexTuplesPerPage, |
| 275 | maxIndexTuplesPerPage; |
| 276 | int i; |
| 277 | int levelStep; |
| 278 | |
| 279 | /* Calc space of index page which is available for index tuples */ |
| 280 | pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) |
| 281 | - sizeof(ItemIdData) |
| 282 | - buildstate->freespace; |
| 283 | |
| 284 | /* |
| 285 | * Calculate average size of already inserted index tuples using gathered |
| 286 | * statistics. |
| 287 | */ |
| 288 | itupAvgSize = (double) buildstate->indtuplesSize / |
| 289 | (double) buildstate->indtuples; |
| 290 | |
| 291 | /* |
| 292 | * Calculate minimal possible size of index tuple by index metadata. |
| 293 | * Minimal possible size of varlena is VARHDRSZ. |
| 294 | * |
| 295 | * XXX: that's not actually true, as a short varlen can be just 2 bytes. |
| 296 | * And we should take padding into account here. |
| 297 | */ |
| 298 | itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData)); |
| 299 | for (i = 0; i < index->rd_att->natts; i++) |
| 300 | { |
| 301 | if (TupleDescAttr(index->rd_att, i)->attlen < 0) |
| 302 | itupMinSize += VARHDRSZ; |
| 303 | else |
| 304 | itupMinSize += TupleDescAttr(index->rd_att, i)->attlen; |
| 305 | } |
| 306 | |
| 307 | /* Calculate average and maximal number of index tuples which fit to page */ |
| 308 | avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; |
| 309 | maxIndexTuplesPerPage = pageFreeSpace / itupMinSize; |
| 310 | |
| 311 | /* |
| 312 | * We need to calculate two parameters for the buffering algorithm: |
| 313 | * levelStep and pagesPerBuffer. |
| 314 | * |
| 315 | * levelStep determines the size of subtree that we operate on, while |
| 316 | * emptying a buffer. A higher value is better, as you need fewer buffer |
| 317 | * emptying steps to build the index. However, if you set it too high, the |
| 318 | * subtree doesn't fit in cache anymore, and you quickly lose the benefit |
| 319 | * of the buffers. |
| 320 | * |
| 321 | * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is |
| 322 | * the number of tuples on page (ie. fanout), and M is the amount of |
| 323 | * internal memory available. Curiously, they doesn't explain *why* that |
| 324 | * setting is optimal. We calculate it by taking the highest levelStep so |
| 325 | * that a subtree still fits in cache. For a small B, our way of |
| 326 | * calculating levelStep is very close to Arge et al's formula. For a |
| 327 | * large B, our formula gives a value that is 2x higher. |
| 328 | * |
| 329 | * The average size (in pages) of a subtree of depth n can be calculated |
| 330 | * as a geometric series: |
| 331 | * |
| 332 | * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B) |
| 333 | * |
| 334 | * where B is the average number of index tuples on page. The subtree is |
| 335 | * cached in the shared buffer cache and the OS cache, so we choose |
| 336 | * levelStep so that the subtree size is comfortably smaller than |
| 337 | * effective_cache_size, with a safety factor of 4. |
| 338 | * |
| 339 | * The estimate on the average number of index tuples on page is based on |
| 340 | * average tuple sizes observed before switching to buffered build, so the |
| 341 | * real subtree size can be somewhat larger. Also, it would selfish to |
| 342 | * gobble the whole cache for our index build. The safety factor of 4 |
| 343 | * should account for those effects. |
| 344 | * |
| 345 | * The other limiting factor for setting levelStep is that while |
| 346 | * processing a subtree, we need to hold one page for each buffer at the |
| 347 | * next lower buffered level. The max. number of buffers needed for that |
| 348 | * is maxIndexTuplesPerPage^levelStep. This is very conservative, but |
| 349 | * hopefully maintenance_work_mem is set high enough that you're |
| 350 | * constrained by effective_cache_size rather than maintenance_work_mem. |
| 351 | * |
| 352 | * XXX: the buffer hash table consumes a fair amount of memory too per |
| 353 | * buffer, but that is not currently taken into account. That scales on |
| 354 | * the total number of buffers used, ie. the index size and on levelStep. |
| 355 | * Note that a higher levelStep *reduces* the amount of memory needed for |
| 356 | * the hash table. |
| 357 | */ |
| 358 | levelStep = 1; |
| 359 | for (;;) |
| 360 | { |
| 361 | double subtreesize; |
| 362 | double maxlowestlevelpages; |
| 363 | |
| 364 | /* size of an average subtree at this levelStep (in pages). */ |
| 365 | subtreesize = |
| 366 | (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) / |
| 367 | (1 - avgIndexTuplesPerPage); |
| 368 | |
| 369 | /* max number of pages at the lowest level of a subtree */ |
| 370 | maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep); |
| 371 | |
| 372 | /* subtree must fit in cache (with safety factor of 4) */ |
| 373 | if (subtreesize > effective_cache_size / 4) |
| 374 | break; |
| 375 | |
| 376 | /* each node in the lowest level of a subtree has one page in memory */ |
| 377 | if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ) |
| 378 | break; |
| 379 | |
| 380 | /* Good, we can handle this levelStep. See if we can go one higher. */ |
| 381 | levelStep++; |
| 382 | } |
| 383 | |
| 384 | /* |
| 385 | * We just reached an unacceptable value of levelStep in previous loop. |
| 386 | * So, decrease levelStep to get last acceptable value. |
| 387 | */ |
| 388 | levelStep--; |
| 389 | |
| 390 | /* |
| 391 | * If there's not enough cache or maintenance_work_mem, fall back to plain |
| 392 | * inserts. |
| 393 | */ |
| 394 | if (levelStep <= 0) |
| 395 | { |
| 396 | elog(DEBUG1, "failed to switch to buffered GiST build" ); |
| 397 | buildstate->bufferingMode = GIST_BUFFERING_DISABLED; |
| 398 | return; |
| 399 | } |
| 400 | |
| 401 | /* |
| 402 | * The second parameter to set is pagesPerBuffer, which determines the |
| 403 | * size of each buffer. We adjust pagesPerBuffer also during the build, |
| 404 | * which is why this calculation is in a separate function. |
| 405 | */ |
| 406 | pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep); |
| 407 | |
| 408 | /* Initialize GISTBuildBuffers with these parameters */ |
| 409 | buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep, |
| 410 | gistGetMaxLevel(index)); |
| 411 | |
| 412 | gistInitParentMap(buildstate); |
| 413 | |
| 414 | buildstate->bufferingMode = GIST_BUFFERING_ACTIVE; |
| 415 | |
| 416 | elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d" , |
| 417 | levelStep, pagesPerBuffer); |
| 418 | } |
| 419 | |
| 420 | /* |
| 421 | * Calculate pagesPerBuffer parameter for the buffering algorithm. |
| 422 | * |
| 423 | * Buffer size is chosen so that assuming that tuples are distributed |
| 424 | * randomly, emptying half a buffer fills on average one page in every buffer |
| 425 | * at the next lower level. |
| 426 | */ |
| 427 | static int |
| 428 | calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep) |
| 429 | { |
| 430 | double pagesPerBuffer; |
| 431 | double avgIndexTuplesPerPage; |
| 432 | double itupAvgSize; |
| 433 | Size pageFreeSpace; |
| 434 | |
| 435 | /* Calc space of index page which is available for index tuples */ |
| 436 | pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) |
| 437 | - sizeof(ItemIdData) |
| 438 | - buildstate->freespace; |
| 439 | |
| 440 | /* |
| 441 | * Calculate average size of already inserted index tuples using gathered |
| 442 | * statistics. |
| 443 | */ |
| 444 | itupAvgSize = (double) buildstate->indtuplesSize / |
| 445 | (double) buildstate->indtuples; |
| 446 | |
| 447 | avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; |
| 448 | |
| 449 | /* |
| 450 | * Recalculate required size of buffers. |
| 451 | */ |
| 452 | pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep); |
| 453 | |
| 454 | return (int) rint(pagesPerBuffer); |
| 455 | } |
| 456 | |
| 457 | /* |
| 458 | * Per-tuple callback for table_index_build_scan. |
| 459 | */ |
| 460 | static void |
| 461 | gistBuildCallback(Relation index, |
| 462 | HeapTuple htup, |
| 463 | Datum *values, |
| 464 | bool *isnull, |
| 465 | bool tupleIsAlive, |
| 466 | void *state) |
| 467 | { |
| 468 | GISTBuildState *buildstate = (GISTBuildState *) state; |
| 469 | IndexTuple itup; |
| 470 | MemoryContext oldCtx; |
| 471 | |
| 472 | oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| 473 | |
| 474 | /* form an index tuple and point it at the heap tuple */ |
| 475 | itup = gistFormTuple(buildstate->giststate, index, values, isnull, true); |
| 476 | itup->t_tid = htup->t_self; |
| 477 | |
| 478 | if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE) |
| 479 | { |
| 480 | /* We have buffers, so use them. */ |
| 481 | gistBufferingBuildInsert(buildstate, itup); |
| 482 | } |
| 483 | else |
| 484 | { |
| 485 | /* |
| 486 | * There's no buffers (yet). Since we already have the index relation |
| 487 | * locked, we call gistdoinsert directly. |
| 488 | */ |
| 489 | gistdoinsert(index, itup, buildstate->freespace, |
| 490 | buildstate->giststate, buildstate->heaprel, true); |
| 491 | } |
| 492 | |
| 493 | /* Update tuple count and total size. */ |
| 494 | buildstate->indtuples += 1; |
| 495 | buildstate->indtuplesSize += IndexTupleSize(itup); |
| 496 | |
| 497 | MemoryContextSwitchTo(oldCtx); |
| 498 | MemoryContextReset(buildstate->giststate->tempCxt); |
| 499 | |
| 500 | if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE && |
| 501 | buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0) |
| 502 | { |
| 503 | /* Adjust the target buffer size now */ |
| 504 | buildstate->gfbb->pagesPerBuffer = |
| 505 | calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep); |
| 506 | } |
| 507 | |
| 508 | /* |
| 509 | * In 'auto' mode, check if the index has grown too large to fit in cache, |
| 510 | * and switch to buffering mode if it has. |
| 511 | * |
| 512 | * To avoid excessive calls to smgrnblocks(), only check this every |
| 513 | * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples |
| 514 | */ |
| 515 | if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO && |
| 516 | buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && |
| 517 | effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) || |
| 518 | (buildstate->bufferingMode == GIST_BUFFERING_STATS && |
| 519 | buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) |
| 520 | { |
| 521 | /* |
| 522 | * Index doesn't fit in effective cache anymore. Try to switch to |
| 523 | * buffering build mode. |
| 524 | */ |
| 525 | gistInitBuffering(buildstate); |
| 526 | } |
| 527 | } |
| 528 | |
| 529 | /* |
| 530 | * Insert function for buffering index build. |
| 531 | */ |
| 532 | static void |
| 533 | gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup) |
| 534 | { |
| 535 | /* Insert the tuple to buffers. */ |
| 536 | gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel); |
| 537 | |
| 538 | /* If we filled up (half of a) buffer, process buffer emptying. */ |
| 539 | gistProcessEmptyingQueue(buildstate); |
| 540 | } |
| 541 | |
| 542 | /* |
| 543 | * Process an index tuple. Runs the tuple down the tree until we reach a leaf |
| 544 | * page or node buffer, and inserts the tuple there. Returns true if we have |
| 545 | * to stop buffer emptying process (because one of child buffers can't take |
| 546 | * index tuples anymore). |
| 547 | */ |
| 548 | static bool |
| 549 | gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, |
| 550 | BlockNumber startblkno, int startlevel) |
| 551 | { |
| 552 | GISTSTATE *giststate = buildstate->giststate; |
| 553 | GISTBuildBuffers *gfbb = buildstate->gfbb; |
| 554 | Relation indexrel = buildstate->indexrel; |
| 555 | BlockNumber childblkno; |
| 556 | Buffer buffer; |
| 557 | bool result = false; |
| 558 | BlockNumber blkno; |
| 559 | int level; |
| 560 | OffsetNumber downlinkoffnum = InvalidOffsetNumber; |
| 561 | BlockNumber parentblkno = InvalidBlockNumber; |
| 562 | |
| 563 | CHECK_FOR_INTERRUPTS(); |
| 564 | |
| 565 | /* |
| 566 | * Loop until we reach a leaf page (level == 0) or a level with buffers |
| 567 | * (not including the level we start at, because we would otherwise make |
| 568 | * no progress). |
| 569 | */ |
| 570 | blkno = startblkno; |
| 571 | level = startlevel; |
| 572 | for (;;) |
| 573 | { |
| 574 | ItemId iid; |
| 575 | IndexTuple idxtuple, |
| 576 | newtup; |
| 577 | Page page; |
| 578 | OffsetNumber childoffnum; |
| 579 | |
| 580 | /* Have we reached a level with buffers? */ |
| 581 | if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel) |
| 582 | break; |
| 583 | |
| 584 | /* Have we reached a leaf page? */ |
| 585 | if (level == 0) |
| 586 | break; |
| 587 | |
| 588 | /* |
| 589 | * Nope. Descend down to the next level then. Choose a child to |
| 590 | * descend down to. |
| 591 | */ |
| 592 | |
| 593 | buffer = ReadBuffer(indexrel, blkno); |
| 594 | LockBuffer(buffer, GIST_EXCLUSIVE); |
| 595 | |
| 596 | page = (Page) BufferGetPage(buffer); |
| 597 | childoffnum = gistchoose(indexrel, page, itup, giststate); |
| 598 | iid = PageGetItemId(page, childoffnum); |
| 599 | idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 600 | childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| 601 | |
| 602 | if (level > 1) |
| 603 | gistMemorizeParent(buildstate, childblkno, blkno); |
| 604 | |
| 605 | /* |
| 606 | * Check that the key representing the target child node is consistent |
| 607 | * with the key we're inserting. Update it if it's not. |
| 608 | */ |
| 609 | newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate); |
| 610 | if (newtup) |
| 611 | { |
| 612 | blkno = gistbufferinginserttuples(buildstate, |
| 613 | buffer, |
| 614 | level, |
| 615 | &newtup, |
| 616 | 1, |
| 617 | childoffnum, |
| 618 | InvalidBlockNumber, |
| 619 | InvalidOffsetNumber); |
| 620 | /* gistbufferinginserttuples() released the buffer */ |
| 621 | } |
| 622 | else |
| 623 | UnlockReleaseBuffer(buffer); |
| 624 | |
| 625 | /* Descend to the child */ |
| 626 | parentblkno = blkno; |
| 627 | blkno = childblkno; |
| 628 | downlinkoffnum = childoffnum; |
| 629 | Assert(level > 0); |
| 630 | level--; |
| 631 | } |
| 632 | |
| 633 | if (LEVEL_HAS_BUFFERS(level, gfbb)) |
| 634 | { |
| 635 | /* |
| 636 | * We've reached level with buffers. Place the index tuple to the |
| 637 | * buffer, and add the buffer to the emptying queue if it overflows. |
| 638 | */ |
| 639 | GISTNodeBuffer *childNodeBuffer; |
| 640 | |
| 641 | /* Find the buffer or create a new one */ |
| 642 | childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level); |
| 643 | |
| 644 | /* Add index tuple to it */ |
| 645 | gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup); |
| 646 | |
| 647 | if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb)) |
| 648 | result = true; |
| 649 | } |
| 650 | else |
| 651 | { |
| 652 | /* |
| 653 | * We've reached a leaf page. Place the tuple here. |
| 654 | */ |
| 655 | Assert(level == 0); |
| 656 | buffer = ReadBuffer(indexrel, blkno); |
| 657 | LockBuffer(buffer, GIST_EXCLUSIVE); |
| 658 | gistbufferinginserttuples(buildstate, buffer, level, |
| 659 | &itup, 1, InvalidOffsetNumber, |
| 660 | parentblkno, downlinkoffnum); |
| 661 | /* gistbufferinginserttuples() released the buffer */ |
| 662 | } |
| 663 | |
| 664 | return result; |
| 665 | } |
| 666 | |
| 667 | /* |
| 668 | * Insert tuples to a given page. |
| 669 | * |
| 670 | * This is analogous with gistinserttuples() in the regular insertion code. |
| 671 | * |
| 672 | * Returns the block number of the page where the (first) new or updated tuple |
| 673 | * was inserted. Usually that's the original page, but might be a sibling page |
| 674 | * if the original page was split. |
| 675 | * |
| 676 | * Caller should hold a lock on 'buffer' on entry. This function will unlock |
| 677 | * and unpin it. |
| 678 | */ |
| 679 | static BlockNumber |
| 680 | gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, |
| 681 | IndexTuple *itup, int ntup, OffsetNumber oldoffnum, |
| 682 | BlockNumber parentblk, OffsetNumber downlinkoffnum) |
| 683 | { |
| 684 | GISTBuildBuffers *gfbb = buildstate->gfbb; |
| 685 | List *splitinfo; |
| 686 | bool is_split; |
| 687 | BlockNumber placed_to_blk = InvalidBlockNumber; |
| 688 | |
| 689 | is_split = gistplacetopage(buildstate->indexrel, |
| 690 | buildstate->freespace, |
| 691 | buildstate->giststate, |
| 692 | buffer, |
| 693 | itup, ntup, oldoffnum, &placed_to_blk, |
| 694 | InvalidBuffer, |
| 695 | &splitinfo, |
| 696 | false, |
| 697 | buildstate->heaprel, true); |
| 698 | |
| 699 | /* |
| 700 | * If this is a root split, update the root path item kept in memory. This |
| 701 | * ensures that all path stacks are always complete, including all parent |
| 702 | * nodes up to the root. That simplifies the algorithm to re-find correct |
| 703 | * parent. |
| 704 | */ |
| 705 | if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO) |
| 706 | { |
| 707 | Page page = BufferGetPage(buffer); |
| 708 | OffsetNumber off; |
| 709 | OffsetNumber maxoff; |
| 710 | |
| 711 | Assert(level == gfbb->rootlevel); |
| 712 | gfbb->rootlevel++; |
| 713 | |
| 714 | elog(DEBUG2, "splitting GiST root page, now %d levels deep" , gfbb->rootlevel); |
| 715 | |
| 716 | /* |
| 717 | * All the downlinks on the old root page are now on one of the child |
| 718 | * pages. Visit all the new child pages to memorize the parents of the |
| 719 | * grandchildren. |
| 720 | */ |
| 721 | if (gfbb->rootlevel > 1) |
| 722 | { |
| 723 | maxoff = PageGetMaxOffsetNumber(page); |
| 724 | for (off = FirstOffsetNumber; off <= maxoff; off++) |
| 725 | { |
| 726 | ItemId iid = PageGetItemId(page, off); |
| 727 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 728 | BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| 729 | Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno); |
| 730 | |
| 731 | LockBuffer(childbuf, GIST_SHARE); |
| 732 | gistMemorizeAllDownlinks(buildstate, childbuf); |
| 733 | UnlockReleaseBuffer(childbuf); |
| 734 | |
| 735 | /* |
| 736 | * Also remember that the parent of the new child page is the |
| 737 | * root block. |
| 738 | */ |
| 739 | gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO); |
| 740 | } |
| 741 | } |
| 742 | } |
| 743 | |
| 744 | if (splitinfo) |
| 745 | { |
| 746 | /* |
| 747 | * Insert the downlinks to the parent. This is analogous with |
| 748 | * gistfinishsplit() in the regular insertion code, but the locking is |
| 749 | * simpler, and we have to maintain the buffers on internal nodes and |
| 750 | * the parent map. |
| 751 | */ |
| 752 | IndexTuple *downlinks; |
| 753 | int ndownlinks, |
| 754 | i; |
| 755 | Buffer parentBuffer; |
| 756 | ListCell *lc; |
| 757 | |
| 758 | /* Parent may have changed since we memorized this path. */ |
| 759 | parentBuffer = |
| 760 | gistBufferingFindCorrectParent(buildstate, |
| 761 | BufferGetBlockNumber(buffer), |
| 762 | level, |
| 763 | &parentblk, |
| 764 | &downlinkoffnum); |
| 765 | |
| 766 | /* |
| 767 | * If there's a buffer associated with this page, that needs to be |
| 768 | * split too. gistRelocateBuildBuffersOnSplit() will also adjust the |
| 769 | * downlinks in 'splitinfo', to make sure they're consistent not only |
| 770 | * with the tuples already on the pages, but also the tuples in the |
| 771 | * buffers that will eventually be inserted to them. |
| 772 | */ |
| 773 | gistRelocateBuildBuffersOnSplit(gfbb, |
| 774 | buildstate->giststate, |
| 775 | buildstate->indexrel, |
| 776 | level, |
| 777 | buffer, splitinfo); |
| 778 | |
| 779 | /* Create an array of all the downlink tuples */ |
| 780 | ndownlinks = list_length(splitinfo); |
| 781 | downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks); |
| 782 | i = 0; |
| 783 | foreach(lc, splitinfo) |
| 784 | { |
| 785 | GISTPageSplitInfo *splitinfo = lfirst(lc); |
| 786 | |
| 787 | /* |
| 788 | * Remember the parent of each new child page in our parent map. |
| 789 | * This assumes that the downlinks fit on the parent page. If the |
| 790 | * parent page is split, too, when we recurse up to insert the |
| 791 | * downlinks, the recursive gistbufferinginserttuples() call will |
| 792 | * update the map again. |
| 793 | */ |
| 794 | if (level > 0) |
| 795 | gistMemorizeParent(buildstate, |
| 796 | BufferGetBlockNumber(splitinfo->buf), |
| 797 | BufferGetBlockNumber(parentBuffer)); |
| 798 | |
| 799 | /* |
| 800 | * Also update the parent map for all the downlinks that got moved |
| 801 | * to a different page. (actually this also loops through the |
| 802 | * downlinks that stayed on the original page, but it does no |
| 803 | * harm). |
| 804 | */ |
| 805 | if (level > 1) |
| 806 | gistMemorizeAllDownlinks(buildstate, splitinfo->buf); |
| 807 | |
| 808 | /* |
| 809 | * Since there's no concurrent access, we can release the lower |
| 810 | * level buffers immediately. This includes the original page. |
| 811 | */ |
| 812 | UnlockReleaseBuffer(splitinfo->buf); |
| 813 | downlinks[i++] = splitinfo->downlink; |
| 814 | } |
| 815 | |
| 816 | /* Insert them into parent. */ |
| 817 | gistbufferinginserttuples(buildstate, parentBuffer, level + 1, |
| 818 | downlinks, ndownlinks, downlinkoffnum, |
| 819 | InvalidBlockNumber, InvalidOffsetNumber); |
| 820 | |
| 821 | list_free_deep(splitinfo); /* we don't need this anymore */ |
| 822 | } |
| 823 | else |
| 824 | UnlockReleaseBuffer(buffer); |
| 825 | |
| 826 | return placed_to_blk; |
| 827 | } |
| 828 | |
| 829 | /* |
| 830 | * Find the downlink pointing to a child page. |
| 831 | * |
| 832 | * 'childblkno' indicates the child page to find the parent for. 'level' is |
| 833 | * the level of the child. On entry, *parentblkno and *downlinkoffnum can |
| 834 | * point to a location where the downlink used to be - we will check that |
| 835 | * location first, and save some cycles if it hasn't moved. The function |
| 836 | * returns a buffer containing the downlink, exclusively-locked, and |
| 837 | * *parentblkno and *downlinkoffnum are set to the real location of the |
| 838 | * downlink. |
| 839 | * |
| 840 | * If the child page is a leaf (level == 0), the caller must supply a correct |
| 841 | * parentblkno. Otherwise we use the parent map hash table to find the parent |
| 842 | * block. |
| 843 | * |
| 844 | * This function serves the same purpose as gistFindCorrectParent() during |
| 845 | * normal index inserts, but this is simpler because we don't need to deal |
| 846 | * with concurrent inserts. |
| 847 | */ |
| 848 | static Buffer |
| 849 | gistBufferingFindCorrectParent(GISTBuildState *buildstate, |
| 850 | BlockNumber childblkno, int level, |
| 851 | BlockNumber *parentblkno, |
| 852 | OffsetNumber *downlinkoffnum) |
| 853 | { |
| 854 | BlockNumber parent; |
| 855 | Buffer buffer; |
| 856 | Page page; |
| 857 | OffsetNumber maxoff; |
| 858 | OffsetNumber off; |
| 859 | |
| 860 | if (level > 0) |
| 861 | parent = gistGetParent(buildstate, childblkno); |
| 862 | else |
| 863 | { |
| 864 | /* |
| 865 | * For a leaf page, the caller must supply a correct parent block |
| 866 | * number. |
| 867 | */ |
| 868 | if (*parentblkno == InvalidBlockNumber) |
| 869 | elog(ERROR, "no parent buffer provided of child %d" , childblkno); |
| 870 | parent = *parentblkno; |
| 871 | } |
| 872 | |
| 873 | buffer = ReadBuffer(buildstate->indexrel, parent); |
| 874 | page = BufferGetPage(buffer); |
| 875 | LockBuffer(buffer, GIST_EXCLUSIVE); |
| 876 | gistcheckpage(buildstate->indexrel, buffer); |
| 877 | maxoff = PageGetMaxOffsetNumber(page); |
| 878 | |
| 879 | /* Check if it was not moved */ |
| 880 | if (parent == *parentblkno && *parentblkno != InvalidBlockNumber && |
| 881 | *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff) |
| 882 | { |
| 883 | ItemId iid = PageGetItemId(page, *downlinkoffnum); |
| 884 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 885 | |
| 886 | if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) |
| 887 | { |
| 888 | /* Still there */ |
| 889 | return buffer; |
| 890 | } |
| 891 | } |
| 892 | |
| 893 | /* |
| 894 | * Downlink was not at the offset where it used to be. Scan the page to |
| 895 | * find it. During normal gist insertions, it might've moved to another |
| 896 | * page, to the right, but during a buffering build, we keep track of the |
| 897 | * parent of each page in the lookup table so we should always know what |
| 898 | * page it's on. |
| 899 | */ |
| 900 | for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off)) |
| 901 | { |
| 902 | ItemId iid = PageGetItemId(page, off); |
| 903 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 904 | |
| 905 | if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) |
| 906 | { |
| 907 | /* yes!!, found it */ |
| 908 | *downlinkoffnum = off; |
| 909 | return buffer; |
| 910 | } |
| 911 | } |
| 912 | |
| 913 | elog(ERROR, "failed to re-find parent for block %u" , childblkno); |
| 914 | return InvalidBuffer; /* keep compiler quiet */ |
| 915 | } |
| 916 | |
| 917 | /* |
| 918 | * Process buffers emptying stack. Emptying of one buffer can cause emptying |
| 919 | * of other buffers. This function iterates until this cascading emptying |
| 920 | * process finished, e.g. until buffers emptying stack is empty. |
| 921 | */ |
| 922 | static void |
| 923 | gistProcessEmptyingQueue(GISTBuildState *buildstate) |
| 924 | { |
| 925 | GISTBuildBuffers *gfbb = buildstate->gfbb; |
| 926 | |
| 927 | /* Iterate while we have elements in buffers emptying stack. */ |
| 928 | while (gfbb->bufferEmptyingQueue != NIL) |
| 929 | { |
| 930 | GISTNodeBuffer *emptyingNodeBuffer; |
| 931 | |
| 932 | /* Get node buffer from emptying stack. */ |
| 933 | emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue); |
| 934 | gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue); |
| 935 | emptyingNodeBuffer->queuedForEmptying = false; |
| 936 | |
| 937 | /* |
| 938 | * We are going to load last pages of buffers where emptying will be |
| 939 | * to. So let's unload any previously loaded buffers. |
| 940 | */ |
| 941 | gistUnloadNodeBuffers(gfbb); |
| 942 | |
| 943 | /* |
| 944 | * Pop tuples from the buffer and run them down to the buffers at |
| 945 | * lower level, or leaf pages. We continue until one of the lower |
| 946 | * level buffers fills up, or this buffer runs empty. |
| 947 | * |
| 948 | * In Arge et al's paper, the buffer emptying is stopped after |
| 949 | * processing 1/2 node buffer worth of tuples, to avoid overfilling |
| 950 | * any of the lower level buffers. However, it's more efficient to |
| 951 | * keep going until one of the lower level buffers actually fills up, |
| 952 | * so that's what we do. This doesn't need to be exact, if a buffer |
| 953 | * overfills by a few tuples, there's no harm done. |
| 954 | */ |
| 955 | while (true) |
| 956 | { |
| 957 | IndexTuple itup; |
| 958 | |
| 959 | /* Get next index tuple from the buffer */ |
| 960 | if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup)) |
| 961 | break; |
| 962 | |
| 963 | /* |
| 964 | * Run it down to the underlying node buffer or leaf page. |
| 965 | * |
| 966 | * Note: it's possible that the buffer we're emptying splits as a |
| 967 | * result of this call. If that happens, our emptyingNodeBuffer |
| 968 | * points to the left half of the split. After split, it's very |
| 969 | * likely that the new left buffer is no longer over the half-full |
| 970 | * threshold, but we might as well keep flushing tuples from it |
| 971 | * until we fill a lower-level buffer. |
| 972 | */ |
| 973 | if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level)) |
| 974 | { |
| 975 | /* |
| 976 | * A lower level buffer filled up. Stop emptying this buffer, |
| 977 | * to avoid overflowing the lower level buffer. |
| 978 | */ |
| 979 | break; |
| 980 | } |
| 981 | |
| 982 | /* Free all the memory allocated during index tuple processing */ |
| 983 | MemoryContextReset(buildstate->giststate->tempCxt); |
| 984 | } |
| 985 | } |
| 986 | } |
| 987 | |
| 988 | /* |
| 989 | * Empty all node buffers, from top to bottom. This is done at the end of |
| 990 | * index build to flush all remaining tuples to the index. |
| 991 | * |
| 992 | * Note: This destroys the buffersOnLevels lists, so the buffers should not |
| 993 | * be inserted to after this call. |
| 994 | */ |
| 995 | static void |
| 996 | gistEmptyAllBuffers(GISTBuildState *buildstate) |
| 997 | { |
| 998 | GISTBuildBuffers *gfbb = buildstate->gfbb; |
| 999 | MemoryContext oldCtx; |
| 1000 | int i; |
| 1001 | |
| 1002 | oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| 1003 | |
| 1004 | /* |
| 1005 | * Iterate through the levels from top to bottom. |
| 1006 | */ |
| 1007 | for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--) |
| 1008 | { |
| 1009 | /* |
| 1010 | * Empty all buffers on this level. Note that new buffers can pop up |
| 1011 | * in the list during the processing, as a result of page splits, so a |
| 1012 | * simple walk through the list won't work. We remove buffers from the |
| 1013 | * list when we see them empty; a buffer can't become non-empty once |
| 1014 | * it's been fully emptied. |
| 1015 | */ |
| 1016 | while (gfbb->buffersOnLevels[i] != NIL) |
| 1017 | { |
| 1018 | GISTNodeBuffer *nodeBuffer; |
| 1019 | |
| 1020 | nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]); |
| 1021 | |
| 1022 | if (nodeBuffer->blocksCount != 0) |
| 1023 | { |
| 1024 | /* |
| 1025 | * Add this buffer to the emptying queue, and proceed to empty |
| 1026 | * the queue. |
| 1027 | */ |
| 1028 | if (!nodeBuffer->queuedForEmptying) |
| 1029 | { |
| 1030 | MemoryContextSwitchTo(gfbb->context); |
| 1031 | nodeBuffer->queuedForEmptying = true; |
| 1032 | gfbb->bufferEmptyingQueue = |
| 1033 | lcons(nodeBuffer, gfbb->bufferEmptyingQueue); |
| 1034 | MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| 1035 | } |
| 1036 | gistProcessEmptyingQueue(buildstate); |
| 1037 | } |
| 1038 | else |
| 1039 | gfbb->buffersOnLevels[i] = |
| 1040 | list_delete_first(gfbb->buffersOnLevels[i]); |
| 1041 | } |
| 1042 | elog(DEBUG2, "emptied all buffers at level %d" , i); |
| 1043 | } |
| 1044 | MemoryContextSwitchTo(oldCtx); |
| 1045 | } |
| 1046 | |
| 1047 | /* |
| 1048 | * Get the depth of the GiST index. |
| 1049 | */ |
| 1050 | static int |
| 1051 | gistGetMaxLevel(Relation index) |
| 1052 | { |
| 1053 | int maxLevel; |
| 1054 | BlockNumber blkno; |
| 1055 | |
| 1056 | /* |
| 1057 | * Traverse down the tree, starting from the root, until we hit the leaf |
| 1058 | * level. |
| 1059 | */ |
| 1060 | maxLevel = 0; |
| 1061 | blkno = GIST_ROOT_BLKNO; |
| 1062 | while (true) |
| 1063 | { |
| 1064 | Buffer buffer; |
| 1065 | Page page; |
| 1066 | IndexTuple itup; |
| 1067 | |
| 1068 | buffer = ReadBuffer(index, blkno); |
| 1069 | |
| 1070 | /* |
| 1071 | * There's no concurrent access during index build, so locking is just |
| 1072 | * pro forma. |
| 1073 | */ |
| 1074 | LockBuffer(buffer, GIST_SHARE); |
| 1075 | page = (Page) BufferGetPage(buffer); |
| 1076 | |
| 1077 | if (GistPageIsLeaf(page)) |
| 1078 | { |
| 1079 | /* We hit the bottom, so we're done. */ |
| 1080 | UnlockReleaseBuffer(buffer); |
| 1081 | break; |
| 1082 | } |
| 1083 | |
| 1084 | /* |
| 1085 | * Pick the first downlink on the page, and follow it. It doesn't |
| 1086 | * matter which downlink we choose, the tree has the same depth |
| 1087 | * everywhere, so we just pick the first one. |
| 1088 | */ |
| 1089 | itup = (IndexTuple) PageGetItem(page, |
| 1090 | PageGetItemId(page, FirstOffsetNumber)); |
| 1091 | blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); |
| 1092 | UnlockReleaseBuffer(buffer); |
| 1093 | |
| 1094 | /* |
| 1095 | * We're going down on the tree. It means that there is yet one more |
| 1096 | * level in the tree. |
| 1097 | */ |
| 1098 | maxLevel++; |
| 1099 | } |
| 1100 | return maxLevel; |
| 1101 | } |
| 1102 | |
| 1103 | |
| 1104 | /* |
| 1105 | * Routines for managing the parent map. |
| 1106 | * |
| 1107 | * Whenever a page is split, we need to insert the downlinks into the parent. |
| 1108 | * We need to somehow find the parent page to do that. In normal insertions, |
| 1109 | * we keep a stack of nodes visited when we descend the tree. However, in |
| 1110 | * buffering build, we can start descending the tree from any internal node, |
| 1111 | * when we empty a buffer by cascading tuples to its children. So we don't |
| 1112 | * have a full stack up to the root available at that time. |
| 1113 | * |
| 1114 | * So instead, we maintain a hash table to track the parent of every internal |
| 1115 | * page. We don't need to track the parents of leaf nodes, however. Whenever |
| 1116 | * we insert to a leaf, we've just descended down from its parent, so we know |
| 1117 | * its immediate parent already. This helps a lot to limit the memory used |
| 1118 | * by this hash table. |
| 1119 | * |
| 1120 | * Whenever an internal node is split, the parent map needs to be updated. |
| 1121 | * the parent of the new child page needs to be recorded, and also the |
| 1122 | * entries for all page whose downlinks are moved to a new page at the split |
| 1123 | * needs to be updated. |
| 1124 | * |
| 1125 | * We also update the parent map whenever we descend the tree. That might seem |
| 1126 | * unnecessary, because we maintain the map whenever a downlink is moved or |
| 1127 | * created, but it is needed because we switch to buffering mode after |
| 1128 | * creating a tree with regular index inserts. Any pages created before |
| 1129 | * switching to buffering mode will not be present in the parent map initially, |
| 1130 | * but will be added there the first time we visit them. |
| 1131 | */ |
| 1132 | |
| 1133 | typedef struct |
| 1134 | { |
| 1135 | BlockNumber childblkno; /* hash key */ |
| 1136 | BlockNumber parentblkno; |
| 1137 | } ParentMapEntry; |
| 1138 | |
| 1139 | static void |
| 1140 | gistInitParentMap(GISTBuildState *buildstate) |
| 1141 | { |
| 1142 | HASHCTL hashCtl; |
| 1143 | |
| 1144 | hashCtl.keysize = sizeof(BlockNumber); |
| 1145 | hashCtl.entrysize = sizeof(ParentMapEntry); |
| 1146 | hashCtl.hcxt = CurrentMemoryContext; |
| 1147 | buildstate->parentMap = hash_create("gistbuild parent map" , |
| 1148 | 1024, |
| 1149 | &hashCtl, |
| 1150 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| 1151 | } |
| 1152 | |
| 1153 | static void |
| 1154 | gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent) |
| 1155 | { |
| 1156 | ParentMapEntry *entry; |
| 1157 | bool found; |
| 1158 | |
| 1159 | entry = (ParentMapEntry *) hash_search(buildstate->parentMap, |
| 1160 | (const void *) &child, |
| 1161 | HASH_ENTER, |
| 1162 | &found); |
| 1163 | entry->parentblkno = parent; |
| 1164 | } |
| 1165 | |
| 1166 | /* |
| 1167 | * Scan all downlinks on a page, and memorize their parent. |
| 1168 | */ |
| 1169 | static void |
| 1170 | gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf) |
| 1171 | { |
| 1172 | OffsetNumber maxoff; |
| 1173 | OffsetNumber off; |
| 1174 | BlockNumber parentblkno = BufferGetBlockNumber(parentbuf); |
| 1175 | Page page = BufferGetPage(parentbuf); |
| 1176 | |
| 1177 | Assert(!GistPageIsLeaf(page)); |
| 1178 | |
| 1179 | maxoff = PageGetMaxOffsetNumber(page); |
| 1180 | for (off = FirstOffsetNumber; off <= maxoff; off++) |
| 1181 | { |
| 1182 | ItemId iid = PageGetItemId(page, off); |
| 1183 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 1184 | BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| 1185 | |
| 1186 | gistMemorizeParent(buildstate, childblkno, parentblkno); |
| 1187 | } |
| 1188 | } |
| 1189 | |
| 1190 | static BlockNumber |
| 1191 | gistGetParent(GISTBuildState *buildstate, BlockNumber child) |
| 1192 | { |
| 1193 | ParentMapEntry *entry; |
| 1194 | bool found; |
| 1195 | |
| 1196 | /* Find node buffer in hash table */ |
| 1197 | entry = (ParentMapEntry *) hash_search(buildstate->parentMap, |
| 1198 | (const void *) &child, |
| 1199 | HASH_FIND, |
| 1200 | &found); |
| 1201 | if (!found) |
| 1202 | elog(ERROR, "could not find parent of block %d in lookup table" , child); |
| 1203 | |
| 1204 | return entry->parentblkno; |
| 1205 | } |
| 1206 | |