| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * nbtsort.c |
| 4 | * Build a btree from sorted input by loading leaf pages sequentially. |
| 5 | * |
| 6 | * NOTES |
| 7 | * |
| 8 | * We use tuplesort.c to sort the given index tuples into order. |
| 9 | * Then we scan the index tuples in order and build the btree pages |
| 10 | * for each level. We load source tuples into leaf-level pages. |
| 11 | * Whenever we fill a page at one level, we add a link to it to its |
| 12 | * parent level (starting a new parent level if necessary). When |
| 13 | * done, we write out each final page on each level, adding it to |
| 14 | * its parent level. When we have only one page on a level, it must be |
| 15 | * the root -- it can be attached to the btree metapage and we are done. |
| 16 | * |
| 17 | * It is not wise to pack the pages entirely full, since then *any* |
| 18 | * insertion would cause a split (and not only of the leaf page; the need |
| 19 | * for a split would cascade right up the tree). The steady-state load |
| 20 | * factor for btrees is usually estimated at 70%. We choose to pack leaf |
| 21 | * pages to the user-controllable fill factor (default 90%) while upper pages |
| 22 | * are always packed to 70%. This gives us reasonable density (there aren't |
| 23 | * many upper pages if the keys are reasonable-size) without risking a lot of |
| 24 | * cascading splits during early insertions. |
| 25 | * |
| 26 | * Formerly the index pages being built were kept in shared buffers, but |
| 27 | * that is of no value (since other backends have no interest in them yet) |
| 28 | * and it created locking problems for CHECKPOINT, because the upper-level |
| 29 | * pages were held exclusive-locked for long periods. Now we just build |
| 30 | * the pages in local memory and smgrwrite or smgrextend them as we finish |
| 31 | * them. They will need to be re-read into shared buffers on first use after |
| 32 | * the build finishes. |
| 33 | * |
| 34 | * Since the index will never be used unless it is completely built, |
| 35 | * from a crash-recovery point of view there is no need to WAL-log the |
| 36 | * steps of the build. After completing the index build, we can just sync |
| 37 | * the whole file to disk using smgrimmedsync() before exiting this module. |
| 38 | * This can be seen to be sufficient for crash recovery by considering that |
| 39 | * it's effectively equivalent to what would happen if a CHECKPOINT occurred |
| 40 | * just after the index build. However, it is clearly not sufficient if the |
| 41 | * DBA is using the WAL log for PITR or replication purposes, since another |
| 42 | * machine would not be able to reconstruct the index from WAL. Therefore, |
| 43 | * we log the completed index pages to WAL if and only if WAL archiving is |
| 44 | * active. |
| 45 | * |
| 46 | * This code isn't concerned about the FSM at all. The caller is responsible |
| 47 | * for initializing that. |
| 48 | * |
| 49 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 50 | * Portions Copyright (c) 1994, Regents of the University of California |
| 51 | * |
| 52 | * IDENTIFICATION |
| 53 | * src/backend/access/nbtree/nbtsort.c |
| 54 | * |
| 55 | *------------------------------------------------------------------------- |
| 56 | */ |
| 57 | |
| 58 | #include "postgres.h" |
| 59 | |
| 60 | #include "access/nbtree.h" |
| 61 | #include "access/parallel.h" |
| 62 | #include "access/relscan.h" |
| 63 | #include "access/table.h" |
| 64 | #include "access/tableam.h" |
| 65 | #include "access/xact.h" |
| 66 | #include "access/xlog.h" |
| 67 | #include "access/xloginsert.h" |
| 68 | #include "catalog/index.h" |
| 69 | #include "commands/progress.h" |
| 70 | #include "miscadmin.h" |
| 71 | #include "pgstat.h" |
| 72 | #include "storage/smgr.h" |
| 73 | #include "tcop/tcopprot.h" /* pgrminclude ignore */ |
| 74 | #include "utils/rel.h" |
| 75 | #include "utils/sortsupport.h" |
| 76 | #include "utils/tuplesort.h" |
| 77 | |
| 78 | |
| 79 | /* Magic numbers for parallel state sharing */ |
| 80 | #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001) |
| 81 | #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) |
| 82 | #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) |
| 83 | #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) |
| 84 | |
| 85 | /* |
| 86 | * DISABLE_LEADER_PARTICIPATION disables the leader's participation in |
| 87 | * parallel index builds. This may be useful as a debugging aid. |
| 88 | #undef DISABLE_LEADER_PARTICIPATION |
| 89 | */ |
| 90 | |
| 91 | /* |
| 92 | * Status record for spooling/sorting phase. (Note we may have two of |
| 93 | * these due to the special requirements for uniqueness-checking with |
| 94 | * dead tuples.) |
| 95 | */ |
| 96 | typedef struct BTSpool |
| 97 | { |
| 98 | Tuplesortstate *sortstate; /* state data for tuplesort.c */ |
| 99 | Relation heap; |
| 100 | Relation index; |
| 101 | bool isunique; |
| 102 | } BTSpool; |
| 103 | |
| 104 | /* |
| 105 | * Status for index builds performed in parallel. This is allocated in a |
| 106 | * dynamic shared memory segment. Note that there is a separate tuplesort TOC |
| 107 | * entry, private to tuplesort.c but allocated by this module on its behalf. |
| 108 | */ |
| 109 | typedef struct BTShared |
| 110 | { |
| 111 | /* |
| 112 | * These fields are not modified during the sort. They primarily exist |
| 113 | * for the benefit of worker processes that need to create BTSpool state |
| 114 | * corresponding to that used by the leader. |
| 115 | */ |
| 116 | Oid heaprelid; |
| 117 | Oid indexrelid; |
| 118 | bool isunique; |
| 119 | bool isconcurrent; |
| 120 | int scantuplesortstates; |
| 121 | |
| 122 | /* |
| 123 | * workersdonecv is used to monitor the progress of workers. All parallel |
| 124 | * participants must indicate that they are done before leader can use |
| 125 | * mutable state that workers maintain during scan (and before leader can |
| 126 | * proceed to tuplesort_performsort()). |
| 127 | */ |
| 128 | ConditionVariable workersdonecv; |
| 129 | |
| 130 | /* |
| 131 | * mutex protects all fields before heapdesc. |
| 132 | * |
| 133 | * These fields contain status information of interest to B-Tree index |
| 134 | * builds that must work just the same when an index is built in parallel. |
| 135 | */ |
| 136 | slock_t mutex; |
| 137 | |
| 138 | /* |
| 139 | * Mutable state that is maintained by workers, and reported back to |
| 140 | * leader at end of parallel scan. |
| 141 | * |
| 142 | * nparticipantsdone is number of worker processes finished. |
| 143 | * |
| 144 | * reltuples is the total number of input heap tuples. |
| 145 | * |
| 146 | * havedead indicates if RECENTLY_DEAD tuples were encountered during |
| 147 | * build. |
| 148 | * |
| 149 | * indtuples is the total number of tuples that made it into the index. |
| 150 | * |
| 151 | * brokenhotchain indicates if any worker detected a broken HOT chain |
| 152 | * during build. |
| 153 | */ |
| 154 | int nparticipantsdone; |
| 155 | double reltuples; |
| 156 | bool havedead; |
| 157 | double indtuples; |
| 158 | bool brokenhotchain; |
| 159 | |
| 160 | /* |
| 161 | * ParallelTableScanDescData data follows. Can't directly embed here, as |
| 162 | * implementations of the parallel table scan desc interface might need |
| 163 | * stronger alignment. |
| 164 | */ |
| 165 | } BTShared; |
| 166 | |
| 167 | /* |
| 168 | * Return pointer to a BTShared's parallel table scan. |
| 169 | * |
| 170 | * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just |
| 171 | * MAXALIGN. |
| 172 | */ |
| 173 | #define ParallelTableScanFromBTShared(shared) \ |
| 174 | (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared))) |
| 175 | |
| 176 | /* |
| 177 | * Status for leader in parallel index build. |
| 178 | */ |
| 179 | typedef struct BTLeader |
| 180 | { |
| 181 | /* parallel context itself */ |
| 182 | ParallelContext *pcxt; |
| 183 | |
| 184 | /* |
| 185 | * nparticipanttuplesorts is the exact number of worker processes |
| 186 | * successfully launched, plus one leader process if it participates as a |
| 187 | * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader |
| 188 | * participating as a worker). |
| 189 | */ |
| 190 | int nparticipanttuplesorts; |
| 191 | |
| 192 | /* |
| 193 | * Leader process convenience pointers to shared state (leader avoids TOC |
| 194 | * lookups). |
| 195 | * |
| 196 | * btshared is the shared state for entire build. sharedsort is the |
| 197 | * shared, tuplesort-managed state passed to each process tuplesort. |
| 198 | * sharedsort2 is the corresponding btspool2 shared state, used only when |
| 199 | * building unique indexes. snapshot is the snapshot used by the scan iff |
| 200 | * an MVCC snapshot is required. |
| 201 | */ |
| 202 | BTShared *btshared; |
| 203 | Sharedsort *sharedsort; |
| 204 | Sharedsort *sharedsort2; |
| 205 | Snapshot snapshot; |
| 206 | } BTLeader; |
| 207 | |
| 208 | /* |
| 209 | * Working state for btbuild and its callback. |
| 210 | * |
| 211 | * When parallel CREATE INDEX is used, there is a BTBuildState for each |
| 212 | * participant. |
| 213 | */ |
| 214 | typedef struct BTBuildState |
| 215 | { |
| 216 | bool isunique; |
| 217 | bool havedead; |
| 218 | Relation heap; |
| 219 | BTSpool *spool; |
| 220 | |
| 221 | /* |
| 222 | * spool2 is needed only when the index is a unique index. Dead tuples are |
| 223 | * put into spool2 instead of spool in order to avoid uniqueness check. |
| 224 | */ |
| 225 | BTSpool *spool2; |
| 226 | double indtuples; |
| 227 | |
| 228 | /* |
| 229 | * btleader is only present when a parallel index build is performed, and |
| 230 | * only in the leader process. (Actually, only the leader has a |
| 231 | * BTBuildState. Workers have their own spool and spool2, though.) |
| 232 | */ |
| 233 | BTLeader *btleader; |
| 234 | } BTBuildState; |
| 235 | |
| 236 | /* |
| 237 | * Status record for a btree page being built. We have one of these |
| 238 | * for each active tree level. |
| 239 | * |
| 240 | * The reason we need to store a copy of the minimum key is that we'll |
| 241 | * need to propagate it to the parent node when this page is linked |
| 242 | * into its parent. However, if the page is not a leaf page, the first |
| 243 | * entry on the page doesn't need to contain a key, so we will not have |
| 244 | * stored the key itself on the page. (You might think we could skip |
| 245 | * copying the minimum key on leaf pages, but actually we must have a |
| 246 | * writable copy anyway because we'll poke the page's address into it |
| 247 | * before passing it up to the parent...) |
| 248 | */ |
| 249 | typedef struct BTPageState |
| 250 | { |
| 251 | Page btps_page; /* workspace for page building */ |
| 252 | BlockNumber btps_blkno; /* block # to write this page at */ |
| 253 | IndexTuple btps_minkey; /* copy of minimum key (first item) on page */ |
| 254 | OffsetNumber btps_lastoff; /* last item offset loaded */ |
| 255 | uint32 btps_level; /* tree level (0 = leaf) */ |
| 256 | Size btps_full; /* "full" if less than this much free space */ |
| 257 | struct BTPageState *btps_next; /* link to parent level, if any */ |
| 258 | } BTPageState; |
| 259 | |
| 260 | /* |
| 261 | * Overall status record for index writing phase. |
| 262 | */ |
| 263 | typedef struct BTWriteState |
| 264 | { |
| 265 | Relation heap; |
| 266 | Relation index; |
| 267 | BTScanInsert inskey; /* generic insertion scankey */ |
| 268 | bool btws_use_wal; /* dump pages to WAL? */ |
| 269 | BlockNumber btws_pages_alloced; /* # pages allocated */ |
| 270 | BlockNumber btws_pages_written; /* # pages written out */ |
| 271 | Page btws_zeropage; /* workspace for filling zeroes */ |
| 272 | } BTWriteState; |
| 273 | |
| 274 | |
| 275 | static double _bt_spools_heapscan(Relation heap, Relation index, |
| 276 | BTBuildState *buildstate, IndexInfo *indexInfo); |
| 277 | static void _bt_spooldestroy(BTSpool *btspool); |
| 278 | static void _bt_spool(BTSpool *btspool, ItemPointer self, |
| 279 | Datum *values, bool *isnull); |
| 280 | static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); |
| 281 | static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values, |
| 282 | bool *isnull, bool tupleIsAlive, void *state); |
| 283 | static Page _bt_blnewpage(uint32 level); |
| 284 | static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); |
| 285 | static void _bt_slideleft(Page page); |
| 286 | static void _bt_sortaddtup(Page page, Size itemsize, |
| 287 | IndexTuple itup, OffsetNumber itup_off); |
| 288 | static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, |
| 289 | IndexTuple itup); |
| 290 | static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); |
| 291 | static void _bt_load(BTWriteState *wstate, |
| 292 | BTSpool *btspool, BTSpool *btspool2); |
| 293 | static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, |
| 294 | int request); |
| 295 | static void _bt_end_parallel(BTLeader *btleader); |
| 296 | static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot); |
| 297 | static double _bt_parallel_heapscan(BTBuildState *buildstate, |
| 298 | bool *brokenhotchain); |
| 299 | static void _bt_leader_participate_as_worker(BTBuildState *buildstate); |
| 300 | static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, |
| 301 | BTShared *btshared, Sharedsort *sharedsort, |
| 302 | Sharedsort *sharedsort2, int sortmem, |
| 303 | bool progress); |
| 304 | |
| 305 | |
| 306 | /* |
| 307 | * btbuild() -- build a new btree index. |
| 308 | */ |
| 309 | IndexBuildResult * |
| 310 | btbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
| 311 | { |
| 312 | IndexBuildResult *result; |
| 313 | BTBuildState buildstate; |
| 314 | double reltuples; |
| 315 | |
| 316 | #ifdef BTREE_BUILD_STATS |
| 317 | if (log_btree_build_stats) |
| 318 | ResetUsage(); |
| 319 | #endif /* BTREE_BUILD_STATS */ |
| 320 | |
| 321 | buildstate.isunique = indexInfo->ii_Unique; |
| 322 | buildstate.havedead = false; |
| 323 | buildstate.heap = heap; |
| 324 | buildstate.spool = NULL; |
| 325 | buildstate.spool2 = NULL; |
| 326 | buildstate.indtuples = 0; |
| 327 | buildstate.btleader = NULL; |
| 328 | |
| 329 | /* |
| 330 | * We expect to be called exactly once for any index relation. If that's |
| 331 | * not the case, big trouble's what we have. |
| 332 | */ |
| 333 | if (RelationGetNumberOfBlocks(index) != 0) |
| 334 | elog(ERROR, "index \"%s\" already contains data" , |
| 335 | RelationGetRelationName(index)); |
| 336 | |
| 337 | reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); |
| 338 | |
| 339 | /* |
| 340 | * Finish the build by (1) completing the sort of the spool file, (2) |
| 341 | * inserting the sorted tuples into btree pages and (3) building the upper |
| 342 | * levels. Finally, it may also be necessary to end use of parallelism. |
| 343 | */ |
| 344 | _bt_leafbuild(buildstate.spool, buildstate.spool2); |
| 345 | _bt_spooldestroy(buildstate.spool); |
| 346 | if (buildstate.spool2) |
| 347 | _bt_spooldestroy(buildstate.spool2); |
| 348 | if (buildstate.btleader) |
| 349 | _bt_end_parallel(buildstate.btleader); |
| 350 | |
| 351 | result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
| 352 | |
| 353 | result->heap_tuples = reltuples; |
| 354 | result->index_tuples = buildstate.indtuples; |
| 355 | |
| 356 | #ifdef BTREE_BUILD_STATS |
| 357 | if (log_btree_build_stats) |
| 358 | { |
| 359 | ShowUsage("BTREE BUILD STATS" ); |
| 360 | ResetUsage(); |
| 361 | } |
| 362 | #endif /* BTREE_BUILD_STATS */ |
| 363 | |
| 364 | return result; |
| 365 | } |
| 366 | |
| 367 | /* |
| 368 | * Create and initialize one or two spool structures, and save them in caller's |
| 369 | * buildstate argument. May also fill-in fields within indexInfo used by index |
| 370 | * builds. |
| 371 | * |
| 372 | * Scans the heap, possibly in parallel, filling spools with IndexTuples. This |
| 373 | * routine encapsulates all aspects of managing parallelism. Caller need only |
| 374 | * call _bt_end_parallel() in parallel case after it is done with spool/spool2. |
| 375 | * |
| 376 | * Returns the total number of heap tuples scanned. |
| 377 | */ |
| 378 | static double |
| 379 | _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, |
| 380 | IndexInfo *indexInfo) |
| 381 | { |
| 382 | BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); |
| 383 | SortCoordinate coordinate = NULL; |
| 384 | double reltuples = 0; |
| 385 | |
| 386 | /* |
| 387 | * We size the sort area as maintenance_work_mem rather than work_mem to |
| 388 | * speed index creation. This should be OK since a single backend can't |
| 389 | * run multiple index creations in parallel (see also: notes on |
| 390 | * parallelism and maintenance_work_mem below). |
| 391 | */ |
| 392 | btspool->heap = heap; |
| 393 | btspool->index = index; |
| 394 | btspool->isunique = indexInfo->ii_Unique; |
| 395 | |
| 396 | /* Save as primary spool */ |
| 397 | buildstate->spool = btspool; |
| 398 | |
| 399 | /* Report table scan phase started */ |
| 400 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| 401 | PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN); |
| 402 | |
| 403 | /* Attempt to launch parallel worker scan when required */ |
| 404 | if (indexInfo->ii_ParallelWorkers > 0) |
| 405 | _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent, |
| 406 | indexInfo->ii_ParallelWorkers); |
| 407 | |
| 408 | /* |
| 409 | * If parallel build requested and at least one worker process was |
| 410 | * successfully launched, set up coordination state |
| 411 | */ |
| 412 | if (buildstate->btleader) |
| 413 | { |
| 414 | coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); |
| 415 | coordinate->isWorker = false; |
| 416 | coordinate->nParticipants = |
| 417 | buildstate->btleader->nparticipanttuplesorts; |
| 418 | coordinate->sharedsort = buildstate->btleader->sharedsort; |
| 419 | } |
| 420 | |
| 421 | /* |
| 422 | * Begin serial/leader tuplesort. |
| 423 | * |
| 424 | * In cases where parallelism is involved, the leader receives the same |
| 425 | * share of maintenance_work_mem as a serial sort (it is generally treated |
| 426 | * in the same way as a serial sort once we return). Parallel worker |
| 427 | * Tuplesortstates will have received only a fraction of |
| 428 | * maintenance_work_mem, though. |
| 429 | * |
| 430 | * We rely on the lifetime of the Leader Tuplesortstate almost not |
| 431 | * overlapping with any worker Tuplesortstate's lifetime. There may be |
| 432 | * some small overlap, but that's okay because we rely on leader |
| 433 | * Tuplesortstate only allocating a small, fixed amount of memory here. |
| 434 | * When its tuplesort_performsort() is called (by our caller), and |
| 435 | * significant amounts of memory are likely to be used, all workers must |
| 436 | * have already freed almost all memory held by their Tuplesortstates |
| 437 | * (they are about to go away completely, too). The overall effect is |
| 438 | * that maintenance_work_mem always represents an absolute high watermark |
| 439 | * on the amount of memory used by a CREATE INDEX operation, regardless of |
| 440 | * the use of parallelism or any other factor. |
| 441 | */ |
| 442 | buildstate->spool->sortstate = |
| 443 | tuplesort_begin_index_btree(heap, index, buildstate->isunique, |
| 444 | maintenance_work_mem, coordinate, |
| 445 | false); |
| 446 | |
| 447 | /* |
| 448 | * If building a unique index, put dead tuples in a second spool to keep |
| 449 | * them out of the uniqueness check. We expect that the second spool (for |
| 450 | * dead tuples) won't get very full, so we give it only work_mem. |
| 451 | */ |
| 452 | if (indexInfo->ii_Unique) |
| 453 | { |
| 454 | BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
| 455 | SortCoordinate coordinate2 = NULL; |
| 456 | |
| 457 | /* Initialize secondary spool */ |
| 458 | btspool2->heap = heap; |
| 459 | btspool2->index = index; |
| 460 | btspool2->isunique = false; |
| 461 | /* Save as secondary spool */ |
| 462 | buildstate->spool2 = btspool2; |
| 463 | |
| 464 | if (buildstate->btleader) |
| 465 | { |
| 466 | /* |
| 467 | * Set up non-private state that is passed to |
| 468 | * tuplesort_begin_index_btree() about the basic high level |
| 469 | * coordination of a parallel sort. |
| 470 | */ |
| 471 | coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); |
| 472 | coordinate2->isWorker = false; |
| 473 | coordinate2->nParticipants = |
| 474 | buildstate->btleader->nparticipanttuplesorts; |
| 475 | coordinate2->sharedsort = buildstate->btleader->sharedsort2; |
| 476 | } |
| 477 | |
| 478 | /* |
| 479 | * We expect that the second one (for dead tuples) won't get very |
| 480 | * full, so we give it only work_mem |
| 481 | */ |
| 482 | buildstate->spool2->sortstate = |
| 483 | tuplesort_begin_index_btree(heap, index, false, work_mem, |
| 484 | coordinate2, false); |
| 485 | } |
| 486 | |
| 487 | /* Fill spool using either serial or parallel heap scan */ |
| 488 | if (!buildstate->btleader) |
| 489 | reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
| 490 | _bt_build_callback, (void *) buildstate, |
| 491 | NULL); |
| 492 | else |
| 493 | reltuples = _bt_parallel_heapscan(buildstate, |
| 494 | &indexInfo->ii_BrokenHotChain); |
| 495 | |
| 496 | /* |
| 497 | * Set the progress target for the next phase. Reset the block number |
| 498 | * values set by table_index_build_scan |
| 499 | */ |
| 500 | { |
| 501 | const int index[] = { |
| 502 | PROGRESS_CREATEIDX_TUPLES_TOTAL, |
| 503 | PROGRESS_SCAN_BLOCKS_TOTAL, |
| 504 | PROGRESS_SCAN_BLOCKS_DONE |
| 505 | }; |
| 506 | const int64 val[] = { |
| 507 | buildstate->indtuples, |
| 508 | 0, 0 |
| 509 | }; |
| 510 | |
| 511 | pgstat_progress_update_multi_param(3, index, val); |
| 512 | } |
| 513 | |
| 514 | /* okay, all heap tuples are spooled */ |
| 515 | if (buildstate->spool2 && !buildstate->havedead) |
| 516 | { |
| 517 | /* spool2 turns out to be unnecessary */ |
| 518 | _bt_spooldestroy(buildstate->spool2); |
| 519 | buildstate->spool2 = NULL; |
| 520 | } |
| 521 | |
| 522 | return reltuples; |
| 523 | } |
| 524 | |
| 525 | /* |
| 526 | * clean up a spool structure and its substructures. |
| 527 | */ |
| 528 | static void |
| 529 | _bt_spooldestroy(BTSpool *btspool) |
| 530 | { |
| 531 | tuplesort_end(btspool->sortstate); |
| 532 | pfree(btspool); |
| 533 | } |
| 534 | |
| 535 | /* |
| 536 | * spool an index entry into the sort file. |
| 537 | */ |
| 538 | static void |
| 539 | _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) |
| 540 | { |
| 541 | tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, |
| 542 | self, values, isnull); |
| 543 | } |
| 544 | |
| 545 | /* |
| 546 | * given a spool loaded by successive calls to _bt_spool, |
| 547 | * create an entire btree. |
| 548 | */ |
| 549 | static void |
| 550 | _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) |
| 551 | { |
| 552 | BTWriteState wstate; |
| 553 | |
| 554 | #ifdef BTREE_BUILD_STATS |
| 555 | if (log_btree_build_stats) |
| 556 | { |
| 557 | ShowUsage("BTREE BUILD (Spool) STATISTICS" ); |
| 558 | ResetUsage(); |
| 559 | } |
| 560 | #endif /* BTREE_BUILD_STATS */ |
| 561 | |
| 562 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| 563 | PROGRESS_BTREE_PHASE_PERFORMSORT_1); |
| 564 | tuplesort_performsort(btspool->sortstate); |
| 565 | if (btspool2) |
| 566 | { |
| 567 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| 568 | PROGRESS_BTREE_PHASE_PERFORMSORT_2); |
| 569 | tuplesort_performsort(btspool2->sortstate); |
| 570 | } |
| 571 | |
| 572 | wstate.heap = btspool->heap; |
| 573 | wstate.index = btspool->index; |
| 574 | wstate.inskey = _bt_mkscankey(wstate.index, NULL); |
| 575 | |
| 576 | /* |
| 577 | * We need to log index creation in WAL iff WAL archiving/streaming is |
| 578 | * enabled UNLESS the index isn't WAL-logged anyway. |
| 579 | */ |
| 580 | wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); |
| 581 | |
| 582 | /* reserve the metapage */ |
| 583 | wstate.btws_pages_alloced = BTREE_METAPAGE + 1; |
| 584 | wstate.btws_pages_written = 0; |
| 585 | wstate.btws_zeropage = NULL; /* until needed */ |
| 586 | |
| 587 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| 588 | PROGRESS_BTREE_PHASE_LEAF_LOAD); |
| 589 | _bt_load(&wstate, btspool, btspool2); |
| 590 | } |
| 591 | |
| 592 | /* |
| 593 | * Per-tuple callback for table_index_build_scan |
| 594 | */ |
| 595 | static void |
| 596 | _bt_build_callback(Relation index, |
| 597 | HeapTuple htup, |
| 598 | Datum *values, |
| 599 | bool *isnull, |
| 600 | bool tupleIsAlive, |
| 601 | void *state) |
| 602 | { |
| 603 | BTBuildState *buildstate = (BTBuildState *) state; |
| 604 | |
| 605 | /* |
| 606 | * insert the index tuple into the appropriate spool file for subsequent |
| 607 | * processing |
| 608 | */ |
| 609 | if (tupleIsAlive || buildstate->spool2 == NULL) |
| 610 | _bt_spool(buildstate->spool, &htup->t_self, values, isnull); |
| 611 | else |
| 612 | { |
| 613 | /* dead tuples are put into spool2 */ |
| 614 | buildstate->havedead = true; |
| 615 | _bt_spool(buildstate->spool2, &htup->t_self, values, isnull); |
| 616 | } |
| 617 | |
| 618 | buildstate->indtuples += 1; |
| 619 | } |
| 620 | |
| 621 | /* |
| 622 | * allocate workspace for a new, clean btree page, not linked to any siblings. |
| 623 | */ |
| 624 | static Page |
| 625 | _bt_blnewpage(uint32 level) |
| 626 | { |
| 627 | Page page; |
| 628 | BTPageOpaque opaque; |
| 629 | |
| 630 | page = (Page) palloc(BLCKSZ); |
| 631 | |
| 632 | /* Zero the page and set up standard page header info */ |
| 633 | _bt_pageinit(page, BLCKSZ); |
| 634 | |
| 635 | /* Initialize BT opaque state */ |
| 636 | opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
| 637 | opaque->btpo_prev = opaque->btpo_next = P_NONE; |
| 638 | opaque->btpo.level = level; |
| 639 | opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; |
| 640 | opaque->btpo_cycleid = 0; |
| 641 | |
| 642 | /* Make the P_HIKEY line pointer appear allocated */ |
| 643 | ((PageHeader) page)->pd_lower += sizeof(ItemIdData); |
| 644 | |
| 645 | return page; |
| 646 | } |
| 647 | |
| 648 | /* |
| 649 | * emit a completed btree page, and release the working storage. |
| 650 | */ |
| 651 | static void |
| 652 | _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) |
| 653 | { |
| 654 | /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ |
| 655 | RelationOpenSmgr(wstate->index); |
| 656 | |
| 657 | /* XLOG stuff */ |
| 658 | if (wstate->btws_use_wal) |
| 659 | { |
| 660 | /* We use the heap NEWPAGE record type for this */ |
| 661 | log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); |
| 662 | } |
| 663 | |
| 664 | /* |
| 665 | * If we have to write pages nonsequentially, fill in the space with |
| 666 | * zeroes until we come back and overwrite. This is not logically |
| 667 | * necessary on standard Unix filesystems (unwritten space will read as |
| 668 | * zeroes anyway), but it should help to avoid fragmentation. The dummy |
| 669 | * pages aren't WAL-logged though. |
| 670 | */ |
| 671 | while (blkno > wstate->btws_pages_written) |
| 672 | { |
| 673 | if (!wstate->btws_zeropage) |
| 674 | wstate->btws_zeropage = (Page) palloc0(BLCKSZ); |
| 675 | /* don't set checksum for all-zero page */ |
| 676 | smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, |
| 677 | wstate->btws_pages_written++, |
| 678 | (char *) wstate->btws_zeropage, |
| 679 | true); |
| 680 | } |
| 681 | |
| 682 | PageSetChecksumInplace(page, blkno); |
| 683 | |
| 684 | /* |
| 685 | * Now write the page. There's no need for smgr to schedule an fsync for |
| 686 | * this write; we'll do it ourselves before ending the build. |
| 687 | */ |
| 688 | if (blkno == wstate->btws_pages_written) |
| 689 | { |
| 690 | /* extending the file... */ |
| 691 | smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, |
| 692 | (char *) page, true); |
| 693 | wstate->btws_pages_written++; |
| 694 | } |
| 695 | else |
| 696 | { |
| 697 | /* overwriting a block we zero-filled before */ |
| 698 | smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, |
| 699 | (char *) page, true); |
| 700 | } |
| 701 | |
| 702 | pfree(page); |
| 703 | } |
| 704 | |
| 705 | /* |
| 706 | * allocate and initialize a new BTPageState. the returned structure |
| 707 | * is suitable for immediate use by _bt_buildadd. |
| 708 | */ |
| 709 | static BTPageState * |
| 710 | _bt_pagestate(BTWriteState *wstate, uint32 level) |
| 711 | { |
| 712 | BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); |
| 713 | |
| 714 | /* create initial page for level */ |
| 715 | state->btps_page = _bt_blnewpage(level); |
| 716 | |
| 717 | /* and assign it a page position */ |
| 718 | state->btps_blkno = wstate->btws_pages_alloced++; |
| 719 | |
| 720 | state->btps_minkey = NULL; |
| 721 | /* initialize lastoff so first item goes into P_FIRSTKEY */ |
| 722 | state->btps_lastoff = P_HIKEY; |
| 723 | state->btps_level = level; |
| 724 | /* set "full" threshold based on level. See notes at head of file. */ |
| 725 | if (level > 0) |
| 726 | state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); |
| 727 | else |
| 728 | state->btps_full = RelationGetTargetPageFreeSpace(wstate->index, |
| 729 | BTREE_DEFAULT_FILLFACTOR); |
| 730 | /* no parent level, yet */ |
| 731 | state->btps_next = NULL; |
| 732 | |
| 733 | return state; |
| 734 | } |
| 735 | |
| 736 | /* |
| 737 | * slide an array of ItemIds back one slot (from P_FIRSTKEY to |
| 738 | * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover |
| 739 | * that we have built an ItemId array in what has turned out to be a |
| 740 | * P_RIGHTMOST page. |
| 741 | */ |
| 742 | static void |
| 743 | _bt_slideleft(Page page) |
| 744 | { |
| 745 | OffsetNumber off; |
| 746 | OffsetNumber maxoff; |
| 747 | ItemId previi; |
| 748 | ItemId thisii; |
| 749 | |
| 750 | if (!PageIsEmpty(page)) |
| 751 | { |
| 752 | maxoff = PageGetMaxOffsetNumber(page); |
| 753 | previi = PageGetItemId(page, P_HIKEY); |
| 754 | for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) |
| 755 | { |
| 756 | thisii = PageGetItemId(page, off); |
| 757 | *previi = *thisii; |
| 758 | previi = thisii; |
| 759 | } |
| 760 | ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); |
| 761 | } |
| 762 | } |
| 763 | |
| 764 | /* |
| 765 | * Add an item to a page being built. |
| 766 | * |
| 767 | * The main difference between this routine and a bare PageAddItem call |
| 768 | * is that this code knows that the leftmost data item on a non-leaf |
| 769 | * btree page doesn't need to have a key. Therefore, it strips such |
| 770 | * items down to just the item header. |
| 771 | * |
| 772 | * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use |
| 773 | * that because it assumes that P_RIGHTMOST() will return the correct |
| 774 | * answer for the page. Here, we don't know yet if the page will be |
| 775 | * rightmost. Offset P_FIRSTKEY is always the first data key. |
| 776 | */ |
| 777 | static void |
| 778 | _bt_sortaddtup(Page page, |
| 779 | Size itemsize, |
| 780 | IndexTuple itup, |
| 781 | OffsetNumber itup_off) |
| 782 | { |
| 783 | BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
| 784 | IndexTupleData trunctuple; |
| 785 | |
| 786 | if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) |
| 787 | { |
| 788 | trunctuple = *itup; |
| 789 | trunctuple.t_info = sizeof(IndexTupleData); |
| 790 | /* Deliberately zero INDEX_ALT_TID_MASK bits */ |
| 791 | BTreeTupleSetNAtts(&trunctuple, 0); |
| 792 | itup = &trunctuple; |
| 793 | itemsize = sizeof(IndexTupleData); |
| 794 | } |
| 795 | |
| 796 | if (PageAddItem(page, (Item) itup, itemsize, itup_off, |
| 797 | false, false) == InvalidOffsetNumber) |
| 798 | elog(ERROR, "failed to add item to the index page" ); |
| 799 | } |
| 800 | |
| 801 | /*---------- |
| 802 | * Add an item to a disk page from the sort output. |
| 803 | * |
| 804 | * We must be careful to observe the page layout conventions of nbtsearch.c: |
| 805 | * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. |
| 806 | * - on non-leaf pages, the key portion of the first item need not be |
| 807 | * stored, we should store only the link. |
| 808 | * |
| 809 | * A leaf page being built looks like: |
| 810 | * |
| 811 | * +----------------+---------------------------------+ |
| 812 | * | PageHeaderData | linp0 linp1 linp2 ... | |
| 813 | * +-----------+----+---------------------------------+ |
| 814 | * | ... linpN | | |
| 815 | * +-----------+--------------------------------------+ |
| 816 | * | ^ last | |
| 817 | * | | |
| 818 | * +-------------+------------------------------------+ |
| 819 | * | | itemN ... | |
| 820 | * +-------------+------------------+-----------------+ |
| 821 | * | ... item3 item2 item1 | "special space" | |
| 822 | * +--------------------------------+-----------------+ |
| 823 | * |
| 824 | * Contrast this with the diagram in bufpage.h; note the mismatch |
| 825 | * between linps and items. This is because we reserve linp0 as a |
| 826 | * placeholder for the pointer to the "high key" item; when we have |
| 827 | * filled up the page, we will set linp0 to point to itemN and clear |
| 828 | * linpN. On the other hand, if we find this is the last (rightmost) |
| 829 | * page, we leave the items alone and slide the linp array over. If |
| 830 | * the high key is to be truncated, offset 1 is deleted, and we insert |
| 831 | * the truncated high key at offset 1. |
| 832 | * |
| 833 | * 'last' pointer indicates the last offset added to the page. |
| 834 | *---------- |
| 835 | */ |
| 836 | static void |
| 837 | _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) |
| 838 | { |
| 839 | Page npage; |
| 840 | BlockNumber nblkno; |
| 841 | OffsetNumber last_off; |
| 842 | Size pgspc; |
| 843 | Size itupsz; |
| 844 | bool isleaf; |
| 845 | |
| 846 | /* |
| 847 | * This is a handy place to check for cancel interrupts during the btree |
| 848 | * load phase of index creation. |
| 849 | */ |
| 850 | CHECK_FOR_INTERRUPTS(); |
| 851 | |
| 852 | npage = state->btps_page; |
| 853 | nblkno = state->btps_blkno; |
| 854 | last_off = state->btps_lastoff; |
| 855 | |
| 856 | pgspc = PageGetFreeSpace(npage); |
| 857 | itupsz = IndexTupleSize(itup); |
| 858 | itupsz = MAXALIGN(itupsz); |
| 859 | /* Leaf case has slightly different rules due to suffix truncation */ |
| 860 | isleaf = (state->btps_level == 0); |
| 861 | |
| 862 | /* |
| 863 | * Check whether the new item can fit on a btree page on current level at |
| 864 | * all. |
| 865 | * |
| 866 | * Every newly built index will treat heap TID as part of the keyspace, |
| 867 | * which imposes the requirement that new high keys must occasionally have |
| 868 | * a heap TID appended within _bt_truncate(). That may leave a new pivot |
| 869 | * tuple one or two MAXALIGN() quantums larger than the original first |
| 870 | * right tuple it's derived from. v4 deals with the problem by decreasing |
| 871 | * the limit on the size of tuples inserted on the leaf level by the same |
| 872 | * small amount. Enforce the new v4+ limit on the leaf level, and the old |
| 873 | * limit on internal levels, since pivot tuples may need to make use of |
| 874 | * the reserved space. This should never fail on internal pages. |
| 875 | */ |
| 876 | if (unlikely(itupsz > BTMaxItemSize(npage))) |
| 877 | _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, |
| 878 | itup); |
| 879 | |
| 880 | /* |
| 881 | * Check to see if current page will fit new item, with space left over to |
| 882 | * append a heap TID during suffix truncation when page is a leaf page. |
| 883 | * |
| 884 | * It is guaranteed that we can fit at least 2 non-pivot tuples plus a |
| 885 | * high key with heap TID when finishing off a leaf page, since we rely on |
| 886 | * _bt_check_third_page() rejecting oversized non-pivot tuples. On |
| 887 | * internal pages we can always fit 3 pivot tuples with larger internal |
| 888 | * page tuple limit (includes page high key). |
| 889 | * |
| 890 | * Most of the time, a page is only "full" in the sense that the soft |
| 891 | * fillfactor-wise limit has been exceeded. However, we must always leave |
| 892 | * at least two items plus a high key on each page before starting a new |
| 893 | * page. Disregard fillfactor and insert on "full" current page if we |
| 894 | * don't have the minimum number of items yet. (Note that we deliberately |
| 895 | * assume that suffix truncation neither enlarges nor shrinks new high key |
| 896 | * when applying soft limit.) |
| 897 | */ |
| 898 | if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || |
| 899 | (pgspc < state->btps_full && last_off > P_FIRSTKEY)) |
| 900 | { |
| 901 | /* |
| 902 | * Finish off the page and write it out. |
| 903 | */ |
| 904 | Page opage = npage; |
| 905 | BlockNumber oblkno = nblkno; |
| 906 | ItemId ii; |
| 907 | ItemId hii; |
| 908 | IndexTuple oitup; |
| 909 | |
| 910 | /* Create new page of same level */ |
| 911 | npage = _bt_blnewpage(state->btps_level); |
| 912 | |
| 913 | /* and assign it a page position */ |
| 914 | nblkno = wstate->btws_pages_alloced++; |
| 915 | |
| 916 | /* |
| 917 | * We copy the last item on the page into the new page, and then |
| 918 | * rearrange the old page so that the 'last item' becomes its high key |
| 919 | * rather than a true data item. There had better be at least two |
| 920 | * items on the page already, else the page would be empty of useful |
| 921 | * data. |
| 922 | */ |
| 923 | Assert(last_off > P_FIRSTKEY); |
| 924 | ii = PageGetItemId(opage, last_off); |
| 925 | oitup = (IndexTuple) PageGetItem(opage, ii); |
| 926 | _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); |
| 927 | |
| 928 | /* |
| 929 | * Move 'last' into the high key position on opage. _bt_blnewpage() |
| 930 | * allocated empty space for a line pointer when opage was first |
| 931 | * created, so this is a matter of rearranging already-allocated space |
| 932 | * on page, and initializing high key line pointer. (Actually, leaf |
| 933 | * pages must also swap oitup with a truncated version of oitup, which |
| 934 | * is sometimes larger than oitup, though never by more than the space |
| 935 | * needed to append a heap TID.) |
| 936 | */ |
| 937 | hii = PageGetItemId(opage, P_HIKEY); |
| 938 | *hii = *ii; |
| 939 | ItemIdSetUnused(ii); /* redundant */ |
| 940 | ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); |
| 941 | |
| 942 | if (isleaf) |
| 943 | { |
| 944 | IndexTuple lastleft; |
| 945 | IndexTuple truncated; |
| 946 | Size truncsz; |
| 947 | |
| 948 | /* |
| 949 | * Truncate away any unneeded attributes from high key on leaf |
| 950 | * level. This is only done at the leaf level because downlinks |
| 951 | * in internal pages are either negative infinity items, or get |
| 952 | * their contents from copying from one level down. See also: |
| 953 | * _bt_split(). |
| 954 | * |
| 955 | * We don't try to bias our choice of split point to make it more |
| 956 | * likely that _bt_truncate() can truncate away more attributes, |
| 957 | * whereas the split point passed to _bt_split() is chosen much |
| 958 | * more delicately. Suffix truncation is mostly useful because it |
| 959 | * improves space utilization for workloads with random |
| 960 | * insertions. It doesn't seem worthwhile to add logic for |
| 961 | * choosing a split point here for a benefit that is bound to be |
| 962 | * much smaller. |
| 963 | * |
| 964 | * Since the truncated tuple is often smaller than the original |
| 965 | * tuple, it cannot just be copied in place (besides, we want to |
| 966 | * actually save space on the leaf page). We delete the original |
| 967 | * high key, and add our own truncated high key at the same |
| 968 | * offset. |
| 969 | * |
| 970 | * Note that the page layout won't be changed very much. oitup is |
| 971 | * already located at the physical beginning of tuple space, so we |
| 972 | * only shift the line pointer array back and forth, and overwrite |
| 973 | * the tuple space previously occupied by oitup. This is fairly |
| 974 | * cheap. |
| 975 | */ |
| 976 | ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); |
| 977 | lastleft = (IndexTuple) PageGetItem(opage, ii); |
| 978 | |
| 979 | truncated = _bt_truncate(wstate->index, lastleft, oitup, |
| 980 | wstate->inskey); |
| 981 | truncsz = IndexTupleSize(truncated); |
| 982 | PageIndexTupleDelete(opage, P_HIKEY); |
| 983 | _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY); |
| 984 | pfree(truncated); |
| 985 | |
| 986 | /* oitup should continue to point to the page's high key */ |
| 987 | hii = PageGetItemId(opage, P_HIKEY); |
| 988 | oitup = (IndexTuple) PageGetItem(opage, hii); |
| 989 | } |
| 990 | |
| 991 | /* |
| 992 | * Link the old page into its parent, using its minimum key. If we |
| 993 | * don't have a parent, we have to create one; this adds a new btree |
| 994 | * level. |
| 995 | */ |
| 996 | if (state->btps_next == NULL) |
| 997 | state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); |
| 998 | |
| 999 | Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <= |
| 1000 | IndexRelationGetNumberOfKeyAttributes(wstate->index) && |
| 1001 | BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) || |
| 1002 | P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); |
| 1003 | Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 || |
| 1004 | !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); |
| 1005 | BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno); |
| 1006 | _bt_buildadd(wstate, state->btps_next, state->btps_minkey); |
| 1007 | pfree(state->btps_minkey); |
| 1008 | |
| 1009 | /* |
| 1010 | * Save a copy of the high key from the old page. It is also used as |
| 1011 | * the minimum key for the new page. |
| 1012 | */ |
| 1013 | state->btps_minkey = CopyIndexTuple(oitup); |
| 1014 | |
| 1015 | /* |
| 1016 | * Set the sibling links for both pages. |
| 1017 | */ |
| 1018 | { |
| 1019 | BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); |
| 1020 | BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); |
| 1021 | |
| 1022 | oopaque->btpo_next = nblkno; |
| 1023 | nopaque->btpo_prev = oblkno; |
| 1024 | nopaque->btpo_next = P_NONE; /* redundant */ |
| 1025 | } |
| 1026 | |
| 1027 | /* |
| 1028 | * Write out the old page. We never need to touch it again, so we can |
| 1029 | * free the opage workspace too. |
| 1030 | */ |
| 1031 | _bt_blwritepage(wstate, opage, oblkno); |
| 1032 | |
| 1033 | /* |
| 1034 | * Reset last_off to point to new page |
| 1035 | */ |
| 1036 | last_off = P_FIRSTKEY; |
| 1037 | } |
| 1038 | |
| 1039 | /* |
| 1040 | * By here, either original page is still the current page, or a new page |
| 1041 | * was created that became the current page. Either way, the current page |
| 1042 | * definitely has space for new item. |
| 1043 | * |
| 1044 | * If the new item is the first for its page, stash a copy for later. Note |
| 1045 | * this will only happen for the first item on a level; on later pages, |
| 1046 | * the first item for a page is copied from the prior page in the code |
| 1047 | * above. The minimum key for an entire level is nothing more than a |
| 1048 | * minus infinity (downlink only) pivot tuple placeholder. |
| 1049 | */ |
| 1050 | if (last_off == P_HIKEY) |
| 1051 | { |
| 1052 | Assert(state->btps_minkey == NULL); |
| 1053 | state->btps_minkey = CopyIndexTuple(itup); |
| 1054 | /* _bt_sortaddtup() will perform full truncation later */ |
| 1055 | BTreeTupleSetNAtts(state->btps_minkey, 0); |
| 1056 | } |
| 1057 | |
| 1058 | /* |
| 1059 | * Add the new item into the current page. |
| 1060 | */ |
| 1061 | last_off = OffsetNumberNext(last_off); |
| 1062 | _bt_sortaddtup(npage, itupsz, itup, last_off); |
| 1063 | |
| 1064 | state->btps_page = npage; |
| 1065 | state->btps_blkno = nblkno; |
| 1066 | state->btps_lastoff = last_off; |
| 1067 | } |
| 1068 | |
| 1069 | /* |
| 1070 | * Finish writing out the completed btree. |
| 1071 | */ |
| 1072 | static void |
| 1073 | _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) |
| 1074 | { |
| 1075 | BTPageState *s; |
| 1076 | BlockNumber rootblkno = P_NONE; |
| 1077 | uint32 rootlevel = 0; |
| 1078 | Page metapage; |
| 1079 | |
| 1080 | /* |
| 1081 | * Each iteration of this loop completes one more level of the tree. |
| 1082 | */ |
| 1083 | for (s = state; s != NULL; s = s->btps_next) |
| 1084 | { |
| 1085 | BlockNumber blkno; |
| 1086 | BTPageOpaque opaque; |
| 1087 | |
| 1088 | blkno = s->btps_blkno; |
| 1089 | opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); |
| 1090 | |
| 1091 | /* |
| 1092 | * We have to link the last page on this level to somewhere. |
| 1093 | * |
| 1094 | * If we're at the top, it's the root, so attach it to the metapage. |
| 1095 | * Otherwise, add an entry for it to its parent using its minimum key. |
| 1096 | * This may cause the last page of the parent level to split, but |
| 1097 | * that's not a problem -- we haven't gotten to it yet. |
| 1098 | */ |
| 1099 | if (s->btps_next == NULL) |
| 1100 | { |
| 1101 | opaque->btpo_flags |= BTP_ROOT; |
| 1102 | rootblkno = blkno; |
| 1103 | rootlevel = s->btps_level; |
| 1104 | } |
| 1105 | else |
| 1106 | { |
| 1107 | Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <= |
| 1108 | IndexRelationGetNumberOfKeyAttributes(wstate->index) && |
| 1109 | BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) || |
| 1110 | P_LEFTMOST(opaque)); |
| 1111 | Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 || |
| 1112 | !P_LEFTMOST(opaque)); |
| 1113 | BTreeInnerTupleSetDownLink(s->btps_minkey, blkno); |
| 1114 | _bt_buildadd(wstate, s->btps_next, s->btps_minkey); |
| 1115 | pfree(s->btps_minkey); |
| 1116 | s->btps_minkey = NULL; |
| 1117 | } |
| 1118 | |
| 1119 | /* |
| 1120 | * This is the rightmost page, so the ItemId array needs to be slid |
| 1121 | * back one slot. Then we can dump out the page. |
| 1122 | */ |
| 1123 | _bt_slideleft(s->btps_page); |
| 1124 | _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); |
| 1125 | s->btps_page = NULL; /* writepage freed the workspace */ |
| 1126 | } |
| 1127 | |
| 1128 | /* |
| 1129 | * As the last step in the process, construct the metapage and make it |
| 1130 | * point to the new root (unless we had no data at all, in which case it's |
| 1131 | * set to point to "P_NONE"). This changes the index to the "valid" state |
| 1132 | * by filling in a valid magic number in the metapage. |
| 1133 | */ |
| 1134 | metapage = (Page) palloc(BLCKSZ); |
| 1135 | _bt_initmetapage(metapage, rootblkno, rootlevel); |
| 1136 | _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); |
| 1137 | } |
| 1138 | |
| 1139 | /* |
| 1140 | * Read tuples in correct sort order from tuplesort, and load them into |
| 1141 | * btree leaves. |
| 1142 | */ |
| 1143 | static void |
| 1144 | _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) |
| 1145 | { |
| 1146 | BTPageState *state = NULL; |
| 1147 | bool merge = (btspool2 != NULL); |
| 1148 | IndexTuple itup, |
| 1149 | itup2 = NULL; |
| 1150 | bool load1; |
| 1151 | TupleDesc tupdes = RelationGetDescr(wstate->index); |
| 1152 | int i, |
| 1153 | keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); |
| 1154 | SortSupport sortKeys; |
| 1155 | int64 tuples_done = 0; |
| 1156 | |
| 1157 | if (merge) |
| 1158 | { |
| 1159 | /* |
| 1160 | * Another BTSpool for dead tuples exists. Now we have to merge |
| 1161 | * btspool and btspool2. |
| 1162 | */ |
| 1163 | |
| 1164 | /* the preparation of merge */ |
| 1165 | itup = tuplesort_getindextuple(btspool->sortstate, true); |
| 1166 | itup2 = tuplesort_getindextuple(btspool2->sortstate, true); |
| 1167 | |
| 1168 | /* Prepare SortSupport data for each column */ |
| 1169 | sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); |
| 1170 | |
| 1171 | for (i = 0; i < keysz; i++) |
| 1172 | { |
| 1173 | SortSupport sortKey = sortKeys + i; |
| 1174 | ScanKey scanKey = wstate->inskey->scankeys + i; |
| 1175 | int16 strategy; |
| 1176 | |
| 1177 | sortKey->ssup_cxt = CurrentMemoryContext; |
| 1178 | sortKey->ssup_collation = scanKey->sk_collation; |
| 1179 | sortKey->ssup_nulls_first = |
| 1180 | (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; |
| 1181 | sortKey->ssup_attno = scanKey->sk_attno; |
| 1182 | /* Abbreviation is not supported here */ |
| 1183 | sortKey->abbreviate = false; |
| 1184 | |
| 1185 | AssertState(sortKey->ssup_attno != 0); |
| 1186 | |
| 1187 | strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? |
| 1188 | BTGreaterStrategyNumber : BTLessStrategyNumber; |
| 1189 | |
| 1190 | PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); |
| 1191 | } |
| 1192 | |
| 1193 | for (;;) |
| 1194 | { |
| 1195 | load1 = true; /* load BTSpool next ? */ |
| 1196 | if (itup2 == NULL) |
| 1197 | { |
| 1198 | if (itup == NULL) |
| 1199 | break; |
| 1200 | } |
| 1201 | else if (itup != NULL) |
| 1202 | { |
| 1203 | int32 compare = 0; |
| 1204 | |
| 1205 | for (i = 1; i <= keysz; i++) |
| 1206 | { |
| 1207 | SortSupport entry; |
| 1208 | Datum attrDatum1, |
| 1209 | attrDatum2; |
| 1210 | bool isNull1, |
| 1211 | isNull2; |
| 1212 | |
| 1213 | entry = sortKeys + i - 1; |
| 1214 | attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); |
| 1215 | attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); |
| 1216 | |
| 1217 | compare = ApplySortComparator(attrDatum1, isNull1, |
| 1218 | attrDatum2, isNull2, |
| 1219 | entry); |
| 1220 | if (compare > 0) |
| 1221 | { |
| 1222 | load1 = false; |
| 1223 | break; |
| 1224 | } |
| 1225 | else if (compare < 0) |
| 1226 | break; |
| 1227 | } |
| 1228 | |
| 1229 | /* |
| 1230 | * If key values are equal, we sort on ItemPointer. This is |
| 1231 | * required for btree indexes, since heap TID is treated as an |
| 1232 | * implicit last key attribute in order to ensure that all |
| 1233 | * keys in the index are physically unique. |
| 1234 | */ |
| 1235 | if (compare == 0) |
| 1236 | { |
| 1237 | compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); |
| 1238 | Assert(compare != 0); |
| 1239 | if (compare > 0) |
| 1240 | load1 = false; |
| 1241 | } |
| 1242 | } |
| 1243 | else |
| 1244 | load1 = false; |
| 1245 | |
| 1246 | /* When we see first tuple, create first index page */ |
| 1247 | if (state == NULL) |
| 1248 | state = _bt_pagestate(wstate, 0); |
| 1249 | |
| 1250 | if (load1) |
| 1251 | { |
| 1252 | _bt_buildadd(wstate, state, itup); |
| 1253 | itup = tuplesort_getindextuple(btspool->sortstate, true); |
| 1254 | } |
| 1255 | else |
| 1256 | { |
| 1257 | _bt_buildadd(wstate, state, itup2); |
| 1258 | itup2 = tuplesort_getindextuple(btspool2->sortstate, true); |
| 1259 | } |
| 1260 | |
| 1261 | /* Report progress */ |
| 1262 | pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
| 1263 | ++tuples_done); |
| 1264 | } |
| 1265 | pfree(sortKeys); |
| 1266 | } |
| 1267 | else |
| 1268 | { |
| 1269 | /* merge is unnecessary */ |
| 1270 | while ((itup = tuplesort_getindextuple(btspool->sortstate, |
| 1271 | true)) != NULL) |
| 1272 | { |
| 1273 | /* When we see first tuple, create first index page */ |
| 1274 | if (state == NULL) |
| 1275 | state = _bt_pagestate(wstate, 0); |
| 1276 | |
| 1277 | _bt_buildadd(wstate, state, itup); |
| 1278 | |
| 1279 | /* Report progress */ |
| 1280 | pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
| 1281 | ++tuples_done); |
| 1282 | } |
| 1283 | } |
| 1284 | |
| 1285 | /* Close down final pages and write the metapage */ |
| 1286 | _bt_uppershutdown(wstate, state); |
| 1287 | |
| 1288 | /* |
| 1289 | * If the index is WAL-logged, we must fsync it down to disk before it's |
| 1290 | * safe to commit the transaction. (For a non-WAL-logged index we don't |
| 1291 | * care since the index will be uninteresting after a crash anyway.) |
| 1292 | * |
| 1293 | * It's obvious that we must do this when not WAL-logging the build. It's |
| 1294 | * less obvious that we have to do it even if we did WAL-log the index |
| 1295 | * pages. The reason is that since we're building outside shared buffers, |
| 1296 | * a CHECKPOINT occurring during the build has no way to flush the |
| 1297 | * previously written data to disk (indeed it won't know the index even |
| 1298 | * exists). A crash later on would replay WAL from the checkpoint, |
| 1299 | * therefore it wouldn't replay our earlier WAL entries. If we do not |
| 1300 | * fsync those pages here, they might still not be on disk when the crash |
| 1301 | * occurs. |
| 1302 | */ |
| 1303 | if (RelationNeedsWAL(wstate->index)) |
| 1304 | { |
| 1305 | RelationOpenSmgr(wstate->index); |
| 1306 | smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); |
| 1307 | } |
| 1308 | } |
| 1309 | |
| 1310 | /* |
| 1311 | * Create parallel context, and launch workers for leader. |
| 1312 | * |
| 1313 | * buildstate argument should be initialized (with the exception of the |
| 1314 | * tuplesort state in spools, which may later be created based on shared |
| 1315 | * state initially set up here). |
| 1316 | * |
| 1317 | * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY. |
| 1318 | * |
| 1319 | * request is the target number of parallel worker processes to launch. |
| 1320 | * |
| 1321 | * Sets buildstate's BTLeader, which caller must use to shut down parallel |
| 1322 | * mode by passing it to _bt_end_parallel() at the very end of its index |
| 1323 | * build. If not even a single worker process can be launched, this is |
| 1324 | * never set, and caller should proceed with a serial index build. |
| 1325 | */ |
| 1326 | static void |
| 1327 | _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) |
| 1328 | { |
| 1329 | ParallelContext *pcxt; |
| 1330 | int scantuplesortstates; |
| 1331 | Snapshot snapshot; |
| 1332 | Size estbtshared; |
| 1333 | Size estsort; |
| 1334 | BTShared *btshared; |
| 1335 | Sharedsort *sharedsort; |
| 1336 | Sharedsort *sharedsort2; |
| 1337 | BTSpool *btspool = buildstate->spool; |
| 1338 | BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); |
| 1339 | bool leaderparticipates = true; |
| 1340 | char *sharedquery; |
| 1341 | int querylen; |
| 1342 | |
| 1343 | #ifdef DISABLE_LEADER_PARTICIPATION |
| 1344 | leaderparticipates = false; |
| 1345 | #endif |
| 1346 | |
| 1347 | /* |
| 1348 | * Enter parallel mode, and create context for parallel build of btree |
| 1349 | * index |
| 1350 | */ |
| 1351 | EnterParallelMode(); |
| 1352 | Assert(request > 0); |
| 1353 | pcxt = CreateParallelContext("postgres" , "_bt_parallel_build_main" , |
| 1354 | request); |
| 1355 | scantuplesortstates = leaderparticipates ? request + 1 : request; |
| 1356 | |
| 1357 | /* |
| 1358 | * Prepare for scan of the base relation. In a normal index build, we use |
| 1359 | * SnapshotAny because we must retrieve all tuples and do our own time |
| 1360 | * qual checks (because we have to index RECENTLY_DEAD tuples). In a |
| 1361 | * concurrent build, we take a regular MVCC snapshot and index whatever's |
| 1362 | * live according to that. |
| 1363 | */ |
| 1364 | if (!isconcurrent) |
| 1365 | snapshot = SnapshotAny; |
| 1366 | else |
| 1367 | snapshot = RegisterSnapshot(GetTransactionSnapshot()); |
| 1368 | |
| 1369 | /* |
| 1370 | * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and |
| 1371 | * PARALLEL_KEY_TUPLESORT tuplesort workspace |
| 1372 | */ |
| 1373 | estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot); |
| 1374 | shm_toc_estimate_chunk(&pcxt->estimator, estbtshared); |
| 1375 | estsort = tuplesort_estimate_shared(scantuplesortstates); |
| 1376 | shm_toc_estimate_chunk(&pcxt->estimator, estsort); |
| 1377 | |
| 1378 | /* |
| 1379 | * Unique case requires a second spool, and so we may have to account for |
| 1380 | * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 |
| 1381 | */ |
| 1382 | if (!btspool->isunique) |
| 1383 | shm_toc_estimate_keys(&pcxt->estimator, 2); |
| 1384 | else |
| 1385 | { |
| 1386 | shm_toc_estimate_chunk(&pcxt->estimator, estsort); |
| 1387 | shm_toc_estimate_keys(&pcxt->estimator, 3); |
| 1388 | } |
| 1389 | |
| 1390 | /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ |
| 1391 | querylen = strlen(debug_query_string); |
| 1392 | shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); |
| 1393 | shm_toc_estimate_keys(&pcxt->estimator, 1); |
| 1394 | |
| 1395 | /* Everyone's had a chance to ask for space, so now create the DSM */ |
| 1396 | InitializeParallelDSM(pcxt); |
| 1397 | |
| 1398 | /* Store shared build state, for which we reserved space */ |
| 1399 | btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared); |
| 1400 | /* Initialize immutable state */ |
| 1401 | btshared->heaprelid = RelationGetRelid(btspool->heap); |
| 1402 | btshared->indexrelid = RelationGetRelid(btspool->index); |
| 1403 | btshared->isunique = btspool->isunique; |
| 1404 | btshared->isconcurrent = isconcurrent; |
| 1405 | btshared->scantuplesortstates = scantuplesortstates; |
| 1406 | ConditionVariableInit(&btshared->workersdonecv); |
| 1407 | SpinLockInit(&btshared->mutex); |
| 1408 | /* Initialize mutable state */ |
| 1409 | btshared->nparticipantsdone = 0; |
| 1410 | btshared->reltuples = 0.0; |
| 1411 | btshared->havedead = false; |
| 1412 | btshared->indtuples = 0.0; |
| 1413 | btshared->brokenhotchain = false; |
| 1414 | table_parallelscan_initialize(btspool->heap, |
| 1415 | ParallelTableScanFromBTShared(btshared), |
| 1416 | snapshot); |
| 1417 | |
| 1418 | /* |
| 1419 | * Store shared tuplesort-private state, for which we reserved space. |
| 1420 | * Then, initialize opaque state using tuplesort routine. |
| 1421 | */ |
| 1422 | sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); |
| 1423 | tuplesort_initialize_shared(sharedsort, scantuplesortstates, |
| 1424 | pcxt->seg); |
| 1425 | |
| 1426 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); |
| 1427 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); |
| 1428 | |
| 1429 | /* Unique case requires a second spool, and associated shared state */ |
| 1430 | if (!btspool->isunique) |
| 1431 | sharedsort2 = NULL; |
| 1432 | else |
| 1433 | { |
| 1434 | /* |
| 1435 | * Store additional shared tuplesort-private state, for which we |
| 1436 | * reserved space. Then, initialize opaque state using tuplesort |
| 1437 | * routine. |
| 1438 | */ |
| 1439 | sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); |
| 1440 | tuplesort_initialize_shared(sharedsort2, scantuplesortstates, |
| 1441 | pcxt->seg); |
| 1442 | |
| 1443 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2); |
| 1444 | } |
| 1445 | |
| 1446 | /* Store query string for workers */ |
| 1447 | sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); |
| 1448 | memcpy(sharedquery, debug_query_string, querylen + 1); |
| 1449 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery); |
| 1450 | |
| 1451 | /* Launch workers, saving status for leader/caller */ |
| 1452 | LaunchParallelWorkers(pcxt); |
| 1453 | btleader->pcxt = pcxt; |
| 1454 | btleader->nparticipanttuplesorts = pcxt->nworkers_launched; |
| 1455 | if (leaderparticipates) |
| 1456 | btleader->nparticipanttuplesorts++; |
| 1457 | btleader->btshared = btshared; |
| 1458 | btleader->sharedsort = sharedsort; |
| 1459 | btleader->sharedsort2 = sharedsort2; |
| 1460 | btleader->snapshot = snapshot; |
| 1461 | |
| 1462 | /* If no workers were successfully launched, back out (do serial build) */ |
| 1463 | if (pcxt->nworkers_launched == 0) |
| 1464 | { |
| 1465 | _bt_end_parallel(btleader); |
| 1466 | return; |
| 1467 | } |
| 1468 | |
| 1469 | /* Save leader state now that it's clear build will be parallel */ |
| 1470 | buildstate->btleader = btleader; |
| 1471 | |
| 1472 | /* Join heap scan ourselves */ |
| 1473 | if (leaderparticipates) |
| 1474 | _bt_leader_participate_as_worker(buildstate); |
| 1475 | |
| 1476 | /* |
| 1477 | * Caller needs to wait for all launched workers when we return. Make |
| 1478 | * sure that the failure-to-start case will not hang forever. |
| 1479 | */ |
| 1480 | WaitForParallelWorkersToAttach(pcxt); |
| 1481 | } |
| 1482 | |
| 1483 | /* |
| 1484 | * Shut down workers, destroy parallel context, and end parallel mode. |
| 1485 | */ |
| 1486 | static void |
| 1487 | _bt_end_parallel(BTLeader *btleader) |
| 1488 | { |
| 1489 | /* Shutdown worker processes */ |
| 1490 | WaitForParallelWorkersToFinish(btleader->pcxt); |
| 1491 | /* Free last reference to MVCC snapshot, if one was used */ |
| 1492 | if (IsMVCCSnapshot(btleader->snapshot)) |
| 1493 | UnregisterSnapshot(btleader->snapshot); |
| 1494 | DestroyParallelContext(btleader->pcxt); |
| 1495 | ExitParallelMode(); |
| 1496 | } |
| 1497 | |
| 1498 | /* |
| 1499 | * Returns size of shared memory required to store state for a parallel |
| 1500 | * btree index build based on the snapshot its parallel scan will use. |
| 1501 | */ |
| 1502 | static Size |
| 1503 | _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot) |
| 1504 | { |
| 1505 | /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ |
| 1506 | return add_size(BUFFERALIGN(sizeof(BTShared)), |
| 1507 | table_parallelscan_estimate(heap, snapshot)); |
| 1508 | } |
| 1509 | |
| 1510 | /* |
| 1511 | * Within leader, wait for end of heap scan. |
| 1512 | * |
| 1513 | * When called, parallel heap scan started by _bt_begin_parallel() will |
| 1514 | * already be underway within worker processes (when leader participates |
| 1515 | * as a worker, we should end up here just as workers are finishing). |
| 1516 | * |
| 1517 | * Fills in fields needed for ambuild statistics, and lets caller set |
| 1518 | * field indicating that some worker encountered a broken HOT chain. |
| 1519 | * |
| 1520 | * Returns the total number of heap tuples scanned. |
| 1521 | */ |
| 1522 | static double |
| 1523 | _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain) |
| 1524 | { |
| 1525 | BTShared *btshared = buildstate->btleader->btshared; |
| 1526 | int nparticipanttuplesorts; |
| 1527 | double reltuples; |
| 1528 | |
| 1529 | nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts; |
| 1530 | for (;;) |
| 1531 | { |
| 1532 | SpinLockAcquire(&btshared->mutex); |
| 1533 | if (btshared->nparticipantsdone == nparticipanttuplesorts) |
| 1534 | { |
| 1535 | buildstate->havedead = btshared->havedead; |
| 1536 | buildstate->indtuples = btshared->indtuples; |
| 1537 | *brokenhotchain = btshared->brokenhotchain; |
| 1538 | reltuples = btshared->reltuples; |
| 1539 | SpinLockRelease(&btshared->mutex); |
| 1540 | break; |
| 1541 | } |
| 1542 | SpinLockRelease(&btshared->mutex); |
| 1543 | |
| 1544 | ConditionVariableSleep(&btshared->workersdonecv, |
| 1545 | WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN); |
| 1546 | } |
| 1547 | |
| 1548 | ConditionVariableCancelSleep(); |
| 1549 | |
| 1550 | return reltuples; |
| 1551 | } |
| 1552 | |
| 1553 | /* |
| 1554 | * Within leader, participate as a parallel worker. |
| 1555 | */ |
| 1556 | static void |
| 1557 | _bt_leader_participate_as_worker(BTBuildState *buildstate) |
| 1558 | { |
| 1559 | BTLeader *btleader = buildstate->btleader; |
| 1560 | BTSpool *leaderworker; |
| 1561 | BTSpool *leaderworker2; |
| 1562 | int sortmem; |
| 1563 | |
| 1564 | /* Allocate memory and initialize private spool */ |
| 1565 | leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); |
| 1566 | leaderworker->heap = buildstate->spool->heap; |
| 1567 | leaderworker->index = buildstate->spool->index; |
| 1568 | leaderworker->isunique = buildstate->spool->isunique; |
| 1569 | |
| 1570 | /* Initialize second spool, if required */ |
| 1571 | if (!btleader->btshared->isunique) |
| 1572 | leaderworker2 = NULL; |
| 1573 | else |
| 1574 | { |
| 1575 | /* Allocate memory for worker's own private secondary spool */ |
| 1576 | leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
| 1577 | |
| 1578 | /* Initialize worker's own secondary spool */ |
| 1579 | leaderworker2->heap = leaderworker->heap; |
| 1580 | leaderworker2->index = leaderworker->index; |
| 1581 | leaderworker2->isunique = false; |
| 1582 | } |
| 1583 | |
| 1584 | /* |
| 1585 | * Might as well use reliable figure when doling out maintenance_work_mem |
| 1586 | * (when requested number of workers were not launched, this will be |
| 1587 | * somewhat higher than it is for other workers). |
| 1588 | */ |
| 1589 | sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts; |
| 1590 | |
| 1591 | /* Perform work common to all participants */ |
| 1592 | _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared, |
| 1593 | btleader->sharedsort, btleader->sharedsort2, |
| 1594 | sortmem, true); |
| 1595 | |
| 1596 | #ifdef BTREE_BUILD_STATS |
| 1597 | if (log_btree_build_stats) |
| 1598 | { |
| 1599 | ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS" ); |
| 1600 | ResetUsage(); |
| 1601 | } |
| 1602 | #endif /* BTREE_BUILD_STATS */ |
| 1603 | } |
| 1604 | |
| 1605 | /* |
| 1606 | * Perform work within a launched parallel process. |
| 1607 | */ |
| 1608 | void |
| 1609 | _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) |
| 1610 | { |
| 1611 | char *sharedquery; |
| 1612 | BTSpool *btspool; |
| 1613 | BTSpool *btspool2; |
| 1614 | BTShared *btshared; |
| 1615 | Sharedsort *sharedsort; |
| 1616 | Sharedsort *sharedsort2; |
| 1617 | Relation heapRel; |
| 1618 | Relation indexRel; |
| 1619 | LOCKMODE heapLockmode; |
| 1620 | LOCKMODE indexLockmode; |
| 1621 | int sortmem; |
| 1622 | |
| 1623 | #ifdef BTREE_BUILD_STATS |
| 1624 | if (log_btree_build_stats) |
| 1625 | ResetUsage(); |
| 1626 | #endif /* BTREE_BUILD_STATS */ |
| 1627 | |
| 1628 | /* Set debug_query_string for individual workers first */ |
| 1629 | sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false); |
| 1630 | debug_query_string = sharedquery; |
| 1631 | |
| 1632 | /* Report the query string from leader */ |
| 1633 | pgstat_report_activity(STATE_RUNNING, debug_query_string); |
| 1634 | |
| 1635 | /* Look up nbtree shared state */ |
| 1636 | btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false); |
| 1637 | |
| 1638 | /* Open relations using lock modes known to be obtained by index.c */ |
| 1639 | if (!btshared->isconcurrent) |
| 1640 | { |
| 1641 | heapLockmode = ShareLock; |
| 1642 | indexLockmode = AccessExclusiveLock; |
| 1643 | } |
| 1644 | else |
| 1645 | { |
| 1646 | heapLockmode = ShareUpdateExclusiveLock; |
| 1647 | indexLockmode = RowExclusiveLock; |
| 1648 | } |
| 1649 | |
| 1650 | /* Open relations within worker */ |
| 1651 | heapRel = table_open(btshared->heaprelid, heapLockmode); |
| 1652 | indexRel = index_open(btshared->indexrelid, indexLockmode); |
| 1653 | |
| 1654 | /* Initialize worker's own spool */ |
| 1655 | btspool = (BTSpool *) palloc0(sizeof(BTSpool)); |
| 1656 | btspool->heap = heapRel; |
| 1657 | btspool->index = indexRel; |
| 1658 | btspool->isunique = btshared->isunique; |
| 1659 | |
| 1660 | /* Look up shared state private to tuplesort.c */ |
| 1661 | sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); |
| 1662 | tuplesort_attach_shared(sharedsort, seg); |
| 1663 | if (!btshared->isunique) |
| 1664 | { |
| 1665 | btspool2 = NULL; |
| 1666 | sharedsort2 = NULL; |
| 1667 | } |
| 1668 | else |
| 1669 | { |
| 1670 | /* Allocate memory for worker's own private secondary spool */ |
| 1671 | btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
| 1672 | |
| 1673 | /* Initialize worker's own secondary spool */ |
| 1674 | btspool2->heap = btspool->heap; |
| 1675 | btspool2->index = btspool->index; |
| 1676 | btspool2->isunique = false; |
| 1677 | /* Look up shared state private to tuplesort.c */ |
| 1678 | sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false); |
| 1679 | tuplesort_attach_shared(sharedsort2, seg); |
| 1680 | } |
| 1681 | |
| 1682 | /* Perform sorting of spool, and possibly a spool2 */ |
| 1683 | sortmem = maintenance_work_mem / btshared->scantuplesortstates; |
| 1684 | _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort, |
| 1685 | sharedsort2, sortmem, false); |
| 1686 | |
| 1687 | #ifdef BTREE_BUILD_STATS |
| 1688 | if (log_btree_build_stats) |
| 1689 | { |
| 1690 | ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS" ); |
| 1691 | ResetUsage(); |
| 1692 | } |
| 1693 | #endif /* BTREE_BUILD_STATS */ |
| 1694 | |
| 1695 | index_close(indexRel, indexLockmode); |
| 1696 | table_close(heapRel, heapLockmode); |
| 1697 | } |
| 1698 | |
| 1699 | /* |
| 1700 | * Perform a worker's portion of a parallel sort. |
| 1701 | * |
| 1702 | * This generates a tuplesort for passed btspool, and a second tuplesort |
| 1703 | * state if a second btspool is need (i.e. for unique index builds). All |
| 1704 | * other spool fields should already be set when this is called. |
| 1705 | * |
| 1706 | * sortmem is the amount of working memory to use within each worker, |
| 1707 | * expressed in KBs. |
| 1708 | * |
| 1709 | * When this returns, workers are done, and need only release resources. |
| 1710 | */ |
| 1711 | static void |
| 1712 | _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, |
| 1713 | BTShared *btshared, Sharedsort *sharedsort, |
| 1714 | Sharedsort *sharedsort2, int sortmem, bool progress) |
| 1715 | { |
| 1716 | SortCoordinate coordinate; |
| 1717 | BTBuildState buildstate; |
| 1718 | TableScanDesc scan; |
| 1719 | double reltuples; |
| 1720 | IndexInfo *indexInfo; |
| 1721 | |
| 1722 | /* Initialize local tuplesort coordination state */ |
| 1723 | coordinate = palloc0(sizeof(SortCoordinateData)); |
| 1724 | coordinate->isWorker = true; |
| 1725 | coordinate->nParticipants = -1; |
| 1726 | coordinate->sharedsort = sharedsort; |
| 1727 | |
| 1728 | /* Begin "partial" tuplesort */ |
| 1729 | btspool->sortstate = tuplesort_begin_index_btree(btspool->heap, |
| 1730 | btspool->index, |
| 1731 | btspool->isunique, |
| 1732 | sortmem, coordinate, |
| 1733 | false); |
| 1734 | |
| 1735 | /* |
| 1736 | * Just as with serial case, there may be a second spool. If so, a |
| 1737 | * second, dedicated spool2 partial tuplesort is required. |
| 1738 | */ |
| 1739 | if (btspool2) |
| 1740 | { |
| 1741 | SortCoordinate coordinate2; |
| 1742 | |
| 1743 | /* |
| 1744 | * We expect that the second one (for dead tuples) won't get very |
| 1745 | * full, so we give it only work_mem (unless sortmem is less for |
| 1746 | * worker). Worker processes are generally permitted to allocate |
| 1747 | * work_mem independently. |
| 1748 | */ |
| 1749 | coordinate2 = palloc0(sizeof(SortCoordinateData)); |
| 1750 | coordinate2->isWorker = true; |
| 1751 | coordinate2->nParticipants = -1; |
| 1752 | coordinate2->sharedsort = sharedsort2; |
| 1753 | btspool2->sortstate = |
| 1754 | tuplesort_begin_index_btree(btspool->heap, btspool->index, false, |
| 1755 | Min(sortmem, work_mem), coordinate2, |
| 1756 | false); |
| 1757 | } |
| 1758 | |
| 1759 | /* Fill in buildstate for _bt_build_callback() */ |
| 1760 | buildstate.isunique = btshared->isunique; |
| 1761 | buildstate.havedead = false; |
| 1762 | buildstate.heap = btspool->heap; |
| 1763 | buildstate.spool = btspool; |
| 1764 | buildstate.spool2 = btspool2; |
| 1765 | buildstate.indtuples = 0; |
| 1766 | buildstate.btleader = NULL; |
| 1767 | |
| 1768 | /* Join parallel scan */ |
| 1769 | indexInfo = BuildIndexInfo(btspool->index); |
| 1770 | indexInfo->ii_Concurrent = btshared->isconcurrent; |
| 1771 | scan = table_beginscan_parallel(btspool->heap, |
| 1772 | ParallelTableScanFromBTShared(btshared)); |
| 1773 | reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, |
| 1774 | true, progress, _bt_build_callback, |
| 1775 | (void *) &buildstate, scan); |
| 1776 | |
| 1777 | /* |
| 1778 | * Execute this worker's part of the sort. |
| 1779 | * |
| 1780 | * Unlike leader and serial cases, we cannot avoid calling |
| 1781 | * tuplesort_performsort() for spool2 if it ends up containing no dead |
| 1782 | * tuples (this is disallowed for workers by tuplesort). |
| 1783 | */ |
| 1784 | tuplesort_performsort(btspool->sortstate); |
| 1785 | if (btspool2) |
| 1786 | tuplesort_performsort(btspool2->sortstate); |
| 1787 | |
| 1788 | /* |
| 1789 | * Done. Record ambuild statistics, and whether we encountered a broken |
| 1790 | * HOT chain. |
| 1791 | */ |
| 1792 | SpinLockAcquire(&btshared->mutex); |
| 1793 | btshared->nparticipantsdone++; |
| 1794 | btshared->reltuples += reltuples; |
| 1795 | if (buildstate.havedead) |
| 1796 | btshared->havedead = true; |
| 1797 | btshared->indtuples += buildstate.indtuples; |
| 1798 | if (indexInfo->ii_BrokenHotChain) |
| 1799 | btshared->brokenhotchain = true; |
| 1800 | SpinLockRelease(&btshared->mutex); |
| 1801 | |
| 1802 | /* Notify leader */ |
| 1803 | ConditionVariableSignal(&btshared->workersdonecv); |
| 1804 | |
| 1805 | /* We can end tuplesorts immediately */ |
| 1806 | tuplesort_end(btspool->sortstate); |
| 1807 | if (btspool2) |
| 1808 | tuplesort_end(btspool2->sortstate); |
| 1809 | } |
| 1810 | |