1/*-------------------------------------------------------------------------
2 *
3 * nbtsort.c
4 * Build a btree from sorted input by loading leaf pages sequentially.
5 *
6 * NOTES
7 *
8 * We use tuplesort.c to sort the given index tuples into order.
9 * Then we scan the index tuples in order and build the btree pages
10 * for each level. We load source tuples into leaf-level pages.
11 * Whenever we fill a page at one level, we add a link to it to its
12 * parent level (starting a new parent level if necessary). When
13 * done, we write out each final page on each level, adding it to
14 * its parent level. When we have only one page on a level, it must be
15 * the root -- it can be attached to the btree metapage and we are done.
16 *
17 * It is not wise to pack the pages entirely full, since then *any*
18 * insertion would cause a split (and not only of the leaf page; the need
19 * for a split would cascade right up the tree). The steady-state load
20 * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 * pages to the user-controllable fill factor (default 90%) while upper pages
22 * are always packed to 70%. This gives us reasonable density (there aren't
23 * many upper pages if the keys are reasonable-size) without risking a lot of
24 * cascading splits during early insertions.
25 *
26 * Formerly the index pages being built were kept in shared buffers, but
27 * that is of no value (since other backends have no interest in them yet)
28 * and it created locking problems for CHECKPOINT, because the upper-level
29 * pages were held exclusive-locked for long periods. Now we just build
30 * the pages in local memory and smgrwrite or smgrextend them as we finish
31 * them. They will need to be re-read into shared buffers on first use after
32 * the build finishes.
33 *
34 * Since the index will never be used unless it is completely built,
35 * from a crash-recovery point of view there is no need to WAL-log the
36 * steps of the build. After completing the index build, we can just sync
37 * the whole file to disk using smgrimmedsync() before exiting this module.
38 * This can be seen to be sufficient for crash recovery by considering that
39 * it's effectively equivalent to what would happen if a CHECKPOINT occurred
40 * just after the index build. However, it is clearly not sufficient if the
41 * DBA is using the WAL log for PITR or replication purposes, since another
42 * machine would not be able to reconstruct the index from WAL. Therefore,
43 * we log the completed index pages to WAL if and only if WAL archiving is
44 * active.
45 *
46 * This code isn't concerned about the FSM at all. The caller is responsible
47 * for initializing that.
48 *
49 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
50 * Portions Copyright (c) 1994, Regents of the University of California
51 *
52 * IDENTIFICATION
53 * src/backend/access/nbtree/nbtsort.c
54 *
55 *-------------------------------------------------------------------------
56 */
57
58#include "postgres.h"
59
60#include "access/nbtree.h"
61#include "access/parallel.h"
62#include "access/relscan.h"
63#include "access/table.h"
64#include "access/tableam.h"
65#include "access/xact.h"
66#include "access/xlog.h"
67#include "access/xloginsert.h"
68#include "catalog/index.h"
69#include "commands/progress.h"
70#include "miscadmin.h"
71#include "pgstat.h"
72#include "storage/smgr.h"
73#include "tcop/tcopprot.h" /* pgrminclude ignore */
74#include "utils/rel.h"
75#include "utils/sortsupport.h"
76#include "utils/tuplesort.h"
77
78
79/* Magic numbers for parallel state sharing */
80#define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
81#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
82#define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
83#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
84
85/*
86 * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
87 * parallel index builds. This may be useful as a debugging aid.
88#undef DISABLE_LEADER_PARTICIPATION
89 */
90
91/*
92 * Status record for spooling/sorting phase. (Note we may have two of
93 * these due to the special requirements for uniqueness-checking with
94 * dead tuples.)
95 */
96typedef struct BTSpool
97{
98 Tuplesortstate *sortstate; /* state data for tuplesort.c */
99 Relation heap;
100 Relation index;
101 bool isunique;
102} BTSpool;
103
104/*
105 * Status for index builds performed in parallel. This is allocated in a
106 * dynamic shared memory segment. Note that there is a separate tuplesort TOC
107 * entry, private to tuplesort.c but allocated by this module on its behalf.
108 */
109typedef struct BTShared
110{
111 /*
112 * These fields are not modified during the sort. They primarily exist
113 * for the benefit of worker processes that need to create BTSpool state
114 * corresponding to that used by the leader.
115 */
116 Oid heaprelid;
117 Oid indexrelid;
118 bool isunique;
119 bool isconcurrent;
120 int scantuplesortstates;
121
122 /*
123 * workersdonecv is used to monitor the progress of workers. All parallel
124 * participants must indicate that they are done before leader can use
125 * mutable state that workers maintain during scan (and before leader can
126 * proceed to tuplesort_performsort()).
127 */
128 ConditionVariable workersdonecv;
129
130 /*
131 * mutex protects all fields before heapdesc.
132 *
133 * These fields contain status information of interest to B-Tree index
134 * builds that must work just the same when an index is built in parallel.
135 */
136 slock_t mutex;
137
138 /*
139 * Mutable state that is maintained by workers, and reported back to
140 * leader at end of parallel scan.
141 *
142 * nparticipantsdone is number of worker processes finished.
143 *
144 * reltuples is the total number of input heap tuples.
145 *
146 * havedead indicates if RECENTLY_DEAD tuples were encountered during
147 * build.
148 *
149 * indtuples is the total number of tuples that made it into the index.
150 *
151 * brokenhotchain indicates if any worker detected a broken HOT chain
152 * during build.
153 */
154 int nparticipantsdone;
155 double reltuples;
156 bool havedead;
157 double indtuples;
158 bool brokenhotchain;
159
160 /*
161 * ParallelTableScanDescData data follows. Can't directly embed here, as
162 * implementations of the parallel table scan desc interface might need
163 * stronger alignment.
164 */
165} BTShared;
166
167/*
168 * Return pointer to a BTShared's parallel table scan.
169 *
170 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
171 * MAXALIGN.
172 */
173#define ParallelTableScanFromBTShared(shared) \
174 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
175
176/*
177 * Status for leader in parallel index build.
178 */
179typedef struct BTLeader
180{
181 /* parallel context itself */
182 ParallelContext *pcxt;
183
184 /*
185 * nparticipanttuplesorts is the exact number of worker processes
186 * successfully launched, plus one leader process if it participates as a
187 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
188 * participating as a worker).
189 */
190 int nparticipanttuplesorts;
191
192 /*
193 * Leader process convenience pointers to shared state (leader avoids TOC
194 * lookups).
195 *
196 * btshared is the shared state for entire build. sharedsort is the
197 * shared, tuplesort-managed state passed to each process tuplesort.
198 * sharedsort2 is the corresponding btspool2 shared state, used only when
199 * building unique indexes. snapshot is the snapshot used by the scan iff
200 * an MVCC snapshot is required.
201 */
202 BTShared *btshared;
203 Sharedsort *sharedsort;
204 Sharedsort *sharedsort2;
205 Snapshot snapshot;
206} BTLeader;
207
208/*
209 * Working state for btbuild and its callback.
210 *
211 * When parallel CREATE INDEX is used, there is a BTBuildState for each
212 * participant.
213 */
214typedef struct BTBuildState
215{
216 bool isunique;
217 bool havedead;
218 Relation heap;
219 BTSpool *spool;
220
221 /*
222 * spool2 is needed only when the index is a unique index. Dead tuples are
223 * put into spool2 instead of spool in order to avoid uniqueness check.
224 */
225 BTSpool *spool2;
226 double indtuples;
227
228 /*
229 * btleader is only present when a parallel index build is performed, and
230 * only in the leader process. (Actually, only the leader has a
231 * BTBuildState. Workers have their own spool and spool2, though.)
232 */
233 BTLeader *btleader;
234} BTBuildState;
235
236/*
237 * Status record for a btree page being built. We have one of these
238 * for each active tree level.
239 *
240 * The reason we need to store a copy of the minimum key is that we'll
241 * need to propagate it to the parent node when this page is linked
242 * into its parent. However, if the page is not a leaf page, the first
243 * entry on the page doesn't need to contain a key, so we will not have
244 * stored the key itself on the page. (You might think we could skip
245 * copying the minimum key on leaf pages, but actually we must have a
246 * writable copy anyway because we'll poke the page's address into it
247 * before passing it up to the parent...)
248 */
249typedef struct BTPageState
250{
251 Page btps_page; /* workspace for page building */
252 BlockNumber btps_blkno; /* block # to write this page at */
253 IndexTuple btps_minkey; /* copy of minimum key (first item) on page */
254 OffsetNumber btps_lastoff; /* last item offset loaded */
255 uint32 btps_level; /* tree level (0 = leaf) */
256 Size btps_full; /* "full" if less than this much free space */
257 struct BTPageState *btps_next; /* link to parent level, if any */
258} BTPageState;
259
260/*
261 * Overall status record for index writing phase.
262 */
263typedef struct BTWriteState
264{
265 Relation heap;
266 Relation index;
267 BTScanInsert inskey; /* generic insertion scankey */
268 bool btws_use_wal; /* dump pages to WAL? */
269 BlockNumber btws_pages_alloced; /* # pages allocated */
270 BlockNumber btws_pages_written; /* # pages written out */
271 Page btws_zeropage; /* workspace for filling zeroes */
272} BTWriteState;
273
274
275static double _bt_spools_heapscan(Relation heap, Relation index,
276 BTBuildState *buildstate, IndexInfo *indexInfo);
277static void _bt_spooldestroy(BTSpool *btspool);
278static void _bt_spool(BTSpool *btspool, ItemPointer self,
279 Datum *values, bool *isnull);
280static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
281static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values,
282 bool *isnull, bool tupleIsAlive, void *state);
283static Page _bt_blnewpage(uint32 level);
284static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
285static void _bt_slideleft(Page page);
286static void _bt_sortaddtup(Page page, Size itemsize,
287 IndexTuple itup, OffsetNumber itup_off);
288static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
289 IndexTuple itup);
290static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
291static void _bt_load(BTWriteState *wstate,
292 BTSpool *btspool, BTSpool *btspool2);
293static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
294 int request);
295static void _bt_end_parallel(BTLeader *btleader);
296static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
297static double _bt_parallel_heapscan(BTBuildState *buildstate,
298 bool *brokenhotchain);
299static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
300static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
301 BTShared *btshared, Sharedsort *sharedsort,
302 Sharedsort *sharedsort2, int sortmem,
303 bool progress);
304
305
306/*
307 * btbuild() -- build a new btree index.
308 */
309IndexBuildResult *
310btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
311{
312 IndexBuildResult *result;
313 BTBuildState buildstate;
314 double reltuples;
315
316#ifdef BTREE_BUILD_STATS
317 if (log_btree_build_stats)
318 ResetUsage();
319#endif /* BTREE_BUILD_STATS */
320
321 buildstate.isunique = indexInfo->ii_Unique;
322 buildstate.havedead = false;
323 buildstate.heap = heap;
324 buildstate.spool = NULL;
325 buildstate.spool2 = NULL;
326 buildstate.indtuples = 0;
327 buildstate.btleader = NULL;
328
329 /*
330 * We expect to be called exactly once for any index relation. If that's
331 * not the case, big trouble's what we have.
332 */
333 if (RelationGetNumberOfBlocks(index) != 0)
334 elog(ERROR, "index \"%s\" already contains data",
335 RelationGetRelationName(index));
336
337 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
338
339 /*
340 * Finish the build by (1) completing the sort of the spool file, (2)
341 * inserting the sorted tuples into btree pages and (3) building the upper
342 * levels. Finally, it may also be necessary to end use of parallelism.
343 */
344 _bt_leafbuild(buildstate.spool, buildstate.spool2);
345 _bt_spooldestroy(buildstate.spool);
346 if (buildstate.spool2)
347 _bt_spooldestroy(buildstate.spool2);
348 if (buildstate.btleader)
349 _bt_end_parallel(buildstate.btleader);
350
351 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
352
353 result->heap_tuples = reltuples;
354 result->index_tuples = buildstate.indtuples;
355
356#ifdef BTREE_BUILD_STATS
357 if (log_btree_build_stats)
358 {
359 ShowUsage("BTREE BUILD STATS");
360 ResetUsage();
361 }
362#endif /* BTREE_BUILD_STATS */
363
364 return result;
365}
366
367/*
368 * Create and initialize one or two spool structures, and save them in caller's
369 * buildstate argument. May also fill-in fields within indexInfo used by index
370 * builds.
371 *
372 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
373 * routine encapsulates all aspects of managing parallelism. Caller need only
374 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
375 *
376 * Returns the total number of heap tuples scanned.
377 */
378static double
379_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
380 IndexInfo *indexInfo)
381{
382 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
383 SortCoordinate coordinate = NULL;
384 double reltuples = 0;
385
386 /*
387 * We size the sort area as maintenance_work_mem rather than work_mem to
388 * speed index creation. This should be OK since a single backend can't
389 * run multiple index creations in parallel (see also: notes on
390 * parallelism and maintenance_work_mem below).
391 */
392 btspool->heap = heap;
393 btspool->index = index;
394 btspool->isunique = indexInfo->ii_Unique;
395
396 /* Save as primary spool */
397 buildstate->spool = btspool;
398
399 /* Report table scan phase started */
400 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
401 PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
402
403 /* Attempt to launch parallel worker scan when required */
404 if (indexInfo->ii_ParallelWorkers > 0)
405 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
406 indexInfo->ii_ParallelWorkers);
407
408 /*
409 * If parallel build requested and at least one worker process was
410 * successfully launched, set up coordination state
411 */
412 if (buildstate->btleader)
413 {
414 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
415 coordinate->isWorker = false;
416 coordinate->nParticipants =
417 buildstate->btleader->nparticipanttuplesorts;
418 coordinate->sharedsort = buildstate->btleader->sharedsort;
419 }
420
421 /*
422 * Begin serial/leader tuplesort.
423 *
424 * In cases where parallelism is involved, the leader receives the same
425 * share of maintenance_work_mem as a serial sort (it is generally treated
426 * in the same way as a serial sort once we return). Parallel worker
427 * Tuplesortstates will have received only a fraction of
428 * maintenance_work_mem, though.
429 *
430 * We rely on the lifetime of the Leader Tuplesortstate almost not
431 * overlapping with any worker Tuplesortstate's lifetime. There may be
432 * some small overlap, but that's okay because we rely on leader
433 * Tuplesortstate only allocating a small, fixed amount of memory here.
434 * When its tuplesort_performsort() is called (by our caller), and
435 * significant amounts of memory are likely to be used, all workers must
436 * have already freed almost all memory held by their Tuplesortstates
437 * (they are about to go away completely, too). The overall effect is
438 * that maintenance_work_mem always represents an absolute high watermark
439 * on the amount of memory used by a CREATE INDEX operation, regardless of
440 * the use of parallelism or any other factor.
441 */
442 buildstate->spool->sortstate =
443 tuplesort_begin_index_btree(heap, index, buildstate->isunique,
444 maintenance_work_mem, coordinate,
445 false);
446
447 /*
448 * If building a unique index, put dead tuples in a second spool to keep
449 * them out of the uniqueness check. We expect that the second spool (for
450 * dead tuples) won't get very full, so we give it only work_mem.
451 */
452 if (indexInfo->ii_Unique)
453 {
454 BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
455 SortCoordinate coordinate2 = NULL;
456
457 /* Initialize secondary spool */
458 btspool2->heap = heap;
459 btspool2->index = index;
460 btspool2->isunique = false;
461 /* Save as secondary spool */
462 buildstate->spool2 = btspool2;
463
464 if (buildstate->btleader)
465 {
466 /*
467 * Set up non-private state that is passed to
468 * tuplesort_begin_index_btree() about the basic high level
469 * coordination of a parallel sort.
470 */
471 coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
472 coordinate2->isWorker = false;
473 coordinate2->nParticipants =
474 buildstate->btleader->nparticipanttuplesorts;
475 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
476 }
477
478 /*
479 * We expect that the second one (for dead tuples) won't get very
480 * full, so we give it only work_mem
481 */
482 buildstate->spool2->sortstate =
483 tuplesort_begin_index_btree(heap, index, false, work_mem,
484 coordinate2, false);
485 }
486
487 /* Fill spool using either serial or parallel heap scan */
488 if (!buildstate->btleader)
489 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
490 _bt_build_callback, (void *) buildstate,
491 NULL);
492 else
493 reltuples = _bt_parallel_heapscan(buildstate,
494 &indexInfo->ii_BrokenHotChain);
495
496 /*
497 * Set the progress target for the next phase. Reset the block number
498 * values set by table_index_build_scan
499 */
500 {
501 const int index[] = {
502 PROGRESS_CREATEIDX_TUPLES_TOTAL,
503 PROGRESS_SCAN_BLOCKS_TOTAL,
504 PROGRESS_SCAN_BLOCKS_DONE
505 };
506 const int64 val[] = {
507 buildstate->indtuples,
508 0, 0
509 };
510
511 pgstat_progress_update_multi_param(3, index, val);
512 }
513
514 /* okay, all heap tuples are spooled */
515 if (buildstate->spool2 && !buildstate->havedead)
516 {
517 /* spool2 turns out to be unnecessary */
518 _bt_spooldestroy(buildstate->spool2);
519 buildstate->spool2 = NULL;
520 }
521
522 return reltuples;
523}
524
525/*
526 * clean up a spool structure and its substructures.
527 */
528static void
529_bt_spooldestroy(BTSpool *btspool)
530{
531 tuplesort_end(btspool->sortstate);
532 pfree(btspool);
533}
534
535/*
536 * spool an index entry into the sort file.
537 */
538static void
539_bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
540{
541 tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
542 self, values, isnull);
543}
544
545/*
546 * given a spool loaded by successive calls to _bt_spool,
547 * create an entire btree.
548 */
549static void
550_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
551{
552 BTWriteState wstate;
553
554#ifdef BTREE_BUILD_STATS
555 if (log_btree_build_stats)
556 {
557 ShowUsage("BTREE BUILD (Spool) STATISTICS");
558 ResetUsage();
559 }
560#endif /* BTREE_BUILD_STATS */
561
562 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
563 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
564 tuplesort_performsort(btspool->sortstate);
565 if (btspool2)
566 {
567 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
568 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
569 tuplesort_performsort(btspool2->sortstate);
570 }
571
572 wstate.heap = btspool->heap;
573 wstate.index = btspool->index;
574 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
575
576 /*
577 * We need to log index creation in WAL iff WAL archiving/streaming is
578 * enabled UNLESS the index isn't WAL-logged anyway.
579 */
580 wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
581
582 /* reserve the metapage */
583 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
584 wstate.btws_pages_written = 0;
585 wstate.btws_zeropage = NULL; /* until needed */
586
587 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
588 PROGRESS_BTREE_PHASE_LEAF_LOAD);
589 _bt_load(&wstate, btspool, btspool2);
590}
591
592/*
593 * Per-tuple callback for table_index_build_scan
594 */
595static void
596_bt_build_callback(Relation index,
597 HeapTuple htup,
598 Datum *values,
599 bool *isnull,
600 bool tupleIsAlive,
601 void *state)
602{
603 BTBuildState *buildstate = (BTBuildState *) state;
604
605 /*
606 * insert the index tuple into the appropriate spool file for subsequent
607 * processing
608 */
609 if (tupleIsAlive || buildstate->spool2 == NULL)
610 _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
611 else
612 {
613 /* dead tuples are put into spool2 */
614 buildstate->havedead = true;
615 _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
616 }
617
618 buildstate->indtuples += 1;
619}
620
621/*
622 * allocate workspace for a new, clean btree page, not linked to any siblings.
623 */
624static Page
625_bt_blnewpage(uint32 level)
626{
627 Page page;
628 BTPageOpaque opaque;
629
630 page = (Page) palloc(BLCKSZ);
631
632 /* Zero the page and set up standard page header info */
633 _bt_pageinit(page, BLCKSZ);
634
635 /* Initialize BT opaque state */
636 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
637 opaque->btpo_prev = opaque->btpo_next = P_NONE;
638 opaque->btpo.level = level;
639 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
640 opaque->btpo_cycleid = 0;
641
642 /* Make the P_HIKEY line pointer appear allocated */
643 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
644
645 return page;
646}
647
648/*
649 * emit a completed btree page, and release the working storage.
650 */
651static void
652_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
653{
654 /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
655 RelationOpenSmgr(wstate->index);
656
657 /* XLOG stuff */
658 if (wstate->btws_use_wal)
659 {
660 /* We use the heap NEWPAGE record type for this */
661 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
662 }
663
664 /*
665 * If we have to write pages nonsequentially, fill in the space with
666 * zeroes until we come back and overwrite. This is not logically
667 * necessary on standard Unix filesystems (unwritten space will read as
668 * zeroes anyway), but it should help to avoid fragmentation. The dummy
669 * pages aren't WAL-logged though.
670 */
671 while (blkno > wstate->btws_pages_written)
672 {
673 if (!wstate->btws_zeropage)
674 wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
675 /* don't set checksum for all-zero page */
676 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
677 wstate->btws_pages_written++,
678 (char *) wstate->btws_zeropage,
679 true);
680 }
681
682 PageSetChecksumInplace(page, blkno);
683
684 /*
685 * Now write the page. There's no need for smgr to schedule an fsync for
686 * this write; we'll do it ourselves before ending the build.
687 */
688 if (blkno == wstate->btws_pages_written)
689 {
690 /* extending the file... */
691 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
692 (char *) page, true);
693 wstate->btws_pages_written++;
694 }
695 else
696 {
697 /* overwriting a block we zero-filled before */
698 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
699 (char *) page, true);
700 }
701
702 pfree(page);
703}
704
705/*
706 * allocate and initialize a new BTPageState. the returned structure
707 * is suitable for immediate use by _bt_buildadd.
708 */
709static BTPageState *
710_bt_pagestate(BTWriteState *wstate, uint32 level)
711{
712 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
713
714 /* create initial page for level */
715 state->btps_page = _bt_blnewpage(level);
716
717 /* and assign it a page position */
718 state->btps_blkno = wstate->btws_pages_alloced++;
719
720 state->btps_minkey = NULL;
721 /* initialize lastoff so first item goes into P_FIRSTKEY */
722 state->btps_lastoff = P_HIKEY;
723 state->btps_level = level;
724 /* set "full" threshold based on level. See notes at head of file. */
725 if (level > 0)
726 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
727 else
728 state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
729 BTREE_DEFAULT_FILLFACTOR);
730 /* no parent level, yet */
731 state->btps_next = NULL;
732
733 return state;
734}
735
736/*
737 * slide an array of ItemIds back one slot (from P_FIRSTKEY to
738 * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
739 * that we have built an ItemId array in what has turned out to be a
740 * P_RIGHTMOST page.
741 */
742static void
743_bt_slideleft(Page page)
744{
745 OffsetNumber off;
746 OffsetNumber maxoff;
747 ItemId previi;
748 ItemId thisii;
749
750 if (!PageIsEmpty(page))
751 {
752 maxoff = PageGetMaxOffsetNumber(page);
753 previi = PageGetItemId(page, P_HIKEY);
754 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
755 {
756 thisii = PageGetItemId(page, off);
757 *previi = *thisii;
758 previi = thisii;
759 }
760 ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
761 }
762}
763
764/*
765 * Add an item to a page being built.
766 *
767 * The main difference between this routine and a bare PageAddItem call
768 * is that this code knows that the leftmost data item on a non-leaf
769 * btree page doesn't need to have a key. Therefore, it strips such
770 * items down to just the item header.
771 *
772 * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
773 * that because it assumes that P_RIGHTMOST() will return the correct
774 * answer for the page. Here, we don't know yet if the page will be
775 * rightmost. Offset P_FIRSTKEY is always the first data key.
776 */
777static void
778_bt_sortaddtup(Page page,
779 Size itemsize,
780 IndexTuple itup,
781 OffsetNumber itup_off)
782{
783 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
784 IndexTupleData trunctuple;
785
786 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
787 {
788 trunctuple = *itup;
789 trunctuple.t_info = sizeof(IndexTupleData);
790 /* Deliberately zero INDEX_ALT_TID_MASK bits */
791 BTreeTupleSetNAtts(&trunctuple, 0);
792 itup = &trunctuple;
793 itemsize = sizeof(IndexTupleData);
794 }
795
796 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
797 false, false) == InvalidOffsetNumber)
798 elog(ERROR, "failed to add item to the index page");
799}
800
801/*----------
802 * Add an item to a disk page from the sort output.
803 *
804 * We must be careful to observe the page layout conventions of nbtsearch.c:
805 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
806 * - on non-leaf pages, the key portion of the first item need not be
807 * stored, we should store only the link.
808 *
809 * A leaf page being built looks like:
810 *
811 * +----------------+---------------------------------+
812 * | PageHeaderData | linp0 linp1 linp2 ... |
813 * +-----------+----+---------------------------------+
814 * | ... linpN | |
815 * +-----------+--------------------------------------+
816 * | ^ last |
817 * | |
818 * +-------------+------------------------------------+
819 * | | itemN ... |
820 * +-------------+------------------+-----------------+
821 * | ... item3 item2 item1 | "special space" |
822 * +--------------------------------+-----------------+
823 *
824 * Contrast this with the diagram in bufpage.h; note the mismatch
825 * between linps and items. This is because we reserve linp0 as a
826 * placeholder for the pointer to the "high key" item; when we have
827 * filled up the page, we will set linp0 to point to itemN and clear
828 * linpN. On the other hand, if we find this is the last (rightmost)
829 * page, we leave the items alone and slide the linp array over. If
830 * the high key is to be truncated, offset 1 is deleted, and we insert
831 * the truncated high key at offset 1.
832 *
833 * 'last' pointer indicates the last offset added to the page.
834 *----------
835 */
836static void
837_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
838{
839 Page npage;
840 BlockNumber nblkno;
841 OffsetNumber last_off;
842 Size pgspc;
843 Size itupsz;
844 bool isleaf;
845
846 /*
847 * This is a handy place to check for cancel interrupts during the btree
848 * load phase of index creation.
849 */
850 CHECK_FOR_INTERRUPTS();
851
852 npage = state->btps_page;
853 nblkno = state->btps_blkno;
854 last_off = state->btps_lastoff;
855
856 pgspc = PageGetFreeSpace(npage);
857 itupsz = IndexTupleSize(itup);
858 itupsz = MAXALIGN(itupsz);
859 /* Leaf case has slightly different rules due to suffix truncation */
860 isleaf = (state->btps_level == 0);
861
862 /*
863 * Check whether the new item can fit on a btree page on current level at
864 * all.
865 *
866 * Every newly built index will treat heap TID as part of the keyspace,
867 * which imposes the requirement that new high keys must occasionally have
868 * a heap TID appended within _bt_truncate(). That may leave a new pivot
869 * tuple one or two MAXALIGN() quantums larger than the original first
870 * right tuple it's derived from. v4 deals with the problem by decreasing
871 * the limit on the size of tuples inserted on the leaf level by the same
872 * small amount. Enforce the new v4+ limit on the leaf level, and the old
873 * limit on internal levels, since pivot tuples may need to make use of
874 * the reserved space. This should never fail on internal pages.
875 */
876 if (unlikely(itupsz > BTMaxItemSize(npage)))
877 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
878 itup);
879
880 /*
881 * Check to see if current page will fit new item, with space left over to
882 * append a heap TID during suffix truncation when page is a leaf page.
883 *
884 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
885 * high key with heap TID when finishing off a leaf page, since we rely on
886 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
887 * internal pages we can always fit 3 pivot tuples with larger internal
888 * page tuple limit (includes page high key).
889 *
890 * Most of the time, a page is only "full" in the sense that the soft
891 * fillfactor-wise limit has been exceeded. However, we must always leave
892 * at least two items plus a high key on each page before starting a new
893 * page. Disregard fillfactor and insert on "full" current page if we
894 * don't have the minimum number of items yet. (Note that we deliberately
895 * assume that suffix truncation neither enlarges nor shrinks new high key
896 * when applying soft limit.)
897 */
898 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
899 (pgspc < state->btps_full && last_off > P_FIRSTKEY))
900 {
901 /*
902 * Finish off the page and write it out.
903 */
904 Page opage = npage;
905 BlockNumber oblkno = nblkno;
906 ItemId ii;
907 ItemId hii;
908 IndexTuple oitup;
909
910 /* Create new page of same level */
911 npage = _bt_blnewpage(state->btps_level);
912
913 /* and assign it a page position */
914 nblkno = wstate->btws_pages_alloced++;
915
916 /*
917 * We copy the last item on the page into the new page, and then
918 * rearrange the old page so that the 'last item' becomes its high key
919 * rather than a true data item. There had better be at least two
920 * items on the page already, else the page would be empty of useful
921 * data.
922 */
923 Assert(last_off > P_FIRSTKEY);
924 ii = PageGetItemId(opage, last_off);
925 oitup = (IndexTuple) PageGetItem(opage, ii);
926 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
927
928 /*
929 * Move 'last' into the high key position on opage. _bt_blnewpage()
930 * allocated empty space for a line pointer when opage was first
931 * created, so this is a matter of rearranging already-allocated space
932 * on page, and initializing high key line pointer. (Actually, leaf
933 * pages must also swap oitup with a truncated version of oitup, which
934 * is sometimes larger than oitup, though never by more than the space
935 * needed to append a heap TID.)
936 */
937 hii = PageGetItemId(opage, P_HIKEY);
938 *hii = *ii;
939 ItemIdSetUnused(ii); /* redundant */
940 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
941
942 if (isleaf)
943 {
944 IndexTuple lastleft;
945 IndexTuple truncated;
946 Size truncsz;
947
948 /*
949 * Truncate away any unneeded attributes from high key on leaf
950 * level. This is only done at the leaf level because downlinks
951 * in internal pages are either negative infinity items, or get
952 * their contents from copying from one level down. See also:
953 * _bt_split().
954 *
955 * We don't try to bias our choice of split point to make it more
956 * likely that _bt_truncate() can truncate away more attributes,
957 * whereas the split point passed to _bt_split() is chosen much
958 * more delicately. Suffix truncation is mostly useful because it
959 * improves space utilization for workloads with random
960 * insertions. It doesn't seem worthwhile to add logic for
961 * choosing a split point here for a benefit that is bound to be
962 * much smaller.
963 *
964 * Since the truncated tuple is often smaller than the original
965 * tuple, it cannot just be copied in place (besides, we want to
966 * actually save space on the leaf page). We delete the original
967 * high key, and add our own truncated high key at the same
968 * offset.
969 *
970 * Note that the page layout won't be changed very much. oitup is
971 * already located at the physical beginning of tuple space, so we
972 * only shift the line pointer array back and forth, and overwrite
973 * the tuple space previously occupied by oitup. This is fairly
974 * cheap.
975 */
976 ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
977 lastleft = (IndexTuple) PageGetItem(opage, ii);
978
979 truncated = _bt_truncate(wstate->index, lastleft, oitup,
980 wstate->inskey);
981 truncsz = IndexTupleSize(truncated);
982 PageIndexTupleDelete(opage, P_HIKEY);
983 _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
984 pfree(truncated);
985
986 /* oitup should continue to point to the page's high key */
987 hii = PageGetItemId(opage, P_HIKEY);
988 oitup = (IndexTuple) PageGetItem(opage, hii);
989 }
990
991 /*
992 * Link the old page into its parent, using its minimum key. If we
993 * don't have a parent, we have to create one; this adds a new btree
994 * level.
995 */
996 if (state->btps_next == NULL)
997 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
998
999 Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <=
1000 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1001 BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) ||
1002 P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1003 Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
1004 !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1005 BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
1006 _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
1007 pfree(state->btps_minkey);
1008
1009 /*
1010 * Save a copy of the high key from the old page. It is also used as
1011 * the minimum key for the new page.
1012 */
1013 state->btps_minkey = CopyIndexTuple(oitup);
1014
1015 /*
1016 * Set the sibling links for both pages.
1017 */
1018 {
1019 BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1020 BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1021
1022 oopaque->btpo_next = nblkno;
1023 nopaque->btpo_prev = oblkno;
1024 nopaque->btpo_next = P_NONE; /* redundant */
1025 }
1026
1027 /*
1028 * Write out the old page. We never need to touch it again, so we can
1029 * free the opage workspace too.
1030 */
1031 _bt_blwritepage(wstate, opage, oblkno);
1032
1033 /*
1034 * Reset last_off to point to new page
1035 */
1036 last_off = P_FIRSTKEY;
1037 }
1038
1039 /*
1040 * By here, either original page is still the current page, or a new page
1041 * was created that became the current page. Either way, the current page
1042 * definitely has space for new item.
1043 *
1044 * If the new item is the first for its page, stash a copy for later. Note
1045 * this will only happen for the first item on a level; on later pages,
1046 * the first item for a page is copied from the prior page in the code
1047 * above. The minimum key for an entire level is nothing more than a
1048 * minus infinity (downlink only) pivot tuple placeholder.
1049 */
1050 if (last_off == P_HIKEY)
1051 {
1052 Assert(state->btps_minkey == NULL);
1053 state->btps_minkey = CopyIndexTuple(itup);
1054 /* _bt_sortaddtup() will perform full truncation later */
1055 BTreeTupleSetNAtts(state->btps_minkey, 0);
1056 }
1057
1058 /*
1059 * Add the new item into the current page.
1060 */
1061 last_off = OffsetNumberNext(last_off);
1062 _bt_sortaddtup(npage, itupsz, itup, last_off);
1063
1064 state->btps_page = npage;
1065 state->btps_blkno = nblkno;
1066 state->btps_lastoff = last_off;
1067}
1068
1069/*
1070 * Finish writing out the completed btree.
1071 */
1072static void
1073_bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1074{
1075 BTPageState *s;
1076 BlockNumber rootblkno = P_NONE;
1077 uint32 rootlevel = 0;
1078 Page metapage;
1079
1080 /*
1081 * Each iteration of this loop completes one more level of the tree.
1082 */
1083 for (s = state; s != NULL; s = s->btps_next)
1084 {
1085 BlockNumber blkno;
1086 BTPageOpaque opaque;
1087
1088 blkno = s->btps_blkno;
1089 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
1090
1091 /*
1092 * We have to link the last page on this level to somewhere.
1093 *
1094 * If we're at the top, it's the root, so attach it to the metapage.
1095 * Otherwise, add an entry for it to its parent using its minimum key.
1096 * This may cause the last page of the parent level to split, but
1097 * that's not a problem -- we haven't gotten to it yet.
1098 */
1099 if (s->btps_next == NULL)
1100 {
1101 opaque->btpo_flags |= BTP_ROOT;
1102 rootblkno = blkno;
1103 rootlevel = s->btps_level;
1104 }
1105 else
1106 {
1107 Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <=
1108 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1109 BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) ||
1110 P_LEFTMOST(opaque));
1111 Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 ||
1112 !P_LEFTMOST(opaque));
1113 BTreeInnerTupleSetDownLink(s->btps_minkey, blkno);
1114 _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
1115 pfree(s->btps_minkey);
1116 s->btps_minkey = NULL;
1117 }
1118
1119 /*
1120 * This is the rightmost page, so the ItemId array needs to be slid
1121 * back one slot. Then we can dump out the page.
1122 */
1123 _bt_slideleft(s->btps_page);
1124 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1125 s->btps_page = NULL; /* writepage freed the workspace */
1126 }
1127
1128 /*
1129 * As the last step in the process, construct the metapage and make it
1130 * point to the new root (unless we had no data at all, in which case it's
1131 * set to point to "P_NONE"). This changes the index to the "valid" state
1132 * by filling in a valid magic number in the metapage.
1133 */
1134 metapage = (Page) palloc(BLCKSZ);
1135 _bt_initmetapage(metapage, rootblkno, rootlevel);
1136 _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1137}
1138
1139/*
1140 * Read tuples in correct sort order from tuplesort, and load them into
1141 * btree leaves.
1142 */
1143static void
1144_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1145{
1146 BTPageState *state = NULL;
1147 bool merge = (btspool2 != NULL);
1148 IndexTuple itup,
1149 itup2 = NULL;
1150 bool load1;
1151 TupleDesc tupdes = RelationGetDescr(wstate->index);
1152 int i,
1153 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1154 SortSupport sortKeys;
1155 int64 tuples_done = 0;
1156
1157 if (merge)
1158 {
1159 /*
1160 * Another BTSpool for dead tuples exists. Now we have to merge
1161 * btspool and btspool2.
1162 */
1163
1164 /* the preparation of merge */
1165 itup = tuplesort_getindextuple(btspool->sortstate, true);
1166 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1167
1168 /* Prepare SortSupport data for each column */
1169 sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1170
1171 for (i = 0; i < keysz; i++)
1172 {
1173 SortSupport sortKey = sortKeys + i;
1174 ScanKey scanKey = wstate->inskey->scankeys + i;
1175 int16 strategy;
1176
1177 sortKey->ssup_cxt = CurrentMemoryContext;
1178 sortKey->ssup_collation = scanKey->sk_collation;
1179 sortKey->ssup_nulls_first =
1180 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1181 sortKey->ssup_attno = scanKey->sk_attno;
1182 /* Abbreviation is not supported here */
1183 sortKey->abbreviate = false;
1184
1185 AssertState(sortKey->ssup_attno != 0);
1186
1187 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1188 BTGreaterStrategyNumber : BTLessStrategyNumber;
1189
1190 PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1191 }
1192
1193 for (;;)
1194 {
1195 load1 = true; /* load BTSpool next ? */
1196 if (itup2 == NULL)
1197 {
1198 if (itup == NULL)
1199 break;
1200 }
1201 else if (itup != NULL)
1202 {
1203 int32 compare = 0;
1204
1205 for (i = 1; i <= keysz; i++)
1206 {
1207 SortSupport entry;
1208 Datum attrDatum1,
1209 attrDatum2;
1210 bool isNull1,
1211 isNull2;
1212
1213 entry = sortKeys + i - 1;
1214 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1215 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1216
1217 compare = ApplySortComparator(attrDatum1, isNull1,
1218 attrDatum2, isNull2,
1219 entry);
1220 if (compare > 0)
1221 {
1222 load1 = false;
1223 break;
1224 }
1225 else if (compare < 0)
1226 break;
1227 }
1228
1229 /*
1230 * If key values are equal, we sort on ItemPointer. This is
1231 * required for btree indexes, since heap TID is treated as an
1232 * implicit last key attribute in order to ensure that all
1233 * keys in the index are physically unique.
1234 */
1235 if (compare == 0)
1236 {
1237 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1238 Assert(compare != 0);
1239 if (compare > 0)
1240 load1 = false;
1241 }
1242 }
1243 else
1244 load1 = false;
1245
1246 /* When we see first tuple, create first index page */
1247 if (state == NULL)
1248 state = _bt_pagestate(wstate, 0);
1249
1250 if (load1)
1251 {
1252 _bt_buildadd(wstate, state, itup);
1253 itup = tuplesort_getindextuple(btspool->sortstate, true);
1254 }
1255 else
1256 {
1257 _bt_buildadd(wstate, state, itup2);
1258 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1259 }
1260
1261 /* Report progress */
1262 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1263 ++tuples_done);
1264 }
1265 pfree(sortKeys);
1266 }
1267 else
1268 {
1269 /* merge is unnecessary */
1270 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1271 true)) != NULL)
1272 {
1273 /* When we see first tuple, create first index page */
1274 if (state == NULL)
1275 state = _bt_pagestate(wstate, 0);
1276
1277 _bt_buildadd(wstate, state, itup);
1278
1279 /* Report progress */
1280 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1281 ++tuples_done);
1282 }
1283 }
1284
1285 /* Close down final pages and write the metapage */
1286 _bt_uppershutdown(wstate, state);
1287
1288 /*
1289 * If the index is WAL-logged, we must fsync it down to disk before it's
1290 * safe to commit the transaction. (For a non-WAL-logged index we don't
1291 * care since the index will be uninteresting after a crash anyway.)
1292 *
1293 * It's obvious that we must do this when not WAL-logging the build. It's
1294 * less obvious that we have to do it even if we did WAL-log the index
1295 * pages. The reason is that since we're building outside shared buffers,
1296 * a CHECKPOINT occurring during the build has no way to flush the
1297 * previously written data to disk (indeed it won't know the index even
1298 * exists). A crash later on would replay WAL from the checkpoint,
1299 * therefore it wouldn't replay our earlier WAL entries. If we do not
1300 * fsync those pages here, they might still not be on disk when the crash
1301 * occurs.
1302 */
1303 if (RelationNeedsWAL(wstate->index))
1304 {
1305 RelationOpenSmgr(wstate->index);
1306 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
1307 }
1308}
1309
1310/*
1311 * Create parallel context, and launch workers for leader.
1312 *
1313 * buildstate argument should be initialized (with the exception of the
1314 * tuplesort state in spools, which may later be created based on shared
1315 * state initially set up here).
1316 *
1317 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1318 *
1319 * request is the target number of parallel worker processes to launch.
1320 *
1321 * Sets buildstate's BTLeader, which caller must use to shut down parallel
1322 * mode by passing it to _bt_end_parallel() at the very end of its index
1323 * build. If not even a single worker process can be launched, this is
1324 * never set, and caller should proceed with a serial index build.
1325 */
1326static void
1327_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1328{
1329 ParallelContext *pcxt;
1330 int scantuplesortstates;
1331 Snapshot snapshot;
1332 Size estbtshared;
1333 Size estsort;
1334 BTShared *btshared;
1335 Sharedsort *sharedsort;
1336 Sharedsort *sharedsort2;
1337 BTSpool *btspool = buildstate->spool;
1338 BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1339 bool leaderparticipates = true;
1340 char *sharedquery;
1341 int querylen;
1342
1343#ifdef DISABLE_LEADER_PARTICIPATION
1344 leaderparticipates = false;
1345#endif
1346
1347 /*
1348 * Enter parallel mode, and create context for parallel build of btree
1349 * index
1350 */
1351 EnterParallelMode();
1352 Assert(request > 0);
1353 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1354 request);
1355 scantuplesortstates = leaderparticipates ? request + 1 : request;
1356
1357 /*
1358 * Prepare for scan of the base relation. In a normal index build, we use
1359 * SnapshotAny because we must retrieve all tuples and do our own time
1360 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1361 * concurrent build, we take a regular MVCC snapshot and index whatever's
1362 * live according to that.
1363 */
1364 if (!isconcurrent)
1365 snapshot = SnapshotAny;
1366 else
1367 snapshot = RegisterSnapshot(GetTransactionSnapshot());
1368
1369 /*
1370 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1371 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1372 */
1373 estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1374 shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1375 estsort = tuplesort_estimate_shared(scantuplesortstates);
1376 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1377
1378 /*
1379 * Unique case requires a second spool, and so we may have to account for
1380 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1381 */
1382 if (!btspool->isunique)
1383 shm_toc_estimate_keys(&pcxt->estimator, 2);
1384 else
1385 {
1386 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1387 shm_toc_estimate_keys(&pcxt->estimator, 3);
1388 }
1389
1390 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1391 querylen = strlen(debug_query_string);
1392 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1393 shm_toc_estimate_keys(&pcxt->estimator, 1);
1394
1395 /* Everyone's had a chance to ask for space, so now create the DSM */
1396 InitializeParallelDSM(pcxt);
1397
1398 /* Store shared build state, for which we reserved space */
1399 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1400 /* Initialize immutable state */
1401 btshared->heaprelid = RelationGetRelid(btspool->heap);
1402 btshared->indexrelid = RelationGetRelid(btspool->index);
1403 btshared->isunique = btspool->isunique;
1404 btshared->isconcurrent = isconcurrent;
1405 btshared->scantuplesortstates = scantuplesortstates;
1406 ConditionVariableInit(&btshared->workersdonecv);
1407 SpinLockInit(&btshared->mutex);
1408 /* Initialize mutable state */
1409 btshared->nparticipantsdone = 0;
1410 btshared->reltuples = 0.0;
1411 btshared->havedead = false;
1412 btshared->indtuples = 0.0;
1413 btshared->brokenhotchain = false;
1414 table_parallelscan_initialize(btspool->heap,
1415 ParallelTableScanFromBTShared(btshared),
1416 snapshot);
1417
1418 /*
1419 * Store shared tuplesort-private state, for which we reserved space.
1420 * Then, initialize opaque state using tuplesort routine.
1421 */
1422 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1423 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1424 pcxt->seg);
1425
1426 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1427 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1428
1429 /* Unique case requires a second spool, and associated shared state */
1430 if (!btspool->isunique)
1431 sharedsort2 = NULL;
1432 else
1433 {
1434 /*
1435 * Store additional shared tuplesort-private state, for which we
1436 * reserved space. Then, initialize opaque state using tuplesort
1437 * routine.
1438 */
1439 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1440 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1441 pcxt->seg);
1442
1443 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1444 }
1445
1446 /* Store query string for workers */
1447 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1448 memcpy(sharedquery, debug_query_string, querylen + 1);
1449 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1450
1451 /* Launch workers, saving status for leader/caller */
1452 LaunchParallelWorkers(pcxt);
1453 btleader->pcxt = pcxt;
1454 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1455 if (leaderparticipates)
1456 btleader->nparticipanttuplesorts++;
1457 btleader->btshared = btshared;
1458 btleader->sharedsort = sharedsort;
1459 btleader->sharedsort2 = sharedsort2;
1460 btleader->snapshot = snapshot;
1461
1462 /* If no workers were successfully launched, back out (do serial build) */
1463 if (pcxt->nworkers_launched == 0)
1464 {
1465 _bt_end_parallel(btleader);
1466 return;
1467 }
1468
1469 /* Save leader state now that it's clear build will be parallel */
1470 buildstate->btleader = btleader;
1471
1472 /* Join heap scan ourselves */
1473 if (leaderparticipates)
1474 _bt_leader_participate_as_worker(buildstate);
1475
1476 /*
1477 * Caller needs to wait for all launched workers when we return. Make
1478 * sure that the failure-to-start case will not hang forever.
1479 */
1480 WaitForParallelWorkersToAttach(pcxt);
1481}
1482
1483/*
1484 * Shut down workers, destroy parallel context, and end parallel mode.
1485 */
1486static void
1487_bt_end_parallel(BTLeader *btleader)
1488{
1489 /* Shutdown worker processes */
1490 WaitForParallelWorkersToFinish(btleader->pcxt);
1491 /* Free last reference to MVCC snapshot, if one was used */
1492 if (IsMVCCSnapshot(btleader->snapshot))
1493 UnregisterSnapshot(btleader->snapshot);
1494 DestroyParallelContext(btleader->pcxt);
1495 ExitParallelMode();
1496}
1497
1498/*
1499 * Returns size of shared memory required to store state for a parallel
1500 * btree index build based on the snapshot its parallel scan will use.
1501 */
1502static Size
1503_bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1504{
1505 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1506 return add_size(BUFFERALIGN(sizeof(BTShared)),
1507 table_parallelscan_estimate(heap, snapshot));
1508}
1509
1510/*
1511 * Within leader, wait for end of heap scan.
1512 *
1513 * When called, parallel heap scan started by _bt_begin_parallel() will
1514 * already be underway within worker processes (when leader participates
1515 * as a worker, we should end up here just as workers are finishing).
1516 *
1517 * Fills in fields needed for ambuild statistics, and lets caller set
1518 * field indicating that some worker encountered a broken HOT chain.
1519 *
1520 * Returns the total number of heap tuples scanned.
1521 */
1522static double
1523_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1524{
1525 BTShared *btshared = buildstate->btleader->btshared;
1526 int nparticipanttuplesorts;
1527 double reltuples;
1528
1529 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1530 for (;;)
1531 {
1532 SpinLockAcquire(&btshared->mutex);
1533 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1534 {
1535 buildstate->havedead = btshared->havedead;
1536 buildstate->indtuples = btshared->indtuples;
1537 *brokenhotchain = btshared->brokenhotchain;
1538 reltuples = btshared->reltuples;
1539 SpinLockRelease(&btshared->mutex);
1540 break;
1541 }
1542 SpinLockRelease(&btshared->mutex);
1543
1544 ConditionVariableSleep(&btshared->workersdonecv,
1545 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1546 }
1547
1548 ConditionVariableCancelSleep();
1549
1550 return reltuples;
1551}
1552
1553/*
1554 * Within leader, participate as a parallel worker.
1555 */
1556static void
1557_bt_leader_participate_as_worker(BTBuildState *buildstate)
1558{
1559 BTLeader *btleader = buildstate->btleader;
1560 BTSpool *leaderworker;
1561 BTSpool *leaderworker2;
1562 int sortmem;
1563
1564 /* Allocate memory and initialize private spool */
1565 leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1566 leaderworker->heap = buildstate->spool->heap;
1567 leaderworker->index = buildstate->spool->index;
1568 leaderworker->isunique = buildstate->spool->isunique;
1569
1570 /* Initialize second spool, if required */
1571 if (!btleader->btshared->isunique)
1572 leaderworker2 = NULL;
1573 else
1574 {
1575 /* Allocate memory for worker's own private secondary spool */
1576 leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1577
1578 /* Initialize worker's own secondary spool */
1579 leaderworker2->heap = leaderworker->heap;
1580 leaderworker2->index = leaderworker->index;
1581 leaderworker2->isunique = false;
1582 }
1583
1584 /*
1585 * Might as well use reliable figure when doling out maintenance_work_mem
1586 * (when requested number of workers were not launched, this will be
1587 * somewhat higher than it is for other workers).
1588 */
1589 sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1590
1591 /* Perform work common to all participants */
1592 _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1593 btleader->sharedsort, btleader->sharedsort2,
1594 sortmem, true);
1595
1596#ifdef BTREE_BUILD_STATS
1597 if (log_btree_build_stats)
1598 {
1599 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1600 ResetUsage();
1601 }
1602#endif /* BTREE_BUILD_STATS */
1603}
1604
1605/*
1606 * Perform work within a launched parallel process.
1607 */
1608void
1609_bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1610{
1611 char *sharedquery;
1612 BTSpool *btspool;
1613 BTSpool *btspool2;
1614 BTShared *btshared;
1615 Sharedsort *sharedsort;
1616 Sharedsort *sharedsort2;
1617 Relation heapRel;
1618 Relation indexRel;
1619 LOCKMODE heapLockmode;
1620 LOCKMODE indexLockmode;
1621 int sortmem;
1622
1623#ifdef BTREE_BUILD_STATS
1624 if (log_btree_build_stats)
1625 ResetUsage();
1626#endif /* BTREE_BUILD_STATS */
1627
1628 /* Set debug_query_string for individual workers first */
1629 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
1630 debug_query_string = sharedquery;
1631
1632 /* Report the query string from leader */
1633 pgstat_report_activity(STATE_RUNNING, debug_query_string);
1634
1635 /* Look up nbtree shared state */
1636 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1637
1638 /* Open relations using lock modes known to be obtained by index.c */
1639 if (!btshared->isconcurrent)
1640 {
1641 heapLockmode = ShareLock;
1642 indexLockmode = AccessExclusiveLock;
1643 }
1644 else
1645 {
1646 heapLockmode = ShareUpdateExclusiveLock;
1647 indexLockmode = RowExclusiveLock;
1648 }
1649
1650 /* Open relations within worker */
1651 heapRel = table_open(btshared->heaprelid, heapLockmode);
1652 indexRel = index_open(btshared->indexrelid, indexLockmode);
1653
1654 /* Initialize worker's own spool */
1655 btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1656 btspool->heap = heapRel;
1657 btspool->index = indexRel;
1658 btspool->isunique = btshared->isunique;
1659
1660 /* Look up shared state private to tuplesort.c */
1661 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1662 tuplesort_attach_shared(sharedsort, seg);
1663 if (!btshared->isunique)
1664 {
1665 btspool2 = NULL;
1666 sharedsort2 = NULL;
1667 }
1668 else
1669 {
1670 /* Allocate memory for worker's own private secondary spool */
1671 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1672
1673 /* Initialize worker's own secondary spool */
1674 btspool2->heap = btspool->heap;
1675 btspool2->index = btspool->index;
1676 btspool2->isunique = false;
1677 /* Look up shared state private to tuplesort.c */
1678 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1679 tuplesort_attach_shared(sharedsort2, seg);
1680 }
1681
1682 /* Perform sorting of spool, and possibly a spool2 */
1683 sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1684 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1685 sharedsort2, sortmem, false);
1686
1687#ifdef BTREE_BUILD_STATS
1688 if (log_btree_build_stats)
1689 {
1690 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1691 ResetUsage();
1692 }
1693#endif /* BTREE_BUILD_STATS */
1694
1695 index_close(indexRel, indexLockmode);
1696 table_close(heapRel, heapLockmode);
1697}
1698
1699/*
1700 * Perform a worker's portion of a parallel sort.
1701 *
1702 * This generates a tuplesort for passed btspool, and a second tuplesort
1703 * state if a second btspool is need (i.e. for unique index builds). All
1704 * other spool fields should already be set when this is called.
1705 *
1706 * sortmem is the amount of working memory to use within each worker,
1707 * expressed in KBs.
1708 *
1709 * When this returns, workers are done, and need only release resources.
1710 */
1711static void
1712_bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1713 BTShared *btshared, Sharedsort *sharedsort,
1714 Sharedsort *sharedsort2, int sortmem, bool progress)
1715{
1716 SortCoordinate coordinate;
1717 BTBuildState buildstate;
1718 TableScanDesc scan;
1719 double reltuples;
1720 IndexInfo *indexInfo;
1721
1722 /* Initialize local tuplesort coordination state */
1723 coordinate = palloc0(sizeof(SortCoordinateData));
1724 coordinate->isWorker = true;
1725 coordinate->nParticipants = -1;
1726 coordinate->sharedsort = sharedsort;
1727
1728 /* Begin "partial" tuplesort */
1729 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1730 btspool->index,
1731 btspool->isunique,
1732 sortmem, coordinate,
1733 false);
1734
1735 /*
1736 * Just as with serial case, there may be a second spool. If so, a
1737 * second, dedicated spool2 partial tuplesort is required.
1738 */
1739 if (btspool2)
1740 {
1741 SortCoordinate coordinate2;
1742
1743 /*
1744 * We expect that the second one (for dead tuples) won't get very
1745 * full, so we give it only work_mem (unless sortmem is less for
1746 * worker). Worker processes are generally permitted to allocate
1747 * work_mem independently.
1748 */
1749 coordinate2 = palloc0(sizeof(SortCoordinateData));
1750 coordinate2->isWorker = true;
1751 coordinate2->nParticipants = -1;
1752 coordinate2->sharedsort = sharedsort2;
1753 btspool2->sortstate =
1754 tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1755 Min(sortmem, work_mem), coordinate2,
1756 false);
1757 }
1758
1759 /* Fill in buildstate for _bt_build_callback() */
1760 buildstate.isunique = btshared->isunique;
1761 buildstate.havedead = false;
1762 buildstate.heap = btspool->heap;
1763 buildstate.spool = btspool;
1764 buildstate.spool2 = btspool2;
1765 buildstate.indtuples = 0;
1766 buildstate.btleader = NULL;
1767
1768 /* Join parallel scan */
1769 indexInfo = BuildIndexInfo(btspool->index);
1770 indexInfo->ii_Concurrent = btshared->isconcurrent;
1771 scan = table_beginscan_parallel(btspool->heap,
1772 ParallelTableScanFromBTShared(btshared));
1773 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1774 true, progress, _bt_build_callback,
1775 (void *) &buildstate, scan);
1776
1777 /*
1778 * Execute this worker's part of the sort.
1779 *
1780 * Unlike leader and serial cases, we cannot avoid calling
1781 * tuplesort_performsort() for spool2 if it ends up containing no dead
1782 * tuples (this is disallowed for workers by tuplesort).
1783 */
1784 tuplesort_performsort(btspool->sortstate);
1785 if (btspool2)
1786 tuplesort_performsort(btspool2->sortstate);
1787
1788 /*
1789 * Done. Record ambuild statistics, and whether we encountered a broken
1790 * HOT chain.
1791 */
1792 SpinLockAcquire(&btshared->mutex);
1793 btshared->nparticipantsdone++;
1794 btshared->reltuples += reltuples;
1795 if (buildstate.havedead)
1796 btshared->havedead = true;
1797 btshared->indtuples += buildstate.indtuples;
1798 if (indexInfo->ii_BrokenHotChain)
1799 btshared->brokenhotchain = true;
1800 SpinLockRelease(&btshared->mutex);
1801
1802 /* Notify leader */
1803 ConditionVariableSignal(&btshared->workersdonecv);
1804
1805 /* We can end tuplesorts immediately */
1806 tuplesort_end(btspool->sortstate);
1807 if (btspool2)
1808 tuplesort_end(btspool2->sortstate);
1809}
1810