nbtsort.c source code [PostgreSQL/src/backend/access/nbtree/nbtsort.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtsort.c
4	* Build a btree from sorted input by loading leaf pages sequentially.
5	*
6	* NOTES
7	*
8	* We use tuplesort.c to sort the given index tuples into order.
9	* Then we scan the index tuples in order and build the btree pages
10	* for each level. We load source tuples into leaf-level pages.
11	* Whenever we fill a page at one level, we add a link to it to its
12	* parent level (starting a new parent level if necessary). When
13	* done, we write out each final page on each level, adding it to
14	* its parent level. When we have only one page on a level, it must be
15	* the root -- it can be attached to the btree metapage and we are done.
16	*
17	* It is not wise to pack the pages entirely full, since then any
18	* insertion would cause a split (and not only of the leaf page; the need
19	* for a split would cascade right up the tree). The steady-state load
20	* factor for btrees is usually estimated at 70%. We choose to pack leaf
21	* pages to the user-controllable fill factor (default 90%) while upper pages
22	* are always packed to 70%. This gives us reasonable density (there aren't
23	* many upper pages if the keys are reasonable-size) without risking a lot of
24	* cascading splits during early insertions.
25	*
26	* Formerly the index pages being built were kept in shared buffers, but
27	* that is of no value (since other backends have no interest in them yet)
28	* and it created locking problems for CHECKPOINT, because the upper-level
29	* pages were held exclusive-locked for long periods. Now we just build
30	* the pages in local memory and smgrwrite or smgrextend them as we finish
31	* them. They will need to be re-read into shared buffers on first use after
32	* the build finishes.
33	*
34	* Since the index will never be used unless it is completely built,
35	* from a crash-recovery point of view there is no need to WAL-log the
36	* steps of the build. After completing the index build, we can just sync
37	* the whole file to disk using smgrimmedsync() before exiting this module.
38	* This can be seen to be sufficient for crash recovery by considering that
39	* it's effectively equivalent to what would happen if a CHECKPOINT occurred
40	* just after the index build. However, it is clearly not sufficient if the
41	* DBA is using the WAL log for PITR or replication purposes, since another
42	* machine would not be able to reconstruct the index from WAL. Therefore,
43	* we log the completed index pages to WAL if and only if WAL archiving is
44	* active.
45	*
46	* This code isn't concerned about the FSM at all. The caller is responsible
47	* for initializing that.
48	*
49	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
50	* Portions Copyright (c) 1994, Regents of the University of California
51	*
52	* IDENTIFICATION
53	* src/backend/access/nbtree/nbtsort.c
54	*
55	*-------------------------------------------------------------------------
56	*/
57
58	#include "postgres.h"
59
60	#include "access/nbtree.h"
61	#include "access/parallel.h"
62	#include "access/relscan.h"
63	#include "access/table.h"
64	#include "access/tableam.h"
65	#include "access/xact.h"
66	#include "access/xlog.h"
67	#include "access/xloginsert.h"
68	#include "catalog/index.h"
69	#include "commands/progress.h"
70	#include "miscadmin.h"
71	#include "pgstat.h"
72	#include "storage/smgr.h"
73	#include "tcop/tcopprot.h" /* pgrminclude ignore */
74	#include "utils/rel.h"
75	#include "utils/sortsupport.h"
76	#include "utils/tuplesort.h"
77
78
79	/ Magic numbers for parallel state sharing /
80	#define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
81	#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
82	#define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
83	#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
84
85	/*
86	* DISABLE_LEADER_PARTICIPATION disables the leader's participation in
87	* parallel index builds. This may be useful as a debugging aid.
88	#undef DISABLE_LEADER_PARTICIPATION
89	*/
90
91	/*
92	* Status record for spooling/sorting phase. (Note we may have two of
93	* these due to the special requirements for uniqueness-checking with
94	* dead tuples.)
95	*/
96	typedef struct BTSpool
97	{
98	Tuplesortstate sortstate; /* state data for tuplesort.c /
99	Relation heap;
100	Relation index;
101	bool isunique;
102	} BTSpool;
103
104	/*
105	* Status for index builds performed in parallel. This is allocated in a
106	* dynamic shared memory segment. Note that there is a separate tuplesort TOC
107	* entry, private to tuplesort.c but allocated by this module on its behalf.
108	*/
109	typedef struct BTShared
110	{
111	/*
112	* These fields are not modified during the sort. They primarily exist
113	* for the benefit of worker processes that need to create BTSpool state
114	* corresponding to that used by the leader.
115	*/
116	Oid heaprelid;
117	Oid indexrelid;
118	bool isunique;
119	bool isconcurrent;
120	int scantuplesortstates;
121
122	/*
123	* workersdonecv is used to monitor the progress of workers. All parallel
124	* participants must indicate that they are done before leader can use
125	* mutable state that workers maintain during scan (and before leader can
126	* proceed to tuplesort_performsort()).
127	*/
128	ConditionVariable workersdonecv;
129
130	/*
131	* mutex protects all fields before heapdesc.
132	*
133	* These fields contain status information of interest to B-Tree index
134	* builds that must work just the same when an index is built in parallel.
135	*/
136	slock_t mutex;
137
138	/*
139	* Mutable state that is maintained by workers, and reported back to
140	* leader at end of parallel scan.
141	*
142	* nparticipantsdone is number of worker processes finished.
143	*
144	* reltuples is the total number of input heap tuples.
145	*
146	* havedead indicates if RECENTLY_DEAD tuples were encountered during
147	* build.
148	*
149	* indtuples is the total number of tuples that made it into the index.
150	*
151	* brokenhotchain indicates if any worker detected a broken HOT chain
152	* during build.
153	*/
154	int nparticipantsdone;
155	double reltuples;
156	bool havedead;
157	double indtuples;
158	bool brokenhotchain;
159
160	/*
161	* ParallelTableScanDescData data follows. Can't directly embed here, as
162	* implementations of the parallel table scan desc interface might need
163	* stronger alignment.
164	*/
165	} BTShared;
166
167	/*
168	* Return pointer to a BTShared's parallel table scan.
169	*
170	* c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
171	* MAXALIGN.
172	*/
173	#define ParallelTableScanFromBTShared(shared) \
174	(ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
175
176	/*
177	* Status for leader in parallel index build.
178	*/
179	typedef struct BTLeader
180	{
181	/ parallel context itself /
182	ParallelContext *pcxt;
183
184	/*
185	* nparticipanttuplesorts is the exact number of worker processes
186	* successfully launched, plus one leader process if it participates as a
187	* worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
188	* participating as a worker).
189	*/
190	int nparticipanttuplesorts;
191
192	/*
193	* Leader process convenience pointers to shared state (leader avoids TOC
194	* lookups).
195	*
196	* btshared is the shared state for entire build. sharedsort is the
197	* shared, tuplesort-managed state passed to each process tuplesort.
198	* sharedsort2 is the corresponding btspool2 shared state, used only when
199	* building unique indexes. snapshot is the snapshot used by the scan iff
200	* an MVCC snapshot is required.
201	*/
202	BTShared *btshared;
203	Sharedsort *sharedsort;
204	Sharedsort *sharedsort2;
205	Snapshot snapshot;
206	} BTLeader;
207
208	/*
209	* Working state for btbuild and its callback.
210	*
211	* When parallel CREATE INDEX is used, there is a BTBuildState for each
212	* participant.
213	*/
214	typedef struct BTBuildState
215	{
216	bool isunique;
217	bool havedead;
218	Relation heap;
219	BTSpool *spool;
220
221	/*
222	* spool2 is needed only when the index is a unique index. Dead tuples are
223	* put into spool2 instead of spool in order to avoid uniqueness check.
224	*/
225	BTSpool *spool2;
226	double indtuples;
227
228	/*
229	* btleader is only present when a parallel index build is performed, and
230	* only in the leader process. (Actually, only the leader has a
231	* BTBuildState. Workers have their own spool and spool2, though.)
232	*/
233	BTLeader *btleader;
234	} BTBuildState;
235
236	/*
237	* Status record for a btree page being built. We have one of these
238	* for each active tree level.
239	*
240	* The reason we need to store a copy of the minimum key is that we'll
241	* need to propagate it to the parent node when this page is linked
242	* into its parent. However, if the page is not a leaf page, the first
243	* entry on the page doesn't need to contain a key, so we will not have
244	* stored the key itself on the page. (You might think we could skip
245	* copying the minimum key on leaf pages, but actually we must have a
246	* writable copy anyway because we'll poke the page's address into it
247	* before passing it up to the parent...)
248	*/
249	typedef struct BTPageState
250	{
251	Page btps_page; / workspace for page building /
252	BlockNumber btps_blkno; / block # to write this page at /
253	IndexTuple btps_minkey; / copy of minimum key (first item) on page /
254	OffsetNumber btps_lastoff; / last item offset loaded /
255	uint32 btps_level; / tree level (0 = leaf) /
256	Size btps_full; / "full" if less than this much free space /
257	struct BTPageState btps_next; /* link to parent level, if any /
258	} BTPageState;
259
260	/*
261	* Overall status record for index writing phase.
262	*/
263	typedef struct BTWriteState
264	{
265	Relation heap;
266	Relation index;
267	BTScanInsert inskey; / generic insertion scankey /
268	bool btws_use_wal; / dump pages to WAL? /
269	BlockNumber btws_pages_alloced; / # pages allocated /
270	BlockNumber btws_pages_written; / # pages written out /
271	Page btws_zeropage; / workspace for filling zeroes /
272	} BTWriteState;
273
274
275	static double _bt_spools_heapscan(Relation heap, Relation index,
276	BTBuildState buildstate, IndexInfo indexInfo);
277	static void _bt_spooldestroy(BTSpool *btspool);
278	static void _bt_spool(BTSpool *btspool, ItemPointer self,
279	Datum values, bool isnull);
280	static void _bt_leafbuild(BTSpool btspool, BTSpool btspool2);
281	static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values,
282	bool isnull, bool tupleIsAlive, void* *state);
283	static Page _bt_blnewpage(uint32 level);
284	static BTPageState _bt_pagestate(BTWriteState wstate, uint32 level);
285	static void _bt_slideleft(Page page);
286	static void _bt_sortaddtup(Page page, Size itemsize,
287	IndexTuple itup, OffsetNumber itup_off);
288	static void _bt_buildadd(BTWriteState wstate, BTPageState state,
289	IndexTuple itup);
290	static void _bt_uppershutdown(BTWriteState wstate, BTPageState state);
291	static void _bt_load(BTWriteState *wstate,
292	BTSpool btspool, BTSpool btspool2);
293	static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
294	int request);
295	static void _bt_end_parallel(BTLeader *btleader);
296	static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
297	static double _bt_parallel_heapscan(BTBuildState *buildstate,
298	bool *brokenhotchain);
299	static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
300	static void _bt_parallel_scan_and_sort(BTSpool btspool, BTSpool btspool2,
301	BTShared btshared, Sharedsort sharedsort,
302	Sharedsort sharedsort2, int* sortmem,
303	bool progress);
304
305
306	/*
307	* btbuild() -- build a new btree index.
308	*/
309	IndexBuildResult *
310	btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
311	{
312	IndexBuildResult *result;
313	BTBuildState buildstate;
314	double reltuples;
315
316	#ifdef BTREE_BUILD_STATS
317	if (log_btree_build_stats)
318	ResetUsage();
319	#endif /* BTREE_BUILD_STATS */
320
321	buildstate.isunique = indexInfo->ii_Unique;
322	buildstate.havedead = false;
323	buildstate.heap = heap;
324	buildstate.spool = NULL;
325	buildstate.spool2 = NULL;
326	buildstate.indtuples = `0`;
327	buildstate.btleader = NULL;
328
329	/*
330	* We expect to be called exactly once for any index relation. If that's
331	* not the case, big trouble's what we have.
332	*/
333	if (RelationGetNumberOfBlocks(index) != `0`)
334	elog(ERROR, "index \"%s\" already contains data",
335	RelationGetRelationName(index));
336
337	reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
338
339	/*
340	* Finish the build by (1) completing the sort of the spool file, (2)
341	* inserting the sorted tuples into btree pages and (3) building the upper
342	* levels. Finally, it may also be necessary to end use of parallelism.
343	*/
344	_bt_leafbuild(buildstate.spool, buildstate.spool2);
345	_bt_spooldestroy(buildstate.spool);
346	if (buildstate.spool2)
347	_bt_spooldestroy(buildstate.spool2);
348	if (buildstate.btleader)
349	_bt_end_parallel(buildstate.btleader);
350
351	result = (IndexBuildResult ) palloc(sizeof*(IndexBuildResult));
352
353	result->heap_tuples = reltuples;
354	result->index_tuples = buildstate.indtuples;
355
356	#ifdef BTREE_BUILD_STATS
357	if (log_btree_build_stats)
358	{
359	ShowUsage("BTREE BUILD STATS");
360	ResetUsage();
361	}
362	#endif /* BTREE_BUILD_STATS */
363
364	return result;
365	}
366
367	/*
368	* Create and initialize one or two spool structures, and save them in caller's
369	* buildstate argument. May also fill-in fields within indexInfo used by index
370	* builds.
371	*
372	* Scans the heap, possibly in parallel, filling spools with IndexTuples. This
373	* routine encapsulates all aspects of managing parallelism. Caller need only
374	* call _bt_end_parallel() in parallel case after it is done with spool/spool2.
375	*
376	* Returns the total number of heap tuples scanned.
377	*/
378	static double
379	_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
380	IndexInfo *indexInfo)
381	{
382	BTSpool btspool = (BTSpool ) palloc0(sizeof(BTSpool));
383	SortCoordinate coordinate = NULL;
384	double reltuples = `0`;
385
386	/*
387	* We size the sort area as maintenance_work_mem rather than work_mem to
388	* speed index creation. This should be OK since a single backend can't
389	* run multiple index creations in parallel (see also: notes on
390	* parallelism and maintenance_work_mem below).
391	*/
392	btspool->heap = heap;
393	btspool->index = index;
394	btspool->isunique = indexInfo->ii_Unique;
395
396	/ Save as primary spool /
397	buildstate->spool = btspool;
398
399	/ Report table scan phase started /
400	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
401	PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
402
403	/ Attempt to launch parallel worker scan when required /
404	if (indexInfo->ii_ParallelWorkers > `0`)
405	_bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
406	indexInfo->ii_ParallelWorkers);
407
408	/*
409	* If parallel build requested and at least one worker process was
410	* successfully launched, set up coordination state
411	*/
412	if (buildstate->btleader)
413	{
414	coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
415	coordinate->isWorker = false;
416	coordinate->nParticipants =
417	buildstate->btleader->nparticipanttuplesorts;
418	coordinate->sharedsort = buildstate->btleader->sharedsort;
419	}
420
421	/*
422	* Begin serial/leader tuplesort.
423	*
424	* In cases where parallelism is involved, the leader receives the same
425	* share of maintenance_work_mem as a serial sort (it is generally treated
426	* in the same way as a serial sort once we return). Parallel worker
427	* Tuplesortstates will have received only a fraction of
428	* maintenance_work_mem, though.
429	*
430	* We rely on the lifetime of the Leader Tuplesortstate almost not
431	* overlapping with any worker Tuplesortstate's lifetime. There may be
432	* some small overlap, but that's okay because we rely on leader
433	* Tuplesortstate only allocating a small, fixed amount of memory here.
434	* When its tuplesort_performsort() is called (by our caller), and
435	* significant amounts of memory are likely to be used, all workers must
436	* have already freed almost all memory held by their Tuplesortstates
437	* (they are about to go away completely, too). The overall effect is
438	* that maintenance_work_mem always represents an absolute high watermark
439	* on the amount of memory used by a CREATE INDEX operation, regardless of
440	* the use of parallelism or any other factor.
441	*/
442	buildstate->spool->sortstate =
443	tuplesort_begin_index_btree(heap, index, buildstate->isunique,
444	maintenance_work_mem, coordinate,
445	false);
446
447	/*
448	* If building a unique index, put dead tuples in a second spool to keep
449	* them out of the uniqueness check. We expect that the second spool (for
450	* dead tuples) won't get very full, so we give it only work_mem.
451	*/
452	if (indexInfo->ii_Unique)
453	{
454	BTSpool btspool2 = (BTSpool ) palloc0(sizeof(BTSpool));
455	SortCoordinate coordinate2 = NULL;
456
457	/ Initialize secondary spool /
458	btspool2->heap = heap;
459	btspool2->index = index;
460	btspool2->isunique = false;
461	/ Save as secondary spool /
462	buildstate->spool2 = btspool2;
463
464	if (buildstate->btleader)
465	{
466	/*
467	* Set up non-private state that is passed to
468	* tuplesort_begin_index_btree() about the basic high level
469	* coordination of a parallel sort.
470	*/
471	coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
472	coordinate2->isWorker = false;
473	coordinate2->nParticipants =
474	buildstate->btleader->nparticipanttuplesorts;
475	coordinate2->sharedsort = buildstate->btleader->sharedsort2;
476	}
477
478	/*
479	* We expect that the second one (for dead tuples) won't get very
480	* full, so we give it only work_mem
481	*/
482	buildstate->spool2->sortstate =
483	tuplesort_begin_index_btree(heap, index, false, work_mem,
484	coordinate2, false);
485	}
486
487	/ Fill spool using either serial or parallel heap scan /
488	if (!buildstate->btleader)
489	reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
490	_bt_build_callback, (void *) buildstate,
491	NULL);
492	else
493	reltuples = _bt_parallel_heapscan(buildstate,
494	&indexInfo->ii_BrokenHotChain);
495
496	/*
497	* Set the progress target for the next phase. Reset the block number
498	* values set by table_index_build_scan
499	*/
500	{
501	const int index[] = {
502	PROGRESS_CREATEIDX_TUPLES_TOTAL,
503	PROGRESS_SCAN_BLOCKS_TOTAL,
504	PROGRESS_SCAN_BLOCKS_DONE
505	};
506	const int64 val[] = {
507	buildstate->indtuples,
508	`0`, `0`
509	};
510
511	pgstat_progress_update_multi_param(`3`, index, val);
512	}
513
514	/ okay, all heap tuples are spooled /
515	if (buildstate->spool2 && !buildstate->havedead)
516	{
517	/ spool2 turns out to be unnecessary /
518	_bt_spooldestroy(buildstate->spool2);
519	buildstate->spool2 = NULL;
520	}
521
522	return reltuples;
523	}
524
525	/*
526	* clean up a spool structure and its substructures.
527	*/
528	static void
529	_bt_spooldestroy(BTSpool *btspool)
530	{
531	tuplesort_end(btspool->sortstate);
532	pfree(btspool);
533	}
534
535	/*
536	* spool an index entry into the sort file.
537	*/
538	static void
539	_bt_spool(BTSpool btspool, ItemPointer self, Datum values, bool *isnull)
540	{
541	tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
542	self, values, isnull);
543	}
544
545	/*
546	* given a spool loaded by successive calls to _bt_spool,
547	* create an entire btree.
548	*/
549	static void
550	_bt_leafbuild(BTSpool btspool, BTSpool btspool2)
551	{
552	BTWriteState wstate;
553
554	#ifdef BTREE_BUILD_STATS
555	if (log_btree_build_stats)
556	{
557	ShowUsage("BTREE BUILD (Spool) STATISTICS");
558	ResetUsage();
559	}
560	#endif /* BTREE_BUILD_STATS */
561
562	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
563	PROGRESS_BTREE_PHASE_PERFORMSORT_1);
564	tuplesort_performsort(btspool->sortstate);
565	if (btspool2)
566	{
567	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
568	PROGRESS_BTREE_PHASE_PERFORMSORT_2);
569	tuplesort_performsort(btspool2->sortstate);
570	}
571
572	wstate.heap = btspool->heap;
573	wstate.index = btspool->index;
574	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
575
576	/*
577	* We need to log index creation in WAL iff WAL archiving/streaming is
578	* enabled UNLESS the index isn't WAL-logged anyway.
579	*/
580	wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
581
582	/ reserve the metapage /
583	wstate.btws_pages_alloced = BTREE_METAPAGE + `1`;
584	wstate.btws_pages_written = `0`;
585	wstate.btws_zeropage = NULL; / until needed /
586
587	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
588	PROGRESS_BTREE_PHASE_LEAF_LOAD);
589	_bt_load(&wstate, btspool, btspool2);
590	}
591
592	/*
593	* Per-tuple callback for table_index_build_scan
594	*/
595	static void
596	_bt_build_callback(Relation index,
597	HeapTuple htup,
598	Datum *values,
599	bool *isnull,
600	bool tupleIsAlive,
601	void *state)
602	{
603	BTBuildState buildstate = (BTBuildState ) state;
604
605	/*
606	* insert the index tuple into the appropriate spool file for subsequent
607	* processing
608	*/
609	if (tupleIsAlive \|\| buildstate->spool2 == NULL)
610	_bt_spool(buildstate->spool, &htup->t_self, values, isnull);
611	else
612	{
613	/ dead tuples are put into spool2 /
614	buildstate->havedead = true;
615	_bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
616	}
617
618	buildstate->indtuples += `1`;
619	}
620
621	/*
622	* allocate workspace for a new, clean btree page, not linked to any siblings.
623	*/
624	static Page
625	_bt_blnewpage(uint32 level)
626	{
627	Page page;
628	BTPageOpaque opaque;
629
630	page = (Page) palloc(BLCKSZ);
631
632	/ Zero the page and set up standard page header info /
633	_bt_pageinit(page, BLCKSZ);
634
635	/ Initialize BT opaque state /
636	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
637	opaque->btpo_prev = opaque->btpo_next = P_NONE;
638	opaque->btpo.level = level;
639	opaque->btpo_flags = (level > `0`) ? `0` : BTP_LEAF;
640	opaque->btpo_cycleid = `0`;
641
642	/ Make the P_HIKEY line pointer appear allocated /
643	((PageHeader) page)->pd_lower += sizeof(ItemIdData);
644
645	return page;
646	}
647
648	/*
649	* emit a completed btree page, and release the working storage.
650	*/
651	static void
652	_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
653	{
654	/ Ensure rd_smgr is open (could have been closed by relcache flush!) /
655	RelationOpenSmgr(wstate->index);
656
657	/ XLOG stuff /
658	if (wstate->btws_use_wal)
659	{
660	/ We use the heap NEWPAGE record type for this /
661	log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
662	}
663
664	/*
665	* If we have to write pages nonsequentially, fill in the space with
666	* zeroes until we come back and overwrite. This is not logically
667	* necessary on standard Unix filesystems (unwritten space will read as
668	* zeroes anyway), but it should help to avoid fragmentation. The dummy
669	* pages aren't WAL-logged though.
670	*/
671	while (blkno > wstate->btws_pages_written)
672	{
673	if (!wstate->btws_zeropage)
674	wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
675	/ don't set checksum for all-zero page /
676	smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
677	wstate->btws_pages_written++,
678	(char *) wstate->btws_zeropage,
679	true);
680	}
681
682	PageSetChecksumInplace(page, blkno);
683
684	/*
685	* Now write the page. There's no need for smgr to schedule an fsync for
686	* this write; we'll do it ourselves before ending the build.
687	*/
688	if (blkno == wstate->btws_pages_written)
689	{
690	/ extending the file... /
691	smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
692	(char *) page, true);
693	wstate->btws_pages_written++;
694	}
695	else
696	{
697	/ overwriting a block we zero-filled before /
698	smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
699	(char *) page, true);
700	}
701
702	pfree(page);
703	}
704
705	/*
706	* allocate and initialize a new BTPageState. the returned structure
707	* is suitable for immediate use by _bt_buildadd.
708	*/
709	static BTPageState *
710	_bt_pagestate(BTWriteState *wstate, uint32 level)
711	{
712	BTPageState state = (BTPageState ) palloc0(sizeof(BTPageState));
713
714	/ create initial page for level /
715	state->btps_page = _bt_blnewpage(level);
716
717	/ and assign it a page position /
718	state->btps_blkno = wstate->btws_pages_alloced++;
719
720	state->btps_minkey = NULL;
721	/ initialize lastoff so first item goes into P_FIRSTKEY /
722	state->btps_lastoff = P_HIKEY;
723	state->btps_level = level;
724	/ set "full" threshold based on level. See notes at head of file. /
725	if (level > `0`)
726	state->btps_full = (BLCKSZ * (`100` - BTREE_NONLEAF_FILLFACTOR) / `100`);
727	else
728	state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
729	BTREE_DEFAULT_FILLFACTOR);
730	/ no parent level, yet /
731	state->btps_next = NULL;
732
733	return state;
734	}
735
736	/*
737	* slide an array of ItemIds back one slot (from P_FIRSTKEY to
738	* P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
739	* that we have built an ItemId array in what has turned out to be a
740	* P_RIGHTMOST page.
741	*/
742	static void
743	_bt_slideleft(Page page)
744	{
745	OffsetNumber off;
746	OffsetNumber maxoff;
747	ItemId previi;
748	ItemId thisii;
749
750	if (!PageIsEmpty(page))
751	{
752	maxoff = PageGetMaxOffsetNumber(page);
753	previi = PageGetItemId(page, P_HIKEY);
754	for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
755	{
756	thisii = PageGetItemId(page, off);
757	previi = thisii;
758	previi = thisii;
759	}
760	((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
761	}
762	}
763
764	/*
765	* Add an item to a page being built.
766	*
767	* The main difference between this routine and a bare PageAddItem call
768	* is that this code knows that the leftmost data item on a non-leaf
769	* btree page doesn't need to have a key. Therefore, it strips such
770	* items down to just the item header.
771	*
772	* This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
773	* that because it assumes that P_RIGHTMOST() will return the correct
774	* answer for the page. Here, we don't know yet if the page will be
775	* rightmost. Offset P_FIRSTKEY is always the first data key.
776	*/
777	static void
778	_bt_sortaddtup(Page page,
779	Size itemsize,
780	IndexTuple itup,
781	OffsetNumber itup_off)
782	{
783	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
784	IndexTupleData trunctuple;
785
786	if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
787	{
788	trunctuple = *itup;
789	trunctuple.t_info = sizeof(IndexTupleData);
790	/ Deliberately zero INDEX_ALT_TID_MASK bits /
791	BTreeTupleSetNAtts(&trunctuple, `0`);
792	itup = &trunctuple;
793	itemsize = sizeof(IndexTupleData);
794	}
795
796	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
797	false, false) == InvalidOffsetNumber)
798	elog(ERROR, "failed to add item to the index page");
799	}
800
801	/----------*
802	* Add an item to a disk page from the sort output.
803	*
804	* We must be careful to observe the page layout conventions of nbtsearch.c:
805	* - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
806	* - on non-leaf pages, the key portion of the first item need not be
807	* stored, we should store only the link.
808	*
809	* A leaf page being built looks like:
810	*
811	* +----------------+---------------------------------+
812	* \| PageHeaderData \| linp0 linp1 linp2 ... \|
813	* +-----------+----+---------------------------------+
814	* \| ... linpN \| \|
815	* +-----------+--------------------------------------+
816	* \| ^ last \|
817	* \| \|
818	* +-------------+------------------------------------+
819	* \| \| itemN ... \|
820	* +-------------+------------------+-----------------+
821	* \| ... item3 item2 item1 \| "special space" \|
822	* +--------------------------------+-----------------+
823	*
824	* Contrast this with the diagram in bufpage.h; note the mismatch
825	* between linps and items. This is because we reserve linp0 as a
826	* placeholder for the pointer to the "high key" item; when we have
827	* filled up the page, we will set linp0 to point to itemN and clear
828	* linpN. On the other hand, if we find this is the last (rightmost)
829	* page, we leave the items alone and slide the linp array over. If
830	* the high key is to be truncated, offset 1 is deleted, and we insert
831	* the truncated high key at offset 1.
832	*
833	* 'last' pointer indicates the last offset added to the page.
834	*----------
835	*/
836	static void
837	_bt_buildadd(BTWriteState wstate, BTPageState state, IndexTuple itup)
838	{
839	Page npage;
840	BlockNumber nblkno;
841	OffsetNumber last_off;
842	Size pgspc;
843	Size itupsz;
844	bool isleaf;
845
846	/*
847	* This is a handy place to check for cancel interrupts during the btree
848	* load phase of index creation.
849	*/
850	CHECK_FOR_INTERRUPTS();
851
852	npage = state->btps_page;
853	nblkno = state->btps_blkno;
854	last_off = state->btps_lastoff;
855
856	pgspc = PageGetFreeSpace(npage);
857	itupsz = IndexTupleSize(itup);
858	itupsz = MAXALIGN(itupsz);
859	/ Leaf case has slightly different rules due to suffix truncation /
860	isleaf = (state->btps_level == `0`);
861
862	/*
863	* Check whether the new item can fit on a btree page on current level at
864	* all.
865	*
866	* Every newly built index will treat heap TID as part of the keyspace,
867	* which imposes the requirement that new high keys must occasionally have
868	* a heap TID appended within _bt_truncate(). That may leave a new pivot
869	* tuple one or two MAXALIGN() quantums larger than the original first
870	* right tuple it's derived from. v4 deals with the problem by decreasing
871	* the limit on the size of tuples inserted on the leaf level by the same
872	* small amount. Enforce the new v4+ limit on the leaf level, and the old
873	* limit on internal levels, since pivot tuples may need to make use of
874	* the reserved space. This should never fail on internal pages.
875	*/
876	if (unlikely(itupsz > BTMaxItemSize(npage)))
877	_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
878	itup);
879
880	/*
881	* Check to see if current page will fit new item, with space left over to
882	* append a heap TID during suffix truncation when page is a leaf page.
883	*
884	* It is guaranteed that we can fit at least 2 non-pivot tuples plus a
885	* high key with heap TID when finishing off a leaf page, since we rely on
886	* _bt_check_third_page() rejecting oversized non-pivot tuples. On
887	* internal pages we can always fit 3 pivot tuples with larger internal
888	* page tuple limit (includes page high key).
889	*
890	* Most of the time, a page is only "full" in the sense that the soft
891	* fillfactor-wise limit has been exceeded. However, we must always leave
892	* at least two items plus a high key on each page before starting a new
893	* page. Disregard fillfactor and insert on "full" current page if we
894	* don't have the minimum number of items yet. (Note that we deliberately
895	* assume that suffix truncation neither enlarges nor shrinks new high key
896	* when applying soft limit.)
897	*/
898	if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : `0`) \|\|
899	(pgspc < state->btps_full && last_off > P_FIRSTKEY))
900	{
901	/*
902	* Finish off the page and write it out.
903	*/
904	Page opage = npage;
905	BlockNumber oblkno = nblkno;
906	ItemId ii;
907	ItemId hii;
908	IndexTuple oitup;
909
910	/ Create new page of same level /
911	npage = _bt_blnewpage(state->btps_level);
912
913	/ and assign it a page position /
914	nblkno = wstate->btws_pages_alloced++;
915
916	/*
917	* We copy the last item on the page into the new page, and then
918	* rearrange the old page so that the 'last item' becomes its high key
919	* rather than a true data item. There had better be at least two
920	* items on the page already, else the page would be empty of useful
921	* data.
922	*/
923	Assert(last_off > P_FIRSTKEY);
924	ii = PageGetItemId(opage, last_off);
925	oitup = (IndexTuple) PageGetItem(opage, ii);
926	_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
927
928	/*
929	* Move 'last' into the high key position on opage. _bt_blnewpage()
930	* allocated empty space for a line pointer when opage was first
931	* created, so this is a matter of rearranging already-allocated space
932	* on page, and initializing high key line pointer. (Actually, leaf
933	* pages must also swap oitup with a truncated version of oitup, which
934	* is sometimes larger than oitup, though never by more than the space
935	* needed to append a heap TID.)
936	*/
937	hii = PageGetItemId(opage, P_HIKEY);
938	hii = ii;
939	ItemIdSetUnused(ii); / redundant /
940	((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
941
942	if (isleaf)
943	{
944	IndexTuple lastleft;
945	IndexTuple truncated;
946	Size truncsz;
947
948	/*
949	* Truncate away any unneeded attributes from high key on leaf
950	* level. This is only done at the leaf level because downlinks
951	* in internal pages are either negative infinity items, or get
952	* their contents from copying from one level down. See also:
953	* _bt_split().
954	*
955	* We don't try to bias our choice of split point to make it more
956	* likely that _bt_truncate() can truncate away more attributes,
957	* whereas the split point passed to _bt_split() is chosen much
958	* more delicately. Suffix truncation is mostly useful because it
959	* improves space utilization for workloads with random
960	* insertions. It doesn't seem worthwhile to add logic for
961	* choosing a split point here for a benefit that is bound to be
962	* much smaller.
963	*
964	* Since the truncated tuple is often smaller than the original
965	* tuple, it cannot just be copied in place (besides, we want to
966	* actually save space on the leaf page). We delete the original
967	* high key, and add our own truncated high key at the same
968	* offset.
969	*
970	* Note that the page layout won't be changed very much. oitup is
971	* already located at the physical beginning of tuple space, so we
972	* only shift the line pointer array back and forth, and overwrite
973	* the tuple space previously occupied by oitup. This is fairly
974	* cheap.
975	*/
976	ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
977	lastleft = (IndexTuple) PageGetItem(opage, ii);
978
979	truncated = _bt_truncate(wstate->index, lastleft, oitup,
980	wstate->inskey);
981	truncsz = IndexTupleSize(truncated);
982	PageIndexTupleDelete(opage, P_HIKEY);
983	_bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
984	pfree(truncated);
985
986	/ oitup should continue to point to the page's high key /
987	hii = PageGetItemId(opage, P_HIKEY);
988	oitup = (IndexTuple) PageGetItem(opage, hii);
989	}
990
991	/*
992	* Link the old page into its parent, using its minimum key. If we
993	* don't have a parent, we have to create one; this adds a new btree
994	* level.
995	*/
996	if (state->btps_next == NULL)
997	state->btps_next = _bt_pagestate(wstate, state->btps_level + `1`);
998
999	Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <=
1000	IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1001	BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > `0`) \|\|
1002	P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1003	Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == `0` \|\|
1004	!P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1005	BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
1006	_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
1007	pfree(state->btps_minkey);
1008
1009	/*
1010	* Save a copy of the high key from the old page. It is also used as
1011	* the minimum key for the new page.
1012	*/
1013	state->btps_minkey = CopyIndexTuple(oitup);
1014
1015	/*
1016	* Set the sibling links for both pages.
1017	*/
1018	{
1019	BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1020	BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1021
1022	oopaque->btpo_next = nblkno;
1023	nopaque->btpo_prev = oblkno;
1024	nopaque->btpo_next = P_NONE; / redundant /
1025	}
1026
1027	/*
1028	* Write out the old page. We never need to touch it again, so we can
1029	* free the opage workspace too.
1030	*/
1031	_bt_blwritepage(wstate, opage, oblkno);
1032
1033	/*
1034	* Reset last_off to point to new page
1035	*/
1036	last_off = P_FIRSTKEY;
1037	}
1038
1039	/*
1040	* By here, either original page is still the current page, or a new page
1041	* was created that became the current page. Either way, the current page
1042	* definitely has space for new item.
1043	*
1044	* If the new item is the first for its page, stash a copy for later. Note
1045	* this will only happen for the first item on a level; on later pages,
1046	* the first item for a page is copied from the prior page in the code
1047	* above. The minimum key for an entire level is nothing more than a
1048	* minus infinity (downlink only) pivot tuple placeholder.
1049	*/
1050	if (last_off == P_HIKEY)
1051	{
1052	Assert(state->btps_minkey == NULL);
1053	state->btps_minkey = CopyIndexTuple(itup);
1054	/ _bt_sortaddtup() will perform full truncation later /
1055	BTreeTupleSetNAtts(state->btps_minkey, `0`);
1056	}
1057
1058	/*
1059	* Add the new item into the current page.
1060	*/
1061	last_off = OffsetNumberNext(last_off);
1062	_bt_sortaddtup(npage, itupsz, itup, last_off);
1063
1064	state->btps_page = npage;
1065	state->btps_blkno = nblkno;
1066	state->btps_lastoff = last_off;
1067	}
1068
1069	/*
1070	* Finish writing out the completed btree.
1071	*/
1072	static void
1073	_bt_uppershutdown(BTWriteState wstate, BTPageState state)
1074	{
1075	BTPageState *s;
1076	BlockNumber rootblkno = P_NONE;
1077	uint32 rootlevel = `0`;
1078	Page metapage;
1079
1080	/*
1081	* Each iteration of this loop completes one more level of the tree.
1082	*/
1083	for (s = state; s != NULL; s = s->btps_next)
1084	{
1085	BlockNumber blkno;
1086	BTPageOpaque opaque;
1087
1088	blkno = s->btps_blkno;
1089	opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
1090
1091	/*
1092	* We have to link the last page on this level to somewhere.
1093	*
1094	* If we're at the top, it's the root, so attach it to the metapage.
1095	* Otherwise, add an entry for it to its parent using its minimum key.
1096	* This may cause the last page of the parent level to split, but
1097	* that's not a problem -- we haven't gotten to it yet.
1098	*/
1099	if (s->btps_next == NULL)
1100	{
1101	opaque->btpo_flags \|= BTP_ROOT;
1102	rootblkno = blkno;
1103	rootlevel = s->btps_level;
1104	}
1105	else
1106	{
1107	Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <=
1108	IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1109	BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > `0`) \|\|
1110	P_LEFTMOST(opaque));
1111	Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == `0` \|\|
1112	!P_LEFTMOST(opaque));
1113	BTreeInnerTupleSetDownLink(s->btps_minkey, blkno);
1114	_bt_buildadd(wstate, s->btps_next, s->btps_minkey);
1115	pfree(s->btps_minkey);
1116	s->btps_minkey = NULL;
1117	}
1118
1119	/*
1120	* This is the rightmost page, so the ItemId array needs to be slid
1121	* back one slot. Then we can dump out the page.
1122	*/
1123	_bt_slideleft(s->btps_page);
1124	_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1125	s->btps_page = NULL; / writepage freed the workspace /
1126	}
1127
1128	/*
1129	* As the last step in the process, construct the metapage and make it
1130	* point to the new root (unless we had no data at all, in which case it's
1131	* set to point to "P_NONE"). This changes the index to the "valid" state
1132	* by filling in a valid magic number in the metapage.
1133	*/
1134	metapage = (Page) palloc(BLCKSZ);
1135	_bt_initmetapage(metapage, rootblkno, rootlevel);
1136	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1137	}
1138
1139	/*
1140	* Read tuples in correct sort order from tuplesort, and load them into
1141	* btree leaves.
1142	*/
1143	static void
1144	_bt_load(BTWriteState wstate, BTSpool btspool, BTSpool *btspool2)
1145	{
1146	BTPageState *state = NULL;
1147	bool merge = (btspool2 != NULL);
1148	IndexTuple itup,
1149	itup2 = NULL;
1150	bool load1;
1151	TupleDesc tupdes = RelationGetDescr(wstate->index);
1152	int i,
1153	keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1154	SortSupport sortKeys;
1155	int64 tuples_done = `0`;
1156
1157	if (merge)
1158	{
1159	/*
1160	* Another BTSpool for dead tuples exists. Now we have to merge
1161	* btspool and btspool2.
1162	*/
1163
1164	/ the preparation of merge /
1165	itup = tuplesort_getindextuple(btspool->sortstate, true);
1166	itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1167
1168	/ Prepare SortSupport data for each column /
1169	sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1170
1171	for (i = `0`; i < keysz; i++)
1172	{
1173	SortSupport sortKey = sortKeys + i;
1174	ScanKey scanKey = wstate->inskey->scankeys + i;
1175	int16 strategy;
1176
1177	sortKey->ssup_cxt = CurrentMemoryContext;
1178	sortKey->ssup_collation = scanKey->sk_collation;
1179	sortKey->ssup_nulls_first =
1180	(scanKey->sk_flags & SK_BT_NULLS_FIRST) != `0`;
1181	sortKey->ssup_attno = scanKey->sk_attno;
1182	/ Abbreviation is not supported here /
1183	sortKey->abbreviate = false;
1184
1185	AssertState(sortKey->ssup_attno != `0`);
1186
1187	strategy = (scanKey->sk_flags & SK_BT_DESC) != `0` ?
1188	BTGreaterStrategyNumber : BTLessStrategyNumber;
1189
1190	PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1191	}
1192
1193	for (;;)
1194	{
1195	load1 = true; / load BTSpool next ? /
1196	if (itup2 == NULL)
1197	{
1198	if (itup == NULL)
1199	break;
1200	}
1201	else if (itup != NULL)
1202	{
1203	int32 compare = `0`;
1204
1205	for (i = `1`; i <= keysz; i++)
1206	{
1207	SortSupport entry;
1208	Datum attrDatum1,
1209	attrDatum2;
1210	bool isNull1,
1211	isNull2;
1212
1213	entry = sortKeys + i - `1`;
1214	attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1215	attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1216
1217	compare = ApplySortComparator(attrDatum1, isNull1,
1218	attrDatum2, isNull2,
1219	entry);
1220	if (compare > `0`)
1221	{
1222	load1 = false;
1223	break;
1224	}
1225	else if (compare < `0`)
1226	break;
1227	}
1228
1229	/*
1230	* If key values are equal, we sort on ItemPointer. This is
1231	* required for btree indexes, since heap TID is treated as an
1232	* implicit last key attribute in order to ensure that all
1233	* keys in the index are physically unique.
1234	*/
1235	if (compare == `0`)
1236	{
1237	compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1238	Assert(compare != `0`);
1239	if (compare > `0`)
1240	load1 = false;
1241	}
1242	}
1243	else
1244	load1 = false;
1245
1246	/ When we see first tuple, create first index page /
1247	if (state == NULL)
1248	state = _bt_pagestate(wstate, `0`);
1249
1250	if (load1)
1251	{
1252	_bt_buildadd(wstate, state, itup);
1253	itup = tuplesort_getindextuple(btspool->sortstate, true);
1254	}
1255	else
1256	{
1257	_bt_buildadd(wstate, state, itup2);
1258	itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1259	}
1260
1261	/ Report progress /
1262	pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1263	++tuples_done);
1264	}
1265	pfree(sortKeys);
1266	}
1267	else
1268	{
1269	/ merge is unnecessary /
1270	while ((itup = tuplesort_getindextuple(btspool->sortstate,
1271	true)) != NULL)
1272	{
1273	/ When we see first tuple, create first index page /
1274	if (state == NULL)
1275	state = _bt_pagestate(wstate, `0`);
1276
1277	_bt_buildadd(wstate, state, itup);
1278
1279	/ Report progress /
1280	pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1281	++tuples_done);
1282	}
1283	}
1284
1285	/ Close down final pages and write the metapage /
1286	_bt_uppershutdown(wstate, state);
1287
1288	/*
1289	* If the index is WAL-logged, we must fsync it down to disk before it's
1290	* safe to commit the transaction. (For a non-WAL-logged index we don't
1291	* care since the index will be uninteresting after a crash anyway.)
1292	*
1293	* It's obvious that we must do this when not WAL-logging the build. It's
1294	* less obvious that we have to do it even if we did WAL-log the index
1295	* pages. The reason is that since we're building outside shared buffers,
1296	* a CHECKPOINT occurring during the build has no way to flush the
1297	* previously written data to disk (indeed it won't know the index even
1298	* exists). A crash later on would replay WAL from the checkpoint,
1299	* therefore it wouldn't replay our earlier WAL entries. If we do not
1300	* fsync those pages here, they might still not be on disk when the crash
1301	* occurs.
1302	*/
1303	if (RelationNeedsWAL(wstate->index))
1304	{
1305	RelationOpenSmgr(wstate->index);
1306	smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
1307	}
1308	}
1309
1310	/*
1311	* Create parallel context, and launch workers for leader.
1312	*
1313	* buildstate argument should be initialized (with the exception of the
1314	* tuplesort state in spools, which may later be created based on shared
1315	* state initially set up here).
1316	*
1317	* isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1318	*
1319	* request is the target number of parallel worker processes to launch.
1320	*
1321	* Sets buildstate's BTLeader, which caller must use to shut down parallel
1322	* mode by passing it to _bt_end_parallel() at the very end of its index
1323	* build. If not even a single worker process can be launched, this is
1324	* never set, and caller should proceed with a serial index build.
1325	*/
1326	static void
1327	_bt_begin_parallel(BTBuildState buildstate, bool isconcurrent, int* request)
1328	{
1329	ParallelContext *pcxt;
1330	int scantuplesortstates;
1331	Snapshot snapshot;
1332	Size estbtshared;
1333	Size estsort;
1334	BTShared *btshared;
1335	Sharedsort *sharedsort;
1336	Sharedsort *sharedsort2;
1337	BTSpool *btspool = buildstate->spool;
1338	BTLeader btleader = (BTLeader ) palloc0(sizeof(BTLeader));
1339	bool leaderparticipates = true;
1340	char *sharedquery;
1341	int querylen;
1342
1343	#ifdef DISABLE_LEADER_PARTICIPATION
1344	leaderparticipates = false;
1345	#endif
1346
1347	/*
1348	* Enter parallel mode, and create context for parallel build of btree
1349	* index
1350	*/
1351	EnterParallelMode();
1352	Assert(request > `0`);
1353	pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1354	request);
1355	scantuplesortstates = leaderparticipates ? request + `1` : request;
1356
1357	/*
1358	* Prepare for scan of the base relation. In a normal index build, we use
1359	* SnapshotAny because we must retrieve all tuples and do our own time
1360	* qual checks (because we have to index RECENTLY_DEAD tuples). In a
1361	* concurrent build, we take a regular MVCC snapshot and index whatever's
1362	* live according to that.
1363	*/
1364	if (!isconcurrent)
1365	snapshot = SnapshotAny;
1366	else
1367	snapshot = RegisterSnapshot(GetTransactionSnapshot());
1368
1369	/*
1370	* Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1371	* PARALLEL_KEY_TUPLESORT tuplesort workspace
1372	*/
1373	estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1374	shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1375	estsort = tuplesort_estimate_shared(scantuplesortstates);
1376	shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1377
1378	/*
1379	* Unique case requires a second spool, and so we may have to account for
1380	* another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1381	*/
1382	if (!btspool->isunique)
1383	shm_toc_estimate_keys(&pcxt->estimator, `2`);
1384	else
1385	{
1386	shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1387	shm_toc_estimate_keys(&pcxt->estimator, `3`);
1388	}
1389
1390	/ Finally, estimate PARALLEL_KEY_QUERY_TEXT space /
1391	querylen = strlen(debug_query_string);
1392	shm_toc_estimate_chunk(&pcxt->estimator, querylen + `1`);
1393	shm_toc_estimate_keys(&pcxt->estimator, `1`);
1394
1395	/ Everyone's had a chance to ask for space, so now create the DSM /
1396	InitializeParallelDSM(pcxt);
1397
1398	/ Store shared build state, for which we reserved space /
1399	btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1400	/ Initialize immutable state /
1401	btshared->heaprelid = RelationGetRelid(btspool->heap);
1402	btshared->indexrelid = RelationGetRelid(btspool->index);
1403	btshared->isunique = btspool->isunique;
1404	btshared->isconcurrent = isconcurrent;
1405	btshared->scantuplesortstates = scantuplesortstates;
1406	ConditionVariableInit(&btshared->workersdonecv);
1407	SpinLockInit(&btshared->mutex);
1408	/ Initialize mutable state /
1409	btshared->nparticipantsdone = `0`;
1410	btshared->reltuples = `0.0`;
1411	btshared->havedead = false;
1412	btshared->indtuples = `0.0`;
1413	btshared->brokenhotchain = false;
1414	table_parallelscan_initialize(btspool->heap,
1415	ParallelTableScanFromBTShared(btshared),
1416	snapshot);
1417
1418	/*
1419	* Store shared tuplesort-private state, for which we reserved space.
1420	* Then, initialize opaque state using tuplesort routine.
1421	*/
1422	sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1423	tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1424	pcxt->seg);
1425
1426	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1427	shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1428
1429	/ Unique case requires a second spool, and associated shared state /
1430	if (!btspool->isunique)
1431	sharedsort2 = NULL;
1432	else
1433	{
1434	/*
1435	* Store additional shared tuplesort-private state, for which we
1436	* reserved space. Then, initialize opaque state using tuplesort
1437	* routine.
1438	*/
1439	sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1440	tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1441	pcxt->seg);
1442
1443	shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1444	}
1445
1446	/ Store query string for workers /
1447	sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + `1`);
1448	memcpy(sharedquery, debug_query_string, querylen + `1`);
1449	shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1450
1451	/ Launch workers, saving status for leader/caller /
1452	LaunchParallelWorkers(pcxt);
1453	btleader->pcxt = pcxt;
1454	btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1455	if (leaderparticipates)
1456	btleader->nparticipanttuplesorts++;
1457	btleader->btshared = btshared;
1458	btleader->sharedsort = sharedsort;
1459	btleader->sharedsort2 = sharedsort2;
1460	btleader->snapshot = snapshot;
1461
1462	/ If no workers were successfully launched, back out (do serial build) /
1463	if (pcxt->nworkers_launched == `0`)
1464	{
1465	_bt_end_parallel(btleader);
1466	return;
1467	}
1468
1469	/ Save leader state now that it's clear build will be parallel /
1470	buildstate->btleader = btleader;
1471
1472	/ Join heap scan ourselves /
1473	if (leaderparticipates)
1474	_bt_leader_participate_as_worker(buildstate);
1475
1476	/*
1477	* Caller needs to wait for all launched workers when we return. Make
1478	* sure that the failure-to-start case will not hang forever.
1479	*/
1480	WaitForParallelWorkersToAttach(pcxt);
1481	}
1482
1483	/*
1484	* Shut down workers, destroy parallel context, and end parallel mode.
1485	*/
1486	static void
1487	_bt_end_parallel(BTLeader *btleader)
1488	{
1489	/ Shutdown worker processes /
1490	WaitForParallelWorkersToFinish(btleader->pcxt);
1491	/ Free last reference to MVCC snapshot, if one was used /
1492	if (IsMVCCSnapshot(btleader->snapshot))
1493	UnregisterSnapshot(btleader->snapshot);
1494	DestroyParallelContext(btleader->pcxt);
1495	ExitParallelMode();
1496	}
1497
1498	/*
1499	* Returns size of shared memory required to store state for a parallel
1500	* btree index build based on the snapshot its parallel scan will use.
1501	*/
1502	static Size
1503	_bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1504	{
1505	/ c.f. shm_toc_allocate as to why BUFFERALIGN is used /
1506	return add_size(BUFFERALIGN(sizeof(BTShared)),
1507	table_parallelscan_estimate(heap, snapshot));
1508	}
1509
1510	/*
1511	* Within leader, wait for end of heap scan.
1512	*
1513	* When called, parallel heap scan started by _bt_begin_parallel() will
1514	* already be underway within worker processes (when leader participates
1515	* as a worker, we should end up here just as workers are finishing).
1516	*
1517	* Fills in fields needed for ambuild statistics, and lets caller set
1518	* field indicating that some worker encountered a broken HOT chain.
1519	*
1520	* Returns the total number of heap tuples scanned.
1521	*/
1522	static double
1523	_bt_parallel_heapscan(BTBuildState buildstate, bool brokenhotchain)
1524	{
1525	BTShared *btshared = buildstate->btleader->btshared;
1526	int nparticipanttuplesorts;
1527	double reltuples;
1528
1529	nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1530	for (;;)
1531	{
1532	SpinLockAcquire(&btshared->mutex);
1533	if (btshared->nparticipantsdone == nparticipanttuplesorts)
1534	{
1535	buildstate->havedead = btshared->havedead;
1536	buildstate->indtuples = btshared->indtuples;
1537	*brokenhotchain = btshared->brokenhotchain;
1538	reltuples = btshared->reltuples;
1539	SpinLockRelease(&btshared->mutex);
1540	break;
1541	}
1542	SpinLockRelease(&btshared->mutex);
1543
1544	ConditionVariableSleep(&btshared->workersdonecv,
1545	WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1546	}
1547
1548	ConditionVariableCancelSleep();
1549
1550	return reltuples;
1551	}
1552
1553	/*
1554	* Within leader, participate as a parallel worker.
1555	*/
1556	static void
1557	_bt_leader_participate_as_worker(BTBuildState *buildstate)
1558	{
1559	BTLeader *btleader = buildstate->btleader;
1560	BTSpool *leaderworker;
1561	BTSpool *leaderworker2;
1562	int sortmem;
1563
1564	/ Allocate memory and initialize private spool /
1565	leaderworker = (BTSpool ) palloc0(sizeof*(BTSpool));
1566	leaderworker->heap = buildstate->spool->heap;
1567	leaderworker->index = buildstate->spool->index;
1568	leaderworker->isunique = buildstate->spool->isunique;
1569
1570	/ Initialize second spool, if required /
1571	if (!btleader->btshared->isunique)
1572	leaderworker2 = NULL;
1573	else
1574	{
1575	/ Allocate memory for worker's own private secondary spool /
1576	leaderworker2 = (BTSpool ) palloc0(sizeof*(BTSpool));
1577
1578	/ Initialize worker's own secondary spool /
1579	leaderworker2->heap = leaderworker->heap;
1580	leaderworker2->index = leaderworker->index;
1581	leaderworker2->isunique = false;
1582	}
1583
1584	/*
1585	* Might as well use reliable figure when doling out maintenance_work_mem
1586	* (when requested number of workers were not launched, this will be
1587	* somewhat higher than it is for other workers).
1588	*/
1589	sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1590
1591	/ Perform work common to all participants /
1592	_bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1593	btleader->sharedsort, btleader->sharedsort2,
1594	sortmem, true);
1595
1596	#ifdef BTREE_BUILD_STATS
1597	if (log_btree_build_stats)
1598	{
1599	ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1600	ResetUsage();
1601	}
1602	#endif /* BTREE_BUILD_STATS */
1603	}
1604
1605	/*
1606	* Perform work within a launched parallel process.
1607	*/
1608	void
1609	_bt_parallel_build_main(dsm_segment seg, shm_toc toc)
1610	{
1611	char *sharedquery;
1612	BTSpool *btspool;
1613	BTSpool *btspool2;
1614	BTShared *btshared;
1615	Sharedsort *sharedsort;
1616	Sharedsort *sharedsort2;
1617	Relation heapRel;
1618	Relation indexRel;
1619	LOCKMODE heapLockmode;
1620	LOCKMODE indexLockmode;
1621	int sortmem;
1622
1623	#ifdef BTREE_BUILD_STATS
1624	if (log_btree_build_stats)
1625	ResetUsage();
1626	#endif /* BTREE_BUILD_STATS */
1627
1628	/ Set debug_query_string for individual workers first /
1629	sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
1630	debug_query_string = sharedquery;
1631
1632	/ Report the query string from leader /
1633	pgstat_report_activity(STATE_RUNNING, debug_query_string);
1634
1635	/ Look up nbtree shared state /
1636	btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1637
1638	/ Open relations using lock modes known to be obtained by index.c /
1639	if (!btshared->isconcurrent)
1640	{
1641	heapLockmode = ShareLock;
1642	indexLockmode = AccessExclusiveLock;
1643	}
1644	else
1645	{
1646	heapLockmode = ShareUpdateExclusiveLock;
1647	indexLockmode = RowExclusiveLock;
1648	}
1649
1650	/ Open relations within worker /
1651	heapRel = table_open(btshared->heaprelid, heapLockmode);
1652	indexRel = index_open(btshared->indexrelid, indexLockmode);
1653
1654	/ Initialize worker's own spool /
1655	btspool = (BTSpool ) palloc0(sizeof*(BTSpool));
1656	btspool->heap = heapRel;
1657	btspool->index = indexRel;
1658	btspool->isunique = btshared->isunique;
1659
1660	/ Look up shared state private to tuplesort.c /
1661	sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1662	tuplesort_attach_shared(sharedsort, seg);
1663	if (!btshared->isunique)
1664	{
1665	btspool2 = NULL;
1666	sharedsort2 = NULL;
1667	}
1668	else
1669	{
1670	/ Allocate memory for worker's own private secondary spool /
1671	btspool2 = (BTSpool ) palloc0(sizeof*(BTSpool));
1672
1673	/ Initialize worker's own secondary spool /
1674	btspool2->heap = btspool->heap;
1675	btspool2->index = btspool->index;
1676	btspool2->isunique = false;
1677	/ Look up shared state private to tuplesort.c /
1678	sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1679	tuplesort_attach_shared(sharedsort2, seg);
1680	}
1681
1682	/ Perform sorting of spool, and possibly a spool2 /
1683	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1684	_bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1685	sharedsort2, sortmem, false);
1686
1687	#ifdef BTREE_BUILD_STATS
1688	if (log_btree_build_stats)
1689	{
1690	ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1691	ResetUsage();
1692	}
1693	#endif /* BTREE_BUILD_STATS */
1694
1695	index_close(indexRel, indexLockmode);
1696	table_close(heapRel, heapLockmode);
1697	}
1698
1699	/*
1700	* Perform a worker's portion of a parallel sort.
1701	*
1702	* This generates a tuplesort for passed btspool, and a second tuplesort
1703	* state if a second btspool is need (i.e. for unique index builds). All
1704	* other spool fields should already be set when this is called.
1705	*
1706	* sortmem is the amount of working memory to use within each worker,
1707	* expressed in KBs.
1708	*
1709	* When this returns, workers are done, and need only release resources.
1710	*/
1711	static void
1712	_bt_parallel_scan_and_sort(BTSpool btspool, BTSpool btspool2,
1713	BTShared btshared, Sharedsort sharedsort,
1714	Sharedsort sharedsort2, int* sortmem, bool progress)
1715	{
1716	SortCoordinate coordinate;
1717	BTBuildState buildstate;
1718	TableScanDesc scan;
1719	double reltuples;
1720	IndexInfo *indexInfo;
1721
1722	/ Initialize local tuplesort coordination state /
1723	coordinate = palloc0(sizeof(SortCoordinateData));
1724	coordinate->isWorker = true;
1725	coordinate->nParticipants = -`1`;
1726	coordinate->sharedsort = sharedsort;
1727
1728	/ Begin "partial" tuplesort /
1729	btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1730	btspool->index,
1731	btspool->isunique,
1732	sortmem, coordinate,
1733	false);
1734
1735	/*
1736	* Just as with serial case, there may be a second spool. If so, a
1737	* second, dedicated spool2 partial tuplesort is required.
1738	*/
1739	if (btspool2)
1740	{
1741	SortCoordinate coordinate2;
1742
1743	/*
1744	* We expect that the second one (for dead tuples) won't get very
1745	* full, so we give it only work_mem (unless sortmem is less for
1746	* worker). Worker processes are generally permitted to allocate
1747	* work_mem independently.
1748	*/
1749	coordinate2 = palloc0(sizeof(SortCoordinateData));
1750	coordinate2->isWorker = true;
1751	coordinate2->nParticipants = -`1`;
1752	coordinate2->sharedsort = sharedsort2;
1753	btspool2->sortstate =
1754	tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1755	Min(sortmem, work_mem), coordinate2,
1756	false);
1757	}
1758
1759	/ Fill in buildstate for _bt_build_callback() /
1760	buildstate.isunique = btshared->isunique;
1761	buildstate.havedead = false;
1762	buildstate.heap = btspool->heap;
1763	buildstate.spool = btspool;
1764	buildstate.spool2 = btspool2;
1765	buildstate.indtuples = `0`;
1766	buildstate.btleader = NULL;
1767
1768	/ Join parallel scan /
1769	indexInfo = BuildIndexInfo(btspool->index);
1770	indexInfo->ii_Concurrent = btshared->isconcurrent;
1771	scan = table_beginscan_parallel(btspool->heap,
1772	ParallelTableScanFromBTShared(btshared));
1773	reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1774	true, progress, _bt_build_callback,
1775	(void *) &buildstate, scan);
1776
1777	/*
1778	* Execute this worker's part of the sort.
1779	*
1780	* Unlike leader and serial cases, we cannot avoid calling
1781	* tuplesort_performsort() for spool2 if it ends up containing no dead
1782	* tuples (this is disallowed for workers by tuplesort).
1783	*/
1784	tuplesort_performsort(btspool->sortstate);
1785	if (btspool2)
1786	tuplesort_performsort(btspool2->sortstate);
1787
1788	/*
1789	* Done. Record ambuild statistics, and whether we encountered a broken
1790	* HOT chain.
1791	*/
1792	SpinLockAcquire(&btshared->mutex);
1793	btshared->nparticipantsdone++;
1794	btshared->reltuples += reltuples;
1795	if (buildstate.havedead)
1796	btshared->havedead = true;
1797	btshared->indtuples += buildstate.indtuples;
1798	if (indexInfo->ii_BrokenHotChain)
1799	btshared->brokenhotchain = true;
1800	SpinLockRelease(&btshared->mutex);
1801
1802	/ Notify leader /
1803	ConditionVariableSignal(&btshared->workersdonecv);
1804
1805	/ We can end tuplesorts immediately /
1806	tuplesort_end(btspool->sortstate);
1807	if (btspool2)
1808	tuplesort_end(btspool2->sortstate);
1809	}
1810

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtsort.c