1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * nbtsort.c |
4 | * Build a btree from sorted input by loading leaf pages sequentially. |
5 | * |
6 | * NOTES |
7 | * |
8 | * We use tuplesort.c to sort the given index tuples into order. |
9 | * Then we scan the index tuples in order and build the btree pages |
10 | * for each level. We load source tuples into leaf-level pages. |
11 | * Whenever we fill a page at one level, we add a link to it to its |
12 | * parent level (starting a new parent level if necessary). When |
13 | * done, we write out each final page on each level, adding it to |
14 | * its parent level. When we have only one page on a level, it must be |
15 | * the root -- it can be attached to the btree metapage and we are done. |
16 | * |
17 | * It is not wise to pack the pages entirely full, since then *any* |
18 | * insertion would cause a split (and not only of the leaf page; the need |
19 | * for a split would cascade right up the tree). The steady-state load |
20 | * factor for btrees is usually estimated at 70%. We choose to pack leaf |
21 | * pages to the user-controllable fill factor (default 90%) while upper pages |
22 | * are always packed to 70%. This gives us reasonable density (there aren't |
23 | * many upper pages if the keys are reasonable-size) without risking a lot of |
24 | * cascading splits during early insertions. |
25 | * |
26 | * Formerly the index pages being built were kept in shared buffers, but |
27 | * that is of no value (since other backends have no interest in them yet) |
28 | * and it created locking problems for CHECKPOINT, because the upper-level |
29 | * pages were held exclusive-locked for long periods. Now we just build |
30 | * the pages in local memory and smgrwrite or smgrextend them as we finish |
31 | * them. They will need to be re-read into shared buffers on first use after |
32 | * the build finishes. |
33 | * |
34 | * Since the index will never be used unless it is completely built, |
35 | * from a crash-recovery point of view there is no need to WAL-log the |
36 | * steps of the build. After completing the index build, we can just sync |
37 | * the whole file to disk using smgrimmedsync() before exiting this module. |
38 | * This can be seen to be sufficient for crash recovery by considering that |
39 | * it's effectively equivalent to what would happen if a CHECKPOINT occurred |
40 | * just after the index build. However, it is clearly not sufficient if the |
41 | * DBA is using the WAL log for PITR or replication purposes, since another |
42 | * machine would not be able to reconstruct the index from WAL. Therefore, |
43 | * we log the completed index pages to WAL if and only if WAL archiving is |
44 | * active. |
45 | * |
46 | * This code isn't concerned about the FSM at all. The caller is responsible |
47 | * for initializing that. |
48 | * |
49 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
50 | * Portions Copyright (c) 1994, Regents of the University of California |
51 | * |
52 | * IDENTIFICATION |
53 | * src/backend/access/nbtree/nbtsort.c |
54 | * |
55 | *------------------------------------------------------------------------- |
56 | */ |
57 | |
58 | #include "postgres.h" |
59 | |
60 | #include "access/nbtree.h" |
61 | #include "access/parallel.h" |
62 | #include "access/relscan.h" |
63 | #include "access/table.h" |
64 | #include "access/tableam.h" |
65 | #include "access/xact.h" |
66 | #include "access/xlog.h" |
67 | #include "access/xloginsert.h" |
68 | #include "catalog/index.h" |
69 | #include "commands/progress.h" |
70 | #include "miscadmin.h" |
71 | #include "pgstat.h" |
72 | #include "storage/smgr.h" |
73 | #include "tcop/tcopprot.h" /* pgrminclude ignore */ |
74 | #include "utils/rel.h" |
75 | #include "utils/sortsupport.h" |
76 | #include "utils/tuplesort.h" |
77 | |
78 | |
79 | /* Magic numbers for parallel state sharing */ |
80 | #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001) |
81 | #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) |
82 | #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) |
83 | #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) |
84 | |
85 | /* |
86 | * DISABLE_LEADER_PARTICIPATION disables the leader's participation in |
87 | * parallel index builds. This may be useful as a debugging aid. |
88 | #undef DISABLE_LEADER_PARTICIPATION |
89 | */ |
90 | |
91 | /* |
92 | * Status record for spooling/sorting phase. (Note we may have two of |
93 | * these due to the special requirements for uniqueness-checking with |
94 | * dead tuples.) |
95 | */ |
96 | typedef struct BTSpool |
97 | { |
98 | Tuplesortstate *sortstate; /* state data for tuplesort.c */ |
99 | Relation heap; |
100 | Relation index; |
101 | bool isunique; |
102 | } BTSpool; |
103 | |
104 | /* |
105 | * Status for index builds performed in parallel. This is allocated in a |
106 | * dynamic shared memory segment. Note that there is a separate tuplesort TOC |
107 | * entry, private to tuplesort.c but allocated by this module on its behalf. |
108 | */ |
109 | typedef struct BTShared |
110 | { |
111 | /* |
112 | * These fields are not modified during the sort. They primarily exist |
113 | * for the benefit of worker processes that need to create BTSpool state |
114 | * corresponding to that used by the leader. |
115 | */ |
116 | Oid heaprelid; |
117 | Oid indexrelid; |
118 | bool isunique; |
119 | bool isconcurrent; |
120 | int scantuplesortstates; |
121 | |
122 | /* |
123 | * workersdonecv is used to monitor the progress of workers. All parallel |
124 | * participants must indicate that they are done before leader can use |
125 | * mutable state that workers maintain during scan (and before leader can |
126 | * proceed to tuplesort_performsort()). |
127 | */ |
128 | ConditionVariable workersdonecv; |
129 | |
130 | /* |
131 | * mutex protects all fields before heapdesc. |
132 | * |
133 | * These fields contain status information of interest to B-Tree index |
134 | * builds that must work just the same when an index is built in parallel. |
135 | */ |
136 | slock_t mutex; |
137 | |
138 | /* |
139 | * Mutable state that is maintained by workers, and reported back to |
140 | * leader at end of parallel scan. |
141 | * |
142 | * nparticipantsdone is number of worker processes finished. |
143 | * |
144 | * reltuples is the total number of input heap tuples. |
145 | * |
146 | * havedead indicates if RECENTLY_DEAD tuples were encountered during |
147 | * build. |
148 | * |
149 | * indtuples is the total number of tuples that made it into the index. |
150 | * |
151 | * brokenhotchain indicates if any worker detected a broken HOT chain |
152 | * during build. |
153 | */ |
154 | int nparticipantsdone; |
155 | double reltuples; |
156 | bool havedead; |
157 | double indtuples; |
158 | bool brokenhotchain; |
159 | |
160 | /* |
161 | * ParallelTableScanDescData data follows. Can't directly embed here, as |
162 | * implementations of the parallel table scan desc interface might need |
163 | * stronger alignment. |
164 | */ |
165 | } BTShared; |
166 | |
167 | /* |
168 | * Return pointer to a BTShared's parallel table scan. |
169 | * |
170 | * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just |
171 | * MAXALIGN. |
172 | */ |
173 | #define ParallelTableScanFromBTShared(shared) \ |
174 | (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared))) |
175 | |
176 | /* |
177 | * Status for leader in parallel index build. |
178 | */ |
179 | typedef struct BTLeader |
180 | { |
181 | /* parallel context itself */ |
182 | ParallelContext *pcxt; |
183 | |
184 | /* |
185 | * nparticipanttuplesorts is the exact number of worker processes |
186 | * successfully launched, plus one leader process if it participates as a |
187 | * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader |
188 | * participating as a worker). |
189 | */ |
190 | int nparticipanttuplesorts; |
191 | |
192 | /* |
193 | * Leader process convenience pointers to shared state (leader avoids TOC |
194 | * lookups). |
195 | * |
196 | * btshared is the shared state for entire build. sharedsort is the |
197 | * shared, tuplesort-managed state passed to each process tuplesort. |
198 | * sharedsort2 is the corresponding btspool2 shared state, used only when |
199 | * building unique indexes. snapshot is the snapshot used by the scan iff |
200 | * an MVCC snapshot is required. |
201 | */ |
202 | BTShared *btshared; |
203 | Sharedsort *sharedsort; |
204 | Sharedsort *sharedsort2; |
205 | Snapshot snapshot; |
206 | } BTLeader; |
207 | |
208 | /* |
209 | * Working state for btbuild and its callback. |
210 | * |
211 | * When parallel CREATE INDEX is used, there is a BTBuildState for each |
212 | * participant. |
213 | */ |
214 | typedef struct BTBuildState |
215 | { |
216 | bool isunique; |
217 | bool havedead; |
218 | Relation heap; |
219 | BTSpool *spool; |
220 | |
221 | /* |
222 | * spool2 is needed only when the index is a unique index. Dead tuples are |
223 | * put into spool2 instead of spool in order to avoid uniqueness check. |
224 | */ |
225 | BTSpool *spool2; |
226 | double indtuples; |
227 | |
228 | /* |
229 | * btleader is only present when a parallel index build is performed, and |
230 | * only in the leader process. (Actually, only the leader has a |
231 | * BTBuildState. Workers have their own spool and spool2, though.) |
232 | */ |
233 | BTLeader *btleader; |
234 | } BTBuildState; |
235 | |
236 | /* |
237 | * Status record for a btree page being built. We have one of these |
238 | * for each active tree level. |
239 | * |
240 | * The reason we need to store a copy of the minimum key is that we'll |
241 | * need to propagate it to the parent node when this page is linked |
242 | * into its parent. However, if the page is not a leaf page, the first |
243 | * entry on the page doesn't need to contain a key, so we will not have |
244 | * stored the key itself on the page. (You might think we could skip |
245 | * copying the minimum key on leaf pages, but actually we must have a |
246 | * writable copy anyway because we'll poke the page's address into it |
247 | * before passing it up to the parent...) |
248 | */ |
249 | typedef struct BTPageState |
250 | { |
251 | Page btps_page; /* workspace for page building */ |
252 | BlockNumber btps_blkno; /* block # to write this page at */ |
253 | IndexTuple btps_minkey; /* copy of minimum key (first item) on page */ |
254 | OffsetNumber btps_lastoff; /* last item offset loaded */ |
255 | uint32 btps_level; /* tree level (0 = leaf) */ |
256 | Size btps_full; /* "full" if less than this much free space */ |
257 | struct BTPageState *btps_next; /* link to parent level, if any */ |
258 | } BTPageState; |
259 | |
260 | /* |
261 | * Overall status record for index writing phase. |
262 | */ |
263 | typedef struct BTWriteState |
264 | { |
265 | Relation heap; |
266 | Relation index; |
267 | BTScanInsert inskey; /* generic insertion scankey */ |
268 | bool btws_use_wal; /* dump pages to WAL? */ |
269 | BlockNumber btws_pages_alloced; /* # pages allocated */ |
270 | BlockNumber btws_pages_written; /* # pages written out */ |
271 | Page btws_zeropage; /* workspace for filling zeroes */ |
272 | } BTWriteState; |
273 | |
274 | |
275 | static double _bt_spools_heapscan(Relation heap, Relation index, |
276 | BTBuildState *buildstate, IndexInfo *indexInfo); |
277 | static void _bt_spooldestroy(BTSpool *btspool); |
278 | static void _bt_spool(BTSpool *btspool, ItemPointer self, |
279 | Datum *values, bool *isnull); |
280 | static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); |
281 | static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values, |
282 | bool *isnull, bool tupleIsAlive, void *state); |
283 | static Page _bt_blnewpage(uint32 level); |
284 | static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); |
285 | static void _bt_slideleft(Page page); |
286 | static void _bt_sortaddtup(Page page, Size itemsize, |
287 | IndexTuple itup, OffsetNumber itup_off); |
288 | static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, |
289 | IndexTuple itup); |
290 | static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); |
291 | static void _bt_load(BTWriteState *wstate, |
292 | BTSpool *btspool, BTSpool *btspool2); |
293 | static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, |
294 | int request); |
295 | static void _bt_end_parallel(BTLeader *btleader); |
296 | static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot); |
297 | static double _bt_parallel_heapscan(BTBuildState *buildstate, |
298 | bool *brokenhotchain); |
299 | static void _bt_leader_participate_as_worker(BTBuildState *buildstate); |
300 | static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, |
301 | BTShared *btshared, Sharedsort *sharedsort, |
302 | Sharedsort *sharedsort2, int sortmem, |
303 | bool progress); |
304 | |
305 | |
306 | /* |
307 | * btbuild() -- build a new btree index. |
308 | */ |
309 | IndexBuildResult * |
310 | btbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
311 | { |
312 | IndexBuildResult *result; |
313 | BTBuildState buildstate; |
314 | double reltuples; |
315 | |
316 | #ifdef BTREE_BUILD_STATS |
317 | if (log_btree_build_stats) |
318 | ResetUsage(); |
319 | #endif /* BTREE_BUILD_STATS */ |
320 | |
321 | buildstate.isunique = indexInfo->ii_Unique; |
322 | buildstate.havedead = false; |
323 | buildstate.heap = heap; |
324 | buildstate.spool = NULL; |
325 | buildstate.spool2 = NULL; |
326 | buildstate.indtuples = 0; |
327 | buildstate.btleader = NULL; |
328 | |
329 | /* |
330 | * We expect to be called exactly once for any index relation. If that's |
331 | * not the case, big trouble's what we have. |
332 | */ |
333 | if (RelationGetNumberOfBlocks(index) != 0) |
334 | elog(ERROR, "index \"%s\" already contains data" , |
335 | RelationGetRelationName(index)); |
336 | |
337 | reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); |
338 | |
339 | /* |
340 | * Finish the build by (1) completing the sort of the spool file, (2) |
341 | * inserting the sorted tuples into btree pages and (3) building the upper |
342 | * levels. Finally, it may also be necessary to end use of parallelism. |
343 | */ |
344 | _bt_leafbuild(buildstate.spool, buildstate.spool2); |
345 | _bt_spooldestroy(buildstate.spool); |
346 | if (buildstate.spool2) |
347 | _bt_spooldestroy(buildstate.spool2); |
348 | if (buildstate.btleader) |
349 | _bt_end_parallel(buildstate.btleader); |
350 | |
351 | result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
352 | |
353 | result->heap_tuples = reltuples; |
354 | result->index_tuples = buildstate.indtuples; |
355 | |
356 | #ifdef BTREE_BUILD_STATS |
357 | if (log_btree_build_stats) |
358 | { |
359 | ShowUsage("BTREE BUILD STATS" ); |
360 | ResetUsage(); |
361 | } |
362 | #endif /* BTREE_BUILD_STATS */ |
363 | |
364 | return result; |
365 | } |
366 | |
367 | /* |
368 | * Create and initialize one or two spool structures, and save them in caller's |
369 | * buildstate argument. May also fill-in fields within indexInfo used by index |
370 | * builds. |
371 | * |
372 | * Scans the heap, possibly in parallel, filling spools with IndexTuples. This |
373 | * routine encapsulates all aspects of managing parallelism. Caller need only |
374 | * call _bt_end_parallel() in parallel case after it is done with spool/spool2. |
375 | * |
376 | * Returns the total number of heap tuples scanned. |
377 | */ |
378 | static double |
379 | _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, |
380 | IndexInfo *indexInfo) |
381 | { |
382 | BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); |
383 | SortCoordinate coordinate = NULL; |
384 | double reltuples = 0; |
385 | |
386 | /* |
387 | * We size the sort area as maintenance_work_mem rather than work_mem to |
388 | * speed index creation. This should be OK since a single backend can't |
389 | * run multiple index creations in parallel (see also: notes on |
390 | * parallelism and maintenance_work_mem below). |
391 | */ |
392 | btspool->heap = heap; |
393 | btspool->index = index; |
394 | btspool->isunique = indexInfo->ii_Unique; |
395 | |
396 | /* Save as primary spool */ |
397 | buildstate->spool = btspool; |
398 | |
399 | /* Report table scan phase started */ |
400 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
401 | PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN); |
402 | |
403 | /* Attempt to launch parallel worker scan when required */ |
404 | if (indexInfo->ii_ParallelWorkers > 0) |
405 | _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent, |
406 | indexInfo->ii_ParallelWorkers); |
407 | |
408 | /* |
409 | * If parallel build requested and at least one worker process was |
410 | * successfully launched, set up coordination state |
411 | */ |
412 | if (buildstate->btleader) |
413 | { |
414 | coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); |
415 | coordinate->isWorker = false; |
416 | coordinate->nParticipants = |
417 | buildstate->btleader->nparticipanttuplesorts; |
418 | coordinate->sharedsort = buildstate->btleader->sharedsort; |
419 | } |
420 | |
421 | /* |
422 | * Begin serial/leader tuplesort. |
423 | * |
424 | * In cases where parallelism is involved, the leader receives the same |
425 | * share of maintenance_work_mem as a serial sort (it is generally treated |
426 | * in the same way as a serial sort once we return). Parallel worker |
427 | * Tuplesortstates will have received only a fraction of |
428 | * maintenance_work_mem, though. |
429 | * |
430 | * We rely on the lifetime of the Leader Tuplesortstate almost not |
431 | * overlapping with any worker Tuplesortstate's lifetime. There may be |
432 | * some small overlap, but that's okay because we rely on leader |
433 | * Tuplesortstate only allocating a small, fixed amount of memory here. |
434 | * When its tuplesort_performsort() is called (by our caller), and |
435 | * significant amounts of memory are likely to be used, all workers must |
436 | * have already freed almost all memory held by their Tuplesortstates |
437 | * (they are about to go away completely, too). The overall effect is |
438 | * that maintenance_work_mem always represents an absolute high watermark |
439 | * on the amount of memory used by a CREATE INDEX operation, regardless of |
440 | * the use of parallelism or any other factor. |
441 | */ |
442 | buildstate->spool->sortstate = |
443 | tuplesort_begin_index_btree(heap, index, buildstate->isunique, |
444 | maintenance_work_mem, coordinate, |
445 | false); |
446 | |
447 | /* |
448 | * If building a unique index, put dead tuples in a second spool to keep |
449 | * them out of the uniqueness check. We expect that the second spool (for |
450 | * dead tuples) won't get very full, so we give it only work_mem. |
451 | */ |
452 | if (indexInfo->ii_Unique) |
453 | { |
454 | BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
455 | SortCoordinate coordinate2 = NULL; |
456 | |
457 | /* Initialize secondary spool */ |
458 | btspool2->heap = heap; |
459 | btspool2->index = index; |
460 | btspool2->isunique = false; |
461 | /* Save as secondary spool */ |
462 | buildstate->spool2 = btspool2; |
463 | |
464 | if (buildstate->btleader) |
465 | { |
466 | /* |
467 | * Set up non-private state that is passed to |
468 | * tuplesort_begin_index_btree() about the basic high level |
469 | * coordination of a parallel sort. |
470 | */ |
471 | coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); |
472 | coordinate2->isWorker = false; |
473 | coordinate2->nParticipants = |
474 | buildstate->btleader->nparticipanttuplesorts; |
475 | coordinate2->sharedsort = buildstate->btleader->sharedsort2; |
476 | } |
477 | |
478 | /* |
479 | * We expect that the second one (for dead tuples) won't get very |
480 | * full, so we give it only work_mem |
481 | */ |
482 | buildstate->spool2->sortstate = |
483 | tuplesort_begin_index_btree(heap, index, false, work_mem, |
484 | coordinate2, false); |
485 | } |
486 | |
487 | /* Fill spool using either serial or parallel heap scan */ |
488 | if (!buildstate->btleader) |
489 | reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
490 | _bt_build_callback, (void *) buildstate, |
491 | NULL); |
492 | else |
493 | reltuples = _bt_parallel_heapscan(buildstate, |
494 | &indexInfo->ii_BrokenHotChain); |
495 | |
496 | /* |
497 | * Set the progress target for the next phase. Reset the block number |
498 | * values set by table_index_build_scan |
499 | */ |
500 | { |
501 | const int index[] = { |
502 | PROGRESS_CREATEIDX_TUPLES_TOTAL, |
503 | PROGRESS_SCAN_BLOCKS_TOTAL, |
504 | PROGRESS_SCAN_BLOCKS_DONE |
505 | }; |
506 | const int64 val[] = { |
507 | buildstate->indtuples, |
508 | 0, 0 |
509 | }; |
510 | |
511 | pgstat_progress_update_multi_param(3, index, val); |
512 | } |
513 | |
514 | /* okay, all heap tuples are spooled */ |
515 | if (buildstate->spool2 && !buildstate->havedead) |
516 | { |
517 | /* spool2 turns out to be unnecessary */ |
518 | _bt_spooldestroy(buildstate->spool2); |
519 | buildstate->spool2 = NULL; |
520 | } |
521 | |
522 | return reltuples; |
523 | } |
524 | |
525 | /* |
526 | * clean up a spool structure and its substructures. |
527 | */ |
528 | static void |
529 | _bt_spooldestroy(BTSpool *btspool) |
530 | { |
531 | tuplesort_end(btspool->sortstate); |
532 | pfree(btspool); |
533 | } |
534 | |
535 | /* |
536 | * spool an index entry into the sort file. |
537 | */ |
538 | static void |
539 | _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) |
540 | { |
541 | tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, |
542 | self, values, isnull); |
543 | } |
544 | |
545 | /* |
546 | * given a spool loaded by successive calls to _bt_spool, |
547 | * create an entire btree. |
548 | */ |
549 | static void |
550 | _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) |
551 | { |
552 | BTWriteState wstate; |
553 | |
554 | #ifdef BTREE_BUILD_STATS |
555 | if (log_btree_build_stats) |
556 | { |
557 | ShowUsage("BTREE BUILD (Spool) STATISTICS" ); |
558 | ResetUsage(); |
559 | } |
560 | #endif /* BTREE_BUILD_STATS */ |
561 | |
562 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
563 | PROGRESS_BTREE_PHASE_PERFORMSORT_1); |
564 | tuplesort_performsort(btspool->sortstate); |
565 | if (btspool2) |
566 | { |
567 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
568 | PROGRESS_BTREE_PHASE_PERFORMSORT_2); |
569 | tuplesort_performsort(btspool2->sortstate); |
570 | } |
571 | |
572 | wstate.heap = btspool->heap; |
573 | wstate.index = btspool->index; |
574 | wstate.inskey = _bt_mkscankey(wstate.index, NULL); |
575 | |
576 | /* |
577 | * We need to log index creation in WAL iff WAL archiving/streaming is |
578 | * enabled UNLESS the index isn't WAL-logged anyway. |
579 | */ |
580 | wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); |
581 | |
582 | /* reserve the metapage */ |
583 | wstate.btws_pages_alloced = BTREE_METAPAGE + 1; |
584 | wstate.btws_pages_written = 0; |
585 | wstate.btws_zeropage = NULL; /* until needed */ |
586 | |
587 | pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
588 | PROGRESS_BTREE_PHASE_LEAF_LOAD); |
589 | _bt_load(&wstate, btspool, btspool2); |
590 | } |
591 | |
592 | /* |
593 | * Per-tuple callback for table_index_build_scan |
594 | */ |
595 | static void |
596 | _bt_build_callback(Relation index, |
597 | HeapTuple htup, |
598 | Datum *values, |
599 | bool *isnull, |
600 | bool tupleIsAlive, |
601 | void *state) |
602 | { |
603 | BTBuildState *buildstate = (BTBuildState *) state; |
604 | |
605 | /* |
606 | * insert the index tuple into the appropriate spool file for subsequent |
607 | * processing |
608 | */ |
609 | if (tupleIsAlive || buildstate->spool2 == NULL) |
610 | _bt_spool(buildstate->spool, &htup->t_self, values, isnull); |
611 | else |
612 | { |
613 | /* dead tuples are put into spool2 */ |
614 | buildstate->havedead = true; |
615 | _bt_spool(buildstate->spool2, &htup->t_self, values, isnull); |
616 | } |
617 | |
618 | buildstate->indtuples += 1; |
619 | } |
620 | |
621 | /* |
622 | * allocate workspace for a new, clean btree page, not linked to any siblings. |
623 | */ |
624 | static Page |
625 | _bt_blnewpage(uint32 level) |
626 | { |
627 | Page page; |
628 | BTPageOpaque opaque; |
629 | |
630 | page = (Page) palloc(BLCKSZ); |
631 | |
632 | /* Zero the page and set up standard page header info */ |
633 | _bt_pageinit(page, BLCKSZ); |
634 | |
635 | /* Initialize BT opaque state */ |
636 | opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
637 | opaque->btpo_prev = opaque->btpo_next = P_NONE; |
638 | opaque->btpo.level = level; |
639 | opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; |
640 | opaque->btpo_cycleid = 0; |
641 | |
642 | /* Make the P_HIKEY line pointer appear allocated */ |
643 | ((PageHeader) page)->pd_lower += sizeof(ItemIdData); |
644 | |
645 | return page; |
646 | } |
647 | |
648 | /* |
649 | * emit a completed btree page, and release the working storage. |
650 | */ |
651 | static void |
652 | _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) |
653 | { |
654 | /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ |
655 | RelationOpenSmgr(wstate->index); |
656 | |
657 | /* XLOG stuff */ |
658 | if (wstate->btws_use_wal) |
659 | { |
660 | /* We use the heap NEWPAGE record type for this */ |
661 | log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); |
662 | } |
663 | |
664 | /* |
665 | * If we have to write pages nonsequentially, fill in the space with |
666 | * zeroes until we come back and overwrite. This is not logically |
667 | * necessary on standard Unix filesystems (unwritten space will read as |
668 | * zeroes anyway), but it should help to avoid fragmentation. The dummy |
669 | * pages aren't WAL-logged though. |
670 | */ |
671 | while (blkno > wstate->btws_pages_written) |
672 | { |
673 | if (!wstate->btws_zeropage) |
674 | wstate->btws_zeropage = (Page) palloc0(BLCKSZ); |
675 | /* don't set checksum for all-zero page */ |
676 | smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, |
677 | wstate->btws_pages_written++, |
678 | (char *) wstate->btws_zeropage, |
679 | true); |
680 | } |
681 | |
682 | PageSetChecksumInplace(page, blkno); |
683 | |
684 | /* |
685 | * Now write the page. There's no need for smgr to schedule an fsync for |
686 | * this write; we'll do it ourselves before ending the build. |
687 | */ |
688 | if (blkno == wstate->btws_pages_written) |
689 | { |
690 | /* extending the file... */ |
691 | smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, |
692 | (char *) page, true); |
693 | wstate->btws_pages_written++; |
694 | } |
695 | else |
696 | { |
697 | /* overwriting a block we zero-filled before */ |
698 | smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, |
699 | (char *) page, true); |
700 | } |
701 | |
702 | pfree(page); |
703 | } |
704 | |
705 | /* |
706 | * allocate and initialize a new BTPageState. the returned structure |
707 | * is suitable for immediate use by _bt_buildadd. |
708 | */ |
709 | static BTPageState * |
710 | _bt_pagestate(BTWriteState *wstate, uint32 level) |
711 | { |
712 | BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); |
713 | |
714 | /* create initial page for level */ |
715 | state->btps_page = _bt_blnewpage(level); |
716 | |
717 | /* and assign it a page position */ |
718 | state->btps_blkno = wstate->btws_pages_alloced++; |
719 | |
720 | state->btps_minkey = NULL; |
721 | /* initialize lastoff so first item goes into P_FIRSTKEY */ |
722 | state->btps_lastoff = P_HIKEY; |
723 | state->btps_level = level; |
724 | /* set "full" threshold based on level. See notes at head of file. */ |
725 | if (level > 0) |
726 | state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); |
727 | else |
728 | state->btps_full = RelationGetTargetPageFreeSpace(wstate->index, |
729 | BTREE_DEFAULT_FILLFACTOR); |
730 | /* no parent level, yet */ |
731 | state->btps_next = NULL; |
732 | |
733 | return state; |
734 | } |
735 | |
736 | /* |
737 | * slide an array of ItemIds back one slot (from P_FIRSTKEY to |
738 | * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover |
739 | * that we have built an ItemId array in what has turned out to be a |
740 | * P_RIGHTMOST page. |
741 | */ |
742 | static void |
743 | _bt_slideleft(Page page) |
744 | { |
745 | OffsetNumber off; |
746 | OffsetNumber maxoff; |
747 | ItemId previi; |
748 | ItemId thisii; |
749 | |
750 | if (!PageIsEmpty(page)) |
751 | { |
752 | maxoff = PageGetMaxOffsetNumber(page); |
753 | previi = PageGetItemId(page, P_HIKEY); |
754 | for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) |
755 | { |
756 | thisii = PageGetItemId(page, off); |
757 | *previi = *thisii; |
758 | previi = thisii; |
759 | } |
760 | ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); |
761 | } |
762 | } |
763 | |
764 | /* |
765 | * Add an item to a page being built. |
766 | * |
767 | * The main difference between this routine and a bare PageAddItem call |
768 | * is that this code knows that the leftmost data item on a non-leaf |
769 | * btree page doesn't need to have a key. Therefore, it strips such |
770 | * items down to just the item header. |
771 | * |
772 | * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use |
773 | * that because it assumes that P_RIGHTMOST() will return the correct |
774 | * answer for the page. Here, we don't know yet if the page will be |
775 | * rightmost. Offset P_FIRSTKEY is always the first data key. |
776 | */ |
777 | static void |
778 | _bt_sortaddtup(Page page, |
779 | Size itemsize, |
780 | IndexTuple itup, |
781 | OffsetNumber itup_off) |
782 | { |
783 | BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
784 | IndexTupleData trunctuple; |
785 | |
786 | if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) |
787 | { |
788 | trunctuple = *itup; |
789 | trunctuple.t_info = sizeof(IndexTupleData); |
790 | /* Deliberately zero INDEX_ALT_TID_MASK bits */ |
791 | BTreeTupleSetNAtts(&trunctuple, 0); |
792 | itup = &trunctuple; |
793 | itemsize = sizeof(IndexTupleData); |
794 | } |
795 | |
796 | if (PageAddItem(page, (Item) itup, itemsize, itup_off, |
797 | false, false) == InvalidOffsetNumber) |
798 | elog(ERROR, "failed to add item to the index page" ); |
799 | } |
800 | |
801 | /*---------- |
802 | * Add an item to a disk page from the sort output. |
803 | * |
804 | * We must be careful to observe the page layout conventions of nbtsearch.c: |
805 | * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. |
806 | * - on non-leaf pages, the key portion of the first item need not be |
807 | * stored, we should store only the link. |
808 | * |
809 | * A leaf page being built looks like: |
810 | * |
811 | * +----------------+---------------------------------+ |
812 | * | PageHeaderData | linp0 linp1 linp2 ... | |
813 | * +-----------+----+---------------------------------+ |
814 | * | ... linpN | | |
815 | * +-----------+--------------------------------------+ |
816 | * | ^ last | |
817 | * | | |
818 | * +-------------+------------------------------------+ |
819 | * | | itemN ... | |
820 | * +-------------+------------------+-----------------+ |
821 | * | ... item3 item2 item1 | "special space" | |
822 | * +--------------------------------+-----------------+ |
823 | * |
824 | * Contrast this with the diagram in bufpage.h; note the mismatch |
825 | * between linps and items. This is because we reserve linp0 as a |
826 | * placeholder for the pointer to the "high key" item; when we have |
827 | * filled up the page, we will set linp0 to point to itemN and clear |
828 | * linpN. On the other hand, if we find this is the last (rightmost) |
829 | * page, we leave the items alone and slide the linp array over. If |
830 | * the high key is to be truncated, offset 1 is deleted, and we insert |
831 | * the truncated high key at offset 1. |
832 | * |
833 | * 'last' pointer indicates the last offset added to the page. |
834 | *---------- |
835 | */ |
836 | static void |
837 | _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) |
838 | { |
839 | Page npage; |
840 | BlockNumber nblkno; |
841 | OffsetNumber last_off; |
842 | Size pgspc; |
843 | Size itupsz; |
844 | bool isleaf; |
845 | |
846 | /* |
847 | * This is a handy place to check for cancel interrupts during the btree |
848 | * load phase of index creation. |
849 | */ |
850 | CHECK_FOR_INTERRUPTS(); |
851 | |
852 | npage = state->btps_page; |
853 | nblkno = state->btps_blkno; |
854 | last_off = state->btps_lastoff; |
855 | |
856 | pgspc = PageGetFreeSpace(npage); |
857 | itupsz = IndexTupleSize(itup); |
858 | itupsz = MAXALIGN(itupsz); |
859 | /* Leaf case has slightly different rules due to suffix truncation */ |
860 | isleaf = (state->btps_level == 0); |
861 | |
862 | /* |
863 | * Check whether the new item can fit on a btree page on current level at |
864 | * all. |
865 | * |
866 | * Every newly built index will treat heap TID as part of the keyspace, |
867 | * which imposes the requirement that new high keys must occasionally have |
868 | * a heap TID appended within _bt_truncate(). That may leave a new pivot |
869 | * tuple one or two MAXALIGN() quantums larger than the original first |
870 | * right tuple it's derived from. v4 deals with the problem by decreasing |
871 | * the limit on the size of tuples inserted on the leaf level by the same |
872 | * small amount. Enforce the new v4+ limit on the leaf level, and the old |
873 | * limit on internal levels, since pivot tuples may need to make use of |
874 | * the reserved space. This should never fail on internal pages. |
875 | */ |
876 | if (unlikely(itupsz > BTMaxItemSize(npage))) |
877 | _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, |
878 | itup); |
879 | |
880 | /* |
881 | * Check to see if current page will fit new item, with space left over to |
882 | * append a heap TID during suffix truncation when page is a leaf page. |
883 | * |
884 | * It is guaranteed that we can fit at least 2 non-pivot tuples plus a |
885 | * high key with heap TID when finishing off a leaf page, since we rely on |
886 | * _bt_check_third_page() rejecting oversized non-pivot tuples. On |
887 | * internal pages we can always fit 3 pivot tuples with larger internal |
888 | * page tuple limit (includes page high key). |
889 | * |
890 | * Most of the time, a page is only "full" in the sense that the soft |
891 | * fillfactor-wise limit has been exceeded. However, we must always leave |
892 | * at least two items plus a high key on each page before starting a new |
893 | * page. Disregard fillfactor and insert on "full" current page if we |
894 | * don't have the minimum number of items yet. (Note that we deliberately |
895 | * assume that suffix truncation neither enlarges nor shrinks new high key |
896 | * when applying soft limit.) |
897 | */ |
898 | if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || |
899 | (pgspc < state->btps_full && last_off > P_FIRSTKEY)) |
900 | { |
901 | /* |
902 | * Finish off the page and write it out. |
903 | */ |
904 | Page opage = npage; |
905 | BlockNumber oblkno = nblkno; |
906 | ItemId ii; |
907 | ItemId hii; |
908 | IndexTuple oitup; |
909 | |
910 | /* Create new page of same level */ |
911 | npage = _bt_blnewpage(state->btps_level); |
912 | |
913 | /* and assign it a page position */ |
914 | nblkno = wstate->btws_pages_alloced++; |
915 | |
916 | /* |
917 | * We copy the last item on the page into the new page, and then |
918 | * rearrange the old page so that the 'last item' becomes its high key |
919 | * rather than a true data item. There had better be at least two |
920 | * items on the page already, else the page would be empty of useful |
921 | * data. |
922 | */ |
923 | Assert(last_off > P_FIRSTKEY); |
924 | ii = PageGetItemId(opage, last_off); |
925 | oitup = (IndexTuple) PageGetItem(opage, ii); |
926 | _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); |
927 | |
928 | /* |
929 | * Move 'last' into the high key position on opage. _bt_blnewpage() |
930 | * allocated empty space for a line pointer when opage was first |
931 | * created, so this is a matter of rearranging already-allocated space |
932 | * on page, and initializing high key line pointer. (Actually, leaf |
933 | * pages must also swap oitup with a truncated version of oitup, which |
934 | * is sometimes larger than oitup, though never by more than the space |
935 | * needed to append a heap TID.) |
936 | */ |
937 | hii = PageGetItemId(opage, P_HIKEY); |
938 | *hii = *ii; |
939 | ItemIdSetUnused(ii); /* redundant */ |
940 | ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); |
941 | |
942 | if (isleaf) |
943 | { |
944 | IndexTuple lastleft; |
945 | IndexTuple truncated; |
946 | Size truncsz; |
947 | |
948 | /* |
949 | * Truncate away any unneeded attributes from high key on leaf |
950 | * level. This is only done at the leaf level because downlinks |
951 | * in internal pages are either negative infinity items, or get |
952 | * their contents from copying from one level down. See also: |
953 | * _bt_split(). |
954 | * |
955 | * We don't try to bias our choice of split point to make it more |
956 | * likely that _bt_truncate() can truncate away more attributes, |
957 | * whereas the split point passed to _bt_split() is chosen much |
958 | * more delicately. Suffix truncation is mostly useful because it |
959 | * improves space utilization for workloads with random |
960 | * insertions. It doesn't seem worthwhile to add logic for |
961 | * choosing a split point here for a benefit that is bound to be |
962 | * much smaller. |
963 | * |
964 | * Since the truncated tuple is often smaller than the original |
965 | * tuple, it cannot just be copied in place (besides, we want to |
966 | * actually save space on the leaf page). We delete the original |
967 | * high key, and add our own truncated high key at the same |
968 | * offset. |
969 | * |
970 | * Note that the page layout won't be changed very much. oitup is |
971 | * already located at the physical beginning of tuple space, so we |
972 | * only shift the line pointer array back and forth, and overwrite |
973 | * the tuple space previously occupied by oitup. This is fairly |
974 | * cheap. |
975 | */ |
976 | ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); |
977 | lastleft = (IndexTuple) PageGetItem(opage, ii); |
978 | |
979 | truncated = _bt_truncate(wstate->index, lastleft, oitup, |
980 | wstate->inskey); |
981 | truncsz = IndexTupleSize(truncated); |
982 | PageIndexTupleDelete(opage, P_HIKEY); |
983 | _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY); |
984 | pfree(truncated); |
985 | |
986 | /* oitup should continue to point to the page's high key */ |
987 | hii = PageGetItemId(opage, P_HIKEY); |
988 | oitup = (IndexTuple) PageGetItem(opage, hii); |
989 | } |
990 | |
991 | /* |
992 | * Link the old page into its parent, using its minimum key. If we |
993 | * don't have a parent, we have to create one; this adds a new btree |
994 | * level. |
995 | */ |
996 | if (state->btps_next == NULL) |
997 | state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); |
998 | |
999 | Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <= |
1000 | IndexRelationGetNumberOfKeyAttributes(wstate->index) && |
1001 | BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) || |
1002 | P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); |
1003 | Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 || |
1004 | !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); |
1005 | BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno); |
1006 | _bt_buildadd(wstate, state->btps_next, state->btps_minkey); |
1007 | pfree(state->btps_minkey); |
1008 | |
1009 | /* |
1010 | * Save a copy of the high key from the old page. It is also used as |
1011 | * the minimum key for the new page. |
1012 | */ |
1013 | state->btps_minkey = CopyIndexTuple(oitup); |
1014 | |
1015 | /* |
1016 | * Set the sibling links for both pages. |
1017 | */ |
1018 | { |
1019 | BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); |
1020 | BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); |
1021 | |
1022 | oopaque->btpo_next = nblkno; |
1023 | nopaque->btpo_prev = oblkno; |
1024 | nopaque->btpo_next = P_NONE; /* redundant */ |
1025 | } |
1026 | |
1027 | /* |
1028 | * Write out the old page. We never need to touch it again, so we can |
1029 | * free the opage workspace too. |
1030 | */ |
1031 | _bt_blwritepage(wstate, opage, oblkno); |
1032 | |
1033 | /* |
1034 | * Reset last_off to point to new page |
1035 | */ |
1036 | last_off = P_FIRSTKEY; |
1037 | } |
1038 | |
1039 | /* |
1040 | * By here, either original page is still the current page, or a new page |
1041 | * was created that became the current page. Either way, the current page |
1042 | * definitely has space for new item. |
1043 | * |
1044 | * If the new item is the first for its page, stash a copy for later. Note |
1045 | * this will only happen for the first item on a level; on later pages, |
1046 | * the first item for a page is copied from the prior page in the code |
1047 | * above. The minimum key for an entire level is nothing more than a |
1048 | * minus infinity (downlink only) pivot tuple placeholder. |
1049 | */ |
1050 | if (last_off == P_HIKEY) |
1051 | { |
1052 | Assert(state->btps_minkey == NULL); |
1053 | state->btps_minkey = CopyIndexTuple(itup); |
1054 | /* _bt_sortaddtup() will perform full truncation later */ |
1055 | BTreeTupleSetNAtts(state->btps_minkey, 0); |
1056 | } |
1057 | |
1058 | /* |
1059 | * Add the new item into the current page. |
1060 | */ |
1061 | last_off = OffsetNumberNext(last_off); |
1062 | _bt_sortaddtup(npage, itupsz, itup, last_off); |
1063 | |
1064 | state->btps_page = npage; |
1065 | state->btps_blkno = nblkno; |
1066 | state->btps_lastoff = last_off; |
1067 | } |
1068 | |
1069 | /* |
1070 | * Finish writing out the completed btree. |
1071 | */ |
1072 | static void |
1073 | _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) |
1074 | { |
1075 | BTPageState *s; |
1076 | BlockNumber rootblkno = P_NONE; |
1077 | uint32 rootlevel = 0; |
1078 | Page metapage; |
1079 | |
1080 | /* |
1081 | * Each iteration of this loop completes one more level of the tree. |
1082 | */ |
1083 | for (s = state; s != NULL; s = s->btps_next) |
1084 | { |
1085 | BlockNumber blkno; |
1086 | BTPageOpaque opaque; |
1087 | |
1088 | blkno = s->btps_blkno; |
1089 | opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); |
1090 | |
1091 | /* |
1092 | * We have to link the last page on this level to somewhere. |
1093 | * |
1094 | * If we're at the top, it's the root, so attach it to the metapage. |
1095 | * Otherwise, add an entry for it to its parent using its minimum key. |
1096 | * This may cause the last page of the parent level to split, but |
1097 | * that's not a problem -- we haven't gotten to it yet. |
1098 | */ |
1099 | if (s->btps_next == NULL) |
1100 | { |
1101 | opaque->btpo_flags |= BTP_ROOT; |
1102 | rootblkno = blkno; |
1103 | rootlevel = s->btps_level; |
1104 | } |
1105 | else |
1106 | { |
1107 | Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <= |
1108 | IndexRelationGetNumberOfKeyAttributes(wstate->index) && |
1109 | BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) || |
1110 | P_LEFTMOST(opaque)); |
1111 | Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 || |
1112 | !P_LEFTMOST(opaque)); |
1113 | BTreeInnerTupleSetDownLink(s->btps_minkey, blkno); |
1114 | _bt_buildadd(wstate, s->btps_next, s->btps_minkey); |
1115 | pfree(s->btps_minkey); |
1116 | s->btps_minkey = NULL; |
1117 | } |
1118 | |
1119 | /* |
1120 | * This is the rightmost page, so the ItemId array needs to be slid |
1121 | * back one slot. Then we can dump out the page. |
1122 | */ |
1123 | _bt_slideleft(s->btps_page); |
1124 | _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); |
1125 | s->btps_page = NULL; /* writepage freed the workspace */ |
1126 | } |
1127 | |
1128 | /* |
1129 | * As the last step in the process, construct the metapage and make it |
1130 | * point to the new root (unless we had no data at all, in which case it's |
1131 | * set to point to "P_NONE"). This changes the index to the "valid" state |
1132 | * by filling in a valid magic number in the metapage. |
1133 | */ |
1134 | metapage = (Page) palloc(BLCKSZ); |
1135 | _bt_initmetapage(metapage, rootblkno, rootlevel); |
1136 | _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); |
1137 | } |
1138 | |
1139 | /* |
1140 | * Read tuples in correct sort order from tuplesort, and load them into |
1141 | * btree leaves. |
1142 | */ |
1143 | static void |
1144 | _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) |
1145 | { |
1146 | BTPageState *state = NULL; |
1147 | bool merge = (btspool2 != NULL); |
1148 | IndexTuple itup, |
1149 | itup2 = NULL; |
1150 | bool load1; |
1151 | TupleDesc tupdes = RelationGetDescr(wstate->index); |
1152 | int i, |
1153 | keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); |
1154 | SortSupport sortKeys; |
1155 | int64 tuples_done = 0; |
1156 | |
1157 | if (merge) |
1158 | { |
1159 | /* |
1160 | * Another BTSpool for dead tuples exists. Now we have to merge |
1161 | * btspool and btspool2. |
1162 | */ |
1163 | |
1164 | /* the preparation of merge */ |
1165 | itup = tuplesort_getindextuple(btspool->sortstate, true); |
1166 | itup2 = tuplesort_getindextuple(btspool2->sortstate, true); |
1167 | |
1168 | /* Prepare SortSupport data for each column */ |
1169 | sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); |
1170 | |
1171 | for (i = 0; i < keysz; i++) |
1172 | { |
1173 | SortSupport sortKey = sortKeys + i; |
1174 | ScanKey scanKey = wstate->inskey->scankeys + i; |
1175 | int16 strategy; |
1176 | |
1177 | sortKey->ssup_cxt = CurrentMemoryContext; |
1178 | sortKey->ssup_collation = scanKey->sk_collation; |
1179 | sortKey->ssup_nulls_first = |
1180 | (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; |
1181 | sortKey->ssup_attno = scanKey->sk_attno; |
1182 | /* Abbreviation is not supported here */ |
1183 | sortKey->abbreviate = false; |
1184 | |
1185 | AssertState(sortKey->ssup_attno != 0); |
1186 | |
1187 | strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? |
1188 | BTGreaterStrategyNumber : BTLessStrategyNumber; |
1189 | |
1190 | PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); |
1191 | } |
1192 | |
1193 | for (;;) |
1194 | { |
1195 | load1 = true; /* load BTSpool next ? */ |
1196 | if (itup2 == NULL) |
1197 | { |
1198 | if (itup == NULL) |
1199 | break; |
1200 | } |
1201 | else if (itup != NULL) |
1202 | { |
1203 | int32 compare = 0; |
1204 | |
1205 | for (i = 1; i <= keysz; i++) |
1206 | { |
1207 | SortSupport entry; |
1208 | Datum attrDatum1, |
1209 | attrDatum2; |
1210 | bool isNull1, |
1211 | isNull2; |
1212 | |
1213 | entry = sortKeys + i - 1; |
1214 | attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); |
1215 | attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); |
1216 | |
1217 | compare = ApplySortComparator(attrDatum1, isNull1, |
1218 | attrDatum2, isNull2, |
1219 | entry); |
1220 | if (compare > 0) |
1221 | { |
1222 | load1 = false; |
1223 | break; |
1224 | } |
1225 | else if (compare < 0) |
1226 | break; |
1227 | } |
1228 | |
1229 | /* |
1230 | * If key values are equal, we sort on ItemPointer. This is |
1231 | * required for btree indexes, since heap TID is treated as an |
1232 | * implicit last key attribute in order to ensure that all |
1233 | * keys in the index are physically unique. |
1234 | */ |
1235 | if (compare == 0) |
1236 | { |
1237 | compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); |
1238 | Assert(compare != 0); |
1239 | if (compare > 0) |
1240 | load1 = false; |
1241 | } |
1242 | } |
1243 | else |
1244 | load1 = false; |
1245 | |
1246 | /* When we see first tuple, create first index page */ |
1247 | if (state == NULL) |
1248 | state = _bt_pagestate(wstate, 0); |
1249 | |
1250 | if (load1) |
1251 | { |
1252 | _bt_buildadd(wstate, state, itup); |
1253 | itup = tuplesort_getindextuple(btspool->sortstate, true); |
1254 | } |
1255 | else |
1256 | { |
1257 | _bt_buildadd(wstate, state, itup2); |
1258 | itup2 = tuplesort_getindextuple(btspool2->sortstate, true); |
1259 | } |
1260 | |
1261 | /* Report progress */ |
1262 | pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
1263 | ++tuples_done); |
1264 | } |
1265 | pfree(sortKeys); |
1266 | } |
1267 | else |
1268 | { |
1269 | /* merge is unnecessary */ |
1270 | while ((itup = tuplesort_getindextuple(btspool->sortstate, |
1271 | true)) != NULL) |
1272 | { |
1273 | /* When we see first tuple, create first index page */ |
1274 | if (state == NULL) |
1275 | state = _bt_pagestate(wstate, 0); |
1276 | |
1277 | _bt_buildadd(wstate, state, itup); |
1278 | |
1279 | /* Report progress */ |
1280 | pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
1281 | ++tuples_done); |
1282 | } |
1283 | } |
1284 | |
1285 | /* Close down final pages and write the metapage */ |
1286 | _bt_uppershutdown(wstate, state); |
1287 | |
1288 | /* |
1289 | * If the index is WAL-logged, we must fsync it down to disk before it's |
1290 | * safe to commit the transaction. (For a non-WAL-logged index we don't |
1291 | * care since the index will be uninteresting after a crash anyway.) |
1292 | * |
1293 | * It's obvious that we must do this when not WAL-logging the build. It's |
1294 | * less obvious that we have to do it even if we did WAL-log the index |
1295 | * pages. The reason is that since we're building outside shared buffers, |
1296 | * a CHECKPOINT occurring during the build has no way to flush the |
1297 | * previously written data to disk (indeed it won't know the index even |
1298 | * exists). A crash later on would replay WAL from the checkpoint, |
1299 | * therefore it wouldn't replay our earlier WAL entries. If we do not |
1300 | * fsync those pages here, they might still not be on disk when the crash |
1301 | * occurs. |
1302 | */ |
1303 | if (RelationNeedsWAL(wstate->index)) |
1304 | { |
1305 | RelationOpenSmgr(wstate->index); |
1306 | smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); |
1307 | } |
1308 | } |
1309 | |
1310 | /* |
1311 | * Create parallel context, and launch workers for leader. |
1312 | * |
1313 | * buildstate argument should be initialized (with the exception of the |
1314 | * tuplesort state in spools, which may later be created based on shared |
1315 | * state initially set up here). |
1316 | * |
1317 | * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY. |
1318 | * |
1319 | * request is the target number of parallel worker processes to launch. |
1320 | * |
1321 | * Sets buildstate's BTLeader, which caller must use to shut down parallel |
1322 | * mode by passing it to _bt_end_parallel() at the very end of its index |
1323 | * build. If not even a single worker process can be launched, this is |
1324 | * never set, and caller should proceed with a serial index build. |
1325 | */ |
1326 | static void |
1327 | _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) |
1328 | { |
1329 | ParallelContext *pcxt; |
1330 | int scantuplesortstates; |
1331 | Snapshot snapshot; |
1332 | Size estbtshared; |
1333 | Size estsort; |
1334 | BTShared *btshared; |
1335 | Sharedsort *sharedsort; |
1336 | Sharedsort *sharedsort2; |
1337 | BTSpool *btspool = buildstate->spool; |
1338 | BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); |
1339 | bool leaderparticipates = true; |
1340 | char *sharedquery; |
1341 | int querylen; |
1342 | |
1343 | #ifdef DISABLE_LEADER_PARTICIPATION |
1344 | leaderparticipates = false; |
1345 | #endif |
1346 | |
1347 | /* |
1348 | * Enter parallel mode, and create context for parallel build of btree |
1349 | * index |
1350 | */ |
1351 | EnterParallelMode(); |
1352 | Assert(request > 0); |
1353 | pcxt = CreateParallelContext("postgres" , "_bt_parallel_build_main" , |
1354 | request); |
1355 | scantuplesortstates = leaderparticipates ? request + 1 : request; |
1356 | |
1357 | /* |
1358 | * Prepare for scan of the base relation. In a normal index build, we use |
1359 | * SnapshotAny because we must retrieve all tuples and do our own time |
1360 | * qual checks (because we have to index RECENTLY_DEAD tuples). In a |
1361 | * concurrent build, we take a regular MVCC snapshot and index whatever's |
1362 | * live according to that. |
1363 | */ |
1364 | if (!isconcurrent) |
1365 | snapshot = SnapshotAny; |
1366 | else |
1367 | snapshot = RegisterSnapshot(GetTransactionSnapshot()); |
1368 | |
1369 | /* |
1370 | * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and |
1371 | * PARALLEL_KEY_TUPLESORT tuplesort workspace |
1372 | */ |
1373 | estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot); |
1374 | shm_toc_estimate_chunk(&pcxt->estimator, estbtshared); |
1375 | estsort = tuplesort_estimate_shared(scantuplesortstates); |
1376 | shm_toc_estimate_chunk(&pcxt->estimator, estsort); |
1377 | |
1378 | /* |
1379 | * Unique case requires a second spool, and so we may have to account for |
1380 | * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 |
1381 | */ |
1382 | if (!btspool->isunique) |
1383 | shm_toc_estimate_keys(&pcxt->estimator, 2); |
1384 | else |
1385 | { |
1386 | shm_toc_estimate_chunk(&pcxt->estimator, estsort); |
1387 | shm_toc_estimate_keys(&pcxt->estimator, 3); |
1388 | } |
1389 | |
1390 | /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ |
1391 | querylen = strlen(debug_query_string); |
1392 | shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); |
1393 | shm_toc_estimate_keys(&pcxt->estimator, 1); |
1394 | |
1395 | /* Everyone's had a chance to ask for space, so now create the DSM */ |
1396 | InitializeParallelDSM(pcxt); |
1397 | |
1398 | /* Store shared build state, for which we reserved space */ |
1399 | btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared); |
1400 | /* Initialize immutable state */ |
1401 | btshared->heaprelid = RelationGetRelid(btspool->heap); |
1402 | btshared->indexrelid = RelationGetRelid(btspool->index); |
1403 | btshared->isunique = btspool->isunique; |
1404 | btshared->isconcurrent = isconcurrent; |
1405 | btshared->scantuplesortstates = scantuplesortstates; |
1406 | ConditionVariableInit(&btshared->workersdonecv); |
1407 | SpinLockInit(&btshared->mutex); |
1408 | /* Initialize mutable state */ |
1409 | btshared->nparticipantsdone = 0; |
1410 | btshared->reltuples = 0.0; |
1411 | btshared->havedead = false; |
1412 | btshared->indtuples = 0.0; |
1413 | btshared->brokenhotchain = false; |
1414 | table_parallelscan_initialize(btspool->heap, |
1415 | ParallelTableScanFromBTShared(btshared), |
1416 | snapshot); |
1417 | |
1418 | /* |
1419 | * Store shared tuplesort-private state, for which we reserved space. |
1420 | * Then, initialize opaque state using tuplesort routine. |
1421 | */ |
1422 | sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); |
1423 | tuplesort_initialize_shared(sharedsort, scantuplesortstates, |
1424 | pcxt->seg); |
1425 | |
1426 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); |
1427 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); |
1428 | |
1429 | /* Unique case requires a second spool, and associated shared state */ |
1430 | if (!btspool->isunique) |
1431 | sharedsort2 = NULL; |
1432 | else |
1433 | { |
1434 | /* |
1435 | * Store additional shared tuplesort-private state, for which we |
1436 | * reserved space. Then, initialize opaque state using tuplesort |
1437 | * routine. |
1438 | */ |
1439 | sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); |
1440 | tuplesort_initialize_shared(sharedsort2, scantuplesortstates, |
1441 | pcxt->seg); |
1442 | |
1443 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2); |
1444 | } |
1445 | |
1446 | /* Store query string for workers */ |
1447 | sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); |
1448 | memcpy(sharedquery, debug_query_string, querylen + 1); |
1449 | shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery); |
1450 | |
1451 | /* Launch workers, saving status for leader/caller */ |
1452 | LaunchParallelWorkers(pcxt); |
1453 | btleader->pcxt = pcxt; |
1454 | btleader->nparticipanttuplesorts = pcxt->nworkers_launched; |
1455 | if (leaderparticipates) |
1456 | btleader->nparticipanttuplesorts++; |
1457 | btleader->btshared = btshared; |
1458 | btleader->sharedsort = sharedsort; |
1459 | btleader->sharedsort2 = sharedsort2; |
1460 | btleader->snapshot = snapshot; |
1461 | |
1462 | /* If no workers were successfully launched, back out (do serial build) */ |
1463 | if (pcxt->nworkers_launched == 0) |
1464 | { |
1465 | _bt_end_parallel(btleader); |
1466 | return; |
1467 | } |
1468 | |
1469 | /* Save leader state now that it's clear build will be parallel */ |
1470 | buildstate->btleader = btleader; |
1471 | |
1472 | /* Join heap scan ourselves */ |
1473 | if (leaderparticipates) |
1474 | _bt_leader_participate_as_worker(buildstate); |
1475 | |
1476 | /* |
1477 | * Caller needs to wait for all launched workers when we return. Make |
1478 | * sure that the failure-to-start case will not hang forever. |
1479 | */ |
1480 | WaitForParallelWorkersToAttach(pcxt); |
1481 | } |
1482 | |
1483 | /* |
1484 | * Shut down workers, destroy parallel context, and end parallel mode. |
1485 | */ |
1486 | static void |
1487 | _bt_end_parallel(BTLeader *btleader) |
1488 | { |
1489 | /* Shutdown worker processes */ |
1490 | WaitForParallelWorkersToFinish(btleader->pcxt); |
1491 | /* Free last reference to MVCC snapshot, if one was used */ |
1492 | if (IsMVCCSnapshot(btleader->snapshot)) |
1493 | UnregisterSnapshot(btleader->snapshot); |
1494 | DestroyParallelContext(btleader->pcxt); |
1495 | ExitParallelMode(); |
1496 | } |
1497 | |
1498 | /* |
1499 | * Returns size of shared memory required to store state for a parallel |
1500 | * btree index build based on the snapshot its parallel scan will use. |
1501 | */ |
1502 | static Size |
1503 | _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot) |
1504 | { |
1505 | /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ |
1506 | return add_size(BUFFERALIGN(sizeof(BTShared)), |
1507 | table_parallelscan_estimate(heap, snapshot)); |
1508 | } |
1509 | |
1510 | /* |
1511 | * Within leader, wait for end of heap scan. |
1512 | * |
1513 | * When called, parallel heap scan started by _bt_begin_parallel() will |
1514 | * already be underway within worker processes (when leader participates |
1515 | * as a worker, we should end up here just as workers are finishing). |
1516 | * |
1517 | * Fills in fields needed for ambuild statistics, and lets caller set |
1518 | * field indicating that some worker encountered a broken HOT chain. |
1519 | * |
1520 | * Returns the total number of heap tuples scanned. |
1521 | */ |
1522 | static double |
1523 | _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain) |
1524 | { |
1525 | BTShared *btshared = buildstate->btleader->btshared; |
1526 | int nparticipanttuplesorts; |
1527 | double reltuples; |
1528 | |
1529 | nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts; |
1530 | for (;;) |
1531 | { |
1532 | SpinLockAcquire(&btshared->mutex); |
1533 | if (btshared->nparticipantsdone == nparticipanttuplesorts) |
1534 | { |
1535 | buildstate->havedead = btshared->havedead; |
1536 | buildstate->indtuples = btshared->indtuples; |
1537 | *brokenhotchain = btshared->brokenhotchain; |
1538 | reltuples = btshared->reltuples; |
1539 | SpinLockRelease(&btshared->mutex); |
1540 | break; |
1541 | } |
1542 | SpinLockRelease(&btshared->mutex); |
1543 | |
1544 | ConditionVariableSleep(&btshared->workersdonecv, |
1545 | WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN); |
1546 | } |
1547 | |
1548 | ConditionVariableCancelSleep(); |
1549 | |
1550 | return reltuples; |
1551 | } |
1552 | |
1553 | /* |
1554 | * Within leader, participate as a parallel worker. |
1555 | */ |
1556 | static void |
1557 | _bt_leader_participate_as_worker(BTBuildState *buildstate) |
1558 | { |
1559 | BTLeader *btleader = buildstate->btleader; |
1560 | BTSpool *leaderworker; |
1561 | BTSpool *leaderworker2; |
1562 | int sortmem; |
1563 | |
1564 | /* Allocate memory and initialize private spool */ |
1565 | leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); |
1566 | leaderworker->heap = buildstate->spool->heap; |
1567 | leaderworker->index = buildstate->spool->index; |
1568 | leaderworker->isunique = buildstate->spool->isunique; |
1569 | |
1570 | /* Initialize second spool, if required */ |
1571 | if (!btleader->btshared->isunique) |
1572 | leaderworker2 = NULL; |
1573 | else |
1574 | { |
1575 | /* Allocate memory for worker's own private secondary spool */ |
1576 | leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
1577 | |
1578 | /* Initialize worker's own secondary spool */ |
1579 | leaderworker2->heap = leaderworker->heap; |
1580 | leaderworker2->index = leaderworker->index; |
1581 | leaderworker2->isunique = false; |
1582 | } |
1583 | |
1584 | /* |
1585 | * Might as well use reliable figure when doling out maintenance_work_mem |
1586 | * (when requested number of workers were not launched, this will be |
1587 | * somewhat higher than it is for other workers). |
1588 | */ |
1589 | sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts; |
1590 | |
1591 | /* Perform work common to all participants */ |
1592 | _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared, |
1593 | btleader->sharedsort, btleader->sharedsort2, |
1594 | sortmem, true); |
1595 | |
1596 | #ifdef BTREE_BUILD_STATS |
1597 | if (log_btree_build_stats) |
1598 | { |
1599 | ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS" ); |
1600 | ResetUsage(); |
1601 | } |
1602 | #endif /* BTREE_BUILD_STATS */ |
1603 | } |
1604 | |
1605 | /* |
1606 | * Perform work within a launched parallel process. |
1607 | */ |
1608 | void |
1609 | _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) |
1610 | { |
1611 | char *sharedquery; |
1612 | BTSpool *btspool; |
1613 | BTSpool *btspool2; |
1614 | BTShared *btshared; |
1615 | Sharedsort *sharedsort; |
1616 | Sharedsort *sharedsort2; |
1617 | Relation heapRel; |
1618 | Relation indexRel; |
1619 | LOCKMODE heapLockmode; |
1620 | LOCKMODE indexLockmode; |
1621 | int sortmem; |
1622 | |
1623 | #ifdef BTREE_BUILD_STATS |
1624 | if (log_btree_build_stats) |
1625 | ResetUsage(); |
1626 | #endif /* BTREE_BUILD_STATS */ |
1627 | |
1628 | /* Set debug_query_string for individual workers first */ |
1629 | sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false); |
1630 | debug_query_string = sharedquery; |
1631 | |
1632 | /* Report the query string from leader */ |
1633 | pgstat_report_activity(STATE_RUNNING, debug_query_string); |
1634 | |
1635 | /* Look up nbtree shared state */ |
1636 | btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false); |
1637 | |
1638 | /* Open relations using lock modes known to be obtained by index.c */ |
1639 | if (!btshared->isconcurrent) |
1640 | { |
1641 | heapLockmode = ShareLock; |
1642 | indexLockmode = AccessExclusiveLock; |
1643 | } |
1644 | else |
1645 | { |
1646 | heapLockmode = ShareUpdateExclusiveLock; |
1647 | indexLockmode = RowExclusiveLock; |
1648 | } |
1649 | |
1650 | /* Open relations within worker */ |
1651 | heapRel = table_open(btshared->heaprelid, heapLockmode); |
1652 | indexRel = index_open(btshared->indexrelid, indexLockmode); |
1653 | |
1654 | /* Initialize worker's own spool */ |
1655 | btspool = (BTSpool *) palloc0(sizeof(BTSpool)); |
1656 | btspool->heap = heapRel; |
1657 | btspool->index = indexRel; |
1658 | btspool->isunique = btshared->isunique; |
1659 | |
1660 | /* Look up shared state private to tuplesort.c */ |
1661 | sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); |
1662 | tuplesort_attach_shared(sharedsort, seg); |
1663 | if (!btshared->isunique) |
1664 | { |
1665 | btspool2 = NULL; |
1666 | sharedsort2 = NULL; |
1667 | } |
1668 | else |
1669 | { |
1670 | /* Allocate memory for worker's own private secondary spool */ |
1671 | btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
1672 | |
1673 | /* Initialize worker's own secondary spool */ |
1674 | btspool2->heap = btspool->heap; |
1675 | btspool2->index = btspool->index; |
1676 | btspool2->isunique = false; |
1677 | /* Look up shared state private to tuplesort.c */ |
1678 | sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false); |
1679 | tuplesort_attach_shared(sharedsort2, seg); |
1680 | } |
1681 | |
1682 | /* Perform sorting of spool, and possibly a spool2 */ |
1683 | sortmem = maintenance_work_mem / btshared->scantuplesortstates; |
1684 | _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort, |
1685 | sharedsort2, sortmem, false); |
1686 | |
1687 | #ifdef BTREE_BUILD_STATS |
1688 | if (log_btree_build_stats) |
1689 | { |
1690 | ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS" ); |
1691 | ResetUsage(); |
1692 | } |
1693 | #endif /* BTREE_BUILD_STATS */ |
1694 | |
1695 | index_close(indexRel, indexLockmode); |
1696 | table_close(heapRel, heapLockmode); |
1697 | } |
1698 | |
1699 | /* |
1700 | * Perform a worker's portion of a parallel sort. |
1701 | * |
1702 | * This generates a tuplesort for passed btspool, and a second tuplesort |
1703 | * state if a second btspool is need (i.e. for unique index builds). All |
1704 | * other spool fields should already be set when this is called. |
1705 | * |
1706 | * sortmem is the amount of working memory to use within each worker, |
1707 | * expressed in KBs. |
1708 | * |
1709 | * When this returns, workers are done, and need only release resources. |
1710 | */ |
1711 | static void |
1712 | _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, |
1713 | BTShared *btshared, Sharedsort *sharedsort, |
1714 | Sharedsort *sharedsort2, int sortmem, bool progress) |
1715 | { |
1716 | SortCoordinate coordinate; |
1717 | BTBuildState buildstate; |
1718 | TableScanDesc scan; |
1719 | double reltuples; |
1720 | IndexInfo *indexInfo; |
1721 | |
1722 | /* Initialize local tuplesort coordination state */ |
1723 | coordinate = palloc0(sizeof(SortCoordinateData)); |
1724 | coordinate->isWorker = true; |
1725 | coordinate->nParticipants = -1; |
1726 | coordinate->sharedsort = sharedsort; |
1727 | |
1728 | /* Begin "partial" tuplesort */ |
1729 | btspool->sortstate = tuplesort_begin_index_btree(btspool->heap, |
1730 | btspool->index, |
1731 | btspool->isunique, |
1732 | sortmem, coordinate, |
1733 | false); |
1734 | |
1735 | /* |
1736 | * Just as with serial case, there may be a second spool. If so, a |
1737 | * second, dedicated spool2 partial tuplesort is required. |
1738 | */ |
1739 | if (btspool2) |
1740 | { |
1741 | SortCoordinate coordinate2; |
1742 | |
1743 | /* |
1744 | * We expect that the second one (for dead tuples) won't get very |
1745 | * full, so we give it only work_mem (unless sortmem is less for |
1746 | * worker). Worker processes are generally permitted to allocate |
1747 | * work_mem independently. |
1748 | */ |
1749 | coordinate2 = palloc0(sizeof(SortCoordinateData)); |
1750 | coordinate2->isWorker = true; |
1751 | coordinate2->nParticipants = -1; |
1752 | coordinate2->sharedsort = sharedsort2; |
1753 | btspool2->sortstate = |
1754 | tuplesort_begin_index_btree(btspool->heap, btspool->index, false, |
1755 | Min(sortmem, work_mem), coordinate2, |
1756 | false); |
1757 | } |
1758 | |
1759 | /* Fill in buildstate for _bt_build_callback() */ |
1760 | buildstate.isunique = btshared->isunique; |
1761 | buildstate.havedead = false; |
1762 | buildstate.heap = btspool->heap; |
1763 | buildstate.spool = btspool; |
1764 | buildstate.spool2 = btspool2; |
1765 | buildstate.indtuples = 0; |
1766 | buildstate.btleader = NULL; |
1767 | |
1768 | /* Join parallel scan */ |
1769 | indexInfo = BuildIndexInfo(btspool->index); |
1770 | indexInfo->ii_Concurrent = btshared->isconcurrent; |
1771 | scan = table_beginscan_parallel(btspool->heap, |
1772 | ParallelTableScanFromBTShared(btshared)); |
1773 | reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, |
1774 | true, progress, _bt_build_callback, |
1775 | (void *) &buildstate, scan); |
1776 | |
1777 | /* |
1778 | * Execute this worker's part of the sort. |
1779 | * |
1780 | * Unlike leader and serial cases, we cannot avoid calling |
1781 | * tuplesort_performsort() for spool2 if it ends up containing no dead |
1782 | * tuples (this is disallowed for workers by tuplesort). |
1783 | */ |
1784 | tuplesort_performsort(btspool->sortstate); |
1785 | if (btspool2) |
1786 | tuplesort_performsort(btspool2->sortstate); |
1787 | |
1788 | /* |
1789 | * Done. Record ambuild statistics, and whether we encountered a broken |
1790 | * HOT chain. |
1791 | */ |
1792 | SpinLockAcquire(&btshared->mutex); |
1793 | btshared->nparticipantsdone++; |
1794 | btshared->reltuples += reltuples; |
1795 | if (buildstate.havedead) |
1796 | btshared->havedead = true; |
1797 | btshared->indtuples += buildstate.indtuples; |
1798 | if (indexInfo->ii_BrokenHotChain) |
1799 | btshared->brokenhotchain = true; |
1800 | SpinLockRelease(&btshared->mutex); |
1801 | |
1802 | /* Notify leader */ |
1803 | ConditionVariableSignal(&btshared->workersdonecv); |
1804 | |
1805 | /* We can end tuplesorts immediately */ |
1806 | tuplesort_end(btspool->sortstate); |
1807 | if (btspool2) |
1808 | tuplesort_end(btspool2->sortstate); |
1809 | } |
1810 | |