gistbuild.c source code [PostgreSQL/src/backend/access/gist/gistbuild.c]

1	/-------------------------------------------------------------------------*
2	*
3	* gistbuild.c
4	* build algorithm for GiST indexes implementation.
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* IDENTIFICATION
11	* src/backend/access/gist/gistbuild.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include <math.h>
18
19	#include "access/genam.h"
20	#include "access/gist_private.h"
21	#include "access/gistxlog.h"
22	#include "access/tableam.h"
23	#include "access/xloginsert.h"
24	#include "catalog/index.h"
25	#include "miscadmin.h"
26	#include "optimizer/optimizer.h"
27	#include "storage/bufmgr.h"
28	#include "storage/smgr.h"
29	#include "utils/memutils.h"
30	#include "utils/rel.h"
31
32	/ Step of index tuples for check whether to switch to buffering build mode /
33	#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
34
35	/*
36	* Number of tuples to process in the slow way before switching to buffering
37	* mode, when buffering is explicitly turned on. Also, the number of tuples
38	* to process between readjusting the buffer size parameter, while in
39	* buffering mode.
40	*/
41	#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
42
43	typedef enum
44	{
45	GIST_BUFFERING_DISABLED, / in regular build mode and aren't going to*
46	* switch */
47	GIST_BUFFERING_AUTO, / in regular build mode, but will switch to*
48	* buffering build mode if the index grows too
49	* big */
50	GIST_BUFFERING_STATS, / gathering statistics of index tuple size*
51	* before switching to the buffering build
52	* mode */
53	GIST_BUFFERING_ACTIVE / in buffering build mode /
54	} GistBufferingMode;
55
56	/ Working state for gistbuild and its callback /
57	typedef struct
58	{
59	Relation indexrel;
60	Relation heaprel;
61	GISTSTATE *giststate;
62
63	int64 indtuples; / number of tuples indexed /
64	int64 indtuplesSize; / total size of all indexed tuples /
65
66	Size freespace; / amount of free space to leave on pages /
67
68	/*
69	* Extra data structures used during a buffering build. 'gfbb' contains
70	* information related to managing the build buffers. 'parentMap' is a
71	* lookup table of the parent of each internal page.
72	*/
73	GISTBuildBuffers *gfbb;
74	HTAB *parentMap;
75
76	GistBufferingMode bufferingMode;
77	} GISTBuildState;
78
79	/ prototypes for private functions /
80	static void gistInitBuffering(GISTBuildState *buildstate);
81	static int calculatePagesPerBuffer(GISTBuildState buildstate, int* levelStep);
82	static void gistBuildCallback(Relation index,
83	HeapTuple htup,
84	Datum *values,
85	bool *isnull,
86	bool tupleIsAlive,
87	void *state);
88	static void gistBufferingBuildInsert(GISTBuildState *buildstate,
89	IndexTuple itup);
90	static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
91	BlockNumber startblkno, int startlevel);
92	static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
93	Buffer buffer, int level,
94	IndexTuple itup, int* ntup, OffsetNumber oldoffnum,
95	BlockNumber parentblk, OffsetNumber downlinkoffnum);
96	static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
97	BlockNumber childblkno, int level,
98	BlockNumber *parentblk,
99	OffsetNumber *downlinkoffnum);
100	static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
101	static void gistEmptyAllBuffers(GISTBuildState *buildstate);
102	static int gistGetMaxLevel(Relation index);
103
104	static void gistInitParentMap(GISTBuildState *buildstate);
105	static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
106	BlockNumber parent);
107	static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
108	static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
109
110	/*
111	* Main entry point to GiST index build. Initially calls insert over and over,
112	* but switches to more efficient buffering build algorithm after a certain
113	* number of tuples (unless buffering mode is disabled).
114	*/
115	IndexBuildResult *
116	gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
117	{
118	IndexBuildResult *result;
119	double reltuples;
120	GISTBuildState buildstate;
121	Buffer buffer;
122	Page page;
123	MemoryContext oldcxt = CurrentMemoryContext;
124	int fillfactor;
125
126	buildstate.indexrel = index;
127	buildstate.heaprel = heap;
128	if (index->rd_options)
129	{
130	/ Get buffering mode from the options string /
131	GiSTOptions options = (GiSTOptions ) index->rd_options;
132	char bufferingMode = (char* *) options + options->bufferingModeOffset;
133
134	if (strcmp(bufferingMode, "on") == `0`)
135	buildstate.bufferingMode = GIST_BUFFERING_STATS;
136	else if (strcmp(bufferingMode, "off") == `0`)
137	buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
138	else
139	buildstate.bufferingMode = GIST_BUFFERING_AUTO;
140
141	fillfactor = options->fillfactor;
142	}
143	else
144	{
145	/*
146	* By default, switch to buffering mode when the index grows too large
147	* to fit in cache.
148	*/
149	buildstate.bufferingMode = GIST_BUFFERING_AUTO;
150	fillfactor = GIST_DEFAULT_FILLFACTOR;
151	}
152	/ Calculate target amount of free space to leave on pages /
153	buildstate.freespace = BLCKSZ * (`100` - fillfactor) / `100`;
154
155	/*
156	* We expect to be called exactly once for any index relation. If that's
157	* not the case, big trouble's what we have.
158	*/
159	if (RelationGetNumberOfBlocks(index) != `0`)
160	elog(ERROR, "index \"%s\" already contains data",
161	RelationGetRelationName(index));
162
163	/ no locking is needed /
164	buildstate.giststate = initGISTstate(index);
165
166	/*
167	* Create a temporary memory context that is reset once for each tuple
168	* processed. (Note: we don't bother to make this a child of the
169	* giststate's scanCxt, so we have to delete it separately at the end.)
170	*/
171	buildstate.giststate->tempCxt = createTempGistContext();
172
173	/ initialize the root page /
174	buffer = gistNewBuffer(index);
175	Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
176	page = BufferGetPage(buffer);
177
178	START_CRIT_SECTION();
179
180	GISTInitBuffer(buffer, F_LEAF);
181
182	MarkBufferDirty(buffer);
183	PageSetLSN(page, GistBuildLSN);
184
185	UnlockReleaseBuffer(buffer);
186
187	END_CRIT_SECTION();
188
189	/ build the index /
190	buildstate.indtuples = `0`;
191	buildstate.indtuplesSize = `0`;
192
193	/*
194	* Do the heap scan.
195	*/
196	reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
197	gistBuildCallback,
198	(void *) &buildstate, NULL);
199
200	/*
201	* If buffering was used, flush out all the tuples that are still in the
202	* buffers.
203	*/
204	if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
205	{
206	elog(DEBUG1, "all tuples processed, emptying buffers");
207	gistEmptyAllBuffers(&buildstate);
208	gistFreeBuildBuffers(buildstate.gfbb);
209	}
210
211	/ okay, all heap tuples are indexed /
212	MemoryContextSwitchTo(oldcxt);
213	MemoryContextDelete(buildstate.giststate->tempCxt);
214
215	freeGISTstate(buildstate.giststate);
216
217	/*
218	* We didn't write WAL records as we built the index, so if WAL-logging is
219	* required, write all pages to the WAL now.
220	*/
221	if (RelationNeedsWAL(index))
222	{
223	log_newpage_range(index, MAIN_FORKNUM,
224	`0`, RelationGetNumberOfBlocks(index),
225	true);
226	}
227
228	/*
229	* Return statistics
230	*/
231	result = (IndexBuildResult ) palloc(sizeof*(IndexBuildResult));
232
233	result->heap_tuples = reltuples;
234	result->index_tuples = (double) buildstate.indtuples;
235
236	return result;
237	}
238
239	/*
240	* Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
241	* and "auto" values.
242	*/
243	void
244	gistValidateBufferingOption(const char *value)
245	{
246	if (value == NULL \|\|
247	(strcmp(value, "on") != `0` &&
248	strcmp(value, "off") != `0` &&
249	strcmp(value, "auto") != `0`))
250	{
251	ereport(ERROR,
252	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
253	errmsg("invalid value for \"buffering\" option"),
254	errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
255	}
256	}
257
258	/*
259	* Attempt to switch to buffering mode.
260	*
261	* If there is not enough memory for buffering build, sets bufferingMode
262	* to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
263	* anymore. Otherwise initializes the build buffers, and sets bufferingMode to
264	* GIST_BUFFERING_ACTIVE.
265	*/
266	static void
267	gistInitBuffering(GISTBuildState *buildstate)
268	{
269	Relation index = buildstate->indexrel;
270	int pagesPerBuffer;
271	Size pageFreeSpace;
272	Size itupAvgSize,
273	itupMinSize;
274	double avgIndexTuplesPerPage,
275	maxIndexTuplesPerPage;
276	int i;
277	int levelStep;
278
279	/ Calc space of index page which is available for index tuples /
280	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
281	- sizeof(ItemIdData)
282	- buildstate->freespace;
283
284	/*
285	* Calculate average size of already inserted index tuples using gathered
286	* statistics.
287	*/
288	itupAvgSize = (double) buildstate->indtuplesSize /
289	(double) buildstate->indtuples;
290
291	/*
292	* Calculate minimal possible size of index tuple by index metadata.
293	* Minimal possible size of varlena is VARHDRSZ.
294	*
295	* XXX: that's not actually true, as a short varlen can be just 2 bytes.
296	* And we should take padding into account here.
297	*/
298	itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
299	for (i = `0`; i < index->rd_att->natts; i++)
300	{
301	if (TupleDescAttr(index->rd_att, i)->attlen < `0`)
302	itupMinSize += VARHDRSZ;
303	else
304	itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
305	}
306
307	/ Calculate average and maximal number of index tuples which fit to page /
308	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
309	maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
310
311	/*
312	* We need to calculate two parameters for the buffering algorithm:
313	* levelStep and pagesPerBuffer.
314	*
315	* levelStep determines the size of subtree that we operate on, while
316	* emptying a buffer. A higher value is better, as you need fewer buffer
317	* emptying steps to build the index. However, if you set it too high, the
318	* subtree doesn't fit in cache anymore, and you quickly lose the benefit
319	* of the buffers.
320	*
321	* In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
322	* the number of tuples on page (ie. fanout), and M is the amount of
323	* internal memory available. Curiously, they doesn't explain why that
324	* setting is optimal. We calculate it by taking the highest levelStep so
325	* that a subtree still fits in cache. For a small B, our way of
326	* calculating levelStep is very close to Arge et al's formula. For a
327	* large B, our formula gives a value that is 2x higher.
328	*
329	* The average size (in pages) of a subtree of depth n can be calculated
330	* as a geometric series:
331	*
332	* B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
333	*
334	* where B is the average number of index tuples on page. The subtree is
335	* cached in the shared buffer cache and the OS cache, so we choose
336	* levelStep so that the subtree size is comfortably smaller than
337	* effective_cache_size, with a safety factor of 4.
338	*
339	* The estimate on the average number of index tuples on page is based on
340	* average tuple sizes observed before switching to buffered build, so the
341	* real subtree size can be somewhat larger. Also, it would selfish to
342	* gobble the whole cache for our index build. The safety factor of 4
343	* should account for those effects.
344	*
345	* The other limiting factor for setting levelStep is that while
346	* processing a subtree, we need to hold one page for each buffer at the
347	* next lower buffered level. The max. number of buffers needed for that
348	* is maxIndexTuplesPerPage^levelStep. This is very conservative, but
349	* hopefully maintenance_work_mem is set high enough that you're
350	* constrained by effective_cache_size rather than maintenance_work_mem.
351	*
352	* XXX: the buffer hash table consumes a fair amount of memory too per
353	* buffer, but that is not currently taken into account. That scales on
354	* the total number of buffers used, ie. the index size and on levelStep.
355	* Note that a higher levelStep reduces the amount of memory needed for
356	* the hash table.
357	*/
358	levelStep = `1`;
359	for (;;)
360	{
361	double subtreesize;
362	double maxlowestlevelpages;
363
364	/ size of an average subtree at this levelStep (in pages). /
365	subtreesize =
366	(`1` - pow(avgIndexTuplesPerPage, (double) (levelStep + `1`))) /
367	(`1` - avgIndexTuplesPerPage);
368
369	/ max number of pages at the lowest level of a subtree /
370	maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
371
372	/ subtree must fit in cache (with safety factor of 4) /
373	if (subtreesize > effective_cache_size / `4`)
374	break;
375
376	/ each node in the lowest level of a subtree has one page in memory /
377	if (maxlowestlevelpages > ((double) maintenance_work_mem * `1024`) / BLCKSZ)
378	break;
379
380	/ Good, we can handle this levelStep. See if we can go one higher. /
381	levelStep++;
382	}
383
384	/*
385	* We just reached an unacceptable value of levelStep in previous loop.
386	* So, decrease levelStep to get last acceptable value.
387	*/
388	levelStep--;
389
390	/*
391	* If there's not enough cache or maintenance_work_mem, fall back to plain
392	* inserts.
393	*/
394	if (levelStep <= `0`)
395	{
396	elog(DEBUG1, "failed to switch to buffered GiST build");
397	buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
398	return;
399	}
400
401	/*
402	* The second parameter to set is pagesPerBuffer, which determines the
403	* size of each buffer. We adjust pagesPerBuffer also during the build,
404	* which is why this calculation is in a separate function.
405	*/
406	pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
407
408	/ Initialize GISTBuildBuffers with these parameters /
409	buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
410	gistGetMaxLevel(index));
411
412	gistInitParentMap(buildstate);
413
414	buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
415
416	elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
417	levelStep, pagesPerBuffer);
418	}
419
420	/*
421	* Calculate pagesPerBuffer parameter for the buffering algorithm.
422	*
423	* Buffer size is chosen so that assuming that tuples are distributed
424	* randomly, emptying half a buffer fills on average one page in every buffer
425	* at the next lower level.
426	*/
427	static int
428	calculatePagesPerBuffer(GISTBuildState buildstate, int* levelStep)
429	{
430	double pagesPerBuffer;
431	double avgIndexTuplesPerPage;
432	double itupAvgSize;
433	Size pageFreeSpace;
434
435	/ Calc space of index page which is available for index tuples /
436	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
437	- sizeof(ItemIdData)
438	- buildstate->freespace;
439
440	/*
441	* Calculate average size of already inserted index tuples using gathered
442	* statistics.
443	*/
444	itupAvgSize = (double) buildstate->indtuplesSize /
445	(double) buildstate->indtuples;
446
447	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
448
449	/*
450	* Recalculate required size of buffers.
451	*/
452	pagesPerBuffer = `2` * pow(avgIndexTuplesPerPage, levelStep);
453
454	return (int) rint(pagesPerBuffer);
455	}
456
457	/*
458	* Per-tuple callback for table_index_build_scan.
459	*/
460	static void
461	gistBuildCallback(Relation index,
462	HeapTuple htup,
463	Datum *values,
464	bool *isnull,
465	bool tupleIsAlive,
466	void *state)
467	{
468	GISTBuildState buildstate = (GISTBuildState ) state;
469	IndexTuple itup;
470	MemoryContext oldCtx;
471
472	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
473
474	/ form an index tuple and point it at the heap tuple /
475	itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
476	itup->t_tid = htup->t_self;
477
478	if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
479	{
480	/ We have buffers, so use them. /
481	gistBufferingBuildInsert(buildstate, itup);
482	}
483	else
484	{
485	/*
486	* There's no buffers (yet). Since we already have the index relation
487	* locked, we call gistdoinsert directly.
488	*/
489	gistdoinsert(index, itup, buildstate->freespace,
490	buildstate->giststate, buildstate->heaprel, true);
491	}
492
493	/ Update tuple count and total size. /
494	buildstate->indtuples += `1`;
495	buildstate->indtuplesSize += IndexTupleSize(itup);
496
497	MemoryContextSwitchTo(oldCtx);
498	MemoryContextReset(buildstate->giststate->tempCxt);
499
500	if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
501	buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == `0`)
502	{
503	/ Adjust the target buffer size now /
504	buildstate->gfbb->pagesPerBuffer =
505	calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
506	}
507
508	/*
509	* In 'auto' mode, check if the index has grown too large to fit in cache,
510	* and switch to buffering mode if it has.
511	*
512	* To avoid excessive calls to smgrnblocks(), only check this every
513	* BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
514	*/
515	if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
516	buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == `0` &&
517	effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) \|\|
518	(buildstate->bufferingMode == GIST_BUFFERING_STATS &&
519	buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
520	{
521	/*
522	* Index doesn't fit in effective cache anymore. Try to switch to
523	* buffering build mode.
524	*/
525	gistInitBuffering(buildstate);
526	}
527	}
528
529	/*
530	* Insert function for buffering index build.
531	*/
532	static void
533	gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
534	{
535	/ Insert the tuple to buffers. /
536	gistProcessItup(buildstate, itup, `0`, buildstate->gfbb->rootlevel);
537
538	/ If we filled up (half of a) buffer, process buffer emptying. /
539	gistProcessEmptyingQueue(buildstate);
540	}
541
542	/*
543	* Process an index tuple. Runs the tuple down the tree until we reach a leaf
544	* page or node buffer, and inserts the tuple there. Returns true if we have
545	* to stop buffer emptying process (because one of child buffers can't take
546	* index tuples anymore).
547	*/
548	static bool
549	gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
550	BlockNumber startblkno, int startlevel)
551	{
552	GISTSTATE *giststate = buildstate->giststate;
553	GISTBuildBuffers *gfbb = buildstate->gfbb;
554	Relation indexrel = buildstate->indexrel;
555	BlockNumber childblkno;
556	Buffer buffer;
557	bool result = false;
558	BlockNumber blkno;
559	int level;
560	OffsetNumber downlinkoffnum = InvalidOffsetNumber;
561	BlockNumber parentblkno = InvalidBlockNumber;
562
563	CHECK_FOR_INTERRUPTS();
564
565	/*
566	* Loop until we reach a leaf page (level == 0) or a level with buffers
567	* (not including the level we start at, because we would otherwise make
568	* no progress).
569	*/
570	blkno = startblkno;
571	level = startlevel;
572	for (;;)
573	{
574	ItemId iid;
575	IndexTuple idxtuple,
576	newtup;
577	Page page;
578	OffsetNumber childoffnum;
579
580	/ Have we reached a level with buffers? /
581	if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
582	break;
583
584	/ Have we reached a leaf page? /
585	if (level == `0`)
586	break;
587
588	/*
589	* Nope. Descend down to the next level then. Choose a child to
590	* descend down to.
591	*/
592
593	buffer = ReadBuffer(indexrel, blkno);
594	LockBuffer(buffer, GIST_EXCLUSIVE);
595
596	page = (Page) BufferGetPage(buffer);
597	childoffnum = gistchoose(indexrel, page, itup, giststate);
598	iid = PageGetItemId(page, childoffnum);
599	idxtuple = (IndexTuple) PageGetItem(page, iid);
600	childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
601
602	if (level > `1`)
603	gistMemorizeParent(buildstate, childblkno, blkno);
604
605	/*
606	* Check that the key representing the target child node is consistent
607	* with the key we're inserting. Update it if it's not.
608	*/
609	newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
610	if (newtup)
611	{
612	blkno = gistbufferinginserttuples(buildstate,
613	buffer,
614	level,
615	&newtup,
616	`1`,
617	childoffnum,
618	InvalidBlockNumber,
619	InvalidOffsetNumber);
620	/ gistbufferinginserttuples() released the buffer /
621	}
622	else
623	UnlockReleaseBuffer(buffer);
624
625	/ Descend to the child /
626	parentblkno = blkno;
627	blkno = childblkno;
628	downlinkoffnum = childoffnum;
629	Assert(level > `0`);
630	level--;
631	}
632
633	if (LEVEL_HAS_BUFFERS(level, gfbb))
634	{
635	/*
636	* We've reached level with buffers. Place the index tuple to the
637	* buffer, and add the buffer to the emptying queue if it overflows.
638	*/
639	GISTNodeBuffer *childNodeBuffer;
640
641	/ Find the buffer or create a new one /
642	childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
643
644	/ Add index tuple to it /
645	gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
646
647	if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
648	result = true;
649	}
650	else
651	{
652	/*
653	* We've reached a leaf page. Place the tuple here.
654	*/
655	Assert(level == `0`);
656	buffer = ReadBuffer(indexrel, blkno);
657	LockBuffer(buffer, GIST_EXCLUSIVE);
658	gistbufferinginserttuples(buildstate, buffer, level,
659	&itup, `1`, InvalidOffsetNumber,
660	parentblkno, downlinkoffnum);
661	/ gistbufferinginserttuples() released the buffer /
662	}
663
664	return result;
665	}
666
667	/*
668	* Insert tuples to a given page.
669	*
670	* This is analogous with gistinserttuples() in the regular insertion code.
671	*
672	* Returns the block number of the page where the (first) new or updated tuple
673	* was inserted. Usually that's the original page, but might be a sibling page
674	* if the original page was split.
675	*
676	* Caller should hold a lock on 'buffer' on entry. This function will unlock
677	* and unpin it.
678	*/
679	static BlockNumber
680	gistbufferinginserttuples(GISTBuildState buildstate, Buffer buffer, int* level,
681	IndexTuple itup, int* ntup, OffsetNumber oldoffnum,
682	BlockNumber parentblk, OffsetNumber downlinkoffnum)
683	{
684	GISTBuildBuffers *gfbb = buildstate->gfbb;
685	List *splitinfo;
686	bool is_split;
687	BlockNumber placed_to_blk = InvalidBlockNumber;
688
689	is_split = gistplacetopage(buildstate->indexrel,
690	buildstate->freespace,
691	buildstate->giststate,
692	buffer,
693	itup, ntup, oldoffnum, &placed_to_blk,
694	InvalidBuffer,
695	&splitinfo,
696	false,
697	buildstate->heaprel, true);
698
699	/*
700	* If this is a root split, update the root path item kept in memory. This
701	* ensures that all path stacks are always complete, including all parent
702	* nodes up to the root. That simplifies the algorithm to re-find correct
703	* parent.
704	*/
705	if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
706	{
707	Page page = BufferGetPage(buffer);
708	OffsetNumber off;
709	OffsetNumber maxoff;
710
711	Assert(level == gfbb->rootlevel);
712	gfbb->rootlevel++;
713
714	elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
715
716	/*
717	* All the downlinks on the old root page are now on one of the child
718	* pages. Visit all the new child pages to memorize the parents of the
719	* grandchildren.
720	*/
721	if (gfbb->rootlevel > `1`)
722	{
723	maxoff = PageGetMaxOffsetNumber(page);
724	for (off = FirstOffsetNumber; off <= maxoff; off++)
725	{
726	ItemId iid = PageGetItemId(page, off);
727	IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
728	BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
729	Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
730
731	LockBuffer(childbuf, GIST_SHARE);
732	gistMemorizeAllDownlinks(buildstate, childbuf);
733	UnlockReleaseBuffer(childbuf);
734
735	/*
736	* Also remember that the parent of the new child page is the
737	* root block.
738	*/
739	gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
740	}
741	}
742	}
743
744	if (splitinfo)
745	{
746	/*
747	* Insert the downlinks to the parent. This is analogous with
748	* gistfinishsplit() in the regular insertion code, but the locking is
749	* simpler, and we have to maintain the buffers on internal nodes and
750	* the parent map.
751	*/
752	IndexTuple *downlinks;
753	int ndownlinks,
754	i;
755	Buffer parentBuffer;
756	ListCell *lc;
757
758	/ Parent may have changed since we memorized this path. /
759	parentBuffer =
760	gistBufferingFindCorrectParent(buildstate,
761	BufferGetBlockNumber(buffer),
762	level,
763	&parentblk,
764	&downlinkoffnum);
765
766	/*
767	* If there's a buffer associated with this page, that needs to be
768	* split too. gistRelocateBuildBuffersOnSplit() will also adjust the
769	* downlinks in 'splitinfo', to make sure they're consistent not only
770	* with the tuples already on the pages, but also the tuples in the
771	* buffers that will eventually be inserted to them.
772	*/
773	gistRelocateBuildBuffersOnSplit(gfbb,
774	buildstate->giststate,
775	buildstate->indexrel,
776	level,
777	buffer, splitinfo);
778
779	/ Create an array of all the downlink tuples /
780	ndownlinks = list_length(splitinfo);
781	downlinks = (IndexTuple ) palloc(sizeof(IndexTuple) ndownlinks);
782	i = `0`;
783	foreach(lc, splitinfo)
784	{
785	GISTPageSplitInfo *splitinfo = lfirst(lc);
786
787	/*
788	* Remember the parent of each new child page in our parent map.
789	* This assumes that the downlinks fit on the parent page. If the
790	* parent page is split, too, when we recurse up to insert the
791	* downlinks, the recursive gistbufferinginserttuples() call will
792	* update the map again.
793	*/
794	if (level > `0`)
795	gistMemorizeParent(buildstate,
796	BufferGetBlockNumber(splitinfo->buf),
797	BufferGetBlockNumber(parentBuffer));
798
799	/*
800	* Also update the parent map for all the downlinks that got moved
801	* to a different page. (actually this also loops through the
802	* downlinks that stayed on the original page, but it does no
803	* harm).
804	*/
805	if (level > `1`)
806	gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
807
808	/*
809	* Since there's no concurrent access, we can release the lower
810	* level buffers immediately. This includes the original page.
811	*/
812	UnlockReleaseBuffer(splitinfo->buf);
813	downlinks[i++] = splitinfo->downlink;
814	}
815
816	/ Insert them into parent. /
817	gistbufferinginserttuples(buildstate, parentBuffer, level + `1`,
818	downlinks, ndownlinks, downlinkoffnum,
819	InvalidBlockNumber, InvalidOffsetNumber);
820
821	list_free_deep(splitinfo); / we don't need this anymore /
822	}
823	else
824	UnlockReleaseBuffer(buffer);
825
826	return placed_to_blk;
827	}
828
829	/*
830	* Find the downlink pointing to a child page.
831	*
832	* 'childblkno' indicates the child page to find the parent for. 'level' is
833	* the level of the child. On entry, parentblkno and downlinkoffnum can
834	* point to a location where the downlink used to be - we will check that
835	* location first, and save some cycles if it hasn't moved. The function
836	* returns a buffer containing the downlink, exclusively-locked, and
837	* parentblkno and downlinkoffnum are set to the real location of the
838	* downlink.
839	*
840	* If the child page is a leaf (level == 0), the caller must supply a correct
841	* parentblkno. Otherwise we use the parent map hash table to find the parent
842	* block.
843	*
844	* This function serves the same purpose as gistFindCorrectParent() during
845	* normal index inserts, but this is simpler because we don't need to deal
846	* with concurrent inserts.
847	*/
848	static Buffer
849	gistBufferingFindCorrectParent(GISTBuildState *buildstate,
850	BlockNumber childblkno, int level,
851	BlockNumber *parentblkno,
852	OffsetNumber *downlinkoffnum)
853	{
854	BlockNumber parent;
855	Buffer buffer;
856	Page page;
857	OffsetNumber maxoff;
858	OffsetNumber off;
859
860	if (level > `0`)
861	parent = gistGetParent(buildstate, childblkno);
862	else
863	{
864	/*
865	* For a leaf page, the caller must supply a correct parent block
866	* number.
867	*/
868	if (*parentblkno == InvalidBlockNumber)
869	elog(ERROR, "no parent buffer provided of child %d", childblkno);
870	parent = *parentblkno;
871	}
872
873	buffer = ReadBuffer(buildstate->indexrel, parent);
874	page = BufferGetPage(buffer);
875	LockBuffer(buffer, GIST_EXCLUSIVE);
876	gistcheckpage(buildstate->indexrel, buffer);
877	maxoff = PageGetMaxOffsetNumber(page);
878
879	/ Check if it was not moved /
880	if (parent == parentblkno && parentblkno != InvalidBlockNumber &&
881	downlinkoffnum != InvalidOffsetNumber && downlinkoffnum <= maxoff)
882	{
883	ItemId iid = PageGetItemId(page, *downlinkoffnum);
884	IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
885
886	if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
887	{
888	/ Still there /
889	return buffer;
890	}
891	}
892
893	/*
894	* Downlink was not at the offset where it used to be. Scan the page to
895	* find it. During normal gist insertions, it might've moved to another
896	* page, to the right, but during a buffering build, we keep track of the
897	* parent of each page in the lookup table so we should always know what
898	* page it's on.
899	*/
900	for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
901	{
902	ItemId iid = PageGetItemId(page, off);
903	IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
904
905	if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
906	{
907	/ yes!!, found it /
908	*downlinkoffnum = off;
909	return buffer;
910	}
911	}
912
913	elog(ERROR, "failed to re-find parent for block %u", childblkno);
914	return InvalidBuffer; / keep compiler quiet /
915	}
916
917	/*
918	* Process buffers emptying stack. Emptying of one buffer can cause emptying
919	* of other buffers. This function iterates until this cascading emptying
920	* process finished, e.g. until buffers emptying stack is empty.
921	*/
922	static void
923	gistProcessEmptyingQueue(GISTBuildState *buildstate)
924	{
925	GISTBuildBuffers *gfbb = buildstate->gfbb;
926
927	/ Iterate while we have elements in buffers emptying stack. /
928	while (gfbb->bufferEmptyingQueue != NIL)
929	{
930	GISTNodeBuffer *emptyingNodeBuffer;
931
932	/ Get node buffer from emptying stack. /
933	emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
934	gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
935	emptyingNodeBuffer->queuedForEmptying = false;
936
937	/*
938	* We are going to load last pages of buffers where emptying will be
939	* to. So let's unload any previously loaded buffers.
940	*/
941	gistUnloadNodeBuffers(gfbb);
942
943	/*
944	* Pop tuples from the buffer and run them down to the buffers at
945	* lower level, or leaf pages. We continue until one of the lower
946	* level buffers fills up, or this buffer runs empty.
947	*
948	* In Arge et al's paper, the buffer emptying is stopped after
949	* processing 1/2 node buffer worth of tuples, to avoid overfilling
950	* any of the lower level buffers. However, it's more efficient to
951	* keep going until one of the lower level buffers actually fills up,
952	* so that's what we do. This doesn't need to be exact, if a buffer
953	* overfills by a few tuples, there's no harm done.
954	*/
955	while (true)
956	{
957	IndexTuple itup;
958
959	/ Get next index tuple from the buffer /
960	if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
961	break;
962
963	/*
964	* Run it down to the underlying node buffer or leaf page.
965	*
966	* Note: it's possible that the buffer we're emptying splits as a
967	* result of this call. If that happens, our emptyingNodeBuffer
968	* points to the left half of the split. After split, it's very
969	* likely that the new left buffer is no longer over the half-full
970	* threshold, but we might as well keep flushing tuples from it
971	* until we fill a lower-level buffer.
972	*/
973	if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
974	{
975	/*
976	* A lower level buffer filled up. Stop emptying this buffer,
977	* to avoid overflowing the lower level buffer.
978	*/
979	break;
980	}
981
982	/ Free all the memory allocated during index tuple processing /
983	MemoryContextReset(buildstate->giststate->tempCxt);
984	}
985	}
986	}
987
988	/*
989	* Empty all node buffers, from top to bottom. This is done at the end of
990	* index build to flush all remaining tuples to the index.
991	*
992	* Note: This destroys the buffersOnLevels lists, so the buffers should not
993	* be inserted to after this call.
994	*/
995	static void
996	gistEmptyAllBuffers(GISTBuildState *buildstate)
997	{
998	GISTBuildBuffers *gfbb = buildstate->gfbb;
999	MemoryContext oldCtx;
1000	int i;
1001
1002	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1003
1004	/*
1005	* Iterate through the levels from top to bottom.
1006	*/
1007	for (i = gfbb->buffersOnLevelsLen - `1`; i >= `0`; i--)
1008	{
1009	/*
1010	* Empty all buffers on this level. Note that new buffers can pop up
1011	* in the list during the processing, as a result of page splits, so a
1012	* simple walk through the list won't work. We remove buffers from the
1013	* list when we see them empty; a buffer can't become non-empty once
1014	* it's been fully emptied.
1015	*/
1016	while (gfbb->buffersOnLevels[i] != NIL)
1017	{
1018	GISTNodeBuffer *nodeBuffer;
1019
1020	nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1021
1022	if (nodeBuffer->blocksCount != `0`)
1023	{
1024	/*
1025	* Add this buffer to the emptying queue, and proceed to empty
1026	* the queue.
1027	*/
1028	if (!nodeBuffer->queuedForEmptying)
1029	{
1030	MemoryContextSwitchTo(gfbb->context);
1031	nodeBuffer->queuedForEmptying = true;
1032	gfbb->bufferEmptyingQueue =
1033	lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1034	MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1035	}
1036	gistProcessEmptyingQueue(buildstate);
1037	}
1038	else
1039	gfbb->buffersOnLevels[i] =
1040	list_delete_first(gfbb->buffersOnLevels[i]);
1041	}
1042	elog(DEBUG2, "emptied all buffers at level %d", i);
1043	}
1044	MemoryContextSwitchTo(oldCtx);
1045	}
1046
1047	/*
1048	* Get the depth of the GiST index.
1049	*/
1050	static int
1051	gistGetMaxLevel(Relation index)
1052	{
1053	int maxLevel;
1054	BlockNumber blkno;
1055
1056	/*
1057	* Traverse down the tree, starting from the root, until we hit the leaf
1058	* level.
1059	*/
1060	maxLevel = `0`;
1061	blkno = GIST_ROOT_BLKNO;
1062	while (true)
1063	{
1064	Buffer buffer;
1065	Page page;
1066	IndexTuple itup;
1067
1068	buffer = ReadBuffer(index, blkno);
1069
1070	/*
1071	* There's no concurrent access during index build, so locking is just
1072	* pro forma.
1073	*/
1074	LockBuffer(buffer, GIST_SHARE);
1075	page = (Page) BufferGetPage(buffer);
1076
1077	if (GistPageIsLeaf(page))
1078	{
1079	/ We hit the bottom, so we're done. /
1080	UnlockReleaseBuffer(buffer);
1081	break;
1082	}
1083
1084	/*
1085	* Pick the first downlink on the page, and follow it. It doesn't
1086	* matter which downlink we choose, the tree has the same depth
1087	* everywhere, so we just pick the first one.
1088	*/
1089	itup = (IndexTuple) PageGetItem(page,
1090	PageGetItemId(page, FirstOffsetNumber));
1091	blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1092	UnlockReleaseBuffer(buffer);
1093
1094	/*
1095	* We're going down on the tree. It means that there is yet one more
1096	* level in the tree.
1097	*/
1098	maxLevel++;
1099	}
1100	return maxLevel;
1101	}
1102
1103
1104	/*
1105	* Routines for managing the parent map.
1106	*
1107	* Whenever a page is split, we need to insert the downlinks into the parent.
1108	* We need to somehow find the parent page to do that. In normal insertions,
1109	* we keep a stack of nodes visited when we descend the tree. However, in
1110	* buffering build, we can start descending the tree from any internal node,
1111	* when we empty a buffer by cascading tuples to its children. So we don't
1112	* have a full stack up to the root available at that time.
1113	*
1114	* So instead, we maintain a hash table to track the parent of every internal
1115	* page. We don't need to track the parents of leaf nodes, however. Whenever
1116	* we insert to a leaf, we've just descended down from its parent, so we know
1117	* its immediate parent already. This helps a lot to limit the memory used
1118	* by this hash table.
1119	*
1120	* Whenever an internal node is split, the parent map needs to be updated.
1121	* the parent of the new child page needs to be recorded, and also the
1122	* entries for all page whose downlinks are moved to a new page at the split
1123	* needs to be updated.
1124	*
1125	* We also update the parent map whenever we descend the tree. That might seem
1126	* unnecessary, because we maintain the map whenever a downlink is moved or
1127	* created, but it is needed because we switch to buffering mode after
1128	* creating a tree with regular index inserts. Any pages created before
1129	* switching to buffering mode will not be present in the parent map initially,
1130	* but will be added there the first time we visit them.
1131	*/
1132
1133	typedef struct
1134	{
1135	BlockNumber childblkno; / hash key /
1136	BlockNumber parentblkno;
1137	} ParentMapEntry;
1138
1139	static void
1140	gistInitParentMap(GISTBuildState *buildstate)
1141	{
1142	HASHCTL hashCtl;
1143
1144	hashCtl.keysize = sizeof(BlockNumber);
1145	hashCtl.entrysize = sizeof(ParentMapEntry);
1146	hashCtl.hcxt = CurrentMemoryContext;
1147	buildstate->parentMap = hash_create("gistbuild parent map",
1148	`1024`,
1149	&hashCtl,
1150	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
1151	}
1152
1153	static void
1154	gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1155	{
1156	ParentMapEntry *entry;
1157	bool found;
1158
1159	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1160	(const void *) &child,
1161	HASH_ENTER,
1162	&found);
1163	entry->parentblkno = parent;
1164	}
1165
1166	/*
1167	* Scan all downlinks on a page, and memorize their parent.
1168	*/
1169	static void
1170	gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1171	{
1172	OffsetNumber maxoff;
1173	OffsetNumber off;
1174	BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1175	Page page = BufferGetPage(parentbuf);
1176
1177	Assert(!GistPageIsLeaf(page));
1178
1179	maxoff = PageGetMaxOffsetNumber(page);
1180	for (off = FirstOffsetNumber; off <= maxoff; off++)
1181	{
1182	ItemId iid = PageGetItemId(page, off);
1183	IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1184	BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1185
1186	gistMemorizeParent(buildstate, childblkno, parentblkno);
1187	}
1188	}
1189
1190	static BlockNumber
1191	gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1192	{
1193	ParentMapEntry *entry;
1194	bool found;
1195
1196	/ Find node buffer in hash table /
1197	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1198	(const void *) &child,
1199	HASH_FIND,
1200	&found);
1201	if (!found)
1202	elog(ERROR, "could not find parent of block %d in lookup table", child);
1203
1204	return entry->parentblkno;
1205	}
1206

Browse the source code of PostgreSQL/src/backend/access/gist/gistbuild.c