nbtsplitloc.c source code [PostgreSQL/src/backend/access/nbtree/nbtsplitloc.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtsplitloc.c
4	* Choose split point code for Postgres btree implementation.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/nbtree/nbtsplitloc.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include "access/nbtree.h"
18	#include "storage/lmgr.h"
19
20	/ limits on split interval (default strategy only) /
21	#define MAX_LEAF_INTERVAL 9
22	#define MAX_INTERNAL_INTERVAL 18
23
24	typedef enum
25	{
26	/ strategy for searching through materialized list of split points /
27	SPLIT_DEFAULT, / give some weight to truncation /
28	SPLIT_MANY_DUPLICATES, / find minimally distinguishing point /
29	SPLIT_SINGLE_VALUE / leave left page almost full /
30	} FindSplitStrat;
31
32	typedef struct
33	{
34	/ details of free space left by split /
35	int16 curdelta; / current leftfree/rightfree delta /
36	int16 leftfree; / space left on left page post-split /
37	int16 rightfree; / space left on right page post-split /
38
39	/ split point identifying fields (returned by _bt_findsplitloc) /
40	OffsetNumber firstoldonright; / first item on new right page /
41	bool newitemonleft; / new item goes on left, or right? /
42
43	} SplitPoint;
44
45	typedef struct
46	{
47	/ context data for _bt_recsplitloc /
48	Relation rel; / index relation /
49	Page page; / page undergoing split /
50	IndexTuple newitem; / new item (cause of page split) /
51	Size newitemsz; / size of newitem (includes line pointer) /
52	bool is_leaf; / T if splitting a leaf page /
53	bool is_rightmost; / T if splitting rightmost page on level /
54	OffsetNumber newitemoff; / where the new item is to be inserted /
55	int leftspace; / space available for items on left page /
56	int rightspace; / space available for items on right page /
57	int olddataitemstotal; / space taken by old items /
58	Size minfirstrightsz; / smallest firstoldonright tuple size /
59
60	/ candidate split point data /
61	int maxsplits; / maximum number of splits /
62	int nsplits; / current number of splits /
63	SplitPoint splits; /* all candidate split points for page /
64	int interval; / current range of acceptable split points /
65	} FindSplitData;
66
67	static void _bt_recsplitloc(FindSplitData *state,
68	OffsetNumber firstoldonright, bool newitemonleft,
69	int olddataitemstoleft, Size firstoldonrightsz);
70	static void _bt_deltasortsplits(FindSplitData state, double* fillfactormult,
71	bool usemult);
72	static int _bt_splitcmp(const void arg1, const* void *arg2);
73	static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
74	int leaffillfactor, bool *usemult);
75	static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
76	static OffsetNumber _bt_bestsplitloc(FindSplitData state, int* perfectpenalty,
77	bool *newitemonleft, FindSplitStrat strategy);
78	static int _bt_strategy(FindSplitData state, SplitPoint leftpage,
79	SplitPoint rightpage, FindSplitStrat strategy);
80	static void _bt_interval_edges(FindSplitData *state,
81	SplitPoint leftinterval, SplitPoint rightinterval);
82	static inline int _bt_split_penalty(FindSplitData state, SplitPoint split);
83	static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
84	SplitPoint *split);
85	static inline IndexTuple _bt_split_firstright(FindSplitData *state,
86	SplitPoint *split);
87
88
89	/*
90	* _bt_findsplitloc() -- find an appropriate place to split a page.
91	*
92	* The main goal here is to equalize the free space that will be on each
93	* split page, after accounting for the inserted tuple. (If we fail to
94	* account for it, we might find ourselves with too little room on the page
95	* that it needs to go into!)
96	*
97	* If the page is the rightmost page on its level, we instead try to arrange
98	* to leave the left split page fillfactor% full. In this way, when we are
99	* inserting successively increasing keys (consider sequences, timestamps,
100	* etc) we will end up with a tree whose pages are about fillfactor% full,
101	* instead of the 50% full result that we'd get without this special case.
102	* This is the same as nbtsort.c produces for a newly-created tree. Note
103	* that leaf and nonleaf pages use different fillfactors. Note also that
104	* there are a number of further special cases where fillfactor is not
105	* applied in the standard way.
106	*
107	* We are passed the intended insert position of the new tuple, expressed as
108	* the offsetnumber of the tuple it must go in front of (this could be
109	* maxoff+1 if the tuple is to go at the end). The new tuple itself is also
110	* passed, since it's needed to give some weight to how effective suffix
111	* truncation will be. The implementation picks the split point that
112	* maximizes the effectiveness of suffix truncation from a small list of
113	* alternative candidate split points that leave each side of the split with
114	* about the same share of free space. Suffix truncation is secondary to
115	* equalizing free space, except in cases with large numbers of duplicates.
116	* Note that it is always assumed that caller goes on to perform truncation,
117	* even with pg_upgrade'd indexes where that isn't actually the case
118	* (!heapkeyspace indexes). See nbtree/README for more information about
119	* suffix truncation.
120	*
121	* We return the index of the first existing tuple that should go on the
122	* righthand page, plus a boolean indicating whether the new tuple goes on
123	* the left or right page. The bool is necessary to disambiguate the case
124	* where firstright == newitemoff.
125	*/
126	OffsetNumber
127	_bt_findsplitloc(Relation rel,
128	Page page,
129	OffsetNumber newitemoff,
130	Size newitemsz,
131	IndexTuple newitem,
132	bool *newitemonleft)
133	{
134	BTPageOpaque opaque;
135	int leftspace,
136	rightspace,
137	olddataitemstotal,
138	olddataitemstoleft,
139	perfectpenalty,
140	leaffillfactor;
141	FindSplitData state;
142	FindSplitStrat strategy;
143	ItemId itemid;
144	OffsetNumber offnum,
145	maxoff,
146	foundfirstright;
147	double fillfactormult;
148	bool usemult;
149	SplitPoint leftpage,
150	rightpage;
151
152	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153	maxoff = PageGetMaxOffsetNumber(page);
154
155	/ Total free space available on a btree page, after fixed overhead /
156	leftspace = rightspace =
157	PageGetPageSize(page) - SizeOfPageHeaderData -
158	MAXALIGN(sizeof(BTPageOpaqueData));
159
160	/ The right page will have the same high key as the old page /
161	if (!P_RIGHTMOST(opaque))
162	{
163	itemid = PageGetItemId(page, P_HIKEY);
164	rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165	sizeof(ItemIdData));
166	}
167
168	/ Count up total space in data items before actually scanning 'em /
169	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170	leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
171
172	/ Passed-in newitemsz is MAXALIGNED but does not include line pointer /
173	newitemsz += sizeof(ItemIdData);
174	state.rel = rel;
175	state.page = page;
176	state.newitem = newitem;
177	state.newitemsz = newitemsz;
178	state.is_leaf = P_ISLEAF(opaque);
179	state.is_rightmost = P_RIGHTMOST(opaque);
180	state.leftspace = leftspace;
181	state.rightspace = rightspace;
182	state.olddataitemstotal = olddataitemstotal;
183	state.minfirstrightsz = SIZE_MAX;
184	state.newitemoff = newitemoff;
185
186	/*
187	* maxsplits should never exceed maxoff because there will be at most as
188	* many candidate split points as there are points _between_ tuples, once
189	* you imagine that the new item is already on the original page (the
190	* final number of splits may be slightly lower because not all points
191	* between tuples will be legal).
192	*/
193	state.maxsplits = maxoff;
194	state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
195	state.nsplits = `0`;
196
197	/*
198	* Scan through the data items and calculate space usage for a split at
199	* each possible position
200	*/
201	olddataitemstoleft = `0`;
202
203	for (offnum = P_FIRSTDATAKEY(opaque);
204	offnum <= maxoff;
205	offnum = OffsetNumberNext(offnum))
206	{
207	Size itemsz;
208
209	itemid = PageGetItemId(page, offnum);
210	itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
211
212	/*
213	* When item offset number is not newitemoff, neither side of the
214	* split can be newitem. Record a split after the previous data item
215	* from original page, but before the current data item from original
216	* page. (_bt_recsplitloc() will reject the split when there are no
217	* previous items, which we rely on.)
218	*/
219	if (offnum < newitemoff)
220	_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
221	else if (offnum > newitemoff)
222	_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
223	else
224	{
225	/*
226	* Record a split after all "offnum < newitemoff" original page
227	* data items, but before newitem
228	*/
229	_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
230
231	/*
232	* Record a split after newitem, but before data item from
233	* original page at offset newitemoff/current offset
234	*/
235	_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
236	}
237
238	olddataitemstoleft += itemsz;
239	}
240
241	/*
242	* Record a split after all original page data items, but before newitem.
243	* (Though only when it's possible that newitem will end up alone on new
244	* right page.)
245	*/
246	Assert(olddataitemstoleft == olddataitemstotal);
247	if (newitemoff > maxoff)
248	_bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, `0`);
249
250	/*
251	* I believe it is not possible to fail to find a feasible split, but just
252	* in case ...
253	*/
254	if (state.nsplits == `0`)
255	elog(ERROR, "could not find a feasible split point for index \"%s\"",
256	RelationGetRelationName(rel));
257
258	/*
259	* Start search for a split point among list of legal split points. Give
260	* primary consideration to equalizing available free space in each half
261	* of the split initially (start with default strategy), while applying
262	* rightmost and split-after-new-item optimizations where appropriate.
263	* Either of the two other fallback strategies may be required for cases
264	* with a large number of duplicates around the original/space-optimal
265	* split point.
266	*
267	* Default strategy gives some weight to suffix truncation in deciding a
268	* split point on leaf pages. It attempts to select a split point where a
269	* distinguishing attribute appears earlier in the new high key for the
270	* left side of the split, in order to maximize the number of trailing
271	* attributes that can be truncated away. Only candidate split points
272	* that imply an acceptable balance of free space on each side are
273	* considered.
274	*/
275	if (!state.is_leaf)
276	{
277	/ fillfactormult only used on rightmost page /
278	usemult = state.is_rightmost;
279	fillfactormult = BTREE_NONLEAF_FILLFACTOR / `100.0`;
280	}
281	else if (state.is_rightmost)
282	{
283	/ Rightmost leaf page -- fillfactormult always used /
284	usemult = true;
285	fillfactormult = leaffillfactor / `100.0`;
286	}
287	else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
288	{
289	/*
290	* New item inserted at rightmost point among a localized grouping on
291	* a leaf page -- apply "split after new item" optimization, either by
292	* applying leaf fillfactor multiplier, or by choosing the exact split
293	* point that leaves the new item as last on the left. (usemult is set
294	* for us.)
295	*/
296	if (usemult)
297	{
298	/ fillfactormult should be set based on leaf fillfactor /
299	fillfactormult = leaffillfactor / `100.0`;
300	}
301	else
302	{
303	/ find precise split point after newitemoff /
304	for (int i = `0`; i < state.nsplits; i++)
305	{
306	SplitPoint *split = state.splits + i;
307
308	if (split->newitemonleft &&
309	newitemoff == split->firstoldonright)
310	{
311	pfree(state.splits);
312	*newitemonleft = true;
313	return newitemoff;
314	}
315	}
316
317	/*
318	* Cannot legally split after newitemoff; proceed with split
319	* without using fillfactor multiplier. This is defensive, and
320	* should never be needed in practice.
321	*/
322	fillfactormult = `0.50`;
323	}
324	}
325	else
326	{
327	/ Other leaf page. 50:50 page split. /
328	usemult = false;
329	/ fillfactormult not used, but be tidy /
330	fillfactormult = `0.50`;
331	}
332
333	/*
334	* Set an initial limit on the split interval/number of candidate split
335	* points as appropriate. The "Prefix B-Trees" paper refers to this as
336	* sigma l for leaf splits and sigma b for internal ("branch") splits.
337	* It's hard to provide a theoretical justification for the initial size
338	* of the split interval, though it's clear that a small split interval
339	* makes suffix truncation much more effective without noticeably
340	* affecting space utilization over time.
341	*/
342	state.interval = Min(Max(`1`, state.nsplits * `0.05`),
343	state.is_leaf ? MAX_LEAF_INTERVAL :
344	MAX_INTERNAL_INTERVAL);
345
346	/*
347	* Save leftmost and rightmost splits for page before original ordinal
348	* sort order is lost by delta/fillfactormult sort
349	*/
350	leftpage = state.splits[`0`];
351	rightpage = state.splits[state.nsplits - `1`];
352
353	/ Give split points a fillfactormult-wise delta, and sort on deltas /
354	_bt_deltasortsplits(&state, fillfactormult, usemult);
355
356	/*
357	* Determine if default strategy/split interval will produce a
358	* sufficiently distinguishing split, or if we should change strategies.
359	* Alternative strategies change the range of split points that are
360	* considered acceptable (split interval), and possibly change
361	* fillfactormult, in order to deal with pages with a large number of
362	* duplicates gracefully.
363	*
364	* Pass low and high splits for the entire page (actually, they're for an
365	* imaginary version of the page that includes newitem). These are used
366	* when the initial split interval encloses split points that are full of
367	* duplicates, and we need to consider if it's even possible to avoid
368	* appending a heap TID.
369	*/
370	perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
371
372	if (strategy == SPLIT_DEFAULT)
373	{
374	/*
375	* Default strategy worked out (always works out with internal page).
376	* Original split interval still stands.
377	*/
378	}
379
380	/*
381	* Many duplicates strategy is used when a heap TID would otherwise be
382	* appended, but the page isn't completely full of logical duplicates.
383	*
384	* The split interval is widened to include all legal candidate split
385	* points. There might be a few as two distinct values in the whole-page
386	* split interval, though it's also possible that most of the values on
387	* the page are unique. The final split point will either be to the
388	* immediate left or to the immediate right of the group of duplicate
389	* tuples that enclose the first/delta-optimal split point (perfect
390	* penalty was set so that the lowest delta split point that avoids
391	* appending a heap TID will be chosen). Maximizing the number of
392	* attributes that can be truncated away is not a goal of the many
393	* duplicates strategy.
394	*
395	* Single value strategy is used when it is impossible to avoid appending
396	* a heap TID. It arranges to leave the left page very full. This
397	* maximizes space utilization in cases where tuples with the same
398	* attribute values span many pages. Newly inserted duplicates will tend
399	* to have higher heap TID values, so we'll end up splitting to the right
400	* consistently. (Single value strategy is harmless though not
401	* particularly useful with !heapkeyspace indexes.)
402	*/
403	else if (strategy == SPLIT_MANY_DUPLICATES)
404	{
405	Assert(state.is_leaf);
406	/ Shouldn't try to truncate away extra user attributes /
407	Assert(perfectpenalty ==
408	IndexRelationGetNumberOfKeyAttributes(state.rel));
409	/ No need to resort splits -- no change in fillfactormult/deltas /
410	state.interval = state.nsplits;
411	}
412	else if (strategy == SPLIT_SINGLE_VALUE)
413	{
414	Assert(state.is_leaf);
415	/ Split near the end of the page /
416	usemult = true;
417	fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / `100.0`;
418	/ Resort split points with new delta /
419	_bt_deltasortsplits(&state, fillfactormult, usemult);
420	/ Appending a heap TID is unavoidable, so interval of 1 is fine /
421	state.interval = `1`;
422	}
423
424	/*
425	* Search among acceptable split points (using final split interval) for
426	* the entry that has the lowest penalty, and is therefore expected to
427	* maximize fan-out. Sets *newitemonleft for us.
428	*/
429	foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
430	strategy);
431	pfree(state.splits);
432
433	return foundfirstright;
434	}
435
436	/*
437	* Subroutine to record a particular point between two tuples (possibly the
438	* new item) on page (ie, combination of firstright and newitemonleft
439	* settings) in *state for later analysis. This is also a convenient point
440	* to check if the split is legal (if it isn't, it won't be recorded).
441	*
442	* firstoldonright is the offset of the first item on the original page that
443	* goes to the right page, and firstoldonrightsz is the size of that tuple.
444	* firstoldonright can be > max offset, which means that all the old items go
445	* to the left page and only the new item goes to the right page. In that
446	* case, firstoldonrightsz is not used.
447	*
448	* olddataitemstoleft is the total size of all old items to the left of the
449	* split point that is recorded here when legal. Should not include
450	* newitemsz, since that is handled here.
451	*/
452	static void
453	_bt_recsplitloc(FindSplitData *state,
454	OffsetNumber firstoldonright,
455	bool newitemonleft,
456	int olddataitemstoleft,
457	Size firstoldonrightsz)
458	{
459	int16 leftfree,
460	rightfree;
461	Size firstrightitemsz;
462	bool newitemisfirstonright;
463
464	/ Is the new item going to be the first item on the right page? /
465	newitemisfirstonright = (firstoldonright == state->newitemoff
466	&& !newitemonleft);
467
468	if (newitemisfirstonright)
469	firstrightitemsz = state->newitemsz;
470	else
471	firstrightitemsz = firstoldonrightsz;
472
473	/ Account for all the old tuples /
474	leftfree = state->leftspace - olddataitemstoleft;
475	rightfree = state->rightspace -
476	(state->olddataitemstotal - olddataitemstoleft);
477
478	/*
479	* The first item on the right page becomes the high key of the left page;
480	* therefore it counts against left space as well as right space (we
481	* cannot assume that suffix truncation will make it any smaller). When
482	* index has included attributes, then those attributes of left page high
483	* key will be truncated leaving that page with slightly more free space.
484	* However, that shouldn't affect our ability to find valid split
485	* location, since we err in the direction of being pessimistic about free
486	* space on the left half. Besides, even when suffix truncation of
487	* non-TID attributes occurs, the new high key often won't even be a
488	* single MAXALIGN() quantum smaller than the firstright tuple it's based
489	* on.
490	*
491	* If we are on the leaf level, assume that suffix truncation cannot avoid
492	* adding a heap TID to the left half's new high key when splitting at the
493	* leaf level. In practice the new high key will often be smaller and
494	* will rarely be larger, but conservatively assume the worst case.
495	*/
496	if (state->is_leaf)
497	leftfree -= (int16) (firstrightitemsz +
498	MAXALIGN(sizeof(ItemPointerData)));
499	else
500	leftfree -= (int16) firstrightitemsz;
501
502	/ account for the new item /
503	if (newitemonleft)
504	leftfree -= (int16) state->newitemsz;
505	else
506	rightfree -= (int16) state->newitemsz;
507
508	/*
509	* If we are not on the leaf level, we will be able to discard the key
510	* data from the first item that winds up on the right page.
511	*/
512	if (!state->is_leaf)
513	rightfree += (int16) firstrightitemsz -
514	(int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
515
516	/ Record split if legal /
517	if (leftfree >= `0` && rightfree >= `0`)
518	{
519	Assert(state->nsplits < state->maxsplits);
520
521	/ Determine smallest firstright item size on page /
522	state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz);
523
524	state->splits[state->nsplits].curdelta = `0`;
525	state->splits[state->nsplits].leftfree = leftfree;
526	state->splits[state->nsplits].rightfree = rightfree;
527	state->splits[state->nsplits].firstoldonright = firstoldonright;
528	state->splits[state->nsplits].newitemonleft = newitemonleft;
529	state->nsplits++;
530	}
531	}
532
533	/*
534	* Subroutine to assign space deltas to materialized array of candidate split
535	* points based on current fillfactor, and to sort array using that fillfactor
536	*/
537	static void
538	_bt_deltasortsplits(FindSplitData state, double* fillfactormult,
539	bool usemult)
540	{
541	for (int i = `0`; i < state->nsplits; i++)
542	{
543	SplitPoint *split = state->splits + i;
544	int16 delta;
545
546	if (usemult)
547	delta = fillfactormult * split->leftfree -
548	(`1.0` - fillfactormult) * split->rightfree;
549	else
550	delta = split->leftfree - split->rightfree;
551
552	if (delta < `0`)
553	delta = -delta;
554
555	/ Save delta /
556	split->curdelta = delta;
557	}
558
559	qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
560	}
561
562	/*
563	* qsort-style comparator used by _bt_deltasortsplits()
564	*/
565	static int
566	_bt_splitcmp(const void arg1, const* void *arg2)
567	{
568	SplitPoint split1 = (SplitPoint ) arg1;
569	SplitPoint split2 = (SplitPoint ) arg2;
570
571	if (split1->curdelta > split2->curdelta)
572	return `1`;
573	if (split1->curdelta < split2->curdelta)
574	return -`1`;
575
576	return `0`;
577	}
578
579	/*
580	* Subroutine to determine whether or not a non-rightmost leaf page should be
581	* split immediately after the would-be original page offset for the
582	* new/incoming tuple (or should have leaf fillfactor applied when new item is
583	* to the right on original page). This is appropriate when there is a
584	* pattern of localized monotonically increasing insertions into a composite
585	* index, where leading attribute values form local groupings, and we
586	* anticipate further insertions of the same/current grouping (new item's
587	* grouping) in the near future. This can be thought of as a variation on
588	* applying leaf fillfactor during rightmost leaf page splits, since cases
589	* that benefit will converge on packing leaf pages leaffillfactor% full over
590	* time.
591	*
592	* We may leave extra free space remaining on the rightmost page of a "most
593	* significant column" grouping of tuples if that grouping never ends up
594	* having future insertions that use the free space. That effect is
595	* self-limiting; a future grouping that becomes the "nearest on the right"
596	* grouping of the affected grouping usually puts the extra free space to good
597	* use.
598	*
599	* Caller uses optimization when routine returns true, though the exact action
600	* taken by caller varies. Caller uses original leaf page fillfactor in
601	* standard way rather than using the new item offset directly when *usemult
602	* was also set to true here. Otherwise, caller applies optimization by
603	* locating the legal split point that makes the new tuple the very last tuple
604	* on the left side of the split.
605	*/
606	static bool
607	_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
608	int leaffillfactor, bool *usemult)
609	{
610	int16 nkeyatts;
611	ItemId itemid;
612	IndexTuple tup;
613	int keepnatts;
614
615	Assert(state->is_leaf && !state->is_rightmost);
616
617	nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
618
619	/ Single key indexes not considered here /
620	if (nkeyatts == `1`)
621	return false;
622
623	/ Ascending insertion pattern never inferred when new item is first /
624	if (state->newitemoff == P_FIRSTKEY)
625	return false;
626
627	/*
628	* Only apply optimization on pages with equisized tuples, since ordinal
629	* keys are likely to be fixed-width. Testing if the new tuple is
630	* variable width directly might also work, but that fails to apply the
631	* optimization to indexes with a numeric_ops attribute.
632	*
633	* Conclude that page has equisized tuples when the new item is the same
634	* width as the smallest item observed during pass over page, and other
635	* non-pivot tuples must be the same width as well. (Note that the
636	* possibly-truncated existing high key isn't counted in
637	* olddataitemstotal, and must be subtracted from maxoff.)
638	*/
639	if (state->newitemsz != state->minfirstrightsz)
640	return false;
641	if (state->newitemsz * (maxoff - `1`) != state->olddataitemstotal)
642	return false;
643
644	/*
645	* Avoid applying optimization when tuples are wider than a tuple
646	* consisting of two non-NULL int8/int64 attributes (or four non-NULL
647	* int4/int32 attributes)
648	*/
649	if (state->newitemsz >
650	MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * `2`) +
651	sizeof(ItemIdData))
652	return false;
653
654	/*
655	* At least the first attribute's value must be equal to the corresponding
656	* value in previous tuple to apply optimization. New item cannot be a
657	* duplicate, either.
658	*
659	* Handle case where new item is to the right of all items on the existing
660	* page. This is suggestive of monotonically increasing insertions in
661	* itself, so the "heap TID adjacency" test is not applied here.
662	*/
663	if (state->newitemoff > maxoff)
664	{
665	itemid = PageGetItemId(state->page, maxoff);
666	tup = (IndexTuple) PageGetItem(state->page, itemid);
667	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
668
669	if (keepnatts > `1` && keepnatts <= nkeyatts)
670	{
671	*usemult = true;
672	return true;
673	}
674
675	return false;
676	}
677
678	/*
679	* "Low cardinality leading column, high cardinality suffix column"
680	* indexes with a random insertion pattern (e.g., an index with a boolean
681	* column, such as an index on '(book_is_in_print, book_isbn)') present us
682	* with a risk of consistently misapplying the optimization. We're
683	* willing to accept very occasional misapplication of the optimization,
684	* provided the cases where we get it wrong are rare and self-limiting.
685	*
686	* Heap TID adjacency strongly suggests that the item just to the left was
687	* inserted very recently, which limits overapplication of the
688	* optimization. Besides, all inappropriate cases triggered here will
689	* still split in the middle of the page on average.
690	*/
691	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
692	tup = (IndexTuple) PageGetItem(state->page, itemid);
693	/ Do cheaper test first /
694	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
695	return false;
696	/ Check same conditions as rightmost item case, too /
697	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
698
699	if (keepnatts > `1` && keepnatts <= nkeyatts)
700	{
701	double interp = (double) state->newitemoff / ((double) maxoff + `1`);
702	double leaffillfactormult = (double) leaffillfactor / `100.0`;
703
704	/*
705	* Don't allow caller to split after a new item when it will result in
706	* a split point to the right of the point that a leaf fillfactor
707	* split would use -- have caller apply leaf fillfactor instead
708	*/
709	*usemult = interp > leaffillfactormult;
710
711	return true;
712	}
713
714	return false;
715	}
716
717	/*
718	* Subroutine for determining if two heap TIDS are "adjacent".
719	*
720	* Adjacent means that the high TID is very likely to have been inserted into
721	* heap relation immediately after the low TID, probably during the current
722	* transaction.
723	*/
724	static bool
725	_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
726	{
727	BlockNumber lowblk,
728	highblk;
729
730	lowblk = ItemPointerGetBlockNumber(lowhtid);
731	highblk = ItemPointerGetBlockNumber(highhtid);
732
733	/ Make optimistic assumption of adjacency when heap blocks match /
734	if (lowblk == highblk)
735	return true;
736
737	/ When heap block one up, second offset should be FirstOffsetNumber /
738	if (lowblk + `1` == highblk &&
739	ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
740	return true;
741
742	return false;
743	}
744
745	/*
746	* Subroutine to find the "best" split point among candidate split points.
747	* The best split point is the split point with the lowest penalty among split
748	* points that fall within current/final split interval. Penalty is an
749	* abstract score, with a definition that varies depending on whether we're
750	* splitting a leaf page or an internal page. See _bt_split_penalty() for
751	* details.
752	*
753	* "perfectpenalty" is assumed to be the lowest possible penalty among
754	* candidate split points. This allows us to return early without wasting
755	* cycles on calculating the first differing attribute for all candidate
756	* splits when that clearly cannot improve our choice (or when we only want a
757	* minimally distinguishing split point, and don't want to make the split any
758	* more unbalanced than is necessary).
759	*
760	* We return the index of the first existing tuple that should go on the right
761	* page, plus a boolean indicating if new item is on left of split point.
762	*/
763	static OffsetNumber
764	_bt_bestsplitloc(FindSplitData state, int* perfectpenalty,
765	bool *newitemonleft, FindSplitStrat strategy)
766	{
767	int bestpenalty,
768	lowsplit;
769	int highsplit = Min(state->interval, state->nsplits);
770	SplitPoint *final;
771
772	bestpenalty = INT_MAX;
773	lowsplit = `0`;
774	for (int i = lowsplit; i < highsplit; i++)
775	{
776	int penalty;
777
778	penalty = _bt_split_penalty(state, state->splits + i);
779
780	if (penalty <= perfectpenalty)
781	{
782	bestpenalty = penalty;
783	lowsplit = i;
784	break;
785	}
786
787	if (penalty < bestpenalty)
788	{
789	bestpenalty = penalty;
790	lowsplit = i;
791	}
792	}
793
794	final = &state->splits[lowsplit];
795
796	/*
797	* There is a risk that the "many duplicates" strategy will repeatedly do
798	* the wrong thing when there are monotonically decreasing insertions to
799	* the right of a large group of duplicates. Repeated splits could leave
800	* a succession of right half pages with free space that can never be
801	* used. This must be avoided.
802	*
803	* Consider the example of the leftmost page in a single integer attribute
804	* NULLS FIRST index which is almost filled with NULLs. Monotonically
805	* decreasing integer insertions might cause the same leftmost page to
806	* split repeatedly at the same point. Each split derives its new high
807	* key from the lowest current value to the immediate right of the large
808	* group of NULLs, which will always be higher than all future integer
809	* insertions, directing all future integer insertions to the same
810	* leftmost page.
811	*/
812	if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
813	!final->newitemonleft && final->firstoldonright >= state->newitemoff &&
814	final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL)
815	{
816	/*
817	* Avoid the problem by peforming a 50:50 split when the new item is
818	* just to the right of the would-be "many duplicates" split point.
819	*/
820	final = &state->splits[`0`];
821	}
822
823	*newitemonleft = final->newitemonleft;
824	return final->firstoldonright;
825	}
826
827	/*
828	* Subroutine to decide whether split should use default strategy/initial
829	* split interval, or whether it should finish splitting the page using
830	* alternative strategies (this is only possible with leaf pages).
831	*
832	* Caller uses alternative strategy (or sticks with default strategy) based
833	* on how *strategy is set here. Return value is "perfect penalty", which is
834	* passed to _bt_bestsplitloc() as a final constraint on how far caller is
835	* willing to go to avoid appending a heap TID when using the many duplicates
836	* strategy (it also saves _bt_bestsplitloc() useless cycles).
837	*/
838	static int
839	_bt_strategy(FindSplitData state, SplitPoint leftpage,
840	SplitPoint rightpage, FindSplitStrat strategy)
841	{
842	IndexTuple leftmost,
843	rightmost;
844	SplitPoint *leftinterval,
845	*rightinterval;
846	int perfectpenalty;
847	int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
848
849	/ Assume that alternative strategy won't be used for now /
850	*strategy = SPLIT_DEFAULT;
851
852	/*
853	* Use smallest observed first right item size for entire page as perfect
854	* penalty on internal pages. This can save cycles in the common case
855	* where most or all splits (not just splits within interval) have first
856	* right tuples that are the same size.
857	*/
858	if (!state->is_leaf)
859	return state->minfirstrightsz;
860
861	/*
862	* Use leftmost and rightmost tuples from leftmost and rightmost splits in
863	* current split interval
864	*/
865	_bt_interval_edges(state, &leftinterval, &rightinterval);
866	leftmost = _bt_split_lastleft(state, leftinterval);
867	rightmost = _bt_split_firstright(state, rightinterval);
868
869	/*
870	* If initial split interval can produce a split point that will at least
871	* avoid appending a heap TID in new high key, we're done. Finish split
872	* with default strategy and initial split interval.
873	*/
874	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
875	if (perfectpenalty <= indnkeyatts)
876	return perfectpenalty;
877
878	/*
879	* Work out how caller should finish split when even their "perfect"
880	* penalty for initial/default split interval indicates that the interval
881	* does not contain even a single split that avoids appending a heap TID.
882	*
883	* Use the leftmost split's lastleft tuple and the rightmost split's
884	* firstright tuple to assess every possible split.
885	*/
886	leftmost = _bt_split_lastleft(state, leftpage);
887	rightmost = _bt_split_firstright(state, rightpage);
888
889	/*
890	* If page (including new item) has many duplicates but is not entirely
891	* full of duplicates, a many duplicates strategy split will be performed.
892	* If page is entirely full of duplicates, a single value strategy split
893	* will be performed.
894	*/
895	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
896	if (perfectpenalty <= indnkeyatts)
897	{
898	*strategy = SPLIT_MANY_DUPLICATES;
899
900	/*
901	* Many duplicates strategy should split at either side the group of
902	* duplicates that enclose the delta-optimal split point. Return
903	* indnkeyatts rather than the true perfect penalty to make that
904	* happen. (If perfectpenalty was returned here then low cardinality
905	* composite indexes could have continual unbalanced splits.)
906	*
907	* Note that caller won't go through with a many duplicates split in
908	* rare cases where it looks like there are ever-decreasing insertions
909	* to the immediate right of the split point. This must happen just
910	* before a final decision is made, within _bt_bestsplitloc().
911	*/
912	return indnkeyatts;
913	}
914
915	/*
916	* Single value strategy is only appropriate with ever-increasing heap
917	* TIDs; otherwise, original default strategy split should proceed to
918	* avoid pathological performance. Use page high key to infer if this is
919	* the rightmost page among pages that store the same duplicate value.
920	* This should not prevent insertions of heap TIDs that are slightly out
921	* of order from using single value strategy, since that's expected with
922	* concurrent inserters of the same duplicate value.
923	*/
924	else if (state->is_rightmost)
925	*strategy = SPLIT_SINGLE_VALUE;
926	else
927	{
928	ItemId itemid;
929	IndexTuple hikey;
930
931	itemid = PageGetItemId(state->page, P_HIKEY);
932	hikey = (IndexTuple) PageGetItem(state->page, itemid);
933	perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
934	state->newitem);
935	if (perfectpenalty <= indnkeyatts)
936	*strategy = SPLIT_SINGLE_VALUE;
937	else
938	{
939	/*
940	* Have caller finish split using default strategy, since page
941	* does not appear to be the rightmost page for duplicates of the
942	* value the page is filled with
943	*/
944	}
945	}
946
947	return perfectpenalty;
948	}
949
950	/*
951	* Subroutine to locate leftmost and rightmost splits for current/default
952	* split interval. Note that it will be the same split iff there is only one
953	* split in interval.
954	*/
955	static void
956	_bt_interval_edges(FindSplitData state, SplitPoint *leftinterval,
957	SplitPoint **rightinterval)
958	{
959	int highsplit = Min(state->interval, state->nsplits);
960	SplitPoint *deltaoptimal;
961
962	deltaoptimal = state->splits;
963	*leftinterval = NULL;
964	*rightinterval = NULL;
965
966	/*
967	* Delta is an absolute distance to optimal split point, so both the
968	* leftmost and rightmost split point will usually be at the end of the
969	* array
970	*/
971	for (int i = highsplit - `1`; i >= `0`; i--)
972	{
973	SplitPoint *distant = state->splits + i;
974
975	if (distant->firstoldonright < deltaoptimal->firstoldonright)
976	{
977	if (*leftinterval == NULL)
978	*leftinterval = distant;
979	}
980	else if (distant->firstoldonright > deltaoptimal->firstoldonright)
981	{
982	if (*rightinterval == NULL)
983	*rightinterval = distant;
984	}
985	else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
986	{
987	/*
988	* "incoming tuple will become first on right page" (distant) is
989	* to the left of "incoming tuple will become last on left page"
990	* (delta-optimal)
991	*/
992	Assert(distant->firstoldonright == state->newitemoff);
993	if (*leftinterval == NULL)
994	*leftinterval = distant;
995	}
996	else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
997	{
998	/*
999	* "incoming tuple will become last on left page" (distant) is to
1000	* the right of "incoming tuple will become first on right page"
1001	* (delta-optimal)
1002	*/
1003	Assert(distant->firstoldonright == state->newitemoff);
1004	if (*rightinterval == NULL)
1005	*rightinterval = distant;
1006	}
1007	else
1008	{
1009	/ There was only one or two splits in initial split interval /
1010	Assert(distant == deltaoptimal);
1011	if (*leftinterval == NULL)
1012	*leftinterval = distant;
1013	if (*rightinterval == NULL)
1014	*rightinterval = distant;
1015	}
1016
1017	if (leftinterval && rightinterval)
1018	return;
1019	}
1020
1021	Assert(false);
1022	}
1023
1024	/*
1025	* Subroutine to find penalty for caller's candidate split point.
1026	*
1027	* On leaf pages, penalty is the attribute number that distinguishes each side
1028	* of a split. It's the last attribute that needs to be included in new high
1029	* key for left page. It can be greater than the number of key attributes in
1030	* cases where a heap TID will need to be appended during truncation.
1031	*
1032	* On internal pages, penalty is simply the size of the first item on the
1033	* right half of the split (including line pointer overhead). This tuple will
1034	* become the new high key for the left page.
1035	*/
1036	static inline int
1037	_bt_split_penalty(FindSplitData state, SplitPoint split)
1038	{
1039	IndexTuple lastleftuple;
1040	IndexTuple firstrighttuple;
1041
1042	if (!state->is_leaf)
1043	{
1044	ItemId itemid;
1045
1046	if (!split->newitemonleft &&
1047	split->firstoldonright == state->newitemoff)
1048	return state->newitemsz;
1049
1050	itemid = PageGetItemId(state->page, split->firstoldonright);
1051
1052	return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1053	}
1054
1055	lastleftuple = _bt_split_lastleft(state, split);
1056	firstrighttuple = _bt_split_firstright(state, split);
1057
1058	Assert(lastleftuple != firstrighttuple);
1059	return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
1060	}
1061
1062	/*
1063	* Subroutine to get a lastleft IndexTuple for a spit point from page
1064	*/
1065	static inline IndexTuple
1066	_bt_split_lastleft(FindSplitData state, SplitPoint split)
1067	{
1068	ItemId itemid;
1069
1070	if (split->newitemonleft && split->firstoldonright == state->newitemoff)
1071	return state->newitem;
1072
1073	itemid = PageGetItemId(state->page,
1074	OffsetNumberPrev(split->firstoldonright));
1075	return (IndexTuple) PageGetItem(state->page, itemid);
1076	}
1077
1078	/*
1079	* Subroutine to get a firstright IndexTuple for a spit point from page
1080	*/
1081	static inline IndexTuple
1082	_bt_split_firstright(FindSplitData state, SplitPoint split)
1083	{
1084	ItemId itemid;
1085
1086	if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
1087	return state->newitem;
1088
1089	itemid = PageGetItemId(state->page, split->firstoldonright);
1090	return (IndexTuple) PageGetItem(state->page, itemid);
1091	}
1092

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtsplitloc.c