nbtpage.c source code [PostgreSQL/src/backend/access/nbtree/nbtpage.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtpage.c
4	* BTree-specific page management code for the Postgres btree access
5	* method.
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	*
11	* IDENTIFICATION
12	* src/backend/access/nbtree/nbtpage.c
13	*
14	* NOTES
15	* Postgres btree pages look like ordinary relation pages. The opaque
16	* data at high addresses includes pointers to left and right siblings
17	* and flag data describing page state. The first page in a btree, page
18	* zero, is special -- it stores meta-information describing the tree.
19	* Pages one and higher store the actual tree data.
20	*
21	*-------------------------------------------------------------------------
22	*/
23	#include "postgres.h"
24
25	#include "access/nbtree.h"
26	#include "access/nbtxlog.h"
27	#include "access/transam.h"
28	#include "access/xlog.h"
29	#include "access/xloginsert.h"
30	#include "miscadmin.h"
31	#include "storage/indexfsm.h"
32	#include "storage/lmgr.h"
33	#include "storage/predicate.h"
34	#include "utils/snapmgr.h"
35
36	static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
37	static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
38	static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
39	bool *rightsib_empty);
40	static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
41	BTStack stack, Buffer topparent, OffsetNumber topoff,
42	BlockNumber target, BlockNumber rightsib);
43	static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
44	TransactionId latestRemovedXid);
45
46	/*
47	* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
48	*/
49	void
50	_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
51	{
52	BTMetaPageData *metad;
53	BTPageOpaque metaopaque;
54
55	_bt_pageinit(page, BLCKSZ);
56
57	metad = BTPageGetMeta(page);
58	metad->btm_magic = BTREE_MAGIC;
59	metad->btm_version = BTREE_VERSION;
60	metad->btm_root = rootbknum;
61	metad->btm_level = level;
62	metad->btm_fastroot = rootbknum;
63	metad->btm_fastlevel = level;
64	metad->btm_oldest_btpo_xact = InvalidTransactionId;
65	metad->btm_last_cleanup_num_heap_tuples = -`1.0`;
66
67	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
68	metaopaque->btpo_flags = BTP_META;
69
70	/*
71	* Set pd_lower just past the end of the metadata. This is essential,
72	* because without doing so, metadata will be lost if xlog.c compresses
73	* the page.
74	*/
75	((PageHeader) page)->pd_lower =
76	((char ) metad + sizeof(BTMetaPageData)) - (char* *) page;
77	}
78
79	/*
80	* _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
81	* 3, the last version that can be updated without broadly affecting
82	* on-disk compatibility. (A REINDEX is required to upgrade to v4.)
83	*
84	* This routine does purely in-memory image upgrade. Caller is
85	* responsible for locking, WAL-logging etc.
86	*/
87	void
88	_bt_upgrademetapage(Page page)
89	{
90	BTMetaPageData *metad;
91	BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
92
93	metad = BTPageGetMeta(page);
94	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
95
96	/ It must be really a meta page of upgradable version /
97	Assert(metaopaque->btpo_flags & BTP_META);
98	Assert(metad->btm_version < BTREE_NOVAC_VERSION);
99	Assert(metad->btm_version >= BTREE_MIN_VERSION);
100
101	/ Set version number and fill extra fields added into version 3 /
102	metad->btm_version = BTREE_NOVAC_VERSION;
103	metad->btm_oldest_btpo_xact = InvalidTransactionId;
104	metad->btm_last_cleanup_num_heap_tuples = -`1.0`;
105
106	/ Adjust pd_lower (see _bt_initmetapage() for details) /
107	((PageHeader) page)->pd_lower =
108	((char ) metad + sizeof(BTMetaPageData)) - (char* *) page;
109	}
110
111	/*
112	* Get metadata from share-locked buffer containing metapage, while performing
113	* standard sanity checks.
114	*
115	* Callers that cache data returned here in local cache should note that an
116	* on-the-fly upgrade using _bt_upgrademetapage() can change the version field
117	* and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
118	*/
119	static BTMetaPageData *
120	_bt_getmeta(Relation rel, Buffer metabuf)
121	{
122	Page metapg;
123	BTPageOpaque metaopaque;
124	BTMetaPageData *metad;
125
126	metapg = BufferGetPage(metabuf);
127	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
128	metad = BTPageGetMeta(metapg);
129
130	/ sanity-check the metapage /
131	if (!P_ISMETA(metaopaque) \|\|
132	metad->btm_magic != BTREE_MAGIC)
133	ereport(ERROR,
134	(errcode(ERRCODE_INDEX_CORRUPTED),
135	errmsg("index \"%s\" is not a btree",
136	RelationGetRelationName(rel))));
137
138	if (metad->btm_version < BTREE_MIN_VERSION \|\|
139	metad->btm_version > BTREE_VERSION)
140	ereport(ERROR,
141	(errcode(ERRCODE_INDEX_CORRUPTED),
142	errmsg("version mismatch in index \"%s\": file version %d, "
143	"current version %d, minimal supported version %d",
144	RelationGetRelationName(rel),
145	metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
146
147	return metad;
148	}
149
150	/*
151	* _bt_update_meta_cleanup_info() -- Update cleanup-related information in
152	* the metapage.
153	*
154	* This routine checks if provided cleanup-related information is matching
155	* to those written in the metapage. On mismatch, metapage is overwritten.
156	*/
157	void
158	_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
159	float8 numHeapTuples)
160	{
161	Buffer metabuf;
162	Page metapg;
163	BTMetaPageData *metad;
164	bool needsRewrite = false;
165	XLogRecPtr recptr;
166
167	/ read the metapage and check if it needs rewrite /
168	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
169	metapg = BufferGetPage(metabuf);
170	metad = BTPageGetMeta(metapg);
171
172	/ outdated version of metapage always needs rewrite /
173	if (metad->btm_version < BTREE_NOVAC_VERSION)
174	needsRewrite = true;
175	else if (metad->btm_oldest_btpo_xact != oldestBtpoXact \|\|
176	metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
177	needsRewrite = true;
178
179	if (!needsRewrite)
180	{
181	_bt_relbuf(rel, metabuf);
182	return;
183	}
184
185	/ trade in our read lock for a write lock /
186	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
187	LockBuffer(metabuf, BT_WRITE);
188
189	START_CRIT_SECTION();
190
191	/ upgrade meta-page if needed /
192	if (metad->btm_version < BTREE_NOVAC_VERSION)
193	_bt_upgrademetapage(metapg);
194
195	/ update cleanup-related information /
196	metad->btm_oldest_btpo_xact = oldestBtpoXact;
197	metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
198	MarkBufferDirty(metabuf);
199
200	/ write wal record if needed /
201	if (RelationNeedsWAL(rel))
202	{
203	xl_btree_metadata md;
204
205	XLogBeginInsert();
206	XLogRegisterBuffer(`0`, metabuf, REGBUF_WILL_INIT \| REGBUF_STANDARD);
207
208	Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
209	md.version = metad->btm_version;
210	md.root = metad->btm_root;
211	md.level = metad->btm_level;
212	md.fastroot = metad->btm_fastroot;
213	md.fastlevel = metad->btm_fastlevel;
214	md.oldest_btpo_xact = oldestBtpoXact;
215	md.last_cleanup_num_heap_tuples = numHeapTuples;
216
217	XLogRegisterBufData(`0`, (char ) &md, sizeof*(xl_btree_metadata));
218
219	recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
220
221	PageSetLSN(metapg, recptr);
222	}
223
224	END_CRIT_SECTION();
225	_bt_relbuf(rel, metabuf);
226	}
227
228	/*
229	* _bt_getroot() -- Get the root page of the btree.
230	*
231	* Since the root page can move around the btree file, we have to read
232	* its location from the metadata page, and then read the root page
233	* itself. If no root page exists yet, we have to create one. The
234	* standard class of race conditions exists here; I think I covered
235	* them all in the intricate dance of lock requests below.
236	*
237	* The access type parameter (BT_READ or BT_WRITE) controls whether
238	* a new root page will be created or not. If access = BT_READ,
239	* and no root page exists, we just return InvalidBuffer. For
240	* BT_WRITE, we try to create the root page if it doesn't exist.
241	* NOTE that the returned root page will have only a read lock set
242	* on it even if access = BT_WRITE!
243	*
244	* The returned page is not necessarily the true root --- it could be
245	* a "fast root" (a page that is alone in its level due to deletions).
246	* Also, if the root page is split while we are "in flight" to it,
247	* what we will return is the old root, which is now just the leftmost
248	* page on a probably-not-very-wide level. For most purposes this is
249	* as good as or better than the true root, so we do not bother to
250	* insist on finding the true root. We do, however, guarantee to
251	* return a live (not deleted or half-dead) page.
252	*
253	* On successful return, the root page is pinned and read-locked.
254	* The metadata page is not locked or pinned on exit.
255	*/
256	Buffer
257	_bt_getroot(Relation rel, int access)
258	{
259	Buffer metabuf;
260	Buffer rootbuf;
261	Page rootpage;
262	BTPageOpaque rootopaque;
263	BlockNumber rootblkno;
264	uint32 rootlevel;
265	BTMetaPageData *metad;
266
267	/*
268	* Try to use previously-cached metapage data to find the root. This
269	* normally saves one buffer access per index search, which is a very
270	* helpful savings in bufmgr traffic and hence contention.
271	*/
272	if (rel->rd_amcache != NULL)
273	{
274	metad = (BTMetaPageData *) rel->rd_amcache;
275	/ We shouldn't have cached it if any of these fail /
276	Assert(metad->btm_magic == BTREE_MAGIC);
277	Assert(metad->btm_version >= BTREE_MIN_VERSION);
278	Assert(metad->btm_version <= BTREE_VERSION);
279	Assert(metad->btm_root != P_NONE);
280
281	rootblkno = metad->btm_fastroot;
282	Assert(rootblkno != P_NONE);
283	rootlevel = metad->btm_fastlevel;
284
285	rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
286	rootpage = BufferGetPage(rootbuf);
287	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
288
289	/*
290	* Since the cache might be stale, we check the page more carefully
291	* here than normal. We must check that it's not deleted. If it's
292	* not alone on its level, then we reject too --- this may be overly
293	* paranoid but better safe than sorry. Note we don't check P_ISROOT,
294	* because that's not set in a "fast root".
295	*/
296	if (!P_IGNORE(rootopaque) &&
297	rootopaque->btpo.level == rootlevel &&
298	P_LEFTMOST(rootopaque) &&
299	P_RIGHTMOST(rootopaque))
300	{
301	/ OK, accept cached page as the root /
302	return rootbuf;
303	}
304	_bt_relbuf(rel, rootbuf);
305	/ Cache is stale, throw it away /
306	if (rel->rd_amcache)
307	pfree(rel->rd_amcache);
308	rel->rd_amcache = NULL;
309	}
310
311	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
312	metad = _bt_getmeta(rel, metabuf);
313
314	/ if no root page initialized yet, do it /
315	if (metad->btm_root == P_NONE)
316	{
317	Page metapg;
318
319	/ If access = BT_READ, caller doesn't want us to create root yet /
320	if (access == BT_READ)
321	{
322	_bt_relbuf(rel, metabuf);
323	return InvalidBuffer;
324	}
325
326	/ trade in our read lock for a write lock /
327	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
328	LockBuffer(metabuf, BT_WRITE);
329
330	/*
331	* Race condition: if someone else initialized the metadata between
332	* the time we released the read lock and acquired the write lock, we
333	* must avoid doing it again.
334	*/
335	if (metad->btm_root != P_NONE)
336	{
337	/*
338	* Metadata initialized by someone else. In order to guarantee no
339	* deadlocks, we have to release the metadata page and start all
340	* over again. (Is that really true? But it's hardly worth trying
341	* to optimize this case.)
342	*/
343	_bt_relbuf(rel, metabuf);
344	return _bt_getroot(rel, access);
345	}
346
347	/*
348	* Get, initialize, write, and leave a lock of the appropriate type on
349	* the new root page. Since this is the first page in the tree, it's
350	* a leaf as well as the root.
351	*/
352	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
353	rootblkno = BufferGetBlockNumber(rootbuf);
354	rootpage = BufferGetPage(rootbuf);
355	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
356	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
357	rootopaque->btpo_flags = (BTP_LEAF \| BTP_ROOT);
358	rootopaque->btpo.level = `0`;
359	rootopaque->btpo_cycleid = `0`;
360	/ Get raw page pointer for metapage /
361	metapg = BufferGetPage(metabuf);
362
363	/ NO ELOG(ERROR) till meta is updated /
364	START_CRIT_SECTION();
365
366	/ upgrade metapage if needed /
367	if (metad->btm_version < BTREE_NOVAC_VERSION)
368	_bt_upgrademetapage(metapg);
369
370	metad->btm_root = rootblkno;
371	metad->btm_level = `0`;
372	metad->btm_fastroot = rootblkno;
373	metad->btm_fastlevel = `0`;
374	metad->btm_oldest_btpo_xact = InvalidTransactionId;
375	metad->btm_last_cleanup_num_heap_tuples = -`1.0`;
376
377	MarkBufferDirty(rootbuf);
378	MarkBufferDirty(metabuf);
379
380	/ XLOG stuff /
381	if (RelationNeedsWAL(rel))
382	{
383	xl_btree_newroot xlrec;
384	XLogRecPtr recptr;
385	xl_btree_metadata md;
386
387	XLogBeginInsert();
388	XLogRegisterBuffer(`0`, rootbuf, REGBUF_WILL_INIT);
389	XLogRegisterBuffer(`2`, metabuf, REGBUF_WILL_INIT \| REGBUF_STANDARD);
390
391	Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
392	md.version = metad->btm_version;
393	md.root = rootblkno;
394	md.level = `0`;
395	md.fastroot = rootblkno;
396	md.fastlevel = `0`;
397	md.oldest_btpo_xact = InvalidTransactionId;
398	md.last_cleanup_num_heap_tuples = -`1.0`;
399
400	XLogRegisterBufData(`2`, (char ) &md, sizeof*(xl_btree_metadata));
401
402	xlrec.rootblk = rootblkno;
403	xlrec.level = `0`;
404
405	XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
406
407	recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
408
409	PageSetLSN(rootpage, recptr);
410	PageSetLSN(metapg, recptr);
411	}
412
413	END_CRIT_SECTION();
414
415	/*
416	* swap root write lock for read lock. There is no danger of anyone
417	* else accessing the new root page while it's unlocked, since no one
418	* else knows where it is yet.
419	*/
420	LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
421	LockBuffer(rootbuf, BT_READ);
422
423	/ okay, metadata is correct, release lock on it without caching /
424	_bt_relbuf(rel, metabuf);
425	}
426	else
427	{
428	rootblkno = metad->btm_fastroot;
429	Assert(rootblkno != P_NONE);
430	rootlevel = metad->btm_fastlevel;
431
432	/*
433	* Cache the metapage data for next time
434	*/
435	rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
436	sizeof(BTMetaPageData));
437	memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
438
439	/*
440	* We are done with the metapage; arrange to release it via first
441	* _bt_relandgetbuf call
442	*/
443	rootbuf = metabuf;
444
445	for (;;)
446	{
447	rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
448	rootpage = BufferGetPage(rootbuf);
449	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
450
451	if (!P_IGNORE(rootopaque))
452	break;
453
454	/ it's dead, Jim. step right one page /
455	if (P_RIGHTMOST(rootopaque))
456	elog(ERROR, "no live root page found in index \"%s\"",
457	RelationGetRelationName(rel));
458	rootblkno = rootopaque->btpo_next;
459	}
460
461	/ Note: can't check btpo.level on deleted pages /
462	if (rootopaque->btpo.level != rootlevel)
463	elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
464	rootblkno, RelationGetRelationName(rel),
465	rootopaque->btpo.level, rootlevel);
466	}
467
468	/*
469	* By here, we have a pin and read lock on the root page, and no lock set
470	* on the metadata page. Return the root page's buffer.
471	*/
472	return rootbuf;
473	}
474
475	/*
476	* _bt_gettrueroot() -- Get the true root page of the btree.
477	*
478	* This is the same as the BT_READ case of _bt_getroot(), except
479	* we follow the true-root link not the fast-root link.
480	*
481	* By the time we acquire lock on the root page, it might have been split and
482	* not be the true root anymore. This is okay for the present uses of this
483	* routine; we only really need to be able to move up at least one tree level
484	* from whatever non-root page we were at. If we ever do need to lock the
485	* one true root page, we could loop here, re-reading the metapage on each
486	* failure. (Note that it wouldn't do to hold the lock on the metapage while
487	* moving to the root --- that'd deadlock against any concurrent root split.)
488	*/
489	Buffer
490	_bt_gettrueroot(Relation rel)
491	{
492	Buffer metabuf;
493	Page metapg;
494	BTPageOpaque metaopaque;
495	Buffer rootbuf;
496	Page rootpage;
497	BTPageOpaque rootopaque;
498	BlockNumber rootblkno;
499	uint32 rootlevel;
500	BTMetaPageData *metad;
501
502	/*
503	* We don't try to use cached metapage data here, since (a) this path is
504	* not performance-critical, and (b) if we are here it suggests our cache
505	* is out-of-date anyway. In light of point (b), it's probably safest to
506	* actively flush any cached metapage info.
507	*/
508	if (rel->rd_amcache)
509	pfree(rel->rd_amcache);
510	rel->rd_amcache = NULL;
511
512	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
513	metapg = BufferGetPage(metabuf);
514	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
515	metad = BTPageGetMeta(metapg);
516
517	if (!P_ISMETA(metaopaque) \|\|
518	metad->btm_magic != BTREE_MAGIC)
519	ereport(ERROR,
520	(errcode(ERRCODE_INDEX_CORRUPTED),
521	errmsg("index \"%s\" is not a btree",
522	RelationGetRelationName(rel))));
523
524	if (metad->btm_version < BTREE_MIN_VERSION \|\|
525	metad->btm_version > BTREE_VERSION)
526	ereport(ERROR,
527	(errcode(ERRCODE_INDEX_CORRUPTED),
528	errmsg("version mismatch in index \"%s\": file version %d, "
529	"current version %d, minimal supported version %d",
530	RelationGetRelationName(rel),
531	metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
532
533	/ if no root page initialized yet, fail /
534	if (metad->btm_root == P_NONE)
535	{
536	_bt_relbuf(rel, metabuf);
537	return InvalidBuffer;
538	}
539
540	rootblkno = metad->btm_root;
541	rootlevel = metad->btm_level;
542
543	/*
544	* We are done with the metapage; arrange to release it via first
545	* _bt_relandgetbuf call
546	*/
547	rootbuf = metabuf;
548
549	for (;;)
550	{
551	rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
552	rootpage = BufferGetPage(rootbuf);
553	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
554
555	if (!P_IGNORE(rootopaque))
556	break;
557
558	/ it's dead, Jim. step right one page /
559	if (P_RIGHTMOST(rootopaque))
560	elog(ERROR, "no live root page found in index \"%s\"",
561	RelationGetRelationName(rel));
562	rootblkno = rootopaque->btpo_next;
563	}
564
565	/ Note: can't check btpo.level on deleted pages /
566	if (rootopaque->btpo.level != rootlevel)
567	elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
568	rootblkno, RelationGetRelationName(rel),
569	rootopaque->btpo.level, rootlevel);
570
571	return rootbuf;
572	}
573
574	/*
575	* _bt_getrootheight() -- Get the height of the btree search tree.
576	*
577	* We return the level (counting from zero) of the current fast root.
578	* This represents the number of tree levels we'd have to descend through
579	* to start any btree index search.
580	*
581	* This is used by the planner for cost-estimation purposes. Since it's
582	* only an estimate, slightly-stale data is fine, hence we don't worry
583	* about updating previously cached data.
584	*/
585	int
586	_bt_getrootheight(Relation rel)
587	{
588	BTMetaPageData *metad;
589
590	if (rel->rd_amcache == NULL)
591	{
592	Buffer metabuf;
593
594	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
595	metad = _bt_getmeta(rel, metabuf);
596
597	/*
598	* If there's no root page yet, _bt_getroot() doesn't expect a cache
599	* to be made, so just stop here and report the index height is zero.
600	* (XXX perhaps _bt_getroot() should be changed to allow this case.)
601	*/
602	if (metad->btm_root == P_NONE)
603	{
604	_bt_relbuf(rel, metabuf);
605	return `0`;
606	}
607
608	/*
609	* Cache the metapage data for next time
610	*/
611	rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
612	sizeof(BTMetaPageData));
613	memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
614	_bt_relbuf(rel, metabuf);
615	}
616
617	/ Get cached page /
618	metad = (BTMetaPageData *) rel->rd_amcache;
619	/ We shouldn't have cached it if any of these fail /
620	Assert(metad->btm_magic == BTREE_MAGIC);
621	Assert(metad->btm_version >= BTREE_MIN_VERSION);
622	Assert(metad->btm_version <= BTREE_VERSION);
623	Assert(metad->btm_fastroot != P_NONE);
624
625	return metad->btm_fastlevel;
626	}
627
628	/*
629	* _bt_heapkeyspace() -- is heap TID being treated as a key?
630	*
631	* This is used to determine the rules that must be used to descend a
632	* btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
633	* pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
634	* performance when inserting a new BTScanInsert-wise duplicate tuple
635	* among many leaf pages already full of such duplicates.
636	*/
637	bool
638	_bt_heapkeyspace(Relation rel)
639	{
640	BTMetaPageData *metad;
641
642	if (rel->rd_amcache == NULL)
643	{
644	Buffer metabuf;
645
646	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
647	metad = _bt_getmeta(rel, metabuf);
648
649	/*
650	* If there's no root page yet, _bt_getroot() doesn't expect a cache
651	* to be made, so just stop here. (XXX perhaps _bt_getroot() should
652	* be changed to allow this case.)
653	*/
654	if (metad->btm_root == P_NONE)
655	{
656	uint32 btm_version = metad->btm_version;
657
658	_bt_relbuf(rel, metabuf);
659	return btm_version > BTREE_NOVAC_VERSION;
660	}
661
662	/*
663	* Cache the metapage data for next time
664	*
665	* An on-the-fly version upgrade performed by _bt_upgrademetapage()
666	* can change the nbtree version for an index without invalidating any
667	* local cache. This is okay because it can only happen when moving
668	* from version 2 to version 3, both of which are !heapkeyspace
669	* versions.
670	*/
671	rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
672	sizeof(BTMetaPageData));
673	memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
674	_bt_relbuf(rel, metabuf);
675	}
676
677	/ Get cached page /
678	metad = (BTMetaPageData *) rel->rd_amcache;
679	/ We shouldn't have cached it if any of these fail /
680	Assert(metad->btm_magic == BTREE_MAGIC);
681	Assert(metad->btm_version >= BTREE_MIN_VERSION);
682	Assert(metad->btm_version <= BTREE_VERSION);
683	Assert(metad->btm_fastroot != P_NONE);
684
685	return metad->btm_version > BTREE_NOVAC_VERSION;
686	}
687
688	/*
689	* _bt_checkpage() -- Verify that a freshly-read page looks sane.
690	*/
691	void
692	_bt_checkpage(Relation rel, Buffer buf)
693	{
694	Page page = BufferGetPage(buf);
695
696	/*
697	* ReadBuffer verifies that every newly-read page passes
698	* PageHeaderIsValid, which means it either contains a reasonably sane
699	* page header or is all-zero. We have to defend against the all-zero
700	* case, however.
701	*/
702	if (PageIsNew(page))
703	ereport(ERROR,
704	(errcode(ERRCODE_INDEX_CORRUPTED),
705	errmsg("index \"%s\" contains unexpected zero page at block %u",
706	RelationGetRelationName(rel),
707	BufferGetBlockNumber(buf)),
708	errhint("Please REINDEX it.")));
709
710	/*
711	* Additionally check that the special area looks sane.
712	*/
713	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
714	ereport(ERROR,
715	(errcode(ERRCODE_INDEX_CORRUPTED),
716	errmsg("index \"%s\" contains corrupted page at block %u",
717	RelationGetRelationName(rel),
718	BufferGetBlockNumber(buf)),
719	errhint("Please REINDEX it.")));
720	}
721
722	/*
723	* Log the reuse of a page from the FSM.
724	*/
725	static void
726	_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
727	{
728	xl_btree_reuse_page xlrec_reuse;
729
730	/*
731	* Note that we don't register the buffer with the record, because this
732	* operation doesn't modify the page. This record only exists to provide a
733	* conflict point for Hot Standby.
734	*/
735
736	/ XLOG stuff /
737	xlrec_reuse.node = rel->rd_node;
738	xlrec_reuse.block = blkno;
739	xlrec_reuse.latestRemovedXid = latestRemovedXid;
740
741	XLogBeginInsert();
742	XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
743
744	XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
745	}
746
747	/*
748	* _bt_getbuf() -- Get a buffer by block number for read or write.
749	*
750	* blkno == P_NEW means to get an unallocated index page. The page
751	* will be initialized before returning it.
752	*
753	* When this routine returns, the appropriate lock is set on the
754	* requested buffer and its reference count has been incremented
755	* (ie, the buffer is "locked and pinned"). Also, we apply
756	* _bt_checkpage to sanity-check the page (except in P_NEW case).
757	*/
758	Buffer
759	_bt_getbuf(Relation rel, BlockNumber blkno, int access)
760	{
761	Buffer buf;
762
763	if (blkno != P_NEW)
764	{
765	/ Read an existing block of the relation /
766	buf = ReadBuffer(rel, blkno);
767	LockBuffer(buf, access);
768	_bt_checkpage(rel, buf);
769	}
770	else
771	{
772	bool needLock;
773	Page page;
774
775	Assert(access == BT_WRITE);
776
777	/*
778	* First see if the FSM knows of any free pages.
779	*
780	* We can't trust the FSM's report unreservedly; we have to check that
781	* the page is still free. (For example, an already-free page could
782	* have been re-used between the time the last VACUUM scanned it and
783	* the time the VACUUM made its FSM updates.)
784	*
785	* In fact, it's worse than that: we can't even assume that it's safe
786	* to take a lock on the reported page. If somebody else has a lock
787	* on it, or even worse our own caller does, we could deadlock. (The
788	* own-caller scenario is actually not improbable. Consider an index
789	* on a serial or timestamp column. Nearly all splits will be at the
790	* rightmost page, so it's entirely likely that _bt_split will call us
791	* while holding a lock on the page most recently acquired from FSM. A
792	* VACUUM running concurrently with the previous split could well have
793	* placed that page back in FSM.)
794	*
795	* To get around that, we ask for only a conditional lock on the
796	* reported page. If we fail, then someone else is using the page,
797	* and we may reasonably assume it's not free. (If we happen to be
798	* wrong, the worst consequence is the page will be lost to use till
799	* the next VACUUM, which is no big problem.)
800	*/
801	for (;;)
802	{
803	blkno = GetFreeIndexPage(rel);
804	if (blkno == InvalidBlockNumber)
805	break;
806	buf = ReadBuffer(rel, blkno);
807	if (ConditionalLockBuffer(buf))
808	{
809	page = BufferGetPage(buf);
810	if (_bt_page_recyclable(page))
811	{
812	/*
813	* If we are generating WAL for Hot Standby then create a
814	* WAL record that will allow us to conflict with queries
815	* running on standby, in case they have snapshots older
816	* than btpo.xact. This can only apply if the page does
817	* have a valid btpo.xact value, ie not if it's new. (We
818	* must check that because an all-zero page has no special
819	* space.)
820	*/
821	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
822	!PageIsNew(page))
823	{
824	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
825
826	_bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
827	}
828
829	/ Okay to use page. Re-initialize and return it /
830	_bt_pageinit(page, BufferGetPageSize(buf));
831	return buf;
832	}
833	elog(DEBUG2, "FSM returned nonrecyclable page");
834	_bt_relbuf(rel, buf);
835	}
836	else
837	{
838	elog(DEBUG2, "FSM returned nonlockable page");
839	/ couldn't get lock, so just drop pin /
840	ReleaseBuffer(buf);
841	}
842	}
843
844	/*
845	* Extend the relation by one page.
846	*
847	* We have to use a lock to ensure no one else is extending the rel at
848	* the same time, else we will both try to initialize the same new
849	* page. We can skip locking for new or temp relations, however,
850	* since no one else could be accessing them.
851	*/
852	needLock = !RELATION_IS_LOCAL(rel);
853
854	if (needLock)
855	LockRelationForExtension(rel, ExclusiveLock);
856
857	buf = ReadBuffer(rel, P_NEW);
858
859	/ Acquire buffer lock on new page /
860	LockBuffer(buf, BT_WRITE);
861
862	/*
863	* Release the file-extension lock; it's now OK for someone else to
864	* extend the relation some more. Note that we cannot release this
865	* lock before we have buffer lock on the new page, or we risk a race
866	* condition against btvacuumscan --- see comments therein.
867	*/
868	if (needLock)
869	UnlockRelationForExtension(rel, ExclusiveLock);
870
871	/ Initialize the new page before returning it /
872	page = BufferGetPage(buf);
873	Assert(PageIsNew(page));
874	_bt_pageinit(page, BufferGetPageSize(buf));
875	}
876
877	/ ref count and lock type are correct /
878	return buf;
879	}
880
881	/*
882	* _bt_relandgetbuf() -- release a locked buffer and get another one.
883	*
884	* This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
885	* exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
886	* then it reduces to just _bt_getbuf; allowing this case simplifies some
887	* callers.
888	*
889	* The original motivation for using this was to avoid two entries to the
890	* bufmgr when one would do. However, now it's mainly just a notational
891	* convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
892	* is when the target page is the same one already in the buffer.
893	*/
894	Buffer
895	_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
896	{
897	Buffer buf;
898
899	Assert(blkno != P_NEW);
900	if (BufferIsValid(obuf))
901	LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
902	buf = ReleaseAndReadBuffer(obuf, rel, blkno);
903	LockBuffer(buf, access);
904	_bt_checkpage(rel, buf);
905	return buf;
906	}
907
908	/*
909	* _bt_relbuf() -- release a locked buffer.
910	*
911	* Lock and pin (refcount) are both dropped.
912	*/
913	void
914	_bt_relbuf(Relation rel, Buffer buf)
915	{
916	UnlockReleaseBuffer(buf);
917	}
918
919	/*
920	* _bt_pageinit() -- Initialize a new page.
921	*
922	* On return, the page header is initialized; data space is empty;
923	* special space is zeroed out.
924	*/
925	void
926	_bt_pageinit(Page page, Size size)
927	{
928	PageInit(page, size, sizeof(BTPageOpaqueData));
929	}
930
931	/*
932	* _bt_page_recyclable() -- Is an existing page recyclable?
933	*
934	* This exists to make sure _bt_getbuf and btvacuumscan have the same
935	* policy about whether a page is safe to re-use. But note that _bt_getbuf
936	* knows enough to distinguish the PageIsNew condition from the other one.
937	* At some point it might be appropriate to redesign this to have a three-way
938	* result value.
939	*/
940	bool
941	_bt_page_recyclable(Page page)
942	{
943	BTPageOpaque opaque;
944
945	/*
946	* It's possible to find an all-zeroes page in an index --- for example, a
947	* backend might successfully extend the relation one page and then crash
948	* before it is able to make a WAL entry for adding the page. If we find a
949	* zeroed page then reclaim it.
950	*/
951	if (PageIsNew(page))
952	return true;
953
954	/*
955	* Otherwise, recycle if deleted and too old to have any processes
956	* interested in it.
957	*/
958	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
959	if (P_ISDELETED(opaque) &&
960	TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
961	return true;
962	return false;
963	}
964
965	/*
966	* Delete item(s) from a btree page during VACUUM.
967	*
968	* This must only be used for deleting leaf items. Deleting an item on a
969	* non-leaf page has to be done as part of an atomic action that includes
970	* deleting the page it points to.
971	*
972	* This routine assumes that the caller has pinned and locked the buffer.
973	* Also, the given itemnos must appear in increasing order in the array.
974	*
975	* We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
976	* we need to be able to pin all of the blocks in the btree in physical
977	* order when replaying the effects of a VACUUM, just as we do for the
978	* original VACUUM itself. lastBlockVacuumed allows us to tell whether an
979	* intermediate range of blocks has had no changes at all by VACUUM,
980	* and so must be scanned anyway during replay. We always write a WAL record
981	* for the last block in the index, whether or not it contained any items
982	* to be removed. This allows us to scan right up to end of index to
983	* ensure correct locking.
984	*/
985	void
986	_bt_delitems_vacuum(Relation rel, Buffer buf,
987	OffsetNumber itemnos, int* nitems,
988	BlockNumber lastBlockVacuumed)
989	{
990	Page page = BufferGetPage(buf);
991	BTPageOpaque opaque;
992
993	/ No ereport(ERROR) until changes are logged /
994	START_CRIT_SECTION();
995
996	/ Fix the page /
997	if (nitems > `0`)
998	PageIndexMultiDelete(page, itemnos, nitems);
999
1000	/*
1001	* We can clear the vacuum cycle ID since this page has certainly been
1002	* processed by the current vacuum scan.
1003	*/
1004	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1005	opaque->btpo_cycleid = `0`;
1006
1007	/*
1008	* Mark the page as not containing any LP_DEAD items. This is not
1009	* certainly true (there might be some that have recently been marked, but
1010	* weren't included in our target-item list), but it will almost always be
1011	* true and it doesn't seem worth an additional page scan to check it.
1012	* Remember that BTP_HAS_GARBAGE is only a hint anyway.
1013	*/
1014	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1015
1016	MarkBufferDirty(buf);
1017
1018	/ XLOG stuff /
1019	if (RelationNeedsWAL(rel))
1020	{
1021	XLogRecPtr recptr;
1022	xl_btree_vacuum xlrec_vacuum;
1023
1024	xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
1025
1026	XLogBeginInsert();
1027	XLogRegisterBuffer(`0`, buf, REGBUF_STANDARD);
1028	XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1029
1030	/*
1031	* The target-offsets array is not in the buffer, but pretend that it
1032	* is. When XLogInsert stores the whole buffer, the offsets array
1033	* need not be stored too.
1034	*/
1035	if (nitems > `0`)
1036	XLogRegisterBufData(`0`, (char ) itemnos, nitems sizeof(OffsetNumber));
1037
1038	recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1039
1040	PageSetLSN(page, recptr);
1041	}
1042
1043	END_CRIT_SECTION();
1044	}
1045
1046	/*
1047	* Delete item(s) from a btree page during single-page cleanup.
1048	*
1049	* As above, must only be used on leaf pages.
1050	*
1051	* This routine assumes that the caller has pinned and locked the buffer.
1052	* Also, the given itemnos must appear in increasing order in the array.
1053	*
1054	* This is nearly the same as _bt_delitems_vacuum as far as what it does to
1055	* the page, but the WAL logging considerations are quite different. See
1056	* comments for _bt_delitems_vacuum.
1057	*/
1058	void
1059	_bt_delitems_delete(Relation rel, Buffer buf,
1060	OffsetNumber itemnos, int* nitems,
1061	Relation heapRel)
1062	{
1063	Page page = BufferGetPage(buf);
1064	BTPageOpaque opaque;
1065	TransactionId latestRemovedXid = InvalidTransactionId;
1066
1067	/ Shouldn't be called unless there's something to do /
1068	Assert(nitems > `0`);
1069
1070	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
1071	latestRemovedXid =
1072	index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
1073	itemnos, nitems);
1074
1075	/ No ereport(ERROR) until changes are logged /
1076	START_CRIT_SECTION();
1077
1078	/ Fix the page /
1079	PageIndexMultiDelete(page, itemnos, nitems);
1080
1081	/*
1082	* Unlike _bt_delitems_vacuum, we must not clear the vacuum cycle ID,
1083	* because this is not called by VACUUM.
1084	*/
1085
1086	/*
1087	* Mark the page as not containing any LP_DEAD items. This is not
1088	* certainly true (there might be some that have recently been marked, but
1089	* weren't included in our target-item list), but it will almost always be
1090	* true and it doesn't seem worth an additional page scan to check it.
1091	* Remember that BTP_HAS_GARBAGE is only a hint anyway.
1092	*/
1093	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1094	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1095
1096	MarkBufferDirty(buf);
1097
1098	/ XLOG stuff /
1099	if (RelationNeedsWAL(rel))
1100	{
1101	XLogRecPtr recptr;
1102	xl_btree_delete xlrec_delete;
1103
1104	xlrec_delete.latestRemovedXid = latestRemovedXid;
1105	xlrec_delete.nitems = nitems;
1106
1107	XLogBeginInsert();
1108	XLogRegisterBuffer(`0`, buf, REGBUF_STANDARD);
1109	XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1110
1111	/*
1112	* We need the target-offsets array whether or not we store the whole
1113	* buffer, to allow us to find the latestRemovedXid on a standby
1114	* server.
1115	*/
1116	XLogRegisterData((char ) itemnos, nitems sizeof(OffsetNumber));
1117
1118	recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1119
1120	PageSetLSN(page, recptr);
1121	}
1122
1123	END_CRIT_SECTION();
1124	}
1125
1126	/*
1127	* Returns true, if the given block has the half-dead flag set.
1128	*/
1129	static bool
1130	_bt_is_page_halfdead(Relation rel, BlockNumber blk)
1131	{
1132	Buffer buf;
1133	Page page;
1134	BTPageOpaque opaque;
1135	bool result;
1136
1137	buf = _bt_getbuf(rel, blk, BT_READ);
1138	page = BufferGetPage(buf);
1139	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1140
1141	result = P_ISHALFDEAD(opaque);
1142	_bt_relbuf(rel, buf);
1143
1144	return result;
1145	}
1146
1147	/*
1148	* Subroutine to find the parent of the branch we're deleting. This climbs
1149	* up the tree until it finds a page with more than one child, i.e. a page
1150	* that will not be totally emptied by the deletion. The chain of pages below
1151	* it, with one downlink each, will form the branch that we need to delete.
1152	*
1153	* If we cannot remove the downlink from the parent, because it's the
1154	* rightmost entry, returns false. On success, topparent and topoff are set
1155	* to the buffer holding the parent, and the offset of the downlink in it.
1156	* *topparent is write-locked, the caller is responsible for releasing it when
1157	* done. *target is set to the topmost page in the branch to-be-deleted, i.e.
1158	* the page whose downlink topparent / topoff point to, and *rightsib to its
1159	* right sibling.
1160	*
1161	* "child" is the leaf page we wish to delete, and "stack" is a search stack
1162	* leading to it (it actually leads to the leftmost leaf page with a high key
1163	* matching that of the page to be deleted in !heapkeyspace indexes). Note
1164	* that we will update the stack entry(s) to reflect current downlink
1165	* positions --- this is essentially the same as the corresponding step of
1166	* splitting, and is not expected to affect caller. The caller should
1167	* initialize target and rightsib to the leaf page and its right sibling.
1168	*
1169	* Note: it's OK to release page locks on any internal pages between the leaf
1170	* and *topparent, because a safe deletion can't become unsafe due to
1171	* concurrent activity. An internal page can only acquire an entry if the
1172	* child is split, but that cannot happen as long as we hold a lock on the
1173	* leaf.
1174	*/
1175	static bool
1176	_bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1177	Buffer topparent, OffsetNumber topoff,
1178	BlockNumber target, BlockNumber rightsib)
1179	{
1180	BlockNumber parent;
1181	OffsetNumber poffset,
1182	maxoff;
1183	Buffer pbuf;
1184	Page page;
1185	BTPageOpaque opaque;
1186	BlockNumber leftsib;
1187
1188	/*
1189	* Locate the downlink of "child" in the parent, updating the stack entry
1190	* if needed. This is how !heapkeyspace indexes deal with having
1191	* non-unique high keys in leaf level pages. Even heapkeyspace indexes
1192	* can have a stale stack due to insertions into the parent.
1193	*/
1194	stack->bts_btentry = child;
1195	pbuf = _bt_getstackbuf(rel, stack);
1196	if (pbuf == InvalidBuffer)
1197	elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
1198	RelationGetRelationName(rel), child);
1199	parent = stack->bts_blkno;
1200	poffset = stack->bts_offset;
1201
1202	page = BufferGetPage(pbuf);
1203	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1204	maxoff = PageGetMaxOffsetNumber(page);
1205
1206	/*
1207	* If the target is the rightmost child of its parent, then we can't
1208	* delete, unless it's also the only child.
1209	*/
1210	if (poffset >= maxoff)
1211	{
1212	/ It's rightmost child... /
1213	if (poffset == P_FIRSTDATAKEY(opaque))
1214	{
1215	/*
1216	* It's only child, so safe if parent would itself be removable.
1217	* We have to check the parent itself, and then recurse to test
1218	* the conditions at the parent's parent.
1219	*/
1220	if (P_RIGHTMOST(opaque) \|\| P_ISROOT(opaque) \|\|
1221	P_INCOMPLETE_SPLIT(opaque))
1222	{
1223	_bt_relbuf(rel, pbuf);
1224	return false;
1225	}
1226
1227	*target = parent;
1228	*rightsib = opaque->btpo_next;
1229	leftsib = opaque->btpo_prev;
1230
1231	_bt_relbuf(rel, pbuf);
1232
1233	/*
1234	* Like in _bt_pagedel, check that the left sibling is not marked
1235	* with INCOMPLETE_SPLIT flag. That would mean that there is no
1236	* downlink to the page to be deleted, and the page deletion
1237	* algorithm isn't prepared to handle that.
1238	*/
1239	if (leftsib != P_NONE)
1240	{
1241	Buffer lbuf;
1242	Page lpage;
1243	BTPageOpaque lopaque;
1244
1245	lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1246	lpage = BufferGetPage(lbuf);
1247	lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1248
1249	/*
1250	* If the left sibling was concurrently split, so that its
1251	* next-pointer doesn't point to the current page anymore, the
1252	* split that created the current page must be completed. (We
1253	* don't allow splitting an incompletely split page again
1254	* until the previous split has been completed)
1255	*/
1256	if (lopaque->btpo_next == parent &&
1257	P_INCOMPLETE_SPLIT(lopaque))
1258	{
1259	_bt_relbuf(rel, lbuf);
1260	return false;
1261	}
1262	_bt_relbuf(rel, lbuf);
1263	}
1264
1265	return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
1266	topparent, topoff, target, rightsib);
1267	}
1268	else
1269	{
1270	/ Unsafe to delete /
1271	_bt_relbuf(rel, pbuf);
1272	return false;
1273	}
1274	}
1275	else
1276	{
1277	/ Not rightmost child, so safe to delete /
1278	*topparent = pbuf;
1279	*topoff = poffset;
1280	return true;
1281	}
1282	}
1283
1284	/*
1285	* _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1286	*
1287	* This action unlinks the page from the b-tree structure, removing all
1288	* pointers leading to it --- but not touching its own left and right links.
1289	* The page cannot be physically reclaimed right away, since other processes
1290	* may currently be trying to follow links leading to the page; they have to
1291	* be allowed to use its right-link to recover. See nbtree/README.
1292	*
1293	* On entry, the target buffer must be pinned and locked (either read or write
1294	* lock is OK). This lock and pin will be dropped before exiting.
1295	*
1296	* Returns the number of pages successfully deleted (zero if page cannot
1297	* be deleted now; could be more than one if parent or sibling pages were
1298	* deleted too).
1299	*
1300	* NOTE: this leaks memory. Rather than trying to clean up everything
1301	* carefully, it's better to run it in a temp context that can be reset
1302	* frequently.
1303	*/
1304	int
1305	_bt_pagedel(Relation rel, Buffer buf)
1306	{
1307	int ndeleted = `0`;
1308	BlockNumber rightsib;
1309	bool rightsib_empty;
1310	Page page;
1311	BTPageOpaque opaque;
1312
1313	/*
1314	* "stack" is a search stack leading (approximately) to the target page.
1315	* It is initially NULL, but when iterating, we keep it to avoid
1316	* duplicated search effort.
1317	*
1318	* Also, when "stack" is not NULL, we have already checked that the
1319	* current page is not the right half of an incomplete split, i.e. the
1320	* left sibling does not have its INCOMPLETE_SPLIT flag set.
1321	*/
1322	BTStack stack = NULL;
1323
1324	for (;;)
1325	{
1326	page = BufferGetPage(buf);
1327	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1328
1329	/*
1330	* Internal pages are never deleted directly, only as part of deleting
1331	* the whole branch all the way down to leaf level.
1332	*/
1333	if (!P_ISLEAF(opaque))
1334	{
1335	/*
1336	* Pre-9.4 page deletion only marked internal pages as half-dead,
1337	* but now we only use that flag on leaf pages. The old algorithm
1338	* was never supposed to leave half-dead pages in the tree, it was
1339	* just a transient state, but it was nevertheless possible in
1340	* error scenarios. We don't know how to deal with them here. They
1341	* are harmless as far as searches are considered, but inserts
1342	* into the deleted keyspace could add out-of-order downlinks in
1343	* the upper levels. Log a notice, hopefully the admin will notice
1344	* and reindex.
1345	*/
1346	if (P_ISHALFDEAD(opaque))
1347	ereport(LOG,
1348	(errcode(ERRCODE_INDEX_CORRUPTED),
1349	errmsg("index \"%s\" contains a half-dead internal page",
1350	RelationGetRelationName(rel)),
1351	errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1352	_bt_relbuf(rel, buf);
1353	return ndeleted;
1354	}
1355
1356	/*
1357	* We can never delete rightmost pages nor root pages. While at it,
1358	* check that page is not already deleted and is empty.
1359	*
1360	* To keep the algorithm simple, we also never delete an incompletely
1361	* split page (they should be rare enough that this doesn't make any
1362	* meaningful difference to disk usage):
1363	*
1364	* The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1365	* left half of an incomplete split, but ensuring that it's not the
1366	* right half is more complicated. For that, we have to check that
1367	* the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1368	* the first iteration, we temporarily release the lock on the current
1369	* page, and check the left sibling and also construct a search stack
1370	* to. On subsequent iterations, we know we stepped right from a page
1371	* that passed these tests, so it's OK.
1372	*/
1373	if (P_RIGHTMOST(opaque) \|\| P_ISROOT(opaque) \|\| P_ISDELETED(opaque) \|\|
1374	P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) \|\|
1375	P_INCOMPLETE_SPLIT(opaque))
1376	{
1377	/ Should never fail to delete a half-dead page /
1378	Assert(!P_ISHALFDEAD(opaque));
1379
1380	_bt_relbuf(rel, buf);
1381	return ndeleted;
1382	}
1383
1384	/*
1385	* First, remove downlink pointing to the page (or a parent of the
1386	* page, if we are going to delete a taller branch), and mark the page
1387	* as half-dead.
1388	*/
1389	if (!P_ISHALFDEAD(opaque))
1390	{
1391	/*
1392	* We need an approximate pointer to the page's parent page. We
1393	* use a variant of the standard search mechanism to search for
1394	* the page's high key; this will give us a link to either the
1395	* current parent or someplace to its left (if there are multiple
1396	* equal high keys, which is possible with !heapkeyspace indexes).
1397	*
1398	* Also check if this is the right-half of an incomplete split
1399	* (see comment above).
1400	*/
1401	if (!stack)
1402	{
1403	BTScanInsert itup_key;
1404	ItemId itemid;
1405	IndexTuple targetkey;
1406	Buffer lbuf;
1407	BlockNumber leftsib;
1408
1409	itemid = PageGetItemId(page, P_HIKEY);
1410	targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1411
1412	leftsib = opaque->btpo_prev;
1413
1414	/*
1415	* To avoid deadlocks, we'd better drop the leaf page lock
1416	* before going further.
1417	*/
1418	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1419
1420	/*
1421	* Fetch the left sibling, to check that it's not marked with
1422	* INCOMPLETE_SPLIT flag. That would mean that the page
1423	* to-be-deleted doesn't have a downlink, and the page
1424	* deletion algorithm isn't prepared to handle that.
1425	*/
1426	if (!P_LEFTMOST(opaque))
1427	{
1428	BTPageOpaque lopaque;
1429	Page lpage;
1430
1431	lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1432	lpage = BufferGetPage(lbuf);
1433	lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1434
1435	/*
1436	* If the left sibling is split again by another backend,
1437	* after we released the lock, we know that the first
1438	* split must have finished, because we don't allow an
1439	* incompletely-split page to be split again. So we don't
1440	* need to walk right here.
1441	*/
1442	if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
1443	P_INCOMPLETE_SPLIT(lopaque))
1444	{
1445	ReleaseBuffer(buf);
1446	_bt_relbuf(rel, lbuf);
1447	return ndeleted;
1448	}
1449	_bt_relbuf(rel, lbuf);
1450	}
1451
1452	/ we need an insertion scan key for the search, so build one /
1453	itup_key = _bt_mkscankey(rel, targetkey);
1454	/ find the leftmost leaf page with matching pivot/high key /
1455	itup_key->pivotsearch = true;
1456	stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
1457	/ don't need a lock or second pin on the page /
1458	_bt_relbuf(rel, lbuf);
1459
1460	/*
1461	* Re-lock the leaf page, and start over, to re-check that the
1462	* page can still be deleted.
1463	*/
1464	LockBuffer(buf, BT_WRITE);
1465	continue;
1466	}
1467
1468	if (!_bt_mark_page_halfdead(rel, buf, stack))
1469	{
1470	_bt_relbuf(rel, buf);
1471	return ndeleted;
1472	}
1473	}
1474
1475	/*
1476	* Then unlink it from its siblings. Each call to
1477	* _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1478	* making it shallower. Iterate until the leaf page is gone.
1479	*/
1480	rightsib_empty = false;
1481	while (P_ISHALFDEAD(opaque))
1482	{
1483	/ will check for interrupts, once lock is released /
1484	if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
1485	{
1486	/ _bt_unlink_halfdead_page already released buffer /
1487	return ndeleted;
1488	}
1489	ndeleted++;
1490	}
1491
1492	rightsib = opaque->btpo_next;
1493
1494	_bt_relbuf(rel, buf);
1495
1496	/*
1497	* Check here, as calling loops will have locks held, preventing
1498	* interrupts from being processed.
1499	*/
1500	CHECK_FOR_INTERRUPTS();
1501
1502	/*
1503	* The page has now been deleted. If its right sibling is completely
1504	* empty, it's possible that the reason we haven't deleted it earlier
1505	* is that it was the rightmost child of the parent. Now that we
1506	* removed the downlink for this page, the right sibling might now be
1507	* the only child of the parent, and could be removed. It would be
1508	* picked up by the next vacuum anyway, but might as well try to
1509	* remove it now, so loop back to process the right sibling.
1510	*/
1511	if (!rightsib_empty)
1512	break;
1513
1514	buf = _bt_getbuf(rel, rightsib, BT_WRITE);
1515	}
1516
1517	return ndeleted;
1518	}
1519
1520	/*
1521	* First stage of page deletion. Remove the downlink to the top of the
1522	* branch being deleted, and mark the leaf page as half-dead.
1523	*/
1524	static bool
1525	_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1526	{
1527	BlockNumber leafblkno;
1528	BlockNumber leafrightsib;
1529	BlockNumber target;
1530	BlockNumber rightsib;
1531	ItemId itemid;
1532	Page page;
1533	BTPageOpaque opaque;
1534	Buffer topparent;
1535	OffsetNumber topoff;
1536	OffsetNumber nextoffset;
1537	IndexTuple itup;
1538	IndexTupleData trunctuple;
1539
1540	page = BufferGetPage(leafbuf);
1541	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1542
1543	Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
1544	!P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
1545	P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1546
1547	/*
1548	* Save info about the leaf page.
1549	*/
1550	leafblkno = BufferGetBlockNumber(leafbuf);
1551	leafrightsib = opaque->btpo_next;
1552
1553	/*
1554	* Before attempting to lock the parent page, check that the right sibling
1555	* is not in half-dead state. A half-dead right sibling would have no
1556	* downlink in the parent, which would be highly confusing later when we
1557	* delete the downlink that follows the current page's downlink. (I
1558	* believe the deletion would work correctly, but it would fail the
1559	* cross-check we make that the following downlink points to the right
1560	* sibling of the delete page.)
1561	*/
1562	if (_bt_is_page_halfdead(rel, leafrightsib))
1563	{
1564	elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
1565	leafblkno, leafrightsib);
1566	return false;
1567	}
1568
1569	/*
1570	* We cannot delete a page that is the rightmost child of its immediate
1571	* parent, unless it is the only child --- in which case the parent has to
1572	* be deleted too, and the same condition applies recursively to it. We
1573	* have to check this condition all the way up before trying to delete,
1574	* and lock the final parent of the to-be-deleted subtree.
1575	*
1576	* However, we won't need to repeat the above _bt_is_page_halfdead() check
1577	* for parent/ancestor pages because of the rightmost restriction. The
1578	* leaf check will apply to a right "cousin" leaf page rather than a
1579	* simple right sibling leaf page in cases where we actually go on to
1580	* perform internal page deletion. The right cousin leaf page is
1581	* representative of the left edge of the subtree to the right of the
1582	* to-be-deleted subtree as a whole. (Besides, internal pages are never
1583	* marked half-dead, so it isn't even possible to directly assess if an
1584	* internal page is part of some other to-be-deleted subtree.)
1585	*/
1586	rightsib = leafrightsib;
1587	target = leafblkno;
1588	if (!_bt_lock_branch_parent(rel, leafblkno, stack,
1589	&topparent, &topoff, &target, &rightsib))
1590	return false;
1591
1592	/*
1593	* Check that the parent-page index items we're about to delete/overwrite
1594	* contain what we expect. This can fail if the index has become corrupt
1595	* for some reason. We want to throw any error before entering the
1596	* critical section --- otherwise it'd be a PANIC.
1597	*
1598	* The test on the target item is just an Assert because
1599	* _bt_lock_branch_parent should have guaranteed it has the expected
1600	* contents. The test on the next-child downlink is known to sometimes
1601	* fail in the field, though.
1602	*/
1603	page = BufferGetPage(topparent);
1604	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1605
1606	#ifdef USE_ASSERT_CHECKING
1607	itemid = PageGetItemId(page, topoff);
1608	itup = (IndexTuple) PageGetItem(page, itemid);
1609	Assert(BTreeInnerTupleGetDownLink(itup) == target);
1610	#endif
1611
1612	nextoffset = OffsetNumberNext(topoff);
1613	itemid = PageGetItemId(page, nextoffset);
1614	itup = (IndexTuple) PageGetItem(page, itemid);
1615	if (BTreeInnerTupleGetDownLink(itup) != rightsib)
1616	elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1617	rightsib, target, BTreeInnerTupleGetDownLink(itup),
1618	BufferGetBlockNumber(topparent), RelationGetRelationName(rel));
1619
1620	/*
1621	* Any insert which would have gone on the leaf block will now go to its
1622	* right sibling.
1623	*/
1624	PredicateLockPageCombine(rel, leafblkno, leafrightsib);
1625
1626	/ No ereport(ERROR) until changes are logged /
1627	START_CRIT_SECTION();
1628
1629	/*
1630	* Update parent. The normal case is a tad tricky because we want to
1631	* delete the target's downlink and the following key. Easiest way is
1632	* to copy the right sibling's downlink over the target downlink, and then
1633	* delete the following item.
1634	*/
1635	page = BufferGetPage(topparent);
1636	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1637
1638	itemid = PageGetItemId(page, topoff);
1639	itup = (IndexTuple) PageGetItem(page, itemid);
1640	BTreeInnerTupleSetDownLink(itup, rightsib);
1641
1642	nextoffset = OffsetNumberNext(topoff);
1643	PageIndexTupleDelete(page, nextoffset);
1644
1645	/*
1646	* Mark the leaf page as half-dead, and stamp it with a pointer to the
1647	* highest internal page in the branch we're deleting. We use the tid of
1648	* the high key to store it.
1649	*/
1650	page = BufferGetPage(leafbuf);
1651	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1652	opaque->btpo_flags \|= BTP_HALF_DEAD;
1653
1654	PageIndexTupleDelete(page, P_HIKEY);
1655	Assert(PageGetMaxOffsetNumber(page) == `0`);
1656	MemSet(&trunctuple, `0`, sizeof(IndexTupleData));
1657	trunctuple.t_info = sizeof(IndexTupleData);
1658	if (target != leafblkno)
1659	BTreeTupleSetTopParent(&trunctuple, target);
1660	else
1661	BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
1662
1663	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
1664	false, false) == InvalidOffsetNumber)
1665	elog(ERROR, "could not add dummy high key to half-dead page");
1666
1667	/ Must mark buffers dirty before XLogInsert /
1668	MarkBufferDirty(topparent);
1669	MarkBufferDirty(leafbuf);
1670
1671	/ XLOG stuff /
1672	if (RelationNeedsWAL(rel))
1673	{
1674	xl_btree_mark_page_halfdead xlrec;
1675	XLogRecPtr recptr;
1676
1677	xlrec.poffset = topoff;
1678	xlrec.leafblk = leafblkno;
1679	if (target != leafblkno)
1680	xlrec.topparent = target;
1681	else
1682	xlrec.topparent = InvalidBlockNumber;
1683
1684	XLogBeginInsert();
1685	XLogRegisterBuffer(`0`, leafbuf, REGBUF_WILL_INIT);
1686	XLogRegisterBuffer(`1`, topparent, REGBUF_STANDARD);
1687
1688	page = BufferGetPage(leafbuf);
1689	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1690	xlrec.leftblk = opaque->btpo_prev;
1691	xlrec.rightblk = opaque->btpo_next;
1692
1693	XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
1694
1695	recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
1696
1697	page = BufferGetPage(topparent);
1698	PageSetLSN(page, recptr);
1699	page = BufferGetPage(leafbuf);
1700	PageSetLSN(page, recptr);
1701	}
1702
1703	END_CRIT_SECTION();
1704
1705	_bt_relbuf(rel, topparent);
1706	return true;
1707	}
1708
1709	/*
1710	* Unlink a page in a branch of half-dead pages from its siblings.
1711	*
1712	* If the leaf page still has a downlink pointing to it, unlinks the highest
1713	* parent in the to-be-deleted branch instead of the leaf page. To get rid
1714	* of the whole branch, including the leaf page itself, iterate until the
1715	* leaf page is deleted.
1716	*
1717	* Returns 'false' if the page could not be unlinked (shouldn't happen).
1718	* If the (new) right sibling of the page is empty, *rightsib_empty is set
1719	* to true.
1720	*
1721	* Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1722	* On success exit, we'll be holding pin and write lock. On failure exit,
1723	* we'll release both pin and lock before returning (we define it that way
1724	* to avoid having to reacquire a lock we already released).
1725	*/
1726	static bool
1727	_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1728	{
1729	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
1730	BlockNumber leafleftsib;
1731	BlockNumber leafrightsib;
1732	BlockNumber target;
1733	BlockNumber leftsib;
1734	BlockNumber rightsib;
1735	Buffer lbuf = InvalidBuffer;
1736	Buffer buf;
1737	Buffer rbuf;
1738	Buffer metabuf = InvalidBuffer;
1739	Page metapg = NULL;
1740	BTMetaPageData *metad = NULL;
1741	ItemId itemid;
1742	Page page;
1743	BTPageOpaque opaque;
1744	bool rightsib_is_rightmost;
1745	int targetlevel;
1746	IndexTuple leafhikey;
1747	BlockNumber nextchild;
1748
1749	page = BufferGetPage(leafbuf);
1750	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1751
1752	Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1753
1754	/*
1755	* Remember some information about the leaf page.
1756	*/
1757	itemid = PageGetItemId(page, P_HIKEY);
1758	leafhikey = (IndexTuple) PageGetItem(page, itemid);
1759	leafleftsib = opaque->btpo_prev;
1760	leafrightsib = opaque->btpo_next;
1761
1762	LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1763
1764	/*
1765	* Check here, as calling loops will have locks held, preventing
1766	* interrupts from being processed.
1767	*/
1768	CHECK_FOR_INTERRUPTS();
1769
1770	/*
1771	* If the leaf page still has a parent pointing to it (or a chain of
1772	* parents), we don't unlink the leaf page yet, but the topmost remaining
1773	* parent in the branch. Set 'target' and 'buf' to reference the page
1774	* actually being unlinked.
1775	*/
1776	target = BTreeTupleGetTopParent(leafhikey);
1777
1778	if (target != InvalidBlockNumber)
1779	{
1780	Assert(target != leafblkno);
1781
1782	/ fetch the block number of the topmost parent's left sibling /
1783	buf = _bt_getbuf(rel, target, BT_READ);
1784	page = BufferGetPage(buf);
1785	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1786	leftsib = opaque->btpo_prev;
1787	targetlevel = opaque->btpo.level;
1788
1789	/*
1790	* To avoid deadlocks, we'd better drop the target page lock before
1791	* going further.
1792	*/
1793	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1794	}
1795	else
1796	{
1797	target = leafblkno;
1798
1799	buf = leafbuf;
1800	leftsib = leafleftsib;
1801	targetlevel = `0`;
1802	}
1803
1804	/*
1805	* We have to lock the pages we need to modify in the standard order:
1806	* moving right, then up. Else we will deadlock against other writers.
1807	*
1808	* So, first lock the leaf page, if it's not the target. Then find and
1809	* write-lock the current left sibling of the target page. The sibling
1810	* that was current a moment ago could have split, so we may have to move
1811	* right. This search could fail if either the sibling or the target page
1812	* was deleted by someone else meanwhile; if so, give up. (Right now,
1813	* that should never happen, since page deletion is only done in VACUUM
1814	* and there shouldn't be multiple VACUUMs concurrently on the same
1815	* table.)
1816	*/
1817	if (target != leafblkno)
1818	LockBuffer(leafbuf, BT_WRITE);
1819	if (leftsib != P_NONE)
1820	{
1821	lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
1822	page = BufferGetPage(lbuf);
1823	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1824	while (P_ISDELETED(opaque) \|\| opaque->btpo_next != target)
1825	{
1826	/ step right one page /
1827	leftsib = opaque->btpo_next;
1828	_bt_relbuf(rel, lbuf);
1829
1830	/*
1831	* It'd be good to check for interrupts here, but it's not easy to
1832	* do so because a lock is always held. This block isn't
1833	* frequently reached, so hopefully the consequences of not
1834	* checking interrupts aren't too bad.
1835	*/
1836
1837	if (leftsib == P_NONE)
1838	{
1839	elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
1840	target,
1841	RelationGetRelationName(rel));
1842	if (target != leafblkno)
1843	{
1844	/ we have only a pin on target, but pin+lock on leafbuf /
1845	ReleaseBuffer(buf);
1846	_bt_relbuf(rel, leafbuf);
1847	}
1848	else
1849	{
1850	/ we have only a pin on leafbuf /
1851	ReleaseBuffer(leafbuf);
1852	}
1853	return false;
1854	}
1855	lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
1856	page = BufferGetPage(lbuf);
1857	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1858	}
1859	}
1860	else
1861	lbuf = InvalidBuffer;
1862
1863	/*
1864	* Next write-lock the target page itself. It should be okay to take just
1865	* a write lock not a superexclusive lock, since no scans would stop on an
1866	* empty page.
1867	*/
1868	LockBuffer(buf, BT_WRITE);
1869	page = BufferGetPage(buf);
1870	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1871
1872	/*
1873	* Check page is still empty etc, else abandon deletion. This is just for
1874	* paranoia's sake; a half-dead page cannot resurrect because there can be
1875	* only one vacuum process running at a time.
1876	*/
1877	if (P_RIGHTMOST(opaque) \|\| P_ISROOT(opaque) \|\| P_ISDELETED(opaque))
1878	{
1879	elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
1880	target, RelationGetRelationName(rel));
1881	}
1882	if (opaque->btpo_prev != leftsib)
1883	elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
1884	target, RelationGetRelationName(rel));
1885
1886	if (target == leafblkno)
1887	{
1888	if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) \|\|
1889	!P_ISLEAF(opaque) \|\| !P_ISHALFDEAD(opaque))
1890	elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
1891	target, RelationGetRelationName(rel));
1892	nextchild = InvalidBlockNumber;
1893	}
1894	else
1895	{
1896	if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) \|\|
1897	P_ISLEAF(opaque))
1898	elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
1899	target, RelationGetRelationName(rel));
1900
1901	/ remember the next non-leaf child down in the branch. /
1902	itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
1903	nextchild = BTreeInnerTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
1904	if (nextchild == leafblkno)
1905	nextchild = InvalidBlockNumber;
1906	}
1907
1908	/*
1909	* And next write-lock the (current) right sibling.
1910	*/
1911	rightsib = opaque->btpo_next;
1912	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1913	page = BufferGetPage(rbuf);
1914	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1915	if (opaque->btpo_prev != target)
1916	elog(ERROR, "right sibling's left-link doesn't match: "
1917	"block %u links to %u instead of expected %u in index \"%s\"",
1918	rightsib, opaque->btpo_prev, target,
1919	RelationGetRelationName(rel));
1920	rightsib_is_rightmost = P_RIGHTMOST(opaque);
1921	*rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1922
1923	/*
1924	* If we are deleting the next-to-last page on the target's level, then
1925	* the rightsib is a candidate to become the new fast root. (In theory, it
1926	* might be possible to push the fast root even further down, but the odds
1927	* of doing so are slim, and the locking considerations daunting.)
1928	*
1929	* We don't support handling this in the case where the parent is becoming
1930	* half-dead, even though it theoretically could occur.
1931	*
1932	* We can safely acquire a lock on the metapage here --- see comments for
1933	* _bt_newroot().
1934	*/
1935	if (leftsib == P_NONE && rightsib_is_rightmost)
1936	{
1937	page = BufferGetPage(rbuf);
1938	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1939	if (P_RIGHTMOST(opaque))
1940	{
1941	/ rightsib will be the only one left on the level /
1942	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1943	metapg = BufferGetPage(metabuf);
1944	metad = BTPageGetMeta(metapg);
1945
1946	/*
1947	* The expected case here is btm_fastlevel == targetlevel+1; if
1948	* the fastlevel is <= targetlevel, something is wrong, and we
1949	* choose to overwrite it to fix it.
1950	*/
1951	if (metad->btm_fastlevel > targetlevel + `1`)
1952	{
1953	/ no update wanted /
1954	_bt_relbuf(rel, metabuf);
1955	metabuf = InvalidBuffer;
1956	}
1957	}
1958	}
1959
1960	/*
1961	* Here we begin doing the deletion.
1962	*/
1963
1964	/ No ereport(ERROR) until changes are logged /
1965	START_CRIT_SECTION();
1966
1967	/*
1968	* Update siblings' side-links. Note the target page's side-links will
1969	* continue to point to the siblings. Asserts here are just rechecking
1970	* things we already verified above.
1971	*/
1972	if (BufferIsValid(lbuf))
1973	{
1974	page = BufferGetPage(lbuf);
1975	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1976	Assert(opaque->btpo_next == target);
1977	opaque->btpo_next = rightsib;
1978	}
1979	page = BufferGetPage(rbuf);
1980	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1981	Assert(opaque->btpo_prev == target);
1982	opaque->btpo_prev = leftsib;
1983
1984	/*
1985	* If we deleted a parent of the targeted leaf page, instead of the leaf
1986	* itself, update the leaf to point to the next remaining child in the
1987	* branch.
1988	*/
1989	if (target != leafblkno)
1990	BTreeTupleSetTopParent(leafhikey, nextchild);
1991
1992	/*
1993	* Mark the page itself deleted. It can be recycled when all current
1994	* transactions are gone. Storing GetTopTransactionId() would work, but
1995	* we're in VACUUM and would not otherwise have an XID. Having already
1996	* updated links to the target, ReadNewTransactionId() suffices as an
1997	* upper bound. Any scan having retained a now-stale link is advertising
1998	* in its PGXACT an xmin less than or equal to the value we read here. It
1999	* will continue to do so, holding back RecentGlobalXmin, for the duration
2000	* of that scan.
2001	*/
2002	page = BufferGetPage(buf);
2003	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2004	opaque->btpo_flags &= ~BTP_HALF_DEAD;
2005	opaque->btpo_flags \|= BTP_DELETED;
2006	opaque->btpo.xact = ReadNewTransactionId();
2007
2008	/ And update the metapage, if needed /
2009	if (BufferIsValid(metabuf))
2010	{
2011	/ upgrade metapage if needed /
2012	if (metad->btm_version < BTREE_NOVAC_VERSION)
2013	_bt_upgrademetapage(metapg);
2014	metad->btm_fastroot = rightsib;
2015	metad->btm_fastlevel = targetlevel;
2016	MarkBufferDirty(metabuf);
2017	}
2018
2019	/ Must mark buffers dirty before XLogInsert /
2020	MarkBufferDirty(rbuf);
2021	MarkBufferDirty(buf);
2022	if (BufferIsValid(lbuf))
2023	MarkBufferDirty(lbuf);
2024	if (target != leafblkno)
2025	MarkBufferDirty(leafbuf);
2026
2027	/ XLOG stuff /
2028	if (RelationNeedsWAL(rel))
2029	{
2030	xl_btree_unlink_page xlrec;
2031	xl_btree_metadata xlmeta;
2032	uint8 xlinfo;
2033	XLogRecPtr recptr;
2034
2035	XLogBeginInsert();
2036
2037	XLogRegisterBuffer(`0`, buf, REGBUF_WILL_INIT);
2038	if (BufferIsValid(lbuf))
2039	XLogRegisterBuffer(`1`, lbuf, REGBUF_STANDARD);
2040	XLogRegisterBuffer(`2`, rbuf, REGBUF_STANDARD);
2041	if (target != leafblkno)
2042	XLogRegisterBuffer(`3`, leafbuf, REGBUF_WILL_INIT);
2043
2044	/ information on the unlinked block /
2045	xlrec.leftsib = leftsib;
2046	xlrec.rightsib = rightsib;
2047	xlrec.btpo_xact = opaque->btpo.xact;
2048
2049	/ information needed to recreate the leaf block (if not the target) /
2050	xlrec.leafleftsib = leafleftsib;
2051	xlrec.leafrightsib = leafrightsib;
2052	xlrec.topparent = nextchild;
2053
2054	XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2055
2056	if (BufferIsValid(metabuf))
2057	{
2058	XLogRegisterBuffer(`4`, metabuf, REGBUF_WILL_INIT \| REGBUF_STANDARD);
2059
2060	Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2061	xlmeta.version = metad->btm_version;
2062	xlmeta.root = metad->btm_root;
2063	xlmeta.level = metad->btm_level;
2064	xlmeta.fastroot = metad->btm_fastroot;
2065	xlmeta.fastlevel = metad->btm_fastlevel;
2066	xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2067	xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
2068
2069	XLogRegisterBufData(`4`, (char ) &xlmeta, sizeof*(xl_btree_metadata));
2070	xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2071	}
2072	else
2073	xlinfo = XLOG_BTREE_UNLINK_PAGE;
2074
2075	recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2076
2077	if (BufferIsValid(metabuf))
2078	{
2079	PageSetLSN(metapg, recptr);
2080	}
2081	page = BufferGetPage(rbuf);
2082	PageSetLSN(page, recptr);
2083	page = BufferGetPage(buf);
2084	PageSetLSN(page, recptr);
2085	if (BufferIsValid(lbuf))
2086	{
2087	page = BufferGetPage(lbuf);
2088	PageSetLSN(page, recptr);
2089	}
2090	if (target != leafblkno)
2091	{
2092	page = BufferGetPage(leafbuf);
2093	PageSetLSN(page, recptr);
2094	}
2095	}
2096
2097	END_CRIT_SECTION();
2098
2099	/ release metapage /
2100	if (BufferIsValid(metabuf))
2101	_bt_relbuf(rel, metabuf);
2102
2103	/ release siblings /
2104	if (BufferIsValid(lbuf))
2105	_bt_relbuf(rel, lbuf);
2106	_bt_relbuf(rel, rbuf);
2107
2108	/*
2109	* Release the target, if it was not the leaf block. The leaf is always
2110	* kept locked.
2111	*/
2112	if (target != leafblkno)
2113	_bt_relbuf(rel, buf);
2114
2115	return true;
2116	}
2117

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtpage.c