nbtxlog.c source code [PostgreSQL/src/backend/access/nbtree/nbtxlog.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtxlog.c
4	* WAL replay logic for btrees.
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* IDENTIFICATION
11	* src/backend/access/nbtree/nbtxlog.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include "access/bufmask.h"
18	#include "access/nbtree.h"
19	#include "access/nbtxlog.h"
20	#include "access/transam.h"
21	#include "access/xlog.h"
22	#include "access/xlogutils.h"
23	#include "storage/procarray.h"
24	#include "miscadmin.h"
25
26	/*
27	* _bt_restore_page -- re-enter all the index tuples on a page
28	*
29	* The page is freshly init'd, and *from (length len) is a copy of what
30	* had been its upper part (pd_upper to pd_special). We assume that the
31	* tuples had been added to the page in item-number order, and therefore
32	* the one with highest item number appears first (lowest on the page).
33	*/
34	static void
35	_bt_restore_page(Page page, char from, int* len)
36	{
37	IndexTupleData itupdata;
38	Size itemsz;
39	char *end = from + len;
40	Item items[MaxIndexTuplesPerPage];
41	uint16 itemsizes[MaxIndexTuplesPerPage];
42	int i;
43	int nitems;
44
45	/*
46	* To get the items back in the original order, we add them to the page in
47	* reverse. To figure out where one tuple ends and another begins, we
48	* have to scan them in forward order first.
49	*/
50	i = `0`;
51	while (from < end)
52	{
53	/*
54	* As we step through the items, 'from' won't always be properly
55	* aligned, so we need to use memcpy(). Further, we use Item (which
56	* is just a char*) here for our items array for the same reason;
57	* wouldn't want the compiler or anyone thinking that an item is
58	* aligned when it isn't.
59	*/
60	memcpy(&itupdata, from, sizeof(IndexTupleData));
61	itemsz = IndexTupleSize(&itupdata);
62	itemsz = MAXALIGN(itemsz);
63
64	items[i] = (Item) from;
65	itemsizes[i] = itemsz;
66	i++;
67
68	from += itemsz;
69	}
70	nitems = i;
71
72	for (i = nitems - `1`; i >= `0`; i--)
73	{
74	if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
75	false, false) == InvalidOffsetNumber)
76	elog(PANIC, "_bt_restore_page: cannot add item to page");
77	from += itemsz;
78	}
79	}
80
81	static void
82	_bt_restore_meta(XLogReaderState *record, uint8 block_id)
83	{
84	XLogRecPtr lsn = record->EndRecPtr;
85	Buffer metabuf;
86	Page metapg;
87	BTMetaPageData *md;
88	BTPageOpaque pageop;
89	xl_btree_metadata *xlrec;
90	char *ptr;
91	Size len;
92
93	metabuf = XLogInitBufferForRedo(record, block_id);
94	ptr = XLogRecGetBlockData(record, block_id, &len);
95
96	Assert(len == sizeof(xl_btree_metadata));
97	Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
98	xlrec = (xl_btree_metadata *) ptr;
99	metapg = BufferGetPage(metabuf);
100
101	_bt_pageinit(metapg, BufferGetPageSize(metabuf));
102
103	md = BTPageGetMeta(metapg);
104	md->btm_magic = BTREE_MAGIC;
105	md->btm_version = xlrec->version;
106	md->btm_root = xlrec->root;
107	md->btm_level = xlrec->level;
108	md->btm_fastroot = xlrec->fastroot;
109	md->btm_fastlevel = xlrec->fastlevel;
110	/ Cannot log BTREE_MIN_VERSION index metapage without upgrade /
111	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
112	md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
113	md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
114
115	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
116	pageop->btpo_flags = BTP_META;
117
118	/*
119	* Set pd_lower just past the end of the metadata. This is essential,
120	* because without doing so, metadata will be lost if xlog.c compresses
121	* the page.
122	*/
123	((PageHeader) metapg)->pd_lower =
124	((char ) md + sizeof(BTMetaPageData)) - (char* *) metapg;
125
126	PageSetLSN(metapg, lsn);
127	MarkBufferDirty(metabuf);
128	UnlockReleaseBuffer(metabuf);
129	}
130
131	/*
132	* _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
133	*
134	* This is a common subroutine of the redo functions of all the WAL record
135	* types that can insert a downlink: insert, split, and newroot.
136	*/
137	static void
138	_bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
139	{
140	XLogRecPtr lsn = record->EndRecPtr;
141	Buffer buf;
142
143	if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
144	{
145	Page page = (Page) BufferGetPage(buf);
146	BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
147
148	Assert(P_INCOMPLETE_SPLIT(pageop));
149	pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
150
151	PageSetLSN(page, lsn);
152	MarkBufferDirty(buf);
153	}
154	if (BufferIsValid(buf))
155	UnlockReleaseBuffer(buf);
156	}
157
158	static void
159	btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
160	{
161	XLogRecPtr lsn = record->EndRecPtr;
162	xl_btree_insert xlrec = (xl_btree_insert ) XLogRecGetData(record);
163	Buffer buffer;
164	Page page;
165
166	/*
167	* Insertion to an internal page finishes an incomplete split at the child
168	* level. Clear the incomplete-split flag in the child. Note: during
169	* normal operation, the child and parent pages are locked at the same
170	* time, so that clearing the flag and inserting the downlink appear
171	* atomic to other backends. We don't bother with that during replay,
172	* because readers don't care about the incomplete-split flag and there
173	* cannot be updates happening.
174	*/
175	if (!isleaf)
176	_bt_clear_incomplete_split(record, `1`);
177	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
178	{
179	Size datalen;
180	char *datapos = XLogRecGetBlockData(record, `0`, &datalen);
181
182	page = BufferGetPage(buffer);
183
184	if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
185	false, false) == InvalidOffsetNumber)
186	elog(PANIC, "btree_xlog_insert: failed to add item");
187
188	PageSetLSN(page, lsn);
189	MarkBufferDirty(buffer);
190	}
191	if (BufferIsValid(buffer))
192	UnlockReleaseBuffer(buffer);
193
194	/*
195	* Note: in normal operation, we'd update the metapage while still holding
196	* lock on the page we inserted into. But during replay it's not
197	* necessary to hold that lock, since no other index updates can be
198	* happening concurrently, and readers will cope fine with following an
199	* obsolete link from the metapage.
200	*/
201	if (ismeta)
202	_bt_restore_meta(record, `2`);
203	}
204
205	static void
206	btree_xlog_split(bool onleft, XLogReaderState *record)
207	{
208	XLogRecPtr lsn = record->EndRecPtr;
209	xl_btree_split xlrec = (xl_btree_split ) XLogRecGetData(record);
210	bool isleaf = (xlrec->level == `0`);
211	Buffer lbuf;
212	Buffer rbuf;
213	Page rpage;
214	BTPageOpaque ropaque;
215	char *datapos;
216	Size datalen;
217	BlockNumber leftsib;
218	BlockNumber rightsib;
219	BlockNumber rnext;
220
221	XLogRecGetBlockTag(record, `0`, NULL, NULL, &leftsib);
222	XLogRecGetBlockTag(record, `1`, NULL, NULL, &rightsib);
223	if (!XLogRecGetBlockTag(record, `2`, NULL, NULL, &rnext))
224	rnext = P_NONE;
225
226	/*
227	* Clear the incomplete split flag on the left sibling of the child page
228	* this is a downlink for. (Like in btree_xlog_insert, this can be done
229	* before locking the other pages)
230	*/
231	if (!isleaf)
232	_bt_clear_incomplete_split(record, `3`);
233
234	/ Reconstruct right (new) sibling page from scratch /
235	rbuf = XLogInitBufferForRedo(record, `1`);
236	datapos = XLogRecGetBlockData(record, `1`, &datalen);
237	rpage = (Page) BufferGetPage(rbuf);
238
239	_bt_pageinit(rpage, BufferGetPageSize(rbuf));
240	ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
241
242	ropaque->btpo_prev = leftsib;
243	ropaque->btpo_next = rnext;
244	ropaque->btpo.level = xlrec->level;
245	ropaque->btpo_flags = isleaf ? BTP_LEAF : `0`;
246	ropaque->btpo_cycleid = `0`;
247
248	_bt_restore_page(rpage, datapos, datalen);
249
250	PageSetLSN(rpage, lsn);
251	MarkBufferDirty(rbuf);
252
253	/ Now reconstruct left (original) sibling page /
254	if (XLogReadBufferForRedo(record, `0`, &lbuf) == BLK_NEEDS_REDO)
255	{
256	/*
257	* To retain the same physical order of the tuples that they had, we
258	* initialize a temporary empty page for the left page and add all the
259	* items to that in item number order. This mirrors how _bt_split()
260	* works. Retaining the same physical order makes WAL consistency
261	* checking possible. See also _bt_restore_page(), which does the
262	* same for the right page.
263	*/
264	Page lpage = (Page) BufferGetPage(lbuf);
265	BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
266	OffsetNumber off;
267	IndexTuple newitem = NULL,
268	left_hikey = NULL;
269	Size newitemsz = `0`,
270	left_hikeysz = `0`;
271	Page newlpage;
272	OffsetNumber leftoff;
273
274	datapos = XLogRecGetBlockData(record, `0`, &datalen);
275
276	if (onleft)
277	{
278	newitem = (IndexTuple) datapos;
279	newitemsz = MAXALIGN(IndexTupleSize(newitem));
280	datapos += newitemsz;
281	datalen -= newitemsz;
282	}
283
284	/ Extract left hikey and its size (assuming 16-bit alignment) /
285	left_hikey = (IndexTuple) datapos;
286	left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
287	datapos += left_hikeysz;
288	datalen -= left_hikeysz;
289
290	Assert(datalen == `0`);
291
292	newlpage = PageGetTempPageCopySpecial(lpage);
293
294	/ Set high key /
295	leftoff = P_HIKEY;
296	if (PageAddItem(newlpage, (Item) left_hikey, left_hikeysz,
297	P_HIKEY, false, false) == InvalidOffsetNumber)
298	elog(PANIC, "failed to add high key to left page after split");
299	leftoff = OffsetNumberNext(leftoff);
300
301	for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
302	{
303	ItemId itemid;
304	Size itemsz;
305	IndexTuple item;
306
307	/ add the new item if it was inserted on left page /
308	if (onleft && off == xlrec->newitemoff)
309	{
310	if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
311	false, false) == InvalidOffsetNumber)
312	elog(ERROR, "failed to add new item to left page after split");
313	leftoff = OffsetNumberNext(leftoff);
314	}
315
316	itemid = PageGetItemId(lpage, off);
317	itemsz = ItemIdGetLength(itemid);
318	item = (IndexTuple) PageGetItem(lpage, itemid);
319	if (PageAddItem(newlpage, (Item) item, itemsz, leftoff,
320	false, false) == InvalidOffsetNumber)
321	elog(ERROR, "failed to add old item to left page after split");
322	leftoff = OffsetNumberNext(leftoff);
323	}
324
325	/ cope with possibility that newitem goes at the end /
326	if (onleft && off == xlrec->newitemoff)
327	{
328	if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
329	false, false) == InvalidOffsetNumber)
330	elog(ERROR, "failed to add new item to left page after split");
331	leftoff = OffsetNumberNext(leftoff);
332	}
333
334	PageRestoreTempPage(newlpage, lpage);
335
336	/ Fix opaque fields /
337	lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
338	if (isleaf)
339	lopaque->btpo_flags \|= BTP_LEAF;
340	lopaque->btpo_next = rightsib;
341	lopaque->btpo_cycleid = `0`;
342
343	PageSetLSN(lpage, lsn);
344	MarkBufferDirty(lbuf);
345	}
346
347	/*
348	* We no longer need the buffers. They must be released together, so that
349	* readers cannot observe two inconsistent halves.
350	*/
351	if (BufferIsValid(lbuf))
352	UnlockReleaseBuffer(lbuf);
353	UnlockReleaseBuffer(rbuf);
354
355	/*
356	* Fix left-link of the page to the right of the new right sibling.
357	*
358	* Note: in normal operation, we do this while still holding lock on the
359	* two split pages. However, that's not necessary for correctness in WAL
360	* replay, because no other index update can be in progress, and readers
361	* will cope properly when following an obsolete left-link.
362	*/
363	if (rnext != P_NONE)
364	{
365	Buffer buffer;
366
367	if (XLogReadBufferForRedo(record, `2`, &buffer) == BLK_NEEDS_REDO)
368	{
369	Page page = (Page) BufferGetPage(buffer);
370	BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
371
372	pageop->btpo_prev = rightsib;
373
374	PageSetLSN(page, lsn);
375	MarkBufferDirty(buffer);
376	}
377	if (BufferIsValid(buffer))
378	UnlockReleaseBuffer(buffer);
379	}
380	}
381
382	static void
383	btree_xlog_vacuum(XLogReaderState *record)
384	{
385	XLogRecPtr lsn = record->EndRecPtr;
386	Buffer buffer;
387	Page page;
388	BTPageOpaque opaque;
389	#ifdef UNUSED
390	xl_btree_vacuum xlrec = (xl_btree_vacuum ) XLogRecGetData(record);
391
392	/*
393	* This section of code is thought to be no longer needed, after analysis
394	* of the calling paths. It is retained to allow the code to be reinstated
395	* if a flaw is revealed in that thinking.
396	*
397	* If we are running non-MVCC scans using this index we need to do some
398	* additional work to ensure correctness, which is known as a "pin scan"
399	* described in more detail in next paragraphs. We used to do the extra
400	* work in all cases, whereas we now avoid that work in most cases. If
401	* lastBlockVacuumed is set to InvalidBlockNumber then we skip the
402	* additional work required for the pin scan.
403	*
404	* Avoiding this extra work is important since it requires us to touch
405	* every page in the index, so is an O(N) operation. Worse, it is an
406	* operation performed in the foreground during redo, so it delays
407	* replication directly.
408	*
409	* If queries might be active then we need to ensure every leaf page is
410	* unpinned between the lastBlockVacuumed and the current block, if there
411	* are any. This prevents replay of the VACUUM from reaching the stage of
412	* removing heap tuples while there could still be indexscans "in flight"
413	* to those particular tuples for those scans which could be confused by
414	* finding new tuples at the old TID locations (see nbtree/README).
415	*
416	* It might be worth checking if there are actually any backends running;
417	* if not, we could just skip this.
418	*
419	* Since VACUUM can visit leaf pages out-of-order, it might issue records
420	* with lastBlockVacuumed >= block; that's not an error, it just means
421	* nothing to do now.
422	*
423	* Note: since we touch all pages in the range, we will lock non-leaf
424	* pages, and also any empty (all-zero) pages that may be in the index. It
425	* doesn't seem worth the complexity to avoid that. But it's important
426	* that HotStandbyActiveInReplay() will not return true if the database
427	* isn't yet consistent; so we need not fear reading still-corrupt blocks
428	* here during crash recovery.
429	*/
430	if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed))
431	{
432	RelFileNode thisrnode;
433	BlockNumber thisblkno;
434	BlockNumber blkno;
435
436	XLogRecGetBlockTag(record, `0`, &thisrnode, NULL, &thisblkno);
437
438	for (blkno = xlrec->lastBlockVacuumed + `1`; blkno < thisblkno; blkno++)
439	{
440	/*
441	* We use RBM_NORMAL_NO_LOG mode because it's not an error
442	* condition to see all-zero pages. The original btvacuumpage
443	* scan would have skipped over all-zero pages, noting them in FSM
444	* but not bothering to initialize them just yet; so we mustn't
445	* throw an error here. (We could skip acquiring the cleanup lock
446	* if PageIsNew, but it's probably not worth the cycles to test.)
447	*
448	* XXX we don't actually need to read the block, we just need to
449	* confirm it is unpinned. If we had a special call into the
450	* buffer manager we could optimise this so that if the block is
451	* not in shared_buffers we confirm it as unpinned. Optimizing
452	* this is now moot, since in most cases we avoid the scan.
453	*/
454	buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno,
455	RBM_NORMAL_NO_LOG);
456	if (BufferIsValid(buffer))
457	{
458	LockBufferForCleanup(buffer);
459	UnlockReleaseBuffer(buffer);
460	}
461	}
462	}
463	#endif
464
465	/*
466	* Like in btvacuumpage(), we need to take a cleanup lock on every leaf
467	* page. See nbtree/README for details.
468	*/
469	if (XLogReadBufferForRedoExtended(record, `0`, RBM_NORMAL, true, &buffer)
470	== BLK_NEEDS_REDO)
471	{
472	char *ptr;
473	Size len;
474
475	ptr = XLogRecGetBlockData(record, `0`, &len);
476
477	page = (Page) BufferGetPage(buffer);
478
479	if (len > `0`)
480	{
481	OffsetNumber *unused;
482	OffsetNumber *unend;
483
484	unused = (OffsetNumber *) ptr;
485	unend = (OffsetNumber ) ((char* *) ptr + len);
486
487	if ((unend - unused) > `0`)
488	PageIndexMultiDelete(page, unused, unend - unused);
489	}
490
491	/*
492	* Mark the page as not containing any LP_DEAD items --- see comments
493	* in _bt_delitems_vacuum().
494	*/
495	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
496	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
497
498	PageSetLSN(page, lsn);
499	MarkBufferDirty(buffer);
500	}
501	if (BufferIsValid(buffer))
502	UnlockReleaseBuffer(buffer);
503	}
504
505	static void
506	btree_xlog_delete(XLogReaderState *record)
507	{
508	XLogRecPtr lsn = record->EndRecPtr;
509	xl_btree_delete xlrec = (xl_btree_delete ) XLogRecGetData(record);
510	Buffer buffer;
511	Page page;
512	BTPageOpaque opaque;
513
514	/*
515	* If we have any conflict processing to do, it must happen before we
516	* update the page.
517	*
518	* Btree delete records can conflict with standby queries. You might
519	* think that vacuum records would conflict as well, but we've handled
520	* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
521	* cleaned by the vacuum of the heap and so we can resolve any conflicts
522	* just once when that arrives. After that we know that no conflicts
523	* exist from individual btree vacuum records on that index.
524	*/
525	if (InHotStandby)
526	{
527	RelFileNode rnode;
528
529	XLogRecGetBlockTag(record, `0`, &rnode, NULL, NULL);
530
531	ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
532	}
533
534	/*
535	* We don't need to take a cleanup lock to apply these changes. See
536	* nbtree/README for details.
537	*/
538	if (XLogReadBufferForRedo(record, `0`, &buffer) == BLK_NEEDS_REDO)
539	{
540	page = (Page) BufferGetPage(buffer);
541
542	if (XLogRecGetDataLen(record) > SizeOfBtreeDelete)
543	{
544	OffsetNumber *unused;
545
546	unused = (OffsetNumber ) ((char* *) xlrec + SizeOfBtreeDelete);
547
548	PageIndexMultiDelete(page, unused, xlrec->nitems);
549	}
550
551	/*
552	* Mark the page as not containing any LP_DEAD items --- see comments
553	* in _bt_delitems_delete().
554	*/
555	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
556	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
557
558	PageSetLSN(page, lsn);
559	MarkBufferDirty(buffer);
560	}
561	if (BufferIsValid(buffer))
562	UnlockReleaseBuffer(buffer);
563	}
564
565	static void
566	btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
567	{
568	XLogRecPtr lsn = record->EndRecPtr;
569	xl_btree_mark_page_halfdead xlrec = (xl_btree_mark_page_halfdead ) XLogRecGetData(record);
570	Buffer buffer;
571	Page page;
572	BTPageOpaque pageop;
573	IndexTupleData trunctuple;
574
575	/*
576	* In normal operation, we would lock all the pages this WAL record
577	* touches before changing any of them. In WAL replay, it should be okay
578	* to lock just one page at a time, since no concurrent index updates can
579	* be happening, and readers should not care whether they arrive at the
580	* target page or not (since it's surely empty).
581	*/
582
583	/ parent page /
584	if (XLogReadBufferForRedo(record, `1`, &buffer) == BLK_NEEDS_REDO)
585	{
586	OffsetNumber poffset;
587	ItemId itemid;
588	IndexTuple itup;
589	OffsetNumber nextoffset;
590	BlockNumber rightsib;
591
592	page = (Page) BufferGetPage(buffer);
593	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
594
595	poffset = xlrec->poffset;
596
597	nextoffset = OffsetNumberNext(poffset);
598	itemid = PageGetItemId(page, nextoffset);
599	itup = (IndexTuple) PageGetItem(page, itemid);
600	rightsib = BTreeInnerTupleGetDownLink(itup);
601
602	itemid = PageGetItemId(page, poffset);
603	itup = (IndexTuple) PageGetItem(page, itemid);
604	BTreeInnerTupleSetDownLink(itup, rightsib);
605	nextoffset = OffsetNumberNext(poffset);
606	PageIndexTupleDelete(page, nextoffset);
607
608	PageSetLSN(page, lsn);
609	MarkBufferDirty(buffer);
610	}
611	if (BufferIsValid(buffer))
612	UnlockReleaseBuffer(buffer);
613
614	/ Rewrite the leaf page as a halfdead page /
615	buffer = XLogInitBufferForRedo(record, `0`);
616	page = (Page) BufferGetPage(buffer);
617
618	_bt_pageinit(page, BufferGetPageSize(buffer));
619	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
620
621	pageop->btpo_prev = xlrec->leftblk;
622	pageop->btpo_next = xlrec->rightblk;
623	pageop->btpo.level = `0`;
624	pageop->btpo_flags = BTP_HALF_DEAD \| BTP_LEAF;
625	pageop->btpo_cycleid = `0`;
626
627	/*
628	* Construct a dummy hikey item that points to the next parent to be
629	* deleted (if any).
630	*/
631	MemSet(&trunctuple, `0`, sizeof(IndexTupleData));
632	trunctuple.t_info = sizeof(IndexTupleData);
633	BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
634
635	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
636	false, false) == InvalidOffsetNumber)
637	elog(ERROR, "could not add dummy high key to half-dead page");
638
639	PageSetLSN(page, lsn);
640	MarkBufferDirty(buffer);
641	UnlockReleaseBuffer(buffer);
642	}
643
644
645	static void
646	btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
647	{
648	XLogRecPtr lsn = record->EndRecPtr;
649	xl_btree_unlink_page xlrec = (xl_btree_unlink_page ) XLogRecGetData(record);
650	BlockNumber leftsib;
651	BlockNumber rightsib;
652	Buffer buffer;
653	Page page;
654	BTPageOpaque pageop;
655
656	leftsib = xlrec->leftsib;
657	rightsib = xlrec->rightsib;
658
659	/*
660	* In normal operation, we would lock all the pages this WAL record
661	* touches before changing any of them. In WAL replay, it should be okay
662	* to lock just one page at a time, since no concurrent index updates can
663	* be happening, and readers should not care whether they arrive at the
664	* target page or not (since it's surely empty).
665	*/
666
667	/ Fix left-link of right sibling /
668	if (XLogReadBufferForRedo(record, `2`, &buffer) == BLK_NEEDS_REDO)
669	{
670	page = (Page) BufferGetPage(buffer);
671	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
672	pageop->btpo_prev = leftsib;
673
674	PageSetLSN(page, lsn);
675	MarkBufferDirty(buffer);
676	}
677	if (BufferIsValid(buffer))
678	UnlockReleaseBuffer(buffer);
679
680	/ Fix right-link of left sibling, if any /
681	if (leftsib != P_NONE)
682	{
683	if (XLogReadBufferForRedo(record, `1`, &buffer) == BLK_NEEDS_REDO)
684	{
685	page = (Page) BufferGetPage(buffer);
686	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
687	pageop->btpo_next = rightsib;
688
689	PageSetLSN(page, lsn);
690	MarkBufferDirty(buffer);
691	}
692	if (BufferIsValid(buffer))
693	UnlockReleaseBuffer(buffer);
694	}
695
696	/ Rewrite target page as empty deleted page /
697	buffer = XLogInitBufferForRedo(record, `0`);
698	page = (Page) BufferGetPage(buffer);
699
700	_bt_pageinit(page, BufferGetPageSize(buffer));
701	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
702
703	pageop->btpo_prev = leftsib;
704	pageop->btpo_next = rightsib;
705	pageop->btpo.xact = xlrec->btpo_xact;
706	pageop->btpo_flags = BTP_DELETED;
707	pageop->btpo_cycleid = `0`;
708
709	PageSetLSN(page, lsn);
710	MarkBufferDirty(buffer);
711	UnlockReleaseBuffer(buffer);
712
713	/*
714	* If we deleted a parent of the targeted leaf page, instead of the leaf
715	* itself, update the leaf to point to the next remaining child in the
716	* branch.
717	*/
718	if (XLogRecHasBlockRef(record, `3`))
719	{
720	/*
721	* There is no real data on the page, so we just re-create it from
722	* scratch using the information from the WAL record.
723	*/
724	IndexTupleData trunctuple;
725
726	buffer = XLogInitBufferForRedo(record, `3`);
727	page = (Page) BufferGetPage(buffer);
728
729	_bt_pageinit(page, BufferGetPageSize(buffer));
730	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
731
732	pageop->btpo_flags = BTP_HALF_DEAD \| BTP_LEAF;
733	pageop->btpo_prev = xlrec->leafleftsib;
734	pageop->btpo_next = xlrec->leafrightsib;
735	pageop->btpo.level = `0`;
736	pageop->btpo_cycleid = `0`;
737
738	/ Add a dummy hikey item /
739	MemSet(&trunctuple, `0`, sizeof(IndexTupleData));
740	trunctuple.t_info = sizeof(IndexTupleData);
741	BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
742
743	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
744	false, false) == InvalidOffsetNumber)
745	elog(ERROR, "could not add dummy high key to half-dead page");
746
747	PageSetLSN(page, lsn);
748	MarkBufferDirty(buffer);
749	UnlockReleaseBuffer(buffer);
750	}
751
752	/ Update metapage if needed /
753	if (info == XLOG_BTREE_UNLINK_PAGE_META)
754	_bt_restore_meta(record, `4`);
755	}
756
757	static void
758	btree_xlog_newroot(XLogReaderState *record)
759	{
760	XLogRecPtr lsn = record->EndRecPtr;
761	xl_btree_newroot xlrec = (xl_btree_newroot ) XLogRecGetData(record);
762	Buffer buffer;
763	Page page;
764	BTPageOpaque pageop;
765	char *ptr;
766	Size len;
767
768	buffer = XLogInitBufferForRedo(record, `0`);
769	page = (Page) BufferGetPage(buffer);
770
771	_bt_pageinit(page, BufferGetPageSize(buffer));
772	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
773
774	pageop->btpo_flags = BTP_ROOT;
775	pageop->btpo_prev = pageop->btpo_next = P_NONE;
776	pageop->btpo.level = xlrec->level;
777	if (xlrec->level == `0`)
778	pageop->btpo_flags \|= BTP_LEAF;
779	pageop->btpo_cycleid = `0`;
780
781	if (xlrec->level > `0`)
782	{
783	ptr = XLogRecGetBlockData(record, `0`, &len);
784	_bt_restore_page(page, ptr, len);
785
786	/ Clear the incomplete-split flag in left child /
787	_bt_clear_incomplete_split(record, `1`);
788	}
789
790	PageSetLSN(page, lsn);
791	MarkBufferDirty(buffer);
792	UnlockReleaseBuffer(buffer);
793
794	_bt_restore_meta(record, `2`);
795	}
796
797	static void
798	btree_xlog_reuse_page(XLogReaderState *record)
799	{
800	xl_btree_reuse_page xlrec = (xl_btree_reuse_page ) XLogRecGetData(record);
801
802	/*
803	* Btree reuse_page records exist to provide a conflict point when we
804	* reuse pages in the index via the FSM. That's all they do though.
805	*
806	* latestRemovedXid was the page's btpo.xact. The btpo.xact <
807	* RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
808	* pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
809	* Consequently, one XID value achieves the same exclusion effect on
810	* master and standby.
811	*/
812	if (InHotStandby)
813	{
814	ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
815	xlrec->node);
816	}
817	}
818
819	void
820	btree_redo(XLogReaderState *record)
821	{
822	uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
823
824	switch (info)
825	{
826	case XLOG_BTREE_INSERT_LEAF:
827	btree_xlog_insert(true, false, record);
828	break;
829	case XLOG_BTREE_INSERT_UPPER:
830	btree_xlog_insert(false, false, record);
831	break;
832	case XLOG_BTREE_INSERT_META:
833	btree_xlog_insert(false, true, record);
834	break;
835	case XLOG_BTREE_SPLIT_L:
836	btree_xlog_split(true, record);
837	break;
838	case XLOG_BTREE_SPLIT_R:
839	btree_xlog_split(false, record);
840	break;
841	case XLOG_BTREE_VACUUM:
842	btree_xlog_vacuum(record);
843	break;
844	case XLOG_BTREE_DELETE:
845	btree_xlog_delete(record);
846	break;
847	case XLOG_BTREE_MARK_PAGE_HALFDEAD:
848	btree_xlog_mark_page_halfdead(info, record);
849	break;
850	case XLOG_BTREE_UNLINK_PAGE:
851	case XLOG_BTREE_UNLINK_PAGE_META:
852	btree_xlog_unlink_page(info, record);
853	break;
854	case XLOG_BTREE_NEWROOT:
855	btree_xlog_newroot(record);
856	break;
857	case XLOG_BTREE_REUSE_PAGE:
858	btree_xlog_reuse_page(record);
859	break;
860	case XLOG_BTREE_META_CLEANUP:
861	_bt_restore_meta(record, `0`);
862	break;
863	default:
864	elog(PANIC, "btree_redo: unknown op code %u", info);
865	}
866	}
867
868	/*
869	* Mask a btree page before performing consistency checks on it.
870	*/
871	void
872	btree_mask(char *pagedata, BlockNumber blkno)
873	{
874	Page page = (Page) pagedata;
875	BTPageOpaque maskopaq;
876
877	mask_page_lsn_and_checksum(page);
878
879	mask_page_hint_bits(page);
880	mask_unused_space(page);
881
882	maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
883
884	if (P_ISDELETED(maskopaq))
885	{
886	/*
887	* Mask page content on a DELETED page since it will be re-initialized
888	* during replay. See btree_xlog_unlink_page() for details.
889	*/
890	mask_page_content(page);
891	}
892	else if (P_ISLEAF(maskopaq))
893	{
894	/*
895	* In btree leaf pages, it is possible to modify the LP_FLAGS without
896	* emitting any WAL record. Hence, mask the line pointer flags. See
897	* _bt_killitems(), _bt_check_unique() for details.
898	*/
899	mask_lp_flags(page);
900	}
901
902	/*
903	* BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
904	* _bt_killitems(), _bt_check_unique() for details.
905	*/
906	maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
907
908	/*
909	* During replay of a btree page split, we don't set the BTP_SPLIT_END
910	* flag of the right sibling and initialize the cycle_id to 0 for the same
911	* page. See btree_xlog_split() for details.
912	*/
913	maskopaq->btpo_flags &= ~BTP_SPLIT_END;
914	maskopaq->btpo_cycleid = `0`;
915	}
916

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtxlog.c