1/*-------------------------------------------------------------------------
2 *
3 * nbtxlog.h
4 * header file for postgres btree xlog routines
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * src/include/access/nbtxlog.h
10 *
11 *-------------------------------------------------------------------------
12 */
13#ifndef NBTXLOG_H
14#define NBTXLOG_H
15
16#include "access/xlogreader.h"
17#include "lib/stringinfo.h"
18#include "storage/off.h"
19
20/*
21 * XLOG records for btree operations
22 *
23 * XLOG allows to store some information in high 4 bits of log
24 * record xl_info field
25 */
26#define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
27#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
28#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
29#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
30#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
31/* 0x50 and 0x60 are unused */
32#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
33#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
34#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
35#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
36#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
37#define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
38 * vacuum */
39#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
40 * FSM */
41#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
42 * metapage */
43
44/*
45 * All that we need to regenerate the meta-data page
46 */
47typedef struct xl_btree_metadata
48{
49 uint32 version;
50 BlockNumber root;
51 uint32 level;
52 BlockNumber fastroot;
53 uint32 fastlevel;
54 TransactionId oldest_btpo_xact;
55 float8 last_cleanup_num_heap_tuples;
56} xl_btree_metadata;
57
58/*
59 * This is what we need to know about simple (without split) insert.
60 *
61 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
62 * Note that INSERT_META implies it's not a leaf page.
63 *
64 * Backup Blk 0: original page (data contains the inserted tuple)
65 * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
66 * Backup Blk 2: xl_btree_metadata, if INSERT_META
67 */
68typedef struct xl_btree_insert
69{
70 OffsetNumber offnum;
71} xl_btree_insert;
72
73#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
74
75/*
76 * On insert with split, we save all the items going into the right sibling
77 * so that we can restore it completely from the log record. This way takes
78 * less xlog space than the normal approach, because if we did it standardly,
79 * XLogInsert would almost always think the right page is new and store its
80 * whole page image. The left page, however, is handled in the normal
81 * incremental-update fashion.
82 *
83 * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
84 * There are two variants to indicate whether the inserted tuple went into the
85 * left or right split page (and thus, whether the new item is stored or not).
86 * We always log the left page high key because suffix truncation can generate
87 * a new leaf high key using user-defined code. This is also necessary on
88 * internal pages, since the first right item that the left page's high key
89 * was based on will have been truncated to zero attributes in the right page
90 * (the original is unavailable from the right page).
91 *
92 * Backup Blk 0: original page / new left page
93 *
94 * The left page's data portion contains the new item, if it's the _L variant.
95 * An IndexTuple representing the high key of the left page must follow with
96 * either variant.
97 *
98 * Backup Blk 1: new right page
99 *
100 * The right page's data portion contains the right page's tuples in the form
101 * used by _bt_restore_page. This includes the new item, if it's the _R
102 * variant. The right page's tuples also include the right page's high key
103 * with either variant (moved from the left/original page during the split),
104 * unless the split happened to be of the rightmost page on its level, where
105 * there is no high key for new right page.
106 *
107 * Backup Blk 2: next block (orig page's rightlink), if any
108 * Backup Blk 3: child's left sibling, if non-leaf split
109 */
110typedef struct xl_btree_split
111{
112 uint32 level; /* tree level of page being split */
113 OffsetNumber firstright; /* first item moved to right page */
114 OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */
115} xl_btree_split;
116
117#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
118
119/*
120 * This is what we need to know about delete of individual leaf index tuples.
121 * The WAL record can represent deletion of any number of index tuples on a
122 * single index page when *not* executed by VACUUM.
123 *
124 * Backup Blk 0: index page
125 */
126typedef struct xl_btree_delete
127{
128 TransactionId latestRemovedXid;
129 int nitems;
130
131 /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
132} xl_btree_delete;
133
134#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))
135
136/*
137 * This is what we need to know about page reuse within btree.
138 */
139typedef struct xl_btree_reuse_page
140{
141 RelFileNode node;
142 BlockNumber block;
143 TransactionId latestRemovedXid;
144} xl_btree_reuse_page;
145
146#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
147
148/*
149 * This is what we need to know about vacuum of individual leaf index tuples.
150 * The WAL record can represent deletion of any number of index tuples on a
151 * single index page when executed by VACUUM.
152 *
153 * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber.
154 * For a non-MVCC index scans there is an additional correctness requirement
155 * for applying these changes during recovery, which is that we must do one
156 * of these two things for every block in the index:
157 * * lock the block for cleanup and apply any required changes
158 * * EnsureBlockUnpinned()
159 * The purpose of this is to ensure that no index scans started before we
160 * finish scanning the index are still running by the time we begin to remove
161 * heap tuples.
162 *
163 * Any changes to any one block are registered on just one WAL record. All
164 * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
165 * starting from the last block vacuumed through until this one. Individual
166 * block numbers aren't given.
167 *
168 * Note that the *last* WAL record in any vacuum of an index is allowed to
169 * have a zero length array of offsets. Earlier records must have at least one.
170 */
171typedef struct xl_btree_vacuum
172{
173 BlockNumber lastBlockVacuumed;
174
175 /* TARGET OFFSET NUMBERS FOLLOW */
176} xl_btree_vacuum;
177
178#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
179
180/*
181 * This is what we need to know about marking an empty branch for deletion.
182 * The target identifies the tuple removed from the parent page (note that we
183 * remove this tuple's downlink and the *following* tuple's key). Note that
184 * the leaf page is empty, so we don't need to store its content --- it is
185 * just reinitialized during recovery using the rest of the fields.
186 *
187 * Backup Blk 0: leaf block
188 * Backup Blk 1: top parent
189 */
190typedef struct xl_btree_mark_page_halfdead
191{
192 OffsetNumber poffset; /* deleted tuple id in parent page */
193
194 /* information needed to recreate the leaf page: */
195 BlockNumber leafblk; /* leaf block ultimately being deleted */
196 BlockNumber leftblk; /* leaf block's left sibling, if any */
197 BlockNumber rightblk; /* leaf block's right sibling */
198 BlockNumber topparent; /* topmost internal page in the branch */
199} xl_btree_mark_page_halfdead;
200
201#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
202
203/*
204 * This is what we need to know about deletion of a btree page. Note we do
205 * not store any content for the deleted page --- it is just rewritten as empty
206 * during recovery, apart from resetting the btpo.xact.
207 *
208 * Backup Blk 0: target block being deleted
209 * Backup Blk 1: target block's left sibling, if any
210 * Backup Blk 2: target block's right sibling
211 * Backup Blk 3: leaf block (if different from target)
212 * Backup Blk 4: metapage (if rightsib becomes new fast root)
213 */
214typedef struct xl_btree_unlink_page
215{
216 BlockNumber leftsib; /* target block's left sibling, if any */
217 BlockNumber rightsib; /* target block's right sibling */
218
219 /*
220 * Information needed to recreate the leaf page, when target is an
221 * internal page.
222 */
223 BlockNumber leafleftsib;
224 BlockNumber leafrightsib;
225 BlockNumber topparent; /* next child down in the branch */
226
227 TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
228 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
229} xl_btree_unlink_page;
230
231#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
232
233/*
234 * New root log record. There are zero tuples if this is to establish an
235 * empty root, or two if it is the result of splitting an old root.
236 *
237 * Note that although this implies rewriting the metadata page, we don't need
238 * an xl_btree_metadata record --- the rootblk and level are sufficient.
239 *
240 * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
241 * Backup Blk 1: left child (if splitting an old root)
242 * Backup Blk 2: metapage
243 */
244typedef struct xl_btree_newroot
245{
246 BlockNumber rootblk; /* location of new root (redundant with blk 0) */
247 uint32 level; /* its tree level */
248} xl_btree_newroot;
249
250#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
251
252
253/*
254 * prototypes for functions in nbtxlog.c
255 */
256extern void btree_redo(XLogReaderState *record);
257extern void btree_desc(StringInfo buf, XLogReaderState *record);
258extern const char *btree_identify(uint8 info);
259extern void btree_mask(char *pagedata, BlockNumber blkno);
260
261#endif /* NBXLOG_H */
262