1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * nbtxlog.h |
4 | * header file for postgres btree xlog routines |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * src/include/access/nbtxlog.h |
10 | * |
11 | *------------------------------------------------------------------------- |
12 | */ |
13 | #ifndef NBTXLOG_H |
14 | #define NBTXLOG_H |
15 | |
16 | #include "access/xlogreader.h" |
17 | #include "lib/stringinfo.h" |
18 | #include "storage/off.h" |
19 | |
20 | /* |
21 | * XLOG records for btree operations |
22 | * |
23 | * XLOG allows to store some information in high 4 bits of log |
24 | * record xl_info field |
25 | */ |
26 | #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ |
27 | #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ |
28 | #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ |
29 | #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ |
30 | #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ |
31 | /* 0x50 and 0x60 are unused */ |
32 | #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ |
33 | #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ |
34 | #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ |
35 | #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ |
36 | #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ |
37 | #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during |
38 | * vacuum */ |
39 | #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from |
40 | * FSM */ |
41 | #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the |
42 | * metapage */ |
43 | |
44 | /* |
45 | * All that we need to regenerate the meta-data page |
46 | */ |
47 | typedef struct xl_btree_metadata |
48 | { |
49 | uint32 version; |
50 | BlockNumber root; |
51 | uint32 level; |
52 | BlockNumber fastroot; |
53 | uint32 fastlevel; |
54 | TransactionId oldest_btpo_xact; |
55 | float8 last_cleanup_num_heap_tuples; |
56 | } xl_btree_metadata; |
57 | |
58 | /* |
59 | * This is what we need to know about simple (without split) insert. |
60 | * |
61 | * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. |
62 | * Note that INSERT_META implies it's not a leaf page. |
63 | * |
64 | * Backup Blk 0: original page (data contains the inserted tuple) |
65 | * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META |
66 | * Backup Blk 2: xl_btree_metadata, if INSERT_META |
67 | */ |
68 | typedef struct xl_btree_insert |
69 | { |
70 | OffsetNumber offnum; |
71 | } xl_btree_insert; |
72 | |
73 | #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) |
74 | |
75 | /* |
76 | * On insert with split, we save all the items going into the right sibling |
77 | * so that we can restore it completely from the log record. This way takes |
78 | * less xlog space than the normal approach, because if we did it standardly, |
79 | * XLogInsert would almost always think the right page is new and store its |
80 | * whole page image. The left page, however, is handled in the normal |
81 | * incremental-update fashion. |
82 | * |
83 | * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record. |
84 | * There are two variants to indicate whether the inserted tuple went into the |
85 | * left or right split page (and thus, whether the new item is stored or not). |
86 | * We always log the left page high key because suffix truncation can generate |
87 | * a new leaf high key using user-defined code. This is also necessary on |
88 | * internal pages, since the first right item that the left page's high key |
89 | * was based on will have been truncated to zero attributes in the right page |
90 | * (the original is unavailable from the right page). |
91 | * |
92 | * Backup Blk 0: original page / new left page |
93 | * |
94 | * The left page's data portion contains the new item, if it's the _L variant. |
95 | * An IndexTuple representing the high key of the left page must follow with |
96 | * either variant. |
97 | * |
98 | * Backup Blk 1: new right page |
99 | * |
100 | * The right page's data portion contains the right page's tuples in the form |
101 | * used by _bt_restore_page. This includes the new item, if it's the _R |
102 | * variant. The right page's tuples also include the right page's high key |
103 | * with either variant (moved from the left/original page during the split), |
104 | * unless the split happened to be of the rightmost page on its level, where |
105 | * there is no high key for new right page. |
106 | * |
107 | * Backup Blk 2: next block (orig page's rightlink), if any |
108 | * Backup Blk 3: child's left sibling, if non-leaf split |
109 | */ |
110 | typedef struct xl_btree_split |
111 | { |
112 | uint32 level; /* tree level of page being split */ |
113 | OffsetNumber firstright; /* first item moved to right page */ |
114 | OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ |
115 | } xl_btree_split; |
116 | |
117 | #define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) |
118 | |
119 | /* |
120 | * This is what we need to know about delete of individual leaf index tuples. |
121 | * The WAL record can represent deletion of any number of index tuples on a |
122 | * single index page when *not* executed by VACUUM. |
123 | * |
124 | * Backup Blk 0: index page |
125 | */ |
126 | typedef struct xl_btree_delete |
127 | { |
128 | TransactionId latestRemovedXid; |
129 | int nitems; |
130 | |
131 | /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ |
132 | } xl_btree_delete; |
133 | |
134 | #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) |
135 | |
136 | /* |
137 | * This is what we need to know about page reuse within btree. |
138 | */ |
139 | typedef struct xl_btree_reuse_page |
140 | { |
141 | RelFileNode node; |
142 | BlockNumber block; |
143 | TransactionId latestRemovedXid; |
144 | } xl_btree_reuse_page; |
145 | |
146 | #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) |
147 | |
148 | /* |
149 | * This is what we need to know about vacuum of individual leaf index tuples. |
150 | * The WAL record can represent deletion of any number of index tuples on a |
151 | * single index page when executed by VACUUM. |
152 | * |
153 | * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. |
154 | * For a non-MVCC index scans there is an additional correctness requirement |
155 | * for applying these changes during recovery, which is that we must do one |
156 | * of these two things for every block in the index: |
157 | * * lock the block for cleanup and apply any required changes |
158 | * * EnsureBlockUnpinned() |
159 | * The purpose of this is to ensure that no index scans started before we |
160 | * finish scanning the index are still running by the time we begin to remove |
161 | * heap tuples. |
162 | * |
163 | * Any changes to any one block are registered on just one WAL record. All |
164 | * blocks that we need to run EnsureBlockUnpinned() are listed as a block range |
165 | * starting from the last block vacuumed through until this one. Individual |
166 | * block numbers aren't given. |
167 | * |
168 | * Note that the *last* WAL record in any vacuum of an index is allowed to |
169 | * have a zero length array of offsets. Earlier records must have at least one. |
170 | */ |
171 | typedef struct xl_btree_vacuum |
172 | { |
173 | BlockNumber lastBlockVacuumed; |
174 | |
175 | /* TARGET OFFSET NUMBERS FOLLOW */ |
176 | } xl_btree_vacuum; |
177 | |
178 | #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) |
179 | |
180 | /* |
181 | * This is what we need to know about marking an empty branch for deletion. |
182 | * The target identifies the tuple removed from the parent page (note that we |
183 | * remove this tuple's downlink and the *following* tuple's key). Note that |
184 | * the leaf page is empty, so we don't need to store its content --- it is |
185 | * just reinitialized during recovery using the rest of the fields. |
186 | * |
187 | * Backup Blk 0: leaf block |
188 | * Backup Blk 1: top parent |
189 | */ |
190 | typedef struct xl_btree_mark_page_halfdead |
191 | { |
192 | OffsetNumber poffset; /* deleted tuple id in parent page */ |
193 | |
194 | /* information needed to recreate the leaf page: */ |
195 | BlockNumber leafblk; /* leaf block ultimately being deleted */ |
196 | BlockNumber leftblk; /* leaf block's left sibling, if any */ |
197 | BlockNumber rightblk; /* leaf block's right sibling */ |
198 | BlockNumber topparent; /* topmost internal page in the branch */ |
199 | } xl_btree_mark_page_halfdead; |
200 | |
201 | #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) |
202 | |
203 | /* |
204 | * This is what we need to know about deletion of a btree page. Note we do |
205 | * not store any content for the deleted page --- it is just rewritten as empty |
206 | * during recovery, apart from resetting the btpo.xact. |
207 | * |
208 | * Backup Blk 0: target block being deleted |
209 | * Backup Blk 1: target block's left sibling, if any |
210 | * Backup Blk 2: target block's right sibling |
211 | * Backup Blk 3: leaf block (if different from target) |
212 | * Backup Blk 4: metapage (if rightsib becomes new fast root) |
213 | */ |
214 | typedef struct xl_btree_unlink_page |
215 | { |
216 | BlockNumber leftsib; /* target block's left sibling, if any */ |
217 | BlockNumber rightsib; /* target block's right sibling */ |
218 | |
219 | /* |
220 | * Information needed to recreate the leaf page, when target is an |
221 | * internal page. |
222 | */ |
223 | BlockNumber leafleftsib; |
224 | BlockNumber leafrightsib; |
225 | BlockNumber topparent; /* next child down in the branch */ |
226 | |
227 | TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ |
228 | /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ |
229 | } xl_btree_unlink_page; |
230 | |
231 | #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) |
232 | |
233 | /* |
234 | * New root log record. There are zero tuples if this is to establish an |
235 | * empty root, or two if it is the result of splitting an old root. |
236 | * |
237 | * Note that although this implies rewriting the metadata page, we don't need |
238 | * an xl_btree_metadata record --- the rootblk and level are sufficient. |
239 | * |
240 | * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) |
241 | * Backup Blk 1: left child (if splitting an old root) |
242 | * Backup Blk 2: metapage |
243 | */ |
244 | typedef struct xl_btree_newroot |
245 | { |
246 | BlockNumber rootblk; /* location of new root (redundant with blk 0) */ |
247 | uint32 level; /* its tree level */ |
248 | } xl_btree_newroot; |
249 | |
250 | #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) |
251 | |
252 | |
253 | /* |
254 | * prototypes for functions in nbtxlog.c |
255 | */ |
256 | extern void btree_redo(XLogReaderState *record); |
257 | extern void btree_desc(StringInfo buf, XLogReaderState *record); |
258 | extern const char *btree_identify(uint8 info); |
259 | extern void btree_mask(char *pagedata, BlockNumber blkno); |
260 | |
261 | #endif /* NBXLOG_H */ |
262 | |