| 1 | /*------------------------------------------------------------------------- | 
|---|
| 2 | * | 
|---|
| 3 | * nbtxlog.h | 
|---|
| 4 | *	  header file for postgres btree xlog routines | 
|---|
| 5 | * | 
|---|
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group | 
|---|
| 7 | * Portions Copyright (c) 1994, Regents of the University of California | 
|---|
| 8 | * | 
|---|
| 9 | * src/include/access/nbtxlog.h | 
|---|
| 10 | * | 
|---|
| 11 | *------------------------------------------------------------------------- | 
|---|
| 12 | */ | 
|---|
| 13 | #ifndef NBTXLOG_H | 
|---|
| 14 | #define NBTXLOG_H | 
|---|
| 15 |  | 
|---|
| 16 | #include "access/xlogreader.h" | 
|---|
| 17 | #include "lib/stringinfo.h" | 
|---|
| 18 | #include "storage/off.h" | 
|---|
| 19 |  | 
|---|
| 20 | /* | 
|---|
| 21 | * XLOG records for btree operations | 
|---|
| 22 | * | 
|---|
| 23 | * XLOG allows to store some information in high 4 bits of log | 
|---|
| 24 | * record xl_info field | 
|---|
| 25 | */ | 
|---|
| 26 | #define XLOG_BTREE_INSERT_LEAF	0x00	/* add index tuple without split */ | 
|---|
| 27 | #define XLOG_BTREE_INSERT_UPPER 0x10	/* same, on a non-leaf page */ | 
|---|
| 28 | #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */ | 
|---|
| 29 | #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */ | 
|---|
| 30 | #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */ | 
|---|
| 31 | /* 0x50 and 0x60 are unused */ | 
|---|
| 32 | #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */ | 
|---|
| 33 | #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */ | 
|---|
| 34 | #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */ | 
|---|
| 35 | #define XLOG_BTREE_NEWROOT		0xA0	/* new root page */ | 
|---|
| 36 | #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0	/* mark a leaf as half-dead */ | 
|---|
| 37 | #define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during | 
|---|
| 38 | * vacuum */ | 
|---|
| 39 | #define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from | 
|---|
| 40 | * FSM */ | 
|---|
| 41 | #define XLOG_BTREE_META_CLEANUP	0xE0	/* update cleanup-related data in the | 
|---|
| 42 | * metapage */ | 
|---|
| 43 |  | 
|---|
| 44 | /* | 
|---|
| 45 | * All that we need to regenerate the meta-data page | 
|---|
| 46 | */ | 
|---|
| 47 | typedef struct xl_btree_metadata | 
|---|
| 48 | { | 
|---|
| 49 | uint32		version; | 
|---|
| 50 | BlockNumber root; | 
|---|
| 51 | uint32		level; | 
|---|
| 52 | BlockNumber fastroot; | 
|---|
| 53 | uint32		fastlevel; | 
|---|
| 54 | TransactionId oldest_btpo_xact; | 
|---|
| 55 | float8		last_cleanup_num_heap_tuples; | 
|---|
| 56 | } xl_btree_metadata; | 
|---|
| 57 |  | 
|---|
| 58 | /* | 
|---|
| 59 | * This is what we need to know about simple (without split) insert. | 
|---|
| 60 | * | 
|---|
| 61 | * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. | 
|---|
| 62 | * Note that INSERT_META implies it's not a leaf page. | 
|---|
| 63 | * | 
|---|
| 64 | * Backup Blk 0: original page (data contains the inserted tuple) | 
|---|
| 65 | * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META | 
|---|
| 66 | * Backup Blk 2: xl_btree_metadata, if INSERT_META | 
|---|
| 67 | */ | 
|---|
| 68 | typedef struct xl_btree_insert | 
|---|
| 69 | { | 
|---|
| 70 | OffsetNumber offnum; | 
|---|
| 71 | } xl_btree_insert; | 
|---|
| 72 |  | 
|---|
| 73 | #define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) | 
|---|
| 74 |  | 
|---|
| 75 | /* | 
|---|
| 76 | * On insert with split, we save all the items going into the right sibling | 
|---|
| 77 | * so that we can restore it completely from the log record.  This way takes | 
|---|
| 78 | * less xlog space than the normal approach, because if we did it standardly, | 
|---|
| 79 | * XLogInsert would almost always think the right page is new and store its | 
|---|
| 80 | * whole page image.  The left page, however, is handled in the normal | 
|---|
| 81 | * incremental-update fashion. | 
|---|
| 82 | * | 
|---|
| 83 | * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record. | 
|---|
| 84 | * There are two variants to indicate whether the inserted tuple went into the | 
|---|
| 85 | * left or right split page (and thus, whether the new item is stored or not). | 
|---|
| 86 | * We always log the left page high key because suffix truncation can generate | 
|---|
| 87 | * a new leaf high key using user-defined code.  This is also necessary on | 
|---|
| 88 | * internal pages, since the first right item that the left page's high key | 
|---|
| 89 | * was based on will have been truncated to zero attributes in the right page | 
|---|
| 90 | * (the original is unavailable from the right page). | 
|---|
| 91 | * | 
|---|
| 92 | * Backup Blk 0: original page / new left page | 
|---|
| 93 | * | 
|---|
| 94 | * The left page's data portion contains the new item, if it's the _L variant. | 
|---|
| 95 | * An IndexTuple representing the high key of the left page must follow with | 
|---|
| 96 | * either variant. | 
|---|
| 97 | * | 
|---|
| 98 | * Backup Blk 1: new right page | 
|---|
| 99 | * | 
|---|
| 100 | * The right page's data portion contains the right page's tuples in the form | 
|---|
| 101 | * used by _bt_restore_page.  This includes the new item, if it's the _R | 
|---|
| 102 | * variant.  The right page's tuples also include the right page's high key | 
|---|
| 103 | * with either variant (moved from the left/original page during the split), | 
|---|
| 104 | * unless the split happened to be of the rightmost page on its level, where | 
|---|
| 105 | * there is no high key for new right page. | 
|---|
| 106 | * | 
|---|
| 107 | * Backup Blk 2: next block (orig page's rightlink), if any | 
|---|
| 108 | * Backup Blk 3: child's left sibling, if non-leaf split | 
|---|
| 109 | */ | 
|---|
| 110 | typedef struct xl_btree_split | 
|---|
| 111 | { | 
|---|
| 112 | uint32		level;			/* tree level of page being split */ | 
|---|
| 113 | OffsetNumber firstright;	/* first item moved to right page */ | 
|---|
| 114 | OffsetNumber newitemoff;	/* new item's offset (useful for _L variant) */ | 
|---|
| 115 | } xl_btree_split; | 
|---|
| 116 |  | 
|---|
| 117 | #define SizeOfBtreeSplit	(offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) | 
|---|
| 118 |  | 
|---|
| 119 | /* | 
|---|
| 120 | * This is what we need to know about delete of individual leaf index tuples. | 
|---|
| 121 | * The WAL record can represent deletion of any number of index tuples on a | 
|---|
| 122 | * single index page when *not* executed by VACUUM. | 
|---|
| 123 | * | 
|---|
| 124 | * Backup Blk 0: index page | 
|---|
| 125 | */ | 
|---|
| 126 | typedef struct xl_btree_delete | 
|---|
| 127 | { | 
|---|
| 128 | TransactionId latestRemovedXid; | 
|---|
| 129 | int			nitems; | 
|---|
| 130 |  | 
|---|
| 131 | /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ | 
|---|
| 132 | } xl_btree_delete; | 
|---|
| 133 |  | 
|---|
| 134 | #define SizeOfBtreeDelete	(offsetof(xl_btree_delete, nitems) + sizeof(int)) | 
|---|
| 135 |  | 
|---|
| 136 | /* | 
|---|
| 137 | * This is what we need to know about page reuse within btree. | 
|---|
| 138 | */ | 
|---|
| 139 | typedef struct xl_btree_reuse_page | 
|---|
| 140 | { | 
|---|
| 141 | RelFileNode node; | 
|---|
| 142 | BlockNumber block; | 
|---|
| 143 | TransactionId latestRemovedXid; | 
|---|
| 144 | } xl_btree_reuse_page; | 
|---|
| 145 |  | 
|---|
| 146 | #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page)) | 
|---|
| 147 |  | 
|---|
| 148 | /* | 
|---|
| 149 | * This is what we need to know about vacuum of individual leaf index tuples. | 
|---|
| 150 | * The WAL record can represent deletion of any number of index tuples on a | 
|---|
| 151 | * single index page when executed by VACUUM. | 
|---|
| 152 | * | 
|---|
| 153 | * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. | 
|---|
| 154 | * For a non-MVCC index scans there is an additional correctness requirement | 
|---|
| 155 | * for applying these changes during recovery, which is that we must do one | 
|---|
| 156 | * of these two things for every block in the index: | 
|---|
| 157 | *		* lock the block for cleanup and apply any required changes | 
|---|
| 158 | *		* EnsureBlockUnpinned() | 
|---|
| 159 | * The purpose of this is to ensure that no index scans started before we | 
|---|
| 160 | * finish scanning the index are still running by the time we begin to remove | 
|---|
| 161 | * heap tuples. | 
|---|
| 162 | * | 
|---|
| 163 | * Any changes to any one block are registered on just one WAL record. All | 
|---|
| 164 | * blocks that we need to run EnsureBlockUnpinned() are listed as a block range | 
|---|
| 165 | * starting from the last block vacuumed through until this one. Individual | 
|---|
| 166 | * block numbers aren't given. | 
|---|
| 167 | * | 
|---|
| 168 | * Note that the *last* WAL record in any vacuum of an index is allowed to | 
|---|
| 169 | * have a zero length array of offsets. Earlier records must have at least one. | 
|---|
| 170 | */ | 
|---|
| 171 | typedef struct xl_btree_vacuum | 
|---|
| 172 | { | 
|---|
| 173 | BlockNumber lastBlockVacuumed; | 
|---|
| 174 |  | 
|---|
| 175 | /* TARGET OFFSET NUMBERS FOLLOW */ | 
|---|
| 176 | } xl_btree_vacuum; | 
|---|
| 177 |  | 
|---|
| 178 | #define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) | 
|---|
| 179 |  | 
|---|
| 180 | /* | 
|---|
| 181 | * This is what we need to know about marking an empty branch for deletion. | 
|---|
| 182 | * The target identifies the tuple removed from the parent page (note that we | 
|---|
| 183 | * remove this tuple's downlink and the *following* tuple's key).  Note that | 
|---|
| 184 | * the leaf page is empty, so we don't need to store its content --- it is | 
|---|
| 185 | * just reinitialized during recovery using the rest of the fields. | 
|---|
| 186 | * | 
|---|
| 187 | * Backup Blk 0: leaf block | 
|---|
| 188 | * Backup Blk 1: top parent | 
|---|
| 189 | */ | 
|---|
| 190 | typedef struct xl_btree_mark_page_halfdead | 
|---|
| 191 | { | 
|---|
| 192 | OffsetNumber poffset;		/* deleted tuple id in parent page */ | 
|---|
| 193 |  | 
|---|
| 194 | /* information needed to recreate the leaf page: */ | 
|---|
| 195 | BlockNumber leafblk;		/* leaf block ultimately being deleted */ | 
|---|
| 196 | BlockNumber leftblk;		/* leaf block's left sibling, if any */ | 
|---|
| 197 | BlockNumber rightblk;		/* leaf block's right sibling */ | 
|---|
| 198 | BlockNumber topparent;		/* topmost internal page in the branch */ | 
|---|
| 199 | } xl_btree_mark_page_halfdead; | 
|---|
| 200 |  | 
|---|
| 201 | #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) | 
|---|
| 202 |  | 
|---|
| 203 | /* | 
|---|
| 204 | * This is what we need to know about deletion of a btree page.  Note we do | 
|---|
| 205 | * not store any content for the deleted page --- it is just rewritten as empty | 
|---|
| 206 | * during recovery, apart from resetting the btpo.xact. | 
|---|
| 207 | * | 
|---|
| 208 | * Backup Blk 0: target block being deleted | 
|---|
| 209 | * Backup Blk 1: target block's left sibling, if any | 
|---|
| 210 | * Backup Blk 2: target block's right sibling | 
|---|
| 211 | * Backup Blk 3: leaf block (if different from target) | 
|---|
| 212 | * Backup Blk 4: metapage (if rightsib becomes new fast root) | 
|---|
| 213 | */ | 
|---|
| 214 | typedef struct xl_btree_unlink_page | 
|---|
| 215 | { | 
|---|
| 216 | BlockNumber leftsib;		/* target block's left sibling, if any */ | 
|---|
| 217 | BlockNumber rightsib;		/* target block's right sibling */ | 
|---|
| 218 |  | 
|---|
| 219 | /* | 
|---|
| 220 | * Information needed to recreate the leaf page, when target is an | 
|---|
| 221 | * internal page. | 
|---|
| 222 | */ | 
|---|
| 223 | BlockNumber leafleftsib; | 
|---|
| 224 | BlockNumber leafrightsib; | 
|---|
| 225 | BlockNumber topparent;		/* next child down in the branch */ | 
|---|
| 226 |  | 
|---|
| 227 | TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */ | 
|---|
| 228 | /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ | 
|---|
| 229 | } xl_btree_unlink_page; | 
|---|
| 230 |  | 
|---|
| 231 | #define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) | 
|---|
| 232 |  | 
|---|
| 233 | /* | 
|---|
| 234 | * New root log record.  There are zero tuples if this is to establish an | 
|---|
| 235 | * empty root, or two if it is the result of splitting an old root. | 
|---|
| 236 | * | 
|---|
| 237 | * Note that although this implies rewriting the metadata page, we don't need | 
|---|
| 238 | * an xl_btree_metadata record --- the rootblk and level are sufficient. | 
|---|
| 239 | * | 
|---|
| 240 | * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) | 
|---|
| 241 | * Backup Blk 1: left child (if splitting an old root) | 
|---|
| 242 | * Backup Blk 2: metapage | 
|---|
| 243 | */ | 
|---|
| 244 | typedef struct xl_btree_newroot | 
|---|
| 245 | { | 
|---|
| 246 | BlockNumber rootblk;		/* location of new root (redundant with blk 0) */ | 
|---|
| 247 | uint32		level;			/* its tree level */ | 
|---|
| 248 | } xl_btree_newroot; | 
|---|
| 249 |  | 
|---|
| 250 | #define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, level) + sizeof(uint32)) | 
|---|
| 251 |  | 
|---|
| 252 |  | 
|---|
| 253 | /* | 
|---|
| 254 | * prototypes for functions in nbtxlog.c | 
|---|
| 255 | */ | 
|---|
| 256 | extern void btree_redo(XLogReaderState *record); | 
|---|
| 257 | extern void btree_desc(StringInfo buf, XLogReaderState *record); | 
|---|
| 258 | extern const char *btree_identify(uint8 info); | 
|---|
| 259 | extern void btree_mask(char *pagedata, BlockNumber blkno); | 
|---|
| 260 |  | 
|---|
| 261 | #endif							/* NBXLOG_H */ | 
|---|
| 262 |  | 
|---|