| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * nbtree.h |
| 4 | * header file for postgres btree access method implementation. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * src/include/access/nbtree.h |
| 11 | * |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | #ifndef NBTREE_H |
| 15 | #define NBTREE_H |
| 16 | |
| 17 | #include "access/amapi.h" |
| 18 | #include "access/itup.h" |
| 19 | #include "access/sdir.h" |
| 20 | #include "access/xlogreader.h" |
| 21 | #include "catalog/pg_index.h" |
| 22 | #include "lib/stringinfo.h" |
| 23 | #include "storage/bufmgr.h" |
| 24 | #include "storage/shm_toc.h" |
| 25 | |
| 26 | /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ |
| 27 | typedef uint16 BTCycleId; |
| 28 | |
| 29 | /* |
| 30 | * BTPageOpaqueData -- At the end of every page, we store a pointer |
| 31 | * to both siblings in the tree. This is used to do forward/backward |
| 32 | * index scans. The next-page link is also critical for recovery when |
| 33 | * a search has navigated to the wrong page due to concurrent page splits |
| 34 | * or deletions; see src/backend/access/nbtree/README for more info. |
| 35 | * |
| 36 | * In addition, we store the page's btree level (counting upwards from |
| 37 | * zero at a leaf page) as well as some flag bits indicating the page type |
| 38 | * and status. If the page is deleted, we replace the level with the |
| 39 | * next-transaction-ID value indicating when it is safe to reclaim the page. |
| 40 | * |
| 41 | * We also store a "vacuum cycle ID". When a page is split while VACUUM is |
| 42 | * processing the index, a nonzero value associated with the VACUUM run is |
| 43 | * stored into both halves of the split page. (If VACUUM is not running, |
| 44 | * both pages receive zero cycleids.) This allows VACUUM to detect whether |
| 45 | * a page was split since it started, with a small probability of false match |
| 46 | * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs |
| 47 | * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left |
| 48 | * (original) page, and set in the right page, but only if the next page |
| 49 | * to its right has a different cycleid. |
| 50 | * |
| 51 | * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested |
| 52 | * instead. |
| 53 | */ |
| 54 | |
| 55 | typedef struct BTPageOpaqueData |
| 56 | { |
| 57 | BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ |
| 58 | BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ |
| 59 | union |
| 60 | { |
| 61 | uint32 level; /* tree level --- zero for leaf pages */ |
| 62 | TransactionId xact; /* next transaction ID, if deleted */ |
| 63 | } btpo; |
| 64 | uint16 btpo_flags; /* flag bits, see below */ |
| 65 | BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ |
| 66 | } BTPageOpaqueData; |
| 67 | |
| 68 | typedef BTPageOpaqueData *BTPageOpaque; |
| 69 | |
| 70 | /* Bits defined in btpo_flags */ |
| 71 | #define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ |
| 72 | #define BTP_ROOT (1 << 1) /* root page (has no parent) */ |
| 73 | #define BTP_DELETED (1 << 2) /* page has been deleted from tree */ |
| 74 | #define BTP_META (1 << 3) /* meta-page */ |
| 75 | #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ |
| 76 | #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ |
| 77 | #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */ |
| 78 | #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ |
| 79 | |
| 80 | /* |
| 81 | * The max allowed value of a cycle ID is a bit less than 64K. This is |
| 82 | * for convenience of pg_filedump and similar utilities: we want to use |
| 83 | * the last 2 bytes of special space as an index type indicator, and |
| 84 | * restricting cycle ID lets btree use that space for vacuum cycle IDs |
| 85 | * while still allowing index type to be identified. |
| 86 | */ |
| 87 | #define MAX_BT_CYCLE_ID 0xFF7F |
| 88 | |
| 89 | |
| 90 | /* |
| 91 | * The Meta page is always the first page in the btree index. |
| 92 | * Its primary purpose is to point to the location of the btree root page. |
| 93 | * We also point to the "fast" root, which is the current effective root; |
| 94 | * see README for discussion. |
| 95 | */ |
| 96 | |
| 97 | typedef struct BTMetaPageData |
| 98 | { |
| 99 | uint32 btm_magic; /* should contain BTREE_MAGIC */ |
| 100 | uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ |
| 101 | BlockNumber btm_root; /* current root location */ |
| 102 | uint32 btm_level; /* tree level of the root page */ |
| 103 | BlockNumber btm_fastroot; /* current "fast" root location */ |
| 104 | uint32 btm_fastlevel; /* tree level of the "fast" root page */ |
| 105 | /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ |
| 106 | TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted |
| 107 | * pages */ |
| 108 | float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples |
| 109 | * during last cleanup */ |
| 110 | } BTMetaPageData; |
| 111 | |
| 112 | #define BTPageGetMeta(p) \ |
| 113 | ((BTMetaPageData *) PageGetContents(p)) |
| 114 | |
| 115 | /* |
| 116 | * The current Btree version is 4. That's what you'll get when you create |
| 117 | * a new index. |
| 118 | * |
| 119 | * Btree version 3 was used in PostgreSQL v11. It is mostly the same as |
| 120 | * version 4, but heap TIDs were not part of the keyspace. Index tuples |
| 121 | * with duplicate keys could be stored in any order. We continue to |
| 122 | * support reading and writing Btree versions 2 and 3, so that they don't |
| 123 | * need to be immediately re-indexed at pg_upgrade. In order to get the |
| 124 | * new heapkeyspace semantics, however, a REINDEX is needed. |
| 125 | * |
| 126 | * Btree version 2 is mostly the same as version 3. There are two new |
| 127 | * fields in the metapage that were introduced in version 3. A version 2 |
| 128 | * metapage will be automatically upgraded to version 3 on the first |
| 129 | * insert to it. INCLUDE indexes cannot use version 2. |
| 130 | */ |
| 131 | #define BTREE_METAPAGE 0 /* first page is meta */ |
| 132 | #define BTREE_MAGIC 0x053162 /* magic number in metapage */ |
| 133 | #define BTREE_VERSION 4 /* current version number */ |
| 134 | #define BTREE_MIN_VERSION 2 /* minimal supported version number */ |
| 135 | #define BTREE_NOVAC_VERSION 3 /* minimal version with all meta fields */ |
| 136 | |
| 137 | /* |
| 138 | * Maximum size of a btree index entry, including its tuple header. |
| 139 | * |
| 140 | * We actually need to be able to fit three items on every page, |
| 141 | * so restrict any one item to 1/3 the per-page available space. |
| 142 | * |
| 143 | * There are rare cases where _bt_truncate() will need to enlarge |
| 144 | * a heap index tuple to make space for a tiebreaker heap TID |
| 145 | * attribute, which we account for here. |
| 146 | */ |
| 147 | #define BTMaxItemSize(page) \ |
| 148 | MAXALIGN_DOWN((PageGetPageSize(page) - \ |
| 149 | MAXALIGN(SizeOfPageHeaderData + \ |
| 150 | 3*sizeof(ItemIdData) + \ |
| 151 | 3*sizeof(ItemPointerData)) - \ |
| 152 | MAXALIGN(sizeof(BTPageOpaqueData))) / 3) |
| 153 | #define BTMaxItemSizeNoHeapTid(page) \ |
| 154 | MAXALIGN_DOWN((PageGetPageSize(page) - \ |
| 155 | MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ |
| 156 | MAXALIGN(sizeof(BTPageOpaqueData))) / 3) |
| 157 | |
| 158 | /* |
| 159 | * The leaf-page fillfactor defaults to 90% but is user-adjustable. |
| 160 | * For pages above the leaf level, we use a fixed 70% fillfactor. |
| 161 | * The fillfactor is applied during index build and when splitting |
| 162 | * a rightmost page; when splitting non-rightmost pages we try to |
| 163 | * divide the data equally. When splitting a page that's entirely |
| 164 | * filled with a single value (duplicates), the effective leaf-page |
| 165 | * fillfactor is 96%, regardless of whether the page is a rightmost |
| 166 | * page. |
| 167 | */ |
| 168 | #define BTREE_MIN_FILLFACTOR 10 |
| 169 | #define BTREE_DEFAULT_FILLFACTOR 90 |
| 170 | #define BTREE_NONLEAF_FILLFACTOR 70 |
| 171 | #define BTREE_SINGLEVAL_FILLFACTOR 96 |
| 172 | |
| 173 | /* |
| 174 | * In general, the btree code tries to localize its knowledge about |
| 175 | * page layout to a couple of routines. However, we need a special |
| 176 | * value to indicate "no page number" in those places where we expect |
| 177 | * page numbers. We can use zero for this because we never need to |
| 178 | * make a pointer to the metadata page. |
| 179 | */ |
| 180 | |
| 181 | #define P_NONE 0 |
| 182 | |
| 183 | /* |
| 184 | * Macros to test whether a page is leftmost or rightmost on its tree level, |
| 185 | * as well as other state info kept in the opaque data. |
| 186 | */ |
| 187 | #define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) |
| 188 | #define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) |
| 189 | #define P_ISLEAF(opaque) (((opaque)->btpo_flags & BTP_LEAF) != 0) |
| 190 | #define P_ISROOT(opaque) (((opaque)->btpo_flags & BTP_ROOT) != 0) |
| 191 | #define P_ISDELETED(opaque) (((opaque)->btpo_flags & BTP_DELETED) != 0) |
| 192 | #define P_ISMETA(opaque) (((opaque)->btpo_flags & BTP_META) != 0) |
| 193 | #define P_ISHALFDEAD(opaque) (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0) |
| 194 | #define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) |
| 195 | #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) |
| 196 | #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) |
| 197 | |
| 198 | /* |
| 199 | * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost |
| 200 | * page. The high key is not a tuple that is used to visit the heap. It is |
| 201 | * a pivot tuple (see "Notes on B-Tree tuple format" below for definition). |
| 202 | * The high key on a page is required to be greater than or equal to any |
| 203 | * other key that appears on the page. If we find ourselves trying to |
| 204 | * insert a key that is strictly > high key, we know we need to move right |
| 205 | * (this should only happen if the page was split since we examined the |
| 206 | * parent page). |
| 207 | * |
| 208 | * Our insertion algorithm guarantees that we can use the initial least key |
| 209 | * on our right sibling as the high key. Once a page is created, its high |
| 210 | * key changes only if the page is split. |
| 211 | * |
| 212 | * On a non-rightmost page, the high key lives in item 1 and data items |
| 213 | * start in item 2. Rightmost pages have no high key, so we store data |
| 214 | * items beginning in item 1. |
| 215 | */ |
| 216 | |
| 217 | #define P_HIKEY ((OffsetNumber) 1) |
| 218 | #define P_FIRSTKEY ((OffsetNumber) 2) |
| 219 | #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) |
| 220 | |
| 221 | /* |
| 222 | * Notes on B-Tree tuple format, and key and non-key attributes: |
| 223 | * |
| 224 | * INCLUDE B-Tree indexes have non-key attributes. These are extra |
| 225 | * attributes that may be returned by index-only scans, but do not influence |
| 226 | * the order of items in the index (formally, non-key attributes are not |
| 227 | * considered to be part of the key space). Non-key attributes are only |
| 228 | * present in leaf index tuples whose item pointers actually point to heap |
| 229 | * tuples (non-pivot tuples). _bt_check_natts() enforces the rules |
| 230 | * described here. |
| 231 | * |
| 232 | * Non-pivot tuple format: |
| 233 | * |
| 234 | * t_tid | t_info | key values | INCLUDE columns, if any |
| 235 | * |
| 236 | * t_tid points to the heap TID, which is a tiebreaker key column as of |
| 237 | * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never |
| 238 | * set for non-pivot tuples. |
| 239 | * |
| 240 | * All other types of index tuples ("pivot" tuples) only have key columns, |
| 241 | * since pivot tuples only exist to represent how the key space is |
| 242 | * separated. In general, any B-Tree index that has more than one level |
| 243 | * (i.e. any index that does not just consist of a metapage and a single |
| 244 | * leaf root page) must have some number of pivot tuples, since pivot |
| 245 | * tuples are used for traversing the tree. Suffix truncation can omit |
| 246 | * trailing key columns when a new pivot is formed, which makes minus |
| 247 | * infinity their logical value. Since BTREE_VERSION 4 indexes treat heap |
| 248 | * TID as a trailing key column that ensures that all index tuples are |
| 249 | * physically unique, it is necessary to represent heap TID as a trailing |
| 250 | * key column in pivot tuples, though very often this can be truncated |
| 251 | * away, just like any other key column. (Actually, the heap TID is |
| 252 | * omitted rather than truncated, since its representation is different to |
| 253 | * the non-pivot representation.) |
| 254 | * |
| 255 | * Pivot tuple format: |
| 256 | * |
| 257 | * t_tid | t_info | key values | [heap TID] |
| 258 | * |
| 259 | * We store the number of columns present inside pivot tuples by abusing |
| 260 | * their t_tid offset field, since pivot tuples never need to store a real |
| 261 | * offset (downlinks only need to store a block number in t_tid). The |
| 262 | * offset field only stores the number of columns/attributes when the |
| 263 | * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap |
| 264 | * TID column sometimes stored in pivot tuples -- that's represented by |
| 265 | * the presence of BT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in t_info |
| 266 | * is always set on BTREE_VERSION 4. BT_HEAP_TID_ATTR can only be set on |
| 267 | * BTREE_VERSION 4. |
| 268 | * |
| 269 | * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in |
| 270 | * pivot tuples. In that case, the number of key columns is implicitly |
| 271 | * the same as the number of key columns in the index. It is not usually |
| 272 | * set on version 2 indexes, which predate the introduction of INCLUDE |
| 273 | * indexes. (Only explicitly truncated pivot tuples explicitly represent |
| 274 | * the number of key columns on versions 2 and 3, whereas all pivot tuples |
| 275 | * are formed using truncation on version 4. A version 2 index will have |
| 276 | * it set for an internal page negative infinity item iff internal page |
| 277 | * split occurred after upgrade to Postgres 11+.) |
| 278 | * |
| 279 | * The 12 least significant offset bits from t_tid are used to represent |
| 280 | * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status |
| 281 | * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for |
| 282 | * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any |
| 283 | * number of columns/attributes <= INDEX_MAX_KEYS. |
| 284 | * |
| 285 | * Note well: The macros that deal with the number of attributes in tuples |
| 286 | * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, |
| 287 | * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot |
| 288 | * tuple (or must have the same number of attributes as the index has |
| 289 | * generally in the case of !heapkeyspace indexes). They will need to be |
| 290 | * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK |
| 291 | * for something else. |
| 292 | */ |
| 293 | #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT |
| 294 | |
| 295 | /* Item pointer offset bits */ |
| 296 | #define BT_RESERVED_OFFSET_MASK 0xF000 |
| 297 | #define BT_N_KEYS_OFFSET_MASK 0x0FFF |
| 298 | #define BT_HEAP_TID_ATTR 0x1000 |
| 299 | |
| 300 | /* Get/set downlink block number */ |
| 301 | #define BTreeInnerTupleGetDownLink(itup) \ |
| 302 | ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) |
| 303 | #define BTreeInnerTupleSetDownLink(itup, blkno) \ |
| 304 | ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)) |
| 305 | |
| 306 | /* |
| 307 | * Get/set leaf page highkey's link. During the second phase of deletion, the |
| 308 | * target leaf page's high key may point to an ancestor page (at all other |
| 309 | * times, the leaf level high key's link is not used). See the nbtree README |
| 310 | * for full details. |
| 311 | */ |
| 312 | #define BTreeTupleGetTopParent(itup) \ |
| 313 | ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) |
| 314 | #define BTreeTupleSetTopParent(itup, blkno) \ |
| 315 | do { \ |
| 316 | ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \ |
| 317 | BTreeTupleSetNAtts((itup), 0); \ |
| 318 | } while(0) |
| 319 | |
| 320 | /* |
| 321 | * Get/set number of attributes within B-tree index tuple. |
| 322 | * |
| 323 | * Note that this does not include an implicit tiebreaker heap TID |
| 324 | * attribute, if any. Note also that the number of key attributes must be |
| 325 | * explicitly represented in all heapkeyspace pivot tuples. |
| 326 | */ |
| 327 | #define BTreeTupleGetNAtts(itup, rel) \ |
| 328 | ( \ |
| 329 | (itup)->t_info & INDEX_ALT_TID_MASK ? \ |
| 330 | ( \ |
| 331 | ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ |
| 332 | ) \ |
| 333 | : \ |
| 334 | IndexRelationGetNumberOfAttributes(rel) \ |
| 335 | ) |
| 336 | #define BTreeTupleSetNAtts(itup, n) \ |
| 337 | do { \ |
| 338 | (itup)->t_info |= INDEX_ALT_TID_MASK; \ |
| 339 | ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ |
| 340 | } while(0) |
| 341 | |
| 342 | /* |
| 343 | * Get tiebreaker heap TID attribute, if any. Macro works with both pivot |
| 344 | * and non-pivot tuples, despite differences in how heap TID is represented. |
| 345 | */ |
| 346 | #define BTreeTupleGetHeapTID(itup) \ |
| 347 | ( \ |
| 348 | (itup)->t_info & INDEX_ALT_TID_MASK && \ |
| 349 | (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ |
| 350 | ( \ |
| 351 | (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ |
| 352 | sizeof(ItemPointerData)) \ |
| 353 | ) \ |
| 354 | : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ |
| 355 | ) |
| 356 | /* |
| 357 | * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK |
| 358 | * representation (currently limited to pivot tuples) |
| 359 | */ |
| 360 | #define BTreeTupleSetAltHeapTID(itup) \ |
| 361 | do { \ |
| 362 | Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ |
| 363 | ItemPointerSetOffsetNumber(&(itup)->t_tid, \ |
| 364 | ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ |
| 365 | } while(0) |
| 366 | |
| 367 | /* |
| 368 | * Operator strategy numbers for B-tree have been moved to access/stratnum.h, |
| 369 | * because many places need to use them in ScanKeyInit() calls. |
| 370 | * |
| 371 | * The strategy numbers are chosen so that we can commute them by |
| 372 | * subtraction, thus: |
| 373 | */ |
| 374 | #define BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat)) |
| 375 | |
| 376 | /* |
| 377 | * When a new operator class is declared, we require that the user |
| 378 | * supply us with an amproc procedure (BTORDER_PROC) for determining |
| 379 | * whether, for two keys a and b, a < b, a = b, or a > b. This routine |
| 380 | * must return < 0, 0, > 0, respectively, in these three cases. |
| 381 | * |
| 382 | * To facilitate accelerated sorting, an operator class may choose to |
| 383 | * offer a second procedure (BTSORTSUPPORT_PROC). For full details, see |
| 384 | * src/include/utils/sortsupport.h. |
| 385 | * |
| 386 | * To support window frames defined by "RANGE offset PRECEDING/FOLLOWING", |
| 387 | * an operator class may choose to offer a third amproc procedure |
| 388 | * (BTINRANGE_PROC), independently of whether it offers sortsupport. |
| 389 | * For full details, see doc/src/sgml/btree.sgml. |
| 390 | */ |
| 391 | |
| 392 | #define BTORDER_PROC 1 |
| 393 | #define BTSORTSUPPORT_PROC 2 |
| 394 | #define BTINRANGE_PROC 3 |
| 395 | #define BTNProcs 3 |
| 396 | |
| 397 | /* |
| 398 | * We need to be able to tell the difference between read and write |
| 399 | * requests for pages, in order to do locking correctly. |
| 400 | */ |
| 401 | |
| 402 | #define BT_READ BUFFER_LOCK_SHARE |
| 403 | #define BT_WRITE BUFFER_LOCK_EXCLUSIVE |
| 404 | |
| 405 | /* |
| 406 | * BTStackData -- As we descend a tree, we push the (location, downlink) |
| 407 | * pairs from internal pages onto a private stack. If we split a |
| 408 | * leaf, we use this stack to walk back up the tree and insert data |
| 409 | * into parent pages (and possibly to split them, too). Lehman and |
| 410 | * Yao's update algorithm guarantees that under no circumstances can |
| 411 | * our private stack give us an irredeemably bad picture up the tree. |
| 412 | * Again, see the paper for details. |
| 413 | */ |
| 414 | |
| 415 | typedef struct BTStackData |
| 416 | { |
| 417 | BlockNumber bts_blkno; |
| 418 | OffsetNumber bts_offset; |
| 419 | BlockNumber bts_btentry; |
| 420 | struct BTStackData *bts_parent; |
| 421 | } BTStackData; |
| 422 | |
| 423 | typedef BTStackData *BTStack; |
| 424 | |
| 425 | /* |
| 426 | * BTScanInsertData is the btree-private state needed to find an initial |
| 427 | * position for an indexscan, or to insert new tuples -- an "insertion |
| 428 | * scankey" (not to be confused with a search scankey). It's used to descend |
| 429 | * a B-Tree using _bt_search. |
| 430 | * |
| 431 | * heapkeyspace indicates if we expect all keys in the index to be physically |
| 432 | * unique because heap TID is used as a tiebreaker attribute, and if index may |
| 433 | * have truncated key attributes in pivot tuples. This is actually a property |
| 434 | * of the index relation itself (not an indexscan). heapkeyspace indexes are |
| 435 | * indexes whose version is >= version 4. It's convenient to keep this close |
| 436 | * by, rather than accessing the metapage repeatedly. |
| 437 | * |
| 438 | * anynullkeys indicates if any of the keys had NULL value when scankey was |
| 439 | * built from index tuple (note that already-truncated tuple key attributes |
| 440 | * set NULL as a placeholder key value, which also affects value of |
| 441 | * anynullkeys). This is a convenience for unique index non-pivot tuple |
| 442 | * insertion, which usually temporarily unsets scantid, but shouldn't iff |
| 443 | * anynullkeys is true. Value generally matches non-pivot tuple's HasNulls |
| 444 | * bit, but may not when inserting into an INCLUDE index (tuple header value |
| 445 | * is affected by the NULL-ness of both key and non-key attributes). |
| 446 | * |
| 447 | * When nextkey is false (the usual case), _bt_search and _bt_binsrch will |
| 448 | * locate the first item >= scankey. When nextkey is true, they will locate |
| 449 | * the first item > scan key. |
| 450 | * |
| 451 | * pivotsearch is set to true by callers that want to re-find a leaf page |
| 452 | * using a scankey built from a leaf page's high key. Most callers set this |
| 453 | * to false. |
| 454 | * |
| 455 | * scantid is the heap TID that is used as a final tiebreaker attribute. It |
| 456 | * is set to NULL when index scan doesn't need to find a position for a |
| 457 | * specific physical tuple. Must be set when inserting new tuples into |
| 458 | * heapkeyspace indexes, since every tuple in the tree unambiguously belongs |
| 459 | * in one exact position (it's never set with !heapkeyspace indexes, though). |
| 460 | * Despite the representational difference, nbtree search code considers |
| 461 | * scantid to be just another insertion scankey attribute. |
| 462 | * |
| 463 | * scankeys is an array of scan key entries for attributes that are compared |
| 464 | * before scantid (user-visible attributes). keysz is the size of the array. |
| 465 | * During insertion, there must be a scan key for every attribute, but when |
| 466 | * starting a regular index scan some can be omitted. The array is used as a |
| 467 | * flexible array member, though it's sized in a way that makes it possible to |
| 468 | * use stack allocations. See nbtree/README for full details. |
| 469 | */ |
| 470 | typedef struct BTScanInsertData |
| 471 | { |
| 472 | bool heapkeyspace; |
| 473 | bool anynullkeys; |
| 474 | bool nextkey; |
| 475 | bool pivotsearch; |
| 476 | ItemPointer scantid; /* tiebreaker for scankeys */ |
| 477 | int keysz; /* Size of scankeys array */ |
| 478 | ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */ |
| 479 | } BTScanInsertData; |
| 480 | |
| 481 | typedef BTScanInsertData *BTScanInsert; |
| 482 | |
| 483 | /* |
| 484 | * BTInsertStateData is a working area used during insertion. |
| 485 | * |
| 486 | * This is filled in after descending the tree to the first leaf page the new |
| 487 | * tuple might belong on. Tracks the current position while performing |
| 488 | * uniqueness check, before we have determined which exact page to insert |
| 489 | * to. |
| 490 | * |
| 491 | * (This should be private to nbtinsert.c, but it's also used by |
| 492 | * _bt_binsrch_insert) |
| 493 | */ |
| 494 | typedef struct BTInsertStateData |
| 495 | { |
| 496 | IndexTuple itup; /* Item we're inserting */ |
| 497 | Size itemsz; /* Size of itup -- should be MAXALIGN()'d */ |
| 498 | BTScanInsert itup_key; /* Insertion scankey */ |
| 499 | |
| 500 | /* Buffer containing leaf page we're likely to insert itup on */ |
| 501 | Buffer buf; |
| 502 | |
| 503 | /* |
| 504 | * Cache of bounds within the current buffer. Only used for insertions |
| 505 | * where _bt_check_unique is called. See _bt_binsrch_insert and |
| 506 | * _bt_findinsertloc for details. |
| 507 | */ |
| 508 | bool bounds_valid; |
| 509 | OffsetNumber low; |
| 510 | OffsetNumber stricthigh; |
| 511 | } BTInsertStateData; |
| 512 | |
| 513 | typedef BTInsertStateData *BTInsertState; |
| 514 | |
| 515 | /* |
| 516 | * BTScanOpaqueData is the btree-private state needed for an indexscan. |
| 517 | * This consists of preprocessed scan keys (see _bt_preprocess_keys() for |
| 518 | * details of the preprocessing), information about the current location |
| 519 | * of the scan, and information about the marked location, if any. (We use |
| 520 | * BTScanPosData to represent the data needed for each of current and marked |
| 521 | * locations.) In addition we can remember some known-killed index entries |
| 522 | * that must be marked before we can move off the current page. |
| 523 | * |
| 524 | * Index scans work a page at a time: we pin and read-lock the page, identify |
| 525 | * all the matching items on the page and save them in BTScanPosData, then |
| 526 | * release the read-lock while returning the items to the caller for |
| 527 | * processing. This approach minimizes lock/unlock traffic. Note that we |
| 528 | * keep the pin on the index page until the caller is done with all the items |
| 529 | * (this is needed for VACUUM synchronization, see nbtree/README). When we |
| 530 | * are ready to step to the next page, if the caller has told us any of the |
| 531 | * items were killed, we re-lock the page to mark them killed, then unlock. |
| 532 | * Finally we drop the pin and step to the next page in the appropriate |
| 533 | * direction. |
| 534 | * |
| 535 | * If we are doing an index-only scan, we save the entire IndexTuple for each |
| 536 | * matched item, otherwise only its heap TID and offset. The IndexTuples go |
| 537 | * into a separate workspace array; each BTScanPosItem stores its tuple's |
| 538 | * offset within that array. |
| 539 | */ |
| 540 | |
| 541 | typedef struct BTScanPosItem /* what we remember about each match */ |
| 542 | { |
| 543 | ItemPointerData heapTid; /* TID of referenced heap item */ |
| 544 | OffsetNumber indexOffset; /* index item's location within page */ |
| 545 | LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ |
| 546 | } BTScanPosItem; |
| 547 | |
| 548 | typedef struct BTScanPosData |
| 549 | { |
| 550 | Buffer buf; /* if valid, the buffer is pinned */ |
| 551 | |
| 552 | XLogRecPtr lsn; /* pos in the WAL stream when page was read */ |
| 553 | BlockNumber currPage; /* page referenced by items array */ |
| 554 | BlockNumber nextPage; /* page's right link when we scanned it */ |
| 555 | |
| 556 | /* |
| 557 | * moreLeft and moreRight track whether we think there may be matching |
| 558 | * index entries to the left and right of the current page, respectively. |
| 559 | * We can clear the appropriate one of these flags when _bt_checkkeys() |
| 560 | * returns continuescan = false. |
| 561 | */ |
| 562 | bool moreLeft; |
| 563 | bool moreRight; |
| 564 | |
| 565 | /* |
| 566 | * If we are doing an index-only scan, nextTupleOffset is the first free |
| 567 | * location in the associated tuple storage workspace. |
| 568 | */ |
| 569 | int nextTupleOffset; |
| 570 | |
| 571 | /* |
| 572 | * The items array is always ordered in index order (ie, increasing |
| 573 | * indexoffset). When scanning backwards it is convenient to fill the |
| 574 | * array back-to-front, so we start at the last slot and fill downwards. |
| 575 | * Hence we need both a first-valid-entry and a last-valid-entry counter. |
| 576 | * itemIndex is a cursor showing which entry was last returned to caller. |
| 577 | */ |
| 578 | int firstItem; /* first valid index in items[] */ |
| 579 | int lastItem; /* last valid index in items[] */ |
| 580 | int itemIndex; /* current index in items[] */ |
| 581 | |
| 582 | BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ |
| 583 | } BTScanPosData; |
| 584 | |
| 585 | typedef BTScanPosData *BTScanPos; |
| 586 | |
| 587 | #define BTScanPosIsPinned(scanpos) \ |
| 588 | ( \ |
| 589 | AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ |
| 590 | !BufferIsValid((scanpos).buf)), \ |
| 591 | BufferIsValid((scanpos).buf) \ |
| 592 | ) |
| 593 | #define BTScanPosUnpin(scanpos) \ |
| 594 | do { \ |
| 595 | ReleaseBuffer((scanpos).buf); \ |
| 596 | (scanpos).buf = InvalidBuffer; \ |
| 597 | } while (0) |
| 598 | #define BTScanPosUnpinIfPinned(scanpos) \ |
| 599 | do { \ |
| 600 | if (BTScanPosIsPinned(scanpos)) \ |
| 601 | BTScanPosUnpin(scanpos); \ |
| 602 | } while (0) |
| 603 | |
| 604 | #define BTScanPosIsValid(scanpos) \ |
| 605 | ( \ |
| 606 | AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ |
| 607 | !BufferIsValid((scanpos).buf)), \ |
| 608 | BlockNumberIsValid((scanpos).currPage) \ |
| 609 | ) |
| 610 | #define BTScanPosInvalidate(scanpos) \ |
| 611 | do { \ |
| 612 | (scanpos).currPage = InvalidBlockNumber; \ |
| 613 | (scanpos).nextPage = InvalidBlockNumber; \ |
| 614 | (scanpos).buf = InvalidBuffer; \ |
| 615 | (scanpos).lsn = InvalidXLogRecPtr; \ |
| 616 | (scanpos).nextTupleOffset = 0; \ |
| 617 | } while (0); |
| 618 | |
| 619 | /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ |
| 620 | typedef struct BTArrayKeyInfo |
| 621 | { |
| 622 | int scan_key; /* index of associated key in arrayKeyData */ |
| 623 | int cur_elem; /* index of current element in elem_values */ |
| 624 | int mark_elem; /* index of marked element in elem_values */ |
| 625 | int num_elems; /* number of elems in current array value */ |
| 626 | Datum *elem_values; /* array of num_elems Datums */ |
| 627 | } BTArrayKeyInfo; |
| 628 | |
| 629 | typedef struct BTScanOpaqueData |
| 630 | { |
| 631 | /* these fields are set by _bt_preprocess_keys(): */ |
| 632 | bool qual_ok; /* false if qual can never be satisfied */ |
| 633 | int numberOfKeys; /* number of preprocessed scan keys */ |
| 634 | ScanKey keyData; /* array of preprocessed scan keys */ |
| 635 | |
| 636 | /* workspace for SK_SEARCHARRAY support */ |
| 637 | ScanKey arrayKeyData; /* modified copy of scan->keyData */ |
| 638 | int numArrayKeys; /* number of equality-type array keys (-1 if |
| 639 | * there are any unsatisfiable array keys) */ |
| 640 | int arrayKeyCount; /* count indicating number of array scan keys |
| 641 | * processed */ |
| 642 | BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ |
| 643 | MemoryContext arrayContext; /* scan-lifespan context for array data */ |
| 644 | |
| 645 | /* info about killed items if any (killedItems is NULL if never used) */ |
| 646 | int *killedItems; /* currPos.items indexes of killed items */ |
| 647 | int numKilled; /* number of currently stored items */ |
| 648 | |
| 649 | /* |
| 650 | * If we are doing an index-only scan, these are the tuple storage |
| 651 | * workspaces for the currPos and markPos respectively. Each is of size |
| 652 | * BLCKSZ, so it can hold as much as a full page's worth of tuples. |
| 653 | */ |
| 654 | char *currTuples; /* tuple storage for currPos */ |
| 655 | char *markTuples; /* tuple storage for markPos */ |
| 656 | |
| 657 | /* |
| 658 | * If the marked position is on the same page as current position, we |
| 659 | * don't use markPos, but just keep the marked itemIndex in markItemIndex |
| 660 | * (all the rest of currPos is valid for the mark position). Hence, to |
| 661 | * determine if there is a mark, first look at markItemIndex, then at |
| 662 | * markPos. |
| 663 | */ |
| 664 | int markItemIndex; /* itemIndex, or -1 if not valid */ |
| 665 | |
| 666 | /* keep these last in struct for efficiency */ |
| 667 | BTScanPosData currPos; /* current position data */ |
| 668 | BTScanPosData markPos; /* marked position, if any */ |
| 669 | } BTScanOpaqueData; |
| 670 | |
| 671 | typedef BTScanOpaqueData *BTScanOpaque; |
| 672 | |
| 673 | /* |
| 674 | * We use some private sk_flags bits in preprocessed scan keys. We're allowed |
| 675 | * to use bits 16-31 (see skey.h). The uppermost bits are copied from the |
| 676 | * index's indoption[] array entry for the index attribute. |
| 677 | */ |
| 678 | #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ |
| 679 | #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ |
| 680 | #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ |
| 681 | #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) |
| 682 | #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) |
| 683 | |
| 684 | /* |
| 685 | * Constant definition for progress reporting. Phase numbers must match |
| 686 | * btbuildphasename. |
| 687 | */ |
| 688 | /* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */ |
| 689 | #define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN 2 |
| 690 | #define PROGRESS_BTREE_PHASE_PERFORMSORT_1 3 |
| 691 | #define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 |
| 692 | #define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 |
| 693 | |
| 694 | /* |
| 695 | * external entry points for btree, in nbtree.c |
| 696 | */ |
| 697 | extern void btbuildempty(Relation index); |
| 698 | extern bool btinsert(Relation rel, Datum *values, bool *isnull, |
| 699 | ItemPointer ht_ctid, Relation heapRel, |
| 700 | IndexUniqueCheck checkUnique, |
| 701 | struct IndexInfo *indexInfo); |
| 702 | extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); |
| 703 | extern Size btestimateparallelscan(void); |
| 704 | extern void btinitparallelscan(void *target); |
| 705 | extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); |
| 706 | extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); |
| 707 | extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
| 708 | ScanKey orderbys, int norderbys); |
| 709 | extern void btparallelrescan(IndexScanDesc scan); |
| 710 | extern void btendscan(IndexScanDesc scan); |
| 711 | extern void btmarkpos(IndexScanDesc scan); |
| 712 | extern void btrestrpos(IndexScanDesc scan); |
| 713 | extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info, |
| 714 | IndexBulkDeleteResult *stats, |
| 715 | IndexBulkDeleteCallback callback, |
| 716 | void *callback_state); |
| 717 | extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info, |
| 718 | IndexBulkDeleteResult *stats); |
| 719 | extern bool btcanreturn(Relation index, int attno); |
| 720 | |
| 721 | /* |
| 722 | * prototypes for internal functions in nbtree.c |
| 723 | */ |
| 724 | extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); |
| 725 | extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); |
| 726 | extern void _bt_parallel_done(IndexScanDesc scan); |
| 727 | extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); |
| 728 | |
| 729 | /* |
| 730 | * prototypes for functions in nbtinsert.c |
| 731 | */ |
| 732 | extern bool _bt_doinsert(Relation rel, IndexTuple itup, |
| 733 | IndexUniqueCheck checkUnique, Relation heapRel); |
| 734 | extern Buffer _bt_getstackbuf(Relation rel, BTStack stack); |
| 735 | extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); |
| 736 | |
| 737 | /* |
| 738 | * prototypes for functions in nbtsplitloc.c |
| 739 | */ |
| 740 | extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, |
| 741 | OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, |
| 742 | bool *newitemonleft); |
| 743 | |
| 744 | /* |
| 745 | * prototypes for functions in nbtpage.c |
| 746 | */ |
| 747 | extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); |
| 748 | extern void _bt_update_meta_cleanup_info(Relation rel, |
| 749 | TransactionId oldestBtpoXact, float8 numHeapTuples); |
| 750 | extern void _bt_upgrademetapage(Page page); |
| 751 | extern Buffer _bt_getroot(Relation rel, int access); |
| 752 | extern Buffer _bt_gettrueroot(Relation rel); |
| 753 | extern int _bt_getrootheight(Relation rel); |
| 754 | extern bool _bt_heapkeyspace(Relation rel); |
| 755 | extern void _bt_checkpage(Relation rel, Buffer buf); |
| 756 | extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); |
| 757 | extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, |
| 758 | BlockNumber blkno, int access); |
| 759 | extern void _bt_relbuf(Relation rel, Buffer buf); |
| 760 | extern void _bt_pageinit(Page page, Size size); |
| 761 | extern bool _bt_page_recyclable(Page page); |
| 762 | extern void _bt_delitems_delete(Relation rel, Buffer buf, |
| 763 | OffsetNumber *itemnos, int nitems, Relation heapRel); |
| 764 | extern void _bt_delitems_vacuum(Relation rel, Buffer buf, |
| 765 | OffsetNumber *itemnos, int nitems, |
| 766 | BlockNumber lastBlockVacuumed); |
| 767 | extern int _bt_pagedel(Relation rel, Buffer buf); |
| 768 | |
| 769 | /* |
| 770 | * prototypes for functions in nbtsearch.c |
| 771 | */ |
| 772 | extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, |
| 773 | int access, Snapshot snapshot); |
| 774 | extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, |
| 775 | bool forupdate, BTStack stack, int access, Snapshot snapshot); |
| 776 | extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); |
| 777 | extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); |
| 778 | extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); |
| 779 | extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); |
| 780 | extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, |
| 781 | Snapshot snapshot); |
| 782 | |
| 783 | /* |
| 784 | * prototypes for functions in nbtutils.c |
| 785 | */ |
| 786 | extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); |
| 787 | extern void _bt_freestack(BTStack stack); |
| 788 | extern void _bt_preprocess_array_keys(IndexScanDesc scan); |
| 789 | extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); |
| 790 | extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); |
| 791 | extern void _bt_mark_array_keys(IndexScanDesc scan); |
| 792 | extern void _bt_restore_array_keys(IndexScanDesc scan); |
| 793 | extern void _bt_preprocess_keys(IndexScanDesc scan); |
| 794 | extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, |
| 795 | int tupnatts, ScanDirection dir, bool *continuescan); |
| 796 | extern void _bt_killitems(IndexScanDesc scan); |
| 797 | extern BTCycleId _bt_vacuum_cycleid(Relation rel); |
| 798 | extern BTCycleId _bt_start_vacuum(Relation rel); |
| 799 | extern void _bt_end_vacuum(Relation rel); |
| 800 | extern void _bt_end_vacuum_callback(int code, Datum arg); |
| 801 | extern Size BTreeShmemSize(void); |
| 802 | extern void BTreeShmemInit(void); |
| 803 | extern bytea *btoptions(Datum reloptions, bool validate); |
| 804 | extern bool btproperty(Oid index_oid, int attno, |
| 805 | IndexAMProperty prop, const char *propname, |
| 806 | bool *res, bool *isnull); |
| 807 | extern char *btbuildphasename(int64 phasenum); |
| 808 | extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, |
| 809 | IndexTuple firstright, BTScanInsert itup_key); |
| 810 | extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, |
| 811 | IndexTuple firstright); |
| 812 | extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, |
| 813 | OffsetNumber offnum); |
| 814 | extern void _bt_check_third_page(Relation rel, Relation heap, |
| 815 | bool needheaptidspace, Page page, IndexTuple newtup); |
| 816 | |
| 817 | /* |
| 818 | * prototypes for functions in nbtvalidate.c |
| 819 | */ |
| 820 | extern bool btvalidate(Oid opclassoid); |
| 821 | |
| 822 | /* |
| 823 | * prototypes for functions in nbtsort.c |
| 824 | */ |
| 825 | extern IndexBuildResult *btbuild(Relation heap, Relation index, |
| 826 | struct IndexInfo *indexInfo); |
| 827 | extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); |
| 828 | |
| 829 | #endif /* NBTREE_H */ |
| 830 | |