| 1 | /*- |
| 2 | * Copyright (c) 1991, 1993, 1994 |
| 3 | * The Regents of the University of California. All rights reserved. |
| 4 | * |
| 5 | * This code is derived from software contributed to Berkeley by |
| 6 | * Mike Olson. |
| 7 | * |
| 8 | * Redistribution and use in source and binary forms, with or without |
| 9 | * modification, are permitted provided that the following conditions |
| 10 | * are met: |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in the |
| 15 | * documentation and/or other materials provided with the distribution. |
| 16 | * 3. All advertising materials mentioning features or use of this software |
| 17 | * must display the following acknowledgement: |
| 18 | * This product includes software developed by the University of |
| 19 | * California, Berkeley and its contributors. |
| 20 | * 4. Neither the name of the University nor the names of its contributors |
| 21 | * may be used to endorse or promote products derived from this software |
| 22 | * without specific prior written permission. |
| 23 | * |
| 24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 34 | * SUCH DAMAGE. |
| 35 | * |
| 36 | * @(#)btree.h 8.11 (Berkeley) 8/17/94 |
| 37 | */ |
| 38 | |
| 39 | /* Macros to set/clear/test flags. */ |
| 40 | #define F_SET(p, f) (p)->flags |= (f) |
| 41 | #define F_CLR(p, f) (p)->flags &= ~(f) |
| 42 | #define F_ISSET(p, f) ((p)->flags & (f)) |
| 43 | |
| 44 | #include <mpool.h> |
| 45 | |
| 46 | #define DEFMINKEYPAGE (2) /* Minimum keys per page */ |
| 47 | #ifndef MINCACHE |
| 48 | #define MINCACHE (5) /* Minimum cached pages */ |
| 49 | #endif |
| 50 | #define MINPSIZE (512) /* Minimum page size */ |
| 51 | #ifndef DEFPSIZE |
| 52 | #define DEFPSIZE (4096) /* Default page size */ |
| 53 | #endif |
| 54 | |
| 55 | /* |
| 56 | * Page 0 of a btree file contains a copy of the meta-data. This page is also |
| 57 | * used as an out-of-band page, i.e. page pointers that point to nowhere point |
| 58 | * to page 0. Page 1 is the root of the btree. |
| 59 | */ |
| 60 | #define P_INVALID 0 /* Invalid tree page number. */ |
| 61 | #define P_META 0 /* Tree metadata page number. */ |
| 62 | #define P_ROOT 1 /* Tree root page number. */ |
| 63 | |
| 64 | /* |
| 65 | * There are five page layouts in the btree: btree internal pages (BINTERNAL), |
| 66 | * btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages |
| 67 | * (RLEAF) and overflow pages. All five page types have a page header (PAGE). |
| 68 | * This implementation requires that values within structures NOT be padded. |
| 69 | * (ANSI C permits random padding.) If your compiler pads randomly you'll have |
| 70 | * to do some work to get this package to run. |
| 71 | */ |
| 72 | typedef struct _page { |
| 73 | pgno_t pgno; /* this page's page number */ |
| 74 | pgno_t prevpg; /* left sibling */ |
| 75 | pgno_t nextpg; /* right sibling */ |
| 76 | |
| 77 | #define P_BINTERNAL 0x01 /* btree internal page */ |
| 78 | #define P_BLEAF 0x02 /* leaf page */ |
| 79 | #define P_OVERFLOW 0x04 /* overflow page */ |
| 80 | #define P_RINTERNAL 0x08 /* recno internal page */ |
| 81 | #define P_RLEAF 0x10 /* leaf page */ |
| 82 | #define P_TYPE 0x1f /* type mask */ |
| 83 | #define P_PRESERVE 0x20 /* never delete this chain of pages */ |
| 84 | u_int32_t flags; |
| 85 | |
| 86 | indx_t lower; /* lower bound of free space on page */ |
| 87 | indx_t upper; /* upper bound of free space on page */ |
| 88 | indx_t linp[1]; /* indx_t-aligned VAR. LENGTH DATA */ |
| 89 | } PAGE; |
| 90 | |
| 91 | /* First and next index. */ |
| 92 | #define BTDATAOFF \ |
| 93 | (sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) + \ |
| 94 | sizeof(u_int32_t) + sizeof(indx_t) + sizeof(indx_t)) |
| 95 | #define NEXTINDEX(p) (((p)->lower - BTDATAOFF) / sizeof(indx_t)) |
| 96 | |
| 97 | /* |
| 98 | * For pages other than overflow pages, there is an array of offsets into the |
| 99 | * rest of the page immediately following the page header. Each offset is to |
| 100 | * an item which is unique to the type of page. The h_lower offset is just |
| 101 | * past the last filled-in index. The h_upper offset is the first item on the |
| 102 | * page. Offsets are from the beginning of the page. |
| 103 | * |
| 104 | * If an item is too big to store on a single page, a flag is set and the item |
| 105 | * is a { page, size } pair such that the page is the first page of an overflow |
| 106 | * chain with size bytes of item. Overflow pages are simply bytes without any |
| 107 | * external structure. |
| 108 | * |
| 109 | * The page number and size fields in the items are pgno_t-aligned so they can |
| 110 | * be manipulated without copying. (This presumes that 32 bit items can be |
| 111 | * manipulated on this system.) |
| 112 | */ |
| 113 | #define LALIGN(n) (((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1)) |
| 114 | #define NOVFLSIZE (sizeof(pgno_t) + sizeof(u_int32_t)) |
| 115 | |
| 116 | /* |
| 117 | * For the btree internal pages, the item is a key. BINTERNALs are {key, pgno} |
| 118 | * pairs, such that the key compares less than or equal to all of the records |
| 119 | * on that page. For a tree without duplicate keys, an internal page with two |
| 120 | * consecutive keys, a and b, will have all records greater than or equal to a |
| 121 | * and less than b stored on the page associated with a. Duplicate keys are |
| 122 | * somewhat special and can cause duplicate internal and leaf page records and |
| 123 | * some minor modifications of the above rule. |
| 124 | */ |
| 125 | typedef struct _binternal { |
| 126 | u_int32_t ksize; /* key size */ |
| 127 | pgno_t pgno; /* page number stored on */ |
| 128 | #define P_BIGDATA 0x01 /* overflow data */ |
| 129 | #define P_BIGKEY 0x02 /* overflow key */ |
| 130 | u_char flags; |
| 131 | char bytes[1]; /* data */ |
| 132 | } BINTERNAL; |
| 133 | |
| 134 | /* Get the page's BINTERNAL structure at index indx. */ |
| 135 | #define GETBINTERNAL(pg, indx) \ |
| 136 | ((BINTERNAL *)((char *)(pg) + (pg)->linp[indx])) |
| 137 | |
| 138 | /* Get the number of bytes in the entry. */ |
| 139 | #define NBINTERNAL(len) \ |
| 140 | LALIGN(sizeof(u_int32_t) + sizeof(pgno_t) + sizeof(u_char) + (len)) |
| 141 | |
| 142 | /* Copy a BINTERNAL entry to the page. */ |
| 143 | #define WR_BINTERNAL(p, size, pgno, flags) { \ |
| 144 | *(u_int32_t *)p = size; \ |
| 145 | p += sizeof(u_int32_t); \ |
| 146 | *(pgno_t *)p = pgno; \ |
| 147 | p += sizeof(pgno_t); \ |
| 148 | *(u_char *)p = flags; \ |
| 149 | p += sizeof(u_char); \ |
| 150 | } |
| 151 | |
| 152 | /* |
| 153 | * For the recno internal pages, the item is a page number with the number of |
| 154 | * keys found on that page and below. |
| 155 | */ |
| 156 | typedef struct _rinternal { |
| 157 | recno_t nrecs; /* number of records */ |
| 158 | pgno_t pgno; /* page number stored below */ |
| 159 | } RINTERNAL; |
| 160 | |
| 161 | /* Get the page's RINTERNAL structure at index indx. */ |
| 162 | #define GETRINTERNAL(pg, indx) \ |
| 163 | ((RINTERNAL *)((char *)(pg) + (pg)->linp[indx])) |
| 164 | |
| 165 | /* Get the number of bytes in the entry. */ |
| 166 | #define NRINTERNAL \ |
| 167 | LALIGN(sizeof(recno_t) + sizeof(pgno_t)) |
| 168 | |
| 169 | /* Copy a RINTERAL entry to the page. */ |
| 170 | #define WR_RINTERNAL(p, nrecs, pgno) { \ |
| 171 | *(recno_t *)p = nrecs; \ |
| 172 | p += sizeof(recno_t); \ |
| 173 | *(pgno_t *)p = pgno; \ |
| 174 | } |
| 175 | |
| 176 | /* For the btree leaf pages, the item is a key and data pair. */ |
| 177 | typedef struct _bleaf { |
| 178 | u_int32_t ksize; /* size of key */ |
| 179 | u_int32_t dsize; /* size of data */ |
| 180 | u_char flags; /* P_BIGDATA, P_BIGKEY */ |
| 181 | char bytes[1]; /* data */ |
| 182 | } BLEAF; |
| 183 | |
| 184 | /* Get the page's BLEAF structure at index indx. */ |
| 185 | #define GETBLEAF(pg, indx) \ |
| 186 | ((BLEAF *)((char *)(pg) + (pg)->linp[indx])) |
| 187 | |
| 188 | /* Get the number of bytes in the entry. */ |
| 189 | #define NBLEAF(p) NBLEAFDBT((p)->ksize, (p)->dsize) |
| 190 | |
| 191 | /* Get the number of bytes in the user's key/data pair. */ |
| 192 | #define NBLEAFDBT(ksize, dsize) \ |
| 193 | LALIGN(sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_char) + \ |
| 194 | (ksize) + (dsize)) |
| 195 | |
| 196 | /* Copy a BLEAF entry to the page. */ |
| 197 | #define WR_BLEAF(p, key, data, flags) { \ |
| 198 | *(u_int32_t *)p = key->size; \ |
| 199 | p += sizeof(u_int32_t); \ |
| 200 | *(u_int32_t *)p = data->size; \ |
| 201 | p += sizeof(u_int32_t); \ |
| 202 | *(u_char *)p = flags; \ |
| 203 | p += sizeof(u_char); \ |
| 204 | memmove(p, key->data, key->size); \ |
| 205 | p += key->size; \ |
| 206 | memmove(p, data->data, data->size); \ |
| 207 | } |
| 208 | |
| 209 | /* For the recno leaf pages, the item is a data entry. */ |
| 210 | typedef struct _rleaf { |
| 211 | u_int32_t dsize; /* size of data */ |
| 212 | u_char flags; /* P_BIGDATA */ |
| 213 | char bytes[1]; |
| 214 | } RLEAF; |
| 215 | |
| 216 | /* Get the page's RLEAF structure at index indx. */ |
| 217 | #define GETRLEAF(pg, indx) \ |
| 218 | ((RLEAF *)((char *)(pg) + (pg)->linp[indx])) |
| 219 | |
| 220 | /* Get the number of bytes in the entry. */ |
| 221 | #define NRLEAF(p) NRLEAFDBT((p)->dsize) |
| 222 | |
| 223 | /* Get the number of bytes from the user's data. */ |
| 224 | #define NRLEAFDBT(dsize) \ |
| 225 | LALIGN(sizeof(u_int32_t) + sizeof(u_char) + (dsize)) |
| 226 | |
| 227 | /* Copy a RLEAF entry to the page. */ |
| 228 | #define WR_RLEAF(p, data, flags) { \ |
| 229 | *(u_int32_t *)p = data->size; \ |
| 230 | p += sizeof(u_int32_t); \ |
| 231 | *(u_char *)p = flags; \ |
| 232 | p += sizeof(u_char); \ |
| 233 | memmove(p, data->data, data->size); \ |
| 234 | } |
| 235 | |
| 236 | /* |
| 237 | * A record in the tree is either a pointer to a page and an index in the page |
| 238 | * or a page number and an index. These structures are used as a cursor, stack |
| 239 | * entry and search returns as well as to pass records to other routines. |
| 240 | * |
| 241 | * One comment about searches. Internal page searches must find the largest |
| 242 | * record less than key in the tree so that descents work. Leaf page searches |
| 243 | * must find the smallest record greater than key so that the returned index |
| 244 | * is the record's correct position for insertion. |
| 245 | */ |
| 246 | typedef struct _epgno { |
| 247 | pgno_t pgno; /* the page number */ |
| 248 | indx_t index; /* the index on the page */ |
| 249 | } EPGNO; |
| 250 | |
| 251 | typedef struct _epg { |
| 252 | PAGE *page; /* the (pinned) page */ |
| 253 | indx_t index; /* the index on the page */ |
| 254 | } EPG; |
| 255 | |
| 256 | /* |
| 257 | * About cursors. The cursor (and the page that contained the key/data pair |
| 258 | * that it referenced) can be deleted, which makes things a bit tricky. If |
| 259 | * there are no duplicates of the cursor key in the tree (i.e. B_NODUPS is set |
| 260 | * or there simply aren't any duplicates of the key) we copy the key that it |
| 261 | * referenced when it's deleted, and reacquire a new cursor key if the cursor |
| 262 | * is used again. If there are duplicates keys, we move to the next/previous |
| 263 | * key, and set a flag so that we know what happened. NOTE: if duplicate (to |
| 264 | * the cursor) keys are added to the tree during this process, it is undefined |
| 265 | * if they will be returned or not in a cursor scan. |
| 266 | * |
| 267 | * The flags determine the possible states of the cursor: |
| 268 | * |
| 269 | * CURS_INIT The cursor references *something*. |
| 270 | * CURS_ACQUIRE The cursor was deleted, and a key has been saved so that |
| 271 | * we can reacquire the right position in the tree. |
| 272 | * CURS_AFTER, CURS_BEFORE |
| 273 | * The cursor was deleted, and now references a key/data pair |
| 274 | * that has not yet been returned, either before or after the |
| 275 | * deleted key/data pair. |
| 276 | * XXX |
| 277 | * This structure is broken out so that we can eventually offer multiple |
| 278 | * cursors as part of the DB interface. |
| 279 | */ |
| 280 | typedef struct _cursor { |
| 281 | EPGNO pg; /* B: Saved tree reference. */ |
| 282 | DBT key; /* B: Saved key, or key.data == NULL. */ |
| 283 | recno_t rcursor; /* R: recno cursor (1-based) */ |
| 284 | |
| 285 | #define CURS_ACQUIRE 0x01 /* B: Cursor needs to be reacquired. */ |
| 286 | #define CURS_AFTER 0x02 /* B: Unreturned cursor after key. */ |
| 287 | #define CURS_BEFORE 0x04 /* B: Unreturned cursor before key. */ |
| 288 | #define CURS_INIT 0x08 /* RB: Cursor initialized. */ |
| 289 | u_int8_t flags; |
| 290 | } CURSOR; |
| 291 | |
| 292 | /* |
| 293 | * The metadata of the tree. The nrecs field is used only by the RECNO code. |
| 294 | * This is because the btree doesn't really need it and it requires that every |
| 295 | * put or delete call modify the metadata. |
| 296 | */ |
| 297 | typedef struct _btmeta { |
| 298 | u_int32_t magic; /* magic number */ |
| 299 | u_int32_t version; /* version */ |
| 300 | u_int32_t psize; /* page size */ |
| 301 | u_int32_t free; /* page number of first free page */ |
| 302 | u_int32_t nrecs; /* R: number of records */ |
| 303 | |
| 304 | #define SAVEMETA (B_NODUPS | R_RECNO) |
| 305 | u_int32_t flags; /* bt_flags & SAVEMETA */ |
| 306 | } BTMETA; |
| 307 | |
| 308 | /* The in-memory btree/recno data structure. */ |
| 309 | typedef struct _btree { |
| 310 | MPOOL *bt_mp; /* memory pool cookie */ |
| 311 | |
| 312 | DB *bt_dbp; /* pointer to enclosing DB */ |
| 313 | |
| 314 | EPG bt_cur; /* current (pinned) page */ |
| 315 | PAGE *bt_pinned; /* page pinned across calls */ |
| 316 | |
| 317 | CURSOR bt_cursor; /* cursor */ |
| 318 | |
| 319 | #define BT_PUSH(t, p, i) { \ |
| 320 | t->bt_sp->pgno = p; \ |
| 321 | t->bt_sp->index = i; \ |
| 322 | ++t->bt_sp; \ |
| 323 | } |
| 324 | #define BT_POP(t) (t->bt_sp == t->bt_stack ? NULL : --t->bt_sp) |
| 325 | #define BT_CLR(t) (t->bt_sp = t->bt_stack) |
| 326 | EPGNO bt_stack[50]; /* stack of parent pages */ |
| 327 | EPGNO *bt_sp; /* current stack pointer */ |
| 328 | |
| 329 | DBT bt_rkey; /* returned key */ |
| 330 | DBT bt_rdata; /* returned data */ |
| 331 | |
| 332 | virt_fd_t bt_fd; /* tree virtual file descriptor */ |
| 333 | |
| 334 | pgno_t bt_free; /* next free page */ |
| 335 | u_int32_t bt_psize; /* page size */ |
| 336 | indx_t bt_ovflsize; /* cut-off for key/data overflow */ |
| 337 | int bt_lorder; /* byte order */ |
| 338 | /* sorted order */ |
| 339 | enum { NOT, BACK, FORWARD } bt_order; |
| 340 | EPGNO bt_last; /* last insert */ |
| 341 | |
| 342 | /* B: key comparison function */ |
| 343 | int (*bt_cmp) __P((const DBT *, const DBT *)); |
| 344 | /* B: prefix comparison function */ |
| 345 | size_t (*bt_pfx) __P((const DBT *, const DBT *)); |
| 346 | /* R: recno input function */ |
| 347 | int (*bt_irec) __P((struct _btree *, recno_t)); |
| 348 | |
| 349 | FILE *bt_rfp; /* R: record FILE pointer */ |
| 350 | int bt_rfd; /* R: record file descriptor */ |
| 351 | |
| 352 | caddr_t bt_cmap; /* R: current point in mapped space */ |
| 353 | caddr_t bt_smap; /* R: start of mapped space */ |
| 354 | caddr_t bt_emap; /* R: end of mapped space */ |
| 355 | size_t bt_msize; /* R: size of mapped region. */ |
| 356 | |
| 357 | recno_t bt_nrecs; /* R: number of records */ |
| 358 | size_t bt_reclen; /* R: fixed record length */ |
| 359 | u_char bt_bval; /* R: delimiting byte/pad character */ |
| 360 | |
| 361 | /* |
| 362 | * NB: |
| 363 | * B_NODUPS and R_RECNO are stored on disk, and may not be changed. |
| 364 | */ |
| 365 | #define B_INMEM 0x00001 /* in-memory tree */ |
| 366 | #define B_METADIRTY 0x00002 /* need to write metadata */ |
| 367 | #define B_MODIFIED 0x00004 /* tree modified */ |
| 368 | #define B_NEEDSWAP 0x00008 /* if byte order requires swapping */ |
| 369 | #define B_RDONLY 0x00010 /* read-only tree */ |
| 370 | |
| 371 | #define B_NODUPS 0x00020 /* no duplicate keys permitted */ |
| 372 | #define R_RECNO 0x00080 /* record oriented tree */ |
| 373 | |
| 374 | #define R_CLOSEFP 0x00040 /* opened a file pointer */ |
| 375 | #define R_EOF 0x00100 /* end of input file reached. */ |
| 376 | #define R_FIXLEN 0x00200 /* fixed length records */ |
| 377 | #define R_MEMMAPPED 0x00400 /* memory mapped file. */ |
| 378 | #define R_INMEM 0x00800 /* in-memory file */ |
| 379 | #define R_MODIFIED 0x01000 /* modified file */ |
| 380 | #define R_RDONLY 0x02000 /* read-only file */ |
| 381 | |
| 382 | #define B_DB_LOCK 0x04000 /* DB_LOCK specified. */ |
| 383 | #define B_DB_SHMEM 0x08000 /* DB_SHMEM specified. */ |
| 384 | #define B_DB_TXN 0x10000 /* DB_TXN specified. */ |
| 385 | u_int32_t flags; |
| 386 | } BTREE; |
| 387 | |
| 388 | #include "extern.h" |
| 389 | |