| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * tidbitmap.c |
| 4 | * PostgreSQL tuple-id (TID) bitmap package |
| 5 | * |
| 6 | * This module provides bitmap data structures that are spiritually |
| 7 | * similar to Bitmapsets, but are specially adapted to store sets of |
| 8 | * tuple identifiers (TIDs), or ItemPointers. In particular, the division |
| 9 | * of an ItemPointer into BlockNumber and OffsetNumber is catered for. |
| 10 | * Also, since we wish to be able to store very large tuple sets in |
| 11 | * memory with this data structure, we support "lossy" storage, in which |
| 12 | * we no longer remember individual tuple offsets on a page but only the |
| 13 | * fact that a particular page needs to be visited. |
| 14 | * |
| 15 | * The "lossy" storage uses one bit per disk page, so at the standard 8K |
| 16 | * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb |
| 17 | * of memory. People pushing around tables of that size should have a |
| 18 | * couple of Mb to spare, so we don't worry about providing a second level |
| 19 | * of lossiness. In theory we could fall back to page ranges at some |
| 20 | * point, but for now that seems useless complexity. |
| 21 | * |
| 22 | * We also support the notion of candidate matches, or rechecking. This |
| 23 | * means we know that a search need visit only some tuples on a page, |
| 24 | * but we are not certain that all of those tuples are real matches. |
| 25 | * So the eventual heap scan must recheck the quals for these tuples only, |
| 26 | * rather than rechecking the quals for all tuples on the page as in the |
| 27 | * lossy-bitmap case. Rechecking can be specified when TIDs are inserted |
| 28 | * into a bitmap, and it can also happen internally when we AND a lossy |
| 29 | * and a non-lossy page. |
| 30 | * |
| 31 | * |
| 32 | * Copyright (c) 2003-2019, PostgreSQL Global Development Group |
| 33 | * |
| 34 | * IDENTIFICATION |
| 35 | * src/backend/nodes/tidbitmap.c |
| 36 | * |
| 37 | *------------------------------------------------------------------------- |
| 38 | */ |
| 39 | #include "postgres.h" |
| 40 | |
| 41 | #include <limits.h> |
| 42 | |
| 43 | #include "access/htup_details.h" |
| 44 | #include "nodes/bitmapset.h" |
| 45 | #include "nodes/tidbitmap.h" |
| 46 | #include "storage/lwlock.h" |
| 47 | #include "utils/dsa.h" |
| 48 | #include "utils/hashutils.h" |
| 49 | |
| 50 | /* |
| 51 | * The maximum number of tuples per page is not large (typically 256 with |
| 52 | * 8K pages, or 1024 with 32K pages). So there's not much point in making |
| 53 | * the per-page bitmaps variable size. We just legislate that the size |
| 54 | * is this: |
| 55 | */ |
| 56 | #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage |
| 57 | |
| 58 | /* |
| 59 | * When we have to switch over to lossy storage, we use a data structure |
| 60 | * with one bit per page, where all pages having the same number DIV |
| 61 | * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present |
| 62 | * and has the bit set for a given page, there must not be a per-page entry |
| 63 | * for that page in the page table. |
| 64 | * |
| 65 | * We actually store both exact pages and lossy chunks in the same hash |
| 66 | * table, using identical data structures. (This is because the memory |
| 67 | * management for hashtables doesn't easily/efficiently allow space to be |
| 68 | * transferred easily from one hashtable to another.) Therefore it's best |
| 69 | * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not |
| 70 | * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to |
| 71 | * avoid expensive integer remainder operations. So, define it like this: |
| 72 | */ |
| 73 | #define PAGES_PER_CHUNK (BLCKSZ / 32) |
| 74 | |
| 75 | /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */ |
| 76 | |
| 77 | #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD) |
| 78 | #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) |
| 79 | |
| 80 | /* number of active words for an exact page: */ |
| 81 | #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) |
| 82 | /* number of active words for a lossy chunk: */ |
| 83 | #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) |
| 84 | |
| 85 | /* |
| 86 | * The hashtable entries are represented by this data structure. For |
| 87 | * an exact page, blockno is the page number and bit k of the bitmap |
| 88 | * represents tuple offset k+1. For a lossy chunk, blockno is the first |
| 89 | * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and |
| 90 | * bit k represents page blockno+k. Note that it is not possible to |
| 91 | * have exact storage for the first page of a chunk if we are using |
| 92 | * lossy storage for any page in the chunk's range, since the same |
| 93 | * hashtable entry has to serve both purposes. |
| 94 | * |
| 95 | * recheck is used only on exact pages --- it indicates that although |
| 96 | * only the stated tuples need be checked, the full index qual condition |
| 97 | * must be checked for each (ie, these are candidate matches). |
| 98 | */ |
| 99 | typedef struct PagetableEntry |
| 100 | { |
| 101 | BlockNumber blockno; /* page number (hashtable key) */ |
| 102 | char status; /* hash entry status */ |
| 103 | bool ischunk; /* T = lossy storage, F = exact */ |
| 104 | bool recheck; /* should the tuples be rechecked? */ |
| 105 | bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; |
| 106 | } PagetableEntry; |
| 107 | |
| 108 | /* |
| 109 | * Holds array of pagetable entries. |
| 110 | */ |
| 111 | typedef struct PTEntryArray |
| 112 | { |
| 113 | pg_atomic_uint32 refcount; /* no. of iterator attached */ |
| 114 | PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]; |
| 115 | } PTEntryArray; |
| 116 | |
| 117 | /* |
| 118 | * We want to avoid the overhead of creating the hashtable, which is |
| 119 | * comparatively large, when not necessary. Particularly when we are using a |
| 120 | * bitmap scan on the inside of a nestloop join: a bitmap may well live only |
| 121 | * long enough to accumulate one entry in such cases. We therefore avoid |
| 122 | * creating an actual hashtable until we need two pagetable entries. When |
| 123 | * just one pagetable entry is needed, we store it in a fixed field of |
| 124 | * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later |
| 125 | * shrinks down to zero or one page again. So, status can be TBM_HASH even |
| 126 | * when nentries is zero or one.) |
| 127 | */ |
| 128 | typedef enum |
| 129 | { |
| 130 | TBM_EMPTY, /* no hashtable, nentries == 0 */ |
| 131 | TBM_ONE_PAGE, /* entry1 contains the single entry */ |
| 132 | TBM_HASH /* pagetable is valid, entry1 is not */ |
| 133 | } TBMStatus; |
| 134 | |
| 135 | /* |
| 136 | * Current iterating state of the TBM. |
| 137 | */ |
| 138 | typedef enum |
| 139 | { |
| 140 | TBM_NOT_ITERATING, /* not yet converted to page and chunk array */ |
| 141 | TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */ |
| 142 | TBM_ITERATING_SHARED /* converted to shared page and chunk array */ |
| 143 | } TBMIteratingState; |
| 144 | |
| 145 | /* |
| 146 | * Here is the representation for a whole TIDBitMap: |
| 147 | */ |
| 148 | struct TIDBitmap |
| 149 | { |
| 150 | NodeTag type; /* to make it a valid Node */ |
| 151 | MemoryContext mcxt; /* memory context containing me */ |
| 152 | TBMStatus status; /* see codes above */ |
| 153 | struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */ |
| 154 | int nentries; /* number of entries in pagetable */ |
| 155 | int maxentries; /* limit on same to meet maxbytes */ |
| 156 | int npages; /* number of exact entries in pagetable */ |
| 157 | int nchunks; /* number of lossy entries in pagetable */ |
| 158 | TBMIteratingState iterating; /* tbm_begin_iterate called? */ |
| 159 | uint32 lossify_start; /* offset to start lossifying hashtable at */ |
| 160 | PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ |
| 161 | /* these are valid when iterating is true: */ |
| 162 | PagetableEntry **spages; /* sorted exact-page list, or NULL */ |
| 163 | PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ |
| 164 | dsa_pointer dsapagetable; /* dsa_pointer to the element array */ |
| 165 | dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */ |
| 166 | dsa_pointer ptpages; /* dsa_pointer to the page array */ |
| 167 | dsa_pointer ptchunks; /* dsa_pointer to the chunk array */ |
| 168 | dsa_area *dsa; /* reference to per-query dsa area */ |
| 169 | }; |
| 170 | |
| 171 | /* |
| 172 | * When iterating over a bitmap in sorted order, a TBMIterator is used to |
| 173 | * track our progress. There can be several iterators scanning the same |
| 174 | * bitmap concurrently. Note that the bitmap becomes read-only as soon as |
| 175 | * any iterator is created. |
| 176 | */ |
| 177 | struct TBMIterator |
| 178 | { |
| 179 | TIDBitmap *tbm; /* TIDBitmap we're iterating over */ |
| 180 | int spageptr; /* next spages index */ |
| 181 | int schunkptr; /* next schunks index */ |
| 182 | int schunkbit; /* next bit to check in current schunk */ |
| 183 | TBMIterateResult output; /* MUST BE LAST (because variable-size) */ |
| 184 | }; |
| 185 | |
| 186 | /* |
| 187 | * Holds the shared members of the iterator so that multiple processes |
| 188 | * can jointly iterate. |
| 189 | */ |
| 190 | typedef struct TBMSharedIteratorState |
| 191 | { |
| 192 | int nentries; /* number of entries in pagetable */ |
| 193 | int maxentries; /* limit on same to meet maxbytes */ |
| 194 | int npages; /* number of exact entries in pagetable */ |
| 195 | int nchunks; /* number of lossy entries in pagetable */ |
| 196 | dsa_pointer pagetable; /* dsa pointers to head of pagetable data */ |
| 197 | dsa_pointer spages; /* dsa pointer to page array */ |
| 198 | dsa_pointer schunks; /* dsa pointer to chunk array */ |
| 199 | LWLock lock; /* lock to protect below members */ |
| 200 | int spageptr; /* next spages index */ |
| 201 | int schunkptr; /* next schunks index */ |
| 202 | int schunkbit; /* next bit to check in current schunk */ |
| 203 | } TBMSharedIteratorState; |
| 204 | |
| 205 | /* |
| 206 | * pagetable iteration array. |
| 207 | */ |
| 208 | typedef struct PTIterationArray |
| 209 | { |
| 210 | pg_atomic_uint32 refcount; /* no. of iterator attached */ |
| 211 | int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */ |
| 212 | } PTIterationArray; |
| 213 | |
| 214 | /* |
| 215 | * same as TBMIterator, but it is used for joint iteration, therefore this |
| 216 | * also holds a reference to the shared state. |
| 217 | */ |
| 218 | struct TBMSharedIterator |
| 219 | { |
| 220 | TBMSharedIteratorState *state; /* shared state */ |
| 221 | PTEntryArray *ptbase; /* pagetable element array */ |
| 222 | PTIterationArray *ptpages; /* sorted exact page index list */ |
| 223 | PTIterationArray *ptchunks; /* sorted lossy page index list */ |
| 224 | TBMIterateResult output; /* MUST BE LAST (because variable-size) */ |
| 225 | }; |
| 226 | |
| 227 | /* Local function prototypes */ |
| 228 | static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); |
| 229 | static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, |
| 230 | const TIDBitmap *b); |
| 231 | static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, |
| 232 | BlockNumber pageno); |
| 233 | static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); |
| 234 | static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); |
| 235 | static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); |
| 236 | static void tbm_lossify(TIDBitmap *tbm); |
| 237 | static int tbm_comparator(const void *left, const void *right); |
| 238 | static int tbm_shared_comparator(const void *left, const void *right, |
| 239 | void *arg); |
| 240 | |
| 241 | /* define hashtable mapping block numbers to PagetableEntry's */ |
| 242 | #define SH_USE_NONDEFAULT_ALLOCATOR |
| 243 | #define SH_PREFIX pagetable |
| 244 | #define SH_ELEMENT_TYPE PagetableEntry |
| 245 | #define SH_KEY_TYPE BlockNumber |
| 246 | #define SH_KEY blockno |
| 247 | #define SH_HASH_KEY(tb, key) murmurhash32(key) |
| 248 | #define SH_EQUAL(tb, a, b) a == b |
| 249 | #define SH_SCOPE static inline |
| 250 | #define SH_DEFINE |
| 251 | #define SH_DECLARE |
| 252 | #include "lib/simplehash.h" |
| 253 | |
| 254 | |
| 255 | /* |
| 256 | * tbm_create - create an initially-empty bitmap |
| 257 | * |
| 258 | * The bitmap will live in the memory context that is CurrentMemoryContext |
| 259 | * at the time of this call. It will be limited to (approximately) maxbytes |
| 260 | * total memory consumption. If the DSA passed to this function is not NULL |
| 261 | * then the memory for storing elements of the underlying page table will |
| 262 | * be allocated from the DSA. |
| 263 | */ |
| 264 | TIDBitmap * |
| 265 | tbm_create(long maxbytes, dsa_area *dsa) |
| 266 | { |
| 267 | TIDBitmap *tbm; |
| 268 | |
| 269 | /* Create the TIDBitmap struct and zero all its fields */ |
| 270 | tbm = makeNode(TIDBitmap); |
| 271 | |
| 272 | tbm->mcxt = CurrentMemoryContext; |
| 273 | tbm->status = TBM_EMPTY; |
| 274 | |
| 275 | tbm->maxentries = (int) tbm_calculate_entries(maxbytes); |
| 276 | tbm->lossify_start = 0; |
| 277 | tbm->dsa = dsa; |
| 278 | tbm->dsapagetable = InvalidDsaPointer; |
| 279 | tbm->dsapagetableold = InvalidDsaPointer; |
| 280 | tbm->ptpages = InvalidDsaPointer; |
| 281 | tbm->ptchunks = InvalidDsaPointer; |
| 282 | |
| 283 | return tbm; |
| 284 | } |
| 285 | |
| 286 | /* |
| 287 | * Actually create the hashtable. Since this is a moderately expensive |
| 288 | * proposition, we don't do it until we have to. |
| 289 | */ |
| 290 | static void |
| 291 | tbm_create_pagetable(TIDBitmap *tbm) |
| 292 | { |
| 293 | Assert(tbm->status != TBM_HASH); |
| 294 | Assert(tbm->pagetable == NULL); |
| 295 | |
| 296 | tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); |
| 297 | |
| 298 | /* If entry1 is valid, push it into the hashtable */ |
| 299 | if (tbm->status == TBM_ONE_PAGE) |
| 300 | { |
| 301 | PagetableEntry *page; |
| 302 | bool found; |
| 303 | char oldstatus; |
| 304 | |
| 305 | page = pagetable_insert(tbm->pagetable, |
| 306 | tbm->entry1.blockno, |
| 307 | &found); |
| 308 | Assert(!found); |
| 309 | oldstatus = page->status; |
| 310 | memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); |
| 311 | page->status = oldstatus; |
| 312 | } |
| 313 | |
| 314 | tbm->status = TBM_HASH; |
| 315 | } |
| 316 | |
| 317 | /* |
| 318 | * tbm_free - free a TIDBitmap |
| 319 | */ |
| 320 | void |
| 321 | tbm_free(TIDBitmap *tbm) |
| 322 | { |
| 323 | if (tbm->pagetable) |
| 324 | pagetable_destroy(tbm->pagetable); |
| 325 | if (tbm->spages) |
| 326 | pfree(tbm->spages); |
| 327 | if (tbm->schunks) |
| 328 | pfree(tbm->schunks); |
| 329 | pfree(tbm); |
| 330 | } |
| 331 | |
| 332 | /* |
| 333 | * tbm_free_shared_area - free shared state |
| 334 | * |
| 335 | * Free shared iterator state, Also free shared pagetable and iterator arrays |
| 336 | * memory if they are not referred by any of the shared iterator i.e recount |
| 337 | * is becomes 0. |
| 338 | */ |
| 339 | void |
| 340 | tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp) |
| 341 | { |
| 342 | TBMSharedIteratorState *istate = dsa_get_address(dsa, dp); |
| 343 | PTEntryArray *ptbase; |
| 344 | PTIterationArray *ptpages; |
| 345 | PTIterationArray *ptchunks; |
| 346 | |
| 347 | if (DsaPointerIsValid(istate->pagetable)) |
| 348 | { |
| 349 | ptbase = dsa_get_address(dsa, istate->pagetable); |
| 350 | if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0) |
| 351 | dsa_free(dsa, istate->pagetable); |
| 352 | } |
| 353 | if (DsaPointerIsValid(istate->spages)) |
| 354 | { |
| 355 | ptpages = dsa_get_address(dsa, istate->spages); |
| 356 | if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0) |
| 357 | dsa_free(dsa, istate->spages); |
| 358 | } |
| 359 | if (DsaPointerIsValid(istate->schunks)) |
| 360 | { |
| 361 | ptchunks = dsa_get_address(dsa, istate->schunks); |
| 362 | if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0) |
| 363 | dsa_free(dsa, istate->schunks); |
| 364 | } |
| 365 | |
| 366 | dsa_free(dsa, dp); |
| 367 | } |
| 368 | |
| 369 | /* |
| 370 | * tbm_add_tuples - add some tuple IDs to a TIDBitmap |
| 371 | * |
| 372 | * If recheck is true, then the recheck flag will be set in the |
| 373 | * TBMIterateResult when any of these tuples are reported out. |
| 374 | */ |
| 375 | void |
| 376 | tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, |
| 377 | bool recheck) |
| 378 | { |
| 379 | BlockNumber currblk = InvalidBlockNumber; |
| 380 | PagetableEntry *page = NULL; /* only valid when currblk is valid */ |
| 381 | int i; |
| 382 | |
| 383 | Assert(tbm->iterating == TBM_NOT_ITERATING); |
| 384 | for (i = 0; i < ntids; i++) |
| 385 | { |
| 386 | BlockNumber blk = ItemPointerGetBlockNumber(tids + i); |
| 387 | OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); |
| 388 | int wordnum, |
| 389 | bitnum; |
| 390 | |
| 391 | /* safety check to ensure we don't overrun bit array bounds */ |
| 392 | if (off < 1 || off > MAX_TUPLES_PER_PAGE) |
| 393 | elog(ERROR, "tuple offset out of range: %u" , off); |
| 394 | |
| 395 | /* |
| 396 | * Look up target page unless we already did. This saves cycles when |
| 397 | * the input includes consecutive tuples on the same page, which is |
| 398 | * common enough to justify an extra test here. |
| 399 | */ |
| 400 | if (blk != currblk) |
| 401 | { |
| 402 | if (tbm_page_is_lossy(tbm, blk)) |
| 403 | page = NULL; /* remember page is lossy */ |
| 404 | else |
| 405 | page = tbm_get_pageentry(tbm, blk); |
| 406 | currblk = blk; |
| 407 | } |
| 408 | |
| 409 | if (page == NULL) |
| 410 | continue; /* whole page is already marked */ |
| 411 | |
| 412 | if (page->ischunk) |
| 413 | { |
| 414 | /* The page is a lossy chunk header, set bit for itself */ |
| 415 | wordnum = bitnum = 0; |
| 416 | } |
| 417 | else |
| 418 | { |
| 419 | /* Page is exact, so set bit for individual tuple */ |
| 420 | wordnum = WORDNUM(off - 1); |
| 421 | bitnum = BITNUM(off - 1); |
| 422 | } |
| 423 | page->words[wordnum] |= ((bitmapword) 1 << bitnum); |
| 424 | page->recheck |= recheck; |
| 425 | |
| 426 | if (tbm->nentries > tbm->maxentries) |
| 427 | { |
| 428 | tbm_lossify(tbm); |
| 429 | /* Page could have been converted to lossy, so force new lookup */ |
| 430 | currblk = InvalidBlockNumber; |
| 431 | } |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | /* |
| 436 | * tbm_add_page - add a whole page to a TIDBitmap |
| 437 | * |
| 438 | * This causes the whole page to be reported (with the recheck flag) |
| 439 | * when the TIDBitmap is scanned. |
| 440 | */ |
| 441 | void |
| 442 | tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) |
| 443 | { |
| 444 | /* Enter the page in the bitmap, or mark it lossy if already present */ |
| 445 | tbm_mark_page_lossy(tbm, pageno); |
| 446 | /* If we went over the memory limit, lossify some more pages */ |
| 447 | if (tbm->nentries > tbm->maxentries) |
| 448 | tbm_lossify(tbm); |
| 449 | } |
| 450 | |
| 451 | /* |
| 452 | * tbm_union - set union |
| 453 | * |
| 454 | * a is modified in-place, b is not changed |
| 455 | */ |
| 456 | void |
| 457 | tbm_union(TIDBitmap *a, const TIDBitmap *b) |
| 458 | { |
| 459 | Assert(!a->iterating); |
| 460 | /* Nothing to do if b is empty */ |
| 461 | if (b->nentries == 0) |
| 462 | return; |
| 463 | /* Scan through chunks and pages in b, merge into a */ |
| 464 | if (b->status == TBM_ONE_PAGE) |
| 465 | tbm_union_page(a, &b->entry1); |
| 466 | else |
| 467 | { |
| 468 | pagetable_iterator i; |
| 469 | PagetableEntry *bpage; |
| 470 | |
| 471 | Assert(b->status == TBM_HASH); |
| 472 | pagetable_start_iterate(b->pagetable, &i); |
| 473 | while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL) |
| 474 | tbm_union_page(a, bpage); |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | /* Process one page of b during a union op */ |
| 479 | static void |
| 480 | tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) |
| 481 | { |
| 482 | PagetableEntry *apage; |
| 483 | int wordnum; |
| 484 | |
| 485 | if (bpage->ischunk) |
| 486 | { |
| 487 | /* Scan b's chunk, mark each indicated page lossy in a */ |
| 488 | for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) |
| 489 | { |
| 490 | bitmapword w = bpage->words[wordnum]; |
| 491 | |
| 492 | if (w != 0) |
| 493 | { |
| 494 | BlockNumber pg; |
| 495 | |
| 496 | pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); |
| 497 | while (w != 0) |
| 498 | { |
| 499 | if (w & 1) |
| 500 | tbm_mark_page_lossy(a, pg); |
| 501 | pg++; |
| 502 | w >>= 1; |
| 503 | } |
| 504 | } |
| 505 | } |
| 506 | } |
| 507 | else if (tbm_page_is_lossy(a, bpage->blockno)) |
| 508 | { |
| 509 | /* page is already lossy in a, nothing to do */ |
| 510 | return; |
| 511 | } |
| 512 | else |
| 513 | { |
| 514 | apage = tbm_get_pageentry(a, bpage->blockno); |
| 515 | if (apage->ischunk) |
| 516 | { |
| 517 | /* The page is a lossy chunk header, set bit for itself */ |
| 518 | apage->words[0] |= ((bitmapword) 1 << 0); |
| 519 | } |
| 520 | else |
| 521 | { |
| 522 | /* Both pages are exact, merge at the bit level */ |
| 523 | for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) |
| 524 | apage->words[wordnum] |= bpage->words[wordnum]; |
| 525 | apage->recheck |= bpage->recheck; |
| 526 | } |
| 527 | } |
| 528 | |
| 529 | if (a->nentries > a->maxentries) |
| 530 | tbm_lossify(a); |
| 531 | } |
| 532 | |
| 533 | /* |
| 534 | * tbm_intersect - set intersection |
| 535 | * |
| 536 | * a is modified in-place, b is not changed |
| 537 | */ |
| 538 | void |
| 539 | tbm_intersect(TIDBitmap *a, const TIDBitmap *b) |
| 540 | { |
| 541 | Assert(!a->iterating); |
| 542 | /* Nothing to do if a is empty */ |
| 543 | if (a->nentries == 0) |
| 544 | return; |
| 545 | /* Scan through chunks and pages in a, try to match to b */ |
| 546 | if (a->status == TBM_ONE_PAGE) |
| 547 | { |
| 548 | if (tbm_intersect_page(a, &a->entry1, b)) |
| 549 | { |
| 550 | /* Page is now empty, remove it from a */ |
| 551 | Assert(!a->entry1.ischunk); |
| 552 | a->npages--; |
| 553 | a->nentries--; |
| 554 | Assert(a->nentries == 0); |
| 555 | a->status = TBM_EMPTY; |
| 556 | } |
| 557 | } |
| 558 | else |
| 559 | { |
| 560 | pagetable_iterator i; |
| 561 | PagetableEntry *apage; |
| 562 | |
| 563 | Assert(a->status == TBM_HASH); |
| 564 | pagetable_start_iterate(a->pagetable, &i); |
| 565 | while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL) |
| 566 | { |
| 567 | if (tbm_intersect_page(a, apage, b)) |
| 568 | { |
| 569 | /* Page or chunk is now empty, remove it from a */ |
| 570 | if (apage->ischunk) |
| 571 | a->nchunks--; |
| 572 | else |
| 573 | a->npages--; |
| 574 | a->nentries--; |
| 575 | if (!pagetable_delete(a->pagetable, apage->blockno)) |
| 576 | elog(ERROR, "hash table corrupted" ); |
| 577 | } |
| 578 | } |
| 579 | } |
| 580 | } |
| 581 | |
| 582 | /* |
| 583 | * Process one page of a during an intersection op |
| 584 | * |
| 585 | * Returns true if apage is now empty and should be deleted from a |
| 586 | */ |
| 587 | static bool |
| 588 | tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) |
| 589 | { |
| 590 | const PagetableEntry *bpage; |
| 591 | int wordnum; |
| 592 | |
| 593 | if (apage->ischunk) |
| 594 | { |
| 595 | /* Scan each bit in chunk, try to clear */ |
| 596 | bool candelete = true; |
| 597 | |
| 598 | for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) |
| 599 | { |
| 600 | bitmapword w = apage->words[wordnum]; |
| 601 | |
| 602 | if (w != 0) |
| 603 | { |
| 604 | bitmapword neww = w; |
| 605 | BlockNumber pg; |
| 606 | int bitnum; |
| 607 | |
| 608 | pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); |
| 609 | bitnum = 0; |
| 610 | while (w != 0) |
| 611 | { |
| 612 | if (w & 1) |
| 613 | { |
| 614 | if (!tbm_page_is_lossy(b, pg) && |
| 615 | tbm_find_pageentry(b, pg) == NULL) |
| 616 | { |
| 617 | /* Page is not in b at all, lose lossy bit */ |
| 618 | neww &= ~((bitmapword) 1 << bitnum); |
| 619 | } |
| 620 | } |
| 621 | pg++; |
| 622 | bitnum++; |
| 623 | w >>= 1; |
| 624 | } |
| 625 | apage->words[wordnum] = neww; |
| 626 | if (neww != 0) |
| 627 | candelete = false; |
| 628 | } |
| 629 | } |
| 630 | return candelete; |
| 631 | } |
| 632 | else if (tbm_page_is_lossy(b, apage->blockno)) |
| 633 | { |
| 634 | /* |
| 635 | * Some of the tuples in 'a' might not satisfy the quals for 'b', but |
| 636 | * because the page 'b' is lossy, we don't know which ones. Therefore |
| 637 | * we mark 'a' as requiring rechecks, to indicate that at most those |
| 638 | * tuples set in 'a' are matches. |
| 639 | */ |
| 640 | apage->recheck = true; |
| 641 | return false; |
| 642 | } |
| 643 | else |
| 644 | { |
| 645 | bool candelete = true; |
| 646 | |
| 647 | bpage = tbm_find_pageentry(b, apage->blockno); |
| 648 | if (bpage != NULL) |
| 649 | { |
| 650 | /* Both pages are exact, merge at the bit level */ |
| 651 | Assert(!bpage->ischunk); |
| 652 | for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) |
| 653 | { |
| 654 | apage->words[wordnum] &= bpage->words[wordnum]; |
| 655 | if (apage->words[wordnum] != 0) |
| 656 | candelete = false; |
| 657 | } |
| 658 | apage->recheck |= bpage->recheck; |
| 659 | } |
| 660 | /* If there is no matching b page, we can just delete the a page */ |
| 661 | return candelete; |
| 662 | } |
| 663 | } |
| 664 | |
| 665 | /* |
| 666 | * tbm_is_empty - is a TIDBitmap completely empty? |
| 667 | */ |
| 668 | bool |
| 669 | tbm_is_empty(const TIDBitmap *tbm) |
| 670 | { |
| 671 | return (tbm->nentries == 0); |
| 672 | } |
| 673 | |
| 674 | /* |
| 675 | * tbm_begin_iterate - prepare to iterate through a TIDBitmap |
| 676 | * |
| 677 | * The TBMIterator struct is created in the caller's memory context. |
| 678 | * For a clean shutdown of the iteration, call tbm_end_iterate; but it's |
| 679 | * okay to just allow the memory context to be released, too. It is caller's |
| 680 | * responsibility not to touch the TBMIterator anymore once the TIDBitmap |
| 681 | * is freed. |
| 682 | * |
| 683 | * NB: after this is called, it is no longer allowed to modify the contents |
| 684 | * of the bitmap. However, you can call this multiple times to scan the |
| 685 | * contents repeatedly, including parallel scans. |
| 686 | */ |
| 687 | TBMIterator * |
| 688 | tbm_begin_iterate(TIDBitmap *tbm) |
| 689 | { |
| 690 | TBMIterator *iterator; |
| 691 | |
| 692 | Assert(tbm->iterating != TBM_ITERATING_SHARED); |
| 693 | |
| 694 | /* |
| 695 | * Create the TBMIterator struct, with enough trailing space to serve the |
| 696 | * needs of the TBMIterateResult sub-struct. |
| 697 | */ |
| 698 | iterator = (TBMIterator *) palloc(sizeof(TBMIterator) + |
| 699 | MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); |
| 700 | iterator->tbm = tbm; |
| 701 | |
| 702 | /* |
| 703 | * Initialize iteration pointers. |
| 704 | */ |
| 705 | iterator->spageptr = 0; |
| 706 | iterator->schunkptr = 0; |
| 707 | iterator->schunkbit = 0; |
| 708 | |
| 709 | /* |
| 710 | * If we have a hashtable, create and fill the sorted page lists, unless |
| 711 | * we already did that for a previous iterator. Note that the lists are |
| 712 | * attached to the bitmap not the iterator, so they can be used by more |
| 713 | * than one iterator. |
| 714 | */ |
| 715 | if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING) |
| 716 | { |
| 717 | pagetable_iterator i; |
| 718 | PagetableEntry *page; |
| 719 | int npages; |
| 720 | int nchunks; |
| 721 | |
| 722 | if (!tbm->spages && tbm->npages > 0) |
| 723 | tbm->spages = (PagetableEntry **) |
| 724 | MemoryContextAlloc(tbm->mcxt, |
| 725 | tbm->npages * sizeof(PagetableEntry *)); |
| 726 | if (!tbm->schunks && tbm->nchunks > 0) |
| 727 | tbm->schunks = (PagetableEntry **) |
| 728 | MemoryContextAlloc(tbm->mcxt, |
| 729 | tbm->nchunks * sizeof(PagetableEntry *)); |
| 730 | |
| 731 | npages = nchunks = 0; |
| 732 | pagetable_start_iterate(tbm->pagetable, &i); |
| 733 | while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) |
| 734 | { |
| 735 | if (page->ischunk) |
| 736 | tbm->schunks[nchunks++] = page; |
| 737 | else |
| 738 | tbm->spages[npages++] = page; |
| 739 | } |
| 740 | Assert(npages == tbm->npages); |
| 741 | Assert(nchunks == tbm->nchunks); |
| 742 | if (npages > 1) |
| 743 | qsort(tbm->spages, npages, sizeof(PagetableEntry *), |
| 744 | tbm_comparator); |
| 745 | if (nchunks > 1) |
| 746 | qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), |
| 747 | tbm_comparator); |
| 748 | } |
| 749 | |
| 750 | tbm->iterating = TBM_ITERATING_PRIVATE; |
| 751 | |
| 752 | return iterator; |
| 753 | } |
| 754 | |
| 755 | /* |
| 756 | * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap. |
| 757 | * |
| 758 | * The necessary shared state will be allocated from the DSA passed to |
| 759 | * tbm_create, so that multiple processes can attach to it and iterate jointly. |
| 760 | * |
| 761 | * This will convert the pagetable hash into page and chunk array of the index |
| 762 | * into pagetable array. |
| 763 | */ |
| 764 | dsa_pointer |
| 765 | tbm_prepare_shared_iterate(TIDBitmap *tbm) |
| 766 | { |
| 767 | dsa_pointer dp; |
| 768 | TBMSharedIteratorState *istate; |
| 769 | PTEntryArray *ptbase = NULL; |
| 770 | PTIterationArray *ptpages = NULL; |
| 771 | PTIterationArray *ptchunks = NULL; |
| 772 | |
| 773 | Assert(tbm->dsa != NULL); |
| 774 | Assert(tbm->iterating != TBM_ITERATING_PRIVATE); |
| 775 | |
| 776 | /* |
| 777 | * Allocate TBMSharedIteratorState from DSA to hold the shared members and |
| 778 | * lock, this will also be used by multiple worker for shared iterate. |
| 779 | */ |
| 780 | dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState)); |
| 781 | istate = dsa_get_address(tbm->dsa, dp); |
| 782 | |
| 783 | /* |
| 784 | * If we're not already iterating, create and fill the sorted page lists. |
| 785 | * (If we are, the sorted page lists are already stored in the TIDBitmap, |
| 786 | * and we can just reuse them.) |
| 787 | */ |
| 788 | if (tbm->iterating == TBM_NOT_ITERATING) |
| 789 | { |
| 790 | pagetable_iterator i; |
| 791 | PagetableEntry *page; |
| 792 | int idx; |
| 793 | int npages; |
| 794 | int nchunks; |
| 795 | |
| 796 | /* |
| 797 | * Allocate the page and chunk array memory from the DSA to share |
| 798 | * across multiple processes. |
| 799 | */ |
| 800 | if (tbm->npages) |
| 801 | { |
| 802 | tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + |
| 803 | tbm->npages * sizeof(int)); |
| 804 | ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); |
| 805 | pg_atomic_init_u32(&ptpages->refcount, 0); |
| 806 | } |
| 807 | if (tbm->nchunks) |
| 808 | { |
| 809 | tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + |
| 810 | tbm->nchunks * sizeof(int)); |
| 811 | ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); |
| 812 | pg_atomic_init_u32(&ptchunks->refcount, 0); |
| 813 | } |
| 814 | |
| 815 | /* |
| 816 | * If TBM status is TBM_HASH then iterate over the pagetable and |
| 817 | * convert it to page and chunk arrays. But if it's in the |
| 818 | * TBM_ONE_PAGE mode then directly allocate the space for one entry |
| 819 | * from the DSA. |
| 820 | */ |
| 821 | npages = nchunks = 0; |
| 822 | if (tbm->status == TBM_HASH) |
| 823 | { |
| 824 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
| 825 | |
| 826 | pagetable_start_iterate(tbm->pagetable, &i); |
| 827 | while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) |
| 828 | { |
| 829 | idx = page - ptbase->ptentry; |
| 830 | if (page->ischunk) |
| 831 | ptchunks->index[nchunks++] = idx; |
| 832 | else |
| 833 | ptpages->index[npages++] = idx; |
| 834 | } |
| 835 | |
| 836 | Assert(npages == tbm->npages); |
| 837 | Assert(nchunks == tbm->nchunks); |
| 838 | } |
| 839 | else if (tbm->status == TBM_ONE_PAGE) |
| 840 | { |
| 841 | /* |
| 842 | * In one page mode allocate the space for one pagetable entry, |
| 843 | * initialize it, and directly store its index (i.e. 0) in the |
| 844 | * page array. |
| 845 | */ |
| 846 | tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) + |
| 847 | sizeof(PagetableEntry)); |
| 848 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
| 849 | memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry)); |
| 850 | ptpages->index[0] = 0; |
| 851 | } |
| 852 | |
| 853 | if (ptbase != NULL) |
| 854 | pg_atomic_init_u32(&ptbase->refcount, 0); |
| 855 | if (npages > 1) |
| 856 | qsort_arg((void *) (ptpages->index), npages, sizeof(int), |
| 857 | tbm_shared_comparator, (void *) ptbase->ptentry); |
| 858 | if (nchunks > 1) |
| 859 | qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int), |
| 860 | tbm_shared_comparator, (void *) ptbase->ptentry); |
| 861 | } |
| 862 | |
| 863 | /* |
| 864 | * Store the TBM members in the shared state so that we can share them |
| 865 | * across multiple processes. |
| 866 | */ |
| 867 | istate->nentries = tbm->nentries; |
| 868 | istate->maxentries = tbm->maxentries; |
| 869 | istate->npages = tbm->npages; |
| 870 | istate->nchunks = tbm->nchunks; |
| 871 | istate->pagetable = tbm->dsapagetable; |
| 872 | istate->spages = tbm->ptpages; |
| 873 | istate->schunks = tbm->ptchunks; |
| 874 | |
| 875 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
| 876 | ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); |
| 877 | ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); |
| 878 | |
| 879 | /* |
| 880 | * For every shared iterator, referring to pagetable and iterator array, |
| 881 | * increase the refcount by 1 so that while freeing the shared iterator we |
| 882 | * don't free pagetable and iterator array until its refcount becomes 0. |
| 883 | */ |
| 884 | if (ptbase != NULL) |
| 885 | pg_atomic_add_fetch_u32(&ptbase->refcount, 1); |
| 886 | if (ptpages != NULL) |
| 887 | pg_atomic_add_fetch_u32(&ptpages->refcount, 1); |
| 888 | if (ptchunks != NULL) |
| 889 | pg_atomic_add_fetch_u32(&ptchunks->refcount, 1); |
| 890 | |
| 891 | /* Initialize the iterator lock */ |
| 892 | LWLockInitialize(&istate->lock, LWTRANCHE_TBM); |
| 893 | |
| 894 | /* Initialize the shared iterator state */ |
| 895 | istate->schunkbit = 0; |
| 896 | istate->schunkptr = 0; |
| 897 | istate->spageptr = 0; |
| 898 | |
| 899 | tbm->iterating = TBM_ITERATING_SHARED; |
| 900 | |
| 901 | return dp; |
| 902 | } |
| 903 | |
| 904 | /* |
| 905 | * tbm_extract_page_tuple - extract the tuple offsets from a page |
| 906 | * |
| 907 | * The extracted offsets are stored into TBMIterateResult. |
| 908 | */ |
| 909 | static inline int |
| 910 | (PagetableEntry *page, TBMIterateResult *output) |
| 911 | { |
| 912 | int wordnum; |
| 913 | int ntuples = 0; |
| 914 | |
| 915 | for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) |
| 916 | { |
| 917 | bitmapword w = page->words[wordnum]; |
| 918 | |
| 919 | if (w != 0) |
| 920 | { |
| 921 | int off = wordnum * BITS_PER_BITMAPWORD + 1; |
| 922 | |
| 923 | while (w != 0) |
| 924 | { |
| 925 | if (w & 1) |
| 926 | output->offsets[ntuples++] = (OffsetNumber) off; |
| 927 | off++; |
| 928 | w >>= 1; |
| 929 | } |
| 930 | } |
| 931 | } |
| 932 | |
| 933 | return ntuples; |
| 934 | } |
| 935 | |
| 936 | /* |
| 937 | * tbm_advance_schunkbit - Advance the schunkbit |
| 938 | */ |
| 939 | static inline void |
| 940 | tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) |
| 941 | { |
| 942 | int schunkbit = *schunkbitp; |
| 943 | |
| 944 | while (schunkbit < PAGES_PER_CHUNK) |
| 945 | { |
| 946 | int wordnum = WORDNUM(schunkbit); |
| 947 | int bitnum = BITNUM(schunkbit); |
| 948 | |
| 949 | if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) |
| 950 | break; |
| 951 | schunkbit++; |
| 952 | } |
| 953 | |
| 954 | *schunkbitp = schunkbit; |
| 955 | } |
| 956 | |
| 957 | /* |
| 958 | * tbm_iterate - scan through next page of a TIDBitmap |
| 959 | * |
| 960 | * Returns a TBMIterateResult representing one page, or NULL if there are |
| 961 | * no more pages to scan. Pages are guaranteed to be delivered in numerical |
| 962 | * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to |
| 963 | * remember the exact tuples to look at on this page --- the caller must |
| 964 | * examine all tuples on the page and check if they meet the intended |
| 965 | * condition. If result->recheck is true, only the indicated tuples need |
| 966 | * be examined, but the condition must be rechecked anyway. (For ease of |
| 967 | * testing, recheck is always set true when ntuples < 0.) |
| 968 | */ |
| 969 | TBMIterateResult * |
| 970 | tbm_iterate(TBMIterator *iterator) |
| 971 | { |
| 972 | TIDBitmap *tbm = iterator->tbm; |
| 973 | TBMIterateResult *output = &(iterator->output); |
| 974 | |
| 975 | Assert(tbm->iterating == TBM_ITERATING_PRIVATE); |
| 976 | |
| 977 | /* |
| 978 | * If lossy chunk pages remain, make sure we've advanced schunkptr/ |
| 979 | * schunkbit to the next set bit. |
| 980 | */ |
| 981 | while (iterator->schunkptr < tbm->nchunks) |
| 982 | { |
| 983 | PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; |
| 984 | int schunkbit = iterator->schunkbit; |
| 985 | |
| 986 | tbm_advance_schunkbit(chunk, &schunkbit); |
| 987 | if (schunkbit < PAGES_PER_CHUNK) |
| 988 | { |
| 989 | iterator->schunkbit = schunkbit; |
| 990 | break; |
| 991 | } |
| 992 | /* advance to next chunk */ |
| 993 | iterator->schunkptr++; |
| 994 | iterator->schunkbit = 0; |
| 995 | } |
| 996 | |
| 997 | /* |
| 998 | * If both chunk and per-page data remain, must output the numerically |
| 999 | * earlier page. |
| 1000 | */ |
| 1001 | if (iterator->schunkptr < tbm->nchunks) |
| 1002 | { |
| 1003 | PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; |
| 1004 | BlockNumber chunk_blockno; |
| 1005 | |
| 1006 | chunk_blockno = chunk->blockno + iterator->schunkbit; |
| 1007 | if (iterator->spageptr >= tbm->npages || |
| 1008 | chunk_blockno < tbm->spages[iterator->spageptr]->blockno) |
| 1009 | { |
| 1010 | /* Return a lossy page indicator from the chunk */ |
| 1011 | output->blockno = chunk_blockno; |
| 1012 | output->ntuples = -1; |
| 1013 | output->recheck = true; |
| 1014 | iterator->schunkbit++; |
| 1015 | return output; |
| 1016 | } |
| 1017 | } |
| 1018 | |
| 1019 | if (iterator->spageptr < tbm->npages) |
| 1020 | { |
| 1021 | PagetableEntry *page; |
| 1022 | int ntuples; |
| 1023 | |
| 1024 | /* In ONE_PAGE state, we don't allocate an spages[] array */ |
| 1025 | if (tbm->status == TBM_ONE_PAGE) |
| 1026 | page = &tbm->entry1; |
| 1027 | else |
| 1028 | page = tbm->spages[iterator->spageptr]; |
| 1029 | |
| 1030 | /* scan bitmap to extract individual offset numbers */ |
| 1031 | ntuples = tbm_extract_page_tuple(page, output); |
| 1032 | output->blockno = page->blockno; |
| 1033 | output->ntuples = ntuples; |
| 1034 | output->recheck = page->recheck; |
| 1035 | iterator->spageptr++; |
| 1036 | return output; |
| 1037 | } |
| 1038 | |
| 1039 | /* Nothing more in the bitmap */ |
| 1040 | return NULL; |
| 1041 | } |
| 1042 | |
| 1043 | /* |
| 1044 | * tbm_shared_iterate - scan through next page of a TIDBitmap |
| 1045 | * |
| 1046 | * As above, but this will iterate using an iterator which is shared |
| 1047 | * across multiple processes. We need to acquire the iterator LWLock, |
| 1048 | * before accessing the shared members. |
| 1049 | */ |
| 1050 | TBMIterateResult * |
| 1051 | tbm_shared_iterate(TBMSharedIterator *iterator) |
| 1052 | { |
| 1053 | TBMIterateResult *output = &iterator->output; |
| 1054 | TBMSharedIteratorState *istate = iterator->state; |
| 1055 | PagetableEntry *ptbase = NULL; |
| 1056 | int *idxpages = NULL; |
| 1057 | int *idxchunks = NULL; |
| 1058 | |
| 1059 | if (iterator->ptbase != NULL) |
| 1060 | ptbase = iterator->ptbase->ptentry; |
| 1061 | if (iterator->ptpages != NULL) |
| 1062 | idxpages = iterator->ptpages->index; |
| 1063 | if (iterator->ptchunks != NULL) |
| 1064 | idxchunks = iterator->ptchunks->index; |
| 1065 | |
| 1066 | /* Acquire the LWLock before accessing the shared members */ |
| 1067 | LWLockAcquire(&istate->lock, LW_EXCLUSIVE); |
| 1068 | |
| 1069 | /* |
| 1070 | * If lossy chunk pages remain, make sure we've advanced schunkptr/ |
| 1071 | * schunkbit to the next set bit. |
| 1072 | */ |
| 1073 | while (istate->schunkptr < istate->nchunks) |
| 1074 | { |
| 1075 | PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; |
| 1076 | int schunkbit = istate->schunkbit; |
| 1077 | |
| 1078 | tbm_advance_schunkbit(chunk, &schunkbit); |
| 1079 | if (schunkbit < PAGES_PER_CHUNK) |
| 1080 | { |
| 1081 | istate->schunkbit = schunkbit; |
| 1082 | break; |
| 1083 | } |
| 1084 | /* advance to next chunk */ |
| 1085 | istate->schunkptr++; |
| 1086 | istate->schunkbit = 0; |
| 1087 | } |
| 1088 | |
| 1089 | /* |
| 1090 | * If both chunk and per-page data remain, must output the numerically |
| 1091 | * earlier page. |
| 1092 | */ |
| 1093 | if (istate->schunkptr < istate->nchunks) |
| 1094 | { |
| 1095 | PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; |
| 1096 | BlockNumber chunk_blockno; |
| 1097 | |
| 1098 | chunk_blockno = chunk->blockno + istate->schunkbit; |
| 1099 | |
| 1100 | if (istate->spageptr >= istate->npages || |
| 1101 | chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) |
| 1102 | { |
| 1103 | /* Return a lossy page indicator from the chunk */ |
| 1104 | output->blockno = chunk_blockno; |
| 1105 | output->ntuples = -1; |
| 1106 | output->recheck = true; |
| 1107 | istate->schunkbit++; |
| 1108 | |
| 1109 | LWLockRelease(&istate->lock); |
| 1110 | return output; |
| 1111 | } |
| 1112 | } |
| 1113 | |
| 1114 | if (istate->spageptr < istate->npages) |
| 1115 | { |
| 1116 | PagetableEntry *page = &ptbase[idxpages[istate->spageptr]]; |
| 1117 | int ntuples; |
| 1118 | |
| 1119 | /* scan bitmap to extract individual offset numbers */ |
| 1120 | ntuples = tbm_extract_page_tuple(page, output); |
| 1121 | output->blockno = page->blockno; |
| 1122 | output->ntuples = ntuples; |
| 1123 | output->recheck = page->recheck; |
| 1124 | istate->spageptr++; |
| 1125 | |
| 1126 | LWLockRelease(&istate->lock); |
| 1127 | |
| 1128 | return output; |
| 1129 | } |
| 1130 | |
| 1131 | LWLockRelease(&istate->lock); |
| 1132 | |
| 1133 | /* Nothing more in the bitmap */ |
| 1134 | return NULL; |
| 1135 | } |
| 1136 | |
| 1137 | /* |
| 1138 | * tbm_end_iterate - finish an iteration over a TIDBitmap |
| 1139 | * |
| 1140 | * Currently this is just a pfree, but it might do more someday. (For |
| 1141 | * instance, it could be useful to count open iterators and allow the |
| 1142 | * bitmap to return to read/write status when there are no more iterators.) |
| 1143 | */ |
| 1144 | void |
| 1145 | tbm_end_iterate(TBMIterator *iterator) |
| 1146 | { |
| 1147 | pfree(iterator); |
| 1148 | } |
| 1149 | |
| 1150 | /* |
| 1151 | * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap |
| 1152 | * |
| 1153 | * This doesn't free any of the shared state associated with the iterator, |
| 1154 | * just our backend-private state. |
| 1155 | */ |
| 1156 | void |
| 1157 | tbm_end_shared_iterate(TBMSharedIterator *iterator) |
| 1158 | { |
| 1159 | pfree(iterator); |
| 1160 | } |
| 1161 | |
| 1162 | /* |
| 1163 | * tbm_find_pageentry - find a PagetableEntry for the pageno |
| 1164 | * |
| 1165 | * Returns NULL if there is no non-lossy entry for the pageno. |
| 1166 | */ |
| 1167 | static const PagetableEntry * |
| 1168 | tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) |
| 1169 | { |
| 1170 | const PagetableEntry *page; |
| 1171 | |
| 1172 | if (tbm->nentries == 0) /* in case pagetable doesn't exist */ |
| 1173 | return NULL; |
| 1174 | |
| 1175 | if (tbm->status == TBM_ONE_PAGE) |
| 1176 | { |
| 1177 | page = &tbm->entry1; |
| 1178 | if (page->blockno != pageno) |
| 1179 | return NULL; |
| 1180 | Assert(!page->ischunk); |
| 1181 | return page; |
| 1182 | } |
| 1183 | |
| 1184 | page = pagetable_lookup(tbm->pagetable, pageno); |
| 1185 | if (page == NULL) |
| 1186 | return NULL; |
| 1187 | if (page->ischunk) |
| 1188 | return NULL; /* don't want a lossy chunk header */ |
| 1189 | return page; |
| 1190 | } |
| 1191 | |
| 1192 | /* |
| 1193 | * tbm_get_pageentry - find or create a PagetableEntry for the pageno |
| 1194 | * |
| 1195 | * If new, the entry is marked as an exact (non-chunk) entry. |
| 1196 | * |
| 1197 | * This may cause the table to exceed the desired memory size. It is |
| 1198 | * up to the caller to call tbm_lossify() at the next safe point if so. |
| 1199 | */ |
| 1200 | static PagetableEntry * |
| 1201 | tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) |
| 1202 | { |
| 1203 | PagetableEntry *page; |
| 1204 | bool found; |
| 1205 | |
| 1206 | if (tbm->status == TBM_EMPTY) |
| 1207 | { |
| 1208 | /* Use the fixed slot */ |
| 1209 | page = &tbm->entry1; |
| 1210 | found = false; |
| 1211 | tbm->status = TBM_ONE_PAGE; |
| 1212 | } |
| 1213 | else |
| 1214 | { |
| 1215 | if (tbm->status == TBM_ONE_PAGE) |
| 1216 | { |
| 1217 | page = &tbm->entry1; |
| 1218 | if (page->blockno == pageno) |
| 1219 | return page; |
| 1220 | /* Time to switch from one page to a hashtable */ |
| 1221 | tbm_create_pagetable(tbm); |
| 1222 | } |
| 1223 | |
| 1224 | /* Look up or create an entry */ |
| 1225 | page = pagetable_insert(tbm->pagetable, pageno, &found); |
| 1226 | } |
| 1227 | |
| 1228 | /* Initialize it if not present before */ |
| 1229 | if (!found) |
| 1230 | { |
| 1231 | char oldstatus = page->status; |
| 1232 | |
| 1233 | MemSet(page, 0, sizeof(PagetableEntry)); |
| 1234 | page->status = oldstatus; |
| 1235 | page->blockno = pageno; |
| 1236 | /* must count it too */ |
| 1237 | tbm->nentries++; |
| 1238 | tbm->npages++; |
| 1239 | } |
| 1240 | |
| 1241 | return page; |
| 1242 | } |
| 1243 | |
| 1244 | /* |
| 1245 | * tbm_page_is_lossy - is the page marked as lossily stored? |
| 1246 | */ |
| 1247 | static bool |
| 1248 | tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) |
| 1249 | { |
| 1250 | PagetableEntry *page; |
| 1251 | BlockNumber chunk_pageno; |
| 1252 | int bitno; |
| 1253 | |
| 1254 | /* we can skip the lookup if there are no lossy chunks */ |
| 1255 | if (tbm->nchunks == 0) |
| 1256 | return false; |
| 1257 | Assert(tbm->status == TBM_HASH); |
| 1258 | |
| 1259 | bitno = pageno % PAGES_PER_CHUNK; |
| 1260 | chunk_pageno = pageno - bitno; |
| 1261 | |
| 1262 | page = pagetable_lookup(tbm->pagetable, chunk_pageno); |
| 1263 | |
| 1264 | if (page != NULL && page->ischunk) |
| 1265 | { |
| 1266 | int wordnum = WORDNUM(bitno); |
| 1267 | int bitnum = BITNUM(bitno); |
| 1268 | |
| 1269 | if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) |
| 1270 | return true; |
| 1271 | } |
| 1272 | return false; |
| 1273 | } |
| 1274 | |
| 1275 | /* |
| 1276 | * tbm_mark_page_lossy - mark the page number as lossily stored |
| 1277 | * |
| 1278 | * This may cause the table to exceed the desired memory size. It is |
| 1279 | * up to the caller to call tbm_lossify() at the next safe point if so. |
| 1280 | */ |
| 1281 | static void |
| 1282 | tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) |
| 1283 | { |
| 1284 | PagetableEntry *page; |
| 1285 | bool found; |
| 1286 | BlockNumber chunk_pageno; |
| 1287 | int bitno; |
| 1288 | int wordnum; |
| 1289 | int bitnum; |
| 1290 | |
| 1291 | /* We force the bitmap into hashtable mode whenever it's lossy */ |
| 1292 | if (tbm->status != TBM_HASH) |
| 1293 | tbm_create_pagetable(tbm); |
| 1294 | |
| 1295 | bitno = pageno % PAGES_PER_CHUNK; |
| 1296 | chunk_pageno = pageno - bitno; |
| 1297 | |
| 1298 | /* |
| 1299 | * Remove any extant non-lossy entry for the page. If the page is its own |
| 1300 | * chunk header, however, we skip this and handle the case below. |
| 1301 | */ |
| 1302 | if (bitno != 0) |
| 1303 | { |
| 1304 | if (pagetable_delete(tbm->pagetable, pageno)) |
| 1305 | { |
| 1306 | /* It was present, so adjust counts */ |
| 1307 | tbm->nentries--; |
| 1308 | tbm->npages--; /* assume it must have been non-lossy */ |
| 1309 | } |
| 1310 | } |
| 1311 | |
| 1312 | /* Look up or create entry for chunk-header page */ |
| 1313 | page = pagetable_insert(tbm->pagetable, chunk_pageno, &found); |
| 1314 | |
| 1315 | /* Initialize it if not present before */ |
| 1316 | if (!found) |
| 1317 | { |
| 1318 | char oldstatus = page->status; |
| 1319 | |
| 1320 | MemSet(page, 0, sizeof(PagetableEntry)); |
| 1321 | page->status = oldstatus; |
| 1322 | page->blockno = chunk_pageno; |
| 1323 | page->ischunk = true; |
| 1324 | /* must count it too */ |
| 1325 | tbm->nentries++; |
| 1326 | tbm->nchunks++; |
| 1327 | } |
| 1328 | else if (!page->ischunk) |
| 1329 | { |
| 1330 | char oldstatus = page->status; |
| 1331 | |
| 1332 | /* chunk header page was formerly non-lossy, make it lossy */ |
| 1333 | MemSet(page, 0, sizeof(PagetableEntry)); |
| 1334 | page->status = oldstatus; |
| 1335 | page->blockno = chunk_pageno; |
| 1336 | page->ischunk = true; |
| 1337 | /* we assume it had some tuple bit(s) set, so mark it lossy */ |
| 1338 | page->words[0] = ((bitmapword) 1 << 0); |
| 1339 | /* adjust counts */ |
| 1340 | tbm->nchunks++; |
| 1341 | tbm->npages--; |
| 1342 | } |
| 1343 | |
| 1344 | /* Now set the original target page's bit */ |
| 1345 | wordnum = WORDNUM(bitno); |
| 1346 | bitnum = BITNUM(bitno); |
| 1347 | page->words[wordnum] |= ((bitmapword) 1 << bitnum); |
| 1348 | } |
| 1349 | |
| 1350 | /* |
| 1351 | * tbm_lossify - lose some information to get back under the memory limit |
| 1352 | */ |
| 1353 | static void |
| 1354 | tbm_lossify(TIDBitmap *tbm) |
| 1355 | { |
| 1356 | pagetable_iterator i; |
| 1357 | PagetableEntry *page; |
| 1358 | |
| 1359 | /* |
| 1360 | * XXX Really stupid implementation: this just lossifies pages in |
| 1361 | * essentially random order. We should be paying some attention to the |
| 1362 | * number of bits set in each page, instead. |
| 1363 | * |
| 1364 | * Since we are called as soon as nentries exceeds maxentries, we should |
| 1365 | * push nentries down to significantly less than maxentries, or else we'll |
| 1366 | * just end up doing this again very soon. We shoot for maxentries/2. |
| 1367 | */ |
| 1368 | Assert(tbm->iterating == TBM_NOT_ITERATING); |
| 1369 | Assert(tbm->status == TBM_HASH); |
| 1370 | |
| 1371 | pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start); |
| 1372 | while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) |
| 1373 | { |
| 1374 | if (page->ischunk) |
| 1375 | continue; /* already a chunk header */ |
| 1376 | |
| 1377 | /* |
| 1378 | * If the page would become a chunk header, we won't save anything by |
| 1379 | * converting it to lossy, so skip it. |
| 1380 | */ |
| 1381 | if ((page->blockno % PAGES_PER_CHUNK) == 0) |
| 1382 | continue; |
| 1383 | |
| 1384 | /* This does the dirty work ... */ |
| 1385 | tbm_mark_page_lossy(tbm, page->blockno); |
| 1386 | |
| 1387 | if (tbm->nentries <= tbm->maxentries / 2) |
| 1388 | { |
| 1389 | /* |
| 1390 | * We have made enough room. Remember where to start lossifying |
| 1391 | * next round, so we evenly iterate over the hashtable. |
| 1392 | */ |
| 1393 | tbm->lossify_start = i.cur; |
| 1394 | break; |
| 1395 | } |
| 1396 | |
| 1397 | /* |
| 1398 | * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the |
| 1399 | * hashtable and may have deleted the non-lossy chunk. We can |
| 1400 | * continue the same hash table scan, since failure to visit one |
| 1401 | * element or visiting the newly inserted element, isn't fatal. |
| 1402 | */ |
| 1403 | } |
| 1404 | |
| 1405 | /* |
| 1406 | * With a big bitmap and small work_mem, it's possible that we cannot get |
| 1407 | * under maxentries. Again, if that happens, we'd end up uselessly |
| 1408 | * calling tbm_lossify over and over. To prevent this from becoming a |
| 1409 | * performance sink, force maxentries up to at least double the current |
| 1410 | * number of entries. (In essence, we're admitting inability to fit |
| 1411 | * within work_mem when we do this.) Note that this test will not fire if |
| 1412 | * we broke out of the loop early; and if we didn't, the current number of |
| 1413 | * entries is simply not reducible any further. |
| 1414 | */ |
| 1415 | if (tbm->nentries > tbm->maxentries / 2) |
| 1416 | tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2; |
| 1417 | } |
| 1418 | |
| 1419 | /* |
| 1420 | * qsort comparator to handle PagetableEntry pointers. |
| 1421 | */ |
| 1422 | static int |
| 1423 | tbm_comparator(const void *left, const void *right) |
| 1424 | { |
| 1425 | BlockNumber l = (*((PagetableEntry *const *) left))->blockno; |
| 1426 | BlockNumber r = (*((PagetableEntry *const *) right))->blockno; |
| 1427 | |
| 1428 | if (l < r) |
| 1429 | return -1; |
| 1430 | else if (l > r) |
| 1431 | return 1; |
| 1432 | return 0; |
| 1433 | } |
| 1434 | |
| 1435 | /* |
| 1436 | * As above, but this will get index into PagetableEntry array. Therefore, |
| 1437 | * it needs to get actual PagetableEntry using the index before comparing the |
| 1438 | * blockno. |
| 1439 | */ |
| 1440 | static int |
| 1441 | tbm_shared_comparator(const void *left, const void *right, void *arg) |
| 1442 | { |
| 1443 | PagetableEntry *base = (PagetableEntry *) arg; |
| 1444 | PagetableEntry *lpage = &base[*(int *) left]; |
| 1445 | PagetableEntry *rpage = &base[*(int *) right]; |
| 1446 | |
| 1447 | if (lpage->blockno < rpage->blockno) |
| 1448 | return -1; |
| 1449 | else if (lpage->blockno > rpage->blockno) |
| 1450 | return 1; |
| 1451 | return 0; |
| 1452 | } |
| 1453 | |
| 1454 | /* |
| 1455 | * tbm_attach_shared_iterate |
| 1456 | * |
| 1457 | * Allocate a backend-private iterator and attach the shared iterator state |
| 1458 | * to it so that multiple processed can iterate jointly. |
| 1459 | * |
| 1460 | * We also converts the DSA pointers to local pointers and store them into |
| 1461 | * our private iterator. |
| 1462 | */ |
| 1463 | TBMSharedIterator * |
| 1464 | tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) |
| 1465 | { |
| 1466 | TBMSharedIterator *iterator; |
| 1467 | TBMSharedIteratorState *istate; |
| 1468 | |
| 1469 | /* |
| 1470 | * Create the TBMSharedIterator struct, with enough trailing space to |
| 1471 | * serve the needs of the TBMIterateResult sub-struct. |
| 1472 | */ |
| 1473 | iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) + |
| 1474 | MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); |
| 1475 | |
| 1476 | istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); |
| 1477 | |
| 1478 | iterator->state = istate; |
| 1479 | |
| 1480 | iterator->ptbase = dsa_get_address(dsa, istate->pagetable); |
| 1481 | |
| 1482 | if (istate->npages) |
| 1483 | iterator->ptpages = dsa_get_address(dsa, istate->spages); |
| 1484 | if (istate->nchunks) |
| 1485 | iterator->ptchunks = dsa_get_address(dsa, istate->schunks); |
| 1486 | |
| 1487 | return iterator; |
| 1488 | } |
| 1489 | |
| 1490 | /* |
| 1491 | * pagetable_allocate |
| 1492 | * |
| 1493 | * Callback function for allocating the memory for hashtable elements. |
| 1494 | * Allocate memory for hashtable elements, using DSA if available. |
| 1495 | */ |
| 1496 | static inline void * |
| 1497 | pagetable_allocate(pagetable_hash *pagetable, Size size) |
| 1498 | { |
| 1499 | TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; |
| 1500 | PTEntryArray *ptbase; |
| 1501 | |
| 1502 | if (tbm->dsa == NULL) |
| 1503 | return MemoryContextAllocExtended(pagetable->ctx, size, |
| 1504 | MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO); |
| 1505 | |
| 1506 | /* |
| 1507 | * Save the dsapagetable reference in dsapagetableold before allocating |
| 1508 | * new memory so that pagetable_free can free the old entry. |
| 1509 | */ |
| 1510 | tbm->dsapagetableold = tbm->dsapagetable; |
| 1511 | tbm->dsapagetable = dsa_allocate_extended(tbm->dsa, |
| 1512 | sizeof(PTEntryArray) + size, |
| 1513 | DSA_ALLOC_HUGE | DSA_ALLOC_ZERO); |
| 1514 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
| 1515 | |
| 1516 | return ptbase->ptentry; |
| 1517 | } |
| 1518 | |
| 1519 | /* |
| 1520 | * pagetable_free |
| 1521 | * |
| 1522 | * Callback function for freeing hash table elements. |
| 1523 | */ |
| 1524 | static inline void |
| 1525 | pagetable_free(pagetable_hash *pagetable, void *pointer) |
| 1526 | { |
| 1527 | TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; |
| 1528 | |
| 1529 | /* pfree the input pointer if DSA is not available */ |
| 1530 | if (tbm->dsa == NULL) |
| 1531 | pfree(pointer); |
| 1532 | else if (DsaPointerIsValid(tbm->dsapagetableold)) |
| 1533 | { |
| 1534 | dsa_free(tbm->dsa, tbm->dsapagetableold); |
| 1535 | tbm->dsapagetableold = InvalidDsaPointer; |
| 1536 | } |
| 1537 | } |
| 1538 | |
| 1539 | /* |
| 1540 | * tbm_calculate_entries |
| 1541 | * |
| 1542 | * Estimate number of hashtable entries we can have within maxbytes. |
| 1543 | */ |
| 1544 | long |
| 1545 | tbm_calculate_entries(double maxbytes) |
| 1546 | { |
| 1547 | long nbuckets; |
| 1548 | |
| 1549 | /* |
| 1550 | * Estimate number of hashtable entries we can have within maxbytes. This |
| 1551 | * estimates the hash cost as sizeof(PagetableEntry), which is good enough |
| 1552 | * for our purpose. Also count an extra Pointer per entry for the arrays |
| 1553 | * created during iteration readout. |
| 1554 | */ |
| 1555 | nbuckets = maxbytes / |
| 1556 | (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); |
| 1557 | nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ |
| 1558 | nbuckets = Max(nbuckets, 16); /* sanity limit */ |
| 1559 | |
| 1560 | return nbuckets; |
| 1561 | } |
| 1562 | |