1/*-------------------------------------------------------------------------
2 *
3 * tidbitmap.c
4 * PostgreSQL tuple-id (TID) bitmap package
5 *
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
14 *
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
21 *
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
30 *
31 *
32 * Copyright (c) 2003-2019, PostgreSQL Global Development Group
33 *
34 * IDENTIFICATION
35 * src/backend/nodes/tidbitmap.c
36 *
37 *-------------------------------------------------------------------------
38 */
39#include "postgres.h"
40
41#include <limits.h>
42
43#include "access/htup_details.h"
44#include "nodes/bitmapset.h"
45#include "nodes/tidbitmap.h"
46#include "storage/lwlock.h"
47#include "utils/dsa.h"
48#include "utils/hashutils.h"
49
50/*
51 * The maximum number of tuples per page is not large (typically 256 with
52 * 8K pages, or 1024 with 32K pages). So there's not much point in making
53 * the per-page bitmaps variable size. We just legislate that the size
54 * is this:
55 */
56#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57
58/*
59 * When we have to switch over to lossy storage, we use a data structure
60 * with one bit per page, where all pages having the same number DIV
61 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62 * and has the bit set for a given page, there must not be a per-page entry
63 * for that page in the page table.
64 *
65 * We actually store both exact pages and lossy chunks in the same hash
66 * table, using identical data structures. (This is because the memory
67 * management for hashtables doesn't easily/efficiently allow space to be
68 * transferred easily from one hashtable to another.) Therefore it's best
69 * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71 * avoid expensive integer remainder operations. So, define it like this:
72 */
73#define PAGES_PER_CHUNK (BLCKSZ / 32)
74
75/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76
77#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79
80/* number of active words for an exact page: */
81#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82/* number of active words for a lossy chunk: */
83#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84
85/*
86 * The hashtable entries are represented by this data structure. For
87 * an exact page, blockno is the page number and bit k of the bitmap
88 * represents tuple offset k+1. For a lossy chunk, blockno is the first
89 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90 * bit k represents page blockno+k. Note that it is not possible to
91 * have exact storage for the first page of a chunk if we are using
92 * lossy storage for any page in the chunk's range, since the same
93 * hashtable entry has to serve both purposes.
94 *
95 * recheck is used only on exact pages --- it indicates that although
96 * only the stated tuples need be checked, the full index qual condition
97 * must be checked for each (ie, these are candidate matches).
98 */
99typedef struct PagetableEntry
100{
101 BlockNumber blockno; /* page number (hashtable key) */
102 char status; /* hash entry status */
103 bool ischunk; /* T = lossy storage, F = exact */
104 bool recheck; /* should the tuples be rechecked? */
105 bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106} PagetableEntry;
107
108/*
109 * Holds array of pagetable entries.
110 */
111typedef struct PTEntryArray
112{
113 pg_atomic_uint32 refcount; /* no. of iterator attached */
114 PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115} PTEntryArray;
116
117/*
118 * We want to avoid the overhead of creating the hashtable, which is
119 * comparatively large, when not necessary. Particularly when we are using a
120 * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121 * long enough to accumulate one entry in such cases. We therefore avoid
122 * creating an actual hashtable until we need two pagetable entries. When
123 * just one pagetable entry is needed, we store it in a fixed field of
124 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125 * shrinks down to zero or one page again. So, status can be TBM_HASH even
126 * when nentries is zero or one.)
127 */
128typedef enum
129{
130 TBM_EMPTY, /* no hashtable, nentries == 0 */
131 TBM_ONE_PAGE, /* entry1 contains the single entry */
132 TBM_HASH /* pagetable is valid, entry1 is not */
133} TBMStatus;
134
135/*
136 * Current iterating state of the TBM.
137 */
138typedef enum
139{
140 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
141 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
142 TBM_ITERATING_SHARED /* converted to shared page and chunk array */
143} TBMIteratingState;
144
145/*
146 * Here is the representation for a whole TIDBitMap:
147 */
148struct TIDBitmap
149{
150 NodeTag type; /* to make it a valid Node */
151 MemoryContext mcxt; /* memory context containing me */
152 TBMStatus status; /* see codes above */
153 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
154 int nentries; /* number of entries in pagetable */
155 int maxentries; /* limit on same to meet maxbytes */
156 int npages; /* number of exact entries in pagetable */
157 int nchunks; /* number of lossy entries in pagetable */
158 TBMIteratingState iterating; /* tbm_begin_iterate called? */
159 uint32 lossify_start; /* offset to start lossifying hashtable at */
160 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
161 /* these are valid when iterating is true: */
162 PagetableEntry **spages; /* sorted exact-page list, or NULL */
163 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
164 dsa_pointer dsapagetable; /* dsa_pointer to the element array */
165 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
166 dsa_pointer ptpages; /* dsa_pointer to the page array */
167 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
168 dsa_area *dsa; /* reference to per-query dsa area */
169};
170
171/*
172 * When iterating over a bitmap in sorted order, a TBMIterator is used to
173 * track our progress. There can be several iterators scanning the same
174 * bitmap concurrently. Note that the bitmap becomes read-only as soon as
175 * any iterator is created.
176 */
177struct TBMIterator
178{
179 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
180 int spageptr; /* next spages index */
181 int schunkptr; /* next schunks index */
182 int schunkbit; /* next bit to check in current schunk */
183 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
184};
185
186/*
187 * Holds the shared members of the iterator so that multiple processes
188 * can jointly iterate.
189 */
190typedef struct TBMSharedIteratorState
191{
192 int nentries; /* number of entries in pagetable */
193 int maxentries; /* limit on same to meet maxbytes */
194 int npages; /* number of exact entries in pagetable */
195 int nchunks; /* number of lossy entries in pagetable */
196 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
197 dsa_pointer spages; /* dsa pointer to page array */
198 dsa_pointer schunks; /* dsa pointer to chunk array */
199 LWLock lock; /* lock to protect below members */
200 int spageptr; /* next spages index */
201 int schunkptr; /* next schunks index */
202 int schunkbit; /* next bit to check in current schunk */
203} TBMSharedIteratorState;
204
205/*
206 * pagetable iteration array.
207 */
208typedef struct PTIterationArray
209{
210 pg_atomic_uint32 refcount; /* no. of iterator attached */
211 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
212} PTIterationArray;
213
214/*
215 * same as TBMIterator, but it is used for joint iteration, therefore this
216 * also holds a reference to the shared state.
217 */
218struct TBMSharedIterator
219{
220 TBMSharedIteratorState *state; /* shared state */
221 PTEntryArray *ptbase; /* pagetable element array */
222 PTIterationArray *ptpages; /* sorted exact page index list */
223 PTIterationArray *ptchunks; /* sorted lossy page index list */
224 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
225};
226
227/* Local function prototypes */
228static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 const TIDBitmap *b);
231static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 BlockNumber pageno);
233static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
234static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236static void tbm_lossify(TIDBitmap *tbm);
237static int tbm_comparator(const void *left, const void *right);
238static int tbm_shared_comparator(const void *left, const void *right,
239 void *arg);
240
241/* define hashtable mapping block numbers to PagetableEntry's */
242#define SH_USE_NONDEFAULT_ALLOCATOR
243#define SH_PREFIX pagetable
244#define SH_ELEMENT_TYPE PagetableEntry
245#define SH_KEY_TYPE BlockNumber
246#define SH_KEY blockno
247#define SH_HASH_KEY(tb, key) murmurhash32(key)
248#define SH_EQUAL(tb, a, b) a == b
249#define SH_SCOPE static inline
250#define SH_DEFINE
251#define SH_DECLARE
252#include "lib/simplehash.h"
253
254
255/*
256 * tbm_create - create an initially-empty bitmap
257 *
258 * The bitmap will live in the memory context that is CurrentMemoryContext
259 * at the time of this call. It will be limited to (approximately) maxbytes
260 * total memory consumption. If the DSA passed to this function is not NULL
261 * then the memory for storing elements of the underlying page table will
262 * be allocated from the DSA.
263 */
264TIDBitmap *
265tbm_create(long maxbytes, dsa_area *dsa)
266{
267 TIDBitmap *tbm;
268
269 /* Create the TIDBitmap struct and zero all its fields */
270 tbm = makeNode(TIDBitmap);
271
272 tbm->mcxt = CurrentMemoryContext;
273 tbm->status = TBM_EMPTY;
274
275 tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
276 tbm->lossify_start = 0;
277 tbm->dsa = dsa;
278 tbm->dsapagetable = InvalidDsaPointer;
279 tbm->dsapagetableold = InvalidDsaPointer;
280 tbm->ptpages = InvalidDsaPointer;
281 tbm->ptchunks = InvalidDsaPointer;
282
283 return tbm;
284}
285
286/*
287 * Actually create the hashtable. Since this is a moderately expensive
288 * proposition, we don't do it until we have to.
289 */
290static void
291tbm_create_pagetable(TIDBitmap *tbm)
292{
293 Assert(tbm->status != TBM_HASH);
294 Assert(tbm->pagetable == NULL);
295
296 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
297
298 /* If entry1 is valid, push it into the hashtable */
299 if (tbm->status == TBM_ONE_PAGE)
300 {
301 PagetableEntry *page;
302 bool found;
303 char oldstatus;
304
305 page = pagetable_insert(tbm->pagetable,
306 tbm->entry1.blockno,
307 &found);
308 Assert(!found);
309 oldstatus = page->status;
310 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
311 page->status = oldstatus;
312 }
313
314 tbm->status = TBM_HASH;
315}
316
317/*
318 * tbm_free - free a TIDBitmap
319 */
320void
321tbm_free(TIDBitmap *tbm)
322{
323 if (tbm->pagetable)
324 pagetable_destroy(tbm->pagetable);
325 if (tbm->spages)
326 pfree(tbm->spages);
327 if (tbm->schunks)
328 pfree(tbm->schunks);
329 pfree(tbm);
330}
331
332/*
333 * tbm_free_shared_area - free shared state
334 *
335 * Free shared iterator state, Also free shared pagetable and iterator arrays
336 * memory if they are not referred by any of the shared iterator i.e recount
337 * is becomes 0.
338 */
339void
340tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
341{
342 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
343 PTEntryArray *ptbase;
344 PTIterationArray *ptpages;
345 PTIterationArray *ptchunks;
346
347 if (DsaPointerIsValid(istate->pagetable))
348 {
349 ptbase = dsa_get_address(dsa, istate->pagetable);
350 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
351 dsa_free(dsa, istate->pagetable);
352 }
353 if (DsaPointerIsValid(istate->spages))
354 {
355 ptpages = dsa_get_address(dsa, istate->spages);
356 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
357 dsa_free(dsa, istate->spages);
358 }
359 if (DsaPointerIsValid(istate->schunks))
360 {
361 ptchunks = dsa_get_address(dsa, istate->schunks);
362 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
363 dsa_free(dsa, istate->schunks);
364 }
365
366 dsa_free(dsa, dp);
367}
368
369/*
370 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
371 *
372 * If recheck is true, then the recheck flag will be set in the
373 * TBMIterateResult when any of these tuples are reported out.
374 */
375void
376tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
377 bool recheck)
378{
379 BlockNumber currblk = InvalidBlockNumber;
380 PagetableEntry *page = NULL; /* only valid when currblk is valid */
381 int i;
382
383 Assert(tbm->iterating == TBM_NOT_ITERATING);
384 for (i = 0; i < ntids; i++)
385 {
386 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
387 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
388 int wordnum,
389 bitnum;
390
391 /* safety check to ensure we don't overrun bit array bounds */
392 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
393 elog(ERROR, "tuple offset out of range: %u", off);
394
395 /*
396 * Look up target page unless we already did. This saves cycles when
397 * the input includes consecutive tuples on the same page, which is
398 * common enough to justify an extra test here.
399 */
400 if (blk != currblk)
401 {
402 if (tbm_page_is_lossy(tbm, blk))
403 page = NULL; /* remember page is lossy */
404 else
405 page = tbm_get_pageentry(tbm, blk);
406 currblk = blk;
407 }
408
409 if (page == NULL)
410 continue; /* whole page is already marked */
411
412 if (page->ischunk)
413 {
414 /* The page is a lossy chunk header, set bit for itself */
415 wordnum = bitnum = 0;
416 }
417 else
418 {
419 /* Page is exact, so set bit for individual tuple */
420 wordnum = WORDNUM(off - 1);
421 bitnum = BITNUM(off - 1);
422 }
423 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
424 page->recheck |= recheck;
425
426 if (tbm->nentries > tbm->maxentries)
427 {
428 tbm_lossify(tbm);
429 /* Page could have been converted to lossy, so force new lookup */
430 currblk = InvalidBlockNumber;
431 }
432 }
433}
434
435/*
436 * tbm_add_page - add a whole page to a TIDBitmap
437 *
438 * This causes the whole page to be reported (with the recheck flag)
439 * when the TIDBitmap is scanned.
440 */
441void
442tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
443{
444 /* Enter the page in the bitmap, or mark it lossy if already present */
445 tbm_mark_page_lossy(tbm, pageno);
446 /* If we went over the memory limit, lossify some more pages */
447 if (tbm->nentries > tbm->maxentries)
448 tbm_lossify(tbm);
449}
450
451/*
452 * tbm_union - set union
453 *
454 * a is modified in-place, b is not changed
455 */
456void
457tbm_union(TIDBitmap *a, const TIDBitmap *b)
458{
459 Assert(!a->iterating);
460 /* Nothing to do if b is empty */
461 if (b->nentries == 0)
462 return;
463 /* Scan through chunks and pages in b, merge into a */
464 if (b->status == TBM_ONE_PAGE)
465 tbm_union_page(a, &b->entry1);
466 else
467 {
468 pagetable_iterator i;
469 PagetableEntry *bpage;
470
471 Assert(b->status == TBM_HASH);
472 pagetable_start_iterate(b->pagetable, &i);
473 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
474 tbm_union_page(a, bpage);
475 }
476}
477
478/* Process one page of b during a union op */
479static void
480tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
481{
482 PagetableEntry *apage;
483 int wordnum;
484
485 if (bpage->ischunk)
486 {
487 /* Scan b's chunk, mark each indicated page lossy in a */
488 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
489 {
490 bitmapword w = bpage->words[wordnum];
491
492 if (w != 0)
493 {
494 BlockNumber pg;
495
496 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
497 while (w != 0)
498 {
499 if (w & 1)
500 tbm_mark_page_lossy(a, pg);
501 pg++;
502 w >>= 1;
503 }
504 }
505 }
506 }
507 else if (tbm_page_is_lossy(a, bpage->blockno))
508 {
509 /* page is already lossy in a, nothing to do */
510 return;
511 }
512 else
513 {
514 apage = tbm_get_pageentry(a, bpage->blockno);
515 if (apage->ischunk)
516 {
517 /* The page is a lossy chunk header, set bit for itself */
518 apage->words[0] |= ((bitmapword) 1 << 0);
519 }
520 else
521 {
522 /* Both pages are exact, merge at the bit level */
523 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
524 apage->words[wordnum] |= bpage->words[wordnum];
525 apage->recheck |= bpage->recheck;
526 }
527 }
528
529 if (a->nentries > a->maxentries)
530 tbm_lossify(a);
531}
532
533/*
534 * tbm_intersect - set intersection
535 *
536 * a is modified in-place, b is not changed
537 */
538void
539tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
540{
541 Assert(!a->iterating);
542 /* Nothing to do if a is empty */
543 if (a->nentries == 0)
544 return;
545 /* Scan through chunks and pages in a, try to match to b */
546 if (a->status == TBM_ONE_PAGE)
547 {
548 if (tbm_intersect_page(a, &a->entry1, b))
549 {
550 /* Page is now empty, remove it from a */
551 Assert(!a->entry1.ischunk);
552 a->npages--;
553 a->nentries--;
554 Assert(a->nentries == 0);
555 a->status = TBM_EMPTY;
556 }
557 }
558 else
559 {
560 pagetable_iterator i;
561 PagetableEntry *apage;
562
563 Assert(a->status == TBM_HASH);
564 pagetable_start_iterate(a->pagetable, &i);
565 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
566 {
567 if (tbm_intersect_page(a, apage, b))
568 {
569 /* Page or chunk is now empty, remove it from a */
570 if (apage->ischunk)
571 a->nchunks--;
572 else
573 a->npages--;
574 a->nentries--;
575 if (!pagetable_delete(a->pagetable, apage->blockno))
576 elog(ERROR, "hash table corrupted");
577 }
578 }
579 }
580}
581
582/*
583 * Process one page of a during an intersection op
584 *
585 * Returns true if apage is now empty and should be deleted from a
586 */
587static bool
588tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
589{
590 const PagetableEntry *bpage;
591 int wordnum;
592
593 if (apage->ischunk)
594 {
595 /* Scan each bit in chunk, try to clear */
596 bool candelete = true;
597
598 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
599 {
600 bitmapword w = apage->words[wordnum];
601
602 if (w != 0)
603 {
604 bitmapword neww = w;
605 BlockNumber pg;
606 int bitnum;
607
608 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
609 bitnum = 0;
610 while (w != 0)
611 {
612 if (w & 1)
613 {
614 if (!tbm_page_is_lossy(b, pg) &&
615 tbm_find_pageentry(b, pg) == NULL)
616 {
617 /* Page is not in b at all, lose lossy bit */
618 neww &= ~((bitmapword) 1 << bitnum);
619 }
620 }
621 pg++;
622 bitnum++;
623 w >>= 1;
624 }
625 apage->words[wordnum] = neww;
626 if (neww != 0)
627 candelete = false;
628 }
629 }
630 return candelete;
631 }
632 else if (tbm_page_is_lossy(b, apage->blockno))
633 {
634 /*
635 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
636 * because the page 'b' is lossy, we don't know which ones. Therefore
637 * we mark 'a' as requiring rechecks, to indicate that at most those
638 * tuples set in 'a' are matches.
639 */
640 apage->recheck = true;
641 return false;
642 }
643 else
644 {
645 bool candelete = true;
646
647 bpage = tbm_find_pageentry(b, apage->blockno);
648 if (bpage != NULL)
649 {
650 /* Both pages are exact, merge at the bit level */
651 Assert(!bpage->ischunk);
652 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
653 {
654 apage->words[wordnum] &= bpage->words[wordnum];
655 if (apage->words[wordnum] != 0)
656 candelete = false;
657 }
658 apage->recheck |= bpage->recheck;
659 }
660 /* If there is no matching b page, we can just delete the a page */
661 return candelete;
662 }
663}
664
665/*
666 * tbm_is_empty - is a TIDBitmap completely empty?
667 */
668bool
669tbm_is_empty(const TIDBitmap *tbm)
670{
671 return (tbm->nentries == 0);
672}
673
674/*
675 * tbm_begin_iterate - prepare to iterate through a TIDBitmap
676 *
677 * The TBMIterator struct is created in the caller's memory context.
678 * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
679 * okay to just allow the memory context to be released, too. It is caller's
680 * responsibility not to touch the TBMIterator anymore once the TIDBitmap
681 * is freed.
682 *
683 * NB: after this is called, it is no longer allowed to modify the contents
684 * of the bitmap. However, you can call this multiple times to scan the
685 * contents repeatedly, including parallel scans.
686 */
687TBMIterator *
688tbm_begin_iterate(TIDBitmap *tbm)
689{
690 TBMIterator *iterator;
691
692 Assert(tbm->iterating != TBM_ITERATING_SHARED);
693
694 /*
695 * Create the TBMIterator struct, with enough trailing space to serve the
696 * needs of the TBMIterateResult sub-struct.
697 */
698 iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
699 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
700 iterator->tbm = tbm;
701
702 /*
703 * Initialize iteration pointers.
704 */
705 iterator->spageptr = 0;
706 iterator->schunkptr = 0;
707 iterator->schunkbit = 0;
708
709 /*
710 * If we have a hashtable, create and fill the sorted page lists, unless
711 * we already did that for a previous iterator. Note that the lists are
712 * attached to the bitmap not the iterator, so they can be used by more
713 * than one iterator.
714 */
715 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
716 {
717 pagetable_iterator i;
718 PagetableEntry *page;
719 int npages;
720 int nchunks;
721
722 if (!tbm->spages && tbm->npages > 0)
723 tbm->spages = (PagetableEntry **)
724 MemoryContextAlloc(tbm->mcxt,
725 tbm->npages * sizeof(PagetableEntry *));
726 if (!tbm->schunks && tbm->nchunks > 0)
727 tbm->schunks = (PagetableEntry **)
728 MemoryContextAlloc(tbm->mcxt,
729 tbm->nchunks * sizeof(PagetableEntry *));
730
731 npages = nchunks = 0;
732 pagetable_start_iterate(tbm->pagetable, &i);
733 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
734 {
735 if (page->ischunk)
736 tbm->schunks[nchunks++] = page;
737 else
738 tbm->spages[npages++] = page;
739 }
740 Assert(npages == tbm->npages);
741 Assert(nchunks == tbm->nchunks);
742 if (npages > 1)
743 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
744 tbm_comparator);
745 if (nchunks > 1)
746 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
747 tbm_comparator);
748 }
749
750 tbm->iterating = TBM_ITERATING_PRIVATE;
751
752 return iterator;
753}
754
755/*
756 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
757 *
758 * The necessary shared state will be allocated from the DSA passed to
759 * tbm_create, so that multiple processes can attach to it and iterate jointly.
760 *
761 * This will convert the pagetable hash into page and chunk array of the index
762 * into pagetable array.
763 */
764dsa_pointer
765tbm_prepare_shared_iterate(TIDBitmap *tbm)
766{
767 dsa_pointer dp;
768 TBMSharedIteratorState *istate;
769 PTEntryArray *ptbase = NULL;
770 PTIterationArray *ptpages = NULL;
771 PTIterationArray *ptchunks = NULL;
772
773 Assert(tbm->dsa != NULL);
774 Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
775
776 /*
777 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
778 * lock, this will also be used by multiple worker for shared iterate.
779 */
780 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
781 istate = dsa_get_address(tbm->dsa, dp);
782
783 /*
784 * If we're not already iterating, create and fill the sorted page lists.
785 * (If we are, the sorted page lists are already stored in the TIDBitmap,
786 * and we can just reuse them.)
787 */
788 if (tbm->iterating == TBM_NOT_ITERATING)
789 {
790 pagetable_iterator i;
791 PagetableEntry *page;
792 int idx;
793 int npages;
794 int nchunks;
795
796 /*
797 * Allocate the page and chunk array memory from the DSA to share
798 * across multiple processes.
799 */
800 if (tbm->npages)
801 {
802 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
803 tbm->npages * sizeof(int));
804 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
805 pg_atomic_init_u32(&ptpages->refcount, 0);
806 }
807 if (tbm->nchunks)
808 {
809 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
810 tbm->nchunks * sizeof(int));
811 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
812 pg_atomic_init_u32(&ptchunks->refcount, 0);
813 }
814
815 /*
816 * If TBM status is TBM_HASH then iterate over the pagetable and
817 * convert it to page and chunk arrays. But if it's in the
818 * TBM_ONE_PAGE mode then directly allocate the space for one entry
819 * from the DSA.
820 */
821 npages = nchunks = 0;
822 if (tbm->status == TBM_HASH)
823 {
824 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
825
826 pagetable_start_iterate(tbm->pagetable, &i);
827 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
828 {
829 idx = page - ptbase->ptentry;
830 if (page->ischunk)
831 ptchunks->index[nchunks++] = idx;
832 else
833 ptpages->index[npages++] = idx;
834 }
835
836 Assert(npages == tbm->npages);
837 Assert(nchunks == tbm->nchunks);
838 }
839 else if (tbm->status == TBM_ONE_PAGE)
840 {
841 /*
842 * In one page mode allocate the space for one pagetable entry,
843 * initialize it, and directly store its index (i.e. 0) in the
844 * page array.
845 */
846 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
847 sizeof(PagetableEntry));
848 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
849 memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
850 ptpages->index[0] = 0;
851 }
852
853 if (ptbase != NULL)
854 pg_atomic_init_u32(&ptbase->refcount, 0);
855 if (npages > 1)
856 qsort_arg((void *) (ptpages->index), npages, sizeof(int),
857 tbm_shared_comparator, (void *) ptbase->ptentry);
858 if (nchunks > 1)
859 qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
860 tbm_shared_comparator, (void *) ptbase->ptentry);
861 }
862
863 /*
864 * Store the TBM members in the shared state so that we can share them
865 * across multiple processes.
866 */
867 istate->nentries = tbm->nentries;
868 istate->maxentries = tbm->maxentries;
869 istate->npages = tbm->npages;
870 istate->nchunks = tbm->nchunks;
871 istate->pagetable = tbm->dsapagetable;
872 istate->spages = tbm->ptpages;
873 istate->schunks = tbm->ptchunks;
874
875 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
877 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
878
879 /*
880 * For every shared iterator, referring to pagetable and iterator array,
881 * increase the refcount by 1 so that while freeing the shared iterator we
882 * don't free pagetable and iterator array until its refcount becomes 0.
883 */
884 if (ptbase != NULL)
885 pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
886 if (ptpages != NULL)
887 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
888 if (ptchunks != NULL)
889 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
890
891 /* Initialize the iterator lock */
892 LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
893
894 /* Initialize the shared iterator state */
895 istate->schunkbit = 0;
896 istate->schunkptr = 0;
897 istate->spageptr = 0;
898
899 tbm->iterating = TBM_ITERATING_SHARED;
900
901 return dp;
902}
903
904/*
905 * tbm_extract_page_tuple - extract the tuple offsets from a page
906 *
907 * The extracted offsets are stored into TBMIterateResult.
908 */
909static inline int
910tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
911{
912 int wordnum;
913 int ntuples = 0;
914
915 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
916 {
917 bitmapword w = page->words[wordnum];
918
919 if (w != 0)
920 {
921 int off = wordnum * BITS_PER_BITMAPWORD + 1;
922
923 while (w != 0)
924 {
925 if (w & 1)
926 output->offsets[ntuples++] = (OffsetNumber) off;
927 off++;
928 w >>= 1;
929 }
930 }
931 }
932
933 return ntuples;
934}
935
936/*
937 * tbm_advance_schunkbit - Advance the schunkbit
938 */
939static inline void
940tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
941{
942 int schunkbit = *schunkbitp;
943
944 while (schunkbit < PAGES_PER_CHUNK)
945 {
946 int wordnum = WORDNUM(schunkbit);
947 int bitnum = BITNUM(schunkbit);
948
949 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
950 break;
951 schunkbit++;
952 }
953
954 *schunkbitp = schunkbit;
955}
956
957/*
958 * tbm_iterate - scan through next page of a TIDBitmap
959 *
960 * Returns a TBMIterateResult representing one page, or NULL if there are
961 * no more pages to scan. Pages are guaranteed to be delivered in numerical
962 * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
963 * remember the exact tuples to look at on this page --- the caller must
964 * examine all tuples on the page and check if they meet the intended
965 * condition. If result->recheck is true, only the indicated tuples need
966 * be examined, but the condition must be rechecked anyway. (For ease of
967 * testing, recheck is always set true when ntuples < 0.)
968 */
969TBMIterateResult *
970tbm_iterate(TBMIterator *iterator)
971{
972 TIDBitmap *tbm = iterator->tbm;
973 TBMIterateResult *output = &(iterator->output);
974
975 Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
976
977 /*
978 * If lossy chunk pages remain, make sure we've advanced schunkptr/
979 * schunkbit to the next set bit.
980 */
981 while (iterator->schunkptr < tbm->nchunks)
982 {
983 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
984 int schunkbit = iterator->schunkbit;
985
986 tbm_advance_schunkbit(chunk, &schunkbit);
987 if (schunkbit < PAGES_PER_CHUNK)
988 {
989 iterator->schunkbit = schunkbit;
990 break;
991 }
992 /* advance to next chunk */
993 iterator->schunkptr++;
994 iterator->schunkbit = 0;
995 }
996
997 /*
998 * If both chunk and per-page data remain, must output the numerically
999 * earlier page.
1000 */
1001 if (iterator->schunkptr < tbm->nchunks)
1002 {
1003 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1004 BlockNumber chunk_blockno;
1005
1006 chunk_blockno = chunk->blockno + iterator->schunkbit;
1007 if (iterator->spageptr >= tbm->npages ||
1008 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1009 {
1010 /* Return a lossy page indicator from the chunk */
1011 output->blockno = chunk_blockno;
1012 output->ntuples = -1;
1013 output->recheck = true;
1014 iterator->schunkbit++;
1015 return output;
1016 }
1017 }
1018
1019 if (iterator->spageptr < tbm->npages)
1020 {
1021 PagetableEntry *page;
1022 int ntuples;
1023
1024 /* In ONE_PAGE state, we don't allocate an spages[] array */
1025 if (tbm->status == TBM_ONE_PAGE)
1026 page = &tbm->entry1;
1027 else
1028 page = tbm->spages[iterator->spageptr];
1029
1030 /* scan bitmap to extract individual offset numbers */
1031 ntuples = tbm_extract_page_tuple(page, output);
1032 output->blockno = page->blockno;
1033 output->ntuples = ntuples;
1034 output->recheck = page->recheck;
1035 iterator->spageptr++;
1036 return output;
1037 }
1038
1039 /* Nothing more in the bitmap */
1040 return NULL;
1041}
1042
1043/*
1044 * tbm_shared_iterate - scan through next page of a TIDBitmap
1045 *
1046 * As above, but this will iterate using an iterator which is shared
1047 * across multiple processes. We need to acquire the iterator LWLock,
1048 * before accessing the shared members.
1049 */
1050TBMIterateResult *
1051tbm_shared_iterate(TBMSharedIterator *iterator)
1052{
1053 TBMIterateResult *output = &iterator->output;
1054 TBMSharedIteratorState *istate = iterator->state;
1055 PagetableEntry *ptbase = NULL;
1056 int *idxpages = NULL;
1057 int *idxchunks = NULL;
1058
1059 if (iterator->ptbase != NULL)
1060 ptbase = iterator->ptbase->ptentry;
1061 if (iterator->ptpages != NULL)
1062 idxpages = iterator->ptpages->index;
1063 if (iterator->ptchunks != NULL)
1064 idxchunks = iterator->ptchunks->index;
1065
1066 /* Acquire the LWLock before accessing the shared members */
1067 LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1068
1069 /*
1070 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1071 * schunkbit to the next set bit.
1072 */
1073 while (istate->schunkptr < istate->nchunks)
1074 {
1075 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1076 int schunkbit = istate->schunkbit;
1077
1078 tbm_advance_schunkbit(chunk, &schunkbit);
1079 if (schunkbit < PAGES_PER_CHUNK)
1080 {
1081 istate->schunkbit = schunkbit;
1082 break;
1083 }
1084 /* advance to next chunk */
1085 istate->schunkptr++;
1086 istate->schunkbit = 0;
1087 }
1088
1089 /*
1090 * If both chunk and per-page data remain, must output the numerically
1091 * earlier page.
1092 */
1093 if (istate->schunkptr < istate->nchunks)
1094 {
1095 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1096 BlockNumber chunk_blockno;
1097
1098 chunk_blockno = chunk->blockno + istate->schunkbit;
1099
1100 if (istate->spageptr >= istate->npages ||
1101 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1102 {
1103 /* Return a lossy page indicator from the chunk */
1104 output->blockno = chunk_blockno;
1105 output->ntuples = -1;
1106 output->recheck = true;
1107 istate->schunkbit++;
1108
1109 LWLockRelease(&istate->lock);
1110 return output;
1111 }
1112 }
1113
1114 if (istate->spageptr < istate->npages)
1115 {
1116 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1117 int ntuples;
1118
1119 /* scan bitmap to extract individual offset numbers */
1120 ntuples = tbm_extract_page_tuple(page, output);
1121 output->blockno = page->blockno;
1122 output->ntuples = ntuples;
1123 output->recheck = page->recheck;
1124 istate->spageptr++;
1125
1126 LWLockRelease(&istate->lock);
1127
1128 return output;
1129 }
1130
1131 LWLockRelease(&istate->lock);
1132
1133 /* Nothing more in the bitmap */
1134 return NULL;
1135}
1136
1137/*
1138 * tbm_end_iterate - finish an iteration over a TIDBitmap
1139 *
1140 * Currently this is just a pfree, but it might do more someday. (For
1141 * instance, it could be useful to count open iterators and allow the
1142 * bitmap to return to read/write status when there are no more iterators.)
1143 */
1144void
1145tbm_end_iterate(TBMIterator *iterator)
1146{
1147 pfree(iterator);
1148}
1149
1150/*
1151 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1152 *
1153 * This doesn't free any of the shared state associated with the iterator,
1154 * just our backend-private state.
1155 */
1156void
1157tbm_end_shared_iterate(TBMSharedIterator *iterator)
1158{
1159 pfree(iterator);
1160}
1161
1162/*
1163 * tbm_find_pageentry - find a PagetableEntry for the pageno
1164 *
1165 * Returns NULL if there is no non-lossy entry for the pageno.
1166 */
1167static const PagetableEntry *
1168tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1169{
1170 const PagetableEntry *page;
1171
1172 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1173 return NULL;
1174
1175 if (tbm->status == TBM_ONE_PAGE)
1176 {
1177 page = &tbm->entry1;
1178 if (page->blockno != pageno)
1179 return NULL;
1180 Assert(!page->ischunk);
1181 return page;
1182 }
1183
1184 page = pagetable_lookup(tbm->pagetable, pageno);
1185 if (page == NULL)
1186 return NULL;
1187 if (page->ischunk)
1188 return NULL; /* don't want a lossy chunk header */
1189 return page;
1190}
1191
1192/*
1193 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1194 *
1195 * If new, the entry is marked as an exact (non-chunk) entry.
1196 *
1197 * This may cause the table to exceed the desired memory size. It is
1198 * up to the caller to call tbm_lossify() at the next safe point if so.
1199 */
1200static PagetableEntry *
1201tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1202{
1203 PagetableEntry *page;
1204 bool found;
1205
1206 if (tbm->status == TBM_EMPTY)
1207 {
1208 /* Use the fixed slot */
1209 page = &tbm->entry1;
1210 found = false;
1211 tbm->status = TBM_ONE_PAGE;
1212 }
1213 else
1214 {
1215 if (tbm->status == TBM_ONE_PAGE)
1216 {
1217 page = &tbm->entry1;
1218 if (page->blockno == pageno)
1219 return page;
1220 /* Time to switch from one page to a hashtable */
1221 tbm_create_pagetable(tbm);
1222 }
1223
1224 /* Look up or create an entry */
1225 page = pagetable_insert(tbm->pagetable, pageno, &found);
1226 }
1227
1228 /* Initialize it if not present before */
1229 if (!found)
1230 {
1231 char oldstatus = page->status;
1232
1233 MemSet(page, 0, sizeof(PagetableEntry));
1234 page->status = oldstatus;
1235 page->blockno = pageno;
1236 /* must count it too */
1237 tbm->nentries++;
1238 tbm->npages++;
1239 }
1240
1241 return page;
1242}
1243
1244/*
1245 * tbm_page_is_lossy - is the page marked as lossily stored?
1246 */
1247static bool
1248tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1249{
1250 PagetableEntry *page;
1251 BlockNumber chunk_pageno;
1252 int bitno;
1253
1254 /* we can skip the lookup if there are no lossy chunks */
1255 if (tbm->nchunks == 0)
1256 return false;
1257 Assert(tbm->status == TBM_HASH);
1258
1259 bitno = pageno % PAGES_PER_CHUNK;
1260 chunk_pageno = pageno - bitno;
1261
1262 page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1263
1264 if (page != NULL && page->ischunk)
1265 {
1266 int wordnum = WORDNUM(bitno);
1267 int bitnum = BITNUM(bitno);
1268
1269 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1270 return true;
1271 }
1272 return false;
1273}
1274
1275/*
1276 * tbm_mark_page_lossy - mark the page number as lossily stored
1277 *
1278 * This may cause the table to exceed the desired memory size. It is
1279 * up to the caller to call tbm_lossify() at the next safe point if so.
1280 */
1281static void
1282tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1283{
1284 PagetableEntry *page;
1285 bool found;
1286 BlockNumber chunk_pageno;
1287 int bitno;
1288 int wordnum;
1289 int bitnum;
1290
1291 /* We force the bitmap into hashtable mode whenever it's lossy */
1292 if (tbm->status != TBM_HASH)
1293 tbm_create_pagetable(tbm);
1294
1295 bitno = pageno % PAGES_PER_CHUNK;
1296 chunk_pageno = pageno - bitno;
1297
1298 /*
1299 * Remove any extant non-lossy entry for the page. If the page is its own
1300 * chunk header, however, we skip this and handle the case below.
1301 */
1302 if (bitno != 0)
1303 {
1304 if (pagetable_delete(tbm->pagetable, pageno))
1305 {
1306 /* It was present, so adjust counts */
1307 tbm->nentries--;
1308 tbm->npages--; /* assume it must have been non-lossy */
1309 }
1310 }
1311
1312 /* Look up or create entry for chunk-header page */
1313 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1314
1315 /* Initialize it if not present before */
1316 if (!found)
1317 {
1318 char oldstatus = page->status;
1319
1320 MemSet(page, 0, sizeof(PagetableEntry));
1321 page->status = oldstatus;
1322 page->blockno = chunk_pageno;
1323 page->ischunk = true;
1324 /* must count it too */
1325 tbm->nentries++;
1326 tbm->nchunks++;
1327 }
1328 else if (!page->ischunk)
1329 {
1330 char oldstatus = page->status;
1331
1332 /* chunk header page was formerly non-lossy, make it lossy */
1333 MemSet(page, 0, sizeof(PagetableEntry));
1334 page->status = oldstatus;
1335 page->blockno = chunk_pageno;
1336 page->ischunk = true;
1337 /* we assume it had some tuple bit(s) set, so mark it lossy */
1338 page->words[0] = ((bitmapword) 1 << 0);
1339 /* adjust counts */
1340 tbm->nchunks++;
1341 tbm->npages--;
1342 }
1343
1344 /* Now set the original target page's bit */
1345 wordnum = WORDNUM(bitno);
1346 bitnum = BITNUM(bitno);
1347 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1348}
1349
1350/*
1351 * tbm_lossify - lose some information to get back under the memory limit
1352 */
1353static void
1354tbm_lossify(TIDBitmap *tbm)
1355{
1356 pagetable_iterator i;
1357 PagetableEntry *page;
1358
1359 /*
1360 * XXX Really stupid implementation: this just lossifies pages in
1361 * essentially random order. We should be paying some attention to the
1362 * number of bits set in each page, instead.
1363 *
1364 * Since we are called as soon as nentries exceeds maxentries, we should
1365 * push nentries down to significantly less than maxentries, or else we'll
1366 * just end up doing this again very soon. We shoot for maxentries/2.
1367 */
1368 Assert(tbm->iterating == TBM_NOT_ITERATING);
1369 Assert(tbm->status == TBM_HASH);
1370
1371 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1372 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1373 {
1374 if (page->ischunk)
1375 continue; /* already a chunk header */
1376
1377 /*
1378 * If the page would become a chunk header, we won't save anything by
1379 * converting it to lossy, so skip it.
1380 */
1381 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1382 continue;
1383
1384 /* This does the dirty work ... */
1385 tbm_mark_page_lossy(tbm, page->blockno);
1386
1387 if (tbm->nentries <= tbm->maxentries / 2)
1388 {
1389 /*
1390 * We have made enough room. Remember where to start lossifying
1391 * next round, so we evenly iterate over the hashtable.
1392 */
1393 tbm->lossify_start = i.cur;
1394 break;
1395 }
1396
1397 /*
1398 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1399 * hashtable and may have deleted the non-lossy chunk. We can
1400 * continue the same hash table scan, since failure to visit one
1401 * element or visiting the newly inserted element, isn't fatal.
1402 */
1403 }
1404
1405 /*
1406 * With a big bitmap and small work_mem, it's possible that we cannot get
1407 * under maxentries. Again, if that happens, we'd end up uselessly
1408 * calling tbm_lossify over and over. To prevent this from becoming a
1409 * performance sink, force maxentries up to at least double the current
1410 * number of entries. (In essence, we're admitting inability to fit
1411 * within work_mem when we do this.) Note that this test will not fire if
1412 * we broke out of the loop early; and if we didn't, the current number of
1413 * entries is simply not reducible any further.
1414 */
1415 if (tbm->nentries > tbm->maxentries / 2)
1416 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1417}
1418
1419/*
1420 * qsort comparator to handle PagetableEntry pointers.
1421 */
1422static int
1423tbm_comparator(const void *left, const void *right)
1424{
1425 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1426 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1427
1428 if (l < r)
1429 return -1;
1430 else if (l > r)
1431 return 1;
1432 return 0;
1433}
1434
1435/*
1436 * As above, but this will get index into PagetableEntry array. Therefore,
1437 * it needs to get actual PagetableEntry using the index before comparing the
1438 * blockno.
1439 */
1440static int
1441tbm_shared_comparator(const void *left, const void *right, void *arg)
1442{
1443 PagetableEntry *base = (PagetableEntry *) arg;
1444 PagetableEntry *lpage = &base[*(int *) left];
1445 PagetableEntry *rpage = &base[*(int *) right];
1446
1447 if (lpage->blockno < rpage->blockno)
1448 return -1;
1449 else if (lpage->blockno > rpage->blockno)
1450 return 1;
1451 return 0;
1452}
1453
1454/*
1455 * tbm_attach_shared_iterate
1456 *
1457 * Allocate a backend-private iterator and attach the shared iterator state
1458 * to it so that multiple processed can iterate jointly.
1459 *
1460 * We also converts the DSA pointers to local pointers and store them into
1461 * our private iterator.
1462 */
1463TBMSharedIterator *
1464tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1465{
1466 TBMSharedIterator *iterator;
1467 TBMSharedIteratorState *istate;
1468
1469 /*
1470 * Create the TBMSharedIterator struct, with enough trailing space to
1471 * serve the needs of the TBMIterateResult sub-struct.
1472 */
1473 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1474 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1475
1476 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477
1478 iterator->state = istate;
1479
1480 iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481
1482 if (istate->npages)
1483 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484 if (istate->nchunks)
1485 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486
1487 return iterator;
1488}
1489
1490/*
1491 * pagetable_allocate
1492 *
1493 * Callback function for allocating the memory for hashtable elements.
1494 * Allocate memory for hashtable elements, using DSA if available.
1495 */
1496static inline void *
1497pagetable_allocate(pagetable_hash *pagetable, Size size)
1498{
1499 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1500 PTEntryArray *ptbase;
1501
1502 if (tbm->dsa == NULL)
1503 return MemoryContextAllocExtended(pagetable->ctx, size,
1504 MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1505
1506 /*
1507 * Save the dsapagetable reference in dsapagetableold before allocating
1508 * new memory so that pagetable_free can free the old entry.
1509 */
1510 tbm->dsapagetableold = tbm->dsapagetable;
1511 tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1512 sizeof(PTEntryArray) + size,
1513 DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1514 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515
1516 return ptbase->ptentry;
1517}
1518
1519/*
1520 * pagetable_free
1521 *
1522 * Callback function for freeing hash table elements.
1523 */
1524static inline void
1525pagetable_free(pagetable_hash *pagetable, void *pointer)
1526{
1527 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1528
1529 /* pfree the input pointer if DSA is not available */
1530 if (tbm->dsa == NULL)
1531 pfree(pointer);
1532 else if (DsaPointerIsValid(tbm->dsapagetableold))
1533 {
1534 dsa_free(tbm->dsa, tbm->dsapagetableold);
1535 tbm->dsapagetableold = InvalidDsaPointer;
1536 }
1537}
1538
1539/*
1540 * tbm_calculate_entries
1541 *
1542 * Estimate number of hashtable entries we can have within maxbytes.
1543 */
1544long
1545tbm_calculate_entries(double maxbytes)
1546{
1547 long nbuckets;
1548
1549 /*
1550 * Estimate number of hashtable entries we can have within maxbytes. This
1551 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552 * for our purpose. Also count an extra Pointer per entry for the arrays
1553 * created during iteration readout.
1554 */
1555 nbuckets = maxbytes /
1556 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1558 nbuckets = Max(nbuckets, 16); /* sanity limit */
1559
1560 return nbuckets;
1561}
1562