1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * tidbitmap.c |
4 | * PostgreSQL tuple-id (TID) bitmap package |
5 | * |
6 | * This module provides bitmap data structures that are spiritually |
7 | * similar to Bitmapsets, but are specially adapted to store sets of |
8 | * tuple identifiers (TIDs), or ItemPointers. In particular, the division |
9 | * of an ItemPointer into BlockNumber and OffsetNumber is catered for. |
10 | * Also, since we wish to be able to store very large tuple sets in |
11 | * memory with this data structure, we support "lossy" storage, in which |
12 | * we no longer remember individual tuple offsets on a page but only the |
13 | * fact that a particular page needs to be visited. |
14 | * |
15 | * The "lossy" storage uses one bit per disk page, so at the standard 8K |
16 | * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb |
17 | * of memory. People pushing around tables of that size should have a |
18 | * couple of Mb to spare, so we don't worry about providing a second level |
19 | * of lossiness. In theory we could fall back to page ranges at some |
20 | * point, but for now that seems useless complexity. |
21 | * |
22 | * We also support the notion of candidate matches, or rechecking. This |
23 | * means we know that a search need visit only some tuples on a page, |
24 | * but we are not certain that all of those tuples are real matches. |
25 | * So the eventual heap scan must recheck the quals for these tuples only, |
26 | * rather than rechecking the quals for all tuples on the page as in the |
27 | * lossy-bitmap case. Rechecking can be specified when TIDs are inserted |
28 | * into a bitmap, and it can also happen internally when we AND a lossy |
29 | * and a non-lossy page. |
30 | * |
31 | * |
32 | * Copyright (c) 2003-2019, PostgreSQL Global Development Group |
33 | * |
34 | * IDENTIFICATION |
35 | * src/backend/nodes/tidbitmap.c |
36 | * |
37 | *------------------------------------------------------------------------- |
38 | */ |
39 | #include "postgres.h" |
40 | |
41 | #include <limits.h> |
42 | |
43 | #include "access/htup_details.h" |
44 | #include "nodes/bitmapset.h" |
45 | #include "nodes/tidbitmap.h" |
46 | #include "storage/lwlock.h" |
47 | #include "utils/dsa.h" |
48 | #include "utils/hashutils.h" |
49 | |
50 | /* |
51 | * The maximum number of tuples per page is not large (typically 256 with |
52 | * 8K pages, or 1024 with 32K pages). So there's not much point in making |
53 | * the per-page bitmaps variable size. We just legislate that the size |
54 | * is this: |
55 | */ |
56 | #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage |
57 | |
58 | /* |
59 | * When we have to switch over to lossy storage, we use a data structure |
60 | * with one bit per page, where all pages having the same number DIV |
61 | * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present |
62 | * and has the bit set for a given page, there must not be a per-page entry |
63 | * for that page in the page table. |
64 | * |
65 | * We actually store both exact pages and lossy chunks in the same hash |
66 | * table, using identical data structures. (This is because the memory |
67 | * management for hashtables doesn't easily/efficiently allow space to be |
68 | * transferred easily from one hashtable to another.) Therefore it's best |
69 | * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not |
70 | * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to |
71 | * avoid expensive integer remainder operations. So, define it like this: |
72 | */ |
73 | #define PAGES_PER_CHUNK (BLCKSZ / 32) |
74 | |
75 | /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */ |
76 | |
77 | #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD) |
78 | #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) |
79 | |
80 | /* number of active words for an exact page: */ |
81 | #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) |
82 | /* number of active words for a lossy chunk: */ |
83 | #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) |
84 | |
85 | /* |
86 | * The hashtable entries are represented by this data structure. For |
87 | * an exact page, blockno is the page number and bit k of the bitmap |
88 | * represents tuple offset k+1. For a lossy chunk, blockno is the first |
89 | * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and |
90 | * bit k represents page blockno+k. Note that it is not possible to |
91 | * have exact storage for the first page of a chunk if we are using |
92 | * lossy storage for any page in the chunk's range, since the same |
93 | * hashtable entry has to serve both purposes. |
94 | * |
95 | * recheck is used only on exact pages --- it indicates that although |
96 | * only the stated tuples need be checked, the full index qual condition |
97 | * must be checked for each (ie, these are candidate matches). |
98 | */ |
99 | typedef struct PagetableEntry |
100 | { |
101 | BlockNumber blockno; /* page number (hashtable key) */ |
102 | char status; /* hash entry status */ |
103 | bool ischunk; /* T = lossy storage, F = exact */ |
104 | bool recheck; /* should the tuples be rechecked? */ |
105 | bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; |
106 | } PagetableEntry; |
107 | |
108 | /* |
109 | * Holds array of pagetable entries. |
110 | */ |
111 | typedef struct PTEntryArray |
112 | { |
113 | pg_atomic_uint32 refcount; /* no. of iterator attached */ |
114 | PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]; |
115 | } PTEntryArray; |
116 | |
117 | /* |
118 | * We want to avoid the overhead of creating the hashtable, which is |
119 | * comparatively large, when not necessary. Particularly when we are using a |
120 | * bitmap scan on the inside of a nestloop join: a bitmap may well live only |
121 | * long enough to accumulate one entry in such cases. We therefore avoid |
122 | * creating an actual hashtable until we need two pagetable entries. When |
123 | * just one pagetable entry is needed, we store it in a fixed field of |
124 | * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later |
125 | * shrinks down to zero or one page again. So, status can be TBM_HASH even |
126 | * when nentries is zero or one.) |
127 | */ |
128 | typedef enum |
129 | { |
130 | TBM_EMPTY, /* no hashtable, nentries == 0 */ |
131 | TBM_ONE_PAGE, /* entry1 contains the single entry */ |
132 | TBM_HASH /* pagetable is valid, entry1 is not */ |
133 | } TBMStatus; |
134 | |
135 | /* |
136 | * Current iterating state of the TBM. |
137 | */ |
138 | typedef enum |
139 | { |
140 | TBM_NOT_ITERATING, /* not yet converted to page and chunk array */ |
141 | TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */ |
142 | TBM_ITERATING_SHARED /* converted to shared page and chunk array */ |
143 | } TBMIteratingState; |
144 | |
145 | /* |
146 | * Here is the representation for a whole TIDBitMap: |
147 | */ |
148 | struct TIDBitmap |
149 | { |
150 | NodeTag type; /* to make it a valid Node */ |
151 | MemoryContext mcxt; /* memory context containing me */ |
152 | TBMStatus status; /* see codes above */ |
153 | struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */ |
154 | int nentries; /* number of entries in pagetable */ |
155 | int maxentries; /* limit on same to meet maxbytes */ |
156 | int npages; /* number of exact entries in pagetable */ |
157 | int nchunks; /* number of lossy entries in pagetable */ |
158 | TBMIteratingState iterating; /* tbm_begin_iterate called? */ |
159 | uint32 lossify_start; /* offset to start lossifying hashtable at */ |
160 | PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ |
161 | /* these are valid when iterating is true: */ |
162 | PagetableEntry **spages; /* sorted exact-page list, or NULL */ |
163 | PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ |
164 | dsa_pointer dsapagetable; /* dsa_pointer to the element array */ |
165 | dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */ |
166 | dsa_pointer ptpages; /* dsa_pointer to the page array */ |
167 | dsa_pointer ptchunks; /* dsa_pointer to the chunk array */ |
168 | dsa_area *dsa; /* reference to per-query dsa area */ |
169 | }; |
170 | |
171 | /* |
172 | * When iterating over a bitmap in sorted order, a TBMIterator is used to |
173 | * track our progress. There can be several iterators scanning the same |
174 | * bitmap concurrently. Note that the bitmap becomes read-only as soon as |
175 | * any iterator is created. |
176 | */ |
177 | struct TBMIterator |
178 | { |
179 | TIDBitmap *tbm; /* TIDBitmap we're iterating over */ |
180 | int spageptr; /* next spages index */ |
181 | int schunkptr; /* next schunks index */ |
182 | int schunkbit; /* next bit to check in current schunk */ |
183 | TBMIterateResult output; /* MUST BE LAST (because variable-size) */ |
184 | }; |
185 | |
186 | /* |
187 | * Holds the shared members of the iterator so that multiple processes |
188 | * can jointly iterate. |
189 | */ |
190 | typedef struct TBMSharedIteratorState |
191 | { |
192 | int nentries; /* number of entries in pagetable */ |
193 | int maxentries; /* limit on same to meet maxbytes */ |
194 | int npages; /* number of exact entries in pagetable */ |
195 | int nchunks; /* number of lossy entries in pagetable */ |
196 | dsa_pointer pagetable; /* dsa pointers to head of pagetable data */ |
197 | dsa_pointer spages; /* dsa pointer to page array */ |
198 | dsa_pointer schunks; /* dsa pointer to chunk array */ |
199 | LWLock lock; /* lock to protect below members */ |
200 | int spageptr; /* next spages index */ |
201 | int schunkptr; /* next schunks index */ |
202 | int schunkbit; /* next bit to check in current schunk */ |
203 | } TBMSharedIteratorState; |
204 | |
205 | /* |
206 | * pagetable iteration array. |
207 | */ |
208 | typedef struct PTIterationArray |
209 | { |
210 | pg_atomic_uint32 refcount; /* no. of iterator attached */ |
211 | int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */ |
212 | } PTIterationArray; |
213 | |
214 | /* |
215 | * same as TBMIterator, but it is used for joint iteration, therefore this |
216 | * also holds a reference to the shared state. |
217 | */ |
218 | struct TBMSharedIterator |
219 | { |
220 | TBMSharedIteratorState *state; /* shared state */ |
221 | PTEntryArray *ptbase; /* pagetable element array */ |
222 | PTIterationArray *ptpages; /* sorted exact page index list */ |
223 | PTIterationArray *ptchunks; /* sorted lossy page index list */ |
224 | TBMIterateResult output; /* MUST BE LAST (because variable-size) */ |
225 | }; |
226 | |
227 | /* Local function prototypes */ |
228 | static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); |
229 | static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, |
230 | const TIDBitmap *b); |
231 | static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, |
232 | BlockNumber pageno); |
233 | static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); |
234 | static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); |
235 | static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); |
236 | static void tbm_lossify(TIDBitmap *tbm); |
237 | static int tbm_comparator(const void *left, const void *right); |
238 | static int tbm_shared_comparator(const void *left, const void *right, |
239 | void *arg); |
240 | |
241 | /* define hashtable mapping block numbers to PagetableEntry's */ |
242 | #define SH_USE_NONDEFAULT_ALLOCATOR |
243 | #define SH_PREFIX pagetable |
244 | #define SH_ELEMENT_TYPE PagetableEntry |
245 | #define SH_KEY_TYPE BlockNumber |
246 | #define SH_KEY blockno |
247 | #define SH_HASH_KEY(tb, key) murmurhash32(key) |
248 | #define SH_EQUAL(tb, a, b) a == b |
249 | #define SH_SCOPE static inline |
250 | #define SH_DEFINE |
251 | #define SH_DECLARE |
252 | #include "lib/simplehash.h" |
253 | |
254 | |
255 | /* |
256 | * tbm_create - create an initially-empty bitmap |
257 | * |
258 | * The bitmap will live in the memory context that is CurrentMemoryContext |
259 | * at the time of this call. It will be limited to (approximately) maxbytes |
260 | * total memory consumption. If the DSA passed to this function is not NULL |
261 | * then the memory for storing elements of the underlying page table will |
262 | * be allocated from the DSA. |
263 | */ |
264 | TIDBitmap * |
265 | tbm_create(long maxbytes, dsa_area *dsa) |
266 | { |
267 | TIDBitmap *tbm; |
268 | |
269 | /* Create the TIDBitmap struct and zero all its fields */ |
270 | tbm = makeNode(TIDBitmap); |
271 | |
272 | tbm->mcxt = CurrentMemoryContext; |
273 | tbm->status = TBM_EMPTY; |
274 | |
275 | tbm->maxentries = (int) tbm_calculate_entries(maxbytes); |
276 | tbm->lossify_start = 0; |
277 | tbm->dsa = dsa; |
278 | tbm->dsapagetable = InvalidDsaPointer; |
279 | tbm->dsapagetableold = InvalidDsaPointer; |
280 | tbm->ptpages = InvalidDsaPointer; |
281 | tbm->ptchunks = InvalidDsaPointer; |
282 | |
283 | return tbm; |
284 | } |
285 | |
286 | /* |
287 | * Actually create the hashtable. Since this is a moderately expensive |
288 | * proposition, we don't do it until we have to. |
289 | */ |
290 | static void |
291 | tbm_create_pagetable(TIDBitmap *tbm) |
292 | { |
293 | Assert(tbm->status != TBM_HASH); |
294 | Assert(tbm->pagetable == NULL); |
295 | |
296 | tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); |
297 | |
298 | /* If entry1 is valid, push it into the hashtable */ |
299 | if (tbm->status == TBM_ONE_PAGE) |
300 | { |
301 | PagetableEntry *page; |
302 | bool found; |
303 | char oldstatus; |
304 | |
305 | page = pagetable_insert(tbm->pagetable, |
306 | tbm->entry1.blockno, |
307 | &found); |
308 | Assert(!found); |
309 | oldstatus = page->status; |
310 | memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); |
311 | page->status = oldstatus; |
312 | } |
313 | |
314 | tbm->status = TBM_HASH; |
315 | } |
316 | |
317 | /* |
318 | * tbm_free - free a TIDBitmap |
319 | */ |
320 | void |
321 | tbm_free(TIDBitmap *tbm) |
322 | { |
323 | if (tbm->pagetable) |
324 | pagetable_destroy(tbm->pagetable); |
325 | if (tbm->spages) |
326 | pfree(tbm->spages); |
327 | if (tbm->schunks) |
328 | pfree(tbm->schunks); |
329 | pfree(tbm); |
330 | } |
331 | |
332 | /* |
333 | * tbm_free_shared_area - free shared state |
334 | * |
335 | * Free shared iterator state, Also free shared pagetable and iterator arrays |
336 | * memory if they are not referred by any of the shared iterator i.e recount |
337 | * is becomes 0. |
338 | */ |
339 | void |
340 | tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp) |
341 | { |
342 | TBMSharedIteratorState *istate = dsa_get_address(dsa, dp); |
343 | PTEntryArray *ptbase; |
344 | PTIterationArray *ptpages; |
345 | PTIterationArray *ptchunks; |
346 | |
347 | if (DsaPointerIsValid(istate->pagetable)) |
348 | { |
349 | ptbase = dsa_get_address(dsa, istate->pagetable); |
350 | if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0) |
351 | dsa_free(dsa, istate->pagetable); |
352 | } |
353 | if (DsaPointerIsValid(istate->spages)) |
354 | { |
355 | ptpages = dsa_get_address(dsa, istate->spages); |
356 | if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0) |
357 | dsa_free(dsa, istate->spages); |
358 | } |
359 | if (DsaPointerIsValid(istate->schunks)) |
360 | { |
361 | ptchunks = dsa_get_address(dsa, istate->schunks); |
362 | if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0) |
363 | dsa_free(dsa, istate->schunks); |
364 | } |
365 | |
366 | dsa_free(dsa, dp); |
367 | } |
368 | |
369 | /* |
370 | * tbm_add_tuples - add some tuple IDs to a TIDBitmap |
371 | * |
372 | * If recheck is true, then the recheck flag will be set in the |
373 | * TBMIterateResult when any of these tuples are reported out. |
374 | */ |
375 | void |
376 | tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, |
377 | bool recheck) |
378 | { |
379 | BlockNumber currblk = InvalidBlockNumber; |
380 | PagetableEntry *page = NULL; /* only valid when currblk is valid */ |
381 | int i; |
382 | |
383 | Assert(tbm->iterating == TBM_NOT_ITERATING); |
384 | for (i = 0; i < ntids; i++) |
385 | { |
386 | BlockNumber blk = ItemPointerGetBlockNumber(tids + i); |
387 | OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); |
388 | int wordnum, |
389 | bitnum; |
390 | |
391 | /* safety check to ensure we don't overrun bit array bounds */ |
392 | if (off < 1 || off > MAX_TUPLES_PER_PAGE) |
393 | elog(ERROR, "tuple offset out of range: %u" , off); |
394 | |
395 | /* |
396 | * Look up target page unless we already did. This saves cycles when |
397 | * the input includes consecutive tuples on the same page, which is |
398 | * common enough to justify an extra test here. |
399 | */ |
400 | if (blk != currblk) |
401 | { |
402 | if (tbm_page_is_lossy(tbm, blk)) |
403 | page = NULL; /* remember page is lossy */ |
404 | else |
405 | page = tbm_get_pageentry(tbm, blk); |
406 | currblk = blk; |
407 | } |
408 | |
409 | if (page == NULL) |
410 | continue; /* whole page is already marked */ |
411 | |
412 | if (page->ischunk) |
413 | { |
414 | /* The page is a lossy chunk header, set bit for itself */ |
415 | wordnum = bitnum = 0; |
416 | } |
417 | else |
418 | { |
419 | /* Page is exact, so set bit for individual tuple */ |
420 | wordnum = WORDNUM(off - 1); |
421 | bitnum = BITNUM(off - 1); |
422 | } |
423 | page->words[wordnum] |= ((bitmapword) 1 << bitnum); |
424 | page->recheck |= recheck; |
425 | |
426 | if (tbm->nentries > tbm->maxentries) |
427 | { |
428 | tbm_lossify(tbm); |
429 | /* Page could have been converted to lossy, so force new lookup */ |
430 | currblk = InvalidBlockNumber; |
431 | } |
432 | } |
433 | } |
434 | |
435 | /* |
436 | * tbm_add_page - add a whole page to a TIDBitmap |
437 | * |
438 | * This causes the whole page to be reported (with the recheck flag) |
439 | * when the TIDBitmap is scanned. |
440 | */ |
441 | void |
442 | tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) |
443 | { |
444 | /* Enter the page in the bitmap, or mark it lossy if already present */ |
445 | tbm_mark_page_lossy(tbm, pageno); |
446 | /* If we went over the memory limit, lossify some more pages */ |
447 | if (tbm->nentries > tbm->maxentries) |
448 | tbm_lossify(tbm); |
449 | } |
450 | |
451 | /* |
452 | * tbm_union - set union |
453 | * |
454 | * a is modified in-place, b is not changed |
455 | */ |
456 | void |
457 | tbm_union(TIDBitmap *a, const TIDBitmap *b) |
458 | { |
459 | Assert(!a->iterating); |
460 | /* Nothing to do if b is empty */ |
461 | if (b->nentries == 0) |
462 | return; |
463 | /* Scan through chunks and pages in b, merge into a */ |
464 | if (b->status == TBM_ONE_PAGE) |
465 | tbm_union_page(a, &b->entry1); |
466 | else |
467 | { |
468 | pagetable_iterator i; |
469 | PagetableEntry *bpage; |
470 | |
471 | Assert(b->status == TBM_HASH); |
472 | pagetable_start_iterate(b->pagetable, &i); |
473 | while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL) |
474 | tbm_union_page(a, bpage); |
475 | } |
476 | } |
477 | |
478 | /* Process one page of b during a union op */ |
479 | static void |
480 | tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) |
481 | { |
482 | PagetableEntry *apage; |
483 | int wordnum; |
484 | |
485 | if (bpage->ischunk) |
486 | { |
487 | /* Scan b's chunk, mark each indicated page lossy in a */ |
488 | for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) |
489 | { |
490 | bitmapword w = bpage->words[wordnum]; |
491 | |
492 | if (w != 0) |
493 | { |
494 | BlockNumber pg; |
495 | |
496 | pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); |
497 | while (w != 0) |
498 | { |
499 | if (w & 1) |
500 | tbm_mark_page_lossy(a, pg); |
501 | pg++; |
502 | w >>= 1; |
503 | } |
504 | } |
505 | } |
506 | } |
507 | else if (tbm_page_is_lossy(a, bpage->blockno)) |
508 | { |
509 | /* page is already lossy in a, nothing to do */ |
510 | return; |
511 | } |
512 | else |
513 | { |
514 | apage = tbm_get_pageentry(a, bpage->blockno); |
515 | if (apage->ischunk) |
516 | { |
517 | /* The page is a lossy chunk header, set bit for itself */ |
518 | apage->words[0] |= ((bitmapword) 1 << 0); |
519 | } |
520 | else |
521 | { |
522 | /* Both pages are exact, merge at the bit level */ |
523 | for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) |
524 | apage->words[wordnum] |= bpage->words[wordnum]; |
525 | apage->recheck |= bpage->recheck; |
526 | } |
527 | } |
528 | |
529 | if (a->nentries > a->maxentries) |
530 | tbm_lossify(a); |
531 | } |
532 | |
533 | /* |
534 | * tbm_intersect - set intersection |
535 | * |
536 | * a is modified in-place, b is not changed |
537 | */ |
538 | void |
539 | tbm_intersect(TIDBitmap *a, const TIDBitmap *b) |
540 | { |
541 | Assert(!a->iterating); |
542 | /* Nothing to do if a is empty */ |
543 | if (a->nentries == 0) |
544 | return; |
545 | /* Scan through chunks and pages in a, try to match to b */ |
546 | if (a->status == TBM_ONE_PAGE) |
547 | { |
548 | if (tbm_intersect_page(a, &a->entry1, b)) |
549 | { |
550 | /* Page is now empty, remove it from a */ |
551 | Assert(!a->entry1.ischunk); |
552 | a->npages--; |
553 | a->nentries--; |
554 | Assert(a->nentries == 0); |
555 | a->status = TBM_EMPTY; |
556 | } |
557 | } |
558 | else |
559 | { |
560 | pagetable_iterator i; |
561 | PagetableEntry *apage; |
562 | |
563 | Assert(a->status == TBM_HASH); |
564 | pagetable_start_iterate(a->pagetable, &i); |
565 | while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL) |
566 | { |
567 | if (tbm_intersect_page(a, apage, b)) |
568 | { |
569 | /* Page or chunk is now empty, remove it from a */ |
570 | if (apage->ischunk) |
571 | a->nchunks--; |
572 | else |
573 | a->npages--; |
574 | a->nentries--; |
575 | if (!pagetable_delete(a->pagetable, apage->blockno)) |
576 | elog(ERROR, "hash table corrupted" ); |
577 | } |
578 | } |
579 | } |
580 | } |
581 | |
582 | /* |
583 | * Process one page of a during an intersection op |
584 | * |
585 | * Returns true if apage is now empty and should be deleted from a |
586 | */ |
587 | static bool |
588 | tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) |
589 | { |
590 | const PagetableEntry *bpage; |
591 | int wordnum; |
592 | |
593 | if (apage->ischunk) |
594 | { |
595 | /* Scan each bit in chunk, try to clear */ |
596 | bool candelete = true; |
597 | |
598 | for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) |
599 | { |
600 | bitmapword w = apage->words[wordnum]; |
601 | |
602 | if (w != 0) |
603 | { |
604 | bitmapword neww = w; |
605 | BlockNumber pg; |
606 | int bitnum; |
607 | |
608 | pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); |
609 | bitnum = 0; |
610 | while (w != 0) |
611 | { |
612 | if (w & 1) |
613 | { |
614 | if (!tbm_page_is_lossy(b, pg) && |
615 | tbm_find_pageentry(b, pg) == NULL) |
616 | { |
617 | /* Page is not in b at all, lose lossy bit */ |
618 | neww &= ~((bitmapword) 1 << bitnum); |
619 | } |
620 | } |
621 | pg++; |
622 | bitnum++; |
623 | w >>= 1; |
624 | } |
625 | apage->words[wordnum] = neww; |
626 | if (neww != 0) |
627 | candelete = false; |
628 | } |
629 | } |
630 | return candelete; |
631 | } |
632 | else if (tbm_page_is_lossy(b, apage->blockno)) |
633 | { |
634 | /* |
635 | * Some of the tuples in 'a' might not satisfy the quals for 'b', but |
636 | * because the page 'b' is lossy, we don't know which ones. Therefore |
637 | * we mark 'a' as requiring rechecks, to indicate that at most those |
638 | * tuples set in 'a' are matches. |
639 | */ |
640 | apage->recheck = true; |
641 | return false; |
642 | } |
643 | else |
644 | { |
645 | bool candelete = true; |
646 | |
647 | bpage = tbm_find_pageentry(b, apage->blockno); |
648 | if (bpage != NULL) |
649 | { |
650 | /* Both pages are exact, merge at the bit level */ |
651 | Assert(!bpage->ischunk); |
652 | for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) |
653 | { |
654 | apage->words[wordnum] &= bpage->words[wordnum]; |
655 | if (apage->words[wordnum] != 0) |
656 | candelete = false; |
657 | } |
658 | apage->recheck |= bpage->recheck; |
659 | } |
660 | /* If there is no matching b page, we can just delete the a page */ |
661 | return candelete; |
662 | } |
663 | } |
664 | |
665 | /* |
666 | * tbm_is_empty - is a TIDBitmap completely empty? |
667 | */ |
668 | bool |
669 | tbm_is_empty(const TIDBitmap *tbm) |
670 | { |
671 | return (tbm->nentries == 0); |
672 | } |
673 | |
674 | /* |
675 | * tbm_begin_iterate - prepare to iterate through a TIDBitmap |
676 | * |
677 | * The TBMIterator struct is created in the caller's memory context. |
678 | * For a clean shutdown of the iteration, call tbm_end_iterate; but it's |
679 | * okay to just allow the memory context to be released, too. It is caller's |
680 | * responsibility not to touch the TBMIterator anymore once the TIDBitmap |
681 | * is freed. |
682 | * |
683 | * NB: after this is called, it is no longer allowed to modify the contents |
684 | * of the bitmap. However, you can call this multiple times to scan the |
685 | * contents repeatedly, including parallel scans. |
686 | */ |
687 | TBMIterator * |
688 | tbm_begin_iterate(TIDBitmap *tbm) |
689 | { |
690 | TBMIterator *iterator; |
691 | |
692 | Assert(tbm->iterating != TBM_ITERATING_SHARED); |
693 | |
694 | /* |
695 | * Create the TBMIterator struct, with enough trailing space to serve the |
696 | * needs of the TBMIterateResult sub-struct. |
697 | */ |
698 | iterator = (TBMIterator *) palloc(sizeof(TBMIterator) + |
699 | MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); |
700 | iterator->tbm = tbm; |
701 | |
702 | /* |
703 | * Initialize iteration pointers. |
704 | */ |
705 | iterator->spageptr = 0; |
706 | iterator->schunkptr = 0; |
707 | iterator->schunkbit = 0; |
708 | |
709 | /* |
710 | * If we have a hashtable, create and fill the sorted page lists, unless |
711 | * we already did that for a previous iterator. Note that the lists are |
712 | * attached to the bitmap not the iterator, so they can be used by more |
713 | * than one iterator. |
714 | */ |
715 | if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING) |
716 | { |
717 | pagetable_iterator i; |
718 | PagetableEntry *page; |
719 | int npages; |
720 | int nchunks; |
721 | |
722 | if (!tbm->spages && tbm->npages > 0) |
723 | tbm->spages = (PagetableEntry **) |
724 | MemoryContextAlloc(tbm->mcxt, |
725 | tbm->npages * sizeof(PagetableEntry *)); |
726 | if (!tbm->schunks && tbm->nchunks > 0) |
727 | tbm->schunks = (PagetableEntry **) |
728 | MemoryContextAlloc(tbm->mcxt, |
729 | tbm->nchunks * sizeof(PagetableEntry *)); |
730 | |
731 | npages = nchunks = 0; |
732 | pagetable_start_iterate(tbm->pagetable, &i); |
733 | while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) |
734 | { |
735 | if (page->ischunk) |
736 | tbm->schunks[nchunks++] = page; |
737 | else |
738 | tbm->spages[npages++] = page; |
739 | } |
740 | Assert(npages == tbm->npages); |
741 | Assert(nchunks == tbm->nchunks); |
742 | if (npages > 1) |
743 | qsort(tbm->spages, npages, sizeof(PagetableEntry *), |
744 | tbm_comparator); |
745 | if (nchunks > 1) |
746 | qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), |
747 | tbm_comparator); |
748 | } |
749 | |
750 | tbm->iterating = TBM_ITERATING_PRIVATE; |
751 | |
752 | return iterator; |
753 | } |
754 | |
755 | /* |
756 | * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap. |
757 | * |
758 | * The necessary shared state will be allocated from the DSA passed to |
759 | * tbm_create, so that multiple processes can attach to it and iterate jointly. |
760 | * |
761 | * This will convert the pagetable hash into page and chunk array of the index |
762 | * into pagetable array. |
763 | */ |
764 | dsa_pointer |
765 | tbm_prepare_shared_iterate(TIDBitmap *tbm) |
766 | { |
767 | dsa_pointer dp; |
768 | TBMSharedIteratorState *istate; |
769 | PTEntryArray *ptbase = NULL; |
770 | PTIterationArray *ptpages = NULL; |
771 | PTIterationArray *ptchunks = NULL; |
772 | |
773 | Assert(tbm->dsa != NULL); |
774 | Assert(tbm->iterating != TBM_ITERATING_PRIVATE); |
775 | |
776 | /* |
777 | * Allocate TBMSharedIteratorState from DSA to hold the shared members and |
778 | * lock, this will also be used by multiple worker for shared iterate. |
779 | */ |
780 | dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState)); |
781 | istate = dsa_get_address(tbm->dsa, dp); |
782 | |
783 | /* |
784 | * If we're not already iterating, create and fill the sorted page lists. |
785 | * (If we are, the sorted page lists are already stored in the TIDBitmap, |
786 | * and we can just reuse them.) |
787 | */ |
788 | if (tbm->iterating == TBM_NOT_ITERATING) |
789 | { |
790 | pagetable_iterator i; |
791 | PagetableEntry *page; |
792 | int idx; |
793 | int npages; |
794 | int nchunks; |
795 | |
796 | /* |
797 | * Allocate the page and chunk array memory from the DSA to share |
798 | * across multiple processes. |
799 | */ |
800 | if (tbm->npages) |
801 | { |
802 | tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + |
803 | tbm->npages * sizeof(int)); |
804 | ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); |
805 | pg_atomic_init_u32(&ptpages->refcount, 0); |
806 | } |
807 | if (tbm->nchunks) |
808 | { |
809 | tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + |
810 | tbm->nchunks * sizeof(int)); |
811 | ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); |
812 | pg_atomic_init_u32(&ptchunks->refcount, 0); |
813 | } |
814 | |
815 | /* |
816 | * If TBM status is TBM_HASH then iterate over the pagetable and |
817 | * convert it to page and chunk arrays. But if it's in the |
818 | * TBM_ONE_PAGE mode then directly allocate the space for one entry |
819 | * from the DSA. |
820 | */ |
821 | npages = nchunks = 0; |
822 | if (tbm->status == TBM_HASH) |
823 | { |
824 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
825 | |
826 | pagetable_start_iterate(tbm->pagetable, &i); |
827 | while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) |
828 | { |
829 | idx = page - ptbase->ptentry; |
830 | if (page->ischunk) |
831 | ptchunks->index[nchunks++] = idx; |
832 | else |
833 | ptpages->index[npages++] = idx; |
834 | } |
835 | |
836 | Assert(npages == tbm->npages); |
837 | Assert(nchunks == tbm->nchunks); |
838 | } |
839 | else if (tbm->status == TBM_ONE_PAGE) |
840 | { |
841 | /* |
842 | * In one page mode allocate the space for one pagetable entry, |
843 | * initialize it, and directly store its index (i.e. 0) in the |
844 | * page array. |
845 | */ |
846 | tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) + |
847 | sizeof(PagetableEntry)); |
848 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
849 | memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry)); |
850 | ptpages->index[0] = 0; |
851 | } |
852 | |
853 | if (ptbase != NULL) |
854 | pg_atomic_init_u32(&ptbase->refcount, 0); |
855 | if (npages > 1) |
856 | qsort_arg((void *) (ptpages->index), npages, sizeof(int), |
857 | tbm_shared_comparator, (void *) ptbase->ptentry); |
858 | if (nchunks > 1) |
859 | qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int), |
860 | tbm_shared_comparator, (void *) ptbase->ptentry); |
861 | } |
862 | |
863 | /* |
864 | * Store the TBM members in the shared state so that we can share them |
865 | * across multiple processes. |
866 | */ |
867 | istate->nentries = tbm->nentries; |
868 | istate->maxentries = tbm->maxentries; |
869 | istate->npages = tbm->npages; |
870 | istate->nchunks = tbm->nchunks; |
871 | istate->pagetable = tbm->dsapagetable; |
872 | istate->spages = tbm->ptpages; |
873 | istate->schunks = tbm->ptchunks; |
874 | |
875 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
876 | ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); |
877 | ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); |
878 | |
879 | /* |
880 | * For every shared iterator, referring to pagetable and iterator array, |
881 | * increase the refcount by 1 so that while freeing the shared iterator we |
882 | * don't free pagetable and iterator array until its refcount becomes 0. |
883 | */ |
884 | if (ptbase != NULL) |
885 | pg_atomic_add_fetch_u32(&ptbase->refcount, 1); |
886 | if (ptpages != NULL) |
887 | pg_atomic_add_fetch_u32(&ptpages->refcount, 1); |
888 | if (ptchunks != NULL) |
889 | pg_atomic_add_fetch_u32(&ptchunks->refcount, 1); |
890 | |
891 | /* Initialize the iterator lock */ |
892 | LWLockInitialize(&istate->lock, LWTRANCHE_TBM); |
893 | |
894 | /* Initialize the shared iterator state */ |
895 | istate->schunkbit = 0; |
896 | istate->schunkptr = 0; |
897 | istate->spageptr = 0; |
898 | |
899 | tbm->iterating = TBM_ITERATING_SHARED; |
900 | |
901 | return dp; |
902 | } |
903 | |
904 | /* |
905 | * tbm_extract_page_tuple - extract the tuple offsets from a page |
906 | * |
907 | * The extracted offsets are stored into TBMIterateResult. |
908 | */ |
909 | static inline int |
910 | (PagetableEntry *page, TBMIterateResult *output) |
911 | { |
912 | int wordnum; |
913 | int ntuples = 0; |
914 | |
915 | for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) |
916 | { |
917 | bitmapword w = page->words[wordnum]; |
918 | |
919 | if (w != 0) |
920 | { |
921 | int off = wordnum * BITS_PER_BITMAPWORD + 1; |
922 | |
923 | while (w != 0) |
924 | { |
925 | if (w & 1) |
926 | output->offsets[ntuples++] = (OffsetNumber) off; |
927 | off++; |
928 | w >>= 1; |
929 | } |
930 | } |
931 | } |
932 | |
933 | return ntuples; |
934 | } |
935 | |
936 | /* |
937 | * tbm_advance_schunkbit - Advance the schunkbit |
938 | */ |
939 | static inline void |
940 | tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) |
941 | { |
942 | int schunkbit = *schunkbitp; |
943 | |
944 | while (schunkbit < PAGES_PER_CHUNK) |
945 | { |
946 | int wordnum = WORDNUM(schunkbit); |
947 | int bitnum = BITNUM(schunkbit); |
948 | |
949 | if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) |
950 | break; |
951 | schunkbit++; |
952 | } |
953 | |
954 | *schunkbitp = schunkbit; |
955 | } |
956 | |
957 | /* |
958 | * tbm_iterate - scan through next page of a TIDBitmap |
959 | * |
960 | * Returns a TBMIterateResult representing one page, or NULL if there are |
961 | * no more pages to scan. Pages are guaranteed to be delivered in numerical |
962 | * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to |
963 | * remember the exact tuples to look at on this page --- the caller must |
964 | * examine all tuples on the page and check if they meet the intended |
965 | * condition. If result->recheck is true, only the indicated tuples need |
966 | * be examined, but the condition must be rechecked anyway. (For ease of |
967 | * testing, recheck is always set true when ntuples < 0.) |
968 | */ |
969 | TBMIterateResult * |
970 | tbm_iterate(TBMIterator *iterator) |
971 | { |
972 | TIDBitmap *tbm = iterator->tbm; |
973 | TBMIterateResult *output = &(iterator->output); |
974 | |
975 | Assert(tbm->iterating == TBM_ITERATING_PRIVATE); |
976 | |
977 | /* |
978 | * If lossy chunk pages remain, make sure we've advanced schunkptr/ |
979 | * schunkbit to the next set bit. |
980 | */ |
981 | while (iterator->schunkptr < tbm->nchunks) |
982 | { |
983 | PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; |
984 | int schunkbit = iterator->schunkbit; |
985 | |
986 | tbm_advance_schunkbit(chunk, &schunkbit); |
987 | if (schunkbit < PAGES_PER_CHUNK) |
988 | { |
989 | iterator->schunkbit = schunkbit; |
990 | break; |
991 | } |
992 | /* advance to next chunk */ |
993 | iterator->schunkptr++; |
994 | iterator->schunkbit = 0; |
995 | } |
996 | |
997 | /* |
998 | * If both chunk and per-page data remain, must output the numerically |
999 | * earlier page. |
1000 | */ |
1001 | if (iterator->schunkptr < tbm->nchunks) |
1002 | { |
1003 | PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; |
1004 | BlockNumber chunk_blockno; |
1005 | |
1006 | chunk_blockno = chunk->blockno + iterator->schunkbit; |
1007 | if (iterator->spageptr >= tbm->npages || |
1008 | chunk_blockno < tbm->spages[iterator->spageptr]->blockno) |
1009 | { |
1010 | /* Return a lossy page indicator from the chunk */ |
1011 | output->blockno = chunk_blockno; |
1012 | output->ntuples = -1; |
1013 | output->recheck = true; |
1014 | iterator->schunkbit++; |
1015 | return output; |
1016 | } |
1017 | } |
1018 | |
1019 | if (iterator->spageptr < tbm->npages) |
1020 | { |
1021 | PagetableEntry *page; |
1022 | int ntuples; |
1023 | |
1024 | /* In ONE_PAGE state, we don't allocate an spages[] array */ |
1025 | if (tbm->status == TBM_ONE_PAGE) |
1026 | page = &tbm->entry1; |
1027 | else |
1028 | page = tbm->spages[iterator->spageptr]; |
1029 | |
1030 | /* scan bitmap to extract individual offset numbers */ |
1031 | ntuples = tbm_extract_page_tuple(page, output); |
1032 | output->blockno = page->blockno; |
1033 | output->ntuples = ntuples; |
1034 | output->recheck = page->recheck; |
1035 | iterator->spageptr++; |
1036 | return output; |
1037 | } |
1038 | |
1039 | /* Nothing more in the bitmap */ |
1040 | return NULL; |
1041 | } |
1042 | |
1043 | /* |
1044 | * tbm_shared_iterate - scan through next page of a TIDBitmap |
1045 | * |
1046 | * As above, but this will iterate using an iterator which is shared |
1047 | * across multiple processes. We need to acquire the iterator LWLock, |
1048 | * before accessing the shared members. |
1049 | */ |
1050 | TBMIterateResult * |
1051 | tbm_shared_iterate(TBMSharedIterator *iterator) |
1052 | { |
1053 | TBMIterateResult *output = &iterator->output; |
1054 | TBMSharedIteratorState *istate = iterator->state; |
1055 | PagetableEntry *ptbase = NULL; |
1056 | int *idxpages = NULL; |
1057 | int *idxchunks = NULL; |
1058 | |
1059 | if (iterator->ptbase != NULL) |
1060 | ptbase = iterator->ptbase->ptentry; |
1061 | if (iterator->ptpages != NULL) |
1062 | idxpages = iterator->ptpages->index; |
1063 | if (iterator->ptchunks != NULL) |
1064 | idxchunks = iterator->ptchunks->index; |
1065 | |
1066 | /* Acquire the LWLock before accessing the shared members */ |
1067 | LWLockAcquire(&istate->lock, LW_EXCLUSIVE); |
1068 | |
1069 | /* |
1070 | * If lossy chunk pages remain, make sure we've advanced schunkptr/ |
1071 | * schunkbit to the next set bit. |
1072 | */ |
1073 | while (istate->schunkptr < istate->nchunks) |
1074 | { |
1075 | PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; |
1076 | int schunkbit = istate->schunkbit; |
1077 | |
1078 | tbm_advance_schunkbit(chunk, &schunkbit); |
1079 | if (schunkbit < PAGES_PER_CHUNK) |
1080 | { |
1081 | istate->schunkbit = schunkbit; |
1082 | break; |
1083 | } |
1084 | /* advance to next chunk */ |
1085 | istate->schunkptr++; |
1086 | istate->schunkbit = 0; |
1087 | } |
1088 | |
1089 | /* |
1090 | * If both chunk and per-page data remain, must output the numerically |
1091 | * earlier page. |
1092 | */ |
1093 | if (istate->schunkptr < istate->nchunks) |
1094 | { |
1095 | PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; |
1096 | BlockNumber chunk_blockno; |
1097 | |
1098 | chunk_blockno = chunk->blockno + istate->schunkbit; |
1099 | |
1100 | if (istate->spageptr >= istate->npages || |
1101 | chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) |
1102 | { |
1103 | /* Return a lossy page indicator from the chunk */ |
1104 | output->blockno = chunk_blockno; |
1105 | output->ntuples = -1; |
1106 | output->recheck = true; |
1107 | istate->schunkbit++; |
1108 | |
1109 | LWLockRelease(&istate->lock); |
1110 | return output; |
1111 | } |
1112 | } |
1113 | |
1114 | if (istate->spageptr < istate->npages) |
1115 | { |
1116 | PagetableEntry *page = &ptbase[idxpages[istate->spageptr]]; |
1117 | int ntuples; |
1118 | |
1119 | /* scan bitmap to extract individual offset numbers */ |
1120 | ntuples = tbm_extract_page_tuple(page, output); |
1121 | output->blockno = page->blockno; |
1122 | output->ntuples = ntuples; |
1123 | output->recheck = page->recheck; |
1124 | istate->spageptr++; |
1125 | |
1126 | LWLockRelease(&istate->lock); |
1127 | |
1128 | return output; |
1129 | } |
1130 | |
1131 | LWLockRelease(&istate->lock); |
1132 | |
1133 | /* Nothing more in the bitmap */ |
1134 | return NULL; |
1135 | } |
1136 | |
1137 | /* |
1138 | * tbm_end_iterate - finish an iteration over a TIDBitmap |
1139 | * |
1140 | * Currently this is just a pfree, but it might do more someday. (For |
1141 | * instance, it could be useful to count open iterators and allow the |
1142 | * bitmap to return to read/write status when there are no more iterators.) |
1143 | */ |
1144 | void |
1145 | tbm_end_iterate(TBMIterator *iterator) |
1146 | { |
1147 | pfree(iterator); |
1148 | } |
1149 | |
1150 | /* |
1151 | * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap |
1152 | * |
1153 | * This doesn't free any of the shared state associated with the iterator, |
1154 | * just our backend-private state. |
1155 | */ |
1156 | void |
1157 | tbm_end_shared_iterate(TBMSharedIterator *iterator) |
1158 | { |
1159 | pfree(iterator); |
1160 | } |
1161 | |
1162 | /* |
1163 | * tbm_find_pageentry - find a PagetableEntry for the pageno |
1164 | * |
1165 | * Returns NULL if there is no non-lossy entry for the pageno. |
1166 | */ |
1167 | static const PagetableEntry * |
1168 | tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) |
1169 | { |
1170 | const PagetableEntry *page; |
1171 | |
1172 | if (tbm->nentries == 0) /* in case pagetable doesn't exist */ |
1173 | return NULL; |
1174 | |
1175 | if (tbm->status == TBM_ONE_PAGE) |
1176 | { |
1177 | page = &tbm->entry1; |
1178 | if (page->blockno != pageno) |
1179 | return NULL; |
1180 | Assert(!page->ischunk); |
1181 | return page; |
1182 | } |
1183 | |
1184 | page = pagetable_lookup(tbm->pagetable, pageno); |
1185 | if (page == NULL) |
1186 | return NULL; |
1187 | if (page->ischunk) |
1188 | return NULL; /* don't want a lossy chunk header */ |
1189 | return page; |
1190 | } |
1191 | |
1192 | /* |
1193 | * tbm_get_pageentry - find or create a PagetableEntry for the pageno |
1194 | * |
1195 | * If new, the entry is marked as an exact (non-chunk) entry. |
1196 | * |
1197 | * This may cause the table to exceed the desired memory size. It is |
1198 | * up to the caller to call tbm_lossify() at the next safe point if so. |
1199 | */ |
1200 | static PagetableEntry * |
1201 | tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) |
1202 | { |
1203 | PagetableEntry *page; |
1204 | bool found; |
1205 | |
1206 | if (tbm->status == TBM_EMPTY) |
1207 | { |
1208 | /* Use the fixed slot */ |
1209 | page = &tbm->entry1; |
1210 | found = false; |
1211 | tbm->status = TBM_ONE_PAGE; |
1212 | } |
1213 | else |
1214 | { |
1215 | if (tbm->status == TBM_ONE_PAGE) |
1216 | { |
1217 | page = &tbm->entry1; |
1218 | if (page->blockno == pageno) |
1219 | return page; |
1220 | /* Time to switch from one page to a hashtable */ |
1221 | tbm_create_pagetable(tbm); |
1222 | } |
1223 | |
1224 | /* Look up or create an entry */ |
1225 | page = pagetable_insert(tbm->pagetable, pageno, &found); |
1226 | } |
1227 | |
1228 | /* Initialize it if not present before */ |
1229 | if (!found) |
1230 | { |
1231 | char oldstatus = page->status; |
1232 | |
1233 | MemSet(page, 0, sizeof(PagetableEntry)); |
1234 | page->status = oldstatus; |
1235 | page->blockno = pageno; |
1236 | /* must count it too */ |
1237 | tbm->nentries++; |
1238 | tbm->npages++; |
1239 | } |
1240 | |
1241 | return page; |
1242 | } |
1243 | |
1244 | /* |
1245 | * tbm_page_is_lossy - is the page marked as lossily stored? |
1246 | */ |
1247 | static bool |
1248 | tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) |
1249 | { |
1250 | PagetableEntry *page; |
1251 | BlockNumber chunk_pageno; |
1252 | int bitno; |
1253 | |
1254 | /* we can skip the lookup if there are no lossy chunks */ |
1255 | if (tbm->nchunks == 0) |
1256 | return false; |
1257 | Assert(tbm->status == TBM_HASH); |
1258 | |
1259 | bitno = pageno % PAGES_PER_CHUNK; |
1260 | chunk_pageno = pageno - bitno; |
1261 | |
1262 | page = pagetable_lookup(tbm->pagetable, chunk_pageno); |
1263 | |
1264 | if (page != NULL && page->ischunk) |
1265 | { |
1266 | int wordnum = WORDNUM(bitno); |
1267 | int bitnum = BITNUM(bitno); |
1268 | |
1269 | if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) |
1270 | return true; |
1271 | } |
1272 | return false; |
1273 | } |
1274 | |
1275 | /* |
1276 | * tbm_mark_page_lossy - mark the page number as lossily stored |
1277 | * |
1278 | * This may cause the table to exceed the desired memory size. It is |
1279 | * up to the caller to call tbm_lossify() at the next safe point if so. |
1280 | */ |
1281 | static void |
1282 | tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) |
1283 | { |
1284 | PagetableEntry *page; |
1285 | bool found; |
1286 | BlockNumber chunk_pageno; |
1287 | int bitno; |
1288 | int wordnum; |
1289 | int bitnum; |
1290 | |
1291 | /* We force the bitmap into hashtable mode whenever it's lossy */ |
1292 | if (tbm->status != TBM_HASH) |
1293 | tbm_create_pagetable(tbm); |
1294 | |
1295 | bitno = pageno % PAGES_PER_CHUNK; |
1296 | chunk_pageno = pageno - bitno; |
1297 | |
1298 | /* |
1299 | * Remove any extant non-lossy entry for the page. If the page is its own |
1300 | * chunk header, however, we skip this and handle the case below. |
1301 | */ |
1302 | if (bitno != 0) |
1303 | { |
1304 | if (pagetable_delete(tbm->pagetable, pageno)) |
1305 | { |
1306 | /* It was present, so adjust counts */ |
1307 | tbm->nentries--; |
1308 | tbm->npages--; /* assume it must have been non-lossy */ |
1309 | } |
1310 | } |
1311 | |
1312 | /* Look up or create entry for chunk-header page */ |
1313 | page = pagetable_insert(tbm->pagetable, chunk_pageno, &found); |
1314 | |
1315 | /* Initialize it if not present before */ |
1316 | if (!found) |
1317 | { |
1318 | char oldstatus = page->status; |
1319 | |
1320 | MemSet(page, 0, sizeof(PagetableEntry)); |
1321 | page->status = oldstatus; |
1322 | page->blockno = chunk_pageno; |
1323 | page->ischunk = true; |
1324 | /* must count it too */ |
1325 | tbm->nentries++; |
1326 | tbm->nchunks++; |
1327 | } |
1328 | else if (!page->ischunk) |
1329 | { |
1330 | char oldstatus = page->status; |
1331 | |
1332 | /* chunk header page was formerly non-lossy, make it lossy */ |
1333 | MemSet(page, 0, sizeof(PagetableEntry)); |
1334 | page->status = oldstatus; |
1335 | page->blockno = chunk_pageno; |
1336 | page->ischunk = true; |
1337 | /* we assume it had some tuple bit(s) set, so mark it lossy */ |
1338 | page->words[0] = ((bitmapword) 1 << 0); |
1339 | /* adjust counts */ |
1340 | tbm->nchunks++; |
1341 | tbm->npages--; |
1342 | } |
1343 | |
1344 | /* Now set the original target page's bit */ |
1345 | wordnum = WORDNUM(bitno); |
1346 | bitnum = BITNUM(bitno); |
1347 | page->words[wordnum] |= ((bitmapword) 1 << bitnum); |
1348 | } |
1349 | |
1350 | /* |
1351 | * tbm_lossify - lose some information to get back under the memory limit |
1352 | */ |
1353 | static void |
1354 | tbm_lossify(TIDBitmap *tbm) |
1355 | { |
1356 | pagetable_iterator i; |
1357 | PagetableEntry *page; |
1358 | |
1359 | /* |
1360 | * XXX Really stupid implementation: this just lossifies pages in |
1361 | * essentially random order. We should be paying some attention to the |
1362 | * number of bits set in each page, instead. |
1363 | * |
1364 | * Since we are called as soon as nentries exceeds maxentries, we should |
1365 | * push nentries down to significantly less than maxentries, or else we'll |
1366 | * just end up doing this again very soon. We shoot for maxentries/2. |
1367 | */ |
1368 | Assert(tbm->iterating == TBM_NOT_ITERATING); |
1369 | Assert(tbm->status == TBM_HASH); |
1370 | |
1371 | pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start); |
1372 | while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) |
1373 | { |
1374 | if (page->ischunk) |
1375 | continue; /* already a chunk header */ |
1376 | |
1377 | /* |
1378 | * If the page would become a chunk header, we won't save anything by |
1379 | * converting it to lossy, so skip it. |
1380 | */ |
1381 | if ((page->blockno % PAGES_PER_CHUNK) == 0) |
1382 | continue; |
1383 | |
1384 | /* This does the dirty work ... */ |
1385 | tbm_mark_page_lossy(tbm, page->blockno); |
1386 | |
1387 | if (tbm->nentries <= tbm->maxentries / 2) |
1388 | { |
1389 | /* |
1390 | * We have made enough room. Remember where to start lossifying |
1391 | * next round, so we evenly iterate over the hashtable. |
1392 | */ |
1393 | tbm->lossify_start = i.cur; |
1394 | break; |
1395 | } |
1396 | |
1397 | /* |
1398 | * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the |
1399 | * hashtable and may have deleted the non-lossy chunk. We can |
1400 | * continue the same hash table scan, since failure to visit one |
1401 | * element or visiting the newly inserted element, isn't fatal. |
1402 | */ |
1403 | } |
1404 | |
1405 | /* |
1406 | * With a big bitmap and small work_mem, it's possible that we cannot get |
1407 | * under maxentries. Again, if that happens, we'd end up uselessly |
1408 | * calling tbm_lossify over and over. To prevent this from becoming a |
1409 | * performance sink, force maxentries up to at least double the current |
1410 | * number of entries. (In essence, we're admitting inability to fit |
1411 | * within work_mem when we do this.) Note that this test will not fire if |
1412 | * we broke out of the loop early; and if we didn't, the current number of |
1413 | * entries is simply not reducible any further. |
1414 | */ |
1415 | if (tbm->nentries > tbm->maxentries / 2) |
1416 | tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2; |
1417 | } |
1418 | |
1419 | /* |
1420 | * qsort comparator to handle PagetableEntry pointers. |
1421 | */ |
1422 | static int |
1423 | tbm_comparator(const void *left, const void *right) |
1424 | { |
1425 | BlockNumber l = (*((PagetableEntry *const *) left))->blockno; |
1426 | BlockNumber r = (*((PagetableEntry *const *) right))->blockno; |
1427 | |
1428 | if (l < r) |
1429 | return -1; |
1430 | else if (l > r) |
1431 | return 1; |
1432 | return 0; |
1433 | } |
1434 | |
1435 | /* |
1436 | * As above, but this will get index into PagetableEntry array. Therefore, |
1437 | * it needs to get actual PagetableEntry using the index before comparing the |
1438 | * blockno. |
1439 | */ |
1440 | static int |
1441 | tbm_shared_comparator(const void *left, const void *right, void *arg) |
1442 | { |
1443 | PagetableEntry *base = (PagetableEntry *) arg; |
1444 | PagetableEntry *lpage = &base[*(int *) left]; |
1445 | PagetableEntry *rpage = &base[*(int *) right]; |
1446 | |
1447 | if (lpage->blockno < rpage->blockno) |
1448 | return -1; |
1449 | else if (lpage->blockno > rpage->blockno) |
1450 | return 1; |
1451 | return 0; |
1452 | } |
1453 | |
1454 | /* |
1455 | * tbm_attach_shared_iterate |
1456 | * |
1457 | * Allocate a backend-private iterator and attach the shared iterator state |
1458 | * to it so that multiple processed can iterate jointly. |
1459 | * |
1460 | * We also converts the DSA pointers to local pointers and store them into |
1461 | * our private iterator. |
1462 | */ |
1463 | TBMSharedIterator * |
1464 | tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) |
1465 | { |
1466 | TBMSharedIterator *iterator; |
1467 | TBMSharedIteratorState *istate; |
1468 | |
1469 | /* |
1470 | * Create the TBMSharedIterator struct, with enough trailing space to |
1471 | * serve the needs of the TBMIterateResult sub-struct. |
1472 | */ |
1473 | iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) + |
1474 | MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); |
1475 | |
1476 | istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); |
1477 | |
1478 | iterator->state = istate; |
1479 | |
1480 | iterator->ptbase = dsa_get_address(dsa, istate->pagetable); |
1481 | |
1482 | if (istate->npages) |
1483 | iterator->ptpages = dsa_get_address(dsa, istate->spages); |
1484 | if (istate->nchunks) |
1485 | iterator->ptchunks = dsa_get_address(dsa, istate->schunks); |
1486 | |
1487 | return iterator; |
1488 | } |
1489 | |
1490 | /* |
1491 | * pagetable_allocate |
1492 | * |
1493 | * Callback function for allocating the memory for hashtable elements. |
1494 | * Allocate memory for hashtable elements, using DSA if available. |
1495 | */ |
1496 | static inline void * |
1497 | pagetable_allocate(pagetable_hash *pagetable, Size size) |
1498 | { |
1499 | TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; |
1500 | PTEntryArray *ptbase; |
1501 | |
1502 | if (tbm->dsa == NULL) |
1503 | return MemoryContextAllocExtended(pagetable->ctx, size, |
1504 | MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO); |
1505 | |
1506 | /* |
1507 | * Save the dsapagetable reference in dsapagetableold before allocating |
1508 | * new memory so that pagetable_free can free the old entry. |
1509 | */ |
1510 | tbm->dsapagetableold = tbm->dsapagetable; |
1511 | tbm->dsapagetable = dsa_allocate_extended(tbm->dsa, |
1512 | sizeof(PTEntryArray) + size, |
1513 | DSA_ALLOC_HUGE | DSA_ALLOC_ZERO); |
1514 | ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); |
1515 | |
1516 | return ptbase->ptentry; |
1517 | } |
1518 | |
1519 | /* |
1520 | * pagetable_free |
1521 | * |
1522 | * Callback function for freeing hash table elements. |
1523 | */ |
1524 | static inline void |
1525 | pagetable_free(pagetable_hash *pagetable, void *pointer) |
1526 | { |
1527 | TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; |
1528 | |
1529 | /* pfree the input pointer if DSA is not available */ |
1530 | if (tbm->dsa == NULL) |
1531 | pfree(pointer); |
1532 | else if (DsaPointerIsValid(tbm->dsapagetableold)) |
1533 | { |
1534 | dsa_free(tbm->dsa, tbm->dsapagetableold); |
1535 | tbm->dsapagetableold = InvalidDsaPointer; |
1536 | } |
1537 | } |
1538 | |
1539 | /* |
1540 | * tbm_calculate_entries |
1541 | * |
1542 | * Estimate number of hashtable entries we can have within maxbytes. |
1543 | */ |
1544 | long |
1545 | tbm_calculate_entries(double maxbytes) |
1546 | { |
1547 | long nbuckets; |
1548 | |
1549 | /* |
1550 | * Estimate number of hashtable entries we can have within maxbytes. This |
1551 | * estimates the hash cost as sizeof(PagetableEntry), which is good enough |
1552 | * for our purpose. Also count an extra Pointer per entry for the arrays |
1553 | * created during iteration readout. |
1554 | */ |
1555 | nbuckets = maxbytes / |
1556 | (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); |
1557 | nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ |
1558 | nbuckets = Max(nbuckets, 16); /* sanity limit */ |
1559 | |
1560 | return nbuckets; |
1561 | } |
1562 | |