tidbitmap.c source code [PostgreSQL/src/backend/nodes/tidbitmap.c]

1	/-------------------------------------------------------------------------*
2	*
3	* tidbitmap.c
4	* PostgreSQL tuple-id (TID) bitmap package
5	*
6	* This module provides bitmap data structures that are spiritually
7	* similar to Bitmapsets, but are specially adapted to store sets of
8	* tuple identifiers (TIDs), or ItemPointers. In particular, the division
9	* of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10	* Also, since we wish to be able to store very large tuple sets in
11	* memory with this data structure, we support "lossy" storage, in which
12	* we no longer remember individual tuple offsets on a page but only the
13	* fact that a particular page needs to be visited.
14	*
15	* The "lossy" storage uses one bit per disk page, so at the standard 8K
16	* BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17	* of memory. People pushing around tables of that size should have a
18	* couple of Mb to spare, so we don't worry about providing a second level
19	* of lossiness. In theory we could fall back to page ranges at some
20	* point, but for now that seems useless complexity.
21	*
22	* We also support the notion of candidate matches, or rechecking. This
23	* means we know that a search need visit only some tuples on a page,
24	* but we are not certain that all of those tuples are real matches.
25	* So the eventual heap scan must recheck the quals for these tuples only,
26	* rather than rechecking the quals for all tuples on the page as in the
27	* lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28	* into a bitmap, and it can also happen internally when we AND a lossy
29	* and a non-lossy page.
30	*
31	*
32	* Copyright (c) 2003-2019, PostgreSQL Global Development Group
33	*
34	* IDENTIFICATION
35	* src/backend/nodes/tidbitmap.c
36	*
37	*-------------------------------------------------------------------------
38	*/
39	#include "postgres.h"
40
41	#include <limits.h>
42
43	#include "access/htup_details.h"
44	#include "nodes/bitmapset.h"
45	#include "nodes/tidbitmap.h"
46	#include "storage/lwlock.h"
47	#include "utils/dsa.h"
48	#include "utils/hashutils.h"
49
50	/*
51	* The maximum number of tuples per page is not large (typically 256 with
52	* 8K pages, or 1024 with 32K pages). So there's not much point in making
53	* the per-page bitmaps variable size. We just legislate that the size
54	* is this:
55	*/
56	#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57
58	/*
59	* When we have to switch over to lossy storage, we use a data structure
60	* with one bit per page, where all pages having the same number DIV
61	* PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62	* and has the bit set for a given page, there must not be a per-page entry
63	* for that page in the page table.
64	*
65	* We actually store both exact pages and lossy chunks in the same hash
66	* table, using identical data structures. (This is because the memory
67	* management for hashtables doesn't easily/efficiently allow space to be
68	* transferred easily from one hashtable to another.) Therefore it's best
69	* if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70	* too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71	* avoid expensive integer remainder operations. So, define it like this:
72	*/
73	#define PAGES_PER_CHUNK (BLCKSZ / 32)
74
75	/ We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h /
76
77	#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78	#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79
80	/ number of active words for an exact page: /
81	#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82	/ number of active words for a lossy chunk: /
83	#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84
85	/*
86	* The hashtable entries are represented by this data structure. For
87	* an exact page, blockno is the page number and bit k of the bitmap
88	* represents tuple offset k+1. For a lossy chunk, blockno is the first
89	* page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90	* bit k represents page blockno+k. Note that it is not possible to
91	* have exact storage for the first page of a chunk if we are using
92	* lossy storage for any page in the chunk's range, since the same
93	* hashtable entry has to serve both purposes.
94	*
95	* recheck is used only on exact pages --- it indicates that although
96	* only the stated tuples need be checked, the full index qual condition
97	* must be checked for each (ie, these are candidate matches).
98	*/
99	typedef struct PagetableEntry
100	{
101	BlockNumber blockno; / page number (hashtable key) /
102	char status; / hash entry status /
103	bool ischunk; / T = lossy storage, F = exact /
104	bool recheck; / should the tuples be rechecked? /
105	bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106	} PagetableEntry;
107
108	/*
109	* Holds array of pagetable entries.
110	*/
111	typedef struct PTEntryArray
112	{
113	pg_atomic_uint32 refcount; / no. of iterator attached /
114	PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115	} PTEntryArray;
116
117	/*
118	* We want to avoid the overhead of creating the hashtable, which is
119	* comparatively large, when not necessary. Particularly when we are using a
120	* bitmap scan on the inside of a nestloop join: a bitmap may well live only
121	* long enough to accumulate one entry in such cases. We therefore avoid
122	* creating an actual hashtable until we need two pagetable entries. When
123	* just one pagetable entry is needed, we store it in a fixed field of
124	* TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125	* shrinks down to zero or one page again. So, status can be TBM_HASH even
126	* when nentries is zero or one.)
127	*/
128	typedef enum
129	{
130	TBM_EMPTY, / no hashtable, nentries == 0 /
131	TBM_ONE_PAGE, / entry1 contains the single entry /
132	TBM_HASH / pagetable is valid, entry1 is not /
133	} TBMStatus;
134
135	/*
136	* Current iterating state of the TBM.
137	*/
138	typedef enum
139	{
140	TBM_NOT_ITERATING, / not yet converted to page and chunk array /
141	TBM_ITERATING_PRIVATE, / converted to local page and chunk array /
142	TBM_ITERATING_SHARED / converted to shared page and chunk array /
143	} TBMIteratingState;
144
145	/*
146	* Here is the representation for a whole TIDBitMap:
147	*/
148	struct TIDBitmap
149	{
150	NodeTag type; / to make it a valid Node /
151	MemoryContext mcxt; / memory context containing me /
152	TBMStatus status; / see codes above /
153	struct pagetable_hash pagetable; /* hash table of PagetableEntry's /
154	int nentries; / number of entries in pagetable /
155	int maxentries; / limit on same to meet maxbytes /
156	int npages; / number of exact entries in pagetable /
157	int nchunks; / number of lossy entries in pagetable /
158	TBMIteratingState iterating; / tbm_begin_iterate called? /
159	uint32 lossify_start; / offset to start lossifying hashtable at /
160	PagetableEntry entry1; / used when status == TBM_ONE_PAGE /
161	/ these are valid when iterating is true: /
162	PagetableEntry *spages; /* sorted exact-page list, or NULL /
163	PagetableEntry *schunks; /* sorted lossy-chunk list, or NULL /
164	dsa_pointer dsapagetable; / dsa_pointer to the element array /
165	dsa_pointer dsapagetableold; / dsa_pointer to the old element array /
166	dsa_pointer ptpages; / dsa_pointer to the page array /
167	dsa_pointer ptchunks; / dsa_pointer to the chunk array /
168	dsa_area dsa; /* reference to per-query dsa area /
169	};
170
171	/*
172	* When iterating over a bitmap in sorted order, a TBMIterator is used to
173	* track our progress. There can be several iterators scanning the same
174	* bitmap concurrently. Note that the bitmap becomes read-only as soon as
175	* any iterator is created.
176	*/
177	struct TBMIterator
178	{
179	TIDBitmap tbm; /* TIDBitmap we're iterating over /
180	int spageptr; / next spages index /
181	int schunkptr; / next schunks index /
182	int schunkbit; / next bit to check in current schunk /
183	TBMIterateResult output; / MUST BE LAST (because variable-size) /
184	};
185
186	/*
187	* Holds the shared members of the iterator so that multiple processes
188	* can jointly iterate.
189	*/
190	typedef struct TBMSharedIteratorState
191	{
192	int nentries; / number of entries in pagetable /
193	int maxentries; / limit on same to meet maxbytes /
194	int npages; / number of exact entries in pagetable /
195	int nchunks; / number of lossy entries in pagetable /
196	dsa_pointer pagetable; / dsa pointers to head of pagetable data /
197	dsa_pointer spages; / dsa pointer to page array /
198	dsa_pointer schunks; / dsa pointer to chunk array /
199	LWLock lock; / lock to protect below members /
200	int spageptr; / next spages index /
201	int schunkptr; / next schunks index /
202	int schunkbit; / next bit to check in current schunk /
203	} TBMSharedIteratorState;
204
205	/*
206	* pagetable iteration array.
207	*/
208	typedef struct PTIterationArray
209	{
210	pg_atomic_uint32 refcount; / no. of iterator attached /
211	int index[FLEXIBLE_ARRAY_MEMBER]; / index array /
212	} PTIterationArray;
213
214	/*
215	* same as TBMIterator, but it is used for joint iteration, therefore this
216	* also holds a reference to the shared state.
217	*/
218	struct TBMSharedIterator
219	{
220	TBMSharedIteratorState state; /* shared state /
221	PTEntryArray ptbase; /* pagetable element array /
222	PTIterationArray ptpages; /* sorted exact page index list /
223	PTIterationArray ptchunks; /* sorted lossy page index list /
224	TBMIterateResult output; / MUST BE LAST (because variable-size) /
225	};
226
227	/ Local function prototypes /
228	static void tbm_union_page(TIDBitmap a, const* PagetableEntry *bpage);
229	static bool tbm_intersect_page(TIDBitmap a, PagetableEntry apage,
230	const TIDBitmap *b);
231	static const PagetableEntry tbm_find_pageentry(const* TIDBitmap *tbm,
232	BlockNumber pageno);
233	static PagetableEntry tbm_get_pageentry(TIDBitmap tbm, BlockNumber pageno);
234	static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235	static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236	static void tbm_lossify(TIDBitmap *tbm);
237	static int tbm_comparator(const void left, const* void *right);
238	static int tbm_shared_comparator(const void left, const* void *right,
239	void *arg);
240
241	/ define hashtable mapping block numbers to PagetableEntry's /
242	#define SH_USE_NONDEFAULT_ALLOCATOR
243	#define SH_PREFIX pagetable
244	#define SH_ELEMENT_TYPE PagetableEntry
245	#define SH_KEY_TYPE BlockNumber
246	#define SH_KEY blockno
247	#define SH_HASH_KEY(tb, key) murmurhash32(key)
248	#define SH_EQUAL(tb, a, b) a == b
249	#define SH_SCOPE static inline
250	#define SH_DEFINE
251	#define SH_DECLARE
252	#include "lib/simplehash.h"
253
254
255	/*
256	* tbm_create - create an initially-empty bitmap
257	*
258	* The bitmap will live in the memory context that is CurrentMemoryContext
259	* at the time of this call. It will be limited to (approximately) maxbytes
260	* total memory consumption. If the DSA passed to this function is not NULL
261	* then the memory for storing elements of the underlying page table will
262	* be allocated from the DSA.
263	*/
264	TIDBitmap *
265	tbm_create(long maxbytes, dsa_area *dsa)
266	{
267	TIDBitmap *tbm;
268
269	/ Create the TIDBitmap struct and zero all its fields /
270	tbm = makeNode(TIDBitmap);
271
272	tbm->mcxt = CurrentMemoryContext;
273	tbm->status = TBM_EMPTY;
274
275	tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
276	tbm->lossify_start = `0`;
277	tbm->dsa = dsa;
278	tbm->dsapagetable = InvalidDsaPointer;
279	tbm->dsapagetableold = InvalidDsaPointer;
280	tbm->ptpages = InvalidDsaPointer;
281	tbm->ptchunks = InvalidDsaPointer;
282
283	return tbm;
284	}
285
286	/*
287	* Actually create the hashtable. Since this is a moderately expensive
288	* proposition, we don't do it until we have to.
289	*/
290	static void
291	tbm_create_pagetable(TIDBitmap *tbm)
292	{
293	Assert(tbm->status != TBM_HASH);
294	Assert(tbm->pagetable == NULL);
295
296	tbm->pagetable = pagetable_create(tbm->mcxt, `128`, tbm);
297
298	/ If entry1 is valid, push it into the hashtable /
299	if (tbm->status == TBM_ONE_PAGE)
300	{
301	PagetableEntry *page;
302	bool found;
303	char oldstatus;
304
305	page = pagetable_insert(tbm->pagetable,
306	tbm->entry1.blockno,
307	&found);
308	Assert(!found);
309	oldstatus = page->status;
310	memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
311	page->status = oldstatus;
312	}
313
314	tbm->status = TBM_HASH;
315	}
316
317	/*
318	* tbm_free - free a TIDBitmap
319	*/
320	void
321	tbm_free(TIDBitmap *tbm)
322	{
323	if (tbm->pagetable)
324	pagetable_destroy(tbm->pagetable);
325	if (tbm->spages)
326	pfree(tbm->spages);
327	if (tbm->schunks)
328	pfree(tbm->schunks);
329	pfree(tbm);
330	}
331
332	/*
333	* tbm_free_shared_area - free shared state
334	*
335	* Free shared iterator state, Also free shared pagetable and iterator arrays
336	* memory if they are not referred by any of the shared iterator i.e recount
337	* is becomes 0.
338	*/
339	void
340	tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
341	{
342	TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
343	PTEntryArray *ptbase;
344	PTIterationArray *ptpages;
345	PTIterationArray *ptchunks;
346
347	if (DsaPointerIsValid(istate->pagetable))
348	{
349	ptbase = dsa_get_address(dsa, istate->pagetable);
350	if (pg_atomic_sub_fetch_u32(&ptbase->refcount, `1`) == `0`)
351	dsa_free(dsa, istate->pagetable);
352	}
353	if (DsaPointerIsValid(istate->spages))
354	{
355	ptpages = dsa_get_address(dsa, istate->spages);
356	if (pg_atomic_sub_fetch_u32(&ptpages->refcount, `1`) == `0`)
357	dsa_free(dsa, istate->spages);
358	}
359	if (DsaPointerIsValid(istate->schunks))
360	{
361	ptchunks = dsa_get_address(dsa, istate->schunks);
362	if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, `1`) == `0`)
363	dsa_free(dsa, istate->schunks);
364	}
365
366	dsa_free(dsa, dp);
367	}
368
369	/*
370	* tbm_add_tuples - add some tuple IDs to a TIDBitmap
371	*
372	* If recheck is true, then the recheck flag will be set in the
373	* TBMIterateResult when any of these tuples are reported out.
374	*/
375	void
376	tbm_add_tuples(TIDBitmap tbm, const* ItemPointer tids, int ntids,
377	bool recheck)
378	{
379	BlockNumber currblk = InvalidBlockNumber;
380	PagetableEntry page = NULL; /* only valid when currblk is valid /
381	int i;
382
383	Assert(tbm->iterating == TBM_NOT_ITERATING);
384	for (i = `0`; i < ntids; i++)
385	{
386	BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
387	OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
388	int wordnum,
389	bitnum;
390
391	/ safety check to ensure we don't overrun bit array bounds /
392	if (off < `1` \|\| off > MAX_TUPLES_PER_PAGE)
393	elog(ERROR, "tuple offset out of range: %u", off);
394
395	/*
396	* Look up target page unless we already did. This saves cycles when
397	* the input includes consecutive tuples on the same page, which is
398	* common enough to justify an extra test here.
399	*/
400	if (blk != currblk)
401	{
402	if (tbm_page_is_lossy(tbm, blk))
403	page = NULL; / remember page is lossy /
404	else
405	page = tbm_get_pageentry(tbm, blk);
406	currblk = blk;
407	}
408
409	if (page == NULL)
410	continue; / whole page is already marked /
411
412	if (page->ischunk)
413	{
414	/ The page is a lossy chunk header, set bit for itself /
415	wordnum = bitnum = `0`;
416	}
417	else
418	{
419	/ Page is exact, so set bit for individual tuple /
420	wordnum = WORDNUM(off - `1`);
421	bitnum = BITNUM(off - `1`);
422	}
423	page->words[wordnum] \|= ((bitmapword) `1` << bitnum);
424	page->recheck \|= recheck;
425
426	if (tbm->nentries > tbm->maxentries)
427	{
428	tbm_lossify(tbm);
429	/ Page could have been converted to lossy, so force new lookup /
430	currblk = InvalidBlockNumber;
431	}
432	}
433	}
434
435	/*
436	* tbm_add_page - add a whole page to a TIDBitmap
437	*
438	* This causes the whole page to be reported (with the recheck flag)
439	* when the TIDBitmap is scanned.
440	*/
441	void
442	tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
443	{
444	/ Enter the page in the bitmap, or mark it lossy if already present /
445	tbm_mark_page_lossy(tbm, pageno);
446	/ If we went over the memory limit, lossify some more pages /
447	if (tbm->nentries > tbm->maxentries)
448	tbm_lossify(tbm);
449	}
450
451	/*
452	* tbm_union - set union
453	*
454	* a is modified in-place, b is not changed
455	*/
456	void
457	tbm_union(TIDBitmap a, const* TIDBitmap *b)
458	{
459	Assert(!a->iterating);
460	/ Nothing to do if b is empty /
461	if (b->nentries == `0`)
462	return;
463	/ Scan through chunks and pages in b, merge into a /
464	if (b->status == TBM_ONE_PAGE)
465	tbm_union_page(a, &b->entry1);
466	else
467	{
468	pagetable_iterator i;
469	PagetableEntry *bpage;
470
471	Assert(b->status == TBM_HASH);
472	pagetable_start_iterate(b->pagetable, &i);
473	while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
474	tbm_union_page(a, bpage);
475	}
476	}
477
478	/ Process one page of b during a union op /
479	static void
480	tbm_union_page(TIDBitmap a, const* PagetableEntry *bpage)
481	{
482	PagetableEntry *apage;
483	int wordnum;
484
485	if (bpage->ischunk)
486	{
487	/ Scan b's chunk, mark each indicated page lossy in a /
488	for (wordnum = `0`; wordnum < WORDS_PER_CHUNK; wordnum++)
489	{
490	bitmapword w = bpage->words[wordnum];
491
492	if (w != `0`)
493	{
494	BlockNumber pg;
495
496	pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
497	while (w != `0`)
498	{
499	if (w & `1`)
500	tbm_mark_page_lossy(a, pg);
501	pg++;
502	w >>= `1`;
503	}
504	}
505	}
506	}
507	else if (tbm_page_is_lossy(a, bpage->blockno))
508	{
509	/ page is already lossy in a, nothing to do /
510	return;
511	}
512	else
513	{
514	apage = tbm_get_pageentry(a, bpage->blockno);
515	if (apage->ischunk)
516	{
517	/ The page is a lossy chunk header, set bit for itself /
518	apage->words[`0`] \|= ((bitmapword) `1` << `0`);
519	}
520	else
521	{
522	/ Both pages are exact, merge at the bit level /
523	for (wordnum = `0`; wordnum < WORDS_PER_PAGE; wordnum++)
524	apage->words[wordnum] \|= bpage->words[wordnum];
525	apage->recheck \|= bpage->recheck;
526	}
527	}
528
529	if (a->nentries > a->maxentries)
530	tbm_lossify(a);
531	}
532
533	/*
534	* tbm_intersect - set intersection
535	*
536	* a is modified in-place, b is not changed
537	*/
538	void
539	tbm_intersect(TIDBitmap a, const* TIDBitmap *b)
540	{
541	Assert(!a->iterating);
542	/ Nothing to do if a is empty /
543	if (a->nentries == `0`)
544	return;
545	/ Scan through chunks and pages in a, try to match to b /
546	if (a->status == TBM_ONE_PAGE)
547	{
548	if (tbm_intersect_page(a, &a->entry1, b))
549	{
550	/ Page is now empty, remove it from a /
551	Assert(!a->entry1.ischunk);
552	a->npages--;
553	a->nentries--;
554	Assert(a->nentries == `0`);
555	a->status = TBM_EMPTY;
556	}
557	}
558	else
559	{
560	pagetable_iterator i;
561	PagetableEntry *apage;
562
563	Assert(a->status == TBM_HASH);
564	pagetable_start_iterate(a->pagetable, &i);
565	while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
566	{
567	if (tbm_intersect_page(a, apage, b))
568	{
569	/ Page or chunk is now empty, remove it from a /
570	if (apage->ischunk)
571	a->nchunks--;
572	else
573	a->npages--;
574	a->nentries--;
575	if (!pagetable_delete(a->pagetable, apage->blockno))
576	elog(ERROR, "hash table corrupted");
577	}
578	}
579	}
580	}
581
582	/*
583	* Process one page of a during an intersection op
584	*
585	* Returns true if apage is now empty and should be deleted from a
586	*/
587	static bool
588	tbm_intersect_page(TIDBitmap a, PagetableEntry apage, const TIDBitmap *b)
589	{
590	const PagetableEntry *bpage;
591	int wordnum;
592
593	if (apage->ischunk)
594	{
595	/ Scan each bit in chunk, try to clear /
596	bool candelete = true;
597
598	for (wordnum = `0`; wordnum < WORDS_PER_CHUNK; wordnum++)
599	{
600	bitmapword w = apage->words[wordnum];
601
602	if (w != `0`)
603	{
604	bitmapword neww = w;
605	BlockNumber pg;
606	int bitnum;
607
608	pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
609	bitnum = `0`;
610	while (w != `0`)
611	{
612	if (w & `1`)
613	{
614	if (!tbm_page_is_lossy(b, pg) &&
615	tbm_find_pageentry(b, pg) == NULL)
616	{
617	/ Page is not in b at all, lose lossy bit /
618	neww &= ~((bitmapword) `1` << bitnum);
619	}
620	}
621	pg++;
622	bitnum++;
623	w >>= `1`;
624	}
625	apage->words[wordnum] = neww;
626	if (neww != `0`)
627	candelete = false;
628	}
629	}
630	return candelete;
631	}
632	else if (tbm_page_is_lossy(b, apage->blockno))
633	{
634	/*
635	* Some of the tuples in 'a' might not satisfy the quals for 'b', but
636	* because the page 'b' is lossy, we don't know which ones. Therefore
637	* we mark 'a' as requiring rechecks, to indicate that at most those
638	* tuples set in 'a' are matches.
639	*/
640	apage->recheck = true;
641	return false;
642	}
643	else
644	{
645	bool candelete = true;
646
647	bpage = tbm_find_pageentry(b, apage->blockno);
648	if (bpage != NULL)
649	{
650	/ Both pages are exact, merge at the bit level /
651	Assert(!bpage->ischunk);
652	for (wordnum = `0`; wordnum < WORDS_PER_PAGE; wordnum++)
653	{
654	apage->words[wordnum] &= bpage->words[wordnum];
655	if (apage->words[wordnum] != `0`)
656	candelete = false;
657	}
658	apage->recheck \|= bpage->recheck;
659	}
660	/ If there is no matching b page, we can just delete the a page /
661	return candelete;
662	}
663	}
664
665	/*
666	* tbm_is_empty - is a TIDBitmap completely empty?
667	*/
668	bool
669	tbm_is_empty(const TIDBitmap *tbm)
670	{
671	return (tbm->nentries == `0`);
672	}
673
674	/*
675	* tbm_begin_iterate - prepare to iterate through a TIDBitmap
676	*
677	* The TBMIterator struct is created in the caller's memory context.
678	* For a clean shutdown of the iteration, call tbm_end_iterate; but it's
679	* okay to just allow the memory context to be released, too. It is caller's
680	* responsibility not to touch the TBMIterator anymore once the TIDBitmap
681	* is freed.
682	*
683	* NB: after this is called, it is no longer allowed to modify the contents
684	* of the bitmap. However, you can call this multiple times to scan the
685	* contents repeatedly, including parallel scans.
686	*/
687	TBMIterator *
688	tbm_begin_iterate(TIDBitmap *tbm)
689	{
690	TBMIterator *iterator;
691
692	Assert(tbm->iterating != TBM_ITERATING_SHARED);
693
694	/*
695	* Create the TBMIterator struct, with enough trailing space to serve the
696	* needs of the TBMIterateResult sub-struct.
697	*/
698	iterator = (TBMIterator ) palloc(sizeof*(TBMIterator) +
699	MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
700	iterator->tbm = tbm;
701
702	/*
703	* Initialize iteration pointers.
704	*/
705	iterator->spageptr = `0`;
706	iterator->schunkptr = `0`;
707	iterator->schunkbit = `0`;
708
709	/*
710	* If we have a hashtable, create and fill the sorted page lists, unless
711	* we already did that for a previous iterator. Note that the lists are
712	* attached to the bitmap not the iterator, so they can be used by more
713	* than one iterator.
714	*/
715	if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
716	{
717	pagetable_iterator i;
718	PagetableEntry *page;
719	int npages;
720	int nchunks;
721
722	if (!tbm->spages && tbm->npages > `0`)
723	tbm->spages = (PagetableEntry **)
724	MemoryContextAlloc(tbm->mcxt,
725	tbm->npages * sizeof(PagetableEntry *));
726	if (!tbm->schunks && tbm->nchunks > `0`)
727	tbm->schunks = (PagetableEntry **)
728	MemoryContextAlloc(tbm->mcxt,
729	tbm->nchunks * sizeof(PagetableEntry *));
730
731	npages = nchunks = `0`;
732	pagetable_start_iterate(tbm->pagetable, &i);
733	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
734	{
735	if (page->ischunk)
736	tbm->schunks[nchunks++] = page;
737	else
738	tbm->spages[npages++] = page;
739	}
740	Assert(npages == tbm->npages);
741	Assert(nchunks == tbm->nchunks);
742	if (npages > `1`)
743	qsort(tbm->spages, npages, sizeof(PagetableEntry *),
744	tbm_comparator);
745	if (nchunks > `1`)
746	qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
747	tbm_comparator);
748	}
749
750	tbm->iterating = TBM_ITERATING_PRIVATE;
751
752	return iterator;
753	}
754
755	/*
756	* tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
757	*
758	* The necessary shared state will be allocated from the DSA passed to
759	* tbm_create, so that multiple processes can attach to it and iterate jointly.
760	*
761	* This will convert the pagetable hash into page and chunk array of the index
762	* into pagetable array.
763	*/
764	dsa_pointer
765	tbm_prepare_shared_iterate(TIDBitmap *tbm)
766	{
767	dsa_pointer dp;
768	TBMSharedIteratorState *istate;
769	PTEntryArray *ptbase = NULL;
770	PTIterationArray *ptpages = NULL;
771	PTIterationArray *ptchunks = NULL;
772
773	Assert(tbm->dsa != NULL);
774	Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
775
776	/*
777	* Allocate TBMSharedIteratorState from DSA to hold the shared members and
778	* lock, this will also be used by multiple worker for shared iterate.
779	*/
780	dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
781	istate = dsa_get_address(tbm->dsa, dp);
782
783	/*
784	* If we're not already iterating, create and fill the sorted page lists.
785	* (If we are, the sorted page lists are already stored in the TIDBitmap,
786	* and we can just reuse them.)
787	*/
788	if (tbm->iterating == TBM_NOT_ITERATING)
789	{
790	pagetable_iterator i;
791	PagetableEntry *page;
792	int idx;
793	int npages;
794	int nchunks;
795
796	/*
797	* Allocate the page and chunk array memory from the DSA to share
798	* across multiple processes.
799	*/
800	if (tbm->npages)
801	{
802	tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
803	tbm->npages * sizeof(int));
804	ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
805	pg_atomic_init_u32(&ptpages->refcount, `0`);
806	}
807	if (tbm->nchunks)
808	{
809	tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
810	tbm->nchunks * sizeof(int));
811	ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
812	pg_atomic_init_u32(&ptchunks->refcount, `0`);
813	}
814
815	/*
816	* If TBM status is TBM_HASH then iterate over the pagetable and
817	* convert it to page and chunk arrays. But if it's in the
818	* TBM_ONE_PAGE mode then directly allocate the space for one entry
819	* from the DSA.
820	*/
821	npages = nchunks = `0`;
822	if (tbm->status == TBM_HASH)
823	{
824	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
825
826	pagetable_start_iterate(tbm->pagetable, &i);
827	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
828	{
829	idx = page - ptbase->ptentry;
830	if (page->ischunk)
831	ptchunks->index[nchunks++] = idx;
832	else
833	ptpages->index[npages++] = idx;
834	}
835
836	Assert(npages == tbm->npages);
837	Assert(nchunks == tbm->nchunks);
838	}
839	else if (tbm->status == TBM_ONE_PAGE)
840	{
841	/*
842	* In one page mode allocate the space for one pagetable entry,
843	* initialize it, and directly store its index (i.e. 0) in the
844	* page array.
845	*/
846	tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
847	sizeof(PagetableEntry));
848	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
849	memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
850	ptpages->index[`0`] = `0`;
851	}
852
853	if (ptbase != NULL)
854	pg_atomic_init_u32(&ptbase->refcount, `0`);
855	if (npages > `1`)
856	qsort_arg((void ) (ptpages->index), npages, sizeof(int*),
857	tbm_shared_comparator, (void *) ptbase->ptentry);
858	if (nchunks > `1`)
859	qsort_arg((void ) (ptchunks->index), nchunks, sizeof(int*),
860	tbm_shared_comparator, (void *) ptbase->ptentry);
861	}
862
863	/*
864	* Store the TBM members in the shared state so that we can share them
865	* across multiple processes.
866	*/
867	istate->nentries = tbm->nentries;
868	istate->maxentries = tbm->maxentries;
869	istate->npages = tbm->npages;
870	istate->nchunks = tbm->nchunks;
871	istate->pagetable = tbm->dsapagetable;
872	istate->spages = tbm->ptpages;
873	istate->schunks = tbm->ptchunks;
874
875	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876	ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
877	ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
878
879	/*
880	* For every shared iterator, referring to pagetable and iterator array,
881	* increase the refcount by 1 so that while freeing the shared iterator we
882	* don't free pagetable and iterator array until its refcount becomes 0.
883	*/
884	if (ptbase != NULL)
885	pg_atomic_add_fetch_u32(&ptbase->refcount, `1`);
886	if (ptpages != NULL)
887	pg_atomic_add_fetch_u32(&ptpages->refcount, `1`);
888	if (ptchunks != NULL)
889	pg_atomic_add_fetch_u32(&ptchunks->refcount, `1`);
890
891	/ Initialize the iterator lock /
892	LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
893
894	/ Initialize the shared iterator state /
895	istate->schunkbit = `0`;
896	istate->schunkptr = `0`;
897	istate->spageptr = `0`;
898
899	tbm->iterating = TBM_ITERATING_SHARED;
900
901	return dp;
902	}
903
904	/*
905	* tbm_extract_page_tuple - extract the tuple offsets from a page
906	*
907	* The extracted offsets are stored into TBMIterateResult.
908	*/
909	static inline int
910	tbm_extract_page_tuple(PagetableEntry page, TBMIterateResult output)
911	{
912	int wordnum;
913	int ntuples = `0`;
914
915	for (wordnum = `0`; wordnum < WORDS_PER_PAGE; wordnum++)
916	{
917	bitmapword w = page->words[wordnum];
918
919	if (w != `0`)
920	{
921	int off = wordnum * BITS_PER_BITMAPWORD + `1`;
922
923	while (w != `0`)
924	{
925	if (w & `1`)
926	output->offsets[ntuples++] = (OffsetNumber) off;
927	off++;
928	w >>= `1`;
929	}
930	}
931	}
932
933	return ntuples;
934	}
935
936	/*
937	* tbm_advance_schunkbit - Advance the schunkbit
938	*/
939	static inline void
940	tbm_advance_schunkbit(PagetableEntry chunk, int* *schunkbitp)
941	{
942	int schunkbit = *schunkbitp;
943
944	while (schunkbit < PAGES_PER_CHUNK)
945	{
946	int wordnum = WORDNUM(schunkbit);
947	int bitnum = BITNUM(schunkbit);
948
949	if ((chunk->words[wordnum] & ((bitmapword) `1` << bitnum)) != `0`)
950	break;
951	schunkbit++;
952	}
953
954	*schunkbitp = schunkbit;
955	}
956
957	/*
958	* tbm_iterate - scan through next page of a TIDBitmap
959	*
960	* Returns a TBMIterateResult representing one page, or NULL if there are
961	* no more pages to scan. Pages are guaranteed to be delivered in numerical
962	* order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
963	* remember the exact tuples to look at on this page --- the caller must
964	* examine all tuples on the page and check if they meet the intended
965	* condition. If result->recheck is true, only the indicated tuples need
966	* be examined, but the condition must be rechecked anyway. (For ease of
967	* testing, recheck is always set true when ntuples < 0.)
968	*/
969	TBMIterateResult *
970	tbm_iterate(TBMIterator *iterator)
971	{
972	TIDBitmap *tbm = iterator->tbm;
973	TBMIterateResult *output = &(iterator->output);
974
975	Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
976
977	/*
978	* If lossy chunk pages remain, make sure we've advanced schunkptr/
979	* schunkbit to the next set bit.
980	*/
981	while (iterator->schunkptr < tbm->nchunks)
982	{
983	PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
984	int schunkbit = iterator->schunkbit;
985
986	tbm_advance_schunkbit(chunk, &schunkbit);
987	if (schunkbit < PAGES_PER_CHUNK)
988	{
989	iterator->schunkbit = schunkbit;
990	break;
991	}
992	/ advance to next chunk /
993	iterator->schunkptr++;
994	iterator->schunkbit = `0`;
995	}
996
997	/*
998	* If both chunk and per-page data remain, must output the numerically
999	* earlier page.
1000	*/
1001	if (iterator->schunkptr < tbm->nchunks)
1002	{
1003	PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1004	BlockNumber chunk_blockno;
1005
1006	chunk_blockno = chunk->blockno + iterator->schunkbit;
1007	if (iterator->spageptr >= tbm->npages \|\|
1008	chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1009	{
1010	/ Return a lossy page indicator from the chunk /
1011	output->blockno = chunk_blockno;
1012	output->ntuples = -`1`;
1013	output->recheck = true;
1014	iterator->schunkbit++;
1015	return output;
1016	}
1017	}
1018
1019	if (iterator->spageptr < tbm->npages)
1020	{
1021	PagetableEntry *page;
1022	int ntuples;
1023
1024	/ In ONE_PAGE state, we don't allocate an spages[] array /
1025	if (tbm->status == TBM_ONE_PAGE)
1026	page = &tbm->entry1;
1027	else
1028	page = tbm->spages[iterator->spageptr];
1029
1030	/ scan bitmap to extract individual offset numbers /
1031	ntuples = tbm_extract_page_tuple(page, output);
1032	output->blockno = page->blockno;
1033	output->ntuples = ntuples;
1034	output->recheck = page->recheck;
1035	iterator->spageptr++;
1036	return output;
1037	}
1038
1039	/ Nothing more in the bitmap /
1040	return NULL;
1041	}
1042
1043	/*
1044	* tbm_shared_iterate - scan through next page of a TIDBitmap
1045	*
1046	* As above, but this will iterate using an iterator which is shared
1047	* across multiple processes. We need to acquire the iterator LWLock,
1048	* before accessing the shared members.
1049	*/
1050	TBMIterateResult *
1051	tbm_shared_iterate(TBMSharedIterator *iterator)
1052	{
1053	TBMIterateResult *output = &iterator->output;
1054	TBMSharedIteratorState *istate = iterator->state;
1055	PagetableEntry *ptbase = NULL;
1056	int *idxpages = NULL;
1057	int *idxchunks = NULL;
1058
1059	if (iterator->ptbase != NULL)
1060	ptbase = iterator->ptbase->ptentry;
1061	if (iterator->ptpages != NULL)
1062	idxpages = iterator->ptpages->index;
1063	if (iterator->ptchunks != NULL)
1064	idxchunks = iterator->ptchunks->index;
1065
1066	/ Acquire the LWLock before accessing the shared members /
1067	LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1068
1069	/*
1070	* If lossy chunk pages remain, make sure we've advanced schunkptr/
1071	* schunkbit to the next set bit.
1072	*/
1073	while (istate->schunkptr < istate->nchunks)
1074	{
1075	PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1076	int schunkbit = istate->schunkbit;
1077
1078	tbm_advance_schunkbit(chunk, &schunkbit);
1079	if (schunkbit < PAGES_PER_CHUNK)
1080	{
1081	istate->schunkbit = schunkbit;
1082	break;
1083	}
1084	/ advance to next chunk /
1085	istate->schunkptr++;
1086	istate->schunkbit = `0`;
1087	}
1088
1089	/*
1090	* If both chunk and per-page data remain, must output the numerically
1091	* earlier page.
1092	*/
1093	if (istate->schunkptr < istate->nchunks)
1094	{
1095	PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1096	BlockNumber chunk_blockno;
1097
1098	chunk_blockno = chunk->blockno + istate->schunkbit;
1099
1100	if (istate->spageptr >= istate->npages \|\|
1101	chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1102	{
1103	/ Return a lossy page indicator from the chunk /
1104	output->blockno = chunk_blockno;
1105	output->ntuples = -`1`;
1106	output->recheck = true;
1107	istate->schunkbit++;
1108
1109	LWLockRelease(&istate->lock);
1110	return output;
1111	}
1112	}
1113
1114	if (istate->spageptr < istate->npages)
1115	{
1116	PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1117	int ntuples;
1118
1119	/ scan bitmap to extract individual offset numbers /
1120	ntuples = tbm_extract_page_tuple(page, output);
1121	output->blockno = page->blockno;
1122	output->ntuples = ntuples;
1123	output->recheck = page->recheck;
1124	istate->spageptr++;
1125
1126	LWLockRelease(&istate->lock);
1127
1128	return output;
1129	}
1130
1131	LWLockRelease(&istate->lock);
1132
1133	/ Nothing more in the bitmap /
1134	return NULL;
1135	}
1136
1137	/*
1138	* tbm_end_iterate - finish an iteration over a TIDBitmap
1139	*
1140	* Currently this is just a pfree, but it might do more someday. (For
1141	* instance, it could be useful to count open iterators and allow the
1142	* bitmap to return to read/write status when there are no more iterators.)
1143	*/
1144	void
1145	tbm_end_iterate(TBMIterator *iterator)
1146	{
1147	pfree(iterator);
1148	}
1149
1150	/*
1151	* tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1152	*
1153	* This doesn't free any of the shared state associated with the iterator,
1154	* just our backend-private state.
1155	*/
1156	void
1157	tbm_end_shared_iterate(TBMSharedIterator *iterator)
1158	{
1159	pfree(iterator);
1160	}
1161
1162	/*
1163	* tbm_find_pageentry - find a PagetableEntry for the pageno
1164	*
1165	* Returns NULL if there is no non-lossy entry for the pageno.
1166	*/
1167	static const PagetableEntry *
1168	tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1169	{
1170	const PagetableEntry *page;
1171
1172	if (tbm->nentries == `0`) / in case pagetable doesn't exist /
1173	return NULL;
1174
1175	if (tbm->status == TBM_ONE_PAGE)
1176	{
1177	page = &tbm->entry1;
1178	if (page->blockno != pageno)
1179	return NULL;
1180	Assert(!page->ischunk);
1181	return page;
1182	}
1183
1184	page = pagetable_lookup(tbm->pagetable, pageno);
1185	if (page == NULL)
1186	return NULL;
1187	if (page->ischunk)
1188	return NULL; / don't want a lossy chunk header /
1189	return page;
1190	}
1191
1192	/*
1193	* tbm_get_pageentry - find or create a PagetableEntry for the pageno
1194	*
1195	* If new, the entry is marked as an exact (non-chunk) entry.
1196	*
1197	* This may cause the table to exceed the desired memory size. It is
1198	* up to the caller to call tbm_lossify() at the next safe point if so.
1199	*/
1200	static PagetableEntry *
1201	tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1202	{
1203	PagetableEntry *page;
1204	bool found;
1205
1206	if (tbm->status == TBM_EMPTY)
1207	{
1208	/ Use the fixed slot /
1209	page = &tbm->entry1;
1210	found = false;
1211	tbm->status = TBM_ONE_PAGE;
1212	}
1213	else
1214	{
1215	if (tbm->status == TBM_ONE_PAGE)
1216	{
1217	page = &tbm->entry1;
1218	if (page->blockno == pageno)
1219	return page;
1220	/ Time to switch from one page to a hashtable /
1221	tbm_create_pagetable(tbm);
1222	}
1223
1224	/ Look up or create an entry /
1225	page = pagetable_insert(tbm->pagetable, pageno, &found);
1226	}
1227
1228	/ Initialize it if not present before /
1229	if (!found)
1230	{
1231	char oldstatus = page->status;
1232
1233	MemSet(page, `0`, sizeof(PagetableEntry));
1234	page->status = oldstatus;
1235	page->blockno = pageno;
1236	/ must count it too /
1237	tbm->nentries++;
1238	tbm->npages++;
1239	}
1240
1241	return page;
1242	}
1243
1244	/*
1245	* tbm_page_is_lossy - is the page marked as lossily stored?
1246	*/
1247	static bool
1248	tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1249	{
1250	PagetableEntry *page;
1251	BlockNumber chunk_pageno;
1252	int bitno;
1253
1254	/ we can skip the lookup if there are no lossy chunks /
1255	if (tbm->nchunks == `0`)
1256	return false;
1257	Assert(tbm->status == TBM_HASH);
1258
1259	bitno = pageno % PAGES_PER_CHUNK;
1260	chunk_pageno = pageno - bitno;
1261
1262	page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1263
1264	if (page != NULL && page->ischunk)
1265	{
1266	int wordnum = WORDNUM(bitno);
1267	int bitnum = BITNUM(bitno);
1268
1269	if ((page->words[wordnum] & ((bitmapword) `1` << bitnum)) != `0`)
1270	return true;
1271	}
1272	return false;
1273	}
1274
1275	/*
1276	* tbm_mark_page_lossy - mark the page number as lossily stored
1277	*
1278	* This may cause the table to exceed the desired memory size. It is
1279	* up to the caller to call tbm_lossify() at the next safe point if so.
1280	*/
1281	static void
1282	tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1283	{
1284	PagetableEntry *page;
1285	bool found;
1286	BlockNumber chunk_pageno;
1287	int bitno;
1288	int wordnum;
1289	int bitnum;
1290
1291	/ We force the bitmap into hashtable mode whenever it's lossy /
1292	if (tbm->status != TBM_HASH)
1293	tbm_create_pagetable(tbm);
1294
1295	bitno = pageno % PAGES_PER_CHUNK;
1296	chunk_pageno = pageno - bitno;
1297
1298	/*
1299	* Remove any extant non-lossy entry for the page. If the page is its own
1300	* chunk header, however, we skip this and handle the case below.
1301	*/
1302	if (bitno != `0`)
1303	{
1304	if (pagetable_delete(tbm->pagetable, pageno))
1305	{
1306	/ It was present, so adjust counts /
1307	tbm->nentries--;
1308	tbm->npages--; / assume it must have been non-lossy /
1309	}
1310	}
1311
1312	/ Look up or create entry for chunk-header page /
1313	page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1314
1315	/ Initialize it if not present before /
1316	if (!found)
1317	{
1318	char oldstatus = page->status;
1319
1320	MemSet(page, `0`, sizeof(PagetableEntry));
1321	page->status = oldstatus;
1322	page->blockno = chunk_pageno;
1323	page->ischunk = true;
1324	/ must count it too /
1325	tbm->nentries++;
1326	tbm->nchunks++;
1327	}
1328	else if (!page->ischunk)
1329	{
1330	char oldstatus = page->status;
1331
1332	/ chunk header page was formerly non-lossy, make it lossy /
1333	MemSet(page, `0`, sizeof(PagetableEntry));
1334	page->status = oldstatus;
1335	page->blockno = chunk_pageno;
1336	page->ischunk = true;
1337	/ we assume it had some tuple bit(s) set, so mark it lossy /
1338	page->words[`0`] = ((bitmapword) `1` << `0`);
1339	/ adjust counts /
1340	tbm->nchunks++;
1341	tbm->npages--;
1342	}
1343
1344	/ Now set the original target page's bit /
1345	wordnum = WORDNUM(bitno);
1346	bitnum = BITNUM(bitno);
1347	page->words[wordnum] \|= ((bitmapword) `1` << bitnum);
1348	}
1349
1350	/*
1351	* tbm_lossify - lose some information to get back under the memory limit
1352	*/
1353	static void
1354	tbm_lossify(TIDBitmap *tbm)
1355	{
1356	pagetable_iterator i;
1357	PagetableEntry *page;
1358
1359	/*
1360	* XXX Really stupid implementation: this just lossifies pages in
1361	* essentially random order. We should be paying some attention to the
1362	* number of bits set in each page, instead.
1363	*
1364	* Since we are called as soon as nentries exceeds maxentries, we should
1365	* push nentries down to significantly less than maxentries, or else we'll
1366	* just end up doing this again very soon. We shoot for maxentries/2.
1367	*/
1368	Assert(tbm->iterating == TBM_NOT_ITERATING);
1369	Assert(tbm->status == TBM_HASH);
1370
1371	pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1372	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1373	{
1374	if (page->ischunk)
1375	continue; / already a chunk header /
1376
1377	/*
1378	* If the page would become a chunk header, we won't save anything by
1379	* converting it to lossy, so skip it.
1380	*/
1381	if ((page->blockno % PAGES_PER_CHUNK) == `0`)
1382	continue;
1383
1384	/ This does the dirty work ... /
1385	tbm_mark_page_lossy(tbm, page->blockno);
1386
1387	if (tbm->nentries <= tbm->maxentries / `2`)
1388	{
1389	/*
1390	* We have made enough room. Remember where to start lossifying
1391	* next round, so we evenly iterate over the hashtable.
1392	*/
1393	tbm->lossify_start = i.cur;
1394	break;
1395	}
1396
1397	/*
1398	* Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1399	* hashtable and may have deleted the non-lossy chunk. We can
1400	* continue the same hash table scan, since failure to visit one
1401	* element or visiting the newly inserted element, isn't fatal.
1402	*/
1403	}
1404
1405	/*
1406	* With a big bitmap and small work_mem, it's possible that we cannot get
1407	* under maxentries. Again, if that happens, we'd end up uselessly
1408	* calling tbm_lossify over and over. To prevent this from becoming a
1409	* performance sink, force maxentries up to at least double the current
1410	* number of entries. (In essence, we're admitting inability to fit
1411	* within work_mem when we do this.) Note that this test will not fire if
1412	* we broke out of the loop early; and if we didn't, the current number of
1413	* entries is simply not reducible any further.
1414	*/
1415	if (tbm->nentries > tbm->maxentries / `2`)
1416	tbm->maxentries = Min(tbm->nentries, (INT_MAX - `1`) / `2`) * `2`;
1417	}
1418
1419	/*
1420	* qsort comparator to handle PagetableEntry pointers.
1421	*/
1422	static int
1423	tbm_comparator(const void left, const* void *right)
1424	{
1425	BlockNumber l = (((PagetableEntry const *) left))->blockno;
1426	BlockNumber r = (((PagetableEntry const *) right))->blockno;
1427
1428	if (l < r)
1429	return -`1`;
1430	else if (l > r)
1431	return `1`;
1432	return `0`;
1433	}
1434
1435	/*
1436	* As above, but this will get index into PagetableEntry array. Therefore,
1437	* it needs to get actual PagetableEntry using the index before comparing the
1438	* blockno.
1439	*/
1440	static int
1441	tbm_shared_comparator(const void left, const* void right, void* *arg)
1442	{
1443	PagetableEntry base = (PagetableEntry ) arg;
1444	PagetableEntry lpage = &base[(int *) left];
1445	PagetableEntry rpage = &base[(int *) right];
1446
1447	if (lpage->blockno < rpage->blockno)
1448	return -`1`;
1449	else if (lpage->blockno > rpage->blockno)
1450	return `1`;
1451	return `0`;
1452	}
1453
1454	/*
1455	* tbm_attach_shared_iterate
1456	*
1457	* Allocate a backend-private iterator and attach the shared iterator state
1458	* to it so that multiple processed can iterate jointly.
1459	*
1460	* We also converts the DSA pointers to local pointers and store them into
1461	* our private iterator.
1462	*/
1463	TBMSharedIterator *
1464	tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1465	{
1466	TBMSharedIterator *iterator;
1467	TBMSharedIteratorState *istate;
1468
1469	/*
1470	* Create the TBMSharedIterator struct, with enough trailing space to
1471	* serve the needs of the TBMIterateResult sub-struct.
1472	*/
1473	iterator = (TBMSharedIterator ) palloc0(sizeof*(TBMSharedIterator) +
1474	MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1475
1476	istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477
1478	iterator->state = istate;
1479
1480	iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481
1482	if (istate->npages)
1483	iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484	if (istate->nchunks)
1485	iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486
1487	return iterator;
1488	}
1489
1490	/*
1491	* pagetable_allocate
1492	*
1493	* Callback function for allocating the memory for hashtable elements.
1494	* Allocate memory for hashtable elements, using DSA if available.
1495	*/
1496	static inline void *
1497	pagetable_allocate(pagetable_hash *pagetable, Size size)
1498	{
1499	TIDBitmap tbm = (TIDBitmap ) pagetable->private_data;
1500	PTEntryArray *ptbase;
1501
1502	if (tbm->dsa == NULL)
1503	return MemoryContextAllocExtended(pagetable->ctx, size,
1504	MCXT_ALLOC_HUGE \| MCXT_ALLOC_ZERO);
1505
1506	/*
1507	* Save the dsapagetable reference in dsapagetableold before allocating
1508	* new memory so that pagetable_free can free the old entry.
1509	*/
1510	tbm->dsapagetableold = tbm->dsapagetable;
1511	tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1512	sizeof(PTEntryArray) + size,
1513	DSA_ALLOC_HUGE \| DSA_ALLOC_ZERO);
1514	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515
1516	return ptbase->ptentry;
1517	}
1518
1519	/*
1520	* pagetable_free
1521	*
1522	* Callback function for freeing hash table elements.
1523	*/
1524	static inline void
1525	pagetable_free(pagetable_hash pagetable, void* *pointer)
1526	{
1527	TIDBitmap tbm = (TIDBitmap ) pagetable->private_data;
1528
1529	/ pfree the input pointer if DSA is not available /
1530	if (tbm->dsa == NULL)
1531	pfree(pointer);
1532	else if (DsaPointerIsValid(tbm->dsapagetableold))
1533	{
1534	dsa_free(tbm->dsa, tbm->dsapagetableold);
1535	tbm->dsapagetableold = InvalidDsaPointer;
1536	}
1537	}
1538
1539	/*
1540	* tbm_calculate_entries
1541	*
1542	* Estimate number of hashtable entries we can have within maxbytes.
1543	*/
1544	long
1545	tbm_calculate_entries(double maxbytes)
1546	{
1547	long nbuckets;
1548
1549	/*
1550	* Estimate number of hashtable entries we can have within maxbytes. This
1551	* estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552	* for our purpose. Also count an extra Pointer per entry for the arrays
1553	* created during iteration readout.
1554	*/
1555	nbuckets = maxbytes /
1556	(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557	nbuckets = Min(nbuckets, INT_MAX - `1`); / safety limit /
1558	nbuckets = Max(nbuckets, `16`); / sanity limit /
1559
1560	return nbuckets;
1561	}
1562

Browse the source code of PostgreSQL/src/backend/nodes/tidbitmap.c