1/*-------------------------------------------------------------------------
2 *
3 * hashovfl.c
4 * Overflow page management code for the Postgres hash access method
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hashovfl.c
12 *
13 * NOTES
14 * Overflow pages look like ordinary relation pages.
15 *
16 *-------------------------------------------------------------------------
17 */
18#include "postgres.h"
19
20#include "access/hash.h"
21#include "access/hash_xlog.h"
22#include "miscadmin.h"
23#include "utils/rel.h"
24
25
26static uint32 _hash_firstfreebit(uint32 map);
27
28
29/*
30 * Convert overflow page bit number (its index in the free-page bitmaps)
31 * to block number within the index.
32 */
33static BlockNumber
34bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
35{
36 uint32 splitnum = metap->hashm_ovflpoint;
37 uint32 i;
38
39 /* Convert zero-based bitnumber to 1-based page number */
40 ovflbitnum += 1;
41
42 /* Determine the split number for this page (must be >= 1) */
43 for (i = 1;
44 i < splitnum && ovflbitnum > metap->hashm_spares[i];
45 i++)
46 /* loop */ ;
47
48 /*
49 * Convert to absolute page number by adding the number of bucket pages
50 * that exist before this split point.
51 */
52 return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
53}
54
55/*
56 * _hash_ovflblkno_to_bitno
57 *
58 * Convert overflow page block number to bit number for free-page bitmap.
59 */
60uint32
61_hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
62{
63 uint32 splitnum = metap->hashm_ovflpoint;
64 uint32 i;
65 uint32 bitnum;
66
67 /* Determine the split number containing this page */
68 for (i = 1; i <= splitnum; i++)
69 {
70 if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
71 break; /* oops */
72 bitnum = ovflblkno - _hash_get_totalbuckets(i);
73
74 /*
75 * bitnum has to be greater than number of overflow page added in
76 * previous split point. The overflow page at this splitnum (i) if any
77 * should start from (_hash_get_totalbuckets(i) +
78 * metap->hashm_spares[i - 1] + 1).
79 */
80 if (bitnum > metap->hashm_spares[i - 1] &&
81 bitnum <= metap->hashm_spares[i])
82 return bitnum - 1; /* -1 to convert 1-based to 0-based */
83 }
84
85 ereport(ERROR,
86 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
87 errmsg("invalid overflow block number %u", ovflblkno)));
88 return 0; /* keep compiler quiet */
89}
90
91/*
92 * _hash_addovflpage
93 *
94 * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
95 *
96 * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
97 * dropped before exiting (we assume the caller is not interested in 'buf'
98 * anymore) if not asked to retain. The pin will be retained only for the
99 * primary bucket. The returned overflow page will be pinned and
100 * write-locked; it is guaranteed to be empty.
101 *
102 * The caller must hold a pin, but no lock, on the metapage buffer.
103 * That buffer is returned in the same state.
104 *
105 * NB: since this could be executed concurrently by multiple processes,
106 * one should not assume that the returned overflow page will be the
107 * immediate successor of the originally passed 'buf'. Additional overflow
108 * pages might have been added to the bucket chain in between.
109 */
110Buffer
111_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
112{
113 Buffer ovflbuf;
114 Page page;
115 Page ovflpage;
116 HashPageOpaque pageopaque;
117 HashPageOpaque ovflopaque;
118 HashMetaPage metap;
119 Buffer mapbuf = InvalidBuffer;
120 Buffer newmapbuf = InvalidBuffer;
121 BlockNumber blkno;
122 uint32 orig_firstfree;
123 uint32 splitnum;
124 uint32 *freep = NULL;
125 uint32 max_ovflpg;
126 uint32 bit;
127 uint32 bitmap_page_bit;
128 uint32 first_page;
129 uint32 last_bit;
130 uint32 last_page;
131 uint32 i,
132 j;
133 bool page_found = false;
134
135 /*
136 * Write-lock the tail page. Here, we need to maintain locking order such
137 * that, first acquire the lock on tail page of bucket, then on meta page
138 * to find and lock the bitmap page and if it is found, then lock on meta
139 * page is released, then finally acquire the lock on new overflow buffer.
140 * We need this locking order to avoid deadlock with backends that are
141 * doing inserts.
142 *
143 * Note: We could have avoided locking many buffers here if we made two
144 * WAL records for acquiring an overflow page (one to allocate an overflow
145 * page and another to add it to overflow bucket chain). However, doing
146 * so can leak an overflow page, if the system crashes after allocation.
147 * Needless to say, it is better to have a single record from a
148 * performance point of view as well.
149 */
150 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
151
152 /* probably redundant... */
153 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
154
155 /* loop to find current tail page, in case someone else inserted too */
156 for (;;)
157 {
158 BlockNumber nextblkno;
159
160 page = BufferGetPage(buf);
161 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
162 nextblkno = pageopaque->hasho_nextblkno;
163
164 if (!BlockNumberIsValid(nextblkno))
165 break;
166
167 /* we assume we do not need to write the unmodified page */
168 if (retain_pin)
169 {
170 /* pin will be retained only for the primary bucket page */
171 Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
172 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
173 }
174 else
175 _hash_relbuf(rel, buf);
176
177 retain_pin = false;
178
179 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
180 }
181
182 /* Get exclusive lock on the meta page */
183 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
184
185 _hash_checkpage(rel, metabuf, LH_META_PAGE);
186 metap = HashPageGetMeta(BufferGetPage(metabuf));
187
188 /* start search at hashm_firstfree */
189 orig_firstfree = metap->hashm_firstfree;
190 first_page = orig_firstfree >> BMPG_SHIFT(metap);
191 bit = orig_firstfree & BMPG_MASK(metap);
192 i = first_page;
193 j = bit / BITS_PER_MAP;
194 bit &= ~(BITS_PER_MAP - 1);
195
196 /* outer loop iterates once per bitmap page */
197 for (;;)
198 {
199 BlockNumber mapblkno;
200 Page mappage;
201 uint32 last_inpage;
202
203 /* want to end search with the last existing overflow page */
204 splitnum = metap->hashm_ovflpoint;
205 max_ovflpg = metap->hashm_spares[splitnum] - 1;
206 last_page = max_ovflpg >> BMPG_SHIFT(metap);
207 last_bit = max_ovflpg & BMPG_MASK(metap);
208
209 if (i > last_page)
210 break;
211
212 Assert(i < metap->hashm_nmaps);
213 mapblkno = metap->hashm_mapp[i];
214
215 if (i == last_page)
216 last_inpage = last_bit;
217 else
218 last_inpage = BMPGSZ_BIT(metap) - 1;
219
220 /* Release exclusive lock on metapage while reading bitmap page */
221 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
222
223 mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
224 mappage = BufferGetPage(mapbuf);
225 freep = HashPageGetBitmap(mappage);
226
227 for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
228 {
229 if (freep[j] != ALL_SET)
230 {
231 page_found = true;
232
233 /* Reacquire exclusive lock on the meta page */
234 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
235
236 /* convert bit to bit number within page */
237 bit += _hash_firstfreebit(freep[j]);
238 bitmap_page_bit = bit;
239
240 /* convert bit to absolute bit number */
241 bit += (i << BMPG_SHIFT(metap));
242 /* Calculate address of the recycled overflow page */
243 blkno = bitno_to_blkno(metap, bit);
244
245 /* Fetch and init the recycled page */
246 ovflbuf = _hash_getinitbuf(rel, blkno);
247
248 goto found;
249 }
250 }
251
252 /* No free space here, try to advance to next map page */
253 _hash_relbuf(rel, mapbuf);
254 mapbuf = InvalidBuffer;
255 i++;
256 j = 0; /* scan from start of next map page */
257 bit = 0;
258
259 /* Reacquire exclusive lock on the meta page */
260 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
261 }
262
263 /*
264 * No free pages --- have to extend the relation to add an overflow page.
265 * First, check to see if we have to add a new bitmap page too.
266 */
267 if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
268 {
269 /*
270 * We create the new bitmap page with all pages marked "in use".
271 * Actually two pages in the new bitmap's range will exist
272 * immediately: the bitmap page itself, and the following page which
273 * is the one we return to the caller. Both of these are correctly
274 * marked "in use". Subsequent pages do not exist yet, but it is
275 * convenient to pre-mark them as "in use" too.
276 */
277 bit = metap->hashm_spares[splitnum];
278
279 /* metapage already has a write lock */
280 if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
281 ereport(ERROR,
282 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
283 errmsg("out of overflow pages in hash index \"%s\"",
284 RelationGetRelationName(rel))));
285
286 newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
287 }
288 else
289 {
290 /*
291 * Nothing to do here; since the page will be past the last used page,
292 * we know its bitmap bit was preinitialized to "in use".
293 */
294 }
295
296 /* Calculate address of the new overflow page */
297 bit = BufferIsValid(newmapbuf) ?
298 metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
299 blkno = bitno_to_blkno(metap, bit);
300
301 /*
302 * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
303 * relation length stays in sync with ours. XXX It's annoying to do this
304 * with metapage write lock held; would be better to use a lock that
305 * doesn't block incoming searches.
306 *
307 * It is okay to hold two buffer locks here (one on tail page of bucket
308 * and other on new overflow page) since there cannot be anyone else
309 * contending for access to ovflbuf.
310 */
311 ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
312
313found:
314
315 /*
316 * Do the update. No ereport(ERROR) until changes are logged. We want to
317 * log the changes for bitmap page and overflow page together to avoid
318 * loss of pages in case the new page is added.
319 */
320 START_CRIT_SECTION();
321
322 if (page_found)
323 {
324 Assert(BufferIsValid(mapbuf));
325
326 /* mark page "in use" in the bitmap */
327 SETBIT(freep, bitmap_page_bit);
328 MarkBufferDirty(mapbuf);
329 }
330 else
331 {
332 /* update the count to indicate new overflow page is added */
333 metap->hashm_spares[splitnum]++;
334
335 if (BufferIsValid(newmapbuf))
336 {
337 _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
338 MarkBufferDirty(newmapbuf);
339
340 /* add the new bitmap page to the metapage's list of bitmaps */
341 metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
342 metap->hashm_nmaps++;
343 metap->hashm_spares[splitnum]++;
344 }
345
346 MarkBufferDirty(metabuf);
347
348 /*
349 * for new overflow page, we don't need to explicitly set the bit in
350 * bitmap page, as by default that will be set to "in use".
351 */
352 }
353
354 /*
355 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
356 * changing it if someone moved it while we were searching bitmap pages.
357 */
358 if (metap->hashm_firstfree == orig_firstfree)
359 {
360 metap->hashm_firstfree = bit + 1;
361 MarkBufferDirty(metabuf);
362 }
363
364 /* initialize new overflow page */
365 ovflpage = BufferGetPage(ovflbuf);
366 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
367 ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
368 ovflopaque->hasho_nextblkno = InvalidBlockNumber;
369 ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
370 ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
371 ovflopaque->hasho_page_id = HASHO_PAGE_ID;
372
373 MarkBufferDirty(ovflbuf);
374
375 /* logically chain overflow page to previous page */
376 pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
377
378 MarkBufferDirty(buf);
379
380 /* XLOG stuff */
381 if (RelationNeedsWAL(rel))
382 {
383 XLogRecPtr recptr;
384 xl_hash_add_ovfl_page xlrec;
385
386 xlrec.bmpage_found = page_found;
387 xlrec.bmsize = metap->hashm_bmsize;
388
389 XLogBeginInsert();
390 XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
391
392 XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
393 XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
394
395 XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
396
397 if (BufferIsValid(mapbuf))
398 {
399 XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
400 XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
401 }
402
403 if (BufferIsValid(newmapbuf))
404 XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
405
406 XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
407 XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
408
409 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
410
411 PageSetLSN(BufferGetPage(ovflbuf), recptr);
412 PageSetLSN(BufferGetPage(buf), recptr);
413
414 if (BufferIsValid(mapbuf))
415 PageSetLSN(BufferGetPage(mapbuf), recptr);
416
417 if (BufferIsValid(newmapbuf))
418 PageSetLSN(BufferGetPage(newmapbuf), recptr);
419
420 PageSetLSN(BufferGetPage(metabuf), recptr);
421 }
422
423 END_CRIT_SECTION();
424
425 if (retain_pin)
426 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
427 else
428 _hash_relbuf(rel, buf);
429
430 if (BufferIsValid(mapbuf))
431 _hash_relbuf(rel, mapbuf);
432
433 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
434
435 if (BufferIsValid(newmapbuf))
436 _hash_relbuf(rel, newmapbuf);
437
438 return ovflbuf;
439}
440
441/*
442 * _hash_firstfreebit()
443 *
444 * Return the number of the first bit that is not set in the word 'map'.
445 */
446static uint32
447_hash_firstfreebit(uint32 map)
448{
449 uint32 i,
450 mask;
451
452 mask = 0x1;
453 for (i = 0; i < BITS_PER_MAP; i++)
454 {
455 if (!(mask & map))
456 return i;
457 mask <<= 1;
458 }
459
460 elog(ERROR, "firstfreebit found no free bit");
461
462 return 0; /* keep compiler quiet */
463}
464
465/*
466 * _hash_freeovflpage() -
467 *
468 * Remove this overflow page from its bucket's chain, and mark the page as
469 * free. On entry, ovflbuf is write-locked; it is released before exiting.
470 *
471 * Add the tuples (itups) to wbuf in this function. We could do that in the
472 * caller as well, but the advantage of doing it here is we can easily write
473 * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
474 * removal of overflow page has to done as an atomic operation, otherwise
475 * during replay on standby users might find duplicate records.
476 *
477 * Since this function is invoked in VACUUM, we provide an access strategy
478 * parameter that controls fetches of the bucket pages.
479 *
480 * Returns the block number of the page that followed the given page
481 * in the bucket, or InvalidBlockNumber if no following page.
482 *
483 * NB: caller must not hold lock on metapage, nor on page, that's next to
484 * ovflbuf in the bucket chain. We don't acquire the lock on page that's
485 * prior to ovflbuf in chain if it is same as wbuf because the caller already
486 * has a lock on same.
487 */
488BlockNumber
489_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
490 Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
491 Size *tups_size, uint16 nitups,
492 BufferAccessStrategy bstrategy)
493{
494 HashMetaPage metap;
495 Buffer metabuf;
496 Buffer mapbuf;
497 BlockNumber ovflblkno;
498 BlockNumber prevblkno;
499 BlockNumber blkno;
500 BlockNumber nextblkno;
501 BlockNumber writeblkno;
502 HashPageOpaque ovflopaque;
503 Page ovflpage;
504 Page mappage;
505 uint32 *freep;
506 uint32 ovflbitno;
507 int32 bitmappage,
508 bitmapbit;
509 Bucket bucket PG_USED_FOR_ASSERTS_ONLY;
510 Buffer prevbuf = InvalidBuffer;
511 Buffer nextbuf = InvalidBuffer;
512 bool update_metap = false;
513
514 /* Get information from the doomed page */
515 _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
516 ovflblkno = BufferGetBlockNumber(ovflbuf);
517 ovflpage = BufferGetPage(ovflbuf);
518 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
519 nextblkno = ovflopaque->hasho_nextblkno;
520 prevblkno = ovflopaque->hasho_prevblkno;
521 writeblkno = BufferGetBlockNumber(wbuf);
522 bucket = ovflopaque->hasho_bucket;
523
524 /*
525 * Fix up the bucket chain. this is a doubly-linked list, so we must fix
526 * up the bucket chain members behind and ahead of the overflow page being
527 * deleted. Concurrency issues are avoided by using lock chaining as
528 * described atop hashbucketcleanup.
529 */
530 if (BlockNumberIsValid(prevblkno))
531 {
532 if (prevblkno == writeblkno)
533 prevbuf = wbuf;
534 else
535 prevbuf = _hash_getbuf_with_strategy(rel,
536 prevblkno,
537 HASH_WRITE,
538 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
539 bstrategy);
540 }
541 if (BlockNumberIsValid(nextblkno))
542 nextbuf = _hash_getbuf_with_strategy(rel,
543 nextblkno,
544 HASH_WRITE,
545 LH_OVERFLOW_PAGE,
546 bstrategy);
547
548 /* Note: bstrategy is intentionally not used for metapage and bitmap */
549
550 /* Read the metapage so we can determine which bitmap page to use */
551 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
552 metap = HashPageGetMeta(BufferGetPage(metabuf));
553
554 /* Identify which bit to set */
555 ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
556
557 bitmappage = ovflbitno >> BMPG_SHIFT(metap);
558 bitmapbit = ovflbitno & BMPG_MASK(metap);
559
560 if (bitmappage >= metap->hashm_nmaps)
561 elog(ERROR, "invalid overflow bit number %u", ovflbitno);
562 blkno = metap->hashm_mapp[bitmappage];
563
564 /* Release metapage lock while we access the bitmap page */
565 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
566
567 /* read the bitmap page to clear the bitmap bit */
568 mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
569 mappage = BufferGetPage(mapbuf);
570 freep = HashPageGetBitmap(mappage);
571 Assert(ISSET(freep, bitmapbit));
572
573 /* Get write-lock on metapage to update firstfree */
574 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
575
576 /* This operation needs to log multiple tuples, prepare WAL for that */
577 if (RelationNeedsWAL(rel))
578 XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);
579
580 START_CRIT_SECTION();
581
582 /*
583 * we have to insert tuples on the "write" page, being careful to preserve
584 * hashkey ordering. (If we insert many tuples into the same "write" page
585 * it would be worth qsort'ing them).
586 */
587 if (nitups > 0)
588 {
589 _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
590 MarkBufferDirty(wbuf);
591 }
592
593 /*
594 * Reinitialize the freed overflow page. Just zeroing the page won't
595 * work, because WAL replay routines expect pages to be initialized. See
596 * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
597 * careful to make the special space valid here so that tools like
598 * pageinspect won't get confused.
599 */
600 _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
601
602 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
603
604 ovflopaque->hasho_prevblkno = InvalidBlockNumber;
605 ovflopaque->hasho_nextblkno = InvalidBlockNumber;
606 ovflopaque->hasho_bucket = -1;
607 ovflopaque->hasho_flag = LH_UNUSED_PAGE;
608 ovflopaque->hasho_page_id = HASHO_PAGE_ID;
609
610 MarkBufferDirty(ovflbuf);
611
612 if (BufferIsValid(prevbuf))
613 {
614 Page prevpage = BufferGetPage(prevbuf);
615 HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
616
617 Assert(prevopaque->hasho_bucket == bucket);
618 prevopaque->hasho_nextblkno = nextblkno;
619 MarkBufferDirty(prevbuf);
620 }
621 if (BufferIsValid(nextbuf))
622 {
623 Page nextpage = BufferGetPage(nextbuf);
624 HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
625
626 Assert(nextopaque->hasho_bucket == bucket);
627 nextopaque->hasho_prevblkno = prevblkno;
628 MarkBufferDirty(nextbuf);
629 }
630
631 /* Clear the bitmap bit to indicate that this overflow page is free */
632 CLRBIT(freep, bitmapbit);
633 MarkBufferDirty(mapbuf);
634
635 /* if this is now the first free page, update hashm_firstfree */
636 if (ovflbitno < metap->hashm_firstfree)
637 {
638 metap->hashm_firstfree = ovflbitno;
639 update_metap = true;
640 MarkBufferDirty(metabuf);
641 }
642
643 /* XLOG stuff */
644 if (RelationNeedsWAL(rel))
645 {
646 xl_hash_squeeze_page xlrec;
647 XLogRecPtr recptr;
648 int i;
649
650 xlrec.prevblkno = prevblkno;
651 xlrec.nextblkno = nextblkno;
652 xlrec.ntups = nitups;
653 xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
654 xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
655
656 XLogBeginInsert();
657 XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
658
659 /*
660 * bucket buffer needs to be registered to ensure that we can acquire
661 * a cleanup lock on it during replay.
662 */
663 if (!xlrec.is_prim_bucket_same_wrt)
664 XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
665
666 XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
667 if (xlrec.ntups > 0)
668 {
669 XLogRegisterBufData(1, (char *) itup_offsets,
670 nitups * sizeof(OffsetNumber));
671 for (i = 0; i < nitups; i++)
672 XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
673 }
674
675 XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
676
677 /*
678 * If prevpage and the writepage (block in which we are moving tuples
679 * from overflow) are same, then no need to separately register
680 * prevpage. During replay, we can directly update the nextblock in
681 * writepage.
682 */
683 if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
684 XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
685
686 if (BufferIsValid(nextbuf))
687 XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
688
689 XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
690 XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
691
692 if (update_metap)
693 {
694 XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
695 XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
696 }
697
698 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
699
700 PageSetLSN(BufferGetPage(wbuf), recptr);
701 PageSetLSN(BufferGetPage(ovflbuf), recptr);
702
703 if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
704 PageSetLSN(BufferGetPage(prevbuf), recptr);
705 if (BufferIsValid(nextbuf))
706 PageSetLSN(BufferGetPage(nextbuf), recptr);
707
708 PageSetLSN(BufferGetPage(mapbuf), recptr);
709
710 if (update_metap)
711 PageSetLSN(BufferGetPage(metabuf), recptr);
712 }
713
714 END_CRIT_SECTION();
715
716 /* release previous bucket if it is not same as write bucket */
717 if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
718 _hash_relbuf(rel, prevbuf);
719
720 if (BufferIsValid(ovflbuf))
721 _hash_relbuf(rel, ovflbuf);
722
723 if (BufferIsValid(nextbuf))
724 _hash_relbuf(rel, nextbuf);
725
726 _hash_relbuf(rel, mapbuf);
727 _hash_relbuf(rel, metabuf);
728
729 return nextblkno;
730}
731
732
733/*
734 * _hash_initbitmapbuffer()
735 *
736 * Initialize a new bitmap page. All bits in the new bitmap page are set to
737 * "1", indicating "in use".
738 */
739void
740_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
741{
742 Page pg;
743 HashPageOpaque op;
744 uint32 *freep;
745
746 pg = BufferGetPage(buf);
747
748 /* initialize the page */
749 if (initpage)
750 _hash_pageinit(pg, BufferGetPageSize(buf));
751
752 /* initialize the page's special space */
753 op = (HashPageOpaque) PageGetSpecialPointer(pg);
754 op->hasho_prevblkno = InvalidBlockNumber;
755 op->hasho_nextblkno = InvalidBlockNumber;
756 op->hasho_bucket = -1;
757 op->hasho_flag = LH_BITMAP_PAGE;
758 op->hasho_page_id = HASHO_PAGE_ID;
759
760 /* set all of the bits to 1 */
761 freep = HashPageGetBitmap(pg);
762 MemSet(freep, 0xFF, bmsize);
763
764 /*
765 * Set pd_lower just past the end of the bitmap page data. We could even
766 * set pd_lower equal to pd_upper, but this is more precise and makes the
767 * page look compressible to xlog.c.
768 */
769 ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
770}
771
772
773/*
774 * _hash_squeezebucket(rel, bucket)
775 *
776 * Try to squeeze the tuples onto pages occurring earlier in the
777 * bucket chain in an attempt to free overflow pages. When we start
778 * the "squeezing", the page from which we start taking tuples (the
779 * "read" page) is the last bucket in the bucket chain and the page
780 * onto which we start squeezing tuples (the "write" page) is the
781 * first page in the bucket chain. The read page works backward and
782 * the write page works forward; the procedure terminates when the
783 * read page and write page are the same page.
784 *
785 * At completion of this procedure, it is guaranteed that all pages in
786 * the bucket are nonempty, unless the bucket is totally empty (in
787 * which case all overflow pages will be freed). The original implementation
788 * required that to be true on entry as well, but it's a lot easier for
789 * callers to leave empty overflow pages and let this guy clean it up.
790 *
791 * Caller must acquire cleanup lock on the primary page of the target
792 * bucket to exclude any scans that are in progress, which could easily
793 * be confused into returning the same tuple more than once or some tuples
794 * not at all by the rearrangement we are performing here. To prevent
795 * any concurrent scan to cross the squeeze scan we use lock chaining
796 * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup.
797 *
798 * We need to retain a pin on the primary bucket to ensure that no concurrent
799 * split can start.
800 *
801 * Since this function is invoked in VACUUM, we provide an access strategy
802 * parameter that controls fetches of the bucket pages.
803 */
804void
805_hash_squeezebucket(Relation rel,
806 Bucket bucket,
807 BlockNumber bucket_blkno,
808 Buffer bucket_buf,
809 BufferAccessStrategy bstrategy)
810{
811 BlockNumber wblkno;
812 BlockNumber rblkno;
813 Buffer wbuf;
814 Buffer rbuf;
815 Page wpage;
816 Page rpage;
817 HashPageOpaque wopaque;
818 HashPageOpaque ropaque;
819
820 /*
821 * start squeezing into the primary bucket page.
822 */
823 wblkno = bucket_blkno;
824 wbuf = bucket_buf;
825 wpage = BufferGetPage(wbuf);
826 wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
827
828 /*
829 * if there aren't any overflow pages, there's nothing to squeeze. caller
830 * is responsible for releasing the pin on primary bucket page.
831 */
832 if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
833 {
834 LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
835 return;
836 }
837
838 /*
839 * Find the last page in the bucket chain by starting at the base bucket
840 * page and working forward. Note: we assume that a hash bucket chain is
841 * usually smaller than the buffer ring being used by VACUUM, else using
842 * the access strategy here would be counterproductive.
843 */
844 rbuf = InvalidBuffer;
845 ropaque = wopaque;
846 do
847 {
848 rblkno = ropaque->hasho_nextblkno;
849 if (rbuf != InvalidBuffer)
850 _hash_relbuf(rel, rbuf);
851 rbuf = _hash_getbuf_with_strategy(rel,
852 rblkno,
853 HASH_WRITE,
854 LH_OVERFLOW_PAGE,
855 bstrategy);
856 rpage = BufferGetPage(rbuf);
857 ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
858 Assert(ropaque->hasho_bucket == bucket);
859 } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
860
861 /*
862 * squeeze the tuples.
863 */
864 for (;;)
865 {
866 OffsetNumber roffnum;
867 OffsetNumber maxroffnum;
868 OffsetNumber deletable[MaxOffsetNumber];
869 IndexTuple itups[MaxIndexTuplesPerPage];
870 Size tups_size[MaxIndexTuplesPerPage];
871 OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
872 uint16 ndeletable = 0;
873 uint16 nitups = 0;
874 Size all_tups_size = 0;
875 int i;
876 bool retain_pin = false;
877
878readpage:
879 /* Scan each tuple in "read" page */
880 maxroffnum = PageGetMaxOffsetNumber(rpage);
881 for (roffnum = FirstOffsetNumber;
882 roffnum <= maxroffnum;
883 roffnum = OffsetNumberNext(roffnum))
884 {
885 IndexTuple itup;
886 Size itemsz;
887
888 /* skip dead tuples */
889 if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
890 continue;
891
892 itup = (IndexTuple) PageGetItem(rpage,
893 PageGetItemId(rpage, roffnum));
894 itemsz = IndexTupleSize(itup);
895 itemsz = MAXALIGN(itemsz);
896
897 /*
898 * Walk up the bucket chain, looking for a page big enough for
899 * this item and all other accumulated items. Exit if we reach
900 * the read page.
901 */
902 while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
903 {
904 Buffer next_wbuf = InvalidBuffer;
905 bool tups_moved = false;
906
907 Assert(!PageIsEmpty(wpage));
908
909 if (wblkno == bucket_blkno)
910 retain_pin = true;
911
912 wblkno = wopaque->hasho_nextblkno;
913 Assert(BlockNumberIsValid(wblkno));
914
915 /* don't need to move to next page if we reached the read page */
916 if (wblkno != rblkno)
917 next_wbuf = _hash_getbuf_with_strategy(rel,
918 wblkno,
919 HASH_WRITE,
920 LH_OVERFLOW_PAGE,
921 bstrategy);
922
923 if (nitups > 0)
924 {
925 Assert(nitups == ndeletable);
926
927 /*
928 * This operation needs to log multiple tuples, prepare
929 * WAL for that.
930 */
931 if (RelationNeedsWAL(rel))
932 XLogEnsureRecordSpace(0, 3 + nitups);
933
934 START_CRIT_SECTION();
935
936 /*
937 * we have to insert tuples on the "write" page, being
938 * careful to preserve hashkey ordering. (If we insert
939 * many tuples into the same "write" page it would be
940 * worth qsort'ing them).
941 */
942 _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
943 MarkBufferDirty(wbuf);
944
945 /* Delete tuples we already moved off read page */
946 PageIndexMultiDelete(rpage, deletable, ndeletable);
947 MarkBufferDirty(rbuf);
948
949 /* XLOG stuff */
950 if (RelationNeedsWAL(rel))
951 {
952 XLogRecPtr recptr;
953 xl_hash_move_page_contents xlrec;
954
955 xlrec.ntups = nitups;
956 xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false;
957
958 XLogBeginInsert();
959 XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents);
960
961 /*
962 * bucket buffer needs to be registered to ensure that
963 * we can acquire a cleanup lock on it during replay.
964 */
965 if (!xlrec.is_prim_bucket_same_wrt)
966 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
967
968 XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
969 XLogRegisterBufData(1, (char *) itup_offsets,
970 nitups * sizeof(OffsetNumber));
971 for (i = 0; i < nitups; i++)
972 XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
973
974 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
975 XLogRegisterBufData(2, (char *) deletable,
976 ndeletable * sizeof(OffsetNumber));
977
978 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
979
980 PageSetLSN(BufferGetPage(wbuf), recptr);
981 PageSetLSN(BufferGetPage(rbuf), recptr);
982 }
983
984 END_CRIT_SECTION();
985
986 tups_moved = true;
987 }
988
989 /*
990 * release the lock on previous page after acquiring the lock
991 * on next page
992 */
993 if (retain_pin)
994 LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
995 else
996 _hash_relbuf(rel, wbuf);
997
998 /* nothing more to do if we reached the read page */
999 if (rblkno == wblkno)
1000 {
1001 _hash_relbuf(rel, rbuf);
1002 return;
1003 }
1004
1005 wbuf = next_wbuf;
1006 wpage = BufferGetPage(wbuf);
1007 wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
1008 Assert(wopaque->hasho_bucket == bucket);
1009 retain_pin = false;
1010
1011 /* be tidy */
1012 for (i = 0; i < nitups; i++)
1013 pfree(itups[i]);
1014 nitups = 0;
1015 all_tups_size = 0;
1016 ndeletable = 0;
1017
1018 /*
1019 * after moving the tuples, rpage would have been compacted,
1020 * so we need to rescan it.
1021 */
1022 if (tups_moved)
1023 goto readpage;
1024 }
1025
1026 /* remember tuple for deletion from "read" page */
1027 deletable[ndeletable++] = roffnum;
1028
1029 /*
1030 * we need a copy of index tuples as they can be freed as part of
1031 * overflow page, however we need them to write a WAL record in
1032 * _hash_freeovflpage.
1033 */
1034 itups[nitups] = CopyIndexTuple(itup);
1035 tups_size[nitups++] = itemsz;
1036 all_tups_size += itemsz;
1037 }
1038
1039 /*
1040 * If we reach here, there are no live tuples on the "read" page ---
1041 * it was empty when we got to it, or we moved them all. So we can
1042 * just free the page without bothering with deleting tuples
1043 * individually. Then advance to the previous "read" page.
1044 *
1045 * Tricky point here: if our read and write pages are adjacent in the
1046 * bucket chain, our write lock on wbuf will conflict with
1047 * _hash_freeovflpage's attempt to update the sibling links of the
1048 * removed page. In that case, we don't need to lock it again.
1049 */
1050 rblkno = ropaque->hasho_prevblkno;
1051 Assert(BlockNumberIsValid(rblkno));
1052
1053 /* free this overflow page (releases rbuf) */
1054 _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1055 tups_size, nitups, bstrategy);
1056
1057 /* be tidy */
1058 for (i = 0; i < nitups; i++)
1059 pfree(itups[i]);
1060
1061 /* are we freeing the page adjacent to wbuf? */
1062 if (rblkno == wblkno)
1063 {
1064 /* retain the pin on primary bucket page till end of bucket scan */
1065 if (wblkno == bucket_blkno)
1066 LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1067 else
1068 _hash_relbuf(rel, wbuf);
1069 return;
1070 }
1071
1072 rbuf = _hash_getbuf_with_strategy(rel,
1073 rblkno,
1074 HASH_WRITE,
1075 LH_OVERFLOW_PAGE,
1076 bstrategy);
1077 rpage = BufferGetPage(rbuf);
1078 ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
1079 Assert(ropaque->hasho_bucket == bucket);
1080 }
1081
1082 /* NOTREACHED */
1083}
1084