1/*-------------------------------------------------------------------------
2 *
3 * hashpage.c
4 * Hash table page management code for the Postgres hash access method
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hashpage.c
12 *
13 * NOTES
14 * Postgres hash pages look like ordinary relation pages. The opaque
15 * data at high addresses includes information about the page including
16 * whether a page is an overflow page or a true bucket, the bucket
17 * number, and the block numbers of the preceding and following pages
18 * in the same bucket.
19 *
20 * The first page in a hash relation, page zero, is special -- it stores
21 * information describing the hash table; it is referred to as the
22 * "meta page." Pages one and higher store the actual data.
23 *
24 * There are also bitmap pages, which are not manipulated here;
25 * see hashovfl.c.
26 *
27 *-------------------------------------------------------------------------
28 */
29#include "postgres.h"
30
31#include "access/hash.h"
32#include "access/hash_xlog.h"
33#include "miscadmin.h"
34#include "storage/lmgr.h"
35#include "storage/smgr.h"
36#include "storage/predicate.h"
37
38
39static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
40 uint32 nblocks);
41static void _hash_splitbucket(Relation rel, Buffer metabuf,
42 Bucket obucket, Bucket nbucket,
43 Buffer obuf,
44 Buffer nbuf,
45 HTAB *htab,
46 uint32 maxbucket,
47 uint32 highmask, uint32 lowmask);
48static void log_split_page(Relation rel, Buffer buf);
49
50
51/*
52 * _hash_getbuf() -- Get a buffer by block number for read or write.
53 *
54 * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
55 * 'flags' is a bitwise OR of the allowed page types.
56 *
57 * This must be used only to fetch pages that are expected to be valid
58 * already. _hash_checkpage() is applied using the given flags.
59 *
60 * When this routine returns, the appropriate lock is set on the
61 * requested buffer and its reference count has been incremented
62 * (ie, the buffer is "locked and pinned").
63 *
64 * P_NEW is disallowed because this routine can only be used
65 * to access pages that are known to be before the filesystem EOF.
66 * Extending the index should be done with _hash_getnewbuf.
67 */
68Buffer
69_hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
70{
71 Buffer buf;
72
73 if (blkno == P_NEW)
74 elog(ERROR, "hash AM does not use P_NEW");
75
76 buf = ReadBuffer(rel, blkno);
77
78 if (access != HASH_NOLOCK)
79 LockBuffer(buf, access);
80
81 /* ref count and lock type are correct */
82
83 _hash_checkpage(rel, buf, flags);
84
85 return buf;
86}
87
88/*
89 * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
90 *
91 * We read the page and try to acquire a cleanup lock. If we get it,
92 * we return the buffer; otherwise, we return InvalidBuffer.
93 */
94Buffer
95_hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
96{
97 Buffer buf;
98
99 if (blkno == P_NEW)
100 elog(ERROR, "hash AM does not use P_NEW");
101
102 buf = ReadBuffer(rel, blkno);
103
104 if (!ConditionalLockBufferForCleanup(buf))
105 {
106 ReleaseBuffer(buf);
107 return InvalidBuffer;
108 }
109
110 /* ref count and lock type are correct */
111
112 _hash_checkpage(rel, buf, flags);
113
114 return buf;
115}
116
117/*
118 * _hash_getinitbuf() -- Get and initialize a buffer by block number.
119 *
120 * This must be used only to fetch pages that are known to be before
121 * the index's filesystem EOF, but are to be filled from scratch.
122 * _hash_pageinit() is applied automatically. Otherwise it has
123 * effects similar to _hash_getbuf() with access = HASH_WRITE.
124 *
125 * When this routine returns, a write lock is set on the
126 * requested buffer and its reference count has been incremented
127 * (ie, the buffer is "locked and pinned").
128 *
129 * P_NEW is disallowed because this routine can only be used
130 * to access pages that are known to be before the filesystem EOF.
131 * Extending the index should be done with _hash_getnewbuf.
132 */
133Buffer
134_hash_getinitbuf(Relation rel, BlockNumber blkno)
135{
136 Buffer buf;
137
138 if (blkno == P_NEW)
139 elog(ERROR, "hash AM does not use P_NEW");
140
141 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
142 NULL);
143
144 /* ref count and lock type are correct */
145
146 /* initialize the page */
147 _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
148
149 return buf;
150}
151
152/*
153 * _hash_initbuf() -- Get and initialize a buffer by bucket number.
154 */
155void
156_hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
157 bool initpage)
158{
159 HashPageOpaque pageopaque;
160 Page page;
161
162 page = BufferGetPage(buf);
163
164 /* initialize the page */
165 if (initpage)
166 _hash_pageinit(page, BufferGetPageSize(buf));
167
168 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
169
170 /*
171 * Set hasho_prevblkno with current hashm_maxbucket. This value will be
172 * used to validate cached HashMetaPageData. See
173 * _hash_getbucketbuf_from_hashkey().
174 */
175 pageopaque->hasho_prevblkno = max_bucket;
176 pageopaque->hasho_nextblkno = InvalidBlockNumber;
177 pageopaque->hasho_bucket = num_bucket;
178 pageopaque->hasho_flag = flag;
179 pageopaque->hasho_page_id = HASHO_PAGE_ID;
180}
181
182/*
183 * _hash_getnewbuf() -- Get a new page at the end of the index.
184 *
185 * This has the same API as _hash_getinitbuf, except that we are adding
186 * a page to the index, and hence expect the page to be past the
187 * logical EOF. (However, we have to support the case where it isn't,
188 * since a prior try might have crashed after extending the filesystem
189 * EOF but before updating the metapage to reflect the added page.)
190 *
191 * It is caller's responsibility to ensure that only one process can
192 * extend the index at a time. In practice, this function is called
193 * only while holding write lock on the metapage, because adding a page
194 * is always associated with an update of metapage data.
195 */
196Buffer
197_hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
198{
199 BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
200 Buffer buf;
201
202 if (blkno == P_NEW)
203 elog(ERROR, "hash AM does not use P_NEW");
204 if (blkno > nblocks)
205 elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
206 RelationGetRelationName(rel));
207
208 /* smgr insists we use P_NEW to extend the relation */
209 if (blkno == nblocks)
210 {
211 buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
212 if (BufferGetBlockNumber(buf) != blkno)
213 elog(ERROR, "unexpected hash relation size: %u, should be %u",
214 BufferGetBlockNumber(buf), blkno);
215 LockBuffer(buf, HASH_WRITE);
216 }
217 else
218 {
219 buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
220 NULL);
221 }
222
223 /* ref count and lock type are correct */
224
225 /* initialize the page */
226 _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
227
228 return buf;
229}
230
231/*
232 * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
233 *
234 * This is identical to _hash_getbuf() but also allows a buffer access
235 * strategy to be specified. We use this for VACUUM operations.
236 */
237Buffer
238_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
239 int access, int flags,
240 BufferAccessStrategy bstrategy)
241{
242 Buffer buf;
243
244 if (blkno == P_NEW)
245 elog(ERROR, "hash AM does not use P_NEW");
246
247 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
248
249 if (access != HASH_NOLOCK)
250 LockBuffer(buf, access);
251
252 /* ref count and lock type are correct */
253
254 _hash_checkpage(rel, buf, flags);
255
256 return buf;
257}
258
259/*
260 * _hash_relbuf() -- release a locked buffer.
261 *
262 * Lock and pin (refcount) are both dropped.
263 */
264void
265_hash_relbuf(Relation rel, Buffer buf)
266{
267 UnlockReleaseBuffer(buf);
268}
269
270/*
271 * _hash_dropbuf() -- release an unlocked buffer.
272 *
273 * This is used to unpin a buffer on which we hold no lock.
274 */
275void
276_hash_dropbuf(Relation rel, Buffer buf)
277{
278 ReleaseBuffer(buf);
279}
280
281/*
282 * _hash_dropscanbuf() -- release buffers used in scan.
283 *
284 * This routine unpins the buffers used during scan on which we
285 * hold no lock.
286 */
287void
288_hash_dropscanbuf(Relation rel, HashScanOpaque so)
289{
290 /* release pin we hold on primary bucket page */
291 if (BufferIsValid(so->hashso_bucket_buf) &&
292 so->hashso_bucket_buf != so->currPos.buf)
293 _hash_dropbuf(rel, so->hashso_bucket_buf);
294 so->hashso_bucket_buf = InvalidBuffer;
295
296 /* release pin we hold on primary bucket page of bucket being split */
297 if (BufferIsValid(so->hashso_split_bucket_buf) &&
298 so->hashso_split_bucket_buf != so->currPos.buf)
299 _hash_dropbuf(rel, so->hashso_split_bucket_buf);
300 so->hashso_split_bucket_buf = InvalidBuffer;
301
302 /* release any pin we still hold */
303 if (BufferIsValid(so->currPos.buf))
304 _hash_dropbuf(rel, so->currPos.buf);
305 so->currPos.buf = InvalidBuffer;
306
307 /* reset split scan */
308 so->hashso_buc_populated = false;
309 so->hashso_buc_split = false;
310}
311
312
313/*
314 * _hash_init() -- Initialize the metadata page of a hash index,
315 * the initial buckets, and the initial bitmap page.
316 *
317 * The initial number of buckets is dependent on num_tuples, an estimate
318 * of the number of tuples to be loaded into the index initially. The
319 * chosen number of buckets is returned.
320 *
321 * We are fairly cavalier about locking here, since we know that no one else
322 * could be accessing this index. In particular the rule about not holding
323 * multiple buffer locks is ignored.
324 */
325uint32
326_hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
327{
328 Buffer metabuf;
329 Buffer buf;
330 Buffer bitmapbuf;
331 Page pg;
332 HashMetaPage metap;
333 RegProcedure procid;
334 int32 data_width;
335 int32 item_width;
336 int32 ffactor;
337 uint32 num_buckets;
338 uint32 i;
339 bool use_wal;
340
341 /* safety check */
342 if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
343 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
344 RelationGetRelationName(rel));
345
346 /*
347 * WAL log creation of pages if the relation is persistent, or this is the
348 * init fork. Init forks for unlogged relations always need to be WAL
349 * logged.
350 */
351 use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
352
353 /*
354 * Determine the target fill factor (in tuples per bucket) for this index.
355 * The idea is to make the fill factor correspond to pages about as full
356 * as the user-settable fillfactor parameter says. We can compute it
357 * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
358 */
359 data_width = sizeof(uint32);
360 item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
361 sizeof(ItemIdData); /* include the line pointer */
362 ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
363 /* keep to a sane range */
364 if (ffactor < 10)
365 ffactor = 10;
366
367 procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
368
369 /*
370 * We initialize the metapage, the first N bucket pages, and the first
371 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
372 * calls to occur. This ensures that the smgr level has the right idea of
373 * the physical index length.
374 *
375 * Critical section not required, because on error the creation of the
376 * whole relation will be rolled back.
377 */
378 metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
379 _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
380 MarkBufferDirty(metabuf);
381
382 pg = BufferGetPage(metabuf);
383 metap = HashPageGetMeta(pg);
384
385 /* XLOG stuff */
386 if (use_wal)
387 {
388 xl_hash_init_meta_page xlrec;
389 XLogRecPtr recptr;
390
391 xlrec.num_tuples = num_tuples;
392 xlrec.procid = metap->hashm_procid;
393 xlrec.ffactor = metap->hashm_ffactor;
394
395 XLogBeginInsert();
396 XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
397 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
398
399 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
400
401 PageSetLSN(BufferGetPage(metabuf), recptr);
402 }
403
404 num_buckets = metap->hashm_maxbucket + 1;
405
406 /*
407 * Release buffer lock on the metapage while we initialize buckets.
408 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
409 * won't accomplish anything. It's a bad idea to hold buffer locks for
410 * long intervals in any case, since that can block the bgwriter.
411 */
412 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
413
414 /*
415 * Initialize and WAL Log the first N buckets
416 */
417 for (i = 0; i < num_buckets; i++)
418 {
419 BlockNumber blkno;
420
421 /* Allow interrupts, in case N is huge */
422 CHECK_FOR_INTERRUPTS();
423
424 blkno = BUCKET_TO_BLKNO(metap, i);
425 buf = _hash_getnewbuf(rel, blkno, forkNum);
426 _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
427 MarkBufferDirty(buf);
428
429 if (use_wal)
430 log_newpage(&rel->rd_node,
431 forkNum,
432 blkno,
433 BufferGetPage(buf),
434 true);
435 _hash_relbuf(rel, buf);
436 }
437
438 /* Now reacquire buffer lock on metapage */
439 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
440
441 /*
442 * Initialize bitmap page
443 */
444 bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
445 _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
446 MarkBufferDirty(bitmapbuf);
447
448 /* add the new bitmap page to the metapage's list of bitmaps */
449 /* metapage already has a write lock */
450 if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
451 ereport(ERROR,
452 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
453 errmsg("out of overflow pages in hash index \"%s\"",
454 RelationGetRelationName(rel))));
455
456 metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
457
458 metap->hashm_nmaps++;
459 MarkBufferDirty(metabuf);
460
461 /* XLOG stuff */
462 if (use_wal)
463 {
464 xl_hash_init_bitmap_page xlrec;
465 XLogRecPtr recptr;
466
467 xlrec.bmsize = metap->hashm_bmsize;
468
469 XLogBeginInsert();
470 XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
471 XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
472
473 /*
474 * This is safe only because nobody else can be modifying the index at
475 * this stage; it's only visible to the transaction that is creating
476 * it.
477 */
478 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
479
480 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
481
482 PageSetLSN(BufferGetPage(bitmapbuf), recptr);
483 PageSetLSN(BufferGetPage(metabuf), recptr);
484 }
485
486 /* all done */
487 _hash_relbuf(rel, bitmapbuf);
488 _hash_relbuf(rel, metabuf);
489
490 return num_buckets;
491}
492
493/*
494 * _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
495 */
496void
497_hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
498 uint16 ffactor, bool initpage)
499{
500 HashMetaPage metap;
501 HashPageOpaque pageopaque;
502 Page page;
503 double dnumbuckets;
504 uint32 num_buckets;
505 uint32 spare_index;
506 uint32 i;
507
508 /*
509 * Choose the number of initial bucket pages to match the fill factor
510 * given the estimated number of tuples. We round up the result to the
511 * total number of buckets which has to be allocated before using its
512 * _hashm_spare element. However always force at least 2 bucket pages. The
513 * upper limit is determined by considerations explained in
514 * _hash_expandtable().
515 */
516 dnumbuckets = num_tuples / ffactor;
517 if (dnumbuckets <= 2.0)
518 num_buckets = 2;
519 else if (dnumbuckets >= (double) 0x40000000)
520 num_buckets = 0x40000000;
521 else
522 num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
523
524 spare_index = _hash_spareindex(num_buckets);
525 Assert(spare_index < HASH_MAX_SPLITPOINTS);
526
527 page = BufferGetPage(buf);
528 if (initpage)
529 _hash_pageinit(page, BufferGetPageSize(buf));
530
531 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
532 pageopaque->hasho_prevblkno = InvalidBlockNumber;
533 pageopaque->hasho_nextblkno = InvalidBlockNumber;
534 pageopaque->hasho_bucket = -1;
535 pageopaque->hasho_flag = LH_META_PAGE;
536 pageopaque->hasho_page_id = HASHO_PAGE_ID;
537
538 metap = HashPageGetMeta(page);
539
540 metap->hashm_magic = HASH_MAGIC;
541 metap->hashm_version = HASH_VERSION;
542 metap->hashm_ntuples = 0;
543 metap->hashm_nmaps = 0;
544 metap->hashm_ffactor = ffactor;
545 metap->hashm_bsize = HashGetMaxBitmapSize(page);
546 /* find largest bitmap array size that will fit in page size */
547 for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
548 {
549 if ((1 << i) <= metap->hashm_bsize)
550 break;
551 }
552 Assert(i > 0);
553 metap->hashm_bmsize = 1 << i;
554 metap->hashm_bmshift = i + BYTE_TO_BIT;
555 Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
556
557 /*
558 * Label the index with its primary hash support function's OID. This is
559 * pretty useless for normal operation (in fact, hashm_procid is not used
560 * anywhere), but it might be handy for forensic purposes so we keep it.
561 */
562 metap->hashm_procid = procid;
563
564 /*
565 * We initialize the index with N buckets, 0 .. N-1, occupying physical
566 * blocks 1 to N. The first freespace bitmap page is in block N+1.
567 */
568 metap->hashm_maxbucket = num_buckets - 1;
569
570 /*
571 * Set highmask as next immediate ((2 ^ x) - 1), which should be
572 * sufficient to cover num_buckets.
573 */
574 metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
575 metap->hashm_lowmask = (metap->hashm_highmask >> 1);
576
577 MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
578 MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
579
580 /* Set up mapping for one spare page after the initial splitpoints */
581 metap->hashm_spares[spare_index] = 1;
582 metap->hashm_ovflpoint = spare_index;
583 metap->hashm_firstfree = 0;
584
585 /*
586 * Set pd_lower just past the end of the metadata. This is essential,
587 * because without doing so, metadata will be lost if xlog.c compresses
588 * the page.
589 */
590 ((PageHeader) page)->pd_lower =
591 ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
592}
593
594/*
595 * _hash_pageinit() -- Initialize a new hash index page.
596 */
597void
598_hash_pageinit(Page page, Size size)
599{
600 PageInit(page, size, sizeof(HashPageOpaqueData));
601}
602
603/*
604 * Attempt to expand the hash table by creating one new bucket.
605 *
606 * This will silently do nothing if we don't get cleanup lock on old or
607 * new bucket.
608 *
609 * Complete the pending splits and remove the tuples from old bucket,
610 * if there are any left over from the previous split.
611 *
612 * The caller must hold a pin, but no lock, on the metapage buffer.
613 * The buffer is returned in the same state.
614 */
615void
616_hash_expandtable(Relation rel, Buffer metabuf)
617{
618 HashMetaPage metap;
619 Bucket old_bucket;
620 Bucket new_bucket;
621 uint32 spare_ndx;
622 BlockNumber start_oblkno;
623 BlockNumber start_nblkno;
624 Buffer buf_nblkno;
625 Buffer buf_oblkno;
626 Page opage;
627 Page npage;
628 HashPageOpaque oopaque;
629 HashPageOpaque nopaque;
630 uint32 maxbucket;
631 uint32 highmask;
632 uint32 lowmask;
633 bool metap_update_masks = false;
634 bool metap_update_splitpoint = false;
635
636restart_expand:
637
638 /*
639 * Write-lock the meta page. It used to be necessary to acquire a
640 * heavyweight lock to begin a split, but that is no longer required.
641 */
642 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
643
644 _hash_checkpage(rel, metabuf, LH_META_PAGE);
645 metap = HashPageGetMeta(BufferGetPage(metabuf));
646
647 /*
648 * Check to see if split is still needed; someone else might have already
649 * done one while we waited for the lock.
650 *
651 * Make sure this stays in sync with _hash_doinsert()
652 */
653 if (metap->hashm_ntuples <=
654 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
655 goto fail;
656
657 /*
658 * Can't split anymore if maxbucket has reached its maximum possible
659 * value.
660 *
661 * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
662 * the calculation maxbucket+1 mustn't overflow). Currently we restrict
663 * to half that because of overflow looping in _hash_log2() and
664 * insufficient space in hashm_spares[]. It's moot anyway because an
665 * index with 2^32 buckets would certainly overflow BlockNumber and hence
666 * _hash_alloc_buckets() would fail, but if we supported buckets smaller
667 * than a disk block then this would be an independent constraint.
668 *
669 * If you change this, see also the maximum initial number of buckets in
670 * _hash_init().
671 */
672 if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
673 goto fail;
674
675 /*
676 * Determine which bucket is to be split, and attempt to take cleanup lock
677 * on the old bucket. If we can't get the lock, give up.
678 *
679 * The cleanup lock protects us not only against other backends, but
680 * against our own backend as well.
681 *
682 * The cleanup lock is mainly to protect the split from concurrent
683 * inserts. See src/backend/access/hash/README, Lock Definitions for
684 * further details. Due to this locking restriction, if there is any
685 * pending scan, the split will give up which is not good, but harmless.
686 */
687 new_bucket = metap->hashm_maxbucket + 1;
688
689 old_bucket = (new_bucket & metap->hashm_lowmask);
690
691 start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
692
693 buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
694 if (!buf_oblkno)
695 goto fail;
696
697 opage = BufferGetPage(buf_oblkno);
698 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
699
700 /*
701 * We want to finish the split from a bucket as there is no apparent
702 * benefit by not doing so and it will make the code complicated to finish
703 * the split that involves multiple buckets considering the case where new
704 * split also fails. We don't need to consider the new bucket for
705 * completing the split here as it is not possible that a re-split of new
706 * bucket starts when there is still a pending split from old bucket.
707 */
708 if (H_BUCKET_BEING_SPLIT(oopaque))
709 {
710 /*
711 * Copy bucket mapping info now; refer the comment in code below where
712 * we copy this information before calling _hash_splitbucket to see
713 * why this is okay.
714 */
715 maxbucket = metap->hashm_maxbucket;
716 highmask = metap->hashm_highmask;
717 lowmask = metap->hashm_lowmask;
718
719 /*
720 * Release the lock on metapage and old_bucket, before completing the
721 * split.
722 */
723 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
724 LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
725
726 _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
727 highmask, lowmask);
728
729 /* release the pin on old buffer and retry for expand. */
730 _hash_dropbuf(rel, buf_oblkno);
731
732 goto restart_expand;
733 }
734
735 /*
736 * Clean the tuples remained from the previous split. This operation
737 * requires cleanup lock and we already have one on the old bucket, so
738 * let's do it. We also don't want to allow further splits from the bucket
739 * till the garbage of previous split is cleaned. This has two
740 * advantages; first, it helps in avoiding the bloat due to garbage and
741 * second is, during cleanup of bucket, we are always sure that the
742 * garbage tuples belong to most recently split bucket. On the contrary,
743 * if we allow cleanup of bucket after meta page is updated to indicate
744 * the new split and before the actual split, the cleanup operation won't
745 * be able to decide whether the tuple has been moved to the newly created
746 * bucket and ended up deleting such tuples.
747 */
748 if (H_NEEDS_SPLIT_CLEANUP(oopaque))
749 {
750 /*
751 * Copy bucket mapping info now; refer to the comment in code below
752 * where we copy this information before calling _hash_splitbucket to
753 * see why this is okay.
754 */
755 maxbucket = metap->hashm_maxbucket;
756 highmask = metap->hashm_highmask;
757 lowmask = metap->hashm_lowmask;
758
759 /* Release the metapage lock. */
760 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
761
762 hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
763 maxbucket, highmask, lowmask, NULL, NULL, true,
764 NULL, NULL);
765
766 _hash_dropbuf(rel, buf_oblkno);
767
768 goto restart_expand;
769 }
770
771 /*
772 * There shouldn't be any active scan on new bucket.
773 *
774 * Note: it is safe to compute the new bucket's blkno here, even though we
775 * may still need to update the BUCKET_TO_BLKNO mapping. This is because
776 * the current value of hashm_spares[hashm_ovflpoint] correctly shows
777 * where we are going to put a new splitpoint's worth of buckets.
778 */
779 start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
780
781 /*
782 * If the split point is increasing we need to allocate a new batch of
783 * bucket pages.
784 */
785 spare_ndx = _hash_spareindex(new_bucket + 1);
786 if (spare_ndx > metap->hashm_ovflpoint)
787 {
788 uint32 buckets_to_add;
789
790 Assert(spare_ndx == metap->hashm_ovflpoint + 1);
791
792 /*
793 * We treat allocation of buckets as a separate WAL-logged action.
794 * Even if we fail after this operation, won't leak bucket pages;
795 * rather, the next split will consume this space. In any case, even
796 * without failure we don't use all the space in one split operation.
797 */
798 buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
799 if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
800 {
801 /* can't split due to BlockNumber overflow */
802 _hash_relbuf(rel, buf_oblkno);
803 goto fail;
804 }
805 }
806
807 /*
808 * Physically allocate the new bucket's primary page. We want to do this
809 * before changing the metapage's mapping info, in case we can't get the
810 * disk space. Ideally, we don't need to check for cleanup lock on new
811 * bucket as no other backend could find this bucket unless meta page is
812 * updated. However, it is good to be consistent with old bucket locking.
813 */
814 buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
815 if (!IsBufferCleanupOK(buf_nblkno))
816 {
817 _hash_relbuf(rel, buf_oblkno);
818 _hash_relbuf(rel, buf_nblkno);
819 goto fail;
820 }
821
822 /*
823 * Since we are scribbling on the pages in the shared buffers, establish a
824 * critical section. Any failure in this next code leaves us with a big
825 * problem: the metapage is effectively corrupt but could get written back
826 * to disk.
827 */
828 START_CRIT_SECTION();
829
830 /*
831 * Okay to proceed with split. Update the metapage bucket mapping info.
832 */
833 metap->hashm_maxbucket = new_bucket;
834
835 if (new_bucket > metap->hashm_highmask)
836 {
837 /* Starting a new doubling */
838 metap->hashm_lowmask = metap->hashm_highmask;
839 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
840 metap_update_masks = true;
841 }
842
843 /*
844 * If the split point is increasing we need to adjust the hashm_spares[]
845 * array and hashm_ovflpoint so that future overflow pages will be created
846 * beyond this new batch of bucket pages.
847 */
848 if (spare_ndx > metap->hashm_ovflpoint)
849 {
850 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
851 metap->hashm_ovflpoint = spare_ndx;
852 metap_update_splitpoint = true;
853 }
854
855 MarkBufferDirty(metabuf);
856
857 /*
858 * Copy bucket mapping info now; this saves re-accessing the meta page
859 * inside _hash_splitbucket's inner loop. Note that once we drop the
860 * split lock, other splits could begin, so these values might be out of
861 * date before _hash_splitbucket finishes. That's okay, since all it
862 * needs is to tell which of these two buckets to map hashkeys into.
863 */
864 maxbucket = metap->hashm_maxbucket;
865 highmask = metap->hashm_highmask;
866 lowmask = metap->hashm_lowmask;
867
868 opage = BufferGetPage(buf_oblkno);
869 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
870
871 /*
872 * Mark the old bucket to indicate that split is in progress. (At
873 * operation end, we will clear the split-in-progress flag.) Also, for a
874 * primary bucket page, hasho_prevblkno stores the number of buckets that
875 * existed as of the last split, so we must update that value here.
876 */
877 oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
878 oopaque->hasho_prevblkno = maxbucket;
879
880 MarkBufferDirty(buf_oblkno);
881
882 npage = BufferGetPage(buf_nblkno);
883
884 /*
885 * initialize the new bucket's primary page and mark it to indicate that
886 * split is in progress.
887 */
888 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
889 nopaque->hasho_prevblkno = maxbucket;
890 nopaque->hasho_nextblkno = InvalidBlockNumber;
891 nopaque->hasho_bucket = new_bucket;
892 nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
893 nopaque->hasho_page_id = HASHO_PAGE_ID;
894
895 MarkBufferDirty(buf_nblkno);
896
897 /* XLOG stuff */
898 if (RelationNeedsWAL(rel))
899 {
900 xl_hash_split_allocate_page xlrec;
901 XLogRecPtr recptr;
902
903 xlrec.new_bucket = maxbucket;
904 xlrec.old_bucket_flag = oopaque->hasho_flag;
905 xlrec.new_bucket_flag = nopaque->hasho_flag;
906 xlrec.flags = 0;
907
908 XLogBeginInsert();
909
910 XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
911 XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
912 XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
913
914 if (metap_update_masks)
915 {
916 xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
917 XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
918 XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
919 }
920
921 if (metap_update_splitpoint)
922 {
923 xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
924 XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
925 sizeof(uint32));
926 XLogRegisterBufData(2,
927 (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
928 sizeof(uint32));
929 }
930
931 XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
932
933 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
934
935 PageSetLSN(BufferGetPage(buf_oblkno), recptr);
936 PageSetLSN(BufferGetPage(buf_nblkno), recptr);
937 PageSetLSN(BufferGetPage(metabuf), recptr);
938 }
939
940 END_CRIT_SECTION();
941
942 /* drop lock, but keep pin */
943 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
944
945 /* Relocate records to the new bucket */
946 _hash_splitbucket(rel, metabuf,
947 old_bucket, new_bucket,
948 buf_oblkno, buf_nblkno, NULL,
949 maxbucket, highmask, lowmask);
950
951 /* all done, now release the pins on primary buckets. */
952 _hash_dropbuf(rel, buf_oblkno);
953 _hash_dropbuf(rel, buf_nblkno);
954
955 return;
956
957 /* Here if decide not to split or fail to acquire old bucket lock */
958fail:
959
960 /* We didn't write the metapage, so just drop lock */
961 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
962}
963
964
965/*
966 * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
967 *
968 * This does not need to initialize the new bucket pages; we'll do that as
969 * each one is used by _hash_expandtable(). But we have to extend the logical
970 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
971 * sync with ours, so that we don't get complaints from smgr.
972 *
973 * We do this by writing a page of zeroes at the end of the splitpoint range.
974 * We expect that the filesystem will ensure that the intervening pages read
975 * as zeroes too. On many filesystems this "hole" will not be allocated
976 * immediately, which means that the index file may end up more fragmented
977 * than if we forced it all to be allocated now; but since we don't scan
978 * hash indexes sequentially anyway, that probably doesn't matter.
979 *
980 * XXX It's annoying that this code is executed with the metapage lock held.
981 * We need to interlock against _hash_addovflpage() adding a new overflow page
982 * concurrently, but it'd likely be better to use LockRelationForExtension
983 * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
984 * so it may not be worth worrying about.
985 *
986 * Returns true if successful, or false if allocation failed due to
987 * BlockNumber overflow.
988 */
989static bool
990_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
991{
992 BlockNumber lastblock;
993 PGAlignedBlock zerobuf;
994 Page page;
995 HashPageOpaque ovflopaque;
996
997 lastblock = firstblock + nblocks - 1;
998
999 /*
1000 * Check for overflow in block number calculation; if so, we cannot extend
1001 * the index anymore.
1002 */
1003 if (lastblock < firstblock || lastblock == InvalidBlockNumber)
1004 return false;
1005
1006 page = (Page) zerobuf.data;
1007
1008 /*
1009 * Initialize the page. Just zeroing the page won't work; see
1010 * _hash_freeovflpage for similar usage. We take care to make the special
1011 * space valid for the benefit of tools such as pageinspect.
1012 */
1013 _hash_pageinit(page, BLCKSZ);
1014
1015 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
1016
1017 ovflopaque->hasho_prevblkno = InvalidBlockNumber;
1018 ovflopaque->hasho_nextblkno = InvalidBlockNumber;
1019 ovflopaque->hasho_bucket = -1;
1020 ovflopaque->hasho_flag = LH_UNUSED_PAGE;
1021 ovflopaque->hasho_page_id = HASHO_PAGE_ID;
1022
1023 if (RelationNeedsWAL(rel))
1024 log_newpage(&rel->rd_node,
1025 MAIN_FORKNUM,
1026 lastblock,
1027 zerobuf.data,
1028 true);
1029
1030 RelationOpenSmgr(rel);
1031 PageSetChecksumInplace(page, lastblock);
1032 smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false);
1033
1034 return true;
1035}
1036
1037
1038/*
1039 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
1040 *
1041 * This routine is used to partition the tuples between old and new bucket and
1042 * is used to finish the incomplete split operations. To finish the previously
1043 * interrupted split operation, the caller needs to fill htab. If htab is set,
1044 * then we skip the movement of tuples that exists in htab, otherwise NULL
1045 * value of htab indicates movement of all the tuples that belong to the new
1046 * bucket.
1047 *
1048 * We are splitting a bucket that consists of a base bucket page and zero
1049 * or more overflow (bucket chain) pages. We must relocate tuples that
1050 * belong in the new bucket.
1051 *
1052 * The caller must hold cleanup locks on both buckets to ensure that
1053 * no one else is trying to access them (see README).
1054 *
1055 * The caller must hold a pin, but no lock, on the metapage buffer.
1056 * The buffer is returned in the same state. (The metapage is only
1057 * touched if it becomes necessary to add or remove overflow pages.)
1058 *
1059 * Split needs to retain pin on primary bucket pages of both old and new
1060 * buckets till end of operation. This is to prevent vacuum from starting
1061 * while a split is in progress.
1062 *
1063 * In addition, the caller must have created the new bucket's base page,
1064 * which is passed in buffer nbuf, pinned and write-locked. The lock will be
1065 * released here and pin must be released by the caller. (The API is set up
1066 * this way because we must do _hash_getnewbuf() before releasing the metapage
1067 * write lock. So instead of passing the new bucket's start block number, we
1068 * pass an actual buffer.)
1069 */
1070static void
1071_hash_splitbucket(Relation rel,
1072 Buffer metabuf,
1073 Bucket obucket,
1074 Bucket nbucket,
1075 Buffer obuf,
1076 Buffer nbuf,
1077 HTAB *htab,
1078 uint32 maxbucket,
1079 uint32 highmask,
1080 uint32 lowmask)
1081{
1082 Buffer bucket_obuf;
1083 Buffer bucket_nbuf;
1084 Page opage;
1085 Page npage;
1086 HashPageOpaque oopaque;
1087 HashPageOpaque nopaque;
1088 OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
1089 IndexTuple itups[MaxIndexTuplesPerPage];
1090 Size all_tups_size = 0;
1091 int i;
1092 uint16 nitups = 0;
1093
1094 bucket_obuf = obuf;
1095 opage = BufferGetPage(obuf);
1096 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1097
1098 bucket_nbuf = nbuf;
1099 npage = BufferGetPage(nbuf);
1100 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1101
1102 /* Copy the predicate locks from old bucket to new bucket. */
1103 PredicateLockPageSplit(rel,
1104 BufferGetBlockNumber(bucket_obuf),
1105 BufferGetBlockNumber(bucket_nbuf));
1106
1107 /*
1108 * Partition the tuples in the old bucket between the old bucket and the
1109 * new bucket, advancing along the old bucket's overflow bucket chain and
1110 * adding overflow pages to the new bucket as needed. Outer loop iterates
1111 * once per page in old bucket.
1112 */
1113 for (;;)
1114 {
1115 BlockNumber oblkno;
1116 OffsetNumber ooffnum;
1117 OffsetNumber omaxoffnum;
1118
1119 /* Scan each tuple in old page */
1120 omaxoffnum = PageGetMaxOffsetNumber(opage);
1121 for (ooffnum = FirstOffsetNumber;
1122 ooffnum <= omaxoffnum;
1123 ooffnum = OffsetNumberNext(ooffnum))
1124 {
1125 IndexTuple itup;
1126 Size itemsz;
1127 Bucket bucket;
1128 bool found = false;
1129
1130 /* skip dead tuples */
1131 if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
1132 continue;
1133
1134 /*
1135 * Before inserting a tuple, probe the hash table containing TIDs
1136 * of tuples belonging to new bucket, if we find a match, then
1137 * skip that tuple, else fetch the item's hash key (conveniently
1138 * stored in the item) and determine which bucket it now belongs
1139 * in.
1140 */
1141 itup = (IndexTuple) PageGetItem(opage,
1142 PageGetItemId(opage, ooffnum));
1143
1144 if (htab)
1145 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
1146
1147 if (found)
1148 continue;
1149
1150 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
1151 maxbucket, highmask, lowmask);
1152
1153 if (bucket == nbucket)
1154 {
1155 IndexTuple new_itup;
1156
1157 /*
1158 * make a copy of index tuple as we have to scribble on it.
1159 */
1160 new_itup = CopyIndexTuple(itup);
1161
1162 /*
1163 * mark the index tuple as moved by split, such tuples are
1164 * skipped by scan if there is split in progress for a bucket.
1165 */
1166 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
1167
1168 /*
1169 * insert the tuple into the new bucket. if it doesn't fit on
1170 * the current page in the new bucket, we must allocate a new
1171 * overflow page and place the tuple on that page instead.
1172 */
1173 itemsz = IndexTupleSize(new_itup);
1174 itemsz = MAXALIGN(itemsz);
1175
1176 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
1177 {
1178 /*
1179 * Change the shared buffer state in critical section,
1180 * otherwise any error could make it unrecoverable.
1181 */
1182 START_CRIT_SECTION();
1183
1184 _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
1185 MarkBufferDirty(nbuf);
1186 /* log the split operation before releasing the lock */
1187 log_split_page(rel, nbuf);
1188
1189 END_CRIT_SECTION();
1190
1191 /* drop lock, but keep pin */
1192 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
1193
1194 /* be tidy */
1195 for (i = 0; i < nitups; i++)
1196 pfree(itups[i]);
1197 nitups = 0;
1198 all_tups_size = 0;
1199
1200 /* chain to a new overflow page */
1201 nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
1202 npage = BufferGetPage(nbuf);
1203 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1204 }
1205
1206 itups[nitups++] = new_itup;
1207 all_tups_size += itemsz;
1208 }
1209 else
1210 {
1211 /*
1212 * the tuple stays on this page, so nothing to do.
1213 */
1214 Assert(bucket == obucket);
1215 }
1216 }
1217
1218 oblkno = oopaque->hasho_nextblkno;
1219
1220 /* retain the pin on the old primary bucket */
1221 if (obuf == bucket_obuf)
1222 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
1223 else
1224 _hash_relbuf(rel, obuf);
1225
1226 /* Exit loop if no more overflow pages in old bucket */
1227 if (!BlockNumberIsValid(oblkno))
1228 {
1229 /*
1230 * Change the shared buffer state in critical section, otherwise
1231 * any error could make it unrecoverable.
1232 */
1233 START_CRIT_SECTION();
1234
1235 _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
1236 MarkBufferDirty(nbuf);
1237 /* log the split operation before releasing the lock */
1238 log_split_page(rel, nbuf);
1239
1240 END_CRIT_SECTION();
1241
1242 if (nbuf == bucket_nbuf)
1243 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
1244 else
1245 _hash_relbuf(rel, nbuf);
1246
1247 /* be tidy */
1248 for (i = 0; i < nitups; i++)
1249 pfree(itups[i]);
1250 break;
1251 }
1252
1253 /* Else, advance to next old page */
1254 obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
1255 opage = BufferGetPage(obuf);
1256 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1257 }
1258
1259 /*
1260 * We're at the end of the old bucket chain, so we're done partitioning
1261 * the tuples. Mark the old and new buckets to indicate split is
1262 * finished.
1263 *
1264 * To avoid deadlocks due to locking order of buckets, first lock the old
1265 * bucket and then the new bucket.
1266 */
1267 LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
1268 opage = BufferGetPage(bucket_obuf);
1269 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1270
1271 LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
1272 npage = BufferGetPage(bucket_nbuf);
1273 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1274
1275 START_CRIT_SECTION();
1276
1277 oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
1278 nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
1279
1280 /*
1281 * After the split is finished, mark the old bucket to indicate that it
1282 * contains deletable tuples. We will clear split-cleanup flag after
1283 * deleting such tuples either at the end of split or at the next split
1284 * from old bucket or at the time of vacuum.
1285 */
1286 oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
1287
1288 /*
1289 * now write the buffers, here we don't release the locks as caller is
1290 * responsible to release locks.
1291 */
1292 MarkBufferDirty(bucket_obuf);
1293 MarkBufferDirty(bucket_nbuf);
1294
1295 if (RelationNeedsWAL(rel))
1296 {
1297 XLogRecPtr recptr;
1298 xl_hash_split_complete xlrec;
1299
1300 xlrec.old_bucket_flag = oopaque->hasho_flag;
1301 xlrec.new_bucket_flag = nopaque->hasho_flag;
1302
1303 XLogBeginInsert();
1304
1305 XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
1306
1307 XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
1308 XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
1309
1310 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
1311
1312 PageSetLSN(BufferGetPage(bucket_obuf), recptr);
1313 PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
1314 }
1315
1316 END_CRIT_SECTION();
1317
1318 /*
1319 * If possible, clean up the old bucket. We might not be able to do this
1320 * if someone else has a pin on it, but if not then we can go ahead. This
1321 * isn't absolutely necessary, but it reduces bloat; if we don't do it
1322 * now, VACUUM will do it eventually, but maybe not until new overflow
1323 * pages have been allocated. Note that there's no need to clean up the
1324 * new bucket.
1325 */
1326 if (IsBufferCleanupOK(bucket_obuf))
1327 {
1328 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
1329 hashbucketcleanup(rel, obucket, bucket_obuf,
1330 BufferGetBlockNumber(bucket_obuf), NULL,
1331 maxbucket, highmask, lowmask, NULL, NULL, true,
1332 NULL, NULL);
1333 }
1334 else
1335 {
1336 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
1337 LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
1338 }
1339}
1340
1341/*
1342 * _hash_finish_split() -- Finish the previously interrupted split operation
1343 *
1344 * To complete the split operation, we form the hash table of TIDs in new
1345 * bucket which is then used by split operation to skip tuples that are
1346 * already moved before the split operation was previously interrupted.
1347 *
1348 * The caller must hold a pin, but no lock, on the metapage and old bucket's
1349 * primary page buffer. The buffers are returned in the same state. (The
1350 * metapage is only touched if it becomes necessary to add or remove overflow
1351 * pages.)
1352 */
1353void
1354_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
1355 uint32 maxbucket, uint32 highmask, uint32 lowmask)
1356{
1357 HASHCTL hash_ctl;
1358 HTAB *tidhtab;
1359 Buffer bucket_nbuf = InvalidBuffer;
1360 Buffer nbuf;
1361 Page npage;
1362 BlockNumber nblkno;
1363 BlockNumber bucket_nblkno;
1364 HashPageOpaque npageopaque;
1365 Bucket nbucket;
1366 bool found;
1367
1368 /* Initialize hash tables used to track TIDs */
1369 memset(&hash_ctl, 0, sizeof(hash_ctl));
1370 hash_ctl.keysize = sizeof(ItemPointerData);
1371 hash_ctl.entrysize = sizeof(ItemPointerData);
1372 hash_ctl.hcxt = CurrentMemoryContext;
1373
1374 tidhtab =
1375 hash_create("bucket ctids",
1376 256, /* arbitrary initial size */
1377 &hash_ctl,
1378 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1379
1380 bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
1381
1382 /*
1383 * Scan the new bucket and build hash table of TIDs
1384 */
1385 for (;;)
1386 {
1387 OffsetNumber noffnum;
1388 OffsetNumber nmaxoffnum;
1389
1390 nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
1391 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
1392
1393 /* remember the primary bucket buffer to acquire cleanup lock on it. */
1394 if (nblkno == bucket_nblkno)
1395 bucket_nbuf = nbuf;
1396
1397 npage = BufferGetPage(nbuf);
1398 npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1399
1400 /* Scan each tuple in new page */
1401 nmaxoffnum = PageGetMaxOffsetNumber(npage);
1402 for (noffnum = FirstOffsetNumber;
1403 noffnum <= nmaxoffnum;
1404 noffnum = OffsetNumberNext(noffnum))
1405 {
1406 IndexTuple itup;
1407
1408 /* Fetch the item's TID and insert it in hash table. */
1409 itup = (IndexTuple) PageGetItem(npage,
1410 PageGetItemId(npage, noffnum));
1411
1412 (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
1413
1414 Assert(!found);
1415 }
1416
1417 nblkno = npageopaque->hasho_nextblkno;
1418
1419 /*
1420 * release our write lock without modifying buffer and ensure to
1421 * retain the pin on primary bucket.
1422 */
1423 if (nbuf == bucket_nbuf)
1424 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
1425 else
1426 _hash_relbuf(rel, nbuf);
1427
1428 /* Exit loop if no more overflow pages in new bucket */
1429 if (!BlockNumberIsValid(nblkno))
1430 break;
1431 }
1432
1433 /*
1434 * Conditionally get the cleanup lock on old and new buckets to perform
1435 * the split operation. If we don't get the cleanup locks, silently give
1436 * up and next insertion on old bucket will try again to complete the
1437 * split.
1438 */
1439 if (!ConditionalLockBufferForCleanup(obuf))
1440 {
1441 hash_destroy(tidhtab);
1442 return;
1443 }
1444 if (!ConditionalLockBufferForCleanup(bucket_nbuf))
1445 {
1446 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
1447 hash_destroy(tidhtab);
1448 return;
1449 }
1450
1451 npage = BufferGetPage(bucket_nbuf);
1452 npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1453 nbucket = npageopaque->hasho_bucket;
1454
1455 _hash_splitbucket(rel, metabuf, obucket,
1456 nbucket, obuf, bucket_nbuf, tidhtab,
1457 maxbucket, highmask, lowmask);
1458
1459 _hash_dropbuf(rel, bucket_nbuf);
1460 hash_destroy(tidhtab);
1461}
1462
1463/*
1464 * log_split_page() -- Log the split operation
1465 *
1466 * We log the split operation when the new page in new bucket gets full,
1467 * so we log the entire page.
1468 *
1469 * 'buf' must be locked by the caller which is also responsible for unlocking
1470 * it.
1471 */
1472static void
1473log_split_page(Relation rel, Buffer buf)
1474{
1475 if (RelationNeedsWAL(rel))
1476 {
1477 XLogRecPtr recptr;
1478
1479 XLogBeginInsert();
1480
1481 XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
1482
1483 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
1484
1485 PageSetLSN(BufferGetPage(buf), recptr);
1486 }
1487}
1488
1489/*
1490 * _hash_getcachedmetap() -- Returns cached metapage data.
1491 *
1492 * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
1493 * the metapage. If not set, we'll set it before returning if we have to
1494 * refresh the cache, and return with a pin but no lock on it; caller is
1495 * responsible for releasing the pin.
1496 *
1497 * We refresh the cache if it's not initialized yet or force_refresh is true.
1498 */
1499HashMetaPage
1500_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
1501{
1502 Page page;
1503
1504 Assert(metabuf);
1505 if (force_refresh || rel->rd_amcache == NULL)
1506 {
1507 char *cache = NULL;
1508
1509 /*
1510 * It's important that we don't set rd_amcache to an invalid value.
1511 * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
1512 * install a pointer to the newly-allocated storage in the actual
1513 * relcache entry until both have succeeeded.
1514 */
1515 if (rel->rd_amcache == NULL)
1516 cache = MemoryContextAlloc(rel->rd_indexcxt,
1517 sizeof(HashMetaPageData));
1518
1519 /* Read the metapage. */
1520 if (BufferIsValid(*metabuf))
1521 LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
1522 else
1523 *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
1524 LH_META_PAGE);
1525 page = BufferGetPage(*metabuf);
1526
1527 /* Populate the cache. */
1528 if (rel->rd_amcache == NULL)
1529 rel->rd_amcache = cache;
1530 memcpy(rel->rd_amcache, HashPageGetMeta(page),
1531 sizeof(HashMetaPageData));
1532
1533 /* Release metapage lock, but keep the pin. */
1534 LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
1535 }
1536
1537 return (HashMetaPage) rel->rd_amcache;
1538}
1539
1540/*
1541 * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
1542 * hashkey.
1543 *
1544 * Bucket pages do not move or get removed once they are allocated. This give
1545 * us an opportunity to use the previously saved metapage contents to reach
1546 * the target bucket buffer, instead of reading from the metapage every time.
1547 * This saves one buffer access every time we want to reach the target bucket
1548 * buffer, which is very helpful savings in bufmgr traffic and contention.
1549 *
1550 * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
1551 * bucket buffer has to be locked for reading or writing.
1552 *
1553 * The out parameter cachedmetap is set with metapage contents used for
1554 * hashkey to bucket buffer mapping. Some callers need this info to reach the
1555 * old bucket in case of bucket split, see _hash_doinsert().
1556 */
1557Buffer
1558_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
1559 HashMetaPage *cachedmetap)
1560{
1561 HashMetaPage metap;
1562 Buffer buf;
1563 Buffer metabuf = InvalidBuffer;
1564 Page page;
1565 Bucket bucket;
1566 BlockNumber blkno;
1567 HashPageOpaque opaque;
1568
1569 /* We read from target bucket buffer, hence locking is must. */
1570 Assert(access == HASH_READ || access == HASH_WRITE);
1571
1572 metap = _hash_getcachedmetap(rel, &metabuf, false);
1573 Assert(metap != NULL);
1574
1575 /*
1576 * Loop until we get a lock on the correct target bucket.
1577 */
1578 for (;;)
1579 {
1580 /*
1581 * Compute the target bucket number, and convert to block number.
1582 */
1583 bucket = _hash_hashkey2bucket(hashkey,
1584 metap->hashm_maxbucket,
1585 metap->hashm_highmask,
1586 metap->hashm_lowmask);
1587
1588 blkno = BUCKET_TO_BLKNO(metap, bucket);
1589
1590 /* Fetch the primary bucket page for the bucket */
1591 buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
1592 page = BufferGetPage(buf);
1593 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
1594 Assert(opaque->hasho_bucket == bucket);
1595 Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
1596
1597 /*
1598 * If this bucket hasn't been split, we're done.
1599 */
1600 if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
1601 break;
1602
1603 /* Drop lock on this buffer, update cached metapage, and retry. */
1604 _hash_relbuf(rel, buf);
1605 metap = _hash_getcachedmetap(rel, &metabuf, true);
1606 Assert(metap != NULL);
1607 }
1608
1609 if (BufferIsValid(metabuf))
1610 _hash_dropbuf(rel, metabuf);
1611
1612 if (cachedmetap)
1613 *cachedmetap = metap;
1614
1615 return buf;
1616}
1617