1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * hashovfl.c |
4 | * Overflow page management code for the Postgres hash access method |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/hash/hashovfl.c |
12 | * |
13 | * NOTES |
14 | * Overflow pages look like ordinary relation pages. |
15 | * |
16 | *------------------------------------------------------------------------- |
17 | */ |
18 | #include "postgres.h" |
19 | |
20 | #include "access/hash.h" |
21 | #include "access/hash_xlog.h" |
22 | #include "miscadmin.h" |
23 | #include "utils/rel.h" |
24 | |
25 | |
26 | static uint32 _hash_firstfreebit(uint32 map); |
27 | |
28 | |
29 | /* |
30 | * Convert overflow page bit number (its index in the free-page bitmaps) |
31 | * to block number within the index. |
32 | */ |
33 | static BlockNumber |
34 | bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) |
35 | { |
36 | uint32 splitnum = metap->hashm_ovflpoint; |
37 | uint32 i; |
38 | |
39 | /* Convert zero-based bitnumber to 1-based page number */ |
40 | ovflbitnum += 1; |
41 | |
42 | /* Determine the split number for this page (must be >= 1) */ |
43 | for (i = 1; |
44 | i < splitnum && ovflbitnum > metap->hashm_spares[i]; |
45 | i++) |
46 | /* loop */ ; |
47 | |
48 | /* |
49 | * Convert to absolute page number by adding the number of bucket pages |
50 | * that exist before this split point. |
51 | */ |
52 | return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); |
53 | } |
54 | |
55 | /* |
56 | * _hash_ovflblkno_to_bitno |
57 | * |
58 | * Convert overflow page block number to bit number for free-page bitmap. |
59 | */ |
60 | uint32 |
61 | _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) |
62 | { |
63 | uint32 splitnum = metap->hashm_ovflpoint; |
64 | uint32 i; |
65 | uint32 bitnum; |
66 | |
67 | /* Determine the split number containing this page */ |
68 | for (i = 1; i <= splitnum; i++) |
69 | { |
70 | if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) |
71 | break; /* oops */ |
72 | bitnum = ovflblkno - _hash_get_totalbuckets(i); |
73 | |
74 | /* |
75 | * bitnum has to be greater than number of overflow page added in |
76 | * previous split point. The overflow page at this splitnum (i) if any |
77 | * should start from (_hash_get_totalbuckets(i) + |
78 | * metap->hashm_spares[i - 1] + 1). |
79 | */ |
80 | if (bitnum > metap->hashm_spares[i - 1] && |
81 | bitnum <= metap->hashm_spares[i]) |
82 | return bitnum - 1; /* -1 to convert 1-based to 0-based */ |
83 | } |
84 | |
85 | ereport(ERROR, |
86 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
87 | errmsg("invalid overflow block number %u" , ovflblkno))); |
88 | return 0; /* keep compiler quiet */ |
89 | } |
90 | |
91 | /* |
92 | * _hash_addovflpage |
93 | * |
94 | * Add an overflow page to the bucket whose last page is pointed to by 'buf'. |
95 | * |
96 | * On entry, the caller must hold a pin but no lock on 'buf'. The pin is |
97 | * dropped before exiting (we assume the caller is not interested in 'buf' |
98 | * anymore) if not asked to retain. The pin will be retained only for the |
99 | * primary bucket. The returned overflow page will be pinned and |
100 | * write-locked; it is guaranteed to be empty. |
101 | * |
102 | * The caller must hold a pin, but no lock, on the metapage buffer. |
103 | * That buffer is returned in the same state. |
104 | * |
105 | * NB: since this could be executed concurrently by multiple processes, |
106 | * one should not assume that the returned overflow page will be the |
107 | * immediate successor of the originally passed 'buf'. Additional overflow |
108 | * pages might have been added to the bucket chain in between. |
109 | */ |
110 | Buffer |
111 | _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) |
112 | { |
113 | Buffer ovflbuf; |
114 | Page page; |
115 | Page ovflpage; |
116 | HashPageOpaque pageopaque; |
117 | HashPageOpaque ovflopaque; |
118 | HashMetaPage metap; |
119 | Buffer mapbuf = InvalidBuffer; |
120 | Buffer newmapbuf = InvalidBuffer; |
121 | BlockNumber blkno; |
122 | uint32 orig_firstfree; |
123 | uint32 splitnum; |
124 | uint32 *freep = NULL; |
125 | uint32 max_ovflpg; |
126 | uint32 bit; |
127 | uint32 bitmap_page_bit; |
128 | uint32 first_page; |
129 | uint32 last_bit; |
130 | uint32 last_page; |
131 | uint32 i, |
132 | j; |
133 | bool page_found = false; |
134 | |
135 | /* |
136 | * Write-lock the tail page. Here, we need to maintain locking order such |
137 | * that, first acquire the lock on tail page of bucket, then on meta page |
138 | * to find and lock the bitmap page and if it is found, then lock on meta |
139 | * page is released, then finally acquire the lock on new overflow buffer. |
140 | * We need this locking order to avoid deadlock with backends that are |
141 | * doing inserts. |
142 | * |
143 | * Note: We could have avoided locking many buffers here if we made two |
144 | * WAL records for acquiring an overflow page (one to allocate an overflow |
145 | * page and another to add it to overflow bucket chain). However, doing |
146 | * so can leak an overflow page, if the system crashes after allocation. |
147 | * Needless to say, it is better to have a single record from a |
148 | * performance point of view as well. |
149 | */ |
150 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
151 | |
152 | /* probably redundant... */ |
153 | _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
154 | |
155 | /* loop to find current tail page, in case someone else inserted too */ |
156 | for (;;) |
157 | { |
158 | BlockNumber nextblkno; |
159 | |
160 | page = BufferGetPage(buf); |
161 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
162 | nextblkno = pageopaque->hasho_nextblkno; |
163 | |
164 | if (!BlockNumberIsValid(nextblkno)) |
165 | break; |
166 | |
167 | /* we assume we do not need to write the unmodified page */ |
168 | if (retain_pin) |
169 | { |
170 | /* pin will be retained only for the primary bucket page */ |
171 | Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); |
172 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
173 | } |
174 | else |
175 | _hash_relbuf(rel, buf); |
176 | |
177 | retain_pin = false; |
178 | |
179 | buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); |
180 | } |
181 | |
182 | /* Get exclusive lock on the meta page */ |
183 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
184 | |
185 | _hash_checkpage(rel, metabuf, LH_META_PAGE); |
186 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
187 | |
188 | /* start search at hashm_firstfree */ |
189 | orig_firstfree = metap->hashm_firstfree; |
190 | first_page = orig_firstfree >> BMPG_SHIFT(metap); |
191 | bit = orig_firstfree & BMPG_MASK(metap); |
192 | i = first_page; |
193 | j = bit / BITS_PER_MAP; |
194 | bit &= ~(BITS_PER_MAP - 1); |
195 | |
196 | /* outer loop iterates once per bitmap page */ |
197 | for (;;) |
198 | { |
199 | BlockNumber mapblkno; |
200 | Page mappage; |
201 | uint32 last_inpage; |
202 | |
203 | /* want to end search with the last existing overflow page */ |
204 | splitnum = metap->hashm_ovflpoint; |
205 | max_ovflpg = metap->hashm_spares[splitnum] - 1; |
206 | last_page = max_ovflpg >> BMPG_SHIFT(metap); |
207 | last_bit = max_ovflpg & BMPG_MASK(metap); |
208 | |
209 | if (i > last_page) |
210 | break; |
211 | |
212 | Assert(i < metap->hashm_nmaps); |
213 | mapblkno = metap->hashm_mapp[i]; |
214 | |
215 | if (i == last_page) |
216 | last_inpage = last_bit; |
217 | else |
218 | last_inpage = BMPGSZ_BIT(metap) - 1; |
219 | |
220 | /* Release exclusive lock on metapage while reading bitmap page */ |
221 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
222 | |
223 | mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); |
224 | mappage = BufferGetPage(mapbuf); |
225 | freep = HashPageGetBitmap(mappage); |
226 | |
227 | for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) |
228 | { |
229 | if (freep[j] != ALL_SET) |
230 | { |
231 | page_found = true; |
232 | |
233 | /* Reacquire exclusive lock on the meta page */ |
234 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
235 | |
236 | /* convert bit to bit number within page */ |
237 | bit += _hash_firstfreebit(freep[j]); |
238 | bitmap_page_bit = bit; |
239 | |
240 | /* convert bit to absolute bit number */ |
241 | bit += (i << BMPG_SHIFT(metap)); |
242 | /* Calculate address of the recycled overflow page */ |
243 | blkno = bitno_to_blkno(metap, bit); |
244 | |
245 | /* Fetch and init the recycled page */ |
246 | ovflbuf = _hash_getinitbuf(rel, blkno); |
247 | |
248 | goto found; |
249 | } |
250 | } |
251 | |
252 | /* No free space here, try to advance to next map page */ |
253 | _hash_relbuf(rel, mapbuf); |
254 | mapbuf = InvalidBuffer; |
255 | i++; |
256 | j = 0; /* scan from start of next map page */ |
257 | bit = 0; |
258 | |
259 | /* Reacquire exclusive lock on the meta page */ |
260 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
261 | } |
262 | |
263 | /* |
264 | * No free pages --- have to extend the relation to add an overflow page. |
265 | * First, check to see if we have to add a new bitmap page too. |
266 | */ |
267 | if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) |
268 | { |
269 | /* |
270 | * We create the new bitmap page with all pages marked "in use". |
271 | * Actually two pages in the new bitmap's range will exist |
272 | * immediately: the bitmap page itself, and the following page which |
273 | * is the one we return to the caller. Both of these are correctly |
274 | * marked "in use". Subsequent pages do not exist yet, but it is |
275 | * convenient to pre-mark them as "in use" too. |
276 | */ |
277 | bit = metap->hashm_spares[splitnum]; |
278 | |
279 | /* metapage already has a write lock */ |
280 | if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) |
281 | ereport(ERROR, |
282 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
283 | errmsg("out of overflow pages in hash index \"%s\"" , |
284 | RelationGetRelationName(rel)))); |
285 | |
286 | newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); |
287 | } |
288 | else |
289 | { |
290 | /* |
291 | * Nothing to do here; since the page will be past the last used page, |
292 | * we know its bitmap bit was preinitialized to "in use". |
293 | */ |
294 | } |
295 | |
296 | /* Calculate address of the new overflow page */ |
297 | bit = BufferIsValid(newmapbuf) ? |
298 | metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; |
299 | blkno = bitno_to_blkno(metap, bit); |
300 | |
301 | /* |
302 | * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the |
303 | * relation length stays in sync with ours. XXX It's annoying to do this |
304 | * with metapage write lock held; would be better to use a lock that |
305 | * doesn't block incoming searches. |
306 | * |
307 | * It is okay to hold two buffer locks here (one on tail page of bucket |
308 | * and other on new overflow page) since there cannot be anyone else |
309 | * contending for access to ovflbuf. |
310 | */ |
311 | ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); |
312 | |
313 | found: |
314 | |
315 | /* |
316 | * Do the update. No ereport(ERROR) until changes are logged. We want to |
317 | * log the changes for bitmap page and overflow page together to avoid |
318 | * loss of pages in case the new page is added. |
319 | */ |
320 | START_CRIT_SECTION(); |
321 | |
322 | if (page_found) |
323 | { |
324 | Assert(BufferIsValid(mapbuf)); |
325 | |
326 | /* mark page "in use" in the bitmap */ |
327 | SETBIT(freep, bitmap_page_bit); |
328 | MarkBufferDirty(mapbuf); |
329 | } |
330 | else |
331 | { |
332 | /* update the count to indicate new overflow page is added */ |
333 | metap->hashm_spares[splitnum]++; |
334 | |
335 | if (BufferIsValid(newmapbuf)) |
336 | { |
337 | _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); |
338 | MarkBufferDirty(newmapbuf); |
339 | |
340 | /* add the new bitmap page to the metapage's list of bitmaps */ |
341 | metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); |
342 | metap->hashm_nmaps++; |
343 | metap->hashm_spares[splitnum]++; |
344 | } |
345 | |
346 | MarkBufferDirty(metabuf); |
347 | |
348 | /* |
349 | * for new overflow page, we don't need to explicitly set the bit in |
350 | * bitmap page, as by default that will be set to "in use". |
351 | */ |
352 | } |
353 | |
354 | /* |
355 | * Adjust hashm_firstfree to avoid redundant searches. But don't risk |
356 | * changing it if someone moved it while we were searching bitmap pages. |
357 | */ |
358 | if (metap->hashm_firstfree == orig_firstfree) |
359 | { |
360 | metap->hashm_firstfree = bit + 1; |
361 | MarkBufferDirty(metabuf); |
362 | } |
363 | |
364 | /* initialize new overflow page */ |
365 | ovflpage = BufferGetPage(ovflbuf); |
366 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
367 | ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); |
368 | ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
369 | ovflopaque->hasho_bucket = pageopaque->hasho_bucket; |
370 | ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; |
371 | ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
372 | |
373 | MarkBufferDirty(ovflbuf); |
374 | |
375 | /* logically chain overflow page to previous page */ |
376 | pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); |
377 | |
378 | MarkBufferDirty(buf); |
379 | |
380 | /* XLOG stuff */ |
381 | if (RelationNeedsWAL(rel)) |
382 | { |
383 | XLogRecPtr recptr; |
384 | xl_hash_add_ovfl_page xlrec; |
385 | |
386 | xlrec.bmpage_found = page_found; |
387 | xlrec.bmsize = metap->hashm_bmsize; |
388 | |
389 | XLogBeginInsert(); |
390 | XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); |
391 | |
392 | XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); |
393 | XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); |
394 | |
395 | XLogRegisterBuffer(1, buf, REGBUF_STANDARD); |
396 | |
397 | if (BufferIsValid(mapbuf)) |
398 | { |
399 | XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); |
400 | XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); |
401 | } |
402 | |
403 | if (BufferIsValid(newmapbuf)) |
404 | XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); |
405 | |
406 | XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); |
407 | XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); |
408 | |
409 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); |
410 | |
411 | PageSetLSN(BufferGetPage(ovflbuf), recptr); |
412 | PageSetLSN(BufferGetPage(buf), recptr); |
413 | |
414 | if (BufferIsValid(mapbuf)) |
415 | PageSetLSN(BufferGetPage(mapbuf), recptr); |
416 | |
417 | if (BufferIsValid(newmapbuf)) |
418 | PageSetLSN(BufferGetPage(newmapbuf), recptr); |
419 | |
420 | PageSetLSN(BufferGetPage(metabuf), recptr); |
421 | } |
422 | |
423 | END_CRIT_SECTION(); |
424 | |
425 | if (retain_pin) |
426 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
427 | else |
428 | _hash_relbuf(rel, buf); |
429 | |
430 | if (BufferIsValid(mapbuf)) |
431 | _hash_relbuf(rel, mapbuf); |
432 | |
433 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
434 | |
435 | if (BufferIsValid(newmapbuf)) |
436 | _hash_relbuf(rel, newmapbuf); |
437 | |
438 | return ovflbuf; |
439 | } |
440 | |
441 | /* |
442 | * _hash_firstfreebit() |
443 | * |
444 | * Return the number of the first bit that is not set in the word 'map'. |
445 | */ |
446 | static uint32 |
447 | _hash_firstfreebit(uint32 map) |
448 | { |
449 | uint32 i, |
450 | mask; |
451 | |
452 | mask = 0x1; |
453 | for (i = 0; i < BITS_PER_MAP; i++) |
454 | { |
455 | if (!(mask & map)) |
456 | return i; |
457 | mask <<= 1; |
458 | } |
459 | |
460 | elog(ERROR, "firstfreebit found no free bit" ); |
461 | |
462 | return 0; /* keep compiler quiet */ |
463 | } |
464 | |
465 | /* |
466 | * _hash_freeovflpage() - |
467 | * |
468 | * Remove this overflow page from its bucket's chain, and mark the page as |
469 | * free. On entry, ovflbuf is write-locked; it is released before exiting. |
470 | * |
471 | * Add the tuples (itups) to wbuf in this function. We could do that in the |
472 | * caller as well, but the advantage of doing it here is we can easily write |
473 | * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and |
474 | * removal of overflow page has to done as an atomic operation, otherwise |
475 | * during replay on standby users might find duplicate records. |
476 | * |
477 | * Since this function is invoked in VACUUM, we provide an access strategy |
478 | * parameter that controls fetches of the bucket pages. |
479 | * |
480 | * Returns the block number of the page that followed the given page |
481 | * in the bucket, or InvalidBlockNumber if no following page. |
482 | * |
483 | * NB: caller must not hold lock on metapage, nor on page, that's next to |
484 | * ovflbuf in the bucket chain. We don't acquire the lock on page that's |
485 | * prior to ovflbuf in chain if it is same as wbuf because the caller already |
486 | * has a lock on same. |
487 | */ |
488 | BlockNumber |
489 | _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, |
490 | Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, |
491 | Size *tups_size, uint16 nitups, |
492 | BufferAccessStrategy bstrategy) |
493 | { |
494 | HashMetaPage metap; |
495 | Buffer metabuf; |
496 | Buffer mapbuf; |
497 | BlockNumber ovflblkno; |
498 | BlockNumber prevblkno; |
499 | BlockNumber blkno; |
500 | BlockNumber nextblkno; |
501 | BlockNumber writeblkno; |
502 | HashPageOpaque ovflopaque; |
503 | Page ovflpage; |
504 | Page mappage; |
505 | uint32 *freep; |
506 | uint32 ovflbitno; |
507 | int32 bitmappage, |
508 | bitmapbit; |
509 | Bucket bucket PG_USED_FOR_ASSERTS_ONLY; |
510 | Buffer prevbuf = InvalidBuffer; |
511 | Buffer nextbuf = InvalidBuffer; |
512 | bool update_metap = false; |
513 | |
514 | /* Get information from the doomed page */ |
515 | _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); |
516 | ovflblkno = BufferGetBlockNumber(ovflbuf); |
517 | ovflpage = BufferGetPage(ovflbuf); |
518 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
519 | nextblkno = ovflopaque->hasho_nextblkno; |
520 | prevblkno = ovflopaque->hasho_prevblkno; |
521 | writeblkno = BufferGetBlockNumber(wbuf); |
522 | bucket = ovflopaque->hasho_bucket; |
523 | |
524 | /* |
525 | * Fix up the bucket chain. this is a doubly-linked list, so we must fix |
526 | * up the bucket chain members behind and ahead of the overflow page being |
527 | * deleted. Concurrency issues are avoided by using lock chaining as |
528 | * described atop hashbucketcleanup. |
529 | */ |
530 | if (BlockNumberIsValid(prevblkno)) |
531 | { |
532 | if (prevblkno == writeblkno) |
533 | prevbuf = wbuf; |
534 | else |
535 | prevbuf = _hash_getbuf_with_strategy(rel, |
536 | prevblkno, |
537 | HASH_WRITE, |
538 | LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, |
539 | bstrategy); |
540 | } |
541 | if (BlockNumberIsValid(nextblkno)) |
542 | nextbuf = _hash_getbuf_with_strategy(rel, |
543 | nextblkno, |
544 | HASH_WRITE, |
545 | LH_OVERFLOW_PAGE, |
546 | bstrategy); |
547 | |
548 | /* Note: bstrategy is intentionally not used for metapage and bitmap */ |
549 | |
550 | /* Read the metapage so we can determine which bitmap page to use */ |
551 | metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); |
552 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
553 | |
554 | /* Identify which bit to set */ |
555 | ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); |
556 | |
557 | bitmappage = ovflbitno >> BMPG_SHIFT(metap); |
558 | bitmapbit = ovflbitno & BMPG_MASK(metap); |
559 | |
560 | if (bitmappage >= metap->hashm_nmaps) |
561 | elog(ERROR, "invalid overflow bit number %u" , ovflbitno); |
562 | blkno = metap->hashm_mapp[bitmappage]; |
563 | |
564 | /* Release metapage lock while we access the bitmap page */ |
565 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
566 | |
567 | /* read the bitmap page to clear the bitmap bit */ |
568 | mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); |
569 | mappage = BufferGetPage(mapbuf); |
570 | freep = HashPageGetBitmap(mappage); |
571 | Assert(ISSET(freep, bitmapbit)); |
572 | |
573 | /* Get write-lock on metapage to update firstfree */ |
574 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
575 | |
576 | /* This operation needs to log multiple tuples, prepare WAL for that */ |
577 | if (RelationNeedsWAL(rel)) |
578 | XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); |
579 | |
580 | START_CRIT_SECTION(); |
581 | |
582 | /* |
583 | * we have to insert tuples on the "write" page, being careful to preserve |
584 | * hashkey ordering. (If we insert many tuples into the same "write" page |
585 | * it would be worth qsort'ing them). |
586 | */ |
587 | if (nitups > 0) |
588 | { |
589 | _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); |
590 | MarkBufferDirty(wbuf); |
591 | } |
592 | |
593 | /* |
594 | * Reinitialize the freed overflow page. Just zeroing the page won't |
595 | * work, because WAL replay routines expect pages to be initialized. See |
596 | * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are |
597 | * careful to make the special space valid here so that tools like |
598 | * pageinspect won't get confused. |
599 | */ |
600 | _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); |
601 | |
602 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
603 | |
604 | ovflopaque->hasho_prevblkno = InvalidBlockNumber; |
605 | ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
606 | ovflopaque->hasho_bucket = -1; |
607 | ovflopaque->hasho_flag = LH_UNUSED_PAGE; |
608 | ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
609 | |
610 | MarkBufferDirty(ovflbuf); |
611 | |
612 | if (BufferIsValid(prevbuf)) |
613 | { |
614 | Page prevpage = BufferGetPage(prevbuf); |
615 | HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); |
616 | |
617 | Assert(prevopaque->hasho_bucket == bucket); |
618 | prevopaque->hasho_nextblkno = nextblkno; |
619 | MarkBufferDirty(prevbuf); |
620 | } |
621 | if (BufferIsValid(nextbuf)) |
622 | { |
623 | Page nextpage = BufferGetPage(nextbuf); |
624 | HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); |
625 | |
626 | Assert(nextopaque->hasho_bucket == bucket); |
627 | nextopaque->hasho_prevblkno = prevblkno; |
628 | MarkBufferDirty(nextbuf); |
629 | } |
630 | |
631 | /* Clear the bitmap bit to indicate that this overflow page is free */ |
632 | CLRBIT(freep, bitmapbit); |
633 | MarkBufferDirty(mapbuf); |
634 | |
635 | /* if this is now the first free page, update hashm_firstfree */ |
636 | if (ovflbitno < metap->hashm_firstfree) |
637 | { |
638 | metap->hashm_firstfree = ovflbitno; |
639 | update_metap = true; |
640 | MarkBufferDirty(metabuf); |
641 | } |
642 | |
643 | /* XLOG stuff */ |
644 | if (RelationNeedsWAL(rel)) |
645 | { |
646 | xl_hash_squeeze_page xlrec; |
647 | XLogRecPtr recptr; |
648 | int i; |
649 | |
650 | xlrec.prevblkno = prevblkno; |
651 | xlrec.nextblkno = nextblkno; |
652 | xlrec.ntups = nitups; |
653 | xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); |
654 | xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); |
655 | |
656 | XLogBeginInsert(); |
657 | XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); |
658 | |
659 | /* |
660 | * bucket buffer needs to be registered to ensure that we can acquire |
661 | * a cleanup lock on it during replay. |
662 | */ |
663 | if (!xlrec.is_prim_bucket_same_wrt) |
664 | XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
665 | |
666 | XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); |
667 | if (xlrec.ntups > 0) |
668 | { |
669 | XLogRegisterBufData(1, (char *) itup_offsets, |
670 | nitups * sizeof(OffsetNumber)); |
671 | for (i = 0; i < nitups; i++) |
672 | XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); |
673 | } |
674 | |
675 | XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); |
676 | |
677 | /* |
678 | * If prevpage and the writepage (block in which we are moving tuples |
679 | * from overflow) are same, then no need to separately register |
680 | * prevpage. During replay, we can directly update the nextblock in |
681 | * writepage. |
682 | */ |
683 | if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) |
684 | XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); |
685 | |
686 | if (BufferIsValid(nextbuf)) |
687 | XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); |
688 | |
689 | XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); |
690 | XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); |
691 | |
692 | if (update_metap) |
693 | { |
694 | XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); |
695 | XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); |
696 | } |
697 | |
698 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); |
699 | |
700 | PageSetLSN(BufferGetPage(wbuf), recptr); |
701 | PageSetLSN(BufferGetPage(ovflbuf), recptr); |
702 | |
703 | if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) |
704 | PageSetLSN(BufferGetPage(prevbuf), recptr); |
705 | if (BufferIsValid(nextbuf)) |
706 | PageSetLSN(BufferGetPage(nextbuf), recptr); |
707 | |
708 | PageSetLSN(BufferGetPage(mapbuf), recptr); |
709 | |
710 | if (update_metap) |
711 | PageSetLSN(BufferGetPage(metabuf), recptr); |
712 | } |
713 | |
714 | END_CRIT_SECTION(); |
715 | |
716 | /* release previous bucket if it is not same as write bucket */ |
717 | if (BufferIsValid(prevbuf) && prevblkno != writeblkno) |
718 | _hash_relbuf(rel, prevbuf); |
719 | |
720 | if (BufferIsValid(ovflbuf)) |
721 | _hash_relbuf(rel, ovflbuf); |
722 | |
723 | if (BufferIsValid(nextbuf)) |
724 | _hash_relbuf(rel, nextbuf); |
725 | |
726 | _hash_relbuf(rel, mapbuf); |
727 | _hash_relbuf(rel, metabuf); |
728 | |
729 | return nextblkno; |
730 | } |
731 | |
732 | |
733 | /* |
734 | * _hash_initbitmapbuffer() |
735 | * |
736 | * Initialize a new bitmap page. All bits in the new bitmap page are set to |
737 | * "1", indicating "in use". |
738 | */ |
739 | void |
740 | _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) |
741 | { |
742 | Page pg; |
743 | HashPageOpaque op; |
744 | uint32 *freep; |
745 | |
746 | pg = BufferGetPage(buf); |
747 | |
748 | /* initialize the page */ |
749 | if (initpage) |
750 | _hash_pageinit(pg, BufferGetPageSize(buf)); |
751 | |
752 | /* initialize the page's special space */ |
753 | op = (HashPageOpaque) PageGetSpecialPointer(pg); |
754 | op->hasho_prevblkno = InvalidBlockNumber; |
755 | op->hasho_nextblkno = InvalidBlockNumber; |
756 | op->hasho_bucket = -1; |
757 | op->hasho_flag = LH_BITMAP_PAGE; |
758 | op->hasho_page_id = HASHO_PAGE_ID; |
759 | |
760 | /* set all of the bits to 1 */ |
761 | freep = HashPageGetBitmap(pg); |
762 | MemSet(freep, 0xFF, bmsize); |
763 | |
764 | /* |
765 | * Set pd_lower just past the end of the bitmap page data. We could even |
766 | * set pd_lower equal to pd_upper, but this is more precise and makes the |
767 | * page look compressible to xlog.c. |
768 | */ |
769 | ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; |
770 | } |
771 | |
772 | |
773 | /* |
774 | * _hash_squeezebucket(rel, bucket) |
775 | * |
776 | * Try to squeeze the tuples onto pages occurring earlier in the |
777 | * bucket chain in an attempt to free overflow pages. When we start |
778 | * the "squeezing", the page from which we start taking tuples (the |
779 | * "read" page) is the last bucket in the bucket chain and the page |
780 | * onto which we start squeezing tuples (the "write" page) is the |
781 | * first page in the bucket chain. The read page works backward and |
782 | * the write page works forward; the procedure terminates when the |
783 | * read page and write page are the same page. |
784 | * |
785 | * At completion of this procedure, it is guaranteed that all pages in |
786 | * the bucket are nonempty, unless the bucket is totally empty (in |
787 | * which case all overflow pages will be freed). The original implementation |
788 | * required that to be true on entry as well, but it's a lot easier for |
789 | * callers to leave empty overflow pages and let this guy clean it up. |
790 | * |
791 | * Caller must acquire cleanup lock on the primary page of the target |
792 | * bucket to exclude any scans that are in progress, which could easily |
793 | * be confused into returning the same tuple more than once or some tuples |
794 | * not at all by the rearrangement we are performing here. To prevent |
795 | * any concurrent scan to cross the squeeze scan we use lock chaining |
796 | * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. |
797 | * |
798 | * We need to retain a pin on the primary bucket to ensure that no concurrent |
799 | * split can start. |
800 | * |
801 | * Since this function is invoked in VACUUM, we provide an access strategy |
802 | * parameter that controls fetches of the bucket pages. |
803 | */ |
804 | void |
805 | _hash_squeezebucket(Relation rel, |
806 | Bucket bucket, |
807 | BlockNumber bucket_blkno, |
808 | Buffer bucket_buf, |
809 | BufferAccessStrategy bstrategy) |
810 | { |
811 | BlockNumber wblkno; |
812 | BlockNumber rblkno; |
813 | Buffer wbuf; |
814 | Buffer rbuf; |
815 | Page wpage; |
816 | Page rpage; |
817 | HashPageOpaque wopaque; |
818 | HashPageOpaque ropaque; |
819 | |
820 | /* |
821 | * start squeezing into the primary bucket page. |
822 | */ |
823 | wblkno = bucket_blkno; |
824 | wbuf = bucket_buf; |
825 | wpage = BufferGetPage(wbuf); |
826 | wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); |
827 | |
828 | /* |
829 | * if there aren't any overflow pages, there's nothing to squeeze. caller |
830 | * is responsible for releasing the pin on primary bucket page. |
831 | */ |
832 | if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) |
833 | { |
834 | LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
835 | return; |
836 | } |
837 | |
838 | /* |
839 | * Find the last page in the bucket chain by starting at the base bucket |
840 | * page and working forward. Note: we assume that a hash bucket chain is |
841 | * usually smaller than the buffer ring being used by VACUUM, else using |
842 | * the access strategy here would be counterproductive. |
843 | */ |
844 | rbuf = InvalidBuffer; |
845 | ropaque = wopaque; |
846 | do |
847 | { |
848 | rblkno = ropaque->hasho_nextblkno; |
849 | if (rbuf != InvalidBuffer) |
850 | _hash_relbuf(rel, rbuf); |
851 | rbuf = _hash_getbuf_with_strategy(rel, |
852 | rblkno, |
853 | HASH_WRITE, |
854 | LH_OVERFLOW_PAGE, |
855 | bstrategy); |
856 | rpage = BufferGetPage(rbuf); |
857 | ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); |
858 | Assert(ropaque->hasho_bucket == bucket); |
859 | } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); |
860 | |
861 | /* |
862 | * squeeze the tuples. |
863 | */ |
864 | for (;;) |
865 | { |
866 | OffsetNumber roffnum; |
867 | OffsetNumber maxroffnum; |
868 | OffsetNumber deletable[MaxOffsetNumber]; |
869 | IndexTuple itups[MaxIndexTuplesPerPage]; |
870 | Size tups_size[MaxIndexTuplesPerPage]; |
871 | OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; |
872 | uint16 ndeletable = 0; |
873 | uint16 nitups = 0; |
874 | Size all_tups_size = 0; |
875 | int i; |
876 | bool retain_pin = false; |
877 | |
878 | readpage: |
879 | /* Scan each tuple in "read" page */ |
880 | maxroffnum = PageGetMaxOffsetNumber(rpage); |
881 | for (roffnum = FirstOffsetNumber; |
882 | roffnum <= maxroffnum; |
883 | roffnum = OffsetNumberNext(roffnum)) |
884 | { |
885 | IndexTuple itup; |
886 | Size itemsz; |
887 | |
888 | /* skip dead tuples */ |
889 | if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) |
890 | continue; |
891 | |
892 | itup = (IndexTuple) PageGetItem(rpage, |
893 | PageGetItemId(rpage, roffnum)); |
894 | itemsz = IndexTupleSize(itup); |
895 | itemsz = MAXALIGN(itemsz); |
896 | |
897 | /* |
898 | * Walk up the bucket chain, looking for a page big enough for |
899 | * this item and all other accumulated items. Exit if we reach |
900 | * the read page. |
901 | */ |
902 | while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) |
903 | { |
904 | Buffer next_wbuf = InvalidBuffer; |
905 | bool tups_moved = false; |
906 | |
907 | Assert(!PageIsEmpty(wpage)); |
908 | |
909 | if (wblkno == bucket_blkno) |
910 | retain_pin = true; |
911 | |
912 | wblkno = wopaque->hasho_nextblkno; |
913 | Assert(BlockNumberIsValid(wblkno)); |
914 | |
915 | /* don't need to move to next page if we reached the read page */ |
916 | if (wblkno != rblkno) |
917 | next_wbuf = _hash_getbuf_with_strategy(rel, |
918 | wblkno, |
919 | HASH_WRITE, |
920 | LH_OVERFLOW_PAGE, |
921 | bstrategy); |
922 | |
923 | if (nitups > 0) |
924 | { |
925 | Assert(nitups == ndeletable); |
926 | |
927 | /* |
928 | * This operation needs to log multiple tuples, prepare |
929 | * WAL for that. |
930 | */ |
931 | if (RelationNeedsWAL(rel)) |
932 | XLogEnsureRecordSpace(0, 3 + nitups); |
933 | |
934 | START_CRIT_SECTION(); |
935 | |
936 | /* |
937 | * we have to insert tuples on the "write" page, being |
938 | * careful to preserve hashkey ordering. (If we insert |
939 | * many tuples into the same "write" page it would be |
940 | * worth qsort'ing them). |
941 | */ |
942 | _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); |
943 | MarkBufferDirty(wbuf); |
944 | |
945 | /* Delete tuples we already moved off read page */ |
946 | PageIndexMultiDelete(rpage, deletable, ndeletable); |
947 | MarkBufferDirty(rbuf); |
948 | |
949 | /* XLOG stuff */ |
950 | if (RelationNeedsWAL(rel)) |
951 | { |
952 | XLogRecPtr recptr; |
953 | xl_hash_move_page_contents xlrec; |
954 | |
955 | xlrec.ntups = nitups; |
956 | xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; |
957 | |
958 | XLogBeginInsert(); |
959 | XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); |
960 | |
961 | /* |
962 | * bucket buffer needs to be registered to ensure that |
963 | * we can acquire a cleanup lock on it during replay. |
964 | */ |
965 | if (!xlrec.is_prim_bucket_same_wrt) |
966 | XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
967 | |
968 | XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); |
969 | XLogRegisterBufData(1, (char *) itup_offsets, |
970 | nitups * sizeof(OffsetNumber)); |
971 | for (i = 0; i < nitups; i++) |
972 | XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); |
973 | |
974 | XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); |
975 | XLogRegisterBufData(2, (char *) deletable, |
976 | ndeletable * sizeof(OffsetNumber)); |
977 | |
978 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); |
979 | |
980 | PageSetLSN(BufferGetPage(wbuf), recptr); |
981 | PageSetLSN(BufferGetPage(rbuf), recptr); |
982 | } |
983 | |
984 | END_CRIT_SECTION(); |
985 | |
986 | tups_moved = true; |
987 | } |
988 | |
989 | /* |
990 | * release the lock on previous page after acquiring the lock |
991 | * on next page |
992 | */ |
993 | if (retain_pin) |
994 | LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
995 | else |
996 | _hash_relbuf(rel, wbuf); |
997 | |
998 | /* nothing more to do if we reached the read page */ |
999 | if (rblkno == wblkno) |
1000 | { |
1001 | _hash_relbuf(rel, rbuf); |
1002 | return; |
1003 | } |
1004 | |
1005 | wbuf = next_wbuf; |
1006 | wpage = BufferGetPage(wbuf); |
1007 | wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); |
1008 | Assert(wopaque->hasho_bucket == bucket); |
1009 | retain_pin = false; |
1010 | |
1011 | /* be tidy */ |
1012 | for (i = 0; i < nitups; i++) |
1013 | pfree(itups[i]); |
1014 | nitups = 0; |
1015 | all_tups_size = 0; |
1016 | ndeletable = 0; |
1017 | |
1018 | /* |
1019 | * after moving the tuples, rpage would have been compacted, |
1020 | * so we need to rescan it. |
1021 | */ |
1022 | if (tups_moved) |
1023 | goto readpage; |
1024 | } |
1025 | |
1026 | /* remember tuple for deletion from "read" page */ |
1027 | deletable[ndeletable++] = roffnum; |
1028 | |
1029 | /* |
1030 | * we need a copy of index tuples as they can be freed as part of |
1031 | * overflow page, however we need them to write a WAL record in |
1032 | * _hash_freeovflpage. |
1033 | */ |
1034 | itups[nitups] = CopyIndexTuple(itup); |
1035 | tups_size[nitups++] = itemsz; |
1036 | all_tups_size += itemsz; |
1037 | } |
1038 | |
1039 | /* |
1040 | * If we reach here, there are no live tuples on the "read" page --- |
1041 | * it was empty when we got to it, or we moved them all. So we can |
1042 | * just free the page without bothering with deleting tuples |
1043 | * individually. Then advance to the previous "read" page. |
1044 | * |
1045 | * Tricky point here: if our read and write pages are adjacent in the |
1046 | * bucket chain, our write lock on wbuf will conflict with |
1047 | * _hash_freeovflpage's attempt to update the sibling links of the |
1048 | * removed page. In that case, we don't need to lock it again. |
1049 | */ |
1050 | rblkno = ropaque->hasho_prevblkno; |
1051 | Assert(BlockNumberIsValid(rblkno)); |
1052 | |
1053 | /* free this overflow page (releases rbuf) */ |
1054 | _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, |
1055 | tups_size, nitups, bstrategy); |
1056 | |
1057 | /* be tidy */ |
1058 | for (i = 0; i < nitups; i++) |
1059 | pfree(itups[i]); |
1060 | |
1061 | /* are we freeing the page adjacent to wbuf? */ |
1062 | if (rblkno == wblkno) |
1063 | { |
1064 | /* retain the pin on primary bucket page till end of bucket scan */ |
1065 | if (wblkno == bucket_blkno) |
1066 | LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
1067 | else |
1068 | _hash_relbuf(rel, wbuf); |
1069 | return; |
1070 | } |
1071 | |
1072 | rbuf = _hash_getbuf_with_strategy(rel, |
1073 | rblkno, |
1074 | HASH_WRITE, |
1075 | LH_OVERFLOW_PAGE, |
1076 | bstrategy); |
1077 | rpage = BufferGetPage(rbuf); |
1078 | ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); |
1079 | Assert(ropaque->hasho_bucket == bucket); |
1080 | } |
1081 | |
1082 | /* NOTREACHED */ |
1083 | } |
1084 | |