1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * hash_xlog.c |
4 | * WAL replay logic for hash index. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/hash/hash_xlog.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres.h" |
16 | |
17 | #include "access/bufmask.h" |
18 | #include "access/hash.h" |
19 | #include "access/hash_xlog.h" |
20 | #include "access/xlogutils.h" |
21 | #include "access/xlog.h" |
22 | #include "access/transam.h" |
23 | #include "storage/procarray.h" |
24 | #include "miscadmin.h" |
25 | |
26 | /* |
27 | * replay a hash index meta page |
28 | */ |
29 | static void |
30 | hash_xlog_init_meta_page(XLogReaderState *record) |
31 | { |
32 | XLogRecPtr lsn = record->EndRecPtr; |
33 | Page page; |
34 | Buffer metabuf; |
35 | ForkNumber forknum; |
36 | |
37 | xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); |
38 | |
39 | /* create the index' metapage */ |
40 | metabuf = XLogInitBufferForRedo(record, 0); |
41 | Assert(BufferIsValid(metabuf)); |
42 | _hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid, |
43 | xlrec->ffactor, true); |
44 | page = (Page) BufferGetPage(metabuf); |
45 | PageSetLSN(page, lsn); |
46 | MarkBufferDirty(metabuf); |
47 | |
48 | /* |
49 | * Force the on-disk state of init forks to always be in sync with the |
50 | * state in shared buffers. See XLogReadBufferForRedoExtended. We need |
51 | * special handling for init forks as create index operations don't log a |
52 | * full page image of the metapage. |
53 | */ |
54 | XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); |
55 | if (forknum == INIT_FORKNUM) |
56 | FlushOneBuffer(metabuf); |
57 | |
58 | /* all done */ |
59 | UnlockReleaseBuffer(metabuf); |
60 | } |
61 | |
62 | /* |
63 | * replay a hash index bitmap page |
64 | */ |
65 | static void |
66 | hash_xlog_init_bitmap_page(XLogReaderState *record) |
67 | { |
68 | XLogRecPtr lsn = record->EndRecPtr; |
69 | Buffer bitmapbuf; |
70 | Buffer metabuf; |
71 | Page page; |
72 | HashMetaPage metap; |
73 | uint32 num_buckets; |
74 | ForkNumber forknum; |
75 | |
76 | xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); |
77 | |
78 | /* |
79 | * Initialize bitmap page |
80 | */ |
81 | bitmapbuf = XLogInitBufferForRedo(record, 0); |
82 | _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); |
83 | PageSetLSN(BufferGetPage(bitmapbuf), lsn); |
84 | MarkBufferDirty(bitmapbuf); |
85 | |
86 | /* |
87 | * Force the on-disk state of init forks to always be in sync with the |
88 | * state in shared buffers. See XLogReadBufferForRedoExtended. We need |
89 | * special handling for init forks as create index operations don't log a |
90 | * full page image of the metapage. |
91 | */ |
92 | XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); |
93 | if (forknum == INIT_FORKNUM) |
94 | FlushOneBuffer(bitmapbuf); |
95 | UnlockReleaseBuffer(bitmapbuf); |
96 | |
97 | /* add the new bitmap page to the metapage's list of bitmaps */ |
98 | if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) |
99 | { |
100 | /* |
101 | * Note: in normal operation, we'd update the metapage while still |
102 | * holding lock on the bitmap page. But during replay it's not |
103 | * necessary to hold that lock, since nobody can see it yet; the |
104 | * creating transaction hasn't yet committed. |
105 | */ |
106 | page = BufferGetPage(metabuf); |
107 | metap = HashPageGetMeta(page); |
108 | |
109 | num_buckets = metap->hashm_maxbucket + 1; |
110 | metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; |
111 | metap->hashm_nmaps++; |
112 | |
113 | PageSetLSN(page, lsn); |
114 | MarkBufferDirty(metabuf); |
115 | |
116 | XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL); |
117 | if (forknum == INIT_FORKNUM) |
118 | FlushOneBuffer(metabuf); |
119 | } |
120 | if (BufferIsValid(metabuf)) |
121 | UnlockReleaseBuffer(metabuf); |
122 | } |
123 | |
124 | /* |
125 | * replay a hash index insert without split |
126 | */ |
127 | static void |
128 | hash_xlog_insert(XLogReaderState *record) |
129 | { |
130 | HashMetaPage metap; |
131 | XLogRecPtr lsn = record->EndRecPtr; |
132 | xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); |
133 | Buffer buffer; |
134 | Page page; |
135 | |
136 | if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) |
137 | { |
138 | Size datalen; |
139 | char *datapos = XLogRecGetBlockData(record, 0, &datalen); |
140 | |
141 | page = BufferGetPage(buffer); |
142 | |
143 | if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, |
144 | false, false) == InvalidOffsetNumber) |
145 | elog(PANIC, "hash_xlog_insert: failed to add item" ); |
146 | |
147 | PageSetLSN(page, lsn); |
148 | MarkBufferDirty(buffer); |
149 | } |
150 | if (BufferIsValid(buffer)) |
151 | UnlockReleaseBuffer(buffer); |
152 | |
153 | if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) |
154 | { |
155 | /* |
156 | * Note: in normal operation, we'd update the metapage while still |
157 | * holding lock on the page we inserted into. But during replay it's |
158 | * not necessary to hold that lock, since no other index updates can |
159 | * be happening concurrently. |
160 | */ |
161 | page = BufferGetPage(buffer); |
162 | metap = HashPageGetMeta(page); |
163 | metap->hashm_ntuples += 1; |
164 | |
165 | PageSetLSN(page, lsn); |
166 | MarkBufferDirty(buffer); |
167 | } |
168 | if (BufferIsValid(buffer)) |
169 | UnlockReleaseBuffer(buffer); |
170 | } |
171 | |
172 | /* |
173 | * replay addition of overflow page for hash index |
174 | */ |
175 | static void |
176 | hash_xlog_add_ovfl_page(XLogReaderState *record) |
177 | { |
178 | XLogRecPtr lsn = record->EndRecPtr; |
179 | xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); |
180 | Buffer leftbuf; |
181 | Buffer ovflbuf; |
182 | Buffer metabuf; |
183 | BlockNumber leftblk; |
184 | BlockNumber rightblk; |
185 | BlockNumber newmapblk = InvalidBlockNumber; |
186 | Page ovflpage; |
187 | HashPageOpaque ovflopaque; |
188 | uint32 *num_bucket; |
189 | char *data; |
190 | Size datalen PG_USED_FOR_ASSERTS_ONLY; |
191 | bool new_bmpage = false; |
192 | |
193 | XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); |
194 | XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); |
195 | |
196 | ovflbuf = XLogInitBufferForRedo(record, 0); |
197 | Assert(BufferIsValid(ovflbuf)); |
198 | |
199 | data = XLogRecGetBlockData(record, 0, &datalen); |
200 | num_bucket = (uint32 *) data; |
201 | Assert(datalen == sizeof(uint32)); |
202 | _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, |
203 | true); |
204 | /* update backlink */ |
205 | ovflpage = BufferGetPage(ovflbuf); |
206 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
207 | ovflopaque->hasho_prevblkno = leftblk; |
208 | |
209 | PageSetLSN(ovflpage, lsn); |
210 | MarkBufferDirty(ovflbuf); |
211 | |
212 | if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) |
213 | { |
214 | Page leftpage; |
215 | HashPageOpaque leftopaque; |
216 | |
217 | leftpage = BufferGetPage(leftbuf); |
218 | leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); |
219 | leftopaque->hasho_nextblkno = rightblk; |
220 | |
221 | PageSetLSN(leftpage, lsn); |
222 | MarkBufferDirty(leftbuf); |
223 | } |
224 | |
225 | if (BufferIsValid(leftbuf)) |
226 | UnlockReleaseBuffer(leftbuf); |
227 | UnlockReleaseBuffer(ovflbuf); |
228 | |
229 | /* |
230 | * Note: in normal operation, we'd update the bitmap and meta page while |
231 | * still holding lock on the overflow pages. But during replay it's not |
232 | * necessary to hold those locks, since no other index updates can be |
233 | * happening concurrently. |
234 | */ |
235 | if (XLogRecHasBlockRef(record, 2)) |
236 | { |
237 | Buffer mapbuffer; |
238 | |
239 | if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) |
240 | { |
241 | Page mappage = (Page) BufferGetPage(mapbuffer); |
242 | uint32 *freep = NULL; |
243 | char *data; |
244 | uint32 *bitmap_page_bit; |
245 | |
246 | freep = HashPageGetBitmap(mappage); |
247 | |
248 | data = XLogRecGetBlockData(record, 2, &datalen); |
249 | bitmap_page_bit = (uint32 *) data; |
250 | |
251 | SETBIT(freep, *bitmap_page_bit); |
252 | |
253 | PageSetLSN(mappage, lsn); |
254 | MarkBufferDirty(mapbuffer); |
255 | } |
256 | if (BufferIsValid(mapbuffer)) |
257 | UnlockReleaseBuffer(mapbuffer); |
258 | } |
259 | |
260 | if (XLogRecHasBlockRef(record, 3)) |
261 | { |
262 | Buffer newmapbuf; |
263 | |
264 | newmapbuf = XLogInitBufferForRedo(record, 3); |
265 | |
266 | _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); |
267 | |
268 | new_bmpage = true; |
269 | newmapblk = BufferGetBlockNumber(newmapbuf); |
270 | |
271 | MarkBufferDirty(newmapbuf); |
272 | PageSetLSN(BufferGetPage(newmapbuf), lsn); |
273 | |
274 | UnlockReleaseBuffer(newmapbuf); |
275 | } |
276 | |
277 | if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) |
278 | { |
279 | HashMetaPage metap; |
280 | Page page; |
281 | uint32 *firstfree_ovflpage; |
282 | |
283 | data = XLogRecGetBlockData(record, 4, &datalen); |
284 | firstfree_ovflpage = (uint32 *) data; |
285 | |
286 | page = BufferGetPage(metabuf); |
287 | metap = HashPageGetMeta(page); |
288 | metap->hashm_firstfree = *firstfree_ovflpage; |
289 | |
290 | if (!xlrec->bmpage_found) |
291 | { |
292 | metap->hashm_spares[metap->hashm_ovflpoint]++; |
293 | |
294 | if (new_bmpage) |
295 | { |
296 | Assert(BlockNumberIsValid(newmapblk)); |
297 | |
298 | metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; |
299 | metap->hashm_nmaps++; |
300 | metap->hashm_spares[metap->hashm_ovflpoint]++; |
301 | } |
302 | } |
303 | |
304 | PageSetLSN(page, lsn); |
305 | MarkBufferDirty(metabuf); |
306 | } |
307 | if (BufferIsValid(metabuf)) |
308 | UnlockReleaseBuffer(metabuf); |
309 | } |
310 | |
311 | /* |
312 | * replay allocation of page for split operation |
313 | */ |
314 | static void |
315 | hash_xlog_split_allocate_page(XLogReaderState *record) |
316 | { |
317 | XLogRecPtr lsn = record->EndRecPtr; |
318 | xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); |
319 | Buffer oldbuf; |
320 | Buffer newbuf; |
321 | Buffer metabuf; |
322 | Size datalen PG_USED_FOR_ASSERTS_ONLY; |
323 | char *data; |
324 | XLogRedoAction action; |
325 | |
326 | /* |
327 | * To be consistent with normal operation, here we take cleanup locks on |
328 | * both the old and new buckets even though there can't be any concurrent |
329 | * inserts. |
330 | */ |
331 | |
332 | /* replay the record for old bucket */ |
333 | action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); |
334 | |
335 | /* |
336 | * Note that we still update the page even if it was restored from a full |
337 | * page image, because the special space is not included in the image. |
338 | */ |
339 | if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) |
340 | { |
341 | Page oldpage; |
342 | HashPageOpaque oldopaque; |
343 | |
344 | oldpage = BufferGetPage(oldbuf); |
345 | oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); |
346 | |
347 | oldopaque->hasho_flag = xlrec->old_bucket_flag; |
348 | oldopaque->hasho_prevblkno = xlrec->new_bucket; |
349 | |
350 | PageSetLSN(oldpage, lsn); |
351 | MarkBufferDirty(oldbuf); |
352 | } |
353 | |
354 | /* replay the record for new bucket */ |
355 | newbuf = XLogInitBufferForRedo(record, 1); |
356 | _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, |
357 | xlrec->new_bucket_flag, true); |
358 | if (!IsBufferCleanupOK(newbuf)) |
359 | elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock" ); |
360 | MarkBufferDirty(newbuf); |
361 | PageSetLSN(BufferGetPage(newbuf), lsn); |
362 | |
363 | /* |
364 | * We can release the lock on old bucket early as well but doing here to |
365 | * consistent with normal operation. |
366 | */ |
367 | if (BufferIsValid(oldbuf)) |
368 | UnlockReleaseBuffer(oldbuf); |
369 | if (BufferIsValid(newbuf)) |
370 | UnlockReleaseBuffer(newbuf); |
371 | |
372 | /* |
373 | * Note: in normal operation, we'd update the meta page while still |
374 | * holding lock on the old and new bucket pages. But during replay it's |
375 | * not necessary to hold those locks, since no other bucket splits can be |
376 | * happening concurrently. |
377 | */ |
378 | |
379 | /* replay the record for metapage changes */ |
380 | if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) |
381 | { |
382 | Page page; |
383 | HashMetaPage metap; |
384 | |
385 | page = BufferGetPage(metabuf); |
386 | metap = HashPageGetMeta(page); |
387 | metap->hashm_maxbucket = xlrec->new_bucket; |
388 | |
389 | data = XLogRecGetBlockData(record, 2, &datalen); |
390 | |
391 | if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) |
392 | { |
393 | uint32 lowmask; |
394 | uint32 *highmask; |
395 | |
396 | /* extract low and high masks. */ |
397 | memcpy(&lowmask, data, sizeof(uint32)); |
398 | highmask = (uint32 *) ((char *) data + sizeof(uint32)); |
399 | |
400 | /* update metapage */ |
401 | metap->hashm_lowmask = lowmask; |
402 | metap->hashm_highmask = *highmask; |
403 | |
404 | data += sizeof(uint32) * 2; |
405 | } |
406 | |
407 | if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) |
408 | { |
409 | uint32 ovflpoint; |
410 | uint32 *ovflpages; |
411 | |
412 | /* extract information of overflow pages. */ |
413 | memcpy(&ovflpoint, data, sizeof(uint32)); |
414 | ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); |
415 | |
416 | /* update metapage */ |
417 | metap->hashm_spares[ovflpoint] = *ovflpages; |
418 | metap->hashm_ovflpoint = ovflpoint; |
419 | } |
420 | |
421 | MarkBufferDirty(metabuf); |
422 | PageSetLSN(BufferGetPage(metabuf), lsn); |
423 | } |
424 | |
425 | if (BufferIsValid(metabuf)) |
426 | UnlockReleaseBuffer(metabuf); |
427 | } |
428 | |
429 | /* |
430 | * replay of split operation |
431 | */ |
432 | static void |
433 | hash_xlog_split_page(XLogReaderState *record) |
434 | { |
435 | Buffer buf; |
436 | |
437 | if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED) |
438 | elog(ERROR, "Hash split record did not contain a full-page image" ); |
439 | |
440 | UnlockReleaseBuffer(buf); |
441 | } |
442 | |
443 | /* |
444 | * replay completion of split operation |
445 | */ |
446 | static void |
447 | hash_xlog_split_complete(XLogReaderState *record) |
448 | { |
449 | XLogRecPtr lsn = record->EndRecPtr; |
450 | xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record); |
451 | Buffer oldbuf; |
452 | Buffer newbuf; |
453 | XLogRedoAction action; |
454 | |
455 | /* replay the record for old bucket */ |
456 | action = XLogReadBufferForRedo(record, 0, &oldbuf); |
457 | |
458 | /* |
459 | * Note that we still update the page even if it was restored from a full |
460 | * page image, because the bucket flag is not included in the image. |
461 | */ |
462 | if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) |
463 | { |
464 | Page oldpage; |
465 | HashPageOpaque oldopaque; |
466 | |
467 | oldpage = BufferGetPage(oldbuf); |
468 | oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); |
469 | |
470 | oldopaque->hasho_flag = xlrec->old_bucket_flag; |
471 | |
472 | PageSetLSN(oldpage, lsn); |
473 | MarkBufferDirty(oldbuf); |
474 | } |
475 | if (BufferIsValid(oldbuf)) |
476 | UnlockReleaseBuffer(oldbuf); |
477 | |
478 | /* replay the record for new bucket */ |
479 | action = XLogReadBufferForRedo(record, 1, &newbuf); |
480 | |
481 | /* |
482 | * Note that we still update the page even if it was restored from a full |
483 | * page image, because the bucket flag is not included in the image. |
484 | */ |
485 | if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) |
486 | { |
487 | Page newpage; |
488 | HashPageOpaque nopaque; |
489 | |
490 | newpage = BufferGetPage(newbuf); |
491 | nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage); |
492 | |
493 | nopaque->hasho_flag = xlrec->new_bucket_flag; |
494 | |
495 | PageSetLSN(newpage, lsn); |
496 | MarkBufferDirty(newbuf); |
497 | } |
498 | if (BufferIsValid(newbuf)) |
499 | UnlockReleaseBuffer(newbuf); |
500 | } |
501 | |
502 | /* |
503 | * replay move of page contents for squeeze operation of hash index |
504 | */ |
505 | static void |
506 | hash_xlog_move_page_contents(XLogReaderState *record) |
507 | { |
508 | XLogRecPtr lsn = record->EndRecPtr; |
509 | xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); |
510 | Buffer bucketbuf = InvalidBuffer; |
511 | Buffer writebuf = InvalidBuffer; |
512 | Buffer deletebuf = InvalidBuffer; |
513 | XLogRedoAction action; |
514 | |
515 | /* |
516 | * Ensure we have a cleanup lock on primary bucket page before we start |
517 | * with the actual replay operation. This is to ensure that neither a |
518 | * scan can start nor a scan can be already-in-progress during the replay |
519 | * of this operation. If we allow scans during this operation, then they |
520 | * can miss some records or show the same record multiple times. |
521 | */ |
522 | if (xldata->is_prim_bucket_same_wrt) |
523 | action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); |
524 | else |
525 | { |
526 | /* |
527 | * we don't care for return value as the purpose of reading bucketbuf |
528 | * is to ensure a cleanup lock on primary bucket page. |
529 | */ |
530 | (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); |
531 | |
532 | action = XLogReadBufferForRedo(record, 1, &writebuf); |
533 | } |
534 | |
535 | /* replay the record for adding entries in overflow buffer */ |
536 | if (action == BLK_NEEDS_REDO) |
537 | { |
538 | Page writepage; |
539 | char *begin; |
540 | char *data; |
541 | Size datalen; |
542 | uint16 ninserted = 0; |
543 | |
544 | data = begin = XLogRecGetBlockData(record, 1, &datalen); |
545 | |
546 | writepage = (Page) BufferGetPage(writebuf); |
547 | |
548 | if (xldata->ntups > 0) |
549 | { |
550 | OffsetNumber *towrite = (OffsetNumber *) data; |
551 | |
552 | data += sizeof(OffsetNumber) * xldata->ntups; |
553 | |
554 | while (data - begin < datalen) |
555 | { |
556 | IndexTuple itup = (IndexTuple) data; |
557 | Size itemsz; |
558 | OffsetNumber l; |
559 | |
560 | itemsz = IndexTupleSize(itup); |
561 | itemsz = MAXALIGN(itemsz); |
562 | |
563 | data += itemsz; |
564 | |
565 | l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); |
566 | if (l == InvalidOffsetNumber) |
567 | elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes" , |
568 | (int) itemsz); |
569 | |
570 | ninserted++; |
571 | } |
572 | } |
573 | |
574 | /* |
575 | * number of tuples inserted must be same as requested in REDO record. |
576 | */ |
577 | Assert(ninserted == xldata->ntups); |
578 | |
579 | PageSetLSN(writepage, lsn); |
580 | MarkBufferDirty(writebuf); |
581 | } |
582 | |
583 | /* replay the record for deleting entries from overflow buffer */ |
584 | if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) |
585 | { |
586 | Page page; |
587 | char *ptr; |
588 | Size len; |
589 | |
590 | ptr = XLogRecGetBlockData(record, 2, &len); |
591 | |
592 | page = (Page) BufferGetPage(deletebuf); |
593 | |
594 | if (len > 0) |
595 | { |
596 | OffsetNumber *unused; |
597 | OffsetNumber *unend; |
598 | |
599 | unused = (OffsetNumber *) ptr; |
600 | unend = (OffsetNumber *) ((char *) ptr + len); |
601 | |
602 | if ((unend - unused) > 0) |
603 | PageIndexMultiDelete(page, unused, unend - unused); |
604 | } |
605 | |
606 | PageSetLSN(page, lsn); |
607 | MarkBufferDirty(deletebuf); |
608 | } |
609 | |
610 | /* |
611 | * Replay is complete, now we can release the buffers. We release locks at |
612 | * end of replay operation to ensure that we hold lock on primary bucket |
613 | * page till end of operation. We can optimize by releasing the lock on |
614 | * write buffer as soon as the operation for same is complete, if it is |
615 | * not same as primary bucket page, but that doesn't seem to be worth |
616 | * complicating the code. |
617 | */ |
618 | if (BufferIsValid(deletebuf)) |
619 | UnlockReleaseBuffer(deletebuf); |
620 | |
621 | if (BufferIsValid(writebuf)) |
622 | UnlockReleaseBuffer(writebuf); |
623 | |
624 | if (BufferIsValid(bucketbuf)) |
625 | UnlockReleaseBuffer(bucketbuf); |
626 | } |
627 | |
628 | /* |
629 | * replay squeeze page operation of hash index |
630 | */ |
631 | static void |
632 | hash_xlog_squeeze_page(XLogReaderState *record) |
633 | { |
634 | XLogRecPtr lsn = record->EndRecPtr; |
635 | xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); |
636 | Buffer bucketbuf = InvalidBuffer; |
637 | Buffer writebuf; |
638 | Buffer ovflbuf; |
639 | Buffer prevbuf = InvalidBuffer; |
640 | Buffer mapbuf; |
641 | XLogRedoAction action; |
642 | |
643 | /* |
644 | * Ensure we have a cleanup lock on primary bucket page before we start |
645 | * with the actual replay operation. This is to ensure that neither a |
646 | * scan can start nor a scan can be already-in-progress during the replay |
647 | * of this operation. If we allow scans during this operation, then they |
648 | * can miss some records or show the same record multiple times. |
649 | */ |
650 | if (xldata->is_prim_bucket_same_wrt) |
651 | action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); |
652 | else |
653 | { |
654 | /* |
655 | * we don't care for return value as the purpose of reading bucketbuf |
656 | * is to ensure a cleanup lock on primary bucket page. |
657 | */ |
658 | (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); |
659 | |
660 | action = XLogReadBufferForRedo(record, 1, &writebuf); |
661 | } |
662 | |
663 | /* replay the record for adding entries in overflow buffer */ |
664 | if (action == BLK_NEEDS_REDO) |
665 | { |
666 | Page writepage; |
667 | char *begin; |
668 | char *data; |
669 | Size datalen; |
670 | uint16 ninserted = 0; |
671 | |
672 | data = begin = XLogRecGetBlockData(record, 1, &datalen); |
673 | |
674 | writepage = (Page) BufferGetPage(writebuf); |
675 | |
676 | if (xldata->ntups > 0) |
677 | { |
678 | OffsetNumber *towrite = (OffsetNumber *) data; |
679 | |
680 | data += sizeof(OffsetNumber) * xldata->ntups; |
681 | |
682 | while (data - begin < datalen) |
683 | { |
684 | IndexTuple itup = (IndexTuple) data; |
685 | Size itemsz; |
686 | OffsetNumber l; |
687 | |
688 | itemsz = IndexTupleSize(itup); |
689 | itemsz = MAXALIGN(itemsz); |
690 | |
691 | data += itemsz; |
692 | |
693 | l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); |
694 | if (l == InvalidOffsetNumber) |
695 | elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes" , |
696 | (int) itemsz); |
697 | |
698 | ninserted++; |
699 | } |
700 | } |
701 | |
702 | /* |
703 | * number of tuples inserted must be same as requested in REDO record. |
704 | */ |
705 | Assert(ninserted == xldata->ntups); |
706 | |
707 | /* |
708 | * if the page on which are adding tuples is a page previous to freed |
709 | * overflow page, then update its nextblno. |
710 | */ |
711 | if (xldata->is_prev_bucket_same_wrt) |
712 | { |
713 | HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); |
714 | |
715 | writeopaque->hasho_nextblkno = xldata->nextblkno; |
716 | } |
717 | |
718 | PageSetLSN(writepage, lsn); |
719 | MarkBufferDirty(writebuf); |
720 | } |
721 | |
722 | /* replay the record for initializing overflow buffer */ |
723 | if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) |
724 | { |
725 | Page ovflpage; |
726 | HashPageOpaque ovflopaque; |
727 | |
728 | ovflpage = BufferGetPage(ovflbuf); |
729 | |
730 | _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); |
731 | |
732 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
733 | |
734 | ovflopaque->hasho_prevblkno = InvalidBlockNumber; |
735 | ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
736 | ovflopaque->hasho_bucket = -1; |
737 | ovflopaque->hasho_flag = LH_UNUSED_PAGE; |
738 | ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
739 | |
740 | PageSetLSN(ovflpage, lsn); |
741 | MarkBufferDirty(ovflbuf); |
742 | } |
743 | if (BufferIsValid(ovflbuf)) |
744 | UnlockReleaseBuffer(ovflbuf); |
745 | |
746 | /* replay the record for page previous to the freed overflow page */ |
747 | if (!xldata->is_prev_bucket_same_wrt && |
748 | XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) |
749 | { |
750 | Page prevpage = BufferGetPage(prevbuf); |
751 | HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); |
752 | |
753 | prevopaque->hasho_nextblkno = xldata->nextblkno; |
754 | |
755 | PageSetLSN(prevpage, lsn); |
756 | MarkBufferDirty(prevbuf); |
757 | } |
758 | if (BufferIsValid(prevbuf)) |
759 | UnlockReleaseBuffer(prevbuf); |
760 | |
761 | /* replay the record for page next to the freed overflow page */ |
762 | if (XLogRecHasBlockRef(record, 4)) |
763 | { |
764 | Buffer nextbuf; |
765 | |
766 | if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) |
767 | { |
768 | Page nextpage = BufferGetPage(nextbuf); |
769 | HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); |
770 | |
771 | nextopaque->hasho_prevblkno = xldata->prevblkno; |
772 | |
773 | PageSetLSN(nextpage, lsn); |
774 | MarkBufferDirty(nextbuf); |
775 | } |
776 | if (BufferIsValid(nextbuf)) |
777 | UnlockReleaseBuffer(nextbuf); |
778 | } |
779 | |
780 | if (BufferIsValid(writebuf)) |
781 | UnlockReleaseBuffer(writebuf); |
782 | |
783 | if (BufferIsValid(bucketbuf)) |
784 | UnlockReleaseBuffer(bucketbuf); |
785 | |
786 | /* |
787 | * Note: in normal operation, we'd update the bitmap and meta page while |
788 | * still holding lock on the primary bucket page and overflow pages. But |
789 | * during replay it's not necessary to hold those locks, since no other |
790 | * index updates can be happening concurrently. |
791 | */ |
792 | /* replay the record for bitmap page */ |
793 | if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) |
794 | { |
795 | Page mappage = (Page) BufferGetPage(mapbuf); |
796 | uint32 *freep = NULL; |
797 | char *data; |
798 | uint32 *bitmap_page_bit; |
799 | Size datalen; |
800 | |
801 | freep = HashPageGetBitmap(mappage); |
802 | |
803 | data = XLogRecGetBlockData(record, 5, &datalen); |
804 | bitmap_page_bit = (uint32 *) data; |
805 | |
806 | CLRBIT(freep, *bitmap_page_bit); |
807 | |
808 | PageSetLSN(mappage, lsn); |
809 | MarkBufferDirty(mapbuf); |
810 | } |
811 | if (BufferIsValid(mapbuf)) |
812 | UnlockReleaseBuffer(mapbuf); |
813 | |
814 | /* replay the record for meta page */ |
815 | if (XLogRecHasBlockRef(record, 6)) |
816 | { |
817 | Buffer metabuf; |
818 | |
819 | if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) |
820 | { |
821 | HashMetaPage metap; |
822 | Page page; |
823 | char *data; |
824 | uint32 *firstfree_ovflpage; |
825 | Size datalen; |
826 | |
827 | data = XLogRecGetBlockData(record, 6, &datalen); |
828 | firstfree_ovflpage = (uint32 *) data; |
829 | |
830 | page = BufferGetPage(metabuf); |
831 | metap = HashPageGetMeta(page); |
832 | metap->hashm_firstfree = *firstfree_ovflpage; |
833 | |
834 | PageSetLSN(page, lsn); |
835 | MarkBufferDirty(metabuf); |
836 | } |
837 | if (BufferIsValid(metabuf)) |
838 | UnlockReleaseBuffer(metabuf); |
839 | } |
840 | } |
841 | |
842 | /* |
843 | * replay delete operation of hash index |
844 | */ |
845 | static void |
846 | hash_xlog_delete(XLogReaderState *record) |
847 | { |
848 | XLogRecPtr lsn = record->EndRecPtr; |
849 | xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); |
850 | Buffer bucketbuf = InvalidBuffer; |
851 | Buffer deletebuf; |
852 | Page page; |
853 | XLogRedoAction action; |
854 | |
855 | /* |
856 | * Ensure we have a cleanup lock on primary bucket page before we start |
857 | * with the actual replay operation. This is to ensure that neither a |
858 | * scan can start nor a scan can be already-in-progress during the replay |
859 | * of this operation. If we allow scans during this operation, then they |
860 | * can miss some records or show the same record multiple times. |
861 | */ |
862 | if (xldata->is_primary_bucket_page) |
863 | action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); |
864 | else |
865 | { |
866 | /* |
867 | * we don't care for return value as the purpose of reading bucketbuf |
868 | * is to ensure a cleanup lock on primary bucket page. |
869 | */ |
870 | (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); |
871 | |
872 | action = XLogReadBufferForRedo(record, 1, &deletebuf); |
873 | } |
874 | |
875 | /* replay the record for deleting entries in bucket page */ |
876 | if (action == BLK_NEEDS_REDO) |
877 | { |
878 | char *ptr; |
879 | Size len; |
880 | |
881 | ptr = XLogRecGetBlockData(record, 1, &len); |
882 | |
883 | page = (Page) BufferGetPage(deletebuf); |
884 | |
885 | if (len > 0) |
886 | { |
887 | OffsetNumber *unused; |
888 | OffsetNumber *unend; |
889 | |
890 | unused = (OffsetNumber *) ptr; |
891 | unend = (OffsetNumber *) ((char *) ptr + len); |
892 | |
893 | if ((unend - unused) > 0) |
894 | PageIndexMultiDelete(page, unused, unend - unused); |
895 | } |
896 | |
897 | /* |
898 | * Mark the page as not containing any LP_DEAD items only if |
899 | * clear_dead_marking flag is set to true. See comments in |
900 | * hashbucketcleanup() for details. |
901 | */ |
902 | if (xldata->clear_dead_marking) |
903 | { |
904 | HashPageOpaque pageopaque; |
905 | |
906 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
907 | pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
908 | } |
909 | |
910 | PageSetLSN(page, lsn); |
911 | MarkBufferDirty(deletebuf); |
912 | } |
913 | if (BufferIsValid(deletebuf)) |
914 | UnlockReleaseBuffer(deletebuf); |
915 | |
916 | if (BufferIsValid(bucketbuf)) |
917 | UnlockReleaseBuffer(bucketbuf); |
918 | } |
919 | |
920 | /* |
921 | * replay split cleanup flag operation for primary bucket page. |
922 | */ |
923 | static void |
924 | hash_xlog_split_cleanup(XLogReaderState *record) |
925 | { |
926 | XLogRecPtr lsn = record->EndRecPtr; |
927 | Buffer buffer; |
928 | Page page; |
929 | |
930 | if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) |
931 | { |
932 | HashPageOpaque bucket_opaque; |
933 | |
934 | page = (Page) BufferGetPage(buffer); |
935 | |
936 | bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
937 | bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; |
938 | PageSetLSN(page, lsn); |
939 | MarkBufferDirty(buffer); |
940 | } |
941 | if (BufferIsValid(buffer)) |
942 | UnlockReleaseBuffer(buffer); |
943 | } |
944 | |
945 | /* |
946 | * replay for update meta page |
947 | */ |
948 | static void |
949 | hash_xlog_update_meta_page(XLogReaderState *record) |
950 | { |
951 | HashMetaPage metap; |
952 | XLogRecPtr lsn = record->EndRecPtr; |
953 | xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record); |
954 | Buffer metabuf; |
955 | Page page; |
956 | |
957 | if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) |
958 | { |
959 | page = BufferGetPage(metabuf); |
960 | metap = HashPageGetMeta(page); |
961 | |
962 | metap->hashm_ntuples = xldata->ntuples; |
963 | |
964 | PageSetLSN(page, lsn); |
965 | MarkBufferDirty(metabuf); |
966 | } |
967 | if (BufferIsValid(metabuf)) |
968 | UnlockReleaseBuffer(metabuf); |
969 | } |
970 | |
971 | /* |
972 | * replay delete operation in hash index to remove |
973 | * tuples marked as DEAD during index tuple insertion. |
974 | */ |
975 | static void |
976 | hash_xlog_vacuum_one_page(XLogReaderState *record) |
977 | { |
978 | XLogRecPtr lsn = record->EndRecPtr; |
979 | xl_hash_vacuum_one_page *xldata; |
980 | Buffer buffer; |
981 | Buffer metabuf; |
982 | Page page; |
983 | XLogRedoAction action; |
984 | HashPageOpaque pageopaque; |
985 | |
986 | xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); |
987 | |
988 | /* |
989 | * If we have any conflict processing to do, it must happen before we |
990 | * update the page. |
991 | * |
992 | * Hash index records that are marked as LP_DEAD and being removed during |
993 | * hash index tuple insertion can conflict with standby queries. You might |
994 | * think that vacuum records would conflict as well, but we've handled |
995 | * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid |
996 | * cleaned by the vacuum of the heap and so we can resolve any conflicts |
997 | * just once when that arrives. After that we know that no conflicts |
998 | * exist from individual hash index vacuum records on that index. |
999 | */ |
1000 | if (InHotStandby) |
1001 | { |
1002 | RelFileNode rnode; |
1003 | |
1004 | XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); |
1005 | ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode); |
1006 | } |
1007 | |
1008 | action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); |
1009 | |
1010 | if (action == BLK_NEEDS_REDO) |
1011 | { |
1012 | page = (Page) BufferGetPage(buffer); |
1013 | |
1014 | if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage) |
1015 | { |
1016 | OffsetNumber *unused; |
1017 | |
1018 | unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage); |
1019 | |
1020 | PageIndexMultiDelete(page, unused, xldata->ntuples); |
1021 | } |
1022 | |
1023 | /* |
1024 | * Mark the page as not containing any LP_DEAD items. See comments in |
1025 | * _hash_vacuum_one_page() for details. |
1026 | */ |
1027 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
1028 | pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
1029 | |
1030 | PageSetLSN(page, lsn); |
1031 | MarkBufferDirty(buffer); |
1032 | } |
1033 | if (BufferIsValid(buffer)) |
1034 | UnlockReleaseBuffer(buffer); |
1035 | |
1036 | if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) |
1037 | { |
1038 | Page metapage; |
1039 | HashMetaPage metap; |
1040 | |
1041 | metapage = BufferGetPage(metabuf); |
1042 | metap = HashPageGetMeta(metapage); |
1043 | |
1044 | metap->hashm_ntuples -= xldata->ntuples; |
1045 | |
1046 | PageSetLSN(metapage, lsn); |
1047 | MarkBufferDirty(metabuf); |
1048 | } |
1049 | if (BufferIsValid(metabuf)) |
1050 | UnlockReleaseBuffer(metabuf); |
1051 | } |
1052 | |
1053 | void |
1054 | hash_redo(XLogReaderState *record) |
1055 | { |
1056 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
1057 | |
1058 | switch (info) |
1059 | { |
1060 | case XLOG_HASH_INIT_META_PAGE: |
1061 | hash_xlog_init_meta_page(record); |
1062 | break; |
1063 | case XLOG_HASH_INIT_BITMAP_PAGE: |
1064 | hash_xlog_init_bitmap_page(record); |
1065 | break; |
1066 | case XLOG_HASH_INSERT: |
1067 | hash_xlog_insert(record); |
1068 | break; |
1069 | case XLOG_HASH_ADD_OVFL_PAGE: |
1070 | hash_xlog_add_ovfl_page(record); |
1071 | break; |
1072 | case XLOG_HASH_SPLIT_ALLOCATE_PAGE: |
1073 | hash_xlog_split_allocate_page(record); |
1074 | break; |
1075 | case XLOG_HASH_SPLIT_PAGE: |
1076 | hash_xlog_split_page(record); |
1077 | break; |
1078 | case XLOG_HASH_SPLIT_COMPLETE: |
1079 | hash_xlog_split_complete(record); |
1080 | break; |
1081 | case XLOG_HASH_MOVE_PAGE_CONTENTS: |
1082 | hash_xlog_move_page_contents(record); |
1083 | break; |
1084 | case XLOG_HASH_SQUEEZE_PAGE: |
1085 | hash_xlog_squeeze_page(record); |
1086 | break; |
1087 | case XLOG_HASH_DELETE: |
1088 | hash_xlog_delete(record); |
1089 | break; |
1090 | case XLOG_HASH_SPLIT_CLEANUP: |
1091 | hash_xlog_split_cleanup(record); |
1092 | break; |
1093 | case XLOG_HASH_UPDATE_META_PAGE: |
1094 | hash_xlog_update_meta_page(record); |
1095 | break; |
1096 | case XLOG_HASH_VACUUM_ONE_PAGE: |
1097 | hash_xlog_vacuum_one_page(record); |
1098 | break; |
1099 | default: |
1100 | elog(PANIC, "hash_redo: unknown op code %u" , info); |
1101 | } |
1102 | } |
1103 | |
1104 | /* |
1105 | * Mask a hash page before performing consistency checks on it. |
1106 | */ |
1107 | void |
1108 | hash_mask(char *pagedata, BlockNumber blkno) |
1109 | { |
1110 | Page page = (Page) pagedata; |
1111 | HashPageOpaque opaque; |
1112 | int pagetype; |
1113 | |
1114 | mask_page_lsn_and_checksum(page); |
1115 | |
1116 | mask_page_hint_bits(page); |
1117 | mask_unused_space(page); |
1118 | |
1119 | opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
1120 | |
1121 | pagetype = opaque->hasho_flag & LH_PAGE_TYPE; |
1122 | if (pagetype == LH_UNUSED_PAGE) |
1123 | { |
1124 | /* |
1125 | * Mask everything on a UNUSED page. |
1126 | */ |
1127 | mask_page_content(page); |
1128 | } |
1129 | else if (pagetype == LH_BUCKET_PAGE || |
1130 | pagetype == LH_OVERFLOW_PAGE) |
1131 | { |
1132 | /* |
1133 | * In hash bucket and overflow pages, it is possible to modify the |
1134 | * LP_FLAGS without emitting any WAL record. Hence, mask the line |
1135 | * pointer flags. See hashgettuple(), _hash_kill_items() for details. |
1136 | */ |
1137 | mask_lp_flags(page); |
1138 | } |
1139 | |
1140 | /* |
1141 | * It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain |
1142 | * unlogged. So, mask it. See _hash_kill_items() for details. |
1143 | */ |
1144 | opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
1145 | } |
1146 | |