1 | /* |
2 | * brin_pageops.c |
3 | * Page-handling routines for BRIN indexes |
4 | * |
5 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
6 | * Portions Copyright (c) 1994, Regents of the University of California |
7 | * |
8 | * IDENTIFICATION |
9 | * src/backend/access/brin/brin_pageops.c |
10 | */ |
11 | #include "postgres.h" |
12 | |
13 | #include "access/brin_pageops.h" |
14 | #include "access/brin_page.h" |
15 | #include "access/brin_revmap.h" |
16 | #include "access/brin_xlog.h" |
17 | #include "access/xloginsert.h" |
18 | #include "miscadmin.h" |
19 | #include "storage/bufmgr.h" |
20 | #include "storage/freespace.h" |
21 | #include "storage/lmgr.h" |
22 | #include "storage/smgr.h" |
23 | #include "utils/rel.h" |
24 | |
25 | |
26 | /* |
27 | * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate |
28 | * a single item per page, unlike other index AMs. |
29 | */ |
30 | #define BrinMaxItemSize \ |
31 | MAXALIGN_DOWN(BLCKSZ - \ |
32 | (MAXALIGN(SizeOfPageHeaderData + \ |
33 | sizeof(ItemIdData)) + \ |
34 | MAXALIGN(sizeof(BrinSpecialSpace)))) |
35 | |
36 | static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, |
37 | bool *extended); |
38 | static Size br_page_get_freespace(Page page); |
39 | static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer); |
40 | |
41 | |
42 | /* |
43 | * Update tuple origtup (size origsz), located in offset oldoff of buffer |
44 | * oldbuf, to newtup (size newsz) as summary tuple for the page range starting |
45 | * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. |
46 | * |
47 | * If samepage is true, attempt to put the new tuple in the same page, but if |
48 | * there's no room, use some other one. |
49 | * |
50 | * If the update is successful, return true; the revmap is updated to point to |
51 | * the new tuple. If the update is not done for whatever reason, return false. |
52 | * Caller may retry the update if this happens. |
53 | */ |
54 | bool |
55 | brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, |
56 | BrinRevmap *revmap, BlockNumber heapBlk, |
57 | Buffer oldbuf, OffsetNumber oldoff, |
58 | const BrinTuple *origtup, Size origsz, |
59 | const BrinTuple *newtup, Size newsz, |
60 | bool samepage) |
61 | { |
62 | Page oldpage; |
63 | ItemId oldlp; |
64 | BrinTuple *oldtup; |
65 | Size oldsz; |
66 | Buffer newbuf; |
67 | BlockNumber newblk = InvalidBlockNumber; |
68 | bool extended; |
69 | |
70 | Assert(newsz == MAXALIGN(newsz)); |
71 | |
72 | /* If the item is oversized, don't bother. */ |
73 | if (newsz > BrinMaxItemSize) |
74 | { |
75 | ereport(ERROR, |
76 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
77 | errmsg("index row size %zu exceeds maximum %zu for index \"%s\"" , |
78 | newsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); |
79 | return false; /* keep compiler quiet */ |
80 | } |
81 | |
82 | /* make sure the revmap is long enough to contain the entry we need */ |
83 | brinRevmapExtend(revmap, heapBlk); |
84 | |
85 | if (!samepage) |
86 | { |
87 | /* need a page on which to put the item */ |
88 | newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); |
89 | if (!BufferIsValid(newbuf)) |
90 | { |
91 | Assert(!extended); |
92 | return false; |
93 | } |
94 | |
95 | /* |
96 | * Note: it's possible (though unlikely) that the returned newbuf is |
97 | * the same as oldbuf, if brin_getinsertbuffer determined that the old |
98 | * buffer does in fact have enough space. |
99 | */ |
100 | if (newbuf == oldbuf) |
101 | { |
102 | Assert(!extended); |
103 | newbuf = InvalidBuffer; |
104 | } |
105 | else |
106 | newblk = BufferGetBlockNumber(newbuf); |
107 | } |
108 | else |
109 | { |
110 | LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
111 | newbuf = InvalidBuffer; |
112 | extended = false; |
113 | } |
114 | oldpage = BufferGetPage(oldbuf); |
115 | oldlp = PageGetItemId(oldpage, oldoff); |
116 | |
117 | /* |
118 | * Check that the old tuple wasn't updated concurrently: it might have |
119 | * moved someplace else entirely, and for that matter the whole page |
120 | * might've become a revmap page. Note that in the first two cases |
121 | * checked here, the "oldlp" we just calculated is garbage; but |
122 | * PageGetItemId() is simple enough that it was safe to do that |
123 | * calculation anyway. |
124 | */ |
125 | if (!BRIN_IS_REGULAR_PAGE(oldpage) || |
126 | oldoff > PageGetMaxOffsetNumber(oldpage) || |
127 | !ItemIdIsNormal(oldlp)) |
128 | { |
129 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
130 | |
131 | /* |
132 | * If this happens, and the new buffer was obtained by extending the |
133 | * relation, then we need to ensure we don't leave it uninitialized or |
134 | * forget about it. |
135 | */ |
136 | if (BufferIsValid(newbuf)) |
137 | { |
138 | if (extended) |
139 | brin_initialize_empty_new_buffer(idxrel, newbuf); |
140 | UnlockReleaseBuffer(newbuf); |
141 | if (extended) |
142 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
143 | } |
144 | return false; |
145 | } |
146 | |
147 | oldsz = ItemIdGetLength(oldlp); |
148 | oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); |
149 | |
150 | /* |
151 | * ... or it might have been updated in place to different contents. |
152 | */ |
153 | if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) |
154 | { |
155 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
156 | if (BufferIsValid(newbuf)) |
157 | { |
158 | /* As above, initialize and record new page if we got one */ |
159 | if (extended) |
160 | brin_initialize_empty_new_buffer(idxrel, newbuf); |
161 | UnlockReleaseBuffer(newbuf); |
162 | if (extended) |
163 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
164 | } |
165 | return false; |
166 | } |
167 | |
168 | /* |
169 | * Great, the old tuple is intact. We can proceed with the update. |
170 | * |
171 | * If there's enough room in the old page for the new tuple, replace it. |
172 | * |
173 | * Note that there might now be enough space on the page even though the |
174 | * caller told us there isn't, if a concurrent update moved another tuple |
175 | * elsewhere or replaced a tuple with a smaller one. |
176 | */ |
177 | if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && |
178 | brin_can_do_samepage_update(oldbuf, origsz, newsz)) |
179 | { |
180 | START_CRIT_SECTION(); |
181 | if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz)) |
182 | elog(ERROR, "failed to replace BRIN tuple" ); |
183 | MarkBufferDirty(oldbuf); |
184 | |
185 | /* XLOG stuff */ |
186 | if (RelationNeedsWAL(idxrel)) |
187 | { |
188 | xl_brin_samepage_update xlrec; |
189 | XLogRecPtr recptr; |
190 | uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; |
191 | |
192 | xlrec.offnum = oldoff; |
193 | |
194 | XLogBeginInsert(); |
195 | XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); |
196 | |
197 | XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); |
198 | XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); |
199 | |
200 | recptr = XLogInsert(RM_BRIN_ID, info); |
201 | |
202 | PageSetLSN(oldpage, recptr); |
203 | } |
204 | |
205 | END_CRIT_SECTION(); |
206 | |
207 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
208 | |
209 | if (BufferIsValid(newbuf)) |
210 | { |
211 | /* As above, initialize and record new page if we got one */ |
212 | if (extended) |
213 | brin_initialize_empty_new_buffer(idxrel, newbuf); |
214 | UnlockReleaseBuffer(newbuf); |
215 | if (extended) |
216 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
217 | } |
218 | |
219 | return true; |
220 | } |
221 | else if (newbuf == InvalidBuffer) |
222 | { |
223 | /* |
224 | * Not enough space, but caller said that there was. Tell them to |
225 | * start over. |
226 | */ |
227 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
228 | return false; |
229 | } |
230 | else |
231 | { |
232 | /* |
233 | * Not enough free space on the oldpage. Put the new tuple on the new |
234 | * page, and update the revmap. |
235 | */ |
236 | Page newpage = BufferGetPage(newbuf); |
237 | Buffer revmapbuf; |
238 | ItemPointerData newtid; |
239 | OffsetNumber newoff; |
240 | Size freespace = 0; |
241 | |
242 | revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); |
243 | |
244 | START_CRIT_SECTION(); |
245 | |
246 | /* |
247 | * We need to initialize the page if it's newly obtained. Note we |
248 | * will WAL-log the initialization as part of the update, so we don't |
249 | * need to do that here. |
250 | */ |
251 | if (extended) |
252 | brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); |
253 | |
254 | PageIndexTupleDeleteNoCompact(oldpage, oldoff); |
255 | newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz, |
256 | InvalidOffsetNumber, false, false); |
257 | if (newoff == InvalidOffsetNumber) |
258 | elog(ERROR, "failed to add BRIN tuple to new page" ); |
259 | MarkBufferDirty(oldbuf); |
260 | MarkBufferDirty(newbuf); |
261 | |
262 | /* needed to update FSM below */ |
263 | if (extended) |
264 | freespace = br_page_get_freespace(newpage); |
265 | |
266 | ItemPointerSet(&newtid, newblk, newoff); |
267 | brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); |
268 | MarkBufferDirty(revmapbuf); |
269 | |
270 | /* XLOG stuff */ |
271 | if (RelationNeedsWAL(idxrel)) |
272 | { |
273 | xl_brin_update xlrec; |
274 | XLogRecPtr recptr; |
275 | uint8 info; |
276 | |
277 | info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); |
278 | |
279 | xlrec.insert.offnum = newoff; |
280 | xlrec.insert.heapBlk = heapBlk; |
281 | xlrec.insert.pagesPerRange = pagesPerRange; |
282 | xlrec.oldOffnum = oldoff; |
283 | |
284 | XLogBeginInsert(); |
285 | |
286 | /* new page */ |
287 | XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); |
288 | |
289 | XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); |
290 | XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); |
291 | |
292 | /* revmap page */ |
293 | XLogRegisterBuffer(1, revmapbuf, 0); |
294 | |
295 | /* old page */ |
296 | XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); |
297 | |
298 | recptr = XLogInsert(RM_BRIN_ID, info); |
299 | |
300 | PageSetLSN(oldpage, recptr); |
301 | PageSetLSN(newpage, recptr); |
302 | PageSetLSN(BufferGetPage(revmapbuf), recptr); |
303 | } |
304 | |
305 | END_CRIT_SECTION(); |
306 | |
307 | LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); |
308 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
309 | UnlockReleaseBuffer(newbuf); |
310 | |
311 | if (extended) |
312 | { |
313 | RecordPageWithFreeSpace(idxrel, newblk, freespace); |
314 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
315 | } |
316 | |
317 | return true; |
318 | } |
319 | } |
320 | |
321 | /* |
322 | * Return whether brin_doupdate can do a samepage update. |
323 | */ |
324 | bool |
325 | brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) |
326 | { |
327 | return |
328 | ((newsz <= origsz) || |
329 | PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); |
330 | } |
331 | |
332 | /* |
333 | * Insert an index tuple into the index relation. The revmap is updated to |
334 | * mark the range containing the given page as pointing to the inserted entry. |
335 | * A WAL record is written. |
336 | * |
337 | * The buffer, if valid, is first checked for free space to insert the new |
338 | * entry; if there isn't enough, a new buffer is obtained and pinned. No |
339 | * buffer lock must be held on entry, no buffer lock is held on exit. |
340 | * |
341 | * Return value is the offset number where the tuple was inserted. |
342 | */ |
343 | OffsetNumber |
344 | brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, |
345 | BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, |
346 | BrinTuple *tup, Size itemsz) |
347 | { |
348 | Page page; |
349 | BlockNumber blk; |
350 | OffsetNumber off; |
351 | Size freespace = 0; |
352 | Buffer revmapbuf; |
353 | ItemPointerData tid; |
354 | bool extended; |
355 | |
356 | Assert(itemsz == MAXALIGN(itemsz)); |
357 | |
358 | /* If the item is oversized, don't even bother. */ |
359 | if (itemsz > BrinMaxItemSize) |
360 | { |
361 | ereport(ERROR, |
362 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
363 | errmsg("index row size %zu exceeds maximum %zu for index \"%s\"" , |
364 | itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); |
365 | return InvalidOffsetNumber; /* keep compiler quiet */ |
366 | } |
367 | |
368 | /* Make sure the revmap is long enough to contain the entry we need */ |
369 | brinRevmapExtend(revmap, heapBlk); |
370 | |
371 | /* |
372 | * Acquire lock on buffer supplied by caller, if any. If it doesn't have |
373 | * enough space, unpin it to obtain a new one below. |
374 | */ |
375 | if (BufferIsValid(*buffer)) |
376 | { |
377 | /* |
378 | * It's possible that another backend (or ourselves!) extended the |
379 | * revmap over the page we held a pin on, so we cannot assume that |
380 | * it's still a regular page. |
381 | */ |
382 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
383 | if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) |
384 | { |
385 | UnlockReleaseBuffer(*buffer); |
386 | *buffer = InvalidBuffer; |
387 | } |
388 | } |
389 | |
390 | /* |
391 | * If we still don't have a usable buffer, have brin_getinsertbuffer |
392 | * obtain one for us. |
393 | */ |
394 | if (!BufferIsValid(*buffer)) |
395 | { |
396 | do |
397 | *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); |
398 | while (!BufferIsValid(*buffer)); |
399 | } |
400 | else |
401 | extended = false; |
402 | |
403 | /* Now obtain lock on revmap buffer */ |
404 | revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); |
405 | |
406 | page = BufferGetPage(*buffer); |
407 | blk = BufferGetBlockNumber(*buffer); |
408 | |
409 | /* Execute the actual insertion */ |
410 | START_CRIT_SECTION(); |
411 | if (extended) |
412 | brin_page_init(page, BRIN_PAGETYPE_REGULAR); |
413 | off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, |
414 | false, false); |
415 | if (off == InvalidOffsetNumber) |
416 | elog(ERROR, "failed to add BRIN tuple to new page" ); |
417 | MarkBufferDirty(*buffer); |
418 | |
419 | /* needed to update FSM below */ |
420 | if (extended) |
421 | freespace = br_page_get_freespace(page); |
422 | |
423 | ItemPointerSet(&tid, blk, off); |
424 | brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); |
425 | MarkBufferDirty(revmapbuf); |
426 | |
427 | /* XLOG stuff */ |
428 | if (RelationNeedsWAL(idxrel)) |
429 | { |
430 | xl_brin_insert xlrec; |
431 | XLogRecPtr recptr; |
432 | uint8 info; |
433 | |
434 | info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); |
435 | xlrec.heapBlk = heapBlk; |
436 | xlrec.pagesPerRange = pagesPerRange; |
437 | xlrec.offnum = off; |
438 | |
439 | XLogBeginInsert(); |
440 | XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); |
441 | |
442 | XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); |
443 | XLogRegisterBufData(0, (char *) tup, itemsz); |
444 | |
445 | XLogRegisterBuffer(1, revmapbuf, 0); |
446 | |
447 | recptr = XLogInsert(RM_BRIN_ID, info); |
448 | |
449 | PageSetLSN(page, recptr); |
450 | PageSetLSN(BufferGetPage(revmapbuf), recptr); |
451 | } |
452 | |
453 | END_CRIT_SECTION(); |
454 | |
455 | /* Tuple is firmly on buffer; we can release our locks */ |
456 | LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); |
457 | LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); |
458 | |
459 | BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u" , |
460 | blk, off, heapBlk)); |
461 | |
462 | if (extended) |
463 | { |
464 | RecordPageWithFreeSpace(idxrel, blk, freespace); |
465 | FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); |
466 | } |
467 | |
468 | return off; |
469 | } |
470 | |
471 | /* |
472 | * Initialize a page with the given type. |
473 | * |
474 | * Caller is responsible for marking it dirty, as appropriate. |
475 | */ |
476 | void |
477 | brin_page_init(Page page, uint16 type) |
478 | { |
479 | PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); |
480 | |
481 | BrinPageType(page) = type; |
482 | } |
483 | |
484 | /* |
485 | * Initialize a new BRIN index's metapage. |
486 | */ |
487 | void |
488 | brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) |
489 | { |
490 | BrinMetaPageData *metadata; |
491 | |
492 | brin_page_init(page, BRIN_PAGETYPE_META); |
493 | |
494 | metadata = (BrinMetaPageData *) PageGetContents(page); |
495 | |
496 | metadata->brinMagic = BRIN_META_MAGIC; |
497 | metadata->brinVersion = version; |
498 | metadata->pagesPerRange = pagesPerRange; |
499 | |
500 | /* |
501 | * Note we cheat here a little. 0 is not a valid revmap block number |
502 | * (because it's the metapage buffer), but doing this enables the first |
503 | * revmap page to be created when the index is. |
504 | */ |
505 | metadata->lastRevmapPage = 0; |
506 | |
507 | /* |
508 | * Set pd_lower just past the end of the metadata. This is essential, |
509 | * because without doing so, metadata will be lost if xlog.c compresses |
510 | * the page. |
511 | */ |
512 | ((PageHeader) page)->pd_lower = |
513 | ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page; |
514 | } |
515 | |
516 | /* |
517 | * Initiate page evacuation protocol. |
518 | * |
519 | * The page must be locked in exclusive mode by the caller. |
520 | * |
521 | * If the page is not yet initialized or empty, return false without doing |
522 | * anything; it can be used for revmap without any further changes. If it |
523 | * contains tuples, mark it for evacuation and return true. |
524 | */ |
525 | bool |
526 | brin_start_evacuating_page(Relation idxRel, Buffer buf) |
527 | { |
528 | OffsetNumber off; |
529 | OffsetNumber maxoff; |
530 | Page page; |
531 | |
532 | page = BufferGetPage(buf); |
533 | |
534 | if (PageIsNew(page)) |
535 | return false; |
536 | |
537 | maxoff = PageGetMaxOffsetNumber(page); |
538 | for (off = FirstOffsetNumber; off <= maxoff; off++) |
539 | { |
540 | ItemId lp; |
541 | |
542 | lp = PageGetItemId(page, off); |
543 | if (ItemIdIsUsed(lp)) |
544 | { |
545 | /* prevent other backends from adding more stuff to this page */ |
546 | BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; |
547 | MarkBufferDirtyHint(buf, true); |
548 | |
549 | return true; |
550 | } |
551 | } |
552 | return false; |
553 | } |
554 | |
555 | /* |
556 | * Move all tuples out of a page. |
557 | * |
558 | * The caller must hold lock on the page. The lock and pin are released. |
559 | */ |
560 | void |
561 | brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, |
562 | BrinRevmap *revmap, Buffer buf) |
563 | { |
564 | OffsetNumber off; |
565 | OffsetNumber maxoff; |
566 | Page page; |
567 | BrinTuple *btup = NULL; |
568 | Size btupsz = 0; |
569 | |
570 | page = BufferGetPage(buf); |
571 | |
572 | Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE); |
573 | |
574 | maxoff = PageGetMaxOffsetNumber(page); |
575 | for (off = FirstOffsetNumber; off <= maxoff; off++) |
576 | { |
577 | BrinTuple *tup; |
578 | Size sz; |
579 | ItemId lp; |
580 | |
581 | CHECK_FOR_INTERRUPTS(); |
582 | |
583 | lp = PageGetItemId(page, off); |
584 | if (ItemIdIsUsed(lp)) |
585 | { |
586 | sz = ItemIdGetLength(lp); |
587 | tup = (BrinTuple *) PageGetItem(page, lp); |
588 | tup = brin_copy_tuple(tup, sz, btup, &btupsz); |
589 | |
590 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
591 | |
592 | if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, |
593 | buf, off, tup, sz, tup, sz, false)) |
594 | off--; /* retry */ |
595 | |
596 | LockBuffer(buf, BUFFER_LOCK_SHARE); |
597 | |
598 | /* It's possible that someone extended the revmap over this page */ |
599 | if (!BRIN_IS_REGULAR_PAGE(page)) |
600 | break; |
601 | } |
602 | } |
603 | |
604 | UnlockReleaseBuffer(buf); |
605 | } |
606 | |
607 | /* |
608 | * Given a BRIN index page, initialize it if necessary, and record its |
609 | * current free space in the FSM. |
610 | * |
611 | * The main use for this is when, during vacuuming, an uninitialized page is |
612 | * found, which could be the result of relation extension followed by a crash |
613 | * before the page can be used. |
614 | * |
615 | * Here, we don't bother to update upper FSM pages, instead expecting that our |
616 | * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere |
617 | * in this file, it's generally a good idea to propagate additions of free |
618 | * space into the upper FSM pages immediately. |
619 | */ |
620 | void |
621 | brin_page_cleanup(Relation idxrel, Buffer buf) |
622 | { |
623 | Page page = BufferGetPage(buf); |
624 | |
625 | /* |
626 | * If a page was left uninitialized, initialize it now; also record it in |
627 | * FSM. |
628 | * |
629 | * Somebody else might be extending the relation concurrently. To avoid |
630 | * re-initializing the page before they can grab the buffer lock, we |
631 | * acquire the extension lock momentarily. Since they hold the extension |
632 | * lock from before getting the page and after its been initialized, we're |
633 | * sure to see their initialization. |
634 | */ |
635 | if (PageIsNew(page)) |
636 | { |
637 | LockRelationForExtension(idxrel, ShareLock); |
638 | UnlockRelationForExtension(idxrel, ShareLock); |
639 | |
640 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
641 | if (PageIsNew(page)) |
642 | { |
643 | brin_initialize_empty_new_buffer(idxrel, buf); |
644 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
645 | return; |
646 | } |
647 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
648 | } |
649 | |
650 | /* Nothing to be done for non-regular index pages */ |
651 | if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || |
652 | BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) |
653 | return; |
654 | |
655 | /* Measure free space and record it */ |
656 | RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), |
657 | br_page_get_freespace(page)); |
658 | } |
659 | |
660 | /* |
661 | * Return a pinned and exclusively locked buffer which can be used to insert an |
662 | * index item of size itemsz (caller must ensure not to request sizes |
663 | * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in |
664 | * an order determined to avoid deadlocks). |
665 | * |
666 | * If we find that the old page is no longer a regular index page (because |
667 | * of a revmap extension), the old buffer is unlocked and we return |
668 | * InvalidBuffer. |
669 | * |
670 | * If there's no existing page with enough free space to accommodate the new |
671 | * item, the relation is extended. If this happens, *extended is set to true, |
672 | * and it is the caller's responsibility to initialize the page (and WAL-log |
673 | * that fact) prior to use. The caller should also update the FSM with the |
674 | * page's remaining free space after the insertion. |
675 | * |
676 | * Note that the caller is not expected to update FSM unless *extended is set |
677 | * true. This policy means that we'll update FSM when a page is created, and |
678 | * when it's found to have too little space for a desired tuple insertion, |
679 | * but not every single time we add a tuple to the page. |
680 | * |
681 | * Note that in some corner cases it is possible for this routine to extend |
682 | * the relation and then not return the new page. It is this routine's |
683 | * responsibility to WAL-log the page initialization and to record the page in |
684 | * FSM if that happens, since the caller certainly can't do it. |
685 | */ |
686 | static Buffer |
687 | brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, |
688 | bool *extended) |
689 | { |
690 | BlockNumber oldblk; |
691 | BlockNumber newblk; |
692 | Page page; |
693 | Size freespace; |
694 | |
695 | /* callers must have checked */ |
696 | Assert(itemsz <= BrinMaxItemSize); |
697 | |
698 | if (BufferIsValid(oldbuf)) |
699 | oldblk = BufferGetBlockNumber(oldbuf); |
700 | else |
701 | oldblk = InvalidBlockNumber; |
702 | |
703 | /* Choose initial target page, re-using existing target if known */ |
704 | newblk = RelationGetTargetBlock(irel); |
705 | if (newblk == InvalidBlockNumber) |
706 | newblk = GetPageWithFreeSpace(irel, itemsz); |
707 | |
708 | /* |
709 | * Loop until we find a page with sufficient free space. By the time we |
710 | * return to caller out of this loop, both buffers are valid and locked; |
711 | * if we have to restart here, neither page is locked and newblk isn't |
712 | * pinned (if it's even valid). |
713 | */ |
714 | for (;;) |
715 | { |
716 | Buffer buf; |
717 | bool extensionLockHeld = false; |
718 | |
719 | CHECK_FOR_INTERRUPTS(); |
720 | |
721 | *extended = false; |
722 | |
723 | if (newblk == InvalidBlockNumber) |
724 | { |
725 | /* |
726 | * There's not enough free space in any existing index page, |
727 | * according to the FSM: extend the relation to obtain a shiny new |
728 | * page. |
729 | */ |
730 | if (!RELATION_IS_LOCAL(irel)) |
731 | { |
732 | LockRelationForExtension(irel, ExclusiveLock); |
733 | extensionLockHeld = true; |
734 | } |
735 | buf = ReadBuffer(irel, P_NEW); |
736 | newblk = BufferGetBlockNumber(buf); |
737 | *extended = true; |
738 | |
739 | BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u" , |
740 | BufferGetBlockNumber(buf))); |
741 | } |
742 | else if (newblk == oldblk) |
743 | { |
744 | /* |
745 | * There's an odd corner-case here where the FSM is out-of-date, |
746 | * and gave us the old page. |
747 | */ |
748 | buf = oldbuf; |
749 | } |
750 | else |
751 | { |
752 | buf = ReadBuffer(irel, newblk); |
753 | } |
754 | |
755 | /* |
756 | * We lock the old buffer first, if it's earlier than the new one; but |
757 | * then we need to check that it hasn't been turned into a revmap page |
758 | * concurrently. If we detect that that happened, give up and tell |
759 | * caller to start over. |
760 | */ |
761 | if (BufferIsValid(oldbuf) && oldblk < newblk) |
762 | { |
763 | LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
764 | if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) |
765 | { |
766 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
767 | |
768 | /* |
769 | * It is possible that the new page was obtained from |
770 | * extending the relation. In that case, we must be sure to |
771 | * record it in the FSM before leaving, because otherwise the |
772 | * space would be lost forever. However, we cannot let an |
773 | * uninitialized page get in the FSM, so we need to initialize |
774 | * it first. |
775 | */ |
776 | if (*extended) |
777 | brin_initialize_empty_new_buffer(irel, buf); |
778 | |
779 | if (extensionLockHeld) |
780 | UnlockRelationForExtension(irel, ExclusiveLock); |
781 | |
782 | ReleaseBuffer(buf); |
783 | |
784 | if (*extended) |
785 | { |
786 | FreeSpaceMapVacuumRange(irel, newblk, newblk + 1); |
787 | /* shouldn't matter, but don't confuse caller */ |
788 | *extended = false; |
789 | } |
790 | |
791 | return InvalidBuffer; |
792 | } |
793 | } |
794 | |
795 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
796 | |
797 | if (extensionLockHeld) |
798 | UnlockRelationForExtension(irel, ExclusiveLock); |
799 | |
800 | page = BufferGetPage(buf); |
801 | |
802 | /* |
803 | * We have a new buffer to insert into. Check that the new page has |
804 | * enough free space, and return it if it does; otherwise start over. |
805 | * (br_page_get_freespace also checks that the FSM didn't hand us a |
806 | * page that has since been repurposed for the revmap.) |
807 | */ |
808 | freespace = *extended ? |
809 | BrinMaxItemSize : br_page_get_freespace(page); |
810 | if (freespace >= itemsz) |
811 | { |
812 | RelationSetTargetBlock(irel, newblk); |
813 | |
814 | /* |
815 | * Lock the old buffer if not locked already. Note that in this |
816 | * case we know for sure it's a regular page: it's later than the |
817 | * new page we just got, which is not a revmap page, and revmap |
818 | * pages are always consecutive. |
819 | */ |
820 | if (BufferIsValid(oldbuf) && oldblk > newblk) |
821 | { |
822 | LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
823 | Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); |
824 | } |
825 | |
826 | return buf; |
827 | } |
828 | |
829 | /* This page is no good. */ |
830 | |
831 | /* |
832 | * If an entirely new page does not contain enough free space for the |
833 | * new item, then surely that item is oversized. Complain loudly; but |
834 | * first make sure we initialize the page and record it as free, for |
835 | * next time. |
836 | */ |
837 | if (*extended) |
838 | { |
839 | brin_initialize_empty_new_buffer(irel, buf); |
840 | /* since this should not happen, skip FreeSpaceMapVacuum */ |
841 | |
842 | ereport(ERROR, |
843 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
844 | errmsg("index row size %zu exceeds maximum %zu for index \"%s\"" , |
845 | itemsz, freespace, RelationGetRelationName(irel)))); |
846 | return InvalidBuffer; /* keep compiler quiet */ |
847 | } |
848 | |
849 | if (newblk != oldblk) |
850 | UnlockReleaseBuffer(buf); |
851 | if (BufferIsValid(oldbuf) && oldblk <= newblk) |
852 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
853 | |
854 | /* |
855 | * Update the FSM with the new, presumably smaller, freespace value |
856 | * for this page, then search for a new target page. |
857 | */ |
858 | newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); |
859 | } |
860 | } |
861 | |
862 | /* |
863 | * Initialize a page as an empty regular BRIN page, WAL-log this, and record |
864 | * the page in FSM. |
865 | * |
866 | * There are several corner situations in which we extend the relation to |
867 | * obtain a new page and later find that we cannot use it immediately. When |
868 | * that happens, we don't want to leave the page go unrecorded in FSM, because |
869 | * there is no mechanism to get the space back and the index would bloat. |
870 | * Also, because we would not WAL-log the action that would initialize the |
871 | * page, the page would go uninitialized in a standby (or after recovery). |
872 | * |
873 | * While we record the page in FSM here, caller is responsible for doing FSM |
874 | * upper-page update if that seems appropriate. |
875 | */ |
876 | static void |
877 | brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) |
878 | { |
879 | Page page; |
880 | |
881 | BRIN_elog((DEBUG2, |
882 | "brin_initialize_empty_new_buffer: initializing blank page %u" , |
883 | BufferGetBlockNumber(buffer))); |
884 | |
885 | START_CRIT_SECTION(); |
886 | page = BufferGetPage(buffer); |
887 | brin_page_init(page, BRIN_PAGETYPE_REGULAR); |
888 | MarkBufferDirty(buffer); |
889 | log_newpage_buffer(buffer, true); |
890 | END_CRIT_SECTION(); |
891 | |
892 | /* |
893 | * We update the FSM for this page, but this is not WAL-logged. This is |
894 | * acceptable because VACUUM will scan the index and update the FSM with |
895 | * pages whose FSM records were forgotten in a crash. |
896 | */ |
897 | RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), |
898 | br_page_get_freespace(page)); |
899 | } |
900 | |
901 | |
902 | /* |
903 | * Return the amount of free space on a regular BRIN index page. |
904 | * |
905 | * If the page is not a regular page, or has been marked with the |
906 | * BRIN_EVACUATE_PAGE flag, returns 0. |
907 | */ |
908 | static Size |
909 | br_page_get_freespace(Page page) |
910 | { |
911 | if (!BRIN_IS_REGULAR_PAGE(page) || |
912 | (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0) |
913 | return 0; |
914 | else |
915 | return PageGetFreeSpace(page); |
916 | } |
917 | |