1/*
2 * brin_pageops.c
3 * Page-handling routines for BRIN indexes
4 *
5 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/access/brin/brin_pageops.c
10 */
11#include "postgres.h"
12
13#include "access/brin_pageops.h"
14#include "access/brin_page.h"
15#include "access/brin_revmap.h"
16#include "access/brin_xlog.h"
17#include "access/xloginsert.h"
18#include "miscadmin.h"
19#include "storage/bufmgr.h"
20#include "storage/freespace.h"
21#include "storage/lmgr.h"
22#include "storage/smgr.h"
23#include "utils/rel.h"
24
25
26/*
27 * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
28 * a single item per page, unlike other index AMs.
29 */
30#define BrinMaxItemSize \
31 MAXALIGN_DOWN(BLCKSZ - \
32 (MAXALIGN(SizeOfPageHeaderData + \
33 sizeof(ItemIdData)) + \
34 MAXALIGN(sizeof(BrinSpecialSpace))))
35
36static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
37 bool *extended);
38static Size br_page_get_freespace(Page page);
39static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
40
41
42/*
43 * Update tuple origtup (size origsz), located in offset oldoff of buffer
44 * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
45 * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
46 *
47 * If samepage is true, attempt to put the new tuple in the same page, but if
48 * there's no room, use some other one.
49 *
50 * If the update is successful, return true; the revmap is updated to point to
51 * the new tuple. If the update is not done for whatever reason, return false.
52 * Caller may retry the update if this happens.
53 */
54bool
55brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
56 BrinRevmap *revmap, BlockNumber heapBlk,
57 Buffer oldbuf, OffsetNumber oldoff,
58 const BrinTuple *origtup, Size origsz,
59 const BrinTuple *newtup, Size newsz,
60 bool samepage)
61{
62 Page oldpage;
63 ItemId oldlp;
64 BrinTuple *oldtup;
65 Size oldsz;
66 Buffer newbuf;
67 BlockNumber newblk = InvalidBlockNumber;
68 bool extended;
69
70 Assert(newsz == MAXALIGN(newsz));
71
72 /* If the item is oversized, don't bother. */
73 if (newsz > BrinMaxItemSize)
74 {
75 ereport(ERROR,
76 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
77 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
78 newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
79 return false; /* keep compiler quiet */
80 }
81
82 /* make sure the revmap is long enough to contain the entry we need */
83 brinRevmapExtend(revmap, heapBlk);
84
85 if (!samepage)
86 {
87 /* need a page on which to put the item */
88 newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
89 if (!BufferIsValid(newbuf))
90 {
91 Assert(!extended);
92 return false;
93 }
94
95 /*
96 * Note: it's possible (though unlikely) that the returned newbuf is
97 * the same as oldbuf, if brin_getinsertbuffer determined that the old
98 * buffer does in fact have enough space.
99 */
100 if (newbuf == oldbuf)
101 {
102 Assert(!extended);
103 newbuf = InvalidBuffer;
104 }
105 else
106 newblk = BufferGetBlockNumber(newbuf);
107 }
108 else
109 {
110 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
111 newbuf = InvalidBuffer;
112 extended = false;
113 }
114 oldpage = BufferGetPage(oldbuf);
115 oldlp = PageGetItemId(oldpage, oldoff);
116
117 /*
118 * Check that the old tuple wasn't updated concurrently: it might have
119 * moved someplace else entirely, and for that matter the whole page
120 * might've become a revmap page. Note that in the first two cases
121 * checked here, the "oldlp" we just calculated is garbage; but
122 * PageGetItemId() is simple enough that it was safe to do that
123 * calculation anyway.
124 */
125 if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
126 oldoff > PageGetMaxOffsetNumber(oldpage) ||
127 !ItemIdIsNormal(oldlp))
128 {
129 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
130
131 /*
132 * If this happens, and the new buffer was obtained by extending the
133 * relation, then we need to ensure we don't leave it uninitialized or
134 * forget about it.
135 */
136 if (BufferIsValid(newbuf))
137 {
138 if (extended)
139 brin_initialize_empty_new_buffer(idxrel, newbuf);
140 UnlockReleaseBuffer(newbuf);
141 if (extended)
142 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
143 }
144 return false;
145 }
146
147 oldsz = ItemIdGetLength(oldlp);
148 oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
149
150 /*
151 * ... or it might have been updated in place to different contents.
152 */
153 if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
154 {
155 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
156 if (BufferIsValid(newbuf))
157 {
158 /* As above, initialize and record new page if we got one */
159 if (extended)
160 brin_initialize_empty_new_buffer(idxrel, newbuf);
161 UnlockReleaseBuffer(newbuf);
162 if (extended)
163 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
164 }
165 return false;
166 }
167
168 /*
169 * Great, the old tuple is intact. We can proceed with the update.
170 *
171 * If there's enough room in the old page for the new tuple, replace it.
172 *
173 * Note that there might now be enough space on the page even though the
174 * caller told us there isn't, if a concurrent update moved another tuple
175 * elsewhere or replaced a tuple with a smaller one.
176 */
177 if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
178 brin_can_do_samepage_update(oldbuf, origsz, newsz))
179 {
180 START_CRIT_SECTION();
181 if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz))
182 elog(ERROR, "failed to replace BRIN tuple");
183 MarkBufferDirty(oldbuf);
184
185 /* XLOG stuff */
186 if (RelationNeedsWAL(idxrel))
187 {
188 xl_brin_samepage_update xlrec;
189 XLogRecPtr recptr;
190 uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
191
192 xlrec.offnum = oldoff;
193
194 XLogBeginInsert();
195 XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
196
197 XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
198 XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
199
200 recptr = XLogInsert(RM_BRIN_ID, info);
201
202 PageSetLSN(oldpage, recptr);
203 }
204
205 END_CRIT_SECTION();
206
207 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
208
209 if (BufferIsValid(newbuf))
210 {
211 /* As above, initialize and record new page if we got one */
212 if (extended)
213 brin_initialize_empty_new_buffer(idxrel, newbuf);
214 UnlockReleaseBuffer(newbuf);
215 if (extended)
216 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
217 }
218
219 return true;
220 }
221 else if (newbuf == InvalidBuffer)
222 {
223 /*
224 * Not enough space, but caller said that there was. Tell them to
225 * start over.
226 */
227 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
228 return false;
229 }
230 else
231 {
232 /*
233 * Not enough free space on the oldpage. Put the new tuple on the new
234 * page, and update the revmap.
235 */
236 Page newpage = BufferGetPage(newbuf);
237 Buffer revmapbuf;
238 ItemPointerData newtid;
239 OffsetNumber newoff;
240 Size freespace = 0;
241
242 revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
243
244 START_CRIT_SECTION();
245
246 /*
247 * We need to initialize the page if it's newly obtained. Note we
248 * will WAL-log the initialization as part of the update, so we don't
249 * need to do that here.
250 */
251 if (extended)
252 brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
253
254 PageIndexTupleDeleteNoCompact(oldpage, oldoff);
255 newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz,
256 InvalidOffsetNumber, false, false);
257 if (newoff == InvalidOffsetNumber)
258 elog(ERROR, "failed to add BRIN tuple to new page");
259 MarkBufferDirty(oldbuf);
260 MarkBufferDirty(newbuf);
261
262 /* needed to update FSM below */
263 if (extended)
264 freespace = br_page_get_freespace(newpage);
265
266 ItemPointerSet(&newtid, newblk, newoff);
267 brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
268 MarkBufferDirty(revmapbuf);
269
270 /* XLOG stuff */
271 if (RelationNeedsWAL(idxrel))
272 {
273 xl_brin_update xlrec;
274 XLogRecPtr recptr;
275 uint8 info;
276
277 info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
278
279 xlrec.insert.offnum = newoff;
280 xlrec.insert.heapBlk = heapBlk;
281 xlrec.insert.pagesPerRange = pagesPerRange;
282 xlrec.oldOffnum = oldoff;
283
284 XLogBeginInsert();
285
286 /* new page */
287 XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
288
289 XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
290 XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
291
292 /* revmap page */
293 XLogRegisterBuffer(1, revmapbuf, 0);
294
295 /* old page */
296 XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
297
298 recptr = XLogInsert(RM_BRIN_ID, info);
299
300 PageSetLSN(oldpage, recptr);
301 PageSetLSN(newpage, recptr);
302 PageSetLSN(BufferGetPage(revmapbuf), recptr);
303 }
304
305 END_CRIT_SECTION();
306
307 LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
308 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
309 UnlockReleaseBuffer(newbuf);
310
311 if (extended)
312 {
313 RecordPageWithFreeSpace(idxrel, newblk, freespace);
314 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
315 }
316
317 return true;
318 }
319}
320
321/*
322 * Return whether brin_doupdate can do a samepage update.
323 */
324bool
325brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
326{
327 return
328 ((newsz <= origsz) ||
329 PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
330}
331
332/*
333 * Insert an index tuple into the index relation. The revmap is updated to
334 * mark the range containing the given page as pointing to the inserted entry.
335 * A WAL record is written.
336 *
337 * The buffer, if valid, is first checked for free space to insert the new
338 * entry; if there isn't enough, a new buffer is obtained and pinned. No
339 * buffer lock must be held on entry, no buffer lock is held on exit.
340 *
341 * Return value is the offset number where the tuple was inserted.
342 */
343OffsetNumber
344brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
345 BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
346 BrinTuple *tup, Size itemsz)
347{
348 Page page;
349 BlockNumber blk;
350 OffsetNumber off;
351 Size freespace = 0;
352 Buffer revmapbuf;
353 ItemPointerData tid;
354 bool extended;
355
356 Assert(itemsz == MAXALIGN(itemsz));
357
358 /* If the item is oversized, don't even bother. */
359 if (itemsz > BrinMaxItemSize)
360 {
361 ereport(ERROR,
362 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
363 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
364 itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
365 return InvalidOffsetNumber; /* keep compiler quiet */
366 }
367
368 /* Make sure the revmap is long enough to contain the entry we need */
369 brinRevmapExtend(revmap, heapBlk);
370
371 /*
372 * Acquire lock on buffer supplied by caller, if any. If it doesn't have
373 * enough space, unpin it to obtain a new one below.
374 */
375 if (BufferIsValid(*buffer))
376 {
377 /*
378 * It's possible that another backend (or ourselves!) extended the
379 * revmap over the page we held a pin on, so we cannot assume that
380 * it's still a regular page.
381 */
382 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
383 if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
384 {
385 UnlockReleaseBuffer(*buffer);
386 *buffer = InvalidBuffer;
387 }
388 }
389
390 /*
391 * If we still don't have a usable buffer, have brin_getinsertbuffer
392 * obtain one for us.
393 */
394 if (!BufferIsValid(*buffer))
395 {
396 do
397 *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
398 while (!BufferIsValid(*buffer));
399 }
400 else
401 extended = false;
402
403 /* Now obtain lock on revmap buffer */
404 revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
405
406 page = BufferGetPage(*buffer);
407 blk = BufferGetBlockNumber(*buffer);
408
409 /* Execute the actual insertion */
410 START_CRIT_SECTION();
411 if (extended)
412 brin_page_init(page, BRIN_PAGETYPE_REGULAR);
413 off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
414 false, false);
415 if (off == InvalidOffsetNumber)
416 elog(ERROR, "failed to add BRIN tuple to new page");
417 MarkBufferDirty(*buffer);
418
419 /* needed to update FSM below */
420 if (extended)
421 freespace = br_page_get_freespace(page);
422
423 ItemPointerSet(&tid, blk, off);
424 brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
425 MarkBufferDirty(revmapbuf);
426
427 /* XLOG stuff */
428 if (RelationNeedsWAL(idxrel))
429 {
430 xl_brin_insert xlrec;
431 XLogRecPtr recptr;
432 uint8 info;
433
434 info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
435 xlrec.heapBlk = heapBlk;
436 xlrec.pagesPerRange = pagesPerRange;
437 xlrec.offnum = off;
438
439 XLogBeginInsert();
440 XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
441
442 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
443 XLogRegisterBufData(0, (char *) tup, itemsz);
444
445 XLogRegisterBuffer(1, revmapbuf, 0);
446
447 recptr = XLogInsert(RM_BRIN_ID, info);
448
449 PageSetLSN(page, recptr);
450 PageSetLSN(BufferGetPage(revmapbuf), recptr);
451 }
452
453 END_CRIT_SECTION();
454
455 /* Tuple is firmly on buffer; we can release our locks */
456 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
457 LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
458
459 BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
460 blk, off, heapBlk));
461
462 if (extended)
463 {
464 RecordPageWithFreeSpace(idxrel, blk, freespace);
465 FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
466 }
467
468 return off;
469}
470
471/*
472 * Initialize a page with the given type.
473 *
474 * Caller is responsible for marking it dirty, as appropriate.
475 */
476void
477brin_page_init(Page page, uint16 type)
478{
479 PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
480
481 BrinPageType(page) = type;
482}
483
484/*
485 * Initialize a new BRIN index's metapage.
486 */
487void
488brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
489{
490 BrinMetaPageData *metadata;
491
492 brin_page_init(page, BRIN_PAGETYPE_META);
493
494 metadata = (BrinMetaPageData *) PageGetContents(page);
495
496 metadata->brinMagic = BRIN_META_MAGIC;
497 metadata->brinVersion = version;
498 metadata->pagesPerRange = pagesPerRange;
499
500 /*
501 * Note we cheat here a little. 0 is not a valid revmap block number
502 * (because it's the metapage buffer), but doing this enables the first
503 * revmap page to be created when the index is.
504 */
505 metadata->lastRevmapPage = 0;
506
507 /*
508 * Set pd_lower just past the end of the metadata. This is essential,
509 * because without doing so, metadata will be lost if xlog.c compresses
510 * the page.
511 */
512 ((PageHeader) page)->pd_lower =
513 ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
514}
515
516/*
517 * Initiate page evacuation protocol.
518 *
519 * The page must be locked in exclusive mode by the caller.
520 *
521 * If the page is not yet initialized or empty, return false without doing
522 * anything; it can be used for revmap without any further changes. If it
523 * contains tuples, mark it for evacuation and return true.
524 */
525bool
526brin_start_evacuating_page(Relation idxRel, Buffer buf)
527{
528 OffsetNumber off;
529 OffsetNumber maxoff;
530 Page page;
531
532 page = BufferGetPage(buf);
533
534 if (PageIsNew(page))
535 return false;
536
537 maxoff = PageGetMaxOffsetNumber(page);
538 for (off = FirstOffsetNumber; off <= maxoff; off++)
539 {
540 ItemId lp;
541
542 lp = PageGetItemId(page, off);
543 if (ItemIdIsUsed(lp))
544 {
545 /* prevent other backends from adding more stuff to this page */
546 BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
547 MarkBufferDirtyHint(buf, true);
548
549 return true;
550 }
551 }
552 return false;
553}
554
555/*
556 * Move all tuples out of a page.
557 *
558 * The caller must hold lock on the page. The lock and pin are released.
559 */
560void
561brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
562 BrinRevmap *revmap, Buffer buf)
563{
564 OffsetNumber off;
565 OffsetNumber maxoff;
566 Page page;
567 BrinTuple *btup = NULL;
568 Size btupsz = 0;
569
570 page = BufferGetPage(buf);
571
572 Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
573
574 maxoff = PageGetMaxOffsetNumber(page);
575 for (off = FirstOffsetNumber; off <= maxoff; off++)
576 {
577 BrinTuple *tup;
578 Size sz;
579 ItemId lp;
580
581 CHECK_FOR_INTERRUPTS();
582
583 lp = PageGetItemId(page, off);
584 if (ItemIdIsUsed(lp))
585 {
586 sz = ItemIdGetLength(lp);
587 tup = (BrinTuple *) PageGetItem(page, lp);
588 tup = brin_copy_tuple(tup, sz, btup, &btupsz);
589
590 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
591
592 if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
593 buf, off, tup, sz, tup, sz, false))
594 off--; /* retry */
595
596 LockBuffer(buf, BUFFER_LOCK_SHARE);
597
598 /* It's possible that someone extended the revmap over this page */
599 if (!BRIN_IS_REGULAR_PAGE(page))
600 break;
601 }
602 }
603
604 UnlockReleaseBuffer(buf);
605}
606
607/*
608 * Given a BRIN index page, initialize it if necessary, and record its
609 * current free space in the FSM.
610 *
611 * The main use for this is when, during vacuuming, an uninitialized page is
612 * found, which could be the result of relation extension followed by a crash
613 * before the page can be used.
614 *
615 * Here, we don't bother to update upper FSM pages, instead expecting that our
616 * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere
617 * in this file, it's generally a good idea to propagate additions of free
618 * space into the upper FSM pages immediately.
619 */
620void
621brin_page_cleanup(Relation idxrel, Buffer buf)
622{
623 Page page = BufferGetPage(buf);
624
625 /*
626 * If a page was left uninitialized, initialize it now; also record it in
627 * FSM.
628 *
629 * Somebody else might be extending the relation concurrently. To avoid
630 * re-initializing the page before they can grab the buffer lock, we
631 * acquire the extension lock momentarily. Since they hold the extension
632 * lock from before getting the page and after its been initialized, we're
633 * sure to see their initialization.
634 */
635 if (PageIsNew(page))
636 {
637 LockRelationForExtension(idxrel, ShareLock);
638 UnlockRelationForExtension(idxrel, ShareLock);
639
640 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
641 if (PageIsNew(page))
642 {
643 brin_initialize_empty_new_buffer(idxrel, buf);
644 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
645 return;
646 }
647 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
648 }
649
650 /* Nothing to be done for non-regular index pages */
651 if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
652 BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
653 return;
654
655 /* Measure free space and record it */
656 RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
657 br_page_get_freespace(page));
658}
659
660/*
661 * Return a pinned and exclusively locked buffer which can be used to insert an
662 * index item of size itemsz (caller must ensure not to request sizes
663 * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
664 * an order determined to avoid deadlocks).
665 *
666 * If we find that the old page is no longer a regular index page (because
667 * of a revmap extension), the old buffer is unlocked and we return
668 * InvalidBuffer.
669 *
670 * If there's no existing page with enough free space to accommodate the new
671 * item, the relation is extended. If this happens, *extended is set to true,
672 * and it is the caller's responsibility to initialize the page (and WAL-log
673 * that fact) prior to use. The caller should also update the FSM with the
674 * page's remaining free space after the insertion.
675 *
676 * Note that the caller is not expected to update FSM unless *extended is set
677 * true. This policy means that we'll update FSM when a page is created, and
678 * when it's found to have too little space for a desired tuple insertion,
679 * but not every single time we add a tuple to the page.
680 *
681 * Note that in some corner cases it is possible for this routine to extend
682 * the relation and then not return the new page. It is this routine's
683 * responsibility to WAL-log the page initialization and to record the page in
684 * FSM if that happens, since the caller certainly can't do it.
685 */
686static Buffer
687brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
688 bool *extended)
689{
690 BlockNumber oldblk;
691 BlockNumber newblk;
692 Page page;
693 Size freespace;
694
695 /* callers must have checked */
696 Assert(itemsz <= BrinMaxItemSize);
697
698 if (BufferIsValid(oldbuf))
699 oldblk = BufferGetBlockNumber(oldbuf);
700 else
701 oldblk = InvalidBlockNumber;
702
703 /* Choose initial target page, re-using existing target if known */
704 newblk = RelationGetTargetBlock(irel);
705 if (newblk == InvalidBlockNumber)
706 newblk = GetPageWithFreeSpace(irel, itemsz);
707
708 /*
709 * Loop until we find a page with sufficient free space. By the time we
710 * return to caller out of this loop, both buffers are valid and locked;
711 * if we have to restart here, neither page is locked and newblk isn't
712 * pinned (if it's even valid).
713 */
714 for (;;)
715 {
716 Buffer buf;
717 bool extensionLockHeld = false;
718
719 CHECK_FOR_INTERRUPTS();
720
721 *extended = false;
722
723 if (newblk == InvalidBlockNumber)
724 {
725 /*
726 * There's not enough free space in any existing index page,
727 * according to the FSM: extend the relation to obtain a shiny new
728 * page.
729 */
730 if (!RELATION_IS_LOCAL(irel))
731 {
732 LockRelationForExtension(irel, ExclusiveLock);
733 extensionLockHeld = true;
734 }
735 buf = ReadBuffer(irel, P_NEW);
736 newblk = BufferGetBlockNumber(buf);
737 *extended = true;
738
739 BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
740 BufferGetBlockNumber(buf)));
741 }
742 else if (newblk == oldblk)
743 {
744 /*
745 * There's an odd corner-case here where the FSM is out-of-date,
746 * and gave us the old page.
747 */
748 buf = oldbuf;
749 }
750 else
751 {
752 buf = ReadBuffer(irel, newblk);
753 }
754
755 /*
756 * We lock the old buffer first, if it's earlier than the new one; but
757 * then we need to check that it hasn't been turned into a revmap page
758 * concurrently. If we detect that that happened, give up and tell
759 * caller to start over.
760 */
761 if (BufferIsValid(oldbuf) && oldblk < newblk)
762 {
763 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
764 if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
765 {
766 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
767
768 /*
769 * It is possible that the new page was obtained from
770 * extending the relation. In that case, we must be sure to
771 * record it in the FSM before leaving, because otherwise the
772 * space would be lost forever. However, we cannot let an
773 * uninitialized page get in the FSM, so we need to initialize
774 * it first.
775 */
776 if (*extended)
777 brin_initialize_empty_new_buffer(irel, buf);
778
779 if (extensionLockHeld)
780 UnlockRelationForExtension(irel, ExclusiveLock);
781
782 ReleaseBuffer(buf);
783
784 if (*extended)
785 {
786 FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
787 /* shouldn't matter, but don't confuse caller */
788 *extended = false;
789 }
790
791 return InvalidBuffer;
792 }
793 }
794
795 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
796
797 if (extensionLockHeld)
798 UnlockRelationForExtension(irel, ExclusiveLock);
799
800 page = BufferGetPage(buf);
801
802 /*
803 * We have a new buffer to insert into. Check that the new page has
804 * enough free space, and return it if it does; otherwise start over.
805 * (br_page_get_freespace also checks that the FSM didn't hand us a
806 * page that has since been repurposed for the revmap.)
807 */
808 freespace = *extended ?
809 BrinMaxItemSize : br_page_get_freespace(page);
810 if (freespace >= itemsz)
811 {
812 RelationSetTargetBlock(irel, newblk);
813
814 /*
815 * Lock the old buffer if not locked already. Note that in this
816 * case we know for sure it's a regular page: it's later than the
817 * new page we just got, which is not a revmap page, and revmap
818 * pages are always consecutive.
819 */
820 if (BufferIsValid(oldbuf) && oldblk > newblk)
821 {
822 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
823 Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
824 }
825
826 return buf;
827 }
828
829 /* This page is no good. */
830
831 /*
832 * If an entirely new page does not contain enough free space for the
833 * new item, then surely that item is oversized. Complain loudly; but
834 * first make sure we initialize the page and record it as free, for
835 * next time.
836 */
837 if (*extended)
838 {
839 brin_initialize_empty_new_buffer(irel, buf);
840 /* since this should not happen, skip FreeSpaceMapVacuum */
841
842 ereport(ERROR,
843 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
844 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
845 itemsz, freespace, RelationGetRelationName(irel))));
846 return InvalidBuffer; /* keep compiler quiet */
847 }
848
849 if (newblk != oldblk)
850 UnlockReleaseBuffer(buf);
851 if (BufferIsValid(oldbuf) && oldblk <= newblk)
852 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
853
854 /*
855 * Update the FSM with the new, presumably smaller, freespace value
856 * for this page, then search for a new target page.
857 */
858 newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
859 }
860}
861
862/*
863 * Initialize a page as an empty regular BRIN page, WAL-log this, and record
864 * the page in FSM.
865 *
866 * There are several corner situations in which we extend the relation to
867 * obtain a new page and later find that we cannot use it immediately. When
868 * that happens, we don't want to leave the page go unrecorded in FSM, because
869 * there is no mechanism to get the space back and the index would bloat.
870 * Also, because we would not WAL-log the action that would initialize the
871 * page, the page would go uninitialized in a standby (or after recovery).
872 *
873 * While we record the page in FSM here, caller is responsible for doing FSM
874 * upper-page update if that seems appropriate.
875 */
876static void
877brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
878{
879 Page page;
880
881 BRIN_elog((DEBUG2,
882 "brin_initialize_empty_new_buffer: initializing blank page %u",
883 BufferGetBlockNumber(buffer)));
884
885 START_CRIT_SECTION();
886 page = BufferGetPage(buffer);
887 brin_page_init(page, BRIN_PAGETYPE_REGULAR);
888 MarkBufferDirty(buffer);
889 log_newpage_buffer(buffer, true);
890 END_CRIT_SECTION();
891
892 /*
893 * We update the FSM for this page, but this is not WAL-logged. This is
894 * acceptable because VACUUM will scan the index and update the FSM with
895 * pages whose FSM records were forgotten in a crash.
896 */
897 RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
898 br_page_get_freespace(page));
899}
900
901
902/*
903 * Return the amount of free space on a regular BRIN index page.
904 *
905 * If the page is not a regular page, or has been marked with the
906 * BRIN_EVACUATE_PAGE flag, returns 0.
907 */
908static Size
909br_page_get_freespace(Page page)
910{
911 if (!BRIN_IS_REGULAR_PAGE(page) ||
912 (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
913 return 0;
914 else
915 return PageGetFreeSpace(page);
916}
917