1/*-------------------------------------------------------------------------
2 *
3 * gindatapage.c
4 * routines for handling GIN posting tree pages.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gin/gindatapage.c
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/gin_private.h"
18#include "access/ginxlog.h"
19#include "access/xloginsert.h"
20#include "lib/ilist.h"
21#include "miscadmin.h"
22#include "storage/predicate.h"
23#include "utils/rel.h"
24
25/*
26 * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
27 *
28 * The code can deal with any size, but random access is more efficient when
29 * a number of smaller lists are stored, rather than one big list. If a
30 * posting list would become larger than Max size as a result of insertions,
31 * it is split into two. If a posting list would be smaller than minimum
32 * size, it is merged with the next posting list.
33 */
34#define GinPostingListSegmentMaxSize 384
35#define GinPostingListSegmentTargetSize 256
36#define GinPostingListSegmentMinSize 128
37
38/*
39 * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
40 * long segment. This is used when estimating how much space is required
41 * for N items, at minimum.
42 */
43#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
44
45/*
46 * A working struct for manipulating a posting tree leaf page.
47 */
48typedef struct
49{
50 dlist_head segments; /* a list of leafSegmentInfos */
51
52 /*
53 * The following fields represent how the segments are split across pages,
54 * if a page split is required. Filled in by leafRepackItems.
55 */
56 dlist_node *lastleft; /* last segment on left page */
57 int lsize; /* total size on left page */
58 int rsize; /* total size on right page */
59
60 bool oldformat; /* page is in pre-9.4 format on disk */
61
62 /*
63 * If we need WAL data representing the reconstructed leaf page, it's
64 * stored here by computeLeafRecompressWALData.
65 */
66 char *walinfo; /* buffer start */
67 int walinfolen; /* and length */
68} disassembledLeaf;
69
70typedef struct
71{
72 dlist_node node; /* linked list pointers */
73
74 /*-------------
75 * 'action' indicates the status of this in-memory segment, compared to
76 * what's on disk. It is one of the GIN_SEGMENT_* action codes:
77 *
78 * UNMODIFIED no changes
79 * DELETE the segment is to be removed. 'seg' and 'items' are
80 * ignored
81 * INSERT this is a completely new segment
82 * REPLACE this replaces an existing segment with new content
83 * ADDITEMS like REPLACE, but no items have been removed, and we track
84 * in detail what items have been added to this segment, in
85 * 'modifieditems'
86 *-------------
87 */
88 char action;
89
90 ItemPointerData *modifieditems;
91 uint16 nmodifieditems;
92
93 /*
94 * The following fields represent the items in this segment. If 'items' is
95 * not NULL, it contains a palloc'd array of the itemsin this segment. If
96 * 'seg' is not NULL, it contains the items in an already-compressed
97 * format. It can point to an on-disk page (!modified), or a palloc'd
98 * segment in memory. If both are set, they must represent the same items.
99 */
100 GinPostingList *seg;
101 ItemPointer items;
102 int nitems; /* # of items in 'items', if items != NULL */
103} leafSegmentInfo;
104
105static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
106static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
107 GinBtreeStack *stack,
108 void *insertdata, BlockNumber updateblkno,
109 Page *newlpage, Page *newrpage);
110
111static disassembledLeaf *disassembleLeaf(Page page);
112static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
113static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
114 int nNewItems);
115
116static void computeLeafRecompressWALData(disassembledLeaf *leaf);
117static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
118static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
119 ItemPointerData lbound, ItemPointerData rbound,
120 Page lpage, Page rpage);
121
122/*
123 * Read TIDs from leaf data page to single uncompressed array. The TIDs are
124 * returned in ascending order.
125 *
126 * advancePast is a hint, indicating that the caller is only interested in
127 * TIDs > advancePast. To return all items, use ItemPointerSetMin.
128 *
129 * Note: This function can still return items smaller than advancePast that
130 * are in the same posting list as the items of interest, so the caller must
131 * still check all the returned items. But passing it allows this function to
132 * skip whole posting lists.
133 */
134ItemPointer
135GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast)
136{
137 ItemPointer result;
138
139 if (GinPageIsCompressed(page))
140 {
141 GinPostingList *seg = GinDataLeafPageGetPostingList(page);
142 Size len = GinDataLeafPageGetPostingListSize(page);
143 Pointer endptr = ((Pointer) seg) + len;
144 GinPostingList *next;
145
146 /* Skip to the segment containing advancePast+1 */
147 if (ItemPointerIsValid(&advancePast))
148 {
149 next = GinNextPostingListSegment(seg);
150 while ((Pointer) next < endptr &&
151 ginCompareItemPointers(&next->first, &advancePast) <= 0)
152 {
153 seg = next;
154 next = GinNextPostingListSegment(seg);
155 }
156 len = endptr - (Pointer) seg;
157 }
158
159 if (len > 0)
160 result = ginPostingListDecodeAllSegments(seg, len, nitems);
161 else
162 {
163 result = NULL;
164 *nitems = 0;
165 }
166 }
167 else
168 {
169 ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
170
171 result = palloc((*nitems) * sizeof(ItemPointerData));
172 memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
173 }
174
175 return result;
176}
177
178/*
179 * Places all TIDs from leaf data page to bitmap.
180 */
181int
182GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
183{
184 ItemPointer uncompressed;
185 int nitems;
186
187 if (GinPageIsCompressed(page))
188 {
189 GinPostingList *segment = GinDataLeafPageGetPostingList(page);
190 Size len = GinDataLeafPageGetPostingListSize(page);
191
192 nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
193 }
194 else
195 {
196 uncompressed = dataLeafPageGetUncompressed(page, &nitems);
197
198 if (nitems > 0)
199 tbm_add_tuples(tbm, uncompressed, nitems, false);
200 }
201
202 return nitems;
203}
204
205/*
206 * Get pointer to the uncompressed array of items on a pre-9.4 format
207 * uncompressed leaf page. The number of items in the array is returned in
208 * *nitems.
209 */
210static ItemPointer
211dataLeafPageGetUncompressed(Page page, int *nitems)
212{
213 ItemPointer items;
214
215 Assert(!GinPageIsCompressed(page));
216
217 /*
218 * In the old pre-9.4 page format, the whole page content is used for
219 * uncompressed items, and the number of items is stored in 'maxoff'
220 */
221 items = (ItemPointer) GinDataPageGetData(page);
222 *nitems = GinPageGetOpaque(page)->maxoff;
223
224 return items;
225}
226
227/*
228 * Check if we should follow the right link to find the item we're searching
229 * for.
230 *
231 * Compares inserting item pointer with the right bound of the current page.
232 */
233static bool
234dataIsMoveRight(GinBtree btree, Page page)
235{
236 ItemPointer iptr = GinDataPageGetRightBound(page);
237
238 if (GinPageRightMost(page))
239 return false;
240
241 return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? true : false;
242}
243
244/*
245 * Find correct PostingItem in non-leaf page. It is assumed that this is
246 * the correct page, and the searched value SHOULD be on the page.
247 */
248static BlockNumber
249dataLocateItem(GinBtree btree, GinBtreeStack *stack)
250{
251 OffsetNumber low,
252 high,
253 maxoff;
254 PostingItem *pitem = NULL;
255 int result;
256 Page page = BufferGetPage(stack->buffer);
257
258 Assert(!GinPageIsLeaf(page));
259 Assert(GinPageIsData(page));
260
261 if (btree->fullScan)
262 {
263 stack->off = FirstOffsetNumber;
264 stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
265 return btree->getLeftMostChild(btree, page);
266 }
267
268 low = FirstOffsetNumber;
269 maxoff = high = GinPageGetOpaque(page)->maxoff;
270 Assert(high >= low);
271
272 high++;
273
274 while (high > low)
275 {
276 OffsetNumber mid = low + ((high - low) / 2);
277
278 pitem = GinDataPageGetPostingItem(page, mid);
279
280 if (mid == maxoff)
281 {
282 /*
283 * Right infinity, page already correctly chosen with a help of
284 * dataIsMoveRight
285 */
286 result = -1;
287 }
288 else
289 {
290 pitem = GinDataPageGetPostingItem(page, mid);
291 result = ginCompareItemPointers(&btree->itemptr, &(pitem->key));
292 }
293
294 if (result == 0)
295 {
296 stack->off = mid;
297 return PostingItemGetBlockNumber(pitem);
298 }
299 else if (result > 0)
300 low = mid + 1;
301 else
302 high = mid;
303 }
304
305 Assert(high >= FirstOffsetNumber && high <= maxoff);
306
307 stack->off = high;
308 pitem = GinDataPageGetPostingItem(page, high);
309 return PostingItemGetBlockNumber(pitem);
310}
311
312/*
313 * Find link to blkno on non-leaf page, returns offset of PostingItem
314 */
315static OffsetNumber
316dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
317{
318 OffsetNumber i,
319 maxoff = GinPageGetOpaque(page)->maxoff;
320 PostingItem *pitem;
321
322 Assert(!GinPageIsLeaf(page));
323 Assert(GinPageIsData(page));
324
325 /* if page isn't changed, we return storedOff */
326 if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
327 {
328 pitem = GinDataPageGetPostingItem(page, storedOff);
329 if (PostingItemGetBlockNumber(pitem) == blkno)
330 return storedOff;
331
332 /*
333 * we hope, that needed pointer goes to right. It's true if there
334 * wasn't a deletion
335 */
336 for (i = storedOff + 1; i <= maxoff; i++)
337 {
338 pitem = GinDataPageGetPostingItem(page, i);
339 if (PostingItemGetBlockNumber(pitem) == blkno)
340 return i;
341 }
342
343 maxoff = storedOff - 1;
344 }
345
346 /* last chance */
347 for (i = FirstOffsetNumber; i <= maxoff; i++)
348 {
349 pitem = GinDataPageGetPostingItem(page, i);
350 if (PostingItemGetBlockNumber(pitem) == blkno)
351 return i;
352 }
353
354 return InvalidOffsetNumber;
355}
356
357/*
358 * Return blkno of leftmost child
359 */
360static BlockNumber
361dataGetLeftMostPage(GinBtree btree, Page page)
362{
363 PostingItem *pitem;
364
365 Assert(!GinPageIsLeaf(page));
366 Assert(GinPageIsData(page));
367 Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
368
369 pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
370 return PostingItemGetBlockNumber(pitem);
371}
372
373/*
374 * Add PostingItem to a non-leaf page.
375 */
376void
377GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
378{
379 OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
380 char *ptr;
381
382 Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
383 Assert(!GinPageIsLeaf(page));
384
385 if (offset == InvalidOffsetNumber)
386 {
387 ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1);
388 }
389 else
390 {
391 ptr = (char *) GinDataPageGetPostingItem(page, offset);
392 if (offset != maxoff + 1)
393 memmove(ptr + sizeof(PostingItem),
394 ptr,
395 (maxoff - offset + 1) * sizeof(PostingItem));
396 }
397 memcpy(ptr, data, sizeof(PostingItem));
398
399 maxoff++;
400 GinPageGetOpaque(page)->maxoff = maxoff;
401
402 /*
403 * Also set pd_lower to the end of the posting items, to follow the
404 * "standard" page layout, so that we can squeeze out the unused space
405 * from full-page images.
406 */
407 GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
408}
409
410/*
411 * Delete posting item from non-leaf page
412 */
413void
414GinPageDeletePostingItem(Page page, OffsetNumber offset)
415{
416 OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
417
418 Assert(!GinPageIsLeaf(page));
419 Assert(offset >= FirstOffsetNumber && offset <= maxoff);
420
421 if (offset != maxoff)
422 memmove(GinDataPageGetPostingItem(page, offset),
423 GinDataPageGetPostingItem(page, offset + 1),
424 sizeof(PostingItem) * (maxoff - offset));
425
426 maxoff--;
427 GinPageGetOpaque(page)->maxoff = maxoff;
428
429 GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
430}
431
432/*
433 * Prepare to insert data on a leaf data page.
434 *
435 * If it will fit, return GPTP_INSERT after doing whatever setup is needed
436 * before we enter the insertion critical section. *ptp_workspace can be
437 * set to pass information along to the execPlaceToPage function.
438 *
439 * If it won't fit, perform a page split and return two temporary page
440 * images into *newlpage and *newrpage, with result GPTP_SPLIT.
441 *
442 * In neither case should the given page buffer be modified here.
443 */
444static GinPlaceToPageRC
445dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
446 void *insertdata,
447 void **ptp_workspace,
448 Page *newlpage, Page *newrpage)
449{
450 GinBtreeDataLeafInsertData *items = insertdata;
451 ItemPointer newItems = &items->items[items->curitem];
452 int maxitems = items->nitem - items->curitem;
453 Page page = BufferGetPage(buf);
454 int i;
455 ItemPointerData rbound;
456 ItemPointerData lbound;
457 bool needsplit;
458 bool append;
459 int segsize;
460 Size freespace;
461 disassembledLeaf *leaf;
462 leafSegmentInfo *lastleftinfo;
463 ItemPointerData maxOldItem;
464 ItemPointerData remaining;
465
466 rbound = *GinDataPageGetRightBound(page);
467
468 /*
469 * Count how many of the new items belong to this page.
470 */
471 if (!GinPageRightMost(page))
472 {
473 for (i = 0; i < maxitems; i++)
474 {
475 if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
476 {
477 /*
478 * This needs to go to some other location in the tree. (The
479 * caller should've chosen the insert location so that at
480 * least the first item goes here.)
481 */
482 Assert(i > 0);
483 break;
484 }
485 }
486 maxitems = i;
487 }
488
489 /* Disassemble the data on the page */
490 leaf = disassembleLeaf(page);
491
492 /*
493 * Are we appending to the end of the page? IOW, are all the new items
494 * larger than any of the existing items.
495 */
496 if (!dlist_is_empty(&leaf->segments))
497 {
498 lastleftinfo = dlist_container(leafSegmentInfo, node,
499 dlist_tail_node(&leaf->segments));
500 if (!lastleftinfo->items)
501 lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
502 &lastleftinfo->nitems);
503 maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
504 if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
505 append = true;
506 else
507 append = false;
508 }
509 else
510 {
511 ItemPointerSetMin(&maxOldItem);
512 append = true;
513 }
514
515 /*
516 * If we're appending to the end of the page, we will append as many items
517 * as we can fit (after splitting), and stop when the pages becomes full.
518 * Otherwise we have to limit the number of new items to insert, because
519 * once we start packing we can't just stop when we run out of space,
520 * because we must make sure that all the old items still fit.
521 */
522 if (GinPageIsCompressed(page))
523 freespace = GinDataLeafPageGetFreeSpace(page);
524 else
525 freespace = 0;
526 if (append)
527 {
528 /*
529 * Even when appending, trying to append more items than will fit is
530 * not completely free, because we will merge the new items and old
531 * items into an array below. In the best case, every new item fits in
532 * a single byte, and we can use all the free space on the old page as
533 * well as the new page. For simplicity, ignore segment overhead etc.
534 */
535 maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize);
536 }
537 else
538 {
539 /*
540 * Calculate a conservative estimate of how many new items we can fit
541 * on the two pages after splitting.
542 *
543 * We can use any remaining free space on the old page to store full
544 * segments, as well as the new page. Each full-sized segment can hold
545 * at least MinTuplesPerSegment items
546 */
547 int nnewsegments;
548
549 nnewsegments = freespace / GinPostingListSegmentMaxSize;
550 nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize;
551 maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
552 }
553
554 /* Add the new items to the segment list */
555 if (!addItemsToLeaf(leaf, newItems, maxitems))
556 {
557 /* all items were duplicates, we have nothing to do */
558 items->curitem += maxitems;
559
560 return GPTP_NO_WORK;
561 }
562
563 /*
564 * Pack the items back to compressed segments, ready for writing to disk.
565 */
566 needsplit = leafRepackItems(leaf, &remaining);
567
568 /*
569 * Did all the new items fit?
570 *
571 * If we're appending, it's OK if they didn't. But as a sanity check,
572 * verify that all the old items fit.
573 */
574 if (ItemPointerIsValid(&remaining))
575 {
576 if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
577 elog(ERROR, "could not split GIN page; all old items didn't fit");
578
579 /* Count how many of the new items did fit. */
580 for (i = 0; i < maxitems; i++)
581 {
582 if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
583 break;
584 }
585 if (i == 0)
586 elog(ERROR, "could not split GIN page; no new items fit");
587 maxitems = i;
588 }
589
590 if (!needsplit)
591 {
592 /*
593 * Great, all the items fit on a single page. If needed, prepare data
594 * for a WAL record describing the changes we'll make.
595 */
596 if (RelationNeedsWAL(btree->index) && !btree->isBuild)
597 computeLeafRecompressWALData(leaf);
598
599 /*
600 * We're ready to enter the critical section, but
601 * dataExecPlaceToPageLeaf will need access to the "leaf" data.
602 */
603 *ptp_workspace = leaf;
604
605 if (append)
606 elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
607 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
608 items->nitem - items->curitem - maxitems);
609 else
610 elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
611 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
612 items->nitem - items->curitem - maxitems);
613 }
614 else
615 {
616 /*
617 * Have to split.
618 *
619 * leafRepackItems already divided the segments between the left and
620 * the right page. It filled the left page as full as possible, and
621 * put the rest to the right page. When building a new index, that's
622 * good, because the table is scanned from beginning to end and there
623 * won't be any more insertions to the left page during the build.
624 * This packs the index as tight as possible. But otherwise, split
625 * 50/50, by moving segments from the left page to the right page
626 * until they're balanced.
627 *
628 * As a further heuristic, when appending items to the end of the
629 * page, try to make the left page 75% full, on the assumption that
630 * subsequent insertions will probably also go to the end. This packs
631 * the index somewhat tighter when appending to a table, which is very
632 * common.
633 */
634 if (!btree->isBuild)
635 {
636 while (dlist_has_prev(&leaf->segments, leaf->lastleft))
637 {
638 lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
639
640 /* ignore deleted segments */
641 if (lastleftinfo->action != GIN_SEGMENT_DELETE)
642 {
643 segsize = SizeOfGinPostingList(lastleftinfo->seg);
644
645 /*
646 * Note that we check that the right page doesn't become
647 * more full than the left page even when appending. It's
648 * possible that we added enough items to make both pages
649 * more than 75% full.
650 */
651 if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
652 break;
653 if (append)
654 {
655 if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4)
656 break;
657 }
658
659 leaf->lsize -= segsize;
660 leaf->rsize += segsize;
661 }
662 leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
663 }
664 }
665 Assert(leaf->lsize <= GinDataPageMaxDataSize);
666 Assert(leaf->rsize <= GinDataPageMaxDataSize);
667
668 /*
669 * Fetch the max item in the left page's last segment; it becomes the
670 * right bound of the page.
671 */
672 lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
673 if (!lastleftinfo->items)
674 lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
675 &lastleftinfo->nitems);
676 lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
677
678 /*
679 * Now allocate a couple of temporary page images, and fill them.
680 */
681 *newlpage = palloc(BLCKSZ);
682 *newrpage = palloc(BLCKSZ);
683
684 dataPlaceToPageLeafSplit(leaf, lbound, rbound,
685 *newlpage, *newrpage);
686
687 Assert(GinPageRightMost(page) ||
688 ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
689 GinDataPageGetRightBound(*newrpage)) < 0);
690
691 if (append)
692 elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
693 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
694 items->nitem - items->curitem - maxitems);
695 else
696 elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
697 maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
698 items->nitem - items->curitem - maxitems);
699 }
700
701 items->curitem += maxitems;
702
703 return needsplit ? GPTP_SPLIT : GPTP_INSERT;
704}
705
706/*
707 * Perform data insertion after beginPlaceToPage has decided it will fit.
708 *
709 * This is invoked within a critical section, and XLOG record creation (if
710 * needed) is already started. The target buffer is registered in slot 0.
711 */
712static void
713dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
714 void *insertdata, void *ptp_workspace)
715{
716 disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace;
717
718 /* Apply changes to page */
719 dataPlaceToPageLeafRecompress(buf, leaf);
720
721 /* If needed, register WAL data built by computeLeafRecompressWALData */
722 if (RelationNeedsWAL(btree->index) && !btree->isBuild)
723 {
724 XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen);
725 }
726}
727
728/*
729 * Vacuum a posting tree leaf page.
730 */
731void
732ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
733{
734 Page page = BufferGetPage(buffer);
735 disassembledLeaf *leaf;
736 bool removedsomething = false;
737 dlist_iter iter;
738
739 leaf = disassembleLeaf(page);
740
741 /* Vacuum each segment. */
742 dlist_foreach(iter, &leaf->segments)
743 {
744 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
745 int oldsegsize;
746 ItemPointer cleaned;
747 int ncleaned;
748
749 if (!seginfo->items)
750 seginfo->items = ginPostingListDecode(seginfo->seg,
751 &seginfo->nitems);
752 if (seginfo->seg)
753 oldsegsize = SizeOfGinPostingList(seginfo->seg);
754 else
755 oldsegsize = GinDataPageMaxDataSize;
756
757 cleaned = ginVacuumItemPointers(gvs,
758 seginfo->items,
759 seginfo->nitems,
760 &ncleaned);
761 pfree(seginfo->items);
762 seginfo->items = NULL;
763 seginfo->nitems = 0;
764 if (cleaned)
765 {
766 if (ncleaned > 0)
767 {
768 int npacked;
769
770 seginfo->seg = ginCompressPostingList(cleaned,
771 ncleaned,
772 oldsegsize,
773 &npacked);
774 /* Removing an item never increases the size of the segment */
775 if (npacked != ncleaned)
776 elog(ERROR, "could not fit vacuumed posting list");
777 seginfo->action = GIN_SEGMENT_REPLACE;
778 }
779 else
780 {
781 seginfo->seg = NULL;
782 seginfo->items = NULL;
783 seginfo->action = GIN_SEGMENT_DELETE;
784 }
785 seginfo->nitems = ncleaned;
786
787 removedsomething = true;
788 }
789 }
790
791 /*
792 * If we removed any items, reconstruct the page from the pieces.
793 *
794 * We don't try to re-encode the segments here, even though some of them
795 * might be really small now that we've removed some items from them. It
796 * seems like a waste of effort, as there isn't really any benefit from
797 * larger segments per se; larger segments only help to pack more items in
798 * the same space. We might as well delay doing that until the next
799 * insertion, which will need to re-encode at least part of the page
800 * anyway.
801 *
802 * Also note if the page was in uncompressed, pre-9.4 format before, it is
803 * now represented as one huge segment that contains all the items. It
804 * might make sense to split that, to speed up random access, but we don't
805 * bother. You'll have to REINDEX anyway if you want the full gain of the
806 * new tighter index format.
807 */
808 if (removedsomething)
809 {
810 bool modified;
811
812 /*
813 * Make sure we have a palloc'd copy of all segments, after the first
814 * segment that is modified. (dataPlaceToPageLeafRecompress requires
815 * this).
816 */
817 modified = false;
818 dlist_foreach(iter, &leaf->segments)
819 {
820 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
821 iter.cur);
822
823 if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
824 modified = true;
825 if (modified && seginfo->action != GIN_SEGMENT_DELETE)
826 {
827 int segsize = SizeOfGinPostingList(seginfo->seg);
828 GinPostingList *tmp = (GinPostingList *) palloc(segsize);
829
830 memcpy(tmp, seginfo->seg, segsize);
831 seginfo->seg = tmp;
832 }
833 }
834
835 if (RelationNeedsWAL(indexrel))
836 computeLeafRecompressWALData(leaf);
837
838 /* Apply changes to page */
839 START_CRIT_SECTION();
840
841 dataPlaceToPageLeafRecompress(buffer, leaf);
842
843 MarkBufferDirty(buffer);
844
845 if (RelationNeedsWAL(indexrel))
846 {
847 XLogRecPtr recptr;
848
849 XLogBeginInsert();
850 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
851 XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen);
852 recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE);
853 PageSetLSN(page, recptr);
854 }
855
856 END_CRIT_SECTION();
857 }
858}
859
860/*
861 * Construct a ginxlogRecompressDataLeaf record representing the changes
862 * in *leaf. (Because this requires a palloc, we have to do it before
863 * we enter the critical section that actually updates the page.)
864 */
865static void
866computeLeafRecompressWALData(disassembledLeaf *leaf)
867{
868 int nmodified = 0;
869 char *walbufbegin;
870 char *walbufend;
871 dlist_iter iter;
872 int segno;
873 ginxlogRecompressDataLeaf *recompress_xlog;
874
875 /* Count the modified segments */
876 dlist_foreach(iter, &leaf->segments)
877 {
878 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
879 iter.cur);
880
881 if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
882 nmodified++;
883 }
884
885 walbufbegin =
886 palloc(sizeof(ginxlogRecompressDataLeaf) +
887 BLCKSZ + /* max size needed to hold the segment data */
888 nmodified * 2 /* (segno + action) per action */
889 );
890 walbufend = walbufbegin;
891
892 recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
893 walbufend += sizeof(ginxlogRecompressDataLeaf);
894
895 recompress_xlog->nactions = nmodified;
896
897 segno = 0;
898 dlist_foreach(iter, &leaf->segments)
899 {
900 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
901 iter.cur);
902 int segsize = 0;
903 int datalen;
904 uint8 action = seginfo->action;
905
906 if (action == GIN_SEGMENT_UNMODIFIED)
907 {
908 segno++;
909 continue;
910 }
911
912 if (action != GIN_SEGMENT_DELETE)
913 segsize = SizeOfGinPostingList(seginfo->seg);
914
915 /*
916 * If storing the uncompressed list of added item pointers would take
917 * more space than storing the compressed segment as is, do that
918 * instead.
919 */
920 if (action == GIN_SEGMENT_ADDITEMS &&
921 seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
922 {
923 action = GIN_SEGMENT_REPLACE;
924 }
925
926 *((uint8 *) (walbufend++)) = segno;
927 *(walbufend++) = action;
928
929 switch (action)
930 {
931 case GIN_SEGMENT_DELETE:
932 datalen = 0;
933 break;
934
935 case GIN_SEGMENT_ADDITEMS:
936 datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
937 memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
938 memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
939 datalen += sizeof(uint16);
940 break;
941
942 case GIN_SEGMENT_INSERT:
943 case GIN_SEGMENT_REPLACE:
944 datalen = SHORTALIGN(segsize);
945 memcpy(walbufend, seginfo->seg, segsize);
946 break;
947
948 default:
949 elog(ERROR, "unexpected GIN leaf action %d", action);
950 }
951 walbufend += datalen;
952
953 if (action != GIN_SEGMENT_INSERT)
954 segno++;
955 }
956
957 /* Pass back the constructed info via *leaf */
958 leaf->walinfo = walbufbegin;
959 leaf->walinfolen = walbufend - walbufbegin;
960}
961
962/*
963 * Assemble a disassembled posting tree leaf page back to a buffer.
964 *
965 * This just updates the target buffer; WAL stuff is caller's responsibility.
966 *
967 * NOTE: The segment pointers must not point directly to the same buffer,
968 * except for segments that have not been modified and whose preceding
969 * segments have not been modified either.
970 */
971static void
972dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
973{
974 Page page = BufferGetPage(buf);
975 char *ptr;
976 int newsize;
977 bool modified = false;
978 dlist_iter iter;
979 int segsize;
980
981 /*
982 * If the page was in pre-9.4 format before, convert the header, and force
983 * all segments to be copied to the page whether they were modified or
984 * not.
985 */
986 if (!GinPageIsCompressed(page))
987 {
988 Assert(leaf->oldformat);
989 GinPageSetCompressed(page);
990 GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
991 modified = true;
992 }
993
994 ptr = (char *) GinDataLeafPageGetPostingList(page);
995 newsize = 0;
996 dlist_foreach(iter, &leaf->segments)
997 {
998 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
999
1000 if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
1001 modified = true;
1002
1003 if (seginfo->action != GIN_SEGMENT_DELETE)
1004 {
1005 segsize = SizeOfGinPostingList(seginfo->seg);
1006
1007 if (modified)
1008 memcpy(ptr, seginfo->seg, segsize);
1009
1010 ptr += segsize;
1011 newsize += segsize;
1012 }
1013 }
1014
1015 Assert(newsize <= GinDataPageMaxDataSize);
1016 GinDataPageSetDataSize(page, newsize);
1017}
1018
1019/*
1020 * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
1021 * segments to two pages instead of one.
1022 *
1023 * This is different from the non-split cases in that this does not modify
1024 * the original page directly, but writes to temporary in-memory copies of
1025 * the new left and right pages.
1026 */
1027static void
1028dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
1029 ItemPointerData lbound, ItemPointerData rbound,
1030 Page lpage, Page rpage)
1031{
1032 char *ptr;
1033 int segsize;
1034 int lsize;
1035 int rsize;
1036 dlist_node *node;
1037 dlist_node *firstright;
1038 leafSegmentInfo *seginfo;
1039
1040 /* Initialize temporary pages to hold the new left and right pages */
1041 GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1042 GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1043
1044 /*
1045 * Copy the segments that go to the left page.
1046 *
1047 * XXX: We should skip copying the unmodified part of the left page, like
1048 * we do when recompressing.
1049 */
1050 lsize = 0;
1051 ptr = (char *) GinDataLeafPageGetPostingList(lpage);
1052 firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
1053 for (node = dlist_head_node(&leaf->segments);
1054 node != firstright;
1055 node = dlist_next_node(&leaf->segments, node))
1056 {
1057 seginfo = dlist_container(leafSegmentInfo, node, node);
1058
1059 if (seginfo->action != GIN_SEGMENT_DELETE)
1060 {
1061 segsize = SizeOfGinPostingList(seginfo->seg);
1062 memcpy(ptr, seginfo->seg, segsize);
1063 ptr += segsize;
1064 lsize += segsize;
1065 }
1066 }
1067 Assert(lsize == leaf->lsize);
1068 GinDataPageSetDataSize(lpage, lsize);
1069 *GinDataPageGetRightBound(lpage) = lbound;
1070
1071 /* Copy the segments that go to the right page */
1072 ptr = (char *) GinDataLeafPageGetPostingList(rpage);
1073 rsize = 0;
1074 for (node = firstright;
1075 ;
1076 node = dlist_next_node(&leaf->segments, node))
1077 {
1078 seginfo = dlist_container(leafSegmentInfo, node, node);
1079
1080 if (seginfo->action != GIN_SEGMENT_DELETE)
1081 {
1082 segsize = SizeOfGinPostingList(seginfo->seg);
1083 memcpy(ptr, seginfo->seg, segsize);
1084 ptr += segsize;
1085 rsize += segsize;
1086 }
1087
1088 if (!dlist_has_next(&leaf->segments, node))
1089 break;
1090 }
1091 Assert(rsize == leaf->rsize);
1092 GinDataPageSetDataSize(rpage, rsize);
1093 *GinDataPageGetRightBound(rpage) = rbound;
1094}
1095
1096/*
1097 * Prepare to insert data on an internal data page.
1098 *
1099 * If it will fit, return GPTP_INSERT after doing whatever setup is needed
1100 * before we enter the insertion critical section. *ptp_workspace can be
1101 * set to pass information along to the execPlaceToPage function.
1102 *
1103 * If it won't fit, perform a page split and return two temporary page
1104 * images into *newlpage and *newrpage, with result GPTP_SPLIT.
1105 *
1106 * In neither case should the given page buffer be modified here.
1107 *
1108 * Note: on insertion to an internal node, in addition to inserting the given
1109 * item, the downlink of the existing item at stack->off will be updated to
1110 * point to updateblkno.
1111 */
1112static GinPlaceToPageRC
1113dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1114 void *insertdata, BlockNumber updateblkno,
1115 void **ptp_workspace,
1116 Page *newlpage, Page *newrpage)
1117{
1118 Page page = BufferGetPage(buf);
1119
1120 /* If it doesn't fit, deal with split case */
1121 if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
1122 {
1123 dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
1124 newlpage, newrpage);
1125 return GPTP_SPLIT;
1126 }
1127
1128 /* Else, we're ready to proceed with insertion */
1129 return GPTP_INSERT;
1130}
1131
1132/*
1133 * Perform data insertion after beginPlaceToPage has decided it will fit.
1134 *
1135 * This is invoked within a critical section, and XLOG record creation (if
1136 * needed) is already started. The target buffer is registered in slot 0.
1137 */
1138static void
1139dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1140 void *insertdata, BlockNumber updateblkno,
1141 void *ptp_workspace)
1142{
1143 Page page = BufferGetPage(buf);
1144 OffsetNumber off = stack->off;
1145 PostingItem *pitem;
1146
1147 /* Update existing downlink to point to next page (on internal page) */
1148 pitem = GinDataPageGetPostingItem(page, off);
1149 PostingItemSetBlockNumber(pitem, updateblkno);
1150
1151 /* Add new item */
1152 pitem = (PostingItem *) insertdata;
1153 GinDataPageAddPostingItem(page, pitem, off);
1154
1155 if (RelationNeedsWAL(btree->index) && !btree->isBuild)
1156 {
1157 /*
1158 * This must be static, because it has to survive until XLogInsert,
1159 * and we can't palloc here. Ugly, but the XLogInsert infrastructure
1160 * isn't reentrant anyway.
1161 */
1162 static ginxlogInsertDataInternal data;
1163
1164 data.offset = off;
1165 data.newitem = *pitem;
1166
1167 XLogRegisterBufData(0, (char *) &data,
1168 sizeof(ginxlogInsertDataInternal));
1169 }
1170}
1171
1172/*
1173 * Prepare to insert data on a posting-tree data page.
1174 *
1175 * If it will fit, return GPTP_INSERT after doing whatever setup is needed
1176 * before we enter the insertion critical section. *ptp_workspace can be
1177 * set to pass information along to the execPlaceToPage function.
1178 *
1179 * If it won't fit, perform a page split and return two temporary page
1180 * images into *newlpage and *newrpage, with result GPTP_SPLIT.
1181 *
1182 * In neither case should the given page buffer be modified here.
1183 *
1184 * Note: on insertion to an internal node, in addition to inserting the given
1185 * item, the downlink of the existing item at stack->off will be updated to
1186 * point to updateblkno.
1187 *
1188 * Calls relevant function for internal or leaf page because they are handled
1189 * very differently.
1190 */
1191static GinPlaceToPageRC
1192dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1193 void *insertdata, BlockNumber updateblkno,
1194 void **ptp_workspace,
1195 Page *newlpage, Page *newrpage)
1196{
1197 Page page = BufferGetPage(buf);
1198
1199 Assert(GinPageIsData(page));
1200
1201 if (GinPageIsLeaf(page))
1202 return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata,
1203 ptp_workspace,
1204 newlpage, newrpage);
1205 else
1206 return dataBeginPlaceToPageInternal(btree, buf, stack,
1207 insertdata, updateblkno,
1208 ptp_workspace,
1209 newlpage, newrpage);
1210}
1211
1212/*
1213 * Perform data insertion after beginPlaceToPage has decided it will fit.
1214 *
1215 * This is invoked within a critical section, and XLOG record creation (if
1216 * needed) is already started. The target buffer is registered in slot 0.
1217 *
1218 * Calls relevant function for internal or leaf page because they are handled
1219 * very differently.
1220 */
1221static void
1222dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1223 void *insertdata, BlockNumber updateblkno,
1224 void *ptp_workspace)
1225{
1226 Page page = BufferGetPage(buf);
1227
1228 if (GinPageIsLeaf(page))
1229 dataExecPlaceToPageLeaf(btree, buf, stack, insertdata,
1230 ptp_workspace);
1231 else
1232 dataExecPlaceToPageInternal(btree, buf, stack, insertdata,
1233 updateblkno, ptp_workspace);
1234}
1235
1236/*
1237 * Split internal page and insert new data.
1238 *
1239 * Returns new temp pages to *newlpage and *newrpage.
1240 * The original buffer is left untouched.
1241 */
1242static void
1243dataSplitPageInternal(GinBtree btree, Buffer origbuf,
1244 GinBtreeStack *stack,
1245 void *insertdata, BlockNumber updateblkno,
1246 Page *newlpage, Page *newrpage)
1247{
1248 Page oldpage = BufferGetPage(origbuf);
1249 OffsetNumber off = stack->off;
1250 int nitems = GinPageGetOpaque(oldpage)->maxoff;
1251 int nleftitems;
1252 int nrightitems;
1253 Size pageSize = PageGetPageSize(oldpage);
1254 ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
1255 ItemPointer bound;
1256 Page lpage;
1257 Page rpage;
1258 OffsetNumber separator;
1259 PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
1260
1261 lpage = PageGetTempPage(oldpage);
1262 rpage = PageGetTempPage(oldpage);
1263 GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
1264 GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
1265
1266 /*
1267 * First construct a new list of PostingItems, which includes all the old
1268 * items, and the new item.
1269 */
1270 memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
1271 (off - 1) * sizeof(PostingItem));
1272
1273 allitems[off - 1] = *((PostingItem *) insertdata);
1274 memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
1275 (nitems - (off - 1)) * sizeof(PostingItem));
1276 nitems++;
1277
1278 /* Update existing downlink to point to next page */
1279 PostingItemSetBlockNumber(&allitems[off], updateblkno);
1280
1281 /*
1282 * When creating a new index, fit as many tuples as possible on the left
1283 * page, on the assumption that the table is scanned from beginning to
1284 * end. This packs the index as tight as possible.
1285 */
1286 if (btree->isBuild && GinPageRightMost(oldpage))
1287 separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
1288 else
1289 separator = nitems / 2;
1290 nleftitems = separator;
1291 nrightitems = nitems - separator;
1292
1293 memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
1294 allitems,
1295 nleftitems * sizeof(PostingItem));
1296 GinPageGetOpaque(lpage)->maxoff = nleftitems;
1297 memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
1298 &allitems[separator],
1299 nrightitems * sizeof(PostingItem));
1300 GinPageGetOpaque(rpage)->maxoff = nrightitems;
1301
1302 /*
1303 * Also set pd_lower for both pages, like GinDataPageAddPostingItem does.
1304 */
1305 GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem));
1306 GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem));
1307
1308 /* set up right bound for left page */
1309 bound = GinDataPageGetRightBound(lpage);
1310 *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key;
1311
1312 /* set up right bound for right page */
1313 *GinDataPageGetRightBound(rpage) = oldbound;
1314
1315 /* return temp pages to caller */
1316 *newlpage = lpage;
1317 *newrpage = rpage;
1318}
1319
1320/*
1321 * Construct insertion payload for inserting the downlink for given buffer.
1322 */
1323static void *
1324dataPrepareDownlink(GinBtree btree, Buffer lbuf)
1325{
1326 PostingItem *pitem = palloc(sizeof(PostingItem));
1327 Page lpage = BufferGetPage(lbuf);
1328
1329 PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf));
1330 pitem->key = *GinDataPageGetRightBound(lpage);
1331
1332 return pitem;
1333}
1334
1335/*
1336 * Fills new root by right bound values from child.
1337 * Also called from ginxlog, should not use btree
1338 */
1339void
1340ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
1341{
1342 PostingItem li,
1343 ri;
1344
1345 li.key = *GinDataPageGetRightBound(lpage);
1346 PostingItemSetBlockNumber(&li, lblkno);
1347 GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
1348
1349 ri.key = *GinDataPageGetRightBound(rpage);
1350 PostingItemSetBlockNumber(&ri, rblkno);
1351 GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
1352}
1353
1354
1355/*** Functions to work with disassembled leaf pages ***/
1356
1357/*
1358 * Disassemble page into a disassembledLeaf struct.
1359 */
1360static disassembledLeaf *
1361disassembleLeaf(Page page)
1362{
1363 disassembledLeaf *leaf;
1364 GinPostingList *seg;
1365 Pointer segbegin;
1366 Pointer segend;
1367
1368 leaf = palloc0(sizeof(disassembledLeaf));
1369 dlist_init(&leaf->segments);
1370
1371 if (GinPageIsCompressed(page))
1372 {
1373 /*
1374 * Create a leafSegment entry for each segment.
1375 */
1376 seg = GinDataLeafPageGetPostingList(page);
1377 segbegin = (Pointer) seg;
1378 segend = segbegin + GinDataLeafPageGetPostingListSize(page);
1379 while ((Pointer) seg < segend)
1380 {
1381 leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
1382
1383 seginfo->action = GIN_SEGMENT_UNMODIFIED;
1384 seginfo->seg = seg;
1385 seginfo->items = NULL;
1386 seginfo->nitems = 0;
1387 dlist_push_tail(&leaf->segments, &seginfo->node);
1388
1389 seg = GinNextPostingListSegment(seg);
1390 }
1391 leaf->oldformat = false;
1392 }
1393 else
1394 {
1395 /*
1396 * A pre-9.4 format uncompressed page is represented by a single
1397 * segment, with an array of items. The corner case is uncompressed
1398 * page containing no items, which is represented as no segments.
1399 */
1400 ItemPointer uncompressed;
1401 int nuncompressed;
1402 leafSegmentInfo *seginfo;
1403
1404 uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
1405
1406 if (nuncompressed > 0)
1407 {
1408 seginfo = palloc(sizeof(leafSegmentInfo));
1409
1410 seginfo->action = GIN_SEGMENT_REPLACE;
1411 seginfo->seg = NULL;
1412 seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
1413 memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
1414 seginfo->nitems = nuncompressed;
1415
1416 dlist_push_tail(&leaf->segments, &seginfo->node);
1417 }
1418
1419 leaf->oldformat = true;
1420 }
1421
1422 return leaf;
1423}
1424
1425/*
1426 * Distribute newItems to the segments.
1427 *
1428 * Any segments that acquire new items are decoded, and the new items are
1429 * merged with the old items.
1430 *
1431 * Returns true if any new items were added. False means they were all
1432 * duplicates of existing items on the page.
1433 */
1434static bool
1435addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
1436{
1437 dlist_iter iter;
1438 ItemPointer nextnew = newItems;
1439 int newleft = nNewItems;
1440 bool modified = false;
1441 leafSegmentInfo *newseg;
1442
1443 /*
1444 * If the page is completely empty, just construct one new segment to hold
1445 * all the new items.
1446 */
1447 if (dlist_is_empty(&leaf->segments))
1448 {
1449 newseg = palloc(sizeof(leafSegmentInfo));
1450 newseg->seg = NULL;
1451 newseg->items = newItems;
1452 newseg->nitems = nNewItems;
1453 newseg->action = GIN_SEGMENT_INSERT;
1454 dlist_push_tail(&leaf->segments, &newseg->node);
1455 return true;
1456 }
1457
1458 dlist_foreach(iter, &leaf->segments)
1459 {
1460 leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
1461 int nthis;
1462 ItemPointer tmpitems;
1463 int ntmpitems;
1464
1465 /*
1466 * How many of the new items fall into this segment?
1467 */
1468 if (!dlist_has_next(&leaf->segments, iter.cur))
1469 nthis = newleft;
1470 else
1471 {
1472 leafSegmentInfo *next;
1473 ItemPointerData next_first;
1474
1475 next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
1476 dlist_next_node(&leaf->segments, iter.cur));
1477 if (next->items)
1478 next_first = next->items[0];
1479 else
1480 {
1481 Assert(next->seg != NULL);
1482 next_first = next->seg->first;
1483 }
1484
1485 nthis = 0;
1486 while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
1487 nthis++;
1488 }
1489 if (nthis == 0)
1490 continue;
1491
1492 /* Merge the new items with the existing items. */
1493 if (!cur->items)
1494 cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
1495
1496 /*
1497 * Fast path for the important special case that we're appending to
1498 * the end of the page: don't let the last segment on the page grow
1499 * larger than the target, create a new segment before that happens.
1500 */
1501 if (!dlist_has_next(&leaf->segments, iter.cur) &&
1502 ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
1503 cur->seg != NULL &&
1504 SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
1505 {
1506 newseg = palloc(sizeof(leafSegmentInfo));
1507 newseg->seg = NULL;
1508 newseg->items = nextnew;
1509 newseg->nitems = nthis;
1510 newseg->action = GIN_SEGMENT_INSERT;
1511 dlist_push_tail(&leaf->segments, &newseg->node);
1512 modified = true;
1513 break;
1514 }
1515
1516 tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
1517 nextnew, nthis,
1518 &ntmpitems);
1519 if (ntmpitems != cur->nitems)
1520 {
1521 /*
1522 * If there are no duplicates, track the added items so that we
1523 * can emit a compact ADDITEMS WAL record later on. (it doesn't
1524 * seem worth re-checking which items were duplicates, if there
1525 * were any)
1526 */
1527 if (ntmpitems == nthis + cur->nitems &&
1528 cur->action == GIN_SEGMENT_UNMODIFIED)
1529 {
1530 cur->action = GIN_SEGMENT_ADDITEMS;
1531 cur->modifieditems = nextnew;
1532 cur->nmodifieditems = nthis;
1533 }
1534 else
1535 cur->action = GIN_SEGMENT_REPLACE;
1536
1537 cur->items = tmpitems;
1538 cur->nitems = ntmpitems;
1539 cur->seg = NULL;
1540 modified = true;
1541 }
1542
1543 nextnew += nthis;
1544 newleft -= nthis;
1545 if (newleft == 0)
1546 break;
1547 }
1548
1549 return modified;
1550}
1551
1552/*
1553 * Recompresses all segments that have been modified.
1554 *
1555 * If not all the items fit on two pages (ie. after split), we store as
1556 * many items as fit, and set *remaining to the first item that didn't fit.
1557 * If all items fit, *remaining is set to invalid.
1558 *
1559 * Returns true if the page has to be split.
1560 */
1561static bool
1562leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
1563{
1564 int pgused = 0;
1565 bool needsplit = false;
1566 dlist_iter iter;
1567 int segsize;
1568 leafSegmentInfo *nextseg;
1569 int npacked;
1570 bool modified;
1571 dlist_node *cur_node;
1572 dlist_node *next_node;
1573
1574 ItemPointerSetInvalid(remaining);
1575
1576 /*
1577 * cannot use dlist_foreach_modify here because we insert adjacent items
1578 * while iterating.
1579 */
1580 for (cur_node = dlist_head_node(&leaf->segments);
1581 cur_node != NULL;
1582 cur_node = next_node)
1583 {
1584 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
1585 cur_node);
1586
1587 if (dlist_has_next(&leaf->segments, cur_node))
1588 next_node = dlist_next_node(&leaf->segments, cur_node);
1589 else
1590 next_node = NULL;
1591
1592 /* Compress the posting list, if necessary */
1593 if (seginfo->action != GIN_SEGMENT_DELETE)
1594 {
1595 if (seginfo->seg == NULL)
1596 {
1597 if (seginfo->nitems > GinPostingListSegmentMaxSize)
1598 npacked = 0; /* no chance that it would fit. */
1599 else
1600 {
1601 seginfo->seg = ginCompressPostingList(seginfo->items,
1602 seginfo->nitems,
1603 GinPostingListSegmentMaxSize,
1604 &npacked);
1605 }
1606 if (npacked != seginfo->nitems)
1607 {
1608 /*
1609 * Too large. Compress again to the target size, and
1610 * create a new segment to represent the remaining items.
1611 * The new segment is inserted after this one, so it will
1612 * be processed in the next iteration of this loop.
1613 */
1614 if (seginfo->seg)
1615 pfree(seginfo->seg);
1616 seginfo->seg = ginCompressPostingList(seginfo->items,
1617 seginfo->nitems,
1618 GinPostingListSegmentTargetSize,
1619 &npacked);
1620 if (seginfo->action != GIN_SEGMENT_INSERT)
1621 seginfo->action = GIN_SEGMENT_REPLACE;
1622
1623 nextseg = palloc(sizeof(leafSegmentInfo));
1624 nextseg->action = GIN_SEGMENT_INSERT;
1625 nextseg->seg = NULL;
1626 nextseg->items = &seginfo->items[npacked];
1627 nextseg->nitems = seginfo->nitems - npacked;
1628 next_node = &nextseg->node;
1629 dlist_insert_after(cur_node, next_node);
1630 }
1631 }
1632
1633 /*
1634 * If the segment is very small, merge it with the next segment.
1635 */
1636 if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
1637 {
1638 int nmerged;
1639
1640 nextseg = dlist_container(leafSegmentInfo, node, next_node);
1641
1642 if (seginfo->items == NULL)
1643 seginfo->items = ginPostingListDecode(seginfo->seg,
1644 &seginfo->nitems);
1645 if (nextseg->items == NULL)
1646 nextseg->items = ginPostingListDecode(nextseg->seg,
1647 &nextseg->nitems);
1648 nextseg->items =
1649 ginMergeItemPointers(seginfo->items, seginfo->nitems,
1650 nextseg->items, nextseg->nitems,
1651 &nmerged);
1652 Assert(nmerged == seginfo->nitems + nextseg->nitems);
1653 nextseg->nitems = nmerged;
1654 nextseg->seg = NULL;
1655
1656 nextseg->action = GIN_SEGMENT_REPLACE;
1657 nextseg->modifieditems = NULL;
1658 nextseg->nmodifieditems = 0;
1659
1660 if (seginfo->action == GIN_SEGMENT_INSERT)
1661 {
1662 dlist_delete(cur_node);
1663 continue;
1664 }
1665 else
1666 {
1667 seginfo->action = GIN_SEGMENT_DELETE;
1668 seginfo->seg = NULL;
1669 }
1670 }
1671
1672 seginfo->items = NULL;
1673 seginfo->nitems = 0;
1674 }
1675
1676 if (seginfo->action == GIN_SEGMENT_DELETE)
1677 continue;
1678
1679 /*
1680 * OK, we now have a compressed version of this segment ready for
1681 * copying to the page. Did we exceed the size that fits on one page?
1682 */
1683 segsize = SizeOfGinPostingList(seginfo->seg);
1684 if (pgused + segsize > GinDataPageMaxDataSize)
1685 {
1686 if (!needsplit)
1687 {
1688 /* switch to right page */
1689 Assert(pgused > 0);
1690 leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
1691 needsplit = true;
1692 leaf->lsize = pgused;
1693 pgused = 0;
1694 }
1695 else
1696 {
1697 /*
1698 * Filled both pages. The last segment we constructed did not
1699 * fit.
1700 */
1701 *remaining = seginfo->seg->first;
1702
1703 /*
1704 * remove all segments that did not fit from the list.
1705 */
1706 while (dlist_has_next(&leaf->segments, cur_node))
1707 dlist_delete(dlist_next_node(&leaf->segments, cur_node));
1708 dlist_delete(cur_node);
1709 break;
1710 }
1711 }
1712
1713 pgused += segsize;
1714 }
1715
1716 if (!needsplit)
1717 {
1718 leaf->lsize = pgused;
1719 leaf->rsize = 0;
1720 }
1721 else
1722 leaf->rsize = pgused;
1723
1724 Assert(leaf->lsize <= GinDataPageMaxDataSize);
1725 Assert(leaf->rsize <= GinDataPageMaxDataSize);
1726
1727 /*
1728 * Make a palloc'd copy of every segment after the first modified one,
1729 * because as we start copying items to the original page, we might
1730 * overwrite an existing segment.
1731 */
1732 modified = false;
1733 dlist_foreach(iter, &leaf->segments)
1734 {
1735 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
1736 iter.cur);
1737
1738 if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
1739 {
1740 modified = true;
1741 }
1742 else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
1743 {
1744 GinPostingList *tmp;
1745
1746 segsize = SizeOfGinPostingList(seginfo->seg);
1747 tmp = palloc(segsize);
1748 memcpy(tmp, seginfo->seg, segsize);
1749 seginfo->seg = tmp;
1750 }
1751 }
1752
1753 return needsplit;
1754}
1755
1756
1757/*** Functions that are exported to the rest of the GIN code ***/
1758
1759/*
1760 * Creates new posting tree containing the given TIDs. Returns the page
1761 * number of the root of the new posting tree.
1762 *
1763 * items[] must be in sorted order with no duplicates.
1764 */
1765BlockNumber
1766createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
1767 GinStatsData *buildStats, Buffer entrybuffer)
1768{
1769 BlockNumber blkno;
1770 Buffer buffer;
1771 Page tmppage;
1772 Page page;
1773 Pointer ptr;
1774 int nrootitems;
1775 int rootsize;
1776 bool is_build = (buildStats != NULL);
1777
1778 /* Construct the new root page in memory first. */
1779 tmppage = (Page) palloc(BLCKSZ);
1780 GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1781 GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
1782
1783 /*
1784 * Write as many of the items to the root page as fit. In segments of max
1785 * GinPostingListSegmentMaxSize bytes each.
1786 */
1787 nrootitems = 0;
1788 rootsize = 0;
1789 ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage);
1790 while (nrootitems < nitems)
1791 {
1792 GinPostingList *segment;
1793 int npacked;
1794 int segsize;
1795
1796 segment = ginCompressPostingList(&items[nrootitems],
1797 nitems - nrootitems,
1798 GinPostingListSegmentMaxSize,
1799 &npacked);
1800 segsize = SizeOfGinPostingList(segment);
1801 if (rootsize + segsize > GinDataPageMaxDataSize)
1802 break;
1803
1804 memcpy(ptr, segment, segsize);
1805 ptr += segsize;
1806 rootsize += segsize;
1807 nrootitems += npacked;
1808 pfree(segment);
1809 }
1810 GinDataPageSetDataSize(tmppage, rootsize);
1811
1812 /*
1813 * All set. Get a new physical page, and copy the in-memory page to it.
1814 */
1815 buffer = GinNewBuffer(index);
1816 page = BufferGetPage(buffer);
1817 blkno = BufferGetBlockNumber(buffer);
1818
1819 /*
1820 * Copy any predicate locks from the entry tree leaf (containing posting
1821 * list) to the posting tree.
1822 */
1823 PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
1824
1825 START_CRIT_SECTION();
1826
1827 PageRestoreTempPage(tmppage, page);
1828 MarkBufferDirty(buffer);
1829
1830 if (RelationNeedsWAL(index) && !is_build)
1831 {
1832 XLogRecPtr recptr;
1833 ginxlogCreatePostingTree data;
1834
1835 data.size = rootsize;
1836
1837 XLogBeginInsert();
1838 XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree));
1839
1840 XLogRegisterData((char *) GinDataLeafPageGetPostingList(page),
1841 rootsize);
1842 XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
1843
1844 recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE);
1845 PageSetLSN(page, recptr);
1846 }
1847
1848 UnlockReleaseBuffer(buffer);
1849
1850 END_CRIT_SECTION();
1851
1852 /* During index build, count the newly-added data page */
1853 if (buildStats)
1854 buildStats->nDataPages++;
1855
1856 elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
1857
1858 /*
1859 * Add any remaining TIDs to the newly-created posting tree.
1860 */
1861 if (nitems > nrootitems)
1862 {
1863 ginInsertItemPointers(index, blkno,
1864 items + nrootitems,
1865 nitems - nrootitems,
1866 buildStats);
1867 }
1868
1869 return blkno;
1870}
1871
1872static void
1873ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
1874{
1875 memset(btree, 0, sizeof(GinBtreeData));
1876
1877 btree->index = index;
1878 btree->rootBlkno = rootBlkno;
1879
1880 btree->findChildPage = dataLocateItem;
1881 btree->getLeftMostChild = dataGetLeftMostPage;
1882 btree->isMoveRight = dataIsMoveRight;
1883 btree->findItem = NULL;
1884 btree->findChildPtr = dataFindChildPtr;
1885 btree->beginPlaceToPage = dataBeginPlaceToPage;
1886 btree->execPlaceToPage = dataExecPlaceToPage;
1887 btree->fillRoot = ginDataFillRoot;
1888 btree->prepareDownlink = dataPrepareDownlink;
1889
1890 btree->isData = true;
1891 btree->fullScan = false;
1892 btree->isBuild = false;
1893}
1894
1895/*
1896 * Inserts array of item pointers, may execute several tree scan (very rare)
1897 */
1898void
1899ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
1900 ItemPointerData *items, uint32 nitem,
1901 GinStatsData *buildStats)
1902{
1903 GinBtreeData btree;
1904 GinBtreeDataLeafInsertData insertdata;
1905 GinBtreeStack *stack;
1906
1907 ginPrepareDataScan(&btree, index, rootBlkno);
1908 btree.isBuild = (buildStats != NULL);
1909 insertdata.items = items;
1910 insertdata.nitem = nitem;
1911 insertdata.curitem = 0;
1912
1913 while (insertdata.curitem < insertdata.nitem)
1914 {
1915 /* search for the leaf page where the first item should go to */
1916 btree.itemptr = insertdata.items[insertdata.curitem];
1917 stack = ginFindLeafPage(&btree, false, true, NULL);
1918
1919 ginInsertValue(&btree, stack, &insertdata, buildStats);
1920 }
1921}
1922
1923/*
1924 * Starts a new scan on a posting tree.
1925 */
1926GinBtreeStack *
1927ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno,
1928 Snapshot snapshot)
1929{
1930 GinBtreeStack *stack;
1931
1932 ginPrepareDataScan(btree, index, rootBlkno);
1933
1934 btree->fullScan = true;
1935
1936 stack = ginFindLeafPage(btree, true, false, snapshot);
1937
1938 return stack;
1939}
1940