1/*-------------------------------------------------------------------------
2 *
3 * freepage.c
4 * Management of free memory pages.
5 *
6 * The intention of this code is to provide infrastructure for memory
7 * allocators written specifically for PostgreSQL. At least in the case
8 * of dynamic shared memory, we can't simply use malloc() or even
9 * relatively thin wrappers like palloc() which sit on top of it, because
10 * no allocator built into the operating system will deal with relative
11 * pointers. In the future, we may find other cases in which greater
12 * control over our own memory management seems desirable.
13 *
14 * A FreePageManager keeps track of which 4kB pages of memory are currently
15 * unused from the point of view of some higher-level memory allocator.
16 * Unlike a user-facing allocator such as palloc(), a FreePageManager can
17 * only allocate and free in units of whole pages, and freeing an
18 * allocation can only be done given knowledge of its length in pages.
19 *
20 * Since a free page manager has only a fixed amount of dedicated memory,
21 * and since there is no underlying allocator, it uses the free pages
22 * it is given to manage to store its bookkeeping data. It keeps multiple
23 * freelists of runs of pages, sorted by the size of the run; the head of
24 * each freelist is stored in the FreePageManager itself, and the first
25 * page of each run contains a relative pointer to the next run. See
26 * FreePageManagerGetInternal for more details on how the freelists are
27 * managed.
28 *
29 * To avoid memory fragmentation, it's important to consolidate adjacent
30 * spans of pages whenever possible; otherwise, large allocation requests
31 * might not be satisfied even when sufficient contiguous space is
32 * available. Therefore, in addition to the freelists, we maintain an
33 * in-memory btree of free page ranges ordered by page number. If a
34 * range being freed precedes or follows a range that is already free,
35 * the existing range is extended; if it exactly bridges the gap between
36 * free ranges, then the two existing ranges are consolidated with the
37 * newly-freed range to form one great big range of free pages.
38 *
39 * When there is only one range of free pages, the btree is trivial and
40 * is stored within the FreePageManager proper; otherwise, pages are
41 * allocated from the area under management as needed. Even in cases
42 * where memory fragmentation is very severe, only a tiny fraction of
43 * the pages under management are consumed by this btree.
44 *
45 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
46 * Portions Copyright (c) 1994, Regents of the University of California
47 *
48 * IDENTIFICATION
49 * src/backend/utils/mmgr/freepage.c
50 *
51 *-------------------------------------------------------------------------
52 */
53
54#include "postgres.h"
55#include "lib/stringinfo.h"
56#include "miscadmin.h"
57
58#include "utils/freepage.h"
59#include "utils/relptr.h"
60
61
62/* Magic numbers to identify various page types */
63#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0
64#define FREE_PAGE_LEAF_MAGIC 0x98eae728
65#define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9
66
67/* Doubly linked list of spans of free pages; stored in first page of span. */
68struct FreePageSpanLeader
69{
70 int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */
71 Size npages; /* number of pages in span */
72 RelptrFreePageSpanLeader prev;
73 RelptrFreePageSpanLeader next;
74};
75
76/* Common header for btree leaf and internal pages. */
77typedef struct FreePageBtreeHeader
78{
79 int magic; /* FREE_PAGE_LEAF_MAGIC or
80 * FREE_PAGE_INTERNAL_MAGIC */
81 Size nused; /* number of items used */
82 RelptrFreePageBtree parent; /* uplink */
83} FreePageBtreeHeader;
84
85/* Internal key; points to next level of btree. */
86typedef struct FreePageBtreeInternalKey
87{
88 Size first_page; /* low bound for keys on child page */
89 RelptrFreePageBtree child; /* downlink */
90} FreePageBtreeInternalKey;
91
92/* Leaf key; no payload data. */
93typedef struct FreePageBtreeLeafKey
94{
95 Size first_page; /* first page in span */
96 Size npages; /* number of pages in span */
97} FreePageBtreeLeafKey;
98
99/* Work out how many keys will fit on a page. */
100#define FPM_ITEMS_PER_INTERNAL_PAGE \
101 ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
102 sizeof(FreePageBtreeInternalKey))
103#define FPM_ITEMS_PER_LEAF_PAGE \
104 ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
105 sizeof(FreePageBtreeLeafKey))
106
107/* A btree page of either sort */
108struct FreePageBtree
109{
110 FreePageBtreeHeader hdr;
111 union
112 {
113 FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
114 FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
115 } u;
116};
117
118/* Results of a btree search */
119typedef struct FreePageBtreeSearchResult
120{
121 FreePageBtree *page;
122 Size index;
123 bool found;
124 unsigned split_pages;
125} FreePageBtreeSearchResult;
126
127/* Helper functions */
128static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
129 FreePageBtree *btp);
130static Size FreePageBtreeCleanup(FreePageManager *fpm);
131static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
132 FreePageBtree *btp);
133static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
134 FreePageBtree *btp);
135static Size FreePageBtreeFirstKey(FreePageBtree *btp);
136static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
137static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
138 Size index, Size first_page, FreePageBtree *child);
139static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
140 Size first_page, Size npages);
141static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
142static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
143 Size index);
144static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
145static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
146 FreePageBtreeSearchResult *result);
147static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
148static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
149static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
150 FreePageBtree *btp);
151static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
152static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
153 FreePageBtree *parent, int level, StringInfo buf);
154static void FreePageManagerDumpSpans(FreePageManager *fpm,
155 FreePageSpanLeader *span, Size expected_pages,
156 StringInfo buf);
157static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
158 Size *first_page);
159static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
160 Size npages, bool soft);
161static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
162static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
163 Size npages);
164static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
165static void FreePageManagerUpdateLargest(FreePageManager *fpm);
166
167#if FPM_EXTRA_ASSERTS
168static Size sum_free_pages(FreePageManager *fpm);
169#endif
170
171/*
172 * Initialize a new, empty free page manager.
173 *
174 * 'fpm' should reference caller-provided memory large enough to contain a
175 * FreePageManager. We'll initialize it here.
176 *
177 * 'base' is the address to which all pointers are relative. When managing
178 * a dynamic shared memory segment, it should normally be the base of the
179 * segment. When managing backend-private memory, it can be either NULL or,
180 * if managing a single contiguous extent of memory, the start of that extent.
181 */
182void
183FreePageManagerInitialize(FreePageManager *fpm, char *base)
184{
185 Size f;
186
187 relptr_store(base, fpm->self, fpm);
188 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
189 relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
190 fpm->btree_depth = 0;
191 fpm->btree_recycle_count = 0;
192 fpm->singleton_first_page = 0;
193 fpm->singleton_npages = 0;
194 fpm->contiguous_pages = 0;
195 fpm->contiguous_pages_dirty = true;
196#ifdef FPM_EXTRA_ASSERTS
197 fpm->free_pages = 0;
198#endif
199
200 for (f = 0; f < FPM_NUM_FREELISTS; f++)
201 relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
202}
203
204/*
205 * Allocate a run of pages of the given length from the free page manager.
206 * The return value indicates whether we were able to satisfy the request;
207 * if true, the first page of the allocation is stored in *first_page.
208 */
209bool
210FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
211{
212 bool result;
213 Size contiguous_pages;
214
215 result = FreePageManagerGetInternal(fpm, npages, first_page);
216
217 /*
218 * It's a bit counterintuitive, but allocating pages can actually create
219 * opportunities for cleanup that create larger ranges. We might pull a
220 * key out of the btree that enables the item at the head of the btree
221 * recycle list to be inserted; and then if there are more items behind it
222 * one of those might cause two currently-separated ranges to merge,
223 * creating a single range of contiguous pages larger than any that
224 * existed previously. It might be worth trying to improve the cleanup
225 * algorithm to avoid such corner cases, but for now we just notice the
226 * condition and do the appropriate reporting.
227 */
228 contiguous_pages = FreePageBtreeCleanup(fpm);
229 if (fpm->contiguous_pages < contiguous_pages)
230 fpm->contiguous_pages = contiguous_pages;
231
232 /*
233 * FreePageManagerGetInternal may have set contiguous_pages_dirty.
234 * Recompute contiguous_pages if so.
235 */
236 FreePageManagerUpdateLargest(fpm);
237
238#ifdef FPM_EXTRA_ASSERTS
239 if (result)
240 {
241 Assert(fpm->free_pages >= npages);
242 fpm->free_pages -= npages;
243 }
244 Assert(fpm->free_pages == sum_free_pages(fpm));
245 Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
246#endif
247 return result;
248}
249
250#ifdef FPM_EXTRA_ASSERTS
251static void
252sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
253{
254 char *base = fpm_segment_base(fpm);
255
256 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
257 btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
258 ++*sum;
259 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
260 {
261 Size index;
262
263
264 for (index = 0; index < btp->hdr.nused; ++index)
265 {
266 FreePageBtree *child;
267
268 child = relptr_access(base, btp->u.internal_key[index].child);
269 sum_free_pages_recurse(fpm, child, sum);
270 }
271 }
272}
273static Size
274sum_free_pages(FreePageManager *fpm)
275{
276 FreePageSpanLeader *recycle;
277 char *base = fpm_segment_base(fpm);
278 Size sum = 0;
279 int list;
280
281 /* Count the spans by scanning the freelists. */
282 for (list = 0; list < FPM_NUM_FREELISTS; ++list)
283 {
284
285 if (!relptr_is_null(fpm->freelist[list]))
286 {
287 FreePageSpanLeader *candidate =
288 relptr_access(base, fpm->freelist[list]);
289
290 do
291 {
292 sum += candidate->npages;
293 candidate = relptr_access(base, candidate->next);
294 } while (candidate != NULL);
295 }
296 }
297
298 /* Count btree internal pages. */
299 if (fpm->btree_depth > 0)
300 {
301 FreePageBtree *root = relptr_access(base, fpm->btree_root);
302
303 sum_free_pages_recurse(fpm, root, &sum);
304 }
305
306 /* Count the recycle list. */
307 for (recycle = relptr_access(base, fpm->btree_recycle);
308 recycle != NULL;
309 recycle = relptr_access(base, recycle->next))
310 {
311 Assert(recycle->npages == 1);
312 ++sum;
313 }
314
315 return sum;
316}
317#endif
318
319/*
320 * Compute the size of the largest run of pages that the user could
321 * successfully get.
322 */
323static Size
324FreePageManagerLargestContiguous(FreePageManager *fpm)
325{
326 char *base;
327 Size largest;
328
329 base = fpm_segment_base(fpm);
330 largest = 0;
331 if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
332 {
333 FreePageSpanLeader *candidate;
334
335 candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
336 do
337 {
338 if (candidate->npages > largest)
339 largest = candidate->npages;
340 candidate = relptr_access(base, candidate->next);
341 } while (candidate != NULL);
342 }
343 else
344 {
345 Size f = FPM_NUM_FREELISTS - 1;
346
347 do
348 {
349 --f;
350 if (!relptr_is_null(fpm->freelist[f]))
351 {
352 largest = f + 1;
353 break;
354 }
355 } while (f > 0);
356 }
357
358 return largest;
359}
360
361/*
362 * Recompute the size of the largest run of pages that the user could
363 * successfully get, if it has been marked dirty.
364 */
365static void
366FreePageManagerUpdateLargest(FreePageManager *fpm)
367{
368 if (!fpm->contiguous_pages_dirty)
369 return;
370
371 fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
372 fpm->contiguous_pages_dirty = false;
373}
374
375/*
376 * Transfer a run of pages to the free page manager.
377 */
378void
379FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
380{
381 Size contiguous_pages;
382
383 Assert(npages > 0);
384
385 /* Record the new pages. */
386 contiguous_pages =
387 FreePageManagerPutInternal(fpm, first_page, npages, false);
388
389 /*
390 * If the new range we inserted into the page manager was contiguous with
391 * an existing range, it may have opened up cleanup opportunities.
392 */
393 if (contiguous_pages > npages)
394 {
395 Size cleanup_contiguous_pages;
396
397 cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
398 if (cleanup_contiguous_pages > contiguous_pages)
399 contiguous_pages = cleanup_contiguous_pages;
400 }
401
402 /* See if we now have a new largest chunk. */
403 if (fpm->contiguous_pages < contiguous_pages)
404 fpm->contiguous_pages = contiguous_pages;
405
406 /*
407 * The earlier call to FreePageManagerPutInternal may have set
408 * contiguous_pages_dirty if it needed to allocate internal pages, so
409 * recompute contiguous_pages if necessary.
410 */
411 FreePageManagerUpdateLargest(fpm);
412
413#ifdef FPM_EXTRA_ASSERTS
414 fpm->free_pages += npages;
415 Assert(fpm->free_pages == sum_free_pages(fpm));
416 Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
417#endif
418}
419
420/*
421 * Produce a debugging dump of the state of a free page manager.
422 */
423char *
424FreePageManagerDump(FreePageManager *fpm)
425{
426 char *base = fpm_segment_base(fpm);
427 StringInfoData buf;
428 FreePageSpanLeader *recycle;
429 bool dumped_any_freelist = false;
430 Size f;
431
432 /* Initialize output buffer. */
433 initStringInfo(&buf);
434
435 /* Dump general stuff. */
436 appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
437 fpm->self.relptr_off, fpm->contiguous_pages);
438
439 /* Dump btree. */
440 if (fpm->btree_depth > 0)
441 {
442 FreePageBtree *root;
443
444 appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
445 root = relptr_access(base, fpm->btree_root);
446 FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
447 }
448 else if (fpm->singleton_npages > 0)
449 {
450 appendStringInfo(&buf, "singleton: %zu(%zu)\n",
451 fpm->singleton_first_page, fpm->singleton_npages);
452 }
453
454 /* Dump btree recycle list. */
455 recycle = relptr_access(base, fpm->btree_recycle);
456 if (recycle != NULL)
457 {
458 appendStringInfoString(&buf, "btree recycle:");
459 FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
460 }
461
462 /* Dump free lists. */
463 for (f = 0; f < FPM_NUM_FREELISTS; ++f)
464 {
465 FreePageSpanLeader *span;
466
467 if (relptr_is_null(fpm->freelist[f]))
468 continue;
469 if (!dumped_any_freelist)
470 {
471 appendStringInfoString(&buf, "freelists:\n");
472 dumped_any_freelist = true;
473 }
474 appendStringInfo(&buf, " %zu:", f + 1);
475 span = relptr_access(base, fpm->freelist[f]);
476 FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
477 }
478
479 /* And return result to caller. */
480 return buf.data;
481}
482
483
484/*
485 * The first_page value stored at index zero in any non-root page must match
486 * the first_page value stored in its parent at the index which points to that
487 * page. So when the value stored at index zero in a btree page changes, we've
488 * got to walk up the tree adjusting ancestor keys until we reach an ancestor
489 * where that key isn't index zero. This function should be called after
490 * updating the first key on the target page; it will propagate the change
491 * upward as far as needed.
492 *
493 * We assume here that the first key on the page has not changed enough to
494 * require changes in the ordering of keys on its ancestor pages. Thus,
495 * if we search the parent page for the first key greater than or equal to
496 * the first key on the current page, the downlink to this page will be either
497 * the exact index returned by the search (if the first key decreased)
498 * or one less (if the first key increased).
499 */
500static void
501FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
502{
503 char *base = fpm_segment_base(fpm);
504 Size first_page;
505 FreePageBtree *parent;
506 FreePageBtree *child;
507
508 /* This might be either a leaf or an internal page. */
509 Assert(btp->hdr.nused > 0);
510 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
511 {
512 Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
513 first_page = btp->u.leaf_key[0].first_page;
514 }
515 else
516 {
517 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
518 Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
519 first_page = btp->u.internal_key[0].first_page;
520 }
521 child = btp;
522
523 /* Loop until we find an ancestor that does not require adjustment. */
524 for (;;)
525 {
526 Size s;
527
528 parent = relptr_access(base, child->hdr.parent);
529 if (parent == NULL)
530 break;
531 s = FreePageBtreeSearchInternal(parent, first_page);
532
533 /* Key is either at index s or index s-1; figure out which. */
534 if (s >= parent->hdr.nused)
535 {
536 Assert(s == parent->hdr.nused);
537 --s;
538 }
539 else
540 {
541 FreePageBtree *check;
542
543 check = relptr_access(base, parent->u.internal_key[s].child);
544 if (check != child)
545 {
546 Assert(s > 0);
547 --s;
548 }
549 }
550
551#ifdef USE_ASSERT_CHECKING
552 /* Debugging double-check. */
553 {
554 FreePageBtree *check;
555
556 check = relptr_access(base, parent->u.internal_key[s].child);
557 Assert(s < parent->hdr.nused);
558 Assert(child == check);
559 }
560#endif
561
562 /* Update the parent key. */
563 parent->u.internal_key[s].first_page = first_page;
564
565 /*
566 * If this is the first key in the parent, go up another level; else
567 * done.
568 */
569 if (s > 0)
570 break;
571 child = parent;
572 }
573}
574
575/*
576 * Attempt to reclaim space from the free-page btree. The return value is
577 * the largest range of contiguous pages created by the cleanup operation.
578 */
579static Size
580FreePageBtreeCleanup(FreePageManager *fpm)
581{
582 char *base = fpm_segment_base(fpm);
583 Size max_contiguous_pages = 0;
584
585 /* Attempt to shrink the depth of the btree. */
586 while (!relptr_is_null(fpm->btree_root))
587 {
588 FreePageBtree *root = relptr_access(base, fpm->btree_root);
589
590 /* If the root contains only one key, reduce depth by one. */
591 if (root->hdr.nused == 1)
592 {
593 /* Shrink depth of tree by one. */
594 Assert(fpm->btree_depth > 0);
595 --fpm->btree_depth;
596 if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
597 {
598 /* If root is a leaf, convert only entry to singleton range. */
599 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
600 fpm->singleton_first_page = root->u.leaf_key[0].first_page;
601 fpm->singleton_npages = root->u.leaf_key[0].npages;
602 }
603 else
604 {
605 FreePageBtree *newroot;
606
607 /* If root is an internal page, make only child the root. */
608 Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
609 relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
610 newroot = relptr_access(base, fpm->btree_root);
611 relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
612 }
613 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
614 }
615 else if (root->hdr.nused == 2 &&
616 root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
617 {
618 Size end_of_first;
619 Size start_of_second;
620
621 end_of_first = root->u.leaf_key[0].first_page +
622 root->u.leaf_key[0].npages;
623 start_of_second = root->u.leaf_key[1].first_page;
624
625 if (end_of_first + 1 == start_of_second)
626 {
627 Size root_page = fpm_pointer_to_page(base, root);
628
629 if (end_of_first == root_page)
630 {
631 FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
632 FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
633 fpm->singleton_first_page = root->u.leaf_key[0].first_page;
634 fpm->singleton_npages = root->u.leaf_key[0].npages +
635 root->u.leaf_key[1].npages + 1;
636 fpm->btree_depth = 0;
637 relptr_store(base, fpm->btree_root,
638 (FreePageBtree *) NULL);
639 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
640 fpm->singleton_npages);
641 Assert(max_contiguous_pages == 0);
642 max_contiguous_pages = fpm->singleton_npages;
643 }
644 }
645
646 /* Whether it worked or not, it's time to stop. */
647 break;
648 }
649 else
650 {
651 /* Nothing more to do. Stop. */
652 break;
653 }
654 }
655
656 /*
657 * Attempt to free recycled btree pages. We skip this if releasing the
658 * recycled page would require a btree page split, because the page we're
659 * trying to recycle would be consumed by the split, which would be
660 * counterproductive.
661 *
662 * We also currently only ever attempt to recycle the first page on the
663 * list; that could be made more aggressive, but it's not clear that the
664 * complexity would be worthwhile.
665 */
666 while (fpm->btree_recycle_count > 0)
667 {
668 FreePageBtree *btp;
669 Size first_page;
670 Size contiguous_pages;
671
672 btp = FreePageBtreeGetRecycled(fpm);
673 first_page = fpm_pointer_to_page(base, btp);
674 contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
675 if (contiguous_pages == 0)
676 {
677 FreePageBtreeRecycle(fpm, first_page);
678 break;
679 }
680 else
681 {
682 if (contiguous_pages > max_contiguous_pages)
683 max_contiguous_pages = contiguous_pages;
684 }
685 }
686
687 return max_contiguous_pages;
688}
689
690/*
691 * Consider consolidating the given page with its left or right sibling,
692 * if it's fairly empty.
693 */
694static void
695FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
696{
697 char *base = fpm_segment_base(fpm);
698 FreePageBtree *np;
699 Size max;
700
701 /*
702 * We only try to consolidate pages that are less than a third full. We
703 * could be more aggressive about this, but that might risk performing
704 * consolidation only to end up splitting again shortly thereafter. Since
705 * the btree should be very small compared to the space under management,
706 * our goal isn't so much to ensure that it always occupies the absolutely
707 * smallest possible number of pages as to reclaim pages before things get
708 * too egregiously out of hand.
709 */
710 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
711 max = FPM_ITEMS_PER_LEAF_PAGE;
712 else
713 {
714 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
715 max = FPM_ITEMS_PER_INTERNAL_PAGE;
716 }
717 if (btp->hdr.nused >= max / 3)
718 return;
719
720 /*
721 * If we can fit our right sibling's keys onto this page, consolidate.
722 */
723 np = FreePageBtreeFindRightSibling(base, btp);
724 if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
725 {
726 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
727 {
728 memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
729 sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
730 btp->hdr.nused += np->hdr.nused;
731 }
732 else
733 {
734 memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
735 sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
736 btp->hdr.nused += np->hdr.nused;
737 FreePageBtreeUpdateParentPointers(base, btp);
738 }
739 FreePageBtreeRemovePage(fpm, np);
740 return;
741 }
742
743 /*
744 * If we can fit our keys onto our left sibling's page, consolidate. In
745 * this case, we move our keys onto the other page rather than visca
746 * versa, to avoid having to adjust ancestor keys.
747 */
748 np = FreePageBtreeFindLeftSibling(base, btp);
749 if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
750 {
751 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
752 {
753 memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
754 sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
755 np->hdr.nused += btp->hdr.nused;
756 }
757 else
758 {
759 memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
760 sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
761 np->hdr.nused += btp->hdr.nused;
762 FreePageBtreeUpdateParentPointers(base, np);
763 }
764 FreePageBtreeRemovePage(fpm, btp);
765 return;
766 }
767}
768
769/*
770 * Find the passed page's left sibling; that is, the page at the same level
771 * of the tree whose keyspace immediately precedes ours.
772 */
773static FreePageBtree *
774FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
775{
776 FreePageBtree *p = btp;
777 int levels = 0;
778
779 /* Move up until we can move left. */
780 for (;;)
781 {
782 Size first_page;
783 Size index;
784
785 first_page = FreePageBtreeFirstKey(p);
786 p = relptr_access(base, p->hdr.parent);
787
788 if (p == NULL)
789 return NULL; /* we were passed the rightmost page */
790
791 index = FreePageBtreeSearchInternal(p, first_page);
792 if (index > 0)
793 {
794 Assert(p->u.internal_key[index].first_page == first_page);
795 p = relptr_access(base, p->u.internal_key[index - 1].child);
796 break;
797 }
798 Assert(index == 0);
799 ++levels;
800 }
801
802 /* Descend left. */
803 while (levels > 0)
804 {
805 Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
806 p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
807 --levels;
808 }
809 Assert(p->hdr.magic == btp->hdr.magic);
810
811 return p;
812}
813
814/*
815 * Find the passed page's right sibling; that is, the page at the same level
816 * of the tree whose keyspace immediately follows ours.
817 */
818static FreePageBtree *
819FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
820{
821 FreePageBtree *p = btp;
822 int levels = 0;
823
824 /* Move up until we can move right. */
825 for (;;)
826 {
827 Size first_page;
828 Size index;
829
830 first_page = FreePageBtreeFirstKey(p);
831 p = relptr_access(base, p->hdr.parent);
832
833 if (p == NULL)
834 return NULL; /* we were passed the rightmost page */
835
836 index = FreePageBtreeSearchInternal(p, first_page);
837 if (index < p->hdr.nused - 1)
838 {
839 Assert(p->u.internal_key[index].first_page == first_page);
840 p = relptr_access(base, p->u.internal_key[index + 1].child);
841 break;
842 }
843 Assert(index == p->hdr.nused - 1);
844 ++levels;
845 }
846
847 /* Descend left. */
848 while (levels > 0)
849 {
850 Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
851 p = relptr_access(base, p->u.internal_key[0].child);
852 --levels;
853 }
854 Assert(p->hdr.magic == btp->hdr.magic);
855
856 return p;
857}
858
859/*
860 * Get the first key on a btree page.
861 */
862static Size
863FreePageBtreeFirstKey(FreePageBtree *btp)
864{
865 Assert(btp->hdr.nused > 0);
866
867 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
868 return btp->u.leaf_key[0].first_page;
869 else
870 {
871 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
872 return btp->u.internal_key[0].first_page;
873 }
874}
875
876/*
877 * Get a page from the btree recycle list for use as a btree page.
878 */
879static FreePageBtree *
880FreePageBtreeGetRecycled(FreePageManager *fpm)
881{
882 char *base = fpm_segment_base(fpm);
883 FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
884 FreePageSpanLeader *newhead;
885
886 Assert(victim != NULL);
887 newhead = relptr_access(base, victim->next);
888 if (newhead != NULL)
889 relptr_copy(newhead->prev, victim->prev);
890 relptr_store(base, fpm->btree_recycle, newhead);
891 Assert(fpm_pointer_is_page_aligned(base, victim));
892 fpm->btree_recycle_count--;
893 return (FreePageBtree *) victim;
894}
895
896/*
897 * Insert an item into an internal page.
898 */
899static void
900FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
901 Size first_page, FreePageBtree *child)
902{
903 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
904 Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
905 Assert(index <= btp->hdr.nused);
906 memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
907 sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
908 btp->u.internal_key[index].first_page = first_page;
909 relptr_store(base, btp->u.internal_key[index].child, child);
910 ++btp->hdr.nused;
911}
912
913/*
914 * Insert an item into a leaf page.
915 */
916static void
917FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
918 Size npages)
919{
920 Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
921 Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
922 Assert(index <= btp->hdr.nused);
923 memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
924 sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
925 btp->u.leaf_key[index].first_page = first_page;
926 btp->u.leaf_key[index].npages = npages;
927 ++btp->hdr.nused;
928}
929
930/*
931 * Put a page on the btree recycle list.
932 */
933static void
934FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
935{
936 char *base = fpm_segment_base(fpm);
937 FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
938 FreePageSpanLeader *span;
939
940 span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
941 span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
942 span->npages = 1;
943 relptr_store(base, span->next, head);
944 relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
945 if (head != NULL)
946 relptr_store(base, head->prev, span);
947 relptr_store(base, fpm->btree_recycle, span);
948 fpm->btree_recycle_count++;
949}
950
951/*
952 * Remove an item from the btree at the given position on the given page.
953 */
954static void
955FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
956{
957 Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
958 Assert(index < btp->hdr.nused);
959
960 /* When last item is removed, extirpate entire page from btree. */
961 if (btp->hdr.nused == 1)
962 {
963 FreePageBtreeRemovePage(fpm, btp);
964 return;
965 }
966
967 /* Physically remove the key from the page. */
968 --btp->hdr.nused;
969 if (index < btp->hdr.nused)
970 memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
971 sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
972
973 /* If we just removed the first key, adjust ancestor keys. */
974 if (index == 0)
975 FreePageBtreeAdjustAncestorKeys(fpm, btp);
976
977 /* Consider whether to consolidate this page with a sibling. */
978 FreePageBtreeConsolidate(fpm, btp);
979}
980
981/*
982 * Remove a page from the btree. Caller is responsible for having relocated
983 * any keys from this page that are still wanted. The page is placed on the
984 * recycled list.
985 */
986static void
987FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
988{
989 char *base = fpm_segment_base(fpm);
990 FreePageBtree *parent;
991 Size index;
992 Size first_page;
993
994 for (;;)
995 {
996 /* Find parent page. */
997 parent = relptr_access(base, btp->hdr.parent);
998 if (parent == NULL)
999 {
1000 /* We are removing the root page. */
1001 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
1002 fpm->btree_depth = 0;
1003 Assert(fpm->singleton_first_page == 0);
1004 Assert(fpm->singleton_npages == 0);
1005 return;
1006 }
1007
1008 /*
1009 * If the parent contains only one item, we need to remove it as well.
1010 */
1011 if (parent->hdr.nused > 1)
1012 break;
1013 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1014 btp = parent;
1015 }
1016
1017 /* Find and remove the downlink. */
1018 first_page = FreePageBtreeFirstKey(btp);
1019 if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1020 {
1021 index = FreePageBtreeSearchLeaf(parent, first_page);
1022 Assert(index < parent->hdr.nused);
1023 if (index < parent->hdr.nused - 1)
1024 memmove(&parent->u.leaf_key[index],
1025 &parent->u.leaf_key[index + 1],
1026 sizeof(FreePageBtreeLeafKey)
1027 * (parent->hdr.nused - index - 1));
1028 }
1029 else
1030 {
1031 index = FreePageBtreeSearchInternal(parent, first_page);
1032 Assert(index < parent->hdr.nused);
1033 if (index < parent->hdr.nused - 1)
1034 memmove(&parent->u.internal_key[index],
1035 &parent->u.internal_key[index + 1],
1036 sizeof(FreePageBtreeInternalKey)
1037 * (parent->hdr.nused - index - 1));
1038 }
1039 parent->hdr.nused--;
1040 Assert(parent->hdr.nused > 0);
1041
1042 /* Recycle the page. */
1043 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1044
1045 /* Adjust ancestor keys if needed. */
1046 if (index == 0)
1047 FreePageBtreeAdjustAncestorKeys(fpm, parent);
1048
1049 /* Consider whether to consolidate the parent with a sibling. */
1050 FreePageBtreeConsolidate(fpm, parent);
1051}
1052
1053/*
1054 * Search the btree for an entry for the given first page and initialize
1055 * *result with the results of the search. result->page and result->index
1056 * indicate either the position of an exact match or the position at which
1057 * the new key should be inserted. result->found is true for an exact match,
1058 * otherwise false. result->split_pages will contain the number of additional
1059 * btree pages that will be needed when performing a split to insert a key.
1060 * Except as described above, the contents of fields in the result object are
1061 * undefined on return.
1062 */
1063static void
1064FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
1065 FreePageBtreeSearchResult *result)
1066{
1067 char *base = fpm_segment_base(fpm);
1068 FreePageBtree *btp = relptr_access(base, fpm->btree_root);
1069 Size index;
1070
1071 result->split_pages = 1;
1072
1073 /* If the btree is empty, there's nothing to find. */
1074 if (btp == NULL)
1075 {
1076 result->page = NULL;
1077 result->found = false;
1078 return;
1079 }
1080
1081 /* Descend until we hit a leaf. */
1082 while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1083 {
1084 FreePageBtree *child;
1085 bool found_exact;
1086
1087 index = FreePageBtreeSearchInternal(btp, first_page);
1088 found_exact = index < btp->hdr.nused &&
1089 btp->u.internal_key[index].first_page == first_page;
1090
1091 /*
1092 * If we found an exact match we descend directly. Otherwise, we
1093 * descend into the child to the left if possible so that we can find
1094 * the insertion point at that child's high end.
1095 */
1096 if (!found_exact && index > 0)
1097 --index;
1098
1099 /* Track required split depth for leaf insert. */
1100 if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
1101 {
1102 Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1103 result->split_pages++;
1104 }
1105 else
1106 result->split_pages = 0;
1107
1108 /* Descend to appropriate child page. */
1109 Assert(index < btp->hdr.nused);
1110 child = relptr_access(base, btp->u.internal_key[index].child);
1111 Assert(relptr_access(base, child->hdr.parent) == btp);
1112 btp = child;
1113 }
1114
1115 /* Track required split depth for leaf insert. */
1116 if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
1117 {
1118 Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1119 result->split_pages++;
1120 }
1121 else
1122 result->split_pages = 0;
1123
1124 /* Search leaf page. */
1125 index = FreePageBtreeSearchLeaf(btp, first_page);
1126
1127 /* Assemble results. */
1128 result->page = btp;
1129 result->index = index;
1130 result->found = index < btp->hdr.nused &&
1131 first_page == btp->u.leaf_key[index].first_page;
1132}
1133
1134/*
1135 * Search an internal page for the first key greater than or equal to a given
1136 * page number. Returns the index of that key, or one greater than the number
1137 * of keys on the page if none.
1138 */
1139static Size
1140FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
1141{
1142 Size low = 0;
1143 Size high = btp->hdr.nused;
1144
1145 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1146 Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
1147
1148 while (low < high)
1149 {
1150 Size mid = (low + high) / 2;
1151 Size val = btp->u.internal_key[mid].first_page;
1152
1153 if (first_page == val)
1154 return mid;
1155 else if (first_page < val)
1156 high = mid;
1157 else
1158 low = mid + 1;
1159 }
1160
1161 return low;
1162}
1163
1164/*
1165 * Search a leaf page for the first key greater than or equal to a given
1166 * page number. Returns the index of that key, or one greater than the number
1167 * of keys on the page if none.
1168 */
1169static Size
1170FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
1171{
1172 Size low = 0;
1173 Size high = btp->hdr.nused;
1174
1175 Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
1176 Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
1177
1178 while (low < high)
1179 {
1180 Size mid = (low + high) / 2;
1181 Size val = btp->u.leaf_key[mid].first_page;
1182
1183 if (first_page == val)
1184 return mid;
1185 else if (first_page < val)
1186 high = mid;
1187 else
1188 low = mid + 1;
1189 }
1190
1191 return low;
1192}
1193
1194/*
1195 * Allocate a new btree page and move half the keys from the provided page
1196 * to the new page. Caller is responsible for making sure that there's a
1197 * page available from fpm->btree_recycle. Returns a pointer to the new page,
1198 * to which caller must add a downlink.
1199 */
1200static FreePageBtree *
1201FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
1202{
1203 FreePageBtree *newsibling;
1204
1205 newsibling = FreePageBtreeGetRecycled(fpm);
1206 newsibling->hdr.magic = btp->hdr.magic;
1207 newsibling->hdr.nused = btp->hdr.nused / 2;
1208 relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
1209 btp->hdr.nused -= newsibling->hdr.nused;
1210
1211 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1212 memcpy(&newsibling->u.leaf_key,
1213 &btp->u.leaf_key[btp->hdr.nused],
1214 sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
1215 else
1216 {
1217 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1218 memcpy(&newsibling->u.internal_key,
1219 &btp->u.internal_key[btp->hdr.nused],
1220 sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
1221 FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
1222 }
1223
1224 return newsibling;
1225}
1226
1227/*
1228 * When internal pages are split or merged, the parent pointers of their
1229 * children must be updated.
1230 */
1231static void
1232FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
1233{
1234 Size i;
1235
1236 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1237 for (i = 0; i < btp->hdr.nused; ++i)
1238 {
1239 FreePageBtree *child;
1240
1241 child = relptr_access(base, btp->u.internal_key[i].child);
1242 relptr_store(base, child->hdr.parent, btp);
1243 }
1244}
1245
1246/*
1247 * Debugging dump of btree data.
1248 */
1249static void
1250FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
1251 FreePageBtree *parent, int level, StringInfo buf)
1252{
1253 char *base = fpm_segment_base(fpm);
1254 Size pageno = fpm_pointer_to_page(base, btp);
1255 Size index;
1256 FreePageBtree *check_parent;
1257
1258 check_stack_depth();
1259 check_parent = relptr_access(base, btp->hdr.parent);
1260 appendStringInfo(buf, " %zu@%d %c", pageno, level,
1261 btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
1262 if (parent != check_parent)
1263 appendStringInfo(buf, " [actual parent %zu, expected %zu]",
1264 fpm_pointer_to_page(base, check_parent),
1265 fpm_pointer_to_page(base, parent));
1266 appendStringInfoChar(buf, ':');
1267 for (index = 0; index < btp->hdr.nused; ++index)
1268 {
1269 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1270 appendStringInfo(buf, " %zu->%zu",
1271 btp->u.internal_key[index].first_page,
1272 btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE);
1273 else
1274 appendStringInfo(buf, " %zu(%zu)",
1275 btp->u.leaf_key[index].first_page,
1276 btp->u.leaf_key[index].npages);
1277 }
1278 appendStringInfoChar(buf, '\n');
1279
1280 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1281 {
1282 for (index = 0; index < btp->hdr.nused; ++index)
1283 {
1284 FreePageBtree *child;
1285
1286 child = relptr_access(base, btp->u.internal_key[index].child);
1287 FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
1288 }
1289 }
1290}
1291
1292/*
1293 * Debugging dump of free-span data.
1294 */
1295static void
1296FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
1297 Size expected_pages, StringInfo buf)
1298{
1299 char *base = fpm_segment_base(fpm);
1300
1301 while (span != NULL)
1302 {
1303 if (span->npages != expected_pages)
1304 appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
1305 span->npages);
1306 else
1307 appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
1308 span = relptr_access(base, span->next);
1309 }
1310
1311 appendStringInfoChar(buf, '\n');
1312}
1313
1314/*
1315 * This function allocates a run of pages of the given length from the free
1316 * page manager.
1317 */
1318static bool
1319FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
1320{
1321 char *base = fpm_segment_base(fpm);
1322 FreePageSpanLeader *victim = NULL;
1323 FreePageSpanLeader *prev;
1324 FreePageSpanLeader *next;
1325 FreePageBtreeSearchResult result;
1326 Size victim_page = 0; /* placate compiler */
1327 Size f;
1328
1329 /*
1330 * Search for a free span.
1331 *
1332 * Right now, we use a simple best-fit policy here, but it's possible for
1333 * this to result in memory fragmentation if we're repeatedly asked to
1334 * allocate chunks just a little smaller than what we have available.
1335 * Hopefully, this is unlikely, because we expect most requests to be
1336 * single pages or superblock-sized chunks -- but no policy can be optimal
1337 * under all circumstances unless it has knowledge of future allocation
1338 * patterns.
1339 */
1340 for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
1341 {
1342 /* Skip empty freelists. */
1343 if (relptr_is_null(fpm->freelist[f]))
1344 continue;
1345
1346 /*
1347 * All of the freelists except the last one contain only items of a
1348 * single size, so we just take the first one. But the final free
1349 * list contains everything too big for any of the other lists, so we
1350 * need to search the list.
1351 */
1352 if (f < FPM_NUM_FREELISTS - 1)
1353 victim = relptr_access(base, fpm->freelist[f]);
1354 else
1355 {
1356 FreePageSpanLeader *candidate;
1357
1358 candidate = relptr_access(base, fpm->freelist[f]);
1359 do
1360 {
1361 if (candidate->npages >= npages && (victim == NULL ||
1362 victim->npages > candidate->npages))
1363 {
1364 victim = candidate;
1365 if (victim->npages == npages)
1366 break;
1367 }
1368 candidate = relptr_access(base, candidate->next);
1369 } while (candidate != NULL);
1370 }
1371 break;
1372 }
1373
1374 /* If we didn't find an allocatable span, return failure. */
1375 if (victim == NULL)
1376 return false;
1377
1378 /* Remove span from free list. */
1379 Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
1380 prev = relptr_access(base, victim->prev);
1381 next = relptr_access(base, victim->next);
1382 if (prev != NULL)
1383 relptr_copy(prev->next, victim->next);
1384 else
1385 relptr_copy(fpm->freelist[f], victim->next);
1386 if (next != NULL)
1387 relptr_copy(next->prev, victim->prev);
1388 victim_page = fpm_pointer_to_page(base, victim);
1389
1390 /* Decide whether we might be invalidating contiguous_pages. */
1391 if (f == FPM_NUM_FREELISTS - 1 &&
1392 victim->npages == fpm->contiguous_pages)
1393 {
1394 /*
1395 * The victim span came from the oversized freelist, and had the same
1396 * size as the longest span. There may or may not be another one of
1397 * the same size, so contiguous_pages must be recomputed just to be
1398 * safe.
1399 */
1400 fpm->contiguous_pages_dirty = true;
1401 }
1402 else if (f + 1 == fpm->contiguous_pages &&
1403 relptr_is_null(fpm->freelist[f]))
1404 {
1405 /*
1406 * The victim span came from a fixed sized freelist, and it was the
1407 * list for spans of the same size as the current longest span, and
1408 * the list is now empty after removing the victim. So
1409 * contiguous_pages must be recomputed without a doubt.
1410 */
1411 fpm->contiguous_pages_dirty = true;
1412 }
1413
1414 /*
1415 * If we haven't initialized the btree yet, the victim must be the single
1416 * span stored within the FreePageManager itself. Otherwise, we need to
1417 * update the btree.
1418 */
1419 if (relptr_is_null(fpm->btree_root))
1420 {
1421 Assert(victim_page == fpm->singleton_first_page);
1422 Assert(victim->npages == fpm->singleton_npages);
1423 Assert(victim->npages >= npages);
1424 fpm->singleton_first_page += npages;
1425 fpm->singleton_npages -= npages;
1426 if (fpm->singleton_npages > 0)
1427 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1428 fpm->singleton_npages);
1429 }
1430 else
1431 {
1432 /*
1433 * If the span we found is exactly the right size, remove it from the
1434 * btree completely. Otherwise, adjust the btree entry to reflect the
1435 * still-unallocated portion of the span, and put that portion on the
1436 * appropriate free list.
1437 */
1438 FreePageBtreeSearch(fpm, victim_page, &result);
1439 Assert(result.found);
1440 if (victim->npages == npages)
1441 FreePageBtreeRemove(fpm, result.page, result.index);
1442 else
1443 {
1444 FreePageBtreeLeafKey *key;
1445
1446 /* Adjust btree to reflect remaining pages. */
1447 Assert(victim->npages > npages);
1448 key = &result.page->u.leaf_key[result.index];
1449 Assert(key->npages == victim->npages);
1450 key->first_page += npages;
1451 key->npages -= npages;
1452 if (result.index == 0)
1453 FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1454
1455 /* Put the unallocated pages back on the appropriate free list. */
1456 FreePagePushSpanLeader(fpm, victim_page + npages,
1457 victim->npages - npages);
1458 }
1459 }
1460
1461 /* Return results to caller. */
1462 *first_page = fpm_pointer_to_page(base, victim);
1463 return true;
1464}
1465
1466/*
1467 * Put a range of pages into the btree and freelists, consolidating it with
1468 * existing free spans just before and/or after it. If 'soft' is true,
1469 * only perform the insertion if it can be done without allocating new btree
1470 * pages; if false, do it always. Returns 0 if the soft flag caused the
1471 * insertion to be skipped, or otherwise the size of the contiguous span
1472 * created by the insertion. This may be larger than npages if we're able
1473 * to consolidate with an adjacent range.
1474 */
1475static Size
1476FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
1477 bool soft)
1478{
1479 char *base = fpm_segment_base(fpm);
1480 FreePageBtreeSearchResult result;
1481 FreePageBtreeLeafKey *prevkey = NULL;
1482 FreePageBtreeLeafKey *nextkey = NULL;
1483 FreePageBtree *np;
1484 Size nindex;
1485
1486 Assert(npages > 0);
1487
1488 /* We can store a single free span without initializing the btree. */
1489 if (fpm->btree_depth == 0)
1490 {
1491 if (fpm->singleton_npages == 0)
1492 {
1493 /* Don't have a span yet; store this one. */
1494 fpm->singleton_first_page = first_page;
1495 fpm->singleton_npages = npages;
1496 FreePagePushSpanLeader(fpm, first_page, npages);
1497 return fpm->singleton_npages;
1498 }
1499 else if (fpm->singleton_first_page + fpm->singleton_npages ==
1500 first_page)
1501 {
1502 /* New span immediately follows sole existing span. */
1503 fpm->singleton_npages += npages;
1504 FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1505 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1506 fpm->singleton_npages);
1507 return fpm->singleton_npages;
1508 }
1509 else if (first_page + npages == fpm->singleton_first_page)
1510 {
1511 /* New span immediately precedes sole existing span. */
1512 FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1513 fpm->singleton_first_page = first_page;
1514 fpm->singleton_npages += npages;
1515 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1516 fpm->singleton_npages);
1517 return fpm->singleton_npages;
1518 }
1519 else
1520 {
1521 /* Not contiguous; we need to initialize the btree. */
1522 Size root_page;
1523 FreePageBtree *root;
1524
1525 if (!relptr_is_null(fpm->btree_recycle))
1526 root = FreePageBtreeGetRecycled(fpm);
1527 else if (soft)
1528 return 0; /* Should not allocate if soft. */
1529 else if (FreePageManagerGetInternal(fpm, 1, &root_page))
1530 root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
1531 else
1532 {
1533 /* We'd better be able to get a page from the existing range. */
1534 elog(FATAL, "free page manager btree is corrupt");
1535 }
1536
1537 /* Create the btree and move the preexisting range into it. */
1538 root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
1539 root->hdr.nused = 1;
1540 relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
1541 root->u.leaf_key[0].first_page = fpm->singleton_first_page;
1542 root->u.leaf_key[0].npages = fpm->singleton_npages;
1543 relptr_store(base, fpm->btree_root, root);
1544 fpm->singleton_first_page = 0;
1545 fpm->singleton_npages = 0;
1546 fpm->btree_depth = 1;
1547
1548 /*
1549 * Corner case: it may be that the btree root took the very last
1550 * free page. In that case, the sole btree entry covers a zero
1551 * page run, which is invalid. Overwrite it with the entry we're
1552 * trying to insert and get out.
1553 */
1554 if (root->u.leaf_key[0].npages == 0)
1555 {
1556 root->u.leaf_key[0].first_page = first_page;
1557 root->u.leaf_key[0].npages = npages;
1558 FreePagePushSpanLeader(fpm, first_page, npages);
1559 return npages;
1560 }
1561
1562 /* Fall through to insert the new key. */
1563 }
1564 }
1565
1566 /* Search the btree. */
1567 FreePageBtreeSearch(fpm, first_page, &result);
1568 Assert(!result.found);
1569 if (result.index > 0)
1570 prevkey = &result.page->u.leaf_key[result.index - 1];
1571 if (result.index < result.page->hdr.nused)
1572 {
1573 np = result.page;
1574 nindex = result.index;
1575 nextkey = &result.page->u.leaf_key[result.index];
1576 }
1577 else
1578 {
1579 np = FreePageBtreeFindRightSibling(base, result.page);
1580 nindex = 0;
1581 if (np != NULL)
1582 nextkey = &np->u.leaf_key[0];
1583 }
1584
1585 /* Consolidate with the previous entry if possible. */
1586 if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
1587 {
1588 bool remove_next = false;
1589 Size result;
1590
1591 Assert(prevkey->first_page + prevkey->npages == first_page);
1592 prevkey->npages = (first_page - prevkey->first_page) + npages;
1593
1594 /* Check whether we can *also* consolidate with the following entry. */
1595 if (nextkey != NULL &&
1596 prevkey->first_page + prevkey->npages >= nextkey->first_page)
1597 {
1598 Assert(prevkey->first_page + prevkey->npages ==
1599 nextkey->first_page);
1600 prevkey->npages = (nextkey->first_page - prevkey->first_page)
1601 + nextkey->npages;
1602 FreePagePopSpanLeader(fpm, nextkey->first_page);
1603 remove_next = true;
1604 }
1605
1606 /* Put the span on the correct freelist and save size. */
1607 FreePagePopSpanLeader(fpm, prevkey->first_page);
1608 FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
1609 result = prevkey->npages;
1610
1611 /*
1612 * If we consolidated with both the preceding and following entries,
1613 * we must remove the following entry. We do this last, because
1614 * removing an element from the btree may invalidate pointers we hold
1615 * into the current data structure.
1616 *
1617 * NB: The btree is technically in an invalid state a this point
1618 * because we've already updated prevkey to cover the same key space
1619 * as nextkey. FreePageBtreeRemove() shouldn't notice that, though.
1620 */
1621 if (remove_next)
1622 FreePageBtreeRemove(fpm, np, nindex);
1623
1624 return result;
1625 }
1626
1627 /* Consolidate with the next entry if possible. */
1628 if (nextkey != NULL && first_page + npages >= nextkey->first_page)
1629 {
1630 Size newpages;
1631
1632 /* Compute new size for span. */
1633 Assert(first_page + npages == nextkey->first_page);
1634 newpages = (nextkey->first_page - first_page) + nextkey->npages;
1635
1636 /* Put span on correct free list. */
1637 FreePagePopSpanLeader(fpm, nextkey->first_page);
1638 FreePagePushSpanLeader(fpm, first_page, newpages);
1639
1640 /* Update key in place. */
1641 nextkey->first_page = first_page;
1642 nextkey->npages = newpages;
1643
1644 /* If reducing first key on page, ancestors might need adjustment. */
1645 if (nindex == 0)
1646 FreePageBtreeAdjustAncestorKeys(fpm, np);
1647
1648 return nextkey->npages;
1649 }
1650
1651 /* Split leaf page and as many of its ancestors as necessary. */
1652 if (result.split_pages > 0)
1653 {
1654 /*
1655 * NB: We could consider various coping strategies here to avoid a
1656 * split; most obviously, if np != result.page, we could target that
1657 * page instead. More complicated shuffling strategies could be
1658 * possible as well; basically, unless every single leaf page is 100%
1659 * full, we can jam this key in there if we try hard enough. It's
1660 * unlikely that trying that hard is worthwhile, but it's possible we
1661 * might need to make more than no effort. For now, we just do the
1662 * easy thing, which is nothing.
1663 */
1664
1665 /* If this is a soft insert, it's time to give up. */
1666 if (soft)
1667 return 0;
1668
1669 /* Check whether we need to allocate more btree pages to split. */
1670 if (result.split_pages > fpm->btree_recycle_count)
1671 {
1672 Size pages_needed;
1673 Size recycle_page;
1674 Size i;
1675
1676 /*
1677 * Allocate the required number of pages and split each one in
1678 * turn. This should never fail, because if we've got enough
1679 * spans of free pages kicking around that we need additional
1680 * storage space just to remember them all, then we should
1681 * certainly have enough to expand the btree, which should only
1682 * ever use a tiny number of pages compared to the number under
1683 * management. If it does, something's badly screwed up.
1684 */
1685 pages_needed = result.split_pages - fpm->btree_recycle_count;
1686 for (i = 0; i < pages_needed; ++i)
1687 {
1688 if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
1689 elog(FATAL, "free page manager btree is corrupt");
1690 FreePageBtreeRecycle(fpm, recycle_page);
1691 }
1692
1693 /*
1694 * The act of allocating pages to recycle may have invalidated the
1695 * results of our previous btree reserch, so repeat it. (We could
1696 * recheck whether any of our split-avoidance strategies that were
1697 * not viable before now are, but it hardly seems worthwhile, so
1698 * we don't bother. Consolidation can't be possible now if it
1699 * wasn't previously.)
1700 */
1701 FreePageBtreeSearch(fpm, first_page, &result);
1702
1703 /*
1704 * The act of allocating pages for use in constructing our btree
1705 * should never cause any page to become more full, so the new
1706 * split depth should be no greater than the old one, and perhaps
1707 * less if we fortuitously allocated a chunk that freed up a slot
1708 * on the page we need to update.
1709 */
1710 Assert(result.split_pages <= fpm->btree_recycle_count);
1711 }
1712
1713 /* If we still need to perform a split, do it. */
1714 if (result.split_pages > 0)
1715 {
1716 FreePageBtree *split_target = result.page;
1717 FreePageBtree *child = NULL;
1718 Size key = first_page;
1719
1720 for (;;)
1721 {
1722 FreePageBtree *newsibling;
1723 FreePageBtree *parent;
1724
1725 /* Identify parent page, which must receive downlink. */
1726 parent = relptr_access(base, split_target->hdr.parent);
1727
1728 /* Split the page - downlink not added yet. */
1729 newsibling = FreePageBtreeSplitPage(fpm, split_target);
1730
1731 /*
1732 * At this point in the loop, we're always carrying a pending
1733 * insertion. On the first pass, it's the actual key we're
1734 * trying to insert; on subsequent passes, it's the downlink
1735 * that needs to be added as a result of the split performed
1736 * during the previous loop iteration. Since we've just split
1737 * the page, there's definitely room on one of the two
1738 * resulting pages.
1739 */
1740 if (child == NULL)
1741 {
1742 Size index;
1743 FreePageBtree *insert_into;
1744
1745 insert_into = key < newsibling->u.leaf_key[0].first_page ?
1746 split_target : newsibling;
1747 index = FreePageBtreeSearchLeaf(insert_into, key);
1748 FreePageBtreeInsertLeaf(insert_into, index, key, npages);
1749 if (index == 0 && insert_into == split_target)
1750 FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1751 }
1752 else
1753 {
1754 Size index;
1755 FreePageBtree *insert_into;
1756
1757 insert_into =
1758 key < newsibling->u.internal_key[0].first_page ?
1759 split_target : newsibling;
1760 index = FreePageBtreeSearchInternal(insert_into, key);
1761 FreePageBtreeInsertInternal(base, insert_into, index,
1762 key, child);
1763 relptr_store(base, child->hdr.parent, insert_into);
1764 if (index == 0 && insert_into == split_target)
1765 FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1766 }
1767
1768 /* If the page we just split has no parent, split the root. */
1769 if (parent == NULL)
1770 {
1771 FreePageBtree *newroot;
1772
1773 newroot = FreePageBtreeGetRecycled(fpm);
1774 newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
1775 newroot->hdr.nused = 2;
1776 relptr_store(base, newroot->hdr.parent,
1777 (FreePageBtree *) NULL);
1778 newroot->u.internal_key[0].first_page =
1779 FreePageBtreeFirstKey(split_target);
1780 relptr_store(base, newroot->u.internal_key[0].child,
1781 split_target);
1782 relptr_store(base, split_target->hdr.parent, newroot);
1783 newroot->u.internal_key[1].first_page =
1784 FreePageBtreeFirstKey(newsibling);
1785 relptr_store(base, newroot->u.internal_key[1].child,
1786 newsibling);
1787 relptr_store(base, newsibling->hdr.parent, newroot);
1788 relptr_store(base, fpm->btree_root, newroot);
1789 fpm->btree_depth++;
1790
1791 break;
1792 }
1793
1794 /* If the parent page isn't full, insert the downlink. */
1795 key = newsibling->u.internal_key[0].first_page;
1796 if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
1797 {
1798 Size index;
1799
1800 index = FreePageBtreeSearchInternal(parent, key);
1801 FreePageBtreeInsertInternal(base, parent, index,
1802 key, newsibling);
1803 relptr_store(base, newsibling->hdr.parent, parent);
1804 if (index == 0)
1805 FreePageBtreeAdjustAncestorKeys(fpm, parent);
1806 break;
1807 }
1808
1809 /* The parent also needs to be split, so loop around. */
1810 child = newsibling;
1811 split_target = parent;
1812 }
1813
1814 /*
1815 * The loop above did the insert, so just need to update the free
1816 * list, and we're done.
1817 */
1818 FreePagePushSpanLeader(fpm, first_page, npages);
1819
1820 return npages;
1821 }
1822 }
1823
1824 /* Physically add the key to the page. */
1825 Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
1826 FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
1827
1828 /* If new first key on page, ancestors might need adjustment. */
1829 if (result.index == 0)
1830 FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1831
1832 /* Put it on the free list. */
1833 FreePagePushSpanLeader(fpm, first_page, npages);
1834
1835 return npages;
1836}
1837
1838/*
1839 * Remove a FreePageSpanLeader from the linked-list that contains it, either
1840 * because we're changing the size of the span, or because we're allocating it.
1841 */
1842static void
1843FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
1844{
1845 char *base = fpm_segment_base(fpm);
1846 FreePageSpanLeader *span;
1847 FreePageSpanLeader *next;
1848 FreePageSpanLeader *prev;
1849
1850 span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
1851
1852 next = relptr_access(base, span->next);
1853 prev = relptr_access(base, span->prev);
1854 if (next != NULL)
1855 relptr_copy(next->prev, span->prev);
1856 if (prev != NULL)
1857 relptr_copy(prev->next, span->next);
1858 else
1859 {
1860 Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
1861
1862 Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE);
1863 relptr_copy(fpm->freelist[f], span->next);
1864 }
1865}
1866
1867/*
1868 * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
1869 */
1870static void
1871FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
1872{
1873 char *base = fpm_segment_base(fpm);
1874 Size f = Min(npages, FPM_NUM_FREELISTS) - 1;
1875 FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
1876 FreePageSpanLeader *span;
1877
1878 span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
1879 span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
1880 span->npages = npages;
1881 relptr_store(base, span->next, head);
1882 relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
1883 if (head != NULL)
1884 relptr_store(base, head->prev, span);
1885 relptr_store(base, fpm->freelist[f], span);
1886}
1887