1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * freepage.c |
4 | * Management of free memory pages. |
5 | * |
6 | * The intention of this code is to provide infrastructure for memory |
7 | * allocators written specifically for PostgreSQL. At least in the case |
8 | * of dynamic shared memory, we can't simply use malloc() or even |
9 | * relatively thin wrappers like palloc() which sit on top of it, because |
10 | * no allocator built into the operating system will deal with relative |
11 | * pointers. In the future, we may find other cases in which greater |
12 | * control over our own memory management seems desirable. |
13 | * |
14 | * A FreePageManager keeps track of which 4kB pages of memory are currently |
15 | * unused from the point of view of some higher-level memory allocator. |
16 | * Unlike a user-facing allocator such as palloc(), a FreePageManager can |
17 | * only allocate and free in units of whole pages, and freeing an |
18 | * allocation can only be done given knowledge of its length in pages. |
19 | * |
20 | * Since a free page manager has only a fixed amount of dedicated memory, |
21 | * and since there is no underlying allocator, it uses the free pages |
22 | * it is given to manage to store its bookkeeping data. It keeps multiple |
23 | * freelists of runs of pages, sorted by the size of the run; the head of |
24 | * each freelist is stored in the FreePageManager itself, and the first |
25 | * page of each run contains a relative pointer to the next run. See |
26 | * FreePageManagerGetInternal for more details on how the freelists are |
27 | * managed. |
28 | * |
29 | * To avoid memory fragmentation, it's important to consolidate adjacent |
30 | * spans of pages whenever possible; otherwise, large allocation requests |
31 | * might not be satisfied even when sufficient contiguous space is |
32 | * available. Therefore, in addition to the freelists, we maintain an |
33 | * in-memory btree of free page ranges ordered by page number. If a |
34 | * range being freed precedes or follows a range that is already free, |
35 | * the existing range is extended; if it exactly bridges the gap between |
36 | * free ranges, then the two existing ranges are consolidated with the |
37 | * newly-freed range to form one great big range of free pages. |
38 | * |
39 | * When there is only one range of free pages, the btree is trivial and |
40 | * is stored within the FreePageManager proper; otherwise, pages are |
41 | * allocated from the area under management as needed. Even in cases |
42 | * where memory fragmentation is very severe, only a tiny fraction of |
43 | * the pages under management are consumed by this btree. |
44 | * |
45 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
46 | * Portions Copyright (c) 1994, Regents of the University of California |
47 | * |
48 | * IDENTIFICATION |
49 | * src/backend/utils/mmgr/freepage.c |
50 | * |
51 | *------------------------------------------------------------------------- |
52 | */ |
53 | |
54 | #include "postgres.h" |
55 | #include "lib/stringinfo.h" |
56 | #include "miscadmin.h" |
57 | |
58 | #include "utils/freepage.h" |
59 | #include "utils/relptr.h" |
60 | |
61 | |
62 | /* Magic numbers to identify various page types */ |
63 | #define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0 |
64 | #define FREE_PAGE_LEAF_MAGIC 0x98eae728 |
65 | #define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9 |
66 | |
67 | /* Doubly linked list of spans of free pages; stored in first page of span. */ |
68 | struct FreePageSpanLeader |
69 | { |
70 | int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */ |
71 | Size npages; /* number of pages in span */ |
72 | RelptrFreePageSpanLeader prev; |
73 | RelptrFreePageSpanLeader next; |
74 | }; |
75 | |
76 | /* Common header for btree leaf and internal pages. */ |
77 | typedef struct |
78 | { |
79 | int ; /* FREE_PAGE_LEAF_MAGIC or |
80 | * FREE_PAGE_INTERNAL_MAGIC */ |
81 | Size ; /* number of items used */ |
82 | RelptrFreePageBtree ; /* uplink */ |
83 | } ; |
84 | |
85 | /* Internal key; points to next level of btree. */ |
86 | typedef struct FreePageBtreeInternalKey |
87 | { |
88 | Size first_page; /* low bound for keys on child page */ |
89 | RelptrFreePageBtree child; /* downlink */ |
90 | } FreePageBtreeInternalKey; |
91 | |
92 | /* Leaf key; no payload data. */ |
93 | typedef struct FreePageBtreeLeafKey |
94 | { |
95 | Size first_page; /* first page in span */ |
96 | Size npages; /* number of pages in span */ |
97 | } FreePageBtreeLeafKey; |
98 | |
99 | /* Work out how many keys will fit on a page. */ |
100 | #define FPM_ITEMS_PER_INTERNAL_PAGE \ |
101 | ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ |
102 | sizeof(FreePageBtreeInternalKey)) |
103 | #define FPM_ITEMS_PER_LEAF_PAGE \ |
104 | ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ |
105 | sizeof(FreePageBtreeLeafKey)) |
106 | |
107 | /* A btree page of either sort */ |
108 | struct FreePageBtree |
109 | { |
110 | FreePageBtreeHeader hdr; |
111 | union |
112 | { |
113 | FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]; |
114 | FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]; |
115 | } u; |
116 | }; |
117 | |
118 | /* Results of a btree search */ |
119 | typedef struct FreePageBtreeSearchResult |
120 | { |
121 | FreePageBtree *page; |
122 | Size index; |
123 | bool found; |
124 | unsigned split_pages; |
125 | } FreePageBtreeSearchResult; |
126 | |
127 | /* Helper functions */ |
128 | static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, |
129 | FreePageBtree *btp); |
130 | static Size FreePageBtreeCleanup(FreePageManager *fpm); |
131 | static FreePageBtree *FreePageBtreeFindLeftSibling(char *base, |
132 | FreePageBtree *btp); |
133 | static FreePageBtree *FreePageBtreeFindRightSibling(char *base, |
134 | FreePageBtree *btp); |
135 | static Size FreePageBtreeFirstKey(FreePageBtree *btp); |
136 | static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm); |
137 | static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, |
138 | Size index, Size first_page, FreePageBtree *child); |
139 | static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, |
140 | Size first_page, Size npages); |
141 | static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno); |
142 | static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, |
143 | Size index); |
144 | static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp); |
145 | static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, |
146 | FreePageBtreeSearchResult *result); |
147 | static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page); |
148 | static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page); |
149 | static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm, |
150 | FreePageBtree *btp); |
151 | static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp); |
152 | static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, |
153 | FreePageBtree *parent, int level, StringInfo buf); |
154 | static void FreePageManagerDumpSpans(FreePageManager *fpm, |
155 | FreePageSpanLeader *span, Size expected_pages, |
156 | StringInfo buf); |
157 | static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, |
158 | Size *first_page); |
159 | static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, |
160 | Size npages, bool soft); |
161 | static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno); |
162 | static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, |
163 | Size npages); |
164 | static Size FreePageManagerLargestContiguous(FreePageManager *fpm); |
165 | static void FreePageManagerUpdateLargest(FreePageManager *fpm); |
166 | |
167 | #if FPM_EXTRA_ASSERTS |
168 | static Size sum_free_pages(FreePageManager *fpm); |
169 | #endif |
170 | |
171 | /* |
172 | * Initialize a new, empty free page manager. |
173 | * |
174 | * 'fpm' should reference caller-provided memory large enough to contain a |
175 | * FreePageManager. We'll initialize it here. |
176 | * |
177 | * 'base' is the address to which all pointers are relative. When managing |
178 | * a dynamic shared memory segment, it should normally be the base of the |
179 | * segment. When managing backend-private memory, it can be either NULL or, |
180 | * if managing a single contiguous extent of memory, the start of that extent. |
181 | */ |
182 | void |
183 | FreePageManagerInitialize(FreePageManager *fpm, char *base) |
184 | { |
185 | Size f; |
186 | |
187 | relptr_store(base, fpm->self, fpm); |
188 | relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); |
189 | relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL); |
190 | fpm->btree_depth = 0; |
191 | fpm->btree_recycle_count = 0; |
192 | fpm->singleton_first_page = 0; |
193 | fpm->singleton_npages = 0; |
194 | fpm->contiguous_pages = 0; |
195 | fpm->contiguous_pages_dirty = true; |
196 | #ifdef FPM_EXTRA_ASSERTS |
197 | fpm->free_pages = 0; |
198 | #endif |
199 | |
200 | for (f = 0; f < FPM_NUM_FREELISTS; f++) |
201 | relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL); |
202 | } |
203 | |
204 | /* |
205 | * Allocate a run of pages of the given length from the free page manager. |
206 | * The return value indicates whether we were able to satisfy the request; |
207 | * if true, the first page of the allocation is stored in *first_page. |
208 | */ |
209 | bool |
210 | FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) |
211 | { |
212 | bool result; |
213 | Size contiguous_pages; |
214 | |
215 | result = FreePageManagerGetInternal(fpm, npages, first_page); |
216 | |
217 | /* |
218 | * It's a bit counterintuitive, but allocating pages can actually create |
219 | * opportunities for cleanup that create larger ranges. We might pull a |
220 | * key out of the btree that enables the item at the head of the btree |
221 | * recycle list to be inserted; and then if there are more items behind it |
222 | * one of those might cause two currently-separated ranges to merge, |
223 | * creating a single range of contiguous pages larger than any that |
224 | * existed previously. It might be worth trying to improve the cleanup |
225 | * algorithm to avoid such corner cases, but for now we just notice the |
226 | * condition and do the appropriate reporting. |
227 | */ |
228 | contiguous_pages = FreePageBtreeCleanup(fpm); |
229 | if (fpm->contiguous_pages < contiguous_pages) |
230 | fpm->contiguous_pages = contiguous_pages; |
231 | |
232 | /* |
233 | * FreePageManagerGetInternal may have set contiguous_pages_dirty. |
234 | * Recompute contiguous_pages if so. |
235 | */ |
236 | FreePageManagerUpdateLargest(fpm); |
237 | |
238 | #ifdef FPM_EXTRA_ASSERTS |
239 | if (result) |
240 | { |
241 | Assert(fpm->free_pages >= npages); |
242 | fpm->free_pages -= npages; |
243 | } |
244 | Assert(fpm->free_pages == sum_free_pages(fpm)); |
245 | Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); |
246 | #endif |
247 | return result; |
248 | } |
249 | |
250 | #ifdef FPM_EXTRA_ASSERTS |
251 | static void |
252 | sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum) |
253 | { |
254 | char *base = fpm_segment_base(fpm); |
255 | |
256 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC || |
257 | btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); |
258 | ++*sum; |
259 | if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) |
260 | { |
261 | Size index; |
262 | |
263 | |
264 | for (index = 0; index < btp->hdr.nused; ++index) |
265 | { |
266 | FreePageBtree *child; |
267 | |
268 | child = relptr_access(base, btp->u.internal_key[index].child); |
269 | sum_free_pages_recurse(fpm, child, sum); |
270 | } |
271 | } |
272 | } |
273 | static Size |
274 | sum_free_pages(FreePageManager *fpm) |
275 | { |
276 | FreePageSpanLeader *recycle; |
277 | char *base = fpm_segment_base(fpm); |
278 | Size sum = 0; |
279 | int list; |
280 | |
281 | /* Count the spans by scanning the freelists. */ |
282 | for (list = 0; list < FPM_NUM_FREELISTS; ++list) |
283 | { |
284 | |
285 | if (!relptr_is_null(fpm->freelist[list])) |
286 | { |
287 | FreePageSpanLeader *candidate = |
288 | relptr_access(base, fpm->freelist[list]); |
289 | |
290 | do |
291 | { |
292 | sum += candidate->npages; |
293 | candidate = relptr_access(base, candidate->next); |
294 | } while (candidate != NULL); |
295 | } |
296 | } |
297 | |
298 | /* Count btree internal pages. */ |
299 | if (fpm->btree_depth > 0) |
300 | { |
301 | FreePageBtree *root = relptr_access(base, fpm->btree_root); |
302 | |
303 | sum_free_pages_recurse(fpm, root, &sum); |
304 | } |
305 | |
306 | /* Count the recycle list. */ |
307 | for (recycle = relptr_access(base, fpm->btree_recycle); |
308 | recycle != NULL; |
309 | recycle = relptr_access(base, recycle->next)) |
310 | { |
311 | Assert(recycle->npages == 1); |
312 | ++sum; |
313 | } |
314 | |
315 | return sum; |
316 | } |
317 | #endif |
318 | |
319 | /* |
320 | * Compute the size of the largest run of pages that the user could |
321 | * successfully get. |
322 | */ |
323 | static Size |
324 | FreePageManagerLargestContiguous(FreePageManager *fpm) |
325 | { |
326 | char *base; |
327 | Size largest; |
328 | |
329 | base = fpm_segment_base(fpm); |
330 | largest = 0; |
331 | if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1])) |
332 | { |
333 | FreePageSpanLeader *candidate; |
334 | |
335 | candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]); |
336 | do |
337 | { |
338 | if (candidate->npages > largest) |
339 | largest = candidate->npages; |
340 | candidate = relptr_access(base, candidate->next); |
341 | } while (candidate != NULL); |
342 | } |
343 | else |
344 | { |
345 | Size f = FPM_NUM_FREELISTS - 1; |
346 | |
347 | do |
348 | { |
349 | --f; |
350 | if (!relptr_is_null(fpm->freelist[f])) |
351 | { |
352 | largest = f + 1; |
353 | break; |
354 | } |
355 | } while (f > 0); |
356 | } |
357 | |
358 | return largest; |
359 | } |
360 | |
361 | /* |
362 | * Recompute the size of the largest run of pages that the user could |
363 | * successfully get, if it has been marked dirty. |
364 | */ |
365 | static void |
366 | FreePageManagerUpdateLargest(FreePageManager *fpm) |
367 | { |
368 | if (!fpm->contiguous_pages_dirty) |
369 | return; |
370 | |
371 | fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm); |
372 | fpm->contiguous_pages_dirty = false; |
373 | } |
374 | |
375 | /* |
376 | * Transfer a run of pages to the free page manager. |
377 | */ |
378 | void |
379 | FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages) |
380 | { |
381 | Size contiguous_pages; |
382 | |
383 | Assert(npages > 0); |
384 | |
385 | /* Record the new pages. */ |
386 | contiguous_pages = |
387 | FreePageManagerPutInternal(fpm, first_page, npages, false); |
388 | |
389 | /* |
390 | * If the new range we inserted into the page manager was contiguous with |
391 | * an existing range, it may have opened up cleanup opportunities. |
392 | */ |
393 | if (contiguous_pages > npages) |
394 | { |
395 | Size cleanup_contiguous_pages; |
396 | |
397 | cleanup_contiguous_pages = FreePageBtreeCleanup(fpm); |
398 | if (cleanup_contiguous_pages > contiguous_pages) |
399 | contiguous_pages = cleanup_contiguous_pages; |
400 | } |
401 | |
402 | /* See if we now have a new largest chunk. */ |
403 | if (fpm->contiguous_pages < contiguous_pages) |
404 | fpm->contiguous_pages = contiguous_pages; |
405 | |
406 | /* |
407 | * The earlier call to FreePageManagerPutInternal may have set |
408 | * contiguous_pages_dirty if it needed to allocate internal pages, so |
409 | * recompute contiguous_pages if necessary. |
410 | */ |
411 | FreePageManagerUpdateLargest(fpm); |
412 | |
413 | #ifdef FPM_EXTRA_ASSERTS |
414 | fpm->free_pages += npages; |
415 | Assert(fpm->free_pages == sum_free_pages(fpm)); |
416 | Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); |
417 | #endif |
418 | } |
419 | |
420 | /* |
421 | * Produce a debugging dump of the state of a free page manager. |
422 | */ |
423 | char * |
424 | FreePageManagerDump(FreePageManager *fpm) |
425 | { |
426 | char *base = fpm_segment_base(fpm); |
427 | StringInfoData buf; |
428 | FreePageSpanLeader *recycle; |
429 | bool dumped_any_freelist = false; |
430 | Size f; |
431 | |
432 | /* Initialize output buffer. */ |
433 | initStringInfo(&buf); |
434 | |
435 | /* Dump general stuff. */ |
436 | appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n" , |
437 | fpm->self.relptr_off, fpm->contiguous_pages); |
438 | |
439 | /* Dump btree. */ |
440 | if (fpm->btree_depth > 0) |
441 | { |
442 | FreePageBtree *root; |
443 | |
444 | appendStringInfo(&buf, "btree depth %u:\n" , fpm->btree_depth); |
445 | root = relptr_access(base, fpm->btree_root); |
446 | FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf); |
447 | } |
448 | else if (fpm->singleton_npages > 0) |
449 | { |
450 | appendStringInfo(&buf, "singleton: %zu(%zu)\n" , |
451 | fpm->singleton_first_page, fpm->singleton_npages); |
452 | } |
453 | |
454 | /* Dump btree recycle list. */ |
455 | recycle = relptr_access(base, fpm->btree_recycle); |
456 | if (recycle != NULL) |
457 | { |
458 | appendStringInfoString(&buf, "btree recycle:" ); |
459 | FreePageManagerDumpSpans(fpm, recycle, 1, &buf); |
460 | } |
461 | |
462 | /* Dump free lists. */ |
463 | for (f = 0; f < FPM_NUM_FREELISTS; ++f) |
464 | { |
465 | FreePageSpanLeader *span; |
466 | |
467 | if (relptr_is_null(fpm->freelist[f])) |
468 | continue; |
469 | if (!dumped_any_freelist) |
470 | { |
471 | appendStringInfoString(&buf, "freelists:\n" ); |
472 | dumped_any_freelist = true; |
473 | } |
474 | appendStringInfo(&buf, " %zu:" , f + 1); |
475 | span = relptr_access(base, fpm->freelist[f]); |
476 | FreePageManagerDumpSpans(fpm, span, f + 1, &buf); |
477 | } |
478 | |
479 | /* And return result to caller. */ |
480 | return buf.data; |
481 | } |
482 | |
483 | |
484 | /* |
485 | * The first_page value stored at index zero in any non-root page must match |
486 | * the first_page value stored in its parent at the index which points to that |
487 | * page. So when the value stored at index zero in a btree page changes, we've |
488 | * got to walk up the tree adjusting ancestor keys until we reach an ancestor |
489 | * where that key isn't index zero. This function should be called after |
490 | * updating the first key on the target page; it will propagate the change |
491 | * upward as far as needed. |
492 | * |
493 | * We assume here that the first key on the page has not changed enough to |
494 | * require changes in the ordering of keys on its ancestor pages. Thus, |
495 | * if we search the parent page for the first key greater than or equal to |
496 | * the first key on the current page, the downlink to this page will be either |
497 | * the exact index returned by the search (if the first key decreased) |
498 | * or one less (if the first key increased). |
499 | */ |
500 | static void |
501 | FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp) |
502 | { |
503 | char *base = fpm_segment_base(fpm); |
504 | Size first_page; |
505 | FreePageBtree *parent; |
506 | FreePageBtree *child; |
507 | |
508 | /* This might be either a leaf or an internal page. */ |
509 | Assert(btp->hdr.nused > 0); |
510 | if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
511 | { |
512 | Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); |
513 | first_page = btp->u.leaf_key[0].first_page; |
514 | } |
515 | else |
516 | { |
517 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
518 | Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); |
519 | first_page = btp->u.internal_key[0].first_page; |
520 | } |
521 | child = btp; |
522 | |
523 | /* Loop until we find an ancestor that does not require adjustment. */ |
524 | for (;;) |
525 | { |
526 | Size s; |
527 | |
528 | parent = relptr_access(base, child->hdr.parent); |
529 | if (parent == NULL) |
530 | break; |
531 | s = FreePageBtreeSearchInternal(parent, first_page); |
532 | |
533 | /* Key is either at index s or index s-1; figure out which. */ |
534 | if (s >= parent->hdr.nused) |
535 | { |
536 | Assert(s == parent->hdr.nused); |
537 | --s; |
538 | } |
539 | else |
540 | { |
541 | FreePageBtree *check; |
542 | |
543 | check = relptr_access(base, parent->u.internal_key[s].child); |
544 | if (check != child) |
545 | { |
546 | Assert(s > 0); |
547 | --s; |
548 | } |
549 | } |
550 | |
551 | #ifdef USE_ASSERT_CHECKING |
552 | /* Debugging double-check. */ |
553 | { |
554 | FreePageBtree *check; |
555 | |
556 | check = relptr_access(base, parent->u.internal_key[s].child); |
557 | Assert(s < parent->hdr.nused); |
558 | Assert(child == check); |
559 | } |
560 | #endif |
561 | |
562 | /* Update the parent key. */ |
563 | parent->u.internal_key[s].first_page = first_page; |
564 | |
565 | /* |
566 | * If this is the first key in the parent, go up another level; else |
567 | * done. |
568 | */ |
569 | if (s > 0) |
570 | break; |
571 | child = parent; |
572 | } |
573 | } |
574 | |
575 | /* |
576 | * Attempt to reclaim space from the free-page btree. The return value is |
577 | * the largest range of contiguous pages created by the cleanup operation. |
578 | */ |
579 | static Size |
580 | FreePageBtreeCleanup(FreePageManager *fpm) |
581 | { |
582 | char *base = fpm_segment_base(fpm); |
583 | Size max_contiguous_pages = 0; |
584 | |
585 | /* Attempt to shrink the depth of the btree. */ |
586 | while (!relptr_is_null(fpm->btree_root)) |
587 | { |
588 | FreePageBtree *root = relptr_access(base, fpm->btree_root); |
589 | |
590 | /* If the root contains only one key, reduce depth by one. */ |
591 | if (root->hdr.nused == 1) |
592 | { |
593 | /* Shrink depth of tree by one. */ |
594 | Assert(fpm->btree_depth > 0); |
595 | --fpm->btree_depth; |
596 | if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
597 | { |
598 | /* If root is a leaf, convert only entry to singleton range. */ |
599 | relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); |
600 | fpm->singleton_first_page = root->u.leaf_key[0].first_page; |
601 | fpm->singleton_npages = root->u.leaf_key[0].npages; |
602 | } |
603 | else |
604 | { |
605 | FreePageBtree *newroot; |
606 | |
607 | /* If root is an internal page, make only child the root. */ |
608 | Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
609 | relptr_copy(fpm->btree_root, root->u.internal_key[0].child); |
610 | newroot = relptr_access(base, fpm->btree_root); |
611 | relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); |
612 | } |
613 | FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root)); |
614 | } |
615 | else if (root->hdr.nused == 2 && |
616 | root->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
617 | { |
618 | Size end_of_first; |
619 | Size start_of_second; |
620 | |
621 | end_of_first = root->u.leaf_key[0].first_page + |
622 | root->u.leaf_key[0].npages; |
623 | start_of_second = root->u.leaf_key[1].first_page; |
624 | |
625 | if (end_of_first + 1 == start_of_second) |
626 | { |
627 | Size root_page = fpm_pointer_to_page(base, root); |
628 | |
629 | if (end_of_first == root_page) |
630 | { |
631 | FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page); |
632 | FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page); |
633 | fpm->singleton_first_page = root->u.leaf_key[0].first_page; |
634 | fpm->singleton_npages = root->u.leaf_key[0].npages + |
635 | root->u.leaf_key[1].npages + 1; |
636 | fpm->btree_depth = 0; |
637 | relptr_store(base, fpm->btree_root, |
638 | (FreePageBtree *) NULL); |
639 | FreePagePushSpanLeader(fpm, fpm->singleton_first_page, |
640 | fpm->singleton_npages); |
641 | Assert(max_contiguous_pages == 0); |
642 | max_contiguous_pages = fpm->singleton_npages; |
643 | } |
644 | } |
645 | |
646 | /* Whether it worked or not, it's time to stop. */ |
647 | break; |
648 | } |
649 | else |
650 | { |
651 | /* Nothing more to do. Stop. */ |
652 | break; |
653 | } |
654 | } |
655 | |
656 | /* |
657 | * Attempt to free recycled btree pages. We skip this if releasing the |
658 | * recycled page would require a btree page split, because the page we're |
659 | * trying to recycle would be consumed by the split, which would be |
660 | * counterproductive. |
661 | * |
662 | * We also currently only ever attempt to recycle the first page on the |
663 | * list; that could be made more aggressive, but it's not clear that the |
664 | * complexity would be worthwhile. |
665 | */ |
666 | while (fpm->btree_recycle_count > 0) |
667 | { |
668 | FreePageBtree *btp; |
669 | Size first_page; |
670 | Size contiguous_pages; |
671 | |
672 | btp = FreePageBtreeGetRecycled(fpm); |
673 | first_page = fpm_pointer_to_page(base, btp); |
674 | contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true); |
675 | if (contiguous_pages == 0) |
676 | { |
677 | FreePageBtreeRecycle(fpm, first_page); |
678 | break; |
679 | } |
680 | else |
681 | { |
682 | if (contiguous_pages > max_contiguous_pages) |
683 | max_contiguous_pages = contiguous_pages; |
684 | } |
685 | } |
686 | |
687 | return max_contiguous_pages; |
688 | } |
689 | |
690 | /* |
691 | * Consider consolidating the given page with its left or right sibling, |
692 | * if it's fairly empty. |
693 | */ |
694 | static void |
695 | FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) |
696 | { |
697 | char *base = fpm_segment_base(fpm); |
698 | FreePageBtree *np; |
699 | Size max; |
700 | |
701 | /* |
702 | * We only try to consolidate pages that are less than a third full. We |
703 | * could be more aggressive about this, but that might risk performing |
704 | * consolidation only to end up splitting again shortly thereafter. Since |
705 | * the btree should be very small compared to the space under management, |
706 | * our goal isn't so much to ensure that it always occupies the absolutely |
707 | * smallest possible number of pages as to reclaim pages before things get |
708 | * too egregiously out of hand. |
709 | */ |
710 | if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
711 | max = FPM_ITEMS_PER_LEAF_PAGE; |
712 | else |
713 | { |
714 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
715 | max = FPM_ITEMS_PER_INTERNAL_PAGE; |
716 | } |
717 | if (btp->hdr.nused >= max / 3) |
718 | return; |
719 | |
720 | /* |
721 | * If we can fit our right sibling's keys onto this page, consolidate. |
722 | */ |
723 | np = FreePageBtreeFindRightSibling(base, btp); |
724 | if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) |
725 | { |
726 | if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
727 | { |
728 | memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0], |
729 | sizeof(FreePageBtreeLeafKey) * np->hdr.nused); |
730 | btp->hdr.nused += np->hdr.nused; |
731 | } |
732 | else |
733 | { |
734 | memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0], |
735 | sizeof(FreePageBtreeInternalKey) * np->hdr.nused); |
736 | btp->hdr.nused += np->hdr.nused; |
737 | FreePageBtreeUpdateParentPointers(base, btp); |
738 | } |
739 | FreePageBtreeRemovePage(fpm, np); |
740 | return; |
741 | } |
742 | |
743 | /* |
744 | * If we can fit our keys onto our left sibling's page, consolidate. In |
745 | * this case, we move our keys onto the other page rather than visca |
746 | * versa, to avoid having to adjust ancestor keys. |
747 | */ |
748 | np = FreePageBtreeFindLeftSibling(base, btp); |
749 | if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) |
750 | { |
751 | if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
752 | { |
753 | memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0], |
754 | sizeof(FreePageBtreeLeafKey) * btp->hdr.nused); |
755 | np->hdr.nused += btp->hdr.nused; |
756 | } |
757 | else |
758 | { |
759 | memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0], |
760 | sizeof(FreePageBtreeInternalKey) * btp->hdr.nused); |
761 | np->hdr.nused += btp->hdr.nused; |
762 | FreePageBtreeUpdateParentPointers(base, np); |
763 | } |
764 | FreePageBtreeRemovePage(fpm, btp); |
765 | return; |
766 | } |
767 | } |
768 | |
769 | /* |
770 | * Find the passed page's left sibling; that is, the page at the same level |
771 | * of the tree whose keyspace immediately precedes ours. |
772 | */ |
773 | static FreePageBtree * |
774 | FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp) |
775 | { |
776 | FreePageBtree *p = btp; |
777 | int levels = 0; |
778 | |
779 | /* Move up until we can move left. */ |
780 | for (;;) |
781 | { |
782 | Size first_page; |
783 | Size index; |
784 | |
785 | first_page = FreePageBtreeFirstKey(p); |
786 | p = relptr_access(base, p->hdr.parent); |
787 | |
788 | if (p == NULL) |
789 | return NULL; /* we were passed the rightmost page */ |
790 | |
791 | index = FreePageBtreeSearchInternal(p, first_page); |
792 | if (index > 0) |
793 | { |
794 | Assert(p->u.internal_key[index].first_page == first_page); |
795 | p = relptr_access(base, p->u.internal_key[index - 1].child); |
796 | break; |
797 | } |
798 | Assert(index == 0); |
799 | ++levels; |
800 | } |
801 | |
802 | /* Descend left. */ |
803 | while (levels > 0) |
804 | { |
805 | Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
806 | p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child); |
807 | --levels; |
808 | } |
809 | Assert(p->hdr.magic == btp->hdr.magic); |
810 | |
811 | return p; |
812 | } |
813 | |
814 | /* |
815 | * Find the passed page's right sibling; that is, the page at the same level |
816 | * of the tree whose keyspace immediately follows ours. |
817 | */ |
818 | static FreePageBtree * |
819 | FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp) |
820 | { |
821 | FreePageBtree *p = btp; |
822 | int levels = 0; |
823 | |
824 | /* Move up until we can move right. */ |
825 | for (;;) |
826 | { |
827 | Size first_page; |
828 | Size index; |
829 | |
830 | first_page = FreePageBtreeFirstKey(p); |
831 | p = relptr_access(base, p->hdr.parent); |
832 | |
833 | if (p == NULL) |
834 | return NULL; /* we were passed the rightmost page */ |
835 | |
836 | index = FreePageBtreeSearchInternal(p, first_page); |
837 | if (index < p->hdr.nused - 1) |
838 | { |
839 | Assert(p->u.internal_key[index].first_page == first_page); |
840 | p = relptr_access(base, p->u.internal_key[index + 1].child); |
841 | break; |
842 | } |
843 | Assert(index == p->hdr.nused - 1); |
844 | ++levels; |
845 | } |
846 | |
847 | /* Descend left. */ |
848 | while (levels > 0) |
849 | { |
850 | Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
851 | p = relptr_access(base, p->u.internal_key[0].child); |
852 | --levels; |
853 | } |
854 | Assert(p->hdr.magic == btp->hdr.magic); |
855 | |
856 | return p; |
857 | } |
858 | |
859 | /* |
860 | * Get the first key on a btree page. |
861 | */ |
862 | static Size |
863 | FreePageBtreeFirstKey(FreePageBtree *btp) |
864 | { |
865 | Assert(btp->hdr.nused > 0); |
866 | |
867 | if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
868 | return btp->u.leaf_key[0].first_page; |
869 | else |
870 | { |
871 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
872 | return btp->u.internal_key[0].first_page; |
873 | } |
874 | } |
875 | |
876 | /* |
877 | * Get a page from the btree recycle list for use as a btree page. |
878 | */ |
879 | static FreePageBtree * |
880 | FreePageBtreeGetRecycled(FreePageManager *fpm) |
881 | { |
882 | char *base = fpm_segment_base(fpm); |
883 | FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle); |
884 | FreePageSpanLeader *newhead; |
885 | |
886 | Assert(victim != NULL); |
887 | newhead = relptr_access(base, victim->next); |
888 | if (newhead != NULL) |
889 | relptr_copy(newhead->prev, victim->prev); |
890 | relptr_store(base, fpm->btree_recycle, newhead); |
891 | Assert(fpm_pointer_is_page_aligned(base, victim)); |
892 | fpm->btree_recycle_count--; |
893 | return (FreePageBtree *) victim; |
894 | } |
895 | |
896 | /* |
897 | * Insert an item into an internal page. |
898 | */ |
899 | static void |
900 | FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, |
901 | Size first_page, FreePageBtree *child) |
902 | { |
903 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
904 | Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); |
905 | Assert(index <= btp->hdr.nused); |
906 | memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], |
907 | sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); |
908 | btp->u.internal_key[index].first_page = first_page; |
909 | relptr_store(base, btp->u.internal_key[index].child, child); |
910 | ++btp->hdr.nused; |
911 | } |
912 | |
913 | /* |
914 | * Insert an item into a leaf page. |
915 | */ |
916 | static void |
917 | FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, |
918 | Size npages) |
919 | { |
920 | Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); |
921 | Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); |
922 | Assert(index <= btp->hdr.nused); |
923 | memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], |
924 | sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); |
925 | btp->u.leaf_key[index].first_page = first_page; |
926 | btp->u.leaf_key[index].npages = npages; |
927 | ++btp->hdr.nused; |
928 | } |
929 | |
930 | /* |
931 | * Put a page on the btree recycle list. |
932 | */ |
933 | static void |
934 | FreePageBtreeRecycle(FreePageManager *fpm, Size pageno) |
935 | { |
936 | char *base = fpm_segment_base(fpm); |
937 | FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle); |
938 | FreePageSpanLeader *span; |
939 | |
940 | span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); |
941 | span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; |
942 | span->npages = 1; |
943 | relptr_store(base, span->next, head); |
944 | relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); |
945 | if (head != NULL) |
946 | relptr_store(base, head->prev, span); |
947 | relptr_store(base, fpm->btree_recycle, span); |
948 | fpm->btree_recycle_count++; |
949 | } |
950 | |
951 | /* |
952 | * Remove an item from the btree at the given position on the given page. |
953 | */ |
954 | static void |
955 | FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index) |
956 | { |
957 | Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); |
958 | Assert(index < btp->hdr.nused); |
959 | |
960 | /* When last item is removed, extirpate entire page from btree. */ |
961 | if (btp->hdr.nused == 1) |
962 | { |
963 | FreePageBtreeRemovePage(fpm, btp); |
964 | return; |
965 | } |
966 | |
967 | /* Physically remove the key from the page. */ |
968 | --btp->hdr.nused; |
969 | if (index < btp->hdr.nused) |
970 | memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1], |
971 | sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); |
972 | |
973 | /* If we just removed the first key, adjust ancestor keys. */ |
974 | if (index == 0) |
975 | FreePageBtreeAdjustAncestorKeys(fpm, btp); |
976 | |
977 | /* Consider whether to consolidate this page with a sibling. */ |
978 | FreePageBtreeConsolidate(fpm, btp); |
979 | } |
980 | |
981 | /* |
982 | * Remove a page from the btree. Caller is responsible for having relocated |
983 | * any keys from this page that are still wanted. The page is placed on the |
984 | * recycled list. |
985 | */ |
986 | static void |
987 | FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp) |
988 | { |
989 | char *base = fpm_segment_base(fpm); |
990 | FreePageBtree *parent; |
991 | Size index; |
992 | Size first_page; |
993 | |
994 | for (;;) |
995 | { |
996 | /* Find parent page. */ |
997 | parent = relptr_access(base, btp->hdr.parent); |
998 | if (parent == NULL) |
999 | { |
1000 | /* We are removing the root page. */ |
1001 | relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); |
1002 | fpm->btree_depth = 0; |
1003 | Assert(fpm->singleton_first_page == 0); |
1004 | Assert(fpm->singleton_npages == 0); |
1005 | return; |
1006 | } |
1007 | |
1008 | /* |
1009 | * If the parent contains only one item, we need to remove it as well. |
1010 | */ |
1011 | if (parent->hdr.nused > 1) |
1012 | break; |
1013 | FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); |
1014 | btp = parent; |
1015 | } |
1016 | |
1017 | /* Find and remove the downlink. */ |
1018 | first_page = FreePageBtreeFirstKey(btp); |
1019 | if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
1020 | { |
1021 | index = FreePageBtreeSearchLeaf(parent, first_page); |
1022 | Assert(index < parent->hdr.nused); |
1023 | if (index < parent->hdr.nused - 1) |
1024 | memmove(&parent->u.leaf_key[index], |
1025 | &parent->u.leaf_key[index + 1], |
1026 | sizeof(FreePageBtreeLeafKey) |
1027 | * (parent->hdr.nused - index - 1)); |
1028 | } |
1029 | else |
1030 | { |
1031 | index = FreePageBtreeSearchInternal(parent, first_page); |
1032 | Assert(index < parent->hdr.nused); |
1033 | if (index < parent->hdr.nused - 1) |
1034 | memmove(&parent->u.internal_key[index], |
1035 | &parent->u.internal_key[index + 1], |
1036 | sizeof(FreePageBtreeInternalKey) |
1037 | * (parent->hdr.nused - index - 1)); |
1038 | } |
1039 | parent->hdr.nused--; |
1040 | Assert(parent->hdr.nused > 0); |
1041 | |
1042 | /* Recycle the page. */ |
1043 | FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); |
1044 | |
1045 | /* Adjust ancestor keys if needed. */ |
1046 | if (index == 0) |
1047 | FreePageBtreeAdjustAncestorKeys(fpm, parent); |
1048 | |
1049 | /* Consider whether to consolidate the parent with a sibling. */ |
1050 | FreePageBtreeConsolidate(fpm, parent); |
1051 | } |
1052 | |
1053 | /* |
1054 | * Search the btree for an entry for the given first page and initialize |
1055 | * *result with the results of the search. result->page and result->index |
1056 | * indicate either the position of an exact match or the position at which |
1057 | * the new key should be inserted. result->found is true for an exact match, |
1058 | * otherwise false. result->split_pages will contain the number of additional |
1059 | * btree pages that will be needed when performing a split to insert a key. |
1060 | * Except as described above, the contents of fields in the result object are |
1061 | * undefined on return. |
1062 | */ |
1063 | static void |
1064 | FreePageBtreeSearch(FreePageManager *fpm, Size first_page, |
1065 | FreePageBtreeSearchResult *result) |
1066 | { |
1067 | char *base = fpm_segment_base(fpm); |
1068 | FreePageBtree *btp = relptr_access(base, fpm->btree_root); |
1069 | Size index; |
1070 | |
1071 | result->split_pages = 1; |
1072 | |
1073 | /* If the btree is empty, there's nothing to find. */ |
1074 | if (btp == NULL) |
1075 | { |
1076 | result->page = NULL; |
1077 | result->found = false; |
1078 | return; |
1079 | } |
1080 | |
1081 | /* Descend until we hit a leaf. */ |
1082 | while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) |
1083 | { |
1084 | FreePageBtree *child; |
1085 | bool found_exact; |
1086 | |
1087 | index = FreePageBtreeSearchInternal(btp, first_page); |
1088 | found_exact = index < btp->hdr.nused && |
1089 | btp->u.internal_key[index].first_page == first_page; |
1090 | |
1091 | /* |
1092 | * If we found an exact match we descend directly. Otherwise, we |
1093 | * descend into the child to the left if possible so that we can find |
1094 | * the insertion point at that child's high end. |
1095 | */ |
1096 | if (!found_exact && index > 0) |
1097 | --index; |
1098 | |
1099 | /* Track required split depth for leaf insert. */ |
1100 | if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE) |
1101 | { |
1102 | Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); |
1103 | result->split_pages++; |
1104 | } |
1105 | else |
1106 | result->split_pages = 0; |
1107 | |
1108 | /* Descend to appropriate child page. */ |
1109 | Assert(index < btp->hdr.nused); |
1110 | child = relptr_access(base, btp->u.internal_key[index].child); |
1111 | Assert(relptr_access(base, child->hdr.parent) == btp); |
1112 | btp = child; |
1113 | } |
1114 | |
1115 | /* Track required split depth for leaf insert. */ |
1116 | if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE) |
1117 | { |
1118 | Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); |
1119 | result->split_pages++; |
1120 | } |
1121 | else |
1122 | result->split_pages = 0; |
1123 | |
1124 | /* Search leaf page. */ |
1125 | index = FreePageBtreeSearchLeaf(btp, first_page); |
1126 | |
1127 | /* Assemble results. */ |
1128 | result->page = btp; |
1129 | result->index = index; |
1130 | result->found = index < btp->hdr.nused && |
1131 | first_page == btp->u.leaf_key[index].first_page; |
1132 | } |
1133 | |
1134 | /* |
1135 | * Search an internal page for the first key greater than or equal to a given |
1136 | * page number. Returns the index of that key, or one greater than the number |
1137 | * of keys on the page if none. |
1138 | */ |
1139 | static Size |
1140 | FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page) |
1141 | { |
1142 | Size low = 0; |
1143 | Size high = btp->hdr.nused; |
1144 | |
1145 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
1146 | Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE); |
1147 | |
1148 | while (low < high) |
1149 | { |
1150 | Size mid = (low + high) / 2; |
1151 | Size val = btp->u.internal_key[mid].first_page; |
1152 | |
1153 | if (first_page == val) |
1154 | return mid; |
1155 | else if (first_page < val) |
1156 | high = mid; |
1157 | else |
1158 | low = mid + 1; |
1159 | } |
1160 | |
1161 | return low; |
1162 | } |
1163 | |
1164 | /* |
1165 | * Search a leaf page for the first key greater than or equal to a given |
1166 | * page number. Returns the index of that key, or one greater than the number |
1167 | * of keys on the page if none. |
1168 | */ |
1169 | static Size |
1170 | FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page) |
1171 | { |
1172 | Size low = 0; |
1173 | Size high = btp->hdr.nused; |
1174 | |
1175 | Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); |
1176 | Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE); |
1177 | |
1178 | while (low < high) |
1179 | { |
1180 | Size mid = (low + high) / 2; |
1181 | Size val = btp->u.leaf_key[mid].first_page; |
1182 | |
1183 | if (first_page == val) |
1184 | return mid; |
1185 | else if (first_page < val) |
1186 | high = mid; |
1187 | else |
1188 | low = mid + 1; |
1189 | } |
1190 | |
1191 | return low; |
1192 | } |
1193 | |
1194 | /* |
1195 | * Allocate a new btree page and move half the keys from the provided page |
1196 | * to the new page. Caller is responsible for making sure that there's a |
1197 | * page available from fpm->btree_recycle. Returns a pointer to the new page, |
1198 | * to which caller must add a downlink. |
1199 | */ |
1200 | static FreePageBtree * |
1201 | FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp) |
1202 | { |
1203 | FreePageBtree *newsibling; |
1204 | |
1205 | newsibling = FreePageBtreeGetRecycled(fpm); |
1206 | newsibling->hdr.magic = btp->hdr.magic; |
1207 | newsibling->hdr.nused = btp->hdr.nused / 2; |
1208 | relptr_copy(newsibling->hdr.parent, btp->hdr.parent); |
1209 | btp->hdr.nused -= newsibling->hdr.nused; |
1210 | |
1211 | if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) |
1212 | memcpy(&newsibling->u.leaf_key, |
1213 | &btp->u.leaf_key[btp->hdr.nused], |
1214 | sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused); |
1215 | else |
1216 | { |
1217 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
1218 | memcpy(&newsibling->u.internal_key, |
1219 | &btp->u.internal_key[btp->hdr.nused], |
1220 | sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused); |
1221 | FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling); |
1222 | } |
1223 | |
1224 | return newsibling; |
1225 | } |
1226 | |
1227 | /* |
1228 | * When internal pages are split or merged, the parent pointers of their |
1229 | * children must be updated. |
1230 | */ |
1231 | static void |
1232 | FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp) |
1233 | { |
1234 | Size i; |
1235 | |
1236 | Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); |
1237 | for (i = 0; i < btp->hdr.nused; ++i) |
1238 | { |
1239 | FreePageBtree *child; |
1240 | |
1241 | child = relptr_access(base, btp->u.internal_key[i].child); |
1242 | relptr_store(base, child->hdr.parent, btp); |
1243 | } |
1244 | } |
1245 | |
1246 | /* |
1247 | * Debugging dump of btree data. |
1248 | */ |
1249 | static void |
1250 | FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, |
1251 | FreePageBtree *parent, int level, StringInfo buf) |
1252 | { |
1253 | char *base = fpm_segment_base(fpm); |
1254 | Size pageno = fpm_pointer_to_page(base, btp); |
1255 | Size index; |
1256 | FreePageBtree *check_parent; |
1257 | |
1258 | check_stack_depth(); |
1259 | check_parent = relptr_access(base, btp->hdr.parent); |
1260 | appendStringInfo(buf, " %zu@%d %c" , pageno, level, |
1261 | btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l'); |
1262 | if (parent != check_parent) |
1263 | appendStringInfo(buf, " [actual parent %zu, expected %zu]" , |
1264 | fpm_pointer_to_page(base, check_parent), |
1265 | fpm_pointer_to_page(base, parent)); |
1266 | appendStringInfoChar(buf, ':'); |
1267 | for (index = 0; index < btp->hdr.nused; ++index) |
1268 | { |
1269 | if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) |
1270 | appendStringInfo(buf, " %zu->%zu" , |
1271 | btp->u.internal_key[index].first_page, |
1272 | btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE); |
1273 | else |
1274 | appendStringInfo(buf, " %zu(%zu)" , |
1275 | btp->u.leaf_key[index].first_page, |
1276 | btp->u.leaf_key[index].npages); |
1277 | } |
1278 | appendStringInfoChar(buf, '\n'); |
1279 | |
1280 | if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) |
1281 | { |
1282 | for (index = 0; index < btp->hdr.nused; ++index) |
1283 | { |
1284 | FreePageBtree *child; |
1285 | |
1286 | child = relptr_access(base, btp->u.internal_key[index].child); |
1287 | FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf); |
1288 | } |
1289 | } |
1290 | } |
1291 | |
1292 | /* |
1293 | * Debugging dump of free-span data. |
1294 | */ |
1295 | static void |
1296 | FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, |
1297 | Size expected_pages, StringInfo buf) |
1298 | { |
1299 | char *base = fpm_segment_base(fpm); |
1300 | |
1301 | while (span != NULL) |
1302 | { |
1303 | if (span->npages != expected_pages) |
1304 | appendStringInfo(buf, " %zu(%zu)" , fpm_pointer_to_page(base, span), |
1305 | span->npages); |
1306 | else |
1307 | appendStringInfo(buf, " %zu" , fpm_pointer_to_page(base, span)); |
1308 | span = relptr_access(base, span->next); |
1309 | } |
1310 | |
1311 | appendStringInfoChar(buf, '\n'); |
1312 | } |
1313 | |
1314 | /* |
1315 | * This function allocates a run of pages of the given length from the free |
1316 | * page manager. |
1317 | */ |
1318 | static bool |
1319 | FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) |
1320 | { |
1321 | char *base = fpm_segment_base(fpm); |
1322 | FreePageSpanLeader *victim = NULL; |
1323 | FreePageSpanLeader *prev; |
1324 | FreePageSpanLeader *next; |
1325 | FreePageBtreeSearchResult result; |
1326 | Size victim_page = 0; /* placate compiler */ |
1327 | Size f; |
1328 | |
1329 | /* |
1330 | * Search for a free span. |
1331 | * |
1332 | * Right now, we use a simple best-fit policy here, but it's possible for |
1333 | * this to result in memory fragmentation if we're repeatedly asked to |
1334 | * allocate chunks just a little smaller than what we have available. |
1335 | * Hopefully, this is unlikely, because we expect most requests to be |
1336 | * single pages or superblock-sized chunks -- but no policy can be optimal |
1337 | * under all circumstances unless it has knowledge of future allocation |
1338 | * patterns. |
1339 | */ |
1340 | for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f) |
1341 | { |
1342 | /* Skip empty freelists. */ |
1343 | if (relptr_is_null(fpm->freelist[f])) |
1344 | continue; |
1345 | |
1346 | /* |
1347 | * All of the freelists except the last one contain only items of a |
1348 | * single size, so we just take the first one. But the final free |
1349 | * list contains everything too big for any of the other lists, so we |
1350 | * need to search the list. |
1351 | */ |
1352 | if (f < FPM_NUM_FREELISTS - 1) |
1353 | victim = relptr_access(base, fpm->freelist[f]); |
1354 | else |
1355 | { |
1356 | FreePageSpanLeader *candidate; |
1357 | |
1358 | candidate = relptr_access(base, fpm->freelist[f]); |
1359 | do |
1360 | { |
1361 | if (candidate->npages >= npages && (victim == NULL || |
1362 | victim->npages > candidate->npages)) |
1363 | { |
1364 | victim = candidate; |
1365 | if (victim->npages == npages) |
1366 | break; |
1367 | } |
1368 | candidate = relptr_access(base, candidate->next); |
1369 | } while (candidate != NULL); |
1370 | } |
1371 | break; |
1372 | } |
1373 | |
1374 | /* If we didn't find an allocatable span, return failure. */ |
1375 | if (victim == NULL) |
1376 | return false; |
1377 | |
1378 | /* Remove span from free list. */ |
1379 | Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC); |
1380 | prev = relptr_access(base, victim->prev); |
1381 | next = relptr_access(base, victim->next); |
1382 | if (prev != NULL) |
1383 | relptr_copy(prev->next, victim->next); |
1384 | else |
1385 | relptr_copy(fpm->freelist[f], victim->next); |
1386 | if (next != NULL) |
1387 | relptr_copy(next->prev, victim->prev); |
1388 | victim_page = fpm_pointer_to_page(base, victim); |
1389 | |
1390 | /* Decide whether we might be invalidating contiguous_pages. */ |
1391 | if (f == FPM_NUM_FREELISTS - 1 && |
1392 | victim->npages == fpm->contiguous_pages) |
1393 | { |
1394 | /* |
1395 | * The victim span came from the oversized freelist, and had the same |
1396 | * size as the longest span. There may or may not be another one of |
1397 | * the same size, so contiguous_pages must be recomputed just to be |
1398 | * safe. |
1399 | */ |
1400 | fpm->contiguous_pages_dirty = true; |
1401 | } |
1402 | else if (f + 1 == fpm->contiguous_pages && |
1403 | relptr_is_null(fpm->freelist[f])) |
1404 | { |
1405 | /* |
1406 | * The victim span came from a fixed sized freelist, and it was the |
1407 | * list for spans of the same size as the current longest span, and |
1408 | * the list is now empty after removing the victim. So |
1409 | * contiguous_pages must be recomputed without a doubt. |
1410 | */ |
1411 | fpm->contiguous_pages_dirty = true; |
1412 | } |
1413 | |
1414 | /* |
1415 | * If we haven't initialized the btree yet, the victim must be the single |
1416 | * span stored within the FreePageManager itself. Otherwise, we need to |
1417 | * update the btree. |
1418 | */ |
1419 | if (relptr_is_null(fpm->btree_root)) |
1420 | { |
1421 | Assert(victim_page == fpm->singleton_first_page); |
1422 | Assert(victim->npages == fpm->singleton_npages); |
1423 | Assert(victim->npages >= npages); |
1424 | fpm->singleton_first_page += npages; |
1425 | fpm->singleton_npages -= npages; |
1426 | if (fpm->singleton_npages > 0) |
1427 | FreePagePushSpanLeader(fpm, fpm->singleton_first_page, |
1428 | fpm->singleton_npages); |
1429 | } |
1430 | else |
1431 | { |
1432 | /* |
1433 | * If the span we found is exactly the right size, remove it from the |
1434 | * btree completely. Otherwise, adjust the btree entry to reflect the |
1435 | * still-unallocated portion of the span, and put that portion on the |
1436 | * appropriate free list. |
1437 | */ |
1438 | FreePageBtreeSearch(fpm, victim_page, &result); |
1439 | Assert(result.found); |
1440 | if (victim->npages == npages) |
1441 | FreePageBtreeRemove(fpm, result.page, result.index); |
1442 | else |
1443 | { |
1444 | FreePageBtreeLeafKey *key; |
1445 | |
1446 | /* Adjust btree to reflect remaining pages. */ |
1447 | Assert(victim->npages > npages); |
1448 | key = &result.page->u.leaf_key[result.index]; |
1449 | Assert(key->npages == victim->npages); |
1450 | key->first_page += npages; |
1451 | key->npages -= npages; |
1452 | if (result.index == 0) |
1453 | FreePageBtreeAdjustAncestorKeys(fpm, result.page); |
1454 | |
1455 | /* Put the unallocated pages back on the appropriate free list. */ |
1456 | FreePagePushSpanLeader(fpm, victim_page + npages, |
1457 | victim->npages - npages); |
1458 | } |
1459 | } |
1460 | |
1461 | /* Return results to caller. */ |
1462 | *first_page = fpm_pointer_to_page(base, victim); |
1463 | return true; |
1464 | } |
1465 | |
1466 | /* |
1467 | * Put a range of pages into the btree and freelists, consolidating it with |
1468 | * existing free spans just before and/or after it. If 'soft' is true, |
1469 | * only perform the insertion if it can be done without allocating new btree |
1470 | * pages; if false, do it always. Returns 0 if the soft flag caused the |
1471 | * insertion to be skipped, or otherwise the size of the contiguous span |
1472 | * created by the insertion. This may be larger than npages if we're able |
1473 | * to consolidate with an adjacent range. |
1474 | */ |
1475 | static Size |
1476 | FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, |
1477 | bool soft) |
1478 | { |
1479 | char *base = fpm_segment_base(fpm); |
1480 | FreePageBtreeSearchResult result; |
1481 | FreePageBtreeLeafKey *prevkey = NULL; |
1482 | FreePageBtreeLeafKey *nextkey = NULL; |
1483 | FreePageBtree *np; |
1484 | Size nindex; |
1485 | |
1486 | Assert(npages > 0); |
1487 | |
1488 | /* We can store a single free span without initializing the btree. */ |
1489 | if (fpm->btree_depth == 0) |
1490 | { |
1491 | if (fpm->singleton_npages == 0) |
1492 | { |
1493 | /* Don't have a span yet; store this one. */ |
1494 | fpm->singleton_first_page = first_page; |
1495 | fpm->singleton_npages = npages; |
1496 | FreePagePushSpanLeader(fpm, first_page, npages); |
1497 | return fpm->singleton_npages; |
1498 | } |
1499 | else if (fpm->singleton_first_page + fpm->singleton_npages == |
1500 | first_page) |
1501 | { |
1502 | /* New span immediately follows sole existing span. */ |
1503 | fpm->singleton_npages += npages; |
1504 | FreePagePopSpanLeader(fpm, fpm->singleton_first_page); |
1505 | FreePagePushSpanLeader(fpm, fpm->singleton_first_page, |
1506 | fpm->singleton_npages); |
1507 | return fpm->singleton_npages; |
1508 | } |
1509 | else if (first_page + npages == fpm->singleton_first_page) |
1510 | { |
1511 | /* New span immediately precedes sole existing span. */ |
1512 | FreePagePopSpanLeader(fpm, fpm->singleton_first_page); |
1513 | fpm->singleton_first_page = first_page; |
1514 | fpm->singleton_npages += npages; |
1515 | FreePagePushSpanLeader(fpm, fpm->singleton_first_page, |
1516 | fpm->singleton_npages); |
1517 | return fpm->singleton_npages; |
1518 | } |
1519 | else |
1520 | { |
1521 | /* Not contiguous; we need to initialize the btree. */ |
1522 | Size root_page; |
1523 | FreePageBtree *root; |
1524 | |
1525 | if (!relptr_is_null(fpm->btree_recycle)) |
1526 | root = FreePageBtreeGetRecycled(fpm); |
1527 | else if (soft) |
1528 | return 0; /* Should not allocate if soft. */ |
1529 | else if (FreePageManagerGetInternal(fpm, 1, &root_page)) |
1530 | root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); |
1531 | else |
1532 | { |
1533 | /* We'd better be able to get a page from the existing range. */ |
1534 | elog(FATAL, "free page manager btree is corrupt" ); |
1535 | } |
1536 | |
1537 | /* Create the btree and move the preexisting range into it. */ |
1538 | root->hdr.magic = FREE_PAGE_LEAF_MAGIC; |
1539 | root->hdr.nused = 1; |
1540 | relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL); |
1541 | root->u.leaf_key[0].first_page = fpm->singleton_first_page; |
1542 | root->u.leaf_key[0].npages = fpm->singleton_npages; |
1543 | relptr_store(base, fpm->btree_root, root); |
1544 | fpm->singleton_first_page = 0; |
1545 | fpm->singleton_npages = 0; |
1546 | fpm->btree_depth = 1; |
1547 | |
1548 | /* |
1549 | * Corner case: it may be that the btree root took the very last |
1550 | * free page. In that case, the sole btree entry covers a zero |
1551 | * page run, which is invalid. Overwrite it with the entry we're |
1552 | * trying to insert and get out. |
1553 | */ |
1554 | if (root->u.leaf_key[0].npages == 0) |
1555 | { |
1556 | root->u.leaf_key[0].first_page = first_page; |
1557 | root->u.leaf_key[0].npages = npages; |
1558 | FreePagePushSpanLeader(fpm, first_page, npages); |
1559 | return npages; |
1560 | } |
1561 | |
1562 | /* Fall through to insert the new key. */ |
1563 | } |
1564 | } |
1565 | |
1566 | /* Search the btree. */ |
1567 | FreePageBtreeSearch(fpm, first_page, &result); |
1568 | Assert(!result.found); |
1569 | if (result.index > 0) |
1570 | prevkey = &result.page->u.leaf_key[result.index - 1]; |
1571 | if (result.index < result.page->hdr.nused) |
1572 | { |
1573 | np = result.page; |
1574 | nindex = result.index; |
1575 | nextkey = &result.page->u.leaf_key[result.index]; |
1576 | } |
1577 | else |
1578 | { |
1579 | np = FreePageBtreeFindRightSibling(base, result.page); |
1580 | nindex = 0; |
1581 | if (np != NULL) |
1582 | nextkey = &np->u.leaf_key[0]; |
1583 | } |
1584 | |
1585 | /* Consolidate with the previous entry if possible. */ |
1586 | if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page) |
1587 | { |
1588 | bool remove_next = false; |
1589 | Size result; |
1590 | |
1591 | Assert(prevkey->first_page + prevkey->npages == first_page); |
1592 | prevkey->npages = (first_page - prevkey->first_page) + npages; |
1593 | |
1594 | /* Check whether we can *also* consolidate with the following entry. */ |
1595 | if (nextkey != NULL && |
1596 | prevkey->first_page + prevkey->npages >= nextkey->first_page) |
1597 | { |
1598 | Assert(prevkey->first_page + prevkey->npages == |
1599 | nextkey->first_page); |
1600 | prevkey->npages = (nextkey->first_page - prevkey->first_page) |
1601 | + nextkey->npages; |
1602 | FreePagePopSpanLeader(fpm, nextkey->first_page); |
1603 | remove_next = true; |
1604 | } |
1605 | |
1606 | /* Put the span on the correct freelist and save size. */ |
1607 | FreePagePopSpanLeader(fpm, prevkey->first_page); |
1608 | FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages); |
1609 | result = prevkey->npages; |
1610 | |
1611 | /* |
1612 | * If we consolidated with both the preceding and following entries, |
1613 | * we must remove the following entry. We do this last, because |
1614 | * removing an element from the btree may invalidate pointers we hold |
1615 | * into the current data structure. |
1616 | * |
1617 | * NB: The btree is technically in an invalid state a this point |
1618 | * because we've already updated prevkey to cover the same key space |
1619 | * as nextkey. FreePageBtreeRemove() shouldn't notice that, though. |
1620 | */ |
1621 | if (remove_next) |
1622 | FreePageBtreeRemove(fpm, np, nindex); |
1623 | |
1624 | return result; |
1625 | } |
1626 | |
1627 | /* Consolidate with the next entry if possible. */ |
1628 | if (nextkey != NULL && first_page + npages >= nextkey->first_page) |
1629 | { |
1630 | Size newpages; |
1631 | |
1632 | /* Compute new size for span. */ |
1633 | Assert(first_page + npages == nextkey->first_page); |
1634 | newpages = (nextkey->first_page - first_page) + nextkey->npages; |
1635 | |
1636 | /* Put span on correct free list. */ |
1637 | FreePagePopSpanLeader(fpm, nextkey->first_page); |
1638 | FreePagePushSpanLeader(fpm, first_page, newpages); |
1639 | |
1640 | /* Update key in place. */ |
1641 | nextkey->first_page = first_page; |
1642 | nextkey->npages = newpages; |
1643 | |
1644 | /* If reducing first key on page, ancestors might need adjustment. */ |
1645 | if (nindex == 0) |
1646 | FreePageBtreeAdjustAncestorKeys(fpm, np); |
1647 | |
1648 | return nextkey->npages; |
1649 | } |
1650 | |
1651 | /* Split leaf page and as many of its ancestors as necessary. */ |
1652 | if (result.split_pages > 0) |
1653 | { |
1654 | /* |
1655 | * NB: We could consider various coping strategies here to avoid a |
1656 | * split; most obviously, if np != result.page, we could target that |
1657 | * page instead. More complicated shuffling strategies could be |
1658 | * possible as well; basically, unless every single leaf page is 100% |
1659 | * full, we can jam this key in there if we try hard enough. It's |
1660 | * unlikely that trying that hard is worthwhile, but it's possible we |
1661 | * might need to make more than no effort. For now, we just do the |
1662 | * easy thing, which is nothing. |
1663 | */ |
1664 | |
1665 | /* If this is a soft insert, it's time to give up. */ |
1666 | if (soft) |
1667 | return 0; |
1668 | |
1669 | /* Check whether we need to allocate more btree pages to split. */ |
1670 | if (result.split_pages > fpm->btree_recycle_count) |
1671 | { |
1672 | Size pages_needed; |
1673 | Size recycle_page; |
1674 | Size i; |
1675 | |
1676 | /* |
1677 | * Allocate the required number of pages and split each one in |
1678 | * turn. This should never fail, because if we've got enough |
1679 | * spans of free pages kicking around that we need additional |
1680 | * storage space just to remember them all, then we should |
1681 | * certainly have enough to expand the btree, which should only |
1682 | * ever use a tiny number of pages compared to the number under |
1683 | * management. If it does, something's badly screwed up. |
1684 | */ |
1685 | pages_needed = result.split_pages - fpm->btree_recycle_count; |
1686 | for (i = 0; i < pages_needed; ++i) |
1687 | { |
1688 | if (!FreePageManagerGetInternal(fpm, 1, &recycle_page)) |
1689 | elog(FATAL, "free page manager btree is corrupt" ); |
1690 | FreePageBtreeRecycle(fpm, recycle_page); |
1691 | } |
1692 | |
1693 | /* |
1694 | * The act of allocating pages to recycle may have invalidated the |
1695 | * results of our previous btree reserch, so repeat it. (We could |
1696 | * recheck whether any of our split-avoidance strategies that were |
1697 | * not viable before now are, but it hardly seems worthwhile, so |
1698 | * we don't bother. Consolidation can't be possible now if it |
1699 | * wasn't previously.) |
1700 | */ |
1701 | FreePageBtreeSearch(fpm, first_page, &result); |
1702 | |
1703 | /* |
1704 | * The act of allocating pages for use in constructing our btree |
1705 | * should never cause any page to become more full, so the new |
1706 | * split depth should be no greater than the old one, and perhaps |
1707 | * less if we fortuitously allocated a chunk that freed up a slot |
1708 | * on the page we need to update. |
1709 | */ |
1710 | Assert(result.split_pages <= fpm->btree_recycle_count); |
1711 | } |
1712 | |
1713 | /* If we still need to perform a split, do it. */ |
1714 | if (result.split_pages > 0) |
1715 | { |
1716 | FreePageBtree *split_target = result.page; |
1717 | FreePageBtree *child = NULL; |
1718 | Size key = first_page; |
1719 | |
1720 | for (;;) |
1721 | { |
1722 | FreePageBtree *newsibling; |
1723 | FreePageBtree *parent; |
1724 | |
1725 | /* Identify parent page, which must receive downlink. */ |
1726 | parent = relptr_access(base, split_target->hdr.parent); |
1727 | |
1728 | /* Split the page - downlink not added yet. */ |
1729 | newsibling = FreePageBtreeSplitPage(fpm, split_target); |
1730 | |
1731 | /* |
1732 | * At this point in the loop, we're always carrying a pending |
1733 | * insertion. On the first pass, it's the actual key we're |
1734 | * trying to insert; on subsequent passes, it's the downlink |
1735 | * that needs to be added as a result of the split performed |
1736 | * during the previous loop iteration. Since we've just split |
1737 | * the page, there's definitely room on one of the two |
1738 | * resulting pages. |
1739 | */ |
1740 | if (child == NULL) |
1741 | { |
1742 | Size index; |
1743 | FreePageBtree *insert_into; |
1744 | |
1745 | insert_into = key < newsibling->u.leaf_key[0].first_page ? |
1746 | split_target : newsibling; |
1747 | index = FreePageBtreeSearchLeaf(insert_into, key); |
1748 | FreePageBtreeInsertLeaf(insert_into, index, key, npages); |
1749 | if (index == 0 && insert_into == split_target) |
1750 | FreePageBtreeAdjustAncestorKeys(fpm, split_target); |
1751 | } |
1752 | else |
1753 | { |
1754 | Size index; |
1755 | FreePageBtree *insert_into; |
1756 | |
1757 | insert_into = |
1758 | key < newsibling->u.internal_key[0].first_page ? |
1759 | split_target : newsibling; |
1760 | index = FreePageBtreeSearchInternal(insert_into, key); |
1761 | FreePageBtreeInsertInternal(base, insert_into, index, |
1762 | key, child); |
1763 | relptr_store(base, child->hdr.parent, insert_into); |
1764 | if (index == 0 && insert_into == split_target) |
1765 | FreePageBtreeAdjustAncestorKeys(fpm, split_target); |
1766 | } |
1767 | |
1768 | /* If the page we just split has no parent, split the root. */ |
1769 | if (parent == NULL) |
1770 | { |
1771 | FreePageBtree *newroot; |
1772 | |
1773 | newroot = FreePageBtreeGetRecycled(fpm); |
1774 | newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC; |
1775 | newroot->hdr.nused = 2; |
1776 | relptr_store(base, newroot->hdr.parent, |
1777 | (FreePageBtree *) NULL); |
1778 | newroot->u.internal_key[0].first_page = |
1779 | FreePageBtreeFirstKey(split_target); |
1780 | relptr_store(base, newroot->u.internal_key[0].child, |
1781 | split_target); |
1782 | relptr_store(base, split_target->hdr.parent, newroot); |
1783 | newroot->u.internal_key[1].first_page = |
1784 | FreePageBtreeFirstKey(newsibling); |
1785 | relptr_store(base, newroot->u.internal_key[1].child, |
1786 | newsibling); |
1787 | relptr_store(base, newsibling->hdr.parent, newroot); |
1788 | relptr_store(base, fpm->btree_root, newroot); |
1789 | fpm->btree_depth++; |
1790 | |
1791 | break; |
1792 | } |
1793 | |
1794 | /* If the parent page isn't full, insert the downlink. */ |
1795 | key = newsibling->u.internal_key[0].first_page; |
1796 | if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE) |
1797 | { |
1798 | Size index; |
1799 | |
1800 | index = FreePageBtreeSearchInternal(parent, key); |
1801 | FreePageBtreeInsertInternal(base, parent, index, |
1802 | key, newsibling); |
1803 | relptr_store(base, newsibling->hdr.parent, parent); |
1804 | if (index == 0) |
1805 | FreePageBtreeAdjustAncestorKeys(fpm, parent); |
1806 | break; |
1807 | } |
1808 | |
1809 | /* The parent also needs to be split, so loop around. */ |
1810 | child = newsibling; |
1811 | split_target = parent; |
1812 | } |
1813 | |
1814 | /* |
1815 | * The loop above did the insert, so just need to update the free |
1816 | * list, and we're done. |
1817 | */ |
1818 | FreePagePushSpanLeader(fpm, first_page, npages); |
1819 | |
1820 | return npages; |
1821 | } |
1822 | } |
1823 | |
1824 | /* Physically add the key to the page. */ |
1825 | Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); |
1826 | FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages); |
1827 | |
1828 | /* If new first key on page, ancestors might need adjustment. */ |
1829 | if (result.index == 0) |
1830 | FreePageBtreeAdjustAncestorKeys(fpm, result.page); |
1831 | |
1832 | /* Put it on the free list. */ |
1833 | FreePagePushSpanLeader(fpm, first_page, npages); |
1834 | |
1835 | return npages; |
1836 | } |
1837 | |
1838 | /* |
1839 | * Remove a FreePageSpanLeader from the linked-list that contains it, either |
1840 | * because we're changing the size of the span, or because we're allocating it. |
1841 | */ |
1842 | static void |
1843 | FreePagePopSpanLeader(FreePageManager *fpm, Size pageno) |
1844 | { |
1845 | char *base = fpm_segment_base(fpm); |
1846 | FreePageSpanLeader *span; |
1847 | FreePageSpanLeader *next; |
1848 | FreePageSpanLeader *prev; |
1849 | |
1850 | span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); |
1851 | |
1852 | next = relptr_access(base, span->next); |
1853 | prev = relptr_access(base, span->prev); |
1854 | if (next != NULL) |
1855 | relptr_copy(next->prev, span->prev); |
1856 | if (prev != NULL) |
1857 | relptr_copy(prev->next, span->next); |
1858 | else |
1859 | { |
1860 | Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1; |
1861 | |
1862 | Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE); |
1863 | relptr_copy(fpm->freelist[f], span->next); |
1864 | } |
1865 | } |
1866 | |
1867 | /* |
1868 | * Initialize a new FreePageSpanLeader and put it on the appropriate free list. |
1869 | */ |
1870 | static void |
1871 | FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages) |
1872 | { |
1873 | char *base = fpm_segment_base(fpm); |
1874 | Size f = Min(npages, FPM_NUM_FREELISTS) - 1; |
1875 | FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]); |
1876 | FreePageSpanLeader *span; |
1877 | |
1878 | span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page); |
1879 | span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; |
1880 | span->npages = npages; |
1881 | relptr_store(base, span->next, head); |
1882 | relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); |
1883 | if (head != NULL) |
1884 | relptr_store(base, head->prev, span); |
1885 | relptr_store(base, fpm->freelist[f], span); |
1886 | } |
1887 | |