1/*-------------------------------------------------------------------------
2 *
3 * dsa.c
4 * Dynamic shared memory areas.
5 *
6 * This module provides dynamic shared memory areas which are built on top of
7 * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 * created and shared between backends, it isn't designed to deal with small
9 * objects. A DSA area is a shared memory heap usually backed by one or more
10 * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 * Alternatively, it can be created in pre-existing shared memory, including a
12 * DSM segment, and then create extra DSM segments as required. Unlike the
13 * regular system heap, it deals in pseudo-pointers which must be converted to
14 * backend-local pointers before they are dereferenced. These pseudo-pointers
15 * can however be shared with other backends, and can be used to construct
16 * shared data structures.
17 *
18 * Each DSA area manages a set of DSM segments, adding new segments as
19 * required and detaching them when they are no longer needed. Each segment
20 * contains a number of 4KB pages, a free page manager for tracking
21 * consecutive runs of free pages, and a page map for tracking the source of
22 * objects allocated on each page. Allocation requests above 8KB are handled
23 * by choosing a segment and finding consecutive free pages in its free page
24 * manager. Allocation requests for smaller sizes are handled using pools of
25 * objects of a selection of sizes. Each pool consists of a number of 16 page
26 * (64KB) superblocks allocated in the same way as large objects. Allocation
27 * of large objects and new superblocks is serialized by a single LWLock, but
28 * allocation of small objects from pre-existing superblocks uses one LWLock
29 * per pool. Currently there is one pool, and therefore one lock, per size
30 * class. Per-core pools to increase concurrency and strategies for reducing
31 * the resulting fragmentation are areas for future research. Each superblock
32 * is managed with a 'span', which tracks the superblock's freelist. Free
33 * requests are handled by looking in the page map to find which span an
34 * address was allocated from, so that small objects can be returned to the
35 * appropriate free list, and large object pages can be returned directly to
36 * the free page map. When allocating, simple heuristics for selecting
37 * segments and superblocks try to encourage occupied memory to be
38 * concentrated, increasing the likelihood that whole superblocks can become
39 * empty and be returned to the free page manager, and whole segments can
40 * become empty and be returned to the operating system.
41 *
42 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
43 * Portions Copyright (c) 1994, Regents of the University of California
44 *
45 * IDENTIFICATION
46 * src/backend/utils/mmgr/dsa.c
47 *
48 *-------------------------------------------------------------------------
49 */
50
51#include "postgres.h"
52
53#include "port/atomics.h"
54#include "storage/dsm.h"
55#include "storage/ipc.h"
56#include "storage/lwlock.h"
57#include "storage/shmem.h"
58#include "utils/dsa.h"
59#include "utils/freepage.h"
60#include "utils/memutils.h"
61
62/*
63 * The size of the initial DSM segment that backs a dsa_area created by
64 * dsa_create. After creating some number of segments of this size we'll
65 * double this size, and so on. Larger segments may be created if necessary
66 * to satisfy large requests.
67 */
68#define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
69
70/*
71 * How many segments to create before we double the segment size. If this is
72 * low, then there is likely to be a lot of wasted space in the largest
73 * segment. If it is high, then we risk running out of segment slots (see
74 * dsm.c's limits on total number of segments), or limiting the total size
75 * an area can manage when using small pointers.
76 */
77#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4
78
79/*
80 * The number of bits used to represent the offset part of a dsa_pointer.
81 * This controls the maximum size of a segment, the maximum possible
82 * allocation size and also the maximum number of segments per area.
83 */
84#if SIZEOF_DSA_POINTER == 4
85#define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
86#else
87#define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
88#endif
89
90/*
91 * The maximum number of DSM segments that an area can own, determined by
92 * the number of bits remaining (but capped at 1024).
93 */
94#define DSA_MAX_SEGMENTS \
95 Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96
97/* The bitmask for extracting the offset from a dsa_pointer. */
98#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99
100/* The maximum size of a DSM segment. */
101#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
102
103/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104#define DSA_PAGES_PER_SUPERBLOCK 16
105
106/*
107 * A magic number used as a sanity check for following DSM segments belonging
108 * to a DSA area (this number will be XORed with the area handle and
109 * the segment index).
110 */
111#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112
113/* Build a dsa_pointer given a segment number and offset. */
114#define DSA_MAKE_POINTER(segment_number, offset) \
115 (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116
117/* Extract the segment number from a dsa_pointer. */
118#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119
120/* Extract the offset from a dsa_pointer. */
121#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122
123/* The type used for index segment indexes (zero based). */
124typedef size_t dsa_segment_index;
125
126/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128
129/*
130 * How many bins of segments do we have? The bins are used to categorize
131 * segments by their largest contiguous run of free pages.
132 */
133#define DSA_NUM_SEGMENT_BINS 16
134
135/*
136 * What is the lowest bin that holds segments that *might* have n contiguous
137 * free pages? There is no point in looking in segments in lower bins; they
138 * definitely can't service a request for n free pages.
139 */
140#define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
141
142/* Macros for access to locks. */
143#define DSA_AREA_LOCK(area) (&area->control->lock)
144#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
145
146/*
147 * The header for an individual segment. This lives at the start of each DSM
148 * segment owned by a DSA area including the first segment (where it appears
149 * as part of the dsa_area_control struct).
150 */
151typedef struct
152{
153 /* Sanity check magic value. */
154 uint32 magic;
155 /* Total number of pages in this segment (excluding metadata area). */
156 size_t usable_pages;
157 /* Total size of this segment in bytes. */
158 size_t size;
159
160 /*
161 * Index of the segment that precedes this one in the same segment bin, or
162 * DSA_SEGMENT_INDEX_NONE if this is the first one.
163 */
164 dsa_segment_index prev;
165
166 /*
167 * Index of the segment that follows this one in the same segment bin, or
168 * DSA_SEGMENT_INDEX_NONE if this is the last one.
169 */
170 dsa_segment_index next;
171 /* The index of the bin that contains this segment. */
172 size_t bin;
173
174 /*
175 * A flag raised to indicate that this segment is being returned to the
176 * operating system and has been unpinned.
177 */
178 bool freed;
179} dsa_segment_header;
180
181/*
182 * Metadata for one superblock.
183 *
184 * For most blocks, span objects are stored out-of-line; that is, the span
185 * object is not stored within the block itself. But, as an exception, for a
186 * "span of spans", the span object is stored "inline". The allocation is
187 * always exactly one page, and the dsa_area_span object is located at
188 * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
189 * and the remaining fields are used just as they would be in an ordinary
190 * block. We can't allocate spans out of ordinary superblocks because
191 * creating an ordinary superblock requires us to be able to allocate a span
192 * *first*. Doing it this way avoids that circularity.
193 */
194typedef struct
195{
196 dsa_pointer pool; /* Containing pool. */
197 dsa_pointer prevspan; /* Previous span. */
198 dsa_pointer nextspan; /* Next span. */
199 dsa_pointer start; /* Starting address. */
200 size_t npages; /* Length of span in pages. */
201 uint16 size_class; /* Size class. */
202 uint16 ninitialized; /* Maximum number of objects ever allocated. */
203 uint16 nallocatable; /* Number of objects currently allocatable. */
204 uint16 firstfree; /* First object on free list. */
205 uint16 nmax; /* Maximum number of objects ever possible. */
206 uint16 fclass; /* Current fullness class. */
207} dsa_area_span;
208
209/*
210 * Given a pointer to an object in a span, access the index of the next free
211 * object in the same span (ie in the span's freelist) as an L-value.
212 */
213#define NextFreeObjectIndex(object) (* (uint16 *) (object))
214
215/*
216 * Small allocations are handled by dividing a single block of memory into
217 * many small objects of equal size. The possible allocation sizes are
218 * defined by the following array. Larger size classes are spaced more widely
219 * than smaller size classes. We fudge the spacing for size classes >1kB to
220 * avoid space wastage: based on the knowledge that we plan to allocate 64kB
221 * blocks, we bump the maximum object size up to the largest multiple of
222 * 8 bytes that still lets us fit the same number of objects into one block.
223 *
224 * NB: Because of this fudging, if we were ever to use differently-sized blocks
225 * for small allocations, these size classes would need to be reworked to be
226 * optimal for the new size.
227 *
228 * NB: The optimal spacing for size classes, as well as the size of the blocks
229 * out of which small objects are allocated, is not a question that has one
230 * right answer. Some allocators (such as tcmalloc) use more closely-spaced
231 * size classes than we do here, while others (like aset.c) use more
232 * widely-spaced classes. Spacing the classes more closely avoids wasting
233 * memory within individual chunks, but also means a larger number of
234 * potentially-unfilled blocks.
235 */
236static const uint16 dsa_size_classes[] = {
237 sizeof(dsa_area_span), 0, /* special size classes */
238 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
239 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
240 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
241 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
242 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
243 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
244 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
245 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
246};
247#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
248
249/* Special size classes. */
250#define DSA_SCLASS_BLOCK_OF_SPANS 0
251#define DSA_SCLASS_SPAN_LARGE 1
252
253/*
254 * The following lookup table is used to map the size of small objects
255 * (less than 1kB) onto the corresponding size class. To use this table,
256 * round the size of the object up to the next multiple of 8 bytes, and then
257 * index into this array.
258 */
259static const uint8 dsa_size_class_map[] = {
260 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
261 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
262 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
263 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
264 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
265 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
266 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
267 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
268};
269#define DSA_SIZE_CLASS_MAP_QUANTUM 8
270
271/*
272 * Superblocks are binned by how full they are. Generally, each fullness
273 * class corresponds to one quartile, but the block being used for
274 * allocations is always at the head of the list for fullness class 1,
275 * regardless of how full it really is.
276 */
277#define DSA_FULLNESS_CLASSES 4
278
279/*
280 * A dsa_area_pool represents a set of objects of a given size class.
281 *
282 * Perhaps there should be multiple pools for the same size class for
283 * contention avoidance, but for now there is just one!
284 */
285typedef struct
286{
287 /* A lock protecting access to this pool. */
288 LWLock lock;
289 /* A set of linked lists of spans, arranged by fullness. */
290 dsa_pointer spans[DSA_FULLNESS_CLASSES];
291 /* Should we pad this out to a cacheline boundary? */
292} dsa_area_pool;
293
294/*
295 * The control block for an area. This lives in shared memory, at the start of
296 * the first DSM segment controlled by this area.
297 */
298typedef struct
299{
300 /* The segment header for the first segment. */
301 dsa_segment_header segment_header;
302 /* The handle for this area. */
303 dsa_handle handle;
304 /* The handles of the segments owned by this area. */
305 dsm_handle segment_handles[DSA_MAX_SEGMENTS];
306 /* Lists of segments, binned by maximum contiguous run of free pages. */
307 dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
308 /* The object pools for each size class. */
309 dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
310 /* The total size of all active segments. */
311 size_t total_segment_size;
312 /* The maximum total size of backing storage we are allowed. */
313 size_t max_total_segment_size;
314 /* Highest used segment index in the history of this area. */
315 dsa_segment_index high_segment_index;
316 /* The reference count for this area. */
317 int refcnt;
318 /* A flag indicating that this area has been pinned. */
319 bool pinned;
320 /* The number of times that segments have been freed. */
321 size_t freed_segment_counter;
322 /* The LWLock tranche ID. */
323 int lwlock_tranche_id;
324 /* The general lock (protects everything except object pools). */
325 LWLock lock;
326} dsa_area_control;
327
328/* Given a pointer to a pool, find a dsa_pointer. */
329#define DsaAreaPoolToDsaPointer(area, p) \
330 DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
331
332/*
333 * A dsa_segment_map is stored within the backend-private memory of each
334 * individual backend. It holds the base address of the segment within that
335 * backend, plus the addresses of key objects within the segment. Those
336 * could instead be derived from the base address but it's handy to have them
337 * around.
338 */
339typedef struct
340{
341 dsm_segment *segment; /* DSM segment */
342 char *mapped_address; /* Address at which segment is mapped */
343 dsa_segment_header *header; /* Header (same as mapped_address) */
344 FreePageManager *fpm; /* Free page manager within segment. */
345 dsa_pointer *pagemap; /* Page map within segment. */
346} dsa_segment_map;
347
348/*
349 * Per-backend state for a storage area. Backends obtain one of these by
350 * creating an area or attaching to an existing one using a handle. Each
351 * process that needs to use an area uses its own object to track where the
352 * segments are mapped.
353 */
354struct dsa_area
355{
356 /* Pointer to the control object in shared memory. */
357 dsa_area_control *control;
358
359 /* Has the mapping been pinned? */
360 bool mapping_pinned;
361
362 /*
363 * This backend's array of segment maps, ordered by segment index
364 * corresponding to control->segment_handles. Some of the area's segments
365 * may not be mapped in this backend yet, and some slots may have been
366 * freed and need to be detached; these operations happen on demand.
367 */
368 dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
369
370 /* The highest segment index this backend has ever mapped. */
371 dsa_segment_index high_segment_index;
372
373 /* The last observed freed_segment_counter. */
374 size_t freed_segment_counter;
375};
376
377#define DSA_SPAN_NOTHING_FREE ((uint16) -1)
378#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
379
380/* Given a pointer to a segment_map, obtain a segment index number. */
381#define get_segment_index(area, segment_map_ptr) \
382 (segment_map_ptr - &area->segment_maps[0])
383
384static void init_span(dsa_area *area, dsa_pointer span_pointer,
385 dsa_area_pool *pool, dsa_pointer start, size_t npages,
386 uint16 size_class);
387static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
388 int fromclass, int toclass);
389static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
390static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
391 int size_class);
392static dsa_segment_map *get_segment_by_index(dsa_area *area,
393 dsa_segment_index index);
394static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
395static void unlink_span(dsa_area *area, dsa_area_span *span);
396static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
397 dsa_pointer span_pointer, int fclass);
398static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
399static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
400static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
401static dsa_area *create_internal(void *place, size_t size,
402 int tranche_id,
403 dsm_handle control_handle,
404 dsm_segment *control_segment);
405static dsa_area *attach_internal(void *place, dsm_segment *segment,
406 dsa_handle handle);
407static void check_for_freed_segments(dsa_area *area);
408static void check_for_freed_segments_locked(dsa_area *area);
409
410/*
411 * Create a new shared area in a new DSM segment. Further DSM segments will
412 * be allocated as required to extend the available space.
413 *
414 * We can't allocate a LWLock tranche_id within this function, because tranche
415 * IDs are a scarce resource; there are only 64k available, using low numbers
416 * when possible matters, and we have no provision for recycling them. So,
417 * we require the caller to provide one.
418 */
419dsa_area *
420dsa_create(int tranche_id)
421{
422 dsm_segment *segment;
423 dsa_area *area;
424
425 /*
426 * Create the DSM segment that will hold the shared control object and the
427 * first segment of usable space.
428 */
429 segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
430
431 /*
432 * All segments backing this area are pinned, so that DSA can explicitly
433 * control their lifetime (otherwise a newly created segment belonging to
434 * this area might be freed when the only backend that happens to have it
435 * mapped in ends, corrupting the area).
436 */
437 dsm_pin_segment(segment);
438
439 /* Create a new DSA area with the control object in this segment. */
440 area = create_internal(dsm_segment_address(segment),
441 DSA_INITIAL_SEGMENT_SIZE,
442 tranche_id,
443 dsm_segment_handle(segment), segment);
444
445 /* Clean up when the control segment detaches. */
446 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
447 PointerGetDatum(dsm_segment_address(segment)));
448
449 return area;
450}
451
452/*
453 * Create a new shared area in an existing shared memory space, which may be
454 * either DSM or Postmaster-initialized memory. DSM segments will be
455 * allocated as required to extend the available space, though that can be
456 * prevented with dsa_set_size_limit(area, size) using the same size provided
457 * to dsa_create_in_place.
458 *
459 * Areas created in-place must eventually be released by the backend that
460 * created them and all backends that attach to them. This can be done
461 * explicitly with dsa_release_in_place, or, in the special case that 'place'
462 * happens to be in a pre-existing DSM segment, by passing in a pointer to the
463 * segment so that a detach hook can be registered with the containing DSM
464 * segment.
465 *
466 * See dsa_create() for a note about the tranche arguments.
467 */
468dsa_area *
469dsa_create_in_place(void *place, size_t size,
470 int tranche_id, dsm_segment *segment)
471{
472 dsa_area *area;
473
474 area = create_internal(place, size, tranche_id,
475 DSM_HANDLE_INVALID, NULL);
476
477 /*
478 * Clean up when the control segment detaches, if a containing DSM segment
479 * was provided.
480 */
481 if (segment != NULL)
482 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
483 PointerGetDatum(place));
484
485 return area;
486}
487
488/*
489 * Obtain a handle that can be passed to other processes so that they can
490 * attach to the given area. Cannot be called for areas created with
491 * dsa_create_in_place.
492 */
493dsa_handle
494dsa_get_handle(dsa_area *area)
495{
496 Assert(area->control->handle != DSM_HANDLE_INVALID);
497 return area->control->handle;
498}
499
500/*
501 * Attach to an area given a handle generated (possibly in another process) by
502 * dsa_get_handle. The area must have been created with dsa_create (not
503 * dsa_create_in_place).
504 */
505dsa_area *
506dsa_attach(dsa_handle handle)
507{
508 dsm_segment *segment;
509 dsa_area *area;
510
511 /*
512 * An area handle is really a DSM segment handle for the first segment, so
513 * we go ahead and attach to that.
514 */
515 segment = dsm_attach(handle);
516 if (segment == NULL)
517 ereport(ERROR,
518 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
519 errmsg("could not attach to dynamic shared area")));
520
521 area = attach_internal(dsm_segment_address(segment), segment, handle);
522
523 /* Clean up when the control segment detaches. */
524 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
525 PointerGetDatum(dsm_segment_address(segment)));
526
527 return area;
528}
529
530/*
531 * Attach to an area that was created with dsa_create_in_place. The caller
532 * must somehow know the location in memory that was used when the area was
533 * created, though it may be mapped at a different virtual address in this
534 * process.
535 *
536 * See dsa_create_in_place for note about releasing in-place areas, and the
537 * optional 'segment' argument which can be provided to allow automatic
538 * release if the containing memory happens to be a DSM segment.
539 */
540dsa_area *
541dsa_attach_in_place(void *place, dsm_segment *segment)
542{
543 dsa_area *area;
544
545 area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
546
547 /*
548 * Clean up when the control segment detaches, if a containing DSM segment
549 * was provided.
550 */
551 if (segment != NULL)
552 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
553 PointerGetDatum(place));
554
555 return area;
556}
557
558/*
559 * Release a DSA area that was produced by dsa_create_in_place or
560 * dsa_attach_in_place. The 'segment' argument is ignored but provides an
561 * interface suitable for on_dsm_detach, for the convenience of users who want
562 * to create a DSA segment inside an existing DSM segment and have it
563 * automatically released when the containing DSM segment is detached.
564 * 'place' should be the address of the place where the area was created.
565 *
566 * This callback is automatically registered for the DSM segment containing
567 * the control object of in-place areas when a segment is provided to
568 * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
569 * with dsa_create.
570 */
571void
572dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
573{
574 dsa_release_in_place(DatumGetPointer(place));
575}
576
577/*
578 * Release a DSA area that was produced by dsa_create_in_place or
579 * dsa_attach_in_place. The 'code' argument is ignored but provides an
580 * interface suitable for on_shmem_exit or before_shmem_exit, for the
581 * convenience of users who want to create a DSA segment inside shared memory
582 * other than a DSM segment and have it automatically release at backend exit.
583 * 'place' should be the address of the place where the area was created.
584 */
585void
586dsa_on_shmem_exit_release_in_place(int code, Datum place)
587{
588 dsa_release_in_place(DatumGetPointer(place));
589}
590
591/*
592 * Release a DSA area that was produced by dsa_create_in_place or
593 * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
594 * callbacks so that this is managed automatically, because failure to release
595 * an area created in-place leaks its segments permanently.
596 *
597 * This is also called automatically for areas produced by dsa_create or
598 * dsa_attach as an implementation detail.
599 */
600void
601dsa_release_in_place(void *place)
602{
603 dsa_area_control *control = (dsa_area_control *) place;
604 int i;
605
606 LWLockAcquire(&control->lock, LW_EXCLUSIVE);
607 Assert(control->segment_header.magic ==
608 (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
609 Assert(control->refcnt > 0);
610 if (--control->refcnt == 0)
611 {
612 for (i = 0; i <= control->high_segment_index; ++i)
613 {
614 dsm_handle handle;
615
616 handle = control->segment_handles[i];
617 if (handle != DSM_HANDLE_INVALID)
618 dsm_unpin_segment(handle);
619 }
620 }
621 LWLockRelease(&control->lock);
622}
623
624/*
625 * Keep a DSA area attached until end of session or explicit detach.
626 *
627 * By default, areas are owned by the current resource owner, which means they
628 * are detached automatically when that scope ends.
629 */
630void
631dsa_pin_mapping(dsa_area *area)
632{
633 int i;
634
635 Assert(!area->mapping_pinned);
636 area->mapping_pinned = true;
637
638 for (i = 0; i <= area->high_segment_index; ++i)
639 if (area->segment_maps[i].segment != NULL)
640 dsm_pin_mapping(area->segment_maps[i].segment);
641}
642
643/*
644 * Allocate memory in this storage area. The return value is a dsa_pointer
645 * that can be passed to other processes, and converted to a local pointer
646 * with dsa_get_address. 'flags' is a bitmap which should be constructed
647 * from the following values:
648 *
649 * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
650 * will result in an ERROR.
651 *
652 * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
653 * no memory is available or a size limit established by dsa_set_size_limit
654 * would be exceeded. Otherwise, such allocations will result in an ERROR.
655 *
656 * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
657 * contents of newly-allocated memory are indeterminate.
658 *
659 * These flags correspond to similarly named flags used by
660 * MemoryContextAllocExtended(). See also the macros dsa_allocate and
661 * dsa_allocate0 which expand to a call to this function with commonly used
662 * flags.
663 */
664dsa_pointer
665dsa_allocate_extended(dsa_area *area, size_t size, int flags)
666{
667 uint16 size_class;
668 dsa_pointer start_pointer;
669 dsa_segment_map *segment_map;
670 dsa_pointer result;
671
672 Assert(size > 0);
673
674 /* Sanity check on huge individual allocation size. */
675 if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
676 ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
677 elog(ERROR, "invalid DSA memory alloc request size %zu", size);
678
679 /*
680 * If bigger than the largest size class, just grab a run of pages from
681 * the free page manager, instead of allocating an object from a pool.
682 * There will still be a span, but it's a special class of span that
683 * manages this whole allocation and simply gives all pages back to the
684 * free page manager when dsa_free is called.
685 */
686 if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
687 {
688 size_t npages = fpm_size_to_pages(size);
689 size_t first_page;
690 dsa_pointer span_pointer;
691 dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
692
693 /* Obtain a span object. */
694 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
695 if (!DsaPointerIsValid(span_pointer))
696 {
697 /* Raise error unless asked not to. */
698 if ((flags & DSA_ALLOC_NO_OOM) == 0)
699 ereport(ERROR,
700 (errcode(ERRCODE_OUT_OF_MEMORY),
701 errmsg("out of memory"),
702 errdetail("Failed on DSA request of size %zu.",
703 size)));
704 return InvalidDsaPointer;
705 }
706
707 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
708
709 /* Find a segment from which to allocate. */
710 segment_map = get_best_segment(area, npages);
711 if (segment_map == NULL)
712 segment_map = make_new_segment(area, npages);
713 if (segment_map == NULL)
714 {
715 /* Can't make any more segments: game over. */
716 LWLockRelease(DSA_AREA_LOCK(area));
717 dsa_free(area, span_pointer);
718
719 /* Raise error unless asked not to. */
720 if ((flags & DSA_ALLOC_NO_OOM) == 0)
721 ereport(ERROR,
722 (errcode(ERRCODE_OUT_OF_MEMORY),
723 errmsg("out of memory"),
724 errdetail("Failed on DSA request of size %zu.",
725 size)));
726 return InvalidDsaPointer;
727 }
728
729 /*
730 * Ask the free page manager for a run of pages. This should always
731 * succeed, since both get_best_segment and make_new_segment should
732 * only return a non-NULL pointer if it actually contains enough
733 * contiguous freespace. If it does fail, something in our backend
734 * private state is out of whack, so use FATAL to kill the process.
735 */
736 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
737 elog(FATAL,
738 "dsa_allocate could not find %zu free pages", npages);
739 LWLockRelease(DSA_AREA_LOCK(area));
740
741 start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
742 first_page * FPM_PAGE_SIZE);
743
744 /* Initialize span and pagemap. */
745 LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
746 LW_EXCLUSIVE);
747 init_span(area, span_pointer, pool, start_pointer, npages,
748 DSA_SCLASS_SPAN_LARGE);
749 segment_map->pagemap[first_page] = span_pointer;
750 LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
751
752 /* Zero-initialize the memory if requested. */
753 if ((flags & DSA_ALLOC_ZERO) != 0)
754 memset(dsa_get_address(area, start_pointer), 0, size);
755
756 return start_pointer;
757 }
758
759 /* Map allocation to a size class. */
760 if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
761 {
762 int mapidx;
763
764 /* For smaller sizes we have a lookup table... */
765 mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
766 DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
767 size_class = dsa_size_class_map[mapidx];
768 }
769 else
770 {
771 uint16 min;
772 uint16 max;
773
774 /* ... and for the rest we search by binary chop. */
775 min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
776 max = lengthof(dsa_size_classes) - 1;
777
778 while (min < max)
779 {
780 uint16 mid = (min + max) / 2;
781 uint16 class_size = dsa_size_classes[mid];
782
783 if (class_size < size)
784 min = mid + 1;
785 else
786 max = mid;
787 }
788
789 size_class = min;
790 }
791 Assert(size <= dsa_size_classes[size_class]);
792 Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
793
794 /* Attempt to allocate an object from the appropriate pool. */
795 result = alloc_object(area, size_class);
796
797 /* Check for failure to allocate. */
798 if (!DsaPointerIsValid(result))
799 {
800 /* Raise error unless asked not to. */
801 if ((flags & DSA_ALLOC_NO_OOM) == 0)
802 ereport(ERROR,
803 (errcode(ERRCODE_OUT_OF_MEMORY),
804 errmsg("out of memory"),
805 errdetail("Failed on DSA request of size %zu.", size)));
806 return InvalidDsaPointer;
807 }
808
809 /* Zero-initialize the memory if requested. */
810 if ((flags & DSA_ALLOC_ZERO) != 0)
811 memset(dsa_get_address(area, result), 0, size);
812
813 return result;
814}
815
816/*
817 * Free memory obtained with dsa_allocate.
818 */
819void
820dsa_free(dsa_area *area, dsa_pointer dp)
821{
822 dsa_segment_map *segment_map;
823 int pageno;
824 dsa_pointer span_pointer;
825 dsa_area_span *span;
826 char *superblock;
827 char *object;
828 size_t size;
829 int size_class;
830
831 /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
832 check_for_freed_segments(area);
833
834 /* Locate the object, span and pool. */
835 segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
836 pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
837 span_pointer = segment_map->pagemap[pageno];
838 span = dsa_get_address(area, span_pointer);
839 superblock = dsa_get_address(area, span->start);
840 object = dsa_get_address(area, dp);
841 size_class = span->size_class;
842 size = dsa_size_classes[size_class];
843
844 /*
845 * Special case for large objects that live in a special span: we return
846 * those pages directly to the free page manager and free the span.
847 */
848 if (span->size_class == DSA_SCLASS_SPAN_LARGE)
849 {
850
851#ifdef CLOBBER_FREED_MEMORY
852 memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
853#endif
854
855 /* Give pages back to free page manager. */
856 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
857 FreePageManagerPut(segment_map->fpm,
858 DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
859 span->npages);
860 LWLockRelease(DSA_AREA_LOCK(area));
861 /* Unlink span. */
862 LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
863 LW_EXCLUSIVE);
864 unlink_span(area, span);
865 LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
866 /* Free the span object so it can be reused. */
867 dsa_free(area, span_pointer);
868 return;
869 }
870
871#ifdef CLOBBER_FREED_MEMORY
872 memset(object, 0x7f, size);
873#endif
874
875 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
876
877 /* Put the object on the span's freelist. */
878 Assert(object >= superblock);
879 Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
880 Assert((object - superblock) % size == 0);
881 NextFreeObjectIndex(object) = span->firstfree;
882 span->firstfree = (object - superblock) / size;
883 ++span->nallocatable;
884
885 /*
886 * See if the span needs to moved to a different fullness class, or be
887 * freed so its pages can be given back to the segment.
888 */
889 if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
890 {
891 /*
892 * The block was completely full and is located in the
893 * highest-numbered fullness class, which is never scanned for free
894 * chunks. We must move it to the next-lower fullness class.
895 */
896 unlink_span(area, span);
897 add_span_to_fullness_class(area, span, span_pointer,
898 DSA_FULLNESS_CLASSES - 2);
899
900 /*
901 * If this is the only span, and there is no active span, then we
902 * should probably move this span to fullness class 1. (Otherwise if
903 * you allocate exactly all the objects in the only span, it moves to
904 * class 3, then you free them all, it moves to 2, and then is given
905 * back, leaving no active span).
906 */
907 }
908 else if (span->nallocatable == span->nmax &&
909 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
910 {
911 /*
912 * This entire block is free, and it's not the active block for this
913 * size class. Return the memory to the free page manager. We don't
914 * do this for the active block to prevent hysteresis: if we
915 * repeatedly allocate and free the only chunk in the active block, it
916 * will be very inefficient if we deallocate and reallocate the block
917 * every time.
918 */
919 destroy_superblock(area, span_pointer);
920 }
921
922 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
923}
924
925/*
926 * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
927 * memory allocated by the given area (possibly in another process) that
928 * hasn't yet been freed. This may cause a segment to be mapped into the
929 * current process if required, and may cause freed segments to be unmapped.
930 */
931void *
932dsa_get_address(dsa_area *area, dsa_pointer dp)
933{
934 dsa_segment_index index;
935 size_t offset;
936
937 /* Convert InvalidDsaPointer to NULL. */
938 if (!DsaPointerIsValid(dp))
939 return NULL;
940
941 /* Process any requests to detach from freed segments. */
942 check_for_freed_segments(area);
943
944 /* Break the dsa_pointer into its components. */
945 index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
946 offset = DSA_EXTRACT_OFFSET(dp);
947 Assert(index < DSA_MAX_SEGMENTS);
948
949 /* Check if we need to cause this segment to be mapped in. */
950 if (unlikely(area->segment_maps[index].mapped_address == NULL))
951 {
952 /* Call for effect (we don't need the result). */
953 get_segment_by_index(area, index);
954 }
955
956 return area->segment_maps[index].mapped_address + offset;
957}
958
959/*
960 * Pin this area, so that it will continue to exist even if all backends
961 * detach from it. In that case, the area can still be reattached to if a
962 * handle has been recorded somewhere.
963 */
964void
965dsa_pin(dsa_area *area)
966{
967 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
968 if (area->control->pinned)
969 {
970 LWLockRelease(DSA_AREA_LOCK(area));
971 elog(ERROR, "dsa_area already pinned");
972 }
973 area->control->pinned = true;
974 ++area->control->refcnt;
975 LWLockRelease(DSA_AREA_LOCK(area));
976}
977
978/*
979 * Undo the effects of dsa_pin, so that the given area can be freed when no
980 * backends are attached to it. May be called only if dsa_pin has been
981 * called.
982 */
983void
984dsa_unpin(dsa_area *area)
985{
986 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
987 Assert(area->control->refcnt > 1);
988 if (!area->control->pinned)
989 {
990 LWLockRelease(DSA_AREA_LOCK(area));
991 elog(ERROR, "dsa_area not pinned");
992 }
993 area->control->pinned = false;
994 --area->control->refcnt;
995 LWLockRelease(DSA_AREA_LOCK(area));
996}
997
998/*
999 * Set the total size limit for this area. This limit is checked whenever new
1000 * segments need to be allocated from the operating system. If the new size
1001 * limit is already exceeded, this has no immediate effect.
1002 *
1003 * Note that the total virtual memory usage may be temporarily larger than
1004 * this limit when segments have been freed, but not yet detached by all
1005 * backends that have attached to them.
1006 */
1007void
1008dsa_set_size_limit(dsa_area *area, size_t limit)
1009{
1010 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1011 area->control->max_total_segment_size = limit;
1012 LWLockRelease(DSA_AREA_LOCK(area));
1013}
1014
1015/*
1016 * Aggressively free all spare memory in the hope of returning DSM segments to
1017 * the operating system.
1018 */
1019void
1020dsa_trim(dsa_area *area)
1021{
1022 int size_class;
1023
1024 /*
1025 * Trim in reverse pool order so we get to the spans-of-spans last, just
1026 * in case any become entirely free while processing all the other pools.
1027 */
1028 for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1029 {
1030 dsa_area_pool *pool = &area->control->pools[size_class];
1031 dsa_pointer span_pointer;
1032
1033 if (size_class == DSA_SCLASS_SPAN_LARGE)
1034 {
1035 /* Large object frees give back segments aggressively already. */
1036 continue;
1037 }
1038
1039 /*
1040 * Search fullness class 1 only. That is where we expect to find an
1041 * entirely empty superblock (entirely empty superblocks in other
1042 * fullness classes are returned to the free page map by dsa_free).
1043 */
1044 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1045 span_pointer = pool->spans[1];
1046 while (DsaPointerIsValid(span_pointer))
1047 {
1048 dsa_area_span *span = dsa_get_address(area, span_pointer);
1049 dsa_pointer next = span->nextspan;
1050
1051 if (span->nallocatable == span->nmax)
1052 destroy_superblock(area, span_pointer);
1053
1054 span_pointer = next;
1055 }
1056 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1057 }
1058}
1059
1060/*
1061 * Print out debugging information about the internal state of the shared
1062 * memory area.
1063 */
1064void
1065dsa_dump(dsa_area *area)
1066{
1067 size_t i,
1068 j;
1069
1070 /*
1071 * Note: This gives an inconsistent snapshot as it acquires and releases
1072 * individual locks as it goes...
1073 */
1074
1075 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1076 check_for_freed_segments_locked(area);
1077 fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1078 fprintf(stderr, " max_total_segment_size: %zu\n",
1079 area->control->max_total_segment_size);
1080 fprintf(stderr, " total_segment_size: %zu\n",
1081 area->control->total_segment_size);
1082 fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1083 fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1084 fprintf(stderr, " segment bins:\n");
1085 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1086 {
1087 if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1088 {
1089 dsa_segment_index segment_index;
1090
1091 fprintf(stderr,
1092 " segment bin %zu (at least %d contiguous pages free):\n",
1093 i, 1 << (i - 1));
1094 segment_index = area->control->segment_bins[i];
1095 while (segment_index != DSA_SEGMENT_INDEX_NONE)
1096 {
1097 dsa_segment_map *segment_map;
1098
1099 segment_map =
1100 get_segment_by_index(area, segment_index);
1101
1102 fprintf(stderr,
1103 " segment index %zu, usable_pages = %zu, "
1104 "contiguous_pages = %zu, mapped at %p\n",
1105 segment_index,
1106 segment_map->header->usable_pages,
1107 fpm_largest(segment_map->fpm),
1108 segment_map->mapped_address);
1109 segment_index = segment_map->header->next;
1110 }
1111 }
1112 }
1113 LWLockRelease(DSA_AREA_LOCK(area));
1114
1115 fprintf(stderr, " pools:\n");
1116 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1117 {
1118 bool found = false;
1119
1120 LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
1121 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1122 if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1123 found = true;
1124 if (found)
1125 {
1126 if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1127 fprintf(stderr, " pool for blocks of span objects:\n");
1128 else if (i == DSA_SCLASS_SPAN_LARGE)
1129 fprintf(stderr, " pool for large object spans:\n");
1130 else
1131 fprintf(stderr,
1132 " pool for size class %zu (object size %hu bytes):\n",
1133 i, dsa_size_classes[i]);
1134 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1135 {
1136 if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1137 fprintf(stderr, " fullness class %zu is empty\n", j);
1138 else
1139 {
1140 dsa_pointer span_pointer = area->control->pools[i].spans[j];
1141
1142 fprintf(stderr, " fullness class %zu:\n", j);
1143 while (DsaPointerIsValid(span_pointer))
1144 {
1145 dsa_area_span *span;
1146
1147 span = dsa_get_address(area, span_pointer);
1148 fprintf(stderr,
1149 " span descriptor at "
1150 DSA_POINTER_FORMAT ", superblock at "
1151 DSA_POINTER_FORMAT
1152 ", pages = %zu, objects free = %hu/%hu\n",
1153 span_pointer, span->start, span->npages,
1154 span->nallocatable, span->nmax);
1155 span_pointer = span->nextspan;
1156 }
1157 }
1158 }
1159 }
1160 LWLockRelease(DSA_SCLASS_LOCK(area, i));
1161 }
1162}
1163
1164/*
1165 * Return the smallest size that you can successfully provide to
1166 * dsa_create_in_place.
1167 */
1168size_t
1169dsa_minimum_size(void)
1170{
1171 size_t size;
1172 int pages = 0;
1173
1174 size = MAXALIGN(sizeof(dsa_area_control)) +
1175 MAXALIGN(sizeof(FreePageManager));
1176
1177 /* Figure out how many pages we need, including the page map... */
1178 while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1179 {
1180 ++pages;
1181 size += sizeof(dsa_pointer);
1182 }
1183
1184 return pages * FPM_PAGE_SIZE;
1185}
1186
1187/*
1188 * Workhorse function for dsa_create and dsa_create_in_place.
1189 */
1190static dsa_area *
1191create_internal(void *place, size_t size,
1192 int tranche_id,
1193 dsm_handle control_handle,
1194 dsm_segment *control_segment)
1195{
1196 dsa_area_control *control;
1197 dsa_area *area;
1198 dsa_segment_map *segment_map;
1199 size_t usable_pages;
1200 size_t total_pages;
1201 size_t metadata_bytes;
1202 int i;
1203
1204 /* Sanity check on the space we have to work in. */
1205 if (size < dsa_minimum_size())
1206 elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1207 dsa_minimum_size(), size);
1208
1209 /* Now figure out how much space is usable */
1210 total_pages = size / FPM_PAGE_SIZE;
1211 metadata_bytes =
1212 MAXALIGN(sizeof(dsa_area_control)) +
1213 MAXALIGN(sizeof(FreePageManager)) +
1214 total_pages * sizeof(dsa_pointer);
1215 /* Add padding up to next page boundary. */
1216 if (metadata_bytes % FPM_PAGE_SIZE != 0)
1217 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1218 Assert(metadata_bytes <= size);
1219 usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1220
1221 /*
1222 * Initialize the dsa_area_control object located at the start of the
1223 * space.
1224 */
1225 control = (dsa_area_control *) place;
1226 control->segment_header.magic =
1227 DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1228 control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
1229 control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
1230 control->segment_header.usable_pages = usable_pages;
1231 control->segment_header.freed = false;
1232 control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
1233 control->handle = control_handle;
1234 control->max_total_segment_size = (size_t) -1;
1235 control->total_segment_size = size;
1236 memset(&control->segment_handles[0], 0,
1237 sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
1238 control->segment_handles[0] = control_handle;
1239 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1240 control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1241 control->high_segment_index = 0;
1242 control->refcnt = 1;
1243 control->freed_segment_counter = 0;
1244 control->lwlock_tranche_id = tranche_id;
1245
1246 /*
1247 * Create the dsa_area object that this backend will use to access the
1248 * area. Other backends will need to obtain their own dsa_area object by
1249 * attaching.
1250 */
1251 area = palloc(sizeof(dsa_area));
1252 area->control = control;
1253 area->mapping_pinned = false;
1254 memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1255 area->high_segment_index = 0;
1256 area->freed_segment_counter = 0;
1257 LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1258 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1259 LWLockInitialize(DSA_SCLASS_LOCK(area, i),
1260 control->lwlock_tranche_id);
1261
1262 /* Set up the segment map for this process's mapping. */
1263 segment_map = &area->segment_maps[0];
1264 segment_map->segment = control_segment;
1265 segment_map->mapped_address = place;
1266 segment_map->header = (dsa_segment_header *) place;
1267 segment_map->fpm = (FreePageManager *)
1268 (segment_map->mapped_address +
1269 MAXALIGN(sizeof(dsa_area_control)));
1270 segment_map->pagemap = (dsa_pointer *)
1271 (segment_map->mapped_address +
1272 MAXALIGN(sizeof(dsa_area_control)) +
1273 MAXALIGN(sizeof(FreePageManager)));
1274
1275 /* Set up the free page map. */
1276 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1277 /* There can be 0 usable pages if size is dsa_minimum_size(). */
1278
1279 if (usable_pages > 0)
1280 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1281 usable_pages);
1282
1283 /* Put this segment into the appropriate bin. */
1284 control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1285 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1286
1287 return area;
1288}
1289
1290/*
1291 * Workhorse function for dsa_attach and dsa_attach_in_place.
1292 */
1293static dsa_area *
1294attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1295{
1296 dsa_area_control *control;
1297 dsa_area *area;
1298 dsa_segment_map *segment_map;
1299
1300 control = (dsa_area_control *) place;
1301 Assert(control->handle == handle);
1302 Assert(control->segment_handles[0] == handle);
1303 Assert(control->segment_header.magic ==
1304 (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1305
1306 /* Build the backend-local area object. */
1307 area = palloc(sizeof(dsa_area));
1308 area->control = control;
1309 area->mapping_pinned = false;
1310 memset(&area->segment_maps[0], 0,
1311 sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1312 area->high_segment_index = 0;
1313
1314 /* Set up the segment map for this process's mapping. */
1315 segment_map = &area->segment_maps[0];
1316 segment_map->segment = segment; /* NULL for in-place */
1317 segment_map->mapped_address = place;
1318 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1319 segment_map->fpm = (FreePageManager *)
1320 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1321 segment_map->pagemap = (dsa_pointer *)
1322 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1323 MAXALIGN(sizeof(FreePageManager)));
1324
1325 /* Bump the reference count. */
1326 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1327 if (control->refcnt == 0)
1328 {
1329 /* We can't attach to a DSA area that has already been destroyed. */
1330 ereport(ERROR,
1331 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1332 errmsg("could not attach to dynamic shared area")));
1333 }
1334 ++control->refcnt;
1335 area->freed_segment_counter = area->control->freed_segment_counter;
1336 LWLockRelease(DSA_AREA_LOCK(area));
1337
1338 return area;
1339}
1340
1341/*
1342 * Add a new span to fullness class 1 of the indicated pool.
1343 */
1344static void
1345init_span(dsa_area *area,
1346 dsa_pointer span_pointer,
1347 dsa_area_pool *pool, dsa_pointer start, size_t npages,
1348 uint16 size_class)
1349{
1350 dsa_area_span *span = dsa_get_address(area, span_pointer);
1351 size_t obsize = dsa_size_classes[size_class];
1352
1353 /*
1354 * The per-pool lock must be held because we manipulate the span list for
1355 * this pool.
1356 */
1357 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1358
1359 /* Push this span onto the front of the span list for fullness class 1. */
1360 if (DsaPointerIsValid(pool->spans[1]))
1361 {
1362 dsa_area_span *head = (dsa_area_span *)
1363 dsa_get_address(area, pool->spans[1]);
1364
1365 head->prevspan = span_pointer;
1366 }
1367 span->pool = DsaAreaPoolToDsaPointer(area, pool);
1368 span->nextspan = pool->spans[1];
1369 span->prevspan = InvalidDsaPointer;
1370 pool->spans[1] = span_pointer;
1371
1372 span->start = start;
1373 span->npages = npages;
1374 span->size_class = size_class;
1375 span->ninitialized = 0;
1376 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1377 {
1378 /*
1379 * A block-of-spans contains its own descriptor, so mark one object as
1380 * initialized and reduce the count of allocatable objects by one.
1381 * Doing this here has the side effect of also reducing nmax by one,
1382 * which is important to make sure we free this object at the correct
1383 * time.
1384 */
1385 span->ninitialized = 1;
1386 span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1387 }
1388 else if (size_class != DSA_SCLASS_SPAN_LARGE)
1389 span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1390 span->firstfree = DSA_SPAN_NOTHING_FREE;
1391 span->nmax = span->nallocatable;
1392 span->fclass = 1;
1393}
1394
1395/*
1396 * Transfer the first span in one fullness class to the head of another
1397 * fullness class.
1398 */
1399static bool
1400transfer_first_span(dsa_area *area,
1401 dsa_area_pool *pool, int fromclass, int toclass)
1402{
1403 dsa_pointer span_pointer;
1404 dsa_area_span *span;
1405 dsa_area_span *nextspan;
1406
1407 /* Can't do it if source list is empty. */
1408 span_pointer = pool->spans[fromclass];
1409 if (!DsaPointerIsValid(span_pointer))
1410 return false;
1411
1412 /* Remove span from head of source list. */
1413 span = dsa_get_address(area, span_pointer);
1414 pool->spans[fromclass] = span->nextspan;
1415 if (DsaPointerIsValid(span->nextspan))
1416 {
1417 nextspan = (dsa_area_span *)
1418 dsa_get_address(area, span->nextspan);
1419 nextspan->prevspan = InvalidDsaPointer;
1420 }
1421
1422 /* Add span to head of target list. */
1423 span->nextspan = pool->spans[toclass];
1424 pool->spans[toclass] = span_pointer;
1425 if (DsaPointerIsValid(span->nextspan))
1426 {
1427 nextspan = (dsa_area_span *)
1428 dsa_get_address(area, span->nextspan);
1429 nextspan->prevspan = span_pointer;
1430 }
1431 span->fclass = toclass;
1432
1433 return true;
1434}
1435
1436/*
1437 * Allocate one object of the requested size class from the given area.
1438 */
1439static inline dsa_pointer
1440alloc_object(dsa_area *area, int size_class)
1441{
1442 dsa_area_pool *pool = &area->control->pools[size_class];
1443 dsa_area_span *span;
1444 dsa_pointer block;
1445 dsa_pointer result;
1446 char *object;
1447 size_t size;
1448
1449 /*
1450 * Even though ensure_active_superblock can in turn call alloc_object if
1451 * it needs to allocate a new span, that's always from a different pool,
1452 * and the order of lock acquisition is always the same, so it's OK that
1453 * we hold this lock for the duration of this function.
1454 */
1455 Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1456 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1457
1458 /*
1459 * If there's no active superblock, we must successfully obtain one or
1460 * fail the request.
1461 */
1462 if (!DsaPointerIsValid(pool->spans[1]) &&
1463 !ensure_active_superblock(area, pool, size_class))
1464 {
1465 result = InvalidDsaPointer;
1466 }
1467 else
1468 {
1469 /*
1470 * There should be a block in fullness class 1 at this point, and it
1471 * should never be completely full. Thus we can either pop an object
1472 * from the free list or, failing that, initialize a new object.
1473 */
1474 Assert(DsaPointerIsValid(pool->spans[1]));
1475 span = (dsa_area_span *)
1476 dsa_get_address(area, pool->spans[1]);
1477 Assert(span->nallocatable > 0);
1478 block = span->start;
1479 Assert(size_class < DSA_NUM_SIZE_CLASSES);
1480 size = dsa_size_classes[size_class];
1481 if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1482 {
1483 result = block + span->firstfree * size;
1484 object = dsa_get_address(area, result);
1485 span->firstfree = NextFreeObjectIndex(object);
1486 }
1487 else
1488 {
1489 result = block + span->ninitialized * size;
1490 ++span->ninitialized;
1491 }
1492 --span->nallocatable;
1493
1494 /* If it's now full, move it to the highest-numbered fullness class. */
1495 if (span->nallocatable == 0)
1496 transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1497 }
1498
1499 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1500 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1501
1502 return result;
1503}
1504
1505/*
1506 * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1507 * superblocks are completely full and no more can be allocated.
1508 *
1509 * Fullness classes K of 0..N are loosely intended to represent blocks whose
1510 * utilization percentage is at least K/N, but we only enforce this rigorously
1511 * for the highest-numbered fullness class, which always contains exactly
1512 * those blocks that are completely full. It's otherwise acceptable for a
1513 * block to be in a higher-numbered fullness class than the one to which it
1514 * logically belongs. In addition, the active block, which is always the
1515 * first block in fullness class 1, is permitted to have a higher allocation
1516 * percentage than would normally be allowable for that fullness class; we
1517 * don't move it until it's completely full, and then it goes to the
1518 * highest-numbered fullness class.
1519 *
1520 * It might seem odd that the active block is the head of fullness class 1
1521 * rather than fullness class 0, but experience with other allocators has
1522 * shown that it's usually better to allocate from a block that's moderately
1523 * full rather than one that's nearly empty. Insofar as is reasonably
1524 * possible, we want to avoid performing new allocations in a block that would
1525 * otherwise become empty soon.
1526 */
1527static bool
1528ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
1529 int size_class)
1530{
1531 dsa_pointer span_pointer;
1532 dsa_pointer start_pointer;
1533 size_t obsize = dsa_size_classes[size_class];
1534 size_t nmax;
1535 int fclass;
1536 size_t npages = 1;
1537 size_t first_page;
1538 size_t i;
1539 dsa_segment_map *segment_map;
1540
1541 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1542
1543 /*
1544 * Compute the number of objects that will fit in a block of this size
1545 * class. Span-of-spans blocks are just a single page, and the first
1546 * object isn't available for use because it describes the block-of-spans
1547 * itself.
1548 */
1549 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1550 nmax = FPM_PAGE_SIZE / obsize - 1;
1551 else
1552 nmax = DSA_SUPERBLOCK_SIZE / obsize;
1553
1554 /*
1555 * If fullness class 1 is empty, try to find a span to put in it by
1556 * scanning higher-numbered fullness classes (excluding the last one,
1557 * whose blocks are certain to all be completely full).
1558 */
1559 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1560 {
1561 span_pointer = pool->spans[fclass];
1562
1563 while (DsaPointerIsValid(span_pointer))
1564 {
1565 int tfclass;
1566 dsa_area_span *span;
1567 dsa_area_span *nextspan;
1568 dsa_area_span *prevspan;
1569 dsa_pointer next_span_pointer;
1570
1571 span = (dsa_area_span *)
1572 dsa_get_address(area, span_pointer);
1573 next_span_pointer = span->nextspan;
1574
1575 /* Figure out what fullness class should contain this span. */
1576 tfclass = (nmax - span->nallocatable)
1577 * (DSA_FULLNESS_CLASSES - 1) / nmax;
1578
1579 /* Look up next span. */
1580 if (DsaPointerIsValid(span->nextspan))
1581 nextspan = (dsa_area_span *)
1582 dsa_get_address(area, span->nextspan);
1583 else
1584 nextspan = NULL;
1585
1586 /*
1587 * If utilization has dropped enough that this now belongs in some
1588 * other fullness class, move it there.
1589 */
1590 if (tfclass < fclass)
1591 {
1592 /* Remove from the current fullness class list. */
1593 if (pool->spans[fclass] == span_pointer)
1594 {
1595 /* It was the head; remove it. */
1596 Assert(!DsaPointerIsValid(span->prevspan));
1597 pool->spans[fclass] = span->nextspan;
1598 if (nextspan != NULL)
1599 nextspan->prevspan = InvalidDsaPointer;
1600 }
1601 else
1602 {
1603 /* It was not the head. */
1604 Assert(DsaPointerIsValid(span->prevspan));
1605 prevspan = (dsa_area_span *)
1606 dsa_get_address(area, span->prevspan);
1607 prevspan->nextspan = span->nextspan;
1608 }
1609 if (nextspan != NULL)
1610 nextspan->prevspan = span->prevspan;
1611
1612 /* Push onto the head of the new fullness class list. */
1613 span->nextspan = pool->spans[tfclass];
1614 pool->spans[tfclass] = span_pointer;
1615 span->prevspan = InvalidDsaPointer;
1616 if (DsaPointerIsValid(span->nextspan))
1617 {
1618 nextspan = (dsa_area_span *)
1619 dsa_get_address(area, span->nextspan);
1620 nextspan->prevspan = span_pointer;
1621 }
1622 span->fclass = tfclass;
1623 }
1624
1625 /* Advance to next span on list. */
1626 span_pointer = next_span_pointer;
1627 }
1628
1629 /* Stop now if we found a suitable block. */
1630 if (DsaPointerIsValid(pool->spans[1]))
1631 return true;
1632 }
1633
1634 /*
1635 * If there are no blocks that properly belong in fullness class 1, pick
1636 * one from some other fullness class and move it there anyway, so that we
1637 * have an allocation target. Our last choice is to transfer a block
1638 * that's almost empty (and might become completely empty soon if left
1639 * alone), but even that is better than failing, which is what we must do
1640 * if there are no blocks at all with freespace.
1641 */
1642 Assert(!DsaPointerIsValid(pool->spans[1]));
1643 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1644 if (transfer_first_span(area, pool, fclass, 1))
1645 return true;
1646 if (!DsaPointerIsValid(pool->spans[1]) &&
1647 transfer_first_span(area, pool, 0, 1))
1648 return true;
1649
1650 /*
1651 * We failed to find an existing span with free objects, so we need to
1652 * allocate a new superblock and construct a new span to manage it.
1653 *
1654 * First, get a dsa_area_span object to describe the new superblock block
1655 * ... unless this allocation is for a dsa_area_span object, in which case
1656 * that's surely not going to work. We handle that case by storing the
1657 * span describing a block-of-spans inline.
1658 */
1659 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1660 {
1661 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1662 if (!DsaPointerIsValid(span_pointer))
1663 return false;
1664 npages = DSA_PAGES_PER_SUPERBLOCK;
1665 }
1666
1667 /* Find or create a segment and allocate the superblock. */
1668 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1669 segment_map = get_best_segment(area, npages);
1670 if (segment_map == NULL)
1671 {
1672 segment_map = make_new_segment(area, npages);
1673 if (segment_map == NULL)
1674 {
1675 LWLockRelease(DSA_AREA_LOCK(area));
1676 return false;
1677 }
1678 }
1679
1680 /*
1681 * This shouldn't happen: get_best_segment() or make_new_segment()
1682 * promised that we can successfully allocate npages.
1683 */
1684 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1685 elog(FATAL,
1686 "dsa_allocate could not find %zu free pages for superblock",
1687 npages);
1688 LWLockRelease(DSA_AREA_LOCK(area));
1689
1690 /* Compute the start of the superblock. */
1691 start_pointer =
1692 DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1693 first_page * FPM_PAGE_SIZE);
1694
1695 /*
1696 * If this is a block-of-spans, carve the descriptor right out of the
1697 * allocated space.
1698 */
1699 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1700 {
1701 /*
1702 * We have a pointer into the segment. We need to build a dsa_pointer
1703 * from the segment index and offset into the segment.
1704 */
1705 span_pointer = start_pointer;
1706 }
1707
1708 /* Initialize span and pagemap. */
1709 init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1710 for (i = 0; i < npages; ++i)
1711 segment_map->pagemap[first_page + i] = span_pointer;
1712
1713 return true;
1714}
1715
1716/*
1717 * Return the segment map corresponding to a given segment index, mapping the
1718 * segment in if necessary. For internal segment book-keeping, this is called
1719 * with the area lock held. It is also called by dsa_free and dsa_get_address
1720 * without any locking, relying on the fact they have a known live segment
1721 * index and they always call check_for_freed_segments to ensures that any
1722 * freed segment occupying the same slot is detached first.
1723 */
1724static dsa_segment_map *
1725get_segment_by_index(dsa_area *area, dsa_segment_index index)
1726{
1727 if (unlikely(area->segment_maps[index].mapped_address == NULL))
1728 {
1729 dsm_handle handle;
1730 dsm_segment *segment;
1731 dsa_segment_map *segment_map;
1732
1733 /*
1734 * If we are reached by dsa_free or dsa_get_address, there must be at
1735 * least one object allocated in the referenced segment. Otherwise,
1736 * their caller has a double-free or access-after-free bug, which we
1737 * have no hope of detecting. So we know it's safe to access this
1738 * array slot without holding a lock; it won't change underneath us.
1739 * Furthermore, we know that we can see the latest contents of the
1740 * slot, as explained in check_for_freed_segments, which those
1741 * functions call before arriving here.
1742 */
1743 handle = area->control->segment_handles[index];
1744
1745 /* It's an error to try to access an unused slot. */
1746 if (handle == DSM_HANDLE_INVALID)
1747 elog(ERROR,
1748 "dsa_area could not attach to a segment that has been freed");
1749
1750 segment = dsm_attach(handle);
1751 if (segment == NULL)
1752 elog(ERROR, "dsa_area could not attach to segment");
1753 if (area->mapping_pinned)
1754 dsm_pin_mapping(segment);
1755 segment_map = &area->segment_maps[index];
1756 segment_map->segment = segment;
1757 segment_map->mapped_address = dsm_segment_address(segment);
1758 segment_map->header =
1759 (dsa_segment_header *) segment_map->mapped_address;
1760 segment_map->fpm = (FreePageManager *)
1761 (segment_map->mapped_address +
1762 MAXALIGN(sizeof(dsa_segment_header)));
1763 segment_map->pagemap = (dsa_pointer *)
1764 (segment_map->mapped_address +
1765 MAXALIGN(sizeof(dsa_segment_header)) +
1766 MAXALIGN(sizeof(FreePageManager)));
1767
1768 /* Remember the highest index this backend has ever mapped. */
1769 if (area->high_segment_index < index)
1770 area->high_segment_index = index;
1771
1772 Assert(segment_map->header->magic ==
1773 (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1774 }
1775
1776 /*
1777 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1778 * but it's a bug in the calling code and undefined behavior if the
1779 * address is not live (ie if the segment might possibly have been freed,
1780 * they're trying to use a dangling pointer).
1781 *
1782 * For dsa.c code that holds the area lock to manipulate segment_bins
1783 * lists, it would be a bug if we ever reach a freed segment here. After
1784 * it's marked as freed, the only thing any backend should do with it is
1785 * unmap it, and it should always have done that in
1786 * check_for_freed_segments_locked() before arriving here to resolve an
1787 * index to a segment_map.
1788 *
1789 * Either way we can assert that we aren't returning a freed segment.
1790 */
1791 Assert(!area->segment_maps[index].header->freed);
1792
1793 return &area->segment_maps[index];
1794}
1795
1796/*
1797 * Return a superblock to the free page manager. If the underlying segment
1798 * has become entirely free, then return it to the operating system.
1799 *
1800 * The appropriate pool lock must be held.
1801 */
1802static void
1803destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
1804{
1805 dsa_area_span *span = dsa_get_address(area, span_pointer);
1806 int size_class = span->size_class;
1807 dsa_segment_map *segment_map;
1808
1809
1810 /* Remove it from its fullness class list. */
1811 unlink_span(area, span);
1812
1813 /*
1814 * Note: Here we acquire the area lock while we already hold a per-pool
1815 * lock. We never hold the area lock and then take a pool lock, or we
1816 * could deadlock.
1817 */
1818 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1819 check_for_freed_segments_locked(area);
1820 segment_map =
1821 get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
1822 FreePageManagerPut(segment_map->fpm,
1823 DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
1824 span->npages);
1825 /* Check if the segment is now entirely free. */
1826 if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1827 {
1828 dsa_segment_index index = get_segment_index(area, segment_map);
1829
1830 /* If it's not the segment with extra control data, free it. */
1831 if (index != 0)
1832 {
1833 /*
1834 * Give it back to the OS, and allow other backends to detect that
1835 * they need to detach.
1836 */
1837 unlink_segment(area, segment_map);
1838 segment_map->header->freed = true;
1839 Assert(area->control->total_segment_size >=
1840 segment_map->header->size);
1841 area->control->total_segment_size -=
1842 segment_map->header->size;
1843 dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
1844 dsm_detach(segment_map->segment);
1845 area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1846 ++area->control->freed_segment_counter;
1847 segment_map->segment = NULL;
1848 segment_map->header = NULL;
1849 segment_map->mapped_address = NULL;
1850 }
1851 }
1852 LWLockRelease(DSA_AREA_LOCK(area));
1853
1854 /*
1855 * Span-of-spans blocks store the span which describes them within the
1856 * block itself, so freeing the storage implicitly frees the descriptor
1857 * also. If this is a block of any other type, we need to separately free
1858 * the span object also. This recursive call to dsa_free will acquire the
1859 * span pool's lock. We can't deadlock because the acquisition order is
1860 * always some other pool and then the span pool.
1861 */
1862 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1863 dsa_free(area, span_pointer);
1864}
1865
1866static void
1867unlink_span(dsa_area *area, dsa_area_span *span)
1868{
1869 if (DsaPointerIsValid(span->nextspan))
1870 {
1871 dsa_area_span *next = dsa_get_address(area, span->nextspan);
1872
1873 next->prevspan = span->prevspan;
1874 }
1875 if (DsaPointerIsValid(span->prevspan))
1876 {
1877 dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1878
1879 prev->nextspan = span->nextspan;
1880 }
1881 else
1882 {
1883 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1884
1885 pool->spans[span->fclass] = span->nextspan;
1886 }
1887}
1888
1889static void
1890add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
1891 dsa_pointer span_pointer,
1892 int fclass)
1893{
1894 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1895
1896 if (DsaPointerIsValid(pool->spans[fclass]))
1897 {
1898 dsa_area_span *head = dsa_get_address(area,
1899 pool->spans[fclass]);
1900
1901 head->prevspan = span_pointer;
1902 }
1903 span->prevspan = InvalidDsaPointer;
1904 span->nextspan = pool->spans[fclass];
1905 pool->spans[fclass] = span_pointer;
1906 span->fclass = fclass;
1907}
1908
1909/*
1910 * Detach from an area that was either created or attached to by this process.
1911 */
1912void
1913dsa_detach(dsa_area *area)
1914{
1915 int i;
1916
1917 /* Detach from all segments. */
1918 for (i = 0; i <= area->high_segment_index; ++i)
1919 if (area->segment_maps[i].segment != NULL)
1920 dsm_detach(area->segment_maps[i].segment);
1921
1922 /*
1923 * Note that 'detaching' (= detaching from DSM segments) doesn't include
1924 * 'releasing' (= adjusting the reference count). It would be nice to
1925 * combine these operations, but client code might never get around to
1926 * calling dsa_detach because of an error path, and a detach hook on any
1927 * particular segment is too late to detach other segments in the area
1928 * without risking a 'leak' warning in the non-error path.
1929 */
1930
1931 /* Free the backend-local area object. */
1932 pfree(area);
1933}
1934
1935/*
1936 * Unlink a segment from the bin that contains it.
1937 */
1938static void
1939unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
1940{
1941 if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1942 {
1943 dsa_segment_map *prev;
1944
1945 prev = get_segment_by_index(area, segment_map->header->prev);
1946 prev->header->next = segment_map->header->next;
1947 }
1948 else
1949 {
1950 Assert(area->control->segment_bins[segment_map->header->bin] ==
1951 get_segment_index(area, segment_map));
1952 area->control->segment_bins[segment_map->header->bin] =
1953 segment_map->header->next;
1954 }
1955 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1956 {
1957 dsa_segment_map *next;
1958
1959 next = get_segment_by_index(area, segment_map->header->next);
1960 next->header->prev = segment_map->header->prev;
1961 }
1962}
1963
1964/*
1965 * Find a segment that could satisfy a request for 'npages' of contiguous
1966 * memory, or return NULL if none can be found. This may involve attaching to
1967 * segments that weren't previously attached so that we can query their free
1968 * pages map.
1969 */
1970static dsa_segment_map *
1971get_best_segment(dsa_area *area, size_t npages)
1972{
1973 size_t bin;
1974
1975 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
1976 check_for_freed_segments_locked(area);
1977
1978 /*
1979 * Start searching from the first bin that *might* have enough contiguous
1980 * pages.
1981 */
1982 for (bin = contiguous_pages_to_segment_bin(npages);
1983 bin < DSA_NUM_SEGMENT_BINS;
1984 ++bin)
1985 {
1986 /*
1987 * The minimum contiguous size that any segment in this bin should
1988 * have. We'll re-bin if we see segments with fewer.
1989 */
1990 size_t threshold = (size_t) 1 << (bin - 1);
1991 dsa_segment_index segment_index;
1992
1993 /* Search this bin for a segment with enough contiguous space. */
1994 segment_index = area->control->segment_bins[bin];
1995 while (segment_index != DSA_SEGMENT_INDEX_NONE)
1996 {
1997 dsa_segment_map *segment_map;
1998 dsa_segment_index next_segment_index;
1999 size_t contiguous_pages;
2000
2001 segment_map = get_segment_by_index(area, segment_index);
2002 next_segment_index = segment_map->header->next;
2003 contiguous_pages = fpm_largest(segment_map->fpm);
2004
2005 /* Not enough for the request, still enough for this bin. */
2006 if (contiguous_pages >= threshold && contiguous_pages < npages)
2007 {
2008 segment_index = next_segment_index;
2009 continue;
2010 }
2011
2012 /* Re-bin it if it's no longer in the appropriate bin. */
2013 if (contiguous_pages < threshold)
2014 {
2015 size_t new_bin;
2016
2017 new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
2018
2019 /* Remove it from its current bin. */
2020 unlink_segment(area, segment_map);
2021
2022 /* Push it onto the front of its new bin. */
2023 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2024 segment_map->header->next =
2025 area->control->segment_bins[new_bin];
2026 segment_map->header->bin = new_bin;
2027 area->control->segment_bins[new_bin] = segment_index;
2028 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2029 {
2030 dsa_segment_map *next;
2031
2032 next = get_segment_by_index(area,
2033 segment_map->header->next);
2034 Assert(next->header->bin == new_bin);
2035 next->header->prev = segment_index;
2036 }
2037
2038 /*
2039 * But fall through to see if it's enough to satisfy this
2040 * request anyway....
2041 */
2042 }
2043
2044 /* Check if we are done. */
2045 if (contiguous_pages >= npages)
2046 return segment_map;
2047
2048 /* Continue searching the same bin. */
2049 segment_index = next_segment_index;
2050 }
2051 }
2052
2053 /* Not found. */
2054 return NULL;
2055}
2056
2057/*
2058 * Create a new segment that can handle at least requested_pages. Returns
2059 * NULL if the requested total size limit or maximum allowed number of
2060 * segments would be exceeded.
2061 */
2062static dsa_segment_map *
2063make_new_segment(dsa_area *area, size_t requested_pages)
2064{
2065 dsa_segment_index new_index;
2066 size_t metadata_bytes;
2067 size_t total_size;
2068 size_t total_pages;
2069 size_t usable_pages;
2070 dsa_segment_map *segment_map;
2071 dsm_segment *segment;
2072
2073 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2074
2075 /* Find a segment slot that is not in use (linearly for now). */
2076 for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2077 {
2078 if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2079 break;
2080 }
2081 if (new_index == DSA_MAX_SEGMENTS)
2082 return NULL;
2083
2084 /*
2085 * If the total size limit is already exceeded, then we exit early and
2086 * avoid arithmetic wraparound in the unsigned expressions below.
2087 */
2088 if (area->control->total_segment_size >=
2089 area->control->max_total_segment_size)
2090 return NULL;
2091
2092 /*
2093 * The size should be at least as big as requested, and at least big
2094 * enough to follow a geometric series that approximately doubles the
2095 * total storage each time we create a new segment. We use geometric
2096 * growth because the underlying DSM system isn't designed for large
2097 * numbers of segments (otherwise we might even consider just using one
2098 * DSM segment for each large allocation and for each superblock, and then
2099 * we wouldn't need to use FreePageManager).
2100 *
2101 * We decide on a total segment size first, so that we produce tidy
2102 * power-of-two sized segments. This is a good property to have if we
2103 * move to huge pages in the future. Then we work back to the number of
2104 * pages we can fit.
2105 */
2106 total_size = DSA_INITIAL_SEGMENT_SIZE *
2107 ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2108 total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2109 total_size = Min(total_size,
2110 area->control->max_total_segment_size -
2111 area->control->total_segment_size);
2112
2113 total_pages = total_size / FPM_PAGE_SIZE;
2114 metadata_bytes =
2115 MAXALIGN(sizeof(dsa_segment_header)) +
2116 MAXALIGN(sizeof(FreePageManager)) +
2117 sizeof(dsa_pointer) * total_pages;
2118
2119 /* Add padding up to next page boundary. */
2120 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2121 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2122 if (total_size <= metadata_bytes)
2123 return NULL;
2124 usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2125 Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2126
2127 /* See if that is enough... */
2128 if (requested_pages > usable_pages)
2129 {
2130 /*
2131 * We'll make an odd-sized segment, working forward from the requested
2132 * number of pages.
2133 */
2134 usable_pages = requested_pages;
2135 metadata_bytes =
2136 MAXALIGN(sizeof(dsa_segment_header)) +
2137 MAXALIGN(sizeof(FreePageManager)) +
2138 usable_pages * sizeof(dsa_pointer);
2139
2140 /* Add padding up to next page boundary. */
2141 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2142 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2143 total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2144
2145 /* Is that too large for dsa_pointer's addressing scheme? */
2146 if (total_size > DSA_MAX_SEGMENT_SIZE)
2147 return NULL;
2148
2149 /* Would that exceed the limit? */
2150 if (total_size > area->control->max_total_segment_size -
2151 area->control->total_segment_size)
2152 return NULL;
2153 }
2154
2155 /* Create the segment. */
2156 segment = dsm_create(total_size, 0);
2157 if (segment == NULL)
2158 return NULL;
2159 dsm_pin_segment(segment);
2160 if (area->mapping_pinned)
2161 dsm_pin_mapping(segment);
2162
2163 /* Store the handle in shared memory to be found by index. */
2164 area->control->segment_handles[new_index] =
2165 dsm_segment_handle(segment);
2166 /* Track the highest segment index in the history of the area. */
2167 if (area->control->high_segment_index < new_index)
2168 area->control->high_segment_index = new_index;
2169 /* Track the highest segment index this backend has ever mapped. */
2170 if (area->high_segment_index < new_index)
2171 area->high_segment_index = new_index;
2172 /* Track total size of all segments. */
2173 area->control->total_segment_size += total_size;
2174 Assert(area->control->total_segment_size <=
2175 area->control->max_total_segment_size);
2176
2177 /* Build a segment map for this segment in this backend. */
2178 segment_map = &area->segment_maps[new_index];
2179 segment_map->segment = segment;
2180 segment_map->mapped_address = dsm_segment_address(segment);
2181 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2182 segment_map->fpm = (FreePageManager *)
2183 (segment_map->mapped_address +
2184 MAXALIGN(sizeof(dsa_segment_header)));
2185 segment_map->pagemap = (dsa_pointer *)
2186 (segment_map->mapped_address +
2187 MAXALIGN(sizeof(dsa_segment_header)) +
2188 MAXALIGN(sizeof(FreePageManager)));
2189
2190 /* Set up the free page map. */
2191 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2192 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2193 usable_pages);
2194
2195 /* Set up the segment header and put it in the appropriate bin. */
2196 segment_map->header->magic =
2197 DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2198 segment_map->header->usable_pages = usable_pages;
2199 segment_map->header->size = total_size;
2200 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2201 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2202 segment_map->header->next =
2203 area->control->segment_bins[segment_map->header->bin];
2204 segment_map->header->freed = false;
2205 area->control->segment_bins[segment_map->header->bin] = new_index;
2206 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2207 {
2208 dsa_segment_map *next =
2209 get_segment_by_index(area, segment_map->header->next);
2210
2211 Assert(next->header->bin == segment_map->header->bin);
2212 next->header->prev = new_index;
2213 }
2214
2215 return segment_map;
2216}
2217
2218/*
2219 * Check if any segments have been freed by destroy_superblock, so we can
2220 * detach from them in this backend. This function is called by
2221 * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2222 * received can be resolved to the correct segment.
2223 *
2224 * The danger we want to defend against is that there could be an old segment
2225 * mapped into a given slot in this backend, and the dsa_pointer they have
2226 * might refer to some new segment in the same slot. So those functions must
2227 * be sure to process all instructions to detach from a freed segment that had
2228 * been generated by the time this process received the dsa_pointer, before
2229 * they call get_segment_by_index.
2230 */
2231static void
2232check_for_freed_segments(dsa_area *area)
2233{
2234 size_t freed_segment_counter;
2235
2236 /*
2237 * Any other process that has freed a segment has incremented
2238 * free_segment_counter while holding an LWLock, and that must precede any
2239 * backend creating a new segment in the same slot while holding an
2240 * LWLock, and that must precede the creation of any dsa_pointer pointing
2241 * into the new segment which might reach us here, and the caller must
2242 * have sent the dsa_pointer to this process using appropriate memory
2243 * synchronization (some kind of locking or atomic primitive or system
2244 * call). So all we need to do on the reading side is ask for the load of
2245 * freed_segment_counter to follow the caller's load of the dsa_pointer it
2246 * has, and we can be sure to detect any segments that had been freed as
2247 * of the time that the dsa_pointer reached this process.
2248 */
2249 pg_read_barrier();
2250 freed_segment_counter = area->control->freed_segment_counter;
2251 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2252 {
2253 /* Check all currently mapped segments to find what's been freed. */
2254 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
2255 check_for_freed_segments_locked(area);
2256 LWLockRelease(DSA_AREA_LOCK(area));
2257 }
2258}
2259
2260/*
2261 * Workhorse for check_for_freed_segments(), and also used directly in path
2262 * where the area lock is already held. This should be called after acquiring
2263 * the lock but before looking up any segment by index number, to make sure we
2264 * unmap any stale segments that might have previously had the same index as a
2265 * current segment.
2266 */
2267static void
2268check_for_freed_segments_locked(dsa_area *area)
2269{
2270 size_t freed_segment_counter;
2271 int i;
2272
2273 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2274 freed_segment_counter = area->control->freed_segment_counter;
2275 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2276 {
2277 for (i = 0; i <= area->high_segment_index; ++i)
2278 {
2279 if (area->segment_maps[i].header != NULL &&
2280 area->segment_maps[i].header->freed)
2281 {
2282 dsm_detach(area->segment_maps[i].segment);
2283 area->segment_maps[i].segment = NULL;
2284 area->segment_maps[i].header = NULL;
2285 area->segment_maps[i].mapped_address = NULL;
2286 }
2287 }
2288 area->freed_segment_counter = freed_segment_counter;
2289 }
2290}
2291