| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * dsa.c |
| 4 | * Dynamic shared memory areas. |
| 5 | * |
| 6 | * This module provides dynamic shared memory areas which are built on top of |
| 7 | * DSM segments. While dsm.c allows segments of memory of shared memory to be |
| 8 | * created and shared between backends, it isn't designed to deal with small |
| 9 | * objects. A DSA area is a shared memory heap usually backed by one or more |
| 10 | * DSM segments which can allocate memory using dsa_allocate() and dsa_free(). |
| 11 | * Alternatively, it can be created in pre-existing shared memory, including a |
| 12 | * DSM segment, and then create extra DSM segments as required. Unlike the |
| 13 | * regular system heap, it deals in pseudo-pointers which must be converted to |
| 14 | * backend-local pointers before they are dereferenced. These pseudo-pointers |
| 15 | * can however be shared with other backends, and can be used to construct |
| 16 | * shared data structures. |
| 17 | * |
| 18 | * Each DSA area manages a set of DSM segments, adding new segments as |
| 19 | * required and detaching them when they are no longer needed. Each segment |
| 20 | * contains a number of 4KB pages, a free page manager for tracking |
| 21 | * consecutive runs of free pages, and a page map for tracking the source of |
| 22 | * objects allocated on each page. Allocation requests above 8KB are handled |
| 23 | * by choosing a segment and finding consecutive free pages in its free page |
| 24 | * manager. Allocation requests for smaller sizes are handled using pools of |
| 25 | * objects of a selection of sizes. Each pool consists of a number of 16 page |
| 26 | * (64KB) superblocks allocated in the same way as large objects. Allocation |
| 27 | * of large objects and new superblocks is serialized by a single LWLock, but |
| 28 | * allocation of small objects from pre-existing superblocks uses one LWLock |
| 29 | * per pool. Currently there is one pool, and therefore one lock, per size |
| 30 | * class. Per-core pools to increase concurrency and strategies for reducing |
| 31 | * the resulting fragmentation are areas for future research. Each superblock |
| 32 | * is managed with a 'span', which tracks the superblock's freelist. Free |
| 33 | * requests are handled by looking in the page map to find which span an |
| 34 | * address was allocated from, so that small objects can be returned to the |
| 35 | * appropriate free list, and large object pages can be returned directly to |
| 36 | * the free page map. When allocating, simple heuristics for selecting |
| 37 | * segments and superblocks try to encourage occupied memory to be |
| 38 | * concentrated, increasing the likelihood that whole superblocks can become |
| 39 | * empty and be returned to the free page manager, and whole segments can |
| 40 | * become empty and be returned to the operating system. |
| 41 | * |
| 42 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 43 | * Portions Copyright (c) 1994, Regents of the University of California |
| 44 | * |
| 45 | * IDENTIFICATION |
| 46 | * src/backend/utils/mmgr/dsa.c |
| 47 | * |
| 48 | *------------------------------------------------------------------------- |
| 49 | */ |
| 50 | |
| 51 | #include "postgres.h" |
| 52 | |
| 53 | #include "port/atomics.h" |
| 54 | #include "storage/dsm.h" |
| 55 | #include "storage/ipc.h" |
| 56 | #include "storage/lwlock.h" |
| 57 | #include "storage/shmem.h" |
| 58 | #include "utils/dsa.h" |
| 59 | #include "utils/freepage.h" |
| 60 | #include "utils/memutils.h" |
| 61 | |
| 62 | /* |
| 63 | * The size of the initial DSM segment that backs a dsa_area created by |
| 64 | * dsa_create. After creating some number of segments of this size we'll |
| 65 | * double this size, and so on. Larger segments may be created if necessary |
| 66 | * to satisfy large requests. |
| 67 | */ |
| 68 | #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024)) |
| 69 | |
| 70 | /* |
| 71 | * How many segments to create before we double the segment size. If this is |
| 72 | * low, then there is likely to be a lot of wasted space in the largest |
| 73 | * segment. If it is high, then we risk running out of segment slots (see |
| 74 | * dsm.c's limits on total number of segments), or limiting the total size |
| 75 | * an area can manage when using small pointers. |
| 76 | */ |
| 77 | #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 4 |
| 78 | |
| 79 | /* |
| 80 | * The number of bits used to represent the offset part of a dsa_pointer. |
| 81 | * This controls the maximum size of a segment, the maximum possible |
| 82 | * allocation size and also the maximum number of segments per area. |
| 83 | */ |
| 84 | #if SIZEOF_DSA_POINTER == 4 |
| 85 | #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */ |
| 86 | #else |
| 87 | #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */ |
| 88 | #endif |
| 89 | |
| 90 | /* |
| 91 | * The maximum number of DSM segments that an area can own, determined by |
| 92 | * the number of bits remaining (but capped at 1024). |
| 93 | */ |
| 94 | #define DSA_MAX_SEGMENTS \ |
| 95 | Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH))) |
| 96 | |
| 97 | /* The bitmask for extracting the offset from a dsa_pointer. */ |
| 98 | #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1) |
| 99 | |
| 100 | /* The maximum size of a DSM segment. */ |
| 101 | #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH) |
| 102 | |
| 103 | /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */ |
| 104 | #define DSA_PAGES_PER_SUPERBLOCK 16 |
| 105 | |
| 106 | /* |
| 107 | * A magic number used as a sanity check for following DSM segments belonging |
| 108 | * to a DSA area (this number will be XORed with the area handle and |
| 109 | * the segment index). |
| 110 | */ |
| 111 | #define 0x0ce26608 |
| 112 | |
| 113 | /* Build a dsa_pointer given a segment number and offset. */ |
| 114 | #define DSA_MAKE_POINTER(segment_number, offset) \ |
| 115 | (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset)) |
| 116 | |
| 117 | /* Extract the segment number from a dsa_pointer. */ |
| 118 | #define (dp) ((dp) >> DSA_OFFSET_WIDTH) |
| 119 | |
| 120 | /* Extract the offset from a dsa_pointer. */ |
| 121 | #define (dp) ((dp) & DSA_OFFSET_BITMASK) |
| 122 | |
| 123 | /* The type used for index segment indexes (zero based). */ |
| 124 | typedef size_t dsa_segment_index; |
| 125 | |
| 126 | /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */ |
| 127 | #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0) |
| 128 | |
| 129 | /* |
| 130 | * How many bins of segments do we have? The bins are used to categorize |
| 131 | * segments by their largest contiguous run of free pages. |
| 132 | */ |
| 133 | #define DSA_NUM_SEGMENT_BINS 16 |
| 134 | |
| 135 | /* |
| 136 | * What is the lowest bin that holds segments that *might* have n contiguous |
| 137 | * free pages? There is no point in looking in segments in lower bins; they |
| 138 | * definitely can't service a request for n free pages. |
| 139 | */ |
| 140 | #define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1) |
| 141 | |
| 142 | /* Macros for access to locks. */ |
| 143 | #define DSA_AREA_LOCK(area) (&area->control->lock) |
| 144 | #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock) |
| 145 | |
| 146 | /* |
| 147 | * The header for an individual segment. This lives at the start of each DSM |
| 148 | * segment owned by a DSA area including the first segment (where it appears |
| 149 | * as part of the dsa_area_control struct). |
| 150 | */ |
| 151 | typedef struct |
| 152 | { |
| 153 | /* Sanity check magic value. */ |
| 154 | uint32 magic; |
| 155 | /* Total number of pages in this segment (excluding metadata area). */ |
| 156 | size_t usable_pages; |
| 157 | /* Total size of this segment in bytes. */ |
| 158 | size_t size; |
| 159 | |
| 160 | /* |
| 161 | * Index of the segment that precedes this one in the same segment bin, or |
| 162 | * DSA_SEGMENT_INDEX_NONE if this is the first one. |
| 163 | */ |
| 164 | dsa_segment_index prev; |
| 165 | |
| 166 | /* |
| 167 | * Index of the segment that follows this one in the same segment bin, or |
| 168 | * DSA_SEGMENT_INDEX_NONE if this is the last one. |
| 169 | */ |
| 170 | dsa_segment_index next; |
| 171 | /* The index of the bin that contains this segment. */ |
| 172 | size_t bin; |
| 173 | |
| 174 | /* |
| 175 | * A flag raised to indicate that this segment is being returned to the |
| 176 | * operating system and has been unpinned. |
| 177 | */ |
| 178 | bool freed; |
| 179 | } ; |
| 180 | |
| 181 | /* |
| 182 | * Metadata for one superblock. |
| 183 | * |
| 184 | * For most blocks, span objects are stored out-of-line; that is, the span |
| 185 | * object is not stored within the block itself. But, as an exception, for a |
| 186 | * "span of spans", the span object is stored "inline". The allocation is |
| 187 | * always exactly one page, and the dsa_area_span object is located at |
| 188 | * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS, |
| 189 | * and the remaining fields are used just as they would be in an ordinary |
| 190 | * block. We can't allocate spans out of ordinary superblocks because |
| 191 | * creating an ordinary superblock requires us to be able to allocate a span |
| 192 | * *first*. Doing it this way avoids that circularity. |
| 193 | */ |
| 194 | typedef struct |
| 195 | { |
| 196 | dsa_pointer pool; /* Containing pool. */ |
| 197 | dsa_pointer prevspan; /* Previous span. */ |
| 198 | dsa_pointer nextspan; /* Next span. */ |
| 199 | dsa_pointer start; /* Starting address. */ |
| 200 | size_t npages; /* Length of span in pages. */ |
| 201 | uint16 size_class; /* Size class. */ |
| 202 | uint16 ninitialized; /* Maximum number of objects ever allocated. */ |
| 203 | uint16 nallocatable; /* Number of objects currently allocatable. */ |
| 204 | uint16 firstfree; /* First object on free list. */ |
| 205 | uint16 nmax; /* Maximum number of objects ever possible. */ |
| 206 | uint16 fclass; /* Current fullness class. */ |
| 207 | } dsa_area_span; |
| 208 | |
| 209 | /* |
| 210 | * Given a pointer to an object in a span, access the index of the next free |
| 211 | * object in the same span (ie in the span's freelist) as an L-value. |
| 212 | */ |
| 213 | #define NextFreeObjectIndex(object) (* (uint16 *) (object)) |
| 214 | |
| 215 | /* |
| 216 | * Small allocations are handled by dividing a single block of memory into |
| 217 | * many small objects of equal size. The possible allocation sizes are |
| 218 | * defined by the following array. Larger size classes are spaced more widely |
| 219 | * than smaller size classes. We fudge the spacing for size classes >1kB to |
| 220 | * avoid space wastage: based on the knowledge that we plan to allocate 64kB |
| 221 | * blocks, we bump the maximum object size up to the largest multiple of |
| 222 | * 8 bytes that still lets us fit the same number of objects into one block. |
| 223 | * |
| 224 | * NB: Because of this fudging, if we were ever to use differently-sized blocks |
| 225 | * for small allocations, these size classes would need to be reworked to be |
| 226 | * optimal for the new size. |
| 227 | * |
| 228 | * NB: The optimal spacing for size classes, as well as the size of the blocks |
| 229 | * out of which small objects are allocated, is not a question that has one |
| 230 | * right answer. Some allocators (such as tcmalloc) use more closely-spaced |
| 231 | * size classes than we do here, while others (like aset.c) use more |
| 232 | * widely-spaced classes. Spacing the classes more closely avoids wasting |
| 233 | * memory within individual chunks, but also means a larger number of |
| 234 | * potentially-unfilled blocks. |
| 235 | */ |
| 236 | static const uint16 dsa_size_classes[] = { |
| 237 | sizeof(dsa_area_span), 0, /* special size classes */ |
| 238 | 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */ |
| 239 | 80, 96, 112, 128, /* 4 classes separated by 16 bytes */ |
| 240 | 160, 192, 224, 256, /* 4 classes separated by 32 bytes */ |
| 241 | 320, 384, 448, 512, /* 4 classes separated by 64 bytes */ |
| 242 | 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */ |
| 243 | 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */ |
| 244 | 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */ |
| 245 | 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */ |
| 246 | }; |
| 247 | #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes) |
| 248 | |
| 249 | /* Special size classes. */ |
| 250 | #define DSA_SCLASS_BLOCK_OF_SPANS 0 |
| 251 | #define DSA_SCLASS_SPAN_LARGE 1 |
| 252 | |
| 253 | /* |
| 254 | * The following lookup table is used to map the size of small objects |
| 255 | * (less than 1kB) onto the corresponding size class. To use this table, |
| 256 | * round the size of the object up to the next multiple of 8 bytes, and then |
| 257 | * index into this array. |
| 258 | */ |
| 259 | static const uint8 dsa_size_class_map[] = { |
| 260 | 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, |
| 261 | 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, |
| 262 | 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, |
| 263 | 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, |
| 264 | 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, |
| 265 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
| 266 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
| 267 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 |
| 268 | }; |
| 269 | #define DSA_SIZE_CLASS_MAP_QUANTUM 8 |
| 270 | |
| 271 | /* |
| 272 | * Superblocks are binned by how full they are. Generally, each fullness |
| 273 | * class corresponds to one quartile, but the block being used for |
| 274 | * allocations is always at the head of the list for fullness class 1, |
| 275 | * regardless of how full it really is. |
| 276 | */ |
| 277 | #define DSA_FULLNESS_CLASSES 4 |
| 278 | |
| 279 | /* |
| 280 | * A dsa_area_pool represents a set of objects of a given size class. |
| 281 | * |
| 282 | * Perhaps there should be multiple pools for the same size class for |
| 283 | * contention avoidance, but for now there is just one! |
| 284 | */ |
| 285 | typedef struct |
| 286 | { |
| 287 | /* A lock protecting access to this pool. */ |
| 288 | LWLock lock; |
| 289 | /* A set of linked lists of spans, arranged by fullness. */ |
| 290 | dsa_pointer spans[DSA_FULLNESS_CLASSES]; |
| 291 | /* Should we pad this out to a cacheline boundary? */ |
| 292 | } dsa_area_pool; |
| 293 | |
| 294 | /* |
| 295 | * The control block for an area. This lives in shared memory, at the start of |
| 296 | * the first DSM segment controlled by this area. |
| 297 | */ |
| 298 | typedef struct |
| 299 | { |
| 300 | /* The segment header for the first segment. */ |
| 301 | dsa_segment_header ; |
| 302 | /* The handle for this area. */ |
| 303 | dsa_handle handle; |
| 304 | /* The handles of the segments owned by this area. */ |
| 305 | dsm_handle segment_handles[DSA_MAX_SEGMENTS]; |
| 306 | /* Lists of segments, binned by maximum contiguous run of free pages. */ |
| 307 | dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]; |
| 308 | /* The object pools for each size class. */ |
| 309 | dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]; |
| 310 | /* The total size of all active segments. */ |
| 311 | size_t total_segment_size; |
| 312 | /* The maximum total size of backing storage we are allowed. */ |
| 313 | size_t max_total_segment_size; |
| 314 | /* Highest used segment index in the history of this area. */ |
| 315 | dsa_segment_index high_segment_index; |
| 316 | /* The reference count for this area. */ |
| 317 | int refcnt; |
| 318 | /* A flag indicating that this area has been pinned. */ |
| 319 | bool pinned; |
| 320 | /* The number of times that segments have been freed. */ |
| 321 | size_t freed_segment_counter; |
| 322 | /* The LWLock tranche ID. */ |
| 323 | int lwlock_tranche_id; |
| 324 | /* The general lock (protects everything except object pools). */ |
| 325 | LWLock lock; |
| 326 | } dsa_area_control; |
| 327 | |
| 328 | /* Given a pointer to a pool, find a dsa_pointer. */ |
| 329 | #define DsaAreaPoolToDsaPointer(area, p) \ |
| 330 | DSA_MAKE_POINTER(0, (char *) p - (char *) area->control) |
| 331 | |
| 332 | /* |
| 333 | * A dsa_segment_map is stored within the backend-private memory of each |
| 334 | * individual backend. It holds the base address of the segment within that |
| 335 | * backend, plus the addresses of key objects within the segment. Those |
| 336 | * could instead be derived from the base address but it's handy to have them |
| 337 | * around. |
| 338 | */ |
| 339 | typedef struct |
| 340 | { |
| 341 | dsm_segment *segment; /* DSM segment */ |
| 342 | char *mapped_address; /* Address at which segment is mapped */ |
| 343 | dsa_segment_header *; /* Header (same as mapped_address) */ |
| 344 | FreePageManager *fpm; /* Free page manager within segment. */ |
| 345 | dsa_pointer *pagemap; /* Page map within segment. */ |
| 346 | } dsa_segment_map; |
| 347 | |
| 348 | /* |
| 349 | * Per-backend state for a storage area. Backends obtain one of these by |
| 350 | * creating an area or attaching to an existing one using a handle. Each |
| 351 | * process that needs to use an area uses its own object to track where the |
| 352 | * segments are mapped. |
| 353 | */ |
| 354 | struct dsa_area |
| 355 | { |
| 356 | /* Pointer to the control object in shared memory. */ |
| 357 | dsa_area_control *control; |
| 358 | |
| 359 | /* Has the mapping been pinned? */ |
| 360 | bool mapping_pinned; |
| 361 | |
| 362 | /* |
| 363 | * This backend's array of segment maps, ordered by segment index |
| 364 | * corresponding to control->segment_handles. Some of the area's segments |
| 365 | * may not be mapped in this backend yet, and some slots may have been |
| 366 | * freed and need to be detached; these operations happen on demand. |
| 367 | */ |
| 368 | dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]; |
| 369 | |
| 370 | /* The highest segment index this backend has ever mapped. */ |
| 371 | dsa_segment_index high_segment_index; |
| 372 | |
| 373 | /* The last observed freed_segment_counter. */ |
| 374 | size_t freed_segment_counter; |
| 375 | }; |
| 376 | |
| 377 | #define DSA_SPAN_NOTHING_FREE ((uint16) -1) |
| 378 | #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE) |
| 379 | |
| 380 | /* Given a pointer to a segment_map, obtain a segment index number. */ |
| 381 | #define get_segment_index(area, segment_map_ptr) \ |
| 382 | (segment_map_ptr - &area->segment_maps[0]) |
| 383 | |
| 384 | static void init_span(dsa_area *area, dsa_pointer span_pointer, |
| 385 | dsa_area_pool *pool, dsa_pointer start, size_t npages, |
| 386 | uint16 size_class); |
| 387 | static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, |
| 388 | int fromclass, int toclass); |
| 389 | static inline dsa_pointer alloc_object(dsa_area *area, int size_class); |
| 390 | static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, |
| 391 | int size_class); |
| 392 | static dsa_segment_map *get_segment_by_index(dsa_area *area, |
| 393 | dsa_segment_index index); |
| 394 | static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer); |
| 395 | static void unlink_span(dsa_area *area, dsa_area_span *span); |
| 396 | static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, |
| 397 | dsa_pointer span_pointer, int fclass); |
| 398 | static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map); |
| 399 | static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages); |
| 400 | static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages); |
| 401 | static dsa_area *create_internal(void *place, size_t size, |
| 402 | int tranche_id, |
| 403 | dsm_handle control_handle, |
| 404 | dsm_segment *control_segment); |
| 405 | static dsa_area *attach_internal(void *place, dsm_segment *segment, |
| 406 | dsa_handle handle); |
| 407 | static void check_for_freed_segments(dsa_area *area); |
| 408 | static void check_for_freed_segments_locked(dsa_area *area); |
| 409 | |
| 410 | /* |
| 411 | * Create a new shared area in a new DSM segment. Further DSM segments will |
| 412 | * be allocated as required to extend the available space. |
| 413 | * |
| 414 | * We can't allocate a LWLock tranche_id within this function, because tranche |
| 415 | * IDs are a scarce resource; there are only 64k available, using low numbers |
| 416 | * when possible matters, and we have no provision for recycling them. So, |
| 417 | * we require the caller to provide one. |
| 418 | */ |
| 419 | dsa_area * |
| 420 | dsa_create(int tranche_id) |
| 421 | { |
| 422 | dsm_segment *segment; |
| 423 | dsa_area *area; |
| 424 | |
| 425 | /* |
| 426 | * Create the DSM segment that will hold the shared control object and the |
| 427 | * first segment of usable space. |
| 428 | */ |
| 429 | segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0); |
| 430 | |
| 431 | /* |
| 432 | * All segments backing this area are pinned, so that DSA can explicitly |
| 433 | * control their lifetime (otherwise a newly created segment belonging to |
| 434 | * this area might be freed when the only backend that happens to have it |
| 435 | * mapped in ends, corrupting the area). |
| 436 | */ |
| 437 | dsm_pin_segment(segment); |
| 438 | |
| 439 | /* Create a new DSA area with the control object in this segment. */ |
| 440 | area = create_internal(dsm_segment_address(segment), |
| 441 | DSA_INITIAL_SEGMENT_SIZE, |
| 442 | tranche_id, |
| 443 | dsm_segment_handle(segment), segment); |
| 444 | |
| 445 | /* Clean up when the control segment detaches. */ |
| 446 | on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, |
| 447 | PointerGetDatum(dsm_segment_address(segment))); |
| 448 | |
| 449 | return area; |
| 450 | } |
| 451 | |
| 452 | /* |
| 453 | * Create a new shared area in an existing shared memory space, which may be |
| 454 | * either DSM or Postmaster-initialized memory. DSM segments will be |
| 455 | * allocated as required to extend the available space, though that can be |
| 456 | * prevented with dsa_set_size_limit(area, size) using the same size provided |
| 457 | * to dsa_create_in_place. |
| 458 | * |
| 459 | * Areas created in-place must eventually be released by the backend that |
| 460 | * created them and all backends that attach to them. This can be done |
| 461 | * explicitly with dsa_release_in_place, or, in the special case that 'place' |
| 462 | * happens to be in a pre-existing DSM segment, by passing in a pointer to the |
| 463 | * segment so that a detach hook can be registered with the containing DSM |
| 464 | * segment. |
| 465 | * |
| 466 | * See dsa_create() for a note about the tranche arguments. |
| 467 | */ |
| 468 | dsa_area * |
| 469 | dsa_create_in_place(void *place, size_t size, |
| 470 | int tranche_id, dsm_segment *segment) |
| 471 | { |
| 472 | dsa_area *area; |
| 473 | |
| 474 | area = create_internal(place, size, tranche_id, |
| 475 | DSM_HANDLE_INVALID, NULL); |
| 476 | |
| 477 | /* |
| 478 | * Clean up when the control segment detaches, if a containing DSM segment |
| 479 | * was provided. |
| 480 | */ |
| 481 | if (segment != NULL) |
| 482 | on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, |
| 483 | PointerGetDatum(place)); |
| 484 | |
| 485 | return area; |
| 486 | } |
| 487 | |
| 488 | /* |
| 489 | * Obtain a handle that can be passed to other processes so that they can |
| 490 | * attach to the given area. Cannot be called for areas created with |
| 491 | * dsa_create_in_place. |
| 492 | */ |
| 493 | dsa_handle |
| 494 | dsa_get_handle(dsa_area *area) |
| 495 | { |
| 496 | Assert(area->control->handle != DSM_HANDLE_INVALID); |
| 497 | return area->control->handle; |
| 498 | } |
| 499 | |
| 500 | /* |
| 501 | * Attach to an area given a handle generated (possibly in another process) by |
| 502 | * dsa_get_handle. The area must have been created with dsa_create (not |
| 503 | * dsa_create_in_place). |
| 504 | */ |
| 505 | dsa_area * |
| 506 | dsa_attach(dsa_handle handle) |
| 507 | { |
| 508 | dsm_segment *segment; |
| 509 | dsa_area *area; |
| 510 | |
| 511 | /* |
| 512 | * An area handle is really a DSM segment handle for the first segment, so |
| 513 | * we go ahead and attach to that. |
| 514 | */ |
| 515 | segment = dsm_attach(handle); |
| 516 | if (segment == NULL) |
| 517 | ereport(ERROR, |
| 518 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| 519 | errmsg("could not attach to dynamic shared area" ))); |
| 520 | |
| 521 | area = attach_internal(dsm_segment_address(segment), segment, handle); |
| 522 | |
| 523 | /* Clean up when the control segment detaches. */ |
| 524 | on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, |
| 525 | PointerGetDatum(dsm_segment_address(segment))); |
| 526 | |
| 527 | return area; |
| 528 | } |
| 529 | |
| 530 | /* |
| 531 | * Attach to an area that was created with dsa_create_in_place. The caller |
| 532 | * must somehow know the location in memory that was used when the area was |
| 533 | * created, though it may be mapped at a different virtual address in this |
| 534 | * process. |
| 535 | * |
| 536 | * See dsa_create_in_place for note about releasing in-place areas, and the |
| 537 | * optional 'segment' argument which can be provided to allow automatic |
| 538 | * release if the containing memory happens to be a DSM segment. |
| 539 | */ |
| 540 | dsa_area * |
| 541 | dsa_attach_in_place(void *place, dsm_segment *segment) |
| 542 | { |
| 543 | dsa_area *area; |
| 544 | |
| 545 | area = attach_internal(place, NULL, DSM_HANDLE_INVALID); |
| 546 | |
| 547 | /* |
| 548 | * Clean up when the control segment detaches, if a containing DSM segment |
| 549 | * was provided. |
| 550 | */ |
| 551 | if (segment != NULL) |
| 552 | on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, |
| 553 | PointerGetDatum(place)); |
| 554 | |
| 555 | return area; |
| 556 | } |
| 557 | |
| 558 | /* |
| 559 | * Release a DSA area that was produced by dsa_create_in_place or |
| 560 | * dsa_attach_in_place. The 'segment' argument is ignored but provides an |
| 561 | * interface suitable for on_dsm_detach, for the convenience of users who want |
| 562 | * to create a DSA segment inside an existing DSM segment and have it |
| 563 | * automatically released when the containing DSM segment is detached. |
| 564 | * 'place' should be the address of the place where the area was created. |
| 565 | * |
| 566 | * This callback is automatically registered for the DSM segment containing |
| 567 | * the control object of in-place areas when a segment is provided to |
| 568 | * dsa_create_in_place or dsa_attach_in_place, and also for all areas created |
| 569 | * with dsa_create. |
| 570 | */ |
| 571 | void |
| 572 | dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place) |
| 573 | { |
| 574 | dsa_release_in_place(DatumGetPointer(place)); |
| 575 | } |
| 576 | |
| 577 | /* |
| 578 | * Release a DSA area that was produced by dsa_create_in_place or |
| 579 | * dsa_attach_in_place. The 'code' argument is ignored but provides an |
| 580 | * interface suitable for on_shmem_exit or before_shmem_exit, for the |
| 581 | * convenience of users who want to create a DSA segment inside shared memory |
| 582 | * other than a DSM segment and have it automatically release at backend exit. |
| 583 | * 'place' should be the address of the place where the area was created. |
| 584 | */ |
| 585 | void |
| 586 | dsa_on_shmem_exit_release_in_place(int code, Datum place) |
| 587 | { |
| 588 | dsa_release_in_place(DatumGetPointer(place)); |
| 589 | } |
| 590 | |
| 591 | /* |
| 592 | * Release a DSA area that was produced by dsa_create_in_place or |
| 593 | * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX' |
| 594 | * callbacks so that this is managed automatically, because failure to release |
| 595 | * an area created in-place leaks its segments permanently. |
| 596 | * |
| 597 | * This is also called automatically for areas produced by dsa_create or |
| 598 | * dsa_attach as an implementation detail. |
| 599 | */ |
| 600 | void |
| 601 | dsa_release_in_place(void *place) |
| 602 | { |
| 603 | dsa_area_control *control = (dsa_area_control *) place; |
| 604 | int i; |
| 605 | |
| 606 | LWLockAcquire(&control->lock, LW_EXCLUSIVE); |
| 607 | Assert(control->segment_header.magic == |
| 608 | (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0)); |
| 609 | Assert(control->refcnt > 0); |
| 610 | if (--control->refcnt == 0) |
| 611 | { |
| 612 | for (i = 0; i <= control->high_segment_index; ++i) |
| 613 | { |
| 614 | dsm_handle handle; |
| 615 | |
| 616 | handle = control->segment_handles[i]; |
| 617 | if (handle != DSM_HANDLE_INVALID) |
| 618 | dsm_unpin_segment(handle); |
| 619 | } |
| 620 | } |
| 621 | LWLockRelease(&control->lock); |
| 622 | } |
| 623 | |
| 624 | /* |
| 625 | * Keep a DSA area attached until end of session or explicit detach. |
| 626 | * |
| 627 | * By default, areas are owned by the current resource owner, which means they |
| 628 | * are detached automatically when that scope ends. |
| 629 | */ |
| 630 | void |
| 631 | dsa_pin_mapping(dsa_area *area) |
| 632 | { |
| 633 | int i; |
| 634 | |
| 635 | Assert(!area->mapping_pinned); |
| 636 | area->mapping_pinned = true; |
| 637 | |
| 638 | for (i = 0; i <= area->high_segment_index; ++i) |
| 639 | if (area->segment_maps[i].segment != NULL) |
| 640 | dsm_pin_mapping(area->segment_maps[i].segment); |
| 641 | } |
| 642 | |
| 643 | /* |
| 644 | * Allocate memory in this storage area. The return value is a dsa_pointer |
| 645 | * that can be passed to other processes, and converted to a local pointer |
| 646 | * with dsa_get_address. 'flags' is a bitmap which should be constructed |
| 647 | * from the following values: |
| 648 | * |
| 649 | * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations |
| 650 | * will result in an ERROR. |
| 651 | * |
| 652 | * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when |
| 653 | * no memory is available or a size limit established by dsa_set_size_limit |
| 654 | * would be exceeded. Otherwise, such allocations will result in an ERROR. |
| 655 | * |
| 656 | * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the |
| 657 | * contents of newly-allocated memory are indeterminate. |
| 658 | * |
| 659 | * These flags correspond to similarly named flags used by |
| 660 | * MemoryContextAllocExtended(). See also the macros dsa_allocate and |
| 661 | * dsa_allocate0 which expand to a call to this function with commonly used |
| 662 | * flags. |
| 663 | */ |
| 664 | dsa_pointer |
| 665 | dsa_allocate_extended(dsa_area *area, size_t size, int flags) |
| 666 | { |
| 667 | uint16 size_class; |
| 668 | dsa_pointer start_pointer; |
| 669 | dsa_segment_map *segment_map; |
| 670 | dsa_pointer result; |
| 671 | |
| 672 | Assert(size > 0); |
| 673 | |
| 674 | /* Sanity check on huge individual allocation size. */ |
| 675 | if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) || |
| 676 | ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size))) |
| 677 | elog(ERROR, "invalid DSA memory alloc request size %zu" , size); |
| 678 | |
| 679 | /* |
| 680 | * If bigger than the largest size class, just grab a run of pages from |
| 681 | * the free page manager, instead of allocating an object from a pool. |
| 682 | * There will still be a span, but it's a special class of span that |
| 683 | * manages this whole allocation and simply gives all pages back to the |
| 684 | * free page manager when dsa_free is called. |
| 685 | */ |
| 686 | if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1]) |
| 687 | { |
| 688 | size_t npages = fpm_size_to_pages(size); |
| 689 | size_t first_page; |
| 690 | dsa_pointer span_pointer; |
| 691 | dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE]; |
| 692 | |
| 693 | /* Obtain a span object. */ |
| 694 | span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); |
| 695 | if (!DsaPointerIsValid(span_pointer)) |
| 696 | { |
| 697 | /* Raise error unless asked not to. */ |
| 698 | if ((flags & DSA_ALLOC_NO_OOM) == 0) |
| 699 | ereport(ERROR, |
| 700 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 701 | errmsg("out of memory" ), |
| 702 | errdetail("Failed on DSA request of size %zu." , |
| 703 | size))); |
| 704 | return InvalidDsaPointer; |
| 705 | } |
| 706 | |
| 707 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 708 | |
| 709 | /* Find a segment from which to allocate. */ |
| 710 | segment_map = get_best_segment(area, npages); |
| 711 | if (segment_map == NULL) |
| 712 | segment_map = make_new_segment(area, npages); |
| 713 | if (segment_map == NULL) |
| 714 | { |
| 715 | /* Can't make any more segments: game over. */ |
| 716 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 717 | dsa_free(area, span_pointer); |
| 718 | |
| 719 | /* Raise error unless asked not to. */ |
| 720 | if ((flags & DSA_ALLOC_NO_OOM) == 0) |
| 721 | ereport(ERROR, |
| 722 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 723 | errmsg("out of memory" ), |
| 724 | errdetail("Failed on DSA request of size %zu." , |
| 725 | size))); |
| 726 | return InvalidDsaPointer; |
| 727 | } |
| 728 | |
| 729 | /* |
| 730 | * Ask the free page manager for a run of pages. This should always |
| 731 | * succeed, since both get_best_segment and make_new_segment should |
| 732 | * only return a non-NULL pointer if it actually contains enough |
| 733 | * contiguous freespace. If it does fail, something in our backend |
| 734 | * private state is out of whack, so use FATAL to kill the process. |
| 735 | */ |
| 736 | if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) |
| 737 | elog(FATAL, |
| 738 | "dsa_allocate could not find %zu free pages" , npages); |
| 739 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 740 | |
| 741 | start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map), |
| 742 | first_page * FPM_PAGE_SIZE); |
| 743 | |
| 744 | /* Initialize span and pagemap. */ |
| 745 | LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), |
| 746 | LW_EXCLUSIVE); |
| 747 | init_span(area, span_pointer, pool, start_pointer, npages, |
| 748 | DSA_SCLASS_SPAN_LARGE); |
| 749 | segment_map->pagemap[first_page] = span_pointer; |
| 750 | LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); |
| 751 | |
| 752 | /* Zero-initialize the memory if requested. */ |
| 753 | if ((flags & DSA_ALLOC_ZERO) != 0) |
| 754 | memset(dsa_get_address(area, start_pointer), 0, size); |
| 755 | |
| 756 | return start_pointer; |
| 757 | } |
| 758 | |
| 759 | /* Map allocation to a size class. */ |
| 760 | if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM) |
| 761 | { |
| 762 | int mapidx; |
| 763 | |
| 764 | /* For smaller sizes we have a lookup table... */ |
| 765 | mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) / |
| 766 | DSA_SIZE_CLASS_MAP_QUANTUM) - 1; |
| 767 | size_class = dsa_size_class_map[mapidx]; |
| 768 | } |
| 769 | else |
| 770 | { |
| 771 | uint16 min; |
| 772 | uint16 max; |
| 773 | |
| 774 | /* ... and for the rest we search by binary chop. */ |
| 775 | min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1]; |
| 776 | max = lengthof(dsa_size_classes) - 1; |
| 777 | |
| 778 | while (min < max) |
| 779 | { |
| 780 | uint16 mid = (min + max) / 2; |
| 781 | uint16 class_size = dsa_size_classes[mid]; |
| 782 | |
| 783 | if (class_size < size) |
| 784 | min = mid + 1; |
| 785 | else |
| 786 | max = mid; |
| 787 | } |
| 788 | |
| 789 | size_class = min; |
| 790 | } |
| 791 | Assert(size <= dsa_size_classes[size_class]); |
| 792 | Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]); |
| 793 | |
| 794 | /* Attempt to allocate an object from the appropriate pool. */ |
| 795 | result = alloc_object(area, size_class); |
| 796 | |
| 797 | /* Check for failure to allocate. */ |
| 798 | if (!DsaPointerIsValid(result)) |
| 799 | { |
| 800 | /* Raise error unless asked not to. */ |
| 801 | if ((flags & DSA_ALLOC_NO_OOM) == 0) |
| 802 | ereport(ERROR, |
| 803 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 804 | errmsg("out of memory" ), |
| 805 | errdetail("Failed on DSA request of size %zu." , size))); |
| 806 | return InvalidDsaPointer; |
| 807 | } |
| 808 | |
| 809 | /* Zero-initialize the memory if requested. */ |
| 810 | if ((flags & DSA_ALLOC_ZERO) != 0) |
| 811 | memset(dsa_get_address(area, result), 0, size); |
| 812 | |
| 813 | return result; |
| 814 | } |
| 815 | |
| 816 | /* |
| 817 | * Free memory obtained with dsa_allocate. |
| 818 | */ |
| 819 | void |
| 820 | dsa_free(dsa_area *area, dsa_pointer dp) |
| 821 | { |
| 822 | dsa_segment_map *segment_map; |
| 823 | int pageno; |
| 824 | dsa_pointer span_pointer; |
| 825 | dsa_area_span *span; |
| 826 | char *superblock; |
| 827 | char *object; |
| 828 | size_t size; |
| 829 | int size_class; |
| 830 | |
| 831 | /* Make sure we don't have a stale segment in the slot 'dp' refers to. */ |
| 832 | check_for_freed_segments(area); |
| 833 | |
| 834 | /* Locate the object, span and pool. */ |
| 835 | segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp)); |
| 836 | pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE; |
| 837 | span_pointer = segment_map->pagemap[pageno]; |
| 838 | span = dsa_get_address(area, span_pointer); |
| 839 | superblock = dsa_get_address(area, span->start); |
| 840 | object = dsa_get_address(area, dp); |
| 841 | size_class = span->size_class; |
| 842 | size = dsa_size_classes[size_class]; |
| 843 | |
| 844 | /* |
| 845 | * Special case for large objects that live in a special span: we return |
| 846 | * those pages directly to the free page manager and free the span. |
| 847 | */ |
| 848 | if (span->size_class == DSA_SCLASS_SPAN_LARGE) |
| 849 | { |
| 850 | |
| 851 | #ifdef CLOBBER_FREED_MEMORY |
| 852 | memset(object, 0x7f, span->npages * FPM_PAGE_SIZE); |
| 853 | #endif |
| 854 | |
| 855 | /* Give pages back to free page manager. */ |
| 856 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 857 | FreePageManagerPut(segment_map->fpm, |
| 858 | DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, |
| 859 | span->npages); |
| 860 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 861 | /* Unlink span. */ |
| 862 | LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), |
| 863 | LW_EXCLUSIVE); |
| 864 | unlink_span(area, span); |
| 865 | LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); |
| 866 | /* Free the span object so it can be reused. */ |
| 867 | dsa_free(area, span_pointer); |
| 868 | return; |
| 869 | } |
| 870 | |
| 871 | #ifdef CLOBBER_FREED_MEMORY |
| 872 | memset(object, 0x7f, size); |
| 873 | #endif |
| 874 | |
| 875 | LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); |
| 876 | |
| 877 | /* Put the object on the span's freelist. */ |
| 878 | Assert(object >= superblock); |
| 879 | Assert(object < superblock + DSA_SUPERBLOCK_SIZE); |
| 880 | Assert((object - superblock) % size == 0); |
| 881 | NextFreeObjectIndex(object) = span->firstfree; |
| 882 | span->firstfree = (object - superblock) / size; |
| 883 | ++span->nallocatable; |
| 884 | |
| 885 | /* |
| 886 | * See if the span needs to moved to a different fullness class, or be |
| 887 | * freed so its pages can be given back to the segment. |
| 888 | */ |
| 889 | if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1) |
| 890 | { |
| 891 | /* |
| 892 | * The block was completely full and is located in the |
| 893 | * highest-numbered fullness class, which is never scanned for free |
| 894 | * chunks. We must move it to the next-lower fullness class. |
| 895 | */ |
| 896 | unlink_span(area, span); |
| 897 | add_span_to_fullness_class(area, span, span_pointer, |
| 898 | DSA_FULLNESS_CLASSES - 2); |
| 899 | |
| 900 | /* |
| 901 | * If this is the only span, and there is no active span, then we |
| 902 | * should probably move this span to fullness class 1. (Otherwise if |
| 903 | * you allocate exactly all the objects in the only span, it moves to |
| 904 | * class 3, then you free them all, it moves to 2, and then is given |
| 905 | * back, leaving no active span). |
| 906 | */ |
| 907 | } |
| 908 | else if (span->nallocatable == span->nmax && |
| 909 | (span->fclass != 1 || span->prevspan != InvalidDsaPointer)) |
| 910 | { |
| 911 | /* |
| 912 | * This entire block is free, and it's not the active block for this |
| 913 | * size class. Return the memory to the free page manager. We don't |
| 914 | * do this for the active block to prevent hysteresis: if we |
| 915 | * repeatedly allocate and free the only chunk in the active block, it |
| 916 | * will be very inefficient if we deallocate and reallocate the block |
| 917 | * every time. |
| 918 | */ |
| 919 | destroy_superblock(area, span_pointer); |
| 920 | } |
| 921 | |
| 922 | LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); |
| 923 | } |
| 924 | |
| 925 | /* |
| 926 | * Obtain a backend-local address for a dsa_pointer. 'dp' must point to |
| 927 | * memory allocated by the given area (possibly in another process) that |
| 928 | * hasn't yet been freed. This may cause a segment to be mapped into the |
| 929 | * current process if required, and may cause freed segments to be unmapped. |
| 930 | */ |
| 931 | void * |
| 932 | dsa_get_address(dsa_area *area, dsa_pointer dp) |
| 933 | { |
| 934 | dsa_segment_index index; |
| 935 | size_t offset; |
| 936 | |
| 937 | /* Convert InvalidDsaPointer to NULL. */ |
| 938 | if (!DsaPointerIsValid(dp)) |
| 939 | return NULL; |
| 940 | |
| 941 | /* Process any requests to detach from freed segments. */ |
| 942 | check_for_freed_segments(area); |
| 943 | |
| 944 | /* Break the dsa_pointer into its components. */ |
| 945 | index = DSA_EXTRACT_SEGMENT_NUMBER(dp); |
| 946 | offset = DSA_EXTRACT_OFFSET(dp); |
| 947 | Assert(index < DSA_MAX_SEGMENTS); |
| 948 | |
| 949 | /* Check if we need to cause this segment to be mapped in. */ |
| 950 | if (unlikely(area->segment_maps[index].mapped_address == NULL)) |
| 951 | { |
| 952 | /* Call for effect (we don't need the result). */ |
| 953 | get_segment_by_index(area, index); |
| 954 | } |
| 955 | |
| 956 | return area->segment_maps[index].mapped_address + offset; |
| 957 | } |
| 958 | |
| 959 | /* |
| 960 | * Pin this area, so that it will continue to exist even if all backends |
| 961 | * detach from it. In that case, the area can still be reattached to if a |
| 962 | * handle has been recorded somewhere. |
| 963 | */ |
| 964 | void |
| 965 | dsa_pin(dsa_area *area) |
| 966 | { |
| 967 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 968 | if (area->control->pinned) |
| 969 | { |
| 970 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 971 | elog(ERROR, "dsa_area already pinned" ); |
| 972 | } |
| 973 | area->control->pinned = true; |
| 974 | ++area->control->refcnt; |
| 975 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 976 | } |
| 977 | |
| 978 | /* |
| 979 | * Undo the effects of dsa_pin, so that the given area can be freed when no |
| 980 | * backends are attached to it. May be called only if dsa_pin has been |
| 981 | * called. |
| 982 | */ |
| 983 | void |
| 984 | dsa_unpin(dsa_area *area) |
| 985 | { |
| 986 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 987 | Assert(area->control->refcnt > 1); |
| 988 | if (!area->control->pinned) |
| 989 | { |
| 990 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 991 | elog(ERROR, "dsa_area not pinned" ); |
| 992 | } |
| 993 | area->control->pinned = false; |
| 994 | --area->control->refcnt; |
| 995 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 996 | } |
| 997 | |
| 998 | /* |
| 999 | * Set the total size limit for this area. This limit is checked whenever new |
| 1000 | * segments need to be allocated from the operating system. If the new size |
| 1001 | * limit is already exceeded, this has no immediate effect. |
| 1002 | * |
| 1003 | * Note that the total virtual memory usage may be temporarily larger than |
| 1004 | * this limit when segments have been freed, but not yet detached by all |
| 1005 | * backends that have attached to them. |
| 1006 | */ |
| 1007 | void |
| 1008 | dsa_set_size_limit(dsa_area *area, size_t limit) |
| 1009 | { |
| 1010 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 1011 | area->control->max_total_segment_size = limit; |
| 1012 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 1013 | } |
| 1014 | |
| 1015 | /* |
| 1016 | * Aggressively free all spare memory in the hope of returning DSM segments to |
| 1017 | * the operating system. |
| 1018 | */ |
| 1019 | void |
| 1020 | dsa_trim(dsa_area *area) |
| 1021 | { |
| 1022 | int size_class; |
| 1023 | |
| 1024 | /* |
| 1025 | * Trim in reverse pool order so we get to the spans-of-spans last, just |
| 1026 | * in case any become entirely free while processing all the other pools. |
| 1027 | */ |
| 1028 | for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class) |
| 1029 | { |
| 1030 | dsa_area_pool *pool = &area->control->pools[size_class]; |
| 1031 | dsa_pointer span_pointer; |
| 1032 | |
| 1033 | if (size_class == DSA_SCLASS_SPAN_LARGE) |
| 1034 | { |
| 1035 | /* Large object frees give back segments aggressively already. */ |
| 1036 | continue; |
| 1037 | } |
| 1038 | |
| 1039 | /* |
| 1040 | * Search fullness class 1 only. That is where we expect to find an |
| 1041 | * entirely empty superblock (entirely empty superblocks in other |
| 1042 | * fullness classes are returned to the free page map by dsa_free). |
| 1043 | */ |
| 1044 | LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); |
| 1045 | span_pointer = pool->spans[1]; |
| 1046 | while (DsaPointerIsValid(span_pointer)) |
| 1047 | { |
| 1048 | dsa_area_span *span = dsa_get_address(area, span_pointer); |
| 1049 | dsa_pointer next = span->nextspan; |
| 1050 | |
| 1051 | if (span->nallocatable == span->nmax) |
| 1052 | destroy_superblock(area, span_pointer); |
| 1053 | |
| 1054 | span_pointer = next; |
| 1055 | } |
| 1056 | LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); |
| 1057 | } |
| 1058 | } |
| 1059 | |
| 1060 | /* |
| 1061 | * Print out debugging information about the internal state of the shared |
| 1062 | * memory area. |
| 1063 | */ |
| 1064 | void |
| 1065 | dsa_dump(dsa_area *area) |
| 1066 | { |
| 1067 | size_t i, |
| 1068 | j; |
| 1069 | |
| 1070 | /* |
| 1071 | * Note: This gives an inconsistent snapshot as it acquires and releases |
| 1072 | * individual locks as it goes... |
| 1073 | */ |
| 1074 | |
| 1075 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 1076 | check_for_freed_segments_locked(area); |
| 1077 | fprintf(stderr, "dsa_area handle %x:\n" , area->control->handle); |
| 1078 | fprintf(stderr, " max_total_segment_size: %zu\n" , |
| 1079 | area->control->max_total_segment_size); |
| 1080 | fprintf(stderr, " total_segment_size: %zu\n" , |
| 1081 | area->control->total_segment_size); |
| 1082 | fprintf(stderr, " refcnt: %d\n" , area->control->refcnt); |
| 1083 | fprintf(stderr, " pinned: %c\n" , area->control->pinned ? 't' : 'f'); |
| 1084 | fprintf(stderr, " segment bins:\n" ); |
| 1085 | for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) |
| 1086 | { |
| 1087 | if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE) |
| 1088 | { |
| 1089 | dsa_segment_index segment_index; |
| 1090 | |
| 1091 | fprintf(stderr, |
| 1092 | " segment bin %zu (at least %d contiguous pages free):\n" , |
| 1093 | i, 1 << (i - 1)); |
| 1094 | segment_index = area->control->segment_bins[i]; |
| 1095 | while (segment_index != DSA_SEGMENT_INDEX_NONE) |
| 1096 | { |
| 1097 | dsa_segment_map *segment_map; |
| 1098 | |
| 1099 | segment_map = |
| 1100 | get_segment_by_index(area, segment_index); |
| 1101 | |
| 1102 | fprintf(stderr, |
| 1103 | " segment index %zu, usable_pages = %zu, " |
| 1104 | "contiguous_pages = %zu, mapped at %p\n" , |
| 1105 | segment_index, |
| 1106 | segment_map->header->usable_pages, |
| 1107 | fpm_largest(segment_map->fpm), |
| 1108 | segment_map->mapped_address); |
| 1109 | segment_index = segment_map->header->next; |
| 1110 | } |
| 1111 | } |
| 1112 | } |
| 1113 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 1114 | |
| 1115 | fprintf(stderr, " pools:\n" ); |
| 1116 | for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) |
| 1117 | { |
| 1118 | bool found = false; |
| 1119 | |
| 1120 | LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE); |
| 1121 | for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) |
| 1122 | if (DsaPointerIsValid(area->control->pools[i].spans[j])) |
| 1123 | found = true; |
| 1124 | if (found) |
| 1125 | { |
| 1126 | if (i == DSA_SCLASS_BLOCK_OF_SPANS) |
| 1127 | fprintf(stderr, " pool for blocks of span objects:\n" ); |
| 1128 | else if (i == DSA_SCLASS_SPAN_LARGE) |
| 1129 | fprintf(stderr, " pool for large object spans:\n" ); |
| 1130 | else |
| 1131 | fprintf(stderr, |
| 1132 | " pool for size class %zu (object size %hu bytes):\n" , |
| 1133 | i, dsa_size_classes[i]); |
| 1134 | for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) |
| 1135 | { |
| 1136 | if (!DsaPointerIsValid(area->control->pools[i].spans[j])) |
| 1137 | fprintf(stderr, " fullness class %zu is empty\n" , j); |
| 1138 | else |
| 1139 | { |
| 1140 | dsa_pointer span_pointer = area->control->pools[i].spans[j]; |
| 1141 | |
| 1142 | fprintf(stderr, " fullness class %zu:\n" , j); |
| 1143 | while (DsaPointerIsValid(span_pointer)) |
| 1144 | { |
| 1145 | dsa_area_span *span; |
| 1146 | |
| 1147 | span = dsa_get_address(area, span_pointer); |
| 1148 | fprintf(stderr, |
| 1149 | " span descriptor at " |
| 1150 | DSA_POINTER_FORMAT ", superblock at " |
| 1151 | DSA_POINTER_FORMAT |
| 1152 | ", pages = %zu, objects free = %hu/%hu\n" , |
| 1153 | span_pointer, span->start, span->npages, |
| 1154 | span->nallocatable, span->nmax); |
| 1155 | span_pointer = span->nextspan; |
| 1156 | } |
| 1157 | } |
| 1158 | } |
| 1159 | } |
| 1160 | LWLockRelease(DSA_SCLASS_LOCK(area, i)); |
| 1161 | } |
| 1162 | } |
| 1163 | |
| 1164 | /* |
| 1165 | * Return the smallest size that you can successfully provide to |
| 1166 | * dsa_create_in_place. |
| 1167 | */ |
| 1168 | size_t |
| 1169 | dsa_minimum_size(void) |
| 1170 | { |
| 1171 | size_t size; |
| 1172 | int pages = 0; |
| 1173 | |
| 1174 | size = MAXALIGN(sizeof(dsa_area_control)) + |
| 1175 | MAXALIGN(sizeof(FreePageManager)); |
| 1176 | |
| 1177 | /* Figure out how many pages we need, including the page map... */ |
| 1178 | while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages) |
| 1179 | { |
| 1180 | ++pages; |
| 1181 | size += sizeof(dsa_pointer); |
| 1182 | } |
| 1183 | |
| 1184 | return pages * FPM_PAGE_SIZE; |
| 1185 | } |
| 1186 | |
| 1187 | /* |
| 1188 | * Workhorse function for dsa_create and dsa_create_in_place. |
| 1189 | */ |
| 1190 | static dsa_area * |
| 1191 | create_internal(void *place, size_t size, |
| 1192 | int tranche_id, |
| 1193 | dsm_handle control_handle, |
| 1194 | dsm_segment *control_segment) |
| 1195 | { |
| 1196 | dsa_area_control *control; |
| 1197 | dsa_area *area; |
| 1198 | dsa_segment_map *segment_map; |
| 1199 | size_t usable_pages; |
| 1200 | size_t total_pages; |
| 1201 | size_t metadata_bytes; |
| 1202 | int i; |
| 1203 | |
| 1204 | /* Sanity check on the space we have to work in. */ |
| 1205 | if (size < dsa_minimum_size()) |
| 1206 | elog(ERROR, "dsa_area space must be at least %zu, but %zu provided" , |
| 1207 | dsa_minimum_size(), size); |
| 1208 | |
| 1209 | /* Now figure out how much space is usable */ |
| 1210 | total_pages = size / FPM_PAGE_SIZE; |
| 1211 | metadata_bytes = |
| 1212 | MAXALIGN(sizeof(dsa_area_control)) + |
| 1213 | MAXALIGN(sizeof(FreePageManager)) + |
| 1214 | total_pages * sizeof(dsa_pointer); |
| 1215 | /* Add padding up to next page boundary. */ |
| 1216 | if (metadata_bytes % FPM_PAGE_SIZE != 0) |
| 1217 | metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); |
| 1218 | Assert(metadata_bytes <= size); |
| 1219 | usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE; |
| 1220 | |
| 1221 | /* |
| 1222 | * Initialize the dsa_area_control object located at the start of the |
| 1223 | * space. |
| 1224 | */ |
| 1225 | control = (dsa_area_control *) place; |
| 1226 | control->segment_header.magic = |
| 1227 | DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0; |
| 1228 | control->segment_header.next = DSA_SEGMENT_INDEX_NONE; |
| 1229 | control->segment_header.prev = DSA_SEGMENT_INDEX_NONE; |
| 1230 | control->segment_header.usable_pages = usable_pages; |
| 1231 | control->segment_header.freed = false; |
| 1232 | control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE; |
| 1233 | control->handle = control_handle; |
| 1234 | control->max_total_segment_size = (size_t) -1; |
| 1235 | control->total_segment_size = size; |
| 1236 | memset(&control->segment_handles[0], 0, |
| 1237 | sizeof(dsm_handle) * DSA_MAX_SEGMENTS); |
| 1238 | control->segment_handles[0] = control_handle; |
| 1239 | for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) |
| 1240 | control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE; |
| 1241 | control->high_segment_index = 0; |
| 1242 | control->refcnt = 1; |
| 1243 | control->freed_segment_counter = 0; |
| 1244 | control->lwlock_tranche_id = tranche_id; |
| 1245 | |
| 1246 | /* |
| 1247 | * Create the dsa_area object that this backend will use to access the |
| 1248 | * area. Other backends will need to obtain their own dsa_area object by |
| 1249 | * attaching. |
| 1250 | */ |
| 1251 | area = palloc(sizeof(dsa_area)); |
| 1252 | area->control = control; |
| 1253 | area->mapping_pinned = false; |
| 1254 | memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); |
| 1255 | area->high_segment_index = 0; |
| 1256 | area->freed_segment_counter = 0; |
| 1257 | LWLockInitialize(&control->lock, control->lwlock_tranche_id); |
| 1258 | for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) |
| 1259 | LWLockInitialize(DSA_SCLASS_LOCK(area, i), |
| 1260 | control->lwlock_tranche_id); |
| 1261 | |
| 1262 | /* Set up the segment map for this process's mapping. */ |
| 1263 | segment_map = &area->segment_maps[0]; |
| 1264 | segment_map->segment = control_segment; |
| 1265 | segment_map->mapped_address = place; |
| 1266 | segment_map->header = (dsa_segment_header *) place; |
| 1267 | segment_map->fpm = (FreePageManager *) |
| 1268 | (segment_map->mapped_address + |
| 1269 | MAXALIGN(sizeof(dsa_area_control))); |
| 1270 | segment_map->pagemap = (dsa_pointer *) |
| 1271 | (segment_map->mapped_address + |
| 1272 | MAXALIGN(sizeof(dsa_area_control)) + |
| 1273 | MAXALIGN(sizeof(FreePageManager))); |
| 1274 | |
| 1275 | /* Set up the free page map. */ |
| 1276 | FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); |
| 1277 | /* There can be 0 usable pages if size is dsa_minimum_size(). */ |
| 1278 | |
| 1279 | if (usable_pages > 0) |
| 1280 | FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, |
| 1281 | usable_pages); |
| 1282 | |
| 1283 | /* Put this segment into the appropriate bin. */ |
| 1284 | control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0; |
| 1285 | segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); |
| 1286 | |
| 1287 | return area; |
| 1288 | } |
| 1289 | |
| 1290 | /* |
| 1291 | * Workhorse function for dsa_attach and dsa_attach_in_place. |
| 1292 | */ |
| 1293 | static dsa_area * |
| 1294 | attach_internal(void *place, dsm_segment *segment, dsa_handle handle) |
| 1295 | { |
| 1296 | dsa_area_control *control; |
| 1297 | dsa_area *area; |
| 1298 | dsa_segment_map *segment_map; |
| 1299 | |
| 1300 | control = (dsa_area_control *) place; |
| 1301 | Assert(control->handle == handle); |
| 1302 | Assert(control->segment_handles[0] == handle); |
| 1303 | Assert(control->segment_header.magic == |
| 1304 | (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0)); |
| 1305 | |
| 1306 | /* Build the backend-local area object. */ |
| 1307 | area = palloc(sizeof(dsa_area)); |
| 1308 | area->control = control; |
| 1309 | area->mapping_pinned = false; |
| 1310 | memset(&area->segment_maps[0], 0, |
| 1311 | sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); |
| 1312 | area->high_segment_index = 0; |
| 1313 | |
| 1314 | /* Set up the segment map for this process's mapping. */ |
| 1315 | segment_map = &area->segment_maps[0]; |
| 1316 | segment_map->segment = segment; /* NULL for in-place */ |
| 1317 | segment_map->mapped_address = place; |
| 1318 | segment_map->header = (dsa_segment_header *) segment_map->mapped_address; |
| 1319 | segment_map->fpm = (FreePageManager *) |
| 1320 | (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control))); |
| 1321 | segment_map->pagemap = (dsa_pointer *) |
| 1322 | (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) + |
| 1323 | MAXALIGN(sizeof(FreePageManager))); |
| 1324 | |
| 1325 | /* Bump the reference count. */ |
| 1326 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 1327 | if (control->refcnt == 0) |
| 1328 | { |
| 1329 | /* We can't attach to a DSA area that has already been destroyed. */ |
| 1330 | ereport(ERROR, |
| 1331 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| 1332 | errmsg("could not attach to dynamic shared area" ))); |
| 1333 | } |
| 1334 | ++control->refcnt; |
| 1335 | area->freed_segment_counter = area->control->freed_segment_counter; |
| 1336 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 1337 | |
| 1338 | return area; |
| 1339 | } |
| 1340 | |
| 1341 | /* |
| 1342 | * Add a new span to fullness class 1 of the indicated pool. |
| 1343 | */ |
| 1344 | static void |
| 1345 | init_span(dsa_area *area, |
| 1346 | dsa_pointer span_pointer, |
| 1347 | dsa_area_pool *pool, dsa_pointer start, size_t npages, |
| 1348 | uint16 size_class) |
| 1349 | { |
| 1350 | dsa_area_span *span = dsa_get_address(area, span_pointer); |
| 1351 | size_t obsize = dsa_size_classes[size_class]; |
| 1352 | |
| 1353 | /* |
| 1354 | * The per-pool lock must be held because we manipulate the span list for |
| 1355 | * this pool. |
| 1356 | */ |
| 1357 | Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); |
| 1358 | |
| 1359 | /* Push this span onto the front of the span list for fullness class 1. */ |
| 1360 | if (DsaPointerIsValid(pool->spans[1])) |
| 1361 | { |
| 1362 | dsa_area_span *head = (dsa_area_span *) |
| 1363 | dsa_get_address(area, pool->spans[1]); |
| 1364 | |
| 1365 | head->prevspan = span_pointer; |
| 1366 | } |
| 1367 | span->pool = DsaAreaPoolToDsaPointer(area, pool); |
| 1368 | span->nextspan = pool->spans[1]; |
| 1369 | span->prevspan = InvalidDsaPointer; |
| 1370 | pool->spans[1] = span_pointer; |
| 1371 | |
| 1372 | span->start = start; |
| 1373 | span->npages = npages; |
| 1374 | span->size_class = size_class; |
| 1375 | span->ninitialized = 0; |
| 1376 | if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) |
| 1377 | { |
| 1378 | /* |
| 1379 | * A block-of-spans contains its own descriptor, so mark one object as |
| 1380 | * initialized and reduce the count of allocatable objects by one. |
| 1381 | * Doing this here has the side effect of also reducing nmax by one, |
| 1382 | * which is important to make sure we free this object at the correct |
| 1383 | * time. |
| 1384 | */ |
| 1385 | span->ninitialized = 1; |
| 1386 | span->nallocatable = FPM_PAGE_SIZE / obsize - 1; |
| 1387 | } |
| 1388 | else if (size_class != DSA_SCLASS_SPAN_LARGE) |
| 1389 | span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize; |
| 1390 | span->firstfree = DSA_SPAN_NOTHING_FREE; |
| 1391 | span->nmax = span->nallocatable; |
| 1392 | span->fclass = 1; |
| 1393 | } |
| 1394 | |
| 1395 | /* |
| 1396 | * Transfer the first span in one fullness class to the head of another |
| 1397 | * fullness class. |
| 1398 | */ |
| 1399 | static bool |
| 1400 | transfer_first_span(dsa_area *area, |
| 1401 | dsa_area_pool *pool, int fromclass, int toclass) |
| 1402 | { |
| 1403 | dsa_pointer span_pointer; |
| 1404 | dsa_area_span *span; |
| 1405 | dsa_area_span *nextspan; |
| 1406 | |
| 1407 | /* Can't do it if source list is empty. */ |
| 1408 | span_pointer = pool->spans[fromclass]; |
| 1409 | if (!DsaPointerIsValid(span_pointer)) |
| 1410 | return false; |
| 1411 | |
| 1412 | /* Remove span from head of source list. */ |
| 1413 | span = dsa_get_address(area, span_pointer); |
| 1414 | pool->spans[fromclass] = span->nextspan; |
| 1415 | if (DsaPointerIsValid(span->nextspan)) |
| 1416 | { |
| 1417 | nextspan = (dsa_area_span *) |
| 1418 | dsa_get_address(area, span->nextspan); |
| 1419 | nextspan->prevspan = InvalidDsaPointer; |
| 1420 | } |
| 1421 | |
| 1422 | /* Add span to head of target list. */ |
| 1423 | span->nextspan = pool->spans[toclass]; |
| 1424 | pool->spans[toclass] = span_pointer; |
| 1425 | if (DsaPointerIsValid(span->nextspan)) |
| 1426 | { |
| 1427 | nextspan = (dsa_area_span *) |
| 1428 | dsa_get_address(area, span->nextspan); |
| 1429 | nextspan->prevspan = span_pointer; |
| 1430 | } |
| 1431 | span->fclass = toclass; |
| 1432 | |
| 1433 | return true; |
| 1434 | } |
| 1435 | |
| 1436 | /* |
| 1437 | * Allocate one object of the requested size class from the given area. |
| 1438 | */ |
| 1439 | static inline dsa_pointer |
| 1440 | alloc_object(dsa_area *area, int size_class) |
| 1441 | { |
| 1442 | dsa_area_pool *pool = &area->control->pools[size_class]; |
| 1443 | dsa_area_span *span; |
| 1444 | dsa_pointer block; |
| 1445 | dsa_pointer result; |
| 1446 | char *object; |
| 1447 | size_t size; |
| 1448 | |
| 1449 | /* |
| 1450 | * Even though ensure_active_superblock can in turn call alloc_object if |
| 1451 | * it needs to allocate a new span, that's always from a different pool, |
| 1452 | * and the order of lock acquisition is always the same, so it's OK that |
| 1453 | * we hold this lock for the duration of this function. |
| 1454 | */ |
| 1455 | Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); |
| 1456 | LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); |
| 1457 | |
| 1458 | /* |
| 1459 | * If there's no active superblock, we must successfully obtain one or |
| 1460 | * fail the request. |
| 1461 | */ |
| 1462 | if (!DsaPointerIsValid(pool->spans[1]) && |
| 1463 | !ensure_active_superblock(area, pool, size_class)) |
| 1464 | { |
| 1465 | result = InvalidDsaPointer; |
| 1466 | } |
| 1467 | else |
| 1468 | { |
| 1469 | /* |
| 1470 | * There should be a block in fullness class 1 at this point, and it |
| 1471 | * should never be completely full. Thus we can either pop an object |
| 1472 | * from the free list or, failing that, initialize a new object. |
| 1473 | */ |
| 1474 | Assert(DsaPointerIsValid(pool->spans[1])); |
| 1475 | span = (dsa_area_span *) |
| 1476 | dsa_get_address(area, pool->spans[1]); |
| 1477 | Assert(span->nallocatable > 0); |
| 1478 | block = span->start; |
| 1479 | Assert(size_class < DSA_NUM_SIZE_CLASSES); |
| 1480 | size = dsa_size_classes[size_class]; |
| 1481 | if (span->firstfree != DSA_SPAN_NOTHING_FREE) |
| 1482 | { |
| 1483 | result = block + span->firstfree * size; |
| 1484 | object = dsa_get_address(area, result); |
| 1485 | span->firstfree = NextFreeObjectIndex(object); |
| 1486 | } |
| 1487 | else |
| 1488 | { |
| 1489 | result = block + span->ninitialized * size; |
| 1490 | ++span->ninitialized; |
| 1491 | } |
| 1492 | --span->nallocatable; |
| 1493 | |
| 1494 | /* If it's now full, move it to the highest-numbered fullness class. */ |
| 1495 | if (span->nallocatable == 0) |
| 1496 | transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1); |
| 1497 | } |
| 1498 | |
| 1499 | Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); |
| 1500 | LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); |
| 1501 | |
| 1502 | return result; |
| 1503 | } |
| 1504 | |
| 1505 | /* |
| 1506 | * Ensure an active (i.e. fullness class 1) superblock, unless all existing |
| 1507 | * superblocks are completely full and no more can be allocated. |
| 1508 | * |
| 1509 | * Fullness classes K of 0..N are loosely intended to represent blocks whose |
| 1510 | * utilization percentage is at least K/N, but we only enforce this rigorously |
| 1511 | * for the highest-numbered fullness class, which always contains exactly |
| 1512 | * those blocks that are completely full. It's otherwise acceptable for a |
| 1513 | * block to be in a higher-numbered fullness class than the one to which it |
| 1514 | * logically belongs. In addition, the active block, which is always the |
| 1515 | * first block in fullness class 1, is permitted to have a higher allocation |
| 1516 | * percentage than would normally be allowable for that fullness class; we |
| 1517 | * don't move it until it's completely full, and then it goes to the |
| 1518 | * highest-numbered fullness class. |
| 1519 | * |
| 1520 | * It might seem odd that the active block is the head of fullness class 1 |
| 1521 | * rather than fullness class 0, but experience with other allocators has |
| 1522 | * shown that it's usually better to allocate from a block that's moderately |
| 1523 | * full rather than one that's nearly empty. Insofar as is reasonably |
| 1524 | * possible, we want to avoid performing new allocations in a block that would |
| 1525 | * otherwise become empty soon. |
| 1526 | */ |
| 1527 | static bool |
| 1528 | ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, |
| 1529 | int size_class) |
| 1530 | { |
| 1531 | dsa_pointer span_pointer; |
| 1532 | dsa_pointer start_pointer; |
| 1533 | size_t obsize = dsa_size_classes[size_class]; |
| 1534 | size_t nmax; |
| 1535 | int fclass; |
| 1536 | size_t npages = 1; |
| 1537 | size_t first_page; |
| 1538 | size_t i; |
| 1539 | dsa_segment_map *segment_map; |
| 1540 | |
| 1541 | Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); |
| 1542 | |
| 1543 | /* |
| 1544 | * Compute the number of objects that will fit in a block of this size |
| 1545 | * class. Span-of-spans blocks are just a single page, and the first |
| 1546 | * object isn't available for use because it describes the block-of-spans |
| 1547 | * itself. |
| 1548 | */ |
| 1549 | if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) |
| 1550 | nmax = FPM_PAGE_SIZE / obsize - 1; |
| 1551 | else |
| 1552 | nmax = DSA_SUPERBLOCK_SIZE / obsize; |
| 1553 | |
| 1554 | /* |
| 1555 | * If fullness class 1 is empty, try to find a span to put in it by |
| 1556 | * scanning higher-numbered fullness classes (excluding the last one, |
| 1557 | * whose blocks are certain to all be completely full). |
| 1558 | */ |
| 1559 | for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) |
| 1560 | { |
| 1561 | span_pointer = pool->spans[fclass]; |
| 1562 | |
| 1563 | while (DsaPointerIsValid(span_pointer)) |
| 1564 | { |
| 1565 | int tfclass; |
| 1566 | dsa_area_span *span; |
| 1567 | dsa_area_span *nextspan; |
| 1568 | dsa_area_span *prevspan; |
| 1569 | dsa_pointer next_span_pointer; |
| 1570 | |
| 1571 | span = (dsa_area_span *) |
| 1572 | dsa_get_address(area, span_pointer); |
| 1573 | next_span_pointer = span->nextspan; |
| 1574 | |
| 1575 | /* Figure out what fullness class should contain this span. */ |
| 1576 | tfclass = (nmax - span->nallocatable) |
| 1577 | * (DSA_FULLNESS_CLASSES - 1) / nmax; |
| 1578 | |
| 1579 | /* Look up next span. */ |
| 1580 | if (DsaPointerIsValid(span->nextspan)) |
| 1581 | nextspan = (dsa_area_span *) |
| 1582 | dsa_get_address(area, span->nextspan); |
| 1583 | else |
| 1584 | nextspan = NULL; |
| 1585 | |
| 1586 | /* |
| 1587 | * If utilization has dropped enough that this now belongs in some |
| 1588 | * other fullness class, move it there. |
| 1589 | */ |
| 1590 | if (tfclass < fclass) |
| 1591 | { |
| 1592 | /* Remove from the current fullness class list. */ |
| 1593 | if (pool->spans[fclass] == span_pointer) |
| 1594 | { |
| 1595 | /* It was the head; remove it. */ |
| 1596 | Assert(!DsaPointerIsValid(span->prevspan)); |
| 1597 | pool->spans[fclass] = span->nextspan; |
| 1598 | if (nextspan != NULL) |
| 1599 | nextspan->prevspan = InvalidDsaPointer; |
| 1600 | } |
| 1601 | else |
| 1602 | { |
| 1603 | /* It was not the head. */ |
| 1604 | Assert(DsaPointerIsValid(span->prevspan)); |
| 1605 | prevspan = (dsa_area_span *) |
| 1606 | dsa_get_address(area, span->prevspan); |
| 1607 | prevspan->nextspan = span->nextspan; |
| 1608 | } |
| 1609 | if (nextspan != NULL) |
| 1610 | nextspan->prevspan = span->prevspan; |
| 1611 | |
| 1612 | /* Push onto the head of the new fullness class list. */ |
| 1613 | span->nextspan = pool->spans[tfclass]; |
| 1614 | pool->spans[tfclass] = span_pointer; |
| 1615 | span->prevspan = InvalidDsaPointer; |
| 1616 | if (DsaPointerIsValid(span->nextspan)) |
| 1617 | { |
| 1618 | nextspan = (dsa_area_span *) |
| 1619 | dsa_get_address(area, span->nextspan); |
| 1620 | nextspan->prevspan = span_pointer; |
| 1621 | } |
| 1622 | span->fclass = tfclass; |
| 1623 | } |
| 1624 | |
| 1625 | /* Advance to next span on list. */ |
| 1626 | span_pointer = next_span_pointer; |
| 1627 | } |
| 1628 | |
| 1629 | /* Stop now if we found a suitable block. */ |
| 1630 | if (DsaPointerIsValid(pool->spans[1])) |
| 1631 | return true; |
| 1632 | } |
| 1633 | |
| 1634 | /* |
| 1635 | * If there are no blocks that properly belong in fullness class 1, pick |
| 1636 | * one from some other fullness class and move it there anyway, so that we |
| 1637 | * have an allocation target. Our last choice is to transfer a block |
| 1638 | * that's almost empty (and might become completely empty soon if left |
| 1639 | * alone), but even that is better than failing, which is what we must do |
| 1640 | * if there are no blocks at all with freespace. |
| 1641 | */ |
| 1642 | Assert(!DsaPointerIsValid(pool->spans[1])); |
| 1643 | for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) |
| 1644 | if (transfer_first_span(area, pool, fclass, 1)) |
| 1645 | return true; |
| 1646 | if (!DsaPointerIsValid(pool->spans[1]) && |
| 1647 | transfer_first_span(area, pool, 0, 1)) |
| 1648 | return true; |
| 1649 | |
| 1650 | /* |
| 1651 | * We failed to find an existing span with free objects, so we need to |
| 1652 | * allocate a new superblock and construct a new span to manage it. |
| 1653 | * |
| 1654 | * First, get a dsa_area_span object to describe the new superblock block |
| 1655 | * ... unless this allocation is for a dsa_area_span object, in which case |
| 1656 | * that's surely not going to work. We handle that case by storing the |
| 1657 | * span describing a block-of-spans inline. |
| 1658 | */ |
| 1659 | if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) |
| 1660 | { |
| 1661 | span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); |
| 1662 | if (!DsaPointerIsValid(span_pointer)) |
| 1663 | return false; |
| 1664 | npages = DSA_PAGES_PER_SUPERBLOCK; |
| 1665 | } |
| 1666 | |
| 1667 | /* Find or create a segment and allocate the superblock. */ |
| 1668 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 1669 | segment_map = get_best_segment(area, npages); |
| 1670 | if (segment_map == NULL) |
| 1671 | { |
| 1672 | segment_map = make_new_segment(area, npages); |
| 1673 | if (segment_map == NULL) |
| 1674 | { |
| 1675 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 1676 | return false; |
| 1677 | } |
| 1678 | } |
| 1679 | |
| 1680 | /* |
| 1681 | * This shouldn't happen: get_best_segment() or make_new_segment() |
| 1682 | * promised that we can successfully allocate npages. |
| 1683 | */ |
| 1684 | if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) |
| 1685 | elog(FATAL, |
| 1686 | "dsa_allocate could not find %zu free pages for superblock" , |
| 1687 | npages); |
| 1688 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 1689 | |
| 1690 | /* Compute the start of the superblock. */ |
| 1691 | start_pointer = |
| 1692 | DSA_MAKE_POINTER(get_segment_index(area, segment_map), |
| 1693 | first_page * FPM_PAGE_SIZE); |
| 1694 | |
| 1695 | /* |
| 1696 | * If this is a block-of-spans, carve the descriptor right out of the |
| 1697 | * allocated space. |
| 1698 | */ |
| 1699 | if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) |
| 1700 | { |
| 1701 | /* |
| 1702 | * We have a pointer into the segment. We need to build a dsa_pointer |
| 1703 | * from the segment index and offset into the segment. |
| 1704 | */ |
| 1705 | span_pointer = start_pointer; |
| 1706 | } |
| 1707 | |
| 1708 | /* Initialize span and pagemap. */ |
| 1709 | init_span(area, span_pointer, pool, start_pointer, npages, size_class); |
| 1710 | for (i = 0; i < npages; ++i) |
| 1711 | segment_map->pagemap[first_page + i] = span_pointer; |
| 1712 | |
| 1713 | return true; |
| 1714 | } |
| 1715 | |
| 1716 | /* |
| 1717 | * Return the segment map corresponding to a given segment index, mapping the |
| 1718 | * segment in if necessary. For internal segment book-keeping, this is called |
| 1719 | * with the area lock held. It is also called by dsa_free and dsa_get_address |
| 1720 | * without any locking, relying on the fact they have a known live segment |
| 1721 | * index and they always call check_for_freed_segments to ensures that any |
| 1722 | * freed segment occupying the same slot is detached first. |
| 1723 | */ |
| 1724 | static dsa_segment_map * |
| 1725 | get_segment_by_index(dsa_area *area, dsa_segment_index index) |
| 1726 | { |
| 1727 | if (unlikely(area->segment_maps[index].mapped_address == NULL)) |
| 1728 | { |
| 1729 | dsm_handle handle; |
| 1730 | dsm_segment *segment; |
| 1731 | dsa_segment_map *segment_map; |
| 1732 | |
| 1733 | /* |
| 1734 | * If we are reached by dsa_free or dsa_get_address, there must be at |
| 1735 | * least one object allocated in the referenced segment. Otherwise, |
| 1736 | * their caller has a double-free or access-after-free bug, which we |
| 1737 | * have no hope of detecting. So we know it's safe to access this |
| 1738 | * array slot without holding a lock; it won't change underneath us. |
| 1739 | * Furthermore, we know that we can see the latest contents of the |
| 1740 | * slot, as explained in check_for_freed_segments, which those |
| 1741 | * functions call before arriving here. |
| 1742 | */ |
| 1743 | handle = area->control->segment_handles[index]; |
| 1744 | |
| 1745 | /* It's an error to try to access an unused slot. */ |
| 1746 | if (handle == DSM_HANDLE_INVALID) |
| 1747 | elog(ERROR, |
| 1748 | "dsa_area could not attach to a segment that has been freed" ); |
| 1749 | |
| 1750 | segment = dsm_attach(handle); |
| 1751 | if (segment == NULL) |
| 1752 | elog(ERROR, "dsa_area could not attach to segment" ); |
| 1753 | if (area->mapping_pinned) |
| 1754 | dsm_pin_mapping(segment); |
| 1755 | segment_map = &area->segment_maps[index]; |
| 1756 | segment_map->segment = segment; |
| 1757 | segment_map->mapped_address = dsm_segment_address(segment); |
| 1758 | segment_map->header = |
| 1759 | (dsa_segment_header *) segment_map->mapped_address; |
| 1760 | segment_map->fpm = (FreePageManager *) |
| 1761 | (segment_map->mapped_address + |
| 1762 | MAXALIGN(sizeof(dsa_segment_header))); |
| 1763 | segment_map->pagemap = (dsa_pointer *) |
| 1764 | (segment_map->mapped_address + |
| 1765 | MAXALIGN(sizeof(dsa_segment_header)) + |
| 1766 | MAXALIGN(sizeof(FreePageManager))); |
| 1767 | |
| 1768 | /* Remember the highest index this backend has ever mapped. */ |
| 1769 | if (area->high_segment_index < index) |
| 1770 | area->high_segment_index = index; |
| 1771 | |
| 1772 | Assert(segment_map->header->magic == |
| 1773 | (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index)); |
| 1774 | } |
| 1775 | |
| 1776 | /* |
| 1777 | * Callers of dsa_get_address() and dsa_free() don't hold the area lock, |
| 1778 | * but it's a bug in the calling code and undefined behavior if the |
| 1779 | * address is not live (ie if the segment might possibly have been freed, |
| 1780 | * they're trying to use a dangling pointer). |
| 1781 | * |
| 1782 | * For dsa.c code that holds the area lock to manipulate segment_bins |
| 1783 | * lists, it would be a bug if we ever reach a freed segment here. After |
| 1784 | * it's marked as freed, the only thing any backend should do with it is |
| 1785 | * unmap it, and it should always have done that in |
| 1786 | * check_for_freed_segments_locked() before arriving here to resolve an |
| 1787 | * index to a segment_map. |
| 1788 | * |
| 1789 | * Either way we can assert that we aren't returning a freed segment. |
| 1790 | */ |
| 1791 | Assert(!area->segment_maps[index].header->freed); |
| 1792 | |
| 1793 | return &area->segment_maps[index]; |
| 1794 | } |
| 1795 | |
| 1796 | /* |
| 1797 | * Return a superblock to the free page manager. If the underlying segment |
| 1798 | * has become entirely free, then return it to the operating system. |
| 1799 | * |
| 1800 | * The appropriate pool lock must be held. |
| 1801 | */ |
| 1802 | static void |
| 1803 | destroy_superblock(dsa_area *area, dsa_pointer span_pointer) |
| 1804 | { |
| 1805 | dsa_area_span *span = dsa_get_address(area, span_pointer); |
| 1806 | int size_class = span->size_class; |
| 1807 | dsa_segment_map *segment_map; |
| 1808 | |
| 1809 | |
| 1810 | /* Remove it from its fullness class list. */ |
| 1811 | unlink_span(area, span); |
| 1812 | |
| 1813 | /* |
| 1814 | * Note: Here we acquire the area lock while we already hold a per-pool |
| 1815 | * lock. We never hold the area lock and then take a pool lock, or we |
| 1816 | * could deadlock. |
| 1817 | */ |
| 1818 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 1819 | check_for_freed_segments_locked(area); |
| 1820 | segment_map = |
| 1821 | get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start)); |
| 1822 | FreePageManagerPut(segment_map->fpm, |
| 1823 | DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, |
| 1824 | span->npages); |
| 1825 | /* Check if the segment is now entirely free. */ |
| 1826 | if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages) |
| 1827 | { |
| 1828 | dsa_segment_index index = get_segment_index(area, segment_map); |
| 1829 | |
| 1830 | /* If it's not the segment with extra control data, free it. */ |
| 1831 | if (index != 0) |
| 1832 | { |
| 1833 | /* |
| 1834 | * Give it back to the OS, and allow other backends to detect that |
| 1835 | * they need to detach. |
| 1836 | */ |
| 1837 | unlink_segment(area, segment_map); |
| 1838 | segment_map->header->freed = true; |
| 1839 | Assert(area->control->total_segment_size >= |
| 1840 | segment_map->header->size); |
| 1841 | area->control->total_segment_size -= |
| 1842 | segment_map->header->size; |
| 1843 | dsm_unpin_segment(dsm_segment_handle(segment_map->segment)); |
| 1844 | dsm_detach(segment_map->segment); |
| 1845 | area->control->segment_handles[index] = DSM_HANDLE_INVALID; |
| 1846 | ++area->control->freed_segment_counter; |
| 1847 | segment_map->segment = NULL; |
| 1848 | segment_map->header = NULL; |
| 1849 | segment_map->mapped_address = NULL; |
| 1850 | } |
| 1851 | } |
| 1852 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 1853 | |
| 1854 | /* |
| 1855 | * Span-of-spans blocks store the span which describes them within the |
| 1856 | * block itself, so freeing the storage implicitly frees the descriptor |
| 1857 | * also. If this is a block of any other type, we need to separately free |
| 1858 | * the span object also. This recursive call to dsa_free will acquire the |
| 1859 | * span pool's lock. We can't deadlock because the acquisition order is |
| 1860 | * always some other pool and then the span pool. |
| 1861 | */ |
| 1862 | if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) |
| 1863 | dsa_free(area, span_pointer); |
| 1864 | } |
| 1865 | |
| 1866 | static void |
| 1867 | unlink_span(dsa_area *area, dsa_area_span *span) |
| 1868 | { |
| 1869 | if (DsaPointerIsValid(span->nextspan)) |
| 1870 | { |
| 1871 | dsa_area_span *next = dsa_get_address(area, span->nextspan); |
| 1872 | |
| 1873 | next->prevspan = span->prevspan; |
| 1874 | } |
| 1875 | if (DsaPointerIsValid(span->prevspan)) |
| 1876 | { |
| 1877 | dsa_area_span *prev = dsa_get_address(area, span->prevspan); |
| 1878 | |
| 1879 | prev->nextspan = span->nextspan; |
| 1880 | } |
| 1881 | else |
| 1882 | { |
| 1883 | dsa_area_pool *pool = dsa_get_address(area, span->pool); |
| 1884 | |
| 1885 | pool->spans[span->fclass] = span->nextspan; |
| 1886 | } |
| 1887 | } |
| 1888 | |
| 1889 | static void |
| 1890 | add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, |
| 1891 | dsa_pointer span_pointer, |
| 1892 | int fclass) |
| 1893 | { |
| 1894 | dsa_area_pool *pool = dsa_get_address(area, span->pool); |
| 1895 | |
| 1896 | if (DsaPointerIsValid(pool->spans[fclass])) |
| 1897 | { |
| 1898 | dsa_area_span *head = dsa_get_address(area, |
| 1899 | pool->spans[fclass]); |
| 1900 | |
| 1901 | head->prevspan = span_pointer; |
| 1902 | } |
| 1903 | span->prevspan = InvalidDsaPointer; |
| 1904 | span->nextspan = pool->spans[fclass]; |
| 1905 | pool->spans[fclass] = span_pointer; |
| 1906 | span->fclass = fclass; |
| 1907 | } |
| 1908 | |
| 1909 | /* |
| 1910 | * Detach from an area that was either created or attached to by this process. |
| 1911 | */ |
| 1912 | void |
| 1913 | dsa_detach(dsa_area *area) |
| 1914 | { |
| 1915 | int i; |
| 1916 | |
| 1917 | /* Detach from all segments. */ |
| 1918 | for (i = 0; i <= area->high_segment_index; ++i) |
| 1919 | if (area->segment_maps[i].segment != NULL) |
| 1920 | dsm_detach(area->segment_maps[i].segment); |
| 1921 | |
| 1922 | /* |
| 1923 | * Note that 'detaching' (= detaching from DSM segments) doesn't include |
| 1924 | * 'releasing' (= adjusting the reference count). It would be nice to |
| 1925 | * combine these operations, but client code might never get around to |
| 1926 | * calling dsa_detach because of an error path, and a detach hook on any |
| 1927 | * particular segment is too late to detach other segments in the area |
| 1928 | * without risking a 'leak' warning in the non-error path. |
| 1929 | */ |
| 1930 | |
| 1931 | /* Free the backend-local area object. */ |
| 1932 | pfree(area); |
| 1933 | } |
| 1934 | |
| 1935 | /* |
| 1936 | * Unlink a segment from the bin that contains it. |
| 1937 | */ |
| 1938 | static void |
| 1939 | unlink_segment(dsa_area *area, dsa_segment_map *segment_map) |
| 1940 | { |
| 1941 | if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE) |
| 1942 | { |
| 1943 | dsa_segment_map *prev; |
| 1944 | |
| 1945 | prev = get_segment_by_index(area, segment_map->header->prev); |
| 1946 | prev->header->next = segment_map->header->next; |
| 1947 | } |
| 1948 | else |
| 1949 | { |
| 1950 | Assert(area->control->segment_bins[segment_map->header->bin] == |
| 1951 | get_segment_index(area, segment_map)); |
| 1952 | area->control->segment_bins[segment_map->header->bin] = |
| 1953 | segment_map->header->next; |
| 1954 | } |
| 1955 | if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) |
| 1956 | { |
| 1957 | dsa_segment_map *next; |
| 1958 | |
| 1959 | next = get_segment_by_index(area, segment_map->header->next); |
| 1960 | next->header->prev = segment_map->header->prev; |
| 1961 | } |
| 1962 | } |
| 1963 | |
| 1964 | /* |
| 1965 | * Find a segment that could satisfy a request for 'npages' of contiguous |
| 1966 | * memory, or return NULL if none can be found. This may involve attaching to |
| 1967 | * segments that weren't previously attached so that we can query their free |
| 1968 | * pages map. |
| 1969 | */ |
| 1970 | static dsa_segment_map * |
| 1971 | get_best_segment(dsa_area *area, size_t npages) |
| 1972 | { |
| 1973 | size_t bin; |
| 1974 | |
| 1975 | Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); |
| 1976 | check_for_freed_segments_locked(area); |
| 1977 | |
| 1978 | /* |
| 1979 | * Start searching from the first bin that *might* have enough contiguous |
| 1980 | * pages. |
| 1981 | */ |
| 1982 | for (bin = contiguous_pages_to_segment_bin(npages); |
| 1983 | bin < DSA_NUM_SEGMENT_BINS; |
| 1984 | ++bin) |
| 1985 | { |
| 1986 | /* |
| 1987 | * The minimum contiguous size that any segment in this bin should |
| 1988 | * have. We'll re-bin if we see segments with fewer. |
| 1989 | */ |
| 1990 | size_t threshold = (size_t) 1 << (bin - 1); |
| 1991 | dsa_segment_index segment_index; |
| 1992 | |
| 1993 | /* Search this bin for a segment with enough contiguous space. */ |
| 1994 | segment_index = area->control->segment_bins[bin]; |
| 1995 | while (segment_index != DSA_SEGMENT_INDEX_NONE) |
| 1996 | { |
| 1997 | dsa_segment_map *segment_map; |
| 1998 | dsa_segment_index next_segment_index; |
| 1999 | size_t contiguous_pages; |
| 2000 | |
| 2001 | segment_map = get_segment_by_index(area, segment_index); |
| 2002 | next_segment_index = segment_map->header->next; |
| 2003 | contiguous_pages = fpm_largest(segment_map->fpm); |
| 2004 | |
| 2005 | /* Not enough for the request, still enough for this bin. */ |
| 2006 | if (contiguous_pages >= threshold && contiguous_pages < npages) |
| 2007 | { |
| 2008 | segment_index = next_segment_index; |
| 2009 | continue; |
| 2010 | } |
| 2011 | |
| 2012 | /* Re-bin it if it's no longer in the appropriate bin. */ |
| 2013 | if (contiguous_pages < threshold) |
| 2014 | { |
| 2015 | size_t new_bin; |
| 2016 | |
| 2017 | new_bin = contiguous_pages_to_segment_bin(contiguous_pages); |
| 2018 | |
| 2019 | /* Remove it from its current bin. */ |
| 2020 | unlink_segment(area, segment_map); |
| 2021 | |
| 2022 | /* Push it onto the front of its new bin. */ |
| 2023 | segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; |
| 2024 | segment_map->header->next = |
| 2025 | area->control->segment_bins[new_bin]; |
| 2026 | segment_map->header->bin = new_bin; |
| 2027 | area->control->segment_bins[new_bin] = segment_index; |
| 2028 | if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) |
| 2029 | { |
| 2030 | dsa_segment_map *next; |
| 2031 | |
| 2032 | next = get_segment_by_index(area, |
| 2033 | segment_map->header->next); |
| 2034 | Assert(next->header->bin == new_bin); |
| 2035 | next->header->prev = segment_index; |
| 2036 | } |
| 2037 | |
| 2038 | /* |
| 2039 | * But fall through to see if it's enough to satisfy this |
| 2040 | * request anyway.... |
| 2041 | */ |
| 2042 | } |
| 2043 | |
| 2044 | /* Check if we are done. */ |
| 2045 | if (contiguous_pages >= npages) |
| 2046 | return segment_map; |
| 2047 | |
| 2048 | /* Continue searching the same bin. */ |
| 2049 | segment_index = next_segment_index; |
| 2050 | } |
| 2051 | } |
| 2052 | |
| 2053 | /* Not found. */ |
| 2054 | return NULL; |
| 2055 | } |
| 2056 | |
| 2057 | /* |
| 2058 | * Create a new segment that can handle at least requested_pages. Returns |
| 2059 | * NULL if the requested total size limit or maximum allowed number of |
| 2060 | * segments would be exceeded. |
| 2061 | */ |
| 2062 | static dsa_segment_map * |
| 2063 | make_new_segment(dsa_area *area, size_t requested_pages) |
| 2064 | { |
| 2065 | dsa_segment_index new_index; |
| 2066 | size_t metadata_bytes; |
| 2067 | size_t total_size; |
| 2068 | size_t total_pages; |
| 2069 | size_t usable_pages; |
| 2070 | dsa_segment_map *segment_map; |
| 2071 | dsm_segment *segment; |
| 2072 | |
| 2073 | Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); |
| 2074 | |
| 2075 | /* Find a segment slot that is not in use (linearly for now). */ |
| 2076 | for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index) |
| 2077 | { |
| 2078 | if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID) |
| 2079 | break; |
| 2080 | } |
| 2081 | if (new_index == DSA_MAX_SEGMENTS) |
| 2082 | return NULL; |
| 2083 | |
| 2084 | /* |
| 2085 | * If the total size limit is already exceeded, then we exit early and |
| 2086 | * avoid arithmetic wraparound in the unsigned expressions below. |
| 2087 | */ |
| 2088 | if (area->control->total_segment_size >= |
| 2089 | area->control->max_total_segment_size) |
| 2090 | return NULL; |
| 2091 | |
| 2092 | /* |
| 2093 | * The size should be at least as big as requested, and at least big |
| 2094 | * enough to follow a geometric series that approximately doubles the |
| 2095 | * total storage each time we create a new segment. We use geometric |
| 2096 | * growth because the underlying DSM system isn't designed for large |
| 2097 | * numbers of segments (otherwise we might even consider just using one |
| 2098 | * DSM segment for each large allocation and for each superblock, and then |
| 2099 | * we wouldn't need to use FreePageManager). |
| 2100 | * |
| 2101 | * We decide on a total segment size first, so that we produce tidy |
| 2102 | * power-of-two sized segments. This is a good property to have if we |
| 2103 | * move to huge pages in the future. Then we work back to the number of |
| 2104 | * pages we can fit. |
| 2105 | */ |
| 2106 | total_size = DSA_INITIAL_SEGMENT_SIZE * |
| 2107 | ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE)); |
| 2108 | total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE); |
| 2109 | total_size = Min(total_size, |
| 2110 | area->control->max_total_segment_size - |
| 2111 | area->control->total_segment_size); |
| 2112 | |
| 2113 | total_pages = total_size / FPM_PAGE_SIZE; |
| 2114 | metadata_bytes = |
| 2115 | MAXALIGN(sizeof(dsa_segment_header)) + |
| 2116 | MAXALIGN(sizeof(FreePageManager)) + |
| 2117 | sizeof(dsa_pointer) * total_pages; |
| 2118 | |
| 2119 | /* Add padding up to next page boundary. */ |
| 2120 | if (metadata_bytes % FPM_PAGE_SIZE != 0) |
| 2121 | metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); |
| 2122 | if (total_size <= metadata_bytes) |
| 2123 | return NULL; |
| 2124 | usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE; |
| 2125 | Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size); |
| 2126 | |
| 2127 | /* See if that is enough... */ |
| 2128 | if (requested_pages > usable_pages) |
| 2129 | { |
| 2130 | /* |
| 2131 | * We'll make an odd-sized segment, working forward from the requested |
| 2132 | * number of pages. |
| 2133 | */ |
| 2134 | usable_pages = requested_pages; |
| 2135 | metadata_bytes = |
| 2136 | MAXALIGN(sizeof(dsa_segment_header)) + |
| 2137 | MAXALIGN(sizeof(FreePageManager)) + |
| 2138 | usable_pages * sizeof(dsa_pointer); |
| 2139 | |
| 2140 | /* Add padding up to next page boundary. */ |
| 2141 | if (metadata_bytes % FPM_PAGE_SIZE != 0) |
| 2142 | metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); |
| 2143 | total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE; |
| 2144 | |
| 2145 | /* Is that too large for dsa_pointer's addressing scheme? */ |
| 2146 | if (total_size > DSA_MAX_SEGMENT_SIZE) |
| 2147 | return NULL; |
| 2148 | |
| 2149 | /* Would that exceed the limit? */ |
| 2150 | if (total_size > area->control->max_total_segment_size - |
| 2151 | area->control->total_segment_size) |
| 2152 | return NULL; |
| 2153 | } |
| 2154 | |
| 2155 | /* Create the segment. */ |
| 2156 | segment = dsm_create(total_size, 0); |
| 2157 | if (segment == NULL) |
| 2158 | return NULL; |
| 2159 | dsm_pin_segment(segment); |
| 2160 | if (area->mapping_pinned) |
| 2161 | dsm_pin_mapping(segment); |
| 2162 | |
| 2163 | /* Store the handle in shared memory to be found by index. */ |
| 2164 | area->control->segment_handles[new_index] = |
| 2165 | dsm_segment_handle(segment); |
| 2166 | /* Track the highest segment index in the history of the area. */ |
| 2167 | if (area->control->high_segment_index < new_index) |
| 2168 | area->control->high_segment_index = new_index; |
| 2169 | /* Track the highest segment index this backend has ever mapped. */ |
| 2170 | if (area->high_segment_index < new_index) |
| 2171 | area->high_segment_index = new_index; |
| 2172 | /* Track total size of all segments. */ |
| 2173 | area->control->total_segment_size += total_size; |
| 2174 | Assert(area->control->total_segment_size <= |
| 2175 | area->control->max_total_segment_size); |
| 2176 | |
| 2177 | /* Build a segment map for this segment in this backend. */ |
| 2178 | segment_map = &area->segment_maps[new_index]; |
| 2179 | segment_map->segment = segment; |
| 2180 | segment_map->mapped_address = dsm_segment_address(segment); |
| 2181 | segment_map->header = (dsa_segment_header *) segment_map->mapped_address; |
| 2182 | segment_map->fpm = (FreePageManager *) |
| 2183 | (segment_map->mapped_address + |
| 2184 | MAXALIGN(sizeof(dsa_segment_header))); |
| 2185 | segment_map->pagemap = (dsa_pointer *) |
| 2186 | (segment_map->mapped_address + |
| 2187 | MAXALIGN(sizeof(dsa_segment_header)) + |
| 2188 | MAXALIGN(sizeof(FreePageManager))); |
| 2189 | |
| 2190 | /* Set up the free page map. */ |
| 2191 | FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); |
| 2192 | FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, |
| 2193 | usable_pages); |
| 2194 | |
| 2195 | /* Set up the segment header and put it in the appropriate bin. */ |
| 2196 | segment_map->header->magic = |
| 2197 | DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index; |
| 2198 | segment_map->header->usable_pages = usable_pages; |
| 2199 | segment_map->header->size = total_size; |
| 2200 | segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); |
| 2201 | segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; |
| 2202 | segment_map->header->next = |
| 2203 | area->control->segment_bins[segment_map->header->bin]; |
| 2204 | segment_map->header->freed = false; |
| 2205 | area->control->segment_bins[segment_map->header->bin] = new_index; |
| 2206 | if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) |
| 2207 | { |
| 2208 | dsa_segment_map *next = |
| 2209 | get_segment_by_index(area, segment_map->header->next); |
| 2210 | |
| 2211 | Assert(next->header->bin == segment_map->header->bin); |
| 2212 | next->header->prev = new_index; |
| 2213 | } |
| 2214 | |
| 2215 | return segment_map; |
| 2216 | } |
| 2217 | |
| 2218 | /* |
| 2219 | * Check if any segments have been freed by destroy_superblock, so we can |
| 2220 | * detach from them in this backend. This function is called by |
| 2221 | * dsa_get_address and dsa_free to make sure that a dsa_pointer they have |
| 2222 | * received can be resolved to the correct segment. |
| 2223 | * |
| 2224 | * The danger we want to defend against is that there could be an old segment |
| 2225 | * mapped into a given slot in this backend, and the dsa_pointer they have |
| 2226 | * might refer to some new segment in the same slot. So those functions must |
| 2227 | * be sure to process all instructions to detach from a freed segment that had |
| 2228 | * been generated by the time this process received the dsa_pointer, before |
| 2229 | * they call get_segment_by_index. |
| 2230 | */ |
| 2231 | static void |
| 2232 | check_for_freed_segments(dsa_area *area) |
| 2233 | { |
| 2234 | size_t freed_segment_counter; |
| 2235 | |
| 2236 | /* |
| 2237 | * Any other process that has freed a segment has incremented |
| 2238 | * free_segment_counter while holding an LWLock, and that must precede any |
| 2239 | * backend creating a new segment in the same slot while holding an |
| 2240 | * LWLock, and that must precede the creation of any dsa_pointer pointing |
| 2241 | * into the new segment which might reach us here, and the caller must |
| 2242 | * have sent the dsa_pointer to this process using appropriate memory |
| 2243 | * synchronization (some kind of locking or atomic primitive or system |
| 2244 | * call). So all we need to do on the reading side is ask for the load of |
| 2245 | * freed_segment_counter to follow the caller's load of the dsa_pointer it |
| 2246 | * has, and we can be sure to detect any segments that had been freed as |
| 2247 | * of the time that the dsa_pointer reached this process. |
| 2248 | */ |
| 2249 | pg_read_barrier(); |
| 2250 | freed_segment_counter = area->control->freed_segment_counter; |
| 2251 | if (unlikely(area->freed_segment_counter != freed_segment_counter)) |
| 2252 | { |
| 2253 | /* Check all currently mapped segments to find what's been freed. */ |
| 2254 | LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); |
| 2255 | check_for_freed_segments_locked(area); |
| 2256 | LWLockRelease(DSA_AREA_LOCK(area)); |
| 2257 | } |
| 2258 | } |
| 2259 | |
| 2260 | /* |
| 2261 | * Workhorse for check_for_freed_segments(), and also used directly in path |
| 2262 | * where the area lock is already held. This should be called after acquiring |
| 2263 | * the lock but before looking up any segment by index number, to make sure we |
| 2264 | * unmap any stale segments that might have previously had the same index as a |
| 2265 | * current segment. |
| 2266 | */ |
| 2267 | static void |
| 2268 | check_for_freed_segments_locked(dsa_area *area) |
| 2269 | { |
| 2270 | size_t freed_segment_counter; |
| 2271 | int i; |
| 2272 | |
| 2273 | Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); |
| 2274 | freed_segment_counter = area->control->freed_segment_counter; |
| 2275 | if (unlikely(area->freed_segment_counter != freed_segment_counter)) |
| 2276 | { |
| 2277 | for (i = 0; i <= area->high_segment_index; ++i) |
| 2278 | { |
| 2279 | if (area->segment_maps[i].header != NULL && |
| 2280 | area->segment_maps[i].header->freed) |
| 2281 | { |
| 2282 | dsm_detach(area->segment_maps[i].segment); |
| 2283 | area->segment_maps[i].segment = NULL; |
| 2284 | area->segment_maps[i].header = NULL; |
| 2285 | area->segment_maps[i].mapped_address = NULL; |
| 2286 | } |
| 2287 | } |
| 2288 | area->freed_segment_counter = freed_segment_counter; |
| 2289 | } |
| 2290 | } |
| 2291 | |