| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * shm_toc.c |
| 4 | * shared memory segment table of contents |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | * |
| 9 | * src/backend/storage/ipc/shm_toc.c |
| 10 | * |
| 11 | *------------------------------------------------------------------------- |
| 12 | */ |
| 13 | |
| 14 | #include "postgres.h" |
| 15 | |
| 16 | #include "port/atomics.h" |
| 17 | #include "storage/shm_toc.h" |
| 18 | #include "storage/spin.h" |
| 19 | |
| 20 | typedef struct shm_toc_entry |
| 21 | { |
| 22 | uint64 key; /* Arbitrary identifier */ |
| 23 | Size offset; /* Offset, in bytes, from TOC start */ |
| 24 | } shm_toc_entry; |
| 25 | |
| 26 | struct shm_toc |
| 27 | { |
| 28 | uint64 toc_magic; /* Magic number identifying this TOC */ |
| 29 | slock_t toc_mutex; /* Spinlock for mutual exclusion */ |
| 30 | Size toc_total_bytes; /* Bytes managed by this TOC */ |
| 31 | Size toc_allocated_bytes; /* Bytes allocated of those managed */ |
| 32 | uint32 toc_nentry; /* Number of entries in TOC */ |
| 33 | shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER]; |
| 34 | }; |
| 35 | |
| 36 | /* |
| 37 | * Initialize a region of shared memory with a table of contents. |
| 38 | */ |
| 39 | shm_toc * |
| 40 | shm_toc_create(uint64 magic, void *address, Size nbytes) |
| 41 | { |
| 42 | shm_toc *toc = (shm_toc *) address; |
| 43 | |
| 44 | Assert(nbytes > offsetof(shm_toc, toc_entry)); |
| 45 | toc->toc_magic = magic; |
| 46 | SpinLockInit(&toc->toc_mutex); |
| 47 | |
| 48 | /* |
| 49 | * The alignment code in shm_toc_allocate() assumes that the starting |
| 50 | * value is buffer-aligned. |
| 51 | */ |
| 52 | toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes); |
| 53 | toc->toc_allocated_bytes = 0; |
| 54 | toc->toc_nentry = 0; |
| 55 | |
| 56 | return toc; |
| 57 | } |
| 58 | |
| 59 | /* |
| 60 | * Attach to an existing table of contents. If the magic number found at |
| 61 | * the target address doesn't match our expectations, return NULL. |
| 62 | */ |
| 63 | shm_toc * |
| 64 | shm_toc_attach(uint64 magic, void *address) |
| 65 | { |
| 66 | shm_toc *toc = (shm_toc *) address; |
| 67 | |
| 68 | if (toc->toc_magic != magic) |
| 69 | return NULL; |
| 70 | |
| 71 | Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes); |
| 72 | Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry)); |
| 73 | |
| 74 | return toc; |
| 75 | } |
| 76 | |
| 77 | /* |
| 78 | * Allocate shared memory from a segment managed by a table of contents. |
| 79 | * |
| 80 | * This is not a full-blown allocator; there's no way to free memory. It's |
| 81 | * just a way of dividing a single physical shared memory segment into logical |
| 82 | * chunks that may be used for different purposes. |
| 83 | * |
| 84 | * We allocate backwards from the end of the segment, so that the TOC entries |
| 85 | * can grow forward from the start of the segment. |
| 86 | */ |
| 87 | void * |
| 88 | shm_toc_allocate(shm_toc *toc, Size nbytes) |
| 89 | { |
| 90 | volatile shm_toc *vtoc = toc; |
| 91 | Size total_bytes; |
| 92 | Size allocated_bytes; |
| 93 | Size nentry; |
| 94 | Size toc_bytes; |
| 95 | |
| 96 | /* |
| 97 | * Make sure request is well-aligned. XXX: MAXALIGN is not enough, |
| 98 | * because atomic ops might need a wider alignment. We don't have a |
| 99 | * proper definition for the minimum to make atomic ops safe, but |
| 100 | * BUFFERALIGN ought to be enough. |
| 101 | */ |
| 102 | nbytes = BUFFERALIGN(nbytes); |
| 103 | |
| 104 | SpinLockAcquire(&toc->toc_mutex); |
| 105 | |
| 106 | total_bytes = vtoc->toc_total_bytes; |
| 107 | allocated_bytes = vtoc->toc_allocated_bytes; |
| 108 | nentry = vtoc->toc_nentry; |
| 109 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) |
| 110 | + allocated_bytes; |
| 111 | |
| 112 | /* Check for memory exhaustion and overflow. */ |
| 113 | if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes) |
| 114 | { |
| 115 | SpinLockRelease(&toc->toc_mutex); |
| 116 | ereport(ERROR, |
| 117 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 118 | errmsg("out of shared memory" ))); |
| 119 | } |
| 120 | vtoc->toc_allocated_bytes += nbytes; |
| 121 | |
| 122 | SpinLockRelease(&toc->toc_mutex); |
| 123 | |
| 124 | return ((char *) toc) + (total_bytes - allocated_bytes - nbytes); |
| 125 | } |
| 126 | |
| 127 | /* |
| 128 | * Return the number of bytes that can still be allocated. |
| 129 | */ |
| 130 | Size |
| 131 | shm_toc_freespace(shm_toc *toc) |
| 132 | { |
| 133 | volatile shm_toc *vtoc = toc; |
| 134 | Size total_bytes; |
| 135 | Size allocated_bytes; |
| 136 | Size nentry; |
| 137 | Size toc_bytes; |
| 138 | |
| 139 | SpinLockAcquire(&toc->toc_mutex); |
| 140 | total_bytes = vtoc->toc_total_bytes; |
| 141 | allocated_bytes = vtoc->toc_allocated_bytes; |
| 142 | nentry = vtoc->toc_nentry; |
| 143 | SpinLockRelease(&toc->toc_mutex); |
| 144 | |
| 145 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry); |
| 146 | Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); |
| 147 | return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); |
| 148 | } |
| 149 | |
| 150 | /* |
| 151 | * Insert a TOC entry. |
| 152 | * |
| 153 | * The idea here is that the process setting up the shared memory segment will |
| 154 | * register the addresses of data structures within the segment using this |
| 155 | * function. Each data structure will be identified using a 64-bit key, which |
| 156 | * is assumed to be a well-known or discoverable integer. Other processes |
| 157 | * accessing the shared memory segment can pass the same key to |
| 158 | * shm_toc_lookup() to discover the addresses of those data structures. |
| 159 | * |
| 160 | * Since the shared memory segment may be mapped at different addresses within |
| 161 | * different backends, we store relative rather than absolute pointers. |
| 162 | * |
| 163 | * This won't scale well to a large number of keys. Hopefully, that isn't |
| 164 | * necessary; if it proves to be, we might need to provide a more sophisticated |
| 165 | * data structure here. But the real idea here is just to give someone mapping |
| 166 | * a dynamic shared memory the ability to find the bare minimum number of |
| 167 | * pointers that they need to bootstrap. If you're storing a lot of stuff in |
| 168 | * the TOC, you're doing it wrong. |
| 169 | */ |
| 170 | void |
| 171 | shm_toc_insert(shm_toc *toc, uint64 key, void *address) |
| 172 | { |
| 173 | volatile shm_toc *vtoc = toc; |
| 174 | Size total_bytes; |
| 175 | Size allocated_bytes; |
| 176 | Size nentry; |
| 177 | Size toc_bytes; |
| 178 | Size offset; |
| 179 | |
| 180 | /* Relativize pointer. */ |
| 181 | Assert(address > (void *) toc); |
| 182 | offset = ((char *) address) - (char *) toc; |
| 183 | |
| 184 | SpinLockAcquire(&toc->toc_mutex); |
| 185 | |
| 186 | total_bytes = vtoc->toc_total_bytes; |
| 187 | allocated_bytes = vtoc->toc_allocated_bytes; |
| 188 | nentry = vtoc->toc_nentry; |
| 189 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) |
| 190 | + allocated_bytes; |
| 191 | |
| 192 | /* Check for memory exhaustion and overflow. */ |
| 193 | if (toc_bytes + sizeof(shm_toc_entry) > total_bytes || |
| 194 | toc_bytes + sizeof(shm_toc_entry) < toc_bytes || |
| 195 | nentry >= PG_UINT32_MAX) |
| 196 | { |
| 197 | SpinLockRelease(&toc->toc_mutex); |
| 198 | ereport(ERROR, |
| 199 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 200 | errmsg("out of shared memory" ))); |
| 201 | } |
| 202 | |
| 203 | Assert(offset < total_bytes); |
| 204 | vtoc->toc_entry[nentry].key = key; |
| 205 | vtoc->toc_entry[nentry].offset = offset; |
| 206 | |
| 207 | /* |
| 208 | * By placing a write barrier after filling in the entry and before |
| 209 | * updating the number of entries, we make it safe to read the TOC |
| 210 | * unlocked. |
| 211 | */ |
| 212 | pg_write_barrier(); |
| 213 | |
| 214 | vtoc->toc_nentry++; |
| 215 | |
| 216 | SpinLockRelease(&toc->toc_mutex); |
| 217 | } |
| 218 | |
| 219 | /* |
| 220 | * Look up a TOC entry. |
| 221 | * |
| 222 | * If the key is not found, returns NULL if noError is true, otherwise |
| 223 | * throws elog(ERROR). |
| 224 | * |
| 225 | * Unlike the other functions in this file, this operation acquires no lock; |
| 226 | * it uses only barriers. It probably wouldn't hurt concurrency very much even |
| 227 | * if it did get a lock, but since it's reasonably likely that a group of |
| 228 | * worker processes could each read a series of entries from the same TOC |
| 229 | * right around the same time, there seems to be some value in avoiding it. |
| 230 | */ |
| 231 | void * |
| 232 | shm_toc_lookup(shm_toc *toc, uint64 key, bool noError) |
| 233 | { |
| 234 | uint32 nentry; |
| 235 | uint32 i; |
| 236 | |
| 237 | /* |
| 238 | * Read the number of entries before we examine any entry. We assume that |
| 239 | * reading a uint32 is atomic. |
| 240 | */ |
| 241 | nentry = toc->toc_nentry; |
| 242 | pg_read_barrier(); |
| 243 | |
| 244 | /* Now search for a matching entry. */ |
| 245 | for (i = 0; i < nentry; ++i) |
| 246 | { |
| 247 | if (toc->toc_entry[i].key == key) |
| 248 | return ((char *) toc) + toc->toc_entry[i].offset; |
| 249 | } |
| 250 | |
| 251 | /* No matching entry was found. */ |
| 252 | if (!noError) |
| 253 | elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p" , |
| 254 | key, toc); |
| 255 | return NULL; |
| 256 | } |
| 257 | |
| 258 | /* |
| 259 | * Estimate how much shared memory will be required to store a TOC and its |
| 260 | * dependent data structures. |
| 261 | */ |
| 262 | Size |
| 263 | shm_toc_estimate(shm_toc_estimator *e) |
| 264 | { |
| 265 | Size sz; |
| 266 | |
| 267 | sz = offsetof(shm_toc, toc_entry); |
| 268 | sz += add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry))); |
| 269 | sz += add_size(sz, e->space_for_chunks); |
| 270 | |
| 271 | return BUFFERALIGN(sz); |
| 272 | } |
| 273 | |