1/*-------------------------------------------------------------------------
2 *
3 * shm_toc.c
4 * shared memory segment table of contents
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * src/backend/storage/ipc/shm_toc.c
10 *
11 *-------------------------------------------------------------------------
12 */
13
14#include "postgres.h"
15
16#include "port/atomics.h"
17#include "storage/shm_toc.h"
18#include "storage/spin.h"
19
20typedef struct shm_toc_entry
21{
22 uint64 key; /* Arbitrary identifier */
23 Size offset; /* Offset, in bytes, from TOC start */
24} shm_toc_entry;
25
26struct shm_toc
27{
28 uint64 toc_magic; /* Magic number identifying this TOC */
29 slock_t toc_mutex; /* Spinlock for mutual exclusion */
30 Size toc_total_bytes; /* Bytes managed by this TOC */
31 Size toc_allocated_bytes; /* Bytes allocated of those managed */
32 uint32 toc_nentry; /* Number of entries in TOC */
33 shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
34};
35
36/*
37 * Initialize a region of shared memory with a table of contents.
38 */
39shm_toc *
40shm_toc_create(uint64 magic, void *address, Size nbytes)
41{
42 shm_toc *toc = (shm_toc *) address;
43
44 Assert(nbytes > offsetof(shm_toc, toc_entry));
45 toc->toc_magic = magic;
46 SpinLockInit(&toc->toc_mutex);
47
48 /*
49 * The alignment code in shm_toc_allocate() assumes that the starting
50 * value is buffer-aligned.
51 */
52 toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
53 toc->toc_allocated_bytes = 0;
54 toc->toc_nentry = 0;
55
56 return toc;
57}
58
59/*
60 * Attach to an existing table of contents. If the magic number found at
61 * the target address doesn't match our expectations, return NULL.
62 */
63shm_toc *
64shm_toc_attach(uint64 magic, void *address)
65{
66 shm_toc *toc = (shm_toc *) address;
67
68 if (toc->toc_magic != magic)
69 return NULL;
70
71 Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
72 Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
73
74 return toc;
75}
76
77/*
78 * Allocate shared memory from a segment managed by a table of contents.
79 *
80 * This is not a full-blown allocator; there's no way to free memory. It's
81 * just a way of dividing a single physical shared memory segment into logical
82 * chunks that may be used for different purposes.
83 *
84 * We allocate backwards from the end of the segment, so that the TOC entries
85 * can grow forward from the start of the segment.
86 */
87void *
88shm_toc_allocate(shm_toc *toc, Size nbytes)
89{
90 volatile shm_toc *vtoc = toc;
91 Size total_bytes;
92 Size allocated_bytes;
93 Size nentry;
94 Size toc_bytes;
95
96 /*
97 * Make sure request is well-aligned. XXX: MAXALIGN is not enough,
98 * because atomic ops might need a wider alignment. We don't have a
99 * proper definition for the minimum to make atomic ops safe, but
100 * BUFFERALIGN ought to be enough.
101 */
102 nbytes = BUFFERALIGN(nbytes);
103
104 SpinLockAcquire(&toc->toc_mutex);
105
106 total_bytes = vtoc->toc_total_bytes;
107 allocated_bytes = vtoc->toc_allocated_bytes;
108 nentry = vtoc->toc_nentry;
109 toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
110 + allocated_bytes;
111
112 /* Check for memory exhaustion and overflow. */
113 if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
114 {
115 SpinLockRelease(&toc->toc_mutex);
116 ereport(ERROR,
117 (errcode(ERRCODE_OUT_OF_MEMORY),
118 errmsg("out of shared memory")));
119 }
120 vtoc->toc_allocated_bytes += nbytes;
121
122 SpinLockRelease(&toc->toc_mutex);
123
124 return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
125}
126
127/*
128 * Return the number of bytes that can still be allocated.
129 */
130Size
131shm_toc_freespace(shm_toc *toc)
132{
133 volatile shm_toc *vtoc = toc;
134 Size total_bytes;
135 Size allocated_bytes;
136 Size nentry;
137 Size toc_bytes;
138
139 SpinLockAcquire(&toc->toc_mutex);
140 total_bytes = vtoc->toc_total_bytes;
141 allocated_bytes = vtoc->toc_allocated_bytes;
142 nentry = vtoc->toc_nentry;
143 SpinLockRelease(&toc->toc_mutex);
144
145 toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
146 Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
147 return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
148}
149
150/*
151 * Insert a TOC entry.
152 *
153 * The idea here is that the process setting up the shared memory segment will
154 * register the addresses of data structures within the segment using this
155 * function. Each data structure will be identified using a 64-bit key, which
156 * is assumed to be a well-known or discoverable integer. Other processes
157 * accessing the shared memory segment can pass the same key to
158 * shm_toc_lookup() to discover the addresses of those data structures.
159 *
160 * Since the shared memory segment may be mapped at different addresses within
161 * different backends, we store relative rather than absolute pointers.
162 *
163 * This won't scale well to a large number of keys. Hopefully, that isn't
164 * necessary; if it proves to be, we might need to provide a more sophisticated
165 * data structure here. But the real idea here is just to give someone mapping
166 * a dynamic shared memory the ability to find the bare minimum number of
167 * pointers that they need to bootstrap. If you're storing a lot of stuff in
168 * the TOC, you're doing it wrong.
169 */
170void
171shm_toc_insert(shm_toc *toc, uint64 key, void *address)
172{
173 volatile shm_toc *vtoc = toc;
174 Size total_bytes;
175 Size allocated_bytes;
176 Size nentry;
177 Size toc_bytes;
178 Size offset;
179
180 /* Relativize pointer. */
181 Assert(address > (void *) toc);
182 offset = ((char *) address) - (char *) toc;
183
184 SpinLockAcquire(&toc->toc_mutex);
185
186 total_bytes = vtoc->toc_total_bytes;
187 allocated_bytes = vtoc->toc_allocated_bytes;
188 nentry = vtoc->toc_nentry;
189 toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
190 + allocated_bytes;
191
192 /* Check for memory exhaustion and overflow. */
193 if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
194 toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
195 nentry >= PG_UINT32_MAX)
196 {
197 SpinLockRelease(&toc->toc_mutex);
198 ereport(ERROR,
199 (errcode(ERRCODE_OUT_OF_MEMORY),
200 errmsg("out of shared memory")));
201 }
202
203 Assert(offset < total_bytes);
204 vtoc->toc_entry[nentry].key = key;
205 vtoc->toc_entry[nentry].offset = offset;
206
207 /*
208 * By placing a write barrier after filling in the entry and before
209 * updating the number of entries, we make it safe to read the TOC
210 * unlocked.
211 */
212 pg_write_barrier();
213
214 vtoc->toc_nentry++;
215
216 SpinLockRelease(&toc->toc_mutex);
217}
218
219/*
220 * Look up a TOC entry.
221 *
222 * If the key is not found, returns NULL if noError is true, otherwise
223 * throws elog(ERROR).
224 *
225 * Unlike the other functions in this file, this operation acquires no lock;
226 * it uses only barriers. It probably wouldn't hurt concurrency very much even
227 * if it did get a lock, but since it's reasonably likely that a group of
228 * worker processes could each read a series of entries from the same TOC
229 * right around the same time, there seems to be some value in avoiding it.
230 */
231void *
232shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
233{
234 uint32 nentry;
235 uint32 i;
236
237 /*
238 * Read the number of entries before we examine any entry. We assume that
239 * reading a uint32 is atomic.
240 */
241 nentry = toc->toc_nentry;
242 pg_read_barrier();
243
244 /* Now search for a matching entry. */
245 for (i = 0; i < nentry; ++i)
246 {
247 if (toc->toc_entry[i].key == key)
248 return ((char *) toc) + toc->toc_entry[i].offset;
249 }
250
251 /* No matching entry was found. */
252 if (!noError)
253 elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
254 key, toc);
255 return NULL;
256}
257
258/*
259 * Estimate how much shared memory will be required to store a TOC and its
260 * dependent data structures.
261 */
262Size
263shm_toc_estimate(shm_toc_estimator *e)
264{
265 Size sz;
266
267 sz = offsetof(shm_toc, toc_entry);
268 sz += add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
269 sz += add_size(sz, e->space_for_chunks);
270
271 return BUFFERALIGN(sz);
272}
273