1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * shm_toc.c |
4 | * shared memory segment table of contents |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * src/backend/storage/ipc/shm_toc.c |
10 | * |
11 | *------------------------------------------------------------------------- |
12 | */ |
13 | |
14 | #include "postgres.h" |
15 | |
16 | #include "port/atomics.h" |
17 | #include "storage/shm_toc.h" |
18 | #include "storage/spin.h" |
19 | |
20 | typedef struct shm_toc_entry |
21 | { |
22 | uint64 key; /* Arbitrary identifier */ |
23 | Size offset; /* Offset, in bytes, from TOC start */ |
24 | } shm_toc_entry; |
25 | |
26 | struct shm_toc |
27 | { |
28 | uint64 toc_magic; /* Magic number identifying this TOC */ |
29 | slock_t toc_mutex; /* Spinlock for mutual exclusion */ |
30 | Size toc_total_bytes; /* Bytes managed by this TOC */ |
31 | Size toc_allocated_bytes; /* Bytes allocated of those managed */ |
32 | uint32 toc_nentry; /* Number of entries in TOC */ |
33 | shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER]; |
34 | }; |
35 | |
36 | /* |
37 | * Initialize a region of shared memory with a table of contents. |
38 | */ |
39 | shm_toc * |
40 | shm_toc_create(uint64 magic, void *address, Size nbytes) |
41 | { |
42 | shm_toc *toc = (shm_toc *) address; |
43 | |
44 | Assert(nbytes > offsetof(shm_toc, toc_entry)); |
45 | toc->toc_magic = magic; |
46 | SpinLockInit(&toc->toc_mutex); |
47 | |
48 | /* |
49 | * The alignment code in shm_toc_allocate() assumes that the starting |
50 | * value is buffer-aligned. |
51 | */ |
52 | toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes); |
53 | toc->toc_allocated_bytes = 0; |
54 | toc->toc_nentry = 0; |
55 | |
56 | return toc; |
57 | } |
58 | |
59 | /* |
60 | * Attach to an existing table of contents. If the magic number found at |
61 | * the target address doesn't match our expectations, return NULL. |
62 | */ |
63 | shm_toc * |
64 | shm_toc_attach(uint64 magic, void *address) |
65 | { |
66 | shm_toc *toc = (shm_toc *) address; |
67 | |
68 | if (toc->toc_magic != magic) |
69 | return NULL; |
70 | |
71 | Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes); |
72 | Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry)); |
73 | |
74 | return toc; |
75 | } |
76 | |
77 | /* |
78 | * Allocate shared memory from a segment managed by a table of contents. |
79 | * |
80 | * This is not a full-blown allocator; there's no way to free memory. It's |
81 | * just a way of dividing a single physical shared memory segment into logical |
82 | * chunks that may be used for different purposes. |
83 | * |
84 | * We allocate backwards from the end of the segment, so that the TOC entries |
85 | * can grow forward from the start of the segment. |
86 | */ |
87 | void * |
88 | shm_toc_allocate(shm_toc *toc, Size nbytes) |
89 | { |
90 | volatile shm_toc *vtoc = toc; |
91 | Size total_bytes; |
92 | Size allocated_bytes; |
93 | Size nentry; |
94 | Size toc_bytes; |
95 | |
96 | /* |
97 | * Make sure request is well-aligned. XXX: MAXALIGN is not enough, |
98 | * because atomic ops might need a wider alignment. We don't have a |
99 | * proper definition for the minimum to make atomic ops safe, but |
100 | * BUFFERALIGN ought to be enough. |
101 | */ |
102 | nbytes = BUFFERALIGN(nbytes); |
103 | |
104 | SpinLockAcquire(&toc->toc_mutex); |
105 | |
106 | total_bytes = vtoc->toc_total_bytes; |
107 | allocated_bytes = vtoc->toc_allocated_bytes; |
108 | nentry = vtoc->toc_nentry; |
109 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) |
110 | + allocated_bytes; |
111 | |
112 | /* Check for memory exhaustion and overflow. */ |
113 | if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes) |
114 | { |
115 | SpinLockRelease(&toc->toc_mutex); |
116 | ereport(ERROR, |
117 | (errcode(ERRCODE_OUT_OF_MEMORY), |
118 | errmsg("out of shared memory" ))); |
119 | } |
120 | vtoc->toc_allocated_bytes += nbytes; |
121 | |
122 | SpinLockRelease(&toc->toc_mutex); |
123 | |
124 | return ((char *) toc) + (total_bytes - allocated_bytes - nbytes); |
125 | } |
126 | |
127 | /* |
128 | * Return the number of bytes that can still be allocated. |
129 | */ |
130 | Size |
131 | shm_toc_freespace(shm_toc *toc) |
132 | { |
133 | volatile shm_toc *vtoc = toc; |
134 | Size total_bytes; |
135 | Size allocated_bytes; |
136 | Size nentry; |
137 | Size toc_bytes; |
138 | |
139 | SpinLockAcquire(&toc->toc_mutex); |
140 | total_bytes = vtoc->toc_total_bytes; |
141 | allocated_bytes = vtoc->toc_allocated_bytes; |
142 | nentry = vtoc->toc_nentry; |
143 | SpinLockRelease(&toc->toc_mutex); |
144 | |
145 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry); |
146 | Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); |
147 | return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); |
148 | } |
149 | |
150 | /* |
151 | * Insert a TOC entry. |
152 | * |
153 | * The idea here is that the process setting up the shared memory segment will |
154 | * register the addresses of data structures within the segment using this |
155 | * function. Each data structure will be identified using a 64-bit key, which |
156 | * is assumed to be a well-known or discoverable integer. Other processes |
157 | * accessing the shared memory segment can pass the same key to |
158 | * shm_toc_lookup() to discover the addresses of those data structures. |
159 | * |
160 | * Since the shared memory segment may be mapped at different addresses within |
161 | * different backends, we store relative rather than absolute pointers. |
162 | * |
163 | * This won't scale well to a large number of keys. Hopefully, that isn't |
164 | * necessary; if it proves to be, we might need to provide a more sophisticated |
165 | * data structure here. But the real idea here is just to give someone mapping |
166 | * a dynamic shared memory the ability to find the bare minimum number of |
167 | * pointers that they need to bootstrap. If you're storing a lot of stuff in |
168 | * the TOC, you're doing it wrong. |
169 | */ |
170 | void |
171 | shm_toc_insert(shm_toc *toc, uint64 key, void *address) |
172 | { |
173 | volatile shm_toc *vtoc = toc; |
174 | Size total_bytes; |
175 | Size allocated_bytes; |
176 | Size nentry; |
177 | Size toc_bytes; |
178 | Size offset; |
179 | |
180 | /* Relativize pointer. */ |
181 | Assert(address > (void *) toc); |
182 | offset = ((char *) address) - (char *) toc; |
183 | |
184 | SpinLockAcquire(&toc->toc_mutex); |
185 | |
186 | total_bytes = vtoc->toc_total_bytes; |
187 | allocated_bytes = vtoc->toc_allocated_bytes; |
188 | nentry = vtoc->toc_nentry; |
189 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) |
190 | + allocated_bytes; |
191 | |
192 | /* Check for memory exhaustion and overflow. */ |
193 | if (toc_bytes + sizeof(shm_toc_entry) > total_bytes || |
194 | toc_bytes + sizeof(shm_toc_entry) < toc_bytes || |
195 | nentry >= PG_UINT32_MAX) |
196 | { |
197 | SpinLockRelease(&toc->toc_mutex); |
198 | ereport(ERROR, |
199 | (errcode(ERRCODE_OUT_OF_MEMORY), |
200 | errmsg("out of shared memory" ))); |
201 | } |
202 | |
203 | Assert(offset < total_bytes); |
204 | vtoc->toc_entry[nentry].key = key; |
205 | vtoc->toc_entry[nentry].offset = offset; |
206 | |
207 | /* |
208 | * By placing a write barrier after filling in the entry and before |
209 | * updating the number of entries, we make it safe to read the TOC |
210 | * unlocked. |
211 | */ |
212 | pg_write_barrier(); |
213 | |
214 | vtoc->toc_nentry++; |
215 | |
216 | SpinLockRelease(&toc->toc_mutex); |
217 | } |
218 | |
219 | /* |
220 | * Look up a TOC entry. |
221 | * |
222 | * If the key is not found, returns NULL if noError is true, otherwise |
223 | * throws elog(ERROR). |
224 | * |
225 | * Unlike the other functions in this file, this operation acquires no lock; |
226 | * it uses only barriers. It probably wouldn't hurt concurrency very much even |
227 | * if it did get a lock, but since it's reasonably likely that a group of |
228 | * worker processes could each read a series of entries from the same TOC |
229 | * right around the same time, there seems to be some value in avoiding it. |
230 | */ |
231 | void * |
232 | shm_toc_lookup(shm_toc *toc, uint64 key, bool noError) |
233 | { |
234 | uint32 nentry; |
235 | uint32 i; |
236 | |
237 | /* |
238 | * Read the number of entries before we examine any entry. We assume that |
239 | * reading a uint32 is atomic. |
240 | */ |
241 | nentry = toc->toc_nentry; |
242 | pg_read_barrier(); |
243 | |
244 | /* Now search for a matching entry. */ |
245 | for (i = 0; i < nentry; ++i) |
246 | { |
247 | if (toc->toc_entry[i].key == key) |
248 | return ((char *) toc) + toc->toc_entry[i].offset; |
249 | } |
250 | |
251 | /* No matching entry was found. */ |
252 | if (!noError) |
253 | elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p" , |
254 | key, toc); |
255 | return NULL; |
256 | } |
257 | |
258 | /* |
259 | * Estimate how much shared memory will be required to store a TOC and its |
260 | * dependent data structures. |
261 | */ |
262 | Size |
263 | shm_toc_estimate(shm_toc_estimator *e) |
264 | { |
265 | Size sz; |
266 | |
267 | sz = offsetof(shm_toc, toc_entry); |
268 | sz += add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry))); |
269 | sz += add_size(sz, e->space_for_chunks); |
270 | |
271 | return BUFFERALIGN(sz); |
272 | } |
273 | |