1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * partdesc.c |
4 | * Support routines for manipulating partition descriptors |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/partitioning/partdesc.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | |
15 | #include "postgres.h" |
16 | |
17 | #include "access/genam.h" |
18 | #include "access/htup_details.h" |
19 | #include "access/table.h" |
20 | #include "catalog/indexing.h" |
21 | #include "catalog/partition.h" |
22 | #include "catalog/pg_inherits.h" |
23 | #include "partitioning/partbounds.h" |
24 | #include "partitioning/partdesc.h" |
25 | #include "storage/bufmgr.h" |
26 | #include "storage/sinval.h" |
27 | #include "utils/builtins.h" |
28 | #include "utils/inval.h" |
29 | #include "utils/fmgroids.h" |
30 | #include "utils/hsearch.h" |
31 | #include "utils/lsyscache.h" |
32 | #include "utils/memutils.h" |
33 | #include "utils/rel.h" |
34 | #include "utils/partcache.h" |
35 | #include "utils/syscache.h" |
36 | |
37 | typedef struct PartitionDirectoryData |
38 | { |
39 | MemoryContext pdir_mcxt; |
40 | HTAB *pdir_hash; |
41 | } PartitionDirectoryData; |
42 | |
43 | typedef struct PartitionDirectoryEntry |
44 | { |
45 | Oid reloid; |
46 | Relation rel; |
47 | PartitionDesc pd; |
48 | } PartitionDirectoryEntry; |
49 | |
50 | /* |
51 | * RelationBuildPartitionDesc |
52 | * Form rel's partition descriptor, and store in relcache entry |
53 | * |
54 | * Note: the descriptor won't be flushed from the cache by |
55 | * RelationClearRelation() unless it's changed because of |
56 | * addition or removal of a partition. Hence, code holding a lock |
57 | * that's sufficient to prevent that can assume that rd_partdesc |
58 | * won't change underneath it. |
59 | */ |
60 | void |
61 | RelationBuildPartitionDesc(Relation rel) |
62 | { |
63 | PartitionDesc partdesc; |
64 | PartitionBoundInfo boundinfo = NULL; |
65 | List *inhoids; |
66 | PartitionBoundSpec **boundspecs = NULL; |
67 | Oid *oids = NULL; |
68 | ListCell *cell; |
69 | int i, |
70 | nparts; |
71 | PartitionKey key = RelationGetPartitionKey(rel); |
72 | MemoryContext oldcxt; |
73 | int *mapping; |
74 | |
75 | /* |
76 | * Get partition oids from pg_inherits. This uses a single snapshot to |
77 | * fetch the list of children, so while more children may be getting added |
78 | * concurrently, whatever this function returns will be accurate as of |
79 | * some well-defined point in time. |
80 | */ |
81 | inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); |
82 | nparts = list_length(inhoids); |
83 | |
84 | /* Allocate arrays for OIDs and boundspecs. */ |
85 | if (nparts > 0) |
86 | { |
87 | oids = palloc(nparts * sizeof(Oid)); |
88 | boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); |
89 | } |
90 | |
91 | /* Collect bound spec nodes for each partition. */ |
92 | i = 0; |
93 | foreach(cell, inhoids) |
94 | { |
95 | Oid inhrelid = lfirst_oid(cell); |
96 | HeapTuple tuple; |
97 | PartitionBoundSpec *boundspec = NULL; |
98 | |
99 | /* Try fetching the tuple from the catcache, for speed. */ |
100 | tuple = SearchSysCache1(RELOID, inhrelid); |
101 | if (HeapTupleIsValid(tuple)) |
102 | { |
103 | Datum datum; |
104 | bool isnull; |
105 | |
106 | datum = SysCacheGetAttr(RELOID, tuple, |
107 | Anum_pg_class_relpartbound, |
108 | &isnull); |
109 | if (!isnull) |
110 | boundspec = stringToNode(TextDatumGetCString(datum)); |
111 | ReleaseSysCache(tuple); |
112 | } |
113 | |
114 | /* |
115 | * The system cache may be out of date; if so, we may find no pg_class |
116 | * tuple or an old one where relpartbound is NULL. In that case, try |
117 | * the table directly. We can't just AcceptInvalidationMessages() and |
118 | * retry the system cache lookup because it's possible that a |
119 | * concurrent ATTACH PARTITION operation has removed itself to the |
120 | * ProcArray but yet added invalidation messages to the shared queue; |
121 | * InvalidateSystemCaches() would work, but seems excessive. |
122 | * |
123 | * Note that this algorithm assumes that PartitionBoundSpec we manage |
124 | * to fetch is the right one -- so this is only good enough for |
125 | * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or |
126 | * some hypothetical operation that changes the partition bounds. |
127 | */ |
128 | if (boundspec == NULL) |
129 | { |
130 | Relation pg_class; |
131 | SysScanDesc scan; |
132 | ScanKeyData key[1]; |
133 | Datum datum; |
134 | bool isnull; |
135 | |
136 | pg_class = table_open(RelationRelationId, AccessShareLock); |
137 | ScanKeyInit(&key[0], |
138 | Anum_pg_class_oid, |
139 | BTEqualStrategyNumber, F_OIDEQ, |
140 | ObjectIdGetDatum(inhrelid)); |
141 | scan = systable_beginscan(pg_class, ClassOidIndexId, true, |
142 | NULL, 1, key); |
143 | tuple = systable_getnext(scan); |
144 | datum = heap_getattr(tuple, Anum_pg_class_relpartbound, |
145 | RelationGetDescr(pg_class), &isnull); |
146 | if (!isnull) |
147 | boundspec = stringToNode(TextDatumGetCString(datum)); |
148 | systable_endscan(scan); |
149 | table_close(pg_class, AccessShareLock); |
150 | } |
151 | |
152 | /* Sanity checks. */ |
153 | if (!boundspec) |
154 | elog(ERROR, "missing relpartbound for relation %u" , inhrelid); |
155 | if (!IsA(boundspec, PartitionBoundSpec)) |
156 | elog(ERROR, "invalid relpartbound for relation %u" , inhrelid); |
157 | |
158 | /* |
159 | * If the PartitionBoundSpec says this is the default partition, its |
160 | * OID should match pg_partitioned_table.partdefid; if not, the |
161 | * catalog is corrupt. |
162 | */ |
163 | if (boundspec->is_default) |
164 | { |
165 | Oid partdefid; |
166 | |
167 | partdefid = get_default_partition_oid(RelationGetRelid(rel)); |
168 | if (partdefid != inhrelid) |
169 | elog(ERROR, "expected partdefid %u, but got %u" , |
170 | inhrelid, partdefid); |
171 | } |
172 | |
173 | /* Save results. */ |
174 | oids[i] = inhrelid; |
175 | boundspecs[i] = boundspec; |
176 | ++i; |
177 | } |
178 | |
179 | /* Assert we aren't about to leak any old data structure */ |
180 | Assert(rel->rd_pdcxt == NULL); |
181 | Assert(rel->rd_partdesc == NULL); |
182 | |
183 | /* |
184 | * Now build the actual relcache partition descriptor. Note that the |
185 | * order of operations here is fairly critical. If we fail partway |
186 | * through this code, we won't have leaked memory because the rd_pdcxt is |
187 | * attached to the relcache entry immediately, so it'll be freed whenever |
188 | * the entry is rebuilt or destroyed. However, we don't assign to |
189 | * rd_partdesc until the cached data structure is fully complete and |
190 | * valid, so that no other code might try to use it. |
191 | */ |
192 | rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext, |
193 | "partition descriptor" , |
194 | ALLOCSET_SMALL_SIZES); |
195 | MemoryContextCopyAndSetIdentifier(rel->rd_pdcxt, |
196 | RelationGetRelationName(rel)); |
197 | |
198 | partdesc = (PartitionDescData *) |
199 | MemoryContextAllocZero(rel->rd_pdcxt, sizeof(PartitionDescData)); |
200 | partdesc->nparts = nparts; |
201 | /* If there are no partitions, the rest of the partdesc can stay zero */ |
202 | if (nparts > 0) |
203 | { |
204 | /* Create PartitionBoundInfo, using the caller's context. */ |
205 | boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping); |
206 | |
207 | /* Now copy all info into relcache's partdesc. */ |
208 | oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt); |
209 | partdesc->boundinfo = partition_bounds_copy(boundinfo, key); |
210 | partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid)); |
211 | partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool)); |
212 | MemoryContextSwitchTo(oldcxt); |
213 | |
214 | /* |
215 | * Assign OIDs from the original array into mapped indexes of the |
216 | * result array. The order of OIDs in the former is defined by the |
217 | * catalog scan that retrieved them, whereas that in the latter is |
218 | * defined by canonicalized representation of the partition bounds. |
219 | * |
220 | * Also record leaf-ness of each partition. For this we use |
221 | * get_rel_relkind() which may leak memory, so be sure to run it in |
222 | * the caller's context. |
223 | */ |
224 | for (i = 0; i < nparts; i++) |
225 | { |
226 | int index = mapping[i]; |
227 | |
228 | partdesc->oids[index] = oids[i]; |
229 | partdesc->is_leaf[index] = |
230 | (get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE); |
231 | } |
232 | } |
233 | |
234 | rel->rd_partdesc = partdesc; |
235 | } |
236 | |
237 | /* |
238 | * CreatePartitionDirectory |
239 | * Create a new partition directory object. |
240 | */ |
241 | PartitionDirectory |
242 | CreatePartitionDirectory(MemoryContext mcxt) |
243 | { |
244 | MemoryContext oldcontext = MemoryContextSwitchTo(mcxt); |
245 | PartitionDirectory pdir; |
246 | HASHCTL ctl; |
247 | |
248 | MemSet(&ctl, 0, sizeof(HASHCTL)); |
249 | ctl.keysize = sizeof(Oid); |
250 | ctl.entrysize = sizeof(PartitionDirectoryEntry); |
251 | ctl.hcxt = mcxt; |
252 | |
253 | pdir = palloc(sizeof(PartitionDirectoryData)); |
254 | pdir->pdir_mcxt = mcxt; |
255 | pdir->pdir_hash = hash_create("partition directory" , 256, &ctl, |
256 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
257 | |
258 | MemoryContextSwitchTo(oldcontext); |
259 | return pdir; |
260 | } |
261 | |
262 | /* |
263 | * PartitionDirectoryLookup |
264 | * Look up the partition descriptor for a relation in the directory. |
265 | * |
266 | * The purpose of this function is to ensure that we get the same |
267 | * PartitionDesc for each relation every time we look it up. In the |
268 | * face of current DDL, different PartitionDescs may be constructed with |
269 | * different views of the catalog state, but any single particular OID |
270 | * will always get the same PartitionDesc for as long as the same |
271 | * PartitionDirectory is used. |
272 | */ |
273 | PartitionDesc |
274 | PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel) |
275 | { |
276 | PartitionDirectoryEntry *pde; |
277 | Oid relid = RelationGetRelid(rel); |
278 | bool found; |
279 | |
280 | pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found); |
281 | if (!found) |
282 | { |
283 | /* |
284 | * We must keep a reference count on the relation so that the |
285 | * PartitionDesc to which we are pointing can't get destroyed. |
286 | */ |
287 | RelationIncrementReferenceCount(rel); |
288 | pde->rel = rel; |
289 | pde->pd = RelationGetPartitionDesc(rel); |
290 | Assert(pde->pd != NULL); |
291 | } |
292 | return pde->pd; |
293 | } |
294 | |
295 | /* |
296 | * DestroyPartitionDirectory |
297 | * Destroy a partition directory. |
298 | * |
299 | * Release the reference counts we're holding. |
300 | */ |
301 | void |
302 | DestroyPartitionDirectory(PartitionDirectory pdir) |
303 | { |
304 | HASH_SEQ_STATUS status; |
305 | PartitionDirectoryEntry *pde; |
306 | |
307 | hash_seq_init(&status, pdir->pdir_hash); |
308 | while ((pde = hash_seq_search(&status)) != NULL) |
309 | RelationDecrementReferenceCount(pde->rel); |
310 | } |
311 | |
312 | /* |
313 | * equalPartitionDescs |
314 | * Compare two partition descriptors for logical equality |
315 | */ |
316 | bool |
317 | equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1, |
318 | PartitionDesc partdesc2) |
319 | { |
320 | int i; |
321 | |
322 | if (partdesc1 != NULL) |
323 | { |
324 | if (partdesc2 == NULL) |
325 | return false; |
326 | if (partdesc1->nparts != partdesc2->nparts) |
327 | return false; |
328 | |
329 | Assert(key != NULL || partdesc1->nparts == 0); |
330 | |
331 | /* |
332 | * Same oids? If the partitioning structure did not change, that is, |
333 | * no partitions were added or removed to the relation, the oids array |
334 | * should still match element-by-element. |
335 | */ |
336 | for (i = 0; i < partdesc1->nparts; i++) |
337 | { |
338 | if (partdesc1->oids[i] != partdesc2->oids[i]) |
339 | return false; |
340 | } |
341 | |
342 | /* |
343 | * Now compare partition bound collections. The logic to iterate over |
344 | * the collections is private to partition.c. |
345 | */ |
346 | if (partdesc1->boundinfo != NULL) |
347 | { |
348 | if (partdesc2->boundinfo == NULL) |
349 | return false; |
350 | |
351 | if (!partition_bounds_equal(key->partnatts, key->parttyplen, |
352 | key->parttypbyval, |
353 | partdesc1->boundinfo, |
354 | partdesc2->boundinfo)) |
355 | return false; |
356 | } |
357 | else if (partdesc2->boundinfo != NULL) |
358 | return false; |
359 | } |
360 | else if (partdesc2 != NULL) |
361 | return false; |
362 | |
363 | return true; |
364 | } |
365 | |
366 | /* |
367 | * get_default_oid_from_partdesc |
368 | * |
369 | * Given a partition descriptor, return the OID of the default partition, if |
370 | * one exists; else, return InvalidOid. |
371 | */ |
372 | Oid |
373 | get_default_oid_from_partdesc(PartitionDesc partdesc) |
374 | { |
375 | if (partdesc && partdesc->boundinfo && |
376 | partition_bound_has_default(partdesc->boundinfo)) |
377 | return partdesc->oids[partdesc->boundinfo->default_index]; |
378 | |
379 | return InvalidOid; |
380 | } |
381 | |