1/*-------------------------------------------------------------------------
2 *
3 * partdesc.c
4 * Support routines for manipulating partition descriptors
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/partitioning/partdesc.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/genam.h"
18#include "access/htup_details.h"
19#include "access/table.h"
20#include "catalog/indexing.h"
21#include "catalog/partition.h"
22#include "catalog/pg_inherits.h"
23#include "partitioning/partbounds.h"
24#include "partitioning/partdesc.h"
25#include "storage/bufmgr.h"
26#include "storage/sinval.h"
27#include "utils/builtins.h"
28#include "utils/inval.h"
29#include "utils/fmgroids.h"
30#include "utils/hsearch.h"
31#include "utils/lsyscache.h"
32#include "utils/memutils.h"
33#include "utils/rel.h"
34#include "utils/partcache.h"
35#include "utils/syscache.h"
36
37typedef struct PartitionDirectoryData
38{
39 MemoryContext pdir_mcxt;
40 HTAB *pdir_hash;
41} PartitionDirectoryData;
42
43typedef struct PartitionDirectoryEntry
44{
45 Oid reloid;
46 Relation rel;
47 PartitionDesc pd;
48} PartitionDirectoryEntry;
49
50/*
51 * RelationBuildPartitionDesc
52 * Form rel's partition descriptor, and store in relcache entry
53 *
54 * Note: the descriptor won't be flushed from the cache by
55 * RelationClearRelation() unless it's changed because of
56 * addition or removal of a partition. Hence, code holding a lock
57 * that's sufficient to prevent that can assume that rd_partdesc
58 * won't change underneath it.
59 */
60void
61RelationBuildPartitionDesc(Relation rel)
62{
63 PartitionDesc partdesc;
64 PartitionBoundInfo boundinfo = NULL;
65 List *inhoids;
66 PartitionBoundSpec **boundspecs = NULL;
67 Oid *oids = NULL;
68 ListCell *cell;
69 int i,
70 nparts;
71 PartitionKey key = RelationGetPartitionKey(rel);
72 MemoryContext oldcxt;
73 int *mapping;
74
75 /*
76 * Get partition oids from pg_inherits. This uses a single snapshot to
77 * fetch the list of children, so while more children may be getting added
78 * concurrently, whatever this function returns will be accurate as of
79 * some well-defined point in time.
80 */
81 inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
82 nparts = list_length(inhoids);
83
84 /* Allocate arrays for OIDs and boundspecs. */
85 if (nparts > 0)
86 {
87 oids = palloc(nparts * sizeof(Oid));
88 boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
89 }
90
91 /* Collect bound spec nodes for each partition. */
92 i = 0;
93 foreach(cell, inhoids)
94 {
95 Oid inhrelid = lfirst_oid(cell);
96 HeapTuple tuple;
97 PartitionBoundSpec *boundspec = NULL;
98
99 /* Try fetching the tuple from the catcache, for speed. */
100 tuple = SearchSysCache1(RELOID, inhrelid);
101 if (HeapTupleIsValid(tuple))
102 {
103 Datum datum;
104 bool isnull;
105
106 datum = SysCacheGetAttr(RELOID, tuple,
107 Anum_pg_class_relpartbound,
108 &isnull);
109 if (!isnull)
110 boundspec = stringToNode(TextDatumGetCString(datum));
111 ReleaseSysCache(tuple);
112 }
113
114 /*
115 * The system cache may be out of date; if so, we may find no pg_class
116 * tuple or an old one where relpartbound is NULL. In that case, try
117 * the table directly. We can't just AcceptInvalidationMessages() and
118 * retry the system cache lookup because it's possible that a
119 * concurrent ATTACH PARTITION operation has removed itself to the
120 * ProcArray but yet added invalidation messages to the shared queue;
121 * InvalidateSystemCaches() would work, but seems excessive.
122 *
123 * Note that this algorithm assumes that PartitionBoundSpec we manage
124 * to fetch is the right one -- so this is only good enough for
125 * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
126 * some hypothetical operation that changes the partition bounds.
127 */
128 if (boundspec == NULL)
129 {
130 Relation pg_class;
131 SysScanDesc scan;
132 ScanKeyData key[1];
133 Datum datum;
134 bool isnull;
135
136 pg_class = table_open(RelationRelationId, AccessShareLock);
137 ScanKeyInit(&key[0],
138 Anum_pg_class_oid,
139 BTEqualStrategyNumber, F_OIDEQ,
140 ObjectIdGetDatum(inhrelid));
141 scan = systable_beginscan(pg_class, ClassOidIndexId, true,
142 NULL, 1, key);
143 tuple = systable_getnext(scan);
144 datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
145 RelationGetDescr(pg_class), &isnull);
146 if (!isnull)
147 boundspec = stringToNode(TextDatumGetCString(datum));
148 systable_endscan(scan);
149 table_close(pg_class, AccessShareLock);
150 }
151
152 /* Sanity checks. */
153 if (!boundspec)
154 elog(ERROR, "missing relpartbound for relation %u", inhrelid);
155 if (!IsA(boundspec, PartitionBoundSpec))
156 elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
157
158 /*
159 * If the PartitionBoundSpec says this is the default partition, its
160 * OID should match pg_partitioned_table.partdefid; if not, the
161 * catalog is corrupt.
162 */
163 if (boundspec->is_default)
164 {
165 Oid partdefid;
166
167 partdefid = get_default_partition_oid(RelationGetRelid(rel));
168 if (partdefid != inhrelid)
169 elog(ERROR, "expected partdefid %u, but got %u",
170 inhrelid, partdefid);
171 }
172
173 /* Save results. */
174 oids[i] = inhrelid;
175 boundspecs[i] = boundspec;
176 ++i;
177 }
178
179 /* Assert we aren't about to leak any old data structure */
180 Assert(rel->rd_pdcxt == NULL);
181 Assert(rel->rd_partdesc == NULL);
182
183 /*
184 * Now build the actual relcache partition descriptor. Note that the
185 * order of operations here is fairly critical. If we fail partway
186 * through this code, we won't have leaked memory because the rd_pdcxt is
187 * attached to the relcache entry immediately, so it'll be freed whenever
188 * the entry is rebuilt or destroyed. However, we don't assign to
189 * rd_partdesc until the cached data structure is fully complete and
190 * valid, so that no other code might try to use it.
191 */
192 rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
193 "partition descriptor",
194 ALLOCSET_SMALL_SIZES);
195 MemoryContextCopyAndSetIdentifier(rel->rd_pdcxt,
196 RelationGetRelationName(rel));
197
198 partdesc = (PartitionDescData *)
199 MemoryContextAllocZero(rel->rd_pdcxt, sizeof(PartitionDescData));
200 partdesc->nparts = nparts;
201 /* If there are no partitions, the rest of the partdesc can stay zero */
202 if (nparts > 0)
203 {
204 /* Create PartitionBoundInfo, using the caller's context. */
205 boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
206
207 /* Now copy all info into relcache's partdesc. */
208 oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
209 partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
210 partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
211 partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
212 MemoryContextSwitchTo(oldcxt);
213
214 /*
215 * Assign OIDs from the original array into mapped indexes of the
216 * result array. The order of OIDs in the former is defined by the
217 * catalog scan that retrieved them, whereas that in the latter is
218 * defined by canonicalized representation of the partition bounds.
219 *
220 * Also record leaf-ness of each partition. For this we use
221 * get_rel_relkind() which may leak memory, so be sure to run it in
222 * the caller's context.
223 */
224 for (i = 0; i < nparts; i++)
225 {
226 int index = mapping[i];
227
228 partdesc->oids[index] = oids[i];
229 partdesc->is_leaf[index] =
230 (get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
231 }
232 }
233
234 rel->rd_partdesc = partdesc;
235}
236
237/*
238 * CreatePartitionDirectory
239 * Create a new partition directory object.
240 */
241PartitionDirectory
242CreatePartitionDirectory(MemoryContext mcxt)
243{
244 MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
245 PartitionDirectory pdir;
246 HASHCTL ctl;
247
248 MemSet(&ctl, 0, sizeof(HASHCTL));
249 ctl.keysize = sizeof(Oid);
250 ctl.entrysize = sizeof(PartitionDirectoryEntry);
251 ctl.hcxt = mcxt;
252
253 pdir = palloc(sizeof(PartitionDirectoryData));
254 pdir->pdir_mcxt = mcxt;
255 pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
256 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
257
258 MemoryContextSwitchTo(oldcontext);
259 return pdir;
260}
261
262/*
263 * PartitionDirectoryLookup
264 * Look up the partition descriptor for a relation in the directory.
265 *
266 * The purpose of this function is to ensure that we get the same
267 * PartitionDesc for each relation every time we look it up. In the
268 * face of current DDL, different PartitionDescs may be constructed with
269 * different views of the catalog state, but any single particular OID
270 * will always get the same PartitionDesc for as long as the same
271 * PartitionDirectory is used.
272 */
273PartitionDesc
274PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
275{
276 PartitionDirectoryEntry *pde;
277 Oid relid = RelationGetRelid(rel);
278 bool found;
279
280 pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
281 if (!found)
282 {
283 /*
284 * We must keep a reference count on the relation so that the
285 * PartitionDesc to which we are pointing can't get destroyed.
286 */
287 RelationIncrementReferenceCount(rel);
288 pde->rel = rel;
289 pde->pd = RelationGetPartitionDesc(rel);
290 Assert(pde->pd != NULL);
291 }
292 return pde->pd;
293}
294
295/*
296 * DestroyPartitionDirectory
297 * Destroy a partition directory.
298 *
299 * Release the reference counts we're holding.
300 */
301void
302DestroyPartitionDirectory(PartitionDirectory pdir)
303{
304 HASH_SEQ_STATUS status;
305 PartitionDirectoryEntry *pde;
306
307 hash_seq_init(&status, pdir->pdir_hash);
308 while ((pde = hash_seq_search(&status)) != NULL)
309 RelationDecrementReferenceCount(pde->rel);
310}
311
312/*
313 * equalPartitionDescs
314 * Compare two partition descriptors for logical equality
315 */
316bool
317equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
318 PartitionDesc partdesc2)
319{
320 int i;
321
322 if (partdesc1 != NULL)
323 {
324 if (partdesc2 == NULL)
325 return false;
326 if (partdesc1->nparts != partdesc2->nparts)
327 return false;
328
329 Assert(key != NULL || partdesc1->nparts == 0);
330
331 /*
332 * Same oids? If the partitioning structure did not change, that is,
333 * no partitions were added or removed to the relation, the oids array
334 * should still match element-by-element.
335 */
336 for (i = 0; i < partdesc1->nparts; i++)
337 {
338 if (partdesc1->oids[i] != partdesc2->oids[i])
339 return false;
340 }
341
342 /*
343 * Now compare partition bound collections. The logic to iterate over
344 * the collections is private to partition.c.
345 */
346 if (partdesc1->boundinfo != NULL)
347 {
348 if (partdesc2->boundinfo == NULL)
349 return false;
350
351 if (!partition_bounds_equal(key->partnatts, key->parttyplen,
352 key->parttypbyval,
353 partdesc1->boundinfo,
354 partdesc2->boundinfo))
355 return false;
356 }
357 else if (partdesc2->boundinfo != NULL)
358 return false;
359 }
360 else if (partdesc2 != NULL)
361 return false;
362
363 return true;
364}
365
366/*
367 * get_default_oid_from_partdesc
368 *
369 * Given a partition descriptor, return the OID of the default partition, if
370 * one exists; else, return InvalidOid.
371 */
372Oid
373get_default_oid_from_partdesc(PartitionDesc partdesc)
374{
375 if (partdesc && partdesc->boundinfo &&
376 partition_bound_has_default(partdesc->boundinfo))
377 return partdesc->oids[partdesc->boundinfo->default_index];
378
379 return InvalidOid;
380}
381