1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * gistutil.c |
4 | * utilities routines for the postgres GiST index access method. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/gist/gistutil.c |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | #include "postgres.h" |
15 | |
16 | #include <math.h> |
17 | |
18 | #include "access/gist_private.h" |
19 | #include "access/htup_details.h" |
20 | #include "access/reloptions.h" |
21 | #include "catalog/pg_opclass.h" |
22 | #include "storage/indexfsm.h" |
23 | #include "storage/lmgr.h" |
24 | #include "utils/float.h" |
25 | #include "utils/syscache.h" |
26 | #include "utils/snapmgr.h" |
27 | #include "utils/lsyscache.h" |
28 | |
29 | |
30 | /* |
31 | * Write itup vector to page, has no control of free space. |
32 | */ |
33 | void |
34 | gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off) |
35 | { |
36 | OffsetNumber l = InvalidOffsetNumber; |
37 | int i; |
38 | |
39 | if (off == InvalidOffsetNumber) |
40 | off = (PageIsEmpty(page)) ? FirstOffsetNumber : |
41 | OffsetNumberNext(PageGetMaxOffsetNumber(page)); |
42 | |
43 | for (i = 0; i < len; i++) |
44 | { |
45 | Size sz = IndexTupleSize(itup[i]); |
46 | |
47 | l = PageAddItem(page, (Item) itup[i], sz, off, false, false); |
48 | if (l == InvalidOffsetNumber) |
49 | elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes" , |
50 | i, len, (int) sz); |
51 | off++; |
52 | } |
53 | } |
54 | |
55 | /* |
56 | * Check space for itup vector on page |
57 | */ |
58 | bool |
59 | gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace) |
60 | { |
61 | unsigned int size = freespace, |
62 | deleted = 0; |
63 | int i; |
64 | |
65 | for (i = 0; i < len; i++) |
66 | size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); |
67 | |
68 | if (todelete != InvalidOffsetNumber) |
69 | { |
70 | IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete)); |
71 | |
72 | deleted = IndexTupleSize(itup) + sizeof(ItemIdData); |
73 | } |
74 | |
75 | return (PageGetFreeSpace(page) + deleted < size); |
76 | } |
77 | |
78 | bool |
79 | gistfitpage(IndexTuple *itvec, int len) |
80 | { |
81 | int i; |
82 | Size size = 0; |
83 | |
84 | for (i = 0; i < len; i++) |
85 | size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData); |
86 | |
87 | /* TODO: Consider fillfactor */ |
88 | return (size <= GiSTPageSize); |
89 | } |
90 | |
91 | /* |
92 | * Read buffer into itup vector |
93 | */ |
94 | IndexTuple * |
95 | (Page page, int *len /* out */ ) |
96 | { |
97 | OffsetNumber i, |
98 | maxoff; |
99 | IndexTuple *itvec; |
100 | |
101 | maxoff = PageGetMaxOffsetNumber(page); |
102 | *len = maxoff; |
103 | itvec = palloc(sizeof(IndexTuple) * maxoff); |
104 | for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) |
105 | itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); |
106 | |
107 | return itvec; |
108 | } |
109 | |
110 | /* |
111 | * join two vectors into one |
112 | */ |
113 | IndexTuple * |
114 | gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen) |
115 | { |
116 | itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen)); |
117 | memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen); |
118 | *len += addlen; |
119 | return itvec; |
120 | } |
121 | |
122 | /* |
123 | * make plain IndexTupleVector |
124 | */ |
125 | |
126 | IndexTupleData * |
127 | gistfillitupvec(IndexTuple *vec, int veclen, int *memlen) |
128 | { |
129 | char *ptr, |
130 | *ret; |
131 | int i; |
132 | |
133 | *memlen = 0; |
134 | |
135 | for (i = 0; i < veclen; i++) |
136 | *memlen += IndexTupleSize(vec[i]); |
137 | |
138 | ptr = ret = palloc(*memlen); |
139 | |
140 | for (i = 0; i < veclen; i++) |
141 | { |
142 | memcpy(ptr, vec[i], IndexTupleSize(vec[i])); |
143 | ptr += IndexTupleSize(vec[i]); |
144 | } |
145 | |
146 | return (IndexTupleData *) ret; |
147 | } |
148 | |
149 | /* |
150 | * Make unions of keys in IndexTuple vector (one union datum per index column). |
151 | * Union Datums are returned into the attr/isnull arrays. |
152 | * Resulting Datums aren't compressed. |
153 | */ |
154 | void |
155 | gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, |
156 | Datum *attr, bool *isnull) |
157 | { |
158 | int i; |
159 | GistEntryVector *evec; |
160 | int attrsize; |
161 | |
162 | evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ); |
163 | |
164 | for (i = 0; i < giststate->nonLeafTupdesc->natts; i++) |
165 | { |
166 | int j; |
167 | |
168 | /* Collect non-null datums for this column */ |
169 | evec->n = 0; |
170 | for (j = 0; j < len; j++) |
171 | { |
172 | Datum datum; |
173 | bool IsNull; |
174 | |
175 | datum = index_getattr(itvec[j], i + 1, giststate->leafTupdesc, |
176 | &IsNull); |
177 | if (IsNull) |
178 | continue; |
179 | |
180 | gistdentryinit(giststate, i, |
181 | evec->vector + evec->n, |
182 | datum, |
183 | NULL, NULL, (OffsetNumber) 0, |
184 | false, IsNull); |
185 | evec->n++; |
186 | } |
187 | |
188 | /* If this column was all NULLs, the union is NULL */ |
189 | if (evec->n == 0) |
190 | { |
191 | attr[i] = (Datum) 0; |
192 | isnull[i] = true; |
193 | } |
194 | else |
195 | { |
196 | if (evec->n == 1) |
197 | { |
198 | /* unionFn may expect at least two inputs */ |
199 | evec->n = 2; |
200 | evec->vector[1] = evec->vector[0]; |
201 | } |
202 | |
203 | /* Make union and store in attr array */ |
204 | attr[i] = FunctionCall2Coll(&giststate->unionFn[i], |
205 | giststate->supportCollation[i], |
206 | PointerGetDatum(evec), |
207 | PointerGetDatum(&attrsize)); |
208 | |
209 | isnull[i] = false; |
210 | } |
211 | } |
212 | } |
213 | |
214 | /* |
215 | * Return an IndexTuple containing the result of applying the "union" |
216 | * method to the specified IndexTuple vector. |
217 | */ |
218 | IndexTuple |
219 | gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate) |
220 | { |
221 | Datum attr[INDEX_MAX_KEYS]; |
222 | bool isnull[INDEX_MAX_KEYS]; |
223 | |
224 | gistMakeUnionItVec(giststate, itvec, len, attr, isnull); |
225 | |
226 | return gistFormTuple(giststate, r, attr, isnull, false); |
227 | } |
228 | |
229 | /* |
230 | * makes union of two key |
231 | */ |
232 | void |
233 | gistMakeUnionKey(GISTSTATE *giststate, int attno, |
234 | GISTENTRY *entry1, bool isnull1, |
235 | GISTENTRY *entry2, bool isnull2, |
236 | Datum *dst, bool *dstisnull) |
237 | { |
238 | /* we need a GistEntryVector with room for exactly 2 elements */ |
239 | union |
240 | { |
241 | GistEntryVector gev; |
242 | char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ]; |
243 | } storage; |
244 | GistEntryVector *evec = &storage.gev; |
245 | int dstsize; |
246 | |
247 | evec->n = 2; |
248 | |
249 | if (isnull1 && isnull2) |
250 | { |
251 | *dstisnull = true; |
252 | *dst = (Datum) 0; |
253 | } |
254 | else |
255 | { |
256 | if (isnull1 == false && isnull2 == false) |
257 | { |
258 | evec->vector[0] = *entry1; |
259 | evec->vector[1] = *entry2; |
260 | } |
261 | else if (isnull1 == false) |
262 | { |
263 | evec->vector[0] = *entry1; |
264 | evec->vector[1] = *entry1; |
265 | } |
266 | else |
267 | { |
268 | evec->vector[0] = *entry2; |
269 | evec->vector[1] = *entry2; |
270 | } |
271 | |
272 | *dstisnull = false; |
273 | *dst = FunctionCall2Coll(&giststate->unionFn[attno], |
274 | giststate->supportCollation[attno], |
275 | PointerGetDatum(evec), |
276 | PointerGetDatum(&dstsize)); |
277 | } |
278 | } |
279 | |
280 | bool |
281 | gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b) |
282 | { |
283 | bool result; |
284 | |
285 | FunctionCall3Coll(&giststate->equalFn[attno], |
286 | giststate->supportCollation[attno], |
287 | a, b, |
288 | PointerGetDatum(&result)); |
289 | return result; |
290 | } |
291 | |
292 | /* |
293 | * Decompress all keys in tuple |
294 | */ |
295 | void |
296 | gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, |
297 | OffsetNumber o, GISTENTRY *attdata, bool *isnull) |
298 | { |
299 | int i; |
300 | |
301 | for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) |
302 | { |
303 | Datum datum; |
304 | |
305 | datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]); |
306 | gistdentryinit(giststate, i, &attdata[i], |
307 | datum, r, p, o, |
308 | false, isnull[i]); |
309 | } |
310 | } |
311 | |
312 | /* |
313 | * Forms union of oldtup and addtup, if union == oldtup then return NULL |
314 | */ |
315 | IndexTuple |
316 | gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate) |
317 | { |
318 | bool neednew = false; |
319 | GISTENTRY oldentries[INDEX_MAX_KEYS], |
320 | addentries[INDEX_MAX_KEYS]; |
321 | bool oldisnull[INDEX_MAX_KEYS], |
322 | addisnull[INDEX_MAX_KEYS]; |
323 | Datum attr[INDEX_MAX_KEYS]; |
324 | bool isnull[INDEX_MAX_KEYS]; |
325 | IndexTuple newtup = NULL; |
326 | int i; |
327 | |
328 | gistDeCompressAtt(giststate, r, oldtup, NULL, |
329 | (OffsetNumber) 0, oldentries, oldisnull); |
330 | |
331 | gistDeCompressAtt(giststate, r, addtup, NULL, |
332 | (OffsetNumber) 0, addentries, addisnull); |
333 | |
334 | for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) |
335 | { |
336 | gistMakeUnionKey(giststate, i, |
337 | oldentries + i, oldisnull[i], |
338 | addentries + i, addisnull[i], |
339 | attr + i, isnull + i); |
340 | |
341 | if (neednew) |
342 | /* we already need new key, so we can skip check */ |
343 | continue; |
344 | |
345 | if (isnull[i]) |
346 | /* union of key may be NULL if and only if both keys are NULL */ |
347 | continue; |
348 | |
349 | if (!addisnull[i]) |
350 | { |
351 | if (oldisnull[i] || |
352 | !gistKeyIsEQ(giststate, i, oldentries[i].key, attr[i])) |
353 | neednew = true; |
354 | } |
355 | } |
356 | |
357 | if (neednew) |
358 | { |
359 | /* need to update key */ |
360 | newtup = gistFormTuple(giststate, r, attr, isnull, false); |
361 | newtup->t_tid = oldtup->t_tid; |
362 | } |
363 | |
364 | return newtup; |
365 | } |
366 | |
367 | /* |
368 | * Search an upper index page for the entry with lowest penalty for insertion |
369 | * of the new index key contained in "it". |
370 | * |
371 | * Returns the index of the page entry to insert into. |
372 | */ |
373 | OffsetNumber |
374 | gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ |
375 | GISTSTATE *giststate) |
376 | { |
377 | OffsetNumber result; |
378 | OffsetNumber maxoff; |
379 | OffsetNumber i; |
380 | float best_penalty[INDEX_MAX_KEYS]; |
381 | GISTENTRY entry, |
382 | identry[INDEX_MAX_KEYS]; |
383 | bool isnull[INDEX_MAX_KEYS]; |
384 | int keep_current_best; |
385 | |
386 | Assert(!GistPageIsLeaf(p)); |
387 | |
388 | gistDeCompressAtt(giststate, r, |
389 | it, NULL, (OffsetNumber) 0, |
390 | identry, isnull); |
391 | |
392 | /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */ |
393 | result = FirstOffsetNumber; |
394 | |
395 | /* |
396 | * The index may have multiple columns, and there's a penalty value for |
397 | * each column. The penalty associated with a column that appears earlier |
398 | * in the index definition is strictly more important than the penalty of |
399 | * a column that appears later in the index definition. |
400 | * |
401 | * best_penalty[j] is the best penalty we have seen so far for column j, |
402 | * or -1 when we haven't yet examined column j. Array entries to the |
403 | * right of the first -1 are undefined. |
404 | */ |
405 | best_penalty[0] = -1; |
406 | |
407 | /* |
408 | * If we find a tuple that's exactly as good as the currently best one, we |
409 | * could use either one. When inserting a lot of tuples with the same or |
410 | * similar keys, it's preferable to descend down the same path when |
411 | * possible, as that's more cache-friendly. On the other hand, if all |
412 | * inserts land on the same leaf page after a split, we're never going to |
413 | * insert anything to the other half of the split, and will end up using |
414 | * only 50% of the available space. Distributing the inserts evenly would |
415 | * lead to better space usage, but that hurts cache-locality during |
416 | * insertion. To get the best of both worlds, when we find a tuple that's |
417 | * exactly as good as the previous best, choose randomly whether to stick |
418 | * to the old best, or use the new one. Once we decide to stick to the |
419 | * old best, we keep sticking to it for any subsequent equally good tuples |
420 | * we might find. This favors tuples with low offsets, but still allows |
421 | * some inserts to go to other equally-good subtrees. |
422 | * |
423 | * keep_current_best is -1 if we haven't yet had to make a random choice |
424 | * whether to keep the current best tuple. If we have done so, and |
425 | * decided to keep it, keep_current_best is 1; if we've decided to |
426 | * replace, keep_current_best is 0. (This state will be reset to -1 as |
427 | * soon as we've made the replacement, but sometimes we make the choice in |
428 | * advance of actually finding a replacement best tuple.) |
429 | */ |
430 | keep_current_best = -1; |
431 | |
432 | /* |
433 | * Loop over tuples on page. |
434 | */ |
435 | maxoff = PageGetMaxOffsetNumber(p); |
436 | Assert(maxoff >= FirstOffsetNumber); |
437 | |
438 | for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) |
439 | { |
440 | IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); |
441 | bool zero_penalty; |
442 | int j; |
443 | |
444 | zero_penalty = true; |
445 | |
446 | /* Loop over index attributes. */ |
447 | for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) |
448 | { |
449 | Datum datum; |
450 | float usize; |
451 | bool IsNull; |
452 | |
453 | /* Compute penalty for this column. */ |
454 | datum = index_getattr(itup, j + 1, giststate->leafTupdesc, |
455 | &IsNull); |
456 | gistdentryinit(giststate, j, &entry, datum, r, p, i, |
457 | false, IsNull); |
458 | usize = gistpenalty(giststate, j, &entry, IsNull, |
459 | &identry[j], isnull[j]); |
460 | if (usize > 0) |
461 | zero_penalty = false; |
462 | |
463 | if (best_penalty[j] < 0 || usize < best_penalty[j]) |
464 | { |
465 | /* |
466 | * New best penalty for column. Tentatively select this tuple |
467 | * as the target, and record the best penalty. Then reset the |
468 | * next column's penalty to "unknown" (and indirectly, the |
469 | * same for all the ones to its right). This will force us to |
470 | * adopt this tuple's penalty values as the best for all the |
471 | * remaining columns during subsequent loop iterations. |
472 | */ |
473 | result = i; |
474 | best_penalty[j] = usize; |
475 | |
476 | if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) |
477 | best_penalty[j + 1] = -1; |
478 | |
479 | /* we have new best, so reset keep-it decision */ |
480 | keep_current_best = -1; |
481 | } |
482 | else if (best_penalty[j] == usize) |
483 | { |
484 | /* |
485 | * The current tuple is exactly as good for this column as the |
486 | * best tuple seen so far. The next iteration of this loop |
487 | * will compare the next column. |
488 | */ |
489 | } |
490 | else |
491 | { |
492 | /* |
493 | * The current tuple is worse for this column than the best |
494 | * tuple seen so far. Skip the remaining columns and move on |
495 | * to the next tuple, if any. |
496 | */ |
497 | zero_penalty = false; /* so outer loop won't exit */ |
498 | break; |
499 | } |
500 | } |
501 | |
502 | /* |
503 | * If we looped past the last column, and did not update "result", |
504 | * then this tuple is exactly as good as the prior best tuple. |
505 | */ |
506 | if (j == IndexRelationGetNumberOfKeyAttributes(r) && result != i) |
507 | { |
508 | if (keep_current_best == -1) |
509 | { |
510 | /* we didn't make the random choice yet for this old best */ |
511 | keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; |
512 | } |
513 | if (keep_current_best == 0) |
514 | { |
515 | /* we choose to use the new tuple */ |
516 | result = i; |
517 | /* choose again if there are even more exactly-as-good ones */ |
518 | keep_current_best = -1; |
519 | } |
520 | } |
521 | |
522 | /* |
523 | * If we find a tuple with zero penalty for all columns, and we've |
524 | * decided we don't want to search for another tuple with equal |
525 | * penalty, there's no need to examine remaining tuples; just break |
526 | * out of the loop and return it. |
527 | */ |
528 | if (zero_penalty) |
529 | { |
530 | if (keep_current_best == -1) |
531 | { |
532 | /* we didn't make the random choice yet for this old best */ |
533 | keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; |
534 | } |
535 | if (keep_current_best == 1) |
536 | break; |
537 | } |
538 | } |
539 | |
540 | return result; |
541 | } |
542 | |
543 | /* |
544 | * initialize a GiST entry with a decompressed version of key |
545 | */ |
546 | void |
547 | gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, |
548 | Datum k, Relation r, Page pg, OffsetNumber o, |
549 | bool l, bool isNull) |
550 | { |
551 | if (!isNull) |
552 | { |
553 | GISTENTRY *dep; |
554 | |
555 | gistentryinit(*e, k, r, pg, o, l); |
556 | |
557 | /* there may not be a decompress function in opclass */ |
558 | if (!OidIsValid(giststate->decompressFn[nkey].fn_oid)) |
559 | return; |
560 | |
561 | dep = (GISTENTRY *) |
562 | DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey], |
563 | giststate->supportCollation[nkey], |
564 | PointerGetDatum(e))); |
565 | /* decompressFn may just return the given pointer */ |
566 | if (dep != e) |
567 | gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset, |
568 | dep->leafkey); |
569 | } |
570 | else |
571 | gistentryinit(*e, (Datum) 0, r, pg, o, l); |
572 | } |
573 | |
574 | IndexTuple |
575 | gistFormTuple(GISTSTATE *giststate, Relation r, |
576 | Datum attdata[], bool isnull[], bool isleaf) |
577 | { |
578 | Datum compatt[INDEX_MAX_KEYS]; |
579 | int i; |
580 | IndexTuple res; |
581 | |
582 | /* |
583 | * Call the compress method on each attribute. |
584 | */ |
585 | for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) |
586 | { |
587 | if (isnull[i]) |
588 | compatt[i] = (Datum) 0; |
589 | else |
590 | { |
591 | GISTENTRY centry; |
592 | GISTENTRY *cep; |
593 | |
594 | gistentryinit(centry, attdata[i], r, NULL, (OffsetNumber) 0, |
595 | isleaf); |
596 | /* there may not be a compress function in opclass */ |
597 | if (OidIsValid(giststate->compressFn[i].fn_oid)) |
598 | cep = (GISTENTRY *) |
599 | DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[i], |
600 | giststate->supportCollation[i], |
601 | PointerGetDatum(¢ry))); |
602 | else |
603 | cep = ¢ry; |
604 | compatt[i] = cep->key; |
605 | } |
606 | } |
607 | |
608 | if (isleaf) |
609 | { |
610 | /* |
611 | * Emplace each included attribute if any. |
612 | */ |
613 | for (; i < r->rd_att->natts; i++) |
614 | { |
615 | if (isnull[i]) |
616 | compatt[i] = (Datum) 0; |
617 | else |
618 | compatt[i] = attdata[i]; |
619 | } |
620 | } |
621 | |
622 | res = index_form_tuple(isleaf ? giststate->leafTupdesc : |
623 | giststate->nonLeafTupdesc, |
624 | compatt, isnull); |
625 | |
626 | /* |
627 | * The offset number on tuples on internal pages is unused. For historical |
628 | * reasons, it is set to 0xffff. |
629 | */ |
630 | ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); |
631 | return res; |
632 | } |
633 | |
634 | /* |
635 | * initialize a GiST entry with fetched value in key field |
636 | */ |
637 | static Datum |
638 | gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r) |
639 | { |
640 | GISTENTRY fentry; |
641 | GISTENTRY *fep; |
642 | |
643 | gistentryinit(fentry, k, r, NULL, (OffsetNumber) 0, false); |
644 | |
645 | fep = (GISTENTRY *) |
646 | DatumGetPointer(FunctionCall1Coll(&giststate->fetchFn[nkey], |
647 | giststate->supportCollation[nkey], |
648 | PointerGetDatum(&fentry))); |
649 | |
650 | /* fetchFn set 'key', return it to the caller */ |
651 | return fep->key; |
652 | } |
653 | |
654 | /* |
655 | * Fetch all keys in tuple. |
656 | * Returns a new HeapTuple containing the originally-indexed data. |
657 | */ |
658 | HeapTuple |
659 | gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) |
660 | { |
661 | MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt); |
662 | Datum fetchatt[INDEX_MAX_KEYS]; |
663 | bool isnull[INDEX_MAX_KEYS]; |
664 | int i; |
665 | |
666 | for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) |
667 | { |
668 | Datum datum; |
669 | |
670 | datum = index_getattr(tuple, i + 1, giststate->leafTupdesc, &isnull[i]); |
671 | |
672 | if (giststate->fetchFn[i].fn_oid != InvalidOid) |
673 | { |
674 | if (!isnull[i]) |
675 | fetchatt[i] = gistFetchAtt(giststate, i, datum, r); |
676 | else |
677 | fetchatt[i] = (Datum) 0; |
678 | } |
679 | else if (giststate->compressFn[i].fn_oid == InvalidOid) |
680 | { |
681 | /* |
682 | * If opclass does not provide compress method that could change |
683 | * original value, att is necessarily stored in original form. |
684 | */ |
685 | if (!isnull[i]) |
686 | fetchatt[i] = datum; |
687 | else |
688 | fetchatt[i] = (Datum) 0; |
689 | } |
690 | else |
691 | { |
692 | /* |
693 | * Index-only scans not supported for this column. Since the |
694 | * planner chose an index-only scan anyway, it is not interested |
695 | * in this column, and we can replace it with a NULL. |
696 | */ |
697 | isnull[i] = true; |
698 | fetchatt[i] = (Datum) 0; |
699 | } |
700 | } |
701 | |
702 | /* |
703 | * Get each included attribute. |
704 | */ |
705 | for (; i < r->rd_att->natts; i++) |
706 | { |
707 | fetchatt[i] = index_getattr(tuple, i + 1, giststate->leafTupdesc, |
708 | &isnull[i]); |
709 | } |
710 | MemoryContextSwitchTo(oldcxt); |
711 | |
712 | return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); |
713 | } |
714 | |
715 | float |
716 | gistpenalty(GISTSTATE *giststate, int attno, |
717 | GISTENTRY *orig, bool isNullOrig, |
718 | GISTENTRY *add, bool isNullAdd) |
719 | { |
720 | float penalty = 0.0; |
721 | |
722 | if (giststate->penaltyFn[attno].fn_strict == false || |
723 | (isNullOrig == false && isNullAdd == false)) |
724 | { |
725 | FunctionCall3Coll(&giststate->penaltyFn[attno], |
726 | giststate->supportCollation[attno], |
727 | PointerGetDatum(orig), |
728 | PointerGetDatum(add), |
729 | PointerGetDatum(&penalty)); |
730 | /* disallow negative or NaN penalty */ |
731 | if (isnan(penalty) || penalty < 0.0) |
732 | penalty = 0.0; |
733 | } |
734 | else if (isNullOrig && isNullAdd) |
735 | penalty = 0.0; |
736 | else |
737 | { |
738 | /* try to prevent mixing null and non-null values */ |
739 | penalty = get_float4_infinity(); |
740 | } |
741 | |
742 | return penalty; |
743 | } |
744 | |
745 | /* |
746 | * Initialize a new index page |
747 | */ |
748 | void |
749 | GISTInitBuffer(Buffer b, uint32 f) |
750 | { |
751 | GISTPageOpaque opaque; |
752 | Page page; |
753 | Size pageSize; |
754 | |
755 | pageSize = BufferGetPageSize(b); |
756 | page = BufferGetPage(b); |
757 | PageInit(page, pageSize, sizeof(GISTPageOpaqueData)); |
758 | |
759 | opaque = GistPageGetOpaque(page); |
760 | /* page was already zeroed by PageInit, so this is not needed: */ |
761 | /* memset(&(opaque->nsn), 0, sizeof(GistNSN)); */ |
762 | opaque->rightlink = InvalidBlockNumber; |
763 | opaque->flags = f; |
764 | opaque->gist_page_id = GIST_PAGE_ID; |
765 | } |
766 | |
767 | /* |
768 | * Verify that a freshly-read page looks sane. |
769 | */ |
770 | void |
771 | gistcheckpage(Relation rel, Buffer buf) |
772 | { |
773 | Page page = BufferGetPage(buf); |
774 | |
775 | /* |
776 | * ReadBuffer verifies that every newly-read page passes |
777 | * PageHeaderIsValid, which means it either contains a reasonably sane |
778 | * page header or is all-zero. We have to defend against the all-zero |
779 | * case, however. |
780 | */ |
781 | if (PageIsNew(page)) |
782 | ereport(ERROR, |
783 | (errcode(ERRCODE_INDEX_CORRUPTED), |
784 | errmsg("index \"%s\" contains unexpected zero page at block %u" , |
785 | RelationGetRelationName(rel), |
786 | BufferGetBlockNumber(buf)), |
787 | errhint("Please REINDEX it." ))); |
788 | |
789 | /* |
790 | * Additionally check that the special area looks sane. |
791 | */ |
792 | if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData))) |
793 | ereport(ERROR, |
794 | (errcode(ERRCODE_INDEX_CORRUPTED), |
795 | errmsg("index \"%s\" contains corrupted page at block %u" , |
796 | RelationGetRelationName(rel), |
797 | BufferGetBlockNumber(buf)), |
798 | errhint("Please REINDEX it." ))); |
799 | } |
800 | |
801 | |
802 | /* |
803 | * Allocate a new page (either by recycling, or by extending the index file) |
804 | * |
805 | * The returned buffer is already pinned and exclusive-locked |
806 | * |
807 | * Caller is responsible for initializing the page by calling GISTInitBuffer |
808 | */ |
809 | Buffer |
810 | gistNewBuffer(Relation r) |
811 | { |
812 | Buffer buffer; |
813 | bool needLock; |
814 | |
815 | /* First, try to get a page from FSM */ |
816 | for (;;) |
817 | { |
818 | BlockNumber blkno = GetFreeIndexPage(r); |
819 | |
820 | if (blkno == InvalidBlockNumber) |
821 | break; /* nothing left in FSM */ |
822 | |
823 | buffer = ReadBuffer(r, blkno); |
824 | |
825 | /* |
826 | * We have to guard against the possibility that someone else already |
827 | * recycled this page; the buffer may be locked if so. |
828 | */ |
829 | if (ConditionalLockBuffer(buffer)) |
830 | { |
831 | Page page = BufferGetPage(buffer); |
832 | |
833 | /* |
834 | * If the page was never initialized, it's OK to use. |
835 | */ |
836 | if (PageIsNew(page)) |
837 | return buffer; |
838 | |
839 | gistcheckpage(r, buffer); |
840 | |
841 | /* |
842 | * Otherwise, recycle it if deleted, and too old to have any |
843 | * processes interested in it. |
844 | */ |
845 | if (gistPageRecyclable(page)) |
846 | { |
847 | /* |
848 | * If we are generating WAL for Hot Standby then create a WAL |
849 | * record that will allow us to conflict with queries running |
850 | * on standby, in case they have snapshots older than the |
851 | * page's deleteXid. |
852 | */ |
853 | if (XLogStandbyInfoActive() && RelationNeedsWAL(r)) |
854 | gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page)); |
855 | |
856 | return buffer; |
857 | } |
858 | |
859 | LockBuffer(buffer, GIST_UNLOCK); |
860 | } |
861 | |
862 | /* Can't use it, so release buffer and try again */ |
863 | ReleaseBuffer(buffer); |
864 | } |
865 | |
866 | /* Must extend the file */ |
867 | needLock = !RELATION_IS_LOCAL(r); |
868 | |
869 | if (needLock) |
870 | LockRelationForExtension(r, ExclusiveLock); |
871 | |
872 | buffer = ReadBuffer(r, P_NEW); |
873 | LockBuffer(buffer, GIST_EXCLUSIVE); |
874 | |
875 | if (needLock) |
876 | UnlockRelationForExtension(r, ExclusiveLock); |
877 | |
878 | return buffer; |
879 | } |
880 | |
881 | /* Can this page be recycled yet? */ |
882 | bool |
883 | (Page page) |
884 | { |
885 | if (PageIsNew(page)) |
886 | return true; |
887 | if (GistPageIsDeleted(page)) |
888 | { |
889 | /* |
890 | * The page was deleted, but when? If it was just deleted, a scan |
891 | * might have seen the downlink to it, and will read the page later. |
892 | * As long as that can happen, we must keep the deleted page around as |
893 | * a tombstone. |
894 | * |
895 | * Compare the deletion XID with RecentGlobalXmin. If deleteXid < |
896 | * RecentGlobalXmin, then no scan that's still in progress could have |
897 | * seen its downlink, and we can recycle it. |
898 | */ |
899 | FullTransactionId deletexid_full = GistPageGetDeleteXid(page); |
900 | FullTransactionId recentxmin_full = GetFullRecentGlobalXmin(); |
901 | |
902 | if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full)) |
903 | return true; |
904 | } |
905 | return false; |
906 | } |
907 | |
908 | bytea * |
909 | gistoptions(Datum reloptions, bool validate) |
910 | { |
911 | relopt_value *options; |
912 | GiSTOptions *rdopts; |
913 | int numoptions; |
914 | static const relopt_parse_elt tab[] = { |
915 | {"fillfactor" , RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)}, |
916 | {"buffering" , RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)} |
917 | }; |
918 | |
919 | options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST, |
920 | &numoptions); |
921 | |
922 | /* if none set, we're done */ |
923 | if (numoptions == 0) |
924 | return NULL; |
925 | |
926 | rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions); |
927 | |
928 | fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions, |
929 | validate, tab, lengthof(tab)); |
930 | |
931 | pfree(options); |
932 | |
933 | return (bytea *) rdopts; |
934 | } |
935 | |
936 | /* |
937 | * gistproperty() -- Check boolean properties of indexes. |
938 | * |
939 | * This is optional for most AMs, but is required for GiST because the core |
940 | * property code doesn't support AMPROP_DISTANCE_ORDERABLE. We also handle |
941 | * AMPROP_RETURNABLE here to save opening the rel to call gistcanreturn. |
942 | */ |
943 | bool |
944 | gistproperty(Oid index_oid, int attno, |
945 | IndexAMProperty prop, const char *propname, |
946 | bool *res, bool *isnull) |
947 | { |
948 | Oid opclass, |
949 | opfamily, |
950 | opcintype; |
951 | int16 procno; |
952 | |
953 | /* Only answer column-level inquiries */ |
954 | if (attno == 0) |
955 | return false; |
956 | |
957 | /* |
958 | * Currently, GiST distance-ordered scans require that there be a distance |
959 | * function in the opclass with the default types (i.e. the one loaded |
960 | * into the relcache entry, see initGISTstate). So we assume that if such |
961 | * a function exists, then there's a reason for it (rather than grubbing |
962 | * through all the opfamily's operators to find an ordered one). |
963 | * |
964 | * Essentially the same code can test whether we support returning the |
965 | * column data, since that's true if the opclass provides a fetch proc. |
966 | */ |
967 | |
968 | switch (prop) |
969 | { |
970 | case AMPROP_DISTANCE_ORDERABLE: |
971 | procno = GIST_DISTANCE_PROC; |
972 | break; |
973 | case AMPROP_RETURNABLE: |
974 | procno = GIST_FETCH_PROC; |
975 | break; |
976 | default: |
977 | return false; |
978 | } |
979 | |
980 | /* First we need to know the column's opclass. */ |
981 | opclass = get_index_column_opclass(index_oid, attno); |
982 | if (!OidIsValid(opclass)) |
983 | { |
984 | *isnull = true; |
985 | return true; |
986 | } |
987 | |
988 | /* Now look up the opclass family and input datatype. */ |
989 | if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype)) |
990 | { |
991 | *isnull = true; |
992 | return true; |
993 | } |
994 | |
995 | /* And now we can check whether the function is provided. */ |
996 | |
997 | *res = SearchSysCacheExists4(AMPROCNUM, |
998 | ObjectIdGetDatum(opfamily), |
999 | ObjectIdGetDatum(opcintype), |
1000 | ObjectIdGetDatum(opcintype), |
1001 | Int16GetDatum(procno)); |
1002 | |
1003 | /* |
1004 | * Special case: even without a fetch function, AMPROP_RETURNABLE is true |
1005 | * if the opclass has no compress function. |
1006 | */ |
1007 | if (prop == AMPROP_RETURNABLE && !*res) |
1008 | { |
1009 | *res = !SearchSysCacheExists4(AMPROCNUM, |
1010 | ObjectIdGetDatum(opfamily), |
1011 | ObjectIdGetDatum(opcintype), |
1012 | ObjectIdGetDatum(opcintype), |
1013 | Int16GetDatum(GIST_COMPRESS_PROC)); |
1014 | } |
1015 | |
1016 | *isnull = false; |
1017 | |
1018 | return true; |
1019 | } |
1020 | |
1021 | /* |
1022 | * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs |
1023 | * to detect concurrent page splits anyway. This function provides a fake |
1024 | * sequence of LSNs for that purpose. |
1025 | */ |
1026 | XLogRecPtr |
1027 | gistGetFakeLSN(Relation rel) |
1028 | { |
1029 | static XLogRecPtr counter = FirstNormalUnloggedLSN; |
1030 | |
1031 | if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) |
1032 | { |
1033 | /* |
1034 | * Temporary relations are only accessible in our session, so a simple |
1035 | * backend-local counter will do. |
1036 | */ |
1037 | return counter++; |
1038 | } |
1039 | else |
1040 | { |
1041 | /* |
1042 | * Unlogged relations are accessible from other backends, and survive |
1043 | * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us. |
1044 | */ |
1045 | Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED); |
1046 | return GetFakeLSNForUnloggedRel(); |
1047 | } |
1048 | } |
1049 | |