| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * ginget.c |
| 4 | * fetch tuples from a GIN scan. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/gin/ginget.c |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "access/gin_private.h" |
| 18 | #include "access/relscan.h" |
| 19 | #include "miscadmin.h" |
| 20 | #include "storage/predicate.h" |
| 21 | #include "utils/datum.h" |
| 22 | #include "utils/memutils.h" |
| 23 | #include "utils/rel.h" |
| 24 | |
| 25 | /* GUC parameter */ |
| 26 | int GinFuzzySearchLimit = 0; |
| 27 | |
| 28 | typedef struct pendingPosition |
| 29 | { |
| 30 | Buffer pendingBuffer; |
| 31 | OffsetNumber firstOffset; |
| 32 | OffsetNumber lastOffset; |
| 33 | ItemPointerData item; |
| 34 | bool *hasMatchKey; |
| 35 | } pendingPosition; |
| 36 | |
| 37 | |
| 38 | /* |
| 39 | * Goes to the next page if current offset is outside of bounds |
| 40 | */ |
| 41 | static bool |
| 42 | moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot) |
| 43 | { |
| 44 | Page page = BufferGetPage(stack->buffer); |
| 45 | |
| 46 | if (stack->off > PageGetMaxOffsetNumber(page)) |
| 47 | { |
| 48 | /* |
| 49 | * We scanned the whole page, so we should take right page |
| 50 | */ |
| 51 | if (GinPageRightMost(page)) |
| 52 | return false; /* no more pages */ |
| 53 | |
| 54 | stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); |
| 55 | stack->blkno = BufferGetBlockNumber(stack->buffer); |
| 56 | stack->off = FirstOffsetNumber; |
| 57 | PredicateLockPage(btree->index, stack->blkno, snapshot); |
| 58 | } |
| 59 | |
| 60 | return true; |
| 61 | } |
| 62 | |
| 63 | /* |
| 64 | * Scan all pages of a posting tree and save all its heap ItemPointers |
| 65 | * in scanEntry->matchBitmap |
| 66 | */ |
| 67 | static void |
| 68 | scanPostingTree(Relation index, GinScanEntry scanEntry, |
| 69 | BlockNumber rootPostingTree, Snapshot snapshot) |
| 70 | { |
| 71 | GinBtreeData btree; |
| 72 | GinBtreeStack *stack; |
| 73 | Buffer buffer; |
| 74 | Page page; |
| 75 | |
| 76 | /* Descend to the leftmost leaf page */ |
| 77 | stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot); |
| 78 | buffer = stack->buffer; |
| 79 | |
| 80 | IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ |
| 81 | |
| 82 | freeGinBtreeStack(stack); |
| 83 | |
| 84 | /* |
| 85 | * Loop iterates through all leaf pages of posting tree |
| 86 | */ |
| 87 | for (;;) |
| 88 | { |
| 89 | page = BufferGetPage(buffer); |
| 90 | if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) |
| 91 | { |
| 92 | int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap); |
| 93 | |
| 94 | scanEntry->predictNumberResult += n; |
| 95 | } |
| 96 | |
| 97 | if (GinPageRightMost(page)) |
| 98 | break; /* no more pages */ |
| 99 | |
| 100 | buffer = ginStepRight(buffer, index, GIN_SHARE); |
| 101 | } |
| 102 | |
| 103 | UnlockReleaseBuffer(buffer); |
| 104 | } |
| 105 | |
| 106 | /* |
| 107 | * Collects TIDs into scanEntry->matchBitmap for all heap tuples that |
| 108 | * match the search entry. This supports three different match modes: |
| 109 | * |
| 110 | * 1. Partial-match support: scan from current point until the |
| 111 | * comparePartialFn says we're done. |
| 112 | * 2. SEARCH_MODE_ALL: scan from current point (which should be first |
| 113 | * key for the current attnum) until we hit null items or end of attnum |
| 114 | * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first |
| 115 | * key for the current attnum) until we hit end of attnum |
| 116 | * |
| 117 | * Returns true if done, false if it's necessary to restart scan from scratch |
| 118 | */ |
| 119 | static bool |
| 120 | collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, |
| 121 | GinScanEntry scanEntry, Snapshot snapshot) |
| 122 | { |
| 123 | OffsetNumber attnum; |
| 124 | Form_pg_attribute attr; |
| 125 | |
| 126 | /* Initialize empty bitmap result */ |
| 127 | scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL); |
| 128 | |
| 129 | /* Null query cannot partial-match anything */ |
| 130 | if (scanEntry->isPartialMatch && |
| 131 | scanEntry->queryCategory != GIN_CAT_NORM_KEY) |
| 132 | return true; |
| 133 | |
| 134 | /* Locate tupdesc entry for key column (for attbyval/attlen data) */ |
| 135 | attnum = scanEntry->attnum; |
| 136 | attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1); |
| 137 | |
| 138 | /* |
| 139 | * Predicate lock entry leaf page, following pages will be locked by |
| 140 | * moveRightIfItNeeded() |
| 141 | */ |
| 142 | PredicateLockPage(btree->index, stack->buffer, snapshot); |
| 143 | |
| 144 | for (;;) |
| 145 | { |
| 146 | Page page; |
| 147 | IndexTuple itup; |
| 148 | Datum idatum; |
| 149 | GinNullCategory icategory; |
| 150 | |
| 151 | /* |
| 152 | * stack->off points to the interested entry, buffer is already locked |
| 153 | */ |
| 154 | if (moveRightIfItNeeded(btree, stack, snapshot) == false) |
| 155 | return true; |
| 156 | |
| 157 | page = BufferGetPage(stack->buffer); |
| 158 | TestForOldSnapshot(snapshot, btree->index, page); |
| 159 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); |
| 160 | |
| 161 | /* |
| 162 | * If tuple stores another attribute then stop scan |
| 163 | */ |
| 164 | if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) |
| 165 | return true; |
| 166 | |
| 167 | /* Safe to fetch attribute value */ |
| 168 | idatum = gintuple_get_key(btree->ginstate, itup, &icategory); |
| 169 | |
| 170 | /* |
| 171 | * Check for appropriate scan stop conditions |
| 172 | */ |
| 173 | if (scanEntry->isPartialMatch) |
| 174 | { |
| 175 | int32 cmp; |
| 176 | |
| 177 | /* |
| 178 | * In partial match, stop scan at any null (including |
| 179 | * placeholders); partial matches never match nulls |
| 180 | */ |
| 181 | if (icategory != GIN_CAT_NORM_KEY) |
| 182 | return true; |
| 183 | |
| 184 | /*---------- |
| 185 | * Check of partial match. |
| 186 | * case cmp == 0 => match |
| 187 | * case cmp > 0 => not match and finish scan |
| 188 | * case cmp < 0 => not match and continue scan |
| 189 | *---------- |
| 190 | */ |
| 191 | cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1], |
| 192 | btree->ginstate->supportCollation[attnum - 1], |
| 193 | scanEntry->queryKey, |
| 194 | idatum, |
| 195 | UInt16GetDatum(scanEntry->strategy), |
| 196 | PointerGetDatum(scanEntry->extra_data))); |
| 197 | |
| 198 | if (cmp > 0) |
| 199 | return true; |
| 200 | else if (cmp < 0) |
| 201 | { |
| 202 | stack->off++; |
| 203 | continue; |
| 204 | } |
| 205 | } |
| 206 | else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL) |
| 207 | { |
| 208 | /* |
| 209 | * In ALL mode, we are not interested in null items, so we can |
| 210 | * stop if we get to a null-item placeholder (which will be the |
| 211 | * last entry for a given attnum). We do want to include NULL_KEY |
| 212 | * and EMPTY_ITEM entries, though. |
| 213 | */ |
| 214 | if (icategory == GIN_CAT_NULL_ITEM) |
| 215 | return true; |
| 216 | } |
| 217 | |
| 218 | /* |
| 219 | * OK, we want to return the TIDs listed in this entry. |
| 220 | */ |
| 221 | if (GinIsPostingTree(itup)) |
| 222 | { |
| 223 | BlockNumber rootPostingTree = GinGetPostingTree(itup); |
| 224 | |
| 225 | /* |
| 226 | * We should unlock current page (but not unpin) during tree scan |
| 227 | * to prevent deadlock with vacuum processes. |
| 228 | * |
| 229 | * We save current entry value (idatum) to be able to re-find our |
| 230 | * tuple after re-locking |
| 231 | */ |
| 232 | if (icategory == GIN_CAT_NORM_KEY) |
| 233 | idatum = datumCopy(idatum, attr->attbyval, attr->attlen); |
| 234 | |
| 235 | LockBuffer(stack->buffer, GIN_UNLOCK); |
| 236 | |
| 237 | /* |
| 238 | * Acquire predicate lock on the posting tree. We already hold a |
| 239 | * lock on the entry page, but insertions to the posting tree |
| 240 | * don't check for conflicts on that level. |
| 241 | */ |
| 242 | PredicateLockPage(btree->index, rootPostingTree, snapshot); |
| 243 | |
| 244 | /* Collect all the TIDs in this entry's posting tree */ |
| 245 | scanPostingTree(btree->index, scanEntry, rootPostingTree, |
| 246 | snapshot); |
| 247 | |
| 248 | /* |
| 249 | * We lock again the entry page and while it was unlocked insert |
| 250 | * might have occurred, so we need to re-find our position. |
| 251 | */ |
| 252 | LockBuffer(stack->buffer, GIN_SHARE); |
| 253 | page = BufferGetPage(stack->buffer); |
| 254 | if (!GinPageIsLeaf(page)) |
| 255 | { |
| 256 | /* |
| 257 | * Root page becomes non-leaf while we unlock it. We will |
| 258 | * start again, this situation doesn't occur often - root can |
| 259 | * became a non-leaf only once per life of index. |
| 260 | */ |
| 261 | return false; |
| 262 | } |
| 263 | |
| 264 | /* Search forward to re-find idatum */ |
| 265 | for (;;) |
| 266 | { |
| 267 | Datum newDatum; |
| 268 | GinNullCategory newCategory; |
| 269 | |
| 270 | if (moveRightIfItNeeded(btree, stack, snapshot) == false) |
| 271 | elog(ERROR, "lost saved point in index" ); /* must not happen !!! */ |
| 272 | |
| 273 | page = BufferGetPage(stack->buffer); |
| 274 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); |
| 275 | |
| 276 | if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) |
| 277 | elog(ERROR, "lost saved point in index" ); /* must not happen !!! */ |
| 278 | newDatum = gintuple_get_key(btree->ginstate, itup, |
| 279 | &newCategory); |
| 280 | |
| 281 | if (ginCompareEntries(btree->ginstate, attnum, |
| 282 | newDatum, newCategory, |
| 283 | idatum, icategory) == 0) |
| 284 | break; /* Found! */ |
| 285 | |
| 286 | stack->off++; |
| 287 | } |
| 288 | |
| 289 | if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval) |
| 290 | pfree(DatumGetPointer(idatum)); |
| 291 | } |
| 292 | else |
| 293 | { |
| 294 | ItemPointer ipd; |
| 295 | int nipd; |
| 296 | |
| 297 | ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); |
| 298 | tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); |
| 299 | scanEntry->predictNumberResult += GinGetNPosting(itup); |
| 300 | pfree(ipd); |
| 301 | } |
| 302 | |
| 303 | /* |
| 304 | * Done with this entry, go to the next |
| 305 | */ |
| 306 | stack->off++; |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | /* |
| 311 | * Start* functions setup beginning state of searches: finds correct buffer and pins it. |
| 312 | */ |
| 313 | static void |
| 314 | startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot) |
| 315 | { |
| 316 | GinBtreeData btreeEntry; |
| 317 | GinBtreeStack *stackEntry; |
| 318 | Page page; |
| 319 | bool needUnlock; |
| 320 | |
| 321 | restartScanEntry: |
| 322 | entry->buffer = InvalidBuffer; |
| 323 | ItemPointerSetMin(&entry->curItem); |
| 324 | entry->offset = InvalidOffsetNumber; |
| 325 | if (entry->list) |
| 326 | pfree(entry->list); |
| 327 | entry->list = NULL; |
| 328 | entry->nlist = 0; |
| 329 | entry->matchBitmap = NULL; |
| 330 | entry->matchResult = NULL; |
| 331 | entry->reduceResult = false; |
| 332 | entry->predictNumberResult = 0; |
| 333 | |
| 334 | /* |
| 335 | * we should find entry, and begin scan of posting tree or just store |
| 336 | * posting list in memory |
| 337 | */ |
| 338 | ginPrepareEntryScan(&btreeEntry, entry->attnum, |
| 339 | entry->queryKey, entry->queryCategory, |
| 340 | ginstate); |
| 341 | stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot); |
| 342 | page = BufferGetPage(stackEntry->buffer); |
| 343 | |
| 344 | /* ginFindLeafPage() will have already checked snapshot age. */ |
| 345 | needUnlock = true; |
| 346 | |
| 347 | entry->isFinished = true; |
| 348 | |
| 349 | if (entry->isPartialMatch || |
| 350 | entry->queryCategory == GIN_CAT_EMPTY_QUERY) |
| 351 | { |
| 352 | /* |
| 353 | * btreeEntry.findItem locates the first item >= given search key. |
| 354 | * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item |
| 355 | * because of the way the GIN_CAT_EMPTY_QUERY category code is |
| 356 | * assigned.) We scan forward from there and collect all TIDs needed |
| 357 | * for the entry type. |
| 358 | */ |
| 359 | btreeEntry.findItem(&btreeEntry, stackEntry); |
| 360 | if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot) |
| 361 | == false) |
| 362 | { |
| 363 | /* |
| 364 | * GIN tree was seriously restructured, so we will cleanup all |
| 365 | * found data and rescan. See comments near 'return false' in |
| 366 | * collectMatchBitmap() |
| 367 | */ |
| 368 | if (entry->matchBitmap) |
| 369 | { |
| 370 | if (entry->matchIterator) |
| 371 | tbm_end_iterate(entry->matchIterator); |
| 372 | entry->matchIterator = NULL; |
| 373 | tbm_free(entry->matchBitmap); |
| 374 | entry->matchBitmap = NULL; |
| 375 | } |
| 376 | LockBuffer(stackEntry->buffer, GIN_UNLOCK); |
| 377 | freeGinBtreeStack(stackEntry); |
| 378 | goto restartScanEntry; |
| 379 | } |
| 380 | |
| 381 | if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) |
| 382 | { |
| 383 | entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); |
| 384 | entry->isFinished = false; |
| 385 | } |
| 386 | } |
| 387 | else if (btreeEntry.findItem(&btreeEntry, stackEntry)) |
| 388 | { |
| 389 | IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); |
| 390 | |
| 391 | if (GinIsPostingTree(itup)) |
| 392 | { |
| 393 | BlockNumber rootPostingTree = GinGetPostingTree(itup); |
| 394 | GinBtreeStack *stack; |
| 395 | Page page; |
| 396 | ItemPointerData minItem; |
| 397 | |
| 398 | /* |
| 399 | * This is an equality scan, so lock the root of the posting tree. |
| 400 | * It represents a lock on the exact key value, and covers all the |
| 401 | * items in the posting tree. |
| 402 | */ |
| 403 | PredicateLockPage(ginstate->index, rootPostingTree, snapshot); |
| 404 | |
| 405 | /* |
| 406 | * We should unlock entry page before touching posting tree to |
| 407 | * prevent deadlocks with vacuum processes. Because entry is never |
| 408 | * deleted from page and posting tree is never reduced to the |
| 409 | * posting list, we can unlock page after getting BlockNumber of |
| 410 | * root of posting tree. |
| 411 | */ |
| 412 | LockBuffer(stackEntry->buffer, GIN_UNLOCK); |
| 413 | needUnlock = false; |
| 414 | |
| 415 | stack = ginScanBeginPostingTree(&entry->btree, ginstate->index, |
| 416 | rootPostingTree, snapshot); |
| 417 | entry->buffer = stack->buffer; |
| 418 | |
| 419 | /* |
| 420 | * We keep buffer pinned because we need to prevent deletion of |
| 421 | * page during scan. See GIN's vacuum implementation. RefCount is |
| 422 | * increased to keep buffer pinned after freeGinBtreeStack() call. |
| 423 | */ |
| 424 | IncrBufferRefCount(entry->buffer); |
| 425 | |
| 426 | page = BufferGetPage(entry->buffer); |
| 427 | |
| 428 | /* |
| 429 | * Load the first page into memory. |
| 430 | */ |
| 431 | ItemPointerSetMin(&minItem); |
| 432 | entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem); |
| 433 | |
| 434 | entry->predictNumberResult = stack->predictNumber * entry->nlist; |
| 435 | |
| 436 | LockBuffer(entry->buffer, GIN_UNLOCK); |
| 437 | freeGinBtreeStack(stack); |
| 438 | entry->isFinished = false; |
| 439 | } |
| 440 | else |
| 441 | { |
| 442 | /* |
| 443 | * Lock the entry leaf page. This is more coarse-grained than |
| 444 | * necessary, because it will conflict with any insertions that |
| 445 | * land on the same leaf page, not only the exact key we searched |
| 446 | * for. But locking an individual tuple would require updating |
| 447 | * that lock whenever it moves because of insertions or vacuums, |
| 448 | * which seems too complicated. |
| 449 | */ |
| 450 | PredicateLockPage(ginstate->index, |
| 451 | BufferGetBlockNumber(stackEntry->buffer), |
| 452 | snapshot); |
| 453 | if (GinGetNPosting(itup) > 0) |
| 454 | { |
| 455 | entry->list = ginReadTuple(ginstate, entry->attnum, itup, |
| 456 | &entry->nlist); |
| 457 | entry->predictNumberResult = entry->nlist; |
| 458 | |
| 459 | entry->isFinished = false; |
| 460 | } |
| 461 | } |
| 462 | } |
| 463 | else |
| 464 | { |
| 465 | /* |
| 466 | * No entry found. Predicate lock the leaf page, to lock the place |
| 467 | * where the entry would've been, had there been one. |
| 468 | */ |
| 469 | PredicateLockPage(ginstate->index, |
| 470 | BufferGetBlockNumber(stackEntry->buffer), snapshot); |
| 471 | } |
| 472 | |
| 473 | if (needUnlock) |
| 474 | LockBuffer(stackEntry->buffer, GIN_UNLOCK); |
| 475 | freeGinBtreeStack(stackEntry); |
| 476 | } |
| 477 | |
| 478 | /* |
| 479 | * Comparison function for scan entry indexes. Sorts by predictNumberResult, |
| 480 | * least frequent items first. |
| 481 | */ |
| 482 | static int |
| 483 | entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg) |
| 484 | { |
| 485 | const GinScanKey key = (const GinScanKey) arg; |
| 486 | int i1 = *(const int *) a1; |
| 487 | int i2 = *(const int *) a2; |
| 488 | uint32 n1 = key->scanEntry[i1]->predictNumberResult; |
| 489 | uint32 n2 = key->scanEntry[i2]->predictNumberResult; |
| 490 | |
| 491 | if (n1 < n2) |
| 492 | return -1; |
| 493 | else if (n1 == n2) |
| 494 | return 0; |
| 495 | else |
| 496 | return 1; |
| 497 | } |
| 498 | |
| 499 | static void |
| 500 | startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) |
| 501 | { |
| 502 | MemoryContext oldCtx = CurrentMemoryContext; |
| 503 | int i; |
| 504 | int j; |
| 505 | int *entryIndexes; |
| 506 | |
| 507 | ItemPointerSetMin(&key->curItem); |
| 508 | key->curItemMatches = false; |
| 509 | key->recheckCurItem = false; |
| 510 | key->isFinished = false; |
| 511 | |
| 512 | /* |
| 513 | * Divide the entries into two distinct sets: required and additional. |
| 514 | * Additional entries are not enough for a match alone, without any items |
| 515 | * from the required set, but are needed by the consistent function to |
| 516 | * decide if an item matches. When scanning, we can skip over items from |
| 517 | * additional entries that have no corresponding matches in any of the |
| 518 | * required entries. That speeds up queries like "frequent & rare" |
| 519 | * considerably, if the frequent term can be put in the additional set. |
| 520 | * |
| 521 | * There can be many legal ways to divide them entries into these two |
| 522 | * sets. A conservative division is to just put everything in the required |
| 523 | * set, but the more you can put in the additional set, the more you can |
| 524 | * skip during the scan. To maximize skipping, we try to put as many |
| 525 | * frequent items as possible into additional, and less frequent ones into |
| 526 | * required. To do that, sort the entries by frequency |
| 527 | * (predictNumberResult), and put entries into the required set in that |
| 528 | * order, until the consistent function says that none of the remaining |
| 529 | * entries can form a match, without any items from the required set. The |
| 530 | * rest go to the additional set. |
| 531 | */ |
| 532 | if (key->nentries > 1) |
| 533 | { |
| 534 | MemoryContextSwitchTo(so->tempCtx); |
| 535 | |
| 536 | entryIndexes = (int *) palloc(sizeof(int) * key->nentries); |
| 537 | for (i = 0; i < key->nentries; i++) |
| 538 | entryIndexes[i] = i; |
| 539 | qsort_arg(entryIndexes, key->nentries, sizeof(int), |
| 540 | entryIndexByFrequencyCmp, key); |
| 541 | |
| 542 | for (i = 0; i < key->nentries - 1; i++) |
| 543 | { |
| 544 | /* Pass all entries <= i as FALSE, and the rest as MAYBE */ |
| 545 | for (j = 0; j <= i; j++) |
| 546 | key->entryRes[entryIndexes[j]] = GIN_FALSE; |
| 547 | for (j = i + 1; j < key->nentries; j++) |
| 548 | key->entryRes[entryIndexes[j]] = GIN_MAYBE; |
| 549 | |
| 550 | if (key->triConsistentFn(key) == GIN_FALSE) |
| 551 | break; |
| 552 | } |
| 553 | /* i is now the last required entry. */ |
| 554 | |
| 555 | MemoryContextSwitchTo(so->keyCtx); |
| 556 | |
| 557 | key->nrequired = i + 1; |
| 558 | key->nadditional = key->nentries - key->nrequired; |
| 559 | key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry)); |
| 560 | key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); |
| 561 | |
| 562 | j = 0; |
| 563 | for (i = 0; i < key->nrequired; i++) |
| 564 | key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]]; |
| 565 | for (i = 0; i < key->nadditional; i++) |
| 566 | key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]]; |
| 567 | |
| 568 | /* clean up after consistentFn calls (also frees entryIndexes) */ |
| 569 | MemoryContextReset(so->tempCtx); |
| 570 | } |
| 571 | else |
| 572 | { |
| 573 | MemoryContextSwitchTo(so->keyCtx); |
| 574 | |
| 575 | key->nrequired = 1; |
| 576 | key->nadditional = 0; |
| 577 | key->requiredEntries = palloc(1 * sizeof(GinScanEntry)); |
| 578 | key->requiredEntries[0] = key->scanEntry[0]; |
| 579 | } |
| 580 | MemoryContextSwitchTo(oldCtx); |
| 581 | } |
| 582 | |
| 583 | static void |
| 584 | startScan(IndexScanDesc scan) |
| 585 | { |
| 586 | GinScanOpaque so = (GinScanOpaque) scan->opaque; |
| 587 | GinState *ginstate = &so->ginstate; |
| 588 | uint32 i; |
| 589 | |
| 590 | for (i = 0; i < so->totalentries; i++) |
| 591 | startScanEntry(ginstate, so->entries[i], scan->xs_snapshot); |
| 592 | |
| 593 | if (GinFuzzySearchLimit > 0) |
| 594 | { |
| 595 | /* |
| 596 | * If all of keys more than threshold we will try to reduce result, we |
| 597 | * hope (and only hope, for intersection operation of array our |
| 598 | * supposition isn't true), that total result will not more than |
| 599 | * minimal predictNumberResult. |
| 600 | */ |
| 601 | bool reduce = true; |
| 602 | |
| 603 | for (i = 0; i < so->totalentries; i++) |
| 604 | { |
| 605 | if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit) |
| 606 | { |
| 607 | reduce = false; |
| 608 | break; |
| 609 | } |
| 610 | } |
| 611 | if (reduce) |
| 612 | { |
| 613 | for (i = 0; i < so->totalentries; i++) |
| 614 | { |
| 615 | so->entries[i]->predictNumberResult /= so->totalentries; |
| 616 | so->entries[i]->reduceResult = true; |
| 617 | } |
| 618 | } |
| 619 | } |
| 620 | |
| 621 | /* |
| 622 | * Now that we have the estimates for the entry frequencies, finish |
| 623 | * initializing the scan keys. |
| 624 | */ |
| 625 | for (i = 0; i < so->nkeys; i++) |
| 626 | startScanKey(ginstate, so, so->keys + i); |
| 627 | } |
| 628 | |
| 629 | /* |
| 630 | * Load the next batch of item pointers from a posting tree. |
| 631 | * |
| 632 | * Note that we copy the page into GinScanEntry->list array and unlock it, but |
| 633 | * keep it pinned to prevent interference with vacuum. |
| 634 | */ |
| 635 | static void |
| 636 | entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, |
| 637 | ItemPointerData advancePast, Snapshot snapshot) |
| 638 | { |
| 639 | Page page; |
| 640 | int i; |
| 641 | bool stepright; |
| 642 | |
| 643 | if (!BufferIsValid(entry->buffer)) |
| 644 | { |
| 645 | entry->isFinished = true; |
| 646 | return; |
| 647 | } |
| 648 | |
| 649 | /* |
| 650 | * We have two strategies for finding the correct page: step right from |
| 651 | * the current page, or descend the tree again from the root. If |
| 652 | * advancePast equals the current item, the next matching item should be |
| 653 | * on the next page, so we step right. Otherwise, descend from root. |
| 654 | */ |
| 655 | if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0) |
| 656 | { |
| 657 | stepright = true; |
| 658 | LockBuffer(entry->buffer, GIN_SHARE); |
| 659 | } |
| 660 | else |
| 661 | { |
| 662 | GinBtreeStack *stack; |
| 663 | |
| 664 | ReleaseBuffer(entry->buffer); |
| 665 | |
| 666 | /* |
| 667 | * Set the search key, and find the correct leaf page. |
| 668 | */ |
| 669 | if (ItemPointerIsLossyPage(&advancePast)) |
| 670 | { |
| 671 | ItemPointerSet(&entry->btree.itemptr, |
| 672 | GinItemPointerGetBlockNumber(&advancePast) + 1, |
| 673 | FirstOffsetNumber); |
| 674 | } |
| 675 | else |
| 676 | { |
| 677 | ItemPointerSet(&entry->btree.itemptr, |
| 678 | GinItemPointerGetBlockNumber(&advancePast), |
| 679 | OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); |
| 680 | } |
| 681 | entry->btree.fullScan = false; |
| 682 | stack = ginFindLeafPage(&entry->btree, true, false, snapshot); |
| 683 | |
| 684 | /* we don't need the stack, just the buffer. */ |
| 685 | entry->buffer = stack->buffer; |
| 686 | IncrBufferRefCount(entry->buffer); |
| 687 | freeGinBtreeStack(stack); |
| 688 | stepright = false; |
| 689 | } |
| 690 | |
| 691 | elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d" , |
| 692 | GinItemPointerGetBlockNumber(&advancePast), |
| 693 | GinItemPointerGetOffsetNumber(&advancePast), |
| 694 | !stepright); |
| 695 | |
| 696 | page = BufferGetPage(entry->buffer); |
| 697 | for (;;) |
| 698 | { |
| 699 | entry->offset = InvalidOffsetNumber; |
| 700 | if (entry->list) |
| 701 | { |
| 702 | pfree(entry->list); |
| 703 | entry->list = NULL; |
| 704 | entry->nlist = 0; |
| 705 | } |
| 706 | |
| 707 | if (stepright) |
| 708 | { |
| 709 | /* |
| 710 | * We've processed all the entries on this page. If it was the |
| 711 | * last page in the tree, we're done. |
| 712 | */ |
| 713 | if (GinPageRightMost(page)) |
| 714 | { |
| 715 | UnlockReleaseBuffer(entry->buffer); |
| 716 | entry->buffer = InvalidBuffer; |
| 717 | entry->isFinished = true; |
| 718 | return; |
| 719 | } |
| 720 | |
| 721 | /* |
| 722 | * Step to next page, following the right link. then find the |
| 723 | * first ItemPointer greater than advancePast. |
| 724 | */ |
| 725 | entry->buffer = ginStepRight(entry->buffer, |
| 726 | ginstate->index, |
| 727 | GIN_SHARE); |
| 728 | page = BufferGetPage(entry->buffer); |
| 729 | } |
| 730 | stepright = true; |
| 731 | |
| 732 | if (GinPageGetOpaque(page)->flags & GIN_DELETED) |
| 733 | continue; /* page was deleted by concurrent vacuum */ |
| 734 | |
| 735 | /* |
| 736 | * The first item > advancePast might not be on this page, but |
| 737 | * somewhere to the right, if the page was split, or a non-match from |
| 738 | * another key in the query allowed us to skip some items from this |
| 739 | * entry. Keep following the right-links until we re-find the correct |
| 740 | * page. |
| 741 | */ |
| 742 | if (!GinPageRightMost(page) && |
| 743 | ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0) |
| 744 | { |
| 745 | /* |
| 746 | * the item we're looking is > the right bound of the page, so it |
| 747 | * can't be on this page. |
| 748 | */ |
| 749 | continue; |
| 750 | } |
| 751 | |
| 752 | entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast); |
| 753 | |
| 754 | for (i = 0; i < entry->nlist; i++) |
| 755 | { |
| 756 | if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0) |
| 757 | { |
| 758 | entry->offset = i; |
| 759 | |
| 760 | if (GinPageRightMost(page)) |
| 761 | { |
| 762 | /* after processing the copied items, we're done. */ |
| 763 | UnlockReleaseBuffer(entry->buffer); |
| 764 | entry->buffer = InvalidBuffer; |
| 765 | } |
| 766 | else |
| 767 | LockBuffer(entry->buffer, GIN_UNLOCK); |
| 768 | return; |
| 769 | } |
| 770 | } |
| 771 | } |
| 772 | } |
| 773 | |
| 774 | #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) |
| 775 | #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) |
| 776 | |
| 777 | /* |
| 778 | * Sets entry->curItem to next heap item pointer > advancePast, for one entry |
| 779 | * of one scan key, or sets entry->isFinished to true if there are no more. |
| 780 | * |
| 781 | * Item pointers are returned in ascending order. |
| 782 | * |
| 783 | * Note: this can return a "lossy page" item pointer, indicating that the |
| 784 | * entry potentially matches all items on that heap page. However, it is |
| 785 | * not allowed to return both a lossy page pointer and exact (regular) |
| 786 | * item pointers for the same page. (Doing so would break the key-combination |
| 787 | * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the |
| 788 | * current implementation this is guaranteed by the behavior of tidbitmaps. |
| 789 | */ |
| 790 | static void |
| 791 | entryGetItem(GinState *ginstate, GinScanEntry entry, |
| 792 | ItemPointerData advancePast, Snapshot snapshot) |
| 793 | { |
| 794 | Assert(!entry->isFinished); |
| 795 | |
| 796 | Assert(!ItemPointerIsValid(&entry->curItem) || |
| 797 | ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); |
| 798 | |
| 799 | if (entry->matchBitmap) |
| 800 | { |
| 801 | /* A bitmap result */ |
| 802 | BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast); |
| 803 | OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast); |
| 804 | bool gotitem = false; |
| 805 | |
| 806 | do |
| 807 | { |
| 808 | /* |
| 809 | * If we've exhausted all items on this block, move to next block |
| 810 | * in the bitmap. |
| 811 | */ |
| 812 | while (entry->matchResult == NULL || |
| 813 | (entry->matchResult->ntuples >= 0 && |
| 814 | entry->offset >= entry->matchResult->ntuples) || |
| 815 | entry->matchResult->blockno < advancePastBlk || |
| 816 | (ItemPointerIsLossyPage(&advancePast) && |
| 817 | entry->matchResult->blockno == advancePastBlk)) |
| 818 | { |
| 819 | entry->matchResult = tbm_iterate(entry->matchIterator); |
| 820 | |
| 821 | if (entry->matchResult == NULL) |
| 822 | { |
| 823 | ItemPointerSetInvalid(&entry->curItem); |
| 824 | tbm_end_iterate(entry->matchIterator); |
| 825 | entry->matchIterator = NULL; |
| 826 | entry->isFinished = true; |
| 827 | break; |
| 828 | } |
| 829 | |
| 830 | /* |
| 831 | * Reset counter to the beginning of entry->matchResult. Note: |
| 832 | * entry->offset is still greater than matchResult->ntuples if |
| 833 | * matchResult is lossy. So, on next call we will get next |
| 834 | * result from TIDBitmap. |
| 835 | */ |
| 836 | entry->offset = 0; |
| 837 | } |
| 838 | if (entry->isFinished) |
| 839 | break; |
| 840 | |
| 841 | /* |
| 842 | * We're now on the first page after advancePast which has any |
| 843 | * items on it. If it's a lossy result, return that. |
| 844 | */ |
| 845 | if (entry->matchResult->ntuples < 0) |
| 846 | { |
| 847 | ItemPointerSetLossyPage(&entry->curItem, |
| 848 | entry->matchResult->blockno); |
| 849 | |
| 850 | /* |
| 851 | * We might as well fall out of the loop; we could not |
| 852 | * estimate number of results on this page to support correct |
| 853 | * reducing of result even if it's enabled. |
| 854 | */ |
| 855 | gotitem = true; |
| 856 | break; |
| 857 | } |
| 858 | |
| 859 | /* |
| 860 | * Not a lossy page. Skip over any offsets <= advancePast, and |
| 861 | * return that. |
| 862 | */ |
| 863 | if (entry->matchResult->blockno == advancePastBlk) |
| 864 | { |
| 865 | /* |
| 866 | * First, do a quick check against the last offset on the |
| 867 | * page. If that's > advancePast, so are all the other |
| 868 | * offsets. |
| 869 | */ |
| 870 | if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) |
| 871 | { |
| 872 | entry->offset = entry->matchResult->ntuples; |
| 873 | continue; |
| 874 | } |
| 875 | |
| 876 | /* Otherwise scan to find the first item > advancePast */ |
| 877 | while (entry->matchResult->offsets[entry->offset] <= advancePastOff) |
| 878 | entry->offset++; |
| 879 | } |
| 880 | |
| 881 | ItemPointerSet(&entry->curItem, |
| 882 | entry->matchResult->blockno, |
| 883 | entry->matchResult->offsets[entry->offset]); |
| 884 | entry->offset++; |
| 885 | gotitem = true; |
| 886 | } while (!gotitem || (entry->reduceResult == true && dropItem(entry))); |
| 887 | } |
| 888 | else if (!BufferIsValid(entry->buffer)) |
| 889 | { |
| 890 | /* |
| 891 | * A posting list from an entry tuple, or the last page of a posting |
| 892 | * tree. |
| 893 | */ |
| 894 | do |
| 895 | { |
| 896 | if (entry->offset >= entry->nlist) |
| 897 | { |
| 898 | ItemPointerSetInvalid(&entry->curItem); |
| 899 | entry->isFinished = true; |
| 900 | break; |
| 901 | } |
| 902 | |
| 903 | entry->curItem = entry->list[entry->offset++]; |
| 904 | } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); |
| 905 | /* XXX: shouldn't we apply the fuzzy search limit here? */ |
| 906 | } |
| 907 | else |
| 908 | { |
| 909 | /* A posting tree */ |
| 910 | do |
| 911 | { |
| 912 | /* If we've processed the current batch, load more items */ |
| 913 | while (entry->offset >= entry->nlist) |
| 914 | { |
| 915 | entryLoadMoreItems(ginstate, entry, advancePast, snapshot); |
| 916 | |
| 917 | if (entry->isFinished) |
| 918 | { |
| 919 | ItemPointerSetInvalid(&entry->curItem); |
| 920 | return; |
| 921 | } |
| 922 | } |
| 923 | |
| 924 | entry->curItem = entry->list[entry->offset++]; |
| 925 | |
| 926 | } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 || |
| 927 | (entry->reduceResult == true && dropItem(entry))); |
| 928 | } |
| 929 | } |
| 930 | |
| 931 | /* |
| 932 | * Identify the "current" item among the input entry streams for this scan key |
| 933 | * that is greater than advancePast, and test whether it passes the scan key |
| 934 | * qual condition. |
| 935 | * |
| 936 | * The current item is the smallest curItem among the inputs. key->curItem |
| 937 | * is set to that value. key->curItemMatches is set to indicate whether that |
| 938 | * TID passes the consistentFn test. If so, key->recheckCurItem is set true |
| 939 | * iff recheck is needed for this item pointer (including the case where the |
| 940 | * item pointer is a lossy page pointer). |
| 941 | * |
| 942 | * If all entry streams are exhausted, sets key->isFinished to true. |
| 943 | * |
| 944 | * Item pointers must be returned in ascending order. |
| 945 | * |
| 946 | * Note: this can return a "lossy page" item pointer, indicating that the |
| 947 | * key potentially matches all items on that heap page. However, it is |
| 948 | * not allowed to return both a lossy page pointer and exact (regular) |
| 949 | * item pointers for the same page. (Doing so would break the key-combination |
| 950 | * logic in scanGetItem.) |
| 951 | */ |
| 952 | static void |
| 953 | keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, |
| 954 | ItemPointerData advancePast, Snapshot snapshot) |
| 955 | { |
| 956 | ItemPointerData minItem; |
| 957 | ItemPointerData curPageLossy; |
| 958 | uint32 i; |
| 959 | bool haveLossyEntry; |
| 960 | GinScanEntry entry; |
| 961 | GinTernaryValue res; |
| 962 | MemoryContext oldCtx; |
| 963 | bool allFinished; |
| 964 | |
| 965 | Assert(!key->isFinished); |
| 966 | |
| 967 | /* |
| 968 | * We might have already tested this item; if so, no need to repeat work. |
| 969 | * (Note: the ">" case can happen, if advancePast is exact but we |
| 970 | * previously had to set curItem to a lossy-page pointer.) |
| 971 | */ |
| 972 | if (ginCompareItemPointers(&key->curItem, &advancePast) > 0) |
| 973 | return; |
| 974 | |
| 975 | /* |
| 976 | * Find the minimum item > advancePast among the active entry streams. |
| 977 | * |
| 978 | * Note: a lossy-page entry is encoded by a ItemPointer with max value for |
| 979 | * offset (0xffff), so that it will sort after any exact entries for the |
| 980 | * same page. So we'll prefer to return exact pointers not lossy |
| 981 | * pointers, which is good. |
| 982 | */ |
| 983 | ItemPointerSetMax(&minItem); |
| 984 | allFinished = true; |
| 985 | for (i = 0; i < key->nrequired; i++) |
| 986 | { |
| 987 | entry = key->requiredEntries[i]; |
| 988 | |
| 989 | if (entry->isFinished) |
| 990 | continue; |
| 991 | |
| 992 | /* |
| 993 | * Advance this stream if necessary. |
| 994 | * |
| 995 | * In particular, since entry->curItem was initialized with |
| 996 | * ItemPointerSetMin, this ensures we fetch the first item for each |
| 997 | * entry on the first call. |
| 998 | */ |
| 999 | if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) |
| 1000 | { |
| 1001 | entryGetItem(ginstate, entry, advancePast, snapshot); |
| 1002 | if (entry->isFinished) |
| 1003 | continue; |
| 1004 | } |
| 1005 | |
| 1006 | allFinished = false; |
| 1007 | if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) |
| 1008 | minItem = entry->curItem; |
| 1009 | } |
| 1010 | |
| 1011 | if (allFinished) |
| 1012 | { |
| 1013 | /* all entries are finished */ |
| 1014 | key->isFinished = true; |
| 1015 | return; |
| 1016 | } |
| 1017 | |
| 1018 | /* |
| 1019 | * Ok, we now know that there are no matches < minItem. |
| 1020 | * |
| 1021 | * If minItem is lossy, it means that there were no exact items on the |
| 1022 | * page among requiredEntries, because lossy pointers sort after exact |
| 1023 | * items. However, there might be exact items for the same page among |
| 1024 | * additionalEntries, so we mustn't advance past them. |
| 1025 | */ |
| 1026 | if (ItemPointerIsLossyPage(&minItem)) |
| 1027 | { |
| 1028 | if (GinItemPointerGetBlockNumber(&advancePast) < |
| 1029 | GinItemPointerGetBlockNumber(&minItem)) |
| 1030 | { |
| 1031 | ItemPointerSet(&advancePast, |
| 1032 | GinItemPointerGetBlockNumber(&minItem), |
| 1033 | InvalidOffsetNumber); |
| 1034 | } |
| 1035 | } |
| 1036 | else |
| 1037 | { |
| 1038 | Assert(GinItemPointerGetOffsetNumber(&minItem) > 0); |
| 1039 | ItemPointerSet(&advancePast, |
| 1040 | GinItemPointerGetBlockNumber(&minItem), |
| 1041 | OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem))); |
| 1042 | } |
| 1043 | |
| 1044 | /* |
| 1045 | * We might not have loaded all the entry streams for this TID yet. We |
| 1046 | * could call the consistent function, passing MAYBE for those entries, to |
| 1047 | * see if it can decide if this TID matches based on the information we |
| 1048 | * have. But if the consistent-function is expensive, and cannot in fact |
| 1049 | * decide with partial information, that could be a big loss. So, load all |
| 1050 | * the additional entries, before calling the consistent function. |
| 1051 | */ |
| 1052 | for (i = 0; i < key->nadditional; i++) |
| 1053 | { |
| 1054 | entry = key->additionalEntries[i]; |
| 1055 | |
| 1056 | if (entry->isFinished) |
| 1057 | continue; |
| 1058 | |
| 1059 | if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) |
| 1060 | { |
| 1061 | entryGetItem(ginstate, entry, advancePast, snapshot); |
| 1062 | if (entry->isFinished) |
| 1063 | continue; |
| 1064 | } |
| 1065 | |
| 1066 | /* |
| 1067 | * Normally, none of the items in additionalEntries can have a curItem |
| 1068 | * larger than minItem. But if minItem is a lossy page, then there |
| 1069 | * might be exact items on the same page among additionalEntries. |
| 1070 | */ |
| 1071 | if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) |
| 1072 | { |
| 1073 | Assert(ItemPointerIsLossyPage(&minItem)); |
| 1074 | minItem = entry->curItem; |
| 1075 | } |
| 1076 | } |
| 1077 | |
| 1078 | /* |
| 1079 | * Ok, we've advanced all the entries up to minItem now. Set key->curItem, |
| 1080 | * and perform consistentFn test. |
| 1081 | * |
| 1082 | * Lossy-page entries pose a problem, since we don't know the correct |
| 1083 | * entryRes state to pass to the consistentFn, and we also don't know what |
| 1084 | * its combining logic will be (could be AND, OR, or even NOT). If the |
| 1085 | * logic is OR then the consistentFn might succeed for all items in the |
| 1086 | * lossy page even when none of the other entries match. |
| 1087 | * |
| 1088 | * Our strategy is to call the tri-state consistent function, with the |
| 1089 | * lossy-page entries set to MAYBE, and all the other entries FALSE. If it |
| 1090 | * returns FALSE, none of the lossy items alone are enough for a match, so |
| 1091 | * we don't need to return a lossy-page pointer. Otherwise, return a |
| 1092 | * lossy-page pointer to indicate that the whole heap page must be |
| 1093 | * checked. (On subsequent calls, we'll do nothing until minItem is past |
| 1094 | * the page altogether, thus ensuring that we never return both regular |
| 1095 | * and lossy pointers for the same page.) |
| 1096 | * |
| 1097 | * An exception is that it doesn't matter what we pass for lossy pointers |
| 1098 | * in "hidden" entries, because the consistentFn's result can't depend on |
| 1099 | * them. We could pass them as MAYBE as well, but if we're using the |
| 1100 | * "shim" implementation of a tri-state consistent function (see |
| 1101 | * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass |
| 1102 | * them as true. |
| 1103 | * |
| 1104 | * Note that only lossy-page entries pointing to the current item's page |
| 1105 | * should trigger this processing; we might have future lossy pages in the |
| 1106 | * entry array, but they aren't relevant yet. |
| 1107 | */ |
| 1108 | key->curItem = minItem; |
| 1109 | ItemPointerSetLossyPage(&curPageLossy, |
| 1110 | GinItemPointerGetBlockNumber(&key->curItem)); |
| 1111 | haveLossyEntry = false; |
| 1112 | for (i = 0; i < key->nentries; i++) |
| 1113 | { |
| 1114 | entry = key->scanEntry[i]; |
| 1115 | if (entry->isFinished == false && |
| 1116 | ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) |
| 1117 | { |
| 1118 | if (i < key->nuserentries) |
| 1119 | key->entryRes[i] = GIN_MAYBE; |
| 1120 | else |
| 1121 | key->entryRes[i] = GIN_TRUE; |
| 1122 | haveLossyEntry = true; |
| 1123 | } |
| 1124 | else |
| 1125 | key->entryRes[i] = GIN_FALSE; |
| 1126 | } |
| 1127 | |
| 1128 | /* prepare for calling consistentFn in temp context */ |
| 1129 | oldCtx = MemoryContextSwitchTo(tempCtx); |
| 1130 | |
| 1131 | if (haveLossyEntry) |
| 1132 | { |
| 1133 | /* Have lossy-page entries, so see if whole page matches */ |
| 1134 | res = key->triConsistentFn(key); |
| 1135 | |
| 1136 | if (res == GIN_TRUE || res == GIN_MAYBE) |
| 1137 | { |
| 1138 | /* Yes, so clean up ... */ |
| 1139 | MemoryContextSwitchTo(oldCtx); |
| 1140 | MemoryContextReset(tempCtx); |
| 1141 | |
| 1142 | /* and return lossy pointer for whole page */ |
| 1143 | key->curItem = curPageLossy; |
| 1144 | key->curItemMatches = true; |
| 1145 | key->recheckCurItem = true; |
| 1146 | return; |
| 1147 | } |
| 1148 | } |
| 1149 | |
| 1150 | /* |
| 1151 | * At this point we know that we don't need to return a lossy whole-page |
| 1152 | * pointer, but we might have matches for individual exact item pointers, |
| 1153 | * possibly in combination with a lossy pointer. Pass lossy pointers as |
| 1154 | * MAYBE to the ternary consistent function, to let it decide if this |
| 1155 | * tuple satisfies the overall key, even though we don't know if the lossy |
| 1156 | * entries match. |
| 1157 | * |
| 1158 | * Prepare entryRes array to be passed to consistentFn. |
| 1159 | */ |
| 1160 | for (i = 0; i < key->nentries; i++) |
| 1161 | { |
| 1162 | entry = key->scanEntry[i]; |
| 1163 | if (entry->isFinished) |
| 1164 | key->entryRes[i] = GIN_FALSE; |
| 1165 | #if 0 |
| 1166 | |
| 1167 | /* |
| 1168 | * This case can't currently happen, because we loaded all the entries |
| 1169 | * for this item earlier. |
| 1170 | */ |
| 1171 | else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) |
| 1172 | key->entryRes[i] = GIN_MAYBE; |
| 1173 | #endif |
| 1174 | else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) |
| 1175 | key->entryRes[i] = GIN_MAYBE; |
| 1176 | else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0) |
| 1177 | key->entryRes[i] = GIN_TRUE; |
| 1178 | else |
| 1179 | key->entryRes[i] = GIN_FALSE; |
| 1180 | } |
| 1181 | |
| 1182 | res = key->triConsistentFn(key); |
| 1183 | |
| 1184 | switch (res) |
| 1185 | { |
| 1186 | case GIN_TRUE: |
| 1187 | key->curItemMatches = true; |
| 1188 | /* triConsistentFn set recheckCurItem */ |
| 1189 | break; |
| 1190 | |
| 1191 | case GIN_FALSE: |
| 1192 | key->curItemMatches = false; |
| 1193 | break; |
| 1194 | |
| 1195 | case GIN_MAYBE: |
| 1196 | key->curItemMatches = true; |
| 1197 | key->recheckCurItem = true; |
| 1198 | break; |
| 1199 | |
| 1200 | default: |
| 1201 | |
| 1202 | /* |
| 1203 | * the 'default' case shouldn't happen, but if the consistent |
| 1204 | * function returns something bogus, this is the safe result |
| 1205 | */ |
| 1206 | key->curItemMatches = true; |
| 1207 | key->recheckCurItem = true; |
| 1208 | break; |
| 1209 | } |
| 1210 | |
| 1211 | /* |
| 1212 | * We have a tuple, and we know if it matches or not. If it's a non-match, |
| 1213 | * we could continue to find the next matching tuple, but let's break out |
| 1214 | * and give scanGetItem a chance to advance the other keys. They might be |
| 1215 | * able to skip past to a much higher TID, allowing us to save work. |
| 1216 | */ |
| 1217 | |
| 1218 | /* clean up after consistentFn calls */ |
| 1219 | MemoryContextSwitchTo(oldCtx); |
| 1220 | MemoryContextReset(tempCtx); |
| 1221 | } |
| 1222 | |
| 1223 | /* |
| 1224 | * Get next heap item pointer (after advancePast) from scan. |
| 1225 | * Returns true if anything found. |
| 1226 | * On success, *item and *recheck are set. |
| 1227 | * |
| 1228 | * Note: this is very nearly the same logic as in keyGetItem(), except |
| 1229 | * that we know the keys are to be combined with AND logic, whereas in |
| 1230 | * keyGetItem() the combination logic is known only to the consistentFn. |
| 1231 | */ |
| 1232 | static bool |
| 1233 | scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, |
| 1234 | ItemPointerData *item, bool *recheck) |
| 1235 | { |
| 1236 | GinScanOpaque so = (GinScanOpaque) scan->opaque; |
| 1237 | uint32 i; |
| 1238 | bool match; |
| 1239 | |
| 1240 | /*---------- |
| 1241 | * Advance the scan keys in lock-step, until we find an item that matches |
| 1242 | * all the keys. If any key reports isFinished, meaning its subset of the |
| 1243 | * entries is exhausted, we can stop. Otherwise, set *item to the next |
| 1244 | * matching item. |
| 1245 | * |
| 1246 | * This logic works only if a keyGetItem stream can never contain both |
| 1247 | * exact and lossy pointers for the same page. Else we could have a |
| 1248 | * case like |
| 1249 | * |
| 1250 | * stream 1 stream 2 |
| 1251 | * ... ... |
| 1252 | * 42/6 42/7 |
| 1253 | * 50/1 42/0xffff |
| 1254 | * ... ... |
| 1255 | * |
| 1256 | * We would conclude that 42/6 is not a match and advance stream 1, |
| 1257 | * thus never detecting the match to the lossy pointer in stream 2. |
| 1258 | * (keyGetItem has a similar problem versus entryGetItem.) |
| 1259 | *---------- |
| 1260 | */ |
| 1261 | do |
| 1262 | { |
| 1263 | ItemPointerSetMin(item); |
| 1264 | match = true; |
| 1265 | for (i = 0; i < so->nkeys && match; i++) |
| 1266 | { |
| 1267 | GinScanKey key = so->keys + i; |
| 1268 | |
| 1269 | /* Fetch the next item for this key that is > advancePast. */ |
| 1270 | keyGetItem(&so->ginstate, so->tempCtx, key, advancePast, |
| 1271 | scan->xs_snapshot); |
| 1272 | |
| 1273 | if (key->isFinished) |
| 1274 | return false; |
| 1275 | |
| 1276 | /* |
| 1277 | * If it's not a match, we can immediately conclude that nothing |
| 1278 | * <= this item matches, without checking the rest of the keys. |
| 1279 | */ |
| 1280 | if (!key->curItemMatches) |
| 1281 | { |
| 1282 | advancePast = key->curItem; |
| 1283 | match = false; |
| 1284 | break; |
| 1285 | } |
| 1286 | |
| 1287 | /* |
| 1288 | * It's a match. We can conclude that nothing < matches, so the |
| 1289 | * other key streams can skip to this item. |
| 1290 | * |
| 1291 | * Beware of lossy pointers, though; from a lossy pointer, we can |
| 1292 | * only conclude that nothing smaller than this *block* matches. |
| 1293 | */ |
| 1294 | if (ItemPointerIsLossyPage(&key->curItem)) |
| 1295 | { |
| 1296 | if (GinItemPointerGetBlockNumber(&advancePast) < |
| 1297 | GinItemPointerGetBlockNumber(&key->curItem)) |
| 1298 | { |
| 1299 | ItemPointerSet(&advancePast, |
| 1300 | GinItemPointerGetBlockNumber(&key->curItem), |
| 1301 | InvalidOffsetNumber); |
| 1302 | } |
| 1303 | } |
| 1304 | else |
| 1305 | { |
| 1306 | Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0); |
| 1307 | ItemPointerSet(&advancePast, |
| 1308 | GinItemPointerGetBlockNumber(&key->curItem), |
| 1309 | OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem))); |
| 1310 | } |
| 1311 | |
| 1312 | /* |
| 1313 | * If this is the first key, remember this location as a potential |
| 1314 | * match, and proceed to check the rest of the keys. |
| 1315 | * |
| 1316 | * Otherwise, check if this is the same item that we checked the |
| 1317 | * previous keys for (or a lossy pointer for the same page). If |
| 1318 | * not, loop back to check the previous keys for this item (we |
| 1319 | * will check this key again too, but keyGetItem returns quickly |
| 1320 | * for that) |
| 1321 | */ |
| 1322 | if (i == 0) |
| 1323 | { |
| 1324 | *item = key->curItem; |
| 1325 | } |
| 1326 | else |
| 1327 | { |
| 1328 | if (ItemPointerIsLossyPage(&key->curItem) || |
| 1329 | ItemPointerIsLossyPage(item)) |
| 1330 | { |
| 1331 | Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item)); |
| 1332 | match = (GinItemPointerGetBlockNumber(&key->curItem) == |
| 1333 | GinItemPointerGetBlockNumber(item)); |
| 1334 | } |
| 1335 | else |
| 1336 | { |
| 1337 | Assert(ginCompareItemPointers(&key->curItem, item) >= 0); |
| 1338 | match = (ginCompareItemPointers(&key->curItem, item) == 0); |
| 1339 | } |
| 1340 | } |
| 1341 | } |
| 1342 | } while (!match); |
| 1343 | |
| 1344 | Assert(!ItemPointerIsMin(item)); |
| 1345 | |
| 1346 | /* |
| 1347 | * Now *item contains the first ItemPointer after previous result that |
| 1348 | * satisfied all the keys for that exact TID, or a lossy reference to the |
| 1349 | * same page. |
| 1350 | * |
| 1351 | * We must return recheck = true if any of the keys are marked recheck. |
| 1352 | */ |
| 1353 | *recheck = false; |
| 1354 | for (i = 0; i < so->nkeys; i++) |
| 1355 | { |
| 1356 | GinScanKey key = so->keys + i; |
| 1357 | |
| 1358 | if (key->recheckCurItem) |
| 1359 | { |
| 1360 | *recheck = true; |
| 1361 | break; |
| 1362 | } |
| 1363 | } |
| 1364 | |
| 1365 | return true; |
| 1366 | } |
| 1367 | |
| 1368 | |
| 1369 | /* |
| 1370 | * Functions for scanning the pending list |
| 1371 | */ |
| 1372 | |
| 1373 | |
| 1374 | /* |
| 1375 | * Get ItemPointer of next heap row to be checked from pending list. |
| 1376 | * Returns false if there are no more. On pages with several heap rows |
| 1377 | * it returns each row separately, on page with part of heap row returns |
| 1378 | * per page data. pos->firstOffset and pos->lastOffset are set to identify |
| 1379 | * the range of pending-list tuples belonging to this heap row. |
| 1380 | * |
| 1381 | * The pendingBuffer is presumed pinned and share-locked on entry, and is |
| 1382 | * pinned and share-locked on success exit. On failure exit it's released. |
| 1383 | */ |
| 1384 | static bool |
| 1385 | scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) |
| 1386 | { |
| 1387 | OffsetNumber maxoff; |
| 1388 | Page page; |
| 1389 | IndexTuple itup; |
| 1390 | |
| 1391 | ItemPointerSetInvalid(&pos->item); |
| 1392 | for (;;) |
| 1393 | { |
| 1394 | page = BufferGetPage(pos->pendingBuffer); |
| 1395 | TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); |
| 1396 | |
| 1397 | maxoff = PageGetMaxOffsetNumber(page); |
| 1398 | if (pos->firstOffset > maxoff) |
| 1399 | { |
| 1400 | BlockNumber blkno = GinPageGetOpaque(page)->rightlink; |
| 1401 | |
| 1402 | if (blkno == InvalidBlockNumber) |
| 1403 | { |
| 1404 | UnlockReleaseBuffer(pos->pendingBuffer); |
| 1405 | pos->pendingBuffer = InvalidBuffer; |
| 1406 | |
| 1407 | return false; |
| 1408 | } |
| 1409 | else |
| 1410 | { |
| 1411 | /* |
| 1412 | * Here we must prevent deletion of next page by insertcleanup |
| 1413 | * process, which may be trying to obtain exclusive lock on |
| 1414 | * current page. So, we lock next page before releasing the |
| 1415 | * current one |
| 1416 | */ |
| 1417 | Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); |
| 1418 | |
| 1419 | LockBuffer(tmpbuf, GIN_SHARE); |
| 1420 | UnlockReleaseBuffer(pos->pendingBuffer); |
| 1421 | |
| 1422 | pos->pendingBuffer = tmpbuf; |
| 1423 | pos->firstOffset = FirstOffsetNumber; |
| 1424 | } |
| 1425 | } |
| 1426 | else |
| 1427 | { |
| 1428 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); |
| 1429 | pos->item = itup->t_tid; |
| 1430 | if (GinPageHasFullRow(page)) |
| 1431 | { |
| 1432 | /* |
| 1433 | * find itempointer to the next row |
| 1434 | */ |
| 1435 | for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++) |
| 1436 | { |
| 1437 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); |
| 1438 | if (!ItemPointerEquals(&pos->item, &itup->t_tid)) |
| 1439 | break; |
| 1440 | } |
| 1441 | } |
| 1442 | else |
| 1443 | { |
| 1444 | /* |
| 1445 | * All itempointers are the same on this page |
| 1446 | */ |
| 1447 | pos->lastOffset = maxoff + 1; |
| 1448 | } |
| 1449 | |
| 1450 | /* |
| 1451 | * Now pos->firstOffset points to the first tuple of current heap |
| 1452 | * row, pos->lastOffset points to the first tuple of next heap row |
| 1453 | * (or to the end of page) |
| 1454 | */ |
| 1455 | break; |
| 1456 | } |
| 1457 | } |
| 1458 | |
| 1459 | return true; |
| 1460 | } |
| 1461 | |
| 1462 | /* |
| 1463 | * Scan pending-list page from current tuple (off) up till the first of: |
| 1464 | * - match is found (then returns true) |
| 1465 | * - no later match is possible |
| 1466 | * - tuple's attribute number is not equal to entry's attrnum |
| 1467 | * - reach end of page |
| 1468 | * |
| 1469 | * datum[]/category[]/datumExtracted[] arrays are used to cache the results |
| 1470 | * of gintuple_get_key() on the current page. |
| 1471 | */ |
| 1472 | static bool |
| 1473 | matchPartialInPendingList(GinState *ginstate, Page page, |
| 1474 | OffsetNumber off, OffsetNumber maxoff, |
| 1475 | GinScanEntry entry, |
| 1476 | Datum *datum, GinNullCategory *category, |
| 1477 | bool *) |
| 1478 | { |
| 1479 | IndexTuple itup; |
| 1480 | int32 cmp; |
| 1481 | |
| 1482 | /* Partial match to a null is not possible */ |
| 1483 | if (entry->queryCategory != GIN_CAT_NORM_KEY) |
| 1484 | return false; |
| 1485 | |
| 1486 | while (off < maxoff) |
| 1487 | { |
| 1488 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); |
| 1489 | |
| 1490 | if (gintuple_get_attrnum(ginstate, itup) != entry->attnum) |
| 1491 | return false; |
| 1492 | |
| 1493 | if (datumExtracted[off - 1] == false) |
| 1494 | { |
| 1495 | datum[off - 1] = gintuple_get_key(ginstate, itup, |
| 1496 | &category[off - 1]); |
| 1497 | datumExtracted[off - 1] = true; |
| 1498 | } |
| 1499 | |
| 1500 | /* Once we hit nulls, no further match is possible */ |
| 1501 | if (category[off - 1] != GIN_CAT_NORM_KEY) |
| 1502 | return false; |
| 1503 | |
| 1504 | /*---------- |
| 1505 | * Check partial match. |
| 1506 | * case cmp == 0 => match |
| 1507 | * case cmp > 0 => not match and end scan (no later match possible) |
| 1508 | * case cmp < 0 => not match and continue scan |
| 1509 | *---------- |
| 1510 | */ |
| 1511 | cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1], |
| 1512 | ginstate->supportCollation[entry->attnum - 1], |
| 1513 | entry->queryKey, |
| 1514 | datum[off - 1], |
| 1515 | UInt16GetDatum(entry->strategy), |
| 1516 | PointerGetDatum(entry->extra_data))); |
| 1517 | if (cmp == 0) |
| 1518 | return true; |
| 1519 | else if (cmp > 0) |
| 1520 | return false; |
| 1521 | |
| 1522 | off++; |
| 1523 | } |
| 1524 | |
| 1525 | return false; |
| 1526 | } |
| 1527 | |
| 1528 | /* |
| 1529 | * Set up the entryRes array for each key by looking at |
| 1530 | * every entry for current heap row in pending list. |
| 1531 | * |
| 1532 | * Returns true if each scan key has at least one entryRes match. |
| 1533 | * This corresponds to the situations where the normal index search will |
| 1534 | * try to apply the key's consistentFn. (A tuple not meeting that requirement |
| 1535 | * cannot be returned by the normal search since no entry stream will |
| 1536 | * source its TID.) |
| 1537 | * |
| 1538 | * The pendingBuffer is presumed pinned and share-locked on entry. |
| 1539 | */ |
| 1540 | static bool |
| 1541 | collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) |
| 1542 | { |
| 1543 | GinScanOpaque so = (GinScanOpaque) scan->opaque; |
| 1544 | OffsetNumber attrnum; |
| 1545 | Page page; |
| 1546 | IndexTuple itup; |
| 1547 | int i, |
| 1548 | j; |
| 1549 | |
| 1550 | /* |
| 1551 | * Reset all entryRes and hasMatchKey flags |
| 1552 | */ |
| 1553 | for (i = 0; i < so->nkeys; i++) |
| 1554 | { |
| 1555 | GinScanKey key = so->keys + i; |
| 1556 | |
| 1557 | memset(key->entryRes, GIN_FALSE, key->nentries); |
| 1558 | } |
| 1559 | memset(pos->hasMatchKey, false, so->nkeys); |
| 1560 | |
| 1561 | /* |
| 1562 | * Outer loop iterates over multiple pending-list pages when a single heap |
| 1563 | * row has entries spanning those pages. |
| 1564 | */ |
| 1565 | for (;;) |
| 1566 | { |
| 1567 | Datum datum[BLCKSZ / sizeof(IndexTupleData)]; |
| 1568 | GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)]; |
| 1569 | bool [BLCKSZ / sizeof(IndexTupleData)]; |
| 1570 | |
| 1571 | Assert(pos->lastOffset > pos->firstOffset); |
| 1572 | memset(datumExtracted + pos->firstOffset - 1, 0, |
| 1573 | sizeof(bool) * (pos->lastOffset - pos->firstOffset)); |
| 1574 | |
| 1575 | page = BufferGetPage(pos->pendingBuffer); |
| 1576 | TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); |
| 1577 | |
| 1578 | for (i = 0; i < so->nkeys; i++) |
| 1579 | { |
| 1580 | GinScanKey key = so->keys + i; |
| 1581 | |
| 1582 | for (j = 0; j < key->nentries; j++) |
| 1583 | { |
| 1584 | GinScanEntry entry = key->scanEntry[j]; |
| 1585 | OffsetNumber StopLow = pos->firstOffset, |
| 1586 | StopHigh = pos->lastOffset, |
| 1587 | StopMiddle; |
| 1588 | |
| 1589 | /* If already matched on earlier page, do no extra work */ |
| 1590 | if (key->entryRes[j]) |
| 1591 | continue; |
| 1592 | |
| 1593 | /* |
| 1594 | * Interesting tuples are from pos->firstOffset to |
| 1595 | * pos->lastOffset and they are ordered by (attnum, Datum) as |
| 1596 | * it's done in entry tree. So we can use binary search to |
| 1597 | * avoid linear scanning. |
| 1598 | */ |
| 1599 | while (StopLow < StopHigh) |
| 1600 | { |
| 1601 | int res; |
| 1602 | |
| 1603 | StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); |
| 1604 | |
| 1605 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); |
| 1606 | |
| 1607 | attrnum = gintuple_get_attrnum(&so->ginstate, itup); |
| 1608 | |
| 1609 | if (key->attnum < attrnum) |
| 1610 | { |
| 1611 | StopHigh = StopMiddle; |
| 1612 | continue; |
| 1613 | } |
| 1614 | if (key->attnum > attrnum) |
| 1615 | { |
| 1616 | StopLow = StopMiddle + 1; |
| 1617 | continue; |
| 1618 | } |
| 1619 | |
| 1620 | if (datumExtracted[StopMiddle - 1] == false) |
| 1621 | { |
| 1622 | datum[StopMiddle - 1] = |
| 1623 | gintuple_get_key(&so->ginstate, itup, |
| 1624 | &category[StopMiddle - 1]); |
| 1625 | datumExtracted[StopMiddle - 1] = true; |
| 1626 | } |
| 1627 | |
| 1628 | if (entry->queryCategory == GIN_CAT_EMPTY_QUERY) |
| 1629 | { |
| 1630 | /* special behavior depending on searchMode */ |
| 1631 | if (entry->searchMode == GIN_SEARCH_MODE_ALL) |
| 1632 | { |
| 1633 | /* match anything except NULL_ITEM */ |
| 1634 | if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM) |
| 1635 | res = -1; |
| 1636 | else |
| 1637 | res = 0; |
| 1638 | } |
| 1639 | else |
| 1640 | { |
| 1641 | /* match everything */ |
| 1642 | res = 0; |
| 1643 | } |
| 1644 | } |
| 1645 | else |
| 1646 | { |
| 1647 | res = ginCompareEntries(&so->ginstate, |
| 1648 | entry->attnum, |
| 1649 | entry->queryKey, |
| 1650 | entry->queryCategory, |
| 1651 | datum[StopMiddle - 1], |
| 1652 | category[StopMiddle - 1]); |
| 1653 | } |
| 1654 | |
| 1655 | if (res == 0) |
| 1656 | { |
| 1657 | /* |
| 1658 | * Found exact match (there can be only one, except in |
| 1659 | * EMPTY_QUERY mode). |
| 1660 | * |
| 1661 | * If doing partial match, scan forward from here to |
| 1662 | * end of page to check for matches. |
| 1663 | * |
| 1664 | * See comment above about tuple's ordering. |
| 1665 | */ |
| 1666 | if (entry->isPartialMatch) |
| 1667 | key->entryRes[j] = |
| 1668 | matchPartialInPendingList(&so->ginstate, |
| 1669 | page, |
| 1670 | StopMiddle, |
| 1671 | pos->lastOffset, |
| 1672 | entry, |
| 1673 | datum, |
| 1674 | category, |
| 1675 | datumExtracted); |
| 1676 | else |
| 1677 | key->entryRes[j] = true; |
| 1678 | |
| 1679 | /* done with binary search */ |
| 1680 | break; |
| 1681 | } |
| 1682 | else if (res < 0) |
| 1683 | StopHigh = StopMiddle; |
| 1684 | else |
| 1685 | StopLow = StopMiddle + 1; |
| 1686 | } |
| 1687 | |
| 1688 | if (StopLow >= StopHigh && entry->isPartialMatch) |
| 1689 | { |
| 1690 | /* |
| 1691 | * No exact match on this page. If doing partial match, |
| 1692 | * scan from the first tuple greater than target value to |
| 1693 | * end of page. Note that since we don't remember whether |
| 1694 | * the comparePartialFn told us to stop early on a |
| 1695 | * previous page, we will uselessly apply comparePartialFn |
| 1696 | * to the first tuple on each subsequent page. |
| 1697 | */ |
| 1698 | key->entryRes[j] = |
| 1699 | matchPartialInPendingList(&so->ginstate, |
| 1700 | page, |
| 1701 | StopHigh, |
| 1702 | pos->lastOffset, |
| 1703 | entry, |
| 1704 | datum, |
| 1705 | category, |
| 1706 | datumExtracted); |
| 1707 | } |
| 1708 | |
| 1709 | pos->hasMatchKey[i] |= key->entryRes[j]; |
| 1710 | } |
| 1711 | } |
| 1712 | |
| 1713 | /* Advance firstOffset over the scanned tuples */ |
| 1714 | pos->firstOffset = pos->lastOffset; |
| 1715 | |
| 1716 | if (GinPageHasFullRow(page)) |
| 1717 | { |
| 1718 | /* |
| 1719 | * We have examined all pending entries for the current heap row. |
| 1720 | * Break out of loop over pages. |
| 1721 | */ |
| 1722 | break; |
| 1723 | } |
| 1724 | else |
| 1725 | { |
| 1726 | /* |
| 1727 | * Advance to next page of pending entries for the current heap |
| 1728 | * row. Complain if there isn't one. |
| 1729 | */ |
| 1730 | ItemPointerData item = pos->item; |
| 1731 | |
| 1732 | if (scanGetCandidate(scan, pos) == false || |
| 1733 | !ItemPointerEquals(&pos->item, &item)) |
| 1734 | elog(ERROR, "could not find additional pending pages for same heap tuple" ); |
| 1735 | } |
| 1736 | } |
| 1737 | |
| 1738 | /* |
| 1739 | * Now return "true" if all scan keys have at least one matching datum |
| 1740 | */ |
| 1741 | for (i = 0; i < so->nkeys; i++) |
| 1742 | { |
| 1743 | if (pos->hasMatchKey[i] == false) |
| 1744 | return false; |
| 1745 | } |
| 1746 | |
| 1747 | return true; |
| 1748 | } |
| 1749 | |
| 1750 | /* |
| 1751 | * Collect all matched rows from pending list into bitmap. |
| 1752 | */ |
| 1753 | static void |
| 1754 | scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) |
| 1755 | { |
| 1756 | GinScanOpaque so = (GinScanOpaque) scan->opaque; |
| 1757 | MemoryContext oldCtx; |
| 1758 | bool recheck, |
| 1759 | match; |
| 1760 | int i; |
| 1761 | pendingPosition pos; |
| 1762 | Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); |
| 1763 | Page page; |
| 1764 | BlockNumber blkno; |
| 1765 | |
| 1766 | *ntids = 0; |
| 1767 | |
| 1768 | /* |
| 1769 | * Acquire predicate lock on the metapage, to conflict with any fastupdate |
| 1770 | * insertions. |
| 1771 | */ |
| 1772 | PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot); |
| 1773 | |
| 1774 | LockBuffer(metabuffer, GIN_SHARE); |
| 1775 | page = BufferGetPage(metabuffer); |
| 1776 | TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); |
| 1777 | blkno = GinPageGetMeta(page)->head; |
| 1778 | |
| 1779 | /* |
| 1780 | * fetch head of list before unlocking metapage. head page must be pinned |
| 1781 | * to prevent deletion by vacuum process |
| 1782 | */ |
| 1783 | if (blkno == InvalidBlockNumber) |
| 1784 | { |
| 1785 | /* No pending list, so proceed with normal scan */ |
| 1786 | UnlockReleaseBuffer(metabuffer); |
| 1787 | return; |
| 1788 | } |
| 1789 | |
| 1790 | pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); |
| 1791 | LockBuffer(pos.pendingBuffer, GIN_SHARE); |
| 1792 | pos.firstOffset = FirstOffsetNumber; |
| 1793 | UnlockReleaseBuffer(metabuffer); |
| 1794 | pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys); |
| 1795 | |
| 1796 | /* |
| 1797 | * loop for each heap row. scanGetCandidate returns full row or row's |
| 1798 | * tuples from first page. |
| 1799 | */ |
| 1800 | while (scanGetCandidate(scan, &pos)) |
| 1801 | { |
| 1802 | /* |
| 1803 | * Check entries in tuple and set up entryRes array. |
| 1804 | * |
| 1805 | * If pending tuples belonging to the current heap row are spread |
| 1806 | * across several pages, collectMatchesForHeapRow will read all of |
| 1807 | * those pages. |
| 1808 | */ |
| 1809 | if (!collectMatchesForHeapRow(scan, &pos)) |
| 1810 | continue; |
| 1811 | |
| 1812 | /* |
| 1813 | * Matching of entries of one row is finished, so check row using |
| 1814 | * consistent functions. |
| 1815 | */ |
| 1816 | oldCtx = MemoryContextSwitchTo(so->tempCtx); |
| 1817 | recheck = false; |
| 1818 | match = true; |
| 1819 | |
| 1820 | for (i = 0; i < so->nkeys; i++) |
| 1821 | { |
| 1822 | GinScanKey key = so->keys + i; |
| 1823 | |
| 1824 | if (!key->boolConsistentFn(key)) |
| 1825 | { |
| 1826 | match = false; |
| 1827 | break; |
| 1828 | } |
| 1829 | recheck |= key->recheckCurItem; |
| 1830 | } |
| 1831 | |
| 1832 | MemoryContextSwitchTo(oldCtx); |
| 1833 | MemoryContextReset(so->tempCtx); |
| 1834 | |
| 1835 | if (match) |
| 1836 | { |
| 1837 | tbm_add_tuples(tbm, &pos.item, 1, recheck); |
| 1838 | (*ntids)++; |
| 1839 | } |
| 1840 | } |
| 1841 | |
| 1842 | pfree(pos.hasMatchKey); |
| 1843 | } |
| 1844 | |
| 1845 | |
| 1846 | #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes ) |
| 1847 | |
| 1848 | int64 |
| 1849 | gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm) |
| 1850 | { |
| 1851 | GinScanOpaque so = (GinScanOpaque) scan->opaque; |
| 1852 | int64 ntids; |
| 1853 | ItemPointerData iptr; |
| 1854 | bool recheck; |
| 1855 | |
| 1856 | /* |
| 1857 | * Set up the scan keys, and check for unsatisfiable query. |
| 1858 | */ |
| 1859 | ginFreeScanKeys(so); /* there should be no keys yet, but just to be |
| 1860 | * sure */ |
| 1861 | ginNewScanKey(scan); |
| 1862 | |
| 1863 | if (GinIsVoidRes(scan)) |
| 1864 | return 0; |
| 1865 | |
| 1866 | ntids = 0; |
| 1867 | |
| 1868 | /* |
| 1869 | * First, scan the pending list and collect any matching entries into the |
| 1870 | * bitmap. After we scan a pending item, some other backend could post it |
| 1871 | * into the main index, and so we might visit it a second time during the |
| 1872 | * main scan. This is okay because we'll just re-set the same bit in the |
| 1873 | * bitmap. (The possibility of duplicate visits is a major reason why GIN |
| 1874 | * can't support the amgettuple API, however.) Note that it would not do |
| 1875 | * to scan the main index before the pending list, since concurrent |
| 1876 | * cleanup could then make us miss entries entirely. |
| 1877 | */ |
| 1878 | scanPendingInsert(scan, tbm, &ntids); |
| 1879 | |
| 1880 | /* |
| 1881 | * Now scan the main index. |
| 1882 | */ |
| 1883 | startScan(scan); |
| 1884 | |
| 1885 | ItemPointerSetMin(&iptr); |
| 1886 | |
| 1887 | for (;;) |
| 1888 | { |
| 1889 | CHECK_FOR_INTERRUPTS(); |
| 1890 | |
| 1891 | if (!scanGetItem(scan, iptr, &iptr, &recheck)) |
| 1892 | break; |
| 1893 | |
| 1894 | if (ItemPointerIsLossyPage(&iptr)) |
| 1895 | tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); |
| 1896 | else |
| 1897 | tbm_add_tuples(tbm, &iptr, 1, recheck); |
| 1898 | ntids++; |
| 1899 | } |
| 1900 | |
| 1901 | return ntids; |
| 1902 | } |
| 1903 | |