| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * nodeBitmapHeapscan.c |
| 4 | * Routines to support bitmapped scans of relations |
| 5 | * |
| 6 | * NOTE: it is critical that this plan type only be used with MVCC-compliant |
| 7 | * snapshots (ie, regular snapshots, not SnapshotAny or one of the other |
| 8 | * special snapshots). The reason is that since index and heap scans are |
| 9 | * decoupled, there can be no assurance that the index tuple prompting a |
| 10 | * visit to a particular heap TID still exists when the visit is made. |
| 11 | * Therefore the tuple might not exist anymore either (which is OK because |
| 12 | * heap_fetch will cope) --- but worse, the tuple slot could have been |
| 13 | * re-used for a newer tuple. With an MVCC snapshot the newer tuple is |
| 14 | * certain to fail the time qual and so it will not be mistakenly returned, |
| 15 | * but with anything else we might return a tuple that doesn't meet the |
| 16 | * required index qual conditions. |
| 17 | * |
| 18 | * |
| 19 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 20 | * Portions Copyright (c) 1994, Regents of the University of California |
| 21 | * |
| 22 | * |
| 23 | * IDENTIFICATION |
| 24 | * src/backend/executor/nodeBitmapHeapscan.c |
| 25 | * |
| 26 | *------------------------------------------------------------------------- |
| 27 | */ |
| 28 | /* |
| 29 | * INTERFACE ROUTINES |
| 30 | * ExecBitmapHeapScan scans a relation using bitmap info |
| 31 | * ExecBitmapHeapNext workhorse for above |
| 32 | * ExecInitBitmapHeapScan creates and initializes state info. |
| 33 | * ExecReScanBitmapHeapScan prepares to rescan the plan. |
| 34 | * ExecEndBitmapHeapScan releases all storage. |
| 35 | */ |
| 36 | #include "postgres.h" |
| 37 | |
| 38 | #include <math.h> |
| 39 | |
| 40 | #include "access/relscan.h" |
| 41 | #include "access/tableam.h" |
| 42 | #include "access/transam.h" |
| 43 | #include "access/visibilitymap.h" |
| 44 | #include "executor/execdebug.h" |
| 45 | #include "executor/nodeBitmapHeapscan.h" |
| 46 | #include "miscadmin.h" |
| 47 | #include "pgstat.h" |
| 48 | #include "storage/bufmgr.h" |
| 49 | #include "storage/predicate.h" |
| 50 | #include "utils/memutils.h" |
| 51 | #include "utils/rel.h" |
| 52 | #include "utils/spccache.h" |
| 53 | #include "utils/snapmgr.h" |
| 54 | |
| 55 | |
| 56 | static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); |
| 57 | static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate); |
| 58 | static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, |
| 59 | TBMIterateResult *tbmres); |
| 60 | static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node); |
| 61 | static inline void BitmapPrefetch(BitmapHeapScanState *node, |
| 62 | TableScanDesc scan); |
| 63 | static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate); |
| 64 | |
| 65 | |
| 66 | /* ---------------------------------------------------------------- |
| 67 | * BitmapHeapNext |
| 68 | * |
| 69 | * Retrieve next tuple from the BitmapHeapScan node's currentRelation |
| 70 | * ---------------------------------------------------------------- |
| 71 | */ |
| 72 | static TupleTableSlot * |
| 73 | BitmapHeapNext(BitmapHeapScanState *node) |
| 74 | { |
| 75 | ExprContext *econtext; |
| 76 | TableScanDesc scan; |
| 77 | TIDBitmap *tbm; |
| 78 | TBMIterator *tbmiterator = NULL; |
| 79 | TBMSharedIterator *shared_tbmiterator = NULL; |
| 80 | TBMIterateResult *tbmres; |
| 81 | TupleTableSlot *slot; |
| 82 | ParallelBitmapHeapState *pstate = node->pstate; |
| 83 | dsa_area *dsa = node->ss.ps.state->es_query_dsa; |
| 84 | |
| 85 | /* |
| 86 | * extract necessary information from index scan node |
| 87 | */ |
| 88 | econtext = node->ss.ps.ps_ExprContext; |
| 89 | slot = node->ss.ss_ScanTupleSlot; |
| 90 | scan = node->ss.ss_currentScanDesc; |
| 91 | tbm = node->tbm; |
| 92 | if (pstate == NULL) |
| 93 | tbmiterator = node->tbmiterator; |
| 94 | else |
| 95 | shared_tbmiterator = node->shared_tbmiterator; |
| 96 | tbmres = node->tbmres; |
| 97 | |
| 98 | /* |
| 99 | * If we haven't yet performed the underlying index scan, do it, and begin |
| 100 | * the iteration over the bitmap. |
| 101 | * |
| 102 | * For prefetching, we use *two* iterators, one for the pages we are |
| 103 | * actually scanning and another that runs ahead of the first for |
| 104 | * prefetching. node->prefetch_pages tracks exactly how many pages ahead |
| 105 | * the prefetch iterator is. Also, node->prefetch_target tracks the |
| 106 | * desired prefetch distance, which starts small and increases up to the |
| 107 | * node->prefetch_maximum. This is to avoid doing a lot of prefetching in |
| 108 | * a scan that stops after a few tuples because of a LIMIT. |
| 109 | */ |
| 110 | if (!node->initialized) |
| 111 | { |
| 112 | if (!pstate) |
| 113 | { |
| 114 | tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); |
| 115 | |
| 116 | if (!tbm || !IsA(tbm, TIDBitmap)) |
| 117 | elog(ERROR, "unrecognized result from subplan" ); |
| 118 | |
| 119 | node->tbm = tbm; |
| 120 | node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); |
| 121 | node->tbmres = tbmres = NULL; |
| 122 | |
| 123 | #ifdef USE_PREFETCH |
| 124 | if (node->prefetch_maximum > 0) |
| 125 | { |
| 126 | node->prefetch_iterator = tbm_begin_iterate(tbm); |
| 127 | node->prefetch_pages = 0; |
| 128 | node->prefetch_target = -1; |
| 129 | } |
| 130 | #endif /* USE_PREFETCH */ |
| 131 | } |
| 132 | else |
| 133 | { |
| 134 | /* |
| 135 | * The leader will immediately come out of the function, but |
| 136 | * others will be blocked until leader populates the TBM and wakes |
| 137 | * them up. |
| 138 | */ |
| 139 | if (BitmapShouldInitializeSharedState(pstate)) |
| 140 | { |
| 141 | tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); |
| 142 | if (!tbm || !IsA(tbm, TIDBitmap)) |
| 143 | elog(ERROR, "unrecognized result from subplan" ); |
| 144 | |
| 145 | node->tbm = tbm; |
| 146 | |
| 147 | /* |
| 148 | * Prepare to iterate over the TBM. This will return the |
| 149 | * dsa_pointer of the iterator state which will be used by |
| 150 | * multiple processes to iterate jointly. |
| 151 | */ |
| 152 | pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); |
| 153 | #ifdef USE_PREFETCH |
| 154 | if (node->prefetch_maximum > 0) |
| 155 | { |
| 156 | pstate->prefetch_iterator = |
| 157 | tbm_prepare_shared_iterate(tbm); |
| 158 | |
| 159 | /* |
| 160 | * We don't need the mutex here as we haven't yet woke up |
| 161 | * others. |
| 162 | */ |
| 163 | pstate->prefetch_pages = 0; |
| 164 | pstate->prefetch_target = -1; |
| 165 | } |
| 166 | #endif |
| 167 | |
| 168 | /* We have initialized the shared state so wake up others. */ |
| 169 | BitmapDoneInitializingSharedState(pstate); |
| 170 | } |
| 171 | |
| 172 | /* Allocate a private iterator and attach the shared state to it */ |
| 173 | node->shared_tbmiterator = shared_tbmiterator = |
| 174 | tbm_attach_shared_iterate(dsa, pstate->tbmiterator); |
| 175 | node->tbmres = tbmres = NULL; |
| 176 | |
| 177 | #ifdef USE_PREFETCH |
| 178 | if (node->prefetch_maximum > 0) |
| 179 | { |
| 180 | node->shared_prefetch_iterator = |
| 181 | tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); |
| 182 | } |
| 183 | #endif /* USE_PREFETCH */ |
| 184 | } |
| 185 | node->initialized = true; |
| 186 | } |
| 187 | |
| 188 | for (;;) |
| 189 | { |
| 190 | bool skip_fetch; |
| 191 | |
| 192 | CHECK_FOR_INTERRUPTS(); |
| 193 | |
| 194 | /* |
| 195 | * Get next page of results if needed |
| 196 | */ |
| 197 | if (tbmres == NULL) |
| 198 | { |
| 199 | if (!pstate) |
| 200 | node->tbmres = tbmres = tbm_iterate(tbmiterator); |
| 201 | else |
| 202 | node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); |
| 203 | if (tbmres == NULL) |
| 204 | { |
| 205 | /* no more entries in the bitmap */ |
| 206 | break; |
| 207 | } |
| 208 | |
| 209 | BitmapAdjustPrefetchIterator(node, tbmres); |
| 210 | |
| 211 | /* |
| 212 | * We can skip fetching the heap page if we don't need any fields |
| 213 | * from the heap, and the bitmap entries don't need rechecking, |
| 214 | * and all tuples on the page are visible to our transaction. |
| 215 | * |
| 216 | * XXX: It's a layering violation that we do these checks above |
| 217 | * tableam, they should probably moved below it at some point. |
| 218 | */ |
| 219 | skip_fetch = (node->can_skip_fetch && |
| 220 | !tbmres->recheck && |
| 221 | VM_ALL_VISIBLE(node->ss.ss_currentRelation, |
| 222 | tbmres->blockno, |
| 223 | &node->vmbuffer)); |
| 224 | |
| 225 | if (skip_fetch) |
| 226 | { |
| 227 | /* can't be lossy in the skip_fetch case */ |
| 228 | Assert(tbmres->ntuples >= 0); |
| 229 | |
| 230 | /* |
| 231 | * The number of tuples on this page is put into |
| 232 | * node->return_empty_tuples. |
| 233 | */ |
| 234 | node->return_empty_tuples = tbmres->ntuples; |
| 235 | } |
| 236 | else if (!table_scan_bitmap_next_block(scan, tbmres)) |
| 237 | { |
| 238 | /* AM doesn't think this block is valid, skip */ |
| 239 | continue; |
| 240 | } |
| 241 | |
| 242 | if (tbmres->ntuples >= 0) |
| 243 | node->exact_pages++; |
| 244 | else |
| 245 | node->lossy_pages++; |
| 246 | |
| 247 | /* Adjust the prefetch target */ |
| 248 | BitmapAdjustPrefetchTarget(node); |
| 249 | } |
| 250 | else |
| 251 | { |
| 252 | /* |
| 253 | * Continuing in previously obtained page. |
| 254 | */ |
| 255 | |
| 256 | #ifdef USE_PREFETCH |
| 257 | |
| 258 | /* |
| 259 | * Try to prefetch at least a few pages even before we get to the |
| 260 | * second page if we don't stop reading after the first tuple. |
| 261 | */ |
| 262 | if (!pstate) |
| 263 | { |
| 264 | if (node->prefetch_target < node->prefetch_maximum) |
| 265 | node->prefetch_target++; |
| 266 | } |
| 267 | else if (pstate->prefetch_target < node->prefetch_maximum) |
| 268 | { |
| 269 | /* take spinlock while updating shared state */ |
| 270 | SpinLockAcquire(&pstate->mutex); |
| 271 | if (pstate->prefetch_target < node->prefetch_maximum) |
| 272 | pstate->prefetch_target++; |
| 273 | SpinLockRelease(&pstate->mutex); |
| 274 | } |
| 275 | #endif /* USE_PREFETCH */ |
| 276 | } |
| 277 | |
| 278 | /* |
| 279 | * We issue prefetch requests *after* fetching the current page to try |
| 280 | * to avoid having prefetching interfere with the main I/O. Also, this |
| 281 | * should happen only when we have determined there is still something |
| 282 | * to do on the current page, else we may uselessly prefetch the same |
| 283 | * page we are just about to request for real. |
| 284 | * |
| 285 | * XXX: It's a layering violation that we do these checks above |
| 286 | * tableam, they should probably moved below it at some point. |
| 287 | */ |
| 288 | BitmapPrefetch(node, scan); |
| 289 | |
| 290 | if (node->return_empty_tuples > 0) |
| 291 | { |
| 292 | /* |
| 293 | * If we don't have to fetch the tuple, just return nulls. |
| 294 | */ |
| 295 | ExecStoreAllNullTuple(slot); |
| 296 | |
| 297 | if (--node->return_empty_tuples == 0) |
| 298 | { |
| 299 | /* no more tuples to return in the next round */ |
| 300 | node->tbmres = tbmres = NULL; |
| 301 | } |
| 302 | } |
| 303 | else |
| 304 | { |
| 305 | /* |
| 306 | * Attempt to fetch tuple from AM. |
| 307 | */ |
| 308 | if (!table_scan_bitmap_next_tuple(scan, tbmres, slot)) |
| 309 | { |
| 310 | /* nothing more to look at on this page */ |
| 311 | node->tbmres = tbmres = NULL; |
| 312 | continue; |
| 313 | } |
| 314 | |
| 315 | /* |
| 316 | * If we are using lossy info, we have to recheck the qual |
| 317 | * conditions at every tuple. |
| 318 | */ |
| 319 | if (tbmres->recheck) |
| 320 | { |
| 321 | econtext->ecxt_scantuple = slot; |
| 322 | if (!ExecQualAndReset(node->bitmapqualorig, econtext)) |
| 323 | { |
| 324 | /* Fails recheck, so drop it and loop back for another */ |
| 325 | InstrCountFiltered2(node, 1); |
| 326 | ExecClearTuple(slot); |
| 327 | continue; |
| 328 | } |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | /* OK to return this tuple */ |
| 333 | return slot; |
| 334 | } |
| 335 | |
| 336 | /* |
| 337 | * if we get here it means we are at the end of the scan.. |
| 338 | */ |
| 339 | return ExecClearTuple(slot); |
| 340 | } |
| 341 | |
| 342 | /* |
| 343 | * BitmapDoneInitializingSharedState - Shared state is initialized |
| 344 | * |
| 345 | * By this time the leader has already populated the TBM and initialized the |
| 346 | * shared state so wake up other processes. |
| 347 | */ |
| 348 | static inline void |
| 349 | BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate) |
| 350 | { |
| 351 | SpinLockAcquire(&pstate->mutex); |
| 352 | pstate->state = BM_FINISHED; |
| 353 | SpinLockRelease(&pstate->mutex); |
| 354 | ConditionVariableBroadcast(&pstate->cv); |
| 355 | } |
| 356 | |
| 357 | /* |
| 358 | * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator |
| 359 | */ |
| 360 | static inline void |
| 361 | BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, |
| 362 | TBMIterateResult *tbmres) |
| 363 | { |
| 364 | #ifdef USE_PREFETCH |
| 365 | ParallelBitmapHeapState *pstate = node->pstate; |
| 366 | |
| 367 | if (pstate == NULL) |
| 368 | { |
| 369 | TBMIterator *prefetch_iterator = node->prefetch_iterator; |
| 370 | |
| 371 | if (node->prefetch_pages > 0) |
| 372 | { |
| 373 | /* The main iterator has closed the distance by one page */ |
| 374 | node->prefetch_pages--; |
| 375 | } |
| 376 | else if (prefetch_iterator) |
| 377 | { |
| 378 | /* Do not let the prefetch iterator get behind the main one */ |
| 379 | TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); |
| 380 | |
| 381 | if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) |
| 382 | elog(ERROR, "prefetch and main iterators are out of sync" ); |
| 383 | } |
| 384 | return; |
| 385 | } |
| 386 | |
| 387 | if (node->prefetch_maximum > 0) |
| 388 | { |
| 389 | TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; |
| 390 | |
| 391 | SpinLockAcquire(&pstate->mutex); |
| 392 | if (pstate->prefetch_pages > 0) |
| 393 | { |
| 394 | pstate->prefetch_pages--; |
| 395 | SpinLockRelease(&pstate->mutex); |
| 396 | } |
| 397 | else |
| 398 | { |
| 399 | /* Release the mutex before iterating */ |
| 400 | SpinLockRelease(&pstate->mutex); |
| 401 | |
| 402 | /* |
| 403 | * In case of shared mode, we can not ensure that the current |
| 404 | * blockno of the main iterator and that of the prefetch iterator |
| 405 | * are same. It's possible that whatever blockno we are |
| 406 | * prefetching will be processed by another process. Therefore, |
| 407 | * we don't validate the blockno here as we do in non-parallel |
| 408 | * case. |
| 409 | */ |
| 410 | if (prefetch_iterator) |
| 411 | tbm_shared_iterate(prefetch_iterator); |
| 412 | } |
| 413 | } |
| 414 | #endif /* USE_PREFETCH */ |
| 415 | } |
| 416 | |
| 417 | /* |
| 418 | * BitmapAdjustPrefetchTarget - Adjust the prefetch target |
| 419 | * |
| 420 | * Increase prefetch target if it's not yet at the max. Note that |
| 421 | * we will increase it to zero after fetching the very first |
| 422 | * page/tuple, then to one after the second tuple is fetched, then |
| 423 | * it doubles as later pages are fetched. |
| 424 | */ |
| 425 | static inline void |
| 426 | BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) |
| 427 | { |
| 428 | #ifdef USE_PREFETCH |
| 429 | ParallelBitmapHeapState *pstate = node->pstate; |
| 430 | |
| 431 | if (pstate == NULL) |
| 432 | { |
| 433 | if (node->prefetch_target >= node->prefetch_maximum) |
| 434 | /* don't increase any further */ ; |
| 435 | else if (node->prefetch_target >= node->prefetch_maximum / 2) |
| 436 | node->prefetch_target = node->prefetch_maximum; |
| 437 | else if (node->prefetch_target > 0) |
| 438 | node->prefetch_target *= 2; |
| 439 | else |
| 440 | node->prefetch_target++; |
| 441 | return; |
| 442 | } |
| 443 | |
| 444 | /* Do an unlocked check first to save spinlock acquisitions. */ |
| 445 | if (pstate->prefetch_target < node->prefetch_maximum) |
| 446 | { |
| 447 | SpinLockAcquire(&pstate->mutex); |
| 448 | if (pstate->prefetch_target >= node->prefetch_maximum) |
| 449 | /* don't increase any further */ ; |
| 450 | else if (pstate->prefetch_target >= node->prefetch_maximum / 2) |
| 451 | pstate->prefetch_target = node->prefetch_maximum; |
| 452 | else if (pstate->prefetch_target > 0) |
| 453 | pstate->prefetch_target *= 2; |
| 454 | else |
| 455 | pstate->prefetch_target++; |
| 456 | SpinLockRelease(&pstate->mutex); |
| 457 | } |
| 458 | #endif /* USE_PREFETCH */ |
| 459 | } |
| 460 | |
| 461 | /* |
| 462 | * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target |
| 463 | */ |
| 464 | static inline void |
| 465 | BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) |
| 466 | { |
| 467 | #ifdef USE_PREFETCH |
| 468 | ParallelBitmapHeapState *pstate = node->pstate; |
| 469 | |
| 470 | if (pstate == NULL) |
| 471 | { |
| 472 | TBMIterator *prefetch_iterator = node->prefetch_iterator; |
| 473 | |
| 474 | if (prefetch_iterator) |
| 475 | { |
| 476 | while (node->prefetch_pages < node->prefetch_target) |
| 477 | { |
| 478 | TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); |
| 479 | bool skip_fetch; |
| 480 | |
| 481 | if (tbmpre == NULL) |
| 482 | { |
| 483 | /* No more pages to prefetch */ |
| 484 | tbm_end_iterate(prefetch_iterator); |
| 485 | node->prefetch_iterator = NULL; |
| 486 | break; |
| 487 | } |
| 488 | node->prefetch_pages++; |
| 489 | |
| 490 | /* |
| 491 | * If we expect not to have to actually read this heap page, |
| 492 | * skip this prefetch call, but continue to run the prefetch |
| 493 | * logic normally. (Would it be better not to increment |
| 494 | * prefetch_pages?) |
| 495 | * |
| 496 | * This depends on the assumption that the index AM will |
| 497 | * report the same recheck flag for this future heap page as |
| 498 | * it did for the current heap page; which is not a certainty |
| 499 | * but is true in many cases. |
| 500 | */ |
| 501 | skip_fetch = (node->can_skip_fetch && |
| 502 | (node->tbmres ? !node->tbmres->recheck : false) && |
| 503 | VM_ALL_VISIBLE(node->ss.ss_currentRelation, |
| 504 | tbmpre->blockno, |
| 505 | &node->pvmbuffer)); |
| 506 | |
| 507 | if (!skip_fetch) |
| 508 | PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | return; |
| 513 | } |
| 514 | |
| 515 | if (pstate->prefetch_pages < pstate->prefetch_target) |
| 516 | { |
| 517 | TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; |
| 518 | |
| 519 | if (prefetch_iterator) |
| 520 | { |
| 521 | while (1) |
| 522 | { |
| 523 | TBMIterateResult *tbmpre; |
| 524 | bool do_prefetch = false; |
| 525 | bool skip_fetch; |
| 526 | |
| 527 | /* |
| 528 | * Recheck under the mutex. If some other process has already |
| 529 | * done enough prefetching then we need not to do anything. |
| 530 | */ |
| 531 | SpinLockAcquire(&pstate->mutex); |
| 532 | if (pstate->prefetch_pages < pstate->prefetch_target) |
| 533 | { |
| 534 | pstate->prefetch_pages++; |
| 535 | do_prefetch = true; |
| 536 | } |
| 537 | SpinLockRelease(&pstate->mutex); |
| 538 | |
| 539 | if (!do_prefetch) |
| 540 | return; |
| 541 | |
| 542 | tbmpre = tbm_shared_iterate(prefetch_iterator); |
| 543 | if (tbmpre == NULL) |
| 544 | { |
| 545 | /* No more pages to prefetch */ |
| 546 | tbm_end_shared_iterate(prefetch_iterator); |
| 547 | node->shared_prefetch_iterator = NULL; |
| 548 | break; |
| 549 | } |
| 550 | |
| 551 | /* As above, skip prefetch if we expect not to need page */ |
| 552 | skip_fetch = (node->can_skip_fetch && |
| 553 | (node->tbmres ? !node->tbmres->recheck : false) && |
| 554 | VM_ALL_VISIBLE(node->ss.ss_currentRelation, |
| 555 | tbmpre->blockno, |
| 556 | &node->pvmbuffer)); |
| 557 | |
| 558 | if (!skip_fetch) |
| 559 | PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); |
| 560 | } |
| 561 | } |
| 562 | } |
| 563 | #endif /* USE_PREFETCH */ |
| 564 | } |
| 565 | |
| 566 | /* |
| 567 | * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual |
| 568 | */ |
| 569 | static bool |
| 570 | BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot) |
| 571 | { |
| 572 | ExprContext *econtext; |
| 573 | |
| 574 | /* |
| 575 | * extract necessary information from index scan node |
| 576 | */ |
| 577 | econtext = node->ss.ps.ps_ExprContext; |
| 578 | |
| 579 | /* Does the tuple meet the original qual conditions? */ |
| 580 | econtext->ecxt_scantuple = slot; |
| 581 | return ExecQualAndReset(node->bitmapqualorig, econtext); |
| 582 | } |
| 583 | |
| 584 | /* ---------------------------------------------------------------- |
| 585 | * ExecBitmapHeapScan(node) |
| 586 | * ---------------------------------------------------------------- |
| 587 | */ |
| 588 | static TupleTableSlot * |
| 589 | ExecBitmapHeapScan(PlanState *pstate) |
| 590 | { |
| 591 | BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate); |
| 592 | |
| 593 | return ExecScan(&node->ss, |
| 594 | (ExecScanAccessMtd) BitmapHeapNext, |
| 595 | (ExecScanRecheckMtd) BitmapHeapRecheck); |
| 596 | } |
| 597 | |
| 598 | /* ---------------------------------------------------------------- |
| 599 | * ExecReScanBitmapHeapScan(node) |
| 600 | * ---------------------------------------------------------------- |
| 601 | */ |
| 602 | void |
| 603 | ExecReScanBitmapHeapScan(BitmapHeapScanState *node) |
| 604 | { |
| 605 | PlanState *outerPlan = outerPlanState(node); |
| 606 | |
| 607 | /* rescan to release any page pin */ |
| 608 | table_rescan(node->ss.ss_currentScanDesc, NULL); |
| 609 | |
| 610 | /* release bitmaps and buffers if any */ |
| 611 | if (node->tbmiterator) |
| 612 | tbm_end_iterate(node->tbmiterator); |
| 613 | if (node->prefetch_iterator) |
| 614 | tbm_end_iterate(node->prefetch_iterator); |
| 615 | if (node->shared_tbmiterator) |
| 616 | tbm_end_shared_iterate(node->shared_tbmiterator); |
| 617 | if (node->shared_prefetch_iterator) |
| 618 | tbm_end_shared_iterate(node->shared_prefetch_iterator); |
| 619 | if (node->tbm) |
| 620 | tbm_free(node->tbm); |
| 621 | if (node->vmbuffer != InvalidBuffer) |
| 622 | ReleaseBuffer(node->vmbuffer); |
| 623 | if (node->pvmbuffer != InvalidBuffer) |
| 624 | ReleaseBuffer(node->pvmbuffer); |
| 625 | node->tbm = NULL; |
| 626 | node->tbmiterator = NULL; |
| 627 | node->tbmres = NULL; |
| 628 | node->prefetch_iterator = NULL; |
| 629 | node->initialized = false; |
| 630 | node->shared_tbmiterator = NULL; |
| 631 | node->shared_prefetch_iterator = NULL; |
| 632 | node->vmbuffer = InvalidBuffer; |
| 633 | node->pvmbuffer = InvalidBuffer; |
| 634 | |
| 635 | ExecScanReScan(&node->ss); |
| 636 | |
| 637 | /* |
| 638 | * if chgParam of subnode is not null then plan will be re-scanned by |
| 639 | * first ExecProcNode. |
| 640 | */ |
| 641 | if (outerPlan->chgParam == NULL) |
| 642 | ExecReScan(outerPlan); |
| 643 | } |
| 644 | |
| 645 | /* ---------------------------------------------------------------- |
| 646 | * ExecEndBitmapHeapScan |
| 647 | * ---------------------------------------------------------------- |
| 648 | */ |
| 649 | void |
| 650 | ExecEndBitmapHeapScan(BitmapHeapScanState *node) |
| 651 | { |
| 652 | TableScanDesc scanDesc; |
| 653 | |
| 654 | /* |
| 655 | * extract information from the node |
| 656 | */ |
| 657 | scanDesc = node->ss.ss_currentScanDesc; |
| 658 | |
| 659 | /* |
| 660 | * Free the exprcontext |
| 661 | */ |
| 662 | ExecFreeExprContext(&node->ss.ps); |
| 663 | |
| 664 | /* |
| 665 | * clear out tuple table slots |
| 666 | */ |
| 667 | if (node->ss.ps.ps_ResultTupleSlot) |
| 668 | ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); |
| 669 | ExecClearTuple(node->ss.ss_ScanTupleSlot); |
| 670 | |
| 671 | /* |
| 672 | * close down subplans |
| 673 | */ |
| 674 | ExecEndNode(outerPlanState(node)); |
| 675 | |
| 676 | /* |
| 677 | * release bitmaps and buffers if any |
| 678 | */ |
| 679 | if (node->tbmiterator) |
| 680 | tbm_end_iterate(node->tbmiterator); |
| 681 | if (node->prefetch_iterator) |
| 682 | tbm_end_iterate(node->prefetch_iterator); |
| 683 | if (node->tbm) |
| 684 | tbm_free(node->tbm); |
| 685 | if (node->shared_tbmiterator) |
| 686 | tbm_end_shared_iterate(node->shared_tbmiterator); |
| 687 | if (node->shared_prefetch_iterator) |
| 688 | tbm_end_shared_iterate(node->shared_prefetch_iterator); |
| 689 | if (node->vmbuffer != InvalidBuffer) |
| 690 | ReleaseBuffer(node->vmbuffer); |
| 691 | if (node->pvmbuffer != InvalidBuffer) |
| 692 | ReleaseBuffer(node->pvmbuffer); |
| 693 | |
| 694 | /* |
| 695 | * close heap scan |
| 696 | */ |
| 697 | table_endscan(scanDesc); |
| 698 | } |
| 699 | |
| 700 | /* ---------------------------------------------------------------- |
| 701 | * ExecInitBitmapHeapScan |
| 702 | * |
| 703 | * Initializes the scan's state information. |
| 704 | * ---------------------------------------------------------------- |
| 705 | */ |
| 706 | BitmapHeapScanState * |
| 707 | ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) |
| 708 | { |
| 709 | BitmapHeapScanState *scanstate; |
| 710 | Relation currentRelation; |
| 711 | int io_concurrency; |
| 712 | |
| 713 | /* check for unsupported flags */ |
| 714 | Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); |
| 715 | |
| 716 | /* |
| 717 | * Assert caller didn't ask for an unsafe snapshot --- see comments at |
| 718 | * head of file. |
| 719 | */ |
| 720 | Assert(IsMVCCSnapshot(estate->es_snapshot)); |
| 721 | |
| 722 | /* |
| 723 | * create state structure |
| 724 | */ |
| 725 | scanstate = makeNode(BitmapHeapScanState); |
| 726 | scanstate->ss.ps.plan = (Plan *) node; |
| 727 | scanstate->ss.ps.state = estate; |
| 728 | scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan; |
| 729 | |
| 730 | scanstate->tbm = NULL; |
| 731 | scanstate->tbmiterator = NULL; |
| 732 | scanstate->tbmres = NULL; |
| 733 | scanstate->return_empty_tuples = 0; |
| 734 | scanstate->vmbuffer = InvalidBuffer; |
| 735 | scanstate->pvmbuffer = InvalidBuffer; |
| 736 | scanstate->exact_pages = 0; |
| 737 | scanstate->lossy_pages = 0; |
| 738 | scanstate->prefetch_iterator = NULL; |
| 739 | scanstate->prefetch_pages = 0; |
| 740 | scanstate->prefetch_target = 0; |
| 741 | /* may be updated below */ |
| 742 | scanstate->prefetch_maximum = target_prefetch_pages; |
| 743 | scanstate->pscan_len = 0; |
| 744 | scanstate->initialized = false; |
| 745 | scanstate->shared_tbmiterator = NULL; |
| 746 | scanstate->shared_prefetch_iterator = NULL; |
| 747 | scanstate->pstate = NULL; |
| 748 | |
| 749 | /* |
| 750 | * We can potentially skip fetching heap pages if we do not need any |
| 751 | * columns of the table, either for checking non-indexable quals or for |
| 752 | * returning data. This test is a bit simplistic, as it checks the |
| 753 | * stronger condition that there's no qual or return tlist at all. But in |
| 754 | * most cases it's probably not worth working harder than that. |
| 755 | */ |
| 756 | scanstate->can_skip_fetch = (node->scan.plan.qual == NIL && |
| 757 | node->scan.plan.targetlist == NIL); |
| 758 | |
| 759 | /* |
| 760 | * Miscellaneous initialization |
| 761 | * |
| 762 | * create expression context for node |
| 763 | */ |
| 764 | ExecAssignExprContext(estate, &scanstate->ss.ps); |
| 765 | |
| 766 | /* |
| 767 | * open the scan relation |
| 768 | */ |
| 769 | currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); |
| 770 | |
| 771 | /* |
| 772 | * initialize child nodes |
| 773 | */ |
| 774 | outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags); |
| 775 | |
| 776 | /* |
| 777 | * get the scan type from the relation descriptor. |
| 778 | */ |
| 779 | ExecInitScanTupleSlot(estate, &scanstate->ss, |
| 780 | RelationGetDescr(currentRelation), |
| 781 | table_slot_callbacks(currentRelation)); |
| 782 | |
| 783 | /* |
| 784 | * Initialize result type and projection. |
| 785 | */ |
| 786 | ExecInitResultTypeTL(&scanstate->ss.ps); |
| 787 | ExecAssignScanProjectionInfo(&scanstate->ss); |
| 788 | |
| 789 | /* |
| 790 | * initialize child expressions |
| 791 | */ |
| 792 | scanstate->ss.ps.qual = |
| 793 | ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); |
| 794 | scanstate->bitmapqualorig = |
| 795 | ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate); |
| 796 | |
| 797 | /* |
| 798 | * Determine the maximum for prefetch_target. If the tablespace has a |
| 799 | * specific IO concurrency set, use that to compute the corresponding |
| 800 | * maximum value; otherwise, we already initialized to the value computed |
| 801 | * by the GUC machinery. |
| 802 | */ |
| 803 | io_concurrency = |
| 804 | get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); |
| 805 | if (io_concurrency != effective_io_concurrency) |
| 806 | { |
| 807 | double maximum; |
| 808 | |
| 809 | if (ComputeIoConcurrency(io_concurrency, &maximum)) |
| 810 | scanstate->prefetch_maximum = rint(maximum); |
| 811 | } |
| 812 | |
| 813 | scanstate->ss.ss_currentRelation = currentRelation; |
| 814 | |
| 815 | scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation, |
| 816 | estate->es_snapshot, |
| 817 | 0, |
| 818 | NULL); |
| 819 | |
| 820 | /* |
| 821 | * all done. |
| 822 | */ |
| 823 | return scanstate; |
| 824 | } |
| 825 | |
| 826 | /*---------------- |
| 827 | * BitmapShouldInitializeSharedState |
| 828 | * |
| 829 | * The first process to come here and see the state to the BM_INITIAL |
| 830 | * will become the leader for the parallel bitmap scan and will be |
| 831 | * responsible for populating the TIDBitmap. The other processes will |
| 832 | * be blocked by the condition variable until the leader wakes them up. |
| 833 | * --------------- |
| 834 | */ |
| 835 | static bool |
| 836 | BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate) |
| 837 | { |
| 838 | SharedBitmapState state; |
| 839 | |
| 840 | while (1) |
| 841 | { |
| 842 | SpinLockAcquire(&pstate->mutex); |
| 843 | state = pstate->state; |
| 844 | if (pstate->state == BM_INITIAL) |
| 845 | pstate->state = BM_INPROGRESS; |
| 846 | SpinLockRelease(&pstate->mutex); |
| 847 | |
| 848 | /* Exit if bitmap is done, or if we're the leader. */ |
| 849 | if (state != BM_INPROGRESS) |
| 850 | break; |
| 851 | |
| 852 | /* Wait for the leader to wake us up. */ |
| 853 | ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN); |
| 854 | } |
| 855 | |
| 856 | ConditionVariableCancelSleep(); |
| 857 | |
| 858 | return (state == BM_INITIAL); |
| 859 | } |
| 860 | |
| 861 | /* ---------------------------------------------------------------- |
| 862 | * ExecBitmapHeapEstimate |
| 863 | * |
| 864 | * Compute the amount of space we'll need in the parallel |
| 865 | * query DSM, and inform pcxt->estimator about our needs. |
| 866 | * ---------------------------------------------------------------- |
| 867 | */ |
| 868 | void |
| 869 | ExecBitmapHeapEstimate(BitmapHeapScanState *node, |
| 870 | ParallelContext *pcxt) |
| 871 | { |
| 872 | EState *estate = node->ss.ps.state; |
| 873 | |
| 874 | node->pscan_len = add_size(offsetof(ParallelBitmapHeapState, |
| 875 | phs_snapshot_data), |
| 876 | EstimateSnapshotSpace(estate->es_snapshot)); |
| 877 | |
| 878 | shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); |
| 879 | shm_toc_estimate_keys(&pcxt->estimator, 1); |
| 880 | } |
| 881 | |
| 882 | /* ---------------------------------------------------------------- |
| 883 | * ExecBitmapHeapInitializeDSM |
| 884 | * |
| 885 | * Set up a parallel bitmap heap scan descriptor. |
| 886 | * ---------------------------------------------------------------- |
| 887 | */ |
| 888 | void |
| 889 | ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, |
| 890 | ParallelContext *pcxt) |
| 891 | { |
| 892 | ParallelBitmapHeapState *pstate; |
| 893 | EState *estate = node->ss.ps.state; |
| 894 | dsa_area *dsa = node->ss.ps.state->es_query_dsa; |
| 895 | |
| 896 | /* If there's no DSA, there are no workers; initialize nothing. */ |
| 897 | if (dsa == NULL) |
| 898 | return; |
| 899 | |
| 900 | pstate = shm_toc_allocate(pcxt->toc, node->pscan_len); |
| 901 | |
| 902 | pstate->tbmiterator = 0; |
| 903 | pstate->prefetch_iterator = 0; |
| 904 | |
| 905 | /* Initialize the mutex */ |
| 906 | SpinLockInit(&pstate->mutex); |
| 907 | pstate->prefetch_pages = 0; |
| 908 | pstate->prefetch_target = 0; |
| 909 | pstate->state = BM_INITIAL; |
| 910 | |
| 911 | ConditionVariableInit(&pstate->cv); |
| 912 | SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data); |
| 913 | |
| 914 | shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate); |
| 915 | node->pstate = pstate; |
| 916 | } |
| 917 | |
| 918 | /* ---------------------------------------------------------------- |
| 919 | * ExecBitmapHeapReInitializeDSM |
| 920 | * |
| 921 | * Reset shared state before beginning a fresh scan. |
| 922 | * ---------------------------------------------------------------- |
| 923 | */ |
| 924 | void |
| 925 | ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, |
| 926 | ParallelContext *pcxt) |
| 927 | { |
| 928 | ParallelBitmapHeapState *pstate = node->pstate; |
| 929 | dsa_area *dsa = node->ss.ps.state->es_query_dsa; |
| 930 | |
| 931 | /* If there's no DSA, there are no workers; do nothing. */ |
| 932 | if (dsa == NULL) |
| 933 | return; |
| 934 | |
| 935 | pstate->state = BM_INITIAL; |
| 936 | |
| 937 | if (DsaPointerIsValid(pstate->tbmiterator)) |
| 938 | tbm_free_shared_area(dsa, pstate->tbmiterator); |
| 939 | |
| 940 | if (DsaPointerIsValid(pstate->prefetch_iterator)) |
| 941 | tbm_free_shared_area(dsa, pstate->prefetch_iterator); |
| 942 | |
| 943 | pstate->tbmiterator = InvalidDsaPointer; |
| 944 | pstate->prefetch_iterator = InvalidDsaPointer; |
| 945 | } |
| 946 | |
| 947 | /* ---------------------------------------------------------------- |
| 948 | * ExecBitmapHeapInitializeWorker |
| 949 | * |
| 950 | * Copy relevant information from TOC into planstate. |
| 951 | * ---------------------------------------------------------------- |
| 952 | */ |
| 953 | void |
| 954 | ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, |
| 955 | ParallelWorkerContext *pwcxt) |
| 956 | { |
| 957 | ParallelBitmapHeapState *pstate; |
| 958 | Snapshot snapshot; |
| 959 | |
| 960 | Assert(node->ss.ps.state->es_query_dsa != NULL); |
| 961 | |
| 962 | pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); |
| 963 | node->pstate = pstate; |
| 964 | |
| 965 | snapshot = RestoreSnapshot(pstate->phs_snapshot_data); |
| 966 | table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot); |
| 967 | } |
| 968 | |