| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * nodeHashjoin.c |
| 4 | * Routines to handle hash join nodes |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | * |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/executor/nodeHashjoin.c |
| 12 | * |
| 13 | * PARALLELISM |
| 14 | * |
| 15 | * Hash joins can participate in parallel query execution in several ways. A |
| 16 | * parallel-oblivious hash join is one where the node is unaware that it is |
| 17 | * part of a parallel plan. In this case, a copy of the inner plan is used to |
| 18 | * build a copy of the hash table in every backend, and the outer plan could |
| 19 | * either be built from a partial or complete path, so that the results of the |
| 20 | * hash join are correspondingly either partial or complete. A parallel-aware |
| 21 | * hash join is one that behaves differently, coordinating work between |
| 22 | * backends, and appears as Parallel Hash Join in EXPLAIN output. A Parallel |
| 23 | * Hash Join always appears with a Parallel Hash node. |
| 24 | * |
| 25 | * Parallel-aware hash joins use the same per-backend state machine to track |
| 26 | * progress through the hash join algorithm as parallel-oblivious hash joins. |
| 27 | * In a parallel-aware hash join, there is also a shared state machine that |
| 28 | * co-operating backends use to synchronize their local state machines and |
| 29 | * program counters. The shared state machine is managed with a Barrier IPC |
| 30 | * primitive. When all attached participants arrive at a barrier, the phase |
| 31 | * advances and all waiting participants are released. |
| 32 | * |
| 33 | * When a participant begins working on a parallel hash join, it must first |
| 34 | * figure out how much progress has already been made, because participants |
| 35 | * don't wait for each other to begin. For this reason there are switch |
| 36 | * statements at key points in the code where we have to synchronize our local |
| 37 | * state machine with the phase, and then jump to the correct part of the |
| 38 | * algorithm so that we can get started. |
| 39 | * |
| 40 | * One barrier called build_barrier is used to coordinate the hashing phases. |
| 41 | * The phase is represented by an integer which begins at zero and increments |
| 42 | * one by one, but in the code it is referred to by symbolic names as follows: |
| 43 | * |
| 44 | * PHJ_BUILD_ELECTING -- initial state |
| 45 | * PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0 |
| 46 | * PHJ_BUILD_HASHING_INNER -- all hash the inner rel |
| 47 | * PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer |
| 48 | * PHJ_BUILD_DONE -- building done, probing can begin |
| 49 | * |
| 50 | * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may |
| 51 | * be used repeatedly as required to coordinate expansions in the number of |
| 52 | * batches or buckets. Their phases are as follows: |
| 53 | * |
| 54 | * PHJ_GROW_BATCHES_ELECTING -- initial state |
| 55 | * PHJ_GROW_BATCHES_ALLOCATING -- one allocates new batches |
| 56 | * PHJ_GROW_BATCHES_REPARTITIONING -- all repartition |
| 57 | * PHJ_GROW_BATCHES_FINISHING -- one cleans up, detects skew |
| 58 | * |
| 59 | * PHJ_GROW_BUCKETS_ELECTING -- initial state |
| 60 | * PHJ_GROW_BUCKETS_ALLOCATING -- one allocates new buckets |
| 61 | * PHJ_GROW_BUCKETS_REINSERTING -- all insert tuples |
| 62 | * |
| 63 | * If the planner got the number of batches and buckets right, those won't be |
| 64 | * necessary, but on the other hand we might finish up needing to expand the |
| 65 | * buckets or batches multiple times while hashing the inner relation to stay |
| 66 | * within our memory budget and load factor target. For that reason it's a |
| 67 | * separate pair of barriers using circular phases. |
| 68 | * |
| 69 | * The PHJ_BUILD_HASHING_OUTER phase is required only for multi-batch joins, |
| 70 | * because we need to divide the outer relation into batches up front in order |
| 71 | * to be able to process batches entirely independently. In contrast, the |
| 72 | * parallel-oblivious algorithm simply throws tuples 'forward' to 'later' |
| 73 | * batches whenever it encounters them while scanning and probing, which it |
| 74 | * can do because it processes batches in serial order. |
| 75 | * |
| 76 | * Once PHJ_BUILD_DONE is reached, backends then split up and process |
| 77 | * different batches, or gang up and work together on probing batches if there |
| 78 | * aren't enough to go around. For each batch there is a separate barrier |
| 79 | * with the following phases: |
| 80 | * |
| 81 | * PHJ_BATCH_ELECTING -- initial state |
| 82 | * PHJ_BATCH_ALLOCATING -- one allocates buckets |
| 83 | * PHJ_BATCH_LOADING -- all load the hash table from disk |
| 84 | * PHJ_BATCH_PROBING -- all probe |
| 85 | * PHJ_BATCH_DONE -- end |
| 86 | * |
| 87 | * Batch 0 is a special case, because it starts out in phase |
| 88 | * PHJ_BATCH_PROBING; populating batch 0's hash table is done during |
| 89 | * PHJ_BUILD_HASHING_INNER so we can skip loading. |
| 90 | * |
| 91 | * Initially we try to plan for a single-batch hash join using the combined |
| 92 | * work_mem of all participants to create a large shared hash table. If that |
| 93 | * turns out either at planning or execution time to be impossible then we |
| 94 | * fall back to regular work_mem sized hash tables. |
| 95 | * |
| 96 | * To avoid deadlocks, we never wait for any barrier unless it is known that |
| 97 | * all other backends attached to it are actively executing the node or have |
| 98 | * already arrived. Practically, that means that we never return a tuple |
| 99 | * while attached to a barrier, unless the barrier has reached its final |
| 100 | * state. In the slightly special case of the per-batch barrier, we return |
| 101 | * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use |
| 102 | * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting. |
| 103 | * |
| 104 | *------------------------------------------------------------------------- |
| 105 | */ |
| 106 | |
| 107 | #include "postgres.h" |
| 108 | |
| 109 | #include "access/htup_details.h" |
| 110 | #include "access/parallel.h" |
| 111 | #include "executor/executor.h" |
| 112 | #include "executor/hashjoin.h" |
| 113 | #include "executor/nodeHash.h" |
| 114 | #include "executor/nodeHashjoin.h" |
| 115 | #include "miscadmin.h" |
| 116 | #include "pgstat.h" |
| 117 | #include "utils/memutils.h" |
| 118 | #include "utils/sharedtuplestore.h" |
| 119 | |
| 120 | |
| 121 | /* |
| 122 | * States of the ExecHashJoin state machine |
| 123 | */ |
| 124 | #define HJ_BUILD_HASHTABLE 1 |
| 125 | #define HJ_NEED_NEW_OUTER 2 |
| 126 | #define HJ_SCAN_BUCKET 3 |
| 127 | #define HJ_FILL_OUTER_TUPLE 4 |
| 128 | #define HJ_FILL_INNER_TUPLES 5 |
| 129 | #define HJ_NEED_NEW_BATCH 6 |
| 130 | |
| 131 | /* Returns true if doing null-fill on outer relation */ |
| 132 | #define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL) |
| 133 | /* Returns true if doing null-fill on inner relation */ |
| 134 | #define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL) |
| 135 | |
| 136 | static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, |
| 137 | HashJoinState *hjstate, |
| 138 | uint32 *hashvalue); |
| 139 | static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, |
| 140 | HashJoinState *hjstate, |
| 141 | uint32 *hashvalue); |
| 142 | static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, |
| 143 | BufFile *file, |
| 144 | uint32 *hashvalue, |
| 145 | TupleTableSlot *tupleSlot); |
| 146 | static bool ExecHashJoinNewBatch(HashJoinState *hjstate); |
| 147 | static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); |
| 148 | static void ExecParallelHashJoinPartitionOuter(HashJoinState *node); |
| 149 | |
| 150 | |
| 151 | /* ---------------------------------------------------------------- |
| 152 | * ExecHashJoinImpl |
| 153 | * |
| 154 | * This function implements the Hybrid Hashjoin algorithm. It is marked |
| 155 | * with an always-inline attribute so that ExecHashJoin() and |
| 156 | * ExecParallelHashJoin() can inline it. Compilers that respect the |
| 157 | * attribute should create versions specialized for parallel == true and |
| 158 | * parallel == false with unnecessary branches removed. |
| 159 | * |
| 160 | * Note: the relation we build hash table on is the "inner" |
| 161 | * the other one is "outer". |
| 162 | * ---------------------------------------------------------------- |
| 163 | */ |
| 164 | static pg_attribute_always_inline TupleTableSlot * |
| 165 | ExecHashJoinImpl(PlanState *pstate, bool parallel) |
| 166 | { |
| 167 | HashJoinState *node = castNode(HashJoinState, pstate); |
| 168 | PlanState *outerNode; |
| 169 | HashState *hashNode; |
| 170 | ExprState *joinqual; |
| 171 | ExprState *otherqual; |
| 172 | ExprContext *econtext; |
| 173 | HashJoinTable hashtable; |
| 174 | TupleTableSlot *outerTupleSlot; |
| 175 | uint32 hashvalue; |
| 176 | int batchno; |
| 177 | ParallelHashJoinState *parallel_state; |
| 178 | |
| 179 | /* |
| 180 | * get information from HashJoin node |
| 181 | */ |
| 182 | joinqual = node->js.joinqual; |
| 183 | otherqual = node->js.ps.qual; |
| 184 | hashNode = (HashState *) innerPlanState(node); |
| 185 | outerNode = outerPlanState(node); |
| 186 | hashtable = node->hj_HashTable; |
| 187 | econtext = node->js.ps.ps_ExprContext; |
| 188 | parallel_state = hashNode->parallel_state; |
| 189 | |
| 190 | /* |
| 191 | * Reset per-tuple memory context to free any expression evaluation |
| 192 | * storage allocated in the previous tuple cycle. |
| 193 | */ |
| 194 | ResetExprContext(econtext); |
| 195 | |
| 196 | /* |
| 197 | * run the hash join state machine |
| 198 | */ |
| 199 | for (;;) |
| 200 | { |
| 201 | /* |
| 202 | * It's possible to iterate this loop many times before returning a |
| 203 | * tuple, in some pathological cases such as needing to move much of |
| 204 | * the current batch to a later batch. So let's check for interrupts |
| 205 | * each time through. |
| 206 | */ |
| 207 | CHECK_FOR_INTERRUPTS(); |
| 208 | |
| 209 | switch (node->hj_JoinState) |
| 210 | { |
| 211 | case HJ_BUILD_HASHTABLE: |
| 212 | |
| 213 | /* |
| 214 | * First time through: build hash table for inner relation. |
| 215 | */ |
| 216 | Assert(hashtable == NULL); |
| 217 | |
| 218 | /* |
| 219 | * If the outer relation is completely empty, and it's not |
| 220 | * right/full join, we can quit without building the hash |
| 221 | * table. However, for an inner join it is only a win to |
| 222 | * check this when the outer relation's startup cost is less |
| 223 | * than the projected cost of building the hash table. |
| 224 | * Otherwise it's best to build the hash table first and see |
| 225 | * if the inner relation is empty. (When it's a left join, we |
| 226 | * should always make this check, since we aren't going to be |
| 227 | * able to skip the join on the strength of an empty inner |
| 228 | * relation anyway.) |
| 229 | * |
| 230 | * If we are rescanning the join, we make use of information |
| 231 | * gained on the previous scan: don't bother to try the |
| 232 | * prefetch if the previous scan found the outer relation |
| 233 | * nonempty. This is not 100% reliable since with new |
| 234 | * parameters the outer relation might yield different |
| 235 | * results, but it's a good heuristic. |
| 236 | * |
| 237 | * The only way to make the check is to try to fetch a tuple |
| 238 | * from the outer plan node. If we succeed, we have to stash |
| 239 | * it away for later consumption by ExecHashJoinOuterGetTuple. |
| 240 | */ |
| 241 | if (HJ_FILL_INNER(node)) |
| 242 | { |
| 243 | /* no chance to not build the hash table */ |
| 244 | node->hj_FirstOuterTupleSlot = NULL; |
| 245 | } |
| 246 | else if (parallel) |
| 247 | { |
| 248 | /* |
| 249 | * The empty-outer optimization is not implemented for |
| 250 | * shared hash tables, because no one participant can |
| 251 | * determine that there are no outer tuples, and it's not |
| 252 | * yet clear that it's worth the synchronization overhead |
| 253 | * of reaching consensus to figure that out. So we have |
| 254 | * to build the hash table. |
| 255 | */ |
| 256 | node->hj_FirstOuterTupleSlot = NULL; |
| 257 | } |
| 258 | else if (HJ_FILL_OUTER(node) || |
| 259 | (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost && |
| 260 | !node->hj_OuterNotEmpty)) |
| 261 | { |
| 262 | node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode); |
| 263 | if (TupIsNull(node->hj_FirstOuterTupleSlot)) |
| 264 | { |
| 265 | node->hj_OuterNotEmpty = false; |
| 266 | return NULL; |
| 267 | } |
| 268 | else |
| 269 | node->hj_OuterNotEmpty = true; |
| 270 | } |
| 271 | else |
| 272 | node->hj_FirstOuterTupleSlot = NULL; |
| 273 | |
| 274 | /* |
| 275 | * Create the hash table. If using Parallel Hash, then |
| 276 | * whoever gets here first will create the hash table and any |
| 277 | * later arrivals will merely attach to it. |
| 278 | */ |
| 279 | hashtable = ExecHashTableCreate(hashNode, |
| 280 | node->hj_HashOperators, |
| 281 | node->hj_Collations, |
| 282 | HJ_FILL_INNER(node)); |
| 283 | node->hj_HashTable = hashtable; |
| 284 | |
| 285 | /* |
| 286 | * Execute the Hash node, to build the hash table. If using |
| 287 | * Parallel Hash, then we'll try to help hashing unless we |
| 288 | * arrived too late. |
| 289 | */ |
| 290 | hashNode->hashtable = hashtable; |
| 291 | (void) MultiExecProcNode((PlanState *) hashNode); |
| 292 | |
| 293 | /* |
| 294 | * If the inner relation is completely empty, and we're not |
| 295 | * doing a left outer join, we can quit without scanning the |
| 296 | * outer relation. |
| 297 | */ |
| 298 | if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node)) |
| 299 | return NULL; |
| 300 | |
| 301 | /* |
| 302 | * need to remember whether nbatch has increased since we |
| 303 | * began scanning the outer relation |
| 304 | */ |
| 305 | hashtable->nbatch_outstart = hashtable->nbatch; |
| 306 | |
| 307 | /* |
| 308 | * Reset OuterNotEmpty for scan. (It's OK if we fetched a |
| 309 | * tuple above, because ExecHashJoinOuterGetTuple will |
| 310 | * immediately set it again.) |
| 311 | */ |
| 312 | node->hj_OuterNotEmpty = false; |
| 313 | |
| 314 | if (parallel) |
| 315 | { |
| 316 | Barrier *build_barrier; |
| 317 | |
| 318 | build_barrier = ¶llel_state->build_barrier; |
| 319 | Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER || |
| 320 | BarrierPhase(build_barrier) == PHJ_BUILD_DONE); |
| 321 | if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER) |
| 322 | { |
| 323 | /* |
| 324 | * If multi-batch, we need to hash the outer relation |
| 325 | * up front. |
| 326 | */ |
| 327 | if (hashtable->nbatch > 1) |
| 328 | ExecParallelHashJoinPartitionOuter(node); |
| 329 | BarrierArriveAndWait(build_barrier, |
| 330 | WAIT_EVENT_HASH_BUILD_HASHING_OUTER); |
| 331 | } |
| 332 | Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE); |
| 333 | |
| 334 | /* Each backend should now select a batch to work on. */ |
| 335 | hashtable->curbatch = -1; |
| 336 | node->hj_JoinState = HJ_NEED_NEW_BATCH; |
| 337 | |
| 338 | continue; |
| 339 | } |
| 340 | else |
| 341 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
| 342 | |
| 343 | /* FALL THRU */ |
| 344 | |
| 345 | case HJ_NEED_NEW_OUTER: |
| 346 | |
| 347 | /* |
| 348 | * We don't have an outer tuple, try to get the next one |
| 349 | */ |
| 350 | if (parallel) |
| 351 | outerTupleSlot = |
| 352 | ExecParallelHashJoinOuterGetTuple(outerNode, node, |
| 353 | &hashvalue); |
| 354 | else |
| 355 | outerTupleSlot = |
| 356 | ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue); |
| 357 | |
| 358 | if (TupIsNull(outerTupleSlot)) |
| 359 | { |
| 360 | /* end of batch, or maybe whole join */ |
| 361 | if (HJ_FILL_INNER(node)) |
| 362 | { |
| 363 | /* set up to scan for unmatched inner tuples */ |
| 364 | ExecPrepHashTableForUnmatched(node); |
| 365 | node->hj_JoinState = HJ_FILL_INNER_TUPLES; |
| 366 | } |
| 367 | else |
| 368 | node->hj_JoinState = HJ_NEED_NEW_BATCH; |
| 369 | continue; |
| 370 | } |
| 371 | |
| 372 | econtext->ecxt_outertuple = outerTupleSlot; |
| 373 | node->hj_MatchedOuter = false; |
| 374 | |
| 375 | /* |
| 376 | * Find the corresponding bucket for this tuple in the main |
| 377 | * hash table or skew hash table. |
| 378 | */ |
| 379 | node->hj_CurHashValue = hashvalue; |
| 380 | ExecHashGetBucketAndBatch(hashtable, hashvalue, |
| 381 | &node->hj_CurBucketNo, &batchno); |
| 382 | node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, |
| 383 | hashvalue); |
| 384 | node->hj_CurTuple = NULL; |
| 385 | |
| 386 | /* |
| 387 | * The tuple might not belong to the current batch (where |
| 388 | * "current batch" includes the skew buckets if any). |
| 389 | */ |
| 390 | if (batchno != hashtable->curbatch && |
| 391 | node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) |
| 392 | { |
| 393 | bool shouldFree; |
| 394 | MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, |
| 395 | &shouldFree); |
| 396 | |
| 397 | /* |
| 398 | * Need to postpone this outer tuple to a later batch. |
| 399 | * Save it in the corresponding outer-batch file. |
| 400 | */ |
| 401 | Assert(parallel_state == NULL); |
| 402 | Assert(batchno > hashtable->curbatch); |
| 403 | ExecHashJoinSaveTuple(mintuple, hashvalue, |
| 404 | &hashtable->outerBatchFile[batchno]); |
| 405 | |
| 406 | if (shouldFree) |
| 407 | heap_free_minimal_tuple(mintuple); |
| 408 | |
| 409 | /* Loop around, staying in HJ_NEED_NEW_OUTER state */ |
| 410 | continue; |
| 411 | } |
| 412 | |
| 413 | /* OK, let's scan the bucket for matches */ |
| 414 | node->hj_JoinState = HJ_SCAN_BUCKET; |
| 415 | |
| 416 | /* FALL THRU */ |
| 417 | |
| 418 | case HJ_SCAN_BUCKET: |
| 419 | |
| 420 | /* |
| 421 | * Scan the selected hash bucket for matches to current outer |
| 422 | */ |
| 423 | if (parallel) |
| 424 | { |
| 425 | if (!ExecParallelScanHashBucket(node, econtext)) |
| 426 | { |
| 427 | /* out of matches; check for possible outer-join fill */ |
| 428 | node->hj_JoinState = HJ_FILL_OUTER_TUPLE; |
| 429 | continue; |
| 430 | } |
| 431 | } |
| 432 | else |
| 433 | { |
| 434 | if (!ExecScanHashBucket(node, econtext)) |
| 435 | { |
| 436 | /* out of matches; check for possible outer-join fill */ |
| 437 | node->hj_JoinState = HJ_FILL_OUTER_TUPLE; |
| 438 | continue; |
| 439 | } |
| 440 | } |
| 441 | |
| 442 | /* |
| 443 | * We've got a match, but still need to test non-hashed quals. |
| 444 | * ExecScanHashBucket already set up all the state needed to |
| 445 | * call ExecQual. |
| 446 | * |
| 447 | * If we pass the qual, then save state for next call and have |
| 448 | * ExecProject form the projection, store it in the tuple |
| 449 | * table, and return the slot. |
| 450 | * |
| 451 | * Only the joinquals determine tuple match status, but all |
| 452 | * quals must pass to actually return the tuple. |
| 453 | */ |
| 454 | if (joinqual == NULL || ExecQual(joinqual, econtext)) |
| 455 | { |
| 456 | node->hj_MatchedOuter = true; |
| 457 | HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); |
| 458 | |
| 459 | /* In an antijoin, we never return a matched tuple */ |
| 460 | if (node->js.jointype == JOIN_ANTI) |
| 461 | { |
| 462 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
| 463 | continue; |
| 464 | } |
| 465 | |
| 466 | /* |
| 467 | * If we only need to join to the first matching inner |
| 468 | * tuple, then consider returning this one, but after that |
| 469 | * continue with next outer tuple. |
| 470 | */ |
| 471 | if (node->js.single_match) |
| 472 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
| 473 | |
| 474 | if (otherqual == NULL || ExecQual(otherqual, econtext)) |
| 475 | return ExecProject(node->js.ps.ps_ProjInfo); |
| 476 | else |
| 477 | InstrCountFiltered2(node, 1); |
| 478 | } |
| 479 | else |
| 480 | InstrCountFiltered1(node, 1); |
| 481 | break; |
| 482 | |
| 483 | case HJ_FILL_OUTER_TUPLE: |
| 484 | |
| 485 | /* |
| 486 | * The current outer tuple has run out of matches, so check |
| 487 | * whether to emit a dummy outer-join tuple. Whether we emit |
| 488 | * one or not, the next state is NEED_NEW_OUTER. |
| 489 | */ |
| 490 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
| 491 | |
| 492 | if (!node->hj_MatchedOuter && |
| 493 | HJ_FILL_OUTER(node)) |
| 494 | { |
| 495 | /* |
| 496 | * Generate a fake join tuple with nulls for the inner |
| 497 | * tuple, and return it if it passes the non-join quals. |
| 498 | */ |
| 499 | econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; |
| 500 | |
| 501 | if (otherqual == NULL || ExecQual(otherqual, econtext)) |
| 502 | return ExecProject(node->js.ps.ps_ProjInfo); |
| 503 | else |
| 504 | InstrCountFiltered2(node, 1); |
| 505 | } |
| 506 | break; |
| 507 | |
| 508 | case HJ_FILL_INNER_TUPLES: |
| 509 | |
| 510 | /* |
| 511 | * We have finished a batch, but we are doing right/full join, |
| 512 | * so any unmatched inner tuples in the hashtable have to be |
| 513 | * emitted before we continue to the next batch. |
| 514 | */ |
| 515 | if (!ExecScanHashTableForUnmatched(node, econtext)) |
| 516 | { |
| 517 | /* no more unmatched tuples */ |
| 518 | node->hj_JoinState = HJ_NEED_NEW_BATCH; |
| 519 | continue; |
| 520 | } |
| 521 | |
| 522 | /* |
| 523 | * Generate a fake join tuple with nulls for the outer tuple, |
| 524 | * and return it if it passes the non-join quals. |
| 525 | */ |
| 526 | econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot; |
| 527 | |
| 528 | if (otherqual == NULL || ExecQual(otherqual, econtext)) |
| 529 | return ExecProject(node->js.ps.ps_ProjInfo); |
| 530 | else |
| 531 | InstrCountFiltered2(node, 1); |
| 532 | break; |
| 533 | |
| 534 | case HJ_NEED_NEW_BATCH: |
| 535 | |
| 536 | /* |
| 537 | * Try to advance to next batch. Done if there are no more. |
| 538 | */ |
| 539 | if (parallel) |
| 540 | { |
| 541 | if (!ExecParallelHashJoinNewBatch(node)) |
| 542 | return NULL; /* end of parallel-aware join */ |
| 543 | } |
| 544 | else |
| 545 | { |
| 546 | if (!ExecHashJoinNewBatch(node)) |
| 547 | return NULL; /* end of parallel-oblivious join */ |
| 548 | } |
| 549 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
| 550 | break; |
| 551 | |
| 552 | default: |
| 553 | elog(ERROR, "unrecognized hashjoin state: %d" , |
| 554 | (int) node->hj_JoinState); |
| 555 | } |
| 556 | } |
| 557 | } |
| 558 | |
| 559 | /* ---------------------------------------------------------------- |
| 560 | * ExecHashJoin |
| 561 | * |
| 562 | * Parallel-oblivious version. |
| 563 | * ---------------------------------------------------------------- |
| 564 | */ |
| 565 | static TupleTableSlot * /* return: a tuple or NULL */ |
| 566 | ExecHashJoin(PlanState *pstate) |
| 567 | { |
| 568 | /* |
| 569 | * On sufficiently smart compilers this should be inlined with the |
| 570 | * parallel-aware branches removed. |
| 571 | */ |
| 572 | return ExecHashJoinImpl(pstate, false); |
| 573 | } |
| 574 | |
| 575 | /* ---------------------------------------------------------------- |
| 576 | * ExecParallelHashJoin |
| 577 | * |
| 578 | * Parallel-aware version. |
| 579 | * ---------------------------------------------------------------- |
| 580 | */ |
| 581 | static TupleTableSlot * /* return: a tuple or NULL */ |
| 582 | ExecParallelHashJoin(PlanState *pstate) |
| 583 | { |
| 584 | /* |
| 585 | * On sufficiently smart compilers this should be inlined with the |
| 586 | * parallel-oblivious branches removed. |
| 587 | */ |
| 588 | return ExecHashJoinImpl(pstate, true); |
| 589 | } |
| 590 | |
| 591 | /* ---------------------------------------------------------------- |
| 592 | * ExecInitHashJoin |
| 593 | * |
| 594 | * Init routine for HashJoin node. |
| 595 | * ---------------------------------------------------------------- |
| 596 | */ |
| 597 | HashJoinState * |
| 598 | ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) |
| 599 | { |
| 600 | HashJoinState *hjstate; |
| 601 | Plan *outerNode; |
| 602 | Hash *hashNode; |
| 603 | TupleDesc outerDesc, |
| 604 | innerDesc; |
| 605 | const TupleTableSlotOps *ops; |
| 606 | |
| 607 | /* check for unsupported flags */ |
| 608 | Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); |
| 609 | |
| 610 | /* |
| 611 | * create state structure |
| 612 | */ |
| 613 | hjstate = makeNode(HashJoinState); |
| 614 | hjstate->js.ps.plan = (Plan *) node; |
| 615 | hjstate->js.ps.state = estate; |
| 616 | |
| 617 | /* |
| 618 | * See ExecHashJoinInitializeDSM() and ExecHashJoinInitializeWorker() |
| 619 | * where this function may be replaced with a parallel version, if we |
| 620 | * managed to launch a parallel query. |
| 621 | */ |
| 622 | hjstate->js.ps.ExecProcNode = ExecHashJoin; |
| 623 | hjstate->js.jointype = node->join.jointype; |
| 624 | |
| 625 | /* |
| 626 | * Miscellaneous initialization |
| 627 | * |
| 628 | * create expression context for node |
| 629 | */ |
| 630 | ExecAssignExprContext(estate, &hjstate->js.ps); |
| 631 | |
| 632 | /* |
| 633 | * initialize child nodes |
| 634 | * |
| 635 | * Note: we could suppress the REWIND flag for the inner input, which |
| 636 | * would amount to betting that the hash will be a single batch. Not |
| 637 | * clear if this would be a win or not. |
| 638 | */ |
| 639 | outerNode = outerPlan(node); |
| 640 | hashNode = (Hash *) innerPlan(node); |
| 641 | |
| 642 | outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); |
| 643 | outerDesc = ExecGetResultType(outerPlanState(hjstate)); |
| 644 | innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags); |
| 645 | innerDesc = ExecGetResultType(innerPlanState(hjstate)); |
| 646 | |
| 647 | /* |
| 648 | * Initialize result slot, type and projection. |
| 649 | */ |
| 650 | ExecInitResultTupleSlotTL(&hjstate->js.ps, &TTSOpsVirtual); |
| 651 | ExecAssignProjectionInfo(&hjstate->js.ps, NULL); |
| 652 | |
| 653 | /* |
| 654 | * tuple table initialization |
| 655 | */ |
| 656 | ops = ExecGetResultSlotOps(outerPlanState(hjstate), NULL); |
| 657 | hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate, outerDesc, |
| 658 | ops); |
| 659 | |
| 660 | /* |
| 661 | * detect whether we need only consider the first matching inner tuple |
| 662 | */ |
| 663 | hjstate->js.single_match = (node->join.inner_unique || |
| 664 | node->join.jointype == JOIN_SEMI); |
| 665 | |
| 666 | /* set up null tuples for outer joins, if needed */ |
| 667 | switch (node->join.jointype) |
| 668 | { |
| 669 | case JOIN_INNER: |
| 670 | case JOIN_SEMI: |
| 671 | break; |
| 672 | case JOIN_LEFT: |
| 673 | case JOIN_ANTI: |
| 674 | hjstate->hj_NullInnerTupleSlot = |
| 675 | ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); |
| 676 | break; |
| 677 | case JOIN_RIGHT: |
| 678 | hjstate->hj_NullOuterTupleSlot = |
| 679 | ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); |
| 680 | break; |
| 681 | case JOIN_FULL: |
| 682 | hjstate->hj_NullOuterTupleSlot = |
| 683 | ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); |
| 684 | hjstate->hj_NullInnerTupleSlot = |
| 685 | ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); |
| 686 | break; |
| 687 | default: |
| 688 | elog(ERROR, "unrecognized join type: %d" , |
| 689 | (int) node->join.jointype); |
| 690 | } |
| 691 | |
| 692 | /* |
| 693 | * now for some voodoo. our temporary tuple slot is actually the result |
| 694 | * tuple slot of the Hash node (which is our inner plan). we can do this |
| 695 | * because Hash nodes don't return tuples via ExecProcNode() -- instead |
| 696 | * the hash join node uses ExecScanHashBucket() to get at the contents of |
| 697 | * the hash table. -cim 6/9/91 |
| 698 | */ |
| 699 | { |
| 700 | HashState *hashstate = (HashState *) innerPlanState(hjstate); |
| 701 | TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; |
| 702 | |
| 703 | hjstate->hj_HashTupleSlot = slot; |
| 704 | } |
| 705 | |
| 706 | /* |
| 707 | * initialize child expressions |
| 708 | */ |
| 709 | hjstate->js.ps.qual = |
| 710 | ExecInitQual(node->join.plan.qual, (PlanState *) hjstate); |
| 711 | hjstate->js.joinqual = |
| 712 | ExecInitQual(node->join.joinqual, (PlanState *) hjstate); |
| 713 | hjstate->hashclauses = |
| 714 | ExecInitQual(node->hashclauses, (PlanState *) hjstate); |
| 715 | |
| 716 | /* |
| 717 | * initialize hash-specific info |
| 718 | */ |
| 719 | hjstate->hj_HashTable = NULL; |
| 720 | hjstate->hj_FirstOuterTupleSlot = NULL; |
| 721 | |
| 722 | hjstate->hj_CurHashValue = 0; |
| 723 | hjstate->hj_CurBucketNo = 0; |
| 724 | hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; |
| 725 | hjstate->hj_CurTuple = NULL; |
| 726 | |
| 727 | hjstate->hj_OuterHashKeys = ExecInitExprList(node->hashkeys, |
| 728 | (PlanState *) hjstate); |
| 729 | hjstate->hj_HashOperators = node->hashoperators; |
| 730 | hjstate->hj_Collations = node->hashcollations; |
| 731 | |
| 732 | hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; |
| 733 | hjstate->hj_MatchedOuter = false; |
| 734 | hjstate->hj_OuterNotEmpty = false; |
| 735 | |
| 736 | return hjstate; |
| 737 | } |
| 738 | |
| 739 | /* ---------------------------------------------------------------- |
| 740 | * ExecEndHashJoin |
| 741 | * |
| 742 | * clean up routine for HashJoin node |
| 743 | * ---------------------------------------------------------------- |
| 744 | */ |
| 745 | void |
| 746 | ExecEndHashJoin(HashJoinState *node) |
| 747 | { |
| 748 | /* |
| 749 | * Free hash table |
| 750 | */ |
| 751 | if (node->hj_HashTable) |
| 752 | { |
| 753 | ExecHashTableDestroy(node->hj_HashTable); |
| 754 | node->hj_HashTable = NULL; |
| 755 | } |
| 756 | |
| 757 | /* |
| 758 | * Free the exprcontext |
| 759 | */ |
| 760 | ExecFreeExprContext(&node->js.ps); |
| 761 | |
| 762 | /* |
| 763 | * clean out the tuple table |
| 764 | */ |
| 765 | ExecClearTuple(node->js.ps.ps_ResultTupleSlot); |
| 766 | ExecClearTuple(node->hj_OuterTupleSlot); |
| 767 | ExecClearTuple(node->hj_HashTupleSlot); |
| 768 | |
| 769 | /* |
| 770 | * clean up subtrees |
| 771 | */ |
| 772 | ExecEndNode(outerPlanState(node)); |
| 773 | ExecEndNode(innerPlanState(node)); |
| 774 | } |
| 775 | |
| 776 | /* |
| 777 | * ExecHashJoinOuterGetTuple |
| 778 | * |
| 779 | * get the next outer tuple for a parallel oblivious hashjoin: either by |
| 780 | * executing the outer plan node in the first pass, or from the temp |
| 781 | * files for the hashjoin batches. |
| 782 | * |
| 783 | * Returns a null slot if no more outer tuples (within the current batch). |
| 784 | * |
| 785 | * On success, the tuple's hash value is stored at *hashvalue --- this is |
| 786 | * either originally computed, or re-read from the temp file. |
| 787 | */ |
| 788 | static TupleTableSlot * |
| 789 | ExecHashJoinOuterGetTuple(PlanState *outerNode, |
| 790 | HashJoinState *hjstate, |
| 791 | uint32 *hashvalue) |
| 792 | { |
| 793 | HashJoinTable hashtable = hjstate->hj_HashTable; |
| 794 | int curbatch = hashtable->curbatch; |
| 795 | TupleTableSlot *slot; |
| 796 | |
| 797 | if (curbatch == 0) /* if it is the first pass */ |
| 798 | { |
| 799 | /* |
| 800 | * Check to see if first outer tuple was already fetched by |
| 801 | * ExecHashJoin() and not used yet. |
| 802 | */ |
| 803 | slot = hjstate->hj_FirstOuterTupleSlot; |
| 804 | if (!TupIsNull(slot)) |
| 805 | hjstate->hj_FirstOuterTupleSlot = NULL; |
| 806 | else |
| 807 | slot = ExecProcNode(outerNode); |
| 808 | |
| 809 | while (!TupIsNull(slot)) |
| 810 | { |
| 811 | /* |
| 812 | * We have to compute the tuple's hash value. |
| 813 | */ |
| 814 | ExprContext *econtext = hjstate->js.ps.ps_ExprContext; |
| 815 | |
| 816 | econtext->ecxt_outertuple = slot; |
| 817 | if (ExecHashGetHashValue(hashtable, econtext, |
| 818 | hjstate->hj_OuterHashKeys, |
| 819 | true, /* outer tuple */ |
| 820 | HJ_FILL_OUTER(hjstate), |
| 821 | hashvalue)) |
| 822 | { |
| 823 | /* remember outer relation is not empty for possible rescan */ |
| 824 | hjstate->hj_OuterNotEmpty = true; |
| 825 | |
| 826 | return slot; |
| 827 | } |
| 828 | |
| 829 | /* |
| 830 | * That tuple couldn't match because of a NULL, so discard it and |
| 831 | * continue with the next one. |
| 832 | */ |
| 833 | slot = ExecProcNode(outerNode); |
| 834 | } |
| 835 | } |
| 836 | else if (curbatch < hashtable->nbatch) |
| 837 | { |
| 838 | BufFile *file = hashtable->outerBatchFile[curbatch]; |
| 839 | |
| 840 | /* |
| 841 | * In outer-join cases, we could get here even though the batch file |
| 842 | * is empty. |
| 843 | */ |
| 844 | if (file == NULL) |
| 845 | return NULL; |
| 846 | |
| 847 | slot = ExecHashJoinGetSavedTuple(hjstate, |
| 848 | file, |
| 849 | hashvalue, |
| 850 | hjstate->hj_OuterTupleSlot); |
| 851 | if (!TupIsNull(slot)) |
| 852 | return slot; |
| 853 | } |
| 854 | |
| 855 | /* End of this batch */ |
| 856 | return NULL; |
| 857 | } |
| 858 | |
| 859 | /* |
| 860 | * ExecHashJoinOuterGetTuple variant for the parallel case. |
| 861 | */ |
| 862 | static TupleTableSlot * |
| 863 | ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, |
| 864 | HashJoinState *hjstate, |
| 865 | uint32 *hashvalue) |
| 866 | { |
| 867 | HashJoinTable hashtable = hjstate->hj_HashTable; |
| 868 | int curbatch = hashtable->curbatch; |
| 869 | TupleTableSlot *slot; |
| 870 | |
| 871 | /* |
| 872 | * In the Parallel Hash case we only run the outer plan directly for |
| 873 | * single-batch hash joins. Otherwise we have to go to batch files, even |
| 874 | * for batch 0. |
| 875 | */ |
| 876 | if (curbatch == 0 && hashtable->nbatch == 1) |
| 877 | { |
| 878 | slot = ExecProcNode(outerNode); |
| 879 | |
| 880 | while (!TupIsNull(slot)) |
| 881 | { |
| 882 | ExprContext *econtext = hjstate->js.ps.ps_ExprContext; |
| 883 | |
| 884 | econtext->ecxt_outertuple = slot; |
| 885 | if (ExecHashGetHashValue(hashtable, econtext, |
| 886 | hjstate->hj_OuterHashKeys, |
| 887 | true, /* outer tuple */ |
| 888 | HJ_FILL_OUTER(hjstate), |
| 889 | hashvalue)) |
| 890 | return slot; |
| 891 | |
| 892 | /* |
| 893 | * That tuple couldn't match because of a NULL, so discard it and |
| 894 | * continue with the next one. |
| 895 | */ |
| 896 | slot = ExecProcNode(outerNode); |
| 897 | } |
| 898 | } |
| 899 | else if (curbatch < hashtable->nbatch) |
| 900 | { |
| 901 | MinimalTuple tuple; |
| 902 | |
| 903 | tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples, |
| 904 | hashvalue); |
| 905 | if (tuple != NULL) |
| 906 | { |
| 907 | ExecForceStoreMinimalTuple(tuple, |
| 908 | hjstate->hj_OuterTupleSlot, |
| 909 | false); |
| 910 | slot = hjstate->hj_OuterTupleSlot; |
| 911 | return slot; |
| 912 | } |
| 913 | else |
| 914 | ExecClearTuple(hjstate->hj_OuterTupleSlot); |
| 915 | } |
| 916 | |
| 917 | /* End of this batch */ |
| 918 | return NULL; |
| 919 | } |
| 920 | |
| 921 | /* |
| 922 | * ExecHashJoinNewBatch |
| 923 | * switch to a new hashjoin batch |
| 924 | * |
| 925 | * Returns true if successful, false if there are no more batches. |
| 926 | */ |
| 927 | static bool |
| 928 | ExecHashJoinNewBatch(HashJoinState *hjstate) |
| 929 | { |
| 930 | HashJoinTable hashtable = hjstate->hj_HashTable; |
| 931 | int nbatch; |
| 932 | int curbatch; |
| 933 | BufFile *innerFile; |
| 934 | TupleTableSlot *slot; |
| 935 | uint32 hashvalue; |
| 936 | |
| 937 | nbatch = hashtable->nbatch; |
| 938 | curbatch = hashtable->curbatch; |
| 939 | |
| 940 | if (curbatch > 0) |
| 941 | { |
| 942 | /* |
| 943 | * We no longer need the previous outer batch file; close it right |
| 944 | * away to free disk space. |
| 945 | */ |
| 946 | if (hashtable->outerBatchFile[curbatch]) |
| 947 | BufFileClose(hashtable->outerBatchFile[curbatch]); |
| 948 | hashtable->outerBatchFile[curbatch] = NULL; |
| 949 | } |
| 950 | else /* we just finished the first batch */ |
| 951 | { |
| 952 | /* |
| 953 | * Reset some of the skew optimization state variables, since we no |
| 954 | * longer need to consider skew tuples after the first batch. The |
| 955 | * memory context reset we are about to do will release the skew |
| 956 | * hashtable itself. |
| 957 | */ |
| 958 | hashtable->skewEnabled = false; |
| 959 | hashtable->skewBucket = NULL; |
| 960 | hashtable->skewBucketNums = NULL; |
| 961 | hashtable->nSkewBuckets = 0; |
| 962 | hashtable->spaceUsedSkew = 0; |
| 963 | } |
| 964 | |
| 965 | /* |
| 966 | * We can always skip over any batches that are completely empty on both |
| 967 | * sides. We can sometimes skip over batches that are empty on only one |
| 968 | * side, but there are exceptions: |
| 969 | * |
| 970 | * 1. In a left/full outer join, we have to process outer batches even if |
| 971 | * the inner batch is empty. Similarly, in a right/full outer join, we |
| 972 | * have to process inner batches even if the outer batch is empty. |
| 973 | * |
| 974 | * 2. If we have increased nbatch since the initial estimate, we have to |
| 975 | * scan inner batches since they might contain tuples that need to be |
| 976 | * reassigned to later inner batches. |
| 977 | * |
| 978 | * 3. Similarly, if we have increased nbatch since starting the outer |
| 979 | * scan, we have to rescan outer batches in case they contain tuples that |
| 980 | * need to be reassigned. |
| 981 | */ |
| 982 | curbatch++; |
| 983 | while (curbatch < nbatch && |
| 984 | (hashtable->outerBatchFile[curbatch] == NULL || |
| 985 | hashtable->innerBatchFile[curbatch] == NULL)) |
| 986 | { |
| 987 | if (hashtable->outerBatchFile[curbatch] && |
| 988 | HJ_FILL_OUTER(hjstate)) |
| 989 | break; /* must process due to rule 1 */ |
| 990 | if (hashtable->innerBatchFile[curbatch] && |
| 991 | HJ_FILL_INNER(hjstate)) |
| 992 | break; /* must process due to rule 1 */ |
| 993 | if (hashtable->innerBatchFile[curbatch] && |
| 994 | nbatch != hashtable->nbatch_original) |
| 995 | break; /* must process due to rule 2 */ |
| 996 | if (hashtable->outerBatchFile[curbatch] && |
| 997 | nbatch != hashtable->nbatch_outstart) |
| 998 | break; /* must process due to rule 3 */ |
| 999 | /* We can ignore this batch. */ |
| 1000 | /* Release associated temp files right away. */ |
| 1001 | if (hashtable->innerBatchFile[curbatch]) |
| 1002 | BufFileClose(hashtable->innerBatchFile[curbatch]); |
| 1003 | hashtable->innerBatchFile[curbatch] = NULL; |
| 1004 | if (hashtable->outerBatchFile[curbatch]) |
| 1005 | BufFileClose(hashtable->outerBatchFile[curbatch]); |
| 1006 | hashtable->outerBatchFile[curbatch] = NULL; |
| 1007 | curbatch++; |
| 1008 | } |
| 1009 | |
| 1010 | if (curbatch >= nbatch) |
| 1011 | return false; /* no more batches */ |
| 1012 | |
| 1013 | hashtable->curbatch = curbatch; |
| 1014 | |
| 1015 | /* |
| 1016 | * Reload the hash table with the new inner batch (which could be empty) |
| 1017 | */ |
| 1018 | ExecHashTableReset(hashtable); |
| 1019 | |
| 1020 | innerFile = hashtable->innerBatchFile[curbatch]; |
| 1021 | |
| 1022 | if (innerFile != NULL) |
| 1023 | { |
| 1024 | if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) |
| 1025 | ereport(ERROR, |
| 1026 | (errcode_for_file_access(), |
| 1027 | errmsg("could not rewind hash-join temporary file: %m" ))); |
| 1028 | |
| 1029 | while ((slot = ExecHashJoinGetSavedTuple(hjstate, |
| 1030 | innerFile, |
| 1031 | &hashvalue, |
| 1032 | hjstate->hj_HashTupleSlot))) |
| 1033 | { |
| 1034 | /* |
| 1035 | * NOTE: some tuples may be sent to future batches. Also, it is |
| 1036 | * possible for hashtable->nbatch to be increased here! |
| 1037 | */ |
| 1038 | ExecHashTableInsert(hashtable, slot, hashvalue); |
| 1039 | } |
| 1040 | |
| 1041 | /* |
| 1042 | * after we build the hash table, the inner batch file is no longer |
| 1043 | * needed |
| 1044 | */ |
| 1045 | BufFileClose(innerFile); |
| 1046 | hashtable->innerBatchFile[curbatch] = NULL; |
| 1047 | } |
| 1048 | |
| 1049 | /* |
| 1050 | * Rewind outer batch file (if present), so that we can start reading it. |
| 1051 | */ |
| 1052 | if (hashtable->outerBatchFile[curbatch] != NULL) |
| 1053 | { |
| 1054 | if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) |
| 1055 | ereport(ERROR, |
| 1056 | (errcode_for_file_access(), |
| 1057 | errmsg("could not rewind hash-join temporary file: %m" ))); |
| 1058 | } |
| 1059 | |
| 1060 | return true; |
| 1061 | } |
| 1062 | |
| 1063 | /* |
| 1064 | * Choose a batch to work on, and attach to it. Returns true if successful, |
| 1065 | * false if there are no more batches. |
| 1066 | */ |
| 1067 | static bool |
| 1068 | ExecParallelHashJoinNewBatch(HashJoinState *hjstate) |
| 1069 | { |
| 1070 | HashJoinTable hashtable = hjstate->hj_HashTable; |
| 1071 | int start_batchno; |
| 1072 | int batchno; |
| 1073 | |
| 1074 | /* |
| 1075 | * If we started up so late that the batch tracking array has been freed |
| 1076 | * already by ExecHashTableDetach(), then we are finished. See also |
| 1077 | * ExecParallelHashEnsureBatchAccessors(). |
| 1078 | */ |
| 1079 | if (hashtable->batches == NULL) |
| 1080 | return false; |
| 1081 | |
| 1082 | /* |
| 1083 | * If we were already attached to a batch, remember not to bother checking |
| 1084 | * it again, and detach from it (possibly freeing the hash table if we are |
| 1085 | * last to detach). |
| 1086 | */ |
| 1087 | if (hashtable->curbatch >= 0) |
| 1088 | { |
| 1089 | hashtable->batches[hashtable->curbatch].done = true; |
| 1090 | ExecHashTableDetachBatch(hashtable); |
| 1091 | } |
| 1092 | |
| 1093 | /* |
| 1094 | * Search for a batch that isn't done. We use an atomic counter to start |
| 1095 | * our search at a different batch in every participant when there are |
| 1096 | * more batches than participants. |
| 1097 | */ |
| 1098 | batchno = start_batchno = |
| 1099 | pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) % |
| 1100 | hashtable->nbatch; |
| 1101 | do |
| 1102 | { |
| 1103 | uint32 hashvalue; |
| 1104 | MinimalTuple tuple; |
| 1105 | TupleTableSlot *slot; |
| 1106 | |
| 1107 | if (!hashtable->batches[batchno].done) |
| 1108 | { |
| 1109 | SharedTuplestoreAccessor *inner_tuples; |
| 1110 | Barrier *batch_barrier = |
| 1111 | &hashtable->batches[batchno].shared->batch_barrier; |
| 1112 | |
| 1113 | switch (BarrierAttach(batch_barrier)) |
| 1114 | { |
| 1115 | case PHJ_BATCH_ELECTING: |
| 1116 | |
| 1117 | /* One backend allocates the hash table. */ |
| 1118 | if (BarrierArriveAndWait(batch_barrier, |
| 1119 | WAIT_EVENT_HASH_BATCH_ELECTING)) |
| 1120 | ExecParallelHashTableAlloc(hashtable, batchno); |
| 1121 | /* Fall through. */ |
| 1122 | |
| 1123 | case PHJ_BATCH_ALLOCATING: |
| 1124 | /* Wait for allocation to complete. */ |
| 1125 | BarrierArriveAndWait(batch_barrier, |
| 1126 | WAIT_EVENT_HASH_BATCH_ALLOCATING); |
| 1127 | /* Fall through. */ |
| 1128 | |
| 1129 | case PHJ_BATCH_LOADING: |
| 1130 | /* Start (or join in) loading tuples. */ |
| 1131 | ExecParallelHashTableSetCurrentBatch(hashtable, batchno); |
| 1132 | inner_tuples = hashtable->batches[batchno].inner_tuples; |
| 1133 | sts_begin_parallel_scan(inner_tuples); |
| 1134 | while ((tuple = sts_parallel_scan_next(inner_tuples, |
| 1135 | &hashvalue))) |
| 1136 | { |
| 1137 | ExecForceStoreMinimalTuple(tuple, |
| 1138 | hjstate->hj_HashTupleSlot, |
| 1139 | false); |
| 1140 | slot = hjstate->hj_HashTupleSlot; |
| 1141 | ExecParallelHashTableInsertCurrentBatch(hashtable, slot, |
| 1142 | hashvalue); |
| 1143 | } |
| 1144 | sts_end_parallel_scan(inner_tuples); |
| 1145 | BarrierArriveAndWait(batch_barrier, |
| 1146 | WAIT_EVENT_HASH_BATCH_LOADING); |
| 1147 | /* Fall through. */ |
| 1148 | |
| 1149 | case PHJ_BATCH_PROBING: |
| 1150 | |
| 1151 | /* |
| 1152 | * This batch is ready to probe. Return control to |
| 1153 | * caller. We stay attached to batch_barrier so that the |
| 1154 | * hash table stays alive until everyone's finished |
| 1155 | * probing it, but no participant is allowed to wait at |
| 1156 | * this barrier again (or else a deadlock could occur). |
| 1157 | * All attached participants must eventually call |
| 1158 | * BarrierArriveAndDetach() so that the final phase |
| 1159 | * PHJ_BATCH_DONE can be reached. |
| 1160 | */ |
| 1161 | ExecParallelHashTableSetCurrentBatch(hashtable, batchno); |
| 1162 | sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples); |
| 1163 | return true; |
| 1164 | |
| 1165 | case PHJ_BATCH_DONE: |
| 1166 | |
| 1167 | /* |
| 1168 | * Already done. Detach and go around again (if any |
| 1169 | * remain). |
| 1170 | */ |
| 1171 | BarrierDetach(batch_barrier); |
| 1172 | hashtable->batches[batchno].done = true; |
| 1173 | hashtable->curbatch = -1; |
| 1174 | break; |
| 1175 | |
| 1176 | default: |
| 1177 | elog(ERROR, "unexpected batch phase %d" , |
| 1178 | BarrierPhase(batch_barrier)); |
| 1179 | } |
| 1180 | } |
| 1181 | batchno = (batchno + 1) % hashtable->nbatch; |
| 1182 | } while (batchno != start_batchno); |
| 1183 | |
| 1184 | return false; |
| 1185 | } |
| 1186 | |
| 1187 | /* |
| 1188 | * ExecHashJoinSaveTuple |
| 1189 | * save a tuple to a batch file. |
| 1190 | * |
| 1191 | * The data recorded in the file for each tuple is its hash value, |
| 1192 | * then the tuple in MinimalTuple format. |
| 1193 | * |
| 1194 | * Note: it is important always to call this in the regular executor |
| 1195 | * context, not in a shorter-lived context; else the temp file buffers |
| 1196 | * will get messed up. |
| 1197 | */ |
| 1198 | void |
| 1199 | ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, |
| 1200 | BufFile **fileptr) |
| 1201 | { |
| 1202 | BufFile *file = *fileptr; |
| 1203 | size_t written; |
| 1204 | |
| 1205 | if (file == NULL) |
| 1206 | { |
| 1207 | /* First write to this batch file, so open it. */ |
| 1208 | file = BufFileCreateTemp(false); |
| 1209 | *fileptr = file; |
| 1210 | } |
| 1211 | |
| 1212 | written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); |
| 1213 | if (written != sizeof(uint32)) |
| 1214 | ereport(ERROR, |
| 1215 | (errcode_for_file_access(), |
| 1216 | errmsg("could not write to hash-join temporary file: %m" ))); |
| 1217 | |
| 1218 | written = BufFileWrite(file, (void *) tuple, tuple->t_len); |
| 1219 | if (written != tuple->t_len) |
| 1220 | ereport(ERROR, |
| 1221 | (errcode_for_file_access(), |
| 1222 | errmsg("could not write to hash-join temporary file: %m" ))); |
| 1223 | } |
| 1224 | |
| 1225 | /* |
| 1226 | * ExecHashJoinGetSavedTuple |
| 1227 | * read the next tuple from a batch file. Return NULL if no more. |
| 1228 | * |
| 1229 | * On success, *hashvalue is set to the tuple's hash value, and the tuple |
| 1230 | * itself is stored in the given slot. |
| 1231 | */ |
| 1232 | static TupleTableSlot * |
| 1233 | ExecHashJoinGetSavedTuple(HashJoinState *hjstate, |
| 1234 | BufFile *file, |
| 1235 | uint32 *hashvalue, |
| 1236 | TupleTableSlot *tupleSlot) |
| 1237 | { |
| 1238 | uint32 [2]; |
| 1239 | size_t nread; |
| 1240 | MinimalTuple tuple; |
| 1241 | |
| 1242 | /* |
| 1243 | * We check for interrupts here because this is typically taken as an |
| 1244 | * alternative code path to an ExecProcNode() call, which would include |
| 1245 | * such a check. |
| 1246 | */ |
| 1247 | CHECK_FOR_INTERRUPTS(); |
| 1248 | |
| 1249 | /* |
| 1250 | * Since both the hash value and the MinimalTuple length word are uint32, |
| 1251 | * we can read them both in one BufFileRead() call without any type |
| 1252 | * cheating. |
| 1253 | */ |
| 1254 | nread = BufFileRead(file, (void *) header, sizeof(header)); |
| 1255 | if (nread == 0) /* end of file */ |
| 1256 | { |
| 1257 | ExecClearTuple(tupleSlot); |
| 1258 | return NULL; |
| 1259 | } |
| 1260 | if (nread != sizeof(header)) |
| 1261 | ereport(ERROR, |
| 1262 | (errcode_for_file_access(), |
| 1263 | errmsg("could not read from hash-join temporary file: %m" ))); |
| 1264 | *hashvalue = header[0]; |
| 1265 | tuple = (MinimalTuple) palloc(header[1]); |
| 1266 | tuple->t_len = header[1]; |
| 1267 | nread = BufFileRead(file, |
| 1268 | (void *) ((char *) tuple + sizeof(uint32)), |
| 1269 | header[1] - sizeof(uint32)); |
| 1270 | if (nread != header[1] - sizeof(uint32)) |
| 1271 | ereport(ERROR, |
| 1272 | (errcode_for_file_access(), |
| 1273 | errmsg("could not read from hash-join temporary file: %m" ))); |
| 1274 | ExecForceStoreMinimalTuple(tuple, tupleSlot, true); |
| 1275 | return tupleSlot; |
| 1276 | } |
| 1277 | |
| 1278 | |
| 1279 | void |
| 1280 | ExecReScanHashJoin(HashJoinState *node) |
| 1281 | { |
| 1282 | /* |
| 1283 | * In a multi-batch join, we currently have to do rescans the hard way, |
| 1284 | * primarily because batch temp files may have already been released. But |
| 1285 | * if it's a single-batch join, and there is no parameter change for the |
| 1286 | * inner subnode, then we can just re-use the existing hash table without |
| 1287 | * rebuilding it. |
| 1288 | */ |
| 1289 | if (node->hj_HashTable != NULL) |
| 1290 | { |
| 1291 | if (node->hj_HashTable->nbatch == 1 && |
| 1292 | node->js.ps.righttree->chgParam == NULL) |
| 1293 | { |
| 1294 | /* |
| 1295 | * Okay to reuse the hash table; needn't rescan inner, either. |
| 1296 | * |
| 1297 | * However, if it's a right/full join, we'd better reset the |
| 1298 | * inner-tuple match flags contained in the table. |
| 1299 | */ |
| 1300 | if (HJ_FILL_INNER(node)) |
| 1301 | ExecHashTableResetMatchFlags(node->hj_HashTable); |
| 1302 | |
| 1303 | /* |
| 1304 | * Also, we need to reset our state about the emptiness of the |
| 1305 | * outer relation, so that the new scan of the outer will update |
| 1306 | * it correctly if it turns out to be empty this time. (There's no |
| 1307 | * harm in clearing it now because ExecHashJoin won't need the |
| 1308 | * info. In the other cases, where the hash table doesn't exist |
| 1309 | * or we are destroying it, we leave this state alone because |
| 1310 | * ExecHashJoin will need it the first time through.) |
| 1311 | */ |
| 1312 | node->hj_OuterNotEmpty = false; |
| 1313 | |
| 1314 | /* ExecHashJoin can skip the BUILD_HASHTABLE step */ |
| 1315 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
| 1316 | } |
| 1317 | else |
| 1318 | { |
| 1319 | /* must destroy and rebuild hash table */ |
| 1320 | ExecHashTableDestroy(node->hj_HashTable); |
| 1321 | node->hj_HashTable = NULL; |
| 1322 | node->hj_JoinState = HJ_BUILD_HASHTABLE; |
| 1323 | |
| 1324 | /* |
| 1325 | * if chgParam of subnode is not null then plan will be re-scanned |
| 1326 | * by first ExecProcNode. |
| 1327 | */ |
| 1328 | if (node->js.ps.righttree->chgParam == NULL) |
| 1329 | ExecReScan(node->js.ps.righttree); |
| 1330 | } |
| 1331 | } |
| 1332 | |
| 1333 | /* Always reset intra-tuple state */ |
| 1334 | node->hj_CurHashValue = 0; |
| 1335 | node->hj_CurBucketNo = 0; |
| 1336 | node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; |
| 1337 | node->hj_CurTuple = NULL; |
| 1338 | |
| 1339 | node->hj_MatchedOuter = false; |
| 1340 | node->hj_FirstOuterTupleSlot = NULL; |
| 1341 | |
| 1342 | /* |
| 1343 | * if chgParam of subnode is not null then plan will be re-scanned by |
| 1344 | * first ExecProcNode. |
| 1345 | */ |
| 1346 | if (node->js.ps.lefttree->chgParam == NULL) |
| 1347 | ExecReScan(node->js.ps.lefttree); |
| 1348 | } |
| 1349 | |
| 1350 | void |
| 1351 | ExecShutdownHashJoin(HashJoinState *node) |
| 1352 | { |
| 1353 | if (node->hj_HashTable) |
| 1354 | { |
| 1355 | /* |
| 1356 | * Detach from shared state before DSM memory goes away. This makes |
| 1357 | * sure that we don't have any pointers into DSM memory by the time |
| 1358 | * ExecEndHashJoin runs. |
| 1359 | */ |
| 1360 | ExecHashTableDetachBatch(node->hj_HashTable); |
| 1361 | ExecHashTableDetach(node->hj_HashTable); |
| 1362 | } |
| 1363 | } |
| 1364 | |
| 1365 | static void |
| 1366 | ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) |
| 1367 | { |
| 1368 | PlanState *outerState = outerPlanState(hjstate); |
| 1369 | ExprContext *econtext = hjstate->js.ps.ps_ExprContext; |
| 1370 | HashJoinTable hashtable = hjstate->hj_HashTable; |
| 1371 | TupleTableSlot *slot; |
| 1372 | uint32 hashvalue; |
| 1373 | int i; |
| 1374 | |
| 1375 | Assert(hjstate->hj_FirstOuterTupleSlot == NULL); |
| 1376 | |
| 1377 | /* Execute outer plan, writing all tuples to shared tuplestores. */ |
| 1378 | for (;;) |
| 1379 | { |
| 1380 | slot = ExecProcNode(outerState); |
| 1381 | if (TupIsNull(slot)) |
| 1382 | break; |
| 1383 | econtext->ecxt_outertuple = slot; |
| 1384 | if (ExecHashGetHashValue(hashtable, econtext, |
| 1385 | hjstate->hj_OuterHashKeys, |
| 1386 | true, /* outer tuple */ |
| 1387 | HJ_FILL_OUTER(hjstate), |
| 1388 | &hashvalue)) |
| 1389 | { |
| 1390 | int batchno; |
| 1391 | int bucketno; |
| 1392 | bool shouldFree; |
| 1393 | MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree); |
| 1394 | |
| 1395 | ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, |
| 1396 | &batchno); |
| 1397 | sts_puttuple(hashtable->batches[batchno].outer_tuples, |
| 1398 | &hashvalue, mintup); |
| 1399 | |
| 1400 | if (shouldFree) |
| 1401 | heap_free_minimal_tuple(mintup); |
| 1402 | } |
| 1403 | CHECK_FOR_INTERRUPTS(); |
| 1404 | } |
| 1405 | |
| 1406 | /* Make sure all outer partitions are readable by any backend. */ |
| 1407 | for (i = 0; i < hashtable->nbatch; ++i) |
| 1408 | sts_end_write(hashtable->batches[i].outer_tuples); |
| 1409 | } |
| 1410 | |
| 1411 | void |
| 1412 | ExecHashJoinEstimate(HashJoinState *state, ParallelContext *pcxt) |
| 1413 | { |
| 1414 | shm_toc_estimate_chunk(&pcxt->estimator, sizeof(ParallelHashJoinState)); |
| 1415 | shm_toc_estimate_keys(&pcxt->estimator, 1); |
| 1416 | } |
| 1417 | |
| 1418 | void |
| 1419 | ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) |
| 1420 | { |
| 1421 | int plan_node_id = state->js.ps.plan->plan_node_id; |
| 1422 | HashState *hashNode; |
| 1423 | ParallelHashJoinState *pstate; |
| 1424 | |
| 1425 | /* |
| 1426 | * Disable shared hash table mode if we failed to create a real DSM |
| 1427 | * segment, because that means that we don't have a DSA area to work with. |
| 1428 | */ |
| 1429 | if (pcxt->seg == NULL) |
| 1430 | return; |
| 1431 | |
| 1432 | ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); |
| 1433 | |
| 1434 | /* |
| 1435 | * Set up the state needed to coordinate access to the shared hash |
| 1436 | * table(s), using the plan node ID as the toc key. |
| 1437 | */ |
| 1438 | pstate = shm_toc_allocate(pcxt->toc, sizeof(ParallelHashJoinState)); |
| 1439 | shm_toc_insert(pcxt->toc, plan_node_id, pstate); |
| 1440 | |
| 1441 | /* |
| 1442 | * Set up the shared hash join state with no batches initially. |
| 1443 | * ExecHashTableCreate() will prepare at least one later and set nbatch |
| 1444 | * and space_allowed. |
| 1445 | */ |
| 1446 | pstate->nbatch = 0; |
| 1447 | pstate->space_allowed = 0; |
| 1448 | pstate->batches = InvalidDsaPointer; |
| 1449 | pstate->old_batches = InvalidDsaPointer; |
| 1450 | pstate->nbuckets = 0; |
| 1451 | pstate->growth = PHJ_GROWTH_OK; |
| 1452 | pstate->chunk_work_queue = InvalidDsaPointer; |
| 1453 | pg_atomic_init_u32(&pstate->distributor, 0); |
| 1454 | pstate->nparticipants = pcxt->nworkers + 1; |
| 1455 | pstate->total_tuples = 0; |
| 1456 | LWLockInitialize(&pstate->lock, |
| 1457 | LWTRANCHE_PARALLEL_HASH_JOIN); |
| 1458 | BarrierInit(&pstate->build_barrier, 0); |
| 1459 | BarrierInit(&pstate->grow_batches_barrier, 0); |
| 1460 | BarrierInit(&pstate->grow_buckets_barrier, 0); |
| 1461 | |
| 1462 | /* Set up the space we'll use for shared temporary files. */ |
| 1463 | SharedFileSetInit(&pstate->fileset, pcxt->seg); |
| 1464 | |
| 1465 | /* Initialize the shared state in the hash node. */ |
| 1466 | hashNode = (HashState *) innerPlanState(state); |
| 1467 | hashNode->parallel_state = pstate; |
| 1468 | } |
| 1469 | |
| 1470 | /* ---------------------------------------------------------------- |
| 1471 | * ExecHashJoinReInitializeDSM |
| 1472 | * |
| 1473 | * Reset shared state before beginning a fresh scan. |
| 1474 | * ---------------------------------------------------------------- |
| 1475 | */ |
| 1476 | void |
| 1477 | ExecHashJoinReInitializeDSM(HashJoinState *state, ParallelContext *cxt) |
| 1478 | { |
| 1479 | int plan_node_id = state->js.ps.plan->plan_node_id; |
| 1480 | ParallelHashJoinState *pstate = |
| 1481 | shm_toc_lookup(cxt->toc, plan_node_id, false); |
| 1482 | |
| 1483 | /* |
| 1484 | * It would be possible to reuse the shared hash table in single-batch |
| 1485 | * cases by resetting and then fast-forwarding build_barrier to |
| 1486 | * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_PROBING, but |
| 1487 | * currently shared hash tables are already freed by now (by the last |
| 1488 | * participant to detach from the batch). We could consider keeping it |
| 1489 | * around for single-batch joins. We'd also need to adjust |
| 1490 | * finalize_plan() so that it doesn't record a dummy dependency for |
| 1491 | * Parallel Hash nodes, preventing the rescan optimization. For now we |
| 1492 | * don't try. |
| 1493 | */ |
| 1494 | |
| 1495 | /* Detach, freeing any remaining shared memory. */ |
| 1496 | if (state->hj_HashTable != NULL) |
| 1497 | { |
| 1498 | ExecHashTableDetachBatch(state->hj_HashTable); |
| 1499 | ExecHashTableDetach(state->hj_HashTable); |
| 1500 | } |
| 1501 | |
| 1502 | /* Clear any shared batch files. */ |
| 1503 | SharedFileSetDeleteAll(&pstate->fileset); |
| 1504 | |
| 1505 | /* Reset build_barrier to PHJ_BUILD_ELECTING so we can go around again. */ |
| 1506 | BarrierInit(&pstate->build_barrier, 0); |
| 1507 | } |
| 1508 | |
| 1509 | void |
| 1510 | ExecHashJoinInitializeWorker(HashJoinState *state, |
| 1511 | ParallelWorkerContext *pwcxt) |
| 1512 | { |
| 1513 | HashState *hashNode; |
| 1514 | int plan_node_id = state->js.ps.plan->plan_node_id; |
| 1515 | ParallelHashJoinState *pstate = |
| 1516 | shm_toc_lookup(pwcxt->toc, plan_node_id, false); |
| 1517 | |
| 1518 | /* Attach to the space for shared temporary files. */ |
| 1519 | SharedFileSetAttach(&pstate->fileset, pwcxt->seg); |
| 1520 | |
| 1521 | /* Attach to the shared state in the hash node. */ |
| 1522 | hashNode = (HashState *) innerPlanState(state); |
| 1523 | hashNode->parallel_state = pstate; |
| 1524 | |
| 1525 | ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); |
| 1526 | } |
| 1527 | |