1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * nodeHashjoin.c |
4 | * Routines to handle hash join nodes |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/executor/nodeHashjoin.c |
12 | * |
13 | * PARALLELISM |
14 | * |
15 | * Hash joins can participate in parallel query execution in several ways. A |
16 | * parallel-oblivious hash join is one where the node is unaware that it is |
17 | * part of a parallel plan. In this case, a copy of the inner plan is used to |
18 | * build a copy of the hash table in every backend, and the outer plan could |
19 | * either be built from a partial or complete path, so that the results of the |
20 | * hash join are correspondingly either partial or complete. A parallel-aware |
21 | * hash join is one that behaves differently, coordinating work between |
22 | * backends, and appears as Parallel Hash Join in EXPLAIN output. A Parallel |
23 | * Hash Join always appears with a Parallel Hash node. |
24 | * |
25 | * Parallel-aware hash joins use the same per-backend state machine to track |
26 | * progress through the hash join algorithm as parallel-oblivious hash joins. |
27 | * In a parallel-aware hash join, there is also a shared state machine that |
28 | * co-operating backends use to synchronize their local state machines and |
29 | * program counters. The shared state machine is managed with a Barrier IPC |
30 | * primitive. When all attached participants arrive at a barrier, the phase |
31 | * advances and all waiting participants are released. |
32 | * |
33 | * When a participant begins working on a parallel hash join, it must first |
34 | * figure out how much progress has already been made, because participants |
35 | * don't wait for each other to begin. For this reason there are switch |
36 | * statements at key points in the code where we have to synchronize our local |
37 | * state machine with the phase, and then jump to the correct part of the |
38 | * algorithm so that we can get started. |
39 | * |
40 | * One barrier called build_barrier is used to coordinate the hashing phases. |
41 | * The phase is represented by an integer which begins at zero and increments |
42 | * one by one, but in the code it is referred to by symbolic names as follows: |
43 | * |
44 | * PHJ_BUILD_ELECTING -- initial state |
45 | * PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0 |
46 | * PHJ_BUILD_HASHING_INNER -- all hash the inner rel |
47 | * PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer |
48 | * PHJ_BUILD_DONE -- building done, probing can begin |
49 | * |
50 | * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may |
51 | * be used repeatedly as required to coordinate expansions in the number of |
52 | * batches or buckets. Their phases are as follows: |
53 | * |
54 | * PHJ_GROW_BATCHES_ELECTING -- initial state |
55 | * PHJ_GROW_BATCHES_ALLOCATING -- one allocates new batches |
56 | * PHJ_GROW_BATCHES_REPARTITIONING -- all repartition |
57 | * PHJ_GROW_BATCHES_FINISHING -- one cleans up, detects skew |
58 | * |
59 | * PHJ_GROW_BUCKETS_ELECTING -- initial state |
60 | * PHJ_GROW_BUCKETS_ALLOCATING -- one allocates new buckets |
61 | * PHJ_GROW_BUCKETS_REINSERTING -- all insert tuples |
62 | * |
63 | * If the planner got the number of batches and buckets right, those won't be |
64 | * necessary, but on the other hand we might finish up needing to expand the |
65 | * buckets or batches multiple times while hashing the inner relation to stay |
66 | * within our memory budget and load factor target. For that reason it's a |
67 | * separate pair of barriers using circular phases. |
68 | * |
69 | * The PHJ_BUILD_HASHING_OUTER phase is required only for multi-batch joins, |
70 | * because we need to divide the outer relation into batches up front in order |
71 | * to be able to process batches entirely independently. In contrast, the |
72 | * parallel-oblivious algorithm simply throws tuples 'forward' to 'later' |
73 | * batches whenever it encounters them while scanning and probing, which it |
74 | * can do because it processes batches in serial order. |
75 | * |
76 | * Once PHJ_BUILD_DONE is reached, backends then split up and process |
77 | * different batches, or gang up and work together on probing batches if there |
78 | * aren't enough to go around. For each batch there is a separate barrier |
79 | * with the following phases: |
80 | * |
81 | * PHJ_BATCH_ELECTING -- initial state |
82 | * PHJ_BATCH_ALLOCATING -- one allocates buckets |
83 | * PHJ_BATCH_LOADING -- all load the hash table from disk |
84 | * PHJ_BATCH_PROBING -- all probe |
85 | * PHJ_BATCH_DONE -- end |
86 | * |
87 | * Batch 0 is a special case, because it starts out in phase |
88 | * PHJ_BATCH_PROBING; populating batch 0's hash table is done during |
89 | * PHJ_BUILD_HASHING_INNER so we can skip loading. |
90 | * |
91 | * Initially we try to plan for a single-batch hash join using the combined |
92 | * work_mem of all participants to create a large shared hash table. If that |
93 | * turns out either at planning or execution time to be impossible then we |
94 | * fall back to regular work_mem sized hash tables. |
95 | * |
96 | * To avoid deadlocks, we never wait for any barrier unless it is known that |
97 | * all other backends attached to it are actively executing the node or have |
98 | * already arrived. Practically, that means that we never return a tuple |
99 | * while attached to a barrier, unless the barrier has reached its final |
100 | * state. In the slightly special case of the per-batch barrier, we return |
101 | * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use |
102 | * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting. |
103 | * |
104 | *------------------------------------------------------------------------- |
105 | */ |
106 | |
107 | #include "postgres.h" |
108 | |
109 | #include "access/htup_details.h" |
110 | #include "access/parallel.h" |
111 | #include "executor/executor.h" |
112 | #include "executor/hashjoin.h" |
113 | #include "executor/nodeHash.h" |
114 | #include "executor/nodeHashjoin.h" |
115 | #include "miscadmin.h" |
116 | #include "pgstat.h" |
117 | #include "utils/memutils.h" |
118 | #include "utils/sharedtuplestore.h" |
119 | |
120 | |
121 | /* |
122 | * States of the ExecHashJoin state machine |
123 | */ |
124 | #define HJ_BUILD_HASHTABLE 1 |
125 | #define HJ_NEED_NEW_OUTER 2 |
126 | #define HJ_SCAN_BUCKET 3 |
127 | #define HJ_FILL_OUTER_TUPLE 4 |
128 | #define HJ_FILL_INNER_TUPLES 5 |
129 | #define HJ_NEED_NEW_BATCH 6 |
130 | |
131 | /* Returns true if doing null-fill on outer relation */ |
132 | #define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL) |
133 | /* Returns true if doing null-fill on inner relation */ |
134 | #define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL) |
135 | |
136 | static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, |
137 | HashJoinState *hjstate, |
138 | uint32 *hashvalue); |
139 | static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, |
140 | HashJoinState *hjstate, |
141 | uint32 *hashvalue); |
142 | static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, |
143 | BufFile *file, |
144 | uint32 *hashvalue, |
145 | TupleTableSlot *tupleSlot); |
146 | static bool ExecHashJoinNewBatch(HashJoinState *hjstate); |
147 | static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); |
148 | static void ExecParallelHashJoinPartitionOuter(HashJoinState *node); |
149 | |
150 | |
151 | /* ---------------------------------------------------------------- |
152 | * ExecHashJoinImpl |
153 | * |
154 | * This function implements the Hybrid Hashjoin algorithm. It is marked |
155 | * with an always-inline attribute so that ExecHashJoin() and |
156 | * ExecParallelHashJoin() can inline it. Compilers that respect the |
157 | * attribute should create versions specialized for parallel == true and |
158 | * parallel == false with unnecessary branches removed. |
159 | * |
160 | * Note: the relation we build hash table on is the "inner" |
161 | * the other one is "outer". |
162 | * ---------------------------------------------------------------- |
163 | */ |
164 | static pg_attribute_always_inline TupleTableSlot * |
165 | ExecHashJoinImpl(PlanState *pstate, bool parallel) |
166 | { |
167 | HashJoinState *node = castNode(HashJoinState, pstate); |
168 | PlanState *outerNode; |
169 | HashState *hashNode; |
170 | ExprState *joinqual; |
171 | ExprState *otherqual; |
172 | ExprContext *econtext; |
173 | HashJoinTable hashtable; |
174 | TupleTableSlot *outerTupleSlot; |
175 | uint32 hashvalue; |
176 | int batchno; |
177 | ParallelHashJoinState *parallel_state; |
178 | |
179 | /* |
180 | * get information from HashJoin node |
181 | */ |
182 | joinqual = node->js.joinqual; |
183 | otherqual = node->js.ps.qual; |
184 | hashNode = (HashState *) innerPlanState(node); |
185 | outerNode = outerPlanState(node); |
186 | hashtable = node->hj_HashTable; |
187 | econtext = node->js.ps.ps_ExprContext; |
188 | parallel_state = hashNode->parallel_state; |
189 | |
190 | /* |
191 | * Reset per-tuple memory context to free any expression evaluation |
192 | * storage allocated in the previous tuple cycle. |
193 | */ |
194 | ResetExprContext(econtext); |
195 | |
196 | /* |
197 | * run the hash join state machine |
198 | */ |
199 | for (;;) |
200 | { |
201 | /* |
202 | * It's possible to iterate this loop many times before returning a |
203 | * tuple, in some pathological cases such as needing to move much of |
204 | * the current batch to a later batch. So let's check for interrupts |
205 | * each time through. |
206 | */ |
207 | CHECK_FOR_INTERRUPTS(); |
208 | |
209 | switch (node->hj_JoinState) |
210 | { |
211 | case HJ_BUILD_HASHTABLE: |
212 | |
213 | /* |
214 | * First time through: build hash table for inner relation. |
215 | */ |
216 | Assert(hashtable == NULL); |
217 | |
218 | /* |
219 | * If the outer relation is completely empty, and it's not |
220 | * right/full join, we can quit without building the hash |
221 | * table. However, for an inner join it is only a win to |
222 | * check this when the outer relation's startup cost is less |
223 | * than the projected cost of building the hash table. |
224 | * Otherwise it's best to build the hash table first and see |
225 | * if the inner relation is empty. (When it's a left join, we |
226 | * should always make this check, since we aren't going to be |
227 | * able to skip the join on the strength of an empty inner |
228 | * relation anyway.) |
229 | * |
230 | * If we are rescanning the join, we make use of information |
231 | * gained on the previous scan: don't bother to try the |
232 | * prefetch if the previous scan found the outer relation |
233 | * nonempty. This is not 100% reliable since with new |
234 | * parameters the outer relation might yield different |
235 | * results, but it's a good heuristic. |
236 | * |
237 | * The only way to make the check is to try to fetch a tuple |
238 | * from the outer plan node. If we succeed, we have to stash |
239 | * it away for later consumption by ExecHashJoinOuterGetTuple. |
240 | */ |
241 | if (HJ_FILL_INNER(node)) |
242 | { |
243 | /* no chance to not build the hash table */ |
244 | node->hj_FirstOuterTupleSlot = NULL; |
245 | } |
246 | else if (parallel) |
247 | { |
248 | /* |
249 | * The empty-outer optimization is not implemented for |
250 | * shared hash tables, because no one participant can |
251 | * determine that there are no outer tuples, and it's not |
252 | * yet clear that it's worth the synchronization overhead |
253 | * of reaching consensus to figure that out. So we have |
254 | * to build the hash table. |
255 | */ |
256 | node->hj_FirstOuterTupleSlot = NULL; |
257 | } |
258 | else if (HJ_FILL_OUTER(node) || |
259 | (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost && |
260 | !node->hj_OuterNotEmpty)) |
261 | { |
262 | node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode); |
263 | if (TupIsNull(node->hj_FirstOuterTupleSlot)) |
264 | { |
265 | node->hj_OuterNotEmpty = false; |
266 | return NULL; |
267 | } |
268 | else |
269 | node->hj_OuterNotEmpty = true; |
270 | } |
271 | else |
272 | node->hj_FirstOuterTupleSlot = NULL; |
273 | |
274 | /* |
275 | * Create the hash table. If using Parallel Hash, then |
276 | * whoever gets here first will create the hash table and any |
277 | * later arrivals will merely attach to it. |
278 | */ |
279 | hashtable = ExecHashTableCreate(hashNode, |
280 | node->hj_HashOperators, |
281 | node->hj_Collations, |
282 | HJ_FILL_INNER(node)); |
283 | node->hj_HashTable = hashtable; |
284 | |
285 | /* |
286 | * Execute the Hash node, to build the hash table. If using |
287 | * Parallel Hash, then we'll try to help hashing unless we |
288 | * arrived too late. |
289 | */ |
290 | hashNode->hashtable = hashtable; |
291 | (void) MultiExecProcNode((PlanState *) hashNode); |
292 | |
293 | /* |
294 | * If the inner relation is completely empty, and we're not |
295 | * doing a left outer join, we can quit without scanning the |
296 | * outer relation. |
297 | */ |
298 | if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node)) |
299 | return NULL; |
300 | |
301 | /* |
302 | * need to remember whether nbatch has increased since we |
303 | * began scanning the outer relation |
304 | */ |
305 | hashtable->nbatch_outstart = hashtable->nbatch; |
306 | |
307 | /* |
308 | * Reset OuterNotEmpty for scan. (It's OK if we fetched a |
309 | * tuple above, because ExecHashJoinOuterGetTuple will |
310 | * immediately set it again.) |
311 | */ |
312 | node->hj_OuterNotEmpty = false; |
313 | |
314 | if (parallel) |
315 | { |
316 | Barrier *build_barrier; |
317 | |
318 | build_barrier = ¶llel_state->build_barrier; |
319 | Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER || |
320 | BarrierPhase(build_barrier) == PHJ_BUILD_DONE); |
321 | if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER) |
322 | { |
323 | /* |
324 | * If multi-batch, we need to hash the outer relation |
325 | * up front. |
326 | */ |
327 | if (hashtable->nbatch > 1) |
328 | ExecParallelHashJoinPartitionOuter(node); |
329 | BarrierArriveAndWait(build_barrier, |
330 | WAIT_EVENT_HASH_BUILD_HASHING_OUTER); |
331 | } |
332 | Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE); |
333 | |
334 | /* Each backend should now select a batch to work on. */ |
335 | hashtable->curbatch = -1; |
336 | node->hj_JoinState = HJ_NEED_NEW_BATCH; |
337 | |
338 | continue; |
339 | } |
340 | else |
341 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
342 | |
343 | /* FALL THRU */ |
344 | |
345 | case HJ_NEED_NEW_OUTER: |
346 | |
347 | /* |
348 | * We don't have an outer tuple, try to get the next one |
349 | */ |
350 | if (parallel) |
351 | outerTupleSlot = |
352 | ExecParallelHashJoinOuterGetTuple(outerNode, node, |
353 | &hashvalue); |
354 | else |
355 | outerTupleSlot = |
356 | ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue); |
357 | |
358 | if (TupIsNull(outerTupleSlot)) |
359 | { |
360 | /* end of batch, or maybe whole join */ |
361 | if (HJ_FILL_INNER(node)) |
362 | { |
363 | /* set up to scan for unmatched inner tuples */ |
364 | ExecPrepHashTableForUnmatched(node); |
365 | node->hj_JoinState = HJ_FILL_INNER_TUPLES; |
366 | } |
367 | else |
368 | node->hj_JoinState = HJ_NEED_NEW_BATCH; |
369 | continue; |
370 | } |
371 | |
372 | econtext->ecxt_outertuple = outerTupleSlot; |
373 | node->hj_MatchedOuter = false; |
374 | |
375 | /* |
376 | * Find the corresponding bucket for this tuple in the main |
377 | * hash table or skew hash table. |
378 | */ |
379 | node->hj_CurHashValue = hashvalue; |
380 | ExecHashGetBucketAndBatch(hashtable, hashvalue, |
381 | &node->hj_CurBucketNo, &batchno); |
382 | node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, |
383 | hashvalue); |
384 | node->hj_CurTuple = NULL; |
385 | |
386 | /* |
387 | * The tuple might not belong to the current batch (where |
388 | * "current batch" includes the skew buckets if any). |
389 | */ |
390 | if (batchno != hashtable->curbatch && |
391 | node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) |
392 | { |
393 | bool shouldFree; |
394 | MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, |
395 | &shouldFree); |
396 | |
397 | /* |
398 | * Need to postpone this outer tuple to a later batch. |
399 | * Save it in the corresponding outer-batch file. |
400 | */ |
401 | Assert(parallel_state == NULL); |
402 | Assert(batchno > hashtable->curbatch); |
403 | ExecHashJoinSaveTuple(mintuple, hashvalue, |
404 | &hashtable->outerBatchFile[batchno]); |
405 | |
406 | if (shouldFree) |
407 | heap_free_minimal_tuple(mintuple); |
408 | |
409 | /* Loop around, staying in HJ_NEED_NEW_OUTER state */ |
410 | continue; |
411 | } |
412 | |
413 | /* OK, let's scan the bucket for matches */ |
414 | node->hj_JoinState = HJ_SCAN_BUCKET; |
415 | |
416 | /* FALL THRU */ |
417 | |
418 | case HJ_SCAN_BUCKET: |
419 | |
420 | /* |
421 | * Scan the selected hash bucket for matches to current outer |
422 | */ |
423 | if (parallel) |
424 | { |
425 | if (!ExecParallelScanHashBucket(node, econtext)) |
426 | { |
427 | /* out of matches; check for possible outer-join fill */ |
428 | node->hj_JoinState = HJ_FILL_OUTER_TUPLE; |
429 | continue; |
430 | } |
431 | } |
432 | else |
433 | { |
434 | if (!ExecScanHashBucket(node, econtext)) |
435 | { |
436 | /* out of matches; check for possible outer-join fill */ |
437 | node->hj_JoinState = HJ_FILL_OUTER_TUPLE; |
438 | continue; |
439 | } |
440 | } |
441 | |
442 | /* |
443 | * We've got a match, but still need to test non-hashed quals. |
444 | * ExecScanHashBucket already set up all the state needed to |
445 | * call ExecQual. |
446 | * |
447 | * If we pass the qual, then save state for next call and have |
448 | * ExecProject form the projection, store it in the tuple |
449 | * table, and return the slot. |
450 | * |
451 | * Only the joinquals determine tuple match status, but all |
452 | * quals must pass to actually return the tuple. |
453 | */ |
454 | if (joinqual == NULL || ExecQual(joinqual, econtext)) |
455 | { |
456 | node->hj_MatchedOuter = true; |
457 | HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); |
458 | |
459 | /* In an antijoin, we never return a matched tuple */ |
460 | if (node->js.jointype == JOIN_ANTI) |
461 | { |
462 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
463 | continue; |
464 | } |
465 | |
466 | /* |
467 | * If we only need to join to the first matching inner |
468 | * tuple, then consider returning this one, but after that |
469 | * continue with next outer tuple. |
470 | */ |
471 | if (node->js.single_match) |
472 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
473 | |
474 | if (otherqual == NULL || ExecQual(otherqual, econtext)) |
475 | return ExecProject(node->js.ps.ps_ProjInfo); |
476 | else |
477 | InstrCountFiltered2(node, 1); |
478 | } |
479 | else |
480 | InstrCountFiltered1(node, 1); |
481 | break; |
482 | |
483 | case HJ_FILL_OUTER_TUPLE: |
484 | |
485 | /* |
486 | * The current outer tuple has run out of matches, so check |
487 | * whether to emit a dummy outer-join tuple. Whether we emit |
488 | * one or not, the next state is NEED_NEW_OUTER. |
489 | */ |
490 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
491 | |
492 | if (!node->hj_MatchedOuter && |
493 | HJ_FILL_OUTER(node)) |
494 | { |
495 | /* |
496 | * Generate a fake join tuple with nulls for the inner |
497 | * tuple, and return it if it passes the non-join quals. |
498 | */ |
499 | econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; |
500 | |
501 | if (otherqual == NULL || ExecQual(otherqual, econtext)) |
502 | return ExecProject(node->js.ps.ps_ProjInfo); |
503 | else |
504 | InstrCountFiltered2(node, 1); |
505 | } |
506 | break; |
507 | |
508 | case HJ_FILL_INNER_TUPLES: |
509 | |
510 | /* |
511 | * We have finished a batch, but we are doing right/full join, |
512 | * so any unmatched inner tuples in the hashtable have to be |
513 | * emitted before we continue to the next batch. |
514 | */ |
515 | if (!ExecScanHashTableForUnmatched(node, econtext)) |
516 | { |
517 | /* no more unmatched tuples */ |
518 | node->hj_JoinState = HJ_NEED_NEW_BATCH; |
519 | continue; |
520 | } |
521 | |
522 | /* |
523 | * Generate a fake join tuple with nulls for the outer tuple, |
524 | * and return it if it passes the non-join quals. |
525 | */ |
526 | econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot; |
527 | |
528 | if (otherqual == NULL || ExecQual(otherqual, econtext)) |
529 | return ExecProject(node->js.ps.ps_ProjInfo); |
530 | else |
531 | InstrCountFiltered2(node, 1); |
532 | break; |
533 | |
534 | case HJ_NEED_NEW_BATCH: |
535 | |
536 | /* |
537 | * Try to advance to next batch. Done if there are no more. |
538 | */ |
539 | if (parallel) |
540 | { |
541 | if (!ExecParallelHashJoinNewBatch(node)) |
542 | return NULL; /* end of parallel-aware join */ |
543 | } |
544 | else |
545 | { |
546 | if (!ExecHashJoinNewBatch(node)) |
547 | return NULL; /* end of parallel-oblivious join */ |
548 | } |
549 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
550 | break; |
551 | |
552 | default: |
553 | elog(ERROR, "unrecognized hashjoin state: %d" , |
554 | (int) node->hj_JoinState); |
555 | } |
556 | } |
557 | } |
558 | |
559 | /* ---------------------------------------------------------------- |
560 | * ExecHashJoin |
561 | * |
562 | * Parallel-oblivious version. |
563 | * ---------------------------------------------------------------- |
564 | */ |
565 | static TupleTableSlot * /* return: a tuple or NULL */ |
566 | ExecHashJoin(PlanState *pstate) |
567 | { |
568 | /* |
569 | * On sufficiently smart compilers this should be inlined with the |
570 | * parallel-aware branches removed. |
571 | */ |
572 | return ExecHashJoinImpl(pstate, false); |
573 | } |
574 | |
575 | /* ---------------------------------------------------------------- |
576 | * ExecParallelHashJoin |
577 | * |
578 | * Parallel-aware version. |
579 | * ---------------------------------------------------------------- |
580 | */ |
581 | static TupleTableSlot * /* return: a tuple or NULL */ |
582 | ExecParallelHashJoin(PlanState *pstate) |
583 | { |
584 | /* |
585 | * On sufficiently smart compilers this should be inlined with the |
586 | * parallel-oblivious branches removed. |
587 | */ |
588 | return ExecHashJoinImpl(pstate, true); |
589 | } |
590 | |
591 | /* ---------------------------------------------------------------- |
592 | * ExecInitHashJoin |
593 | * |
594 | * Init routine for HashJoin node. |
595 | * ---------------------------------------------------------------- |
596 | */ |
597 | HashJoinState * |
598 | ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) |
599 | { |
600 | HashJoinState *hjstate; |
601 | Plan *outerNode; |
602 | Hash *hashNode; |
603 | TupleDesc outerDesc, |
604 | innerDesc; |
605 | const TupleTableSlotOps *ops; |
606 | |
607 | /* check for unsupported flags */ |
608 | Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); |
609 | |
610 | /* |
611 | * create state structure |
612 | */ |
613 | hjstate = makeNode(HashJoinState); |
614 | hjstate->js.ps.plan = (Plan *) node; |
615 | hjstate->js.ps.state = estate; |
616 | |
617 | /* |
618 | * See ExecHashJoinInitializeDSM() and ExecHashJoinInitializeWorker() |
619 | * where this function may be replaced with a parallel version, if we |
620 | * managed to launch a parallel query. |
621 | */ |
622 | hjstate->js.ps.ExecProcNode = ExecHashJoin; |
623 | hjstate->js.jointype = node->join.jointype; |
624 | |
625 | /* |
626 | * Miscellaneous initialization |
627 | * |
628 | * create expression context for node |
629 | */ |
630 | ExecAssignExprContext(estate, &hjstate->js.ps); |
631 | |
632 | /* |
633 | * initialize child nodes |
634 | * |
635 | * Note: we could suppress the REWIND flag for the inner input, which |
636 | * would amount to betting that the hash will be a single batch. Not |
637 | * clear if this would be a win or not. |
638 | */ |
639 | outerNode = outerPlan(node); |
640 | hashNode = (Hash *) innerPlan(node); |
641 | |
642 | outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); |
643 | outerDesc = ExecGetResultType(outerPlanState(hjstate)); |
644 | innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags); |
645 | innerDesc = ExecGetResultType(innerPlanState(hjstate)); |
646 | |
647 | /* |
648 | * Initialize result slot, type and projection. |
649 | */ |
650 | ExecInitResultTupleSlotTL(&hjstate->js.ps, &TTSOpsVirtual); |
651 | ExecAssignProjectionInfo(&hjstate->js.ps, NULL); |
652 | |
653 | /* |
654 | * tuple table initialization |
655 | */ |
656 | ops = ExecGetResultSlotOps(outerPlanState(hjstate), NULL); |
657 | hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate, outerDesc, |
658 | ops); |
659 | |
660 | /* |
661 | * detect whether we need only consider the first matching inner tuple |
662 | */ |
663 | hjstate->js.single_match = (node->join.inner_unique || |
664 | node->join.jointype == JOIN_SEMI); |
665 | |
666 | /* set up null tuples for outer joins, if needed */ |
667 | switch (node->join.jointype) |
668 | { |
669 | case JOIN_INNER: |
670 | case JOIN_SEMI: |
671 | break; |
672 | case JOIN_LEFT: |
673 | case JOIN_ANTI: |
674 | hjstate->hj_NullInnerTupleSlot = |
675 | ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); |
676 | break; |
677 | case JOIN_RIGHT: |
678 | hjstate->hj_NullOuterTupleSlot = |
679 | ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); |
680 | break; |
681 | case JOIN_FULL: |
682 | hjstate->hj_NullOuterTupleSlot = |
683 | ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); |
684 | hjstate->hj_NullInnerTupleSlot = |
685 | ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); |
686 | break; |
687 | default: |
688 | elog(ERROR, "unrecognized join type: %d" , |
689 | (int) node->join.jointype); |
690 | } |
691 | |
692 | /* |
693 | * now for some voodoo. our temporary tuple slot is actually the result |
694 | * tuple slot of the Hash node (which is our inner plan). we can do this |
695 | * because Hash nodes don't return tuples via ExecProcNode() -- instead |
696 | * the hash join node uses ExecScanHashBucket() to get at the contents of |
697 | * the hash table. -cim 6/9/91 |
698 | */ |
699 | { |
700 | HashState *hashstate = (HashState *) innerPlanState(hjstate); |
701 | TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; |
702 | |
703 | hjstate->hj_HashTupleSlot = slot; |
704 | } |
705 | |
706 | /* |
707 | * initialize child expressions |
708 | */ |
709 | hjstate->js.ps.qual = |
710 | ExecInitQual(node->join.plan.qual, (PlanState *) hjstate); |
711 | hjstate->js.joinqual = |
712 | ExecInitQual(node->join.joinqual, (PlanState *) hjstate); |
713 | hjstate->hashclauses = |
714 | ExecInitQual(node->hashclauses, (PlanState *) hjstate); |
715 | |
716 | /* |
717 | * initialize hash-specific info |
718 | */ |
719 | hjstate->hj_HashTable = NULL; |
720 | hjstate->hj_FirstOuterTupleSlot = NULL; |
721 | |
722 | hjstate->hj_CurHashValue = 0; |
723 | hjstate->hj_CurBucketNo = 0; |
724 | hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; |
725 | hjstate->hj_CurTuple = NULL; |
726 | |
727 | hjstate->hj_OuterHashKeys = ExecInitExprList(node->hashkeys, |
728 | (PlanState *) hjstate); |
729 | hjstate->hj_HashOperators = node->hashoperators; |
730 | hjstate->hj_Collations = node->hashcollations; |
731 | |
732 | hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; |
733 | hjstate->hj_MatchedOuter = false; |
734 | hjstate->hj_OuterNotEmpty = false; |
735 | |
736 | return hjstate; |
737 | } |
738 | |
739 | /* ---------------------------------------------------------------- |
740 | * ExecEndHashJoin |
741 | * |
742 | * clean up routine for HashJoin node |
743 | * ---------------------------------------------------------------- |
744 | */ |
745 | void |
746 | ExecEndHashJoin(HashJoinState *node) |
747 | { |
748 | /* |
749 | * Free hash table |
750 | */ |
751 | if (node->hj_HashTable) |
752 | { |
753 | ExecHashTableDestroy(node->hj_HashTable); |
754 | node->hj_HashTable = NULL; |
755 | } |
756 | |
757 | /* |
758 | * Free the exprcontext |
759 | */ |
760 | ExecFreeExprContext(&node->js.ps); |
761 | |
762 | /* |
763 | * clean out the tuple table |
764 | */ |
765 | ExecClearTuple(node->js.ps.ps_ResultTupleSlot); |
766 | ExecClearTuple(node->hj_OuterTupleSlot); |
767 | ExecClearTuple(node->hj_HashTupleSlot); |
768 | |
769 | /* |
770 | * clean up subtrees |
771 | */ |
772 | ExecEndNode(outerPlanState(node)); |
773 | ExecEndNode(innerPlanState(node)); |
774 | } |
775 | |
776 | /* |
777 | * ExecHashJoinOuterGetTuple |
778 | * |
779 | * get the next outer tuple for a parallel oblivious hashjoin: either by |
780 | * executing the outer plan node in the first pass, or from the temp |
781 | * files for the hashjoin batches. |
782 | * |
783 | * Returns a null slot if no more outer tuples (within the current batch). |
784 | * |
785 | * On success, the tuple's hash value is stored at *hashvalue --- this is |
786 | * either originally computed, or re-read from the temp file. |
787 | */ |
788 | static TupleTableSlot * |
789 | ExecHashJoinOuterGetTuple(PlanState *outerNode, |
790 | HashJoinState *hjstate, |
791 | uint32 *hashvalue) |
792 | { |
793 | HashJoinTable hashtable = hjstate->hj_HashTable; |
794 | int curbatch = hashtable->curbatch; |
795 | TupleTableSlot *slot; |
796 | |
797 | if (curbatch == 0) /* if it is the first pass */ |
798 | { |
799 | /* |
800 | * Check to see if first outer tuple was already fetched by |
801 | * ExecHashJoin() and not used yet. |
802 | */ |
803 | slot = hjstate->hj_FirstOuterTupleSlot; |
804 | if (!TupIsNull(slot)) |
805 | hjstate->hj_FirstOuterTupleSlot = NULL; |
806 | else |
807 | slot = ExecProcNode(outerNode); |
808 | |
809 | while (!TupIsNull(slot)) |
810 | { |
811 | /* |
812 | * We have to compute the tuple's hash value. |
813 | */ |
814 | ExprContext *econtext = hjstate->js.ps.ps_ExprContext; |
815 | |
816 | econtext->ecxt_outertuple = slot; |
817 | if (ExecHashGetHashValue(hashtable, econtext, |
818 | hjstate->hj_OuterHashKeys, |
819 | true, /* outer tuple */ |
820 | HJ_FILL_OUTER(hjstate), |
821 | hashvalue)) |
822 | { |
823 | /* remember outer relation is not empty for possible rescan */ |
824 | hjstate->hj_OuterNotEmpty = true; |
825 | |
826 | return slot; |
827 | } |
828 | |
829 | /* |
830 | * That tuple couldn't match because of a NULL, so discard it and |
831 | * continue with the next one. |
832 | */ |
833 | slot = ExecProcNode(outerNode); |
834 | } |
835 | } |
836 | else if (curbatch < hashtable->nbatch) |
837 | { |
838 | BufFile *file = hashtable->outerBatchFile[curbatch]; |
839 | |
840 | /* |
841 | * In outer-join cases, we could get here even though the batch file |
842 | * is empty. |
843 | */ |
844 | if (file == NULL) |
845 | return NULL; |
846 | |
847 | slot = ExecHashJoinGetSavedTuple(hjstate, |
848 | file, |
849 | hashvalue, |
850 | hjstate->hj_OuterTupleSlot); |
851 | if (!TupIsNull(slot)) |
852 | return slot; |
853 | } |
854 | |
855 | /* End of this batch */ |
856 | return NULL; |
857 | } |
858 | |
859 | /* |
860 | * ExecHashJoinOuterGetTuple variant for the parallel case. |
861 | */ |
862 | static TupleTableSlot * |
863 | ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, |
864 | HashJoinState *hjstate, |
865 | uint32 *hashvalue) |
866 | { |
867 | HashJoinTable hashtable = hjstate->hj_HashTable; |
868 | int curbatch = hashtable->curbatch; |
869 | TupleTableSlot *slot; |
870 | |
871 | /* |
872 | * In the Parallel Hash case we only run the outer plan directly for |
873 | * single-batch hash joins. Otherwise we have to go to batch files, even |
874 | * for batch 0. |
875 | */ |
876 | if (curbatch == 0 && hashtable->nbatch == 1) |
877 | { |
878 | slot = ExecProcNode(outerNode); |
879 | |
880 | while (!TupIsNull(slot)) |
881 | { |
882 | ExprContext *econtext = hjstate->js.ps.ps_ExprContext; |
883 | |
884 | econtext->ecxt_outertuple = slot; |
885 | if (ExecHashGetHashValue(hashtable, econtext, |
886 | hjstate->hj_OuterHashKeys, |
887 | true, /* outer tuple */ |
888 | HJ_FILL_OUTER(hjstate), |
889 | hashvalue)) |
890 | return slot; |
891 | |
892 | /* |
893 | * That tuple couldn't match because of a NULL, so discard it and |
894 | * continue with the next one. |
895 | */ |
896 | slot = ExecProcNode(outerNode); |
897 | } |
898 | } |
899 | else if (curbatch < hashtable->nbatch) |
900 | { |
901 | MinimalTuple tuple; |
902 | |
903 | tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples, |
904 | hashvalue); |
905 | if (tuple != NULL) |
906 | { |
907 | ExecForceStoreMinimalTuple(tuple, |
908 | hjstate->hj_OuterTupleSlot, |
909 | false); |
910 | slot = hjstate->hj_OuterTupleSlot; |
911 | return slot; |
912 | } |
913 | else |
914 | ExecClearTuple(hjstate->hj_OuterTupleSlot); |
915 | } |
916 | |
917 | /* End of this batch */ |
918 | return NULL; |
919 | } |
920 | |
921 | /* |
922 | * ExecHashJoinNewBatch |
923 | * switch to a new hashjoin batch |
924 | * |
925 | * Returns true if successful, false if there are no more batches. |
926 | */ |
927 | static bool |
928 | ExecHashJoinNewBatch(HashJoinState *hjstate) |
929 | { |
930 | HashJoinTable hashtable = hjstate->hj_HashTable; |
931 | int nbatch; |
932 | int curbatch; |
933 | BufFile *innerFile; |
934 | TupleTableSlot *slot; |
935 | uint32 hashvalue; |
936 | |
937 | nbatch = hashtable->nbatch; |
938 | curbatch = hashtable->curbatch; |
939 | |
940 | if (curbatch > 0) |
941 | { |
942 | /* |
943 | * We no longer need the previous outer batch file; close it right |
944 | * away to free disk space. |
945 | */ |
946 | if (hashtable->outerBatchFile[curbatch]) |
947 | BufFileClose(hashtable->outerBatchFile[curbatch]); |
948 | hashtable->outerBatchFile[curbatch] = NULL; |
949 | } |
950 | else /* we just finished the first batch */ |
951 | { |
952 | /* |
953 | * Reset some of the skew optimization state variables, since we no |
954 | * longer need to consider skew tuples after the first batch. The |
955 | * memory context reset we are about to do will release the skew |
956 | * hashtable itself. |
957 | */ |
958 | hashtable->skewEnabled = false; |
959 | hashtable->skewBucket = NULL; |
960 | hashtable->skewBucketNums = NULL; |
961 | hashtable->nSkewBuckets = 0; |
962 | hashtable->spaceUsedSkew = 0; |
963 | } |
964 | |
965 | /* |
966 | * We can always skip over any batches that are completely empty on both |
967 | * sides. We can sometimes skip over batches that are empty on only one |
968 | * side, but there are exceptions: |
969 | * |
970 | * 1. In a left/full outer join, we have to process outer batches even if |
971 | * the inner batch is empty. Similarly, in a right/full outer join, we |
972 | * have to process inner batches even if the outer batch is empty. |
973 | * |
974 | * 2. If we have increased nbatch since the initial estimate, we have to |
975 | * scan inner batches since they might contain tuples that need to be |
976 | * reassigned to later inner batches. |
977 | * |
978 | * 3. Similarly, if we have increased nbatch since starting the outer |
979 | * scan, we have to rescan outer batches in case they contain tuples that |
980 | * need to be reassigned. |
981 | */ |
982 | curbatch++; |
983 | while (curbatch < nbatch && |
984 | (hashtable->outerBatchFile[curbatch] == NULL || |
985 | hashtable->innerBatchFile[curbatch] == NULL)) |
986 | { |
987 | if (hashtable->outerBatchFile[curbatch] && |
988 | HJ_FILL_OUTER(hjstate)) |
989 | break; /* must process due to rule 1 */ |
990 | if (hashtable->innerBatchFile[curbatch] && |
991 | HJ_FILL_INNER(hjstate)) |
992 | break; /* must process due to rule 1 */ |
993 | if (hashtable->innerBatchFile[curbatch] && |
994 | nbatch != hashtable->nbatch_original) |
995 | break; /* must process due to rule 2 */ |
996 | if (hashtable->outerBatchFile[curbatch] && |
997 | nbatch != hashtable->nbatch_outstart) |
998 | break; /* must process due to rule 3 */ |
999 | /* We can ignore this batch. */ |
1000 | /* Release associated temp files right away. */ |
1001 | if (hashtable->innerBatchFile[curbatch]) |
1002 | BufFileClose(hashtable->innerBatchFile[curbatch]); |
1003 | hashtable->innerBatchFile[curbatch] = NULL; |
1004 | if (hashtable->outerBatchFile[curbatch]) |
1005 | BufFileClose(hashtable->outerBatchFile[curbatch]); |
1006 | hashtable->outerBatchFile[curbatch] = NULL; |
1007 | curbatch++; |
1008 | } |
1009 | |
1010 | if (curbatch >= nbatch) |
1011 | return false; /* no more batches */ |
1012 | |
1013 | hashtable->curbatch = curbatch; |
1014 | |
1015 | /* |
1016 | * Reload the hash table with the new inner batch (which could be empty) |
1017 | */ |
1018 | ExecHashTableReset(hashtable); |
1019 | |
1020 | innerFile = hashtable->innerBatchFile[curbatch]; |
1021 | |
1022 | if (innerFile != NULL) |
1023 | { |
1024 | if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) |
1025 | ereport(ERROR, |
1026 | (errcode_for_file_access(), |
1027 | errmsg("could not rewind hash-join temporary file: %m" ))); |
1028 | |
1029 | while ((slot = ExecHashJoinGetSavedTuple(hjstate, |
1030 | innerFile, |
1031 | &hashvalue, |
1032 | hjstate->hj_HashTupleSlot))) |
1033 | { |
1034 | /* |
1035 | * NOTE: some tuples may be sent to future batches. Also, it is |
1036 | * possible for hashtable->nbatch to be increased here! |
1037 | */ |
1038 | ExecHashTableInsert(hashtable, slot, hashvalue); |
1039 | } |
1040 | |
1041 | /* |
1042 | * after we build the hash table, the inner batch file is no longer |
1043 | * needed |
1044 | */ |
1045 | BufFileClose(innerFile); |
1046 | hashtable->innerBatchFile[curbatch] = NULL; |
1047 | } |
1048 | |
1049 | /* |
1050 | * Rewind outer batch file (if present), so that we can start reading it. |
1051 | */ |
1052 | if (hashtable->outerBatchFile[curbatch] != NULL) |
1053 | { |
1054 | if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) |
1055 | ereport(ERROR, |
1056 | (errcode_for_file_access(), |
1057 | errmsg("could not rewind hash-join temporary file: %m" ))); |
1058 | } |
1059 | |
1060 | return true; |
1061 | } |
1062 | |
1063 | /* |
1064 | * Choose a batch to work on, and attach to it. Returns true if successful, |
1065 | * false if there are no more batches. |
1066 | */ |
1067 | static bool |
1068 | ExecParallelHashJoinNewBatch(HashJoinState *hjstate) |
1069 | { |
1070 | HashJoinTable hashtable = hjstate->hj_HashTable; |
1071 | int start_batchno; |
1072 | int batchno; |
1073 | |
1074 | /* |
1075 | * If we started up so late that the batch tracking array has been freed |
1076 | * already by ExecHashTableDetach(), then we are finished. See also |
1077 | * ExecParallelHashEnsureBatchAccessors(). |
1078 | */ |
1079 | if (hashtable->batches == NULL) |
1080 | return false; |
1081 | |
1082 | /* |
1083 | * If we were already attached to a batch, remember not to bother checking |
1084 | * it again, and detach from it (possibly freeing the hash table if we are |
1085 | * last to detach). |
1086 | */ |
1087 | if (hashtable->curbatch >= 0) |
1088 | { |
1089 | hashtable->batches[hashtable->curbatch].done = true; |
1090 | ExecHashTableDetachBatch(hashtable); |
1091 | } |
1092 | |
1093 | /* |
1094 | * Search for a batch that isn't done. We use an atomic counter to start |
1095 | * our search at a different batch in every participant when there are |
1096 | * more batches than participants. |
1097 | */ |
1098 | batchno = start_batchno = |
1099 | pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) % |
1100 | hashtable->nbatch; |
1101 | do |
1102 | { |
1103 | uint32 hashvalue; |
1104 | MinimalTuple tuple; |
1105 | TupleTableSlot *slot; |
1106 | |
1107 | if (!hashtable->batches[batchno].done) |
1108 | { |
1109 | SharedTuplestoreAccessor *inner_tuples; |
1110 | Barrier *batch_barrier = |
1111 | &hashtable->batches[batchno].shared->batch_barrier; |
1112 | |
1113 | switch (BarrierAttach(batch_barrier)) |
1114 | { |
1115 | case PHJ_BATCH_ELECTING: |
1116 | |
1117 | /* One backend allocates the hash table. */ |
1118 | if (BarrierArriveAndWait(batch_barrier, |
1119 | WAIT_EVENT_HASH_BATCH_ELECTING)) |
1120 | ExecParallelHashTableAlloc(hashtable, batchno); |
1121 | /* Fall through. */ |
1122 | |
1123 | case PHJ_BATCH_ALLOCATING: |
1124 | /* Wait for allocation to complete. */ |
1125 | BarrierArriveAndWait(batch_barrier, |
1126 | WAIT_EVENT_HASH_BATCH_ALLOCATING); |
1127 | /* Fall through. */ |
1128 | |
1129 | case PHJ_BATCH_LOADING: |
1130 | /* Start (or join in) loading tuples. */ |
1131 | ExecParallelHashTableSetCurrentBatch(hashtable, batchno); |
1132 | inner_tuples = hashtable->batches[batchno].inner_tuples; |
1133 | sts_begin_parallel_scan(inner_tuples); |
1134 | while ((tuple = sts_parallel_scan_next(inner_tuples, |
1135 | &hashvalue))) |
1136 | { |
1137 | ExecForceStoreMinimalTuple(tuple, |
1138 | hjstate->hj_HashTupleSlot, |
1139 | false); |
1140 | slot = hjstate->hj_HashTupleSlot; |
1141 | ExecParallelHashTableInsertCurrentBatch(hashtable, slot, |
1142 | hashvalue); |
1143 | } |
1144 | sts_end_parallel_scan(inner_tuples); |
1145 | BarrierArriveAndWait(batch_barrier, |
1146 | WAIT_EVENT_HASH_BATCH_LOADING); |
1147 | /* Fall through. */ |
1148 | |
1149 | case PHJ_BATCH_PROBING: |
1150 | |
1151 | /* |
1152 | * This batch is ready to probe. Return control to |
1153 | * caller. We stay attached to batch_barrier so that the |
1154 | * hash table stays alive until everyone's finished |
1155 | * probing it, but no participant is allowed to wait at |
1156 | * this barrier again (or else a deadlock could occur). |
1157 | * All attached participants must eventually call |
1158 | * BarrierArriveAndDetach() so that the final phase |
1159 | * PHJ_BATCH_DONE can be reached. |
1160 | */ |
1161 | ExecParallelHashTableSetCurrentBatch(hashtable, batchno); |
1162 | sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples); |
1163 | return true; |
1164 | |
1165 | case PHJ_BATCH_DONE: |
1166 | |
1167 | /* |
1168 | * Already done. Detach and go around again (if any |
1169 | * remain). |
1170 | */ |
1171 | BarrierDetach(batch_barrier); |
1172 | hashtable->batches[batchno].done = true; |
1173 | hashtable->curbatch = -1; |
1174 | break; |
1175 | |
1176 | default: |
1177 | elog(ERROR, "unexpected batch phase %d" , |
1178 | BarrierPhase(batch_barrier)); |
1179 | } |
1180 | } |
1181 | batchno = (batchno + 1) % hashtable->nbatch; |
1182 | } while (batchno != start_batchno); |
1183 | |
1184 | return false; |
1185 | } |
1186 | |
1187 | /* |
1188 | * ExecHashJoinSaveTuple |
1189 | * save a tuple to a batch file. |
1190 | * |
1191 | * The data recorded in the file for each tuple is its hash value, |
1192 | * then the tuple in MinimalTuple format. |
1193 | * |
1194 | * Note: it is important always to call this in the regular executor |
1195 | * context, not in a shorter-lived context; else the temp file buffers |
1196 | * will get messed up. |
1197 | */ |
1198 | void |
1199 | ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, |
1200 | BufFile **fileptr) |
1201 | { |
1202 | BufFile *file = *fileptr; |
1203 | size_t written; |
1204 | |
1205 | if (file == NULL) |
1206 | { |
1207 | /* First write to this batch file, so open it. */ |
1208 | file = BufFileCreateTemp(false); |
1209 | *fileptr = file; |
1210 | } |
1211 | |
1212 | written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); |
1213 | if (written != sizeof(uint32)) |
1214 | ereport(ERROR, |
1215 | (errcode_for_file_access(), |
1216 | errmsg("could not write to hash-join temporary file: %m" ))); |
1217 | |
1218 | written = BufFileWrite(file, (void *) tuple, tuple->t_len); |
1219 | if (written != tuple->t_len) |
1220 | ereport(ERROR, |
1221 | (errcode_for_file_access(), |
1222 | errmsg("could not write to hash-join temporary file: %m" ))); |
1223 | } |
1224 | |
1225 | /* |
1226 | * ExecHashJoinGetSavedTuple |
1227 | * read the next tuple from a batch file. Return NULL if no more. |
1228 | * |
1229 | * On success, *hashvalue is set to the tuple's hash value, and the tuple |
1230 | * itself is stored in the given slot. |
1231 | */ |
1232 | static TupleTableSlot * |
1233 | ExecHashJoinGetSavedTuple(HashJoinState *hjstate, |
1234 | BufFile *file, |
1235 | uint32 *hashvalue, |
1236 | TupleTableSlot *tupleSlot) |
1237 | { |
1238 | uint32 [2]; |
1239 | size_t nread; |
1240 | MinimalTuple tuple; |
1241 | |
1242 | /* |
1243 | * We check for interrupts here because this is typically taken as an |
1244 | * alternative code path to an ExecProcNode() call, which would include |
1245 | * such a check. |
1246 | */ |
1247 | CHECK_FOR_INTERRUPTS(); |
1248 | |
1249 | /* |
1250 | * Since both the hash value and the MinimalTuple length word are uint32, |
1251 | * we can read them both in one BufFileRead() call without any type |
1252 | * cheating. |
1253 | */ |
1254 | nread = BufFileRead(file, (void *) header, sizeof(header)); |
1255 | if (nread == 0) /* end of file */ |
1256 | { |
1257 | ExecClearTuple(tupleSlot); |
1258 | return NULL; |
1259 | } |
1260 | if (nread != sizeof(header)) |
1261 | ereport(ERROR, |
1262 | (errcode_for_file_access(), |
1263 | errmsg("could not read from hash-join temporary file: %m" ))); |
1264 | *hashvalue = header[0]; |
1265 | tuple = (MinimalTuple) palloc(header[1]); |
1266 | tuple->t_len = header[1]; |
1267 | nread = BufFileRead(file, |
1268 | (void *) ((char *) tuple + sizeof(uint32)), |
1269 | header[1] - sizeof(uint32)); |
1270 | if (nread != header[1] - sizeof(uint32)) |
1271 | ereport(ERROR, |
1272 | (errcode_for_file_access(), |
1273 | errmsg("could not read from hash-join temporary file: %m" ))); |
1274 | ExecForceStoreMinimalTuple(tuple, tupleSlot, true); |
1275 | return tupleSlot; |
1276 | } |
1277 | |
1278 | |
1279 | void |
1280 | ExecReScanHashJoin(HashJoinState *node) |
1281 | { |
1282 | /* |
1283 | * In a multi-batch join, we currently have to do rescans the hard way, |
1284 | * primarily because batch temp files may have already been released. But |
1285 | * if it's a single-batch join, and there is no parameter change for the |
1286 | * inner subnode, then we can just re-use the existing hash table without |
1287 | * rebuilding it. |
1288 | */ |
1289 | if (node->hj_HashTable != NULL) |
1290 | { |
1291 | if (node->hj_HashTable->nbatch == 1 && |
1292 | node->js.ps.righttree->chgParam == NULL) |
1293 | { |
1294 | /* |
1295 | * Okay to reuse the hash table; needn't rescan inner, either. |
1296 | * |
1297 | * However, if it's a right/full join, we'd better reset the |
1298 | * inner-tuple match flags contained in the table. |
1299 | */ |
1300 | if (HJ_FILL_INNER(node)) |
1301 | ExecHashTableResetMatchFlags(node->hj_HashTable); |
1302 | |
1303 | /* |
1304 | * Also, we need to reset our state about the emptiness of the |
1305 | * outer relation, so that the new scan of the outer will update |
1306 | * it correctly if it turns out to be empty this time. (There's no |
1307 | * harm in clearing it now because ExecHashJoin won't need the |
1308 | * info. In the other cases, where the hash table doesn't exist |
1309 | * or we are destroying it, we leave this state alone because |
1310 | * ExecHashJoin will need it the first time through.) |
1311 | */ |
1312 | node->hj_OuterNotEmpty = false; |
1313 | |
1314 | /* ExecHashJoin can skip the BUILD_HASHTABLE step */ |
1315 | node->hj_JoinState = HJ_NEED_NEW_OUTER; |
1316 | } |
1317 | else |
1318 | { |
1319 | /* must destroy and rebuild hash table */ |
1320 | ExecHashTableDestroy(node->hj_HashTable); |
1321 | node->hj_HashTable = NULL; |
1322 | node->hj_JoinState = HJ_BUILD_HASHTABLE; |
1323 | |
1324 | /* |
1325 | * if chgParam of subnode is not null then plan will be re-scanned |
1326 | * by first ExecProcNode. |
1327 | */ |
1328 | if (node->js.ps.righttree->chgParam == NULL) |
1329 | ExecReScan(node->js.ps.righttree); |
1330 | } |
1331 | } |
1332 | |
1333 | /* Always reset intra-tuple state */ |
1334 | node->hj_CurHashValue = 0; |
1335 | node->hj_CurBucketNo = 0; |
1336 | node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; |
1337 | node->hj_CurTuple = NULL; |
1338 | |
1339 | node->hj_MatchedOuter = false; |
1340 | node->hj_FirstOuterTupleSlot = NULL; |
1341 | |
1342 | /* |
1343 | * if chgParam of subnode is not null then plan will be re-scanned by |
1344 | * first ExecProcNode. |
1345 | */ |
1346 | if (node->js.ps.lefttree->chgParam == NULL) |
1347 | ExecReScan(node->js.ps.lefttree); |
1348 | } |
1349 | |
1350 | void |
1351 | ExecShutdownHashJoin(HashJoinState *node) |
1352 | { |
1353 | if (node->hj_HashTable) |
1354 | { |
1355 | /* |
1356 | * Detach from shared state before DSM memory goes away. This makes |
1357 | * sure that we don't have any pointers into DSM memory by the time |
1358 | * ExecEndHashJoin runs. |
1359 | */ |
1360 | ExecHashTableDetachBatch(node->hj_HashTable); |
1361 | ExecHashTableDetach(node->hj_HashTable); |
1362 | } |
1363 | } |
1364 | |
1365 | static void |
1366 | ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) |
1367 | { |
1368 | PlanState *outerState = outerPlanState(hjstate); |
1369 | ExprContext *econtext = hjstate->js.ps.ps_ExprContext; |
1370 | HashJoinTable hashtable = hjstate->hj_HashTable; |
1371 | TupleTableSlot *slot; |
1372 | uint32 hashvalue; |
1373 | int i; |
1374 | |
1375 | Assert(hjstate->hj_FirstOuterTupleSlot == NULL); |
1376 | |
1377 | /* Execute outer plan, writing all tuples to shared tuplestores. */ |
1378 | for (;;) |
1379 | { |
1380 | slot = ExecProcNode(outerState); |
1381 | if (TupIsNull(slot)) |
1382 | break; |
1383 | econtext->ecxt_outertuple = slot; |
1384 | if (ExecHashGetHashValue(hashtable, econtext, |
1385 | hjstate->hj_OuterHashKeys, |
1386 | true, /* outer tuple */ |
1387 | HJ_FILL_OUTER(hjstate), |
1388 | &hashvalue)) |
1389 | { |
1390 | int batchno; |
1391 | int bucketno; |
1392 | bool shouldFree; |
1393 | MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree); |
1394 | |
1395 | ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, |
1396 | &batchno); |
1397 | sts_puttuple(hashtable->batches[batchno].outer_tuples, |
1398 | &hashvalue, mintup); |
1399 | |
1400 | if (shouldFree) |
1401 | heap_free_minimal_tuple(mintup); |
1402 | } |
1403 | CHECK_FOR_INTERRUPTS(); |
1404 | } |
1405 | |
1406 | /* Make sure all outer partitions are readable by any backend. */ |
1407 | for (i = 0; i < hashtable->nbatch; ++i) |
1408 | sts_end_write(hashtable->batches[i].outer_tuples); |
1409 | } |
1410 | |
1411 | void |
1412 | ExecHashJoinEstimate(HashJoinState *state, ParallelContext *pcxt) |
1413 | { |
1414 | shm_toc_estimate_chunk(&pcxt->estimator, sizeof(ParallelHashJoinState)); |
1415 | shm_toc_estimate_keys(&pcxt->estimator, 1); |
1416 | } |
1417 | |
1418 | void |
1419 | ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) |
1420 | { |
1421 | int plan_node_id = state->js.ps.plan->plan_node_id; |
1422 | HashState *hashNode; |
1423 | ParallelHashJoinState *pstate; |
1424 | |
1425 | /* |
1426 | * Disable shared hash table mode if we failed to create a real DSM |
1427 | * segment, because that means that we don't have a DSA area to work with. |
1428 | */ |
1429 | if (pcxt->seg == NULL) |
1430 | return; |
1431 | |
1432 | ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); |
1433 | |
1434 | /* |
1435 | * Set up the state needed to coordinate access to the shared hash |
1436 | * table(s), using the plan node ID as the toc key. |
1437 | */ |
1438 | pstate = shm_toc_allocate(pcxt->toc, sizeof(ParallelHashJoinState)); |
1439 | shm_toc_insert(pcxt->toc, plan_node_id, pstate); |
1440 | |
1441 | /* |
1442 | * Set up the shared hash join state with no batches initially. |
1443 | * ExecHashTableCreate() will prepare at least one later and set nbatch |
1444 | * and space_allowed. |
1445 | */ |
1446 | pstate->nbatch = 0; |
1447 | pstate->space_allowed = 0; |
1448 | pstate->batches = InvalidDsaPointer; |
1449 | pstate->old_batches = InvalidDsaPointer; |
1450 | pstate->nbuckets = 0; |
1451 | pstate->growth = PHJ_GROWTH_OK; |
1452 | pstate->chunk_work_queue = InvalidDsaPointer; |
1453 | pg_atomic_init_u32(&pstate->distributor, 0); |
1454 | pstate->nparticipants = pcxt->nworkers + 1; |
1455 | pstate->total_tuples = 0; |
1456 | LWLockInitialize(&pstate->lock, |
1457 | LWTRANCHE_PARALLEL_HASH_JOIN); |
1458 | BarrierInit(&pstate->build_barrier, 0); |
1459 | BarrierInit(&pstate->grow_batches_barrier, 0); |
1460 | BarrierInit(&pstate->grow_buckets_barrier, 0); |
1461 | |
1462 | /* Set up the space we'll use for shared temporary files. */ |
1463 | SharedFileSetInit(&pstate->fileset, pcxt->seg); |
1464 | |
1465 | /* Initialize the shared state in the hash node. */ |
1466 | hashNode = (HashState *) innerPlanState(state); |
1467 | hashNode->parallel_state = pstate; |
1468 | } |
1469 | |
1470 | /* ---------------------------------------------------------------- |
1471 | * ExecHashJoinReInitializeDSM |
1472 | * |
1473 | * Reset shared state before beginning a fresh scan. |
1474 | * ---------------------------------------------------------------- |
1475 | */ |
1476 | void |
1477 | ExecHashJoinReInitializeDSM(HashJoinState *state, ParallelContext *cxt) |
1478 | { |
1479 | int plan_node_id = state->js.ps.plan->plan_node_id; |
1480 | ParallelHashJoinState *pstate = |
1481 | shm_toc_lookup(cxt->toc, plan_node_id, false); |
1482 | |
1483 | /* |
1484 | * It would be possible to reuse the shared hash table in single-batch |
1485 | * cases by resetting and then fast-forwarding build_barrier to |
1486 | * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_PROBING, but |
1487 | * currently shared hash tables are already freed by now (by the last |
1488 | * participant to detach from the batch). We could consider keeping it |
1489 | * around for single-batch joins. We'd also need to adjust |
1490 | * finalize_plan() so that it doesn't record a dummy dependency for |
1491 | * Parallel Hash nodes, preventing the rescan optimization. For now we |
1492 | * don't try. |
1493 | */ |
1494 | |
1495 | /* Detach, freeing any remaining shared memory. */ |
1496 | if (state->hj_HashTable != NULL) |
1497 | { |
1498 | ExecHashTableDetachBatch(state->hj_HashTable); |
1499 | ExecHashTableDetach(state->hj_HashTable); |
1500 | } |
1501 | |
1502 | /* Clear any shared batch files. */ |
1503 | SharedFileSetDeleteAll(&pstate->fileset); |
1504 | |
1505 | /* Reset build_barrier to PHJ_BUILD_ELECTING so we can go around again. */ |
1506 | BarrierInit(&pstate->build_barrier, 0); |
1507 | } |
1508 | |
1509 | void |
1510 | ExecHashJoinInitializeWorker(HashJoinState *state, |
1511 | ParallelWorkerContext *pwcxt) |
1512 | { |
1513 | HashState *hashNode; |
1514 | int plan_node_id = state->js.ps.plan->plan_node_id; |
1515 | ParallelHashJoinState *pstate = |
1516 | shm_toc_lookup(pwcxt->toc, plan_node_id, false); |
1517 | |
1518 | /* Attach to the space for shared temporary files. */ |
1519 | SharedFileSetAttach(&pstate->fileset, pwcxt->seg); |
1520 | |
1521 | /* Attach to the shared state in the hash node. */ |
1522 | hashNode = (HashState *) innerPlanState(state); |
1523 | hashNode->parallel_state = pstate; |
1524 | |
1525 | ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); |
1526 | } |
1527 | |