1 /*-------------------------------------------------------------------------
4 * Routines to handle hash join nodes
6 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.75 2005/10/18 01:06:24 tgl Exp $
13 *-------------------------------------------------------------------------
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "optimizer/clauses.h"
23 #include "utils/memutils.h"
26 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
27 HashJoinState *hjstate,
29 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
32 TupleTableSlot *tupleSlot);
33 static int ExecHashJoinNewBatch(HashJoinState *hjstate);
36 /* ----------------------------------------------------------------
39 * This function implements the Hybrid Hashjoin algorithm.
41 * Note: the relation we build hash table on is the "inner"
42 * the other one is "outer".
43 * ----------------------------------------------------------------
45 TupleTableSlot * /* return: a tuple or NULL */
46 ExecHashJoin(HashJoinState *node)
53 TupleTableSlot *inntuple;
54 ExprContext *econtext;
56 HashJoinTable hashtable;
58 TupleTableSlot *outerTupleSlot;
63 * get information from HashJoin node
65 estate = node->js.ps.state;
66 joinqual = node->js.joinqual;
67 otherqual = node->js.ps.qual;
68 hashNode = (HashState *) innerPlanState(node);
69 outerNode = outerPlanState(node);
72 * get information from HashJoin state
74 hashtable = node->hj_HashTable;
75 econtext = node->js.ps.ps_ExprContext;
78 * Check to see if we're still projecting out tuples from a previous join
79 * tuple (because there is a function-returning-set in the projection
80 * expressions). If so, try to project another one.
82 if (node->js.ps.ps_TupFromTlist)
84 TupleTableSlot *result;
86 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
87 if (isDone == ExprMultipleResult)
89 /* Done with that source tuple... */
90 node->js.ps.ps_TupFromTlist = false;
94 * If we're doing an IN join, we want to return at most one row per outer
95 * tuple; so we can stop scanning the inner scan if we matched on the
98 if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
99 node->hj_NeedNewOuter = true;
102 * Reset per-tuple memory context to free any expression evaluation
103 * storage allocated in the previous tuple cycle. Note this can't happen
104 * until we're done projecting out tuples from a join tuple.
106 ResetExprContext(econtext);
109 * if this is the first call, build the hash table for inner relation
111 if (hashtable == NULL)
114 * If the outer relation is completely empty, we can quit without
115 * building the hash table. However, for an inner join it is only a
116 * win to check this when the outer relation's startup cost is less
117 * than the projected cost of building the hash table. Otherwise it's
118 * best to build the hash table first and see if the inner relation is
119 * empty. (When it's an outer join, we should always make this check,
120 * since we aren't going to be able to skip the join on the strength
121 * of an empty inner relation anyway.)
123 * The only way to make the check is to try to fetch a tuple from the
124 * outer plan node. If we succeed, we have to stash it away for later
125 * consumption by ExecHashJoinOuterGetTuple.
127 if (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost ||
128 node->js.jointype == JOIN_LEFT)
130 node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
131 if (TupIsNull(node->hj_FirstOuterTupleSlot))
135 node->hj_FirstOuterTupleSlot = NULL;
138 * create the hash table
140 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
141 node->hj_HashOperators);
142 node->hj_HashTable = hashtable;
145 * execute the Hash node, to build the hash table
147 hashNode->hashtable = hashtable;
148 (void) MultiExecProcNode((PlanState *) hashNode);
151 * If the inner relation is completely empty, and we're not doing an
152 * outer join, we can quit without scanning the outer relation.
154 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
156 ExecHashTableDestroy(hashtable);
157 node->hj_HashTable = NULL;
158 node->hj_FirstOuterTupleSlot = NULL;
163 * need to remember whether nbatch has increased since we began
164 * scanning the outer relation
166 hashtable->nbatch_outstart = hashtable->nbatch;
170 * run the hash join process
175 * If we don't have an outer tuple, get the next one
177 if (node->hj_NeedNewOuter)
179 outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
182 if (TupIsNull(outerTupleSlot))
188 node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
189 econtext->ecxt_outertuple = outerTupleSlot;
190 node->hj_NeedNewOuter = false;
191 node->hj_MatchedOuter = false;
194 * now we have an outer tuple, find the corresponding bucket for
195 * this tuple from the hash table
197 node->hj_CurHashValue = hashvalue;
198 ExecHashGetBucketAndBatch(hashtable, hashvalue,
199 &node->hj_CurBucketNo, &batchno);
200 node->hj_CurTuple = NULL;
203 * Now we've got an outer tuple and the corresponding hash bucket,
204 * but this tuple may not belong to the current batch.
206 if (batchno != hashtable->curbatch)
209 * Need to postpone this outer tuple to a later batch. Save it
210 * in the corresponding outer-batch file.
212 Assert(batchno > hashtable->curbatch);
213 ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
215 &hashtable->outerBatchFile[batchno]);
216 node->hj_NeedNewOuter = true;
217 continue; /* loop around for a new outer tuple */
222 * OK, scan the selected hash bucket for matches
226 curtuple = ExecScanHashBucket(node, econtext);
227 if (curtuple == NULL)
228 break; /* out of matches */
231 * we've got a match, but still need to test non-hashed quals
233 inntuple = ExecStoreTuple(curtuple,
234 node->hj_HashTupleSlot,
236 false); /* don't pfree this tuple */
237 econtext->ecxt_innertuple = inntuple;
239 /* reset temp memory each time to avoid leaks from qual expr */
240 ResetExprContext(econtext);
243 * if we pass the qual, then save state for next call and have
244 * ExecProject form the projection, store it in the tuple table,
245 * and return the slot.
247 * Only the joinquals determine MatchedOuter status, but all quals
248 * must pass to actually return the tuple.
250 if (joinqual == NIL || ExecQual(joinqual, econtext, false))
252 node->hj_MatchedOuter = true;
254 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
256 TupleTableSlot *result;
258 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
260 if (isDone != ExprEndResult)
262 node->js.ps.ps_TupFromTlist =
263 (isDone == ExprMultipleResult);
269 * If we didn't return a tuple, may need to set NeedNewOuter
271 if (node->js.jointype == JOIN_IN)
273 node->hj_NeedNewOuter = true;
274 break; /* out of loop over hash bucket */
280 * Now the current outer tuple has run out of matches, so check
281 * whether to emit a dummy outer-join tuple. If not, loop around to
282 * get a new outer tuple.
284 node->hj_NeedNewOuter = true;
286 if (!node->hj_MatchedOuter &&
287 node->js.jointype == JOIN_LEFT)
290 * We are doing an outer join and there were no join matches for
291 * this outer tuple. Generate a fake join tuple with nulls for
292 * the inner tuple, and return it if it passes the non-join quals.
294 econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
296 if (ExecQual(otherqual, econtext, false))
299 * qualification was satisfied so we project and return the
300 * slot containing the result tuple using ExecProject().
302 TupleTableSlot *result;
304 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
306 if (isDone != ExprEndResult)
308 node->js.ps.ps_TupFromTlist =
309 (isDone == ExprMultipleResult);
317 /* ----------------------------------------------------------------
320 * Init routine for HashJoin node.
321 * ----------------------------------------------------------------
324 ExecInitHashJoin(HashJoin *node, EState *estate)
326 HashJoinState *hjstate;
335 * create state structure
337 hjstate = makeNode(HashJoinState);
338 hjstate->js.ps.plan = (Plan *) node;
339 hjstate->js.ps.state = estate;
342 * Miscellaneous initialization
344 * create expression context for node
346 ExecAssignExprContext(estate, &hjstate->js.ps);
349 * initialize child expressions
351 hjstate->js.ps.targetlist = (List *)
352 ExecInitExpr((Expr *) node->join.plan.targetlist,
353 (PlanState *) hjstate);
354 hjstate->js.ps.qual = (List *)
355 ExecInitExpr((Expr *) node->join.plan.qual,
356 (PlanState *) hjstate);
357 hjstate->js.jointype = node->join.jointype;
358 hjstate->js.joinqual = (List *)
359 ExecInitExpr((Expr *) node->join.joinqual,
360 (PlanState *) hjstate);
361 hjstate->hashclauses = (List *)
362 ExecInitExpr((Expr *) node->hashclauses,
363 (PlanState *) hjstate);
366 * initialize child nodes
368 outerNode = outerPlan(node);
369 hashNode = (Hash *) innerPlan(node);
371 outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
372 innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
374 #define HASHJOIN_NSLOTS 3
377 * tuple table initialization
379 ExecInitResultTupleSlot(estate, &hjstate->js.ps);
380 hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
382 switch (node->join.jointype)
388 hjstate->hj_NullInnerTupleSlot =
389 ExecInitNullTupleSlot(estate,
390 ExecGetResultType(innerPlanState(hjstate)));
393 elog(ERROR, "unrecognized join type: %d",
394 (int) node->join.jointype);
398 * now for some voodoo. our temporary tuple slot is actually the result
399 * tuple slot of the Hash node (which is our inner plan). we do this
400 * because Hash nodes don't return tuples via ExecProcNode() -- instead
401 * the hash join node uses ExecScanHashBucket() to get at the contents of
402 * the hash table. -cim 6/9/91
405 HashState *hashstate = (HashState *) innerPlanState(hjstate);
406 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
408 hjstate->hj_HashTupleSlot = slot;
412 * initialize tuple type and projection info
414 ExecAssignResultTypeFromTL(&hjstate->js.ps);
415 ExecAssignProjectionInfo(&hjstate->js.ps);
417 ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
418 ExecGetResultType(outerPlanState(hjstate)),
422 * initialize hash-specific info
424 hjstate->hj_HashTable = NULL;
425 hjstate->hj_FirstOuterTupleSlot = NULL;
427 hjstate->hj_CurHashValue = 0;
428 hjstate->hj_CurBucketNo = 0;
429 hjstate->hj_CurTuple = NULL;
432 * Deconstruct the hash clauses into outer and inner argument values, so
433 * that we can evaluate those subexpressions separately. Also make a list
434 * of the hash operator OIDs, in preparation for looking up the hash
440 foreach(l, hjstate->hashclauses)
442 FuncExprState *fstate = (FuncExprState *) lfirst(l);
445 Assert(IsA(fstate, FuncExprState));
446 hclause = (OpExpr *) fstate->xprstate.expr;
447 Assert(IsA(hclause, OpExpr));
448 lclauses = lappend(lclauses, linitial(fstate->args));
449 rclauses = lappend(rclauses, lsecond(fstate->args));
450 hoperators = lappend_oid(hoperators, hclause->opno);
452 hjstate->hj_OuterHashKeys = lclauses;
453 hjstate->hj_InnerHashKeys = rclauses;
454 hjstate->hj_HashOperators = hoperators;
455 /* child Hash node needs to evaluate inner hash keys, too */
456 ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
458 hjstate->js.ps.ps_OuterTupleSlot = NULL;
459 hjstate->js.ps.ps_TupFromTlist = false;
460 hjstate->hj_NeedNewOuter = true;
461 hjstate->hj_MatchedOuter = false;
467 ExecCountSlotsHashJoin(HashJoin *node)
469 return ExecCountSlotsNode(outerPlan(node)) +
470 ExecCountSlotsNode(innerPlan(node)) +
474 /* ----------------------------------------------------------------
477 * clean up routine for HashJoin node
478 * ----------------------------------------------------------------
481 ExecEndHashJoin(HashJoinState *node)
486 if (node->hj_HashTable)
488 ExecHashTableDestroy(node->hj_HashTable);
489 node->hj_HashTable = NULL;
490 node->hj_FirstOuterTupleSlot = NULL;
494 * Free the exprcontext
496 ExecFreeExprContext(&node->js.ps);
499 * clean out the tuple table
501 ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
502 ExecClearTuple(node->hj_OuterTupleSlot);
503 ExecClearTuple(node->hj_HashTupleSlot);
508 ExecEndNode(outerPlanState(node));
509 ExecEndNode(innerPlanState(node));
513 * ExecHashJoinOuterGetTuple
515 * get the next outer tuple for hashjoin: either by
516 * executing a plan node in the first pass, or from
517 * the temp files for the hashjoin batches.
519 * Returns a null slot if no more outer tuples. On success, the tuple's
520 * hash value is stored at *hashvalue --- this is either originally computed,
521 * or re-read from the temp file.
523 static TupleTableSlot *
524 ExecHashJoinOuterGetTuple(PlanState *outerNode,
525 HashJoinState *hjstate,
528 HashJoinTable hashtable = hjstate->hj_HashTable;
529 int curbatch = hashtable->curbatch;
530 TupleTableSlot *slot;
533 { /* if it is the first pass */
536 * Check to see if first outer tuple was already fetched by
537 * ExecHashJoin() and not used yet.
539 slot = hjstate->hj_FirstOuterTupleSlot;
540 if (!TupIsNull(slot))
541 hjstate->hj_FirstOuterTupleSlot = NULL;
543 slot = ExecProcNode(outerNode);
544 if (!TupIsNull(slot))
547 * We have to compute the tuple's hash value.
549 ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
551 econtext->ecxt_outertuple = slot;
552 *hashvalue = ExecHashGetHashValue(hashtable, econtext,
553 hjstate->hj_OuterHashKeys);
559 * We have just reached the end of the first pass. Try to switch to a
562 curbatch = ExecHashJoinNewBatch(hjstate);
566 * Try to read from a temp file. Loop allows us to advance to new batches
567 * as needed. NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
568 * don't try to optimize this loop.
570 while (curbatch < hashtable->nbatch)
572 slot = ExecHashJoinGetSavedTuple(hjstate,
573 hashtable->outerBatchFile[curbatch],
575 hjstate->hj_OuterTupleSlot);
576 if (!TupIsNull(slot))
578 curbatch = ExecHashJoinNewBatch(hjstate);
581 /* Out of batches... */
586 * ExecHashJoinNewBatch
587 * switch to a new hashjoin batch
589 * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
590 * We will never return a batch number that has an empty outer batch file.
593 ExecHashJoinNewBatch(HashJoinState *hjstate)
595 HashJoinTable hashtable = hjstate->hj_HashTable;
599 TupleTableSlot *slot;
603 nbatch = hashtable->nbatch;
604 curbatch = hashtable->curbatch;
609 * We no longer need the previous outer batch file; close it right
610 * away to free disk space.
612 if (hashtable->outerBatchFile[curbatch])
613 BufFileClose(hashtable->outerBatchFile[curbatch]);
614 hashtable->outerBatchFile[curbatch] = NULL;
618 * We can always skip over any batches that are completely empty on both
619 * sides. We can sometimes skip over batches that are empty on only one
620 * side, but there are exceptions:
622 * 1. In a LEFT JOIN, we have to process outer batches even if the inner
625 * 2. If we have increased nbatch since the initial estimate, we have to scan
626 * inner batches since they might contain tuples that need to be
627 * reassigned to later inner batches.
629 * 3. Similarly, if we have increased nbatch since starting the outer scan,
630 * we have to rescan outer batches in case they contain tuples that need
634 while (curbatch < nbatch &&
635 (hashtable->outerBatchFile[curbatch] == NULL ||
636 hashtable->innerBatchFile[curbatch] == NULL))
638 if (hashtable->outerBatchFile[curbatch] &&
639 hjstate->js.jointype == JOIN_LEFT)
640 break; /* must process due to rule 1 */
641 if (hashtable->innerBatchFile[curbatch] &&
642 nbatch != hashtable->nbatch_original)
643 break; /* must process due to rule 2 */
644 if (hashtable->outerBatchFile[curbatch] &&
645 nbatch != hashtable->nbatch_outstart)
646 break; /* must process due to rule 3 */
647 /* We can ignore this batch. */
648 /* Release associated temp files right away. */
649 if (hashtable->innerBatchFile[curbatch])
650 BufFileClose(hashtable->innerBatchFile[curbatch]);
651 hashtable->innerBatchFile[curbatch] = NULL;
652 if (hashtable->outerBatchFile[curbatch])
653 BufFileClose(hashtable->outerBatchFile[curbatch]);
654 hashtable->outerBatchFile[curbatch] = NULL;
658 if (curbatch >= nbatch)
659 return curbatch; /* no more batches */
661 hashtable->curbatch = curbatch;
664 * Reload the hash table with the new inner batch (which could be empty)
666 ExecHashTableReset(hashtable);
668 innerFile = hashtable->innerBatchFile[curbatch];
670 if (innerFile != NULL)
672 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
674 (errcode_for_file_access(),
675 errmsg("could not rewind hash-join temporary file: %m")));
677 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
680 hjstate->hj_HashTupleSlot)))
683 * NOTE: some tuples may be sent to future batches. Also, it is
684 * possible for hashtable->nbatch to be increased here!
686 ExecHashTableInsert(hashtable,
687 ExecFetchSlotTuple(slot),
692 * after we build the hash table, the inner batch file is no longer
695 BufFileClose(innerFile);
696 hashtable->innerBatchFile[curbatch] = NULL;
700 * If there's no outer batch file, advance to next batch.
702 if (hashtable->outerBatchFile[curbatch] == NULL)
706 * Rewind outer batch file, so that we can start reading it.
708 if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
710 (errcode_for_file_access(),
711 errmsg("could not rewind hash-join temporary file: %m")));
717 * ExecHashJoinSaveTuple
718 * save a tuple to a batch file.
720 * The data recorded in the file for each tuple is its hash value,
721 * then an image of its HeapTupleData (with meaningless t_data pointer)
722 * followed by the HeapTupleHeader and tuple data.
724 * Note: it is important always to call this in the regular executor
725 * context, not in a shorter-lived context; else the temp file buffers
726 * will get messed up.
729 ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
732 BufFile *file = *fileptr;
737 /* First write to this batch file, so open it. */
738 file = BufFileCreateTemp(false);
742 written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
743 if (written != sizeof(uint32))
745 (errcode_for_file_access(),
746 errmsg("could not write to hash-join temporary file: %m")));
748 written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
749 if (written != sizeof(HeapTupleData))
751 (errcode_for_file_access(),
752 errmsg("could not write to hash-join temporary file: %m")));
754 written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
755 if (written != (size_t) heapTuple->t_len)
757 (errcode_for_file_access(),
758 errmsg("could not write to hash-join temporary file: %m")));
762 * ExecHashJoinGetSavedTuple
763 * read the next tuple from a batch file. Return NULL if no more.
765 * On success, *hashvalue is set to the tuple's hash value, and the tuple
766 * itself is stored in the given slot.
768 static TupleTableSlot *
769 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
772 TupleTableSlot *tupleSlot)
778 nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
780 return NULL; /* end of file */
781 if (nread != sizeof(uint32))
783 (errcode_for_file_access(),
784 errmsg("could not read from hash-join temporary file: %m")));
785 nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
786 if (nread != sizeof(HeapTupleData))
788 (errcode_for_file_access(),
789 errmsg("could not read from hash-join temporary file: %m")));
790 heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
791 memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
792 heapTuple->t_datamcxt = CurrentMemoryContext;
793 heapTuple->t_data = (HeapTupleHeader)
794 ((char *) heapTuple + HEAPTUPLESIZE);
795 nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
796 if (nread != (size_t) htup.t_len)
798 (errcode_for_file_access(),
799 errmsg("could not read from hash-join temporary file: %m")));
800 return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
805 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
808 * If we haven't yet built the hash table then we can just return; nothing
809 * done yet, so nothing to undo.
811 if (node->hj_HashTable == NULL)
815 * In a multi-batch join, we currently have to do rescans the hard way,
816 * primarily because batch temp files may have already been released. But
817 * if it's a single-batch join, and there is no parameter change for the
818 * inner subnode, then we can just re-use the existing hash table without
821 if (node->hj_HashTable->nbatch == 1 &&
822 ((PlanState *) node)->righttree->chgParam == NULL)
824 /* okay to reuse the hash table; needn't rescan inner, either */
828 /* must destroy and rebuild hash table */
829 ExecHashTableDestroy(node->hj_HashTable);
830 node->hj_HashTable = NULL;
831 node->hj_FirstOuterTupleSlot = NULL;
834 * if chgParam of subnode is not null then plan will be re-scanned by
835 * first ExecProcNode.
837 if (((PlanState *) node)->righttree->chgParam == NULL)
838 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
841 /* Always reset intra-tuple state */
842 node->hj_CurHashValue = 0;
843 node->hj_CurBucketNo = 0;
844 node->hj_CurTuple = NULL;
846 node->js.ps.ps_OuterTupleSlot = NULL;
847 node->js.ps.ps_TupFromTlist = false;
848 node->hj_NeedNewOuter = true;
849 node->hj_MatchedOuter = false;
852 * if chgParam of subnode is not null then plan will be re-scanned by
853 * first ExecProcNode.
855 if (((PlanState *) node)->lefttree->chgParam == NULL)
856 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);