1 /*-------------------------------------------------------------------------
4 * Routines to handle hash join nodes
6 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.74 2005/10/15 02:49:17 momjian Exp $
13 *-------------------------------------------------------------------------
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "optimizer/clauses.h"
23 #include "utils/memutils.h"
26 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
27 HashJoinState *hjstate,
29 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
32 TupleTableSlot *tupleSlot);
33 static int ExecHashJoinNewBatch(HashJoinState *hjstate);
36 /* ----------------------------------------------------------------
39 * This function implements the Hybrid Hashjoin algorithm.
41 * Note: the relation we build hash table on is the "inner"
42 * the other one is "outer".
43 * ----------------------------------------------------------------
45 TupleTableSlot * /* return: a tuple or NULL */
46 ExecHashJoin(HashJoinState *node)
54 TupleTableSlot *inntuple;
55 ExprContext *econtext;
57 HashJoinTable hashtable;
59 TupleTableSlot *outerTupleSlot;
64 * get information from HashJoin node
66 estate = node->js.ps.state;
67 joinqual = node->js.joinqual;
68 otherqual = node->js.ps.qual;
69 hashNode = (HashState *) innerPlanState(node);
70 outerNode = outerPlanState(node);
71 dir = estate->es_direction;
74 * get information from HashJoin state
76 hashtable = node->hj_HashTable;
77 econtext = node->js.ps.ps_ExprContext;
80 * Check to see if we're still projecting out tuples from a previous join
81 * tuple (because there is a function-returning-set in the projection
82 * expressions). If so, try to project another one.
84 if (node->js.ps.ps_TupFromTlist)
86 TupleTableSlot *result;
88 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
89 if (isDone == ExprMultipleResult)
91 /* Done with that source tuple... */
92 node->js.ps.ps_TupFromTlist = false;
96 * If we're doing an IN join, we want to return at most one row per outer
97 * tuple; so we can stop scanning the inner scan if we matched on the
100 if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
101 node->hj_NeedNewOuter = true;
104 * Reset per-tuple memory context to free any expression evaluation
105 * storage allocated in the previous tuple cycle. Note this can't happen
106 * until we're done projecting out tuples from a join tuple.
108 ResetExprContext(econtext);
111 * if this is the first call, build the hash table for inner relation
113 if (hashtable == NULL)
116 * If the outer relation is completely empty, we can quit without
117 * building the hash table. However, for an inner join it is only a
118 * win to check this when the outer relation's startup cost is less
119 * than the projected cost of building the hash table. Otherwise it's
120 * best to build the hash table first and see if the inner relation is
121 * empty. (When it's an outer join, we should always make this check,
122 * since we aren't going to be able to skip the join on the strength
123 * of an empty inner relation anyway.)
125 * The only way to make the check is to try to fetch a tuple from the
126 * outer plan node. If we succeed, we have to stash it away for later
127 * consumption by ExecHashJoinOuterGetTuple.
129 if (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost ||
130 node->js.jointype == JOIN_LEFT)
132 node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
133 if (TupIsNull(node->hj_FirstOuterTupleSlot))
137 node->hj_FirstOuterTupleSlot = NULL;
140 * create the hash table
142 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
143 node->hj_HashOperators);
144 node->hj_HashTable = hashtable;
147 * execute the Hash node, to build the hash table
149 hashNode->hashtable = hashtable;
150 (void) MultiExecProcNode((PlanState *) hashNode);
153 * If the inner relation is completely empty, and we're not doing an
154 * outer join, we can quit without scanning the outer relation.
156 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
158 ExecHashTableDestroy(hashtable);
159 node->hj_HashTable = NULL;
160 node->hj_FirstOuterTupleSlot = NULL;
165 * need to remember whether nbatch has increased since we began
166 * scanning the outer relation
168 hashtable->nbatch_outstart = hashtable->nbatch;
172 * run the hash join process
177 * If we don't have an outer tuple, get the next one
179 if (node->hj_NeedNewOuter)
181 outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
184 if (TupIsNull(outerTupleSlot))
190 node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
191 econtext->ecxt_outertuple = outerTupleSlot;
192 node->hj_NeedNewOuter = false;
193 node->hj_MatchedOuter = false;
196 * now we have an outer tuple, find the corresponding bucket for
197 * this tuple from the hash table
199 node->hj_CurHashValue = hashvalue;
200 ExecHashGetBucketAndBatch(hashtable, hashvalue,
201 &node->hj_CurBucketNo, &batchno);
202 node->hj_CurTuple = NULL;
205 * Now we've got an outer tuple and the corresponding hash bucket,
206 * but this tuple may not belong to the current batch.
208 if (batchno != hashtable->curbatch)
211 * Need to postpone this outer tuple to a later batch. Save it
212 * in the corresponding outer-batch file.
214 Assert(batchno > hashtable->curbatch);
215 ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
217 &hashtable->outerBatchFile[batchno]);
218 node->hj_NeedNewOuter = true;
219 continue; /* loop around for a new outer tuple */
224 * OK, scan the selected hash bucket for matches
228 curtuple = ExecScanHashBucket(node, econtext);
229 if (curtuple == NULL)
230 break; /* out of matches */
233 * we've got a match, but still need to test non-hashed quals
235 inntuple = ExecStoreTuple(curtuple,
236 node->hj_HashTupleSlot,
238 false); /* don't pfree this tuple */
239 econtext->ecxt_innertuple = inntuple;
241 /* reset temp memory each time to avoid leaks from qual expr */
242 ResetExprContext(econtext);
245 * if we pass the qual, then save state for next call and have
246 * ExecProject form the projection, store it in the tuple table,
247 * and return the slot.
249 * Only the joinquals determine MatchedOuter status, but all quals
250 * must pass to actually return the tuple.
252 if (joinqual == NIL || ExecQual(joinqual, econtext, false))
254 node->hj_MatchedOuter = true;
256 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
258 TupleTableSlot *result;
260 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
262 if (isDone != ExprEndResult)
264 node->js.ps.ps_TupFromTlist =
265 (isDone == ExprMultipleResult);
271 * If we didn't return a tuple, may need to set NeedNewOuter
273 if (node->js.jointype == JOIN_IN)
275 node->hj_NeedNewOuter = true;
276 break; /* out of loop over hash bucket */
282 * Now the current outer tuple has run out of matches, so check
283 * whether to emit a dummy outer-join tuple. If not, loop around to
284 * get a new outer tuple.
286 node->hj_NeedNewOuter = true;
288 if (!node->hj_MatchedOuter &&
289 node->js.jointype == JOIN_LEFT)
292 * We are doing an outer join and there were no join matches for
293 * this outer tuple. Generate a fake join tuple with nulls for
294 * the inner tuple, and return it if it passes the non-join quals.
296 econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
298 if (ExecQual(otherqual, econtext, false))
301 * qualification was satisfied so we project and return the
302 * slot containing the result tuple using ExecProject().
304 TupleTableSlot *result;
306 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
308 if (isDone != ExprEndResult)
310 node->js.ps.ps_TupFromTlist =
311 (isDone == ExprMultipleResult);
319 /* ----------------------------------------------------------------
322 * Init routine for HashJoin node.
323 * ----------------------------------------------------------------
326 ExecInitHashJoin(HashJoin *node, EState *estate)
328 HashJoinState *hjstate;
337 * create state structure
339 hjstate = makeNode(HashJoinState);
340 hjstate->js.ps.plan = (Plan *) node;
341 hjstate->js.ps.state = estate;
344 * Miscellaneous initialization
346 * create expression context for node
348 ExecAssignExprContext(estate, &hjstate->js.ps);
351 * initialize child expressions
353 hjstate->js.ps.targetlist = (List *)
354 ExecInitExpr((Expr *) node->join.plan.targetlist,
355 (PlanState *) hjstate);
356 hjstate->js.ps.qual = (List *)
357 ExecInitExpr((Expr *) node->join.plan.qual,
358 (PlanState *) hjstate);
359 hjstate->js.jointype = node->join.jointype;
360 hjstate->js.joinqual = (List *)
361 ExecInitExpr((Expr *) node->join.joinqual,
362 (PlanState *) hjstate);
363 hjstate->hashclauses = (List *)
364 ExecInitExpr((Expr *) node->hashclauses,
365 (PlanState *) hjstate);
368 * initialize child nodes
370 outerNode = outerPlan(node);
371 hashNode = (Hash *) innerPlan(node);
373 outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
374 innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
376 #define HASHJOIN_NSLOTS 3
379 * tuple table initialization
381 ExecInitResultTupleSlot(estate, &hjstate->js.ps);
382 hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
384 switch (node->join.jointype)
390 hjstate->hj_NullInnerTupleSlot =
391 ExecInitNullTupleSlot(estate,
392 ExecGetResultType(innerPlanState(hjstate)));
395 elog(ERROR, "unrecognized join type: %d",
396 (int) node->join.jointype);
400 * now for some voodoo. our temporary tuple slot is actually the result
401 * tuple slot of the Hash node (which is our inner plan). we do this
402 * because Hash nodes don't return tuples via ExecProcNode() -- instead
403 * the hash join node uses ExecScanHashBucket() to get at the contents of
404 * the hash table. -cim 6/9/91
407 HashState *hashstate = (HashState *) innerPlanState(hjstate);
408 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
410 hjstate->hj_HashTupleSlot = slot;
414 * initialize tuple type and projection info
416 ExecAssignResultTypeFromTL(&hjstate->js.ps);
417 ExecAssignProjectionInfo(&hjstate->js.ps);
419 ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
420 ExecGetResultType(outerPlanState(hjstate)),
424 * initialize hash-specific info
426 hjstate->hj_HashTable = NULL;
427 hjstate->hj_FirstOuterTupleSlot = NULL;
429 hjstate->hj_CurHashValue = 0;
430 hjstate->hj_CurBucketNo = 0;
431 hjstate->hj_CurTuple = NULL;
434 * Deconstruct the hash clauses into outer and inner argument values, so
435 * that we can evaluate those subexpressions separately. Also make a list
436 * of the hash operator OIDs, in preparation for looking up the hash
442 foreach(l, hjstate->hashclauses)
444 FuncExprState *fstate = (FuncExprState *) lfirst(l);
447 Assert(IsA(fstate, FuncExprState));
448 hclause = (OpExpr *) fstate->xprstate.expr;
449 Assert(IsA(hclause, OpExpr));
450 lclauses = lappend(lclauses, linitial(fstate->args));
451 rclauses = lappend(rclauses, lsecond(fstate->args));
452 hoperators = lappend_oid(hoperators, hclause->opno);
454 hjstate->hj_OuterHashKeys = lclauses;
455 hjstate->hj_InnerHashKeys = rclauses;
456 hjstate->hj_HashOperators = hoperators;
457 /* child Hash node needs to evaluate inner hash keys, too */
458 ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
460 hjstate->js.ps.ps_OuterTupleSlot = NULL;
461 hjstate->js.ps.ps_TupFromTlist = false;
462 hjstate->hj_NeedNewOuter = true;
463 hjstate->hj_MatchedOuter = false;
469 ExecCountSlotsHashJoin(HashJoin *node)
471 return ExecCountSlotsNode(outerPlan(node)) +
472 ExecCountSlotsNode(innerPlan(node)) +
476 /* ----------------------------------------------------------------
479 * clean up routine for HashJoin node
480 * ----------------------------------------------------------------
483 ExecEndHashJoin(HashJoinState *node)
488 if (node->hj_HashTable)
490 ExecHashTableDestroy(node->hj_HashTable);
491 node->hj_HashTable = NULL;
492 node->hj_FirstOuterTupleSlot = NULL;
496 * Free the exprcontext
498 ExecFreeExprContext(&node->js.ps);
501 * clean out the tuple table
503 ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
504 ExecClearTuple(node->hj_OuterTupleSlot);
505 ExecClearTuple(node->hj_HashTupleSlot);
510 ExecEndNode(outerPlanState(node));
511 ExecEndNode(innerPlanState(node));
515 * ExecHashJoinOuterGetTuple
517 * get the next outer tuple for hashjoin: either by
518 * executing a plan node in the first pass, or from
519 * the temp files for the hashjoin batches.
521 * Returns a null slot if no more outer tuples. On success, the tuple's
522 * hash value is stored at *hashvalue --- this is either originally computed,
523 * or re-read from the temp file.
525 static TupleTableSlot *
526 ExecHashJoinOuterGetTuple(PlanState *outerNode,
527 HashJoinState *hjstate,
530 HashJoinTable hashtable = hjstate->hj_HashTable;
531 int curbatch = hashtable->curbatch;
532 TupleTableSlot *slot;
535 { /* if it is the first pass */
538 * Check to see if first outer tuple was already fetched by
539 * ExecHashJoin() and not used yet.
541 slot = hjstate->hj_FirstOuterTupleSlot;
542 if (!TupIsNull(slot))
543 hjstate->hj_FirstOuterTupleSlot = NULL;
545 slot = ExecProcNode(outerNode);
546 if (!TupIsNull(slot))
549 * We have to compute the tuple's hash value.
551 ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
553 econtext->ecxt_outertuple = slot;
554 *hashvalue = ExecHashGetHashValue(hashtable, econtext,
555 hjstate->hj_OuterHashKeys);
561 * We have just reached the end of the first pass. Try to switch to a
564 curbatch = ExecHashJoinNewBatch(hjstate);
568 * Try to read from a temp file. Loop allows us to advance to new batches
569 * as needed. NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
570 * don't try to optimize this loop.
572 while (curbatch < hashtable->nbatch)
574 slot = ExecHashJoinGetSavedTuple(hjstate,
575 hashtable->outerBatchFile[curbatch],
577 hjstate->hj_OuterTupleSlot);
578 if (!TupIsNull(slot))
580 curbatch = ExecHashJoinNewBatch(hjstate);
583 /* Out of batches... */
588 * ExecHashJoinNewBatch
589 * switch to a new hashjoin batch
591 * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
592 * We will never return a batch number that has an empty outer batch file.
595 ExecHashJoinNewBatch(HashJoinState *hjstate)
597 HashJoinTable hashtable = hjstate->hj_HashTable;
601 TupleTableSlot *slot;
605 nbatch = hashtable->nbatch;
606 curbatch = hashtable->curbatch;
611 * We no longer need the previous outer batch file; close it right
612 * away to free disk space.
614 if (hashtable->outerBatchFile[curbatch])
615 BufFileClose(hashtable->outerBatchFile[curbatch]);
616 hashtable->outerBatchFile[curbatch] = NULL;
620 * We can always skip over any batches that are completely empty on both
621 * sides. We can sometimes skip over batches that are empty on only one
622 * side, but there are exceptions:
624 * 1. In a LEFT JOIN, we have to process outer batches even if the inner
627 * 2. If we have increased nbatch since the initial estimate, we have to scan
628 * inner batches since they might contain tuples that need to be
629 * reassigned to later inner batches.
631 * 3. Similarly, if we have increased nbatch since starting the outer scan,
632 * we have to rescan outer batches in case they contain tuples that need
636 while (curbatch < nbatch &&
637 (hashtable->outerBatchFile[curbatch] == NULL ||
638 hashtable->innerBatchFile[curbatch] == NULL))
640 if (hashtable->outerBatchFile[curbatch] &&
641 hjstate->js.jointype == JOIN_LEFT)
642 break; /* must process due to rule 1 */
643 if (hashtable->innerBatchFile[curbatch] &&
644 nbatch != hashtable->nbatch_original)
645 break; /* must process due to rule 2 */
646 if (hashtable->outerBatchFile[curbatch] &&
647 nbatch != hashtable->nbatch_outstart)
648 break; /* must process due to rule 3 */
649 /* We can ignore this batch. */
650 /* Release associated temp files right away. */
651 if (hashtable->innerBatchFile[curbatch])
652 BufFileClose(hashtable->innerBatchFile[curbatch]);
653 hashtable->innerBatchFile[curbatch] = NULL;
654 if (hashtable->outerBatchFile[curbatch])
655 BufFileClose(hashtable->outerBatchFile[curbatch]);
656 hashtable->outerBatchFile[curbatch] = NULL;
660 if (curbatch >= nbatch)
661 return curbatch; /* no more batches */
663 hashtable->curbatch = curbatch;
666 * Reload the hash table with the new inner batch (which could be empty)
668 ExecHashTableReset(hashtable);
670 innerFile = hashtable->innerBatchFile[curbatch];
672 if (innerFile != NULL)
674 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
676 (errcode_for_file_access(),
677 errmsg("could not rewind hash-join temporary file: %m")));
679 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
682 hjstate->hj_HashTupleSlot)))
685 * NOTE: some tuples may be sent to future batches. Also, it is
686 * possible for hashtable->nbatch to be increased here!
688 ExecHashTableInsert(hashtable,
689 ExecFetchSlotTuple(slot),
694 * after we build the hash table, the inner batch file is no longer
697 BufFileClose(innerFile);
698 hashtable->innerBatchFile[curbatch] = NULL;
702 * If there's no outer batch file, advance to next batch.
704 if (hashtable->outerBatchFile[curbatch] == NULL)
708 * Rewind outer batch file, so that we can start reading it.
710 if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
712 (errcode_for_file_access(),
713 errmsg("could not rewind hash-join temporary file: %m")));
719 * ExecHashJoinSaveTuple
720 * save a tuple to a batch file.
722 * The data recorded in the file for each tuple is its hash value,
723 * then an image of its HeapTupleData (with meaningless t_data pointer)
724 * followed by the HeapTupleHeader and tuple data.
726 * Note: it is important always to call this in the regular executor
727 * context, not in a shorter-lived context; else the temp file buffers
728 * will get messed up.
731 ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
734 BufFile *file = *fileptr;
739 /* First write to this batch file, so open it. */
740 file = BufFileCreateTemp(false);
744 written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
745 if (written != sizeof(uint32))
747 (errcode_for_file_access(),
748 errmsg("could not write to hash-join temporary file: %m")));
750 written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
751 if (written != sizeof(HeapTupleData))
753 (errcode_for_file_access(),
754 errmsg("could not write to hash-join temporary file: %m")));
756 written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
757 if (written != (size_t) heapTuple->t_len)
759 (errcode_for_file_access(),
760 errmsg("could not write to hash-join temporary file: %m")));
764 * ExecHashJoinGetSavedTuple
765 * read the next tuple from a batch file. Return NULL if no more.
767 * On success, *hashvalue is set to the tuple's hash value, and the tuple
768 * itself is stored in the given slot.
770 static TupleTableSlot *
771 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
774 TupleTableSlot *tupleSlot)
780 nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
782 return NULL; /* end of file */
783 if (nread != sizeof(uint32))
785 (errcode_for_file_access(),
786 errmsg("could not read from hash-join temporary file: %m")));
787 nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
788 if (nread != sizeof(HeapTupleData))
790 (errcode_for_file_access(),
791 errmsg("could not read from hash-join temporary file: %m")));
792 heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
793 memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
794 heapTuple->t_datamcxt = CurrentMemoryContext;
795 heapTuple->t_data = (HeapTupleHeader)
796 ((char *) heapTuple + HEAPTUPLESIZE);
797 nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
798 if (nread != (size_t) htup.t_len)
800 (errcode_for_file_access(),
801 errmsg("could not read from hash-join temporary file: %m")));
802 return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
807 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
810 * If we haven't yet built the hash table then we can just return; nothing
811 * done yet, so nothing to undo.
813 if (node->hj_HashTable == NULL)
817 * In a multi-batch join, we currently have to do rescans the hard way,
818 * primarily because batch temp files may have already been released. But
819 * if it's a single-batch join, and there is no parameter change for the
820 * inner subnode, then we can just re-use the existing hash table without
823 if (node->hj_HashTable->nbatch == 1 &&
824 ((PlanState *) node)->righttree->chgParam == NULL)
826 /* okay to reuse the hash table; needn't rescan inner, either */
830 /* must destroy and rebuild hash table */
831 ExecHashTableDestroy(node->hj_HashTable);
832 node->hj_HashTable = NULL;
833 node->hj_FirstOuterTupleSlot = NULL;
836 * if chgParam of subnode is not null then plan will be re-scanned by
837 * first ExecProcNode.
839 if (((PlanState *) node)->righttree->chgParam == NULL)
840 ExecReScan(((PlanState *) node)->righttree, exprCtxt);
843 /* Always reset intra-tuple state */
844 node->hj_CurHashValue = 0;
845 node->hj_CurBucketNo = 0;
846 node->hj_CurTuple = NULL;
848 node->js.ps.ps_OuterTupleSlot = NULL;
849 node->js.ps.ps_TupFromTlist = false;
850 node->hj_NeedNewOuter = true;
851 node->hj_MatchedOuter = false;
854 * if chgParam of subnode is not null then plan will be re-scanned by
855 * first ExecProcNode.
857 if (((PlanState *) node)->lefttree->chgParam == NULL)
858 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);