OSDN Git Service

Standard pgindent run for 8.1.
[pg-rex/syncrep.git] / src / backend / executor / nodeHashjoin.c
1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *        Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.74 2005/10/15 02:49:17 momjian Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "executor/executor.h"
19 #include "executor/hashjoin.h"
20 #include "executor/nodeHash.h"
21 #include "executor/nodeHashjoin.h"
22 #include "optimizer/clauses.h"
23 #include "utils/memutils.h"
24
25
26 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
27                                                   HashJoinState *hjstate,
28                                                   uint32 *hashvalue);
29 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
30                                                   BufFile *file,
31                                                   uint32 *hashvalue,
32                                                   TupleTableSlot *tupleSlot);
33 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
34
35
36 /* ----------------------------------------------------------------
37  *              ExecHashJoin
38  *
39  *              This function implements the Hybrid Hashjoin algorithm.
40  *
41  *              Note: the relation we build hash table on is the "inner"
42  *                        the other one is "outer".
43  * ----------------------------------------------------------------
44  */
45 TupleTableSlot *                                /* return: a tuple or NULL */
46 ExecHashJoin(HashJoinState *node)
47 {
48         EState     *estate;
49         PlanState  *outerNode;
50         HashState  *hashNode;
51         List       *joinqual;
52         List       *otherqual;
53         ScanDirection dir;
54         TupleTableSlot *inntuple;
55         ExprContext *econtext;
56         ExprDoneCond isDone;
57         HashJoinTable hashtable;
58         HeapTuple       curtuple;
59         TupleTableSlot *outerTupleSlot;
60         uint32          hashvalue;
61         int                     batchno;
62
63         /*
64          * get information from HashJoin node
65          */
66         estate = node->js.ps.state;
67         joinqual = node->js.joinqual;
68         otherqual = node->js.ps.qual;
69         hashNode = (HashState *) innerPlanState(node);
70         outerNode = outerPlanState(node);
71         dir = estate->es_direction;
72
73         /*
74          * get information from HashJoin state
75          */
76         hashtable = node->hj_HashTable;
77         econtext = node->js.ps.ps_ExprContext;
78
79         /*
80          * Check to see if we're still projecting out tuples from a previous join
81          * tuple (because there is a function-returning-set in the projection
82          * expressions).  If so, try to project another one.
83          */
84         if (node->js.ps.ps_TupFromTlist)
85         {
86                 TupleTableSlot *result;
87
88                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
89                 if (isDone == ExprMultipleResult)
90                         return result;
91                 /* Done with that source tuple... */
92                 node->js.ps.ps_TupFromTlist = false;
93         }
94
95         /*
96          * If we're doing an IN join, we want to return at most one row per outer
97          * tuple; so we can stop scanning the inner scan if we matched on the
98          * previous try.
99          */
100         if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
101                 node->hj_NeedNewOuter = true;
102
103         /*
104          * Reset per-tuple memory context to free any expression evaluation
105          * storage allocated in the previous tuple cycle.  Note this can't happen
106          * until we're done projecting out tuples from a join tuple.
107          */
108         ResetExprContext(econtext);
109
110         /*
111          * if this is the first call, build the hash table for inner relation
112          */
113         if (hashtable == NULL)
114         {
115                 /*
116                  * If the outer relation is completely empty, we can quit without
117                  * building the hash table.  However, for an inner join it is only a
118                  * win to check this when the outer relation's startup cost is less
119                  * than the projected cost of building the hash table.  Otherwise it's
120                  * best to build the hash table first and see if the inner relation is
121                  * empty.  (When it's an outer join, we should always make this check,
122                  * since we aren't going to be able to skip the join on the strength
123                  * of an empty inner relation anyway.)
124                  *
125                  * The only way to make the check is to try to fetch a tuple from the
126                  * outer plan node.  If we succeed, we have to stash it away for later
127                  * consumption by ExecHashJoinOuterGetTuple.
128                  */
129                 if (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost ||
130                         node->js.jointype == JOIN_LEFT)
131                 {
132                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
133                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
134                                 return NULL;
135                 }
136                 else
137                         node->hj_FirstOuterTupleSlot = NULL;
138
139                 /*
140                  * create the hash table
141                  */
142                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
143                                                                                 node->hj_HashOperators);
144                 node->hj_HashTable = hashtable;
145
146                 /*
147                  * execute the Hash node, to build the hash table
148                  */
149                 hashNode->hashtable = hashtable;
150                 (void) MultiExecProcNode((PlanState *) hashNode);
151
152                 /*
153                  * If the inner relation is completely empty, and we're not doing an
154                  * outer join, we can quit without scanning the outer relation.
155                  */
156                 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
157                 {
158                         ExecHashTableDestroy(hashtable);
159                         node->hj_HashTable = NULL;
160                         node->hj_FirstOuterTupleSlot = NULL;
161                         return NULL;
162                 }
163
164                 /*
165                  * need to remember whether nbatch has increased since we began
166                  * scanning the outer relation
167                  */
168                 hashtable->nbatch_outstart = hashtable->nbatch;
169         }
170
171         /*
172          * run the hash join process
173          */
174         for (;;)
175         {
176                 /*
177                  * If we don't have an outer tuple, get the next one
178                  */
179                 if (node->hj_NeedNewOuter)
180                 {
181                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
182                                                                                                            node,
183                                                                                                            &hashvalue);
184                         if (TupIsNull(outerTupleSlot))
185                         {
186                                 /* end of join */
187                                 return NULL;
188                         }
189
190                         node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
191                         econtext->ecxt_outertuple = outerTupleSlot;
192                         node->hj_NeedNewOuter = false;
193                         node->hj_MatchedOuter = false;
194
195                         /*
196                          * now we have an outer tuple, find the corresponding bucket for
197                          * this tuple from the hash table
198                          */
199                         node->hj_CurHashValue = hashvalue;
200                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
201                                                                           &node->hj_CurBucketNo, &batchno);
202                         node->hj_CurTuple = NULL;
203
204                         /*
205                          * Now we've got an outer tuple and the corresponding hash bucket,
206                          * but this tuple may not belong to the current batch.
207                          */
208                         if (batchno != hashtable->curbatch)
209                         {
210                                 /*
211                                  * Need to postpone this outer tuple to a later batch. Save it
212                                  * in the corresponding outer-batch file.
213                                  */
214                                 Assert(batchno > hashtable->curbatch);
215                                 ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
216                                                                           hashvalue,
217                                                                           &hashtable->outerBatchFile[batchno]);
218                                 node->hj_NeedNewOuter = true;
219                                 continue;               /* loop around for a new outer tuple */
220                         }
221                 }
222
223                 /*
224                  * OK, scan the selected hash bucket for matches
225                  */
226                 for (;;)
227                 {
228                         curtuple = ExecScanHashBucket(node, econtext);
229                         if (curtuple == NULL)
230                                 break;                  /* out of matches */
231
232                         /*
233                          * we've got a match, but still need to test non-hashed quals
234                          */
235                         inntuple = ExecStoreTuple(curtuple,
236                                                                           node->hj_HashTupleSlot,
237                                                                           InvalidBuffer,
238                                                                           false);       /* don't pfree this tuple */
239                         econtext->ecxt_innertuple = inntuple;
240
241                         /* reset temp memory each time to avoid leaks from qual expr */
242                         ResetExprContext(econtext);
243
244                         /*
245                          * if we pass the qual, then save state for next call and have
246                          * ExecProject form the projection, store it in the tuple table,
247                          * and return the slot.
248                          *
249                          * Only the joinquals determine MatchedOuter status, but all quals
250                          * must pass to actually return the tuple.
251                          */
252                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
253                         {
254                                 node->hj_MatchedOuter = true;
255
256                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
257                                 {
258                                         TupleTableSlot *result;
259
260                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
261
262                                         if (isDone != ExprEndResult)
263                                         {
264                                                 node->js.ps.ps_TupFromTlist =
265                                                         (isDone == ExprMultipleResult);
266                                                 return result;
267                                         }
268                                 }
269
270                                 /*
271                                  * If we didn't return a tuple, may need to set NeedNewOuter
272                                  */
273                                 if (node->js.jointype == JOIN_IN)
274                                 {
275                                         node->hj_NeedNewOuter = true;
276                                         break;          /* out of loop over hash bucket */
277                                 }
278                         }
279                 }
280
281                 /*
282                  * Now the current outer tuple has run out of matches, so check
283                  * whether to emit a dummy outer-join tuple. If not, loop around to
284                  * get a new outer tuple.
285                  */
286                 node->hj_NeedNewOuter = true;
287
288                 if (!node->hj_MatchedOuter &&
289                         node->js.jointype == JOIN_LEFT)
290                 {
291                         /*
292                          * We are doing an outer join and there were no join matches for
293                          * this outer tuple.  Generate a fake join tuple with nulls for
294                          * the inner tuple, and return it if it passes the non-join quals.
295                          */
296                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
297
298                         if (ExecQual(otherqual, econtext, false))
299                         {
300                                 /*
301                                  * qualification was satisfied so we project and return the
302                                  * slot containing the result tuple using ExecProject().
303                                  */
304                                 TupleTableSlot *result;
305
306                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
307
308                                 if (isDone != ExprEndResult)
309                                 {
310                                         node->js.ps.ps_TupFromTlist =
311                                                 (isDone == ExprMultipleResult);
312                                         return result;
313                                 }
314                         }
315                 }
316         }
317 }
318
319 /* ----------------------------------------------------------------
320  *              ExecInitHashJoin
321  *
322  *              Init routine for HashJoin node.
323  * ----------------------------------------------------------------
324  */
325 HashJoinState *
326 ExecInitHashJoin(HashJoin *node, EState *estate)
327 {
328         HashJoinState *hjstate;
329         Plan       *outerNode;
330         Hash       *hashNode;
331         List       *lclauses;
332         List       *rclauses;
333         List       *hoperators;
334         ListCell   *l;
335
336         /*
337          * create state structure
338          */
339         hjstate = makeNode(HashJoinState);
340         hjstate->js.ps.plan = (Plan *) node;
341         hjstate->js.ps.state = estate;
342
343         /*
344          * Miscellaneous initialization
345          *
346          * create expression context for node
347          */
348         ExecAssignExprContext(estate, &hjstate->js.ps);
349
350         /*
351          * initialize child expressions
352          */
353         hjstate->js.ps.targetlist = (List *)
354                 ExecInitExpr((Expr *) node->join.plan.targetlist,
355                                          (PlanState *) hjstate);
356         hjstate->js.ps.qual = (List *)
357                 ExecInitExpr((Expr *) node->join.plan.qual,
358                                          (PlanState *) hjstate);
359         hjstate->js.jointype = node->join.jointype;
360         hjstate->js.joinqual = (List *)
361                 ExecInitExpr((Expr *) node->join.joinqual,
362                                          (PlanState *) hjstate);
363         hjstate->hashclauses = (List *)
364                 ExecInitExpr((Expr *) node->hashclauses,
365                                          (PlanState *) hjstate);
366
367         /*
368          * initialize child nodes
369          */
370         outerNode = outerPlan(node);
371         hashNode = (Hash *) innerPlan(node);
372
373         outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
374         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
375
376 #define HASHJOIN_NSLOTS 3
377
378         /*
379          * tuple table initialization
380          */
381         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
382         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
383
384         switch (node->join.jointype)
385         {
386                 case JOIN_INNER:
387                 case JOIN_IN:
388                         break;
389                 case JOIN_LEFT:
390                         hjstate->hj_NullInnerTupleSlot =
391                                 ExecInitNullTupleSlot(estate,
392                                                                  ExecGetResultType(innerPlanState(hjstate)));
393                         break;
394                 default:
395                         elog(ERROR, "unrecognized join type: %d",
396                                  (int) node->join.jointype);
397         }
398
399         /*
400          * now for some voodoo.  our temporary tuple slot is actually the result
401          * tuple slot of the Hash node (which is our inner plan).  we do this
402          * because Hash nodes don't return tuples via ExecProcNode() -- instead
403          * the hash join node uses ExecScanHashBucket() to get at the contents of
404          * the hash table.      -cim 6/9/91
405          */
406         {
407                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
408                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
409
410                 hjstate->hj_HashTupleSlot = slot;
411         }
412
413         /*
414          * initialize tuple type and projection info
415          */
416         ExecAssignResultTypeFromTL(&hjstate->js.ps);
417         ExecAssignProjectionInfo(&hjstate->js.ps);
418
419         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
420                                                   ExecGetResultType(outerPlanState(hjstate)),
421                                                   false);
422
423         /*
424          * initialize hash-specific info
425          */
426         hjstate->hj_HashTable = NULL;
427         hjstate->hj_FirstOuterTupleSlot = NULL;
428
429         hjstate->hj_CurHashValue = 0;
430         hjstate->hj_CurBucketNo = 0;
431         hjstate->hj_CurTuple = NULL;
432
433         /*
434          * Deconstruct the hash clauses into outer and inner argument values, so
435          * that we can evaluate those subexpressions separately.  Also make a list
436          * of the hash operator OIDs, in preparation for looking up the hash
437          * functions to use.
438          */
439         lclauses = NIL;
440         rclauses = NIL;
441         hoperators = NIL;
442         foreach(l, hjstate->hashclauses)
443         {
444                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
445                 OpExpr     *hclause;
446
447                 Assert(IsA(fstate, FuncExprState));
448                 hclause = (OpExpr *) fstate->xprstate.expr;
449                 Assert(IsA(hclause, OpExpr));
450                 lclauses = lappend(lclauses, linitial(fstate->args));
451                 rclauses = lappend(rclauses, lsecond(fstate->args));
452                 hoperators = lappend_oid(hoperators, hclause->opno);
453         }
454         hjstate->hj_OuterHashKeys = lclauses;
455         hjstate->hj_InnerHashKeys = rclauses;
456         hjstate->hj_HashOperators = hoperators;
457         /* child Hash node needs to evaluate inner hash keys, too */
458         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
459
460         hjstate->js.ps.ps_OuterTupleSlot = NULL;
461         hjstate->js.ps.ps_TupFromTlist = false;
462         hjstate->hj_NeedNewOuter = true;
463         hjstate->hj_MatchedOuter = false;
464
465         return hjstate;
466 }
467
468 int
469 ExecCountSlotsHashJoin(HashJoin *node)
470 {
471         return ExecCountSlotsNode(outerPlan(node)) +
472                 ExecCountSlotsNode(innerPlan(node)) +
473                 HASHJOIN_NSLOTS;
474 }
475
476 /* ----------------------------------------------------------------
477  *              ExecEndHashJoin
478  *
479  *              clean up routine for HashJoin node
480  * ----------------------------------------------------------------
481  */
482 void
483 ExecEndHashJoin(HashJoinState *node)
484 {
485         /*
486          * Free hash table
487          */
488         if (node->hj_HashTable)
489         {
490                 ExecHashTableDestroy(node->hj_HashTable);
491                 node->hj_HashTable = NULL;
492                 node->hj_FirstOuterTupleSlot = NULL;
493         }
494
495         /*
496          * Free the exprcontext
497          */
498         ExecFreeExprContext(&node->js.ps);
499
500         /*
501          * clean out the tuple table
502          */
503         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
504         ExecClearTuple(node->hj_OuterTupleSlot);
505         ExecClearTuple(node->hj_HashTupleSlot);
506
507         /*
508          * clean up subtrees
509          */
510         ExecEndNode(outerPlanState(node));
511         ExecEndNode(innerPlanState(node));
512 }
513
514 /*
515  * ExecHashJoinOuterGetTuple
516  *
517  *              get the next outer tuple for hashjoin: either by
518  *              executing a plan node in the first pass, or from
519  *              the temp files for the hashjoin batches.
520  *
521  * Returns a null slot if no more outer tuples.  On success, the tuple's
522  * hash value is stored at *hashvalue --- this is either originally computed,
523  * or re-read from the temp file.
524  */
525 static TupleTableSlot *
526 ExecHashJoinOuterGetTuple(PlanState *outerNode,
527                                                   HashJoinState *hjstate,
528                                                   uint32 *hashvalue)
529 {
530         HashJoinTable hashtable = hjstate->hj_HashTable;
531         int                     curbatch = hashtable->curbatch;
532         TupleTableSlot *slot;
533
534         if (curbatch == 0)
535         {                                                       /* if it is the first pass */
536
537                 /*
538                  * Check to see if first outer tuple was already fetched by
539                  * ExecHashJoin() and not used yet.
540                  */
541                 slot = hjstate->hj_FirstOuterTupleSlot;
542                 if (!TupIsNull(slot))
543                         hjstate->hj_FirstOuterTupleSlot = NULL;
544                 else
545                         slot = ExecProcNode(outerNode);
546                 if (!TupIsNull(slot))
547                 {
548                         /*
549                          * We have to compute the tuple's hash value.
550                          */
551                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
552
553                         econtext->ecxt_outertuple = slot;
554                         *hashvalue = ExecHashGetHashValue(hashtable, econtext,
555                                                                                           hjstate->hj_OuterHashKeys);
556
557                         return slot;
558                 }
559
560                 /*
561                  * We have just reached the end of the first pass. Try to switch to a
562                  * saved batch.
563                  */
564                 curbatch = ExecHashJoinNewBatch(hjstate);
565         }
566
567         /*
568          * Try to read from a temp file. Loop allows us to advance to new batches
569          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
570          * don't try to optimize this loop.
571          */
572         while (curbatch < hashtable->nbatch)
573         {
574                 slot = ExecHashJoinGetSavedTuple(hjstate,
575                                                                                  hashtable->outerBatchFile[curbatch],
576                                                                                  hashvalue,
577                                                                                  hjstate->hj_OuterTupleSlot);
578                 if (!TupIsNull(slot))
579                         return slot;
580                 curbatch = ExecHashJoinNewBatch(hjstate);
581         }
582
583         /* Out of batches... */
584         return NULL;
585 }
586
587 /*
588  * ExecHashJoinNewBatch
589  *              switch to a new hashjoin batch
590  *
591  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
592  * We will never return a batch number that has an empty outer batch file.
593  */
594 static int
595 ExecHashJoinNewBatch(HashJoinState *hjstate)
596 {
597         HashJoinTable hashtable = hjstate->hj_HashTable;
598         int                     nbatch;
599         int                     curbatch;
600         BufFile    *innerFile;
601         TupleTableSlot *slot;
602         uint32          hashvalue;
603
604 start_over:
605         nbatch = hashtable->nbatch;
606         curbatch = hashtable->curbatch;
607
608         if (curbatch > 0)
609         {
610                 /*
611                  * We no longer need the previous outer batch file; close it right
612                  * away to free disk space.
613                  */
614                 if (hashtable->outerBatchFile[curbatch])
615                         BufFileClose(hashtable->outerBatchFile[curbatch]);
616                 hashtable->outerBatchFile[curbatch] = NULL;
617         }
618
619         /*
620          * We can always skip over any batches that are completely empty on both
621          * sides.  We can sometimes skip over batches that are empty on only one
622          * side, but there are exceptions:
623          *
624          * 1. In a LEFT JOIN, we have to process outer batches even if the inner
625          * batch is empty.
626          *
627          * 2. If we have increased nbatch since the initial estimate, we have to scan
628          * inner batches since they might contain tuples that need to be
629          * reassigned to later inner batches.
630          *
631          * 3. Similarly, if we have increased nbatch since starting the outer scan,
632          * we have to rescan outer batches in case they contain tuples that need
633          * to be reassigned.
634          */
635         curbatch++;
636         while (curbatch < nbatch &&
637                    (hashtable->outerBatchFile[curbatch] == NULL ||
638                         hashtable->innerBatchFile[curbatch] == NULL))
639         {
640                 if (hashtable->outerBatchFile[curbatch] &&
641                         hjstate->js.jointype == JOIN_LEFT)
642                         break;                          /* must process due to rule 1 */
643                 if (hashtable->innerBatchFile[curbatch] &&
644                         nbatch != hashtable->nbatch_original)
645                         break;                          /* must process due to rule 2 */
646                 if (hashtable->outerBatchFile[curbatch] &&
647                         nbatch != hashtable->nbatch_outstart)
648                         break;                          /* must process due to rule 3 */
649                 /* We can ignore this batch. */
650                 /* Release associated temp files right away. */
651                 if (hashtable->innerBatchFile[curbatch])
652                         BufFileClose(hashtable->innerBatchFile[curbatch]);
653                 hashtable->innerBatchFile[curbatch] = NULL;
654                 if (hashtable->outerBatchFile[curbatch])
655                         BufFileClose(hashtable->outerBatchFile[curbatch]);
656                 hashtable->outerBatchFile[curbatch] = NULL;
657                 curbatch++;
658         }
659
660         if (curbatch >= nbatch)
661                 return curbatch;                /* no more batches */
662
663         hashtable->curbatch = curbatch;
664
665         /*
666          * Reload the hash table with the new inner batch (which could be empty)
667          */
668         ExecHashTableReset(hashtable);
669
670         innerFile = hashtable->innerBatchFile[curbatch];
671
672         if (innerFile != NULL)
673         {
674                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
675                         ereport(ERROR,
676                                         (errcode_for_file_access(),
677                                    errmsg("could not rewind hash-join temporary file: %m")));
678
679                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
680                                                                                                  innerFile,
681                                                                                                  &hashvalue,
682                                                                                                  hjstate->hj_HashTupleSlot)))
683                 {
684                         /*
685                          * NOTE: some tuples may be sent to future batches.  Also, it is
686                          * possible for hashtable->nbatch to be increased here!
687                          */
688                         ExecHashTableInsert(hashtable,
689                                                                 ExecFetchSlotTuple(slot),
690                                                                 hashvalue);
691                 }
692
693                 /*
694                  * after we build the hash table, the inner batch file is no longer
695                  * needed
696                  */
697                 BufFileClose(innerFile);
698                 hashtable->innerBatchFile[curbatch] = NULL;
699         }
700
701         /*
702          * If there's no outer batch file, advance to next batch.
703          */
704         if (hashtable->outerBatchFile[curbatch] == NULL)
705                 goto start_over;
706
707         /*
708          * Rewind outer batch file, so that we can start reading it.
709          */
710         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
711                 ereport(ERROR,
712                                 (errcode_for_file_access(),
713                                  errmsg("could not rewind hash-join temporary file: %m")));
714
715         return curbatch;
716 }
717
718 /*
719  * ExecHashJoinSaveTuple
720  *              save a tuple to a batch file.
721  *
722  * The data recorded in the file for each tuple is its hash value,
723  * then an image of its HeapTupleData (with meaningless t_data pointer)
724  * followed by the HeapTupleHeader and tuple data.
725  *
726  * Note: it is important always to call this in the regular executor
727  * context, not in a shorter-lived context; else the temp file buffers
728  * will get messed up.
729  */
730 void
731 ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
732                                           BufFile **fileptr)
733 {
734         BufFile    *file = *fileptr;
735         size_t          written;
736
737         if (file == NULL)
738         {
739                 /* First write to this batch file, so open it. */
740                 file = BufFileCreateTemp(false);
741                 *fileptr = file;
742         }
743
744         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
745         if (written != sizeof(uint32))
746                 ereport(ERROR,
747                                 (errcode_for_file_access(),
748                                  errmsg("could not write to hash-join temporary file: %m")));
749
750         written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
751         if (written != sizeof(HeapTupleData))
752                 ereport(ERROR,
753                                 (errcode_for_file_access(),
754                                  errmsg("could not write to hash-join temporary file: %m")));
755
756         written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
757         if (written != (size_t) heapTuple->t_len)
758                 ereport(ERROR,
759                                 (errcode_for_file_access(),
760                                  errmsg("could not write to hash-join temporary file: %m")));
761 }
762
763 /*
764  * ExecHashJoinGetSavedTuple
765  *              read the next tuple from a batch file.  Return NULL if no more.
766  *
767  * On success, *hashvalue is set to the tuple's hash value, and the tuple
768  * itself is stored in the given slot.
769  */
770 static TupleTableSlot *
771 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
772                                                   BufFile *file,
773                                                   uint32 *hashvalue,
774                                                   TupleTableSlot *tupleSlot)
775 {
776         HeapTupleData htup;
777         size_t          nread;
778         HeapTuple       heapTuple;
779
780         nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
781         if (nread == 0)
782                 return NULL;                    /* end of file */
783         if (nread != sizeof(uint32))
784                 ereport(ERROR,
785                                 (errcode_for_file_access(),
786                                  errmsg("could not read from hash-join temporary file: %m")));
787         nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
788         if (nread != sizeof(HeapTupleData))
789                 ereport(ERROR,
790                                 (errcode_for_file_access(),
791                                  errmsg("could not read from hash-join temporary file: %m")));
792         heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
793         memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
794         heapTuple->t_datamcxt = CurrentMemoryContext;
795         heapTuple->t_data = (HeapTupleHeader)
796                 ((char *) heapTuple + HEAPTUPLESIZE);
797         nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
798         if (nread != (size_t) htup.t_len)
799                 ereport(ERROR,
800                                 (errcode_for_file_access(),
801                                  errmsg("could not read from hash-join temporary file: %m")));
802         return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
803 }
804
805
806 void
807 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
808 {
809         /*
810          * If we haven't yet built the hash table then we can just return; nothing
811          * done yet, so nothing to undo.
812          */
813         if (node->hj_HashTable == NULL)
814                 return;
815
816         /*
817          * In a multi-batch join, we currently have to do rescans the hard way,
818          * primarily because batch temp files may have already been released. But
819          * if it's a single-batch join, and there is no parameter change for the
820          * inner subnode, then we can just re-use the existing hash table without
821          * rebuilding it.
822          */
823         if (node->hj_HashTable->nbatch == 1 &&
824                 ((PlanState *) node)->righttree->chgParam == NULL)
825         {
826                 /* okay to reuse the hash table; needn't rescan inner, either */
827         }
828         else
829         {
830                 /* must destroy and rebuild hash table */
831                 ExecHashTableDestroy(node->hj_HashTable);
832                 node->hj_HashTable = NULL;
833                 node->hj_FirstOuterTupleSlot = NULL;
834
835                 /*
836                  * if chgParam of subnode is not null then plan will be re-scanned by
837                  * first ExecProcNode.
838                  */
839                 if (((PlanState *) node)->righttree->chgParam == NULL)
840                         ExecReScan(((PlanState *) node)->righttree, exprCtxt);
841         }
842
843         /* Always reset intra-tuple state */
844         node->hj_CurHashValue = 0;
845         node->hj_CurBucketNo = 0;
846         node->hj_CurTuple = NULL;
847
848         node->js.ps.ps_OuterTupleSlot = NULL;
849         node->js.ps.ps_TupFromTlist = false;
850         node->hj_NeedNewOuter = true;
851         node->hj_MatchedOuter = false;
852
853         /*
854          * if chgParam of subnode is not null then plan will be re-scanned by
855          * first ExecProcNode.
856          */
857         if (((PlanState *) node)->lefttree->chgParam == NULL)
858                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
859 }