src/backend/executor/nodeHashjoin.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nodeHashjoin.c
   4  *        Routines to handle hash join nodes
   5  *
   6  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.74 2005/10/15 02:49:17 momjian Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "executor/executor.h"
  19 #include "executor/hashjoin.h"
  20 #include "executor/nodeHash.h"
  21 #include "executor/nodeHashjoin.h"
  22 #include "optimizer/clauses.h"
  23 #include "utils/memutils.h"
  24
  25
  26 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
  27                                                   HashJoinState *hjstate,
  28                                                   uint32 *hashvalue);
  29 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
  30                                                   BufFile *file,
  31                                                   uint32 *hashvalue,
  32                                                   TupleTableSlot *tupleSlot);
  33 static int      ExecHashJoinNewBatch(HashJoinState *hjstate);
  34
  35
  36 /* ----------------------------------------------------------------
  37  *              ExecHashJoin
  38  *
  39  *              This function implements the Hybrid Hashjoin algorithm.
  40  *
  41  *              Note: the relation we build hash table on is the "inner"
  42  *                        the other one is "outer".
  43  * ----------------------------------------------------------------
  44  */
  45 TupleTableSlot *                                /* return: a tuple or NULL */
  46 ExecHashJoin(HashJoinState *node)
  47 {
  48         EState     *estate;
  49         PlanState  *outerNode;
  50         HashState  *hashNode;
  51         List       *joinqual;
  52         List       *otherqual;
  53         ScanDirection dir;
  54         TupleTableSlot *inntuple;
  55         ExprContext *econtext;
  56         ExprDoneCond isDone;
  57         HashJoinTable hashtable;
  58         HeapTuple       curtuple;
  59         TupleTableSlot *outerTupleSlot;
  60         uint32          hashvalue;
  61         int                     batchno;
  62
  63         /*
  64          * get information from HashJoin node
  65          */
  66         estate = node->js.ps.state;
  67         joinqual = node->js.joinqual;
  68         otherqual = node->js.ps.qual;
  69         hashNode = (HashState *) innerPlanState(node);
  70         outerNode = outerPlanState(node);
  71         dir = estate->es_direction;
  72
  73         /*
  74          * get information from HashJoin state
  75          */
  76         hashtable = node->hj_HashTable;
  77         econtext = node->js.ps.ps_ExprContext;
  78
  79         /*
  80          * Check to see if we're still projecting out tuples from a previous join
  81          * tuple (because there is a function-returning-set in the projection
  82          * expressions).  If so, try to project another one.
  83          */
  84         if (node->js.ps.ps_TupFromTlist)
  85         {
  86                 TupleTableSlot *result;
  87
  88                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
  89                 if (isDone == ExprMultipleResult)
  90                         return result;
  91                 /* Done with that source tuple... */
  92                 node->js.ps.ps_TupFromTlist = false;
  93         }
  94
  95         /*
  96          * If we're doing an IN join, we want to return at most one row per outer
  97          * tuple; so we can stop scanning the inner scan if we matched on the
  98          * previous try.
  99          */
 100         if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
 101                 node->hj_NeedNewOuter = true;
 102
 103         /*
 104          * Reset per-tuple memory context to free any expression evaluation
 105          * storage allocated in the previous tuple cycle.  Note this can't happen
 106          * until we're done projecting out tuples from a join tuple.
 107          */
 108         ResetExprContext(econtext);
 109
 110         /*
 111          * if this is the first call, build the hash table for inner relation
 112          */
 113         if (hashtable == NULL)
 114         {
 115                 /*
 116                  * If the outer relation is completely empty, we can quit without
 117                  * building the hash table.  However, for an inner join it is only a
 118                  * win to check this when the outer relation's startup cost is less
 119                  * than the projected cost of building the hash table.  Otherwise it's
 120                  * best to build the hash table first and see if the inner relation is
 121                  * empty.  (When it's an outer join, we should always make this check,
 122                  * since we aren't going to be able to skip the join on the strength
 123                  * of an empty inner relation anyway.)
 124                  *
 125                  * The only way to make the check is to try to fetch a tuple from the
 126                  * outer plan node.  If we succeed, we have to stash it away for later
 127                  * consumption by ExecHashJoinOuterGetTuple.
 128                  */
 129                 if (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost ||
 130                         node->js.jointype == JOIN_LEFT)
 131                 {
 132                         node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
 133                         if (TupIsNull(node->hj_FirstOuterTupleSlot))
 134                                 return NULL;
 135                 }
 136                 else
 137                         node->hj_FirstOuterTupleSlot = NULL;
 138
 139                 /*
 140                  * create the hash table
 141                  */
 142                 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
 143                                                                                 node->hj_HashOperators);
 144                 node->hj_HashTable = hashtable;
 145
 146                 /*
 147                  * execute the Hash node, to build the hash table
 148                  */
 149                 hashNode->hashtable = hashtable;
 150                 (void) MultiExecProcNode((PlanState *) hashNode);
 151
 152                 /*
 153                  * If the inner relation is completely empty, and we're not doing an
 154                  * outer join, we can quit without scanning the outer relation.
 155                  */
 156                 if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
 157                 {
 158                         ExecHashTableDestroy(hashtable);
 159                         node->hj_HashTable = NULL;
 160                         node->hj_FirstOuterTupleSlot = NULL;
 161                         return NULL;
 162                 }
 163
 164                 /*
 165                  * need to remember whether nbatch has increased since we began
 166                  * scanning the outer relation
 167                  */
 168                 hashtable->nbatch_outstart = hashtable->nbatch;
 169         }
 170
 171         /*
 172          * run the hash join process
 173          */
 174         for (;;)
 175         {
 176                 /*
 177                  * If we don't have an outer tuple, get the next one
 178                  */
 179                 if (node->hj_NeedNewOuter)
 180                 {
 181                         outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
 182                                                                                                            node,
 183                                                                                                            &hashvalue);
 184                         if (TupIsNull(outerTupleSlot))
 185                         {
 186                                 /* end of join */
 187                                 return NULL;
 188                         }
 189
 190                         node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
 191                         econtext->ecxt_outertuple = outerTupleSlot;
 192                         node->hj_NeedNewOuter = false;
 193                         node->hj_MatchedOuter = false;
 194
 195                         /*
 196                          * now we have an outer tuple, find the corresponding bucket for
 197                          * this tuple from the hash table
 198                          */
 199                         node->hj_CurHashValue = hashvalue;
 200                         ExecHashGetBucketAndBatch(hashtable, hashvalue,
 201                                                                           &node->hj_CurBucketNo, &batchno);
 202                         node->hj_CurTuple = NULL;
 203
 204                         /*
 205                          * Now we've got an outer tuple and the corresponding hash bucket,
 206                          * but this tuple may not belong to the current batch.
 207                          */
 208                         if (batchno != hashtable->curbatch)
 209                         {
 210                                 /*
 211                                  * Need to postpone this outer tuple to a later batch. Save it
 212                                  * in the corresponding outer-batch file.
 213                                  */
 214                                 Assert(batchno > hashtable->curbatch);
 215                                 ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
 216                                                                           hashvalue,
 217                                                                           &hashtable->outerBatchFile[batchno]);
 218                                 node->hj_NeedNewOuter = true;
 219                                 continue;               /* loop around for a new outer tuple */
 220                         }
 221                 }
 222
 223                 /*
 224                  * OK, scan the selected hash bucket for matches
 225                  */
 226                 for (;;)
 227                 {
 228                         curtuple = ExecScanHashBucket(node, econtext);
 229                         if (curtuple == NULL)
 230                                 break;                  /* out of matches */
 231
 232                         /*
 233                          * we've got a match, but still need to test non-hashed quals
 234                          */
 235                         inntuple = ExecStoreTuple(curtuple,
 236                                                                           node->hj_HashTupleSlot,
 237                                                                           InvalidBuffer,
 238                                                                           false);       /* don't pfree this tuple */
 239                         econtext->ecxt_innertuple = inntuple;
 240
 241                         /* reset temp memory each time to avoid leaks from qual expr */
 242                         ResetExprContext(econtext);
 243
 244                         /*
 245                          * if we pass the qual, then save state for next call and have
 246                          * ExecProject form the projection, store it in the tuple table,
 247                          * and return the slot.
 248                          *
 249                          * Only the joinquals determine MatchedOuter status, but all quals
 250                          * must pass to actually return the tuple.
 251                          */
 252                         if (joinqual == NIL || ExecQual(joinqual, econtext, false))
 253                         {
 254                                 node->hj_MatchedOuter = true;
 255
 256                                 if (otherqual == NIL || ExecQual(otherqual, econtext, false))
 257                                 {
 258                                         TupleTableSlot *result;
 259
 260                                         result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
 261
 262                                         if (isDone != ExprEndResult)
 263                                         {
 264                                                 node->js.ps.ps_TupFromTlist =
 265                                                         (isDone == ExprMultipleResult);
 266                                                 return result;
 267                                         }
 268                                 }
 269
 270                                 /*
 271                                  * If we didn't return a tuple, may need to set NeedNewOuter
 272                                  */
 273                                 if (node->js.jointype == JOIN_IN)
 274                                 {
 275                                         node->hj_NeedNewOuter = true;
 276                                         break;          /* out of loop over hash bucket */
 277                                 }
 278                         }
 279                 }
 280
 281                 /*
 282                  * Now the current outer tuple has run out of matches, so check
 283                  * whether to emit a dummy outer-join tuple. If not, loop around to
 284                  * get a new outer tuple.
 285                  */
 286                 node->hj_NeedNewOuter = true;
 287
 288                 if (!node->hj_MatchedOuter &&
 289                         node->js.jointype == JOIN_LEFT)
 290                 {
 291                         /*
 292                          * We are doing an outer join and there were no join matches for
 293                          * this outer tuple.  Generate a fake join tuple with nulls for
 294                          * the inner tuple, and return it if it passes the non-join quals.
 295                          */
 296                         econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
 297
 298                         if (ExecQual(otherqual, econtext, false))
 299                         {
 300                                 /*
 301                                  * qualification was satisfied so we project and return the
 302                                  * slot containing the result tuple using ExecProject().
 303                                  */
 304                                 TupleTableSlot *result;
 305
 306                                 result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
 307
 308                                 if (isDone != ExprEndResult)
 309                                 {
 310                                         node->js.ps.ps_TupFromTlist =
 311                                                 (isDone == ExprMultipleResult);
 312                                         return result;
 313                                 }
 314                         }
 315                 }
 316         }
 317 }
 318
 319 /* ----------------------------------------------------------------
 320  *              ExecInitHashJoin
 321  *
 322  *              Init routine for HashJoin node.
 323  * ----------------------------------------------------------------
 324  */
 325 HashJoinState *
 326 ExecInitHashJoin(HashJoin *node, EState *estate)
 327 {
 328         HashJoinState *hjstate;
 329         Plan       *outerNode;
 330         Hash       *hashNode;
 331         List       *lclauses;
 332         List       *rclauses;
 333         List       *hoperators;
 334         ListCell   *l;
 335
 336         /*
 337          * create state structure
 338          */
 339         hjstate = makeNode(HashJoinState);
 340         hjstate->js.ps.plan = (Plan *) node;
 341         hjstate->js.ps.state = estate;
 342
 343         /*
 344          * Miscellaneous initialization
 345          *
 346          * create expression context for node
 347          */
 348         ExecAssignExprContext(estate, &hjstate->js.ps);
 349
 350         /*
 351          * initialize child expressions
 352          */
 353         hjstate->js.ps.targetlist = (List *)
 354                 ExecInitExpr((Expr *) node->join.plan.targetlist,
 355                                          (PlanState *) hjstate);
 356         hjstate->js.ps.qual = (List *)
 357                 ExecInitExpr((Expr *) node->join.plan.qual,
 358                                          (PlanState *) hjstate);
 359         hjstate->js.jointype = node->join.jointype;
 360         hjstate->js.joinqual = (List *)
 361                 ExecInitExpr((Expr *) node->join.joinqual,
 362                                          (PlanState *) hjstate);
 363         hjstate->hashclauses = (List *)
 364                 ExecInitExpr((Expr *) node->hashclauses,
 365                                          (PlanState *) hjstate);
 366
 367         /*
 368          * initialize child nodes
 369          */
 370         outerNode = outerPlan(node);
 371         hashNode = (Hash *) innerPlan(node);
 372
 373         outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
 374         innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
 375
 376 #define HASHJOIN_NSLOTS 3
 377
 378         /*
 379          * tuple table initialization
 380          */
 381         ExecInitResultTupleSlot(estate, &hjstate->js.ps);
 382         hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
 383
 384         switch (node->join.jointype)
 385         {
 386                 case JOIN_INNER:
 387                 case JOIN_IN:
 388                         break;
 389                 case JOIN_LEFT:
 390                         hjstate->hj_NullInnerTupleSlot =
 391                                 ExecInitNullTupleSlot(estate,
 392                                                                  ExecGetResultType(innerPlanState(hjstate)));
 393                         break;
 394                 default:
 395                         elog(ERROR, "unrecognized join type: %d",
 396                                  (int) node->join.jointype);
 397         }
 398
 399         /*
 400          * now for some voodoo.  our temporary tuple slot is actually the result
 401          * tuple slot of the Hash node (which is our inner plan).  we do this
 402          * because Hash nodes don't return tuples via ExecProcNode() -- instead
 403          * the hash join node uses ExecScanHashBucket() to get at the contents of
 404          * the hash table.      -cim 6/9/91
 405          */
 406         {
 407                 HashState  *hashstate = (HashState *) innerPlanState(hjstate);
 408                 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
 409
 410                 hjstate->hj_HashTupleSlot = slot;
 411         }
 412
 413         /*
 414          * initialize tuple type and projection info
 415          */
 416         ExecAssignResultTypeFromTL(&hjstate->js.ps);
 417         ExecAssignProjectionInfo(&hjstate->js.ps);
 418
 419         ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
 420                                                   ExecGetResultType(outerPlanState(hjstate)),
 421                                                   false);
 422
 423         /*
 424          * initialize hash-specific info
 425          */
 426         hjstate->hj_HashTable = NULL;
 427         hjstate->hj_FirstOuterTupleSlot = NULL;
 428
 429         hjstate->hj_CurHashValue = 0;
 430         hjstate->hj_CurBucketNo = 0;
 431         hjstate->hj_CurTuple = NULL;
 432
 433         /*
 434          * Deconstruct the hash clauses into outer and inner argument values, so
 435          * that we can evaluate those subexpressions separately.  Also make a list
 436          * of the hash operator OIDs, in preparation for looking up the hash
 437          * functions to use.
 438          */
 439         lclauses = NIL;
 440         rclauses = NIL;
 441         hoperators = NIL;
 442         foreach(l, hjstate->hashclauses)
 443         {
 444                 FuncExprState *fstate = (FuncExprState *) lfirst(l);
 445                 OpExpr     *hclause;
 446
 447                 Assert(IsA(fstate, FuncExprState));
 448                 hclause = (OpExpr *) fstate->xprstate.expr;
 449                 Assert(IsA(hclause, OpExpr));
 450                 lclauses = lappend(lclauses, linitial(fstate->args));
 451                 rclauses = lappend(rclauses, lsecond(fstate->args));
 452                 hoperators = lappend_oid(hoperators, hclause->opno);
 453         }
 454         hjstate->hj_OuterHashKeys = lclauses;
 455         hjstate->hj_InnerHashKeys = rclauses;
 456         hjstate->hj_HashOperators = hoperators;
 457         /* child Hash node needs to evaluate inner hash keys, too */
 458         ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
 459
 460         hjstate->js.ps.ps_OuterTupleSlot = NULL;
 461         hjstate->js.ps.ps_TupFromTlist = false;
 462         hjstate->hj_NeedNewOuter = true;
 463         hjstate->hj_MatchedOuter = false;
 464
 465         return hjstate;
 466 }
 467
 468 int
 469 ExecCountSlotsHashJoin(HashJoin *node)
 470 {
 471         return ExecCountSlotsNode(outerPlan(node)) +
 472                 ExecCountSlotsNode(innerPlan(node)) +
 473                 HASHJOIN_NSLOTS;
 474 }
 475
 476 /* ----------------------------------------------------------------
 477  *              ExecEndHashJoin
 478  *
 479  *              clean up routine for HashJoin node
 480  * ----------------------------------------------------------------
 481  */
 482 void
 483 ExecEndHashJoin(HashJoinState *node)
 484 {
 485         /*
 486          * Free hash table
 487          */
 488         if (node->hj_HashTable)
 489         {
 490                 ExecHashTableDestroy(node->hj_HashTable);
 491                 node->hj_HashTable = NULL;
 492                 node->hj_FirstOuterTupleSlot = NULL;
 493         }
 494
 495         /*
 496          * Free the exprcontext
 497          */
 498         ExecFreeExprContext(&node->js.ps);
 499
 500         /*
 501          * clean out the tuple table
 502          */
 503         ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
 504         ExecClearTuple(node->hj_OuterTupleSlot);
 505         ExecClearTuple(node->hj_HashTupleSlot);
 506
 507         /*
 508          * clean up subtrees
 509          */
 510         ExecEndNode(outerPlanState(node));
 511         ExecEndNode(innerPlanState(node));
 512 }
 513
 514 /*
 515  * ExecHashJoinOuterGetTuple
 516  *
 517  *              get the next outer tuple for hashjoin: either by
 518  *              executing a plan node in the first pass, or from
 519  *              the temp files for the hashjoin batches.
 520  *
 521  * Returns a null slot if no more outer tuples.  On success, the tuple's
 522  * hash value is stored at *hashvalue --- this is either originally computed,
 523  * or re-read from the temp file.
 524  */
 525 static TupleTableSlot *
 526 ExecHashJoinOuterGetTuple(PlanState *outerNode,
 527                                                   HashJoinState *hjstate,
 528                                                   uint32 *hashvalue)
 529 {
 530         HashJoinTable hashtable = hjstate->hj_HashTable;
 531         int                     curbatch = hashtable->curbatch;
 532         TupleTableSlot *slot;
 533
 534         if (curbatch == 0)
 535         {                                                       /* if it is the first pass */
 536
 537                 /*
 538                  * Check to see if first outer tuple was already fetched by
 539                  * ExecHashJoin() and not used yet.
 540                  */
 541                 slot = hjstate->hj_FirstOuterTupleSlot;
 542                 if (!TupIsNull(slot))
 543                         hjstate->hj_FirstOuterTupleSlot = NULL;
 544                 else
 545                         slot = ExecProcNode(outerNode);
 546                 if (!TupIsNull(slot))
 547                 {
 548                         /*
 549                          * We have to compute the tuple's hash value.
 550                          */
 551                         ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
 552
 553                         econtext->ecxt_outertuple = slot;
 554                         *hashvalue = ExecHashGetHashValue(hashtable, econtext,
 555                                                                                           hjstate->hj_OuterHashKeys);
 556
 557                         return slot;
 558                 }
 559
 560                 /*
 561                  * We have just reached the end of the first pass. Try to switch to a
 562                  * saved batch.
 563                  */
 564                 curbatch = ExecHashJoinNewBatch(hjstate);
 565         }
 566
 567         /*
 568          * Try to read from a temp file. Loop allows us to advance to new batches
 569          * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
 570          * don't try to optimize this loop.
 571          */
 572         while (curbatch < hashtable->nbatch)
 573         {
 574                 slot = ExecHashJoinGetSavedTuple(hjstate,
 575                                                                                  hashtable->outerBatchFile[curbatch],
 576                                                                                  hashvalue,
 577                                                                                  hjstate->hj_OuterTupleSlot);
 578                 if (!TupIsNull(slot))
 579                         return slot;
 580                 curbatch = ExecHashJoinNewBatch(hjstate);
 581         }
 582
 583         /* Out of batches... */
 584         return NULL;
 585 }
 586
 587 /*
 588  * ExecHashJoinNewBatch
 589  *              switch to a new hashjoin batch
 590  *
 591  * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
 592  * We will never return a batch number that has an empty outer batch file.
 593  */
 594 static int
 595 ExecHashJoinNewBatch(HashJoinState *hjstate)
 596 {
 597         HashJoinTable hashtable = hjstate->hj_HashTable;
 598         int                     nbatch;
 599         int                     curbatch;
 600         BufFile    *innerFile;
 601         TupleTableSlot *slot;
 602         uint32          hashvalue;
 603
 604 start_over:
 605         nbatch = hashtable->nbatch;
 606         curbatch = hashtable->curbatch;
 607
 608         if (curbatch > 0)
 609         {
 610                 /*
 611                  * We no longer need the previous outer batch file; close it right
 612                  * away to free disk space.
 613                  */
 614                 if (hashtable->outerBatchFile[curbatch])
 615                         BufFileClose(hashtable->outerBatchFile[curbatch]);
 616                 hashtable->outerBatchFile[curbatch] = NULL;
 617         }
 618
 619         /*
 620          * We can always skip over any batches that are completely empty on both
 621          * sides.  We can sometimes skip over batches that are empty on only one
 622          * side, but there are exceptions:
 623          *
 624          * 1. In a LEFT JOIN, we have to process outer batches even if the inner
 625          * batch is empty.
 626          *
 627          * 2. If we have increased nbatch since the initial estimate, we have to scan
 628          * inner batches since they might contain tuples that need to be
 629          * reassigned to later inner batches.
 630          *
 631          * 3. Similarly, if we have increased nbatch since starting the outer scan,
 632          * we have to rescan outer batches in case they contain tuples that need
 633          * to be reassigned.
 634          */
 635         curbatch++;
 636         while (curbatch < nbatch &&
 637                    (hashtable->outerBatchFile[curbatch] == NULL ||
 638                         hashtable->innerBatchFile[curbatch] == NULL))
 639         {
 640                 if (hashtable->outerBatchFile[curbatch] &&
 641                         hjstate->js.jointype == JOIN_LEFT)
 642                         break;                          /* must process due to rule 1 */
 643                 if (hashtable->innerBatchFile[curbatch] &&
 644                         nbatch != hashtable->nbatch_original)
 645                         break;                          /* must process due to rule 2 */
 646                 if (hashtable->outerBatchFile[curbatch] &&
 647                         nbatch != hashtable->nbatch_outstart)
 648                         break;                          /* must process due to rule 3 */
 649                 /* We can ignore this batch. */
 650                 /* Release associated temp files right away. */
 651                 if (hashtable->innerBatchFile[curbatch])
 652                         BufFileClose(hashtable->innerBatchFile[curbatch]);
 653                 hashtable->innerBatchFile[curbatch] = NULL;
 654                 if (hashtable->outerBatchFile[curbatch])
 655                         BufFileClose(hashtable->outerBatchFile[curbatch]);
 656                 hashtable->outerBatchFile[curbatch] = NULL;
 657                 curbatch++;
 658         }
 659
 660         if (curbatch >= nbatch)
 661                 return curbatch;                /* no more batches */
 662
 663         hashtable->curbatch = curbatch;
 664
 665         /*
 666          * Reload the hash table with the new inner batch (which could be empty)
 667          */
 668         ExecHashTableReset(hashtable);
 669
 670         innerFile = hashtable->innerBatchFile[curbatch];
 671
 672         if (innerFile != NULL)
 673         {
 674                 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
 675                         ereport(ERROR,
 676                                         (errcode_for_file_access(),
 677                                    errmsg("could not rewind hash-join temporary file: %m")));
 678
 679                 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
 680                                                                                                  innerFile,
 681                                                                                                  &hashvalue,
 682                                                                                                  hjstate->hj_HashTupleSlot)))
 683                 {
 684                         /*
 685                          * NOTE: some tuples may be sent to future batches.  Also, it is
 686                          * possible for hashtable->nbatch to be increased here!
 687                          */
 688                         ExecHashTableInsert(hashtable,
 689                                                                 ExecFetchSlotTuple(slot),
 690                                                                 hashvalue);
 691                 }
 692
 693                 /*
 694                  * after we build the hash table, the inner batch file is no longer
 695                  * needed
 696                  */
 697                 BufFileClose(innerFile);
 698                 hashtable->innerBatchFile[curbatch] = NULL;
 699         }
 700
 701         /*
 702          * If there's no outer batch file, advance to next batch.
 703          */
 704         if (hashtable->outerBatchFile[curbatch] == NULL)
 705                 goto start_over;
 706
 707         /*
 708          * Rewind outer batch file, so that we can start reading it.
 709          */
 710         if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
 711                 ereport(ERROR,
 712                                 (errcode_for_file_access(),
 713                                  errmsg("could not rewind hash-join temporary file: %m")));
 714
 715         return curbatch;
 716 }
 717
 718 /*
 719  * ExecHashJoinSaveTuple
 720  *              save a tuple to a batch file.
 721  *
 722  * The data recorded in the file for each tuple is its hash value,
 723  * then an image of its HeapTupleData (with meaningless t_data pointer)
 724  * followed by the HeapTupleHeader and tuple data.
 725  *
 726  * Note: it is important always to call this in the regular executor
 727  * context, not in a shorter-lived context; else the temp file buffers
 728  * will get messed up.
 729  */
 730 void
 731 ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
 732                                           BufFile **fileptr)
 733 {
 734         BufFile    *file = *fileptr;
 735         size_t          written;
 736
 737         if (file == NULL)
 738         {
 739                 /* First write to this batch file, so open it. */
 740                 file = BufFileCreateTemp(false);
 741                 *fileptr = file;
 742         }
 743
 744         written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
 745         if (written != sizeof(uint32))
 746                 ereport(ERROR,
 747                                 (errcode_for_file_access(),
 748                                  errmsg("could not write to hash-join temporary file: %m")));
 749
 750         written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
 751         if (written != sizeof(HeapTupleData))
 752                 ereport(ERROR,
 753                                 (errcode_for_file_access(),
 754                                  errmsg("could not write to hash-join temporary file: %m")));
 755
 756         written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
 757         if (written != (size_t) heapTuple->t_len)
 758                 ereport(ERROR,
 759                                 (errcode_for_file_access(),
 760                                  errmsg("could not write to hash-join temporary file: %m")));
 761 }
 762
 763 /*
 764  * ExecHashJoinGetSavedTuple
 765  *              read the next tuple from a batch file.  Return NULL if no more.
 766  *
 767  * On success, *hashvalue is set to the tuple's hash value, and the tuple
 768  * itself is stored in the given slot.
 769  */
 770 static TupleTableSlot *
 771 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 772                                                   BufFile *file,
 773                                                   uint32 *hashvalue,
 774                                                   TupleTableSlot *tupleSlot)
 775 {
 776         HeapTupleData htup;
 777         size_t          nread;
 778         HeapTuple       heapTuple;
 779
 780         nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
 781         if (nread == 0)
 782                 return NULL;                    /* end of file */
 783         if (nread != sizeof(uint32))
 784                 ereport(ERROR,
 785                                 (errcode_for_file_access(),
 786                                  errmsg("could not read from hash-join temporary file: %m")));
 787         nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
 788         if (nread != sizeof(HeapTupleData))
 789                 ereport(ERROR,
 790                                 (errcode_for_file_access(),
 791                                  errmsg("could not read from hash-join temporary file: %m")));
 792         heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
 793         memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
 794         heapTuple->t_datamcxt = CurrentMemoryContext;
 795         heapTuple->t_data = (HeapTupleHeader)
 796                 ((char *) heapTuple + HEAPTUPLESIZE);
 797         nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
 798         if (nread != (size_t) htup.t_len)
 799                 ereport(ERROR,
 800                                 (errcode_for_file_access(),
 801                                  errmsg("could not read from hash-join temporary file: %m")));
 802         return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
 803 }
 804
 805
 806 void
 807 ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
 808 {
 809         /*
 810          * If we haven't yet built the hash table then we can just return; nothing
 811          * done yet, so nothing to undo.
 812          */
 813         if (node->hj_HashTable == NULL)
 814                 return;
 815
 816         /*
 817          * In a multi-batch join, we currently have to do rescans the hard way,
 818          * primarily because batch temp files may have already been released. But
 819          * if it's a single-batch join, and there is no parameter change for the
 820          * inner subnode, then we can just re-use the existing hash table without
 821          * rebuilding it.
 822          */
 823         if (node->hj_HashTable->nbatch == 1 &&
 824                 ((PlanState *) node)->righttree->chgParam == NULL)
 825         {
 826                 /* okay to reuse the hash table; needn't rescan inner, either */
 827         }
 828         else
 829         {
 830                 /* must destroy and rebuild hash table */
 831                 ExecHashTableDestroy(node->hj_HashTable);
 832                 node->hj_HashTable = NULL;
 833                 node->hj_FirstOuterTupleSlot = NULL;
 834
 835                 /*
 836                  * if chgParam of subnode is not null then plan will be re-scanned by
 837                  * first ExecProcNode.
 838                  */
 839                 if (((PlanState *) node)->righttree->chgParam == NULL)
 840                         ExecReScan(((PlanState *) node)->righttree, exprCtxt);
 841         }
 842
 843         /* Always reset intra-tuple state */
 844         node->hj_CurHashValue = 0;
 845         node->hj_CurBucketNo = 0;
 846         node->hj_CurTuple = NULL;
 847
 848         node->js.ps.ps_OuterTupleSlot = NULL;
 849         node->js.ps.ps_TupFromTlist = false;
 850         node->hj_NeedNewOuter = true;
 851         node->hj_MatchedOuter = false;
 852
 853         /*
 854          * if chgParam of subnode is not null then plan will be re-scanned by
 855          * first ExecProcNode.
 856          */
 857         if (((PlanState *) node)->lefttree->chgParam == NULL)
 858                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
 859 }