/*------------------------------------------------------------------------- * * nodeHashjoin.c * Routines to handle hash join nodes * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.72 2005/06/15 07:27:44 neilc Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "executor/executor.h" #include "executor/hashjoin.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "optimizer/clauses.h" #include "utils/memutils.h" static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, HashJoinState *hjstate, uint32 *hashvalue); static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, uint32 *hashvalue, TupleTableSlot *tupleSlot); static int ExecHashJoinNewBatch(HashJoinState *hjstate); static TupleTableSlot *ExecHashJoinReadOuterPlan(HashJoinState *hjstate); /* ---------------------------------------------------------------- * ExecHashJoin * * This function implements the Hybrid Hashjoin algorithm. * * Note: the relation we build hash table on is the "inner" * the other one is "outer". * ---------------------------------------------------------------- */ TupleTableSlot * /* return: a tuple or NULL */ ExecHashJoin(HashJoinState *node) { EState *estate; PlanState *outerNode; HashState *hashNode; List *joinqual; List *otherqual; ScanDirection dir; TupleTableSlot *inntuple; ExprContext *econtext; ExprDoneCond isDone; HashJoinTable hashtable; HeapTuple curtuple; TupleTableSlot *outerTupleSlot; /* * get information from HashJoin node */ estate = node->js.ps.state; joinqual = node->js.joinqual; otherqual = node->js.ps.qual; hashNode = (HashState *) innerPlanState(node); outerNode = outerPlanState(node); dir = estate->es_direction; /* * get information from HashJoin state */ hashtable = node->hj_HashTable; econtext = node->js.ps.ps_ExprContext; /* * Check to see if we're still projecting out tuples from a previous * join tuple (because there is a function-returning-set in the * projection expressions). If so, try to project another one. */ if (node->js.ps.ps_TupFromTlist) { TupleTableSlot *result; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone == ExprMultipleResult) return result; /* Done with that source tuple... */ node->js.ps.ps_TupFromTlist = false; } /* * If we're doing an IN join, we want to return at most one row per * outer tuple; so we can stop scanning the inner scan if we matched * on the previous try. */ if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter) node->hj_NeedNewOuter = true; /* * Reset per-tuple memory context to free any expression evaluation * storage allocated in the previous tuple cycle. Note this can't * happen until we're done projecting out tuples from a join tuple. */ ResetExprContext(econtext); if (hashtable == NULL) { /* * This is the first call to the node. When _either_ of the * hash join inputs are empty, we want to avoid doing * unnecessary work (e.g. building the hash table for the * inner join relation). We therefore read a single tuple from * both inputs before proceeding further. We choose which * input to probe first based on the startup cost of the plan * node. * * Note that if we're executing an outer join and the inner * relation is empty, we still have work to do. */ /* Consider probing the inner relation first */ if (hashNode->ps.plan->startup_cost <= outerNode->plan->startup_cost && node->js.jointype != JOIN_LEFT) { /* * ExecHash() lets us get a single tuple from the inner * relation without building the entire hash table */ TupleTableSlot *tup = ExecProcNode(&hashNode->ps); if (TupIsNull(tup)) return NULL; } /* * Before we can check the outer relation, we need to build * the hash table. This is somewhat a waste of time if the * outer relation is empty, but it would be awkward to avoid. */ hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan, node->hj_HashOperators); node->hj_HashTable = hashtable; hashNode->hashtable = hashtable; /* Now check the outer relation */ outerTupleSlot = ExecHashJoinReadOuterPlan(node); if (TupIsNull(outerTupleSlot)) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; return NULL; } /* * Okay, we can't avoid it, so execute the Hash node to build * the hash table */ (void) MultiExecProcNode((PlanState *) hashNode); /* * If the inner relation is empty but its startup cost was * less than the outer relation's startup cost, we can arrive * here -- we're done unless this is an outer join */ if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; return NULL; } /* * need to remember whether nbatch has increased since we began * scanning the outer relation */ hashtable->nbatch_outstart = hashtable->nbatch; } /* * run the hash join process */ for (;;) { /* * If we don't have an outer tuple, get the next one */ if (node->hj_NeedNewOuter) { outerTupleSlot = ExecHashJoinReadOuterPlan(node); if (TupIsNull(outerTupleSlot)) return NULL; /* end of join */ } /* * OK, scan the selected hash bucket for matches */ for (;;) { curtuple = ExecScanHashBucket(node, econtext); if (curtuple == NULL) break; /* out of matches */ /* * we've got a match, but still need to test non-hashed quals */ inntuple = ExecStoreTuple(curtuple, node->hj_HashTupleSlot, InvalidBuffer, false); /* don't pfree this tuple */ econtext->ecxt_innertuple = inntuple; /* reset temp memory each time to avoid leaks from qual expr */ ResetExprContext(econtext); /* * if we pass the qual, then save state for next call and have * ExecProject form the projection, store it in the tuple * table, and return the slot. * * Only the joinquals determine MatchedOuter status, but all * quals must pass to actually return the tuple. */ if (joinqual == NIL || ExecQual(joinqual, econtext, false)) { node->hj_MatchedOuter = true; if (otherqual == NIL || ExecQual(otherqual, econtext, false)) { TupleTableSlot *result; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_TupFromTlist = (isDone == ExprMultipleResult); return result; } } /* * If we didn't return a tuple, may need to set * NeedNewOuter */ if (node->js.jointype == JOIN_IN) { node->hj_NeedNewOuter = true; break; /* out of loop over hash bucket */ } } } /* * Now the current outer tuple has run out of matches, so check * whether to emit a dummy outer-join tuple. If not, loop around * to get a new outer tuple. */ node->hj_NeedNewOuter = true; if (!node->hj_MatchedOuter && node->js.jointype == JOIN_LEFT) { /* * We are doing an outer join and there were no join matches * for this outer tuple. Generate a fake join tuple with * nulls for the inner tuple, and return it if it passes the * non-join quals. */ econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; if (ExecQual(otherqual, econtext, false)) { /* * qualification was satisfied so we project and return * the slot containing the result tuple using * ExecProject(). */ TupleTableSlot *result; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_TupFromTlist = (isDone == ExprMultipleResult); return result; } } } } } /* ---------------------------------------------------------------- * ExecInitHashJoin * * Init routine for HashJoin node. * ---------------------------------------------------------------- */ HashJoinState * ExecInitHashJoin(HashJoin *node, EState *estate) { HashJoinState *hjstate; Plan *outerNode; Hash *hashNode; List *lclauses; List *rclauses; List *hoperators; ListCell *l; /* * create state structure */ hjstate = makeNode(HashJoinState); hjstate->js.ps.plan = (Plan *) node; hjstate->js.ps.state = estate; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &hjstate->js.ps); /* * initialize child expressions */ hjstate->js.ps.targetlist = (List *) ExecInitExpr((Expr *) node->join.plan.targetlist, (PlanState *) hjstate); hjstate->js.ps.qual = (List *) ExecInitExpr((Expr *) node->join.plan.qual, (PlanState *) hjstate); hjstate->js.jointype = node->join.jointype; hjstate->js.joinqual = (List *) ExecInitExpr((Expr *) node->join.joinqual, (PlanState *) hjstate); hjstate->hashclauses = (List *) ExecInitExpr((Expr *) node->hashclauses, (PlanState *) hjstate); /* * initialize child nodes */ outerNode = outerPlan(node); hashNode = (Hash *) innerPlan(node); outerPlanState(hjstate) = ExecInitNode(outerNode, estate); innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate); #define HASHJOIN_NSLOTS 3 /* * tuple table initialization */ ExecInitResultTupleSlot(estate, &hjstate->js.ps); hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate); switch (node->join.jointype) { case JOIN_INNER: case JOIN_IN: break; case JOIN_LEFT: hjstate->hj_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(innerPlanState(hjstate))); break; default: elog(ERROR, "unrecognized join type: %d", (int) node->join.jointype); } /* * now for some voodoo. our temporary tuple slot is actually the * result tuple slot of the Hash node (which is our inner plan). we * do this because Hash nodes don't return tuples via ExecProcNode() * -- instead the hash join node uses ExecScanHashBucket() to get at * the contents of the hash table. -cim 6/9/91 */ { HashState *hashstate = (HashState *) innerPlanState(hjstate); TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; hjstate->hj_HashTupleSlot = slot; } /* * initialize tuple type and projection info */ ExecAssignResultTypeFromTL(&hjstate->js.ps); ExecAssignProjectionInfo(&hjstate->js.ps); ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot, ExecGetResultType(outerPlanState(hjstate)), false); /* * initialize hash-specific info */ hjstate->hj_HashTable = NULL; hjstate->hj_CurHashValue = 0; hjstate->hj_CurBucketNo = 0; hjstate->hj_CurTuple = NULL; /* * Deconstruct the hash clauses into outer and inner argument values, * so that we can evaluate those subexpressions separately. Also make * a list of the hash operator OIDs, in preparation for looking up the * hash functions to use. */ lclauses = NIL; rclauses = NIL; hoperators = NIL; foreach(l, hjstate->hashclauses) { FuncExprState *fstate = (FuncExprState *) lfirst(l); OpExpr *hclause; Assert(IsA(fstate, FuncExprState)); hclause = (OpExpr *) fstate->xprstate.expr; Assert(IsA(hclause, OpExpr)); lclauses = lappend(lclauses, linitial(fstate->args)); rclauses = lappend(rclauses, lsecond(fstate->args)); hoperators = lappend_oid(hoperators, hclause->opno); } hjstate->hj_OuterHashKeys = lclauses; hjstate->hj_InnerHashKeys = rclauses; hjstate->hj_HashOperators = hoperators; /* child Hash node needs to evaluate inner hash keys, too */ ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses; hjstate->js.ps.ps_OuterTupleSlot = NULL; hjstate->js.ps.ps_TupFromTlist = false; hjstate->hj_NeedNewOuter = true; hjstate->hj_MatchedOuter = false; return hjstate; } int ExecCountSlotsHashJoin(HashJoin *node) { return ExecCountSlotsNode(outerPlan(node)) + ExecCountSlotsNode(innerPlan(node)) + HASHJOIN_NSLOTS; } /* ---------------------------------------------------------------- * ExecEndHashJoin * * clean up routine for HashJoin node * ---------------------------------------------------------------- */ void ExecEndHashJoin(HashJoinState *node) { /* * Free hash table */ if (node->hj_HashTable) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; } /* * Free the exprcontext */ ExecFreeExprContext(&node->js.ps); /* * clean out the tuple table */ ExecClearTuple(node->js.ps.ps_ResultTupleSlot); ExecClearTuple(node->hj_OuterTupleSlot); ExecClearTuple(node->hj_HashTupleSlot); /* * clean up subtrees */ ExecEndNode(outerPlanState(node)); ExecEndNode(innerPlanState(node)); } /* * ExecHashJoinReadOuterPlan * * do all the work necessary to produce the next tuple from the * outer hash join relation that is in the current batch. Returns * NULL if there are no more tuples in the outer relation. */ static TupleTableSlot * ExecHashJoinReadOuterPlan(HashJoinState *hjstate) { PlanState *outerNode; ExprContext *econtext; HashJoinTable hashtable; outerNode = outerPlanState(hjstate); econtext = hjstate->js.ps.ps_ExprContext; hashtable = hjstate->hj_HashTable; for (;;) { TupleTableSlot *result; uint32 hashvalue; int batchno; result = ExecHashJoinOuterGetTuple(outerNode, hjstate, &hashvalue); if (TupIsNull(result)) { /* end of join */ return NULL; } hjstate->js.ps.ps_OuterTupleSlot = result; econtext->ecxt_outertuple = result; hjstate->hj_NeedNewOuter = false; hjstate->hj_MatchedOuter = false; /* * now we have an outer tuple, find the corresponding bucket * for this tuple from the hash table */ hjstate->hj_CurHashValue = hashvalue; ExecHashGetBucketAndBatch(hashtable, hashvalue, &hjstate->hj_CurBucketNo, &batchno); hjstate->hj_CurTuple = NULL; /* * Now we've got an outer tuple and the corresponding hash * bucket, but this tuple may not belong to the current batch. */ if (batchno != hashtable->curbatch) { /* * Need to postpone this outer tuple to a later batch. * Save it in the corresponding outer-batch file. */ Assert(batchno > hashtable->curbatch); ExecHashJoinSaveTuple(ExecFetchSlotTuple(result), hashvalue, &hashtable->outerBatchFile[batchno]); hjstate->hj_NeedNewOuter = true; continue; /* Get the next outer tuple */ } /* * Otherwise, we have a tuple in the current batch, so we're * done */ return result; } } /* * ExecHashJoinOuterGetTuple * * get the next outer tuple for hashjoin: either by * executing a plan node in the first pass, or from * the temp files for the hashjoin batches. * * Returns a null slot if no more outer tuples. On success, the tuple's * hash value is stored at *hashvalue --- this is either originally computed, * or re-read from the temp file. */ static TupleTableSlot * ExecHashJoinOuterGetTuple(PlanState *outerNode, HashJoinState *hjstate, uint32 *hashvalue) { HashJoinTable hashtable = hjstate->hj_HashTable; int curbatch = hashtable->curbatch; TupleTableSlot *slot; if (curbatch == 0) { /* if it is the first pass */ slot = ExecProcNode(outerNode); if (!TupIsNull(slot)) { /* * We have to compute the tuple's hash value. */ ExprContext *econtext = hjstate->js.ps.ps_ExprContext; econtext->ecxt_outertuple = slot; *hashvalue = ExecHashGetHashValue(hashtable, econtext, hjstate->hj_OuterHashKeys); return slot; } /* * We have just reached the end of the first pass. Try to switch * to a saved batch. */ curbatch = ExecHashJoinNewBatch(hjstate); } /* * Try to read from a temp file. Loop allows us to advance to new * batches as needed. NOTE: nbatch could increase inside * ExecHashJoinNewBatch, so don't try to optimize this loop. */ while (curbatch < hashtable->nbatch) { slot = ExecHashJoinGetSavedTuple(hjstate, hashtable->outerBatchFile[curbatch], hashvalue, hjstate->hj_OuterTupleSlot); if (!TupIsNull(slot)) return slot; curbatch = ExecHashJoinNewBatch(hjstate); } /* Out of batches... */ return NULL; } /* * ExecHashJoinNewBatch * switch to a new hashjoin batch * * Returns the number of the new batch (1..nbatch-1), or nbatch if no more. * We will never return a batch number that has an empty outer batch file. */ static int ExecHashJoinNewBatch(HashJoinState *hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; int nbatch; int curbatch; BufFile *innerFile; TupleTableSlot *slot; uint32 hashvalue; start_over: nbatch = hashtable->nbatch; curbatch = hashtable->curbatch; if (curbatch > 0) { /* * We no longer need the previous outer batch file; close it right * away to free disk space. */ if (hashtable->outerBatchFile[curbatch]) BufFileClose(hashtable->outerBatchFile[curbatch]); hashtable->outerBatchFile[curbatch] = NULL; } /* * We can always skip over any batches that are completely empty on both * sides. We can sometimes skip over batches that are empty on only one * side, but there are exceptions: * * 1. In a LEFT JOIN, we have to process outer batches even if the * inner batch is empty. * * 2. If we have increased nbatch since the initial estimate, we have * to scan inner batches since they might contain tuples that need to * be reassigned to later inner batches. * * 3. Similarly, if we have increased nbatch since starting the outer * scan, we have to rescan outer batches in case they contain tuples * that need to be reassigned. */ curbatch++; while (curbatch < nbatch && (hashtable->outerBatchFile[curbatch] == NULL || hashtable->innerBatchFile[curbatch] == NULL)) { if (hashtable->outerBatchFile[curbatch] && hjstate->js.jointype == JOIN_LEFT) break; /* must process due to rule 1 */ if (hashtable->innerBatchFile[curbatch] && nbatch != hashtable->nbatch_original) break; /* must process due to rule 2 */ if (hashtable->outerBatchFile[curbatch] && nbatch != hashtable->nbatch_outstart) break; /* must process due to rule 3 */ /* We can ignore this batch. */ /* Release associated temp files right away. */ if (hashtable->innerBatchFile[curbatch]) BufFileClose(hashtable->innerBatchFile[curbatch]); hashtable->innerBatchFile[curbatch] = NULL; if (hashtable->outerBatchFile[curbatch]) BufFileClose(hashtable->outerBatchFile[curbatch]); hashtable->outerBatchFile[curbatch] = NULL; curbatch++; } if (curbatch >= nbatch) return curbatch; /* no more batches */ hashtable->curbatch = curbatch; /* * Reload the hash table with the new inner batch (which could be empty) */ ExecHashTableReset(hashtable); innerFile = hashtable->innerBatchFile[curbatch]; if (innerFile != NULL) { if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rewind hash-join temporary file: %m"))); while ((slot = ExecHashJoinGetSavedTuple(hjstate, innerFile, &hashvalue, hjstate->hj_HashTupleSlot))) { /* * NOTE: some tuples may be sent to future batches. Also, * it is possible for hashtable->nbatch to be increased here! */ ExecHashTableInsert(hashtable, ExecFetchSlotTuple(slot), hashvalue); } /* * after we build the hash table, the inner batch file is no longer * needed */ BufFileClose(innerFile); hashtable->innerBatchFile[curbatch] = NULL; } /* * If there's no outer batch file, advance to next batch. */ if (hashtable->outerBatchFile[curbatch] == NULL) goto start_over; /* * Rewind outer batch file, so that we can start reading it. */ if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rewind hash-join temporary file: %m"))); return curbatch; } /* * ExecHashJoinSaveTuple * save a tuple to a batch file. * * The data recorded in the file for each tuple is its hash value, * then an image of its HeapTupleData (with meaningless t_data pointer) * followed by the HeapTupleHeader and tuple data. * * Note: it is important always to call this in the regular executor * context, not in a shorter-lived context; else the temp file buffers * will get messed up. */ void ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue, BufFile **fileptr) { BufFile *file = *fileptr; size_t written; if (file == NULL) { /* First write to this batch file, so open it. */ file = BufFileCreateTemp(false); *fileptr = file; } written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); if (written != sizeof(uint32)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to hash-join temporary file: %m"))); written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData)); if (written != sizeof(HeapTupleData)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to hash-join temporary file: %m"))); written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len); if (written != (size_t) heapTuple->t_len) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to hash-join temporary file: %m"))); } /* * ExecHashJoinGetSavedTuple * read the next tuple from a batch file. Return NULL if no more. * * On success, *hashvalue is set to the tuple's hash value, and the tuple * itself is stored in the given slot. */ static TupleTableSlot * ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, uint32 *hashvalue, TupleTableSlot *tupleSlot) { HeapTupleData htup; size_t nread; HeapTuple heapTuple; nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32)); if (nread == 0) return NULL; /* end of file */ if (nread != sizeof(uint32)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from hash-join temporary file: %m"))); nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData)); if (nread != sizeof(HeapTupleData)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from hash-join temporary file: %m"))); heapTuple = palloc(HEAPTUPLESIZE + htup.t_len); memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData)); heapTuple->t_datamcxt = CurrentMemoryContext; heapTuple->t_data = (HeapTupleHeader) ((char *) heapTuple + HEAPTUPLESIZE); nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len); if (nread != (size_t) htup.t_len) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from hash-join temporary file: %m"))); return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true); } void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) { /* * If we haven't yet built the hash table then we can just return; * nothing done yet, so nothing to undo. */ if (node->hj_HashTable == NULL) return; /* * In a multi-batch join, we currently have to do rescans the hard * way, primarily because batch temp files may have already been * released. But if it's a single-batch join, and there is no * parameter change for the inner subnode, then we can just re-use the * existing hash table without rebuilding it. */ if (node->hj_HashTable->nbatch == 1 && ((PlanState *) node)->righttree->chgParam == NULL) { /* okay to reuse the hash table; needn't rescan inner, either */ } else { /* must destroy and rebuild hash table */ ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; /* * if chgParam of subnode is not null then plan will be re-scanned * by first ExecProcNode. */ if (((PlanState *) node)->righttree->chgParam == NULL) ExecReScan(((PlanState *) node)->righttree, exprCtxt); } /* Always reset intra-tuple state */ node->hj_CurHashValue = 0; node->hj_CurBucketNo = 0; node->hj_CurTuple = NULL; node->js.ps.ps_OuterTupleSlot = NULL; node->js.ps.ps_TupFromTlist = false; node->hj_NeedNewOuter = true; node->hj_MatchedOuter = false; /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. */ if (((PlanState *) node)->lefttree->chgParam == NULL) ExecReScan(((PlanState *) node)->lefttree, exprCtxt); }