diff options
Diffstat (limited to 'src/backend/executor/nodeSetOp.c')
-rw-r--r-- | src/backend/executor/nodeSetOp.c | 602 |
1 files changed, 457 insertions, 145 deletions
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c index c60ceff1259..2421b0a6df4 100644 --- a/src/backend/executor/nodeSetOp.c +++ b/src/backend/executor/nodeSetOp.c @@ -4,33 +4,38 @@ * Routines to handle INTERSECT and EXCEPT selection * * The input of a SetOp node consists of tuples from two relations, - * which have been combined into one dataset and sorted on all the nonjunk - * attributes. In addition there is a junk attribute that shows which - * relation each tuple came from. The SetOp node scans each group of - * identical tuples to determine how many came from each input relation. - * Then it is a simple matter to emit the output demanded by the SQL spec - * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * which have been combined into one dataset, with a junk attribute added + * that shows which relation each tuple came from. In SETOP_SORTED mode, + * the input has furthermore been sorted according to all the grouping + * columns (ie, all the non-junk attributes). The SetOp node scans each + * group of identical tuples to determine how many came from each input + * relation. Then it is a simple matter to emit the output demanded by the + * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * + * In SETOP_HASHED mode, the input is delivered in no particular order. + * We build a hash table in memory with one entry for each group of + * identical tuples, and count the number of tuples in the group from + * each relation. After seeing all the input, we scan the hashtable and + * generate the correct output using those counts. * * This node type is not used for UNION or UNION ALL, since those can be * implemented more cheaply (there's no need for the junk attribute to * identify the source relation). * + * Note that SetOp does no qual checking nor projection. The delivered + * output tuples are just copies of the first-to-arrive tuple in each + * input group. + * * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ -/* - * INTERFACE ROUTINES - * ExecSetOp - filter input to generate INTERSECT/EXCEPT results - * ExecInitSetOp - initialize node and subnodes.. - * ExecEndSetOp - shutdown node and subnodes - */ #include "postgres.h" @@ -39,6 +44,160 @@ #include "utils/memutils.h" +/* + * SetOpStatePerGroupData - per-group working state + * + * These values are working state that is initialized at the start of + * an input tuple group and updated for each input tuple. + * + * In SETOP_SORTED mode, we need only one of these structs, and it's kept in + * the plan state node. In SETOP_HASHED mode, the hash table contains one + * of these for each tuple group. + */ +typedef struct SetOpStatePerGroupData +{ + long numLeft; /* number of left-input dups in group */ + long numRight; /* number of right-input dups in group */ +} SetOpStatePerGroupData; + +/* + * To implement hashed mode, we need a hashtable that stores a + * representative tuple and the duplicate counts for each distinct set + * of grouping columns. We compute the hash key from the grouping columns. + */ +typedef struct SetOpHashEntryData *SetOpHashEntry; + +typedef struct SetOpHashEntryData +{ + TupleHashEntryData shared; /* common header for hash table entries */ + SetOpStatePerGroupData pergroup; +} SetOpHashEntryData; + + +static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate); +static void setop_fill_hash_table(SetOpState *setopstate); +static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate); + + +/* + * Initialize state for a new group of input values. + */ +static void +initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + pergroup->numLeft = pergroup->numRight = 0; +} + +/* + * Advance the appropriate counter for one input tuple. + */ +static void +advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup, + TupleTableSlot *inputslot) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + int flag; + bool isNull; + + flag = DatumGetInt32(slot_getattr(inputslot, + node->flagColIdx, + &isNull)); + Assert(!isNull); + if (flag) + pergroup->numRight++; + else + pergroup->numLeft++; +} + +/* + * Initialize the hash table to empty. + */ +static void +build_hash_table(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + + Assert(node->strategy == SETOP_HASHED); + Assert(node->numGroups > 0); + + setopstate->hashtable = BuildTupleHashTable(node->numCols, + node->dupColIdx, + setopstate->eqfunctions, + setopstate->hashfunctions, + node->numGroups, + sizeof(SetOpHashEntryData), + setopstate->tableContext, + setopstate->tempContext); +} + +/* + * Find or create a hashtable entry for the tuple group containing the + * given tuple. + */ +static SetOpHashEntry +lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot) +{ + SetOpHashEntry entry; + bool isnew; + + /* find or create the hashtable entry */ + entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable, + inputslot, + &isnew); + + if (isnew) + { + /* initialize counts for new tuple group */ + initialize_counts(setopstate, &entry->pergroup); + } + + /* Must reset temp context after each hashtable lookup */ + MemoryContextReset(setopstate->tempContext); + + return entry; +} + +/* + * We've completed processing a tuple group. Decide how many copies (if any) + * of its representative row to emit, and store the count into numOutput. + * This logic is straight from the SQL92 specification. + */ +static void +set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + SetOp *plannode = (SetOp *) setopstate->ps.plan; + + switch (plannode->cmd) + { + case SETOPCMD_INTERSECT: + if (pergroup->numLeft > 0 && pergroup->numRight > 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_INTERSECT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + pergroup->numLeft : pergroup->numRight; + break; + case SETOPCMD_EXCEPT: + if (pergroup->numLeft > 0 && pergroup->numRight == 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_EXCEPT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + 0 : (pergroup->numLeft - pergroup->numRight); + break; + default: + elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd); + break; + } +} + + /* ---------------------------------------------------------------- * ExecSetOp * ---------------------------------------------------------------- @@ -47,14 +206,7 @@ TupleTableSlot * /* return: a tuple or NULL */ ExecSetOp(SetOpState *node) { SetOp *plannode = (SetOp *) node->ps.plan; - TupleTableSlot *resultTupleSlot; - PlanState *outerPlan; - - /* - * get information from the node - */ - outerPlan = outerPlanState(node); - resultTupleSlot = node->ps.ps_ResultTupleSlot; + TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot; /* * If the previously-returned tuple needs to be returned more than once, @@ -66,142 +218,218 @@ ExecSetOp(SetOpState *node) return resultTupleSlot; } - /* Flag that we have no current tuple */ - ExecClearTuple(resultTupleSlot); + /* Otherwise, we're done if we are out of groups */ + if (node->setop_done) + return NULL; + + /* Fetch the next tuple group according to the correct strategy */ + if (plannode->strategy == SETOP_HASHED) + { + if (!node->table_filled) + setop_fill_hash_table(node); + return setop_retrieve_hash_table(node); + } + else + return setop_retrieve_direct(node); +} + +/* + * ExecSetOp for non-hashed case + */ +static TupleTableSlot * +setop_retrieve_direct(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + PlanState *outerPlan; + SetOpStatePerGroup pergroup; + TupleTableSlot *outerslot; + TupleTableSlot *resultTupleSlot; /* - * Absorb groups of duplicate tuples, counting them, and saving the first - * of each group as a possible return value. At the end of each group, - * decide whether to return anything. - * - * We assume that the tuples arrive in sorted order so we can detect - * duplicates easily. + * get state info from node */ - for (;;) - { - TupleTableSlot *inputTupleSlot; - bool endOfGroup; + outerPlan = outerPlanState(setopstate); + pergroup = setopstate->pergroup; + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { /* - * fetch a tuple from the outer subplan, unless we already did. + * If we don't already have the first tuple of the new group, fetch it + * from the outer plan. */ - if (node->ps.ps_OuterTupleSlot == NULL && - !node->subplan_done) - { - node->ps.ps_OuterTupleSlot = - ExecProcNode(outerPlan); - if (TupIsNull(node->ps.ps_OuterTupleSlot)) - node->subplan_done = true; - } - inputTupleSlot = node->ps.ps_OuterTupleSlot; - - if (TupIsNull(resultTupleSlot)) + if (setopstate->grp_firstTuple == NULL) { - /* - * First of group: save a copy in result slot, and reset - * duplicate-counters for new group. - */ - if (node->subplan_done) - return NULL; /* no more tuples */ - ExecCopySlot(resultTupleSlot, inputTupleSlot); - node->numLeft = 0; - node->numRight = 0; - endOfGroup = false; - } - else if (node->subplan_done) - { - /* - * Reached end of input, so finish processing final group - */ - endOfGroup = true; - } - else - { - /* - * Else test if the new tuple and the previously saved tuple - * match. - */ - if (execTuplesMatch(inputTupleSlot, - resultTupleSlot, - plannode->numCols, plannode->dupColIdx, - node->eqfunctions, - node->tempContext)) - endOfGroup = false; + outerslot = ExecProcNode(outerPlan); + if (!TupIsNull(outerslot)) + { + /* Make a copy of the first input tuple */ + setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot); + } else - endOfGroup = true; + { + /* outer plan produced no tuples at all */ + setopstate->setop_done = true; + return NULL; + } } - if (endOfGroup) + /* + * Store the copied first input tuple in the tuple table slot + * reserved for it. The tuple will be deleted when it is cleared + * from the slot. + */ + ExecStoreTuple(setopstate->grp_firstTuple, + resultTupleSlot, + InvalidBuffer, + true); + setopstate->grp_firstTuple = NULL; /* don't keep two pointers */ + + /* Initialize working state for a new input tuple group */ + initialize_counts(setopstate, pergroup); + + /* Count the first input tuple */ + advance_counts(setopstate, pergroup, resultTupleSlot); + + /* + * Scan the outer plan until we exhaust it or cross a group boundary. + */ + for (;;) { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + { + /* no more outer-plan tuples available */ + setopstate->setop_done = true; + break; + } + /* - * We've reached the end of the group containing resultTuple. - * Decide how many copies (if any) to emit. This logic is - * straight from the SQL92 specification. + * Check whether we've crossed a group boundary. */ - switch (plannode->cmd) + if (!execTuplesMatch(resultTupleSlot, + outerslot, + node->numCols, node->dupColIdx, + setopstate->eqfunctions, + setopstate->tempContext)) { - case SETOPCMD_INTERSECT: - if (node->numLeft > 0 && node->numRight > 0) - node->numOutput = 1; - else - node->numOutput = 0; - break; - case SETOPCMD_INTERSECT_ALL: - node->numOutput = - (node->numLeft < node->numRight) ? - node->numLeft : node->numRight; - break; - case SETOPCMD_EXCEPT: - if (node->numLeft > 0 && node->numRight == 0) - node->numOutput = 1; - else - node->numOutput = 0; - break; - case SETOPCMD_EXCEPT_ALL: - node->numOutput = - (node->numLeft < node->numRight) ? - 0 : (node->numLeft - node->numRight); - break; - default: - elog(ERROR, "unrecognized set op: %d", - (int) plannode->cmd); - break; - } - /* Fall out of for-loop if we have tuples to emit */ - if (node->numOutput > 0) + /* + * Save the first input tuple of the next group. + */ + setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot); break; - /* Else flag that we have no current tuple, and loop around */ - ExecClearTuple(resultTupleSlot); + } + + /* Still in same group, so count this tuple */ + advance_counts(setopstate, pergroup, outerslot); } - else + + /* + * Done scanning input tuple group. See if we should emit any + * copies of result tuple, and if so return the first copy. + */ + set_output_count(setopstate, pergroup); + + if (setopstate->numOutput > 0) { - /* - * Current tuple is member of same group as resultTuple. Count it - * in the appropriate counter. - */ - int flag; - bool isNull; - - flag = DatumGetInt32(slot_getattr(inputTupleSlot, - plannode->flagColIdx, - &isNull)); - Assert(!isNull); - if (flag) - node->numRight++; - else - node->numLeft++; - /* Set flag to fetch a new input tuple, and loop around */ - node->ps.ps_OuterTupleSlot = NULL; + setopstate->numOutput--; + return resultTupleSlot; } } + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; +} + +/* + * ExecSetOp for hashed case: phase 1, read input and build hash table + */ +static void +setop_fill_hash_table(SetOpState *setopstate) +{ + PlanState *outerPlan; + SetOpHashEntry entry; + TupleTableSlot *outerslot; + + /* + * get state info from node + */ + outerPlan = outerPlanState(setopstate); + /* - * If we fall out of loop, then we need to emit at least one copy of - * resultTuple. + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. */ - Assert(node->numOutput > 0); - node->numOutput--; - return resultTupleSlot; + for (;;) + { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + break; + + /* Find or build hashtable entry for this tuple's group */ + entry = lookup_hash_entry(setopstate, outerslot); + + /* Advance the counts */ + advance_counts(setopstate, &entry->pergroup, outerslot); + } + + setopstate->table_filled = true; + /* Initialize to walk the hash table */ + ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter); +} + +/* + * ExecSetOp for hashed case: phase 2, retrieving groups from hash table + */ +static TupleTableSlot * +setop_retrieve_hash_table(SetOpState *setopstate) +{ + SetOpHashEntry entry; + TupleTableSlot *resultTupleSlot; + + /* + * get state info from node + */ + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { + /* + * Find the next entry in the hash table + */ + entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter); + if (entry == NULL) + { + /* No more entries in hashtable, so done */ + setopstate->setop_done = true; + return NULL; + } + + /* + * See if we should emit any copies of this tuple, and if so return + * the first copy. + */ + set_output_count(setopstate, &entry->pergroup); + + if (setopstate->numOutput > 0) + { + setopstate->numOutput--; + return ExecStoreMinimalTuple(entry->shared.firstTuple, + resultTupleSlot, + false); + } + } + + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; } /* ---------------------------------------------------------------- @@ -226,9 +454,14 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) setopstate->ps.plan = (Plan *) node; setopstate->ps.state = estate; - setopstate->ps.ps_OuterTupleSlot = NULL; - setopstate->subplan_done = false; + setopstate->eqfunctions = NULL; + setopstate->hashfunctions = NULL; + setopstate->setop_done = false; setopstate->numOutput = 0; + setopstate->pergroup = NULL; + setopstate->grp_firstTuple = NULL; + setopstate->hashtable = NULL; + setopstate->tableContext = NULL; /* * Miscellaneous initialization @@ -244,6 +477,19 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + /* + * If hashing, we also need a longer-lived context to store the hash + * table. The table can't just be kept in the per-query context because + * we want to be able to throw it away in ExecReScanSetOp. + */ + if (node->strategy == SETOP_HASHED) + setopstate->tableContext = + AllocSetContextCreate(CurrentMemoryContext, + "SetOp hash table", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + #define SETOP_NSLOTS 1 /* @@ -252,8 +498,13 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) ExecInitResultTupleSlot(estate, &setopstate->ps); /* - * then initialize outer plan + * initialize child nodes + * + * If we are hashing then the child plan does not need + * to handle REWIND efficiently; see ExecReScanSetOp. */ + if (node->strategy == SETOP_HASHED) + eflags &= ~EXEC_FLAG_REWIND; outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags); /* @@ -264,11 +515,30 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) setopstate->ps.ps_ProjInfo = NULL; /* - * Precompute fmgr lookup data for inner loop + * Precompute fmgr lookup data for inner loop. We need both equality and + * hashing functions to do it by hashing, but only equality if not + * hashing. */ - setopstate->eqfunctions = - execTuplesMatchPrepare(node->numCols, - node->dupOperators); + if (node->strategy == SETOP_HASHED) + execTuplesHashPrepare(node->numCols, + node->dupOperators, + &setopstate->eqfunctions, + &setopstate->hashfunctions); + else + setopstate->eqfunctions = + execTuplesMatchPrepare(node->numCols, + node->dupOperators); + + if (node->strategy == SETOP_HASHED) + { + build_hash_table(setopstate); + setopstate->table_filled = false; + } + else + { + setopstate->pergroup = + (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData)); + } return setopstate; } @@ -293,9 +563,11 @@ ExecEndSetOp(SetOpState *node) { /* clean up tuple table */ ExecClearTuple(node->ps.ps_ResultTupleSlot); - node->ps.ps_OuterTupleSlot = NULL; + /* free subsidiary stuff including hashtable */ MemoryContextDelete(node->tempContext); + if (node->tableContext) + MemoryContextDelete(node->tableContext); ExecEndNode(outerPlanState(node)); } @@ -305,10 +577,50 @@ void ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt) { ExecClearTuple(node->ps.ps_ResultTupleSlot); - node->ps.ps_OuterTupleSlot = NULL; - node->subplan_done = false; + node->setop_done = false; node->numOutput = 0; + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + /* + * In the hashed case, if we haven't yet built the hash table then we + * can just return; nothing done yet, so nothing to undo. If subnode's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!node->table_filled) + return; + + /* + * If we do have the hash table and the subplan does not have any + * parameter changes, then we can just rescan the existing hash table; + * no need to build it again. + */ + if (((PlanState *) node)->lefttree->chgParam == NULL) + { + ResetTupleHashIterator(node->hashtable, &node->hashiter); + return; + } + } + + /* Release first tuple of group, if we have made a copy */ + if (node->grp_firstTuple != NULL) + { + heap_freetuple(node->grp_firstTuple); + node->grp_firstTuple = NULL; + } + + /* Release any hashtable storage */ + if (node->tableContext) + MemoryContextResetAndDeleteChildren(node->tableContext); + + /* And rebuild empty hashtable if needed */ + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + build_hash_table(node); + node->table_filled = false; + } + /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. |