diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2008-08-07 03:04:04 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2008-08-07 03:04:04 +0000 |
commit | 368df3042783778031ece2b8580324516cd42de1 (patch) | |
tree | dd6f9877cdc12654647a97785c4e7267cc7ac945 /src/backend/executor/nodeSetOp.c | |
parent | 2d1d96b1cea8f67a095e8f28372af4081605f681 (diff) | |
download | postgresql-368df3042783778031ece2b8580324516cd42de1.tar.gz postgresql-368df3042783778031ece2b8580324516cd42de1.zip |
Support hashing for duplicate-elimination in INTERSECT and EXCEPT queries.
This completes my project of improving usage of hashing for duplicate
elimination (aggregate functions with DISTINCT remain undone, but that's
for some other day).
As with the previous patches, this means we can INTERSECT/EXCEPT on datatypes
that can hash but not sort, and it means that INTERSECT/EXCEPT without ORDER
BY are no longer certain to produce sorted output.
Diffstat (limited to 'src/backend/executor/nodeSetOp.c')
-rw-r--r-- | src/backend/executor/nodeSetOp.c | 602 |
1 files changed, 457 insertions, 145 deletions
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c index c60ceff1259..2421b0a6df4 100644 --- a/src/backend/executor/nodeSetOp.c +++ b/src/backend/executor/nodeSetOp.c @@ -4,33 +4,38 @@ * Routines to handle INTERSECT and EXCEPT selection * * The input of a SetOp node consists of tuples from two relations, - * which have been combined into one dataset and sorted on all the nonjunk - * attributes. In addition there is a junk attribute that shows which - * relation each tuple came from. The SetOp node scans each group of - * identical tuples to determine how many came from each input relation. - * Then it is a simple matter to emit the output demanded by the SQL spec - * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * which have been combined into one dataset, with a junk attribute added + * that shows which relation each tuple came from. In SETOP_SORTED mode, + * the input has furthermore been sorted according to all the grouping + * columns (ie, all the non-junk attributes). The SetOp node scans each + * group of identical tuples to determine how many came from each input + * relation. Then it is a simple matter to emit the output demanded by the + * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * + * In SETOP_HASHED mode, the input is delivered in no particular order. + * We build a hash table in memory with one entry for each group of + * identical tuples, and count the number of tuples in the group from + * each relation. After seeing all the input, we scan the hashtable and + * generate the correct output using those counts. * * This node type is not used for UNION or UNION ALL, since those can be * implemented more cheaply (there's no need for the junk attribute to * identify the source relation). * + * Note that SetOp does no qual checking nor projection. The delivered + * output tuples are just copies of the first-to-arrive tuple in each + * input group. + * * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ -/* - * INTERFACE ROUTINES - * ExecSetOp - filter input to generate INTERSECT/EXCEPT results - * ExecInitSetOp - initialize node and subnodes.. - * ExecEndSetOp - shutdown node and subnodes - */ #include "postgres.h" @@ -39,6 +44,160 @@ #include "utils/memutils.h" +/* + * SetOpStatePerGroupData - per-group working state + * + * These values are working state that is initialized at the start of + * an input tuple group and updated for each input tuple. + * + * In SETOP_SORTED mode, we need only one of these structs, and it's kept in + * the plan state node. In SETOP_HASHED mode, the hash table contains one + * of these for each tuple group. + */ +typedef struct SetOpStatePerGroupData +{ + long numLeft; /* number of left-input dups in group */ + long numRight; /* number of right-input dups in group */ +} SetOpStatePerGroupData; + +/* + * To implement hashed mode, we need a hashtable that stores a + * representative tuple and the duplicate counts for each distinct set + * of grouping columns. We compute the hash key from the grouping columns. + */ +typedef struct SetOpHashEntryData *SetOpHashEntry; + +typedef struct SetOpHashEntryData +{ + TupleHashEntryData shared; /* common header for hash table entries */ + SetOpStatePerGroupData pergroup; +} SetOpHashEntryData; + + +static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate); +static void setop_fill_hash_table(SetOpState *setopstate); +static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate); + + +/* + * Initialize state for a new group of input values. + */ +static void +initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + pergroup->numLeft = pergroup->numRight = 0; +} + +/* + * Advance the appropriate counter for one input tuple. + */ +static void +advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup, + TupleTableSlot *inputslot) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + int flag; + bool isNull; + + flag = DatumGetInt32(slot_getattr(inputslot, + node->flagColIdx, + &isNull)); + Assert(!isNull); + if (flag) + pergroup->numRight++; + else + pergroup->numLeft++; +} + +/* + * Initialize the hash table to empty. + */ +static void +build_hash_table(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + + Assert(node->strategy == SETOP_HASHED); + Assert(node->numGroups > 0); + + setopstate->hashtable = BuildTupleHashTable(node->numCols, + node->dupColIdx, + setopstate->eqfunctions, + setopstate->hashfunctions, + node->numGroups, + sizeof(SetOpHashEntryData), + setopstate->tableContext, + setopstate->tempContext); +} + +/* + * Find or create a hashtable entry for the tuple group containing the + * given tuple. + */ +static SetOpHashEntry +lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot) +{ + SetOpHashEntry entry; + bool isnew; + + /* find or create the hashtable entry */ + entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable, + inputslot, + &isnew); + + if (isnew) + { + /* initialize counts for new tuple group */ + initialize_counts(setopstate, &entry->pergroup); + } + + /* Must reset temp context after each hashtable lookup */ + MemoryContextReset(setopstate->tempContext); + + return entry; +} + +/* + * We've completed processing a tuple group. Decide how many copies (if any) + * of its representative row to emit, and store the count into numOutput. + * This logic is straight from the SQL92 specification. + */ +static void +set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + SetOp *plannode = (SetOp *) setopstate->ps.plan; + + switch (plannode->cmd) + { + case SETOPCMD_INTERSECT: + if (pergroup->numLeft > 0 && pergroup->numRight > 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_INTERSECT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + pergroup->numLeft : pergroup->numRight; + break; + case SETOPCMD_EXCEPT: + if (pergroup->numLeft > 0 && pergroup->numRight == 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_EXCEPT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + 0 : (pergroup->numLeft - pergroup->numRight); + break; + default: + elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd); + break; + } +} + + /* ---------------------------------------------------------------- * ExecSetOp * ---------------------------------------------------------------- @@ -47,14 +206,7 @@ TupleTableSlot * /* return: a tuple or NULL */ ExecSetOp(SetOpState *node) { SetOp *plannode = (SetOp *) node->ps.plan; - TupleTableSlot *resultTupleSlot; - PlanState *outerPlan; - - /* - * get information from the node - */ - outerPlan = outerPlanState(node); - resultTupleSlot = node->ps.ps_ResultTupleSlot; + TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot; /* * If the previously-returned tuple needs to be returned more than once, @@ -66,142 +218,218 @@ ExecSetOp(SetOpState *node) return resultTupleSlot; } - /* Flag that we have no current tuple */ - ExecClearTuple(resultTupleSlot); + /* Otherwise, we're done if we are out of groups */ + if (node->setop_done) + return NULL; + + /* Fetch the next tuple group according to the correct strategy */ + if (plannode->strategy == SETOP_HASHED) + { + if (!node->table_filled) + setop_fill_hash_table(node); + return setop_retrieve_hash_table(node); + } + else + return setop_retrieve_direct(node); +} + +/* + * ExecSetOp for non-hashed case + */ +static TupleTableSlot * +setop_retrieve_direct(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + PlanState *outerPlan; + SetOpStatePerGroup pergroup; + TupleTableSlot *outerslot; + TupleTableSlot *resultTupleSlot; /* - * Absorb groups of duplicate tuples, counting them, and saving the first - * of each group as a possible return value. At the end of each group, - * decide whether to return anything. - * - * We assume that the tuples arrive in sorted order so we can detect - * duplicates easily. + * get state info from node */ - for (;;) - { - TupleTableSlot *inputTupleSlot; - bool endOfGroup; + outerPlan = outerPlanState(setopstate); + pergroup = setopstate->pergroup; + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { /* - * fetch a tuple from the outer subplan, unless we already did. + * If we don't already have the first tuple of the new group, fetch it + * from the outer plan. */ - if (node->ps.ps_OuterTupleSlot == NULL && - !node->subplan_done) - { - node->ps.ps_OuterTupleSlot = - ExecProcNode(outerPlan); - if (TupIsNull(node->ps.ps_OuterTupleSlot)) - node->subplan_done = true; - } - inputTupleSlot = node->ps.ps_OuterTupleSlot; - - if (TupIsNull(resultTupleSlot)) + if (setopstate->grp_firstTuple == NULL) { - /* - * First of group: save a copy in result slot, and reset - * duplicate-counters for new group. - */ - if (node->subplan_done) - return NULL; /* no more tuples */ - ExecCopySlot(resultTupleSlot, inputTupleSlot); - node->numLeft = 0; - node->numRight = 0; - endOfGroup = false; - } - else if (node->subplan_done) - { - /* - * Reached end of input, so finish processing final group - */ - endOfGroup = true; - } - else - { - /* - * Else test if the new tuple and the previously saved tuple - * match. - */ - if (execTuplesMatch(inputTupleSlot, - resultTupleSlot, - plannode->numCols, plannode->dupColIdx, - node->eqfunctions, - node->tempContext)) - endOfGroup = false; + outerslot = ExecProcNode(outerPlan); + if (!TupIsNull(outerslot)) + { + /* Make a copy of the first input tuple */ + setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot); + } else - endOfGroup = true; + { + /* outer plan produced no tuples at all */ + setopstate->setop_done = true; + return NULL; + } } - if (endOfGroup) + /* + * Store the copied first input tuple in the tuple table slot + * reserved for it. The tuple will be deleted when it is cleared + * from the slot. + */ + ExecStoreTuple(setopstate->grp_firstTuple, + resultTupleSlot, + InvalidBuffer, + true); + setopstate->grp_firstTuple = NULL; /* don't keep two pointers */ + + /* Initialize working state for a new input tuple group */ + initialize_counts(setopstate, pergroup); + + /* Count the first input tuple */ + advance_counts(setopstate, pergroup, resultTupleSlot); + + /* + * Scan the outer plan until we exhaust it or cross a group boundary. + */ + for (;;) { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + { + /* no more outer-plan tuples available */ + setopstate->setop_done = true; + break; + } + /* - * We've reached the end of the group containing resultTuple. - * Decide how many copies (if any) to emit. This logic is - * straight from the SQL92 specification. + * Check whether we've crossed a group boundary. */ - switch (plannode->cmd) + if (!execTuplesMatch(resultTupleSlot, + outerslot, + node->numCols, node->dupColIdx, + setopstate->eqfunctions, + setopstate->tempContext)) { - case SETOPCMD_INTERSECT: - if (node->numLeft > 0 && node->numRight > 0) - node->numOutput = 1; - else - node->numOutput = 0; - break; - case SETOPCMD_INTERSECT_ALL: - node->numOutput = - (node->numLeft < node->numRight) ? - node->numLeft : node->numRight; - break; - case SETOPCMD_EXCEPT: - if (node->numLeft > 0 && node->numRight == 0) - node->numOutput = 1; - else - node->numOutput = 0; - break; - case SETOPCMD_EXCEPT_ALL: - node->numOutput = - (node->numLeft < node->numRight) ? - 0 : (node->numLeft - node->numRight); - break; - default: - elog(ERROR, "unrecognized set op: %d", - (int) plannode->cmd); - break; - } - /* Fall out of for-loop if we have tuples to emit */ - if (node->numOutput > 0) + /* + * Save the first input tuple of the next group. + */ + setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot); break; - /* Else flag that we have no current tuple, and loop around */ - ExecClearTuple(resultTupleSlot); + } + + /* Still in same group, so count this tuple */ + advance_counts(setopstate, pergroup, outerslot); } - else + + /* + * Done scanning input tuple group. See if we should emit any + * copies of result tuple, and if so return the first copy. + */ + set_output_count(setopstate, pergroup); + + if (setopstate->numOutput > 0) { - /* - * Current tuple is member of same group as resultTuple. Count it - * in the appropriate counter. - */ - int flag; - bool isNull; - - flag = DatumGetInt32(slot_getattr(inputTupleSlot, - plannode->flagColIdx, - &isNull)); - Assert(!isNull); - if (flag) - node->numRight++; - else - node->numLeft++; - /* Set flag to fetch a new input tuple, and loop around */ - node->ps.ps_OuterTupleSlot = NULL; + setopstate->numOutput--; + return resultTupleSlot; } } + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; +} + +/* + * ExecSetOp for hashed case: phase 1, read input and build hash table + */ +static void +setop_fill_hash_table(SetOpState *setopstate) +{ + PlanState *outerPlan; + SetOpHashEntry entry; + TupleTableSlot *outerslot; + + /* + * get state info from node + */ + outerPlan = outerPlanState(setopstate); + /* - * If we fall out of loop, then we need to emit at least one copy of - * resultTuple. + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. */ - Assert(node->numOutput > 0); - node->numOutput--; - return resultTupleSlot; + for (;;) + { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + break; + + /* Find or build hashtable entry for this tuple's group */ + entry = lookup_hash_entry(setopstate, outerslot); + + /* Advance the counts */ + advance_counts(setopstate, &entry->pergroup, outerslot); + } + + setopstate->table_filled = true; + /* Initialize to walk the hash table */ + ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter); +} + +/* + * ExecSetOp for hashed case: phase 2, retrieving groups from hash table + */ +static TupleTableSlot * +setop_retrieve_hash_table(SetOpState *setopstate) +{ + SetOpHashEntry entry; + TupleTableSlot *resultTupleSlot; + + /* + * get state info from node + */ + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { + /* + * Find the next entry in the hash table + */ + entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter); + if (entry == NULL) + { + /* No more entries in hashtable, so done */ + setopstate->setop_done = true; + return NULL; + } + + /* + * See if we should emit any copies of this tuple, and if so return + * the first copy. + */ + set_output_count(setopstate, &entry->pergroup); + + if (setopstate->numOutput > 0) + { + setopstate->numOutput--; + return ExecStoreMinimalTuple(entry->shared.firstTuple, + resultTupleSlot, + false); + } + } + + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; } /* ---------------------------------------------------------------- @@ -226,9 +454,14 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) setopstate->ps.plan = (Plan *) node; setopstate->ps.state = estate; - setopstate->ps.ps_OuterTupleSlot = NULL; - setopstate->subplan_done = false; + setopstate->eqfunctions = NULL; + setopstate->hashfunctions = NULL; + setopstate->setop_done = false; setopstate->numOutput = 0; + setopstate->pergroup = NULL; + setopstate->grp_firstTuple = NULL; + setopstate->hashtable = NULL; + setopstate->tableContext = NULL; /* * Miscellaneous initialization @@ -244,6 +477,19 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + /* + * If hashing, we also need a longer-lived context to store the hash + * table. The table can't just be kept in the per-query context because + * we want to be able to throw it away in ExecReScanSetOp. + */ + if (node->strategy == SETOP_HASHED) + setopstate->tableContext = + AllocSetContextCreate(CurrentMemoryContext, + "SetOp hash table", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + #define SETOP_NSLOTS 1 /* @@ -252,8 +498,13 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) ExecInitResultTupleSlot(estate, &setopstate->ps); /* - * then initialize outer plan + * initialize child nodes + * + * If we are hashing then the child plan does not need + * to handle REWIND efficiently; see ExecReScanSetOp. */ + if (node->strategy == SETOP_HASHED) + eflags &= ~EXEC_FLAG_REWIND; outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags); /* @@ -264,11 +515,30 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) setopstate->ps.ps_ProjInfo = NULL; /* - * Precompute fmgr lookup data for inner loop + * Precompute fmgr lookup data for inner loop. We need both equality and + * hashing functions to do it by hashing, but only equality if not + * hashing. */ - setopstate->eqfunctions = - execTuplesMatchPrepare(node->numCols, - node->dupOperators); + if (node->strategy == SETOP_HASHED) + execTuplesHashPrepare(node->numCols, + node->dupOperators, + &setopstate->eqfunctions, + &setopstate->hashfunctions); + else + setopstate->eqfunctions = + execTuplesMatchPrepare(node->numCols, + node->dupOperators); + + if (node->strategy == SETOP_HASHED) + { + build_hash_table(setopstate); + setopstate->table_filled = false; + } + else + { + setopstate->pergroup = + (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData)); + } return setopstate; } @@ -293,9 +563,11 @@ ExecEndSetOp(SetOpState *node) { /* clean up tuple table */ ExecClearTuple(node->ps.ps_ResultTupleSlot); - node->ps.ps_OuterTupleSlot = NULL; + /* free subsidiary stuff including hashtable */ MemoryContextDelete(node->tempContext); + if (node->tableContext) + MemoryContextDelete(node->tableContext); ExecEndNode(outerPlanState(node)); } @@ -305,10 +577,50 @@ void ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt) { ExecClearTuple(node->ps.ps_ResultTupleSlot); - node->ps.ps_OuterTupleSlot = NULL; - node->subplan_done = false; + node->setop_done = false; node->numOutput = 0; + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + /* + * In the hashed case, if we haven't yet built the hash table then we + * can just return; nothing done yet, so nothing to undo. If subnode's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!node->table_filled) + return; + + /* + * If we do have the hash table and the subplan does not have any + * parameter changes, then we can just rescan the existing hash table; + * no need to build it again. + */ + if (((PlanState *) node)->lefttree->chgParam == NULL) + { + ResetTupleHashIterator(node->hashtable, &node->hashiter); + return; + } + } + + /* Release first tuple of group, if we have made a copy */ + if (node->grp_firstTuple != NULL) + { + heap_freetuple(node->grp_firstTuple); + node->grp_firstTuple = NULL; + } + + /* Release any hashtable storage */ + if (node->tableContext) + MemoryContextResetAndDeleteChildren(node->tableContext); + + /* And rebuild empty hashtable if needed */ + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + build_hash_table(node); + node->table_filled = false; + } + /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. |