1 files changed, 457 insertions, 145 deletions
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index c60ceff1259..2421b0a6df4 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -4,33 +4,38 @@
  *	  Routines to handle INTERSECT and EXCEPT selection
  *
  * The input of a SetOp node consists of tuples from two relations,
- * which have been combined into one dataset and sorted on all the nonjunk
- * attributes.	In addition there is a junk attribute that shows which
- * relation each tuple came from.  The SetOp node scans each group of
- * identical tuples to determine how many came from each input relation.
- * Then it is a simple matter to emit the output demanded by the SQL spec
- * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ * which have been combined into one dataset, with a junk attribute added
+ * that shows which relation each tuple came from.  In SETOP_SORTED mode,
+ * the input has furthermore been sorted according to all the grouping
+ * columns (ie, all the non-junk attributes).  The SetOp node scans each
+ * group of identical tuples to determine how many came from each input
+ * relation.  Then it is a simple matter to emit the output demanded by the
+ * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ *
+ * In SETOP_HASHED mode, the input is delivered in no particular order.
+ * We build a hash table in memory with one entry for each group of
+ * identical tuples, and count the number of tuples in the group from
+ * each relation.  After seeing all the input, we scan the hashtable and
+ * generate the correct output using those counts.
  *
  * This node type is not used for UNION or UNION ALL, since those can be
  * implemented more cheaply (there's no need for the junk attribute to
  * identify the source relation).
  *
+ * Note that SetOp does no qual checking nor projection.  The delivered
+ * output tuples are just copies of the first-to-arrive tuple in each
+ * input group.
+ *
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
-/*
- * INTERFACE ROUTINES
- *		ExecSetOp		- filter input to generate INTERSECT/EXCEPT results
- *		ExecInitSetOp	- initialize node and subnodes..
- *		ExecEndSetOp	- shutdown node and subnodes
- */
 
 #include "postgres.h"
 
@@ -39,6 +44,160 @@
 #include "utils/memutils.h"
 
 
+/*
+ * SetOpStatePerGroupData - per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
+ * the plan state node.  In SETOP_HASHED mode, the hash table contains one
+ * of these for each tuple group.
+ */
+typedef struct SetOpStatePerGroupData
+{
+	long		numLeft;		/* number of left-input dups in group */
+	long		numRight;		/* number of right-input dups in group */
+} SetOpStatePerGroupData;
+
+/*
+ * To implement hashed mode, we need a hashtable that stores a
+ * representative tuple and the duplicate counts for each distinct set
+ * of grouping columns.  We compute the hash key from the grouping columns.
+ */
+typedef struct SetOpHashEntryData *SetOpHashEntry;
+
+typedef struct SetOpHashEntryData
+{
+	TupleHashEntryData shared;	/* common header for hash table entries */
+	SetOpStatePerGroupData pergroup;
+} SetOpHashEntryData;
+
+
+static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
+static void setop_fill_hash_table(SetOpState *setopstate);
+static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
+
+
+/*
+ * Initialize state for a new group of input values.
+ */
+static void
+initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	pergroup->numLeft = pergroup->numRight = 0;
+}
+
+/*
+ * Advance the appropriate counter for one input tuple.
+ */
+static void
+advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup,
+			   TupleTableSlot *inputslot)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	int			flag;
+	bool		isNull;
+
+	flag = DatumGetInt32(slot_getattr(inputslot,
+									  node->flagColIdx,
+									  &isNull));
+	Assert(!isNull);
+	if (flag)
+		pergroup->numRight++;
+	else
+		pergroup->numLeft++;
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+
+	Assert(node->strategy == SETOP_HASHED);
+	Assert(node->numGroups > 0);
+
+	setopstate->hashtable = BuildTupleHashTable(node->numCols,
+												node->dupColIdx,
+												setopstate->eqfunctions,
+												setopstate->hashfunctions,
+												node->numGroups,
+												sizeof(SetOpHashEntryData),
+												setopstate->tableContext,
+												setopstate->tempContext);
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple.
+ */
+static SetOpHashEntry
+lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot)
+{
+	SetOpHashEntry entry;
+	bool		isnew;
+
+	/* find or create the hashtable entry */
+	entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable,
+												  inputslot,
+												  &isnew);
+
+	if (isnew)
+	{
+		/* initialize counts for new tuple group */
+		initialize_counts(setopstate, &entry->pergroup);
+	}
+
+	/* Must reset temp context after each hashtable lookup */
+	MemoryContextReset(setopstate->tempContext);
+
+	return entry;
+}
+
+/*
+ * We've completed processing a tuple group.  Decide how many copies (if any)
+ * of its representative row to emit, and store the count into numOutput.
+ * This logic is straight from the SQL92 specification.
+ */
+static void
+set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	SetOp	   *plannode = (SetOp *) setopstate->ps.plan;
+
+	switch (plannode->cmd)
+	{
+		case SETOPCMD_INTERSECT:
+			if (pergroup->numLeft > 0 && pergroup->numRight > 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_INTERSECT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				pergroup->numLeft : pergroup->numRight;
+			break;
+		case SETOPCMD_EXCEPT:
+			if (pergroup->numLeft > 0 && pergroup->numRight == 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_EXCEPT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				0 : (pergroup->numLeft - pergroup->numRight);
+			break;
+		default:
+			elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
+			break;
+	}
+}
+
+
 /* ----------------------------------------------------------------
  *		ExecSetOp
  * ----------------------------------------------------------------
@@ -47,14 +206,7 @@ TupleTableSlot *				/* return: a tuple or NULL */
 ExecSetOp(SetOpState *node)
 {
 	SetOp	   *plannode = (SetOp *) node->ps.plan;
-	TupleTableSlot *resultTupleSlot;
-	PlanState  *outerPlan;
-
-	/*
-	 * get information from the node
-	 */
-	outerPlan = outerPlanState(node);
-	resultTupleSlot = node->ps.ps_ResultTupleSlot;
+	TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
 
 	/*
 	 * If the previously-returned tuple needs to be returned more than once,
@@ -66,142 +218,218 @@ ExecSetOp(SetOpState *node)
 		return resultTupleSlot;
 	}
 
-	/* Flag that we have no current tuple */
-	ExecClearTuple(resultTupleSlot);
+	/* Otherwise, we're done if we are out of groups */
+	if (node->setop_done)
+		return NULL;
+
+	/* Fetch the next tuple group according to the correct strategy */
+	if (plannode->strategy == SETOP_HASHED)
+	{
+		if (!node->table_filled)
+			setop_fill_hash_table(node);
+		return setop_retrieve_hash_table(node);
+	}
+	else
+		return setop_retrieve_direct(node);
+}
+
+/*
+ * ExecSetOp for non-hashed case
+ */
+static TupleTableSlot *
+setop_retrieve_direct(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	PlanState  *outerPlan;
+	SetOpStatePerGroup pergroup;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *resultTupleSlot;
 
 	/*
-	 * Absorb groups of duplicate tuples, counting them, and saving the first
-	 * of each group as a possible return value. At the end of each group,
-	 * decide whether to return anything.
-	 *
-	 * We assume that the tuples arrive in sorted order so we can detect
-	 * duplicates easily.
+	 * get state info from node
 	 */
-	for (;;)
-	{
-		TupleTableSlot *inputTupleSlot;
-		bool		endOfGroup;
+	outerPlan = outerPlanState(setopstate);
+	pergroup = setopstate->pergroup;
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
 
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
 		/*
-		 * fetch a tuple from the outer subplan, unless we already did.
+		 * If we don't already have the first tuple of the new group, fetch it
+		 * from the outer plan.
 		 */
-		if (node->ps.ps_OuterTupleSlot == NULL &&
-			!node->subplan_done)
-		{
-			node->ps.ps_OuterTupleSlot =
-				ExecProcNode(outerPlan);
-			if (TupIsNull(node->ps.ps_OuterTupleSlot))
-				node->subplan_done = true;
-		}
-		inputTupleSlot = node->ps.ps_OuterTupleSlot;
-
-		if (TupIsNull(resultTupleSlot))
+		if (setopstate->grp_firstTuple == NULL)
 		{
-			/*
-			 * First of group: save a copy in result slot, and reset
-			 * duplicate-counters for new group.
-			 */
-			if (node->subplan_done)
-				return NULL;	/* no more tuples */
-			ExecCopySlot(resultTupleSlot, inputTupleSlot);
-			node->numLeft = 0;
-			node->numRight = 0;
-			endOfGroup = false;
-		}
-		else if (node->subplan_done)
-		{
-			/*
-			 * Reached end of input, so finish processing final group
-			 */
-			endOfGroup = true;
-		}
-		else
-		{
-			/*
-			 * Else test if the new tuple and the previously saved tuple
-			 * match.
-			 */
-			if (execTuplesMatch(inputTupleSlot,
-								resultTupleSlot,
-								plannode->numCols, plannode->dupColIdx,
-								node->eqfunctions,
-								node->tempContext))
-				endOfGroup = false;
+			outerslot = ExecProcNode(outerPlan);
+			if (!TupIsNull(outerslot))
+			{
+				/* Make a copy of the first input tuple */
+				setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
+			}
 			else
-				endOfGroup = true;
+			{
+				/* outer plan produced no tuples at all */
+				setopstate->setop_done = true;
+				return NULL;
+			}
 		}
 
-		if (endOfGroup)
+		/*
+		 * Store the copied first input tuple in the tuple table slot
+		 * reserved for it.  The tuple will be deleted when it is cleared
+		 * from the slot.
+		 */
+		ExecStoreTuple(setopstate->grp_firstTuple,
+					   resultTupleSlot,
+					   InvalidBuffer,
+					   true);
+		setopstate->grp_firstTuple = NULL;	/* don't keep two pointers */
+
+		/* Initialize working state for a new input tuple group */
+		initialize_counts(setopstate, pergroup);
+
+		/* Count the first input tuple */
+		advance_counts(setopstate, pergroup, resultTupleSlot);
+
+		/*
+		 * Scan the outer plan until we exhaust it or cross a group boundary.
+		 */
+		for (;;)
 		{
+			outerslot = ExecProcNode(outerPlan);
+			if (TupIsNull(outerslot))
+			{
+				/* no more outer-plan tuples available */
+				setopstate->setop_done = true;
+				break;
+			}
+
 			/*
-			 * We've reached the end of the group containing resultTuple.
-			 * Decide how many copies (if any) to emit.  This logic is
-			 * straight from the SQL92 specification.
+			 * Check whether we've crossed a group boundary.
 			 */
-			switch (plannode->cmd)
+			if (!execTuplesMatch(resultTupleSlot,
+								 outerslot,
+								 node->numCols, node->dupColIdx,
+								 setopstate->eqfunctions,
+								 setopstate->tempContext))
 			{
-				case SETOPCMD_INTERSECT:
-					if (node->numLeft > 0 && node->numRight > 0)
-						node->numOutput = 1;
-					else
-						node->numOutput = 0;
-					break;
-				case SETOPCMD_INTERSECT_ALL:
-					node->numOutput =
-						(node->numLeft < node->numRight) ?
-						node->numLeft : node->numRight;
-					break;
-				case SETOPCMD_EXCEPT:
-					if (node->numLeft > 0 && node->numRight == 0)
-						node->numOutput = 1;
-					else
-						node->numOutput = 0;
-					break;
-				case SETOPCMD_EXCEPT_ALL:
-					node->numOutput =
-						(node->numLeft < node->numRight) ?
-						0 : (node->numLeft - node->numRight);
-					break;
-				default:
-					elog(ERROR, "unrecognized set op: %d",
-						 (int) plannode->cmd);
-					break;
-			}
-			/* Fall out of for-loop if we have tuples to emit */
-			if (node->numOutput > 0)
+				/*
+				 * Save the first input tuple of the next group.
+				 */
+				setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
 				break;
-			/* Else flag that we have no current tuple, and loop around */
-			ExecClearTuple(resultTupleSlot);
+			}
+
+			/* Still in same group, so count this tuple */
+			advance_counts(setopstate, pergroup, outerslot);
 		}
-		else
+
+		/*
+		 * Done scanning input tuple group.  See if we should emit any
+		 * copies of result tuple, and if so return the first copy.
+		 */
+		set_output_count(setopstate, pergroup);
+
+		if (setopstate->numOutput > 0)
 		{
-			/*
-			 * Current tuple is member of same group as resultTuple. Count it
-			 * in the appropriate counter.
-			 */
-			int			flag;
-			bool		isNull;
-
-			flag = DatumGetInt32(slot_getattr(inputTupleSlot,
-											  plannode->flagColIdx,
-											  &isNull));
-			Assert(!isNull);
-			if (flag)
-				node->numRight++;
-			else
-				node->numLeft++;
-			/* Set flag to fetch a new input tuple, and loop around */
-			node->ps.ps_OuterTupleSlot = NULL;
+			setopstate->numOutput--;
+			return resultTupleSlot;
 		}
 	}
 
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
+}
+
+/*
+ * ExecSetOp for hashed case: phase 1, read input and build hash table
+ */
+static void
+setop_fill_hash_table(SetOpState *setopstate)
+{
+	PlanState  *outerPlan;
+	SetOpHashEntry entry;
+	TupleTableSlot *outerslot;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(setopstate);
+
 	/*
-	 * If we fall out of loop, then we need to emit at least one copy of
-	 * resultTuple.
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
 	 */
-	Assert(node->numOutput > 0);
-	node->numOutput--;
-	return resultTupleSlot;
+	for (;;)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+			break;
+
+		/* Find or build hashtable entry for this tuple's group */
+		entry = lookup_hash_entry(setopstate, outerslot);
+
+		/* Advance the counts */
+		advance_counts(setopstate, &entry->pergroup, outerslot);
+	}
+
+	setopstate->table_filled = true;
+	/* Initialize to walk the hash table */
+	ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
+}
+
+/*
+ * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+setop_retrieve_hash_table(SetOpState *setopstate)
+{
+	SetOpHashEntry entry;
+	TupleTableSlot *resultTupleSlot;
+
+	/*
+	 * get state info from node
+	 */
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
+		/*
+		 * Find the next entry in the hash table
+		 */
+		entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter);
+		if (entry == NULL)
+		{
+			/* No more entries in hashtable, so done */
+			setopstate->setop_done = true;
+			return NULL;
+		}
+
+		/*
+		 * See if we should emit any copies of this tuple, and if so return
+		 * the first copy.
+		 */
+		set_output_count(setopstate, &entry->pergroup);
+
+		if (setopstate->numOutput > 0)
+		{
+			setopstate->numOutput--;
+			return ExecStoreMinimalTuple(entry->shared.firstTuple,
+										 resultTupleSlot,
+										 false);
+		}
+	}
+
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
 }
 
 /* ----------------------------------------------------------------
@@ -226,9 +454,14 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	setopstate->ps.plan = (Plan *) node;
 	setopstate->ps.state = estate;
 
-	setopstate->ps.ps_OuterTupleSlot = NULL;
-	setopstate->subplan_done = false;
+	setopstate->eqfunctions = NULL;
+	setopstate->hashfunctions = NULL;
+	setopstate->setop_done = false;
 	setopstate->numOutput = 0;
+	setopstate->pergroup = NULL;
+	setopstate->grp_firstTuple = NULL;
+	setopstate->hashtable = NULL;
+	setopstate->tableContext = NULL;
 
 	/*
 	 * Miscellaneous initialization
@@ -244,6 +477,19 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 							  ALLOCSET_DEFAULT_INITSIZE,
 							  ALLOCSET_DEFAULT_MAXSIZE);
 
+	/*
+	 * If hashing, we also need a longer-lived context to store the hash
+	 * table.  The table can't just be kept in the per-query context because
+	 * we want to be able to throw it away in ExecReScanSetOp.
+	 */
+	if (node->strategy == SETOP_HASHED)
+		setopstate->tableContext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "SetOp hash table",
+								  ALLOCSET_DEFAULT_MINSIZE,
+								  ALLOCSET_DEFAULT_INITSIZE,
+								  ALLOCSET_DEFAULT_MAXSIZE);
+
 #define SETOP_NSLOTS 1
 
 	/*
@@ -252,8 +498,13 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	ExecInitResultTupleSlot(estate, &setopstate->ps);
 
 	/*
-	 * then initialize outer plan
+	 * initialize child nodes
+	 *
+	 * If we are hashing then the child plan does not need
+	 * to handle REWIND efficiently; see ExecReScanSetOp.
 	 */
+	if (node->strategy == SETOP_HASHED)
+		eflags &= ~EXEC_FLAG_REWIND;
 	outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
 
 	/*
@@ -264,11 +515,30 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	setopstate->ps.ps_ProjInfo = NULL;
 
 	/*
-	 * Precompute fmgr lookup data for inner loop
+	 * Precompute fmgr lookup data for inner loop. We need both equality and
+	 * hashing functions to do it by hashing, but only equality if not
+	 * hashing.
 	 */
-	setopstate->eqfunctions =
-		execTuplesMatchPrepare(node->numCols,
-							   node->dupOperators);
+	if (node->strategy == SETOP_HASHED)
+		execTuplesHashPrepare(node->numCols,
+							  node->dupOperators,
+							  &setopstate->eqfunctions,
+							  &setopstate->hashfunctions);
+	else
+		setopstate->eqfunctions =
+			execTuplesMatchPrepare(node->numCols,
+								   node->dupOperators);
+
+	if (node->strategy == SETOP_HASHED)
+	{
+		build_hash_table(setopstate);
+		setopstate->table_filled = false;
+	}
+	else
+	{
+		setopstate->pergroup =
+			(SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
+	}
 
 	return setopstate;
 }
@@ -293,9 +563,11 @@ ExecEndSetOp(SetOpState *node)
 {
 	/* clean up tuple table */
 	ExecClearTuple(node->ps.ps_ResultTupleSlot);
-	node->ps.ps_OuterTupleSlot = NULL;
 
+	/* free subsidiary stuff including hashtable */
 	MemoryContextDelete(node->tempContext);
+	if (node->tableContext)
+		MemoryContextDelete(node->tableContext);
 
 	ExecEndNode(outerPlanState(node));
 }
@@ -305,10 +577,50 @@ void
 ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt)
 {
 	ExecClearTuple(node->ps.ps_ResultTupleSlot);
-	node->ps.ps_OuterTupleSlot = NULL;
-	node->subplan_done = false;
+	node->setop_done = false;
 	node->numOutput = 0;
 
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		/*
+		 * In the hashed case, if we haven't yet built the hash table then we
+		 * can just return; nothing done yet, so nothing to undo. If subnode's
+		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+		 * else no reason to re-scan it at all.
+		 */
+		if (!node->table_filled)
+			return;
+
+		/*
+		 * If we do have the hash table and the subplan does not have any
+		 * parameter changes, then we can just rescan the existing hash table;
+		 * no need to build it again.
+		 */
+		if (((PlanState *) node)->lefttree->chgParam == NULL)
+		{
+			ResetTupleHashIterator(node->hashtable, &node->hashiter);
+			return;
+		}
+	}
+
+	/* Release first tuple of group, if we have made a copy */
+	if (node->grp_firstTuple != NULL)
+	{
+		heap_freetuple(node->grp_firstTuple);
+		node->grp_firstTuple = NULL;
+	}
+
+	/* Release any hashtable storage */
+	if (node->tableContext)
+		MemoryContextResetAndDeleteChildren(node->tableContext);
+
+	/* And rebuild empty hashtable if needed */
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		build_hash_table(node);
+		node->table_filled = false;
+	}
+
 	/*
 	 * if chgParam of subnode is not null then plan will be re-scanned by
 	 * first ExecProcNode.