Fix performance problems in multi-batch hash joins by ensuring that we select

a well-randomized batch number even when given a poorly-randomized hash value. This is a bit inefficient but seems the only practical solution given the constraint that we can't change the hash functions in released branches. Per report from Joseph Shraibman. Applied to 8.1 and 8.2 only --- HEAD is getting a cleaner fix, and 8.0 and before use different coding that seems less vulnerable.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-06-01 15:58:09 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-06-01 15:58:09 +0000
commit: 38bbcb3888d8d93dfc369fbbac49888860596010 (patch)
tree: 18deb70ac768ef7925b8dc008b96877ab8ee80f3 /src/backend
parent: 4c2158bf0c309099c1d185405b3d10d41708eb2e (diff)
download: postgresql-38bbcb3888d8d93dfc369fbbac49888860596010.tar.gz
postgresql-38bbcb3888d8d93dfc369fbbac49888860596010.zip
2 files changed, 34 insertions, 6 deletions
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index a4803708ad4..f03df3d5999 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.45.2.1 2005/12/22 22:50:06 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.45.2.2 2007/06/01 15:58:09 tgl Exp $
  *
  * NOTES
  *	  These functions are stored in pg_amproc.	For each operator class
@@ -267,6 +267,31 @@ hash_any(register const unsigned char *k, register int keylen)
 			/* case 0: nothing left to add */
 	}
 	mix(a, b, c);
+
+	/* report the result */
+	return UInt32GetDatum(c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value
+ *
+ * This has the same result (at least on little-endian machines) as
+ *		hash_any(&k, sizeof(uint32))
+ * but is faster and doesn't force the caller to store k into memory.
+ */
+Datum
+hash_uint32(uint32 k)
+{
+	register uint32 a,
+				b,
+				c;
+
+	a = 0x9e3779b9 + k;
+	b = 0x9e3779b9;
+	c = 3923095 + (uint32) sizeof(uint32);
+
+	mix(a, b, c);
+
 	/* report the result */
 	return UInt32GetDatum(c);
 }
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index a40423179d1..826cafe8737 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.96.2.2 2005/11/23 20:28:04 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.96.2.3 2007/06/01 15:58:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -20,6 +20,7 @@
  */
 #include "postgres.h"
 
+#include "access/hash.h"
 #include "executor/execdebug.h"
 #include "executor/hashjoin.h"
 #include "executor/instrument.h"
@@ -713,9 +714,11 @@ ExecHashGetHashValue(HashJoinTable hashtable,
  * chains), and must only cause the batch number to remain the same or
  * increase.  Our algorithm is
  *		bucketno = hashvalue MOD nbuckets
- *		batchno = (hashvalue DIV nbuckets) MOD nbatch
- * where nbuckets should preferably be prime so that all bits of the
- * hash value can affect both bucketno and batchno.
+ *		batchno = hash_uint32(hashvalue) MOD nbatch
+ * which gives reasonably independent bucket and batch numbers in the face
+ * of some rather poorly-implemented hash functions in hashfunc.c.  (This
+ * will change in PG 8.3.)
+ *
  * nbuckets doesn't change over the course of the join.
  *
  * nbatch is always a power of 2; we increase it only by doubling it.  This
@@ -734,7 +737,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 	{
 		*bucketno = hashvalue % nbuckets;
 		/* since nbatch is a power of 2, can do MOD by masking */
-		*batchno = (hashvalue / nbuckets) & (nbatch - 1);
+		*batchno = hash_uint32(hashvalue) & (nbatch - 1);
 	}
 	else
 	{
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-06-01 15:58:09 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-06-01 15:58:09 +0000
commit	38bbcb3888d8d93dfc369fbbac49888860596010 (patch)
tree	18deb70ac768ef7925b8dc008b96877ab8ee80f3 /src/backend
parent	4c2158bf0c309099c1d185405b3d10d41708eb2e (diff)
download	postgresql-38bbcb3888d8d93dfc369fbbac49888860596010.tar.gz postgresql-38bbcb3888d8d93dfc369fbbac49888860596010.zip