Change hash indexes to store only the hash code rather than the whole indexed

value. This means that hash index lookups are always lossy and have to be rechecked when the heap is visited; however, the gain in index compactness outweighs this when the indexed values are wide. Also, we only need to perform datatype comparisons when the hash codes match exactly, rather than for every entry in the hash bucket; so it could also win for datatypes that have expensive comparison functions. A small additional win is gained by keeping hash index pages sorted by hash code and using binary search to reduce the number of index tuples we have to look at. Xiao Meng This commit also incorporates Zdenek Kotala's patch to isolate hash metapages and hash bitmaps a bit better from the page header datastructures.
author: Tom Lane <tgl@sss.pgh.pa.us> 2008-09-15 18:43:41 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2008-09-15 18:43:41 +0000
commit: 4adc2f72a4ccd6e55e594aca837f09130a6af62b (patch)
tree: 6da4349e66c02ce2d76fe9600ff7ac8aeee741cb /src/backend
parent: 440b3384b0741199b4f56a8aac773ecd16aba137 (diff)
download: postgresql-4adc2f72a4ccd6e55e594aca837f09130a6af62b.tar.gz
postgresql-4adc2f72a4ccd6e55e594aca837f09130a6af62b.zip
8 files changed, 256 insertions, 96 deletions
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 41607c54dc3..af4c4c058fd 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.104 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.105 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS)
 	 * then we'll thrash horribly.  To prevent that scenario, we can sort the
 	 * tuples by (expected) bucket number.  However, such a sort is useless
 	 * overhead when the index does fit in RAM.  We choose to sort if the
-	 * initial index size exceeds effective_cache_size.
+	 * initial index size exceeds NBuffers.
 	 *
 	 * NOTE: this test will need adjustment if a bucket is ever different
 	 * from one page.
 	 */
-	if (num_buckets >= (uint32) effective_cache_size)
+	if (num_buckets >= (uint32) NBuffers)
 		buildstate.spool = _h_spoolinit(index, num_buckets);
 	else
 		buildstate.spool = NULL;
@@ -129,7 +129,7 @@ hashbuildCallback(Relation index,
 	IndexTuple	itup;
 
 	/* form an index tuple and point it at the heap tuple */
-	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+	itup = _hash_form_tuple(index, values, isnull);
 	itup->t_tid = htup->t_self;
 
 	/* Hash indexes don't index nulls, see notes in hashinsert */
@@ -153,8 +153,8 @@ hashbuildCallback(Relation index,
 /*
  *	hashinsert() -- insert an index tuple into a hash table.
  *
- *	Hash on the index tuple's key, find the appropriate location
- *	for the new tuple, and put it there.
+ *	Hash on the heap tuple's key, form an index tuple with hash code.
+ *	Find the appropriate location for the new tuple, and put it there.
  */
 Datum
 hashinsert(PG_FUNCTION_ARGS)
@@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS)
 	IndexTuple	itup;
 
 	/* generate an index tuple */
-	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+	itup = _hash_form_tuple(rel, values, isnull);
 	itup->t_tid = *ht_ctid;
 
 	/*
@@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS)
 	OffsetNumber offnum;
 	bool		res;
 
-	/* Hash indexes are never lossy (at the moment anyway) */
-	scan->xs_recheck = false;
+	/* Hash indexes are always lossy since we store only the hash code */
+	scan->xs_recheck = true;
 
 	/*
 	 * We hold pin but not lock on current buffer while outside the hash AM.
@@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS)
 		/* Save tuple ID, and continue scanning */
 		if (add_tuple) 
 		{
-			tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false);
+			/* Note we mark the tuple ID as requiring recheck */
+			tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
 			ntids++;
 		}
 
@@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
 	 * each bucket.
 	 */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap =  HashPageGetMeta(BufferGetPage(metabuf));
 	orig_maxbucket = metap->hashm_maxbucket;
 	orig_ntuples = metap->hashm_ntuples;
 	memcpy(&local_metapage, metap, sizeof(local_metapage));
@@ -629,7 +630,7 @@ loop_top:
 
 	/* Write-lock metapage and check for split since we started */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	if (cur_maxbucket != metap->hashm_maxbucket)
 	{
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 7f68318f1a6..6195c8a2ac2 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.50 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.51 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	bool		do_expand;
 	uint32		hashkey;
 	Bucket		bucket;
-	Datum		datum;
-	bool		isnull;
 
 	/*
-	 * Compute the hash key for the item.  We do this first so as not to need
-	 * to hold any locks while running the hash function.
+	 * Get the hash key for the item (it's stored in the index tuple itself).
 	 */
-	if (rel->rd_rel->relnatts != 1)
-		elog(ERROR, "hash indexes support only one index key");
-	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
-	Assert(!isnull);
-	hashkey = _hash_datum2hashkey(rel, datum);
+	hashkey = _hash_get_indextuple_hashkey(itup);
 
 	/* compute item size too */
 	itemsz = IndexTupleDSize(*itup);
@@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 
 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	/*
 	 * Check whether the item can fit on a hash page at all. (Eventually, we
 	 * ought to try to apply TOAST methods if not.)  Note that at this point,
 	 * itemsz doesn't include the ItemId.
+	 *
+	 * XXX this is useless code if we are only storing hash keys.
 	 */
 	if (itemsz > HashMaxItemSize((Page) metap))
 		ereport(ERROR,
@@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel,
 {
 	OffsetNumber itup_off;
 	Page		page;
+	uint32		hashkey;
 
 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 	page = BufferGetPage(buf);
 
-	itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+	/* Find where to insert the tuple (preserving page's hashkey ordering) */
+	hashkey = _hash_get_indextuple_hashkey(itup);
+	itup_off = _hash_binsearch(page, hashkey);
+
 	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
 		== InvalidOffsetNumber)
 		elog(ERROR, "failed to add index item to \"%s\"",
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 06958ec8657..37315dbf378 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.64 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.65 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	/* start search at hashm_firstfree */
 	orig_firstfree = metap->hashm_firstfree;
@@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 
 	/* Read the metapage so we can determine which bitmap page to use */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	/* Identify which bit to set */
 	ovflbitno = blkno_to_bitno(metap, ovflblkno);
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 43ec69cab32..c5edf6dcfb9 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.76 2008/08/11 11:05:10 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.77 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples)
 	 * Determine the target fill factor (in tuples per bucket) for this index.
 	 * The idea is to make the fill factor correspond to pages about as full
 	 * as the user-settable fillfactor parameter says.	We can compute it
-	 * exactly if the index datatype is fixed-width, but for var-width there's
-	 * some guessing involved.
+	 * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
 	 */
-	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
-								 RelationGetDescr(rel)->attrs[0]->atttypmod);
+	data_width = sizeof(uint32);
 	item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
 		sizeof(ItemIdData);		/* include the line pointer */
 	ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
@@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples)
 	pageopaque->hasho_flag = LH_META_PAGE;
 	pageopaque->hasho_page_id = HASHO_PAGE_ID;
 
-	metap = (HashMetaPage) pg;
+	metap = HashPageGetMeta(pg);
 
 	metap->hashm_magic = HASH_MAGIC;
 	metap->hashm_version = HASH_VERSION;
 	metap->hashm_ntuples = 0;
 	metap->hashm_nmaps = 0;
 	metap->hashm_ffactor = ffactor;
-	metap->hashm_bsize = BufferGetPageSize(metabuf);
+	metap->hashm_bsize = HashGetMaxBitmapSize(pg);
 	/* find largest bitmap array size that will fit in page size */
 	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
 	{
-		if ((1 << i) <= (metap->hashm_bsize -
-						 (MAXALIGN(sizeof(PageHeaderData)) +
-						  MAXALIGN(sizeof(HashPageOpaqueData)))))
+		if ((1 << i) <= metap->hashm_bsize)
 			break;
 	}
 	Assert(i > 0);
@@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	/*
 	 * Check to see if split is still needed; someone else might have already
@@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel,
 	Buffer		nbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
-	bool		null;
-	Datum		datum;
 	HashPageOpaque oopaque;
 	HashPageOpaque nopaque;
 	IndexTuple	itup;
@@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel,
 	OffsetNumber omaxoffnum;
 	Page		opage;
 	Page		npage;
-	TupleDesc	itupdesc = RelationGetDescr(rel);
 
 	/*
 	 * It should be okay to simultaneously write-lock pages from each bucket,
@@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel,
 		}
 
 		/*
-		 * Re-hash the tuple to determine which bucket it now belongs in.
-		 *
-		 * It is annoying to call the hash function while holding locks, but
-		 * releasing and relocking the page for each tuple is unappealing too.
+		 * Fetch the item's hash key (conveniently stored in the item)
+		 * and determine which bucket it now belongs in.
 		 */
 		itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
-		datum = index_getattr(itup, 1, itupdesc, &null);
-		Assert(!null);
-
-		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+		bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
 									  maxbucket, highmask, lowmask);
 
 		if (bucket == nbucket)
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 1e05558523f..85368393423 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.53 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.54 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 		hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
 										   cur->sk_subtype);
 
+	so->hashso_sk_hash = hashkey;
+
 	/*
 	 * Acquire shared split lock so we can compute the target bucket safely
 	 * (see README).
@@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
 	/*
 	 * Compute the target bucket number, and convert to block number.
@@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 		offnum = InvalidOffsetNumber;
 
 	/*
-	 * 'offnum' now points to the last tuple we have seen (if any).
+	 * 'offnum' now points to the last tuple we examined (if any).
 	 *
 	 * continue to step through tuples until: 1) we get to the end of the
 	 * bucket chain or 2) we find a valid tuple.
@@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				if (offnum != InvalidOffsetNumber)
 					offnum = OffsetNumberNext(offnum);	/* move forward */
 				else
-					offnum = FirstOffsetNumber; /* new page */
+				{
+					/* new page, locate starting position by binary search */
+					offnum = _hash_binsearch(page, so->hashso_sk_hash);
+				}
 
-				while (offnum > maxoff)
+				for (;;)
 				{
 					/*
-					 * either this page is empty (maxoff ==
-					 * InvalidOffsetNumber) or we ran off the end.
+					 * check if we're still in the range of items with
+					 * the target hash key
+					 */
+					if (offnum <= maxoff)
+					{
+						Assert(offnum >= FirstOffsetNumber);
+						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+							break;				/* yes, so exit for-loop */
+					}
+
+					/*
+					 * ran off the end of this page, try the next
 					 */
 					_hash_readnext(rel, &buf, &page, &opaque);
 					if (BufferIsValid(buf))
 					{
 						maxoff = PageGetMaxOffsetNumber(page);
-						offnum = FirstOffsetNumber;
+						offnum = _hash_binsearch(page, so->hashso_sk_hash);
 					}
 					else
 					{
 						/* end of bucket */
-						maxoff = offnum = InvalidOffsetNumber;
-						break;	/* exit while */
+						itup = NULL;
+						break;	/* exit for-loop */
 					}
 				}
 				break;
@@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				if (offnum != InvalidOffsetNumber)
 					offnum = OffsetNumberPrev(offnum);	/* move back */
 				else
-					offnum = maxoff;	/* new page */
+				{
+					/* new page, locate starting position by binary search */
+					offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+				}
 
-				while (offnum < FirstOffsetNumber)
+				for (;;)
 				{
 					/*
-					 * either this page is empty (offnum ==
-					 * InvalidOffsetNumber) or we ran off the end.
+					 * check if we're still in the range of items with
+					 * the target hash key
+					 */
+					if (offnum >= FirstOffsetNumber)
+					{
+						Assert(offnum <= maxoff);
+						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+							break;				/* yes, so exit for-loop */
+					}
+
+					/*
+					 * ran off the end of this page, try the next
 					 */
 					_hash_readprev(rel, &buf, &page, &opaque);
 					if (BufferIsValid(buf))
-						maxoff = offnum = PageGetMaxOffsetNumber(page);
+					{
+						maxoff = PageGetMaxOffsetNumber(page);
+						offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+					}
 					else
 					{
 						/* end of bucket */
-						maxoff = offnum = InvalidOffsetNumber;
-						break;	/* exit while */
+						itup = NULL;
+						break;	/* exit for-loop */
 					}
 				}
 				break;
@@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 			default:
 				/* NoMovementScanDirection */
 				/* this should not be reached */
+				itup = NULL;
 				break;
 		}
 
-		/* we ran off the end of the world without finding a match */
-		if (offnum == InvalidOffsetNumber)
+		if (itup == NULL)
 		{
+			/* we ran off the end of the bucket without finding a match */
 			*bufP = so->hashso_curbuf = InvalidBuffer;
 			ItemPointerSetInvalid(current);
 			return false;
 		}
 
-		/* get ready to check this tuple */
-		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		/* check the tuple quals, loop around if not met */
 	} while (!_hash_checkqual(scan, itup));
 
 	/* if we made it to here, we've found a valid tuple */
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 29cdf24529a..7a1e3a8ad0b 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.56 2008/07/13 20:45:47 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.57 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,12 +28,21 @@
 bool
 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 {
+	/*
+	 * Currently, we can't check any of the scan conditions since we do
+	 * not have the original index entry value to supply to the sk_func.
+	 * Always return true; we expect that hashgettuple already set the
+	 * recheck flag to make the main indexscan code do it.
+	 */
+#ifdef NOT_USED
 	TupleDesc	tupdesc = RelationGetDescr(scan->indexRelation);
 	ScanKey		key = scan->keyData;
 	int			scanKeySize = scan->numberOfKeys;
+#endif
 
 	IncrIndexProcessed();
 
+#ifdef NOT_USED
 	while (scanKeySize > 0)
 	{
 		Datum		datum;
@@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 		key++;
 		scanKeySize--;
 	}
+#endif
 
 	return true;
 }
@@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
 	 */
 	if (flags == LH_META_PAGE)
 	{
-		HashMetaPage metap = (HashMetaPage) page;
+		HashMetaPage metap = HashPageGetMeta(page);
 
 		if (metap->hashm_magic != HASH_MAGIC)
 			ereport(ERROR,
@@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS)
 		PG_RETURN_BYTEA_P(result);
 	PG_RETURN_NULL();
 }
+
+/*
+ * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
+ */
+uint32
+_hash_get_indextuple_hashkey(IndexTuple itup)
+{
+	char	   *attp;
+
+	/*
+	 * We assume the hash key is the first attribute and can't be null,
+	 * so this can be done crudely but very very cheaply ...
+	 */
+	attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+	return *((uint32 *) attp);
+}
+
+/*
+ * _hash_form_tuple - form an index tuple containing hash code only
+ */
+IndexTuple
+_hash_form_tuple(Relation index, Datum *values, bool *isnull)
+{
+	IndexTuple		itup;
+	uint32			hashkey;
+	Datum			hashkeydatum;
+	TupleDesc		hashdesc;
+
+	if (isnull[0])
+		hashkeydatum = (Datum) 0;
+	else
+	{
+		hashkey = _hash_datum2hashkey(index, values[0]);
+		hashkeydatum = UInt32GetDatum(hashkey);
+	}
+	hashdesc = RelationGetDescr(index);
+	Assert(hashdesc->natts == 1);
+	itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
+	return itup;
+}
+
+/*
+ * _hash_binsearch - Return the offset number in the page where the
+ *					 specified hash value should be sought or inserted.
+ *
+ * We use binary search, relying on the assumption that the existing entries
+ * are ordered by hash key.
+ *
+ * Returns the offset of the first index entry having hashkey >= hash_value,
+ * or the page's max offset plus one if hash_value is greater than all
+ * existing hash keys in the page.  This is the appropriate place to start
+ * a search, or to insert a new item.
+ */
+OffsetNumber
+_hash_binsearch(Page page, uint32 hash_value)
+{
+	OffsetNumber	upper;
+	OffsetNumber	lower;
+
+	/* Loop invariant: lower <= desired place <= upper */
+	upper = PageGetMaxOffsetNumber(page) + 1;
+	lower = FirstOffsetNumber;
+
+	while (upper > lower)
+	{
+		OffsetNumber	off;
+		IndexTuple		itup;
+		uint32			hashkey;
+
+		off = (upper + lower) / 2;
+		Assert(OffsetNumberIsValid(off));
+
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+		hashkey = _hash_get_indextuple_hashkey(itup);
+		if (hashkey < hash_value)
+			lower = off + 1;
+		else
+			upper = off;
+	}
+
+	return lower;
+}
+
+/*
+ * _hash_binsearch_last
+ *
+ * Same as above, except that if there are multiple matching items in the
+ * page, we return the offset of the last one instead of the first one,
+ * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
+ * This is handy for starting a new page in a backwards scan.
+ */
+OffsetNumber
+_hash_binsearch_last(Page page, uint32 hash_value)
+{
+	OffsetNumber	upper;
+	OffsetNumber	lower;
+
+	/* Loop invariant: lower <= desired place <= upper */
+	upper = PageGetMaxOffsetNumber(page);
+	lower = FirstOffsetNumber - 1;
+
+	while (upper > lower)
+	{
+		IndexTuple		itup;
+		OffsetNumber	off;
+		uint32			hashkey;
+
+		off = (upper + lower + 1) / 2;
+		Assert(OffsetNumberIsValid(off));
+
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+		hashkey = _hash_get_indextuple_hashkey(itup);
+		if (hashkey > hash_value)
+			upper = off - 1;
+		else
+			lower = off;
+	}
+
+	return lower;
+}
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 1847f023e4a..301e7d1f2d5 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.303 2008/08/25 22:42:32 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -76,6 +76,7 @@ typedef struct
 /* non-export function prototypes */
 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
 						 IndexInfo *indexInfo,
+						 Oid accessMethodObjectId,
 						 Oid *classObjectId);
 static void InitializeAttributeOids(Relation indexRelation,
 						int numatts, Oid indexoid);
@@ -105,15 +106,28 @@ static Oid	IndexGetRelation(Oid indexId);
 static TupleDesc
 ConstructTupleDescriptor(Relation heapRelation,
 						 IndexInfo *indexInfo,
+						 Oid accessMethodObjectId,
 						 Oid *classObjectId)
 {
 	int			numatts = indexInfo->ii_NumIndexAttrs;
 	ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
+	HeapTuple	amtuple;
+	Form_pg_am	amform;
 	TupleDesc	heapTupDesc;
 	TupleDesc	indexTupDesc;
 	int			natts;			/* #atts in heap rel --- for error checks */
 	int			i;
 
+	/* We need access to the index AM's pg_am tuple */
+	amtuple = SearchSysCache(AMOID,
+							 ObjectIdGetDatum(accessMethodObjectId),
+							 0, 0, 0);
+	if (!HeapTupleIsValid(amtuple))
+		elog(ERROR, "cache lookup failed for access method %u",
+			 accessMethodObjectId);
+	amform = (Form_pg_am) GETSTRUCT(amtuple);
+
+	/* ... and to the table's tuple descriptor */
 	heapTupDesc = RelationGetDescr(heapRelation);
 	natts = RelationGetForm(heapRelation)->relnatts;
 
@@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation,
 		Form_pg_attribute to = indexTupDesc->attrs[i];
 		HeapTuple	tuple;
 		Form_pg_type typeTup;
+		Form_pg_opclass opclassTup;
 		Oid			keyType;
 
 		if (atnum != 0)
@@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation,
 		to->attrelid = InvalidOid;
 
 		/*
-		 * Check the opclass to see if it provides a keytype (overriding the
-		 * attribute type).
+		 * Check the opclass and index AM to see if either provides a keytype
+		 * (overriding the attribute type).  Opclass takes precedence.
 		 */
 		tuple = SearchSysCache(CLAOID,
 							   ObjectIdGetDatum(classObjectId[i]),
@@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation,
 		if (!HeapTupleIsValid(tuple))
 			elog(ERROR, "cache lookup failed for opclass %u",
 				 classObjectId[i]);
-		keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
+		opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
+		if (OidIsValid(opclassTup->opckeytype))
+			keyType = opclassTup->opckeytype;
+		else
+			keyType = amform->amkeytype;
 		ReleaseSysCache(tuple);
 
 		if (OidIsValid(keyType) && keyType != to->atttypid)
@@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation,
 		}
 	}
 
+	ReleaseSysCache(amtuple);
+
 	return indexTupDesc;
 }
 
@@ -577,6 +598,7 @@ index_create(Oid heapRelationId,
 	 */
 	indexTupDesc = ConstructTupleDescriptor(heapRelation,
 											indexInfo,
+											accessMethodObjectId,
 											classObjectId);
 
 	/*
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 775840da185..29a076e1384 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -91,7 +91,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.86 2008/08/01 13:16:09 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.87 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -101,7 +101,6 @@
 #include <limits.h>
 
 #include "access/genam.h"
-#include "access/hash.h"
 #include "access/nbtree.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_operator.h"
@@ -353,7 +352,6 @@ struct Tuplesortstate
 	bool		enforceUnique;	/* complain if we find duplicate tuples */
 
 	/* These are specific to the index_hash subcase: */
-	FmgrInfo   *hash_proc;		/* call info for the hash function */
 	uint32		hash_mask;		/* mask for sortable part of hash code */
 
 	/*
@@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel,
 
 	state->indexRel = indexRel;
 
-	/*
-	 * We look up the index column's hash function just once, to avoid
-	 * chewing lots of cycles in repeated index_getprocinfo calls.  This
-	 * assumes that our caller holds the index relation open throughout the
-	 * sort, else the pointer obtained here might cease to be valid.
-	 */
-	state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC);
 	state->hash_mask = hash_mask;
 
 	MemoryContextSwitchTo(oldcontext);
@@ -2821,11 +2812,6 @@ static int
 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 					  Tuplesortstate *state)
 {
-	/*
-	 * It's slightly annoying to redo the hash function each time, although
-	 * most hash functions ought to be cheap.  Is it worth having a variant
-	 * tuple storage format so we can store the hash code?
-	 */
 	uint32		hash1;
 	uint32		hash2;
 	IndexTuple	tuple1;
@@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 	/* Allow interrupting long sorts */
 	CHECK_FOR_INTERRUPTS();
 
-	/* Compute hash codes and mask off bits we don't want to sort by */
+	/*
+	 * Fetch hash keys and mask off bits we don't want to sort by.
+	 * We know that the first column of the index tuple is the hash key.
+	 */
 	Assert(!a->isnull1);
-	hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1))
-		& state->hash_mask;
+	hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
 	Assert(!b->isnull1);
-	hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1))
-		& state->hash_mask;
+	hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
 
 	if (hash1 > hash2)
 		return 1;
author	Tom Lane <tgl@sss.pgh.pa.us>	2008-09-15 18:43:41 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2008-09-15 18:43:41 +0000
commit	4adc2f72a4ccd6e55e594aca837f09130a6af62b (patch)
tree	6da4349e66c02ce2d76fe9600ff7ac8aeee741cb /src/backend
parent	440b3384b0741199b4f56a8aac773ecd16aba137 (diff)
download	postgresql-4adc2f72a4ccd6e55e594aca837f09130a6af62b.tar.gz postgresql-4adc2f72a4ccd6e55e594aca837f09130a6af62b.zip