4 files changed, 106 insertions, 7 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 76bbe01730a..9fd6c5731c9 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -316,7 +316,17 @@ Other things that are handy to know
 
 Page zero of every btree is a meta-data page.  This page stores the
 location of the root page --- both the true root and the current effective
-root ("fast" root).
+root ("fast" root).  To avoid fetching the metapage for every single index
+search, we cache a copy of the meta-data information in the index's
+relcache entry (rd_amcache).  This is a bit ticklish since using the cache
+implies following a root page pointer that could be stale.  We require
+every metapage update to send out a SI "relcache inval" message on the
+index relation.  That ensures that each backend will flush its cached copy
+not later than the start of its next transaction.  Therefore, stale
+pointers cannot be used for longer than the current transaction, which
+reduces the problem to the same one already dealt with for concurrent
+VACUUM --- we can just imagine that each open transaction is potentially
+"already in flight" to the old root.
 
 The algorithm assumes we can fit at least three items per page
 (a "high key" and two real data items).  Therefore it's unsafe
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 6d8cf324ecf..ad6b632b2d2 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.135 2006/04/13 03:53:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.136 2006/04/25 22:46:05 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -18,6 +18,7 @@
 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "miscadmin.h"
+#include "utils/inval.h"
 
 
 typedef struct
@@ -638,9 +639,12 @@ _bt_insertonpg(Relation rel,
 
 		END_CRIT_SECTION();
 
-		/* release pin/lock */
+		/* release buffers; send out relcache inval if metapage changed */
 		if (BufferIsValid(metabuf))
+		{
+			CacheInvalidateRelcache(rel);
 			_bt_relbuf(rel, metabuf);
+		}
 
 		_bt_relbuf(rel, buf);
 	}
@@ -1526,6 +1530,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 
 	END_CRIT_SECTION();
 
+	/* send out relcache inval for metapage change */
+	CacheInvalidateRelcache(rel);
+
 	/* done with metapage */
 	_bt_relbuf(rel, metabuf);
 
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 3160a982cc1..3c76353e2fd 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.95 2006/04/01 03:03:36 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.96 2006/04/25 22:46:05 tgl Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -26,6 +26,7 @@
 #include "miscadmin.h"
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
+#include "utils/inval.h"
 
 
 /*
@@ -99,6 +100,49 @@ _bt_getroot(Relation rel, int access)
 	uint32		rootlevel;
 	BTMetaPageData *metad;
 
+	/*
+	 * Try to use previously-cached metapage data to find the root.  This
+	 * normally saves one buffer access per index search, which is a very
+	 * helpful savings in bufmgr traffic and hence contention.
+	 */
+	if (rel->rd_amcache != NULL)
+	{
+		metad = (BTMetaPageData *) rel->rd_amcache;
+		/* We shouldn't have cached it if any of these fail */
+		Assert(metad->btm_magic == BTREE_MAGIC);
+		Assert(metad->btm_version == BTREE_VERSION);
+		Assert(metad->btm_root != P_NONE);
+
+		rootblkno = metad->btm_fastroot;
+		Assert(rootblkno != P_NONE);
+		rootlevel = metad->btm_fastlevel;
+
+		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+		rootpage = BufferGetPage(rootbuf);
+		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+		/*
+		 * Since the cache might be stale, we check the page more carefully
+		 * here than normal.  We *must* check that it's not deleted.
+		 * If it's not alone on its level, then we reject too --- this
+		 * may be overly paranoid but better safe than sorry.  Note we
+		 * don't check P_ISROOT, because that's not set in a "fast root".
+		 */
+		if (!P_IGNORE(rootopaque) &&
+			rootopaque->btpo.level == rootlevel &&
+			P_LEFTMOST(rootopaque) &&
+			P_RIGHTMOST(rootopaque))
+		{
+			/* OK, accept cached page as the root */
+			return rootbuf;
+		}
+		_bt_relbuf(rel, rootbuf);
+		/* Cache is stale, throw it away */
+		if (rel->rd_amcache)
+			pfree(rel->rd_amcache);
+		rel->rd_amcache = NULL;
+	}
+
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 	metapg = BufferGetPage(metabuf);
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
@@ -201,6 +245,12 @@ _bt_getroot(Relation rel, int access)
 		END_CRIT_SECTION();
 
 		/*
+		 * Send out relcache inval for metapage change (probably unnecessary
+		 * here, but let's be safe).
+		 */
+		CacheInvalidateRelcache(rel);
+
+		/*
 		 * swap root write lock for read lock.	There is no danger of anyone
 		 * else accessing the new root page while it's unlocked, since no one
 		 * else knows where it is yet.
@@ -218,6 +268,13 @@ _bt_getroot(Relation rel, int access)
 		rootlevel = metad->btm_fastlevel;
 
 		/*
+		 * Cache the metapage data for next time
+		 */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(BTMetaPageData));
+		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+
+		/*
 		 * We are done with the metapage; arrange to release it via first
 		 * _bt_relandgetbuf call
 		 */
@@ -280,6 +337,16 @@ _bt_gettrueroot(Relation rel)
 	uint32		rootlevel;
 	BTMetaPageData *metad;
 
+	/*
+	 * We don't try to use cached metapage data here, since (a) this path is
+	 * not performance-critical, and (b) if we are here it suggests our cache
+	 * is out-of-date anyway.  In light of point (b), it's probably safest to
+	 * actively flush any cached metapage info.
+	 */
+	if (rel->rd_amcache)
+		pfree(rel->rd_amcache);
+	rel->rd_amcache = NULL;
+
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 	metapg = BufferGetPage(metabuf);
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
@@ -1052,9 +1119,12 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
 
 	END_CRIT_SECTION();
 
-	/* release buffers */
+	/* release buffers; send out relcache inval if metapage changed */
 	if (BufferIsValid(metabuf))
+	{
+		CacheInvalidateRelcache(rel);
 		_bt_relbuf(rel, metabuf);
+	}
 	_bt_relbuf(rel, pbuf);
 	_bt_relbuf(rel, rbuf);
 	_bt_relbuf(rel, buf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index cfd7629bb06..4f6915e32b4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.144 2006/04/01 03:03:37 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.145 2006/04/25 22:46:05 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,6 +26,7 @@
 #include "miscadmin.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
+#include "utils/inval.h"
 #include "utils/memutils.h"
 
 
@@ -127,6 +128,17 @@ btbuild(PG_FUNCTION_ARGS)
 	}
 #endif   /* BTREE_BUILD_STATS */
 
+	/*
+	 * If we are reindexing a pre-existing index, it is critical to send out
+	 * a relcache invalidation SI message to ensure all backends re-read the
+	 * index metapage.  In most circumstances the update-stats operation will
+	 * cause that to happen, but at the moment there are corner cases where
+	 * no pg_class update will occur, so force an inval here.  XXX FIXME:
+	 * the upper levels of CREATE INDEX should handle the stats update as
+	 * well as guaranteeing relcache inval.
+	 */
+	CacheInvalidateRelcache(index);
+
 	/* since we just counted the # of tuples, may as well update stats */
 	IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);