18 files changed, 1036 insertions, 1200 deletions
diff --git a/src/backend/access/common/indexvalid.c b/src/backend/access/common/indexvalid.c
index 6a7c08b4506..94e7efd522e 100644
--- a/src/backend/access/common/indexvalid.c
+++ b/src/backend/access/common/indexvalid.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.26 2001/01/24 19:42:47 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.27 2001/07/15 22:48:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -24,12 +24,9 @@
  */
 int			NIndexTupleProcessed;
 
+
 /* ----------------
- *		index_keytest
- *
- * old comments
- *		May eventually combine with other tests (like timeranges)?
- *		Should have Buffer buffer; as an argument and pass it to amgetattr.
+ *		index_keytest - does this index tuple satisfy the scan key(s)?
  * ----------------
  */
 bool
@@ -38,16 +35,16 @@ index_keytest(IndexTuple tuple,
 			  int scanKeySize,
 			  ScanKey key)
 {
-	bool		isNull;
-	Datum		datum;
-	Datum		test;
-
 	IncrIndexProcessed();
 
 	while (scanKeySize > 0)
 	{
+		Datum		datum;
+		bool		isNull;
+		Datum		test;
+
 		datum = index_getattr(tuple,
-							  key[0].sk_attno,
+							  key->sk_attno,
 							  tupdesc,
 							  &isNull);
 
@@ -57,25 +54,19 @@ index_keytest(IndexTuple tuple,
 			return false;
 		}
 
-		if (key[0].sk_flags & SK_ISNULL)
+		if (key->sk_flags & SK_ISNULL)
 			return false;
 
-		if (key[0].sk_flags & SK_COMMUTE)
-		{
-			test = FunctionCall2(&key[0].sk_func,
-								 key[0].sk_argument, datum);
-		}
+		if (key->sk_flags & SK_COMMUTE)
+			test = FunctionCall2(&key->sk_func, key->sk_argument, datum);
 		else
-		{
-			test = FunctionCall2(&key[0].sk_func,
-								 datum, key[0].sk_argument);
-		}
+			test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
 
-		if (DatumGetBool(test) == !!(key[0].sk_flags & SK_NEGATE))
+		if (DatumGetBool(test) == !!(key->sk_flags & SK_NEGATE))
 			return false;
 
-		scanKeySize -= 1;
 		key++;
+		scanKeySize--;
 	}
 
 	return true;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 9d6e2040f6c..c99c4a7e6e3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.79 2001/06/11 05:00:56 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.80 2001/07/15 22:48:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,7 +43,23 @@
 #define RIGHT_ADDED	0x02
 #define BOTH_ADDED	( LEFT_ADDED | RIGHT_ADDED )
 
+
+/* Working state for gistbuild and its callback */
+typedef struct
+{
+	GISTSTATE	giststate;
+	int			numindexattrs;
+	double		indtuples;
+} GISTBuildState;
+
+
 /* non-export function prototypes */
+static void gistbuildCallback(Relation index,
+							  HeapTuple htup,
+							  Datum *attdata,
+							  char *nulls,
+							  bool tupleIsAlive,
+							  void *state);
 static void gistdoinsert(Relation r,
 			 IndexTuple itup,
 			 InsertIndexResult *res,
@@ -89,6 +105,7 @@ static void GISTInitBuffer(Buffer b, uint32 f);
 static OffsetNumber gistchoose(Relation r, Page p,
 		   IndexTuple it,
 		   GISTSTATE *giststate);
+static void gistdelete(Relation r, ItemPointer tid);
 #ifdef GIST_PAGEADDITEM
 static IndexTuple gist_tuple_replacekey(Relation r,
 					  GISTENTRY entry, IndexTuple t);
@@ -116,184 +133,36 @@ gistbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
-
-#endif
-	ExprContext *econtext;
-	GISTSTATE	giststate;
-	GISTENTRY	tmpcentry;
-	Buffer		buffer = InvalidBuffer;
-	bool	   *compvec;
-	int			i;
+	double		reltuples;
+	GISTBuildState buildstate;
+	Buffer		buffer;
 
 	/* no locking is needed */
 
-	initGISTstate(&giststate, index);
+	initGISTstate(&buildstate.giststate, index);
 
 	/*
 	 * We expect to be called exactly once for any index relation. If
 	 * that's not the case, big trouble's what we have.
 	 */
-	if (oldPred == NULL && RelationGetNumberOfBlocks(index) != 0)
-		elog(ERROR, "%s already contains data", RelationGetRelationName(index));
-
-	/* initialize the root page (if this is a new index) */
-	if (oldPred == NULL)
-	{
-		buffer = ReadBuffer(index, P_NEW);
-		GISTInitBuffer(buffer, F_LEAF);
-		WriteBuffer(buffer);
-	}
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
-	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
-	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
+	/* initialize the root page */
+	buffer = ReadBuffer(index, P_NEW);
+	GISTInitBuffer(buffer, F_LEAF);
+	WriteBuffer(buffer);
 
 	/* build the index */
-	nhtups = nitups = 0.0;
-
-	compvec = (bool *) palloc(sizeof(bool) * indexInfo->ii_NumIndexAttrs);
-
-	/* start a heap scan */
-	hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
-
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* immediately compress keys to normalize */
-		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
-		{
-			gistcentryinit(&giststate, i, &tmpcentry, attdata[i],
-						   (Relation) NULL, (Page) NULL, (OffsetNumber) 0,
-						   -1 /* size is currently bogus */ , TRUE);
-			if (attdata[i] != tmpcentry.key &&
-				!(giststate.keytypbyval))
-				compvec[i] = TRUE;
-			else
-				compvec[i] = FALSE;
-			attdata[i] = tmpcentry.key;
-		}
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-		itup->t_tid = htup->t_self;
-
-		/*
-		 * Since we already have the index relation locked, we call
-		 * gistdoinsert directly.  Normal access method calls dispatch
-		 * through gistinsert, which locks the relation for write.	This
-		 * is the right thing to do if you're inserting single tups, but
-		 * not when you're initializing the whole index at once.
-		 */
-		gistdoinsert(index, itup, NULL, &giststate);
+	buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
+	buildstate.indtuples = 0;
 
-		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
-			if (compvec[i])
-				pfree(DatumGetPointer(attdata[i]));
-
-		pfree(itup);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   gistbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-
-	pfree(compvec);
-
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
 
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
@@ -313,14 +182,8 @@ gistbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
-		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
-		}
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
 	}
 
 #ifdef GISTDEBUG
@@ -331,6 +194,63 @@ gistbuild(PG_FUNCTION_ARGS)
 }
 
 /*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+gistbuildCallback(Relation index,
+				  HeapTuple htup,
+				  Datum *attdata,
+				  char *nulls,
+				  bool tupleIsAlive,
+				  void *state)
+{
+	GISTBuildState   *buildstate = (GISTBuildState *) state;
+	IndexTuple	itup;
+	bool		compvec[INDEX_MAX_KEYS];
+	GISTENTRY	tmpcentry;
+	int			i;
+
+	/* immediately compress keys to normalize */
+	for (i = 0; i < buildstate->numindexattrs; i++)
+	{
+		gistcentryinit(&buildstate->giststate, i, &tmpcentry, attdata[i],
+					   (Relation) NULL, (Page) NULL, (OffsetNumber) 0,
+					   -1 /* size is currently bogus */ , TRUE);
+		if (attdata[i] != tmpcentry.key &&
+			!(buildstate->giststate.keytypbyval))
+			compvec[i] = TRUE;
+		else
+			compvec[i] = FALSE;
+		attdata[i] = tmpcentry.key;
+	}
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	/* GIST indexes don't index nulls, see notes in gistinsert */
+	if (! IndexTupleHasNulls(itup))
+	{
+		/*
+		 * Since we already have the index relation locked, we call
+		 * gistdoinsert directly.  Normal access method calls dispatch
+		 * through gistinsert, which locks the relation for write.	This
+		 * is the right thing to do if you're inserting single tups, but
+		 * not when you're initializing the whole index at once.
+		 */
+		gistdoinsert(index, itup, NULL, &buildstate->giststate);
+
+		buildstate->indtuples += 1;
+	}
+
+	for (i = 0; i < buildstate->numindexattrs; i++)
+		if (compvec[i])
+			pfree(DatumGetPointer(attdata[i]));
+
+	pfree(itup);
+}
+
+/*
  *	gistinsert -- wrapper for GiST tuple insertion.
  *
  *	  This is the public interface routine for tuple insertion in GiSTs.
@@ -343,25 +263,28 @@ gistinsert(PG_FUNCTION_ARGS)
 	Datum	   *datum = (Datum *) PG_GETARG_POINTER(1);
 	char	   *nulls = (char *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
 #ifdef NOT_USED
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
-
 #endif
 	InsertIndexResult res;
 	IndexTuple	itup;
 	GISTSTATE	giststate;
 	GISTENTRY	tmpentry;
 	int			i;
-	bool	   *compvec;
+	bool		compvec[INDEX_MAX_KEYS];
+
+	/*
+	 * Since GIST is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
+	 */
 
 	initGISTstate(&giststate, r);
 
 	/* immediately compress keys to normalize */
-	compvec = (bool *) palloc(sizeof(bool) * r->rd_att->natts);
 	for (i = 0; i < r->rd_att->natts; i++)
 	{
-		gistcentryinit(&giststate, i,&tmpentry, datum[i],
+		gistcentryinit(&giststate, i, &tmpentry, datum[i],
 					   (Relation) NULL, (Page) NULL, (OffsetNumber) 0,
 					   -1 /* size is currently bogus */ , TRUE);
 		if (datum[i] != tmpentry.key && !(giststate.keytypbyval))
@@ -374,18 +297,24 @@ gistinsert(PG_FUNCTION_ARGS)
 	itup->t_tid = *ht_ctid;
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices()
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Currently, GIST indexes do not support indexing NULLs; considerable
+	 * infrastructure work would have to be done to do anything reasonable
+	 * with a NULL.
 	 */
+	if (IndexTupleHasNulls(itup))
+	{
+		res = NULL;
+	}
+	else
+	{
+		res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+		gistdoinsert(r, itup, &res, &giststate);
+	}
 
-	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
-	gistdoinsert(r, itup, &res, &giststate);
 	for (i = 0; i < r->rd_att->natts; i++)
 		if (compvec[i] == TRUE)
 			pfree(DatumGetPointer(datum[i]));
 	pfree(itup);
-	pfree(compvec);
 
 	PG_RETURN_POINTER(res);
 }
@@ -527,9 +456,7 @@ gistlayerinsert(Relation r, BlockNumber blkno,
 
 		/* key is modified, so old version must be deleted */
 		ItemPointerSet(&oldtid, blkno, child);
-		DirectFunctionCall2(gistdelete,
-							PointerGetDatum(r),
-							PointerGetDatum(&oldtid));
+		gistdelete(r, &oldtid);
 	}
 
 	ret = INSERTED;
@@ -1416,29 +1343,31 @@ gistfreestack(GISTSTACK *s)
 
 
 /*
-** remove an entry from a page
-*/
-Datum
-gistdelete(PG_FUNCTION_ARGS)
+ * Retail deletion of a single tuple.
+ *
+ * NB: this is no longer called externally, but is still needed by
+ * gistlayerinsert().  That dependency will have to be fixed if GIST
+ * is ever going to allow concurrent insertions.
+ */
+static void
+gistdelete(Relation r, ItemPointer tid)
 {
-	Relation	r = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
 	BlockNumber blkno;
 	OffsetNumber offnum;
 	Buffer		buf;
 	Page		page;
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices() Also note that only vacuum
-	 * deletes index tuples now...
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Since GIST is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
 	 */
 
 	blkno = ItemPointerGetBlockNumber(tid);
 	offnum = ItemPointerGetOffsetNumber(tid);
 
 	/* adjust any scans that will be affected by this deletion */
+	/* NB: this works only for scans in *this* backend! */
 	gistadjscans(r, GISTOP_DEL, blkno, offnum);
 
 	/* delete the index tuple */
@@ -1448,10 +1377,93 @@ gistdelete(PG_FUNCTION_ARGS)
 	PageIndexTupleDelete(page, offnum);
 
 	WriteBuffer(buf);
+}
 
-	PG_RETURN_VOID();
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+gistbulkdelete(PG_FUNCTION_ARGS)
+{
+	Relation	rel = (Relation) PG_GETARG_POINTER(0);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc iscan;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
+
+	/*
+	 * Since GIST is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
+	 */
+
+	/*
+	 * XXX generic implementation --- should be improved!
+	 */
+
+	/* walk through the entire index */
+	iscan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+
+	while ((res = index_getnext(iscan, ForwardScanDirection))
+		   != (RetrieveIndexResult) NULL)
+	{
+		ItemPointer heapptr = &res->heap_iptr;
+
+		if (callback(heapptr, callback_state))
+		{
+			ItemPointer indexptr = &res->index_iptr;
+			BlockNumber blkno;
+			OffsetNumber offnum;
+			Buffer		buf;
+			Page		page;
+
+			blkno = ItemPointerGetBlockNumber(indexptr);
+			offnum = ItemPointerGetOffsetNumber(indexptr);
+
+			/* adjust any scans that will be affected by this deletion */
+			gistadjscans(rel, GISTOP_DEL, blkno, offnum);
+
+			/* delete the index tuple */
+			buf = ReadBuffer(rel, blkno);
+			page = BufferGetPage(buf);
+
+			PageIndexTupleDelete(page, offnum);
+
+			WriteBuffer(buf);
+
+			tuples_removed += 1;
+		}
+		else
+			num_index_tuples += 1;
+
+		pfree(res);
+	}
+
+	index_endscan(iscan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
+
 void
 initGISTstate(GISTSTATE *giststate, Relation index)
 {
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 672b121693a..9358692a53c 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gistscan.c,v 1.37 2001/06/28 16:00:07 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gistscan.c,v 1.38 2001/07/15 22:48:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -59,13 +59,8 @@ gistbeginscan(PG_FUNCTION_ARGS)
 	ScanKey		key = (ScanKey) PG_GETARG_POINTER(3);
 	IndexScanDesc s;
 
-	/*
-	 * Let index_beginscan does its work...
-	 *
-	 * RelationSetLockForRead(r);
-	 */
-
 	s = RelationGetIndexScan(r, fromEnd, nkeys, key);
+
 	gistregscan(s);
 
 	PG_RETURN_POINTER(s);
@@ -283,6 +278,27 @@ gistdropscan(IndexScanDesc s)
 	pfree(l);
 }
 
+/*
+ * AtEOXact_gist() --- clean up gist subsystem at xact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var GISTScans.
+ */
+void
+AtEOXact_gist(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Reset the active-scans list to empty. We do not need to free the
+	 * list elements, because they're all palloc()'d, so they'll go away
+	 * at end of transaction anyway.
+	 */
+	GISTScans = NULL;
+}
+
 void
 gistadjscans(Relation rel, int op, BlockNumber blkno, OffsetNumber offnum)
 {
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 9617fcc33a6..9b0e6cf28ee 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.51 2001/05/07 00:43:15 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.52 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -21,13 +21,27 @@
 #include "access/genam.h"
 #include "access/hash.h"
 #include "access/heapam.h"
+#include "access/xlogutils.h"
 #include "catalog/index.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 
+
 bool		BuildingHash = false;
 
-#include "access/xlogutils.h"
+
+/* Working state for hashbuild and its callback */
+typedef struct
+{
+	double		indtuples;
+} HashBuildState;
+
+static void hashbuildCallback(Relation index,
+							  HeapTuple htup,
+							  Datum *attdata,
+							  char *nulls,
+							  bool tupleIsAlive,
+							  void *state);
 
 
 /*
@@ -44,161 +58,32 @@ hashbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	HashItem	hitem;
-	Node	   *pred = indexInfo->ii_Predicate;
-
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
+	double		reltuples;
+	HashBuildState buildstate;
 
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-
-	/* note that this is a new hash */
+	/* set flag to disable locking */
 	BuildingHash = true;
 
-	/* initialize the hash index metadata page (if this is a new index) */
-	if (oldPred == NULL)
-		_hash_metapinit(index);
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
 	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
+	 * We expect to be called exactly once for any index relation. If
+	 * that's not the case, big trouble's what we have.
 	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-	/* build the index */
-	nhtups = nitups = 0.0;
-
-	/* start a heap scan */
-	hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL);
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+	/* initialize the hash index metadata page */
+	_hash_metapinit(index);
 
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-
-		/*
-		 * If the single index key is null, we don't insert it into the
-		 * index.  Hash tables support scans on '='. Relational algebra
-		 * says that A = B returns null if either A or B is null.  This
-		 * means that no qualification used in an index scan could ever
-		 * return true on a null attribute.  It also means that indices
-		 * can't be used by ISNULL or NOTNULL scans, but that's an
-		 * artifact of the strategy map architecture chosen in 1986, not
-		 * of the way nulls are handled here.
-		 */
-
-		if (IndexTupleHasNulls(itup))
-		{
-			pfree(itup);
-			continue;
-		}
-
-		itup->t_tid = htup->t_self;
-		hitem = _hash_formitem(itup);
-
-		res = _hash_doinsert(index, hitem);
-
-		pfree(hitem);
-		pfree(itup);
-		pfree(res);
-	}
+	/* build the index */
+	buildstate.indtuples = 0;
 
-	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   hashbuildCallback, (void *) &buildstate);
 
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
+	/* all done */
+	BuildingHash = false;
 
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
@@ -218,23 +103,54 @@ hashbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
-		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
-		}
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
 	}
 
-	/* all done */
-	BuildingHash = false;
-
 	PG_RETURN_VOID();
 }
 
 /*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+hashbuildCallback(Relation index,
+				  HeapTuple htup,
+				  Datum *attdata,
+				  char *nulls,
+				  bool tupleIsAlive,
+				  void *state)
+{
+	HashBuildState   *buildstate = (HashBuildState *) state;
+	IndexTuple	itup;
+	HashItem	hitem;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	/* Hash indexes don't index nulls, see notes in hashinsert */
+	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
+		return;
+	}
+
+	hitem = _hash_formitem(itup);
+
+	res = _hash_doinsert(index, hitem);
+
+	if (res)
+		pfree(res);
+
+	buildstate->indtuples += 1;
+
+	pfree(hitem);
+	pfree(itup);
+}
+
+/*
  *	hashinsert() -- insert an index tuple into a hash table.
  *
  *	Hash on the index tuple's key, find the appropriate location
@@ -248,10 +164,8 @@ hashinsert(PG_FUNCTION_ARGS)
 	Datum	   *datum = (Datum *) PG_GETARG_POINTER(1);
 	char	   *nulls = (char *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
 #ifdef NOT_USED
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
-
 #endif
 	InsertIndexResult res;
 	HashItem	hitem;
@@ -261,8 +175,21 @@ hashinsert(PG_FUNCTION_ARGS)
 	itup = index_formtuple(RelationGetDescr(rel), datum, nulls);
 	itup->t_tid = *ht_ctid;
 
+	/*
+	 * If the single index key is null, we don't insert it into the
+	 * index.  Hash tables support scans on '='. Relational algebra
+	 * says that A = B returns null if either A or B is null.  This
+	 * means that no qualification used in an index scan could ever
+	 * return true on a null attribute.  It also means that indices
+	 * can't be used by ISNULL or NOTNULL scans, but that's an
+	 * artifact of the strategy map architecture chosen in 1986, not
+	 * of the way nulls are handled here.
+	 */
 	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
 		PG_RETURN_POINTER((InsertIndexResult) NULL);
+	}
 
 	hitem = _hash_formitem(itup);
 
@@ -471,22 +398,74 @@ hashrestrpos(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-/* stubs */
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-hashdelete(PG_FUNCTION_ARGS)
+hashbulkdelete(PG_FUNCTION_ARGS)
 {
 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc iscan;
 
-	/* adjust any active scans that will be affected by this deletion */
-	_hash_adjscans(rel, tid);
+	tuples_removed = 0;
+	num_index_tuples = 0;
 
-	/* delete the data from the page */
-	_hash_pagedel(rel, tid);
+	/*
+	 * XXX generic implementation --- should be improved!
+	 */
 
-	PG_RETURN_VOID();
+	/* walk through the entire index */
+	iscan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+
+	while ((res = index_getnext(iscan, ForwardScanDirection))
+		   != (RetrieveIndexResult) NULL)
+	{
+		ItemPointer heapptr = &res->heap_iptr;
+
+		if (callback(heapptr, callback_state))
+		{
+			ItemPointer indexptr = &res->index_iptr;
+
+			/* adjust any active scans that will be affected by deletion */
+			/* (namely, my own scan) */
+			_hash_adjscans(rel, indexptr);
+
+			/* delete the data from the page */
+			_hash_pagedel(rel, indexptr);
+
+			tuples_removed += 1;
+		}
+		else
+			num_index_tuples += 1;
+
+		pfree(res);
+	}
+
+	index_endscan(iscan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
+
 void
 hash_redo(XLogRecPtr lsn, XLogRecord *record)
 {
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 8e2ed1bb8af..c9fb065dbd2 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.29 2001/03/07 21:20:26 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.30 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -112,14 +112,14 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 
 	metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE);
 
-	splitnum = metap->OVFL_POINT;
-	max_free = metap->SPARES[splitnum];
+	splitnum = metap->hashm_ovflpoint;
+	max_free = metap->hashm_spares[splitnum];
 
 	free_page = (max_free - 1) >> (metap->hashm_bshift + BYTE_TO_BIT);
 	free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1);
 
 	/* Look through all the free maps to find the first free block */
-	first_page = metap->LAST_FREED >> (metap->hashm_bshift + BYTE_TO_BIT);
+	first_page = metap->hashm_lastfreed >> (metap->hashm_bshift + BYTE_TO_BIT);
 	for (i = first_page; i <= free_page; i++)
 	{
 		Page		mappage;
@@ -138,7 +138,7 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 
 		if (i == first_page)
 		{
-			bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1);
+			bit = metap->hashm_lastfreed & (BMPGSZ_BIT(metap) - 1);
 			j = bit / BITS_PER_MAP;
 			bit = bit & ~(BITS_PER_MAP - 1);
 		}
@@ -153,10 +153,10 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 	}
 
 	/* No Free Page Found - have to allocate a new page */
-	metap->LAST_FREED = metap->SPARES[splitnum];
-	metap->SPARES[splitnum]++;
-	offset = metap->SPARES[splitnum] -
-		(splitnum ? metap->SPARES[splitnum - 1] : 0);
+	metap->hashm_lastfreed = metap->hashm_spares[splitnum];
+	metap->hashm_spares[splitnum]++;
+	offset = metap->hashm_spares[splitnum] -
+		(splitnum ? metap->hashm_spares[splitnum - 1] : 0);
 
 #define OVMSG	"HASH: Out of overflow pages.  Out of luck.\n"
 
@@ -164,9 +164,9 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 	{
 		if (++splitnum >= NCACHED)
 			elog(ERROR, OVMSG);
-		metap->OVFL_POINT = splitnum;
-		metap->SPARES[splitnum] = metap->SPARES[splitnum - 1];
-		metap->SPARES[splitnum - 1]--;
+		metap->hashm_ovflpoint = splitnum;
+		metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1];
+		metap->hashm_spares[splitnum - 1]--;
 		offset = 0;
 	}
 
@@ -194,15 +194,15 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp)
 		if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset),
 							 1, free_page))
 			elog(ERROR, "overflow_page: problem with _hash_initbitmap.");
-		metap->SPARES[splitnum]++;
+		metap->hashm_spares[splitnum]++;
 		offset++;
 		if (offset > SPLITMASK)
 		{
 			if (++splitnum >= NCACHED)
 				elog(ERROR, OVMSG);
-			metap->OVFL_POINT = splitnum;
-			metap->SPARES[splitnum] = metap->SPARES[splitnum - 1];
-			metap->SPARES[splitnum - 1]--;
+			metap->hashm_ovflpoint = splitnum;
+			metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1];
+			metap->hashm_spares[splitnum - 1]--;
 			offset = 0;
 		}
 	}
@@ -235,13 +235,13 @@ found:
 	 */
 
 	bit = 1 + bit + (i * BMPGSZ_BIT(metap));
-	if (bit >= metap->LAST_FREED)
-		metap->LAST_FREED = bit - 1;
+	if (bit >= metap->hashm_lastfreed)
+		metap->hashm_lastfreed = bit - 1;
 
 	/* Calculate the split number for this page */
-	for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++)
+	for (i = 0; (i < splitnum) && (bit > metap->hashm_spares[i]); i++)
 		;
-	offset = (i ? bit - metap->SPARES[i - 1] : bit);
+	offset = (i ? bit - metap->hashm_spares[i - 1] : bit);
 	if (offset >= SPLITMASK)
 		elog(ERROR, OVMSG);
 
@@ -355,10 +355,10 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	 * element hashm_mapp[bitmappage].
 	 */
 	splitnum = (addr >> SPLITSHIFT);
-	ovflpgno = (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
+	ovflpgno = (splitnum ? metap->hashm_spares[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
 
-	if (ovflpgno < metap->LAST_FREED)
-		metap->LAST_FREED = ovflpgno;
+	if (ovflpgno < metap->hashm_lastfreed)
+		metap->hashm_lastfreed = ovflpgno;
 
 	bitmappage = (ovflpgno >> (metap->hashm_bshift + BYTE_TO_BIT));
 	bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1);
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index d1b3aaa2325..b8c520e3c0d 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.31 2001/06/27 23:31:37 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.32 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -18,7 +18,7 @@
  *	  address of the page if it is an overflow page.
  *
  *	  The first page in a hash relation, page zero, is special -- it stores
- *	  information describing the hash table; it is referred to as teh
+ *	  information describing the hash table; it is referred to as the
  *	  "meta page." Pages one and higher store the actual data.
  *
  *-------------------------------------------------------------------------
@@ -48,6 +48,19 @@ static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket
  *	before the lock table is fully initialized, so we can't use it.
  *	Strictly speaking, this violates 2pl, but we don't do 2pl on the
  *	system catalogs anyway.
+ *
+ *	Note that our page locks are actual lockmanager locks, not buffer
+ *	locks (as are used by btree, for example).  This is a good idea because
+ *	the algorithms are not deadlock-free, and we'd better be able to detect
+ *	and recover from deadlocks.
+ *
+ *	Another important difference from btree is that a hash indexscan
+ *	retains both a lock and a buffer pin on the current index page
+ *	between hashgettuple() calls (btree keeps only a buffer pin).
+ *	Because of this, it's safe to do item deletions with only a regular
+ *	write lock on a hash page --- there cannot be an indexscan stopped on
+ *	the page being deleted, other than an indexscan of our own backend,
+ *	which will be taken care of by _hash_adjscans.
  */
 
 
@@ -350,6 +363,16 @@ _hash_unsetpagelock(Relation rel,
 	}
 }
 
+/*
+ * Delete a hash index item.
+ *
+ * It is safe to delete an item after acquiring a regular WRITE lock on
+ * the page, because no other backend can hold a READ lock on the page,
+ * and that means no other backend currently has an indexscan stopped on
+ * any item of the item being deleted.  Our own backend might have such
+ * an indexscan (in fact *will*, since that's how VACUUM found the item
+ * in the first place), but _hash_adjscans will fix the scan position.
+ */
 void
 _hash_pagedel(Relation rel, ItemPointer tid)
 {
@@ -384,7 +407,7 @@ _hash_pagedel(Relation rel, ItemPointer tid)
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	metap = (HashMetaPage) BufferGetPage(metabuf);
 	_hash_checkpage((Page) metap, LH_META_PAGE);
-	++metap->hashm_nkeys;
+	metap->hashm_nkeys--;
 	_hash_wrtbuf(rel, metabuf);
 }
 
@@ -402,32 +425,32 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	_hash_checkpage((Page) metap, LH_META_PAGE);
 
 	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
-	new_bucket = ++metap->MAX_BUCKET;
+	new_bucket = ++metap->hashm_maxbucket;
 	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
-	old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK);
+	old_bucket = (metap->hashm_maxbucket & metap->hashm_lowmask);
 
 	/*
-	 * If the split point is increasing (MAX_BUCKET's log base 2 *
+	 * If the split point is increasing (hashm_maxbucket's log base 2 *
 	 * increases), we need to copy the current contents of the spare split
 	 * bucket to the next bucket.
 	 */
-	spare_ndx = _hash_log2(metap->MAX_BUCKET + 1);
-	if (spare_ndx > metap->OVFL_POINT)
+	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
+	if (spare_ndx > metap->hashm_ovflpoint)
 	{
 
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
-		metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT];
-		metap->OVFL_POINT = spare_ndx;
+		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
+		metap->hashm_ovflpoint = spare_ndx;
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 	}
 
-	if (new_bucket > metap->HIGH_MASK)
+	if (new_bucket > metap->hashm_highmask)
 	{
 
 		/* Starting a new doubling */
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
-		metap->LOW_MASK = metap->HIGH_MASK;
-		metap->HIGH_MASK = new_bucket | metap->LOW_MASK;
+		metap->hashm_lowmask = metap->hashm_highmask;
+		metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
 		metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 
 	}
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
index 649e42fbeb0..f4a91b5710f 100644
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.24 2001/01/24 19:42:47 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.25 2001/07/15 22:48:15 tgl Exp $
  *
  * NOTES
  *	  Because we can be doing an index scan on a relation while we
@@ -45,6 +45,31 @@ typedef HashScanListData *HashScanList;
 
 static HashScanList HashScans = (HashScanList) NULL;
 
+
+/*
+ * AtEOXact_hash() --- clean up hash subsystem at xact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var HashScans.
+ */
+void
+AtEOXact_hash(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Reset the active-scans list to empty. We do not need to free the
+	 * list elements, because they're all palloc()'d, so they'll go away
+	 * at end of transaction anyway.
+	 */
+	HashScans = NULL;
+
+	/* If we were building a hash, we ain't anymore. */
+	BuildingHash = false;
+}
+
 /*
  *	_Hash_regscan() -- register a new scan.
  */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index adeccf5cc84..2b6be06168f 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.51 2001/06/22 19:16:21 wieck Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.52 2001/07/15 22:48:15 tgl Exp $
  *
  * INTERFACE ROUTINES
  *		index_open		- open an index relation by relationId
@@ -18,23 +18,17 @@
  *		index_rescan	- restart a scan of an index
  *		index_endscan	- end a scan
  *		index_insert	- insert an index tuple into a relation
- *		index_delete	- delete an item from an index relation
  *		index_markpos	- mark a scan position
  *		index_restrpos	- restore a scan position
  *		index_getnext	- get the next tuple from a scan
- * **	index_fetch		- retrieve tuple with tid
- * **	index_replace	- replace a tuple
- * **	index_getattr	- get an attribute from an index tuple
- *		index_getprocid - get a support procedure id from the rel tuple
- *
- *		IndexScanIsValid - check index scan
+ *		index_bulk_delete	- bulk deletion of index tuples
+ *		index_cost_estimator	- fetch amcostestimate procedure OID
+ *		index_getprocid - get a support procedure OID
  *
  * NOTES
  *		This file contains the index_ routines which used
  *		to be a scattered collection of stuff in access/genam.
  *
- *		The ** routines: index_fetch, index_replace, and index_getattr
- *		have not yet been implemented.	They may not be needed.
  *
  * old comments
  *		Scans are implemented as follows:
@@ -211,23 +205,6 @@ index_insert(Relation relation,
 }
 
 /* ----------------
- *		index_delete - delete an item from an index relation
- * ----------------
- */
-void
-index_delete(Relation relation, ItemPointer indexItem)
-{
-	RegProcedure procedure;
-
-	RELATION_CHECKS;
-	GET_REL_PROCEDURE(delete, amdelete);
-
-	OidFunctionCall2(procedure,
-					 PointerGetDatum(relation),
-					 PointerGetDatum(indexItem));
-}
-
-/* ----------------
  *		index_beginscan - start a scan of an index
  * ----------------
  */
@@ -379,6 +356,35 @@ index_getnext(IndexScanDesc scan,
 }
 
 /* ----------------
+ *		index_bulk_delete - do mass deletion of index entries
+ *
+ *		callback routine tells whether a given main-heap tuple is
+ *		to be deleted
+ *
+ *		return value is an optional palloc'd struct of statistics
+ * ----------------
+ */
+IndexBulkDeleteResult *
+index_bulk_delete(Relation relation,
+				  IndexBulkDeleteCallback callback,
+				  void *callback_state)
+{
+	RegProcedure procedure;
+	IndexBulkDeleteResult *result;
+
+	RELATION_CHECKS;
+	GET_REL_PROCEDURE(bulk_delete, ambulkdelete);
+
+	result = (IndexBulkDeleteResult *)
+		DatumGetPointer(OidFunctionCall3(procedure,
+										 PointerGetDatum(relation),
+										 PointerGetDatum((Pointer) callback),
+										 PointerGetDatum(callback_state)));
+
+	return result;
+}
+
+/* ----------------
  *		index_cost_estimator
  *
  *		Fetch the amcostestimate procedure OID for an index.
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index eba9bd4eefe..bdc366dd0ad 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/nbtree
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.10 2000/08/31 16:09:41 petere Exp $
+#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@ subdir = src/backend/access/nbtree
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtscan.o nbtsearch.o \
+OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
        nbtstrat.o nbtutils.o nbtsort.o
 
 all: SUBSYS.o
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index cff7ff0d655..d8ec739b2a8 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.4 2000/07/25 05:26:40 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.5 2001/07/15 22:48:16 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -109,15 +109,11 @@ In addition, the following things are handy to know:
    is too high a price).  Rebuilding corrupted indexes during restart
    seems more attractive.
 
-+  On deletions, we need to adjust the position of active scans on
-   the index.  The code in nbtscan.c handles this.  We don't need to
-   do this for insertions or splits because _bt_restscan can find the
-   new position of the previously-found item.  NOTE that nbtscan.c
-   only copes with deletions issued by the current backend.  This
-   essentially means that concurrent deletions are not supported, but
-   that's true already in the Lehman and Yao algorithm.  nbtscan.c
-   exists only to support VACUUM and allow it to delete items while
-   it's scanning the index.
++  Deletions are handled by getting a super-exclusive lock on the target
+   page, so that no other backend has a pin on the page when the deletion
+   starts.  This means no scan is pointing at the page.  This is OK for
+   deleting leaf items, probably not OK for deleting internal nodes;
+   will need to think harder when it's time to support index compaction.
 
 +  "ScanKey" data structures are used in two fundamentally different ways
    in this code.  Searches for the initial position for a scan, as well as
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 8ffb9b9043c..c91c568ed2f 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.83 2001/06/22 19:16:21 wieck Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.84 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -126,7 +126,7 @@ top:
 		if (TransactionIdIsValid(xwait))
 		{
 			/* Have to wait for the other guy ... */
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			XactLockTableWait(xwait);
 			/* start over... */
 			_bt_freestack(stack);
@@ -234,7 +234,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 				if (TransactionIdIsValid(xwait))
 				{
 					if (nbuf != InvalidBuffer)
-						_bt_relbuf(rel, nbuf, BT_READ);
+						_bt_relbuf(rel, nbuf);
 					/* Tell _bt_doinsert to wait... */
 					return xwait;
 				}
@@ -263,7 +263,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 				break;
 			nblkno = opaque->btpo_next;
 			if (nbuf != InvalidBuffer)
-				_bt_relbuf(rel, nbuf, BT_READ);
+				_bt_relbuf(rel, nbuf);
 			nbuf = _bt_getbuf(rel, nblkno, BT_READ);
 			page = BufferGetPage(nbuf);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -273,7 +273,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 	}
 
 	if (nbuf != InvalidBuffer)
-		_bt_relbuf(rel, nbuf, BT_READ);
+		_bt_relbuf(rel, nbuf);
 
 	return NullTransactionId;
 }
@@ -397,7 +397,7 @@ _bt_insertonpg(Relation rel,
 			/* step right one page */
 			BlockNumber rblkno = lpageop->btpo_next;
 
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			buf = _bt_getbuf(rel, rblkno, BT_WRITE);
 			page = BufferGetPage(buf);
 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1175,12 +1175,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
 		 */
 		if (P_RIGHTMOST(opaque))
 		{
-			_bt_relbuf(rel, buf, access);
+			_bt_relbuf(rel, buf);
 			return (InvalidBuffer);
 		}
 
 		blkno = opaque->btpo_next;
-		_bt_relbuf(rel, buf, access);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1449,7 +1449,7 @@ _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release)
 							   &itup_off, &itup_blkno);
 			/* Keep lock on new "root" buffer ! */
 			if (buf != rootbuf)
-				_bt_relbuf(rel, buf, BT_WRITE);
+				_bt_relbuf(rel, buf);
 			buf = newbuf;
 			page = BufferGetPage(buf);
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1525,7 +1525,7 @@ _bt_fixtree(Relation rel, BlockNumber blkno)
 			if (P_ISROOT(opaque))
 			{
 				/* Tree is Ok now */
-				_bt_relbuf(rel, buf, BT_WRITE);
+				_bt_relbuf(rel, buf);
 				return;
 			}
 			/* Call _bt_fixroot() if there is no upper level */
@@ -1533,12 +1533,12 @@ _bt_fixtree(Relation rel, BlockNumber blkno)
 			{
 				elog(NOTICE, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel));
 				buf = _bt_fixroot(rel, buf, true);
-				_bt_relbuf(rel, buf, BT_WRITE);
+				_bt_relbuf(rel, buf);
 				return;
 			}
 			/* Have to go up one level */
 			pblkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 		}
 		blkno = pblkno;
 	}
@@ -1571,7 +1571,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 	page = BufferGetPage(buf);
 	/* copy page to temp storage */
 	memmove(tbuf, page, PageGetPageSize(page));
-	_bt_relbuf(rel, buf, BT_READ);
+	_bt_relbuf(rel, buf);
 
 	page = (Page) tbuf;
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1682,7 +1682,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 					{
 						if (coff[i] != P_FIRSTDATAKEY(newopaque))
 							elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel));
-						_bt_relbuf(rel, buf, BT_WRITE);
+						_bt_relbuf(rel, buf);
 						buf = newbuf;
 						page = newpage;
 						opaque = newopaque;
@@ -1691,7 +1691,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 						continue;
 					}
 					/* unfound - need to insert on current page */
-					_bt_relbuf(rel, newbuf, BT_WRITE);
+					_bt_relbuf(rel, newbuf);
 				}
 				/* insert pointer */
 				ritem = (BTItem) PageGetItem(cpage[i - 1],
@@ -1718,10 +1718,10 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 									   &itup_off, &itup_blkno);
 					/* what buffer we need in ? */
 					if (newitemonleft)
-						_bt_relbuf(rel, newbuf, BT_WRITE);
+						_bt_relbuf(rel, newbuf);
 					else
 					{
-						_bt_relbuf(rel, buf, BT_WRITE);
+						_bt_relbuf(rel, buf);
 						buf = newbuf;
 						page = BufferGetPage(buf);
 						opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1741,7 +1741,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 
 			/* copy page with pointer to cblkno[cidx] to temp storage */
 			memmove(tbuf, page, PageGetPageSize(page));
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			page = (Page) tbuf;
 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		}
@@ -1751,13 +1751,13 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
 			goodbye = false;
 
 		/* Pointers to child pages are Ok - right end of child level ? */
-		_bt_relbuf(rel, cbuf[0], BT_READ);
-		_bt_relbuf(rel, cbuf[1], BT_READ);
+		_bt_relbuf(rel, cbuf[0]);
+		_bt_relbuf(rel, cbuf[1]);
 		if (cidx == 1 ||
 			(cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye)))
 		{
 			if (cidx == 2)
-				_bt_relbuf(rel, cbuf[2], BT_READ);
+				_bt_relbuf(rel, cbuf[2]);
 			return;
 		}
 		if (cblkno[0] == limit || cblkno[1] == limit)
@@ -1819,7 +1819,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		{
 			if (offnum <= stack.bts_offset)
 				elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel));
-			_bt_relbuf(rel, buf, BT_READ);
+			_bt_relbuf(rel, buf);
 			return;
 		}
 
@@ -1837,7 +1837,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		if (rbuf == InvalidBuffer)
 			elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel));
 		rblkno = BufferGetBlockNumber(rbuf);
-		_bt_relbuf(rel, rbuf, BT_READ);
+		_bt_relbuf(rel, rbuf);
 
 		/*
 		 * If we have parent item in true_stack then go up one level and
@@ -1845,7 +1845,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		 */
 		if (true_stack)
 		{
-			_bt_relbuf(rel, buf, BT_READ);
+			_bt_relbuf(rel, buf);
 			blkno = true_stack->bts_blkno;
 			true_stack = true_stack->bts_parent;
 			continue;
@@ -1860,19 +1860,19 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno,
 		if (!BTreeInvalidParent(opaque))
 		{
 			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_READ);
+			_bt_relbuf(rel, buf);
 			continue;
 		}
 
 		/* Have to switch to excl buf lock and re-check btpo_parent */
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_WRITE);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		if (!BTreeInvalidParent(opaque))
 		{
 			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			continue;
 		}
 
@@ -1913,7 +1913,7 @@ _bt_fixup(Relation rel, Buffer buf)
 		if (!BTreeInvalidParent(opaque))
 		{
 			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf, BT_WRITE);
+			_bt_relbuf(rel, buf);
 			elog(NOTICE, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel));
 			_bt_fixtree(rel, blkno);
 			return;
@@ -1921,8 +1921,7 @@ _bt_fixup(Relation rel, Buffer buf)
 		if (P_LEFTMOST(opaque))
 			break;
 		blkno = opaque->btpo_prev;
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		ReleaseBuffer(buf);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_WRITE);
 	}
 
@@ -1932,9 +1931,7 @@ _bt_fixup(Relation rel, Buffer buf)
 	 */
 	elog(NOTICE, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel));
 	buf = _bt_fixroot(rel, buf, true);
-	_bt_relbuf(rel, buf, BT_WRITE);
-
-	return;
+	_bt_relbuf(rel, buf);
 }
 
 static OffsetNumber
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 67e1407b22b..376274c5621 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.52 2001/06/27 23:31:38 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.53 2001/07/15 22:48:16 tgl Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -138,7 +138,7 @@ _bt_getroot(Relation rel, int access)
 		/* If access = BT_READ, caller doesn't want us to create root yet */
 		if (access == BT_READ)
 		{
-			_bt_relbuf(rel, metabuf, BT_READ);
+			_bt_relbuf(rel, metabuf);
 			return InvalidBuffer;
 		}
 
@@ -215,14 +215,14 @@ _bt_getroot(Relation rel, int access)
 			 * guarantee no deadlocks, we have to release the metadata
 			 * page and start all over again.
 			 */
-			_bt_relbuf(rel, metabuf, BT_WRITE);
+			_bt_relbuf(rel, metabuf);
 			return _bt_getroot(rel, access);
 		}
 	}
 	else
 	{
 		rootblkno = metad->btm_root;
-		_bt_relbuf(rel, metabuf, BT_READ);		/* done with the meta page */
+		_bt_relbuf(rel, metabuf);		/* done with the meta page */
 
 		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 	}
@@ -270,8 +270,8 @@ _bt_getroot(Relation rel, int access)
 					goto check_parent;
 				}
 				else
-/* someone else already fixed root */
 				{
+					/* someone else already fixed root */
 					LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 					LockBuffer(rootbuf, BT_READ);
 				}
@@ -283,7 +283,7 @@ _bt_getroot(Relation rel, int access)
 			 * chance that parent is root page.
 			 */
 			newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
-			_bt_relbuf(rel, rootbuf, BT_READ);
+			_bt_relbuf(rel, rootbuf);
 			rootbuf = newrootbuf;
 			rootpage = BufferGetPage(rootbuf);
 			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
@@ -293,7 +293,7 @@ _bt_getroot(Relation rel, int access)
 		}
 
 		/* try again */
-		_bt_relbuf(rel, rootbuf, BT_READ);
+		_bt_relbuf(rel, rootbuf);
 		return _bt_getroot(rel, access);
 	}
 
@@ -350,10 +350,12 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 /*
  *	_bt_relbuf() -- release a locked buffer.
  *
- * Lock and pin (refcount) are both dropped.
+ * Lock and pin (refcount) are both dropped.  Note that either read or
+ * write lock can be dropped this way, but if we modified the buffer,
+ * this is NOT the right way to release a write lock.
  */
 void
-_bt_relbuf(Relation rel, Buffer buf, int access)
+_bt_relbuf(Relation rel, Buffer buf)
 {
 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 	ReleaseBuffer(buf);
@@ -449,24 +451,23 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
 }
 
 /*
- * Delete an item from a btree.  It had better be a leaf item...
+ * Delete an item from a btree page.
+ *
+ * This routine assumes that the caller has pinned and locked the buffer,
+ * and will write the buffer afterwards.
  */
 void
-_bt_pagedel(Relation rel, ItemPointer tid)
+_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
 {
-	Buffer		buf;
-	Page		page;
-	BlockNumber blkno;
+	Page		page = BufferGetPage(buf);
 	OffsetNumber offno;
 
-	blkno = ItemPointerGetBlockNumber(tid);
 	offno = ItemPointerGetOffsetNumber(tid);
 
-	buf = _bt_getbuf(rel, blkno, BT_WRITE);
-	page = BufferGetPage(buf);
-
 	START_CRIT_SECTION();
+
 	PageIndexTupleDelete(page, offno);
+
 	/* XLOG stuff */
 	{
 		xl_btree_delete xlrec;
@@ -490,8 +491,6 @@ _bt_pagedel(Relation rel, ItemPointer tid)
 		PageSetLSN(page, recptr);
 		PageSetSUI(page, ThisStartUpID);
 	}
-	END_CRIT_SECTION();
 
-	/* write the buffer and release the lock */
-	_bt_wrtbuf(rel, buf);
+	END_CRIT_SECTION();
 }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b714296c8f7..b1426456241 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.81 2001/05/18 21:24:17 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.82 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,11 +28,27 @@
 #include "storage/sinval.h"
 #include "access/xlogutils.h"
 
-bool		BuildingBtree = false;		/* see comment in btbuild() */
-bool		FastBuild = true;	/* use sort/build instead */
 
- /* of insertion build */
+/* Working state for btbuild and its callback */
+typedef struct
+{
+	bool		usefast;
+	bool		isUnique;
+	bool		haveDead;
+	Relation	heapRel;
+	BTSpool    *spool;
+	/*
+	 * spool2 is needed only when the index is an unique index. Dead
+	 * tuples are put into spool2 instead of spool in order to avoid
+	 * uniqueness check.
+	 */
+	BTSpool    *spool2;
+	double		indtuples;
+} BTBuildState;
+
 
+bool		BuildingBtree = false;		/* see comment in btbuild() */
+bool		FastBuild = true;	/* use SORT instead of insertion build */
 
 /*
  * TEMPORARY FLAG FOR TESTING NEW FIX TREE
@@ -41,6 +57,29 @@ bool		FastBuild = true;	/* use sort/build instead */
 bool		FixBTree = true;
 
 static void _bt_restscan(IndexScanDesc scan);
+static void btbuildCallback(Relation index,
+							HeapTuple htup,
+							Datum *attdata,
+							char *nulls,
+							bool tupleIsAlive,
+							void *state);
+
+
+/*
+ * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit.
+ */
+void
+AtEOXact_nbtree(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/* If we were building a btree, we ain't anymore. */
+	BuildingBtree = false;
+}
+
 
 /*
  *	btbuild() -- build a new btree index.
@@ -56,42 +95,10 @@ btbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-	BTSpool    *spool = NULL;
-	BTItem		btitem;
-	bool		usefast;
-	Snapshot	snapshot;
-	TransactionId XmaxRecent;
+	double		reltuples;
+	BTBuildState buildstate;
 
-	/*
-	 * spool2 is needed only when the index is an unique index. Dead
-	 * tuples are put into spool2 instead of spool in order to avoid
-	 * uniqueness check.
-	 */
-	BTSpool    *spool2 = NULL;
-	bool		tupleIsAlive;
-	int			dead_count;
-
-	/* note that this is a new btree */
+	/* set flag to disable locking */
 	BuildingBtree = true;
 
 	/*
@@ -100,220 +107,63 @@ btbuild(PG_FUNCTION_ARGS)
 	 * look harder at this.  (there is some kind of incremental processing
 	 * going on there.) -- pma 08/29/95
 	 */
-	usefast = (FastBuild && IsNormalProcessingMode());
+	buildstate.usefast = (FastBuild && IsNormalProcessingMode());
+	buildstate.isUnique = indexInfo->ii_Unique;
+	buildstate.haveDead = false;
+	buildstate.heapRel = heap;
+	buildstate.spool = NULL;
+	buildstate.spool2 = NULL;
+	buildstate.indtuples = 0;
 
 #ifdef BTREE_BUILD_STATS
 	if (Show_btree_build_stats)
 		ResetUsage();
 #endif	 /* BTREE_BUILD_STATS */
 
-	/* initialize the btree index metadata page (if this is a new index) */
-	if (oldPred == NULL)
-		_bt_metapinit(index);
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
 	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
+	 * We expect to be called exactly once for any index relation. If
+	 * that's not the case, big trouble's what we have.
 	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-
-		/*
-		 * we never want to use sort/build if we are extending an existing
-		 * partial index -- it works by inserting the newly-qualifying
-		 * tuples into the existing index. (sort/build would overwrite the
-		 * existing index with one consisting of the newly-qualifying
-		 * tuples.)
-		 */
-		usefast = false;
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	/* build the index */
-	nhtups = nitups = 0.0;
+	/* initialize the btree index metadata page */
+	_bt_metapinit(index);
 
-	if (usefast)
+	if (buildstate.usefast)
 	{
-		spool = _bt_spoolinit(index, indexInfo->ii_Unique);
-
+		buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique);
 		/*
-		 * Different from spool,the uniqueness isn't checked for spool2.
+		 * Different from spool, the uniqueness isn't checked for spool2.
 		 */
 		if (indexInfo->ii_Unique)
-			spool2 = _bt_spoolinit(index, false);
+			buildstate.spool2 = _bt_spoolinit(index, false);
 	}
 
-	/* start a heap scan */
-	dead_count = 0;
-	snapshot = (IsBootstrapProcessingMode() ? SnapshotNow : SnapshotAny);
-	hscan = heap_beginscan(heap, 0, snapshot, 0, (ScanKey) NULL);
-	XmaxRecent = 0;
-	if (snapshot == SnapshotAny)
-		GetXmaxRecent(&XmaxRecent);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		if (snapshot == SnapshotAny)
-		{
-			tupleIsAlive = HeapTupleSatisfiesNow(htup->t_data);
-			if (!tupleIsAlive)
-			{
-				if ((htup->t_data->t_infomask & HEAP_XMIN_INVALID) != 0)
-					continue;
-				if (htup->t_data->t_infomask & HEAP_XMAX_COMMITTED &&
-					htup->t_data->t_xmax < XmaxRecent)
-					continue;
-			}
-		}
-		else
-			tupleIsAlive = true;
-
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
-
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-
-		/*
-		 * If the single index key is null, we don't insert it into the
-		 * index.  Btrees support scans on <, <=, =, >=, and >. Relational
-		 * algebra says that A op B (where op is one of the operators
-		 * above) returns null if either A or B is null.  This means that
-		 * no qualification used in an index scan could ever return true
-		 * on a null attribute.  It also means that indices can't be used
-		 * by ISNULL or NOTNULL scans, but that's an artifact of the
-		 * strategy map architecture chosen in 1986, not of the way nulls
-		 * are handled here.
-		 */
-
-		/*
-		 * New comments: NULLs handling. While we can't do NULL
-		 * comparison, we can follow simple rule for ordering items on
-		 * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE.
-		 * Sure, it's just rule for placing/finding items and no more -
-		 * keytest'll return FALSE for a = 5 for items having 'a' isNULL.
-		 * Look at _bt_compare for how it works. - vadim 03/23/97
-		 *
-		 * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; }
-		 */
-
-		itup->t_tid = htup->t_self;
-		btitem = _bt_formitem(itup);
-
-		/*
-		 * if we are doing bottom-up btree build, we insert the index into
-		 * a spool file for subsequent processing.	otherwise, we insert
-		 * into the btree.
-		 */
-		if (usefast)
-		{
-			if (tupleIsAlive || !spool2)
-				_bt_spool(btitem, spool);
-			else
-/* dead tuples are put into spool2 */
-			{
-				dead_count++;
-				_bt_spool(btitem, spool2);
-			}
-		}
-		else
-			res = _bt_doinsert(index, btitem, indexInfo->ii_Unique, heap);
-
-		pfree(btitem);
-		pfree(itup);
-		if (res)
-			pfree(res);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   btbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-	if (spool2 && !dead_count)	/* spool2 was found to be unnecessary */
+	if (buildstate.spool2 && !buildstate.haveDead)
 	{
-		_bt_spooldestroy(spool2);
-		spool2 = NULL;
+		/* spool2 turns out to be unnecessary */
+		_bt_spooldestroy(buildstate.spool2);
+		buildstate.spool2 = NULL;
 	}
 
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
-
 	/*
 	 * if we are doing bottom-up btree build, finish the build by (1)
 	 * completing the sort of the spool file, (2) inserting the sorted
 	 * tuples into btree pages and (3) building the upper levels.
 	 */
-	if (usefast)
+	if (buildstate.usefast)
 	{
-		_bt_leafbuild(spool, spool2);
-		_bt_spooldestroy(spool);
-		if (spool2)
-			_bt_spooldestroy(spool2);
+		_bt_leafbuild(buildstate.spool, buildstate.spool2);
+		_bt_spooldestroy(buildstate.spool);
+		if (buildstate.spool2)
+			_bt_spooldestroy(buildstate.spool2);
 	}
 
 #ifdef BTREE_BUILD_STATS
@@ -325,6 +175,9 @@ btbuild(PG_FUNCTION_ARGS)
 	}
 #endif	 /* BTREE_BUILD_STATS */
 
+	/* all done */
+	BuildingBtree = false;
+
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
 	 * in pg_class to guarantee that the planner takes advantage of the
@@ -343,20 +196,63 @@ btbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+btbuildCallback(Relation index,
+				HeapTuple htup,
+				Datum *attdata,
+				char *nulls,
+				bool tupleIsAlive,
+				void *state)
+{
+	BTBuildState   *buildstate = (BTBuildState *) state;
+	IndexTuple	itup;
+	BTItem		btitem;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	btitem = _bt_formitem(itup);
+
+	/*
+	 * if we are doing bottom-up btree build, we insert the index into
+	 * a spool file for subsequent processing.	otherwise, we insert
+	 * into the btree.
+	 */
+	if (buildstate->usefast)
+	{
+		if (tupleIsAlive || buildstate->spool2 == NULL)
+			_bt_spool(btitem, buildstate->spool);
+		else
 		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
+			/* dead tuples are put into spool2 */
+			buildstate->haveDead = true;
+			_bt_spool(btitem, buildstate->spool2);
 		}
 	}
+	else
+	{
+		res = _bt_doinsert(index, btitem,
+						   buildstate->isUnique, buildstate->heapRel);
+		if (res)
+			pfree(res);
+	}
 
-	/* all done */
-	BuildingBtree = false;
+	buildstate->indtuples += 1;
 
-	PG_RETURN_VOID();
+	pfree(btitem);
+	pfree(itup);
 }
 
 /*
@@ -423,8 +319,10 @@ btgettuple(PG_FUNCTION_ARGS)
 
 	/*
 	 * Save heap TID to use it in _bt_restscan.  Then release the read
-	 * lock on the buffer so that we aren't blocking other backends. NOTE:
-	 * we do keep the pin on the buffer!
+	 * lock on the buffer so that we aren't blocking other backends.
+	 *
+	 * NOTE: we do keep the pin on the buffer!  This is essential to ensure
+	 * that someone else doesn't delete the index entry we are stopped on.
 	 */
 	if (res)
 	{
@@ -451,9 +349,6 @@ btbeginscan(PG_FUNCTION_ARGS)
 	/* get the scan */
 	scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
 
-	/* register scan in case we change pages it's using */
-	_bt_regscan(scan);
-
 	PG_RETURN_POINTER(scan);
 }
 
@@ -571,8 +466,6 @@ btendscan(PG_FUNCTION_ARGS)
 		pfree(so->keyData);
 	pfree(so);
 
-	_bt_dropscan(scan);
-
 	PG_RETURN_VOID();
 }
 
@@ -640,20 +533,127 @@ btrestrpos(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-/* stubs */
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-btdelete(PG_FUNCTION_ARGS)
+btbulkdelete(PG_FUNCTION_ARGS)
 {
 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc scan;
+	BTScanOpaque so;
+	ItemPointer current;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
+
+	/*
+	 * We use a standard IndexScanDesc scan object, but to speed up the loop,
+	 * we skip most of the wrapper layers of index_getnext and instead call
+	 * _bt_step directly.  This implies holding buffer lock on a target page
+	 * throughout the loop over the page's tuples.  Initially, we have a read
+	 * lock acquired by _bt_step when we stepped onto the page.  If we find
+	 * a tuple we need to delete, we trade in the read lock for an exclusive
+	 * write lock; after that, we hold the write lock until we step off the
+	 * page (fortunately, _bt_relbuf doesn't care which kind of lock it's
+	 * releasing).  This should minimize the amount of work needed per page.
+	 */
+	scan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+	so = (BTScanOpaque) scan->opaque;
+	current = &(scan->currentItemData);
 
-	/* adjust any active scans that will be affected by this deletion */
-	_bt_adjscans(rel, tid);
+	/* Use _bt_first to get started, then _bt_step to remaining tuples */
+	res = _bt_first(scan, ForwardScanDirection);
 
-	/* delete the data from the page */
-	_bt_pagedel(rel, tid);
+	if (res != NULL)
+	{
+		Buffer		buf;
+		BlockNumber	lockedBlock = InvalidBlockNumber;
 
-	PG_RETURN_VOID();
+		pfree(res);
+		/* we have the buffer pinned and locked */
+		buf = so->btso_curbuf;
+		Assert(BufferIsValid(buf));
+
+		do
+		{
+			Page		page;
+			BlockNumber	blkno;
+			OffsetNumber offnum;
+			BTItem		btitem;
+			IndexTuple	itup;
+			ItemPointer	htup;
+
+			/* current is the next index tuple */
+			blkno = ItemPointerGetBlockNumber(current);
+			offnum = ItemPointerGetOffsetNumber(current);
+			page = BufferGetPage(buf);
+			btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+			itup = &btitem->bti_itup;
+			htup = &(itup->t_tid);
+
+			if (callback(htup, callback_state))
+			{
+				/*
+				 * If this is first deletion on this page, trade in read
+				 * lock for a really-exclusive write lock.  Then, step back
+				 * one and re-examine the item, because someone else might
+				 * have inserted an item while we weren't holding the lock!
+				 */
+				if (blkno != lockedBlock)
+				{
+					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+					LockBufferForCleanup(buf);
+					lockedBlock = blkno;
+				}
+				else
+				{
+					/* Delete the item from the page */
+					_bt_itemdel(rel, buf, current);
+
+					/* Mark buffer dirty, but keep the lock and pin */
+					WriteNoReleaseBuffer(buf);
+
+					tuples_removed += 1;
+				}
+
+				/*
+				 * We need to back up the scan one item so that the next
+				 * cycle will re-examine the same offnum on this page.
+				 *
+				 * For now, just hack the current-item index.  Will need
+				 * to be smarter when deletion includes removal of empty
+				 * index pages.
+				 */
+				current->ip_posid--;
+			}
+			else
+				num_index_tuples += 1;
+		} while (_bt_step(scan, &buf, ForwardScanDirection));
+	}
+
+	index_endscan(scan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
 /*
@@ -676,7 +676,7 @@ _bt_restscan(IndexScanDesc scan)
 
 	/*
 	 * Get back the read lock we were holding on the buffer. (We still
-	 * have a reference-count pin on it, though.)
+	 * have a reference-count pin on it, so need not get that.)
 	 */
 	LockBuffer(buf, BT_READ);
 
@@ -729,7 +729,7 @@ _bt_restscan(IndexScanDesc scan)
 				 "\n\tRecreate index %s.", RelationGetRelationName(rel));
 
 		blkno = opaque->btpo_next;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_READ);
 		page = BufferGetPage(buf);
 		maxoff = PageGetMaxOffsetNumber(page);
diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c
deleted file mode 100644
index e07914b3440..00000000000
--- a/src/backend/access/nbtree/nbtscan.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * btscan.c
- *	  manage scans on btrees.
- *
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.33 2001/01/24 19:42:48 momjian Exp $
- *
- *
- * NOTES
- *	 Because we can be doing an index scan on a relation while we update
- *	 it, we need to avoid missing data that moves around in the index.
- *	 Insertions and page splits are no problem because _bt_restscan()
- *	 can figure out where the current item moved to, but if a deletion
- *	 happens at or before the current scan position, we'd better do
- *	 something to stay in sync.
- *
- *	 The routines in this file handle the problem for deletions issued
- *	 by the current backend.  Currently, that's all we need, since
- *	 deletions are only done by VACUUM and it gets an exclusive lock.
- *
- *	 The scheme is to manage a list of active scans in the current backend.
- *	 Whenever we remove a record from an index, we check the list of active
- *	 scans to see if any has been affected.  A scan is affected only if it
- *	 is on the same relation, and the same page, as the update.
- *
- *-------------------------------------------------------------------------
- */
-
-#include "postgres.h"
-
-#include "access/nbtree.h"
-
-typedef struct BTScanListData
-{
-	IndexScanDesc btsl_scan;
-	struct BTScanListData *btsl_next;
-} BTScanListData;
-
-typedef BTScanListData *BTScanList;
-
-static BTScanList BTScans = (BTScanList) NULL;
-
-static void _bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
-
-/*
- * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit.
- *
- * This is here because it needs to touch this module's static var BTScans.
- */
-void
-AtEOXact_nbtree(void)
-{
-
-	/*
-	 * Note: these actions should only be necessary during xact abort; but
-	 * they can't hurt during a commit.
-	 */
-
-	/*
-	 * Reset the active-scans list to empty. We do not need to free the
-	 * list elements, because they're all palloc()'d, so they'll go away
-	 * at end of transaction anyway.
-	 */
-	BTScans = NULL;
-
-	/* If we were building a btree, we ain't anymore. */
-	BuildingBtree = false;
-}
-
-/*
- *	_bt_regscan() -- register a new scan.
- */
-void
-_bt_regscan(IndexScanDesc scan)
-{
-	BTScanList	new_el;
-
-	new_el = (BTScanList) palloc(sizeof(BTScanListData));
-	new_el->btsl_scan = scan;
-	new_el->btsl_next = BTScans;
-	BTScans = new_el;
-}
-
-/*
- *	_bt_dropscan() -- drop a scan from the scan list
- */
-void
-_bt_dropscan(IndexScanDesc scan)
-{
-	BTScanList	chk,
-				last;
-
-	last = (BTScanList) NULL;
-	for (chk = BTScans;
-		 chk != (BTScanList) NULL && chk->btsl_scan != scan;
-		 chk = chk->btsl_next)
-		last = chk;
-
-	if (chk == (BTScanList) NULL)
-		elog(ERROR, "btree scan list trashed; can't find 0x%p", (void *) scan);
-
-	if (last == (BTScanList) NULL)
-		BTScans = chk->btsl_next;
-	else
-		last->btsl_next = chk->btsl_next;
-
-	pfree(chk);
-}
-
-/*
- *	_bt_adjscans() -- adjust all scans in the scan list to compensate
- *					  for a given deletion
- */
-void
-_bt_adjscans(Relation rel, ItemPointer tid)
-{
-	BTScanList	l;
-	Oid			relid;
-
-	relid = RelationGetRelid(rel);
-	for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next)
-	{
-		if (relid == RelationGetRelid(l->btsl_scan->relation))
-			_bt_scandel(l->btsl_scan,
-						ItemPointerGetBlockNumber(tid),
-						ItemPointerGetOffsetNumber(tid));
-	}
-}
-
-/*
- *	_bt_scandel() -- adjust a single scan on deletion
- *
- */
-static void
-_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
-{
-	ItemPointer current;
-	Buffer		buf;
-	BTScanOpaque so;
-	OffsetNumber start;
-	Page		page;
-	BTPageOpaque opaque;
-
-	so = (BTScanOpaque) scan->opaque;
-	buf = so->btso_curbuf;
-
-	current = &(scan->currentItemData);
-	if (ItemPointerIsValid(current)
-		&& ItemPointerGetBlockNumber(current) == blkno
-		&& ItemPointerGetOffsetNumber(current) >= offno)
-	{
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		start = P_FIRSTDATAKEY(opaque);
-		if (ItemPointerGetOffsetNumber(current) == start)
-			ItemPointerSetInvalid(&(so->curHeapIptr));
-		else
-		{
-
-			/*
-			 * We have to lock buffer before _bt_step and unlock it after
-			 * that.
-			 */
-			LockBuffer(buf, BT_READ);
-			_bt_step(scan, &buf, BackwardScanDirection);
-			if (ItemPointerIsValid(current))
-			{
-				Page		pg = BufferGetPage(buf);
-				BTItem		btitem = (BTItem) PageGetItem(pg,
-				 PageGetItemId(pg, ItemPointerGetOffsetNumber(current)));
-
-				so->curHeapIptr = btitem->bti_itup.t_tid;
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			}
-		}
-	}
-
-	current = &(scan->currentMarkData);
-	if (ItemPointerIsValid(current)
-		&& ItemPointerGetBlockNumber(current) == blkno
-		&& ItemPointerGetOffsetNumber(current) >= offno)
-	{
-		page = BufferGetPage(so->btso_mrkbuf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		start = P_FIRSTDATAKEY(opaque);
-
-		if (ItemPointerGetOffsetNumber(current) == start)
-			ItemPointerSetInvalid(&(so->mrkHeapIptr));
-		else
-		{
-			ItemPointerData tmp;
-
-			tmp = *current;
-			*current = scan->currentItemData;
-			scan->currentItemData = tmp;
-			so->btso_curbuf = so->btso_mrkbuf;
-			so->btso_mrkbuf = buf;
-			buf = so->btso_curbuf;
-			LockBuffer(buf, BT_READ);	/* as above */
-
-			_bt_step(scan, &buf, BackwardScanDirection);
-
-			so->btso_curbuf = so->btso_mrkbuf;
-			so->btso_mrkbuf = buf;
-			tmp = *current;
-			*current = scan->currentItemData;
-			scan->currentItemData = tmp;
-			if (ItemPointerIsValid(current))
-			{
-				Page		pg = BufferGetPage(buf);
-				BTItem		btitem = (BTItem) PageGetItem(pg,
-				 PageGetItemId(pg, ItemPointerGetOffsetNumber(current)));
-
-				so->mrkHeapIptr = btitem->bti_itup.t_tid;
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);	/* as above */
-			}
-		}
-	}
-}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 59bf5358e4f..295387ed517 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.66 2001/03/23 04:49:51 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.67 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -94,7 +94,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
 		new_stack->bts_parent = stack_in;
 
 		/* drop the read lock on the parent page, acquire one on the child */
-		_bt_relbuf(rel, *bufP, BT_READ);
+		_bt_relbuf(rel, *bufP);
 		*bufP = _bt_getbuf(rel, blkno, BT_READ);
 
 		/*
@@ -155,7 +155,7 @@ _bt_moveright(Relation rel,
 		/* step right one page */
 		BlockNumber rblkno = opaque->btpo_next;
 
-		_bt_relbuf(rel, buf, access);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, rblkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -406,7 +406,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	/* No more items, so close down the current-item info */
 	ItemPointerSetInvalid(current);
 	so->btso_curbuf = InvalidBuffer;
-	_bt_relbuf(rel, buf, BT_READ);
+	_bt_relbuf(rel, buf);
 
 	return (RetrieveIndexResult) NULL;
 }
@@ -760,7 +760,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 nomatches:
 		ItemPointerSetInvalid(current);
 		so->btso_curbuf = InvalidBuffer;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		res = (RetrieveIndexResult) NULL;
 	}
 
@@ -815,14 +815,14 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				/* if we're at end of scan, release the buffer and return */
 				if (P_RIGHTMOST(opaque))
 				{
-					_bt_relbuf(rel, *bufP, BT_READ);
+					_bt_relbuf(rel, *bufP);
 					ItemPointerSetInvalid(current);
 					*bufP = so->btso_curbuf = InvalidBuffer;
 					return false;
 				}
 				/* step right one page */
 				blkno = opaque->btpo_next;
-				_bt_relbuf(rel, *bufP, BT_READ);
+				_bt_relbuf(rel, *bufP);
 				*bufP = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(*bufP);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -846,7 +846,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				/* if we're at end of scan, release the buffer and return */
 				if (P_LEFTMOST(opaque))
 				{
-					_bt_relbuf(rel, *bufP, BT_READ);
+					_bt_relbuf(rel, *bufP);
 					ItemPointerSetInvalid(current);
 					*bufP = so->btso_curbuf = InvalidBuffer;
 					return false;
@@ -854,7 +854,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				/* step left */
 				obknum = BufferGetBlockNumber(*bufP);
 				blkno = opaque->btpo_prev;
-				_bt_relbuf(rel, *bufP, BT_READ);
+				_bt_relbuf(rel, *bufP);
 				*bufP = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(*bufP);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -868,7 +868,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				while (opaque->btpo_next != obknum)
 				{
 					blkno = opaque->btpo_next;
-					_bt_relbuf(rel, *bufP, BT_READ);
+					_bt_relbuf(rel, *bufP);
 					*bufP = _bt_getbuf(rel, blkno, BT_READ);
 					page = BufferGetPage(*bufP);
 					opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -952,7 +952,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		itup = &(btitem->bti_itup);
 		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
 
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_READ);
 
 		page = BufferGetPage(buf);
@@ -968,7 +968,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 			do
 			{
 				blkno = opaque->btpo_next;
-				_bt_relbuf(rel, buf, BT_READ);
+				_bt_relbuf(rel, buf);
 				buf = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(buf);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1035,7 +1035,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		/* no tuples in the index match this scan key */
 		ItemPointerSetInvalid(current);
 		so->btso_curbuf = InvalidBuffer;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		res = (RetrieveIndexResult) NULL;
 	}
 
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c
index a8c6a13ea3c..21831ef5d61 100644
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.62 2001/05/07 00:43:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.63 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,7 +62,20 @@ typedef struct RTSTATE
 	FmgrInfo	interFn;		/* intersection function */
 } RTSTATE;
 
+/* Working state for rtbuild and its callback */
+typedef struct
+{
+	RTSTATE		rtState;
+	double		indtuples;
+} RTBuildState;
+
 /* non-export function prototypes */
+static void rtbuildCallback(Relation index,
+							HeapTuple htup,
+							Datum *attdata,
+							char *nulls,
+							bool tupleIsAlive,
+							void *state);
 static InsertIndexResult rtdoinsert(Relation r, IndexTuple itup,
 		   RTSTATE *rtstate);
 static void rttighten(Relation r, RTSTACK *stk, Datum datum, int att_size,
@@ -81,165 +94,44 @@ static int	nospace(Page p, IndexTuple it);
 static void initRtstate(RTSTATE *rtstate, Relation index);
 
 
+/*
+ * routine to build an index.  Basically calls insert over and over
+ */
 Datum
 rtbuild(PG_FUNCTION_ARGS)
 {
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
+	double		reltuples;
+	RTBuildState buildstate;
+	Buffer		buffer;
 
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-	Buffer		buffer = InvalidBuffer;
-	RTSTATE		rtState;
+	/* no locking is needed */
 
-	initRtstate(&rtState, index);
+	initRtstate(&buildstate.rtState, index);
 
 	/*
 	 * We expect to be called exactly once for any index relation. If
 	 * that's not the case, big trouble's what we have.
 	 */
-	if (oldPred == NULL && RelationGetNumberOfBlocks(index) != 0)
-		elog(ERROR, "%s already contains data", RelationGetRelationName(index));
-
-	/* initialize the root page (if this is a new index) */
-	if (oldPred == NULL)
-	{
-		buffer = ReadBuffer(index, P_NEW);
-		RTInitBuffer(buffer, F_LEAF);
-		WriteBuffer(buffer);
-	}
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
-	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
-	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-	/* count the tuples as we insert them */
-	nhtups = nitups = 0.0;
-
-	/* start a heap scan */
-	hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-		itup->t_tid = htup->t_self;
+	/* initialize the root page */
+	buffer = ReadBuffer(index, P_NEW);
+	RTInitBuffer(buffer, F_LEAF);
+	WriteBuffer(buffer);
 
-		/*
-		 * Since we already have the index relation locked, we call
-		 * rtdoinsert directly.  Normal access method calls dispatch
-		 * through rtinsert, which locks the relation for write.  This is
-		 * the right thing to do if you're inserting single tups, but not
-		 * when you're initializing the whole index at once.
-		 */
+	/* build the index */
+	buildstate.indtuples = 0;
 
-		res = rtdoinsert(index, itup, &rtState);
-		pfree(itup);
-		pfree(res);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   rtbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
 
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
@@ -259,20 +151,57 @@ rtbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
-		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
-		}
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
 	}
 
 	PG_RETURN_VOID();
 }
 
 /*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+rtbuildCallback(Relation index,
+				HeapTuple htup,
+				Datum *attdata,
+				char *nulls,
+				bool tupleIsAlive,
+				void *state)
+{
+	RTBuildState   *buildstate = (RTBuildState *) state;
+	IndexTuple	itup;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	/* rtree indexes don't index nulls, see notes in rtinsert */
+	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
+		return;
+	}
+
+	/*
+	 * Since we already have the index relation locked, we call
+	 * rtdoinsert directly.  Normal access method calls dispatch
+	 * through rtinsert, which locks the relation for write.  This is
+	 * the right thing to do if you're inserting single tups, but not
+	 * when you're initializing the whole index at once.
+	 */
+	res = rtdoinsert(index, itup, &buildstate->rtState);
+
+	if (res)
+		pfree(res);
+
+	buildstate->indtuples += 1;
+
+	pfree(itup);
+}
+
+/*
  *	rtinsert -- wrapper for rtree tuple insertion.
  *
  *	  This is the public interface routine for tuple insertion in rtrees.
@@ -285,10 +214,8 @@ rtinsert(PG_FUNCTION_ARGS)
 	Datum	   *datum = (Datum *) PG_GETARG_POINTER(1);
 	char	   *nulls = (char *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
 #ifdef NOT_USED
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
-
 #endif
 	InsertIndexResult res;
 	IndexTuple	itup;
@@ -297,12 +224,24 @@ rtinsert(PG_FUNCTION_ARGS)
 	/* generate an index tuple */
 	itup = index_formtuple(RelationGetDescr(r), datum, nulls);
 	itup->t_tid = *ht_ctid;
+
+	/*
+	 * Currently, rtrees do not support indexing NULLs; considerable
+	 * infrastructure work would have to be done to do anything reasonable
+	 * with a NULL.
+	 */
+	if (IndexTupleHasNulls(itup))
+	{
+		pfree(itup);
+		PG_RETURN_POINTER((InsertIndexResult) NULL);
+	}
+
 	initRtstate(&rtState, r);
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices()
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Since rtree is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
 	 */
 
 	res = rtdoinsert(r, itup, &rtState);
@@ -1104,40 +1043,92 @@ freestack(RTSTACK *s)
 	}
 }
 
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-rtdelete(PG_FUNCTION_ARGS)
+rtbulkdelete(PG_FUNCTION_ARGS)
 {
-	Relation	r = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
-	BlockNumber blkno;
-	OffsetNumber offnum;
-	Buffer		buf;
-	Page		page;
+	Relation	rel = (Relation) PG_GETARG_POINTER(0);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc iscan;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
 
 	/*
-	 * Notes in ExecUtils:ExecOpenIndices() Also note that only vacuum
-	 * deletes index tuples now...
-	 *
-	 * RelationSetLockForWrite(r);
+	 * Since rtree is not marked "amconcurrent" in pg_am, caller should
+	 * have acquired exclusive lock on index relation.  We need no locking
+	 * here.
 	 */
 
-	blkno = ItemPointerGetBlockNumber(tid);
-	offnum = ItemPointerGetOffsetNumber(tid);
+	/*
+	 * XXX generic implementation --- should be improved!
+	 */
 
-	/* adjust any scans that will be affected by this deletion */
-	rtadjscans(r, RTOP_DEL, blkno, offnum);
+	/* walk through the entire index */
+	iscan = index_beginscan(rel, false, 0, (ScanKey) NULL);
 
-	/* delete the index tuple */
-	buf = ReadBuffer(r, blkno);
-	page = BufferGetPage(buf);
+	while ((res = index_getnext(iscan, ForwardScanDirection))
+		   != (RetrieveIndexResult) NULL)
+	{
+		ItemPointer heapptr = &res->heap_iptr;
 
-	PageIndexTupleDelete(page, offnum);
+		if (callback(heapptr, callback_state))
+		{
+			ItemPointer indexptr = &res->index_iptr;
+			BlockNumber blkno;
+			OffsetNumber offnum;
+			Buffer		buf;
+			Page		page;
 
-	WriteBuffer(buf);
+			blkno = ItemPointerGetBlockNumber(indexptr);
+			offnum = ItemPointerGetOffsetNumber(indexptr);
 
-	PG_RETURN_VOID();
+			/* adjust any scans that will be affected by this deletion */
+			/* (namely, my own scan) */
+			rtadjscans(rel, RTOP_DEL, blkno, offnum);
+
+			/* delete the index tuple */
+			buf = ReadBuffer(rel, blkno);
+			page = BufferGetPage(buf);
+
+			PageIndexTupleDelete(page, offnum);
+
+			WriteBuffer(buf);
+
+			tuples_removed += 1;
+		}
+		else
+			num_index_tuples += 1;
+
+		pfree(res);
+	}
+
+	index_endscan(iscan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
+
 static void
 initRtstate(RTSTATE *rtstate, Relation index)
 {
diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c
index c9f1ab7b893..1311cfdc29a 100644
--- a/src/backend/access/rtree/rtscan.c
+++ b/src/backend/access/rtree/rtscan.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.37 2001/06/09 18:16:56 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.38 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,13 +60,8 @@ rtbeginscan(PG_FUNCTION_ARGS)
 	ScanKey		key = (ScanKey) PG_GETARG_POINTER(3);
 	IndexScanDesc s;
 
-	/*
-	 * Let index_beginscan does its work...
-	 *
-	 * RelationSetLockForRead(r);
-	 */
-
 	s = RelationGetIndexScan(r, fromEnd, nkeys, key);
+
 	rtregscan(s);
 
 	PG_RETURN_POINTER(s);
@@ -282,6 +277,27 @@ rtdropscan(IndexScanDesc s)
 	pfree(l);
 }
 
+/*
+ * AtEOXact_rtree() --- clean up rtree subsystem at xact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var RTScans.
+ */
+void
+AtEOXact_rtree(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Reset the active-scans list to empty. We do not need to free the
+	 * list elements, because they're all palloc()'d, so they'll go away
+	 * at end of transaction anyway.
+	 */
+	RTScans = NULL;
+}
+
 void
 rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum)
 {
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 64671792315..d32a6dda978 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.106 2001/07/12 04:11:13 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.107 2001/07/15 22:48:16 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -156,7 +156,10 @@
 
 #include <sys/time.h>
 
+#include "access/gistscan.h"
+#include "access/hash.h"
 #include "access/nbtree.h"
+#include "access/rtree.h"
 #include "access/xact.h"
 #include "catalog/heap.h"
 #include "catalog/index.h"
@@ -1040,7 +1043,10 @@ CommitTransaction(void)
 	smgrDoPendingDeletes(true);
 
 	AtEOXact_SPI();
+	AtEOXact_gist();
+	AtEOXact_hash();
 	AtEOXact_nbtree();
+	AtEOXact_rtree();
 	AtCommit_Cache();
 	AtCommit_Locks();
 	AtEOXact_CatCache(true);
@@ -1147,7 +1153,10 @@ AbortTransaction(void)
 	smgrDoPendingDeletes(false);
 
 	AtEOXact_SPI();
+	AtEOXact_gist();
+	AtEOXact_hash();
 	AtEOXact_nbtree();
+	AtEOXact_rtree();
 	AtAbort_Cache();
 	AtEOXact_CatCache(false);
 	AtAbort_Memory();