61 files changed, 3440 insertions, 501 deletions
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index a5253da0211..70867ac40ba 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -359,7 +359,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
 	*prdata = rdata;
 	data.updateBlkno = entryPreparePage(btree, page, off);
 
-	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false);
+	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
 	if (placed != off)
 		elog(ERROR, "failed to add item to index page in \"%s\"",
 			 RelationGetRelationName(btree->index));
@@ -488,7 +488,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
 			lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
 		}
 
-		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(btree->index));
 		ptr += MAXALIGN(IndexTupleSize(itup));
@@ -563,11 +563,11 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
 	page = BufferGetPage(root);
 
 	itup = ginPageGetLinkItup(lbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 
 	itup = ginPageGetLinkItup(rbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 }
 
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 91f7f3e5f8b..1f26869d646 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.16 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.17 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -544,7 +544,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
 				itup = GinFormTuple(&gvs->ginstate, value, GinGetPosting(itup), newN);
 				PageIndexTupleDelete(tmppage, i);
 
-				if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false) != i)
+				if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
 					elog(ERROR, "failed to add item to index page in \"%s\"",
 						 RelationGetRelationName(gvs->index));
 
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index db2d6b39336..bf2174c37c7 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -199,7 +199,7 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
 
 		itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
 
-		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				 data->node.spcNode, data->node.dbNode, data->node.relNode);
 
@@ -281,7 +281,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
 		for (i = 0; i < data->separator; i++)
 		{
-			if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -289,7 +289,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
 		for (i = data->separator; i < data->nitem; i++)
 		{
-			if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -375,7 +375,7 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
 
 		for (i = 0; i < data->nitem; i++)
 		{
-			if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index fce9a94ebae..0c1b94d7d38 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.146 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.147 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -366,7 +366,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
 			data = (char *) (ptr->list);
 			for (i = 0; i < ptr->block.num; i++)
 			{
-				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 					elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
 				data += IndexTupleSize((IndexTuple) data);
 			}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 6d4f31d53b2..409377d1d14 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.23 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.24 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -42,7 +42,7 @@ gistfillbuffer(Relation r, Page page, IndexTuple *itup,
 	for (i = 0; i < len; i++)
 	{
 		l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]),
-						off, false);
+						off, false, false);
 		if (l == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(r));
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 0abd0197ad3..212995e7c57 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.31 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.32 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -201,7 +201,7 @@ vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon,
 		data = (char *) (ptr->list);
 		for (i = 0; i < ptr->block.num; i++)
 		{
-			if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index));
 			data += IndexTupleSize((IndexTuple) data);
 		}
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index c82ad0ad9fb..8cbf0294acf 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.46 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.47 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -200,7 +200,7 @@ _hash_pgaddtup(Relation rel,
 	page = BufferGetPage(buf);
 
 	itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
-	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false)
+	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
 		== InvalidOffsetNumber)
 		elog(ERROR, "failed to add index item to \"%s\"",
 			 RelationGetRelationName(rel));
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 7e87f308b26..e4ea24a62d1 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.59 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.60 2007/09/20 17:56:30 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -684,7 +684,7 @@ _hash_squeezebucket(Relation rel,
 			 * we have found room so insert on the "write" page.
 			 */
 			woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
-			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false)
+			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false)
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 4b1450926d3..807dbed8a8c 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.69 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.70 2007/09/20 17:56:30 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -830,7 +830,7 @@ _hash_splitbucket(Relation rel,
 			}
 
 			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
-			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false)
+			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false)
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile
index ac2401232bb..aff2847bab5 100644
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/heap
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.16 2007/06/08 18:23:52 tgl Exp $
+#    $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.17 2007/09/20 17:56:30 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@ subdir = src/backend/access/heap
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o
+OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
new file mode 100644
index 00000000000..8cf0fa44de6
--- /dev/null
+++ b/src/backend/access/heap/README.HOT
@@ -0,0 +1,489 @@
+$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.1 2007/09/20 17:56:30 tgl Exp $
+
+                           Heap Only Tuples (HOT)
+
+Introduction
+------------
+
+The Heap Only Tuple (HOT) feature eliminates redundant index entries and
+allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples
+without performing a table-wide vacuum.  It does this by allowing
+single-page vacuuming, also called "defragmentation".
+
+Note: there is a Glossary at the end of this document that may be helpful
+for first-time readers.
+
+
+Technical Challenges
+--------------------
+
+Page-at-a-time vacuuming is normally impractical because of the costs of
+finding and removing the index entries that link to the tuples to be
+reclaimed.  Standard vacuuming scans the indexes to ensure all such index
+entries are removed, amortizing the index scan cost across as many dead
+tuples as possible; this approach does not scale down well to the case of
+reclaiming just a few tuples.  In principle one could recompute the index
+keys and do standard index searches to find the index entries, but this is
+risky in the presence of possibly-buggy user-defined functions in
+functional indexes.  An allegedly immutable function that in fact is not
+immutable might prevent us from re-finding an index entry (and we cannot
+throw an error for not finding it, in view of the fact that dead index
+entries are sometimes reclaimed early).  That would lead to a seriously
+corrupt index, in the form of entries pointing to tuple slots that by now
+contain some unrelated content.  In any case we would prefer to be able
+to do vacuuming without invoking any user-written code.
+
+HOT solves this problem for a restricted but useful special case:
+where a tuple is repeatedly updated in ways that do not change its
+indexed columns.  (Here, "indexed column" means any column referenced
+at all in an index definition, including for example columns that are
+tested in a partial-index predicate but are not stored in the index.)
+
+An additional property of HOT is that it reduces index size by avoiding
+the creation of identically-keyed index entries.  This improves search
+speeds.
+
+
+Update Chains With a Single Index Entry
+---------------------------------------
+
+Without HOT, every version of a row in an update chain has its own index
+entries, even if all indexed columns are the same.  With HOT, a new tuple
+placed on the same page and with all indexed columns the same as its
+parent row version does not get new index entries.  This means there is
+only one index entry for the entire update chain on the heap page.
+An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag.
+The prior row version is marked HEAP_HOT_UPDATED, and (as always in an
+update chain) its t_ctid field links forward to the newer version.
+
+For example:
+
+	Index points to 1
+	lp [1]  [2]
+
+	[111111111]->[2222222222]
+
+In the above diagram, the index points to line pointer 1, and tuple 1 is
+marked as HEAP_HOT_UPDATED.  Tuple 2 is a HOT tuple, meaning it has
+no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE.
+Although tuple 2 is not directly referenced by the index, it can still be
+found by an index search: after traversing from the index to tuple 1,
+the index search proceeds forward to child tuples as long as it sees the
+HEAP_HOT_UPDATED flag set.  Since we restrict the HOT chain to lie within
+a single page, this requires no additional page fetches and doesn't
+introduce much performance penalty.
+
+Eventually, tuple 1 will no longer be visible to any transaction.
+At that point its space could be reclaimed, but its line pointer cannot,
+since the index still links to that line pointer and we still need to
+be able to find tuple 2 in an index search.  HOT handles this by turning
+line pointer 1 into a "redirecting line pointer", which links to tuple 2
+but has no actual tuple attached.  This state of affairs looks like
+
+	Index points to 1
+	lp [1]->[2]
+
+	[2222222222]
+
+If now the row is updated again, to version 3, the page looks like this:
+
+	Index points to 1
+	lp [1]->[2]  [3]
+
+	[2222222222]->[3333333333]
+
+At some later time when no transaction can see tuple 2 in its snapshot,
+tuple 2 and its line pointer can be pruned entirely:
+
+	Index points to 1
+	lp [1]------>[3]
+
+	[3333333333]
+
+This is safe because no index entry points to line pointer 2.  Subsequent
+insertions into the page can now recycle both line pointer 2 and the
+space formerly used by tuple 2.
+
+If an update changes any indexed column, or there is not room on the
+same page for the new tuple, then the HOT chain ends: the last member
+has a regular t_ctid link to the next version and is not marked
+HEAP_HOT_UPDATED.  (In principle we could continue a HOT chain across
+pages, but this would destroy the desired property of being able to
+reclaim space with just page-local manipulations.  Anyway, we don't
+want to have to chase through multiple heap pages to get from an index
+entry to the desired tuple, so it seems better to create a new index
+entry for the new tuple.)  If further updates occur, the next version
+could become the root of a new HOT chain.
+
+Line pointer 1 has to remain as long as there is any non-dead member of
+the chain on the page.  When there is not, it is marked "dead".
+This lets us reclaim the last child line pointer and associated tuple
+immediately.  The next regular VACUUM pass can reclaim the index entries
+pointing at the line pointer and then the line pointer itself.  Since a
+line pointer is small compared to a tuple, this does not represent an
+undue space cost.
+
+Note: we can use a "dead" line pointer for any DELETEd tuple,
+whether it was part of a HOT chain or not.  This allows space reclamation
+in advance of running VACUUM for plain DELETEs as well as HOT updates.
+
+The requirement for doing a HOT update is that none of the indexed
+columns are changed.  This is checked at execution time by comparing the
+binary representation of the old and new values.  We insist on bitwise
+equality rather than using datatype-specific equality routines.  The
+main reason to avoid the latter is that there might be multiple notions
+of equality for a datatype, and we don't know exactly which one is
+relevant for the indexes at hand.  We assume that bitwise equality
+guarantees equality for all purposes.
+
+
+Abort Cases
+-----------
+
+If a heap-only tuple's xmin is aborted, then it can be removed immediately:
+it was never visible to any other transaction, and all descendant row
+versions must be aborted as well.  Therefore we need not consider it part
+of a HOT chain.  By the same token, if a HOT-updated tuple's xmax is
+aborted, there is no need to follow the chain link.  However, there is a
+race condition here: the transaction that did the HOT update might abort
+between the time we inspect the HOT-updated tuple and the time we reach
+the descendant heap-only tuple.  It is conceivable that someone prunes
+the heap-only tuple before that, and even conceivable that the line pointer
+is re-used for another purpose.  Therefore, when following a HOT chain,
+it is always necessary to be prepared for the possibility that the
+linked-to item pointer is unused, dead, or redirected; and if it is a
+normal item pointer, we still have to check that XMIN of the tuple matches
+the XMAX of the tuple we left.  Otherwise we should assume that we have
+come to the end of the HOT chain.  Note that this sort of XMIN/XMAX
+matching is required when following ordinary update chains anyway.
+
+(Early versions of the HOT code assumed that holding pin on the page
+buffer while following a HOT link would prevent this type of problem,
+but checking XMIN/XMAX matching is a much more robust solution.)
+
+
+Index/Sequential Scans
+----------------------
+
+When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose
+xmax is not aborted, we need to follow its t_ctid link and check that
+entry as well; possibly repeatedly until we reach the end of the HOT
+chain.  (When using an MVCC snapshot it is possible to optimize this a
+bit: there can be at most one visible tuple in the chain, so we can stop
+when we find it.  This rule does not work for non-MVCC snapshots, though.)
+
+Sequential scans do not need to pay attention to the HOT links because
+they scan every item pointer on the page anyway.  The same goes for a
+bitmap heap scan with a lossy bitmap.
+
+
+Pruning
+-------
+
+HOT pruning means updating item pointers so that HOT chains are
+reduced in length, by collapsing out line pointers for intermediate dead
+tuples.  Although this makes those line pointers available for re-use,
+it does not immediately make the space occupied by their tuples available.
+
+
+Defragmentation
+---------------
+
+Defragmentation centralizes unused space.  After we have converted root
+line pointers to redirected line pointers and pruned away any dead
+intermediate line pointers, the tuples they linked to are free space.
+But unless that space is adjacent to the central "hole" on the page
+(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion.
+Defragmentation moves the surviving tuples to coalesce all the free
+space into one "hole".  This is done with the same PageRepairFragmentation
+function that regular VACUUM uses.
+
+
+When can/should we prune or defragment?
+---------------------------------------
+
+This is the most interesting question in HOT implementation, since there
+is no simple right answer: we must use heuristics to determine when it's
+most efficient to perform pruning and/or defragmenting.
+
+We cannot prune or defragment unless we can get a "buffer cleanup lock"
+on the target page; otherwise, pruning might destroy line pointers that
+other backends have live references to, and defragmenting might move
+tuples that other backends have live pointers to.  Thus the general
+approach must be to heuristically decide if we should try to prune
+or defragment, and if so try to acquire the buffer cleanup lock without
+blocking.  If we succeed we can proceed with our housekeeping work.
+If we cannot get the lock (which should not happen often, except under
+very heavy contention) then the housekeeping has to be postponed till
+some other time.  The worst-case consequence of this is only that an
+UPDATE cannot be made HOT but has to link to a new tuple version placed on
+some other page, for lack of centralized space on the original page.
+
+Ideally we would do defragmenting only when we are about to attempt
+heap_update on a HOT-safe tuple.  The difficulty with this approach
+is that the update query has certainly got a pin on the old tuple, and
+therefore our attempt to acquire a buffer cleanup lock will always fail.
+(This corresponds to the idea that we don't want to move the old tuple
+out from under where the query's HeapTuple pointer points.  It might
+be possible to finesse that, but it seems fragile.)
+
+Pruning, however, is potentially useful even when we are not about to
+insert a new tuple, since shortening a HOT chain reduces the cost of
+subsequent index searches.  However it is unclear that this gain is
+large enough to accept any extra maintenance burden for.
+
+The currently planned heuristic is to prune and defrag when first accessing
+a page that potentially has prunable tuples (flagged by the PD_PRUNABLE
+page hint bit) and that either has free space less than MAX(fillfactor
+target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to
+find enough free space to store an updated tuple version.  (These rules
+are subject to change.)
+
+We have effectively implemented the "truncate dead tuples to just line
+pointer" idea that has been proposed and rejected before because of fear
+of line pointer bloat: we might end up with huge numbers of line pointers
+and just a few actual tuples on a page.  To limit the damage in the worst
+case, and to keep various work arrays as well as the bitmaps in bitmap
+scans reasonably sized, the maximum number of line pointers per page
+is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that
+could fit without HOT pruning).
+
+
+VACUUM
+------
+
+There is little change to regular vacuum.  It performs pruning to remove
+dead heap-only tuples, and cleans up any dead line pointers as if they were
+regular dead tuples.
+
+
+VACUUM FULL
+-----------
+
+VACUUM FULL performs an extra operation of collapsing out redirecting line
+pointers, by moving the first non-DEAD tuple of each HOT chain to the root
+position and clearing its heap-only-tuple flag.  This effectively changes
+the user-visible CTID of that tuple.  This would be completely unsafe
+during normal concurrent operation, but since VACUUM FULL takes full
+exclusive lock on the table, it should be OK.  (Note that VACUUM FULL has
+always felt free to change tuples' CTIDs by moving them across pages.)
+Eliminating redirection links means that the main body of VACUUM FULL
+doesn't have to deal with them, which seems a good thing since VACUUM FULL
+is horrendously complex already.
+
+When VACUUM FULL tries to move tuple chains, it does not distinguish regular
+and heap-only tuples, but just moves both types the same.  This is OK because
+it will move the entire non-DEAD tail of an update chain and remove index
+entries for each item moved.  At worst, we'll uselessly search for index
+entries matching the heap-only tuples included in the move.
+
+
+Statistics
+----------
+
+Currently, we count HOT updates the same as cold updates for statistics
+purposes, though there is an additional per-table counter that counts
+only HOT updates.  When a page pruning operation is able to remove a
+physical tuple by eliminating an intermediate heap-only tuple or
+replacing a physical root tuple by a redirect pointer, a decrement in
+the table's number of dead tuples is reported to pgstats, which may
+postpone autovacuuming.  Note that we do not count replacing a root tuple
+by a DEAD item pointer as decrementing n_dead_tuples; we still want
+autovacuum to run to clean up the index entries and DEAD item.
+
+This area probably needs further work ...
+
+
+CREATE INDEX
+------------
+
+CREATE INDEX presents a problem for HOT updates.  While the existing HOT
+chains all have the same index values for existing indexes, the columns
+in the new index might change within a pre-existing HOT chain, creating
+a "broken" chain that can't be indexed properly.
+
+To address this issue, regular (non-concurrent) CREATE INDEX makes the
+new index usable only by transactions newer than the CREATE INDEX
+command.  This prevents transactions that can see the inconsistent HOT
+chains from trying to use the new index and getting incorrect results.  
+New transactions can only see the rows visible after the index was
+created, hence the HOT chains are consistent for them.
+
+Entries in the new index point to root tuples (tuples with current index
+pointers) so that our index uses the same index pointers as all other
+indexes on the table.  However the row we want to index is actually at
+the *end* of the chain, ie, the most recent live tuple on the HOT chain.
+That is the one we compute the index entry values for, but the TID
+we put into the index is that of the root tuple.  Since transactions that
+will be allowed to use the new index cannot see any of the older tuple
+versions in the chain, the fact that they might not match the index entry
+isn't a problem.  (Such transactions will check the tuple visibility
+information of the older versions and ignore them, without ever looking at
+their contents, so the content inconsistency is OK.)  Subsequent updates
+to the live tuple will be allowed to extend the HOT chain only if they are
+HOT-safe for all the indexes.
+
+Because we have ShareLock on the table, any DELETE_IN_PROGRESS or
+INSERT_IN_PROGRESS tuples should have come from our own transaction.
+Therefore we can consider them committed since if the CREATE INDEX
+commits, they will be committed, and if it aborts the index is discarded.
+An exception to this is that early lock release is customary for system
+catalog updates, and so we might find such tuples when reindexing a system
+catalog.  In that case we deal with it by waiting for the source
+transaction to commit or roll back.  (We could do that for user tables
+too, but since the case is unexpected we prefer to throw an error.)
+
+Practically, we prevent old transactions from using the new index by
+setting pg_index.indcheckxmin to TRUE.  Queries are allowed to use such an
+index only after pg_index.xmin is below their TransactionXmin horizon,
+thereby ensuring that any incompatible rows in HOT chains are dead to them.
+(pg_index.xmin will be the XID of the CREATE INDEX transaction.  The reason
+for using xmin rather than a normal column is that the regular vacuum
+freezing mechanism will take care of converting xmin to FrozenTransactionId
+before it can wrap around.)
+
+This means in particular that the transaction creating the index will be
+unable to use the index.  We alleviate that problem somewhat by not setting
+indcheckxmin unless the table actually contains HOT chains with
+RECENTLY_DEAD members.  (In 8.4 we may be able to improve the situation,
+at least for non-serializable transactions, because we expect to be able to
+advance TransactionXmin intratransaction.)
+
+Another unpleasant consequence is that it is now risky to use SnapshotAny
+in an index scan: if the index was created more recently than the last
+vacuum, it's possible that some of the visited tuples do not match the
+index entry they are linked to.  This does not seem to be a fatal
+objection, since there are few users of SnapshotAny and most use seqscans.
+The only exception at this writing is CLUSTER, which is okay because it
+does not require perfect ordering of the indexscan readout (and especially
+so because CLUSTER tends to write recently-dead tuples out of order anyway).
+
+
+CREATE INDEX CONCURRENTLY
+-------------------------
+
+In the concurrent case we must take a different approach.  We create the
+pg_index entry immediately, before we scan the table.  The pg_index entry
+is marked as "not ready for inserts".  Then we commit and wait for any
+transactions which have the table open to finish.  This ensures that no
+new HOT updates will change the key value for our new index, because all
+transactions will see the existence of the index and will respect its
+constraint on which updates can be HOT.  Other transactions must include
+such an index when determining HOT-safety of updates, even though they
+must ignore it for both insertion and searching purposes.
+
+We must do this to avoid making incorrect index entries.  For example,
+suppose we are building an index on column X and we make an index entry for
+a non-HOT tuple with X=1.  Then some other backend, unaware that X is an
+indexed column, HOT-updates the row to have X=2, and commits.  We now have
+an index entry for X=1 pointing at a HOT chain whose live row has X=2.
+We could make an index entry with X=2 during the validation pass, but
+there is no nice way to get rid of the wrong entry with X=1.  So we must
+have the HOT-safety property enforced before we start to build the new
+index.
+
+After waiting for transactions which had the table open, we build the index
+for all rows that are valid in a fresh snapshot.  Any tuples visible in the
+snapshot will have only valid forward-growing HOT chains.  (They might have
+older HOT updates behind them which are broken, but this is OK for the same
+reason it's OK in a regular index build.)  As above, we point the index
+entry at the root of the HOT-update chain but we use the key value from the
+live tuple.
+
+We mark the index open for inserts (but still not ready for reads) then
+we again wait for transactions which have the table open.  Then we take
+a second reference snapshot and validate the index.  This searches for
+tuples missing from the index, and inserts any missing ones.  Again,
+the index entries have to have TIDs equal to HOT-chain root TIDs, but
+the value to be inserted is the one from the live tuple.
+
+Then we wait until every transaction that could have a snapshot older than
+the second reference snapshot is finished.  This ensures that nobody is
+alive any longer who could need to see any tuples that might be missing
+from the index, as well as ensuring that no one can see any inconsistent
+rows in a broken HOT chain (the first condition is stronger than the
+second).  Finally, we can mark the index valid for searches.
+
+
+Limitations and Restrictions
+----------------------------
+
+It is worth noting that HOT forever forecloses alternative approaches
+to vacuuming, specifically the recompute-the-index-keys approach alluded
+to in Technical Challenges above.  It'll be tough to recompute the index
+keys for a root line pointer you don't have data for anymore ...
+
+
+Glossary
+--------
+
+Broken HOT Chain
+
+	A HOT chain in which the key value for an index has changed.
+
+	This is not allowed to occur normally but if a new index is created
+	it can happen.  In that case various strategies are used to ensure
+	that no transaction for which the older tuples are visible can
+	use the index.
+
+Cold update
+
+	A normal, non-HOT update, in which index entries are made for
+	the new version of the tuple.
+
+Dead line pointer
+
+	A stub line pointer, that does not point to anything, but cannot
+	be removed or reused yet because there are index pointers to it.
+	Semantically same as a dead tuple.  It has state LP_DEAD.
+
+Heap-only tuple
+
+	A heap tuple with no index pointers, which can only be reached
+	from indexes indirectly through its ancestral root tuple.
+	Marked with HEAP_ONLY_TUPLE flag.
+
+HOT-safe
+
+	A proposed tuple update is said to be HOT-safe if it changes
+	none of the tuple's indexed columns.  It will only become an
+	actual HOT update if we can find room on the same page for
+	the new tuple version.
+
+HOT update
+
+	An UPDATE where the new tuple becomes a heap-only tuple, and no
+	new index entries are made.
+
+HOT-updated tuple
+
+	An updated tuple, for which the next tuple in the chain is a
+	heap-only tuple.  Marked with HEAP_HOT_UPDATED flag.
+
+Indexed column
+
+	A column used in an index definition.  The column might not
+	actually be stored in the index --- it could be used in a
+	functional index's expression, or used in a partial index
+	predicate.  HOT treats all these cases alike.
+
+Redirecting line pointer
+
+	A line pointer that points to another line pointer and has no
+	associated tuple.  It has the special lp_flags state LP_REDIRECT,
+	and lp_off is the OffsetNumber of the line pointer it links to.
+	This is used when a root tuple becomes dead but we cannot prune
+	the line pointer because there are non-dead heap-only tuples
+	further down the chain.
+
+Root tuple
+
+	The first tuple in a HOT update chain; the one that indexes point to.
+
+Update chain
+
+	A chain of updated tuples, in which each tuple's ctid points to
+	the next tuple in the chain. A HOT update chain is an update chain
+	(or portion of an update chain) that consists of a root tuple and
+	one or more heap-only tuples.  A complete update chain can contain
+	both HOT and non-HOT (cold) updated tuples.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 09a70d813f7..d5a2f9a43d1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.240 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.241 2007/09/20 17:56:30 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -52,6 +52,7 @@
 #include "pgstat.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "utils/datum.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/relcache.h"
@@ -64,6 +65,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
 											bool is_bitmapscan);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
 		   ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
+static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+					   HeapTuple oldtup, HeapTuple newtup);
 
 
 /* ----------------------------------------------------------------
@@ -184,6 +187,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	snapshot = scan->rs_snapshot;
 
 	/*
+	 * Prune and repair fragmentation for the whole page, if possible.
+	 */
+	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
+	/*
 	 * We must hold share lock on the buffer content while examining tuple
 	 * visibility.	Afterwards, however, the tuples we have found to be
 	 * visible are guaranteed good as long as we hold the buffer pin.
@@ -316,7 +324,7 @@ heapgettup(HeapScanDesc scan,
 			 * forward scanners.
 			 */
 			scan->rs_syncscan = false;
-			/* start from last page of the scan */ 
+			/* start from last page of the scan */
 			if (scan->rs_startblock > 0)
 				page = scan->rs_startblock - 1;
 			else
@@ -368,6 +376,7 @@ heapgettup(HeapScanDesc scan,
 		dp = (Page) BufferGetPage(scan->rs_cbuf);
 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
 		lpp = PageGetItemId(dp, lineoff);
+		Assert(ItemIdIsNormal(lpp));
 
 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 		tuple->t_len = ItemIdGetLength(lpp);
@@ -583,7 +592,7 @@ heapgettup_pagemode(HeapScanDesc scan,
 			 * forward scanners.
 			 */
 			scan->rs_syncscan = false;
-			/* start from last page of the scan */ 
+			/* start from last page of the scan */
 			if (scan->rs_startblock > 0)
 				page = scan->rs_startblock - 1;
 			else
@@ -632,6 +641,7 @@ heapgettup_pagemode(HeapScanDesc scan,
 		dp = (Page) BufferGetPage(scan->rs_cbuf);
 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
 		lpp = PageGetItemId(dp, lineoff);
+		Assert(ItemIdIsNormal(lpp));
 
 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 		tuple->t_len = ItemIdGetLength(lpp);
@@ -1246,6 +1256,9 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
  * for statistical purposes.  (This could be the heap rel itself, an
  * associated index, or NULL to not count the fetch at all.)
  *
+ * heap_fetch does not follow HOT chains: only the exact TID requested will
+ * be fetched.
+ *
  * It is somewhat inconsistent that we ereport() on invalid block number but
  * return false on invalid item number.  There are a couple of reasons though.
  * One is that the caller can relatively easily check the block number for
@@ -1390,6 +1403,143 @@ heap_release_fetch(Relation relation,
 }
 
 /*
+ *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
+ *
+ * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
+ * of a HOT chain), and buffer is the buffer holding this tuple.  We search
+ * for the first chain member satisfying the given snapshot.  If one is
+ * found, we update *tid to reference that tuple's offset number, and
+ * return TRUE.  If no match, return FALSE without modifying *tid.
+ *
+ * If all_dead is not NULL, we check non-visible tuples to see if they are
+ * globally dead; *all_dead is set TRUE if all members of the HOT chain
+ * are vacuumable, FALSE if not.
+ *
+ * Unlike heap_fetch, the caller must already have pin and (at least) share
+ * lock on the buffer; it is still pinned/locked at exit.  Also unlike
+ * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
+ */
+bool
+heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
+					   bool *all_dead)
+{
+	Page dp = (Page) BufferGetPage(buffer);
+	TransactionId prev_xmax = InvalidTransactionId;
+	OffsetNumber offnum;
+	bool at_chain_start;
+
+	if (all_dead)
+		*all_dead = true;
+
+	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
+	offnum = ItemPointerGetOffsetNumber(tid);
+	at_chain_start = true;
+
+	/* Scan through possible multiple members of HOT-chain */
+	for (;;)
+	{
+		ItemId lp;
+		HeapTupleData heapTuple;
+
+		/* check for bogus TID */
+		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+			break;
+
+		lp = PageGetItemId(dp, offnum);
+
+		/* check for unused, dead, or redirected items */
+		if (!ItemIdIsNormal(lp))
+		{
+			/* We should only see a redirect at start of chain */
+			if (ItemIdIsRedirected(lp) && at_chain_start)
+			{
+				/* Follow the redirect */
+				offnum = ItemIdGetRedirect(lp);
+				at_chain_start = false;
+				continue;
+			}
+			/* else must be end of chain */
+			break;
+		}
+
+		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+		heapTuple.t_len = ItemIdGetLength(lp);
+
+		/*
+		 * Shouldn't see a HEAP_ONLY tuple at chain start.
+		 */
+		if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
+			break;
+
+		/*
+		 * The xmin should match the previous xmax value, else chain is broken.
+		 */
+		if (TransactionIdIsValid(prev_xmax) &&
+			!TransactionIdEquals(prev_xmax,
+								 HeapTupleHeaderGetXmin(heapTuple.t_data)))
+			break;
+
+		/* If it's visible per the snapshot, we must return it */
+		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
+		{
+			ItemPointerSetOffsetNumber(tid, offnum);
+			if (all_dead)
+				*all_dead = false;
+			return true;
+		}
+
+		/*
+		 * If we can't see it, maybe no one else can either.  At caller
+		 * request, check whether all chain members are dead to all
+		 * transactions.
+		 */
+		if (all_dead && *all_dead &&
+			HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
+									 buffer) != HEAPTUPLE_DEAD)
+			*all_dead = false;
+
+		/*
+		 * Check to see if HOT chain continues past this tuple; if so
+		 * fetch the next offnum and loop around.
+		 */
+		if (HeapTupleIsHotUpdated(&heapTuple))
+		{
+			Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
+				   ItemPointerGetBlockNumber(tid));
+			offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
+			at_chain_start = false;
+			prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
+		}
+		else
+			break;			/* end of chain */
+	}
+
+	return false;
+}
+
+/*
+ *	heap_hot_search		- search HOT chain for tuple satisfying snapshot
+ *
+ * This has the same API as heap_hot_search_buffer, except that the caller
+ * does not provide the buffer containing the page, rather we access it
+ * locally.
+ */
+bool
+heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
+				bool *all_dead)
+{
+	bool	result;
+	Buffer	buffer;
+
+	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+	LockBuffer(buffer, BUFFER_LOCK_SHARE);
+	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buffer);
+	return result;
+}
+
+/*
  *	heap_get_latest_tid -  get the latest tid of a specified tuple
  *
  * Actually, this gets the latest version that is visible according to
@@ -1594,6 +1744,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	}
 
 	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
 	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
 	HeapTupleHeaderSetXmin(tup->t_data, xid);
 	HeapTupleHeaderSetCmin(tup->t_data, cid);
@@ -1628,6 +1779,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 
 	RelationPutHeapTuple(relation, buffer, heaptup);
 
+	/*
+	 * XXX Should we set PageSetPrunable on this page ?
+	 *
+	 * The inserting transaction may eventually abort thus making this tuple
+	 * DEAD and hence available for pruning. Though we don't want to optimize
+	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
+	 * aborted tuple will never be pruned until next vacuum is triggered.
+	 *
+	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+	 */
+
 	MarkBufferDirty(buffer);
 
 	/* XLOG stuff */
@@ -1904,12 +2066,21 @@ l1:
 
 	START_CRIT_SECTION();
 
+	/*
+	 * If this transaction commits, the tuple will become DEAD sooner or
+	 * later. Set hint bit that this page is a candidate for pruning.  If
+	 * the transaction finally aborts, the subsequent page pruning will be
+	 * a no-op and the hint will be cleared.
+	 */
+	PageSetPrunable((Page) dp);
+
 	/* store transaction information of xact deleting the tuple */
 	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 							   HEAP_XMAX_INVALID |
 							   HEAP_XMAX_IS_MULTI |
 							   HEAP_IS_LOCKED |
 							   HEAP_MOVED);
+	HeapTupleHeaderClearHotUpdated(tp.t_data);
 	HeapTupleHeaderSetXmax(tp.t_data, xid);
 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
 	/* Make sure there is no forward chain link in t_ctid */
@@ -2045,7 +2216,8 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *
  * On success, the header fields of *newtup are updated to match the new
  * stored tuple; in particular, newtup->t_self is set to the TID where the
- * new tuple was inserted.	However, any TOAST changes in the new tuple's
+ * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
+ * update was done.  However, any TOAST changes in the new tuple's
  * data are not reflected into *newtup.
  *
  * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
@@ -2060,6 +2232,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 {
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
+	Bitmapset  *hot_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
@@ -2072,9 +2245,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 				pagefree;
 	bool		have_tuple_lock = false;
 	bool		iscombo;
+	bool		use_hot_update = false;
 
 	Assert(ItemPointerIsValid(otid));
 
+	/*
+	 * Fetch the list of attributes to be checked for HOT update.  This is
+	 * wasted effort if we fail to update or have to put the new tuple on
+	 * a different page.  But we must compute the list before obtaining
+	 * buffer lock --- in the worst case, if we are doing an update on one
+	 * of the relevant system catalogs, we could deadlock if we try to
+	 * fetch the list later.  In any case, the relcache caches the data
+	 * so this is usually pretty cheap.
+	 *
+	 * Note that we get a copy here, so we need not worry about relcache
+	 * flush happening midway through.
+	 */
+	hot_attrs = RelationGetIndexAttrBitmap(relation);
+
 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
@@ -2208,6 +2396,7 @@ l2:
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
 			UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+		bms_free(hot_attrs);
 		return result;
 	}
 
@@ -2227,6 +2416,7 @@ l2:
 	}
 
 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
 	newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
@@ -2261,17 +2451,20 @@ l2:
 					  HeapTupleHasExternal(newtup) ||
 					  newtup->t_len > TOAST_TUPLE_THRESHOLD);
 
-	pagefree = PageGetFreeSpace((Page) dp);
+	pagefree = PageGetHeapFreeSpace((Page) dp);
 
 	newtupsize = MAXALIGN(newtup->t_len);
 
 	if (need_toast || newtupsize > pagefree)
 	{
+		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
 									   HEAP_IS_LOCKED |
 									   HEAP_MOVED);
+		HeapTupleClearHotUpdated(&oldtup);
+		/* ... and store info about transaction updating this tuple */
 		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 		/* temporarily make it look not-updated */
@@ -2324,7 +2517,7 @@ l2:
 			/* Re-acquire the lock on the old tuple's page. */
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 			/* Re-check using the up-to-date free space */
-			pagefree = PageGetFreeSpace((Page) dp);
+			pagefree = PageGetHeapFreeSpace((Page) dp);
 			if (newtupsize > pagefree)
 			{
 				/*
@@ -2357,18 +2550,66 @@ l2:
 	 * one pin is held.
 	 */
 
+	if (newbuf == buffer)
+	{
+		/*
+		 * Since the new tuple is going into the same page, we might be able
+		 * to do a HOT update.  Check if any of the index columns have been
+		 * changed.  If not, then HOT update is possible.
+		 */
+		if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+			use_hot_update = true;
+	}
+	else
+	{
+		/* Set a hint that the old page could use prune/defrag */
+		PageSetFull(dp);
+	}
+
 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
 
+	/*
+	 * If this transaction commits, the old tuple will become DEAD sooner or
+	 * later. Set hint bit that this page is a candidate for pruning.  If
+	 * the transaction finally aborts, the subsequent page pruning will be
+	 * a no-op and the hint will be cleared.
+	 *
+	 * XXX Should we set hint on newbuf as well?  If the transaction
+	 * aborts, there would be a prunable tuple in the newbuf; but for now
+	 * we choose not to optimize for aborts.  Note that heap_xlog_update
+	 * must be kept in sync if this changes.
+	 */
+	PageSetPrunable(dp);
+
+	if (use_hot_update)
+	{
+		/* Mark the old tuple as HOT-updated */
+		HeapTupleSetHotUpdated(&oldtup);
+		/* And mark the new tuple as heap-only */
+		HeapTupleSetHeapOnly(heaptup);
+		/* Mark the caller's copy too, in case different from heaptup */
+		HeapTupleSetHeapOnly(newtup);
+	}
+	else
+	{
+		/* Make sure tuples are correctly marked as not-HOT */
+		HeapTupleClearHotUpdated(&oldtup);
+		HeapTupleClearHeapOnly(heaptup);
+		HeapTupleClearHeapOnly(newtup);
+	}
+
 	RelationPutHeapTuple(relation, newbuf, heaptup);	/* insert new tuple */
 
 	if (!already_marked)
 	{
+		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
 									   HEAP_IS_LOCKED |
 									   HEAP_MOVED);
+		/* ... and store info about transaction updating this tuple */
 		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 	}
@@ -2427,7 +2668,7 @@ l2:
 	if (have_tuple_lock)
 		UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
 
-	pgstat_count_heap_update(relation);
+	pgstat_count_heap_update(relation, use_hot_update);
 
 	/*
 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
@@ -2439,10 +2680,120 @@ l2:
 		heap_freetuple(heaptup);
 	}
 
+	bms_free(hot_attrs);
+
 	return HeapTupleMayBeUpdated;
 }
 
 /*
+ * Check if the specified attribute's value is same in both given tuples.
+ * Subroutine for HeapSatisfiesHOTUpdate.
+ */
+static bool
+heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
+					   HeapTuple tup1, HeapTuple tup2)
+{
+	Datum value1, value2;
+	bool isnull1, isnull2;
+	Form_pg_attribute att;
+
+	/*
+	 * If it's a whole-tuple reference, say "not equal".  It's not really
+	 * worth supporting this case, since it could only succeed after a
+	 * no-op update, which is hardly a case worth optimizing for.
+	 */
+	if (attrnum == 0)
+		return false;
+
+	/*
+	 * Likewise, automatically say "not equal" for any system attribute
+	 * other than OID and tableOID; we cannot expect these to be consistent
+	 * in a HOT chain, or even to be set correctly yet in the new tuple.
+	 */
+	if (attrnum < 0)
+	{
+		if (attrnum != ObjectIdAttributeNumber &&
+			attrnum != TableOidAttributeNumber)
+			return false;
+	}
+
+	/*
+	 * Extract the corresponding values.  XXX this is pretty inefficient
+	 * if there are many indexed columns.  Should HeapSatisfiesHOTUpdate
+	 * do a single heap_deform_tuple call on each tuple, instead?  But
+	 * that doesn't work for system columns ...
+	 */
+	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
+	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
+
+	/*
+	 * If one value is NULL and other is not, then they are certainly
+	 * not equal
+	 */
+	if (isnull1 != isnull2)
+		return false;
+
+	/*
+	 * If both are NULL, they can be considered equal.
+	 */
+	if (isnull1)
+		return true;
+
+	/*
+	 * We do simple binary comparison of the two datums.  This may be overly
+	 * strict because there can be multiple binary representations for the
+	 * same logical value.  But we should be OK as long as there are no false
+	 * positives.  Using a type-specific equality operator is messy because
+	 * there could be multiple notions of equality in different operator
+	 * classes; furthermore, we cannot safely invoke user-defined functions
+	 * while holding exclusive buffer lock.
+	 */
+	if (attrnum <= 0)
+	{
+		/* The only allowed system columns are OIDs, so do this */
+		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+	}
+	else
+	{
+		Assert(attrnum <= tupdesc->natts);
+		att	= tupdesc->attrs[attrnum - 1];
+		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+	}
+}
+
+/*
+ * Check if the old and new tuples represent a HOT-safe update. To be able
+ * to do a HOT update, we must not have changed any columns used in index
+ * definitions.
+ *
+ * The set of attributes to be checked is passed in (we dare not try to
+ * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
+ * is destructively modified!  That is OK since this is invoked at most once
+ * by heap_update().
+ *
+ * Returns true if safe to do HOT update.
+ */
+static bool
+HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+					   HeapTuple oldtup, HeapTuple newtup)
+{
+	int attrnum;
+
+	while ((attrnum = bms_first_member(hot_attrs)) >= 0)
+	{
+		/* Adjust for system attributes */
+		attrnum += FirstLowInvalidHeapAttributeNumber;
+
+		/* If the attribute value has changed, we can't do HOT update */
+		if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
+									oldtup, newtup))
+			return false;
+	}
+
+	return true;
+}
+
+/*
  *	simple_heap_update - replace a tuple
  *
  * This routine may be used to update a tuple when concurrent updates of
@@ -2865,6 +3216,7 @@ l3:
 	 * avoids possibly generating a useless combo CID.
 	 */
 	tuple->t_data->t_infomask = new_infomask;
+	HeapTupleHeaderClearHotUpdated(tuple->t_data);
 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
 	/* Make sure there is no forward chain link in t_ctid */
 	tuple->t_data->t_ctid = *tid;
@@ -3110,6 +3462,7 @@ recheck_xmax:
 			 */
 			tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
 			tuple->t_infomask |= HEAP_XMAX_INVALID;
+			HeapTupleHeaderClearHotUpdated(tuple);
 			changed = true;
 		}
 	}
@@ -3245,21 +3598,29 @@ heap_restrpos(HeapScanDesc scan)
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
  *
- * Note: for historical reasons, the entries in the unused[] array should
- * be zero-based tuple indexes, not one-based.
+ * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
+ * zero-based tuple indexes.  Now they are one-based like other uses
+ * of OffsetNumber.
  */
 XLogRecPtr
-log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
+log_heap_clean(Relation reln, Buffer buffer,
+			   OffsetNumber *redirected, int nredirected,
+			   OffsetNumber *nowdead, int ndead,
+			   OffsetNumber *nowunused, int nunused,
+			   bool redirect_move)
 {
 	xl_heap_clean xlrec;
+	uint8		info;
 	XLogRecPtr	recptr;
-	XLogRecData rdata[2];
+	XLogRecData rdata[4];
 
 	/* Caller should not call me on a temp relation */
 	Assert(!reln->rd_istemp);
 
 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
+	xlrec.nredirected = nredirected;
+	xlrec.ndead = ndead;
 
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapClean;
@@ -3267,14 +3628,17 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 	rdata[0].next = &(rdata[1]);
 
 	/*
-	 * The unused-offsets array is not actually in the buffer, but pretend
-	 * that it is.	When XLogInsert stores the whole buffer, the offsets array
-	 * need not be stored too.
+	 * The OffsetNumber arrays are not actually in the buffer, but we pretend
+	 * that they are.  When XLogInsert stores the whole buffer, the offset
+	 * arrays need not be stored too.  Note that even if all three arrays
+	 * are empty, we want to expose the buffer as a candidate for whole-page
+	 * storage, since this record type implies a defragmentation operation
+	 * even if no item pointers changed state.
 	 */
-	if (uncnt > 0)
+	if (nredirected > 0)
 	{
-		rdata[1].data = (char *) unused;
-		rdata[1].len = uncnt * sizeof(OffsetNumber);
+		rdata[1].data = (char *) redirected;
+		rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
 	}
 	else
 	{
@@ -3283,9 +3647,38 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 	}
 	rdata[1].buffer = buffer;
 	rdata[1].buffer_std = true;
-	rdata[1].next = NULL;
+	rdata[1].next = &(rdata[2]);
+
+	if (ndead > 0)
+	{
+		rdata[2].data = (char *) nowdead;
+		rdata[2].len = ndead * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[2].data = NULL;
+		rdata[2].len = 0;
+	}
+	rdata[2].buffer = buffer;
+	rdata[2].buffer_std = true;
+	rdata[2].next = &(rdata[3]);
+
+	if (nunused > 0)
+	{
+		rdata[3].data = (char *) nowunused;
+		rdata[3].len = nunused * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[3].data = NULL;
+		rdata[3].len = 0;
+	}
+	rdata[3].buffer = buffer;
+	rdata[3].buffer_std = true;
+	rdata[3].next = NULL;
 
-	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
+	info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN;
+	recptr = XLogInsert(RM_HEAP2_ID, info, rdata);
 
 	return recptr;
 }
@@ -3293,8 +3686,6 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 /*
  * Perform XLogInsert for a heap-freeze operation.  Caller must already
  * have modified the buffer and marked it dirty.
- *
- * Unlike log_heap_clean(), the offsets[] entries are one-based.
  */
 XLogRecPtr
 log_heap_freeze(Relation reln, Buffer buffer,
@@ -3363,17 +3754,28 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
 	}			xlhdr;
 	int			hsize = SizeOfHeapHeader;
 	xl_heap_update xlrec;
+	uint8		info;
 	XLogRecPtr	recptr;
 	XLogRecData rdata[4];
 	Page		page = BufferGetPage(newbuf);
-	uint8		info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE;
 
 	/* Caller should not call me on a temp relation */
 	Assert(!reln->rd_istemp);
 
+	if (move)
+	{
+		Assert(!HeapTupleIsHeapOnly(newtup));
+		info = XLOG_HEAP_MOVE;
+	}
+	else if (HeapTupleIsHeapOnly(newtup))
+		info = XLOG_HEAP_HOT_UPDATE;
+	else
+		info = XLOG_HEAP_UPDATE;
+
 	xlrec.target.node = reln->rd_node;
 	xlrec.target.tid = from;
 	xlrec.newtid = newtup->t_self;
+
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapUpdate;
 	rdata[0].buffer = InvalidBuffer;
@@ -3489,13 +3891,21 @@ log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page)
 	return recptr;
 }
 
+/*
+ * Handles CLEAN and CLEAN_MOVE record types
+ */
 static void
-heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
+heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
 {
 	xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
 	Relation	reln;
 	Buffer		buffer;
 	Page		page;
+	OffsetNumber *offnum;
+	OffsetNumber *end;
+	int nredirected;
+	int ndead;
+	int i;
 
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
@@ -3512,25 +3922,63 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
 		return;
 	}
 
-	if (record->xl_len > SizeOfHeapClean)
-	{
-		OffsetNumber *unused;
-		OffsetNumber *unend;
-		ItemId		lp;
+	nredirected = xlrec->nredirected;
+	ndead = xlrec->ndead;
+	offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
+	end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
 
-		unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
-		unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+	/* Update all redirected or moved line pointers */
+	for (i = 0; i < nredirected; i++)
+	{
+		OffsetNumber fromoff = *offnum++;
+		OffsetNumber tooff = *offnum++;
+		ItemId	fromlp = PageGetItemId(page, fromoff);
 
-		while (unused < unend)
+		if (clean_move)
 		{
-			/* unused[] entries are zero-based */
-			lp = PageGetItemId(page, *unused + 1);
-			ItemIdSetUnused(lp);
-			unused++;
+			/* Physically move the "to" item to the "from" slot */
+			ItemId	tolp = PageGetItemId(page, tooff);
+			HeapTupleHeader htup;
+
+			*fromlp = *tolp;
+			ItemIdSetUnused(tolp);
+
+			/* We also have to clear the tuple's heap-only bit */
+			Assert(ItemIdIsNormal(fromlp));
+			htup = (HeapTupleHeader) PageGetItem(page, fromlp);
+			Assert(HeapTupleHeaderIsHeapOnly(htup));
+			HeapTupleHeaderClearHeapOnly(htup);
+		}
+		else
+		{
+			/* Just insert a REDIRECT link at fromoff */
+			ItemIdSetRedirect(fromlp, tooff);
 		}
 	}
 
-	PageRepairFragmentation(page, NULL);
+	/* Update all now-dead line pointers */
+	for (i = 0; i < ndead; i++)
+	{
+		OffsetNumber off = *offnum++;
+		ItemId	lp = PageGetItemId(page, off);
+
+		ItemIdSetDead(lp);
+	}
+
+	/* Update all now-unused line pointers */
+	while (offnum < end)
+	{
+		OffsetNumber off = *offnum++;
+		ItemId	lp = PageGetItemId(page, off);
+
+		ItemIdSetUnused(lp);
+	}
+
+	/*
+	 * Finally, repair any fragmentation, and update the page's hint bit
+	 * about whether it has free pointers.
+	 */
+	PageRepairFragmentation(page);
 
 	PageSetLSN(page, lsn);
 	PageSetTLI(page, ThisTimeLineID);
@@ -3655,8 +4103,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 						  HEAP_XMAX_IS_MULTI |
 						  HEAP_IS_LOCKED |
 						  HEAP_MOVED);
+	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, record->xl_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+
+	/* Mark the page as a candidate for pruning */
+	PageSetPrunable(page);
+
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->target.tid;
 	PageSetLSN(page, lsn);
@@ -3736,7 +4189,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 	HeapTupleHeaderSetCmin(htup, FirstCommandId);
 	htup->t_ctid = xlrec->target.tid;
 
-	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_insert_redo: failed to add tuple");
 	PageSetLSN(page, lsn);
@@ -3746,10 +4199,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 }
 
 /*
- * Handles UPDATE & MOVE
+ * Handles UPDATE, HOT_UPDATE & MOVE
  */
 static void
-heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
+heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
 {
 	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
 	Relation	reln = XLogOpenRelation(xlrec->target.node);
@@ -3808,6 +4261,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
 							  HEAP_XMIN_INVALID |
 							  HEAP_MOVED_IN);
 		htup->t_infomask |= HEAP_MOVED_OFF;
+		HeapTupleHeaderClearHotUpdated(htup);
 		HeapTupleHeaderSetXvac(htup, record->xl_xid);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = xlrec->target.tid;
@@ -3819,12 +4273,19 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
 							  HEAP_XMAX_IS_MULTI |
 							  HEAP_IS_LOCKED |
 							  HEAP_MOVED);
+		if (hot_update)
+			HeapTupleHeaderSetHotUpdated(htup);
+		else
+			HeapTupleHeaderClearHotUpdated(htup);
 		HeapTupleHeaderSetXmax(htup, record->xl_xid);
 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = xlrec->newtid;
 	}
 
+	/* Mark the page as a candidate for pruning */
+	PageSetPrunable(page);
+
 	/*
 	 * this test is ugly, but necessary to avoid thinking that insert change
 	 * is already applied
@@ -3914,7 +4375,7 @@ newsame:;
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->newtid;
 
-	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_update_redo: failed to add tuple");
 	PageSetLSN(page, lsn);
@@ -3971,6 +4432,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 		htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
 	else
 		htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 	/* Make sure there is no forward chain link in t_ctid */
@@ -4039,25 +4501,35 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	info &= XLOG_HEAP_OPMASK;
-	if (info == XLOG_HEAP_INSERT)
-		heap_xlog_insert(lsn, record);
-	else if (info == XLOG_HEAP_DELETE)
-		heap_xlog_delete(lsn, record);
-	else if (info == XLOG_HEAP_UPDATE)
-		heap_xlog_update(lsn, record, false);
-	else if (info == XLOG_HEAP_MOVE)
-		heap_xlog_update(lsn, record, true);
-	else if (info == XLOG_HEAP_CLEAN)
-		heap_xlog_clean(lsn, record);
-	else if (info == XLOG_HEAP_NEWPAGE)
-		heap_xlog_newpage(lsn, record);
-	else if (info == XLOG_HEAP_LOCK)
-		heap_xlog_lock(lsn, record);
-	else if (info == XLOG_HEAP_INPLACE)
-		heap_xlog_inplace(lsn, record);
-	else
-		elog(PANIC, "heap_redo: unknown op code %u", info);
+	switch (info & XLOG_HEAP_OPMASK)
+	{
+		case XLOG_HEAP_INSERT:
+			heap_xlog_insert(lsn, record);
+			break;
+		case XLOG_HEAP_DELETE:
+			heap_xlog_delete(lsn, record);
+			break;
+		case XLOG_HEAP_UPDATE:
+			heap_xlog_update(lsn, record, false, false);
+			break;
+		case XLOG_HEAP_MOVE:
+			heap_xlog_update(lsn, record, true, false);
+			break;
+		case XLOG_HEAP_HOT_UPDATE:
+			heap_xlog_update(lsn, record, false, true);
+			break;
+		case XLOG_HEAP_NEWPAGE:
+			heap_xlog_newpage(lsn, record);
+			break;
+		case XLOG_HEAP_LOCK:
+			heap_xlog_lock(lsn, record);
+			break;
+		case XLOG_HEAP_INPLACE:
+			heap_xlog_inplace(lsn, record);
+			break;
+		default:
+			elog(PANIC, "heap_redo: unknown op code %u", info);
+	}
 }
 
 void
@@ -4065,11 +4537,20 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	info &= XLOG_HEAP_OPMASK;
-	if (info == XLOG_HEAP2_FREEZE)
-		heap_xlog_freeze(lsn, record);
-	else
-		elog(PANIC, "heap2_redo: unknown op code %u", info);
+	switch (info & XLOG_HEAP_OPMASK)
+	{
+		case XLOG_HEAP2_FREEZE:
+			heap_xlog_freeze(lsn, record);
+			break;
+		case XLOG_HEAP2_CLEAN:
+			heap_xlog_clean(lsn, record, false);
+			break;
+		case XLOG_HEAP2_CLEAN_MOVE:
+			heap_xlog_clean(lsn, record, true);
+			break;
+		default:
+			elog(PANIC, "heap2_redo: unknown op code %u", info);
+	}
 }
 
 static void
@@ -4130,13 +4611,18 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
 						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
 	}
-	else if (info == XLOG_HEAP_CLEAN)
+	else if (info == XLOG_HEAP_HOT_UPDATE)
 	{
-		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+		xl_heap_update *xlrec = (xl_heap_update *) rec;
 
-		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
-						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+		if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
+			appendStringInfo(buf, "hot_update(init): ");
+		else
+			appendStringInfo(buf, "hot_update: ");
+		out_target(buf, &(xlrec->target));
+		appendStringInfo(buf, "; new %u/%u",
+						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
+						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
 	}
 	else if (info == XLOG_HEAP_NEWPAGE)
 	{
@@ -4187,6 +4673,22 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->node.relNode, xlrec->block,
 						 xlrec->cutoff_xid);
 	}
+	else if (info == XLOG_HEAP2_CLEAN)
+	{
+		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block);
+	}
+	else if (info == XLOG_HEAP2_CLEAN_MOVE)
+	{
+		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 6dbdf13fbe0..cd13d8f87c9 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.66 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.67 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,7 +41,7 @@ RelationPutHeapTuple(Relation relation,
 	pageHeader = BufferGetPage(buffer);
 
 	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
-						 tuple->t_len, InvalidOffsetNumber, false);
+						 tuple->t_len, InvalidOffsetNumber, false, true);
 
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "failed to add tuple to page");
@@ -218,7 +218,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * we're done.
 		 */
 		pageHeader = (Page) BufferGetPage(buffer);
-		pageFreeSpace = PageGetFreeSpace(pageHeader);
+		pageFreeSpace = PageGetHeapFreeSpace(pageHeader);
 		if (len + saveFreeSpace <= pageFreeSpace)
 		{
 			/* use this page as future insert target, too */
@@ -311,7 +311,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 
 	PageInit(pageHeader, BufferGetPageSize(buffer), 0);
 
-	if (len > PageGetFreeSpace(pageHeader))
+	if (len > PageGetHeapFreeSpace(pageHeader))
 	{
 		/* We should not get here given the test at the top */
 		elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
new file mode 100644
index 00000000000..d5496689003
--- /dev/null
+++ b/src/backend/access/heap/pruneheap.c
@@ -0,0 +1,702 @@
+/*-------------------------------------------------------------------------
+ *
+ * pruneheap.c
+ *	  heap page pruning and HOT-chain management code
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.1 2007/09/20 17:56:30 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/inval.h"
+
+
+/* Local functions */
+static int	heap_prune_chain(Relation relation, Buffer buffer,
+							 OffsetNumber rootoffnum,
+							 TransactionId OldestXmin,
+							 OffsetNumber *redirected, int *nredirected,
+							 OffsetNumber *nowdead, int *ndead,
+							 OffsetNumber *nowunused, int *nunused,
+							 bool redirect_move);
+static void heap_prune_record_redirect(OffsetNumber *redirected,
+			int *nredirected,
+			OffsetNumber offnum,
+			OffsetNumber rdoffnum);
+static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+			OffsetNumber offnum);
+static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+			OffsetNumber offnum);
+
+
+/*
+ * Optionally prune and repair fragmentation in the specified page.
+ *
+ * This is an opportunistic function.  It will perform housekeeping
+ * only if the page heuristically looks like a candidate for pruning and we
+ * can acquire buffer cleanup lock without blocking.
+ *
+ * Note: this is called quite often.  It's important that it fall out quickly
+ * if there's not any use in pruning.
+ *
+ * Caller must have pin on the buffer, and must *not* have a lock on it.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ */
+void
+heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+{
+	PageHeader	dp = (PageHeader) BufferGetPage(buffer);
+	Size		minfree;
+
+	/*
+	 * Let's see if we really need pruning.
+	 *
+	 * Forget it if page is not hinted to contain something prunable
+	 */
+	if (!PageIsPrunable(dp))
+		return;
+
+	/*
+	 * We prune when a previous UPDATE failed to find enough space on the
+	 * page for a new tuple version, or when free space falls below the
+	 * relation's fill-factor target (but not less than 10%).
+	 *
+	 * Checking free space here is questionable since we aren't holding
+	 * any lock on the buffer; in the worst case we could get a bogus
+	 * answer.  It's unlikely to be *seriously* wrong, though, since
+	 * reading either pd_lower or pd_upper is probably atomic.  Avoiding
+	 * taking a lock seems better than sometimes getting a wrong answer
+	 * in what is after all just a heuristic estimate.
+	 */
+	minfree = RelationGetTargetPageFreeSpace(relation,
+											 HEAP_DEFAULT_FILLFACTOR);
+	minfree = Max(minfree, BLCKSZ / 10);
+
+	if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+	{
+		/* OK, try to get exclusive buffer lock */
+		if (!ConditionalLockBufferForCleanup(buffer))
+			return;
+
+		/*
+		 * Now that we have buffer lock, get accurate information about the
+		 * page's free space, and recheck the heuristic about whether to prune.
+		 */
+		if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+		{
+			/* OK to prune (though not to remove redirects) */
+			(void) heap_page_prune(relation, buffer, OldestXmin, false, true);
+		}
+
+		/* And release buffer lock */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	}
+}
+
+
+/*
+ * Prune and repair fragmentation in the specified page.
+ *
+ * Caller must have pin and buffer cleanup lock on the page.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ *
+ * If redirect_move is set, we remove redirecting line pointers by
+ * updating the root line pointer to point directly to the first non-dead
+ * tuple in the chain.  NOTE: eliminating the redirect changes the first
+ * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL.
+ * The only reason we support this capability at all is that by using it,
+ * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a
+ * good thing since VACUUM FULL is overly complicated already.
+ *
+ * If report_stats is true then we send the number of reclaimed heap-only
+ * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
+ * send its own new total to pgstats, and we don't want this delta applied
+ * on top of that.)
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+int
+heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
+				bool redirect_move, bool report_stats)
+{
+	int				ndeleted = 0;
+	Page			page = BufferGetPage(buffer);
+	OffsetNumber 	offnum,
+					maxoff;
+	OffsetNumber	redirected[MaxHeapTuplesPerPage * 2];
+	OffsetNumber	nowdead[MaxHeapTuplesPerPage];
+	OffsetNumber	nowunused[MaxHeapTuplesPerPage];
+	int				nredirected = 0;
+	int				ndead = 0;
+	int				nunused = 0;
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Mark the page as clear of prunable tuples. If we find a tuple which
+	 * may soon become prunable, we shall set the hint again.  Also clear
+	 * the "page is full" flag, since there's no point in repeating the
+	 * prune/defrag process until something else happens to the page.
+	 */
+	PageClearPrunable(page);
+	PageClearFull(page);
+
+	/* Scan the page */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId itemid = PageGetItemId(page, offnum);
+
+		/* Nothing to do if slot is empty or already dead */
+		if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
+			continue;
+
+		/* Process this item or chain of items */
+		ndeleted += heap_prune_chain(relation, buffer, offnum,
+									 OldestXmin,
+									 redirected, &nredirected,
+									 nowdead, &ndead,
+									 nowunused, &nunused,
+									 redirect_move);
+	}
+
+	/* Have we pruned any items? */
+	if (nredirected > 0 || ndead > 0 || nunused > 0)
+	{
+		/*
+		 * Repair page fragmentation, and update the page's hint bit about
+		 * whether it has free line pointers.
+		 */
+		PageRepairFragmentation((Page) page);
+
+		MarkBufferDirty(buffer);
+
+		/*
+		 * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did
+		 */
+		if (!relation->rd_istemp)
+		{
+			XLogRecPtr	recptr;
+
+			recptr = log_heap_clean(relation, buffer,
+									redirected, nredirected,
+									nowdead, ndead,
+									nowunused, nunused,
+									redirect_move);
+			PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
+			PageSetLSN(BufferGetPage(buffer), recptr);
+		}
+	}
+
+	END_CRIT_SECTION();
+
+	/*
+	 * If requested, report the number of tuples reclaimed to pgstats.
+	 * This is ndeleted minus ndead, because we don't want to count a now-DEAD
+	 * root item as a deletion for this purpose.
+	 */
+	if (report_stats && ndeleted > ndead)
+		pgstat_update_heap_dead_tuples(relation, ndeleted - ndead);
+
+	/*
+	 * XXX Should we update the FSM information of this page ?
+	 *
+	 * There are two schools of thought here. We may not want to update
+	 * FSM information so that the page is not used for unrelated
+	 * UPDATEs/INSERTs and any free space in this page will remain
+	 * available for further UPDATEs in *this* page, thus improving
+	 * chances for doing HOT updates.
+	 *
+	 * But for a large table and where a page does not receive further
+	 * UPDATEs for a long time, we might waste this space by not
+	 * updating the FSM information. The relation may get extended and
+	 * fragmented further.
+	 *
+	 * One possibility is to leave "fillfactor" worth of space in this
+	 * page and update FSM with the remaining space.
+	 *
+	 * In any case, the current FSM implementation doesn't accept
+	 * one-page-at-a-time updates, so this is all academic for now.
+	 */
+
+	return ndeleted;
+}
+
+
+/*
+ * Prune specified item pointer or a HOT chain originating at that item.
+ *
+ * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
+ * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
+ * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
+ * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
+ * DEAD, the OldestXmin test is just too coarse to detect it.
+ *
+ * The root line pointer is redirected to the tuple immediately after the
+ * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
+ * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
+ * tuple, which we treat as a chain of length 1.)
+ *
+ * OldestXmin is the cutoff XID used to identify dead tuples.
+ *
+ * Redirected items are added to the redirected[] array (two entries per
+ * redirection); items set to LP_DEAD state are added to nowdead[]; and
+ * items set to LP_UNUSED state are added to nowunused[].  (These arrays
+ * will be used to generate a WAL record after all chains are pruned.)
+ *
+ * If redirect_move is true, we get rid of redirecting line pointers.
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+static int
+heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
+				 TransactionId OldestXmin,
+				 OffsetNumber *redirected, int *nredirected,
+				 OffsetNumber *nowdead, int *ndead,
+				 OffsetNumber *nowunused, int *nunused,
+				 bool redirect_move)
+{
+	int				ndeleted = 0;
+	Page			dp = (Page) BufferGetPage(buffer);
+	TransactionId	priorXmax = InvalidTransactionId;
+	ItemId			rootlp;
+	HeapTupleHeader	htup;
+	OffsetNumber	latestdead = InvalidOffsetNumber,
+					maxoff = PageGetMaxOffsetNumber(dp),
+					offnum;
+	OffsetNumber	chainitems[MaxHeapTuplesPerPage];
+	int				nchain = 0,
+					i;
+
+	rootlp = PageGetItemId(dp, rootoffnum);
+
+	/*
+	 * If it's a heap-only tuple, then it is not the start of a HOT chain.
+	 */
+	if (ItemIdIsNormal(rootlp))
+	{
+		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
+		if (HeapTupleHeaderIsHeapOnly(htup))
+		{
+			/*
+			 * If the tuple is DEAD and doesn't chain to anything else, mark it
+			 * unused immediately.  (If it does chain, we can only remove it as
+			 * part of pruning its chain.)
+			 *
+			 * We need this primarily to handle aborted HOT updates, that is,
+			 * XMIN_INVALID heap-only tuples.  Those might not be linked to
+			 * by any chain, since the parent tuple might be re-updated before
+			 * any pruning occurs.  So we have to be able to reap them
+			 * separately from chain-pruning.
+			 *
+			 * Note that we might first arrive at a dead heap-only tuple
+			 * either here or while following a chain below.  Whichever path
+			 * gets there first will mark the tuple unused.
+			 */
+			if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
+				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+			{
+				ItemIdSetUnused(rootlp);
+				heap_prune_record_unused(nowunused, nunused, rootoffnum);
+				ndeleted++;
+			}
+
+			/* Nothing more to do */
+			return ndeleted;
+		}
+	}
+
+	/* Start from the root tuple */
+	offnum = rootoffnum;
+
+	/* while not end of the chain */
+	for (;;)
+	{
+		ItemId			lp;
+		bool			tupdead,
+						recent_dead;
+
+		/* Some sanity checks */
+		if (offnum < FirstOffsetNumber || offnum > maxoff)
+			break;
+
+		lp = PageGetItemId(dp, offnum);
+
+		if (!ItemIdIsUsed(lp))
+			break;
+
+		/*
+		 * If we are looking at the redirected root line pointer,
+		 * jump to the first normal tuple in the chain.  If we find
+		 * a redirect somewhere else, stop --- it must not be same chain.
+		 */
+		if (ItemIdIsRedirected(lp))
+		{
+			if (nchain > 0)
+				break;			/* not at start of chain */
+			chainitems[nchain++] = offnum;
+			offnum = ItemIdGetRedirect(rootlp);
+			continue;
+		}
+
+		/*
+		 * Likewise, a dead item pointer can't be part of the chain.
+		 * (We already eliminated the case of dead root tuple outside
+		 * this function.)
+		 */
+		if (ItemIdIsDead(lp))
+			break;
+
+		Assert(ItemIdIsNormal(lp));
+		htup = (HeapTupleHeader) PageGetItem(dp, lp);
+
+		/*
+		 * Check the tuple XMIN against prior XMAX, if any
+		 */
+		if (TransactionIdIsValid(priorXmax) &&
+			!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+			break;
+
+		/*
+		 * OK, this tuple is indeed a member of the chain.
+		 */
+		chainitems[nchain++] = offnum;
+
+		/*
+		 * Check tuple's visibility status.
+		 */
+		tupdead = recent_dead = false;
+
+		switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
+		{
+			case HEAPTUPLE_DEAD:
+				tupdead = true;
+				break;
+
+			case HEAPTUPLE_RECENTLY_DEAD:
+				recent_dead = true;
+				/*
+				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * that the page is reconsidered for pruning in future.
+				 */
+				PageSetPrunable(dp);
+				break;
+
+			case HEAPTUPLE_DELETE_IN_PROGRESS:
+				/*
+				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * that the page is reconsidered for pruning in future.
+				 */
+				PageSetPrunable(dp);
+				break;
+
+			case HEAPTUPLE_LIVE:
+			case HEAPTUPLE_INSERT_IN_PROGRESS:
+				/*
+				 * If we wanted to optimize for aborts, we might consider
+				 * marking the page prunable when we see INSERT_IN_PROGRESS.
+				 * But we don't.  See related decisions about when to mark
+				 * the page prunable in heapam.c.
+				 */
+				break;
+
+			default:
+				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+				break;
+		}
+
+		/*
+		 * Remember the last DEAD tuple seen.  We will advance past
+		 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
+		 * but we can't advance past anything else.  (XXX is it really worth
+		 * continuing to scan beyond RECENTLY_DEAD?  The case where we will
+		 * find another DEAD tuple is a fairly unusual corner case.)
+		 */
+		if (tupdead)
+			latestdead = offnum;
+		else if (!recent_dead)
+			break;
+
+		/*
+		 * If the tuple is not HOT-updated, then we are at the end of this
+		 * HOT-update chain.
+		 */
+		if (!HeapTupleHeaderIsHotUpdated(htup))
+			break;
+
+		/*
+		 * Advance to next chain member.
+		 */
+		Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
+			   BufferGetBlockNumber(buffer));
+		offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+		priorXmax = HeapTupleHeaderGetXmax(htup);
+	}
+
+	/*
+	 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
+	 * the DEAD tuples at the start of the chain are removed and the root line
+	 * pointer is appropriately redirected.
+	 */
+	if (OffsetNumberIsValid(latestdead))
+	{
+		/*
+		 * Mark as unused each intermediate item that we are able to remove
+		 * from the chain.
+		 *
+		 * When the previous item is the last dead tuple seen, we are at
+		 * the right candidate for redirection.
+		 */
+		for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
+		{
+			ItemId lp = PageGetItemId(dp, chainitems[i]);
+
+			ItemIdSetUnused(lp);
+			heap_prune_record_unused(nowunused, nunused, chainitems[i]);
+			ndeleted++;
+		}
+
+		/*
+		 * If the root entry had been a normal tuple, we are deleting it,
+		 * so count it in the result.  But changing a redirect (even to
+		 * DEAD state) doesn't count.
+		 */
+		if (ItemIdIsNormal(rootlp))
+			ndeleted++;
+
+		/*
+		 * If the DEAD tuple is at the end of the chain, the entire chain is
+		 * dead and the root line pointer can be marked dead.  Otherwise
+		 * just redirect the root to the correct chain member.
+		 */
+		if (i >= nchain)
+		{
+			ItemIdSetDead(rootlp);
+			heap_prune_record_dead(nowdead, ndead, rootoffnum);
+		}
+		else
+		{
+			ItemIdSetRedirect(rootlp, chainitems[i]);
+			heap_prune_record_redirect(redirected, nredirected,
+									   rootoffnum,
+									   chainitems[i]);
+		}
+	}
+	else if (nchain < 2 && ItemIdIsRedirected(rootlp))
+	{
+		/*
+		 * We found a redirect item that doesn't point to a valid follow-on
+		 * item.  This can happen if the loop in heap_page_prune caused us
+		 * to visit the dead successor of a redirect item before visiting
+		 * the redirect item.  We can clean up by setting the redirect item
+		 * to DEAD state.
+		 */
+		ItemIdSetDead(rootlp);
+		heap_prune_record_dead(nowdead, ndead, rootoffnum);
+	}
+
+	/*
+	 * If requested, eliminate LP_REDIRECT items by moving tuples.  Note that
+	 * if the root item is LP_REDIRECT and doesn't point to a valid follow-on
+	 * item, we already killed it above.
+	 */
+	if (redirect_move && ItemIdIsRedirected(rootlp))
+	{
+		OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp);
+		ItemId firstlp = PageGetItemId(dp, firstoffnum);
+		HeapTupleData	firsttup;
+
+		Assert(ItemIdIsNormal(firstlp));
+		/* Set up firsttup to reference the tuple at its existing CTID */
+		firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp);
+		firsttup.t_len = ItemIdGetLength(firstlp);
+		ItemPointerSet(&firsttup.t_self,
+					   BufferGetBlockNumber(buffer),
+					   firstoffnum);
+		firsttup.t_tableOid = RelationGetRelid(relation);
+
+		/*
+		 * Mark the tuple for invalidation.  Needed because we're changing
+		 * its CTID.
+		 */
+		CacheInvalidateHeapTuple(relation, &firsttup);
+
+		/*
+		 * Change heap-only status of the tuple because after the line
+		 * pointer manipulation, it's no longer a heap-only tuple, but is
+		 * directly pointed to by index entries.
+		 */
+		Assert(HeapTupleIsHeapOnly(&firsttup));
+		HeapTupleClearHeapOnly(&firsttup);
+
+		/* Now move the item pointer */
+		*rootlp = *firstlp;
+		ItemIdSetUnused(firstlp);
+
+		/*
+		 * If latestdead is valid, we have already recorded the redirection
+		 * above.  Otherwise, do it now.
+		 *
+		 * We don't record firstlp in the nowunused[] array, since the
+		 * redirection entry is enough to tell heap_xlog_clean what to do.
+		 */
+		if (!OffsetNumberIsValid(latestdead))
+			heap_prune_record_redirect(redirected, nredirected, rootoffnum,
+									   firstoffnum);
+	}
+
+	return ndeleted;
+}
+
+
+/* Record newly-redirected item pointer */
+static void
+heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected,
+			OffsetNumber offnum, OffsetNumber rdoffnum)
+{
+	Assert(*nredirected < MaxHeapTuplesPerPage);
+	redirected[*nredirected * 2] = offnum;
+	redirected[*nredirected * 2 + 1] = rdoffnum;
+	(*nredirected)++;
+}
+
+/* Record newly-dead item pointer */
+static void
+heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+					   OffsetNumber offnum)
+{
+	Assert(*ndead < MaxHeapTuplesPerPage);
+	nowdead[*ndead] = offnum;
+	(*ndead)++;
+}
+
+/* Record newly-unused item pointer */
+static void
+heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+						 OffsetNumber offnum)
+{
+	Assert(*nunused < MaxHeapTuplesPerPage);
+	nowunused[*nunused] = offnum;
+	(*nunused)++;
+}
+
+
+/*
+ * For all items in this page, find their respective root line pointers.
+ * If item k is part of a HOT-chain with root at item j, then we set
+ * root_offsets[k - 1] = j.
+ *
+ * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
+ * We zero out all unused entries.
+ *
+ * The function must be called with at least share lock on the buffer, to
+ * prevent concurrent prune operations.
+ *
+ * Note: The information collected here is valid only as long as the caller
+ * holds a pin on the buffer. Once pin is released, a tuple might be pruned
+ * and reused by a completely unrelated tuple.
+ */
+void
+heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
+{
+	OffsetNumber	offnum, maxoff;
+
+	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+	{
+		ItemId			lp = PageGetItemId(page, offnum);
+		HeapTupleHeader	htup;
+		OffsetNumber	nextoffnum;
+		TransactionId	priorXmax;
+
+		/* skip unused and dead items */
+		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+			continue;
+
+		if (ItemIdIsNormal(lp))
+		{
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			/*
+			 * Check if this tuple is part of a HOT-chain rooted at some other
+			 * tuple. If so, skip it for now; we'll process it when we find
+			 * its root.
+			 */
+			if (HeapTupleHeaderIsHeapOnly(htup))
+				continue;
+
+			/*
+			 * This is either a plain tuple or the root of a HOT-chain.
+			 * Remember it in the mapping.
+			 */
+			root_offsets[offnum - 1] = offnum;
+
+			/* If it's not the start of a HOT-chain, we're done with it */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				continue;
+
+			/* Set up to scan the HOT-chain */
+			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetXmax(htup);
+		}
+		else
+		{
+			/* Must be a redirect item. We do not set its root_offsets entry */
+			Assert(ItemIdIsRedirected(lp));
+			/* Set up to scan the HOT-chain */
+			nextoffnum = ItemIdGetRedirect(lp);
+			priorXmax = InvalidTransactionId;
+		}
+
+		/*
+		 * Now follow the HOT-chain and collect other tuples in the chain.
+		 *
+		 * Note: Even though this is a nested loop, the complexity of the
+		 * function is O(N) because a tuple in the page should be visited not
+		 * more than twice, once in the outer loop and once in HOT-chain
+		 * chases.
+		 */
+		for (;;)
+		{
+			lp = PageGetItemId(page, nextoffnum);
+
+			/* Check for broken chains */
+			if (!ItemIdIsNormal(lp))
+				break;
+
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			if (TransactionIdIsValid(priorXmax) &&
+				!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
+				break;
+
+			/* Remember the root line pointer for this item */
+			root_offsets[nextoffnum - 1] = offnum;
+
+			/* Advance to next chain member, if any */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				break;
+
+			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetXmax(htup);
+		}
+	}
+}
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 60aab58de38..e8c5eec50ac 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -96,7 +96,7 @@
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.6 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.7 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -320,12 +320,14 @@ rewrite_heap_tuple(RewriteState state,
 	 * Copy the original tuple's visibility information into new_tuple.
 	 *
 	 * XXX we might later need to copy some t_infomask2 bits, too?
+	 * Right now, we intentionally clear the HOT status bits.
 	 */
 	memcpy(&new_tuple->t_data->t_choice.t_heap,
 		   &old_tuple->t_data->t_choice.t_heap,
 		   sizeof(HeapTupleFields));
 
 	new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
+	new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
 	new_tuple->t_data->t_infomask |=
 		old_tuple->t_data->t_infomask & HEAP_XACT_MASK;
 
@@ -593,7 +595,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	/* Now we can check to see if there's enough free space already. */
 	if (state->rs_buffer_valid)
 	{
-		pageFreeSpace = PageGetFreeSpace(page);
+		pageFreeSpace = PageGetHeapFreeSpace(page);
 
 		if (len + saveFreeSpace > pageFreeSpace)
 		{
@@ -628,7 +630,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 
 	/* And now we can insert the tuple into the page */
 	newoff = PageAddItem(page, (Item) heaptup->t_data, len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(ERROR, "failed to add tuple");
 
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0009739180c..7bf1e43cb45 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.62 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.63 2007/09/20 17:56:30 tgl Exp $
  *
  * NOTES
  *	  many of the old access method routines have been turned into
@@ -21,6 +21,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 
@@ -95,6 +96,9 @@ RelationGetIndexScan(Relation indexRelation,
 	ItemPointerSetInvalid(&scan->xs_ctup.t_self);
 	scan->xs_ctup.t_data = NULL;
 	scan->xs_cbuf = InvalidBuffer;
+	scan->xs_prev_xmax = InvalidTransactionId;
+	scan->xs_next_hot = InvalidOffsetNumber;
+	scan->xs_hot_dead = false;
 
 	/*
 	 * Let the AM fill in the key and any opaque data it wants.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index d905013a5fc..fd727ca68c8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.98 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.99 2007/09/20 17:56:30 tgl Exp $
  *
  * INTERFACE ROUTINES
  *		index_open		- open an index relation by relation OID
@@ -64,6 +64,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "pgstat.h"
 #include "utils/relcache.h"
 
@@ -313,6 +314,8 @@ index_rescan(IndexScanDesc scan, ScanKey key)
 		scan->xs_cbuf = InvalidBuffer;
 	}
 
+	scan->xs_next_hot = InvalidOffsetNumber;
+
 	scan->kill_prior_tuple = false;		/* for safety */
 
 	FunctionCall2(procedure,
@@ -370,6 +373,14 @@ index_markpos(IndexScanDesc scan)
  * NOTE: this only restores the internal scan state of the index AM.
  * The current result tuple (scan->xs_ctup) doesn't change.  See comments
  * for ExecRestrPos().
+ *
+ * NOTE: in the presence of HOT chains, mark/restore only works correctly
+ * if the scan's snapshot is MVCC-safe; that ensures that there's at most one
+ * returnable tuple in each HOT chain, and so restoring the prior state at the
+ * granularity of the index AM is sufficient.  Since the only current user
+ * of mark/restore functionality is nodeMergejoin.c, this effectively means
+ * that merge-join plans only work for MVCC snapshots.  This could be fixed
+ * if necessary, but for now it seems unimportant.
  * ----------------
  */
 void
@@ -377,9 +388,13 @@ index_restrpos(IndexScanDesc scan)
 {
 	FmgrInfo   *procedure;
 
+	Assert(IsMVCCSnapshot(scan->xs_snapshot));
+
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amrestrpos);
 
+	scan->xs_next_hot = InvalidOffsetNumber;
+
 	scan->kill_prior_tuple = false;		/* for safety */
 
 	FunctionCall1(procedure, PointerGetDatum(scan));
@@ -398,72 +413,224 @@ HeapTuple
 index_getnext(IndexScanDesc scan, ScanDirection direction)
 {
 	HeapTuple	heapTuple = &scan->xs_ctup;
+	ItemPointer	tid = &heapTuple->t_self;
 	FmgrInfo   *procedure;
 
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amgettuple);
 
-	/* just make sure this is false... */
-	scan->kill_prior_tuple = false;
+	/*
+	 * We always reset xs_hot_dead; if we are here then either we are just
+	 * starting the scan, or we previously returned a visible tuple, and in
+	 * either case it's inappropriate to kill the prior index entry.
+	 */
+	scan->xs_hot_dead = false;
 
 	for (;;)
 	{
-		bool		found;
+		OffsetNumber offnum;
+		bool at_chain_start;
+		Page dp;
 
-		/*
-		 * The AM's gettuple proc finds the next tuple matching the scan keys.
-		 */
-		found = DatumGetBool(FunctionCall2(procedure,
-										   PointerGetDatum(scan),
-										   Int32GetDatum(direction)));
+		if (scan->xs_next_hot != InvalidOffsetNumber)
+		{
+			/*
+			 * We are resuming scan of a HOT chain after having returned
+			 * an earlier member.  Must still hold pin on current heap page.
+			 */
+			Assert(BufferIsValid(scan->xs_cbuf));
+			Assert(ItemPointerGetBlockNumber(tid) ==
+				   BufferGetBlockNumber(scan->xs_cbuf));
+			Assert(TransactionIdIsValid(scan->xs_prev_xmax));
+			offnum = scan->xs_next_hot;
+			at_chain_start = false;
+			scan->xs_next_hot = InvalidOffsetNumber;
+		}
+		else
+		{
+			bool		found;
+			Buffer		prev_buf;
+
+			/*
+			 * If we scanned a whole HOT chain and found only dead tuples,
+			 * tell index AM to kill its entry for that TID.
+			 */
+			scan->kill_prior_tuple = scan->xs_hot_dead;
+
+			/*
+			 * The AM's gettuple proc finds the next index entry matching the
+			 * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid).
+			 */
+			found = DatumGetBool(FunctionCall2(procedure,
+											   PointerGetDatum(scan),
+											   Int32GetDatum(direction)));
+
+			/* Reset kill flag immediately for safety */
+			scan->kill_prior_tuple = false;
+
+			/* If we're out of index entries, break out of outer loop */
+			if (!found)
+				break;
+
+			pgstat_count_index_tuples(scan->indexRelation, 1);
+
+			/* Switch to correct buffer if we don't have it already */
+			prev_buf = scan->xs_cbuf;
+			scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+												 scan->heapRelation,
+											 ItemPointerGetBlockNumber(tid));
+
+			/*
+			 * Prune page, but only if we weren't already on this page
+			 */
+			if (prev_buf != scan->xs_cbuf)
+				heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+									RecentGlobalXmin);
+
+			/* Prepare to scan HOT chain starting at index-referenced offnum */
+			offnum = ItemPointerGetOffsetNumber(tid);
+			at_chain_start = true;
+
+			/* We don't know what the first tuple's xmin should be */
+			scan->xs_prev_xmax = InvalidTransactionId;
+
+			/* Initialize flag to detect if all entries are dead */
+			scan->xs_hot_dead = true;
+		}
+
+		/* Obtain share-lock on the buffer so we can examine visibility */
+		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
 
-		/* Reset kill flag immediately for safety */
-		scan->kill_prior_tuple = false;
+		dp = (Page) BufferGetPage(scan->xs_cbuf);
 
-		if (!found)
+		/* Scan through possible multiple members of HOT-chain */
+		for (;;)
 		{
-			/* Release any held pin on a heap page */
-			if (BufferIsValid(scan->xs_cbuf))
-			{
-				ReleaseBuffer(scan->xs_cbuf);
-				scan->xs_cbuf = InvalidBuffer;
-			}
-			return NULL;		/* failure exit */
-		}
+			ItemId lp;
+			ItemPointer ctid;
 
-		pgstat_count_index_tuples(scan->indexRelation, 1);
+			/* check for bogus TID */
+			if (offnum < FirstOffsetNumber ||
+				offnum > PageGetMaxOffsetNumber(dp))
+				break;
 
-		/*
-		 * Fetch the heap tuple and see if it matches the snapshot.
-		 */
-		if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
-							   heapTuple, &scan->xs_cbuf, true,
-							   scan->indexRelation))
-			break;
+			lp = PageGetItemId(dp, offnum);
 
-		/* Skip if no undeleted tuple at this location */
-		if (heapTuple->t_data == NULL)
-			continue;
+			/* check for unused, dead, or redirected items */
+			if (!ItemIdIsNormal(lp))
+			{
+				/* We should only see a redirect at start of chain */
+				if (ItemIdIsRedirected(lp) && at_chain_start)
+				{
+					/* Follow the redirect */
+					offnum = ItemIdGetRedirect(lp);
+					at_chain_start = false;
+					continue;
+				}
+				/* else must be end of chain */
+				break;
+			}
 
-		/*
-		 * If we can't see it, maybe no one else can either.  Check to see if
-		 * the tuple is dead to all transactions.  If so, signal the index AM
-		 * to not return it on future indexscans.
-		 *
-		 * We told heap_release_fetch to keep a pin on the buffer, so we can
-		 * re-access the tuple here.  But we must re-lock the buffer first.
-		 */
-		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+			/*
+			 * We must initialize all of *heapTuple (ie, scan->xs_ctup)
+			 * since it is returned to the executor on success.
+			 */
+			heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+			heapTuple->t_len = ItemIdGetLength(lp);
+			ItemPointerSetOffsetNumber(tid, offnum);
+			heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation);
+			ctid = &heapTuple->t_data->t_ctid;
+
+			/*
+			 * Shouldn't see a HEAP_ONLY tuple at chain start.  (This test
+			 * should be unnecessary, since the chain root can't be removed
+			 * while we have pin on the index entry, but let's make it anyway.)
+			 */
+			if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
+				break;
+
+			/*
+			 * The xmin should match the previous xmax value, else chain is
+			 * broken.  (Note: this test is not optional because it protects
+			 * us against the case where the prior chain member's xmax
+			 * aborted since we looked at it.)
+			 */
+			if (TransactionIdIsValid(scan->xs_prev_xmax) &&
+				!TransactionIdEquals(scan->xs_prev_xmax,
+								 HeapTupleHeaderGetXmin(heapTuple->t_data)))
+				break;
+
+			/* If it's visible per the snapshot, we must return it */
+			if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
+											 scan->xs_cbuf))
+			{
+				/*
+				 * If the snapshot is MVCC, we know that it could accept
+				 * at most one member of the HOT chain, so we can skip
+				 * examining any more members.  Otherwise, check for
+				 * continuation of the HOT-chain, and set state for next time.
+				 */
+				if (IsMVCCSnapshot(scan->xs_snapshot))
+					scan->xs_next_hot = InvalidOffsetNumber;
+				else if (HeapTupleIsHotUpdated(heapTuple))
+				{
+					Assert(ItemPointerGetBlockNumber(ctid) ==
+						   ItemPointerGetBlockNumber(tid));
+					scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid);
+					scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+				}
+				else
+					scan->xs_next_hot = InvalidOffsetNumber;
+
+				LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+				pgstat_count_heap_fetch(scan->indexRelation);
+
+				return heapTuple;
+			}
 
-		if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
-									 scan->xs_cbuf) == HEAPTUPLE_DEAD)
-			scan->kill_prior_tuple = true;
+			/*
+			 * If we can't see it, maybe no one else can either.  Check to see
+			 * if the tuple is dead to all transactions.  If we find that all
+			 * the tuples in the HOT chain are dead, we'll signal the index AM
+			 * to not return that TID on future indexscans.
+			 */
+			if (scan->xs_hot_dead &&
+				HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
+										 scan->xs_cbuf) != HEAPTUPLE_DEAD)
+				scan->xs_hot_dead = false;
+
+			/*
+			 * Check to see if HOT chain continues past this tuple; if so
+			 * fetch the next offnum (we don't bother storing it into
+			 * xs_next_hot, but must store xs_prev_xmax), and loop around.
+			 */
+			if (HeapTupleIsHotUpdated(heapTuple))
+			{
+				Assert(ItemPointerGetBlockNumber(ctid) ==
+					   ItemPointerGetBlockNumber(tid));
+				offnum = ItemPointerGetOffsetNumber(ctid);
+				at_chain_start = false;
+				scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+			}
+			else
+				break;			/* end of chain */
+		} /* loop over a single HOT chain */
 
 		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+		/* Loop around to ask index AM for another TID */
+		scan->xs_next_hot = InvalidOffsetNumber;
+	}
+
+	/* Release any held pin on a heap page */
+	if (BufferIsValid(scan->xs_cbuf))
+	{
+		ReleaseBuffer(scan->xs_cbuf);
+		scan->xs_cbuf = InvalidBuffer;
 	}
 
-	/* Success exit */
-	return heapTuple;
+	return NULL;				/* failure exit */
 }
 
 /* ----------------
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 7dbaa2c245f..5f7ecbe16da 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.160 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -193,8 +193,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 	 */
 	for (;;)
 	{
-		HeapTupleData htup;
-		Buffer		hbuffer;
 		ItemId		curitemid;
 		IndexTuple	curitup;
 		BlockNumber nblkno;
@@ -223,6 +221,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 			 */
 			if (!ItemIdIsDead(curitemid))
 			{
+				ItemPointerData htid;
+				bool all_dead;
+
 				/*
 				 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
 				 * how we handling NULLs - and so we must not use _bt_compare
@@ -234,17 +235,20 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 
 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);
-				htup.t_self = curitup->t_tid;
-				if (heap_fetch(heapRel, &SnapshotDirty, &htup, &hbuffer,
-							   true, NULL))
+				htid = curitup->t_tid;
+
+				/*
+				 * We check the whole HOT-chain to see if there is any tuple
+				 * that satisfies SnapshotDirty.  This is necessary because
+				 * we have just a single index entry for the entire chain.
+				 */
+				if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
 				{
 					/* it is a duplicate */
 					TransactionId xwait =
 					(TransactionIdIsValid(SnapshotDirty.xmin)) ?
 					SnapshotDirty.xmin : SnapshotDirty.xmax;
 
-					ReleaseBuffer(hbuffer);
-
 					/*
 					 * If this tuple is being updated by other transaction
 					 * then we have to wait for its commit/abort.
@@ -263,15 +267,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 					 * is itself now committed dead --- if so, don't complain.
 					 * This is a waste of time in normal scenarios but we must
 					 * do it to support CREATE INDEX CONCURRENTLY.
+					 * 
+					 * We must follow HOT-chains here because during
+					 * concurrent index build, we insert the root TID though
+					 * the actual tuple may be somewhere in the HOT-chain.
+					 * While following the chain we might not stop at the exact
+					 * tuple which triggered the insert, but that's OK because
+					 * if we find a live tuple anywhere in this chain, we have
+					 * a unique key conflict.  The other live tuple is not part
+					 * of this chain because it had a different index entry.
 					 */
-					htup.t_self = itup->t_tid;
-					if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer,
-								   false, NULL))
+					htid = itup->t_tid;
+					if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
-						ReleaseBuffer(hbuffer);
 					}
-					else if (htup.t_data != NULL)
+					else
 					{
 						/*
 						 * It's been deleted, so no error, and no need to
@@ -279,39 +290,27 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 						 */
 						break;
 					}
-					else
-					{
-						/* couldn't find the tuple?? */
-						elog(ERROR, "failed to fetch tuple being inserted");
-					}
 
 					ereport(ERROR,
 							(errcode(ERRCODE_UNIQUE_VIOLATION),
 					errmsg("duplicate key value violates unique constraint \"%s\"",
 						   RelationGetRelationName(rel))));
 				}
-				else if (htup.t_data != NULL)
+				else if (all_dead)
 				{
 					/*
-					 * Hmm, if we can't see the tuple, maybe it can be marked
-					 * killed.	This logic should match index_getnext and
-					 * btgettuple.
+					 * The conflicting tuple (or whole HOT chain) is dead to
+					 * everyone, so we may as well mark the index entry
+					 * killed.
 					 */
-					LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
-					if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
-												 hbuffer) == HEAPTUPLE_DEAD)
-					{
-						ItemIdMarkDead(curitemid);
-						opaque->btpo_flags |= BTP_HAS_GARBAGE;
-						/* be sure to mark the proper buffer dirty... */
-						if (nbuf != InvalidBuffer)
-							SetBufferCommitInfoNeedsSave(nbuf);
-						else
-							SetBufferCommitInfoNeedsSave(buf);
-					}
-					LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
+					ItemIdMarkDead(curitemid);
+					opaque->btpo_flags |= BTP_HAS_GARBAGE;
+					/* be sure to mark the proper buffer dirty... */
+					if (nbuf != InvalidBuffer)
+						SetBufferCommitInfoNeedsSave(nbuf);
+					else
+						SetBufferCommitInfoNeedsSave(buf);
 				}
-				ReleaseBuffer(hbuffer);
 			}
 		}
 
@@ -840,7 +839,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
-						false) == InvalidOffsetNumber)
+						false, false) == InvalidOffsetNumber)
 			elog(PANIC, "failed to add hikey to the right sibling");
 		rightoff = OffsetNumberNext(rightoff);
 	}
@@ -865,7 +864,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 	}
 	if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add hikey to the left sibling");
 	leftoff = OffsetNumberNext(leftoff);
 
@@ -1700,7 +1699,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * benefit of _bt_restore_page().
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add leftkey to new root page");
 	pfree(new_item);
 
@@ -1718,7 +1717,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * insert the right page pointer into the new root page.
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add rightkey to new root page");
 	pfree(new_item);
 
@@ -1805,7 +1804,7 @@ _bt_pgaddtup(Relation rel,
 	}
 
 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item to the %s for \"%s\"",
 			 where, RelationGetRelationName(rel));
 }
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 118dc22bb35..6293792b9f5 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -57,7 +57,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.113 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
 	}
 
 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to the index page");
 }
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index db64422b19f..499129c48f1 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.46 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -141,8 +141,8 @@ _bt_restore_page(Page page, char *from, int len)
 		memcpy(&itupdata, from, sizeof(IndexTupleData));
 		itemsz = IndexTupleDSize(itupdata);
 		itemsz = MAXALIGN(itemsz);
-		if (PageAddItem(page, (Item) from, itemsz,
-						FirstOffsetNumber, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
+						false, false) == InvalidOffsetNumber)
 			elog(PANIC, "_bt_restore_page: cannot add item to page");
 		from += itemsz;
 	}
@@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 			{
 				if (PageAddItem(page, (Item) datapos, datalen,
 							ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
-								false) == InvalidOffsetNumber)
+								false, false) == InvalidOffsetNumber)
 					elog(PANIC, "btree_insert_redo: failed to add item");
 
 				PageSetLSN(page, lsn);
@@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
 				if (onleft)
 				{
 					if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
-									false) == InvalidOffsetNumber)
+									false, false) == InvalidOffsetNumber)
 						elog(PANIC, "failed to add new item to left page after split");
 				}
 
@@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
 				hiItem = PageGetItem(rpage, hiItemId);
 
 				if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
-								P_HIKEY, false) == InvalidOffsetNumber)
+								P_HIKEY, false, false) == InvalidOffsetNumber)
 					elog(PANIC, "failed to add high key to left page after split");
 
 				/* Fix opaque fields */
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 9aa58e35f9a..8137377e7a5 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.285 2007/09/20 17:56:30 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -410,6 +410,9 @@ UpdateIndexRelation(Oid indexoid,
 	values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
 	values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
 	values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
+	values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false);
+	/* we set isvalid and isready the same way */
+	values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid);
 	values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
 	values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
 	values[Anum_pg_index_indoption - 1] = PointerGetDatum(indoption);
@@ -944,7 +947,11 @@ BuildIndexInfo(Relation index)
 
 	/* other info */
 	ii->ii_Unique = indexStruct->indisunique;
-	ii->ii_Concurrent = false;	/* assume normal case */
+	ii->ii_ReadyForInserts = indexStruct->indisready;
+
+	/* initialize index-build state to default */
+	ii->ii_Concurrent = false;
+	ii->ii_BrokenHotChain = false;
 
 	return ii;
 }
@@ -1309,6 +1316,35 @@ index_build(Relation heapRelation,
 	Assert(PointerIsValid(stats));
 
 	/*
+	 * If we found any potentially broken HOT chains, mark the index as
+	 * not being usable until the current transaction is below the event
+	 * horizon.  See src/backend/access/heap/README.HOT for discussion.
+	 */
+	if (indexInfo->ii_BrokenHotChain)
+	{
+		Oid indexId = RelationGetRelid(indexRelation);
+		Relation pg_index;
+		HeapTuple indexTuple;
+		Form_pg_index indexForm;
+
+		pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+		indexTuple = SearchSysCacheCopy(INDEXRELID,
+										ObjectIdGetDatum(indexId),
+										0, 0, 0);
+		if (!HeapTupleIsValid(indexTuple))
+			elog(ERROR, "cache lookup failed for index %u", indexId);
+		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+		indexForm->indcheckxmin = true;
+		simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+		CatalogUpdateIndexes(pg_index, indexTuple);
+
+		heap_freetuple(indexTuple);
+		heap_close(pg_index, RowExclusiveLock);
+	}
+
+	/*
 	 * Update heap and index pg_class rows
 	 */
 	index_update_stats(heapRelation,
@@ -1346,6 +1382,11 @@ index_build(Relation heapRelation,
  * must keep track of the number of index tuples; we don't do so here because
  * the AM might reject some of the tuples for its own reasons, such as being
  * unable to store NULLs.
+ *
+ * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
+ * any potentially broken HOT chains.  Currently, we set this if there are
+ * any RECENTLY_DEAD entries in a HOT chain, without trying very hard to
+ * detect whether they're really incompatible with the chain tip.
  */
 double
 IndexBuildHeapScan(Relation heapRelation,
@@ -1365,6 +1406,8 @@ IndexBuildHeapScan(Relation heapRelation,
 	ExprContext *econtext;
 	Snapshot	snapshot;
 	TransactionId OldestXmin;
+	BlockNumber root_blkno = InvalidBlockNumber;
+	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
 
 	/*
 	 * sanity checks
@@ -1427,15 +1470,47 @@ IndexBuildHeapScan(Relation heapRelation,
 
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * When dealing with a HOT-chain of updated tuples, we want to
+		 * index the values of the live tuple (if any), but index it
+		 * under the TID of the chain's root tuple.  This approach is
+		 * necessary to preserve the HOT-chain structure in the heap.
+		 * So we need to be able to find the root item offset for every
+		 * tuple that's in a HOT-chain.  When first reaching a new page
+		 * of the relation, call heap_get_root_tuples() to build a map
+		 * of root item offsets on the page.
+		 *
+		 * It might look unsafe to use this information across buffer
+		 * lock/unlock.  However, we hold ShareLock on the table so no
+		 * ordinary insert/update/delete should occur; and we hold pin on
+		 * the buffer continuously while visiting the page, so no pruning
+		 * operation can occur either.
+		 *
+		 * Note the implied assumption that there is no more than one live
+		 * tuple per HOT-chain ...
+		 */
+		if (scan->rs_cblock != root_blkno)
+		{
+			Page page = BufferGetPage(scan->rs_cbuf);
+
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+			heap_get_root_tuples(page, root_offsets);
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+			root_blkno = scan->rs_cblock;
+		}
+
 		if (snapshot == SnapshotAny)
 		{
 			/* do our own time qual check */
 			bool		indexIt;
 
+		recheck:
 			/*
 			 * We could possibly get away with not locking the buffer here,
 			 * since caller should hold ShareLock on the relation, but let's
-			 * be conservative about it.
+			 * be conservative about it.  (This remark is still correct
+			 * even with HOT-pruning: our pin on the buffer prevents pruning.)
 			 */
 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
 
@@ -1458,10 +1533,29 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * If tuple is recently deleted then we must index it
 					 * anyway to preserve MVCC semantics.  (Pre-existing
 					 * transactions could try to use the index after we finish
-					 * building it, and may need to see such tuples.) Exclude
-					 * it from unique-checking, however.
+					 * building it, and may need to see such tuples.)
+					 *
+					 * However, if it was HOT-updated then we must only index
+					 * the live tuple at the end of the HOT-chain.  Since this
+					 * breaks semantics for pre-existing snapshots, mark
+					 * the index as unusable for them.
+					 *
+					 * If we've already decided that the index will be unsafe
+					 * for old snapshots, we may as well stop indexing
+					 * recently-dead tuples, since there's no longer any
+					 * point.
 					 */
-					indexIt = true;
+					if (HeapTupleIsHotUpdated(heapTuple))
+					{
+						indexIt = false;
+						/* mark the index as unsafe for old snapshots */
+						indexInfo->ii_BrokenHotChain = true;
+					}
+					else if (indexInfo->ii_BrokenHotChain)
+						indexIt = false;
+					else
+						indexIt = true;
+					/* In any case, exclude the tuple from unique-checking */
 					tupleIsAlive = false;
 					break;
 				case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1473,12 +1567,31 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * followed by CREATE INDEX within a transaction.)	An
 					 * exception occurs when reindexing a system catalog,
 					 * because we often release lock on system catalogs before
-					 * committing.
+					 * committing.  In that case we wait for the inserting
+					 * transaction to finish and check again.  (We could do
+					 * that on user tables too, but since the case is not
+					 * expected it seems better to throw an error.)
 					 */
 					if (!TransactionIdIsCurrentTransactionId(
-								   HeapTupleHeaderGetXmin(heapTuple->t_data))
-						&& !IsSystemRelation(heapRelation))
-						elog(ERROR, "concurrent insert in progress");
+								   HeapTupleHeaderGetXmin(heapTuple->t_data)))
+					{
+						if (!IsSystemRelation(heapRelation))
+							elog(ERROR, "concurrent insert in progress");
+						else
+						{
+							/*
+							 * Must drop the lock on the buffer before we wait
+							 */
+							TransactionId xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
+							LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+							XactLockTableWait(xwait);
+							goto recheck;
+						}
+					}
+					/*
+					 * We must index such tuples, since if the index build
+					 * commits then they're good.
+					 */
 					indexIt = true;
 					tupleIsAlive = true;
 					break;
@@ -1491,19 +1604,48 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * followed by CREATE INDEX within a transaction.)	An
 					 * exception occurs when reindexing a system catalog,
 					 * because we often release lock on system catalogs before
-					 * committing.
+					 * committing.  In that case we wait for the deleting
+					 * transaction to finish and check again.  (We could do
+					 * that on user tables too, but since the case is not
+					 * expected it seems better to throw an error.)
 					 */
 					Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 					if (!TransactionIdIsCurrentTransactionId(
-								   HeapTupleHeaderGetXmax(heapTuple->t_data))
-						&& !IsSystemRelation(heapRelation))
-						elog(ERROR, "concurrent delete in progress");
-					indexIt = true;
+								   HeapTupleHeaderGetXmax(heapTuple->t_data)))
+					{
+						if (!IsSystemRelation(heapRelation))
+							elog(ERROR, "concurrent delete in progress");
+						else
+						{
+							/*
+							 * Must drop the lock on the buffer before we wait
+							 */
+							TransactionId xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
+							LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+							XactLockTableWait(xwait);
+							goto recheck;
+						}
+					}
+					/*
+					 * Otherwise, we have to treat these tuples just like
+					 * RECENTLY_DELETED ones.
+					 */
+					if (HeapTupleIsHotUpdated(heapTuple))
+					{
+						indexIt = false;
+						/* mark the index as unsafe for old snapshots */
+						indexInfo->ii_BrokenHotChain = true;
+					}
+					else if (indexInfo->ii_BrokenHotChain)
+						indexIt = false;
+					else
+						indexIt = true;
+					/* In any case, exclude the tuple from unique-checking */
 					tupleIsAlive = false;
 					break;
 				default:
 					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-					indexIt = tupleIsAlive = false;		/* keep compiler quiet */
+					indexIt = tupleIsAlive = false;	/* keep compiler quiet */
 					break;
 			}
 
@@ -1552,9 +1694,33 @@ IndexBuildHeapScan(Relation heapRelation,
 		 * pass the values[] and isnull[] arrays, instead.
 		 */
 
-		/* Call the AM's callback routine to process the tuple */
-		callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
-				 callback_state);
+		if (HeapTupleIsHeapOnly(heapTuple))
+		{
+			/*
+			 * For a heap-only tuple, pretend its TID is that of the root.
+			 * See src/backend/access/heap/README.HOT for discussion.
+			 */
+			HeapTupleData	rootTuple;
+			OffsetNumber	offnum;
+
+			rootTuple = *heapTuple;
+			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
+
+			Assert(OffsetNumberIsValid(root_offsets[offnum - 1]));
+
+			ItemPointerSetOffsetNumber(&rootTuple.t_self,
+									   root_offsets[offnum - 1]);
+
+			/* Call the AM's callback routine to process the tuple */
+			callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
+					 callback_state);
+		}
+		else
+		{
+			/* Call the AM's callback routine to process the tuple */
+			callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
+					 callback_state);
+		}
 	}
 
 	heap_endscan(scan);
@@ -1574,8 +1740,15 @@ IndexBuildHeapScan(Relation heapRelation,
 /*
  * validate_index - support code for concurrent index builds
  *
- * We do a concurrent index build by first building the index normally via
- * index_create(), while holding a weak lock that allows concurrent
+ * We do a concurrent index build by first inserting the catalog entry for the
+ * index via index_create(), marking it not indisready and not indisvalid.
+ * Then we commit our transaction and start a new one, then we wait for all
+ * transactions that could have been modifying the table to terminate.  Now
+ * we know that any subsequently-started transactions will see the index and
+ * honor its constraints on HOT updates; so while existing HOT-chains might
+ * be broken with respect to the index, no currently live tuple will have an
+ * incompatible HOT update done to it.  We now build the index normally via
+ * index_build(), while holding a weak lock that allows concurrent
  * insert/update/delete.  Also, we index only tuples that are valid
  * as of the start of the scan (see IndexBuildHeapScan), whereas a normal
  * build takes care to include recently-dead tuples.  This is OK because
@@ -1586,11 +1759,10 @@ IndexBuildHeapScan(Relation heapRelation,
  * if we used HeapTupleSatisfiesVacuum).  This leaves us with an index that
  * does not contain any tuples added to the table while we built the index.
  *
- * Next, we commit the transaction so that the index becomes visible to other
- * backends, but it is marked not "indisvalid" to prevent the planner from
- * relying on it for indexscans.  Then we wait for all transactions that
- * could have been modifying the table to terminate.  At this point we
- * know that any subsequently-started transactions will see the index and
+ * Next, we mark the index "indisready" (but still not "indisvalid") and
+ * commit the second transaction and start a third.  Again we wait for all
+ * transactions that could have been modifying the table to terminate.  Now
+ * we know that any subsequently-started transactions will see the index and
  * insert their new tuples into it.  We then take a new reference snapshot
  * which is passed to validate_index().  Any tuples that are valid according
  * to this snap, but are not in the index, must be added to the index.
@@ -1610,7 +1782,7 @@ IndexBuildHeapScan(Relation heapRelation,
  * Building a unique index this way is tricky: we might try to insert a
  * tuple that is already dead or is in process of being deleted, and we
  * mustn't have a uniqueness failure against an updated version of the same
- * row.  We can check the tuple to see if it's already dead and tell
+ * row.  We could try to check the tuple to see if it's already dead and tell
  * index_insert() not to do the uniqueness check, but that still leaves us
  * with a race condition against an in-progress update.  To handle that,
  * we expect the index AM to recheck liveness of the to-be-inserted tuple
@@ -1620,7 +1792,8 @@ IndexBuildHeapScan(Relation heapRelation,
  * were alive at the time of the reference snapshot are gone; this is
  * necessary to be sure there are none left with a serializable snapshot
  * older than the reference (and hence possibly able to see tuples we did
- * not index).	Then we mark the index valid and commit.
+ * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
+ * transactions will be able to use it for queries.
  *
  * Doing two full table scans is a brute-force strategy.  We could try to be
  * cleverer, eg storing new tuples in a special area of the table (perhaps
@@ -1727,6 +1900,9 @@ validate_index_heapscan(Relation heapRelation,
 	TupleTableSlot *slot;
 	EState	   *estate;
 	ExprContext *econtext;
+	BlockNumber root_blkno = InvalidBlockNumber;
+	OffsetNumber	root_offsets[MaxHeapTuplesPerPage];
+	bool			in_index[MaxHeapTuplesPerPage];
 
 	/* state variables for the merge */
 	ItemPointer indexcursor = NULL;
@@ -1768,39 +1944,86 @@ validate_index_heapscan(Relation heapRelation,
 	while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
 		ItemPointer heapcursor = &heapTuple->t_self;
+		ItemPointerData rootTuple;
+		OffsetNumber	root_offnum;
 
 		CHECK_FOR_INTERRUPTS();
 
 		state->htups += 1;
 
 		/*
+		 * As commented in IndexBuildHeapScan, we should index heap-only tuples
+		 * under the TIDs of their root tuples; so when we advance onto a new
+		 * heap page, build a map of root item offsets on the page.
+		 *
+		 * This complicates merging against the tuplesort output: we will
+		 * visit the live tuples in order by their offsets, but the root
+		 * offsets that we need to compare against the index contents might
+		 * be ordered differently.  So we might have to "look back" within
+		 * the tuplesort output, but only within the current page.  We handle
+		 * that by keeping a bool array in_index[] showing all the
+		 * already-passed-over tuplesort output TIDs of the current page.
+		 * We clear that array here, when advancing onto a new heap page.
+		 */
+		if (scan->rs_cblock != root_blkno)
+		{
+			Page page = BufferGetPage(scan->rs_cbuf);
+
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+			heap_get_root_tuples(page, root_offsets);
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+			memset(in_index, 0, sizeof(in_index));
+
+			root_blkno = scan->rs_cblock;
+		}
+
+		/* Convert actual tuple TID to root TID */
+		rootTuple = *heapcursor;
+		root_offnum = ItemPointerGetOffsetNumber(heapcursor);
+
+		if (HeapTupleIsHeapOnly(heapTuple))
+		{
+			root_offnum = root_offsets[root_offnum - 1];
+			Assert(OffsetNumberIsValid(root_offnum));
+			ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
+		}
+
+		/*
 		 * "merge" by skipping through the index tuples until we find or pass
-		 * the current heap tuple.
+		 * the current root tuple.
 		 */
 		while (!tuplesort_empty &&
 			   (!indexcursor ||
-				ItemPointerCompare(indexcursor, heapcursor) < 0))
+				ItemPointerCompare(indexcursor, &rootTuple) < 0))
 		{
 			Datum		ts_val;
 			bool		ts_isnull;
 
 			if (indexcursor)
+			{
+				/*
+				 * Remember index items seen earlier on the current heap page
+				 */
+				if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
+					in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
 				pfree(indexcursor);
+			}
+
 			tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
 												  &ts_val, &ts_isnull);
 			Assert(tuplesort_empty || !ts_isnull);
 			indexcursor = (ItemPointer) DatumGetPointer(ts_val);
 		}
 
-		if (tuplesort_empty ||
-			ItemPointerCompare(indexcursor, heapcursor) > 0)
+		/*
+		 * If the tuplesort has overshot *and* we didn't see a match earlier,
+		 * then this tuple is missing from the index, so insert it.
+		 */
+		if ((tuplesort_empty ||
+			 ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
+			!in_index[root_offnum - 1])
 		{
-			/*
-			 * We've overshot which means this heap tuple is missing from the
-			 * index, so insert it.
-			 */
-			bool		check_unique;
-
 			MemoryContextReset(econtext->ecxt_per_tuple_memory);
 
 			/* Set up for predicate or expression evaluation */
@@ -1828,39 +2051,29 @@ validate_index_heapscan(Relation heapRelation,
 						   isnull);
 
 			/*
-			 * If the tuple is already committed dead, we still have to put it
-			 * in the index (because some xacts might be able to see it), but
-			 * we might as well suppress uniqueness checking. This is just an
-			 * optimization because the index AM is not supposed to raise a
-			 * uniqueness failure anyway.
-			 */
-			if (indexInfo->ii_Unique)
-			{
-				/* must lock buffer to call HeapTupleSatisfiesVisibility */
-				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-				if (HeapTupleSatisfiesVisibility(heapTuple, SnapshotNow,
-												 scan->rs_cbuf))
-					check_unique = true;
-				else
-					check_unique = false;
-
-				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-			}
-			else
-				check_unique = false;
-
-			/*
 			 * You'd think we should go ahead and build the index tuple here,
 			 * but some index AMs want to do further processing on the data
 			 * first. So pass the values[] and isnull[] arrays, instead.
 			 */
+
+			/*
+			 * If the tuple is already committed dead, you might think we
+			 * could suppress uniqueness checking, but this is no longer
+			 * true in the presence of HOT, because the insert is actually
+			 * a proxy for a uniqueness check on the whole HOT-chain.  That
+			 * is, the tuple we have here could be dead because it was already
+			 * HOT-updated, and if so the updating transaction will not have
+			 * thought it should insert index entries.  The index AM will
+			 * check the whole HOT-chain and correctly detect a conflict
+			 * if there is one.
+			 */
+
 			index_insert(indexRelation,
 						 values,
 						 isnull,
-						 heapcursor,
+						 &rootTuple,
 						 heapRelation,
-						 check_unique);
+						 indexInfo->ii_Unique);
 
 			state->tups_inserted += 1;
 		}
@@ -1983,9 +2196,9 @@ reindex_index(Oid indexId)
 	ResetReindexProcessing();
 
 	/*
-	 * If the index is marked invalid (ie, it's from a failed CREATE INDEX
-	 * CONCURRENTLY), we can now mark it valid.  This allows REINDEX to be
-	 * used to clean up in such cases.
+	 * If the index is marked invalid or not ready (ie, it's from a failed
+	 * CREATE INDEX CONCURRENTLY), we can now mark it valid.  This allows
+	 * REINDEX to be used to clean up in such cases.
 	 */
 	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
 
@@ -1996,9 +2209,10 @@ reindex_index(Oid indexId)
 		elog(ERROR, "cache lookup failed for index %u", indexId);
 	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 
-	if (!indexForm->indisvalid)
+	if (!indexForm->indisvalid || !indexForm->indisready)
 	{
 		indexForm->indisvalid = true;
+		indexForm->indisready = true;
 		simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
 		CatalogUpdateIndexes(pg_index, indexTuple);
 	}
diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c
index e6ef88fd4ab..6f71022ffd2 100644
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/indexing.c,v 1.114 2007/01/05 22:19:24 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/indexing.c,v 1.115 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -78,6 +78,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 	Datum		values[INDEX_MAX_KEYS];
 	bool		isnull[INDEX_MAX_KEYS];
 
+	/* HOT update does not require index inserts */
+	if (HeapTupleIsHeapOnly(heapTuple))
+		return;
+
 	/*
 	 * Get information from the state structure.  Fall out if nothing to do.
 	 */
@@ -101,6 +105,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 
 		indexInfo = indexInfoArray[i];
 
+		/* If the index is marked as read-only, ignore it */
+		if (!indexInfo->ii_ReadyForInserts)
+			continue;
+
 		/*
 		 * Expressional and partial indexes on system catalogs are not
 		 * supported
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index e4ae0f39d4e..3e76bd17253 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1996-2007, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.44 2007/09/11 08:51:22 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.45 2007/09/20 17:56:30 tgl Exp $
  */
 
 CREATE VIEW pg_roles AS 
@@ -207,6 +207,7 @@ CREATE VIEW pg_stat_all_tables AS
             pg_stat_get_tuples_inserted(C.oid) AS n_tup_ins, 
             pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, 
             pg_stat_get_tuples_deleted(C.oid) AS n_tup_del,
+            pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd,
             pg_stat_get_live_tuples(C.oid) AS n_live_tup, 
             pg_stat_get_dead_tuples(C.oid) AS n_dead_tup,
             pg_stat_get_last_vacuum_time(C.oid) as last_vacuum,
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index 2fe44f59f8c..86d1def1cc8 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.7 2007/07/25 22:16:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.8 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -225,7 +225,9 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
 	indexInfo->ii_Predicate = NIL;
 	indexInfo->ii_PredicateState = NIL;
 	indexInfo->ii_Unique = true;
+	indexInfo->ii_ReadyForInserts = true;
 	indexInfo->ii_Concurrent = false;
+	indexInfo->ii_BrokenHotChain = false;
 
 	classObjectId[0] = OID_BTREE_OPS_OID;
 	classObjectId[1] = INT4_BTREE_OPS_OID;
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index ebac5957bd2..943978e589a 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.165 2007/09/10 21:59:37 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.166 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -119,6 +119,7 @@ DefineIndex(RangeVar *heapRelation,
 	Oid			namespaceId;
 	Oid			tablespaceId;
 	Relation	rel;
+	Relation	indexRelation;
 	HeapTuple	tuple;
 	Form_pg_am	accessMethodForm;
 	bool		amcanorder;
@@ -420,7 +421,10 @@ DefineIndex(RangeVar *heapRelation,
 	indexInfo->ii_Predicate = make_ands_implicit(predicate);
 	indexInfo->ii_PredicateState = NIL;
 	indexInfo->ii_Unique = unique;
+	/* In a concurrent build, mark it not-ready-for-inserts */
+	indexInfo->ii_ReadyForInserts = !concurrent;
 	indexInfo->ii_Concurrent = concurrent;
+	indexInfo->ii_BrokenHotChain = false;
 
 	classObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid));
 	coloptions = (int16 *) palloc(numberOfAttributes * sizeof(int16));
@@ -439,23 +443,38 @@ DefineIndex(RangeVar *heapRelation,
 				  primary ? "PRIMARY KEY" : "UNIQUE",
 				  indexRelationName, RelationGetRelationName(rel))));
 
-	/* save lockrelid for below, then close rel */
+	/* save lockrelid and locktag for below, then close rel */
 	heaprelid = rel->rd_lockInfo.lockRelId;
+	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
 	heap_close(rel, NoLock);
 
+	if (!concurrent)
+	{
+		indexRelationId =
+			index_create(relationId, indexRelationName, indexRelationId,
+						 indexInfo, accessMethodId, tablespaceId, classObjectId,
+						 coloptions, reloptions, primary, isconstraint,
+						 allowSystemTableMods, skip_build, concurrent);
+
+		return;					/* We're done, in the standard case */
+	}
+
+	/*
+	 * For a concurrent build, we next insert the catalog entry and add
+	 * constraints.  We don't build the index just yet; we must first make
+	 * the catalog entry so that the new index is visible to updating
+	 * transactions.  That will prevent them from making incompatible HOT
+	 * updates.  The new index will be marked not indisready and not
+	 * indisvalid, so that no one else tries to either insert into it or use
+	 * it for queries.  We pass skip_build = true to prevent the build.
+	 */
 	indexRelationId =
 		index_create(relationId, indexRelationName, indexRelationId,
 					 indexInfo, accessMethodId, tablespaceId, classObjectId,
 					 coloptions, reloptions, primary, isconstraint,
-					 allowSystemTableMods, skip_build, concurrent);
-
-	if (!concurrent)
-		return;					/* We're done, in the standard case */
+				 	 allowSystemTableMods, true, concurrent);
 
 	/*
-	 * Phase 2 of concurrent index build (see comments for validate_index()
-	 * for an overview of how this works)
-	 *
 	 * We must commit our current transaction so that the index becomes
 	 * visible; then start another.  Note that all the data structures we just
 	 * built are lost in the commit.  The only data we keep past here are the
@@ -476,6 +495,9 @@ DefineIndex(RangeVar *heapRelation,
 	StartTransactionCommand();
 
 	/*
+	 * Phase 2 of concurrent index build (see comments for validate_index()
+	 * for an overview of how this works)
+	 *
 	 * Now we must wait until no running transaction could have the table open
 	 * with the old list of indexes.  To do this, inquire which xacts
 	 * currently would conflict with ShareLock on the table -- ie, which ones
@@ -494,7 +516,91 @@ DefineIndex(RangeVar *heapRelation,
 	 * check for that.  Also, prepared xacts are not reported, which is
 	 * fine since they certainly aren't going to do anything more.
 	 */
-	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
+	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
+
+	while (VirtualTransactionIdIsValid(*old_lockholders))
+	{
+		VirtualXactLockTableWait(*old_lockholders);
+		old_lockholders++;
+	}
+
+	/*
+	 * At this moment we are sure that there are no transactions with the
+	 * table open for write that don't have this new index in their list of
+	 * indexes.  We have waited out all the existing transactions and any new
+	 * transaction will have the new index in its list, but the index is still
+	 * marked as "not-ready-for-inserts".  The index is consulted while
+	 * deciding HOT-safety though.  This arrangement ensures that no new HOT
+	 * chains can be created where the new tuple and the old tuple in the
+	 * chain have different index keys.
+	 *
+	 * We now take a new snapshot, and build the index using all tuples that
+	 * are visible in this snapshot.  We can be sure that any HOT updates
+	 * to these tuples will be compatible with the index, since any updates
+	 * made by transactions that didn't know about the index are now committed
+	 * or rolled back.  Thus, each visible tuple is either the end of its
+	 * HOT-chain or the extension of the chain is HOT-safe for this index.
+	 */
+
+	/* Open and lock the parent heap relation */
+	rel = heap_openrv(heapRelation, ShareUpdateExclusiveLock);
+
+	/* And the target index relation */
+	indexRelation = index_open(indexRelationId, RowExclusiveLock);
+
+	/* Set ActiveSnapshot since functions in the indexes may need it */
+	ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
+
+	/* We have to re-build the IndexInfo struct, since it was lost in commit */
+	indexInfo = BuildIndexInfo(indexRelation);
+	Assert(!indexInfo->ii_ReadyForInserts);
+	indexInfo->ii_Concurrent = true;
+	indexInfo->ii_BrokenHotChain = false;
+
+	/* Now build the index */
+	index_build(rel, indexRelation, indexInfo, primary);
+
+	/* Close both the relations, but keep the locks */
+	heap_close(rel, NoLock);
+	index_close(indexRelation, NoLock);
+
+	/*
+	 * Update the pg_index row to mark the index as ready for inserts.
+	 * Once we commit this transaction, any new transactions that
+	 * open the table must insert new entries into the index for insertions
+	 * and non-HOT updates.
+	 */
+	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+	indexTuple = SearchSysCacheCopy(INDEXRELID,
+									ObjectIdGetDatum(indexRelationId),
+									0, 0, 0);
+	if (!HeapTupleIsValid(indexTuple))
+		elog(ERROR, "cache lookup failed for index %u", indexRelationId);
+	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+	Assert(!indexForm->indisready);
+	Assert(!indexForm->indisvalid);
+
+	indexForm->indisready = true;
+
+	simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+	CatalogUpdateIndexes(pg_index, indexTuple);
+
+	heap_close(pg_index, RowExclusiveLock);
+
+	/*
+	 * Commit this transaction to make the indisready update visible.
+	 */
+	CommitTransactionCommand();
+	StartTransactionCommand();
+
+	/*
+	 * Phase 3 of concurrent index build
+	 *
+	 * We once again wait until no transaction can have the table open with
+	 * the index marked as read-only for updates.
+	 */
 	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
 
 	while (VirtualTransactionIdIsValid(*old_lockholders))
@@ -505,7 +611,7 @@ DefineIndex(RangeVar *heapRelation,
 
 	/*
 	 * Now take the "reference snapshot" that will be used by validate_index()
-	 * to filter candidate tuples.  Beware!  There might be still snapshots
+	 * to filter candidate tuples.  Beware!  There might still be snapshots
 	 * in use that treat some transaction as in-progress that our reference
 	 * snapshot treats as committed.  If such a recently-committed transaction
 	 * deleted tuples in the table, we will not include them in the index; yet
@@ -560,7 +666,7 @@ DefineIndex(RangeVar *heapRelation,
 		elog(ERROR, "cache lookup failed for index %u", indexRelationId);
 	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 
-	Assert(indexForm->indexrelid = indexRelationId);
+	Assert(indexForm->indisready);
 	Assert(!indexForm->indisvalid);
 
 	indexForm->indisvalid = true;
@@ -575,7 +681,8 @@ DefineIndex(RangeVar *heapRelation,
 	 * relcache entries for the index itself, but we should also send a
 	 * relcache inval on the parent table to force replanning of cached plans.
 	 * Otherwise existing sessions might fail to use the new index where it
-	 * would be useful.
+	 * would be useful.  (Note that our earlier commits did not create
+	 * reasons to replan; relcache flush on the index itself was sufficient.)
 	 */
 	CacheInvalidateRelcacheByRelid(heaprelid.relId);
 
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 73024a7e703..25d1e2311b6 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.145 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.146 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1281,7 +1281,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
 	itemsz = record->xl_len - sizeof(xl_seq_rec);
 	itemsz = MAXALIGN(itemsz);
 	if (PageAddItem(page, (Item) item, itemsz,
-					FirstOffsetNumber, false) == InvalidOffsetNumber)
+					FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(PANIC, "seq_redo: failed to add item to page");
 
 	PageSetLSN(page, lsn);
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index f9b9423534e..5630fc2730d 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.358 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.359 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -124,10 +124,11 @@ typedef VTupleMoveData *VTupleMove;
 typedef struct VRelStats
 {
 	/* miscellaneous statistics */
-	BlockNumber rel_pages;
-	double		rel_tuples;
-	Size		min_tlen;
-	Size		max_tlen;
+	BlockNumber rel_pages;		/* pages in relation */
+	double		rel_tuples;		/* tuples that remain after vacuuming */
+	double		rel_indexed_tuples;		/* indexed tuples that remain */
+	Size		min_tlen;		/* min surviving tuple size */
+	Size		max_tlen;		/* max surviving tuple size */
 	bool		hasindex;
 	/* vtlinks array for tuple chain following - sorted by new_tid */
 	int			num_vtlinks;
@@ -1177,6 +1178,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
 	vacrelstats->rel_pages = 0;
 	vacrelstats->rel_tuples = 0;
+	vacrelstats->rel_indexed_tuples = 0;
 	vacrelstats->hasindex = false;
 
 	/* scan the heap */
@@ -1195,13 +1197,13 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 		{
 			for (i = 0; i < nindexes; i++)
 				vacuum_index(&vacuum_pages, Irel[i],
-							 vacrelstats->rel_tuples, 0);
+							 vacrelstats->rel_indexed_tuples, 0);
 		}
 		else
 		{
 			/* just scan indexes to update statistic */
 			for (i = 0; i < nindexes; i++)
-				scan_index(Irel[i], vacrelstats->rel_tuples);
+				scan_index(Irel[i], vacrelstats->rel_indexed_tuples);
 		}
 	}
 
@@ -1256,6 +1258,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 	BlockNumber empty_pages,
 				empty_end_pages;
 	double		num_tuples,
+				num_indexed_tuples,
 				tups_vacuumed,
 				nkeep,
 				nunused;
@@ -1278,7 +1281,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 					relname)));
 
 	empty_pages = empty_end_pages = 0;
-	num_tuples = tups_vacuumed = nkeep = nunused = 0;
+	num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0;
 	free_space = 0;
 
 	nblocks = RelationGetNumberOfBlocks(onerel);
@@ -1313,9 +1316,13 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		 * background writer will try to write the page if it's already marked
 		 * dirty.  To ensure that invalid data doesn't get written to disk, we
 		 * must take exclusive buffer lock wherever we potentially modify
-		 * pages.
+		 * pages.  In fact, we insist on cleanup lock so that we can safely
+		 * call heap_page_prune().  (This might be overkill, since the bgwriter
+		 * pays no attention to individual tuples, but on the other hand it's
+		 * unlikely that the bgwriter has this particular page pinned at this
+		 * instant.  So violating the coding rule would buy us little anyway.)
 		 */
-		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		LockBufferForCleanup(buf);
 
 		vacpage->blkno = blkno;
 		vacpage->offsets_used = 0;
@@ -1356,6 +1363,21 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			continue;
 		}
 
+		/* 
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We use the redirect_move option so that redirecting line pointers
+		 * get collapsed out; this allows us to not worry about them below.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+										 true, false);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for
+		 * tuples requiring freezing.
+		 */
 		nfrozen = 0;
 		notup = true;
 		maxoff = PageGetMaxOffsetNumber(page);
@@ -1369,7 +1391,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 
 			/*
 			 * Collect un-used items too - it's possible to have indexes
-			 * pointing here after crash.
+			 * pointing here after crash.  (That's an ancient comment and
+			 * is likely obsolete with WAL, but we might as well continue
+			 * to check for such problems.)
 			 */
 			if (!ItemIdIsUsed(itemid))
 			{
@@ -1378,6 +1402,23 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 				continue;
 			}
 
+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting
+			 * (at least in the common case where heap_page_prune() just
+			 * freed up a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				vacpage->offsets[vacpage->offsets_free++] = offnum;
+				continue;
+			}
+
+			/* Shouldn't have any redirected items anymore */
+			if (!ItemIdIsNormal(itemid))
+				elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item",
+					 relname, blkno, offnum);
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple.t_len = ItemIdGetLength(itemid);
 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
@@ -1410,12 +1451,45 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 					}
 					break;
 				case HEAPTUPLE_DEAD:
-					tupgone = true;		/* we can delete the tuple */
 					/*
-					 * We need not require XMIN_COMMITTED or XMAX_COMMITTED to
-					 * be set, since we will remove the tuple without any
-					 * further examination of its hint bits.
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition, though it
+					 * does suggest that someone released a lock early.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it as if it
+					 * were RECENTLY_DEAD, and abandon shrinking. (XXX is it
+					 * worth trying to make the shrinking code smart enough
+					 * to handle this?  It's an unusual corner case.)
+					 *
+					 * DEAD heap-only tuples can safely be removed if they
+					 * aren't themselves HOT-updated, although this is a bit
+					 * inefficient since we'll uselessly try to remove
+					 * index entries for them.
 					 */
+					if (HeapTupleIsHotUpdated(&tuple))
+					{
+						nkeep += 1;
+						if (do_shrinking)
+							ereport(LOG,
+									(errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation",
+											relname, blkno, offnum)));
+						do_shrinking = false;
+					}
+					else
+					{
+						tupgone = true;		/* we can delete the tuple */
+						/*
+						 * We need not require XMIN_COMMITTED or
+						 * XMAX_COMMITTED to be set, since we will remove the
+						 * tuple without any further examination of its hint
+						 * bits.
+						 */
+					}
 					break;
 				case HEAPTUPLE_RECENTLY_DEAD:
 
@@ -1530,6 +1604,8 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			else
 			{
 				num_tuples += 1;
+				if (!HeapTupleIsHeapOnly(&tuple))
+					num_indexed_tuples += 1;
 				notup = false;
 				if (tuple.t_len < min_tlen)
 					min_tlen = tuple.t_len;
@@ -1549,7 +1625,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		if (tempPage != NULL)
 		{
 			/* Some tuples are removable; figure free space after removal */
-			PageRepairFragmentation(tempPage, NULL);
+			PageRepairFragmentation(tempPage);
 			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
 			pfree(tempPage);
 			do_reap = true;
@@ -1558,7 +1634,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		{
 			/* Just use current available space */
 			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
-			/* Need to reap the page if it has LP_UNUSED line pointers */
+			/* Need to reap the page if it has UNUSED or DEAD line pointers */
 			do_reap = (vacpage->offsets_free > 0);
 		}
 
@@ -1621,6 +1697,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 
 	/* save stats in the rel list for use later */
 	vacrelstats->rel_tuples = num_tuples;
+	vacrelstats->rel_indexed_tuples = num_indexed_tuples;
 	vacrelstats->rel_pages = nblocks;
 	if (num_tuples == 0)
 		min_tlen = max_tlen = 0;
@@ -1720,6 +1797,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				num_fraged_pages,
 				vacuumed_pages;
 	int			keep_tuples = 0;
+	int			keep_indexed_tuples = 0;
 	PGRUsage	ru0;
 
 	pg_rusage_init(&ru0);
@@ -1845,6 +1923,16 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			if (!ItemIdIsUsed(itemid))
 				continue;
 
+			if (ItemIdIsDead(itemid))
+			{
+				/* just remember it for vacuum_page() */
+				vacpage->offsets[vacpage->offsets_free++] = offnum;
+				continue;
+			}
+
+			/* Shouldn't have any redirected items now */
+			Assert(ItemIdIsNormal(itemid));
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
@@ -1906,12 +1994,28 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					if (i >= vacpage->offsets_free)		/* not found */
 					{
 						vacpage->offsets[vacpage->offsets_free++] = offnum;
+						/*
+						 * If this is not a heap-only tuple, there must be an
+						 * index entry for this item which will be removed in
+						 * the index cleanup. Decrement the keep_indexed_tuples
+						 * count to remember this.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+							keep_indexed_tuples--;
 						keep_tuples--;
 					}
 				}
 				else
 				{
 					vacpage->offsets[vacpage->offsets_free++] = offnum;
+					/*
+					 * If this is not a heap-only tuple, there must be an
+					 * index entry for this item which will be removed in
+					 * the index cleanup. Decrement the keep_indexed_tuples
+					 * count to remember this.
+					 */
+					if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+						keep_indexed_tuples--;
 					keep_tuples--;
 				}
 				continue;
@@ -2028,7 +2132,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						break;
 					}
 					nextItemid = PageGetItemId(nextPage, nextOffnum);
-					if (!ItemIdIsUsed(nextItemid))
+					if (!ItemIdIsNormal(nextItemid))
 					{
 						ReleaseBuffer(nextBuf);
 						break;
@@ -2166,7 +2270,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					Pitemid = PageGetItemId(Ppage,
 								   ItemPointerGetOffsetNumber(&(tp.t_self)));
 					/* this can't happen since we saw tuple earlier: */
-					if (!ItemIdIsUsed(Pitemid))
+					if (!ItemIdIsNormal(Pitemid))
 						elog(ERROR, "parent itemid marked as unused");
 					PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
 
@@ -2268,6 +2372,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 									 dst_buffer, dst_page, destvacpage,
 									 &ec, &Ctid, vtmove[ti].cleanVpd);
 
+					/*
+					 * If the tuple we are moving is a heap-only tuple,
+					 * this move will generate an additional index entry,
+					 * so increment the rel_indexed_tuples count.
+					 */ 
+					if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+						vacrelstats->rel_indexed_tuples++;
+
 					num_moved++;
 					if (destvacpage->blkno > last_move_dest_block)
 						last_move_dest_block = destvacpage->blkno;
@@ -2280,7 +2392,31 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						vacpage->offsets[vacpage->offsets_free++] =
 							ItemPointerGetOffsetNumber(&(tuple.t_self));
 					else
+					{
+						/*
+						 * When we move tuple chains, we may need to move
+						 * tuples from a block that we haven't yet scanned in
+						 * the outer walk-along-the-relation loop. Note that we
+						 * can't be moving a tuple from a block that we have
+						 * already scanned because if such a tuple exists, then
+						 * we must have moved the chain along with that tuple
+						 * when we scanned that block. IOW the test of
+						 * (Cbuf != buf) guarantees that the tuple we are
+						 * looking at right now is in a block which is yet to
+						 * be scanned.
+						 *
+						 * We maintain two counters to correctly count the
+						 * moved-off tuples from blocks that are not yet
+						 * scanned (keep_tuples) and how many of them have
+						 * index pointers (keep_indexed_tuples).  The main
+						 * reason to track the latter is to help verify
+						 * that indexes have the expected number of entries
+						 * when all the dust settles.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+							keep_indexed_tuples++;
 						keep_tuples++;
+					}
 
 					ReleaseBuffer(dst_buffer);
 					ReleaseBuffer(Cbuf);
@@ -2328,6 +2464,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			move_plain_tuple(onerel, buf, page, &tuple,
 							 dst_buffer, dst_page, dst_vacpage, &ec);
 
+			/*
+			 * If the tuple we are moving is a heap-only tuple,
+			 * this move will generate an additional index entry,
+			 * so increment the rel_indexed_tuples count.
+			 */
+			if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+				vacrelstats->rel_indexed_tuples++;
+
 			num_moved++;
 			if (dst_vacpage->blkno > last_move_dest_block)
 				last_move_dest_block = dst_vacpage->blkno;
@@ -2361,6 +2505,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
 				if (!ItemIdIsUsed(itemid))
 					continue;
+				/* Shouldn't be any DEAD or REDIRECT items anymore */
+				Assert(ItemIdIsNormal(itemid));
+
 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 					continue;
@@ -2389,6 +2536,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					{
 						vacpage->offsets[vacpage->offsets_free++] = off;
 						Assert(keep_tuples > 0);
+						/*
+						 * If this is not a heap-only tuple, there must be an
+						 * index entry for this item which will be removed in
+						 * the index cleanup. Decrement the keep_indexed_tuples
+						 * count to remember this.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(htup))
+							keep_indexed_tuples--;
 						keep_tuples--;
 					}
 				}
@@ -2396,6 +2551,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				{
 					vacpage->offsets[vacpage->offsets_free++] = off;
 					Assert(keep_tuples > 0);
+					if (!HeapTupleHeaderIsHeapOnly(htup))
+						keep_indexed_tuples--;
 					keep_tuples--;
 				}
 			}
@@ -2529,11 +2686,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			 * page during chain moves but not been scanned over subsequently.
 			 * The tuple ids of these tuples are not recorded as free offsets
 			 * for any VacPage, so they will not be cleared from the indexes.
+			 * keep_indexed_tuples is the portion of these that are expected
+			 * to have index entries.
 			 */
 			Assert(keep_tuples >= 0);
 			for (i = 0; i < nindexes; i++)
 				vacuum_index(&Nvacpagelist, Irel[i],
-							 vacrelstats->rel_tuples, keep_tuples);
+							 vacrelstats->rel_indexed_tuples,
+							 keep_indexed_tuples);
 		}
 
 		/*
@@ -2551,7 +2711,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			OffsetNumber unused[MaxOffsetNumber];
 			OffsetNumber offnum,
 						maxoff;
-			int			uncnt;
+			int			uncnt = 0;
 			int			num_tuples = 0;
 
 			buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
@@ -2567,6 +2727,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
 				if (!ItemIdIsUsed(itemid))
 					continue;
+				/* Shouldn't be any DEAD or REDIRECT items anymore */
+				Assert(ItemIdIsNormal(itemid));
+
 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 					continue;
@@ -2584,12 +2747,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
 				ItemIdSetUnused(itemid);
 				num_tuples++;
+
+				unused[uncnt++] = offnum;
 			}
 			Assert(vacpage->offsets_free == num_tuples);
 
 			START_CRIT_SECTION();
 
-			uncnt = PageRepairFragmentation(page, unused);
+			PageRepairFragmentation(page);
 
 			MarkBufferDirty(buf);
 
@@ -2598,7 +2763,10 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			{
 				XLogRecPtr	recptr;
 
-				recptr = log_heap_clean(onerel, buf, unused, uncnt);
+				recptr = log_heap_clean(onerel, buf,
+										NULL, 0, NULL, 0,
+										unused, uncnt,
+										false);
 				PageSetLSN(page, recptr);
 				PageSetTLI(page, ThisTimeLineID);
 			}
@@ -2706,15 +2874,17 @@ move_chain_tuple(Relation rel,
 
 	/*
 	 * Update the state of the copied tuple, and store it on the destination
-	 * page.
+	 * page.  The copied tuple is never part of a HOT chain.
 	 */
 	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
 								   HEAP_XMIN_INVALID |
 								   HEAP_MOVED_OFF);
 	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+	HeapTupleHeaderClearHotUpdated(newtup.t_data);
+	HeapTupleHeaderClearHeapOnly(newtup.t_data);
 	HeapTupleHeaderSetXvac(newtup.t_data, myXID);
 	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
 			 (unsigned long) tuple_len, dst_vacpage->blkno);
@@ -2809,17 +2979,19 @@ move_plain_tuple(Relation rel,
 	START_CRIT_SECTION();
 
 	/*
-	 * Mark new tuple as MOVED_IN by me.
+	 * Mark new tuple as MOVED_IN by me; also mark it not HOT.
 	 */
 	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
 								   HEAP_XMIN_INVALID |
 								   HEAP_MOVED_OFF);
 	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+	HeapTupleHeaderClearHotUpdated(newtup.t_data);
+	HeapTupleHeaderClearHeapOnly(newtup.t_data);
 	HeapTupleHeaderSetXvac(newtup.t_data, myXID);
 
 	/* add tuple to the page */
 	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
 			 (unsigned long) tuple_len,
@@ -2934,6 +3106,9 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
 
 			if (!ItemIdIsUsed(itemid))
 				continue;
+			/* Shouldn't be any DEAD or REDIRECT items anymore */
+			Assert(ItemIdIsNormal(itemid));
+
 			htup = (HeapTupleHeader) PageGetItem(page, itemid);
 			if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 				continue;
@@ -3019,10 +3194,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 static void
 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 {
-	OffsetNumber unused[MaxOffsetNumber];
-	int			uncnt;
 	Page		page = BufferGetPage(buffer);
-	ItemId		itemid;
 	int			i;
 
 	/* There shouldn't be any tuples moved onto the page yet! */
@@ -3032,11 +3204,12 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 
 	for (i = 0; i < vacpage->offsets_free; i++)
 	{
-		itemid = PageGetItemId(page, vacpage->offsets[i]);
+		ItemId		itemid = PageGetItemId(page, vacpage->offsets[i]);
+
 		ItemIdSetUnused(itemid);
 	}
 
-	uncnt = PageRepairFragmentation(page, unused);
+	PageRepairFragmentation(page);
 
 	MarkBufferDirty(buffer);
 
@@ -3045,7 +3218,10 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 	{
 		XLogRecPtr	recptr;
 
-		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+		recptr = log_heap_clean(onerel, buffer,
+								NULL, 0, NULL, 0,
+								vacpage->offsets, vacpage->offsets_free,
+								false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
@@ -3527,8 +3703,7 @@ enough_space(VacPage vacpage, Size len)
 static Size
 PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
 {
-	PageHeader	pd = (PageHeader) page;
-	Size		freespace = pd->pd_upper - pd->pd_lower;
+	Size		freespace = PageGetHeapFreeSpace(page);
 	Size		targetfree;
 
 	targetfree = RelationGetTargetPageFreeSpace(relation,
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 3faf172acbf..b9050719cb4 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -36,7 +36,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.96 2007/09/16 02:37:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.97 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -326,8 +326,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
 		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 
-		/* Initially, we only need shared access to the buffer */
-		LockBuffer(buf, BUFFER_LOCK_SHARE);
+		/* We need buffer cleanup lock so that we can prune HOT chains. */
+		LockBufferForCleanup(buf);
 
 		page = BufferGetPage(buf);
 
@@ -341,11 +341,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			 * We have to be careful here because we could be looking at a
 			 * page that someone has just added to the relation and not yet
 			 * been able to initialize (see RelationGetBufferForTuple). To
-			 * interlock against that, release the buffer read lock (which we
-			 * must do anyway) and grab the relation extension lock before
-			 * re-locking in exclusive mode.  If the page is still
-			 * uninitialized by then, it must be left over from a crashed
-			 * backend, and we can initialize it.
+			 * protect against that, release the buffer lock, grab the
+			 * relation extension lock momentarily, and re-lock the buffer.
+			 * If the page is still uninitialized by then, it must be left
+			 * over from a crashed backend, and we can initialize it.
 			 *
 			 * We don't really need the relation lock when this is a new or
 			 * temp relation, but it's probably not worth the code space to
@@ -357,7 +356,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 			LockRelationForExtension(onerel, ExclusiveLock);
 			UnlockRelationForExtension(onerel, ExclusiveLock);
-			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+			LockBufferForCleanup(buf);
 			if (PageIsNew(page))
 			{
 				ereport(WARNING,
@@ -366,7 +365,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				PageInit(page, BufferGetPageSize(buf), 0);
 				empty_pages++;
 				lazy_record_free_space(vacrelstats, blkno,
-									   PageGetFreeSpace(page));
+									   PageGetHeapFreeSpace(page));
 			}
 			MarkBufferDirty(buf);
 			UnlockReleaseBuffer(buf);
@@ -377,11 +376,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		{
 			empty_pages++;
 			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetFreeSpace(page));
+								   PageGetHeapFreeSpace(page));
 			UnlockReleaseBuffer(buf);
 			continue;
 		}
 
+		/* 
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+										 false, false);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for
+		 * tuples requiring freezing.
+		 */
 		nfrozen = 0;
 		hastup = false;
 		prev_dead_count = vacrelstats->num_dead_tuples;
@@ -394,22 +405,64 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
 			itemid = PageGetItemId(page, offnum);
 
+			/* Unused items require no processing, but we count 'em */
 			if (!ItemIdIsUsed(itemid))
 			{
 				nunused += 1;
 				continue;
 			}
 
+			/* Redirect items mustn't be touched */
+ 			if (ItemIdIsRedirected(itemid))
+ 			{
+				hastup = true;	/* this page won't be truncatable */
+ 				continue;
+ 			}
+
+ 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
+
+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting
+			 * (at least in the common case where heap_page_prune() just
+			 * freed up a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+				continue;
+			}
+
+			Assert(ItemIdIsNormal(itemid));
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple.t_len = ItemIdGetLength(itemid);
-			ItemPointerSet(&(tuple.t_self), blkno, offnum);
 
 			tupgone = false;
 
 			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 			{
 				case HEAPTUPLE_DEAD:
-					tupgone = true;		/* we can delete the tuple */
+					/*
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it just as
+					 * if it were RECENTLY_DEAD.  Also, if it's a heap-only
+					 * tuple, we choose to keep it, because it'll be a
+					 * lot cheaper to get rid of it in the next pruning pass
+					 * than to treat it like an indexed tuple.
+					 */
+					if (HeapTupleIsHotUpdated(&tuple) ||
+						HeapTupleIsHeapOnly(&tuple))
+						nkeep += 1;
+					else
+						tupgone = true;		/* we can delete the tuple */
 					break;
 				case HEAPTUPLE_LIVE:
 					/* Tuple is good --- but let's do some validity checks */
@@ -449,11 +502,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
 				/*
 				 * Each non-removable tuple must be checked to see if it
-				 * needs freezing.  If we already froze anything, then
-				 * we've already switched the buffer lock to exclusive.
+				 * needs freezing.  Note we already have exclusive buffer lock.
 				 */
 				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
-									  (nfrozen > 0) ? InvalidBuffer : buf))
+									  InvalidBuffer))
 					frozen[nfrozen++] = offnum;
 			}
 		}						/* scan along page */
@@ -485,9 +537,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if (nindexes == 0 &&
 			vacrelstats->num_dead_tuples > 0)
 		{
-			/* Trade in buffer share lock for super-exclusive lock */
-			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			LockBufferForCleanup(buf);
 			/* Remove tuples from heap */
 			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
 			/* Forget the now-vacuumed tuples, and press on */
@@ -505,7 +554,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if (vacrelstats->num_dead_tuples == prev_dead_count)
 		{
 			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetFreeSpace(page));
+								   PageGetHeapFreeSpace(page));
 		}
 
 		/* Remember the location of the last page with nonremovable tuples */
@@ -598,7 +647,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		/* Now that we've compacted the page, record its available space */
 		page = BufferGetPage(buf);
 		lazy_record_free_space(vacrelstats, tblk,
-							   PageGetFreeSpace(page));
+							   PageGetHeapFreeSpace(page));
 		UnlockReleaseBuffer(buf);
 		npages++;
 	}
@@ -615,7 +664,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
  *	lazy_vacuum_page() -- free dead tuples on a page
  *					 and repair its fragmentation.
  *
- * Caller must hold pin and lock on the buffer.
+ * Caller must hold pin and buffer cleanup lock on the buffer.
  *
  * tupindex is the index in vacrelstats->dead_tuples of the first dead
  * tuple for this page.  We assume the rest follow sequentially.
@@ -625,10 +674,9 @@ static int
 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 				 int tupindex, LVRelStats *vacrelstats)
 {
-	OffsetNumber unused[MaxOffsetNumber];
-	int			uncnt;
 	Page		page = BufferGetPage(buffer);
-	ItemId		itemid;
+	OffsetNumber unused[MaxOffsetNumber];
+	int			uncnt = 0;
 
 	START_CRIT_SECTION();
 
@@ -636,6 +684,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	{
 		BlockNumber tblk;
 		OffsetNumber toff;
+		ItemId		itemid;
 
 		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 		if (tblk != blkno)
@@ -643,9 +692,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
 		itemid = PageGetItemId(page, toff);
 		ItemIdSetUnused(itemid);
+		unused[uncnt++] = toff;
 	}
 
-	uncnt = PageRepairFragmentation(page, unused);
+	PageRepairFragmentation(page);
 
 	MarkBufferDirty(buffer);
 
@@ -654,7 +704,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	{
 		XLogRecPtr	recptr;
 
-		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+		recptr = log_heap_clean(onerel, buffer,
+								NULL, 0, NULL, 0,
+								unused, uncnt,
+								false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
@@ -980,7 +1033,7 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats,
 	/*
 	 * The array shouldn't overflow under normal behavior, but perhaps it
 	 * could if we are given a really small maintenance_work_mem. In that
-	 * case, just forget the last few tuples.
+	 * case, just forget the last few tuples (we'll get 'em next time).
 	 */
 	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
 	{
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 7e5873b89df..485f6ddc1ee 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -26,7 +26,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.297 2007/09/07 20:59:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.298 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1813,8 +1813,10 @@ lreplace:;
 	 *
 	 * Note: heap_update returns the tid (location) of the new tuple in the
 	 * t_self field.
+	 *
+	 * If it's a HOT update, we mustn't insert new index entries.
 	 */
-	if (resultRelInfo->ri_NumIndices > 0)
+	if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple))
 		ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false);
 
 	/* AFTER ROW UPDATE Triggers */
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 1d478062998..790a9dccc10 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.150 2007/08/15 21:39:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.151 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -981,6 +981,10 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
  *		stuff as it only exists here because the genam stuff
  *		doesn't provide the functionality needed by the
  *		executor.. -cim 9/27/89
+ *
+ *		CAUTION: this must not be called for a HOT update.
+ *		We can't defend against that here for lack of info.
+ *		Should we change the API to make it safer?
  * ----------------------------------------------------------------
  */
 void
@@ -1029,6 +1033,10 @@ ExecInsertIndexTuples(TupleTableSlot *slot,
 
 		indexInfo = indexInfoArray[i];
 
+		/* If the index is marked as read-only, ignore it */
+		if (!indexInfo->ii_ReadyForInserts)
+			continue;
+
 		/* Check for partial index */
 		if (indexInfo->ii_Predicate != NIL)
 		{
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index f1e30aeb8f0..87e0063a03a 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -21,7 +21,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.19 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.20 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -240,12 +240,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	BlockNumber page = tbmres->blockno;
 	Buffer		buffer;
 	Snapshot	snapshot;
-	Page		dp;
 	int			ntup;
-	int			curslot;
-	int			minslot;
-	int			maxslot;
-	int			maxoff;
 
 	/*
 	 * Acquire pin on the target heap page, trading in any pin we held before.
@@ -258,6 +253,13 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	buffer = scan->rs_cbuf;
 	snapshot = scan->rs_snapshot;
 
+	ntup = 0;
+
+	/*
+	 * Prune and repair fragmentation for the whole page, if possible.
+	 */
+	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
 	 * visibility.	Afterwards, however, the tuples we have found to be
@@ -265,71 +267,51 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	 */
 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
-	dp = (Page) BufferGetPage(buffer);
-	maxoff = PageGetMaxOffsetNumber(dp);
-
 	/*
-	 * Determine how many entries we need to look at on this page. If the
-	 * bitmap is lossy then we need to look at each physical item pointer;
-	 * otherwise we just look through the offsets listed in tbmres.
+	 * We need two separate strategies for lossy and non-lossy cases.
 	 */
 	if (tbmres->ntuples >= 0)
 	{
-		/* non-lossy case */
-		minslot = 0;
-		maxslot = tbmres->ntuples - 1;
-	}
-	else
-	{
-		/* lossy case */
-		minslot = FirstOffsetNumber;
-		maxslot = maxoff;
-	}
-
-	ntup = 0;
-	for (curslot = minslot; curslot <= maxslot; curslot++)
-	{
-		OffsetNumber targoffset;
-		ItemId		lp;
-		HeapTupleData loctup;
-		bool		valid;
-
-		if (tbmres->ntuples >= 0)
-		{
-			/* non-lossy case */
-			targoffset = tbmres->offsets[curslot];
-		}
-		else
-		{
-			/* lossy case */
-			targoffset = (OffsetNumber) curslot;
-		}
-
 		/*
-		 * We'd better check for out-of-range offnum in case of VACUUM since
-		 * the TID was obtained.
+		 * Bitmap is non-lossy, so we just look through the offsets listed in
+		 * tbmres; but we have to follow any HOT chain starting at each such
+		 * offset.
 		 */
-		if (targoffset < FirstOffsetNumber || targoffset > maxoff)
-			continue;
+		int curslot;
 
-		lp = PageGetItemId(dp, targoffset);
+		for (curslot = 0; curslot < tbmres->ntuples; curslot++)
+		{
+			OffsetNumber offnum = tbmres->offsets[curslot];
+			ItemPointerData tid;
 
+			ItemPointerSet(&tid, page, offnum);
+			if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL))
+				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
+		}
+	}
+	else
+	{
 		/*
-		 * Must check for deleted tuple.
+		 * Bitmap is lossy, so we must examine each item pointer on the page.
+		 * But we can ignore HOT chains, since we'll check each tuple anyway.
 		 */
-		if (!ItemIdIsNormal(lp))
-			continue;
+		Page		dp = (Page) BufferGetPage(buffer);
+		OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
+		OffsetNumber offnum;
 
-		/*
-		 * check time qualification of tuple, remember it if valid
-		 */
-		loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
-		loctup.t_len = ItemIdGetLength(lp);
-		ItemPointerSet(&(loctup.t_self), page, targoffset);
+		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+		{
+			ItemId		lp;
+			HeapTupleData loctup;
 
-		valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
-		if (valid)
-			scan->rs_vistuples[ntup++] = targoffset;
+			lp = PageGetItemId(dp, offnum);
+			if (!ItemIdIsNormal(lp))
+				continue;
+			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+			loctup.t_len = ItemIdGetLength(lp);
+			if (HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer))
+				scan->rs_vistuples[ntup++] = offnum;
+		}
 	}
 
 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index af94ad1a3b8..875e4da2914 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.180 2007/08/15 19:15:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.181 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1407,6 +1407,7 @@ _SPI_prepare_plan(const char *src, SPIPlanPtr plan)
 		plansource->num_params = nargs;
 		plansource->fully_planned = true;
 		plansource->fixed_result = false;
+		/* no need to set search_path, generation or saved_xmin */
 		plansource->resultDesc = PlanCacheComputeResultDesc(stmt_list);
 		plansource->plan = cplan;
 
@@ -1973,6 +1974,7 @@ _SPI_copy_plan(SPIPlanPtr plan, MemoryContext parentcxt)
 		newsource->num_params = newplan->nargs;
 		newsource->fully_planned = plansource->fully_planned;
 		newsource->fixed_result = plansource->fixed_result;
+		/* no need to worry about seach_path, generation or saved_xmin */
 		if (plansource->resultDesc)
 			newsource->resultDesc = CreateTupleDescCopy(plansource->resultDesc);
 		newsource->plan = newcplan;
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 2f3e00d6a26..53e35b01acc 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003-2007, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.12 2007/04/26 23:24:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.13 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -32,6 +32,7 @@
 #include <limits.h>
 
 #include "access/htup.h"
+#include "nodes/bitmapset.h"
 #include "nodes/tidbitmap.h"
 #include "storage/bufpage.h"
 #include "utils/hsearch.h"
@@ -61,9 +62,7 @@
  */
 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
 
-/* The bitmap unit size can be adjusted by changing these declarations: */
-#define BITS_PER_BITMAPWORD 32
-typedef uint32 bitmapword;		/* must be an unsigned type */
+/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
 
 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
 #define BITNUM(x)	((x) % BITS_PER_BITMAPWORD)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index e2396d42ca6..e36ba97f6b8 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.221 2007/05/26 18:23:01 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.222 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -134,6 +134,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
 	glob->subrtables = NIL;
 	glob->rewindPlanIDs = NULL;
 	glob->finalrtable = NIL;
+	glob->transientPlan = false;
 
 	/* Determine what fraction of the plan is likely to be scanned */
 	if (cursorOptions & CURSOR_OPT_FAST_PLAN)
@@ -183,6 +184,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
 
 	result->commandType = parse->commandType;
 	result->canSetTag = parse->canSetTag;
+	result->transientPlan = glob->transientPlan;
 	result->planTree = top_plan;
 	result->rtable = glob->finalrtable;
 	result->resultRelations = root->resultRelations;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 70b3d7d43f5..21dd342593a 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.136 2007/05/31 16:57:34 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.137 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "catalog/pg_inherits.h"
 #include "nodes/makefuncs.h"
 #include "optimizer/clauses.h"
@@ -164,6 +165,20 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				continue;
 			}
 
+			/*
+			 * If the index is valid, but cannot yet be used, ignore it;
+			 * but mark the plan we are generating as transient.
+			 * See src/backend/access/heap/README.HOT for discussion.
+			 */
+			if (index->indcheckxmin &&
+				!TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data),
+									   TransactionXmin))
+			{
+				root->glob->transientPlan = true;
+				index_close(indexRelation, NoLock);
+				continue;
+			}
+
 			info = makeNode(IndexOptInfo);
 
 			info->indexoid = index->indexrelid;
diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c
index c501c827922..efb1ad9343d 100644
--- a/src/backend/optimizer/util/var.c
+++ b/src/backend/optimizer/util/var.c
@@ -8,12 +8,13 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/var.c,v 1.70 2007/06/11 01:16:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/var.c,v 1.71 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+#include "access/htup.h"
 #include "optimizer/clauses.h"
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
@@ -54,6 +55,7 @@ typedef struct
 
 static bool pull_varnos_walker(Node *node,
 				   pull_varnos_context *context);
+static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos);
 static bool contain_var_reference_walker(Node *node,
 							 contain_var_reference_context *context);
 static bool contain_var_clause_walker(Node *node, void *context);
@@ -134,6 +136,47 @@ pull_varnos_walker(Node *node, pull_varnos_context *context)
 								  (void *) context);
 }
 
+/*
+ * pull_varattnos
+ *		Find all the distinct attribute numbers present in an expression tree,
+ *		and add them to the initial contents of *varattnos.
+ *		Only Vars that reference RTE 1 of rtable level zero are considered.
+ *
+ * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
+ * we can include system attributes (e.g., OID) in the bitmap representation.
+ *
+ * Currently, this does not support subqueries nor expressions containing
+ * references to multiple tables; not needed since it's only applied to
+ * index expressions and predicates.
+ */
+void
+pull_varattnos(Node *node, Bitmapset **varattnos)
+{
+	(void) pull_varattnos_walker(node, varattnos);
+}
+
+static bool
+pull_varattnos_walker(Node *node, Bitmapset **varattnos)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Var))
+	{
+		Var		   *var = (Var *) node;
+
+		Assert(var->varno == 1);
+		*varattnos = bms_add_member(*varattnos,
+						var->varattno - FirstLowInvalidHeapAttributeNumber);
+		return false;
+	}
+	/* Should not find a subquery or subplan */
+	Assert(!IsA(node, Query));
+	Assert(!is_subplan(node));
+
+	return expression_tree_walker(node, pull_varattnos_walker,
+								  (void *) varattnos);
+}
+
 
 /*
  *		contain_var_reference
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 0d59d2e3463..9e088780d4c 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -13,7 +13,7 @@
  *
  *	Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.163 2007/09/11 03:28:05 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.164 2007/09/20 17:56:31 tgl Exp $
  * ----------
  */
 #include "postgres.h"
@@ -1294,7 +1294,7 @@ pgstat_count_heap_insert(Relation rel)
  * pgstat_count_heap_update - count a tuple update
  */
 void
-pgstat_count_heap_update(Relation rel)
+pgstat_count_heap_update(Relation rel, bool hot)
 {
 	PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
@@ -1304,6 +1304,9 @@ pgstat_count_heap_update(Relation rel)
 
 		/* t_tuples_updated is nontransactional, so just advance it */
 		pgstat_info->t_counts.t_tuples_updated++;
+		/* ditto for the hot_update counter */
+		if (hot)
+			pgstat_info->t_counts.t_tuples_hot_updated++;
 
 		/* We have to log the transactional effect at the proper level */
 		if (pgstat_info->trans == NULL ||
@@ -1340,6 +1343,23 @@ pgstat_count_heap_delete(Relation rel)
 	}
 }
 
+/*
+ * pgstat_update_heap_dead_tuples - update dead-tuples count
+ *
+ * The semantics of this are that we are reporting the nontransactional
+ * recovery of "delta" dead tuples; so t_new_dead_tuples decreases
+ * rather than increasing, and the change goes straight into the per-table
+ * counter, not into transactional state.
+ */
+void
+pgstat_update_heap_dead_tuples(Relation rel, int delta)
+{
+	PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+	if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+		pgstat_info->t_counts.t_new_dead_tuples -= delta;
+}
+
 
 /* ----------
  * AtEOXact_PgStat
@@ -2901,6 +2921,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 			tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
 			tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
 			tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+			tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated;
 			tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
 			tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
 			tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
@@ -2923,6 +2944,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 			tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
 			tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
 			tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+			tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated;
 			tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
 			tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
 			tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
@@ -2931,6 +2953,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 
 		/* Clamp n_live_tuples in case of negative new_live_tuples */
 		tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+		/* Likewise for n_dead_tuples */
+		tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
 
 		/*
 		 * Add per-table stats to the per-database entry, too.
@@ -3115,6 +3139,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
 	else
 		tabentry->vacuum_timestamp = msg->m_vacuumtime;
 	tabentry->n_live_tuples = msg->m_tuples;
+	/* Resetting dead_tuples to 0 is an approximation ... */
 	tabentry->n_dead_tuples = 0;
 	if (msg->m_analyze)
 	{
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 12564a69ee4..9c0ef67f6bb 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.223 2007/06/30 19:12:01 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.224 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2067,6 +2067,55 @@ LockBufferForCleanup(Buffer buffer)
 }
 
 /*
+ * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
+ *
+ * We won't loop, but just check once to see if the pin count is OK.  If
+ * not, return FALSE with no lock held.
+ */ 
+bool
+ConditionalLockBufferForCleanup(Buffer buffer)
+{
+	volatile BufferDesc *bufHdr;
+
+	Assert(BufferIsValid(buffer));
+
+	if (BufferIsLocal(buffer))
+	{
+		/* There should be exactly one pin */
+		Assert(LocalRefCount[-buffer - 1] > 0);
+		if (LocalRefCount[-buffer - 1] != 1)
+			return false;
+		/* Nobody else to wait for */
+		return true;
+	}
+
+	/* There should be exactly one local pin */
+	Assert(PrivateRefCount[buffer - 1] > 0);
+	if (PrivateRefCount[buffer - 1] != 1)
+		return false;
+
+	/* Try to acquire lock */
+	if (!ConditionalLockBuffer(buffer))
+		return false;
+
+	bufHdr = &BufferDescriptors[buffer - 1];
+	LockBufHdr(bufHdr);
+	Assert(bufHdr->refcount > 0);
+	if (bufHdr->refcount == 1)
+	{
+		/* Successfully acquired exclusive lock with pincount 1 */
+		UnlockBufHdr(bufHdr);
+		return true;
+	}
+
+	/* Failed, so release the lock */
+	UnlockBufHdr(bufHdr);
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	return false;
+}
+
+
+/*
  *	Functions for buffer I/O handling
  *
  *	Note: We assume that nested buffer I/O never occurs.
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 3ce2f04bd8e..b382e4d0240 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -8,12 +8,13 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.73 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.74 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+#include "access/htup.h"
 #include "storage/bufpage.h"
 
 
@@ -108,6 +109,9 @@ PageHeaderIsValid(PageHeader page)
  *	If offsetNumber is not valid, then assign one by finding the first
  *	one that is both unused and deallocated.
  *
+ *	If is_heap is true, we enforce that there can't be more than
+ *	MaxHeapTuplesPerPage line pointers on the page.
+ *
  *	!!! EREPORT(ERROR) IS DISALLOWED HERE !!!
  */
 OffsetNumber
@@ -115,7 +119,8 @@ PageAddItem(Page page,
 			Item item,
 			Size size,
 			OffsetNumber offsetNumber,
-			bool overwrite)
+			bool overwrite,
+			bool is_heap)
 {
 	PageHeader	phdr = (PageHeader) page;
 	Size		alignedSize;
@@ -200,6 +205,12 @@ PageAddItem(Page page,
 		return InvalidOffsetNumber;
 	}
 
+	if (is_heap && offsetNumber > MaxHeapTuplesPerPage)
+	{
+		elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
+		return InvalidOffsetNumber;
+	}
+
 	/*
 	 * Compute new lower and upper pointers for page, see if it'll fit.
 	 *
@@ -315,11 +326,10 @@ itemoffcompare(const void *itemidp1, const void *itemidp2)
  *
  * This routine is usable for heap pages only, but see PageIndexMultiDelete.
  *
- * Returns number of unused line pointers on page.	If "unused" is not NULL
- * then the unused[] array is filled with indexes of unused line pointers.
+ * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated.
  */
-int
-PageRepairFragmentation(Page page, OffsetNumber *unused)
+void
+PageRepairFragmentation(Page page)
 {
 	Offset		pd_lower = ((PageHeader) page)->pd_lower;
 	Offset		pd_upper = ((PageHeader) page)->pd_upper;
@@ -329,7 +339,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 	ItemId		lp;
 	int			nline,
 				nstorage,
-				nused;
+				nunused;
 	int			i;
 	Size		totallen;
 	Offset		upper;
@@ -352,13 +362,12 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 						pd_lower, pd_upper, pd_special)));
 
 	nline = PageGetMaxOffsetNumber(page);
-	nused = nstorage = 0;
-	for (i = 0; i < nline; i++)
+	nunused = nstorage = 0;
+	for (i = FirstOffsetNumber; i <= nline; i++)
 	{
-		lp = PageGetItemId(page, i + 1);
+		lp = PageGetItemId(page, i);
 		if (ItemIdIsUsed(lp))
 		{
-			nused++;
 			if (ItemIdHasStorage(lp))
 				nstorage++;
 		}
@@ -366,9 +375,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 		{
 			/* Unused entries should have lp_len = 0, but make sure */
 			ItemIdSetUnused(lp);
-			/* Report to caller if asked for */
-			if (unused)
-				unused[i - nused] = (OffsetNumber) i;
+			nunused++;
 		}
 	}
 
@@ -431,18 +438,19 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 	}
 
 	/* Set hint bit for PageAddItem */
-	if (nused < nline)
+	if (nunused > 0)
 		PageSetHasFreeLinePointers(page);
 	else
 		PageClearHasFreeLinePointers(page);
-
-	return (nline - nused);
 }
 
 /*
  * PageGetFreeSpace
  *		Returns the size of the free (allocatable) space on a page,
  *		reduced by the space needed for a new line pointer.
+ *
+ * Note: this should usually only be used on index pages.  Use
+ * PageGetHeapFreeSpace on heap pages.
  */
 Size
 PageGetFreeSpace(Page page)
@@ -465,7 +473,8 @@ PageGetFreeSpace(Page page)
 
 /*
  * PageGetExactFreeSpace
- *		Returns the size of the free (allocatable) space on a page.
+ *		Returns the size of the free (allocatable) space on a page,
+ *		without any consideration for adding/removing line pointers.
  */
 Size
 PageGetExactFreeSpace(Page page)
@@ -484,6 +493,73 @@ PageGetExactFreeSpace(Page page)
 
 
 /*
+ * PageGetHeapFreeSpace
+ *		Returns the size of the free (allocatable) space on a page,
+ *		reduced by the space needed for a new line pointer.
+ *
+ * The difference between this and PageGetFreeSpace is that this will return
+ * zero if there are already MaxHeapTuplesPerPage line pointers in the page
+ * and none are free.  We use this to enforce that no more than
+ * MaxHeapTuplesPerPage line pointers are created on a heap page.  (Although
+ * no more tuples than that could fit anyway, in the presence of redirected
+ * or dead line pointers it'd be possible to have too many line pointers.
+ * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
+ * on the number of line pointers, we make this extra check.)
+ */
+Size
+PageGetHeapFreeSpace(Page page)
+{
+	Size			space;
+
+	space = PageGetFreeSpace(page);
+	if (space > 0)
+	{
+		OffsetNumber	offnum, nline;
+
+		/*
+		 * Are there already MaxHeapTuplesPerPage line pointers in the page?
+		 */
+		nline = PageGetMaxOffsetNumber(page);
+		if (nline >= MaxHeapTuplesPerPage)
+		{
+			if (PageHasFreeLinePointers((PageHeader) page))
+			{
+				/*
+				 * Since this is just a hint, we must confirm that there is
+				 * indeed a free line pointer
+				 */
+				for (offnum = FirstOffsetNumber; offnum <= nline; offnum++)
+				{
+					ItemId	lp = PageGetItemId(page, offnum);
+
+					if (!ItemIdIsUsed(lp))
+						break;
+				}
+
+				if (offnum > nline)
+				{
+					/*
+					 * The hint is wrong, but we can't clear it here since
+					 * we don't have the ability to mark the page dirty.
+					 */
+					space = 0;
+				}
+			}
+			else
+			{
+				/*
+				 * Although the hint might be wrong, PageAddItem will believe
+				 * it anyway, so we must believe it too.
+				 */
+				space = 0;
+			}
+		}
+	}
+	return space;
+}
+
+
+/*
  * PageIndexTupleDelete
  *
  * This routine does the work of removing a tuple from an index page.
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 55951cf71b9..954e174bb71 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/pgstatfuncs.c,v 1.44 2007/09/11 03:28:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/pgstatfuncs.c,v 1.45 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,6 +28,7 @@ extern Datum pg_stat_get_tuples_fetched(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_inserted(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_updated(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS);
+extern Datum pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_dead_tuples(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_blocks_fetched(PG_FUNCTION_ARGS);
@@ -170,6 +171,22 @@ pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS)
 
 
 Datum
+pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	int64		result;
+	PgStat_StatTabEntry *tabentry;
+
+	if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL)
+		result = 0;
+	else
+		result = (int64) (tabentry->tuples_hot_updated);
+
+	PG_RETURN_INT64(result);
+}
+
+
+Datum
 pg_stat_get_live_tuples(PG_FUNCTION_ARGS)
 { 
 	Oid		relid = PG_GETARG_OID(0);
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 21aed6eadbe..43297281f5f 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -33,13 +33,14 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/plancache.c,v 1.10 2007/06/05 20:00:41 wieck Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/plancache.c,v 1.11 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include "utils/plancache.h"
+#include "access/transam.h"
 #include "catalog/namespace.h"
 #include "executor/executor.h"
 #include "optimizer/clauses.h"
@@ -79,6 +80,7 @@ static void ScanQueryForRelids(Query *parsetree,
 							   void *arg);
 static bool ScanQueryWalker(Node *node, ScanQueryWalkerContext *context);
 static bool rowmark_member(List *rowMarks, int rt_index);
+static bool plan_list_is_transient(List *stmt_list);
 static void PlanCacheCallback(Datum arg, Oid relid);
 static void InvalRelid(Oid relid, LOCKMODE lockmode,
 					   InvalRelidContext *context);
@@ -322,6 +324,13 @@ StoreCachedPlan(CachedPlanSource *plansource,
 	plan->stmt_list = stmt_list;
 	plan->fully_planned = plansource->fully_planned;
 	plan->dead = false;
+	if (plansource->fully_planned && plan_list_is_transient(stmt_list))
+	{
+		Assert(TransactionIdIsNormal(TransactionXmin));
+		plan->saved_xmin = TransactionXmin;
+	}
+	else
+		plan->saved_xmin = InvalidTransactionId;
 	plan->refcount = 1;			/* for the parent's link */
 	plan->generation = ++(plansource->generation);
 	plan->context = plan_context;
@@ -412,6 +421,15 @@ RevalidateCachedPlan(CachedPlanSource *plansource, bool useResOwner)
 			AcquirePlannerLocks(plan->stmt_list, true);
 
 		/*
+		 * If plan was transient, check to see if TransactionXmin has
+		 * advanced, and if so invalidate it.
+		 */
+		if (!plan->dead &&
+			TransactionIdIsValid(plan->saved_xmin) &&
+			!TransactionIdEquals(plan->saved_xmin, TransactionXmin))
+			plan->dead = true;
+
+		/*
 		 * By now, if any invalidation has happened, PlanCacheCallback
 		 * will have marked the plan dead.
 		 */
@@ -790,6 +808,28 @@ rowmark_member(List *rowMarks, int rt_index)
 }
 
 /*
+ * plan_list_is_transient: check if any of the plans in the list are transient.
+ */
+static bool
+plan_list_is_transient(List *stmt_list)
+{
+	ListCell   *lc;
+
+	foreach(lc, stmt_list)
+	{
+		PlannedStmt *plannedstmt = (PlannedStmt *) lfirst(lc);
+		
+		if (!IsA(plannedstmt, PlannedStmt))
+			continue;			/* Ignore utility statements */
+
+		if (plannedstmt->transientPlan)
+			return true;
+	}	
+
+	return false;
+}
+
+/*
  * PlanCacheComputeResultDesc: given a list of either fully-planned statements
  * or Queries, determine the result tupledesc it will produce.  Returns NULL
  * if the execution will not return tuples.
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index f69fb0c9362..8efa9e6c4e7 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.262 2007/07/25 22:16:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.263 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -34,6 +34,7 @@
 #include "access/reloptions.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
+#include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_amop.h"
@@ -51,6 +52,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
+#include "optimizer/var.h"
 #include "rewrite/rewriteDefine.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
@@ -1658,6 +1660,10 @@ RelationReloadIndexInfo(Relation relation)
 		index = (Form_pg_index) GETSTRUCT(tuple);
 
 		relation->rd_index->indisvalid = index->indisvalid;
+		relation->rd_index->indcheckxmin = index->indcheckxmin;
+		relation->rd_index->indisready = index->indisready;
+		HeapTupleHeaderSetXmin(relation->rd_indextuple->t_data,
+							   HeapTupleHeaderGetXmin(tuple->t_data));
 
 		ReleaseSysCache(tuple);
 	}
@@ -1762,6 +1768,7 @@ RelationClearRelation(Relation relation, bool rebuild)
 	if (relation->rd_options)
 		pfree(relation->rd_options);
 	list_free(relation->rd_indexlist);
+	bms_free(relation->rd_indexattr);
 	if (relation->rd_indexcxt)
 		MemoryContextDelete(relation->rd_indexcxt);
 
@@ -2969,6 +2976,7 @@ RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex)
 	relation->rd_indexvalid = 2;	/* mark list as forced */
 	/* must flag that we have a forced index list */
 	need_eoxact_work = true;
+	/* we deliberately do not change rd_indexattr */
 }
 
 /*
@@ -3140,6 +3148,91 @@ RelationGetIndexPredicate(Relation relation)
 	return result;
 }
 
+/*
+ * RelationGetIndexAttrBitmap -- get a bitmap of index attribute numbers
+ *
+ * The result has a bit set for each attribute used anywhere in the index
+ * definitions of all the indexes on this relation.  (This includes not only
+ * simple index keys, but attributes used in expressions and partial-index
+ * predicates.)
+ *
+ * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
+ * we can include system attributes (e.g., OID) in the bitmap representation.
+ *
+ * The returned result is palloc'd in the caller's memory context and should
+ * be bms_free'd when not needed anymore.
+ */
+Bitmapset *
+RelationGetIndexAttrBitmap(Relation relation)
+{
+	Bitmapset	*indexattrs;
+	List		*indexoidlist;
+	ListCell	*l;
+	MemoryContext oldcxt;
+
+	/* Quick exit if we already computed the result. */
+	if (relation->rd_indexattr != NULL)
+		return bms_copy(relation->rd_indexattr);
+
+	/* Fast path if definitely no indexes */
+	if (!RelationGetForm(relation)->relhasindex)
+		return NULL;
+
+	/*
+	 * Get cached list of index OIDs
+	 */
+	indexoidlist = RelationGetIndexList(relation);
+
+	/* Fall out if no indexes (but relhasindex was set) */
+	if (indexoidlist == NIL)
+		return NULL;
+
+	/*
+	 * For each index, add referenced attributes to indexattrs.
+	 */
+	indexattrs = NULL;
+	foreach(l, indexoidlist)
+	{
+		Oid			indexOid = lfirst_oid(l);
+		Relation	indexDesc;
+		IndexInfo  *indexInfo;
+		int 		i;
+
+		indexDesc = index_open(indexOid, AccessShareLock);
+
+		/* Extract index key information from the index's pg_index row */
+		indexInfo = BuildIndexInfo(indexDesc);
+
+		/* Collect simple attribute references */
+		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
+		{
+			int attrnum = indexInfo->ii_KeyAttrNumbers[i];
+
+			if (attrnum != 0)
+				indexattrs = bms_add_member(indexattrs,
+						attrnum - FirstLowInvalidHeapAttributeNumber);
+		}
+
+		/* Collect all attributes used in expressions, too */
+		pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs);
+
+		/* Collect all attributes in the index predicate, too */
+		pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs);
+
+		index_close(indexDesc, AccessShareLock);
+	}
+
+	list_free(indexoidlist);
+
+	/* Now save a copy of the bitmap in the relcache entry. */
+	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+	relation->rd_indexattr = bms_copy(indexattrs);
+	MemoryContextSwitchTo(oldcxt);
+
+	/* We return our original working copy for caller to play with */
+	return indexattrs;
+}
+
 
 /*
  *	load_relcache_init_file, write_relcache_init_file
@@ -3465,6 +3558,7 @@ load_relcache_init_file(void)
 			rel->rd_refcnt = 0;
 		rel->rd_indexvalid = 0;
 		rel->rd_indexlist = NIL;
+		rel->rd_indexattr = NULL;
 		rel->rd_oidindex = InvalidOid;
 		rel->rd_createSubid = InvalidSubTransactionId;
 		rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index cade6a26aa9..fbe24c8e45f 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.126 2007/06/09 18:49:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.127 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -153,6 +153,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
 extern bool heap_release_fetch(Relation relation, Snapshot snapshot,
 				   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
 				   Relation stats_relation);
+extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer,
+								   Snapshot snapshot, bool *all_dead);
+extern bool heap_hot_search(ItemPointer tid, Relation relation,
+							Snapshot snapshot, bool *all_dead);
 
 extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
 					ItemPointer tid);
@@ -183,6 +187,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 extern void heap_markpos(HeapScanDesc scan);
 extern void heap_restrpos(HeapScanDesc scan);
 
+extern void heap_sync(Relation relation);
+
 extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
 extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
 extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
@@ -192,7 +198,10 @@ extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
 			  ItemPointerData from,
 			  Buffer newbuf, HeapTuple newtup);
 extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
-			   OffsetNumber *unused, int uncnt);
+			   OffsetNumber *redirected, int nredirected,
+			   OffsetNumber *nowdead, int ndead,
+			   OffsetNumber *nowunused, int nunused,
+			   bool redirect_move);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
 								  TransactionId cutoff_xid,
 								  OffsetNumber *offsets, int offcnt);
@@ -240,7 +249,13 @@ extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup);
 extern HeapTuple heap_addheader(int natts, bool withoid,
 			   Size structlen, void *structure);
 
-extern void heap_sync(Relation relation);
+/* in heap/pruneheap.c */
+extern void heap_page_prune_opt(Relation relation, Buffer buffer,
+								TransactionId OldestXmin);
+extern int	heap_page_prune(Relation relation, Buffer buffer,
+							TransactionId OldestXmin,
+							bool redirect_move, bool report_stats);
+extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index ee816c568a8..32a27c972cb 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.93 2007/04/06 04:21:43 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.94 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -184,8 +184,12 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 /*
  * information stored in t_infomask2:
  */
-#define HEAP_NATTS_MASK			0x7FF	/* 11 bits for number of attributes */
-/* bits 0xF800 are currently unused */
+#define HEAP_NATTS_MASK			0x07FF	/* 11 bits for number of attributes */
+/* bits 0x3800 are available */
+#define HEAP_HOT_UPDATED		0x4000	/* tuple was HOT-updated */
+#define HEAP_ONLY_TUPLE			0x8000	/* this is heap-only tuple */
+
+#define HEAP2_XACT_MASK			0xC000	/* visibility-related bits */
 
 /*
  * HeapTupleHeader accessor macros
@@ -201,7 +205,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 
 #define HeapTupleHeaderSetXmin(tup, xid) \
 ( \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmin) \
+	(tup)->t_choice.t_heap.t_xmin = (xid) \
 )
 
 #define HeapTupleHeaderGetXmax(tup) \
@@ -211,7 +215,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 
 #define HeapTupleHeaderSetXmax(tup, xid) \
 ( \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmax) \
+	(tup)->t_choice.t_heap.t_xmax = (xid) \
 )
 
 /*
@@ -255,7 +259,7 @@ do { \
 #define HeapTupleHeaderSetXvac(tup, xid) \
 do { \
 	Assert((tup)->t_infomask & HEAP_MOVED); \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field3.t_xvac); \
+	(tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \
 } while (0)
 
 #define HeapTupleHeaderGetDatumLength(tup) \
@@ -298,6 +302,43 @@ do { \
 	*((Oid *) ((char *)(tup) + (tup)->t_hoff - sizeof(Oid))) = (oid); \
 } while (0)
 
+/*
+ * Note that we stop considering a tuple HOT-updated as soon as it is known
+ * aborted or the would-be updating transaction is known aborted.  For best
+ * efficiency, check tuple visibility before using this macro, so that the
+ * INVALID bits will be as up to date as possible.
+ */
+#define HeapTupleHeaderIsHotUpdated(tup) \
+( \
+	((tup)->t_infomask2 & HEAP_HOT_UPDATED) != 0 && \
+	((tup)->t_infomask & (HEAP_XMIN_INVALID | HEAP_XMAX_INVALID)) == 0 \
+)
+
+#define HeapTupleHeaderSetHotUpdated(tup) \
+( \
+	(tup)->t_infomask2 |= HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderClearHotUpdated(tup) \
+( \
+	(tup)->t_infomask2 &= ~HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderIsHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 & HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderSetHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderClearHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \
+)
+
 #define HeapTupleHeaderGetNatts(tup) \
 	((tup)->t_infomask2 & HEAP_NATTS_MASK)
 
@@ -331,6 +372,11 @@ do { \
  * fit on one heap page.  (Note that indexes could have more, because they
  * use a smaller tuple header.)  We arrive at the divisor because each tuple
  * must be maxaligned, and it must have an associated item pointer.
+ *
+ * Note: with HOT, there could theoretically be more line pointers (not actual
+ * tuples) than this on a heap page.  However we constrain the number of line
+ * pointers to this anyway, to avoid excessive line-pointer bloat and not
+ * require increases in the size of work arrays.
  */
 #define MaxHeapTuplesPerPage	\
 	((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
@@ -484,6 +530,24 @@ typedef HeapTupleData *HeapTuple;
 #define HeapTupleHasExternal(tuple) \
 		(((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0)
 
+#define HeapTupleIsHotUpdated(tuple) \
+		HeapTupleHeaderIsHotUpdated((tuple)->t_data)
+
+#define HeapTupleSetHotUpdated(tuple) \
+		HeapTupleHeaderSetHotUpdated((tuple)->t_data)
+
+#define HeapTupleClearHotUpdated(tuple) \
+		HeapTupleHeaderClearHotUpdated((tuple)->t_data)
+
+#define HeapTupleIsHeapOnly(tuple) \
+		HeapTupleHeaderIsHeapOnly((tuple)->t_data)
+
+#define HeapTupleSetHeapOnly(tuple) \
+		HeapTupleHeaderSetHeapOnly((tuple)->t_data)
+
+#define HeapTupleClearHeapOnly(tuple) \
+		HeapTupleHeaderClearHeapOnly((tuple)->t_data)
+
 #define HeapTupleGetOid(tuple) \
 		HeapTupleHeaderGetOid((tuple)->t_data)
 
@@ -497,27 +561,30 @@ typedef HeapTupleData *HeapTuple;
  * XLOG allows to store some information in high 4 bits of log
  * record xl_info field.  We use 3 for opcode and one for init bit.
  */
-#define XLOG_HEAP_INSERT	0x00
-#define XLOG_HEAP_DELETE	0x10
-#define XLOG_HEAP_UPDATE	0x20
-#define XLOG_HEAP_MOVE		0x30
-#define XLOG_HEAP_CLEAN		0x40
-#define XLOG_HEAP_NEWPAGE	0x50
-#define XLOG_HEAP_LOCK		0x60
-#define XLOG_HEAP_INPLACE	0x70
-#define XLOG_HEAP_OPMASK	0x70
+#define XLOG_HEAP_INSERT		0x00
+#define XLOG_HEAP_DELETE		0x10
+#define XLOG_HEAP_UPDATE		0x20
+#define XLOG_HEAP_MOVE			0x30
+#define XLOG_HEAP_HOT_UPDATE	0x40
+#define XLOG_HEAP_NEWPAGE		0x50
+#define XLOG_HEAP_LOCK			0x60
+#define XLOG_HEAP_INPLACE		0x70
+
+#define XLOG_HEAP_OPMASK		0x70
 /*
  * When we insert 1st item on new page in INSERT/UPDATE
  * we can (and we do) restore entire page in redo
  */
-#define XLOG_HEAP_INIT_PAGE 0x80
+#define XLOG_HEAP_INIT_PAGE 	0x80
 /*
  * We ran out of opcodes, so heapam.c now has a second RmgrId.  These opcodes
  * are associated with RM_HEAP2_ID, but are not logically different from
  * the ones above associated with RM_HEAP_ID.  We apply XLOG_HEAP_OPMASK,
  * although currently XLOG_HEAP_INIT_PAGE is not used for any of these.
  */
-#define XLOG_HEAP2_FREEZE	0x00
+#define XLOG_HEAP2_FREEZE		0x00
+#define XLOG_HEAP2_CLEAN		0x10
+#define XLOG_HEAP2_CLEAN_MOVE	0x20
 
 /*
  * All what we need to find changed tuple
@@ -569,7 +636,7 @@ typedef struct xl_heap_insert
 
 #define SizeOfHeapInsert	(offsetof(xl_heap_insert, target) + SizeOfHeapTid)
 
-/* This is what we need to know about update|move */
+/* This is what we need to know about update|move|hot_update */
 typedef struct xl_heap_update
 {
 	xl_heaptid	target;			/* deleted tuple id */
@@ -580,15 +647,34 @@ typedef struct xl_heap_update
 
 #define SizeOfHeapUpdate	(offsetof(xl_heap_update, newtid) + SizeOfIptrData)
 
-/* This is what we need to know about vacuum page cleanup */
+/*
+ * This is what we need to know about vacuum page cleanup/redirect
+ *
+ * The array of OffsetNumbers following the fixed part of the record contains:
+ *	* for each redirected item: the item offset, then the offset redirected to
+ *	* for each now-dead item: the item offset
+ *	* for each now-unused item: the item offset
+ * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused.
+ * Note that nunused is not explicitly stored, but may be found by reference
+ * to the total record length.
+ *
+ * If the opcode is CLEAN_MOVE instead of CLEAN, then each redirection pair
+ * should be interpreted as physically moving the "to" item pointer to the
+ * "from" slot, rather than placing a redirection item in the "from" slot.
+ * The moved pointers should be replaced by LP_UNUSED items (there will not
+ * be explicit entries in the "now-unused" list for this).  Also, the
+ * HEAP_ONLY bit in the moved tuples must be turned off.
+ */
 typedef struct xl_heap_clean
 {
 	RelFileNode node;
 	BlockNumber block;
-	/* UNUSED OFFSET NUMBERS FOLLOW AT THE END */
+	uint16		nredirected;
+	uint16		ndead;
+	/* OFFSET NUMBERS FOLLOW */
 } xl_heap_clean;
 
-#define SizeOfHeapClean (offsetof(xl_heap_clean, block) + sizeof(BlockNumber))
+#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
 
 /* This is for replacing a page's contents in toto */
 /* NB: this is used for indexes as well as heaps */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 15b9b8a3374..b145e09e36a 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.56 2007/06/09 18:49:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.57 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -82,6 +82,9 @@ typedef struct IndexScanDescData
 	HeapTupleData xs_ctup;		/* current heap tuple, if any */
 	Buffer		xs_cbuf;		/* current heap buffer in scan, if any */
 	/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+	TransactionId xs_prev_xmax;	/* previous HOT chain member's XMAX, if any */
+	OffsetNumber xs_next_hot;	/* next member of HOT chain, if any */
+	bool		xs_hot_dead;	/* T if all members of HOT chain are dead */
 } IndexScanDescData;
 
 typedef IndexScanDescData *IndexScanDesc;
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index e21606a1259..c295aab857c 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.425 2007/09/18 17:41:17 adunstan Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.426 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200709181
+#define CATALOG_VERSION_NO	200709201
 
 #endif
diff --git a/src/include/catalog/pg_attribute.h b/src/include/catalog/pg_attribute.h
index 7970cac3c48..006a8e4392d 100644
--- a/src/include/catalog/pg_attribute.h
+++ b/src/include/catalog/pg_attribute.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_attribute.h,v 1.132 2007/09/03 00:39:21 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_attribute.h,v 1.133 2007/09/20 17:56:32 tgl Exp $
  *
  * NOTES
  *	  the genbki.sh script reads this file and generates .bki
@@ -471,10 +471,12 @@ DATA(insert ( 1259 tableoid			26 0  4  -7 0 -1 -1 t p i t f f t 0));
 { 0, {"indisprimary"},		16, -1, 1, 5, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
 { 0, {"indisclustered"},	16, -1, 1, 6, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
 { 0, {"indisvalid"},		16, -1, 1, 7, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
-{ 0, {"indkey"},			22, -1, -1, 8, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indclass"},			30, -1, -1, 9, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indoption"},			22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indexprs"},			25, -1, -1, 11, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \
-{ 0, {"indpred"},			25, -1, -1, 12, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }
+{ 0, {"indcheckxmin"},		16, -1, 1, 8, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
+{ 0, {"indisready"},		16, -1, 1, 9, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
+{ 0, {"indkey"},			22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indclass"},			30, -1, -1, 11, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indoption"},			22, -1, -1, 12, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indexprs"},			25, -1, -1, 13, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \
+{ 0, {"indpred"},			25, -1, -1, 14, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }
 
 #endif   /* PG_ATTRIBUTE_H */
diff --git a/src/include/catalog/pg_index.h b/src/include/catalog/pg_index.h
index 31c6e25fb0d..f74ff4af0b9 100644
--- a/src/include/catalog/pg_index.h
+++ b/src/include/catalog/pg_index.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_index.h,v 1.43 2007/01/09 02:14:15 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_index.h,v 1.44 2007/09/20 17:56:32 tgl Exp $
  *
  * NOTES
  *	  the genbki.sh script reads this file and generates .bki
@@ -42,6 +42,8 @@ CATALOG(pg_index,2610) BKI_WITHOUT_OIDS
 	bool		indisprimary;	/* is this index for primary key? */
 	bool		indisclustered; /* is this the index last clustered by? */
 	bool		indisvalid;		/* is this index valid for use by queries? */
+	bool		indcheckxmin;	/* must we wait for xmin to be old? */
+	bool		indisready;		/* is this index ready for inserts? */
 
 	/* VARIABLE LENGTH FIELDS: */
 	int2vector	indkey;			/* column numbers of indexed cols, or 0 */
@@ -65,7 +67,7 @@ typedef FormData_pg_index *Form_pg_index;
  *		compiler constants for pg_index
  * ----------------
  */
-#define Natts_pg_index					12
+#define Natts_pg_index					14
 #define Anum_pg_index_indexrelid		1
 #define Anum_pg_index_indrelid			2
 #define Anum_pg_index_indnatts			3
@@ -73,11 +75,13 @@ typedef FormData_pg_index *Form_pg_index;
 #define Anum_pg_index_indisprimary		5
 #define Anum_pg_index_indisclustered	6
 #define Anum_pg_index_indisvalid		7
-#define Anum_pg_index_indkey			8
-#define Anum_pg_index_indclass			9
-#define Anum_pg_index_indoption			10
-#define Anum_pg_index_indexprs			11
-#define Anum_pg_index_indpred			12
+#define Anum_pg_index_indcheckxmin		8
+#define Anum_pg_index_indisready		9
+#define Anum_pg_index_indkey			10
+#define Anum_pg_index_indclass			11
+#define Anum_pg_index_indoption			12
+#define Anum_pg_index_indexprs			13
+#define Anum_pg_index_indpred			14
 
 /*
  * Index AMs that support ordered scans must support these two indoption
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 5f534839b23..3eaead16bda 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.470 2007/09/18 17:41:17 adunstan Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.471 2007/09/20 17:56:32 tgl Exp $
  *
  * NOTES
  *	  The script catalog/genbki.sh reads this file and generates .bki
@@ -2873,6 +2873,8 @@ DATA(insert OID = 1932 (  pg_stat_get_tuples_updated	PGNSP PGUID 12 1 0 f f t f
 DESCR("statistics: number of tuples updated");
 DATA(insert OID = 1933 (  pg_stat_get_tuples_deleted	PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_deleted - _null_ _null_ ));
 DESCR("statistics: number of tuples deleted");
+DATA(insert OID = 1972 (  pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_hot_updated - _null_ _null_ ));
+DESCR("statistics: number of tuples hot updated");
 DATA(insert OID = 2878 (  pg_stat_get_live_tuples	PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_live_tuples - _null_ _null_ ));
 DESCR("statistics: number of live tuples");
 DATA(insert OID = 2879 (  pg_stat_get_dead_tuples	PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_dead_tuples - _null_ _null_ ));
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index d886c0149fc..82f851eeacc 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.177 2007/08/15 21:39:50 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.178 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,7 +37,12 @@
  *		Predicate			partial-index predicate, or NIL if none
  *		PredicateState		exec state for predicate, or NIL if none
  *		Unique				is it a unique index?
+ *		ReadyForInserts		is it valid for inserts?
  *		Concurrent			are we doing a concurrent index build?
+ *		BrokenHotChain		did we detect any broken HOT chains?
+ *
+ * ii_Concurrent and ii_BrokenHotChain are used only during index build;
+ * they're conventionally set to false otherwise.
  * ----------------
  */
 typedef struct IndexInfo
@@ -50,7 +55,9 @@ typedef struct IndexInfo
 	List	   *ii_Predicate;	/* list of Expr */
 	List	   *ii_PredicateState;		/* list of ExprState */
 	bool		ii_Unique;
+	bool		ii_ReadyForInserts;
 	bool		ii_Concurrent;
+	bool		ii_BrokenHotChain;
 } IndexInfo;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 9d336e4b889..992b47f58d8 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.94 2007/04/27 22:05:49 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.95 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -39,6 +39,8 @@ typedef struct PlannedStmt
 
 	bool		canSetTag;		/* do I set the command result tag? */
 
+	bool		transientPlan;	/* redo plan when TransactionXmin changes? */
+
 	struct Plan *planTree;		/* tree of Plan nodes */
 
 	List	   *rtable;			/* list of RangeTblEntry nodes */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index ab26491a629..32c699b6de6 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.145 2007/08/31 01:44:06 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.146 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -71,6 +71,8 @@ typedef struct PlannerGlobal
 	Bitmapset  *rewindPlanIDs;	/* indices of subplans that require REWIND */
 
 	List	   *finalrtable;	/* "flat" rangetable for executor */
+
+	bool		transientPlan;	/* redo plan when TransactionXmin changes? */
 } PlannerGlobal;
 
 /* macro for fetching the Plan associated with a SubPlan node */
diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h
index 34f8c73f3f6..824ba5a1a4b 100644
--- a/src/include/optimizer/var.h
+++ b/src/include/optimizer/var.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/var.h,v 1.35 2007/01/05 22:19:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/var.h,v 1.36 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -18,6 +18,7 @@
 
 
 extern Relids pull_varnos(Node *node);
+extern void pull_varattnos(Node *node, Bitmapset **varattnos);
 extern bool contain_var_reference(Node *node, int varno, int varattno,
 					  int levelsup);
 extern bool contain_var_clause(Node *node);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 93f08cd2fbf..9cdeb2ee909 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -5,7 +5,7 @@
  *
  *	Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *	$PostgreSQL: pgsql/src/include/pgstat.h,v 1.65 2007/09/11 03:28:05 tgl Exp $
+ *	$PostgreSQL: pgsql/src/include/pgstat.h,v 1.66 2007/09/20 17:56:32 tgl Exp $
  * ----------
  */
 #ifndef PGSTAT_H
@@ -55,10 +55,10 @@ typedef int64 PgStat_Counter;
  * the index AM, while tuples_fetched is the number of tuples successfully
  * fetched by heap_fetch under the control of simple indexscans for this index.
  *
- * tuples_inserted/tuples_updated/tuples_deleted count attempted actions,
+ * tuples_inserted/updated/deleted/hot_updated count attempted actions,
  * regardless of whether the transaction committed.  new_live_tuples and
  * new_dead_tuples are properly adjusted depending on commit or abort.
- * Note that new_live_tuples can be negative!
+ * Note that new_live_tuples and new_dead_tuples can be negative!
  * ----------
  */
 typedef struct PgStat_TableCounts
@@ -71,6 +71,7 @@ typedef struct PgStat_TableCounts
 	PgStat_Counter t_tuples_inserted;
 	PgStat_Counter t_tuples_updated;
 	PgStat_Counter t_tuples_deleted;
+	PgStat_Counter t_tuples_hot_updated;
 
 	PgStat_Counter t_new_live_tuples;
 	PgStat_Counter t_new_dead_tuples;
@@ -323,7 +324,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC96
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC97
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -367,6 +368,7 @@ typedef struct PgStat_StatTabEntry
 	PgStat_Counter tuples_inserted;
 	PgStat_Counter tuples_updated;
 	PgStat_Counter tuples_deleted;
+	PgStat_Counter tuples_hot_updated;
 
 	PgStat_Counter n_live_tuples;
 	PgStat_Counter n_dead_tuples;
@@ -545,8 +547,9 @@ extern void pgstat_initstats(Relation rel);
 	} while (0)
 
 extern void pgstat_count_heap_insert(Relation rel);
-extern void pgstat_count_heap_update(Relation rel);
+extern void pgstat_count_heap_update(Relation rel, bool hot);
 extern void pgstat_count_heap_delete(Relation rel);
+extern void pgstat_update_heap_dead_tuples(Relation rel, int delta);
 
 extern void AtEOXact_PgStat(bool isCommit);
 extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 7a4190d044b..1324befa1e2 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.106 2007/07/25 12:22:53 mha Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.107 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -156,6 +156,7 @@ extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 extern bool ConditionalLockBuffer(Buffer buffer);
 extern void LockBufferForCleanup(Buffer buffer);
+extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 
 extern void AbortBufferIO(void);
 
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 7e6e429108d..8ca2dd8e38f 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.73 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.74 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -140,10 +140,21 @@ typedef PageHeaderData *PageHeader;
  * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
  * pd_lower.  This should be considered a hint rather than the truth, since
  * changes to it are not WAL-logged.
+ *
+ * PD_PRUNABLE is set if there are any prunable tuples in the page.
+ * This should be considered a hint rather than the truth, since
+ * the transaction which generates a prunable tuple may or may not commit.
+ * Also there is a lag before a tuple is declared dead.
+ *
+ * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
+ * page for its new tuple version; this suggests that a prune is needed.
+ * Again, this is just a hint.
  */
 #define PD_HAS_FREE_LINES	0x0001	/* are there any unused line pointers? */
+#define PD_PRUNABLE			0x0002	/* are there any prunable tuples? */
+#define PD_PAGE_FULL		0x0004	/* not enough free space for new tuple? */
 
-#define PD_VALID_FLAG_BITS	0x0001	/* OR of all valid pd_flags bits */
+#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
@@ -337,6 +348,20 @@ typedef PageHeaderData *PageHeader;
 #define PageClearHasFreeLinePointers(page) \
 	(((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES)
 
+#define PageIsPrunable(page) \
+	(((PageHeader) (page))->pd_flags & PD_PRUNABLE)
+#define PageSetPrunable(page) \
+	(((PageHeader) (page))->pd_flags |= PD_PRUNABLE)
+#define PageClearPrunable(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_PRUNABLE)
+
+#define PageIsFull(page) \
+	(((PageHeader) (page))->pd_flags & PD_PAGE_FULL)
+#define PageSetFull(page) \
+	(((PageHeader) (page))->pd_flags |= PD_PAGE_FULL)
+#define PageClearFull(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL)
+
 
 /* ----------------------------------------------------------------
  *		extern declarations
@@ -346,12 +371,13 @@ typedef PageHeaderData *PageHeader;
 extern void PageInit(Page page, Size pageSize, Size specialSize);
 extern bool PageHeaderIsValid(PageHeader page);
 extern OffsetNumber PageAddItem(Page page, Item item, Size size,
-			OffsetNumber offsetNumber, bool overwrite);
+			OffsetNumber offsetNumber, bool overwrite, bool is_heap);
 extern Page PageGetTempPage(Page page, Size specialSize);
 extern void PageRestoreTempPage(Page tempPage, Page oldPage);
-extern int	PageRepairFragmentation(Page page, OffsetNumber *unused);
+extern void PageRepairFragmentation(Page page);
 extern Size PageGetFreeSpace(Page page);
 extern Size PageGetExactFreeSpace(Page page);
+extern Size PageGetHeapFreeSpace(Page page);
 extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
 extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
 
diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index d8152142aac..0a91e886e29 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/plancache.h,v 1.7 2007/06/05 20:00:41 wieck Exp $
+ * $PostgreSQL: pgsql/src/include/utils/plancache.h,v 1.8 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -75,6 +75,8 @@ typedef struct CachedPlan
 	List	   *stmt_list;		/* list of statement or Query nodes */
 	bool		fully_planned;	/* do we cache planner or rewriter output? */
 	bool		dead;			/* if true, do not use */
+	TransactionId saved_xmin;	/* if valid, replan when TransactionXmin
+								 * changes from this value */
 	int			refcount;		/* count of live references to this struct */
 	int			generation;		/* counter, starting at 1, for replans */
 	MemoryContext context;		/* context containing this CachedPlan */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index bc6bf190b86..48569c583b2 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.101 2007/05/27 03:50:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.102 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,7 @@
 #include "catalog/pg_class.h"
 #include "catalog/pg_index.h"
 #include "fmgr.h"
+#include "nodes/bitmapset.h"
 #include "rewrite/prs2lock.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
@@ -145,6 +146,7 @@ typedef struct RelationData
 	TupleDesc	rd_att;			/* tuple descriptor */
 	Oid			rd_id;			/* relation's object id */
 	List	   *rd_indexlist;	/* list of OIDs of indexes on relation */
+	Bitmapset  *rd_indexattr;	/* identifies columns used in indexes */
 	Oid			rd_oidindex;	/* OID of unique index on OID, if any */
 	LockInfoData rd_lockInfo;	/* lock mgr's info for locking relation */
 	RuleLock   *rd_rules;		/* rewrite rules */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 25b60082a09..a2b6f21248f 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.59 2007/03/29 00:15:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.60 2007/09/20 17:56:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -29,6 +29,7 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid	RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation);
 
 extern void RelationSetIndexList(Relation relation,
 					 List *indexIds, Oid oidIndex);
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index adec0e6c847..3483ba15554 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -415,6 +415,7 @@ Table "public.concur_heap"
  f2     | text | 
 Indexes:
     "concur_index2" UNIQUE, btree (f1)
+    "concur_index3" UNIQUE, btree (f2) INVALID
     "concur_index1" btree (f2, f1)
     "concur_index4" btree (f2) WHERE f1 = 'a'::text
     "concur_index5" btree (f2) WHERE f1 = 'x'::text
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index bafce821eba..3fc65ea2350 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1291,13 +1291,13 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
  pg_shadow                | SELECT pg_authid.rolname AS usename, pg_authid.oid AS usesysid, pg_authid.rolcreatedb AS usecreatedb, pg_authid.rolsuper AS usesuper, pg_authid.rolcatupdate AS usecatupd, pg_authid.rolpassword AS passwd, (pg_authid.rolvaliduntil)::abstime AS valuntil, pg_authid.rolconfig AS useconfig FROM pg_authid WHERE pg_authid.rolcanlogin;
  pg_stat_activity         | SELECT d.oid AS datid, d.datname, pg_stat_get_backend_pid(s.backendid) AS procpid, pg_stat_get_backend_userid(s.backendid) AS usesysid, u.rolname AS usename, pg_stat_get_backend_activity(s.backendid) AS current_query, pg_stat_get_backend_waiting(s.backendid) AS waiting, pg_stat_get_backend_xact_start(s.backendid) AS xact_start, pg_stat_get_backend_activity_start(s.backendid) AS query_start, pg_stat_get_backend_start(s.backendid) AS backend_start, pg_stat_get_backend_client_addr(s.backendid) AS client_addr, pg_stat_get_backend_client_port(s.backendid) AS client_port FROM pg_database d, (SELECT pg_stat_get_backend_idset() AS backendid) s, pg_authid u WHERE ((pg_stat_get_backend_dbid(s.backendid) = d.oid) AND (pg_stat_get_backend_userid(s.backendid) = u.oid));
  pg_stat_all_indexes      | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, pg_stat_get_numscans(i.oid) AS idx_scan, pg_stat_get_tuples_returned(i.oid) AS idx_tup_read, pg_stat_get_tuples_fetched(i.oid) AS idx_tup_fetch FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"]));
- pg_stat_all_tables       | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname;
+ pg_stat_all_tables       | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname;
  pg_stat_bgwriter         | SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints_timed, pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req, pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint, pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean, pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean;
  pg_stat_database         | SELECT d.oid AS datid, d.datname, pg_stat_get_db_numbackends(d.oid) AS numbackends, pg_stat_get_db_xact_commit(d.oid) AS xact_commit, pg_stat_get_db_xact_rollback(d.oid) AS xact_rollback, (pg_stat_get_db_blocks_fetched(d.oid) - pg_stat_get_db_blocks_hit(d.oid)) AS blks_read, pg_stat_get_db_blocks_hit(d.oid) AS blks_hit, pg_stat_get_db_tuples_returned(d.oid) AS tup_returned, pg_stat_get_db_tuples_fetched(d.oid) AS tup_fetched, pg_stat_get_db_tuples_inserted(d.oid) AS tup_inserted, pg_stat_get_db_tuples_updated(d.oid) AS tup_updated, pg_stat_get_db_tuples_deleted(d.oid) AS tup_deleted FROM pg_database d;
  pg_stat_sys_indexes      | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_indexes.schemaname ~ '^pg_toast'::text));
- pg_stat_sys_tables       | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
+ pg_stat_sys_tables       | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
  pg_stat_user_indexes     | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_indexes.schemaname !~ '^pg_toast'::text));
- pg_stat_user_tables      | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+ pg_stat_user_tables      | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
  pg_statio_all_indexes    | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, (pg_stat_get_blocks_fetched(i.oid) - pg_stat_get_blocks_hit(i.oid)) AS idx_blks_read, pg_stat_get_blocks_hit(i.oid) AS idx_blks_hit FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"]));
  pg_statio_all_sequences  | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS blks_read, pg_stat_get_blocks_hit(c.oid) AS blks_hit FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'S'::"char");
  pg_statio_all_tables     | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS heap_blks_read, pg_stat_get_blocks_hit(c.oid) AS heap_blks_hit, (sum((pg_stat_get_blocks_fetched(i.indexrelid) - pg_stat_get_blocks_hit(i.indexrelid))))::bigint AS idx_blks_read, (sum(pg_stat_get_blocks_hit(i.indexrelid)))::bigint AS idx_blks_hit, (pg_stat_get_blocks_fetched(t.oid) - pg_stat_get_blocks_hit(t.oid)) AS toast_blks_read, pg_stat_get_blocks_hit(t.oid) AS toast_blks_hit, (pg_stat_get_blocks_fetched(x.oid) - pg_stat_get_blocks_hit(x.oid)) AS tidx_blks_read, pg_stat_get_blocks_hit(x.oid) AS tidx_blks_hit FROM ((((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_class t ON ((c.reltoastrelid = t.oid))) LEFT JOIN pg_class x ON ((t.reltoastidxid = x.oid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname, t.oid, x.oid;