6 files changed, 430 insertions, 67 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 57acaf2bb8c..12775cc2db7 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.220 2006/10/04 00:29:48 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.221 2006/11/05 22:42:07 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -2809,6 +2809,166 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
 }
 
 
+/*
+ * heap_freeze_tuple
+ *
+ * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
+ * are older than the specified cutoff XID.  If so, replace them with
+ * FrozenTransactionId or InvalidTransactionId as appropriate, and return
+ * TRUE.  Return FALSE if nothing was changed.
+ *
+ * It is assumed that the caller has checked the tuple with
+ * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
+ * (else we should be removing the tuple, not freezing it).
+ *
+ * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
+ * XID older than it could neither be running nor seen as running by any
+ * open transaction.  This ensures that the replacement will not change
+ * anyone's idea of the tuple state.  Also, since we assume the tuple is
+ * not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us
+ * to assume that it is either committed good or aborted, as appropriate;
+ * so we need no external state checks to decide what to do.  (This is good
+ * because this function is applied during WAL recovery, when we don't have
+ * access to any such state, and can't depend on the hint bits to be set.)
+ *
+ * In lazy VACUUM, we call this while initially holding only a shared lock
+ * on the tuple's buffer.  If any change is needed, we trade that in for an
+ * exclusive lock before making the change.  Caller should pass the buffer ID
+ * if shared lock is held, InvalidBuffer if exclusive lock is already held.
+ *
+ * Note: it might seem we could make the changes without exclusive lock, since
+ * TransactionId read/write is assumed atomic anyway.  However there is a race
+ * condition: someone who just fetched an old XID that we overwrite here could
+ * conceivably not finish checking the XID against pg_clog before we finish
+ * the VACUUM and perhaps truncate off the part of pg_clog he needs.  Getting
+ * exclusive lock ensures no other backend is in process of checking the
+ * tuple status.  Also, getting exclusive lock makes it safe to adjust the
+ * infomask bits.
+ */
+bool
+heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+				  Buffer buf)
+{
+	bool		changed = false;
+	TransactionId xid;
+
+	xid = HeapTupleHeaderGetXmin(tuple);
+	if (TransactionIdIsNormal(xid) &&
+		TransactionIdPrecedes(xid, cutoff_xid))
+	{
+		if (buf != InvalidBuffer)
+		{
+			/* trade in share lock for exclusive lock */
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+			buf = InvalidBuffer;
+		}
+		HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
+		/*
+		 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
+		 * already be set here, but there's a small chance not.
+		 */
+		Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
+		tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+		changed = true;
+	}
+
+	/*
+	 * When we release shared lock, it's possible for someone else to change
+	 * xmax before we get the lock back, so repeat the check after acquiring
+	 * exclusive lock.  (We don't need this pushup for xmin, because only
+	 * VACUUM could be interested in changing an existing tuple's xmin,
+	 * and there's only one VACUUM allowed on a table at a time.)
+	 */
+recheck_xmax:
+	if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+	{
+		xid = HeapTupleHeaderGetXmax(tuple);
+		if (TransactionIdIsNormal(xid) &&
+			TransactionIdPrecedes(xid, cutoff_xid))
+		{
+			if (buf != InvalidBuffer)
+			{
+				/* trade in share lock for exclusive lock */
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				buf = InvalidBuffer;
+				goto recheck_xmax;			/* see comment above */
+			}
+			HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
+			/*
+			 * The tuple might be marked either XMAX_INVALID or
+			 * XMAX_COMMITTED + LOCKED.  Normalize to INVALID just to be
+			 * sure no one gets confused.
+			 */
+			tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
+			tuple->t_infomask |= HEAP_XMAX_INVALID;
+			changed = true;
+		}
+	}
+	else
+	{
+		/*----------
+		 * XXX perhaps someday we should zero out very old MultiXactIds here?
+		 *
+		 * The only way a stale MultiXactId could pose a problem is if a
+		 * tuple, having once been multiply-share-locked, is not touched by
+		 * any vacuum or attempted lock or deletion for just over 4G MultiXact
+		 * creations, and then in the probably-narrow window where its xmax
+		 * is again a live MultiXactId, someone tries to lock or delete it.
+		 * Even then, another share-lock attempt would work fine.  An
+		 * exclusive-lock or delete attempt would face unexpected delay, or
+		 * in the very worst case get a deadlock error.  This seems an
+		 * extremely low-probability scenario with minimal downside even if
+		 * it does happen, so for now we don't do the extra bookkeeping that
+		 * would be needed to clean out MultiXactIds.
+		 *----------
+		 */
+	}
+
+	/*
+	 * Although xvac per se could only be set by VACUUM, it shares physical
+	 * storage space with cmax, and so could be wiped out by someone setting
+	 * xmax.  Hence recheck after changing lock, same as for xmax itself.
+	 */
+recheck_xvac:
+	if (tuple->t_infomask & HEAP_MOVED)
+	{
+		xid = HeapTupleHeaderGetXvac(tuple);
+		if (TransactionIdIsNormal(xid) &&
+			TransactionIdPrecedes(xid, cutoff_xid))
+		{
+			if (buf != InvalidBuffer)
+			{
+				/* trade in share lock for exclusive lock */
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				buf = InvalidBuffer;
+				goto recheck_xvac;			/* see comment above */
+			}
+			/*
+			 * If a MOVED_OFF tuple is not dead, the xvac transaction must
+			 * have failed; whereas a non-dead MOVED_IN tuple must mean the
+			 * xvac transaction succeeded.
+			 */
+			if (tuple->t_infomask & HEAP_MOVED_OFF)
+				HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
+			else
+				HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
+			/*
+			 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
+			 * already be set here, but there's a small chance not.
+			 */
+			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
+			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+			changed = true;
+		}
+	}
+
+	return changed;
+}
+
+
 /* ----------------
  *		heap_markpos	- mark scan position
  * ----------------
@@ -2877,6 +3037,9 @@ heap_restrpos(HeapScanDesc scan)
 /*
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
+ *
+ * Note: for historical reasons, the entries in the unused[] array should
+ * be zero-based tuple indexes, not one-based.
  */
 XLogRecPtr
 log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
@@ -2921,6 +3084,57 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 }
 
 /*
+ * Perform XLogInsert for a heap-freeze operation.  Caller must already
+ * have modified the buffer and marked it dirty.
+ *
+ * Unlike log_heap_clean(), the offsets[] entries are one-based.
+ */
+XLogRecPtr
+log_heap_freeze(Relation reln, Buffer buffer,
+				TransactionId cutoff_xid,
+				OffsetNumber *offsets, int offcnt)
+{
+	xl_heap_freeze xlrec;
+	XLogRecPtr	recptr;
+	XLogRecData rdata[2];
+
+	/* Caller should not call me on a temp relation */
+	Assert(!reln->rd_istemp);
+
+	xlrec.node = reln->rd_node;
+	xlrec.block = BufferGetBlockNumber(buffer);
+	xlrec.cutoff_xid = cutoff_xid;
+
+	rdata[0].data = (char *) &xlrec;
+	rdata[0].len = SizeOfHeapFreeze;
+	rdata[0].buffer = InvalidBuffer;
+	rdata[0].next = &(rdata[1]);
+
+	/*
+	 * The tuple-offsets array is not actually in the buffer, but pretend
+	 * that it is.	When XLogInsert stores the whole buffer, the offsets array
+	 * need not be stored too.
+	 */
+	if (offcnt > 0)
+	{
+		rdata[1].data = (char *) offsets;
+		rdata[1].len = offcnt * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[1].data = NULL;
+		rdata[1].len = 0;
+	}
+	rdata[1].buffer = buffer;
+	rdata[1].buffer_std = true;
+	rdata[1].next = NULL;
+
+	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata);
+
+	return recptr;
+}
+
+/*
  * Perform XLogInsert for a heap-update operation.	Caller must already
  * have modified the buffer(s) and marked them dirty.
  */
@@ -3057,6 +3271,7 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
 
 		while (unused < unend)
 		{
+			/* unused[] entries are zero-based */
 			lp = PageGetItemId(page, *unused + 1);
 			lp->lp_flags &= ~LP_USED;
 			unused++;
@@ -3072,6 +3287,55 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
 }
 
 static void
+heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
+	TransactionId cutoff_xid = xlrec->cutoff_xid;
+	Relation	reln;
+	Buffer		buffer;
+	Page		page;
+
+	if (record->xl_info & XLR_BKP_BLOCK_1)
+		return;
+
+	reln = XLogOpenRelation(xlrec->node);
+	buffer = XLogReadBuffer(reln, xlrec->block, false);
+	if (!BufferIsValid(buffer))
+		return;
+	page = (Page) BufferGetPage(buffer);
+
+	if (XLByteLE(lsn, PageGetLSN(page)))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	if (record->xl_len > SizeOfHeapFreeze)
+	{
+		OffsetNumber *offsets;
+		OffsetNumber *offsets_end;
+
+		offsets = (OffsetNumber *) ((char *) xlrec + SizeOfHeapFreeze);
+		offsets_end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+		while (offsets < offsets_end)
+		{
+			/* offsets[] entries are one-based */
+			ItemId		lp = PageGetItemId(page, *offsets);
+			HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
+
+			(void) heap_freeze_tuple(tuple, cutoff_xid, InvalidBuffer);
+			offsets++;
+		}
+	}
+
+	PageSetLSN(page, lsn);
+	PageSetTLI(page, ThisTimeLineID);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
+static void
 heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
 {
 	xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record);
@@ -3546,6 +3810,18 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 		elog(PANIC, "heap_redo: unknown op code %u", info);
 }
 
+void
+heap2_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+	info &= XLOG_HEAP_OPMASK;
+	if (info == XLOG_HEAP2_FREEZE)
+		heap_xlog_freeze(lsn, record);
+	else
+		elog(PANIC, "heap2_redo: unknown op code %u", info);
+}
+
 static void
 out_target(StringInfo buf, xl_heaptid *target)
 {
@@ -3645,3 +3921,22 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
+
+void
+heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
+{
+	uint8		info = xl_info & ~XLR_INFO_MASK;
+
+	info &= XLOG_HEAP_OPMASK;
+	if (info == XLOG_HEAP2_FREEZE)
+	{
+		xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;
+
+		appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block,
+						 xlrec->cutoff_xid);
+	}
+	else
+		appendStringInfo(buf, "UNKNOWN");
+}
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index f57bdefa3a0..5817239a374 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -24,7 +24,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.40 2006/10/04 00:29:49 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.41 2006/11/05 22:42:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -69,6 +69,7 @@ static SlruCtlData ClogCtlData;
 static int	ZeroCLOGPage(int pageno, bool writeXlog);
 static bool CLOGPagePrecedes(int page1, int page2);
 static void WriteZeroPageXlogRec(int pageno);
+static void WriteTruncateXlogRec(int pageno);
 
 
 /*
@@ -309,16 +310,17 @@ ExtendCLOG(TransactionId newestXact)
 /*
  * Remove all CLOG segments before the one holding the passed transaction ID
  *
- * When this is called, we know that the database logically contains no
- * reference to transaction IDs older than oldestXact.	However, we must
- * not truncate the CLOG until we have performed a checkpoint, to ensure
- * that no such references remain on disk either; else a crash just after
- * the truncation might leave us with a problem.  Since CLOG segments hold
- * a large number of transactions, the opportunity to actually remove a
- * segment is fairly rare, and so it seems best not to do the checkpoint
- * unless we have confirmed that there is a removable segment.	Therefore
- * we issue the checkpoint command here, not in higher-level code as might
- * seem cleaner.
+ * Before removing any CLOG data, we must flush XLOG to disk, to ensure
+ * that any recently-emitted HEAP_FREEZE records have reached disk; otherwise
+ * a crash and restart might leave us with some unfrozen tuples referencing
+ * removed CLOG data.  We choose to emit a special TRUNCATE XLOG record too.
+ * Replaying the deletion from XLOG is not critical, since the files could
+ * just as well be removed later, but doing so prevents a long-running hot
+ * standby server from acquiring an unreasonably bloated CLOG directory.
+ *
+ * Since CLOG segments hold a large number of transactions, the opportunity to
+ * actually remove a segment is fairly rare, and so it seems best not to do
+ * the XLOG flush unless we have confirmed that there is a removable segment.
  */
 void
 TruncateCLOG(TransactionId oldestXact)
@@ -335,8 +337,8 @@ TruncateCLOG(TransactionId oldestXact)
 	if (!SlruScanDirectory(ClogCtl, cutoffPage, false))
 		return;					/* nothing to remove */
 
-	/* Perform a CHECKPOINT */
-	RequestCheckpoint(true, false);
+	/* Write XLOG record and flush XLOG to disk */
+	WriteTruncateXlogRec(cutoffPage);
 
 	/* Now we can remove the old CLOG segment(s) */
 	SimpleLruTruncate(ClogCtl, cutoffPage);
@@ -387,6 +389,29 @@ WriteZeroPageXlogRec(int pageno)
 }
 
 /*
+ * Write a TRUNCATE xlog record
+ *
+ * We must flush the xlog record to disk before returning --- see notes
+ * in TruncateCLOG().
+ *
+ * Note: xlog record is marked as outside transaction control, since we
+ * want it to be redone whether the invoking transaction commits or not.
+ */
+static void
+WriteTruncateXlogRec(int pageno)
+{
+	XLogRecData rdata;
+	XLogRecPtr	recptr;
+
+	rdata.data = (char *) (&pageno);
+	rdata.len = sizeof(int);
+	rdata.buffer = InvalidBuffer;
+	rdata.next = NULL;
+	recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE | XLOG_NO_TRAN, &rdata);
+	XLogFlush(recptr);
+}
+
+/*
  * CLOG resource manager's routines
  */
 void
@@ -409,6 +434,22 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		LWLockRelease(CLogControlLock);
 	}
+	else if (info == CLOG_TRUNCATE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+		/*
+		 * During XLOG replay, latest_page_number isn't set up yet; insert
+		 * a suitable value to bypass the sanity test in SimpleLruTruncate.
+		 */
+		ClogCtl->shared->latest_page_number = pageno;
+
+		SimpleLruTruncate(ClogCtl, pageno);
+	}
+	else
+		elog(PANIC, "clog_redo: unknown op code %u", info);
 }
 
 void
@@ -423,6 +464,13 @@ clog_desc(StringInfo buf, uint8 xl_info, char *rec)
 		memcpy(&pageno, rec, sizeof(int));
 		appendStringInfo(buf, "zeropage: %d", pageno);
 	}
+	else if (info == CLOG_TRUNCATE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, rec, sizeof(int));
+		appendStringInfo(buf, "truncate before: %d", pageno);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 7db05c9c482..08de22eaa4a 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.24 2006/08/07 16:57:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.25 2006/11/05 22:42:07 tgl Exp $
  */
 #include "postgres.h"
 
@@ -32,7 +32,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
 	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
 	{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
-	{"Reserved 9", NULL, NULL, NULL, NULL, NULL},
+	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
 	{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
 	{"Hash", hash_redo, hash_desc, NULL, NULL, NULL},
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 04e9840cb5d..a8abaaea35c 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -6,7 +6,7 @@
  * Copyright (c) 2000-2006, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.75 2006/10/04 00:29:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.76 2006/11/05 22:42:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,6 +17,8 @@
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "utils/builtins.h"
 
@@ -47,20 +49,31 @@ GetNewTransactionId(bool isSubXact)
 
 	xid = ShmemVariableCache->nextXid;
 
-	/*
+	/*----------
 	 * Check to see if it's safe to assign another XID.  This protects against
 	 * catastrophic data loss due to XID wraparound.  The basic rules are:
-	 * warn if we're past xidWarnLimit, and refuse to execute transactions if
-	 * we're past xidStopLimit, unless we are running in a standalone backend
-	 * (which gives an escape hatch to the DBA who ignored all those
-	 * warnings).
+	 *
+	 * If we're past xidVacLimit, start trying to force autovacuum cycles.
+	 * If we're past xidWarnLimit, start issuing warnings.
+	 * If we're past xidStopLimit, refuse to execute transactions, unless
+	 * we are running in a standalone backend (which gives an escape hatch
+	 * to the DBA who somehow got past the earlier defenses).
 	 *
 	 * Test is coded to fall out as fast as possible during normal operation,
-	 * ie, when the warn limit is set and we haven't violated it.
+	 * ie, when the vac limit is set and we haven't violated it.
+	 *----------
 	 */
-	if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidWarnLimit) &&
-		TransactionIdIsValid(ShmemVariableCache->xidWarnLimit))
+	if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit) &&
+		TransactionIdIsValid(ShmemVariableCache->xidVacLimit))
 	{
+		/*
+		 * To avoid swamping the postmaster with signals, we issue the
+		 * autovac request only once per 64K transaction starts.  This
+		 * still gives plenty of chances before we get into real trouble.
+		 */
+		if (IsUnderPostmaster && (xid % 65536) == 0)
+			SendPostmasterSignal(PMSIGNAL_START_AUTOVAC);
+
 		if (IsUnderPostmaster &&
 		 TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidStopLimit))
 			ereport(ERROR,
@@ -69,7 +82,7 @@ GetNewTransactionId(bool isSubXact)
 							NameStr(ShmemVariableCache->limit_datname)),
 					 errhint("Stop the postmaster and use a standalone backend to vacuum database \"%s\".",
 							 NameStr(ShmemVariableCache->limit_datname))));
-		else
+		else if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidWarnLimit))
 			ereport(WARNING,
 			(errmsg("database \"%s\" must be vacuumed within %u transactions",
 					NameStr(ShmemVariableCache->limit_datname),
@@ -178,28 +191,29 @@ ReadNewTransactionId(void)
 
 /*
  * Determine the last safe XID to allocate given the currently oldest
- * datminxid (ie, the oldest XID that might exist in any database
+ * datfrozenxid (ie, the oldest XID that might exist in any database
  * of our cluster).
  */
 void
-SetTransactionIdLimit(TransactionId oldest_datminxid,
+SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
 					  Name oldest_datname)
 {
+	TransactionId xidVacLimit;
 	TransactionId xidWarnLimit;
 	TransactionId xidStopLimit;
 	TransactionId xidWrapLimit;
 	TransactionId curXid;
 
-	Assert(TransactionIdIsValid(oldest_datminxid));
+	Assert(TransactionIdIsNormal(oldest_datfrozenxid));
 
 	/*
 	 * The place where we actually get into deep trouble is halfway around
-	 * from the oldest existing XID.  (This calculation is probably off by one
-	 * or two counts, because the special XIDs reduce the size of the loop a
-	 * little bit.	But we throw in plenty of slop below, so it doesn't
-	 * matter.)
+	 * from the oldest potentially-existing XID.  (This calculation is
+	 * probably off by one or two counts, because the special XIDs reduce the
+	 * size of the loop a little bit.  But we throw in plenty of slop below,
+	 * so it doesn't matter.)
 	 */
-	xidWrapLimit = oldest_datminxid + (MaxTransactionId >> 1);
+	xidWrapLimit = oldest_datfrozenxid + (MaxTransactionId >> 1);
 	if (xidWrapLimit < FirstNormalTransactionId)
 		xidWrapLimit += FirstNormalTransactionId;
 
@@ -229,8 +243,28 @@ SetTransactionIdLimit(TransactionId oldest_datminxid,
 	if (xidWarnLimit < FirstNormalTransactionId)
 		xidWarnLimit -= FirstNormalTransactionId;
 
+	/*
+	 * We'll start trying to force autovacuums when oldest_datfrozenxid
+	 * gets to be more than autovacuum_freeze_max_age transactions old.
+	 *
+	 * Note: guc.c ensures that autovacuum_freeze_max_age is in a sane
+	 * range, so that xidVacLimit will be well before xidWarnLimit.
+	 *
+	 * Note: autovacuum_freeze_max_age is a PGC_POSTMASTER parameter so that
+	 * we don't have to worry about dealing with on-the-fly changes in its
+	 * value.  It doesn't look practical to update shared state from a GUC
+	 * assign hook (too many processes would try to execute the hook,
+	 * resulting in race conditions as well as crashes of those not
+	 * connected to shared memory).  Perhaps this can be improved someday.
+	 */
+	xidVacLimit = oldest_datfrozenxid + autovacuum_freeze_max_age;
+	if (xidVacLimit < FirstNormalTransactionId)
+		xidVacLimit += FirstNormalTransactionId;
+
 	/* Grab lock for just long enough to set the new limit values */
 	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+	ShmemVariableCache->oldestXid = oldest_datfrozenxid;
+	ShmemVariableCache->xidVacLimit = xidVacLimit;
 	ShmemVariableCache->xidWarnLimit = xidWarnLimit;
 	ShmemVariableCache->xidStopLimit = xidStopLimit;
 	ShmemVariableCache->xidWrapLimit = xidWrapLimit;
@@ -242,6 +276,18 @@ SetTransactionIdLimit(TransactionId oldest_datminxid,
 	ereport(DEBUG1,
 	   (errmsg("transaction ID wrap limit is %u, limited by database \"%s\"",
 			   xidWrapLimit, NameStr(*oldest_datname))));
+
+	/*
+	 * If past the autovacuum force point, immediately signal an autovac
+	 * request.  The reason for this is that autovac only processes one
+	 * database per invocation.  Once it's finished cleaning up the oldest
+	 * database, it'll call here, and we'll signal the postmaster to start
+	 * another iteration immediately if there are still any old databases.
+	 */
+	if (TransactionIdFollowsOrEquals(curXid, xidVacLimit) &&
+		IsUnderPostmaster)
+		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC);
+
 	/* Give an immediate warning if past the wrap warn point */
 	if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit))
 		ereport(WARNING,
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 8e1724989cb..3c6e2ebf5cd 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.227 2006/10/04 00:29:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.228 2006/11/05 22:42:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -468,8 +468,12 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
 	 * is what we need during bootstrap.  (Bootstrap mode only inserts tuples,
 	 * it never updates or deletes them, so all tuples can be presumed good
 	 * immediately.)
+	 *
+	 * Likewise, InvalidTransactionId and FrozenTransactionId are certainly
+	 * not my transaction ID, so we can just return "false" immediately for
+	 * any non-normal XID.
 	 */
-	if (xid == BootstrapTransactionId)
+	if (!TransactionIdIsNormal(xid))
 		return false;
 
 	/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ba029397f87..03440cbf48b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.252 2006/10/18 22:44:11 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.253 2006/11/05 22:42:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -5344,36 +5344,6 @@ GetLastSegSwitchTime(void)
 }
 
 /*
- * GetRecentNextXid - get the nextXid value saved by the most recent checkpoint
- *
- * This is currently used only by the autovacuum daemon.  To check for
- * impending XID wraparound, autovac needs an approximate idea of the current
- * XID counter, and it needs it before choosing which DB to attach to, hence
- * before it sets up a PGPROC, hence before it can take any LWLocks.  But it
- * has attached to shared memory, and so we can let it reach into the shared
- * ControlFile structure and pull out the last checkpoint nextXID.
- *
- * Since we don't take any sort of lock, we have to assume that reading a
- * TransactionId is atomic ... but that assumption is made elsewhere, too,
- * and in any case the worst possible consequence of a bogus result is that
- * autovac issues an unnecessary database-wide VACUUM.
- *
- * Note: we could also choose to read ShmemVariableCache->nextXid in an
- * unlocked fashion, thus getting a more up-to-date result; but since that
- * changes far more frequently than the controlfile checkpoint copy, it would
- * pose a far higher risk of bogus result if we did have a nonatomic-read
- * problem.
- *
- * A (theoretically) completely safe answer is to read the actual pg_control
- * file into local process memory, but that certainly seems like overkill.
- */
-TransactionId
-GetRecentNextXid(void)
-{
-	return ControlFile->checkPointCopy.nextXid;
-}
-
-/*
  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
  *
  * This is exported for use by code that would like to have 64-bit XIDs.