aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/transam/xact.c28
-rw-r--r--src/backend/access/transam/xlog.c15
-rw-r--r--src/backend/storage/ipc/procarray.c262
-rw-r--r--src/backend/storage/ipc/standby.c58
-rw-r--r--src/include/storage/procarray.h5
-rw-r--r--src/include/storage/standby.h4
6 files changed, 252 insertions, 120 deletions
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 43966d5ab6f..91fbbd0be6b 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.289 2010/02/26 02:00:34 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.290 2010/05/13 11:15:38 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -4378,7 +4378,7 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn)
LWLockRelease(XidGenLock);
}
- if (!InHotStandby)
+ if (standbyState == STANDBY_DISABLED)
{
/*
* Mark the transaction committed in pg_clog.
@@ -4412,12 +4412,12 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn)
/*
* We must mark clog before we update the ProcArray.
*/
- ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids);
+ ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids, max_xid);
/*
* Send any cache invalidations attached to the commit. We must
* maintain the same order of invalidation then release locks as
- * occurs in .
+ * occurs in CommitTransaction().
*/
ProcessCommittedInvalidationMessages(inval_msgs, xlrec->nmsgs,
XactCompletionRelcacheInitFileInval(xlrec),
@@ -4499,7 +4499,12 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
LWLockRelease(XidGenLock);
}
- if (InHotStandby)
+ if (standbyState == STANDBY_DISABLED)
+ {
+ /* Mark the transaction aborted in pg_clog, no need for async stuff */
+ TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+ }
+ else
{
/*
* If a transaction completion record arrives that has as-yet
@@ -4511,17 +4516,14 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
* already. Leave it in.
*/
RecordKnownAssignedTransactionIds(max_xid);
- }
- /* Mark the transaction aborted in pg_clog, no need for async stuff */
- TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+ /* Mark the transaction aborted in pg_clog, no need for async stuff */
+ TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
- if (InHotStandby)
- {
/*
- * We must mark clog before we update the ProcArray.
+ * We must update the ProcArray after we have marked clog.
*/
- ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids);
+ ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids, max_xid);
/*
* There are no flat files that need updating, nor invalidation
@@ -4596,7 +4598,7 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
{
xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
- if (InHotStandby)
+ if (standbyState >= STANDBY_INITIALIZED)
ProcArrayApplyXidAssignment(xlrec->xtop,
xlrec->nsubxacts, xlrec->xsub);
}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a39d455a7f0..3253bdad57c 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.409 2010/05/03 11:17:52 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.410 2010/05/13 11:15:38 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -5995,6 +5995,7 @@ StartupXLOG(void)
if (wasShutdown)
{
RunningTransactionsData running;
+ TransactionId latestCompletedXid;
/*
* Construct a RunningTransactions snapshot representing a shut
@@ -6006,6 +6007,9 @@ StartupXLOG(void)
running.subxid_overflow = false;
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
+ latestCompletedXid = checkPoint.nextXid;
+ TransactionIdRetreat(latestCompletedXid);
+ running.latestCompletedXid = latestCompletedXid;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
@@ -6154,8 +6158,9 @@ StartupXLOG(void)
xlogctl->recoveryLastXTime = recoveryLastXTime;
SpinLockRelease(&xlogctl->info_lck);
- /* In Hot Standby mode, keep track of XIDs we've seen */
- if (InHotStandby && TransactionIdIsValid(record->xl_xid))
+ /* If we are attempting to enter Hot Standby mode, process XIDs we see */
+ if (standbyState >= STANDBY_INITIALIZED &&
+ TransactionIdIsValid(record->xl_xid))
RecordKnownAssignedTransactionIds(record->xl_xid);
RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
@@ -7803,6 +7808,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
TransactionId *xids;
int nxids;
TransactionId oldestActiveXID;
+ TransactionId latestCompletedXid;
RunningTransactionsData running;
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
@@ -7817,6 +7823,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
running.subxid_overflow = false;
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
+ latestCompletedXid = checkPoint.nextXid;
+ TransactionIdRetreat(latestCompletedXid);
+ running.latestCompletedXid = latestCompletedXid;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 4fc1fc430be..11b28098338 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -37,7 +37,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.68 2010/04/29 21:36:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.69 2010/05/13 11:15:38 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -105,12 +105,6 @@ static TransactionId latestObservedXid = InvalidTransactionId;
*/
static TransactionId standbySnapshotPendingXmin;
-/*
- * Oldest transaction still running according to the running-xacts snapshot
- * we initialized standby mode from.
- */
-static TransactionId snapshotOldestActiveXid;
-
#ifdef XIDCACHE_DEBUG
/* counters for XidCache measurement */
@@ -158,7 +152,7 @@ static void KnownAssignedXidsRemove(TransactionId xid);
static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
TransactionId *subxids);
static void KnownAssignedXidsRemovePreceding(TransactionId xid);
-static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
+static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
TransactionId *xmin,
TransactionId xmax);
@@ -439,10 +433,17 @@ ProcArrayClearTransaction(PGPROC *proc)
proc->subxids.overflowed = false;
}
+/*
+ * ProcArrayInitRecoveryInfo
+ *
+ * When trying to assemble our snapshot we only care about xids after this value.
+ * See comments for LogStandbySnapshot().
+ */
void
ProcArrayInitRecoveryInfo(TransactionId oldestActiveXid)
{
- snapshotOldestActiveXid = oldestActiveXid;
+ latestObservedXid = oldestActiveXid;
+ TransactionIdRetreat(latestObservedXid);
}
/*
@@ -458,16 +459,15 @@ ProcArrayInitRecoveryInfo(TransactionId oldestActiveXid)
* with FATAL errors fail to write abort records, which could cause eventual
* overflow.
*
- * Only used during recovery. Notice the signature is very similar to a
- * _redo function and its difficult to decide exactly where this code should
- * reside.
+ * See comments for LogStandbySnapshot().
*/
void
ProcArrayApplyRecoveryInfo(RunningTransactions running)
{
- int xid_index; /* main loop */
TransactionId *xids;
- int nxids;
+ int nxids;
+ TransactionId nextXid;
+ int i;
Assert(standbyState >= STANDBY_INITIALIZED);
@@ -505,41 +505,40 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
elog(trace_recovery(DEBUG2),
"recovery snapshots are now enabled");
}
+ else
+ elog(trace_recovery(DEBUG2),
+ "recovery snapshot waiting for %u oldest active xid on standby is %u",
+ standbySnapshotPendingXmin,
+ running->oldestRunningXid);
return;
}
+ Assert(standbyState == STANDBY_INITIALIZED);
+
/*
* OK, we need to initialise from the RunningXactData record
*/
- latestObservedXid = running->nextXid;
- TransactionIdRetreat(latestObservedXid);
/*
- * If the snapshot overflowed, then we still initialise with what we know,
- * but the recovery snapshot isn't fully valid yet because we know there
- * are some subxids missing (ergo we don't know which ones)
+ * Remove all xids except xids later than the snapshot. We don't know
+ * exactly which ones that is until precisely now, so that is why we
+ * allow xids to be added only to remove most of them again here.
*/
- if (!running->subxid_overflow)
- {
- standbyState = STANDBY_SNAPSHOT_READY;
- standbySnapshotPendingXmin = InvalidTransactionId;
- }
- else
- {
- standbyState = STANDBY_SNAPSHOT_PENDING;
- standbySnapshotPendingXmin = latestObservedXid;
- ereport(LOG,
- (errmsg("consistent state delayed because recovery snapshot incomplete")));
- }
+ ExpireOldKnownAssignedTransactionIds(running->nextXid);
+ StandbyReleaseOldLocks(running->nextXid);
- nxids = running->xcnt;
- xids = running->xids;
-
- KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ /*
+ * Nobody else is running yet, but take locks anyhow
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
/*
- * Scan through the incoming array of RunningXacts and collect xids. We
- * don't use SubtransSetParent because it doesn't matter yet. If we aren't
+ * Combine the running xact data with already known xids, if any exist.
+ * KnownAssignedXids is sorted so we cannot just add new xids, we have
+ * to combine them first, sort them and then re-add to KnownAssignedXids.
+ *
+ * Some of the new xids are top-level xids and some are subtransactions. We
+ * don't call SubtransSetParent because it doesn't matter yet. If we aren't
* overflowed then all xids will fit in snapshot and so we don't need
* subtrans. If we later overflow, an xid assignment record will add xids
* to subtrans. If RunningXacts is overflowed then we don't have enough
@@ -547,59 +546,148 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
*/
/*
- * Nobody else is running yet, but take locks anyhow
+ * Allocate a temporary array so we can combine xids. The total
+ * of both arrays should never normally exceed TOTAL_MAX_CACHED_SUBXIDS.
*/
- LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ xids = palloc(sizeof(TransactionId) * TOTAL_MAX_CACHED_SUBXIDS);
+
+ /*
+ * Get the remaining KnownAssignedXids. In most cases there won't
+ * be any at all since this exists only to catch a theoretical
+ * race condition.
+ */
+ nxids = KnownAssignedXidsGet(xids, InvalidTransactionId);
+ if (nxids > 0)
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
- /* Reset latestCompletedXid */
- ShmemVariableCache->latestCompletedXid = running->nextXid;
- TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
+ /*
+ * Now we have a copy of any KnownAssignedXids we can zero the
+ * array before we re-insertion of combined snapshot.
+ */
+ KnownAssignedXidsRemovePreceding(InvalidTransactionId);
/*
- * Add our new xids into the array
+ * Add to the temp array any xids which have not already completed,
+ * taking care not to overflow in extreme cases.
*/
- for (xid_index = 0; xid_index < running->xcnt; xid_index++)
+ for (i = 0; i < running->xcnt; i++)
{
- TransactionId xid = running->xids[xid_index];
+ TransactionId xid = running->xids[i];
/*
- * The running-xacts snapshot can contain xids that did finish between
- * when the snapshot was taken and when it was written to WAL. Such
- * transactions are not running anymore, so ignore them.
+ * The running-xacts snapshot can contain xids that were running at
+ * the time of the snapshot, yet complete before the snapshot was
+ * written to WAL. They're running now, so ignore them.
*/
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
continue;
- KnownAssignedXidsAdd(xid, xid, true);
+ xids[nxids++] = xid;
+
+ /*
+ * Test for overflow only after we have filtered out already complete
+ * transactions.
+ */
+ if (nxids > TOTAL_MAX_CACHED_SUBXIDS)
+ elog(ERROR, "too many xids to add into KnownAssignedXids");
}
- KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ if (nxids > 0)
+ {
+ /*
+ * Sort the array so that we can add them safely into KnownAssignedXids.
+ */
+ qsort(xids, nxids, sizeof(TransactionId), xidComparator);
+
+ /*
+ * Re-initialise latestObservedXid to the highest xid we've seen.
+ */
+ latestObservedXid = xids[nxids - 1];
+
+ /*
+ * Add the sorted snapshot into KnownAssignedXids
+ */
+ for (i = 0; i < nxids; i++)
+ {
+ TransactionId xid = xids[i];
+
+ KnownAssignedXidsAdd(xid, xid, true);
+ }
+
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ }
+
+ pfree(xids);
/*
- * Update lastOverflowedXid if the snapshot has any missing subxids.
+ * Now we've got the running xids we need to set the global values
+ * thare used to track snapshots as they evolve further
+ *
+ * * latestCompletedXid which will be the xmax for snapshots
+ * * lastOverflowedXid which shows whether snapshots overflow
+ * * nextXid
+ *
+ * If the snapshot overflowed, then we still initialise with what we know,
+ * but the recovery snapshot isn't fully valid yet because we know there
+ * are some subxids missing.
* We don't know the specific subxids that are missing, so conservatively
* assume the last one is latestObservedXid. If no missing subxids,
* try to clear lastOverflowedXid.
+ *
+ * If the snapshot didn't overflow it's still possible that an overflow
+ * occurred in the gap between taking snapshot and logging record, so
+ * we also need to check if lastOverflowedXid is already ahead of us.
*/
if (running->subxid_overflow)
{
+ standbyState = STANDBY_SNAPSHOT_PENDING;
+
+ standbySnapshotPendingXmin = latestObservedXid;
if (TransactionIdFollows(latestObservedXid,
procArray->lastOverflowedXid))
procArray->lastOverflowedXid = latestObservedXid;
}
- else if (TransactionIdFollows(running->oldestRunningXid,
+ else if (TransactionIdFollows(procArray->lastOverflowedXid,
+ latestObservedXid))
+ {
+ standbyState = STANDBY_SNAPSHOT_PENDING;
+
+ standbySnapshotPendingXmin = procArray->lastOverflowedXid;
+ }
+ else
+ {
+ standbyState = STANDBY_SNAPSHOT_READY;
+
+ standbySnapshotPendingXmin = InvalidTransactionId;
+ if (TransactionIdFollows(running->oldestRunningXid,
procArray->lastOverflowedXid))
- procArray->lastOverflowedXid = InvalidTransactionId;
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ }
+
+ /*
+ * If a transaction completed in the gap between taking and logging the
+ * snapshot then latestCompletedXid may already be higher than the value
+ * from the snapshot, so check before we use the incoming value.
+ */
+ if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+ running->latestCompletedXid))
+ ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
/* nextXid must be beyond any observed xid */
- if (TransactionIdFollows(running->nextXid, ShmemVariableCache->nextXid))
- ShmemVariableCache->nextXid = running->nextXid;
+ nextXid = latestObservedXid;
+ TransactionIdAdvance(nextXid);
+ if (TransactionIdFollows(nextXid, ShmemVariableCache->nextXid))
+ ShmemVariableCache->nextXid = nextXid;
LWLockRelease(ProcArrayLock);
elog(trace_recovery(DEBUG2), "running transaction data initialized");
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
if (standbyState == STANDBY_SNAPSHOT_READY)
elog(trace_recovery(DEBUG2), "recovery snapshots are now enabled");
+ else
+ ereport(LOG,
+ (errmsg("consistent state delayed because recovery snapshot incomplete")));
}
/*
@@ -613,8 +701,7 @@ ProcArrayApplyXidAssignment(TransactionId topxid,
TransactionId max_xid;
int i;
- if (standbyState < STANDBY_SNAPSHOT_PENDING)
- return;
+ Assert(standbyState >= STANDBY_INITIALIZED);
max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
@@ -1410,6 +1497,7 @@ GetRunningTransactionData(void)
CurrentRunningXacts->subxid_overflow = suboverflowed;
CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+ CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
LWLockRelease(XidGenLock);
LWLockRelease(ProcArrayLock);
@@ -2219,35 +2307,16 @@ DisplayXidCache(void)
*
* RecordKnownAssignedTransactionIds() should be run for *every* WAL record
* type apart from XLOG_XACT_RUNNING_XACTS (since that initialises the first
- * snapshot so that RecordKnownAssignedTransactionIds() can be called).
+ * snapshot so that RecordKnownAssignedTransactionIds() can be called). Must
+ * be called for each record after we have executed StartupCLog() et al,
+ * since we must ExtendCLOG() etc..
*
- * Must only be called in Startup process.
+ * Called during recovery in analogy with and in place of GetNewTransactionId()
*/
void
RecordKnownAssignedTransactionIds(TransactionId xid)
{
- /*
- * Skip processing if the current snapshot is not initialized.
- */
- if (standbyState < STANDBY_SNAPSHOT_PENDING)
- return;
-
- /*
- * We can see WAL records before the running-xacts snapshot that contain
- * XIDs that are not in the running-xacts snapshot, but that we know to
- * have finished before the running-xacts snapshot was taken. Don't waste
- * precious shared memory by keeping them in the hash table.
- *
- * We can also see WAL records before the running-xacts snapshot that
- * contain XIDs that are not in the running-xacts snapshot for a different
- * reason: the transaction started *after* the running-xacts snapshot was
- * taken, but before it was written to WAL. We must be careful to not
- * ignore such XIDs. Because such a transaction started after the
- * running-xacts snapshot was taken, it must have an XID larger than the
- * oldest XID according to the running-xacts snapshot.
- */
- if (TransactionIdPrecedes(xid, snapshotOldestActiveXid))
- return;
+ Assert(standbyState >= STANDBY_INITIALIZED);
elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
xid, latestObservedXid);
@@ -2287,31 +2356,25 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
* Now we can advance latestObservedXid
*/
latestObservedXid = xid;
- }
- /* nextXid must be beyond any observed xid */
- if (TransactionIdFollowsOrEquals(latestObservedXid,
- ShmemVariableCache->nextXid))
- {
- ShmemVariableCache->nextXid = latestObservedXid;
- TransactionIdAdvance(ShmemVariableCache->nextXid);
+ /* ShmemVariableCache->nextXid must be beyond any observed xid */
+ next_expected_xid = latestObservedXid;
+ TransactionIdAdvance(next_expected_xid);
+ ShmemVariableCache->nextXid = next_expected_xid;
}
}
/*
* ExpireTreeKnownAssignedTransactionIds
* Remove the given XIDs from KnownAssignedXids.
+ *
+ * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
*/
void
ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
- TransactionId *subxids)
+ TransactionId *subxids, TransactionId max_xid)
{
- TransactionId max_xid;
-
- if (standbyState == STANDBY_DISABLED)
- return; /* nothing to do */
-
- max_xid = TransactionIdLatest(xid, nsubxids, subxids);
+ Assert(standbyState >= STANDBY_INITIALIZED);
/*
* Uses same locking as transaction commit
@@ -2882,8 +2945,6 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
int head, tail;
int i;
- Assert(TransactionIdIsValid(xmax));
-
/*
* Fetch head just once, since it may change while we loop.
* We can stop once we reach the initially seen head, since
@@ -2894,8 +2955,8 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
* Must take spinlock to ensure we see up-to-date array contents.
*/
SpinLockAcquire(&pArray->known_assigned_xids_lck);
- head = pArray->tailKnownAssignedXids;
- tail = pArray->headKnownAssignedXids;
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
SpinLockRelease(&pArray->known_assigned_xids_lck);
for (i = tail; i < head; i++)
@@ -2917,7 +2978,8 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
* Filter out anything >= xmax, again relying on sorted property
* of array.
*/
- if (TransactionIdPrecedesOrEquals(xmax, knownXid))
+ if (TransactionIdIsValid(xmax) &&
+ TransactionIdFollowsOrEquals(knownXid, xmax))
break;
/* Add knownXid into output array */
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 6bd156845fb..a313ee50f1c 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -11,7 +11,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.21 2010/05/02 02:10:33 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.22 2010/05/13 11:15:38 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -776,6 +776,51 @@ standby_desc(StringInfo buf, uint8 xl_info, char *rec)
/*
* Log details of the current snapshot to WAL. This allows the snapshot state
* to be reconstructed on the standby.
+ *
+ * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
+ * start from a shutdown checkpoint because we know nothing was running
+ * at that time and our recovery snapshot is known empty. In the more
+ * typical case of an online checkpoint we need to jump through a few
+ * hoops to get a correct recovery snapshot and this requires a two or
+ * sometimes a three stage process.
+ *
+ * The initial snapshot must contain all running xids and all current
+ * AccessExclusiveLocks at a point in time on the standby. Assembling
+ * that information while the server is running requires many and
+ * various LWLocks, so we choose to derive that information piece by
+ * piece and then re-assemble that info on the standby. When that
+ * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
+ *
+ * Since locking on the primary when we derive the information is not
+ * strict, we note that there is a time window between the derivation and
+ * writing to WAL of the derived information. That allows race conditions
+ * that we must resolve, since xids and locks may enter or leave the
+ * snapshot during that window. This creates the issue that an xid or
+ * lock may start *after* the snapshot has been derived yet *before* the
+ * snapshot is logged in the running xacts WAL record. We resolve this by
+ * starting to accumulate changes at a point just prior to when we derive
+ * the snapshot on the primary, then ignore duplicates when we later apply
+ * the snapshot from the running xacts record. This is implemented during
+ * CreateCheckpoint() where we use the logical checkpoint location as
+ * our starting point and then write the running xacts record immediately
+ * before writing the main checkpoint WAL record. Since we always start
+ * up from a checkpoint and are immediately at our starting point, we
+ * unconditionally move to STANDBY_INITIALIZED. After this point we
+ * must do 4 things:
+ * * move shared nextXid forwards as we see new xids
+ * * extend the clog and subtrans with each new xid
+ * * keep track of uncommitted known assigned xids
+ * * keep track of uncommitted AccessExclusiveLocks
+ *
+ * When we see a commit/abort we must remove known assigned xids and locks
+ * from the completing transaction. Attempted removals that cannot locate
+ * an entry are expected and must not cause an error when we are in state
+ * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
+ * KnownAssignedXidsRemove().
+ *
+ * Later, when we apply the running xact data we must be careful to ignore
+ * transactions already committed, since those commits raced ahead when
+ * making WAL entries.
*/
void
LogStandbySnapshot(TransactionId *oldestActiveXid, TransactionId *nextXid)
@@ -788,6 +833,12 @@ LogStandbySnapshot(TransactionId *oldestActiveXid, TransactionId *nextXid)
/*
* Get details of any AccessExclusiveLocks being held at the moment.
+ *
+ * XXX GetRunningTransactionLocks() currently holds a lock on all partitions
+ * though it is possible to further optimise the locking. By reference
+ * counting locks and storing the value on the ProcArray entry for each backend
+ * we can easily tell if any locks need recording without trying to acquire
+ * the partition locks and scanning the lock table.
*/
locks = GetRunningTransactionLocks(&nlocks);
if (nlocks > 0)
@@ -798,6 +849,11 @@ LogStandbySnapshot(TransactionId *oldestActiveXid, TransactionId *nextXid)
* record we write, because standby will open up when it sees this.
*/
running = GetRunningTransactionData();
+ /*
+ * The gap between GetRunningTransactionData() and LogCurrentRunningXacts()
+ * is what most of the fuss is about here, so artifically extending this
+ * interval is a great way to test the little used parts of the code.
+ */
LogCurrentRunningXacts(running);
*oldestActiveXid = running->oldestRunningXid;
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index ed68be6f271..e92809d58d0 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.31 2010/01/23 16:37:12 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.32 2010/05/13 11:15:38 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -35,7 +35,8 @@ extern void ProcArrayApplyXidAssignment(TransactionId topxid,
extern void RecordKnownAssignedTransactionIds(TransactionId xid);
extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid,
- int nsubxids, TransactionId *subxids);
+ int nsubxids, TransactionId *subxids,
+ TransactionId max_xid);
extern void ExpireAllKnownAssignedTransactionIds(void);
extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid);
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index fd2dfacd351..9159301a168 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/storage/standby.h,v 1.9 2010/02/26 02:01:28 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/standby.h,v 1.10 2010/05/13 11:15:38 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -68,6 +68,7 @@ typedef struct xl_running_xacts
bool subxid_overflow; /* snapshot overflowed, subxids missing */
TransactionId nextXid; /* copy of ShmemVariableCache->nextXid */
TransactionId oldestRunningXid; /* *not* oldestXmin */
+ TransactionId latestCompletedXid; /* so we can set xmax */
TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */
} xl_running_xacts;
@@ -97,6 +98,7 @@ typedef struct RunningTransactionsData
bool subxid_overflow; /* snapshot overflowed, subxids missing */
TransactionId nextXid; /* copy of ShmemVariableCache->nextXid */
TransactionId oldestRunningXid; /* *not* oldestXmin */
+ TransactionId latestCompletedXid; /* so we can set xmax */
TransactionId *xids; /* array of (sub)xids still running */
} RunningTransactionsData;