aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils
diff options
context:
space:
mode:
authorSimon Riggs <simon@2ndQuadrant.com>2009-12-19 01:32:45 +0000
committerSimon Riggs <simon@2ndQuadrant.com>2009-12-19 01:32:45 +0000
commitefc16ea520679d713d98a2c7bf1453c4ff7b91ec (patch)
tree6a39d2af0704a36281dc7df3ec10823eb3e6de75 /src/backend/utils
parent78a09145e0f8322e625bbc7d69fcb865ce4f3034 (diff)
downloadpostgresql-efc16ea520679d713d98a2c7bf1453c4ff7b91ec.tar.gz
postgresql-efc16ea520679d713d98a2c7bf1453c4ff7b91ec.zip
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
Diffstat (limited to 'src/backend/utils')
-rw-r--r--src/backend/utils/adt/txid.c12
-rw-r--r--src/backend/utils/adt/xid.c21
-rw-r--r--src/backend/utils/cache/inval.c164
-rw-r--r--src/backend/utils/error/elog.c20
-rw-r--r--src/backend/utils/init/postinit.c10
-rw-r--r--src/backend/utils/misc/guc.c64
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample3
-rw-r--r--src/backend/utils/time/snapmgr.c12
-rw-r--r--src/backend/utils/time/tqual.c90
9 files changed, 290 insertions, 106 deletions
diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c
index a4a5b866768..fe9f7c5d396 100644
--- a/src/backend/utils/adt/txid.c
+++ b/src/backend/utils/adt/txid.c
@@ -14,7 +14,7 @@
* Author: Jan Wieck, Afilias USA INC.
* 64-bit txids: Marko Kreen, Skype Technologies
*
- * $PostgreSQL: pgsql/src/backend/utils/adt/txid.c,v 1.8 2009/01/01 17:23:50 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/txid.c,v 1.9 2009/12/19 01:32:36 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -24,6 +24,7 @@
#include "access/transam.h"
#include "access/xact.h"
#include "funcapi.h"
+#include "miscadmin.h"
#include "libpq/pqformat.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
@@ -338,6 +339,15 @@ txid_current(PG_FUNCTION_ARGS)
txid val;
TxidEpoch state;
+ /*
+ * Must prevent during recovery because if an xid is
+ * not assigned we try to assign one, which would fail.
+ * Programs already rely on this function to always
+ * return a valid current xid, so we should not change
+ * this to return NULL or similar invalid xid.
+ */
+ PreventCommandDuringRecovery();
+
load_xid_epoch(&state);
val = convert_xid(GetTopTransactionId(), &state);
diff --git a/src/backend/utils/adt/xid.c b/src/backend/utils/adt/xid.c
index 0fbd394bbfd..2cb197fbff8 100644
--- a/src/backend/utils/adt/xid.c
+++ b/src/backend/utils/adt/xid.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/xid.c,v 1.12 2009/01/01 17:23:50 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/xid.c,v 1.13 2009/12/19 01:32:36 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -102,6 +102,25 @@ xid_age(PG_FUNCTION_ARGS)
PG_RETURN_INT32((int32) (now - xid));
}
+/*
+ * xidComparator
+ * qsort comparison function for XIDs
+ *
+ * We can't use wraparound comparison for XIDs because that does not respect
+ * the triangle inequality! Any old sort order will do.
+ */
+int
+xidComparator(const void *arg1, const void *arg2)
+{
+ TransactionId xid1 = *(const TransactionId *) arg1;
+ TransactionId xid2 = *(const TransactionId *) arg2;
+
+ if (xid1 > xid2)
+ return 1;
+ if (xid1 < xid2)
+ return -1;
+ return 0;
+}
/*****************************************************************************
* COMMAND IDENTIFIER ROUTINES *
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 5fac924207d..5e59d0ab8e6 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -80,7 +80,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.89 2009/06/11 14:49:05 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.90 2009/12/19 01:32:36 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -155,6 +155,11 @@ typedef struct TransInvalidationInfo
static TransInvalidationInfo *transInvalInfo = NULL;
+static SharedInvalidationMessage *SharedInvalidMessagesArray;
+static int numSharedInvalidMessagesArray;
+static int maxSharedInvalidMessagesArray;
+
+
/*
* Dynamically-registered callback functions. Current implementation
* assumes there won't be very many of these at once; could improve if needed.
@@ -180,14 +185,6 @@ static struct RELCACHECALLBACK
static int relcache_callback_count = 0;
-/* info values for 2PC callback */
-#define TWOPHASE_INFO_MSG 0 /* SharedInvalidationMessage */
-#define TWOPHASE_INFO_FILE_BEFORE 1 /* relcache file inval */
-#define TWOPHASE_INFO_FILE_AFTER 2 /* relcache file inval */
-
-static void PersistInvalidationMessage(SharedInvalidationMessage *msg);
-
-
/* ----------------------------------------------------------------
* Invalidation list support functions
*
@@ -741,38 +738,8 @@ AtStart_Inval(void)
MemoryContextAllocZero(TopTransactionContext,
sizeof(TransInvalidationInfo));
transInvalInfo->my_level = GetCurrentTransactionNestLevel();
-}
-
-/*
- * AtPrepare_Inval
- * Save the inval lists state at 2PC transaction prepare.
- *
- * In this phase we just generate 2PC records for all the pending invalidation
- * work.
- */
-void
-AtPrepare_Inval(void)
-{
- /* Must be at top of stack */
- Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
-
- /*
- * Relcache init file invalidation requires processing both before and
- * after we send the SI messages.
- */
- if (transInvalInfo->RelcacheInitFileInval)
- RegisterTwoPhaseRecord(TWOPHASE_RM_INVAL_ID, TWOPHASE_INFO_FILE_BEFORE,
- NULL, 0);
-
- AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
- &transInvalInfo->CurrentCmdInvalidMsgs);
-
- ProcessInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
- PersistInvalidationMessage);
-
- if (transInvalInfo->RelcacheInitFileInval)
- RegisterTwoPhaseRecord(TWOPHASE_RM_INVAL_ID, TWOPHASE_INFO_FILE_AFTER,
- NULL, 0);
+ SharedInvalidMessagesArray = NULL;
+ numSharedInvalidMessagesArray = 0;
}
/*
@@ -812,45 +779,97 @@ AtSubStart_Inval(void)
}
/*
- * PersistInvalidationMessage
- * Write an invalidation message to the 2PC state file.
+ * Collect invalidation messages into SharedInvalidMessagesArray array.
*/
static void
-PersistInvalidationMessage(SharedInvalidationMessage *msg)
+MakeSharedInvalidMessagesArray(const SharedInvalidationMessage *msgs, int n)
{
- RegisterTwoPhaseRecord(TWOPHASE_RM_INVAL_ID, TWOPHASE_INFO_MSG,
- msg, sizeof(SharedInvalidationMessage));
+ /*
+ * Initialise array first time through in each commit
+ */
+ if (SharedInvalidMessagesArray == NULL)
+ {
+ maxSharedInvalidMessagesArray = FIRSTCHUNKSIZE;
+ numSharedInvalidMessagesArray = 0;
+
+ /*
+ * Although this is being palloc'd we don't actually free it directly.
+ * We're so close to EOXact that we now we're going to lose it anyhow.
+ */
+ SharedInvalidMessagesArray = palloc(maxSharedInvalidMessagesArray
+ * sizeof(SharedInvalidationMessage));
+ }
+
+ if ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+ {
+ while ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+ maxSharedInvalidMessagesArray *= 2;
+
+ SharedInvalidMessagesArray = repalloc(SharedInvalidMessagesArray,
+ maxSharedInvalidMessagesArray
+ * sizeof(SharedInvalidationMessage));
+ }
+
+ /*
+ * Append the next chunk onto the array
+ */
+ memcpy(SharedInvalidMessagesArray + numSharedInvalidMessagesArray,
+ msgs, n * sizeof(SharedInvalidationMessage));
+ numSharedInvalidMessagesArray += n;
}
/*
- * inval_twophase_postcommit
- * Process an invalidation message from the 2PC state file.
+ * xactGetCommittedInvalidationMessages() is executed by
+ * RecordTransactionCommit() to add invalidation messages onto the
+ * commit record. This applies only to commit message types, never to
+ * abort records. Must always run before AtEOXact_Inval(), since that
+ * removes the data we need to see.
+ *
+ * Remember that this runs before we have officially committed, so we
+ * must not do anything here to change what might occur *if* we should
+ * fail between here and the actual commit.
+ *
+ * see also xact_redo_commit() and xact_desc_commit()
*/
-void
-inval_twophase_postcommit(TransactionId xid, uint16 info,
- void *recdata, uint32 len)
+int
+xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
+ bool *RelcacheInitFileInval)
{
- SharedInvalidationMessage *msg;
+ MemoryContext oldcontext;
- switch (info)
- {
- case TWOPHASE_INFO_MSG:
- msg = (SharedInvalidationMessage *) recdata;
- Assert(len == sizeof(SharedInvalidationMessage));
- SendSharedInvalidMessages(msg, 1);
- break;
- case TWOPHASE_INFO_FILE_BEFORE:
- RelationCacheInitFileInvalidate(true);
- break;
- case TWOPHASE_INFO_FILE_AFTER:
- RelationCacheInitFileInvalidate(false);
- break;
- default:
- Assert(false);
- break;
- }
-}
+ /* Must be at top of stack */
+ Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
+
+ /*
+ * Relcache init file invalidation requires processing both before and
+ * after we send the SI messages. However, we need not do anything
+ * unless we committed.
+ */
+ *RelcacheInitFileInval = transInvalInfo->RelcacheInitFileInval;
+ /*
+ * Walk through TransInvalidationInfo to collect all the messages
+ * into a single contiguous array of invalidation messages. It must
+ * be contiguous so we can copy directly into WAL message. Maintain the
+ * order that they would be processed in by AtEOXact_Inval(), to ensure
+ * emulated behaviour in redo is as similar as possible to original.
+ * We want the same bugs, if any, not new ones.
+ */
+ oldcontext = MemoryContextSwitchTo(CurTransactionContext);
+
+ ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+ MakeSharedInvalidMessagesArray);
+ ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs,
+ MakeSharedInvalidMessagesArray);
+ MemoryContextSwitchTo(oldcontext);
+
+ Assert(!(numSharedInvalidMessagesArray > 0 &&
+ SharedInvalidMessagesArray == NULL));
+
+ *msgs = SharedInvalidMessagesArray;
+
+ return numSharedInvalidMessagesArray;
+}
/*
* AtEOXact_Inval
@@ -1028,6 +1047,8 @@ CommandEndInvalidationMessages(void)
* no need to worry about cleaning up if there's an elog(ERROR) before
* reaching EndNonTransactionalInvalidation (the invals will just be thrown
* away if that happens).
+ *
+ * Note that these are not replayed in standby mode.
*/
void
BeginNonTransactionalInvalidation(void)
@@ -1041,6 +1062,9 @@ BeginNonTransactionalInvalidation(void)
Assert(transInvalInfo->CurrentCmdInvalidMsgs.cclist == NULL);
Assert(transInvalInfo->CurrentCmdInvalidMsgs.rclist == NULL);
Assert(transInvalInfo->RelcacheInitFileInval == false);
+
+ SharedInvalidMessagesArray = NULL;
+ numSharedInvalidMessagesArray = 0;
}
/*
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 59fa07a379a..552b3392da7 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -42,7 +42,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.219 2009/11/28 23:38:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.220 2009/12/19 01:32:37 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -2794,3 +2794,21 @@ is_log_level_output(int elevel, int log_min_level)
return false;
}
+
+/*
+ * If trace_recovery_messages is set to make this visible, then show as LOG,
+ * else display as whatever level is set. It may still be shown, but only
+ * if log_min_messages is set lower than trace_recovery_messages.
+ *
+ * Intention is to keep this for at least the whole of the 8.5 production
+ * release, so we can more easily diagnose production problems in the field.
+ */
+int
+trace_recovery(int trace_level)
+{
+ if (trace_level < LOG &&
+ trace_level >= trace_recovery_messages)
+ return LOG;
+
+ return trace_level;
+}
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index b6c93c7f8eb..120405cae54 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.198 2009/10/07 22:14:23 alvherre Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.199 2009/12/19 01:32:37 sriggs Exp $
*
*
*-------------------------------------------------------------------------
@@ -481,7 +481,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
*/
MyBackendId = InvalidBackendId;
- SharedInvalBackendInit();
+ SharedInvalBackendInit(false);
if (MyBackendId > MaxBackends || MyBackendId <= 0)
elog(FATAL, "bad backend id: %d", MyBackendId);
@@ -495,11 +495,11 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
InitBufferPoolBackend();
/*
- * Initialize local process's access to XLOG. In bootstrap case we may
- * skip this since StartupXLOG() was run instead.
+ * Initialize local process's access to XLOG, if appropriate. In bootstrap
+ * case we skip this since StartupXLOG() was run instead.
*/
if (!bootstrap)
- InitXLOGAccess();
+ (void) RecoveryInProgress();
/*
* Initialize the relation cache and the system catalog caches. Note that
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 900c3662786..0c9998614fa 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
* Written by Peter Eisentraut <peter_e@gmx.net>.
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.527 2009/12/11 03:34:56 itagaki Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.528 2009/12/19 01:32:37 sriggs Exp $
*
*--------------------------------------------------------------------
*/
@@ -114,6 +114,9 @@ extern char *default_tablespace;
extern char *temp_tablespaces;
extern bool synchronize_seqscans;
extern bool fullPageWrites;
+extern int vacuum_defer_cleanup_age;
+
+int trace_recovery_messages = LOG;
#ifdef TRACE_SORT
extern bool trace_sort;
@@ -1207,6 +1210,17 @@ static struct config_bool ConfigureNamesBool[] =
},
{
+ {"recovery_connections", PGC_POSTMASTER, WAL_SETTINGS,
+ gettext_noop("During recovery, allows connections and queries. "
+ " During normal running, causes additional info to be written"
+ " to WAL to enable hot standby mode on WAL standby nodes."),
+ NULL
+ },
+ &XLogRequestRecoveryConnections,
+ true, NULL, NULL
+ },
+
+ {
{"allow_system_table_mods", PGC_POSTMASTER, DEVELOPER_OPTIONS,
gettext_noop("Allows modifications of the structure of system tables."),
NULL,
@@ -1347,6 +1361,8 @@ static struct config_int ConfigureNamesInt[] =
* plus autovacuum_max_workers plus one (for the autovacuum launcher).
*
* Likewise we have to limit NBuffers to INT_MAX/2.
+ *
+ * See also CheckRequiredParameterValues() if this parameter changes
*/
{
{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
@@ -1358,6 +1374,15 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"max_standby_delay", PGC_SIGHUP, WAL_SETTINGS,
+ gettext_noop("Sets the maximum delay to avoid conflict processing on Hot Standby servers."),
+ NULL
+ },
+ &MaxStandbyDelay,
+ 30, -1, INT_MAX, NULL, NULL
+ },
+
+ {
{"superuser_reserved_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
gettext_noop("Sets the number of connection slots reserved for superusers."),
NULL
@@ -1514,6 +1539,9 @@ static struct config_int ConfigureNamesInt[] =
1000, 25, INT_MAX, NULL, NULL
},
+ /*
+ * See also CheckRequiredParameterValues() if this parameter changes
+ */
{
{"max_prepared_transactions", PGC_POSTMASTER, RESOURCES,
gettext_noop("Sets the maximum number of simultaneously prepared transactions."),
@@ -1573,6 +1601,18 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"vacuum_defer_cleanup_age", PGC_USERSET, CLIENT_CONN_STATEMENT,
+ gettext_noop("Age by which VACUUM and HOT cleanup should be deferred, if any."),
+ NULL
+ },
+ &vacuum_defer_cleanup_age,
+ 0, 0, 1000000, NULL, NULL
+ },
+
+ /*
+ * See also CheckRequiredParameterValues() if this parameter changes
+ */
+ {
{"max_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT,
gettext_noop("Sets the maximum number of locks per transaction."),
gettext_noop("The shared lock table is sized on the assumption that "
@@ -2685,6 +2725,16 @@ static struct config_enum ConfigureNamesEnum[] =
},
{
+ {"trace_recovery_messages", PGC_SUSET, LOGGING_WHEN,
+ gettext_noop("Sets the message levels that are logged during recovery."),
+ gettext_noop("Each level includes all the levels that follow it. The later"
+ " the level, the fewer messages are sent.")
+ },
+ &trace_recovery_messages,
+ DEBUG1, server_message_level_options, NULL, NULL
+ },
+
+ {
{"track_functions", PGC_SUSET, STATS_COLLECTOR,
gettext_noop("Collects function-level statistics on database activity."),
NULL
@@ -7511,6 +7561,18 @@ assign_transaction_read_only(bool newval, bool doit, GucSource source)
if (source != PGC_S_OVERRIDE)
return false;
}
+
+ /* Can't go to r/w mode while recovery is still active */
+ if (newval == false && XactReadOnly && RecoveryInProgress())
+ {
+ ereport(GUC_complaint_elevel(source),
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot set transaction read-write mode during recovery")));
+ /* source == PGC_S_OVERRIDE means do it anyway, eg at xact abort */
+ if (source != PGC_S_OVERRIDE)
+ return false;
+ }
+
return true;
}
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d2da9b9c3d2..c4ddeaf2bca 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -181,6 +181,9 @@
#archive_timeout = 0 # force a logfile segment switch after this
# number of seconds; 0 disables
+#recovery_connections = on # allows connections during recovery
+#max_standby_delay = 30 # max acceptable standby lag (s) to help queries
+ # complete without conflict; -1 disables
#------------------------------------------------------------------------------
# QUERY TUNING
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 41ff3405a39..cf3479e0640 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -19,7 +19,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/time/snapmgr.c,v 1.12 2009/10/07 16:27:18 alvherre Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/time/snapmgr.c,v 1.13 2009/12/19 01:32:37 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -224,8 +224,14 @@ CopySnapshot(Snapshot snapshot)
else
newsnap->xip = NULL;
- /* setup subXID array */
- if (snapshot->subxcnt > 0)
+ /*
+ * Setup subXID array. Don't bother to copy it if it had overflowed,
+ * though, because it's not used anywhere in that case. Except if it's
+ * a snapshot taken during recovery; all the top-level XIDs are in subxip
+ * as well in that case, so we mustn't lose them.
+ */
+ if (snapshot->subxcnt > 0 &&
+ (!snapshot->suboverflowed || snapshot->takenDuringRecovery))
{
newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff);
memcpy(newsnap->subxip, snapshot->subxip,
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 6d8f86acc96..32eeabb9994 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -50,7 +50,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.113 2009/06/11 14:49:06 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.114 2009/12/19 01:32:37 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1257,42 +1257,84 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
return true;
/*
- * If the snapshot contains full subxact data, the fastest way to check
- * things is just to compare the given XID against both subxact XIDs and
- * top-level XIDs. If the snapshot overflowed, we have to use pg_subtrans
- * to convert a subxact XID to its parent XID, but then we need only look
- * at top-level XIDs not subxacts.
+ * Snapshot information is stored slightly differently in snapshots
+ * taken during recovery.
*/
- if (snapshot->subxcnt >= 0)
+ if (!snapshot->takenDuringRecovery)
{
- /* full data, so search subxip */
- int32 j;
+ /*
+ * If the snapshot contains full subxact data, the fastest way to check
+ * things is just to compare the given XID against both subxact XIDs and
+ * top-level XIDs. If the snapshot overflowed, we have to use pg_subtrans
+ * to convert a subxact XID to its parent XID, but then we need only look
+ * at top-level XIDs not subxacts.
+ */
+ if (!snapshot->suboverflowed)
+ {
+ /* full data, so search subxip */
+ int32 j;
- for (j = 0; j < snapshot->subxcnt; j++)
+ for (j = 0; j < snapshot->subxcnt; j++)
+ {
+ if (TransactionIdEquals(xid, snapshot->subxip[j]))
+ return true;
+ }
+
+ /* not there, fall through to search xip[] */
+ }
+ else
{
- if (TransactionIdEquals(xid, snapshot->subxip[j]))
- return true;
+ /* overflowed, so convert xid to top-level */
+ xid = SubTransGetTopmostTransaction(xid);
+
+ /*
+ * If xid was indeed a subxact, we might now have an xid < xmin, so
+ * recheck to avoid an array scan. No point in rechecking xmax.
+ */
+ if (TransactionIdPrecedes(xid, snapshot->xmin))
+ return false;
}
- /* not there, fall through to search xip[] */
+ for (i = 0; i < snapshot->xcnt; i++)
+ {
+ if (TransactionIdEquals(xid, snapshot->xip[i]))
+ return true;
+ }
}
else
{
- /* overflowed, so convert xid to top-level */
- xid = SubTransGetTopmostTransaction(xid);
+ int32 j;
/*
- * If xid was indeed a subxact, we might now have an xid < xmin, so
- * recheck to avoid an array scan. No point in rechecking xmax.
+ * In recovery we store all xids in the subxact array because it
+ * is by far the bigger array, and we mostly don't know which xids
+ * are top-level and which are subxacts. The xip array is empty.
+ *
+ * We start by searching subtrans, if we overflowed.
*/
- if (TransactionIdPrecedes(xid, snapshot->xmin))
- return false;
- }
+ if (snapshot->suboverflowed)
+ {
+ /* overflowed, so convert xid to top-level */
+ xid = SubTransGetTopmostTransaction(xid);
- for (i = 0; i < snapshot->xcnt; i++)
- {
- if (TransactionIdEquals(xid, snapshot->xip[i]))
- return true;
+ /*
+ * If xid was indeed a subxact, we might now have an xid < xmin, so
+ * recheck to avoid an array scan. No point in rechecking xmax.
+ */
+ if (TransactionIdPrecedes(xid, snapshot->xmin))
+ return false;
+ }
+
+ /*
+ * We now have either a top-level xid higher than xmin or an
+ * indeterminate xid. We don't know whether it's top level or subxact
+ * but it doesn't matter. If it's present, the xid is visible.
+ */
+ for (j = 0; j < snapshot->subxcnt; j++)
+ {
+ if (TransactionIdEquals(xid, snapshot->subxip[j]))
+ return true;
+ }
}
return false;