Prevent concurrent SimpleLruTruncate() for any given SLRU.

The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com
author: Noah Misch <noah@leadboat.com> 2020-08-15 10:15:53 -0700
committer: Noah Misch <noah@leadboat.com> 2020-08-15 10:15:57 -0700
commit: d4031d78460cbbb4ed2fb7be635f84bea0e9a0c1 (patch)
tree: d7ee345e4522f52d62b51f33db15d96ba9a8535a /src
parent: 9d472b51e98777102d72f8ccdfb8cef10e087f74 (diff)
download: postgresql-d4031d78460cbbb4ed2fb7be635f84bea0e9a0c1.tar.gz
postgresql-d4031d78460cbbb4ed2fb7be635f84bea0e9a0c1.zip
9 files changed, 97 insertions, 12 deletions
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index fad5d363e32..67387979bdb 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1163,6 +1163,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 
 /*
  * Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
+ * before computing cutoffPage.  Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage.  Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
  */
 void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 4faa21f5aef..ef63b6d98a4 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -347,8 +347,8 @@ ExtendSUBTRANS(TransactionId newestXact)
 /*
  * Remove all SUBTRANS segments before the one holding the passed transaction ID
  *
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
+ * oldestXact is the oldest TransactionXmin of any running transaction.  This
+ * is called only during checkpoint.
  */
 void
 TruncateSUBTRANS(TransactionId oldestXact)
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 4cd1f4b95c3..8ef0aad808e 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -224,19 +224,22 @@ typedef struct QueueBackendStatus
 /*
  * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
  *
- * The AsyncQueueControl structure is protected by the AsyncQueueLock.
+ * The AsyncQueueControl structure is protected by the AsyncQueueLock and
+ * NotifyQueueTailLock.
  *
- * When holding the lock in SHARED mode, backends may only inspect their own
- * entries as well as the head and tail pointers. Consequently we can allow a
- * backend to update its own record while holding only SHARED lock (since no
- * other backend will inspect it).
+ * When holding AsyncQueueLock in SHARED mode, backends may only inspect their
+ * own entries as well as the head and tail pointers. Consequently we can
+ * allow a backend to update its own record while holding only SHARED lock
+ * (since no other backend will inspect it).
  *
- * When holding the lock in EXCLUSIVE mode, backends can inspect the entries
- * of other backends and also change the head and tail pointers.
+ * When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
+ * entries of other backends and also change the head pointer. When holding
+ * both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
+ * change the tail pointer.
  *
  * AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
- * In order to avoid deadlocks, whenever we need both locks, we always first
- * get AsyncQueueLock and then AsyncCtlLock.
+ * In order to avoid deadlocks, whenever we need multiple locks, we first get
+ * NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
  *
  * Each backend uses the backend[] array entry with index equal to its
  * BackendId (which can range from 1 to MaxBackends).  We rely on this to make
@@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void)
 	int			newtailpage;
 	int			boundary;
 
+	/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+	LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
+
+	/* Compute the new tail. */
 	LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
 	min = QUEUE_HEAD;
 	for (i = 1; i <= MaxBackends; i++)
@@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void)
 			min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
 	}
 	oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
-	QUEUE_TAIL = min;
 	LWLockRelease(AsyncQueueLock);
 
 	/*
@@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void)
 		 */
 		SimpleLruTruncate(AsyncCtl, newtailpage);
 	}
+
+	/*
+	 * Advertise the new tail.  This changes asyncQueueIsFull()'s verdict for
+	 * the segment immediately prior to the new tail, allowing fresh data into
+	 * that segment.
+	 */
+	LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
+	QUEUE_TAIL = min;
+	LWLockRelease(AsyncQueueLock);
+
+	LWLockRelease(NotifyQueueTailLock);
 }
 
 /*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 70fa5e114bd..b45661672e3 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1018,6 +1018,14 @@ vac_update_datfrozenxid(void)
 	bool		dirty = false;
 
 	/*
+	 * Restrict this task to one backend per database.  This avoids race
+	 * conditions that would move datfrozenxid or datminmxid backward.  It
+	 * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+	 * datfrozenxid passed to an earlier vac_truncate_clog() call.
+	 */
+	LockDatabaseFrozenIds(ExclusiveLock);
+
+	/*
 	 * Initialize the "min" calculation with GetOldestXmin, which is a
 	 * reasonable approximation to the minimum relfrozenxid for not-yet-
 	 * committed pg_class entries for new tables; see AddNewRelationTuple().
@@ -1181,6 +1189,9 @@ vac_truncate_clog(TransactionId frozenXID,
 	bool		bogus = false;
 	bool		frozenAlreadyWrapped = false;
 
+	/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+	LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
 	/* init oldest datoids to sync with my frozenXID/minMulti values */
 	oldestxid_datoid = MyDatabaseId;
 	minmulti_datoid = MyDatabaseId;
@@ -1290,6 +1301,8 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	SetTransactionIdLimit(frozenXID, oldestxid_datoid);
 	SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+	LWLockRelease(WrapLimitsVacuumLock);
 }
 
 
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index dc0a4396388..044b0189e7e 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -413,6 +413,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
 }
 
 /*
+ *		LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
  *		LockPage
  *
  * Obtain a page-level lock.  This is currently used by some index access
@@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
 							 tag->locktag_field2,
 							 tag->locktag_field1);
 			break;
+		case LOCKTAG_DATABASE_FROZEN_IDS:
+			appendStringInfo(buf,
+							 _("pg_database.datfrozenxid of database %u"),
+							 tag->locktag_field1);
+			break;
 		case LOCKTAG_PAGE:
 			appendStringInfo(buf,
 							 _("page %u of relation %u of database %u"),
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6025ecedb3..04a1786d372 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,5 @@ OldSnapshotTimeMapLock				42
 BackendRandomLock					43
 LogicalRepWorkerLock				44
 CLogTruncationLock					45
+WrapLimitsVacuumLock				46
+NotifyQueueTailLock					47
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index 66c09a1f316..01c66a2538b 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -26,6 +26,7 @@
 const char *const LockTagTypeNames[] = {
 	"relation",
 	"extend",
+	"frozenid",
 	"page",
 	"tuple",
 	"transactionid",
@@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[8] = true;
 				nulls[9] = true;
 				break;
+			case LOCKTAG_DATABASE_FROZEN_IDS:
+				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
+				nulls[2] = true;
+				nulls[3] = true;
+				nulls[4] = true;
+				nulls[5] = true;
+				nulls[6] = true;
+				nulls[7] = true;
+				nulls[8] = true;
+				nulls[9] = true;
+				break;
 			case LOCKTAG_PAGE:
 				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
 				values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index a217de97166..13667584ea0 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
 									LOCKMODE lockmode);
 extern int	RelationExtensionLockWaiterCount(Relation relation);
 
+/* Lock to recompute pg_database.datfrozenxid in the current database */
+extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
+
 /* Lock a page (currently only used within indexes) */
 extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
 extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index ff4df7fec51..6b56fa58743 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -141,6 +141,8 @@ typedef enum LockTagType
 	/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
 	LOCKTAG_RELATION_EXTEND,	/* the right to extend a relation */
 	/* same ID info as RELATION */
+	LOCKTAG_DATABASE_FROZEN_IDS,	/* pg_database.datfrozenxid */
+	/* ID info for frozen IDs is DB OID */
 	LOCKTAG_PAGE,				/* one page of a relation */
 	/* ID info for a page is RELATION info + BlockNumber */
 	LOCKTAG_TUPLE,				/* one physical tuple */
@@ -206,6 +208,14 @@ typedef struct LOCKTAG
 	 (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
 	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
 
+#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = 0, \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
 #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
 	((locktag).locktag_field1 = (dboid), \
 	 (locktag).locktag_field2 = (reloid), \
author	Noah Misch <noah@leadboat.com>	2020-08-15 10:15:53 -0700
committer	Noah Misch <noah@leadboat.com>	2020-08-15 10:15:57 -0700
commit	d4031d78460cbbb4ed2fb7be635f84bea0e9a0c1 (patch)
tree	d7ee345e4522f52d62b51f33db15d96ba9a8535a /src
parent	9d472b51e98777102d72f8ccdfb8cef10e087f74 (diff)
download	postgresql-d4031d78460cbbb4ed2fb7be635f84bea0e9a0c1.tar.gz postgresql-d4031d78460cbbb4ed2fb7be635f84bea0e9a0c1.zip