Log long wait time on recovery conflict when it's resolved.

This is a follow-up of the work done in commit 0650ff2303. This commit extends log_recovery_conflict_waits so that a log message is produced also when recovery conflict has already been resolved after deadlock_timeout passes, i.e., when the startup process finishes waiting for recovery conflict after deadlock_timeout. This is useful in investigating how long recovery conflicts prevented the recovery from applying WAL. Author: Fujii Masao Reviewed-by: Kyotaro Horiguchi, Bertrand Drouvot Discussion: https://postgr.es/m/9a60178c-a853-1440-2cdc-c3af916cff59@amazon.com
author: Fujii Masao <fujii@postgresql.org> 2021-01-13 22:59:17 +0900
committer: Fujii Masao <fujii@postgresql.org> 2021-01-13 22:59:17 +0900
commit: 39b03690b529935a3c33024ee68f08e2d347cf4f (patch)
tree: 535e89716c5be4aebec5debb1db1e37e6f0d43c3 /src
parent: 6ecaaf810b8b9e3132f00549e630400f38a7ca37 (diff)
download: postgresql-39b03690b529935a3c33024ee68f08e2d347cf4f.tar.gz
postgresql-39b03690b529935a3c33024ee68f08e2d347cf4f.zip
4 files changed, 60 insertions, 14 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index c46b8abad12..561c212092f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4034,6 +4034,16 @@ LockBufferForCleanup(Buffer buffer)
 			/* Successfully acquired exclusive lock with pincount 1 */
 			UnlockBufHdr(bufHdr, buf_state);
 
+			/*
+			 * Emit the log message if recovery conflict on buffer pin was
+			 * resolved but the startup process waited longer than
+			 * deadlock_timeout for it.
+			 */
+			if (logged_recovery_conflict)
+				LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+									waitStart, GetCurrentTimestamp(),
+									NULL, false);
+
 			/* Report change to non-waiting status */
 			if (new_status)
 			{
@@ -4088,7 +4098,7 @@ LockBufferForCleanup(Buffer buffer)
 											   DeadlockTimeout))
 				{
 					LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
-										waitStart, now, NULL);
+										waitStart, now, NULL, true);
 					logged_recovery_conflict = true;
 				}
 			}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index d4b0f65ba20..39a30c00f7a 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -226,11 +226,14 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
  * wait_start is the timestamp when the caller started to wait.
  * now is the timestamp when this function has been called.
  * wait_list is the list of virtual transaction ids assigned to
- * conflicting processes.
+ * conflicting processes. still_waiting indicates whether
+ * the startup process is still waiting for the recovery conflict
+ * to be resolved or not.
  */
 void
 LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
-					TimestampTz now, VirtualTransactionId *wait_list)
+					TimestampTz now, VirtualTransactionId *wait_list,
+					bool still_waiting)
 {
 	long		secs;
 	int			usecs;
@@ -238,6 +241,12 @@ LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
 	StringInfoData buf;
 	int			nprocs = 0;
 
+	/*
+	 * There must be no conflicting processes when the recovery conflict has
+	 * already been resolved.
+	 */
+	Assert(still_waiting || wait_list == NULL);
+
 	TimestampDifference(wait_start, now, &secs, &usecs);
 	msecs = secs * 1000 + usecs / 1000;
 	usecs = usecs % 1000;
@@ -275,12 +284,21 @@ LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
 	 * conflicting backends in a detail message. Note that if all the backends
 	 * in the list are not active, no detail message is logged.
 	 */
-	ereport(LOG,
-			errmsg("recovery still waiting after %ld.%03d ms: %s",
-				   msecs, usecs, _(get_recovery_conflict_desc(reason))),
-			nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
-											  "Conflicting processes: %s.",
-											  nprocs, buf.data) : 0);
+	if (still_waiting)
+	{
+		ereport(LOG,
+				errmsg("recovery still waiting after %ld.%03d ms: %s",
+					   msecs, usecs, _(get_recovery_conflict_desc(reason))),
+				nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
+												  "Conflicting processes: %s.",
+												  nprocs, buf.data) : 0);
+	}
+	else
+	{
+		ereport(LOG,
+				errmsg("recovery finished waiting after %ld.%03d ms: %s",
+					   msecs, usecs, _(get_recovery_conflict_desc(reason))));
+	}
 
 	if (nprocs > 0)
 		pfree(buf.data);
@@ -375,13 +393,12 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
 
 				/*
 				 * Emit the log message if the startup process is waiting
-				 * longer than deadlock_timeout for recovery conflict on
-				 * buffer pin.
+				 * longer than deadlock_timeout for recovery conflict.
 				 */
 				if (maybe_log_conflict &&
 					TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
 				{
-					LogRecoveryConflict(reason, waitStart, now, waitlist);
+					LogRecoveryConflict(reason, waitStart, now, waitlist, true);
 					logged_recovery_conflict = true;
 				}
 			}
@@ -391,6 +408,14 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
 		waitlist++;
 	}
 
+	/*
+	 * Emit the log message if recovery conflict was resolved but the startup
+	 * process waited longer than deadlock_timeout for it.
+	 */
+	if (logged_recovery_conflict)
+		LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
+							NULL, false);
+
 	/* Reset ps display if we changed it */
 	if (new_status)
 	{
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index db0cfaa3600..c87ffc65491 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1322,7 +1322,8 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 					 * longer than deadlock_timeout.
 					 */
 					LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
-										standbyWaitStart, now, cnt > 0 ? vxids : NULL);
+										standbyWaitStart, now,
+										cnt > 0 ? vxids : NULL, true);
 					logged_recovery_conflict = true;
 				}
 			}
@@ -1608,6 +1609,15 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 	}
 
 	/*
+	 * Emit the log message if recovery conflict on lock was resolved but the
+	 * startup process waited longer than deadlock_timeout for it.
+	 */
+	if (InHotStandby && logged_recovery_conflict)
+		LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+							standbyWaitStart, GetCurrentTimestamp(),
+							NULL, false);
+
+	/*
 	 * Re-acquire the lock table's partition lock.  We have to do this to hold
 	 * off cancel/die interrupts before we can mess with lockAwaited (else we
 	 * might have a missed or duplicated locallock update).
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index a0f3e0bdf09..2b1f340b82b 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -40,7 +40,8 @@ extern void StandbyDeadLockHandler(void);
 extern void StandbyTimeoutHandler(void);
 extern void StandbyLockTimeoutHandler(void);
 extern void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
-								TimestampTz cur_ts, VirtualTransactionId *wait_list);
+								TimestampTz cur_ts, VirtualTransactionId *wait_list,
+								bool still_waiting);
 
 /*
  * Standby Rmgr (RM_STANDBY_ID)
author	Fujii Masao <fujii@postgresql.org>	2021-01-13 22:59:17 +0900
committer	Fujii Masao <fujii@postgresql.org>	2021-01-13 22:59:17 +0900
commit	39b03690b529935a3c33024ee68f08e2d347cf4f (patch)
tree	535e89716c5be4aebec5debb1db1e37e6f0d43c3 /src
parent	6ecaaf810b8b9e3132f00549e630400f38a7ca37 (diff)
download	postgresql-39b03690b529935a3c33024ee68f08e2d347cf4f.tar.gz postgresql-39b03690b529935a3c33024ee68f08e2d347cf4f.zip