6 files changed, 117 insertions, 7 deletions
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index bab048d38a1..b467b5c89d1 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -34,6 +34,7 @@
 #include "commands/trigger.h"
 #include "executor/spi.h"
 #include "libpq/be-fsstubs.h"
+#include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/walsender.h"
@@ -52,6 +53,7 @@
 #include "utils/memutils.h"
 #include "utils/relmapper.h"
 #include "utils/snapmgr.h"
+#include "utils/timeout.h"
 #include "utils/timestamp.h"
 #include "pg_trace.h"
 
@@ -2297,6 +2299,22 @@ AbortTransaction(void)
 	LockErrorCleanup();
 
 	/*
+	 * If any timeout events are still active, make sure the timeout interrupt
+	 * is scheduled.  This covers possible loss of a timeout interrupt due to
+	 * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm).
+	 * We delay this till after LockErrorCleanup so that we don't uselessly
+	 * reschedule lock or deadlock check timeouts.
+	 */
+	reschedule_timeouts();
+
+	/*
+	 * Re-enable signals, in case we got here by longjmp'ing out of a signal
+	 * handler.  We do this fairly early in the sequence so that the timeout
+	 * infrastructure will be functional if needed while aborting.
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
 	 * check the current transaction state
 	 */
 	if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE)
@@ -4222,9 +4240,29 @@ AbortSubTransaction(void)
 	AbortBufferIO();
 	UnlockBuffers();
 
+	/*
+	 * Also clean up any open wait for lock, since the lock manager will choke
+	 * if we try to wait for another lock before doing this.
+	 */
 	LockErrorCleanup();
 
 	/*
+	 * If any timeout events are still active, make sure the timeout interrupt
+	 * is scheduled.  This covers possible loss of a timeout interrupt due to
+	 * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm).
+	 * We delay this till after LockErrorCleanup so that we don't uselessly
+	 * reschedule lock or deadlock check timeouts.
+	 */
+	reschedule_timeouts();
+
+	/*
+	 * Re-enable signals, in case we got here by longjmp'ing out of a signal
+	 * handler.  We do this fairly early in the sequence so that the timeout
+	 * infrastructure will be functional if needed while aborting.
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
 	 * check the current transaction state
 	 */
 	ShowTransactionState("AbortSubTransaction");
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 8c14d0f4e44..88ecd3834b3 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -493,9 +493,8 @@ AutoVacLauncherMain(int argc, char *argv[])
 		HOLD_INTERRUPTS();
 
 		/* Forget any pending QueryCancel or timeout request */
-		QueryCancelPending = false;
 		disable_all_timeouts(false);
-		QueryCancelPending = false;		/* again in case timeout occurred */
+		QueryCancelPending = false;		/* second to avoid race condition */
 
 		/* Report the error to the server log */
 		EmitErrorReport();
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 222251df659..122afb24a8b 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1266,7 +1266,10 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 	} while (myWaitStatus == STATUS_WAITING);
 
 	/*
-	 * Disable the timers, if they are still running
+	 * Disable the timers, if they are still running.  As in LockErrorCleanup,
+	 * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has
+	 * already caused QueryCancelPending to become set, we want the cancel to
+	 * be reported as a lock timeout, not a user cancel.
 	 */
 	if (LockTimeout > 0)
 	{
@@ -1275,7 +1278,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		timeouts[0].id = DEADLOCK_TIMEOUT;
 		timeouts[0].keep_indicator = false;
 		timeouts[1].id = LOCK_TIMEOUT;
-		timeouts[1].keep_indicator = false;
+		timeouts[1].keep_indicator = true;
 		disable_timeouts(timeouts, 2);
 	}
 	else
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3d74654c827..e6428b69f43 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3803,6 +3803,13 @@ PostgresMain(int argc, char *argv[],
 	 * always active, we have at least some chance of recovering from an error
 	 * during error recovery.  (If we get into an infinite loop thereby, it
 	 * will soon be stopped by overflow of elog.c's internal state stack.)
+	 *
+	 * Note that we use sigsetjmp(..., 1), so that this function's signal mask
+	 * (to wit, UnBlockSig) will be restored when longjmp'ing to here.  This
+	 * is essential in case we longjmp'd out of a signal handler on a platform
+	 * where that leaves the signal blocked.  It's not redundant with the
+	 * unblock in AbortTransaction() because the latter is only called if we
+	 * were inside a transaction.
 	 */
 
 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
@@ -3823,11 +3830,17 @@ PostgresMain(int argc, char *argv[],
 
 		/*
 		 * Forget any pending QueryCancel request, since we're returning to
-		 * the idle loop anyway, and cancel any active timeout requests.
+		 * the idle loop anyway, and cancel any active timeout requests.  (In
+		 * future we might want to allow some timeout requests to survive, but
+		 * at minimum it'd be necessary to do reschedule_timeouts(), in case
+		 * we got here because of a query cancel interrupting the SIGALRM
+		 * interrupt handler.)	Note in particular that we must clear the
+		 * statement and lock timeout indicators, to prevent any future plain
+		 * query cancels from being misreported as timeouts in case we're
+		 * forgetting a timeout cancel.
 		 */
-		QueryCancelPending = false;
 		disable_all_timeouts(false);
-		QueryCancelPending = false;		/* again in case timeout occurred */
+		QueryCancelPending = false;		/* second to avoid race condition */
 
 		/*
 		 * Turn off these interrupts too.  This is only needed here and not in
diff --git a/src/backend/utils/misc/timeout.c b/src/backend/utils/misc/timeout.c
index 3c3e41eca91..aae947e601b 100644
--- a/src/backend/utils/misc/timeout.c
+++ b/src/backend/utils/misc/timeout.c
@@ -16,6 +16,7 @@
 
 #include <sys/time.h>
 
+#include "miscadmin.h"
 #include "storage/proc.h"
 #include "utils/timeout.h"
 #include "utils/timestamp.h"
@@ -260,6 +261,23 @@ handle_sig_alarm(SIGNAL_ARGS)
 	int			save_errno = errno;
 
 	/*
+	 * We may be executing while ImmediateInterruptOK is true (e.g., when
+	 * mainline is waiting for a lock).  If SIGINT or similar arrives while
+	 * this code is running, we'd lose control and perhaps leave our data
+	 * structures in an inconsistent state.  Hold off interrupts to prevent
+	 * that.
+	 *
+	 * Note: it's possible for a SIGINT to interrupt handle_sig_alarm even
+	 * before we reach HOLD_INTERRUPTS(); the net effect would be as if the
+	 * SIGALRM event had been silently lost.  Therefore error recovery must
+	 * include some action that will allow any lost interrupt to be
+	 * rescheduled.  Disabling some or all timeouts is sufficient, or if
+	 * that's not appropriate, reschedule_timeouts() can be called.  Also, the
+	 * signal blocking hazard described below applies here too.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
 	 * SIGALRM is always cause for waking anything waiting on the process
 	 * latch.  Cope with MyProc not being there, as the startup process also
 	 * uses this signal handler.
@@ -311,6 +329,20 @@ handle_sig_alarm(SIGNAL_ARGS)
 		}
 	}
 
+	/*
+	 * Re-allow query cancel, and then service any cancel request that arrived
+	 * meanwhile (this might in particular include a cancel request fired by
+	 * one of the timeout handlers).
+	 *
+	 * Note: a longjmp from here is safe so far as our own data structures are
+	 * concerned; but on platforms that block a signal before calling the
+	 * handler and then un-block it on return, longjmping out of the signal
+	 * handler leaves SIGALRM still blocked.  Error cleanup is responsible for
+	 * unblocking any blocked signals.
+	 */
+	RESUME_INTERRUPTS();
+	CHECK_FOR_INTERRUPTS();
+
 	errno = save_errno;
 }
 
@@ -388,6 +420,30 @@ RegisterTimeout(TimeoutId id, timeout_handler_proc handler)
 }
 
 /*
+ * Reschedule any pending SIGALRM interrupt.
+ *
+ * This can be used during error recovery in case query cancel resulted in loss
+ * of a SIGALRM event (due to longjmp'ing out of handle_sig_alarm before it
+ * could do anything).	But note it's not necessary if any of the public
+ * enable_ or disable_timeout functions are called in the same area, since
+ * those all do schedule_alarm() internally if needed.
+ */
+void
+reschedule_timeouts(void)
+{
+	/* For flexibility, allow this to be called before we're initialized. */
+	if (!all_timeouts_initialized)
+		return;
+
+	/* Disable timeout interrupts for safety. */
+	disable_alarm();
+
+	/* Reschedule the interrupt, if any timeouts remain active. */
+	if (num_active_timeouts > 0)
+		schedule_alarm(GetCurrentTimestamp());
+}
+
+/*
  * Enable the specified timeout to fire after the specified delay.
  *
  * Delay is given in milliseconds.
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index 06bcc5a0f3d..759ee24774b 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -67,6 +67,7 @@ typedef struct
 /* timeout setup */
 extern void InitializeTimeouts(void);
 extern TimeoutId RegisterTimeout(TimeoutId id, timeout_handler_proc handler);
+extern void reschedule_timeouts(void);
 
 /* timeout operation */
 extern void enable_timeout_after(TimeoutId id, int delay_ms);