aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/init/miscinit.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-10-06 17:15:27 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-10-06 17:15:27 -0400
commit02580df6c3ac288f2ed5e38ed42532512993d468 (patch)
treef7cfb448ee5af5fb2351f388c394e0ac3272b860 /src/backend/utils/init/miscinit.c
parentb96df2c61710b39d24e98767cfe17b920b9319a6 (diff)
downloadpostgresql-02580df6c3ac288f2ed5e38ed42532512993d468.tar.gz
postgresql-02580df6c3ac288f2ed5e38ed42532512993d468.zip
Perform an immediate shutdown if the postmaster.pid file is removed.
The postmaster now checks every minute or so (worst case, at most two minutes) that postmaster.pid is still there and still contains its own PID. If not, it performs an immediate shutdown, as though it had received SIGQUIT. The original goal behind this change was to ensure that failed buildfarm runs would get fully cleaned up, even if the test scripts had left a postmaster running, which is not an infrequent occurrence. When the buildfarm script removes a test postmaster's $PGDATA directory, its next check on postmaster.pid will fail and cause it to exit. Previously, manual intervention was often needed to get rid of such orphaned postmasters, since they'd block new test postmasters from obtaining the expected socket address. However, by checking postmaster.pid and not something else, we can provide additional robustness: manual removal of postmaster.pid is a frequent DBA mistake, and now we can at least limit the damage that will ensue if a new postmaster is started while the old one is still alive. Back-patch to all supported branches, since we won't get the desired improvement in buildfarm reliability otherwise.
Diffstat (limited to 'src/backend/utils/init/miscinit.c')
-rw-r--r--src/backend/utils/init/miscinit.c70
1 files changed, 70 insertions, 0 deletions
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index e871fef7faa..fb3cb6eb3d5 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1218,6 +1218,76 @@ AddToDataDirLockFile(int target_line, const char *str)
}
+/*
+ * Recheck that the data directory lock file still exists with expected
+ * content. Return TRUE if the lock file appears OK, FALSE if it isn't.
+ *
+ * We call this periodically in the postmaster. The idea is that if the
+ * lock file has been removed or replaced by another postmaster, we should
+ * do a panic database shutdown. Therefore, we should return TRUE if there
+ * is any doubt: we do not want to cause a panic shutdown unnecessarily.
+ * Transient failures like EINTR or ENFILE should not cause us to fail.
+ * (If there really is something wrong, we'll detect it on a future recheck.)
+ */
+bool
+RecheckDataDirLockFile(void)
+{
+ int fd;
+ int len;
+ long file_pid;
+ char buffer[BLCKSZ];
+
+ fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0);
+ if (fd < 0)
+ {
+ /*
+ * There are many foreseeable false-positive error conditions. For
+ * safety, fail only on enumerated clearly-something-is-wrong
+ * conditions.
+ */
+ switch (errno)
+ {
+ case ENOENT:
+ case ENOTDIR:
+ /* disaster */
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ DIRECTORY_LOCK_FILE)));
+ return false;
+ default:
+ /* non-fatal, at least for now */
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m; continuing anyway",
+ DIRECTORY_LOCK_FILE)));
+ return true;
+ }
+ }
+ len = read(fd, buffer, sizeof(buffer) - 1);
+ if (len < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not read from file \"%s\": %m",
+ DIRECTORY_LOCK_FILE)));
+ close(fd);
+ return true; /* treat read failure as nonfatal */
+ }
+ buffer[len] = '\0';
+ close(fd);
+ file_pid = atol(buffer);
+ if (file_pid == getpid())
+ return true; /* all is well */
+
+ /* Trouble: someone's overwritten the lock file */
+ ereport(LOG,
+ (errmsg("lock file \"%s\" contains wrong PID: %ld instead of %ld",
+ DIRECTORY_LOCK_FILE, file_pid, (long) getpid())));
+ return false;
+}
+
+
/*-------------------------------------------------------------------------
* Version checking support
*-------------------------------------------------------------------------