aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/xlog.c13
-rw-r--r--src/backend/postmaster/pgstat.c3
-rw-r--r--src/backend/postmaster/postmaster.c33
-rw-r--r--src/backend/postmaster/syslogger.c5
-rw-r--r--src/backend/storage/file/fd.c116
-rw-r--r--src/backend/storage/ipc/dsm_impl.c15
-rw-r--r--src/backend/storage/ipc/latch.c44
-rw-r--r--src/backend/utils/misc/guc.c2
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample2
-rw-r--r--src/include/storage/fd.h11
10 files changed, 222 insertions, 22 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fd527f20c50..d19408b3be3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -774,6 +774,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea
* openLogFile is -1 or a kernel FD for an open log file segment.
* openLogSegNo identifies the segment. These variables are only used to
* write the XLOG, and so will normally refer to the active segment.
+ * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
*/
static int openLogFile = -1;
static XLogSegNo openLogSegNo = 0;
@@ -785,6 +786,9 @@ static XLogSegNo openLogSegNo = 0;
* will be just past that page. readLen indicates how much of the current
* page has been read into readBuf, and readSource indicates where we got
* the currently open file from.
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of
+ * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
+ * not read by general-purpose sessions.
*/
static int readFile = -1;
static XLogSegNo readSegNo = 0;
@@ -2447,6 +2451,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
/* create/use new log file */
use_existent = true;
openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
+ ReserveExternalFD();
}
/* Make sure we have the current logfile open */
@@ -2455,6 +2460,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size);
openLogFile = XLogFileOpen(openLogSegNo);
+ ReserveExternalFD();
}
/* Add current page to the set of pending pages-to-dump */
@@ -2605,6 +2611,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size);
openLogFile = XLogFileOpen(openLogSegNo);
+ ReserveExternalFD();
}
issue_xlog_fsync(openLogFile, openLogSegNo);
@@ -3811,6 +3818,7 @@ XLogFileClose(void)
}
openLogFile = -1;
+ ReleaseExternalFD();
}
/*
@@ -5224,6 +5232,11 @@ BootStrapXLOG(void)
use_existent = false;
openLogFile = XLogFileInit(1, &use_existent, false);
+ /*
+ * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
+ * close the file again in a moment.
+ */
+
/* Write the first page with the initial record */
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 59dc4f31abd..462b4d7e069 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -602,6 +602,9 @@ retry2:
pg_freeaddrinfo_all(hints.ai_family, addrs);
+ /* Now that we have a long-lived socket, tell fd.c about it. */
+ ReserveExternalFD();
+
return;
startup_failed:
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b3986bee75f..cd61665eea3 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2554,9 +2554,14 @@ ClosePostmasterPorts(bool am_syslogger)
(errcode_for_file_access(),
errmsg_internal("could not close postmaster death monitoring pipe in child process: %m")));
postmaster_alive_fds[POSTMASTER_FD_OWN] = -1;
+ /* Notify fd.c that we released one pipe FD. */
+ ReleaseExternalFD();
#endif
- /* Close the listen sockets */
+ /*
+ * Close the postmaster's listen sockets. These aren't tracked by fd.c,
+ * so we don't call ReleaseExternalFD() here.
+ */
for (i = 0; i < MAXLISTEN; i++)
{
if (ListenSocket[i] != PGINVALID_SOCKET)
@@ -2566,7 +2571,10 @@ ClosePostmasterPorts(bool am_syslogger)
}
}
- /* If using syslogger, close the read side of the pipe */
+ /*
+ * If using syslogger, close the read side of the pipe. We don't bother
+ * tracking this in fd.c, either.
+ */
if (!am_syslogger)
{
#ifndef WIN32
@@ -4279,6 +4287,9 @@ BackendInitialize(Port *port)
/* Save port etc. for ps status */
MyProcPort = port;
+ /* Tell fd.c about the long-lived FD associated with the port */
+ ReserveExternalFD();
+
/*
* PreAuthDelay is a debugging aid for investigating problems in the
* authentication cycle: it can be set in postgresql.conf to allow time to
@@ -6442,6 +6453,20 @@ restore_backend_variables(BackendParameters *param, Port *port)
strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH);
strlcpy(ExtraOptions, param->ExtraOptions, MAXPGPATH);
+
+ /*
+ * We need to restore fd.c's counts of externally-opened FDs; to avoid
+ * confusion, be sure to do this after restoring max_safe_fds. (Note:
+ * BackendInitialize will handle this for port->sock.)
+ */
+#ifndef WIN32
+ if (postmaster_alive_fds[0] >= 0)
+ ReserveExternalFD();
+ if (postmaster_alive_fds[1] >= 0)
+ ReserveExternalFD();
+#endif
+ if (pgStatSock != PGINVALID_SOCKET)
+ ReserveExternalFD();
}
@@ -6584,6 +6609,10 @@ InitPostmasterDeathWatchHandle(void)
(errcode_for_file_access(),
errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+ /* Notify fd.c that we've eaten two FDs for the pipe. */
+ ReserveExternalFD();
+ ReserveExternalFD();
+
/*
* Set O_NONBLOCK to allow testing for the fd's presence with a read()
* call.
diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c
index b2b69a72079..cf7b535e4e3 100644
--- a/src/backend/postmaster/syslogger.c
+++ b/src/backend/postmaster/syslogger.c
@@ -562,6 +562,11 @@ SysLogger_Start(void)
* This means the postmaster must continue to hold the read end of the
* pipe open, so we can pass it down to the reincarnated syslogger. This
* is a bit klugy but we have little choice.
+ *
+ * Also note that we don't bother counting the pipe FDs by calling
+ * Reserve/ReleaseExternalFD. There's no real need to account for them
+ * accurately in the postmaster or syslogger process, and both ends of the
+ * pipe will wind up closed in all other postmaster children.
*/
#ifndef WIN32
if (syslogPipe[0] < 0)
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index b5f4df6a485..34f7443110a 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -61,6 +61,12 @@
* BasicOpenFile, it is solely the caller's responsibility to close the file
* descriptor by calling close(2).
*
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open. This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
*-------------------------------------------------------------------------
*/
@@ -103,8 +109,8 @@
/*
* We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This
- * is the number left free. (While we can be pretty sure we won't get
- * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * is the number left free. (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
* other processes chewing up FDs. So it's a bad idea to try to open files
* without consulting fd.c. Nonetheless we cannot control all code.)
*
@@ -119,9 +125,12 @@
/*
* If we have fewer than this many usable FDs after allowing for the reserved
- * ones, choke.
+ * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that. Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
*/
-#define FD_MINFREE 10
+#define FD_MINFREE 48
/*
* A number of platforms allow individual processes to open many more files
@@ -132,8 +141,8 @@
int max_files_per_process = 1000;
/*
- * Maximum number of file descriptors to open for either VFD entries or
- * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
* to a conservative value, and remains that way indefinitely in bootstrap or
* standalone-backend cases. In normal postmaster operation, the postmaster
* calls set_max_safe_fds() late in initialization to update the value, and
@@ -142,7 +151,7 @@ int max_files_per_process = 1000;
* Note: the value of max_files_per_process is taken into account while
* setting this variable, and so need not be tested separately.
*/
-int max_safe_fds = 32; /* default if not changed */
+int max_safe_fds = FD_MINFREE; /* default if not changed */
/* Whether it is safe to continue running after fsync() fails. */
bool data_sync_retry = false;
@@ -244,6 +253,11 @@ static int maxAllocatedDescs = 0;
static AllocateDesc *allocatedDescs = NULL;
/*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int numExternalFDs = 0;
+
+/*
* Number of temporary files opened during the current session;
* this is used in generation of tempfile names.
*/
@@ -1025,6 +1039,80 @@ tryAgain:
return -1; /* failure */
}
+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved. This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+ /*
+ * We don't want more than max_safe_fds / 3 FDs to be consumed for
+ * "external" FDs.
+ */
+ if (numExternalFDs < max_safe_fds / 3)
+ {
+ ReserveExternalFD();
+ return true;
+ }
+ errno = EMFILE;
+ return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module. This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure. Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice. In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+ /*
+ * Release VFDs if needed to stay safe. Because we do this before
+ * incrementing numExternalFDs, the final state will be as desired, i.e.,
+ * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+ */
+ ReleaseLruFiles();
+
+ numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+ Assert(numExternalFDs > 0);
+ numExternalFDs--;
+}
+
+
#if defined(FDDEBUG)
static void
@@ -1185,7 +1273,7 @@ ReleaseLruFile(void)
static void
ReleaseLruFiles(void)
{
- while (nfile + numAllocatedDescs >= max_safe_fds)
+ while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
{
if (!ReleaseLruFile())
break;
@@ -2176,13 +2264,13 @@ reserveAllocatedDesc(void)
/*
* If the array hasn't yet been created in the current process, initialize
- * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
+ * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
* we will ever need, anyway. We don't want to look at max_safe_fds
* immediately because set_max_safe_fds() may not have run yet.
*/
if (allocatedDescs == NULL)
{
- newMax = FD_MINFREE / 2;
+ newMax = FD_MINFREE / 3;
newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
/* Out of memory already? Treat as fatal error. */
if (newDescs == NULL)
@@ -2200,10 +2288,12 @@ reserveAllocatedDesc(void)
*
* We mustn't let allocated descriptors hog all the available FDs, and in
* practice we'd better leave a reasonable number of FDs for VFD use. So
- * set the maximum to max_safe_fds / 2. (This should certainly be at
- * least as large as the initial size, FD_MINFREE / 2.)
+ * set the maximum to max_safe_fds / 3. (This should certainly be at
+ * least as large as the initial size, FD_MINFREE / 3, so we aren't
+ * tightening the restriction here.) Recall that "external" FDs are
+ * allowed to consume another third of max_safe_fds.
*/
- newMax = max_safe_fds / 2;
+ newMax = max_safe_fds / 3;
if (newMax > maxAllocatedDescs)
{
newDescs = (AllocateDesc *) realloc(allocatedDescs,
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 138bdec47e9..1972aecbedc 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -247,14 +247,17 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/*
* Create new segment or open an existing one for attach.
*
- * Even though we're not going through fd.c, we should be safe against
- * running out of file descriptors, because of NUM_RESERVED_FDS. We're
- * only opening one extra descriptor here, and we'll close it before
- * returning.
+ * Even though we will close the FD before returning, it seems desirable
+ * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+ * failure. The fact that we won't hold the FD open long justifies using
+ * ReserveExternalFD rather than AcquireExternalFD, though.
*/
+ ReserveExternalFD();
+
flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
{
+ ReleaseExternalFD();
if (errno != EEXIST)
ereport(elevel,
(errcode_for_dynamic_shared_memory(),
@@ -278,6 +281,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* Back out what's already been done. */
save_errno = errno;
close(fd);
+ ReleaseExternalFD();
errno = save_errno;
ereport(elevel,
@@ -295,6 +299,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* Back out what's already been done. */
save_errno = errno;
close(fd);
+ ReleaseExternalFD();
shm_unlink(name);
errno = save_errno;
@@ -323,6 +328,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* Back out what's already been done. */
save_errno = errno;
close(fd);
+ ReleaseExternalFD();
if (op == DSM_OP_CREATE)
shm_unlink(name);
errno = save_errno;
@@ -336,6 +342,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
*mapped_address = address;
*mapped_size = request_size;
close(fd);
+ ReleaseExternalFD();
return true;
}
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index cbd495225ca..046ca5c6c7e 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -51,6 +51,7 @@
#include "port/atomics.h"
#include "portability/instr_time.h"
#include "postmaster/postmaster.h"
+#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
@@ -187,6 +188,9 @@ InitializeLatchSupport(void)
/* Clean up, just for safety's sake; we'll set these below */
selfpipe_readfd = selfpipe_writefd = -1;
selfpipe_owner_pid = 0;
+ /* Keep fd.c's accounting straight */
+ ReleaseExternalFD();
+ ReleaseExternalFD();
}
else
{
@@ -194,6 +198,7 @@ InitializeLatchSupport(void)
* Postmaster didn't create a self-pipe ... or else we're in an
* EXEC_BACKEND build, in which case it doesn't matter since the
* postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+ * fd.c won't have state to clean up, either.
*/
Assert(selfpipe_readfd == -1);
}
@@ -228,6 +233,10 @@ InitializeLatchSupport(void)
selfpipe_readfd = pipefd[0];
selfpipe_writefd = pipefd[1];
selfpipe_owner_pid = MyProcPid;
+
+ /* Tell fd.c about these two long-lived FDs */
+ ReserveExternalFD();
+ ReserveExternalFD();
#else
/* currently, nothing to do here for Windows */
#endif
@@ -604,24 +613,57 @@ CreateWaitEventSet(MemoryContext context, int nevents)
set->exit_on_postmaster_death = false;
#if defined(WAIT_USE_EPOLL)
+ if (!AcquireExternalFD())
+ {
+ /* treat this as though epoll_create1 itself returned EMFILE */
+ elog(ERROR, "epoll_create1 failed: %m");
+ }
#ifdef EPOLL_CLOEXEC
set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (set->epoll_fd < 0)
+ {
+ ReleaseExternalFD();
elog(ERROR, "epoll_create1 failed: %m");
+ }
#else
/* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
set->epoll_fd = epoll_create(nevents);
if (set->epoll_fd < 0)
+ {
+ ReleaseExternalFD();
elog(ERROR, "epoll_create failed: %m");
+ }
if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
+ {
+ int save_errno = errno;
+
+ close(set->epoll_fd);
+ ReleaseExternalFD();
+ errno = save_errno;
elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
+ }
#endif /* EPOLL_CLOEXEC */
#elif defined(WAIT_USE_KQUEUE)
+ if (!AcquireExternalFD())
+ {
+ /* treat this as though kqueue itself returned EMFILE */
+ elog(ERROR, "kqueue failed: %m");
+ }
set->kqueue_fd = kqueue();
if (set->kqueue_fd < 0)
+ {
+ ReleaseExternalFD();
elog(ERROR, "kqueue failed: %m");
+ }
if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+ {
+ int save_errno = errno;
+
+ close(set->kqueue_fd);
+ ReleaseExternalFD();
+ errno = save_errno;
elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+ }
set->report_postmaster_not_running = false;
#elif defined(WAIT_USE_WIN32)
@@ -655,8 +697,10 @@ FreeWaitEventSet(WaitEventSet *set)
{
#if defined(WAIT_USE_EPOLL)
close(set->epoll_fd);
+ ReleaseExternalFD();
#elif defined(WAIT_USE_KQUEUE)
close(set->kqueue_fd);
+ ReleaseExternalFD();
#elif defined(WAIT_USE_WIN32)
WaitEvent *cur_event;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8228e1f3903..464f264d9a2 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2431,7 +2431,7 @@ static struct config_int ConfigureNamesInt[] =
NULL
},
&max_files_per_process,
- 1000, 25, INT_MAX,
+ 1000, 64, INT_MAX,
NULL, NULL, NULL
},
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e1048c0047b..e58e4788a8e 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -153,7 +153,7 @@
# - Kernel Resources -
-#max_files_per_process = 1000 # min 25
+#max_files_per_process = 1000 # min 64
# (change requires restart)
# - Cost-Based Vacuum Delay -
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 51e2ece3c9d..2085c62b41d 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -35,6 +35,10 @@
* Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
* open directories (DIR*), and OpenTransientFile/CloseTransientFile for an
* unbuffered file descriptor.
+ *
+ * If you really can't use any of the above, at least call AcquireExternalFD
+ * or ReserveExternalFD to report any file descriptors that are held for any
+ * length of time. Failure to do so risks unnecessary EMFILE errors.
*/
#ifndef FD_H
#define FD_H
@@ -120,7 +124,12 @@ extern int CloseTransientFile(int fd);
extern int BasicOpenFile(const char *fileName, int fileFlags);
extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
- /* Make a directory with default permissions */
+/* Use these for other cases, and also for long-lived BasicOpenFile FDs */
+extern bool AcquireExternalFD(void);
+extern void ReserveExternalFD(void);
+extern void ReleaseExternalFD(void);
+
+/* Make a directory with default permissions */
extern int MakePGDirectory(const char *directoryName);
/* Miscellaneous support routines */