aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/file/fd.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2020-02-24 17:28:33 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2020-02-24 17:28:33 -0500
commit3d475515a15f70a4a3f36fbbba93db6877ff8346 (patch)
treeb927e13255bef34d548a0bb9cc93a85da4d2b0a3 /src/backend/storage/file/fd.c
parent1420617b14e2e3722367b826986a50ea33ff62ec (diff)
downloadpostgresql-3d475515a15f70a4a3f36fbbba93db6877ff8346.tar.gz
postgresql-3d475515a15f70a4a3f36fbbba93db6877ff8346.zip
Account explicitly for long-lived FDs that are allocated outside fd.c.
The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r--src/backend/storage/file/fd.c116
1 files changed, 103 insertions, 13 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index b5f4df6a485..34f7443110a 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -61,6 +61,12 @@
* BasicOpenFile, it is solely the caller's responsibility to close the file
* descriptor by calling close(2).
*
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open. This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
*-------------------------------------------------------------------------
*/
@@ -103,8 +109,8 @@
/*
* We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This
- * is the number left free. (While we can be pretty sure we won't get
- * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * is the number left free. (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
* other processes chewing up FDs. So it's a bad idea to try to open files
* without consulting fd.c. Nonetheless we cannot control all code.)
*
@@ -119,9 +125,12 @@
/*
* If we have fewer than this many usable FDs after allowing for the reserved
- * ones, choke.
+ * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that. Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
*/
-#define FD_MINFREE 10
+#define FD_MINFREE 48
/*
* A number of platforms allow individual processes to open many more files
@@ -132,8 +141,8 @@
int max_files_per_process = 1000;
/*
- * Maximum number of file descriptors to open for either VFD entries or
- * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
* to a conservative value, and remains that way indefinitely in bootstrap or
* standalone-backend cases. In normal postmaster operation, the postmaster
* calls set_max_safe_fds() late in initialization to update the value, and
@@ -142,7 +151,7 @@ int max_files_per_process = 1000;
* Note: the value of max_files_per_process is taken into account while
* setting this variable, and so need not be tested separately.
*/
-int max_safe_fds = 32; /* default if not changed */
+int max_safe_fds = FD_MINFREE; /* default if not changed */
/* Whether it is safe to continue running after fsync() fails. */
bool data_sync_retry = false;
@@ -244,6 +253,11 @@ static int maxAllocatedDescs = 0;
static AllocateDesc *allocatedDescs = NULL;
/*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int numExternalFDs = 0;
+
+/*
* Number of temporary files opened during the current session;
* this is used in generation of tempfile names.
*/
@@ -1025,6 +1039,80 @@ tryAgain:
return -1; /* failure */
}
+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved. This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+ /*
+ * We don't want more than max_safe_fds / 3 FDs to be consumed for
+ * "external" FDs.
+ */
+ if (numExternalFDs < max_safe_fds / 3)
+ {
+ ReserveExternalFD();
+ return true;
+ }
+ errno = EMFILE;
+ return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module. This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure. Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice. In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+ /*
+ * Release VFDs if needed to stay safe. Because we do this before
+ * incrementing numExternalFDs, the final state will be as desired, i.e.,
+ * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+ */
+ ReleaseLruFiles();
+
+ numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+ Assert(numExternalFDs > 0);
+ numExternalFDs--;
+}
+
+
#if defined(FDDEBUG)
static void
@@ -1185,7 +1273,7 @@ ReleaseLruFile(void)
static void
ReleaseLruFiles(void)
{
- while (nfile + numAllocatedDescs >= max_safe_fds)
+ while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
{
if (!ReleaseLruFile())
break;
@@ -2176,13 +2264,13 @@ reserveAllocatedDesc(void)
/*
* If the array hasn't yet been created in the current process, initialize
- * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
+ * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
* we will ever need, anyway. We don't want to look at max_safe_fds
* immediately because set_max_safe_fds() may not have run yet.
*/
if (allocatedDescs == NULL)
{
- newMax = FD_MINFREE / 2;
+ newMax = FD_MINFREE / 3;
newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
/* Out of memory already? Treat as fatal error. */
if (newDescs == NULL)
@@ -2200,10 +2288,12 @@ reserveAllocatedDesc(void)
*
* We mustn't let allocated descriptors hog all the available FDs, and in
* practice we'd better leave a reasonable number of FDs for VFD use. So
- * set the maximum to max_safe_fds / 2. (This should certainly be at
- * least as large as the initial size, FD_MINFREE / 2.)
+ * set the maximum to max_safe_fds / 3. (This should certainly be at
+ * least as large as the initial size, FD_MINFREE / 3, so we aren't
+ * tightening the restriction here.) Recall that "external" FDs are
+ * allowed to consume another third of max_safe_fds.
*/
- newMax = max_safe_fds / 2;
+ newMax = max_safe_fds / 3;
if (newMax > maxAllocatedDescs)
{
newDescs = (AllocateDesc *) realloc(allocatedDescs,