aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/file/fd.c
diff options
context:
space:
mode:
authorThomas Munro <tmunro@postgresql.org>2021-03-20 11:46:32 +1300
committerThomas Munro <tmunro@postgresql.org>2021-03-20 12:07:28 +1300
commit61752afb26404dfc99a535c7a53f7f04dc110263 (patch)
treedbb477a1f01f495a180e891028e3d1545532881d /src/backend/storage/file/fd.c
parentb822ae13ea93c18326d58d47829bbc66d36fae5c (diff)
downloadpostgresql-61752afb26404dfc99a535c7a53f7f04dc110263.tar.gz
postgresql-61752afb26404dfc99a535c7a53f7f04dc110263.zip
Provide recovery_init_sync_method=syncfs.
Since commit 2ce439f3 we have opened every file in the data directory and called fsync() at the start of crash recovery. This can be very slow if there are many files, leading to field complaints of systems taking minutes or even hours to begin crash recovery. Provide an alternative method, for Linux only, where we call syncfs() on every possibly different filesystem under the data directory. This is equivalent, but avoids faulting in potentially many inodes from potentially slow storage. The new mode comes with some caveats, described in the documentation, so the default value for the new setting is "fsync", preserving the older behavior. Reported-by: Michael Brown <michael.brown@discourse.org> Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com> Reviewed-by: Paul Guo <guopa@vmware.com> Reviewed-by: Bruce Momjian <bruce@momjian.us> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: David Steele <david@pgmasters.net> Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r--src/backend/storage/file/fd.c65
1 files changed, 64 insertions, 1 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 110ba31517a..28933f8bbe1 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -72,9 +72,11 @@
#include "postgres.h"
+#include <dirent.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
+#include <sys/types.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
@@ -158,6 +160,9 @@ int max_safe_fds = FD_MINFREE; /* default if not changed */
/* Whether it is safe to continue running after fsync() fails. */
bool data_sync_retry = false;
+/* How SyncDataDirectory() should do its job. */
+int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
+
/* Debugging.... */
#ifdef FDDEBUG
@@ -3265,9 +3270,31 @@ looks_like_temp_rel_name(const char *name)
return true;
}
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+ int fd;
+
+ fd = OpenTransientFile(path, O_RDONLY);
+ if (fd < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open %s: %m", path)));
+ return;
+ }
+ if (syncfs(fd) < 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not sync filesystem for \"%s\": %m", path)));
+ CloseTransientFile(fd);
+}
+#endif
/*
- * Issue fsync recursively on PGDATA and all its contents.
+ * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
+ * all potential filesystem, depending on recovery_init_sync_method setting.
*
* We fsync regular files and directories wherever they are, but we
* follow symlinks only for pg_wal and immediately under pg_tblspc.
@@ -3319,6 +3346,42 @@ SyncDataDirectory(void)
xlog_is_symlink = true;
#endif
+#ifdef HAVE_SYNCFS
+ if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
+ {
+ DIR *dir;
+ struct dirent *de;
+
+ /*
+ * On Linux, we don't have to open every single file one by one. We
+ * can use syncfs() to sync whole filesystems. We only expect
+ * filesystem boundaries to exist where we tolerate symlinks, namely
+ * pg_wal and the tablespaces, so we call syncfs() for each of those
+ * directories.
+ */
+
+ /* Sync the top level pgdata directory. */
+ do_syncfs(".");
+ /* If any tablespaces are configured, sync each of those. */
+ dir = AllocateDir("pg_tblspc");
+ while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
+ {
+ char path[MAXPGPATH];
+
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+ continue;
+
+ snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+ do_syncfs(path);
+ }
+ FreeDir(dir);
+ /* If pg_wal is a symlink, process that too. */
+ if (xlog_is_symlink)
+ do_syncfs("pg_wal");
+ return;
+ }
+#endif /* !HAVE_SYNCFS */
+
/*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents. Errors in this step are even less