aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/file/fd.c
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2016-02-19 12:13:05 -0800
committerAndres Freund <andres@anarazel.de>2016-03-10 17:04:34 -0800
commit428b1d6b29ca599c5700d4bc4f4ce4c5880369bf (patch)
tree55a957b82069476471bc9966ce81a9ffec8f4ba5 /src/backend/storage/file/fd.c
parentc82c92b111b7b636e80f8a432de10c62011b35b6 (diff)
downloadpostgresql-428b1d6b29ca599c5700d4bc4f4ce4c5880369bf.tar.gz
postgresql-428b1d6b29ca599c5700d4bc4f4ce4c5880369bf.zip
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the OS page cache. This means that some operating systems can end up collecting a large number of dirty buffers in their respective page caches. When these dirty buffers are flushed to storage rapidly, be it because of fsync(), timeouts, or dirty ratios, latency for other reads and writes can increase massively. This is the primary reason for regular massive stalls observed in real world scenarios and artificial benchmarks; on rotating disks stalls on the order of hundreds of seconds have been observed. On linux it is possible to control this by reducing the global dirty limits significantly, reducing the above problem. But global configuration is rather problematic because it'll affect other applications; also PostgreSQL itself doesn't always generally want this behavior, e.g. for temporary files it's undesirable. Several operating systems allow some control over the kernel page cache. Linux has sync_file_range(2), several posix systems have msync(2) and posix_fadvise(2). sync_file_range(2) is preferable because it requires no special setup, whereas msync() requires the to-be-flushed range to be mmap'ed. For the purpose of flushing dirty data posix_fadvise(2) is the worst alternative, as flushing dirty data is just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages from the page cache. Thus the feature is enabled by default only on linux, but can be enabled on all systems that have any of the above APIs. While desirable and likely possible this patch does not contain an implementation for windows. With the infrastructure added, writes made via checkpointer, bgwriter and normal user backends can be flushed after a configurable number of writes. Each of these sources of writes controlled by a separate GUC, checkpointer_flush_after, bgwriter_flush_after and backend_flush_after respectively; they're separate because the number of flushes that are good are separate, and because the performance considerations of controlled flushing for each of these are different. A later patch will add checkpoint sorting - after that flushes from the ckeckpoint will almost always be desirable. Bgwriter flushes are most of the time going to be random, which are slow on lots of storage hardware. Flushing in backends works well if the storage and bgwriter can keep up, but if not it can have negative consequences. This patch is likely to have negative performance consequences without checkpoint sorting, but unfortunately so has sorting without flush control. Discussion: alpine.DEB.2.10.1506011320000.28433@sto Author: Fabien Coelho and Andres Freund
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r--src/backend/storage/file/fd.c157
1 files changed, 139 insertions, 18 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index c1076992a33..046d1b3cc30 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -61,6 +61,9 @@
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_SYS_RESOURCE_H
@@ -82,6 +85,8 @@
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
#define PG_FLUSH_DATA_WORKS 1
+#elif !defined(WIN32) && defined(MS_ASYNC)
+#define PG_FLUSH_DATA_WORKS 1
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
#define PG_FLUSH_DATA_WORKS 1
#endif
@@ -383,29 +388,126 @@ pg_fdatasync(int fd)
}
/*
- * pg_flush_data --- advise OS that the data described won't be needed soon
+ * pg_flush_data --- advise OS that the described dirty data should be flushed
*
- * Not all platforms have sync_file_range or posix_fadvise; treat as no-op
- * if not available. Also, treat as no-op if enableFsync is off; this is
- * because the call isn't free, and some platforms such as Linux will actually
- * block the requestor until the write is scheduled.
+ * An offset of 0 with an nbytes 0 means that the entire file should be
+ * flushed.
*/
-int
-pg_flush_data(int fd, off_t offset, off_t amount)
+void
+pg_flush_data(int fd, off_t offset, off_t nbytes)
{
-#ifdef PG_FLUSH_DATA_WORKS
- if (enableFsync)
- {
+ /*
+ * Right now file flushing is primarily used to avoid making later
+ * fsync()/fdatasync() calls have a less impact. Thus don't trigger
+ * flushes if fsyncs are disabled - that's a decision we might want to
+ * make configurable at some point.
+ */
+ if (!enableFsync)
+ return;
+
+ /*
+ * XXX: compile all alternatives, to find portability problems more easily
+ */
#if defined(HAVE_SYNC_FILE_RANGE)
- return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
-#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
- return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
-#else
-#error PG_FLUSH_DATA_WORKS should not have been defined
+ {
+ int rc = 0;
+
+ /*
+ * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
+ * tells the OS that writeback for the passed in blocks should be
+ * started, but that we don't want to wait for completion. Note that
+ * this call might block if too much dirty data exists in the range.
+ * This is the preferrable method on OSs supporting it, as it works
+ * reliably when available (contrast to msync()) and doesn't flush out
+ * clean data (like FADV_DONTNEED).
+ */
+ rc = sync_file_range(fd, offset, nbytes,
+ SYNC_FILE_RANGE_WRITE);
+
+ /* don't error out, this is just a performance optimization */
+ if (rc != 0)
+ {
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not flush dirty data: %m")));
+ }
+
+ return;
+ }
#endif
+#if !defined(WIN32) && defined(MS_ASYNC)
+ {
+ int rc = 0;
+ void *p;
+
+ /*
+ * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
+ * writeback. On linux it only does so with MS_SYNC is specified, but
+ * then it does the writeback synchronously. Luckily all common linux
+ * systems have sync_file_range(). This is preferrable over
+ * FADV_DONTNEED because it doesn't flush out clean data.
+ *
+ * We map the file (mmap()), tell the kernel to sync back the contents
+ * (msync()), and then remove the mapping again (munmap()).
+ */
+ p = mmap(NULL, nbytes,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, offset);
+ if (p == MAP_FAILED)
+ {
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not mmap while flushing dirty data: %m")));
+ return;
+ }
+
+ rc = msync(p, nbytes, MS_ASYNC);
+ if (rc != 0)
+ {
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not flush dirty data: %m")));
+ /* NB: need to fall through to munmap()! */
+ }
+
+ rc = munmap(p, nbytes);
+ if (rc != 0)
+ {
+ /* FATAL error because mapping would remain */
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not munmap while flushing blocks: %m")));
+ }
+
+ return;
+ }
+#endif
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ {
+ int rc = 0;
+
+ /*
+ * Signal the kernel that the passed in range should not be cached
+ * anymore. This has the, desired, side effect of writing out dirty
+ * data, and the, undesired, side effect of likely discarding useful
+ * clean cached blocks. For the latter reason this is the least
+ * preferrable method.
+ */
+
+ rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
+
+ /* don't error out, this is just a performance optimization */
+ if (rc != 0)
+ {
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not flush dirty data: %m")));
+ return;
+ }
+
+ return;
}
#endif
- return 0;
}
@@ -1396,6 +1498,24 @@ FilePrefetch(File file, off_t offset, int amount)
#endif
}
+void
+FileWriteback(File file, off_t offset, int amount)
+{
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " %d",
+ file, VfdCache[file].fileName,
+ (int64) offset, amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return;
+
+ pg_flush_data(VfdCache[file].fd, offset, amount);
+}
+
int
FileRead(File file, char *buffer, int amount)
{
@@ -2796,9 +2916,10 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
}
/*
- * We ignore errors from pg_flush_data() because this is only a hint.
+ * pg_flush_data() ignores errors, which is ok because this is only a
+ * hint.
*/
- (void) pg_flush_data(fd, 0, 0);
+ pg_flush_data(fd, 0, 0);
(void) CloseTransientFile(fd);
}