diff options
author | Andres Freund <andres@anarazel.de> | 2016-02-19 12:13:05 -0800 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2016-03-10 17:04:34 -0800 |
commit | 428b1d6b29ca599c5700d4bc4f4ce4c5880369bf (patch) | |
tree | 55a957b82069476471bc9966ce81a9ffec8f4ba5 /src/backend/storage/file/fd.c | |
parent | c82c92b111b7b636e80f8a432de10c62011b35b6 (diff) | |
download | postgresql-428b1d6b29ca599c5700d4bc4f4ce4c5880369bf.tar.gz postgresql-428b1d6b29ca599c5700d4bc4f4ce4c5880369bf.zip |
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the
OS page cache. This means that some operating systems can end up
collecting a large number of dirty buffers in their respective page
caches. When these dirty buffers are flushed to storage rapidly, be it
because of fsync(), timeouts, or dirty ratios, latency for other reads
and writes can increase massively. This is the primary reason for
regular massive stalls observed in real world scenarios and artificial
benchmarks; on rotating disks stalls on the order of hundreds of seconds
have been observed.
On linux it is possible to control this by reducing the global dirty
limits significantly, reducing the above problem. But global
configuration is rather problematic because it'll affect other
applications; also PostgreSQL itself doesn't always generally want this
behavior, e.g. for temporary files it's undesirable.
Several operating systems allow some control over the kernel page
cache. Linux has sync_file_range(2), several posix systems have msync(2)
and posix_fadvise(2). sync_file_range(2) is preferable because it
requires no special setup, whereas msync() requires the to-be-flushed
range to be mmap'ed. For the purpose of flushing dirty data
posix_fadvise(2) is the worst alternative, as flushing dirty data is
just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages
from the page cache. Thus the feature is enabled by default only on
linux, but can be enabled on all systems that have any of the above
APIs.
While desirable and likely possible this patch does not contain an
implementation for windows.
With the infrastructure added, writes made via checkpointer, bgwriter
and normal user backends can be flushed after a configurable number of
writes. Each of these sources of writes controlled by a separate GUC,
checkpointer_flush_after, bgwriter_flush_after and backend_flush_after
respectively; they're separate because the number of flushes that are
good are separate, and because the performance considerations of
controlled flushing for each of these are different.
A later patch will add checkpoint sorting - after that flushes from the
ckeckpoint will almost always be desirable. Bgwriter flushes are most of
the time going to be random, which are slow on lots of storage hardware.
Flushing in backends works well if the storage and bgwriter can keep up,
but if not it can have negative consequences. This patch is likely to
have negative performance consequences without checkpoint sorting, but
unfortunately so has sorting without flush control.
Discussion: alpine.DEB.2.10.1506011320000.28433@sto
Author: Fabien Coelho and Andres Freund
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r-- | src/backend/storage/file/fd.c | 157 |
1 files changed, 139 insertions, 18 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index c1076992a33..046d1b3cc30 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -61,6 +61,9 @@ #include <sys/file.h> #include <sys/param.h> #include <sys/stat.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif #include <unistd.h> #include <fcntl.h> #ifdef HAVE_SYS_RESOURCE_H @@ -82,6 +85,8 @@ /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ #if defined(HAVE_SYNC_FILE_RANGE) #define PG_FLUSH_DATA_WORKS 1 +#elif !defined(WIN32) && defined(MS_ASYNC) +#define PG_FLUSH_DATA_WORKS 1 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) #define PG_FLUSH_DATA_WORKS 1 #endif @@ -383,29 +388,126 @@ pg_fdatasync(int fd) } /* - * pg_flush_data --- advise OS that the data described won't be needed soon + * pg_flush_data --- advise OS that the described dirty data should be flushed * - * Not all platforms have sync_file_range or posix_fadvise; treat as no-op - * if not available. Also, treat as no-op if enableFsync is off; this is - * because the call isn't free, and some platforms such as Linux will actually - * block the requestor until the write is scheduled. + * An offset of 0 with an nbytes 0 means that the entire file should be + * flushed. */ -int -pg_flush_data(int fd, off_t offset, off_t amount) +void +pg_flush_data(int fd, off_t offset, off_t nbytes) { -#ifdef PG_FLUSH_DATA_WORKS - if (enableFsync) - { + /* + * Right now file flushing is primarily used to avoid making later + * fsync()/fdatasync() calls have a less impact. Thus don't trigger + * flushes if fsyncs are disabled - that's a decision we might want to + * make configurable at some point. + */ + if (!enableFsync) + return; + + /* + * XXX: compile all alternatives, to find portability problems more easily + */ #if defined(HAVE_SYNC_FILE_RANGE) - return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE); -#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); -#else -#error PG_FLUSH_DATA_WORKS should not have been defined + { + int rc = 0; + + /* + * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific, + * tells the OS that writeback for the passed in blocks should be + * started, but that we don't want to wait for completion. Note that + * this call might block if too much dirty data exists in the range. + * This is the preferrable method on OSs supporting it, as it works + * reliably when available (contrast to msync()) and doesn't flush out + * clean data (like FADV_DONTNEED). + */ + rc = sync_file_range(fd, offset, nbytes, + SYNC_FILE_RANGE_WRITE); + + /* don't error out, this is just a performance optimization */ + if (rc != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + } + + return; + } #endif +#if !defined(WIN32) && defined(MS_ASYNC) + { + int rc = 0; + void *p; + + /* + * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers + * writeback. On linux it only does so with MS_SYNC is specified, but + * then it does the writeback synchronously. Luckily all common linux + * systems have sync_file_range(). This is preferrable over + * FADV_DONTNEED because it doesn't flush out clean data. + * + * We map the file (mmap()), tell the kernel to sync back the contents + * (msync()), and then remove the mapping again (munmap()). + */ + p = mmap(NULL, nbytes, + PROT_READ | PROT_WRITE, MAP_SHARED, + fd, offset); + if (p == MAP_FAILED) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not mmap while flushing dirty data: %m"))); + return; + } + + rc = msync(p, nbytes, MS_ASYNC); + if (rc != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + /* NB: need to fall through to munmap()! */ + } + + rc = munmap(p, nbytes); + if (rc != 0) + { + /* FATAL error because mapping would remain */ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not munmap while flushing blocks: %m"))); + } + + return; + } +#endif +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + { + int rc = 0; + + /* + * Signal the kernel that the passed in range should not be cached + * anymore. This has the, desired, side effect of writing out dirty + * data, and the, undesired, side effect of likely discarding useful + * clean cached blocks. For the latter reason this is the least + * preferrable method. + */ + + rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED); + + /* don't error out, this is just a performance optimization */ + if (rc != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + return; + } + + return; } #endif - return 0; } @@ -1396,6 +1498,24 @@ FilePrefetch(File file, off_t offset, int amount) #endif } +void +FileWriteback(File file, off_t offset, int amount) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " %d", + file, VfdCache[file].fileName, + (int64) offset, amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return; + + pg_flush_data(VfdCache[file].fd, offset, amount); +} + int FileRead(File file, char *buffer, int amount) { @@ -2796,9 +2916,10 @@ pre_sync_fname(const char *fname, bool isdir, int elevel) } /* - * We ignore errors from pg_flush_data() because this is only a hint. + * pg_flush_data() ignores errors, which is ok because this is only a + * hint. */ - (void) pg_flush_data(fd, 0, 0); + pg_flush_data(fd, 0, 0); (void) CloseTransientFile(fd); } |