aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/transam/xlog.c
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2016-02-15 23:52:38 +0100
committerAndres Freund <andres@anarazel.de>2016-02-16 00:56:34 +0100
commit7975c5e0a992ae9a45e03d145e0d37e2b5a707f5 (patch)
treefa09f94f77fd54482b7fbd3ae411534ac1d7a85d /src/backend/access/transam/xlog.c
parent5df44d14ba9fd3f6149c3fa0919745c9e24bcffe (diff)
downloadpostgresql-7975c5e0a992ae9a45e03d145e0d37e2b5a707f5.tar.gz
postgresql-7975c5e0a992ae9a45e03d145e0d37e2b5a707f5.zip
Allow the WAL writer to flush WAL at a reduced rate.
Commit 4de82f7d7 increased the WAL flush rate, mainly to increase the likelihood that hint bits can be set quickly. More quickly set hint bits can reduce contention around the clog et al. But unfortunately the increased flush rate can have a significant negative performance impact, I have measured up to a factor of ~4. The reason for this slowdown is that if there are independent writes to the underlying devices, for example because shared buffers is a lot smaller than the hot data set, or because a checkpoint is ongoing, the fdatasync() calls force cache flushes to be emitted to the storage. This is achieved by flushing WAL only if the last flush was longer than wal_writer_delay ago, or if more than wal_writer_flush_after (new GUC) unflushed blocks are pending. Based on some tests the default for wal_writer_delay is 1MB, which seems to work well both on SSD and rotational media. To avoid negative performance impact due to 4de82f7d7 an earlier commit (db76b1e) made SetHintBits() more likely to succeed; preventing performance regressions in the pgbench tests I performed. Discussion: 20160118163908.GW10941@awork2.anarazel.de
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r--src/backend/access/transam/xlog.c104
1 files changed, 76 insertions, 28 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8d480f7ce24..94b79ac49d5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/walwriter.h"
#include "postmaster/startup.h"
#include "replication/basebackup.h"
#include "replication/logical.h"
@@ -2729,28 +2730,37 @@ XLogFlush(XLogRecPtr record)
}
/*
- * Flush xlog, but without specifying exactly where to flush to.
+ * Write & flush xlog, but without specifying exactly where to.
*
- * We normally flush only completed blocks; but if there is nothing to do on
- * that basis, we check for unflushed async commits in the current incomplete
- * block, and flush through the latest one of those. Thus, if async commits
- * are not being used, we will flush complete blocks only. We can guarantee
- * that async commits reach disk after at most three cycles; normally only
- * one or two. (When flushing complete blocks, we allow XLogWrite to write
- * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
- * difference only with very high load or long wal_writer_delay, but imposes
- * one extra cycle for the worst case for async commits.)
+ * We normally write only completed blocks; but if there is nothing to do on
+ * that basis, we check for unwritten async commits in the current incomplete
+ * block, and write through the latest one of those. Thus, if async commits
+ * are not being used, we will write complete blocks only.
+ *
+ * If, based on the above, there's anything to write we do so immediately. But
+ * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
+ * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
+ * more than wal_writer_flush_after unflushed blocks.
+ *
+ * We can guarantee that async commits reach disk after at most three
+ * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
+ * to write "flexibly", meaning it can stop at the end of the buffer ring;
+ * this makes a difference only with very high load or long wal_writer_delay,
+ * but imposes one extra cycle for the worst case for async commits.)
*
* This routine is invoked periodically by the background walwriter process.
*
- * Returns TRUE if we flushed anything.
+ * Returns TRUE if there was any work to do, even if we skipped flushing due
+ * to wal_writer_delay/wal_flush_after.
*/
bool
XLogBackgroundFlush(void)
{
- XLogRecPtr WriteRqstPtr;
+ XLogwrtRqst WriteRqst;
bool flexible = true;
- bool wrote_something = false;
+ static TimestampTz lastflush;
+ TimestampTz now;
+ int flushbytes;
/* XLOG doesn't need flushing during recovery */
if (RecoveryInProgress())
@@ -2759,17 +2769,17 @@ XLogBackgroundFlush(void)
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
- WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
+ WriteRqst = XLogCtl->LogwrtRqst;
SpinLockRelease(&XLogCtl->info_lck);
/* back off to last completed page boundary */
- WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
+ WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
/* if we have already flushed that far, consider async commit records */
- if (WriteRqstPtr <= LogwrtResult.Flush)
+ if (WriteRqst.Write <= LogwrtResult.Flush)
{
SpinLockAcquire(&XLogCtl->info_lck);
- WriteRqstPtr = XLogCtl->asyncXactLSN;
+ WriteRqst.Write = XLogCtl->asyncXactLSN;
SpinLockRelease(&XLogCtl->info_lck);
flexible = false; /* ensure it all gets written */
}
@@ -2779,7 +2789,7 @@ XLogBackgroundFlush(void)
* holding an open file handle to a logfile that's no longer in use,
* preventing the file from being deleted.
*/
- if (WriteRqstPtr <= LogwrtResult.Flush)
+ if (WriteRqst.Write <= LogwrtResult.Flush)
{
if (openLogFile >= 0)
{
@@ -2791,10 +2801,47 @@ XLogBackgroundFlush(void)
return false;
}
+ /*
+ * Determine how far to flush WAL, based on the wal_writer_delay and
+ * wal_writer_flush_after GUCs.
+ */
+ now = GetCurrentTimestamp();
+ flushbytes =
+ WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
+
+ if (WalWriterFlushAfter == 0 || lastflush == 0)
+ {
+ /* first call, or block based limits disabled */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
+ {
+ /*
+ * Flush the writes at least every WalWriteDelay ms. This is important
+ * to bound the amount of time it takes for an asynchronous commit to
+ * hit disk.
+ */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (flushbytes >= WalWriterFlushAfter)
+ {
+ /* exceeded wal_writer_flush_after blocks, flush */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else
+ {
+ /* no flushing, this time round */
+ WriteRqst.Flush = 0;
+ }
+
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
- elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
- (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
+ elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+ (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
+ (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
(uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
@@ -2802,17 +2849,13 @@ XLogBackgroundFlush(void)
START_CRIT_SECTION();
/* now wait for any in-progress insertions to finish and get write lock */
- WaitXLogInsertionsToFinish(WriteRqstPtr);
+ WaitXLogInsertionsToFinish(WriteRqst.Write);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
- if (WriteRqstPtr > LogwrtResult.Flush)
+ if (WriteRqst.Write > LogwrtResult.Write ||
+ WriteRqst.Flush > LogwrtResult.Flush)
{
- XLogwrtRqst WriteRqst;
-
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = WriteRqstPtr;
XLogWrite(WriteRqst, flexible);
- wrote_something = true;
}
LWLockRelease(WALWriteLock);
@@ -2827,7 +2870,12 @@ XLogBackgroundFlush(void)
*/
AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
- return wrote_something;
+ /*
+ * If we determined that we need to write data, but somebody else
+ * wrote/flushed already, it should be considered as being active, to
+ * avoid hibernating too early.
+ */
+ return true;
}
/*