aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Paquier <michael@paquier.xyz>2025-02-26 09:49:59 +0900
committerMichael Paquier <michael@paquier.xyz>2025-02-26 09:49:59 +0900
commit6c349d83b607cfe20a685a5f66a4fa0e24dbe458 (patch)
treef40e52d4a8571f996976ffe0876b989921e9a998
parenta5cbdeb98af9a8d4f683fbffa69cd74be1f4a084 (diff)
downloadpostgresql-6c349d83b607cfe20a685a5f66a4fa0e24dbe458.tar.gz
postgresql-6c349d83b607cfe20a685a5f66a4fa0e24dbe458.zip
Re-add GUC track_wal_io_timing
This commit is a rework of 2421e9a51d20, about which Andres Freund has raised some concerns as it is valuable to have both track_io_timing and track_wal_io_timing in some cases, as the WAL write and fsync paths can be a major bottleneck for some workloads. Hence, it can be relevant to not calculate the WAL timings in environments where pg_test_timing performs poorly while capturing some IO data under track_io_timing for the non-WAL IO paths. The opposite can be also true: it should be possible to disable the non-WAL timings and enable the WAL timings (the previous GUC setups allowed this possibility). track_wal_io_timing is added back in this commit, controlling if WAL timings should be calculated in pg_stat_io for the read, fsync and write paths, as done previously with pg_stat_wal. pg_stat_wal previously tracked only the sync and write parts (now removed), read stats is new data tracked in pg_stat_io, all three are aggregated if track_wal_io_timing is enabled. The read part matters during recovery or if a XLogReader is used. Extra note: more control over if the types of timings calculated in pg_stat_io could be done with a GUC that lists pairs of (IOObject,IOOp). Reported-by: Andres Freund <andres@anarazel.de> Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com> Co-authored-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/3opf2wh2oljco6ldyqf7ukabw3jijnnhno6fjb4mlu6civ5h24@fcwmhsgmlmzu
-rw-r--r--doc/src/sgml/config.sgml32
-rw-r--r--doc/src/sgml/monitoring.sgml31
-rw-r--r--doc/src/sgml/wal.sgml4
-rw-r--r--src/backend/access/transam/xlog.c9
-rw-r--r--src/backend/access/transam/xlogreader.c2
-rw-r--r--src/backend/access/transam/xlogrecovery.c2
-rw-r--r--src/backend/storage/buffer/bufmgr.c10
-rw-r--r--src/backend/storage/buffer/localbuf.c4
-rw-r--r--src/backend/storage/smgr/md.c4
-rw-r--r--src/backend/utils/activity/pgstat_backend.c3
-rw-r--r--src/backend/utils/activity/pgstat_io.c14
-rw-r--r--src/backend/utils/misc/guc_tables.c9
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample1
-rw-r--r--src/include/access/xlog.h1
-rw-r--r--src/include/pgstat.h2
15 files changed, 96 insertions, 32 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index a8354576108..e55700f35b8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8378,9 +8378,11 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
displayed in <link linkend="monitoring-pg-stat-database-view">
<structname>pg_stat_database</structname></link>,
<link linkend="monitoring-pg-stat-io-view">
- <structname>pg_stat_io</structname></link>, in the output of the
+ <structname>pg_stat_io</structname></link> (if <varname>object</varname>
+ is not <literal>wal</literal>), in the output of the
<link linkend="pg-stat-get-backend-io">
- <function>pg_stat_get_backend_io()</function></link> function, in the
+ <function>pg_stat_get_backend_io()</function></link> function (if
+ <varname>object</varname> is not <literal>wal</literal>), in the
output of <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal>
option is used, in the output of <xref linkend="sql-vacuum"/> when
the <literal>VERBOSE</literal> option is used, by autovacuum
@@ -8393,6 +8395,32 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
</listitem>
</varlistentry>
+ <varlistentry id="guc-track-wal-io-timing" xreflabel="track_wal_io_timing">
+ <term><varname>track_wal_io_timing</varname> (<type>boolean</type>)
+ <indexterm>
+ <primary><varname>track_wal_io_timing</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Enables timing of WAL I/O calls. This parameter is off by default,
+ as it will repeatedly query the operating system for the current time,
+ which may cause significant overhead on some platforms.
+ You can use the <application>pg_test_timing</application> tool to
+ measure the overhead of timing on your system.
+ I/O timing information is displayed in
+ <link linkend="monitoring-pg-stat-io-view">
+ <structname>pg_stat_io</structname></link> for the
+ <varname>object</varname> <literal>wal</literal> and in the output of
+ the <link linkend="pg-stat-get-backend-io">
+ <function>pg_stat_get_backend_io()</function></link> function for the
+ <varname>object</varname> <literal>wal</literal>.
+ Only superusers and users with the appropriate <literal>SET</literal>
+ privilege can change this setting.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-track-functions" xreflabel="track_functions">
<term><varname>track_functions</varname> (<type>enum</type>)
<indexterm>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 3dfd059b7ee..9178f1d34ef 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -184,6 +184,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
</para>
<para>
+ The parameter <xref linkend="guc-track-wal-io-timing"/> enables monitoring
+ of WAL read, write and fsync times.
+ </para>
+
+ <para>
Normally these parameters are set in <filename>postgresql.conf</filename> so
that they apply to all server processes, but it is possible to turn
them on or off in individual sessions using the <xref
@@ -2723,7 +2728,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
</para>
<para>
Time spent in read operations in milliseconds (if
- <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+ <xref linkend="guc-track-io-timing"/> is enabled and
+ <varname>object</varname> is not <literal>wal</literal>,
+ or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+ and <varname>object</varname> is <literal>wal</literal>,
+ otherwise zero)
</para>
</entry>
</row>
@@ -2757,7 +2766,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
</para>
<para>
Time spent in write operations in milliseconds (if
- <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+ <xref linkend="guc-track-io-timing"/> is enabled and
+ <varname>object</varname> is not <literal>wal</literal>,
+ or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+ and <varname>object</varname> is <literal>wal</literal>,
+ otherwise zero)
</para>
</entry>
</row>
@@ -2816,8 +2829,12 @@ description | Waiting for a newly initialized WAL file to reach durable storage
<structfield>extend_time</structfield> <type>double precision</type>
</para>
<para>
- Time spent in extend operations in milliseconds (if
- <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+ Time spent in extend operations in milliseconds. (if
+ <xref linkend="guc-track-io-timing"/> is enabled and
+ <varname>object</varname> is not <literal>wal</literal>,
+ or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+ and <varname>object</varname> is <literal>wal</literal>,
+ otherwise zero)
</para>
</entry>
</row>
@@ -2887,7 +2904,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
</para>
<para>
Time spent in fsync operations in milliseconds (if
- <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+ <xref linkend="guc-track-io-timing"/> is enabled and
+ <varname>object</varname> is not <literal>wal</literal>,
+ or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+ and <varname>object</varname> is <literal>wal</literal>,
+ otherwise zero)
</para>
</entry>
</row>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 4fc09761115..f3b86b26be9 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -808,7 +808,7 @@
<para>
There are two internal functions to write WAL data to disk:
<function>XLogWrite</function> and <function>issue_xlog_fsync</function>.
- When <xref linkend="guc-track-io-timing"/> is enabled, the total
+ When <xref linkend="guc-track-wal-io-timing"/> is enabled, the total
amounts of time <function>XLogWrite</function> writes and
<function>issue_xlog_fsync</function> syncs WAL data to disk are counted as
<varname>write_time</varname> and <varname>fsync_time</varname> in
@@ -828,7 +828,7 @@
<literal>fsync</literal>, or <literal>fsync_writethrough</literal>,
the write operation moves WAL buffers to kernel cache and
<function>issue_xlog_fsync</function> syncs them to disk. Regardless
- of the setting of <varname>track_io_timing</varname>, the number
+ of the setting of <varname>track_wal_io_timing</varname>, the number
of times <function>XLogWrite</function> writes and
<function>issue_xlog_fsync</function> syncs WAL data to disk are also
counted as <varname>writes</varname> and <varname>fsyncs</varname>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 919314f8258..799fc739e18 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -134,6 +134,7 @@ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
int wal_retrieve_retry_interval = 5000;
int max_slot_wal_keep_size_mb = -1;
int wal_decode_buffer_size = 512 * 1024;
+bool track_wal_io_timing = false;
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
@@ -2437,7 +2438,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
/*
* Measure I/O timing to write WAL data, for pg_stat_io.
*/
- start = pgstat_prepare_io_time();
+ start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = pg_pwrite(openLogFile, from, nleft, startoffset);
@@ -3248,7 +3249,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
errmsg("could not create file \"%s\": %m", tmppath)));
/* Measure I/O timing when initializing segment */
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
save_errno = 0;
@@ -3310,7 +3311,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
}
/* Measure I/O timing when flushing segment */
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
if (pg_fsync(fd) != 0)
@@ -8744,7 +8745,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
/*
* Measure I/O timing to sync the WAL file for pg_stat_io.
*/
- start = pgstat_prepare_io_time();
+ start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
switch (wal_sync_method)
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 995e4f7c26f..5483d4f0dd2 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1557,7 +1557,7 @@ WALRead(XLogReaderState *state,
#ifndef FRONTEND
/* Measure I/O timing when reading segment */
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
#endif
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 915cb330295..c5475aaa97a 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -3401,7 +3401,7 @@ retry:
readOff = targetPageOff;
/* Measure I/O timing when reading segment */
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 40e9efec312..7915ed624c1 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1509,7 +1509,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
}
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1, io_buffers_len * BLCKSZ);
@@ -2402,7 +2402,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
}
}
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
/*
* Note: if smgrzeroextend fails, we will end up with buffers that are
@@ -3857,7 +3857,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
*/
bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
/*
* bufToWrite is either the shared buffer or a copy, as appropriate.
@@ -4459,7 +4459,7 @@ FlushRelationBuffers(Relation rel)
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
smgrwrite(srel,
BufTagGetForkNum(&bufHdr->tag),
@@ -5905,7 +5905,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
sort_pending_writebacks(wb_context->pending_writebacks,
wb_context->nr_pending);
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
/*
* Coalesce neighbouring writes, but nothing else. For that we iterate
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index d2f8158d697..80b83444eb2 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -244,7 +244,7 @@ GetLocalVictimBuffer(void)
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
/* And write... */
smgrwrite(oreln,
@@ -414,7 +414,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
}
}
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
/* actually extend relation */
smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 4994c97795c..f3220f98dc4 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1391,7 +1391,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
ereport(DEBUG1,
(errmsg_internal("could not forward fsync request because request queue is full")));
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
ereport(data_sync_elevel(ERROR),
@@ -1790,7 +1790,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
need_to_close = true;
}
- io_start = pgstat_prepare_io_time();
+ io_start = pgstat_prepare_io_time(track_io_timing);
/* Sync the file. */
result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 4a667e7019c..338da73a9a9 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -24,6 +24,7 @@
#include "postgres.h"
+#include "access/xlog.h"
#include "storage/bufmgr.h"
#include "utils/memutils.h"
#include "utils/pgstat_internal.h"
@@ -43,7 +44,7 @@ void
pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context,
IOOp io_op, instr_time io_time)
{
- Assert(track_io_timing);
+ Assert(track_io_timing || track_wal_io_timing);
if (!pgstat_tracks_backend_bktype(MyBackendType))
return;
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index e7362b52a37..ba11545a17f 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -83,20 +83,22 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
}
/*
- * Initialize the internal timing for an IO operation.
+ * Initialize the internal timing for an IO operation, depending on an
+ * IO timing GUC.
*/
instr_time
-pgstat_prepare_io_time(void)
+pgstat_prepare_io_time(bool track_io_guc)
{
instr_time io_start;
- if (track_io_timing)
+ if (track_io_guc)
INSTR_TIME_SET_CURRENT(io_start);
else
{
/*
- * There is no need to set io_start when an IO timing GUC is disabled,
- * still initialize it to zero to avoid compiler warnings.
+ * There is no need to set io_start when an IO timing GUC is disabled.
+ * Initialize it to zero to avoid compiler warnings and to let
+ * pgstat_count_io_op_time() know that timings should be ignored.
*/
INSTR_TIME_SET_ZERO(io_start);
}
@@ -119,7 +121,7 @@ void
pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
instr_time start_time, uint32 cnt, uint64 bytes)
{
- if (track_io_timing)
+ if (!INSTR_TIME_IS_ZERO(start_time))
{
instr_time io_time;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 690bf96ef03..ad25cbb39c5 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1501,6 +1501,15 @@ struct config_bool ConfigureNamesBool[] =
false,
NULL, NULL, NULL
},
+ {
+ {"track_wal_io_timing", PGC_SUSET, STATS_CUMULATIVE,
+ gettext_noop("Collects timing statistics for WAL I/O activity."),
+ NULL
+ },
+ &track_wal_io_timing,
+ false,
+ NULL, NULL, NULL
+ },
{
{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e771d87da1f..5362ff80519 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -641,6 +641,7 @@
#track_counts = on
#track_cost_delay_timing = off
#track_io_timing = off
+#track_wal_io_timing = off
#track_functions = none # none, pl, all
#stats_fetch_consistency = cache # cache, none, snapshot
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 0a50d160bba..d313099c027 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -54,6 +54,7 @@ extern PGDLLIMPORT char *wal_consistency_checking_string;
extern PGDLLIMPORT bool log_checkpoints;
extern PGDLLIMPORT int CommitDelay;
extern PGDLLIMPORT int CommitSiblings;
+extern PGDLLIMPORT bool track_wal_io_timing;
extern PGDLLIMPORT int wal_decode_buffer_size;
extern PGDLLIMPORT int CheckPointSegments;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fc651d03cf9..e64c527265b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -564,7 +564,7 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
BackendType bktype);
extern void pgstat_count_io_op(IOObject io_object, IOContext io_context,
IOOp io_op, uint32 cnt, uint64 bytes);
-extern instr_time pgstat_prepare_io_time(void);
+extern instr_time pgstat_prepare_io_time(bool track_io_guc);
extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
IOOp io_op, instr_time start_time,
uint32 cnt, uint64 bytes);