diff options
-rw-r--r-- | doc/src/sgml/monitoring.sgml | 321 | ||||
-rw-r--r-- | src/backend/catalog/system_views.sql | 15 | ||||
-rw-r--r-- | src/backend/utils/adt/pgstatfuncs.c | 141 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.dat | 9 | ||||
-rw-r--r-- | src/test/regress/expected/rules.out | 12 | ||||
-rw-r--r-- | src/tools/pgindent/typedefs.list | 1 |
7 files changed, 486 insertions, 15 deletions
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index b246ddc6341..dca50707ad4 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -470,6 +470,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser </row> <row> + <entry><structname>pg_stat_io</structname><indexterm><primary>pg_stat_io</primary></indexterm></entry> + <entry> + One row for each combination of backend type, context, and target object + containing cluster-wide I/O statistics. + See <link linkend="monitoring-pg-stat-io-view"> + <structname>pg_stat_io</structname></link> for details. + </entry> + </row> + + <row> <entry><structname>pg_stat_replication_slots</structname><indexterm><primary>pg_stat_replication_slots</primary></indexterm></entry> <entry>One row per replication slot, showing statistics about the replication slot's usage. See @@ -665,20 +675,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser </para> <para> - The <structname>pg_statio_</structname> views are primarily useful to - determine the effectiveness of the buffer cache. When the number - of actual disk reads is much smaller than the number of buffer - hits, then the cache is satisfying most read requests without - invoking a kernel call. However, these statistics do not give the - entire story: due to the way in which <productname>PostgreSQL</productname> - handles disk I/O, data that is not in the - <productname>PostgreSQL</productname> buffer cache might still reside in the - kernel's I/O cache, and might therefore still be fetched without - requiring a physical read. Users interested in obtaining more - detailed information on <productname>PostgreSQL</productname> I/O behavior are - advised to use the <productname>PostgreSQL</productname> statistics views - in combination with operating system utilities that allow insight - into the kernel's handling of I/O. + The <structname>pg_stat_io</structname> and + <structname>pg_statio_</structname> set of views are useful for determining + the effectiveness of the buffer cache. They can be used to calculate a cache + hit ratio. Note that while <productname>PostgreSQL</productname>'s I/O + statistics capture most instances in which the kernel was invoked in order + to perform I/O, they do not differentiate between data which had to be + fetched from disk and that which already resided in the kernel page cache. + Users are advised to use the <productname>PostgreSQL</productname> + statistics views in combination with operating system utilities for a more + complete picture of their database's I/O performance. </para> </sect2> @@ -3669,6 +3675,293 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i <structfield>last_archived_wal</structfield> have also been successfully archived. </para> + </sect2> + + <sect2 id="monitoring-pg-stat-io-view"> + <title><structname>pg_stat_io</structname></title> + + <indexterm> + <primary>pg_stat_io</primary> + </indexterm> + + <para> + The <structname>pg_stat_io</structname> view will contain one row for each + combination of backend type, target I/O object, and I/O context, showing + cluster-wide I/O statistics. Combinations which do not make sense are + omitted. + </para> + + <para> + Currently, I/O on relations (e.g. tables, indexes) is tracked. However, + relation I/O which bypasses shared buffers (e.g. when moving a table from one + tablespace to another) is currently not tracked. + </para> + + <table id="pg-stat-io-view" xreflabel="pg_stat_io"> + <title><structname>pg_stat_io</structname> View</title> + <tgroup cols="1"> + <thead> + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + Column Type + </para> + <para> + Description + </para> + </entry> + </row> + </thead> + <tbody> + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>backend_type</structfield> <type>text</type> + </para> + <para> + Type of backend (e.g. background worker, autovacuum worker). See <link + linkend="monitoring-pg-stat-activity-view"> + <structname>pg_stat_activity</structname></link> for more information + on <varname>backend_type</varname>s. Some + <varname>backend_type</varname>s do not accumulate I/O operation + statistics and will not be included in the view. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>io_object</structfield> <type>text</type> + </para> + <para> + Target object of an I/O operation. Possible values are: + <itemizedlist> + <listitem> + <para> + <literal>relation</literal>: Permanent relations. + </para> + </listitem> + <listitem> + <para> + <literal>temp relation</literal>: Temporary relations. + </para> + </listitem> + </itemizedlist> + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>io_context</structfield> <type>text</type> + </para> + <para> + The context of an I/O operation. Possible values are: + </para> + <itemizedlist> + <listitem> + <para> + <literal>normal</literal>: The default or standard + <varname>io_context</varname> for a type of I/O operation. For + example, by default, relation data is read into and written out from + shared buffers. Thus, reads and writes of relation data to and from + shared buffers are tracked in <varname>io_context</varname> + <literal>normal</literal>. + </para> + </listitem> + <listitem> + <para> + <literal>vacuum</literal>: I/O operations performed outside of shared + buffers while vacuuming and analyzing permanent relations. Temporary + table vacuums use the same local buffer pool as other temporary table + IO operations and are tracked in <varname>io_context</varname> + <literal>normal</literal>. + </para> + </listitem> + <listitem> + <para> + <literal>bulkread</literal>: Certain large read I/O operations + done outside of shared buffers, for example, a sequential scan of a + large table. + </para> + </listitem> + <listitem> + <para> + <literal>bulkwrite</literal>: Certain large write I/O operations + done outside of shared buffers, such as <command>COPY</command>. + </para> + </listitem> + </itemizedlist> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>reads</structfield> <type>bigint</type> + </para> + <para> + Number of read operations, each of the size specified in + <varname>op_bytes</varname>. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>writes</structfield> <type>bigint</type> + </para> + <para> + Number of write operations, each of the size specified in + <varname>op_bytes</varname>. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>extends</structfield> <type>bigint</type> + </para> + <para> + Number of relation extend operations, each of the size specified in + <varname>op_bytes</varname>. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>op_bytes</structfield> <type>bigint</type> + </para> + <para> + The number of bytes per unit of I/O read, written, or extended. + </para> + <para> + Relation data reads, writes, and extends are done in + <varname>block_size</varname> units, derived from the build-time + parameter <symbol>BLCKSZ</symbol>, which is <literal>8192</literal> by + default. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>evictions</structfield> <type>bigint</type> + </para> + <para> + Number of times a block has been written out from a shared or local + buffer in order to make it available for another use. + </para> + <para> + In <varname>io_context</varname> <literal>normal</literal>, this counts + the number of times a block was evicted from a buffer and replaced with + another block. In <varname>io_context</varname>s + <literal>bulkwrite</literal>, <literal>bulkread</literal>, and + <literal>vacuum</literal>, this counts the number of times a block was + evicted from shared buffers in order to add the shared buffer to a + separate, size-limited ring buffer for use in a bulk I/O operation. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>reuses</structfield> <type>bigint</type> + </para> + <para> + The number of times an existing buffer in a size-limited ring buffer + outside of shared buffers was reused as part of an I/O operation in the + <literal>bulkread</literal>, <literal>bulkwrite</literal>, or + <literal>vacuum</literal> <varname>io_context</varname>s. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>fsyncs</structfield> <type>bigint</type> + </para> + <para> + Number of <literal>fsync</literal> calls. These are only tracked in + <varname>io_context</varname> <literal>normal</literal>. + </para> + </entry> + </row> + + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>stats_reset</structfield> <type>timestamp with time zone</type> + </para> + <para> + Time at which these statistics were last reset. + </para> + </entry> + </row> + </tbody> + </tgroup> + </table> + + <para> + Some backend types never perform I/O operations on some I/O objects and/or + in some I/O contexts. These rows are omitted from the view. For example, the + checkpointer does not checkpoint temporary tables, so there will be no rows + for <varname>backend_type</varname> <literal>checkpointer</literal> and + <varname>io_object</varname> <literal>temp relation</literal>. + </para> + + <para> + In addition, some I/O operations will never be performed either by certain + backend types or on certain I/O objects and/or in certain I/O contexts. + These cells will be NULL. For example, temporary tables are not + <literal>fsync</literal>ed, so <varname>fsyncs</varname> will be NULL for + <varname>io_object</varname> <literal>temp relation</literal>. Also, the + background writer does not perform reads, so <varname>reads</varname> will + be NULL in rows for <varname>backend_type</varname> <literal>background + writer</literal>. + </para> + + <para> + <structname>pg_stat_io</structname> can be used to inform database tuning. + For example: + <itemizedlist> + <listitem> + <para> + A high <varname>evictions</varname> count can indicate that shared + buffers should be increased. + </para> + </listitem> + <listitem> + <para> + Client backends rely on the checkpointer to ensure data is persisted to + permanent storage. Large numbers of <varname>fsyncs</varname> by + <literal>client backend</literal>s could indicate a misconfiguration of + shared buffers or of the checkpointer. More information on configuring + the checkpointer can be found in <xref linkend="wal-configuration"/>. + </para> + </listitem> + <listitem> + <para> + Normally, client backends should be able to rely on auxiliary processes + like the checkpointer and the background writer to write out dirty data + as much as possible. Large numbers of writes by client backends could + indicate a misconfiguration of shared buffers or of the checkpointer. + More information on configuring the checkpointer can be found in <xref + linkend="wal-configuration"/>. + </para> + </listitem> + </itemizedlist> + </para> + </sect2> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 8608e3fa5b1..34ca0e739f5 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1117,6 +1117,21 @@ CREATE VIEW pg_stat_bgwriter AS pg_stat_get_buf_alloc() AS buffers_alloc, pg_stat_get_bgwriter_stat_reset_time() AS stats_reset; +CREATE VIEW pg_stat_io AS +SELECT + b.backend_type, + b.io_object, + b.io_context, + b.reads, + b.writes, + b.extends, + b.op_bytes, + b.evictions, + b.reuses, + b.fsyncs, + b.stats_reset +FROM pg_stat_get_io() b; + CREATE VIEW pg_stat_wal AS SELECT w.wal_records, diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 924698e6ae4..9d707c35216 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1246,6 +1246,147 @@ pg_stat_get_buf_alloc(PG_FUNCTION_ARGS) } /* +* When adding a new column to the pg_stat_io view, add a new enum value +* here above IO_NUM_COLUMNS. +*/ +typedef enum io_stat_col +{ + IO_COL_BACKEND_TYPE, + IO_COL_IO_OBJECT, + IO_COL_IO_CONTEXT, + IO_COL_READS, + IO_COL_WRITES, + IO_COL_EXTENDS, + IO_COL_CONVERSION, + IO_COL_EVICTIONS, + IO_COL_REUSES, + IO_COL_FSYNCS, + IO_COL_RESET_TIME, + IO_NUM_COLUMNS, +} io_stat_col; + +/* + * When adding a new IOOp, add a new io_stat_col and add a case to this + * function returning the corresponding io_stat_col. + */ +static io_stat_col +pgstat_get_io_op_index(IOOp io_op) +{ + switch (io_op) + { + case IOOP_EVICT: + return IO_COL_EVICTIONS; + case IOOP_READ: + return IO_COL_READS; + case IOOP_REUSE: + return IO_COL_REUSES; + case IOOP_WRITE: + return IO_COL_WRITES; + case IOOP_EXTEND: + return IO_COL_EXTENDS; + case IOOP_FSYNC: + return IO_COL_FSYNCS; + } + + elog(ERROR, "unrecognized IOOp value: %d", io_op); + pg_unreachable(); +} + +Datum +pg_stat_get_io(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo; + PgStat_IO *backends_io_stats; + Datum reset_time; + + InitMaterializedSRF(fcinfo, 0); + rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + backends_io_stats = pgstat_fetch_stat_io(); + + reset_time = TimestampTzGetDatum(backends_io_stats->stat_reset_timestamp); + + for (BackendType bktype = B_INVALID; bktype < BACKEND_NUM_TYPES; bktype++) + { + Datum bktype_desc = CStringGetTextDatum(GetBackendTypeDesc(bktype)); + PgStat_BktypeIO *bktype_stats = &backends_io_stats->stats[bktype]; + + /* + * In Assert builds, we can afford an extra loop through all of the + * counters checking that only expected stats are non-zero, since it + * keeps the non-Assert code cleaner. + */ + Assert(pgstat_bktype_io_stats_valid(bktype_stats, bktype)); + + /* + * For those BackendTypes without IO Operation stats, skip + * representing them in the view altogether. + */ + if (!pgstat_tracks_io_bktype(bktype)) + continue; + + for (IOObject io_obj = IOOBJECT_FIRST; + io_obj < IOOBJECT_NUM_TYPES; io_obj++) + { + const char *obj_name = pgstat_get_io_object_name(io_obj); + + for (IOContext io_context = IOCONTEXT_FIRST; + io_context < IOCONTEXT_NUM_TYPES; io_context++) + { + const char *context_name = pgstat_get_io_context_name(io_context); + + Datum values[IO_NUM_COLUMNS] = {0}; + bool nulls[IO_NUM_COLUMNS] = {0}; + + /* + * Some combinations of BackendType, IOObject, and IOContext + * are not valid for any type of IOOp. In such cases, omit the + * entire row from the view. + */ + if (!pgstat_tracks_io_object(bktype, io_obj, io_context)) + continue; + + values[IO_COL_BACKEND_TYPE] = bktype_desc; + values[IO_COL_IO_CONTEXT] = CStringGetTextDatum(context_name); + values[IO_COL_IO_OBJECT] = CStringGetTextDatum(obj_name); + values[IO_COL_RESET_TIME] = TimestampTzGetDatum(reset_time); + + /* + * Hard-code this to the value of BLCKSZ for now. Future + * values could include XLOG_BLCKSZ, once WAL IO is tracked, + * and constant multipliers, once non-block-oriented IO (e.g. + * temporary file IO) is tracked. + */ + values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ); + + for (IOOp io_op = IOOP_FIRST; io_op < IOOP_NUM_TYPES; io_op++) + { + int col_idx = pgstat_get_io_op_index(io_op); + + /* + * Some combinations of BackendType and IOOp, of IOContext + * and IOOp, and of IOObject and IOOp are not tracked. Set + * these cells in the view NULL. + */ + nulls[col_idx] = !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op); + + if (nulls[col_idx]) + continue; + + values[col_idx] = + Int64GetDatum(bktype_stats->data[io_obj][io_context][io_op]); + } + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + } + } + + return (Datum) 0; +} + +/* * Returns statistics of WAL activity */ Datum diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index c1ce0b76e14..9c298cb1253 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202301301 +#define CATALOG_VERSION_NO 202302111 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index c0f2a8a77c8..66b73c3900d 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5717,6 +5717,15 @@ proname => 'pg_stat_get_buf_alloc', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc' }, +{ oid => '8459', descr => 'statistics: per backend type IO statistics', + proname => 'pg_stat_get_io', provolatile => 'v', + prorows => '30', proretset => 't', + proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,timestamptz}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,evictions,reuses,fsyncs,stats_reset}', + prosrc => 'pg_stat_get_io' }, + { oid => '1136', descr => 'statistics: information about WAL activity', proname => 'pg_stat_get_wal', proisstrict => 'f', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => '', diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index e7a2f5856aa..174b725fff1 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1876,6 +1876,18 @@ pg_stat_gssapi| SELECT pid, gss_enc AS encrypted FROM pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid, query_id) WHERE (client_port IS NOT NULL); +pg_stat_io| SELECT backend_type, + io_object, + io_context, + reads, + writes, + extends, + op_bytes, + evictions, + reuses, + fsyncs, + stats_reset + FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, evictions, reuses, fsyncs, stats_reset); pg_stat_progress_analyze| SELECT s.pid, s.datid, d.datname, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 36d1dc01177..45fc5759ced 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3378,6 +3378,7 @@ intset_internal_node intset_leaf_node intset_node intvKEY +io_stat_col itemIdCompact itemIdCompactData iterator |