diff options
-rw-r--r-- | doc/src/sgml/catalogs.sgml | 432 | ||||
-rw-r--r-- | doc/src/sgml/perform.sgml | 214 | ||||
-rw-r--r-- | doc/src/sgml/planstats.sgml | 42 | ||||
-rw-r--r-- | doc/src/sgml/ref/create_statistics.sgml | 43 |
4 files changed, 397 insertions, 334 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index aa5e705e574..b2fae027f5f 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -221,13 +221,13 @@ </row> <row> - <entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry> - <entry>template data for procedural languages</entry> + <entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry> + <entry>information about partition key of tables</entry> </row> <row> - <entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry> - <entry>information about partition key of tables</entry> + <entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry> + <entry>template data for procedural languages</entry> </row> <row> @@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt>< </table> </sect1> - <sect1 id="catalog-pg-statistic-ext"> - <title><structname>pg_statistic_ext</structname></title> - - <indexterm zone="catalog-pg-statistic-ext"> - <primary>pg_statistic_ext</primary> - </indexterm> - - <para> - The catalog <structname>pg_statistic_ext</structname> - holds extended planner statistics. - </para> - - <table> - <title><structname>pg_statistic_ext</> Columns</title> - - <tgroup cols="4"> - <thead> - <row> - <entry>Name</entry> - <entry>Type</entry> - <entry>References</entry> - <entry>Description</entry> - </row> - </thead> - - <tbody> - - <row> - <entry><structfield>stxrelid</structfield></entry> - <entry><type>oid</type></entry> - <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry> - <entry>The table that the described columns belongs to</entry> - </row> - - <row> - <entry><structfield>stxname</structfield></entry> - <entry><type>name</type></entry> - <entry></entry> - <entry>Name of the statistic.</entry> - </row> - - <row> - <entry><structfield>stxnamespace</structfield></entry> - <entry><type>oid</type></entry> - <entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry> - <entry> - The OID of the namespace that contains this statistic - </entry> - </row> - - <row> - <entry><structfield>stxowner</structfield></entry> - <entry><type>oid</type></entry> - <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry> - <entry>Owner of the statistic</entry> - </row> - - <row> - <entry><structfield>stxkeys</structfield></entry> - <entry><type>int2vector</type></entry> - <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry> - <entry> - This is an array of values that indicate which table columns this - statistic covers. For example a value of <literal>1 3</literal> would - mean that the first and the third table columns make up the statistic key. - </entry> - </row> - - <row> - <entry><structfield>stxkind</structfield></entry> - <entry><type>char[]</type></entry> - <entry></entry> - <entry> - An array with the modes of the enabled statistic types. Valid values - are: - <literal>d</literal> for ndistinct coefficients, - <literal>f</literal> for functional dependencies. - </entry> - </row> - - <row> - <entry><structfield>stxndistinct</structfield></entry> - <entry><type>pg_ndistinct</type></entry> - <entry></entry> - <entry> - N-distinct coefficients, serialized as <structname>pg_ndistinct</> type. - </entry> - </row> - - <row> - <entry><structfield>stxdependencies</structfield></entry> - <entry><type>pg_dependencies</type></entry> - <entry></entry> - <entry> - Functional dependencies, serialized as <structname>pg_dependencies</> type. - </entry> - </row> - - </tbody> - </tgroup> - </table> - </sect1> <sect1 id="catalog-pg-namespace"> <title><structname>pg_namespace</structname></title> @@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt>< </sect1> + <sect1 id="catalog-pg-partitioned-table"> + <title><structname>pg_partitioned_table</structname></title> + + <indexterm zone="catalog-pg-partitioned-table"> + <primary>pg_partitioned_table</primary> + </indexterm> + + <para> + The catalog <structname>pg_partitioned_table</structname> stores + information about how tables are partitioned. + </para> + + <table> + <title><structname>pg_partitioned_table</> Columns</title> + + <tgroup cols="4"> + <thead> + <row> + <entry>Name</entry> + <entry>Type</entry> + <entry>References</entry> + <entry>Description</entry> + </row> + </thead> + + <tbody> + + <row> + <entry><structfield>partrelid</structfield></entry> + <entry><type>oid</type></entry> + <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry> + <entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry> + </row> + + <row> + <entry><structfield>partstrat</structfield></entry> + <entry><type>char</type></entry> + <entry></entry> + <entry> + Partitioning strategy; <literal>l</> = list partitioned table, + <literal>r</> = range partitioned table + </entry> + </row> + + <row> + <entry><structfield>partnatts</structfield></entry> + <entry><type>int2</type></entry> + <entry></entry> + <entry>The number of columns in partition key</entry> + </row> + + <row> + <entry><structfield>partattrs</structfield></entry> + <entry><type>int2vector</type></entry> + <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry> + <entry> + This is an array of <structfield>partnatts</structfield> values that + indicate which table columns are part of the partition key. For + example, a value of <literal>1 3</literal> would mean that the first + and the third table columns make up the partition key. A zero in this + array indicates that the corresponding partition key column is an + expression, rather than a simple column reference. + </entry> + </row> + + <row> + <entry><structfield>partclass</structfield></entry> + <entry><type>oidvector</type></entry> + <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry> + <entry> + For each column in the partition key, this contains the OID of the + operator class to use. See + <link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details. + </entry> + </row> + + <row> + <entry><structfield>partcollation</structfield></entry> + <entry><type>oidvector</type></entry> + <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry> + <entry> + For each column in the partition key, this contains the OID of the + the collation to use for partitioning. + </entry> + </row> + + <row> + <entry><structfield>partexprs</structfield></entry> + <entry><type>pg_node_tree</type></entry> + <entry></entry> + <entry> + Expression trees (in <function>nodeToString()</function> + representation) for partition key columns that are not simple column + references. This is a list with one element for each zero + entry in <structfield>partattrs</>. Null if all partition key columns + are simple references. + </entry> + </row> + + </tbody> + </tgroup> + </table> + </sect1> + + <sect1 id="catalog-pg-pltemplate"> <title><structname>pg_pltemplate</structname></title> @@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt>< </sect1> - <sect1 id="catalog-pg-partitioned-table"> - <title><structname>pg_partitioned_table</structname></title> - - <indexterm zone="catalog-pg-partitioned-table"> - <primary>pg_partitioned_table</primary> - </indexterm> - - <para> - The catalog <structname>pg_partitioned_table</structname> stores - information about how tables are partitioned. - </para> - - <table> - <title><structname>pg_partitioned_table</> Columns</title> - - <tgroup cols="4"> - <thead> - <row> - <entry>Name</entry> - <entry>Type</entry> - <entry>References</entry> - <entry>Description</entry> - </row> - </thead> - - <tbody> - - <row> - <entry><structfield>partrelid</structfield></entry> - <entry><type>oid</type></entry> - <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry> - <entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry> - </row> - - <row> - <entry><structfield>partstrat</structfield></entry> - <entry><type>char</type></entry> - <entry></entry> - <entry> - Partitioning strategy; <literal>l</> = list partitioned table, - <literal>r</> = range partitioned table - </entry> - </row> - - <row> - <entry><structfield>partnatts</structfield></entry> - <entry><type>int2</type></entry> - <entry></entry> - <entry>The number of columns in partition key</entry> - </row> - - <row> - <entry><structfield>partattrs</structfield></entry> - <entry><type>int2vector</type></entry> - <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry> - <entry> - This is an array of <structfield>partnatts</structfield> values that - indicate which table columns are part of the partition key. For - example, a value of <literal>1 3</literal> would mean that the first - and the third table columns make up the partition key. A zero in this - array indicates that the corresponding partition key column is an - expression, rather than a simple column reference. - </entry> - </row> - - <row> - <entry><structfield>partclass</structfield></entry> - <entry><type>oidvector</type></entry> - <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry> - <entry> - For each column in the partition key, this contains the OID of the - operator class to use. See - <link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details. - </entry> - </row> - - <row> - <entry><structfield>partcollation</structfield></entry> - <entry><type>oidvector</type></entry> - <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry> - <entry> - For each column in the partition key, this contains the OID of the - the collation to use for partitioning. - </entry> - </row> - - <row> - <entry><structfield>partexprs</structfield></entry> - <entry><type>pg_node_tree</type></entry> - <entry></entry> - <entry> - Expression trees (in <function>nodeToString()</function> - representation) for partition key columns that are not simple column - references. This is a list with one element for each zero - entry in <structfield>partattrs</>. Null if all partition key columns - are simple references. - </entry> - </row> - - </tbody> - </tgroup> - </table> - </sect1> <sect1 id="catalog-pg-policy"> <title><structname>pg_policy</structname></title> @@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt>< </sect1> + <sect1 id="catalog-pg-statistic-ext"> + <title><structname>pg_statistic_ext</structname></title> + + <indexterm zone="catalog-pg-statistic-ext"> + <primary>pg_statistic_ext</primary> + </indexterm> + + <para> + The catalog <structname>pg_statistic_ext</structname> + holds extended planner statistics. + Each row in this catalog corresponds to a <firstterm>statistics object</> + created with <xref linkend="sql-createstatistics">. + </para> + + <table> + <title><structname>pg_statistic_ext</> Columns</title> + + <tgroup cols="4"> + <thead> + <row> + <entry>Name</entry> + <entry>Type</entry> + <entry>References</entry> + <entry>Description</entry> + </row> + </thead> + + <tbody> + + <row> + <entry><structfield>stxrelid</structfield></entry> + <entry><type>oid</type></entry> + <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry> + <entry>Table containing the columns described by this object</entry> + </row> + + <row> + <entry><structfield>stxname</structfield></entry> + <entry><type>name</type></entry> + <entry></entry> + <entry>Name of the statistics object</entry> + </row> + + <row> + <entry><structfield>stxnamespace</structfield></entry> + <entry><type>oid</type></entry> + <entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry> + <entry> + The OID of the namespace that contains this statistics object + </entry> + </row> + + <row> + <entry><structfield>stxowner</structfield></entry> + <entry><type>oid</type></entry> + <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry> + <entry>Owner of the statistics object</entry> + </row> + + <row> + <entry><structfield>stxkeys</structfield></entry> + <entry><type>int2vector</type></entry> + <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry> + <entry> + An array of attribute numbers, indicating which table columns are + covered by this statistics object; + for example a value of <literal>1 3</literal> would + mean that the first and the third table columns are covered + </entry> + </row> + + <row> + <entry><structfield>stxkind</structfield></entry> + <entry><type>char[]</type></entry> + <entry></entry> + <entry> + An array containing codes for the enabled statistic types; + valid values are: + <literal>d</literal> for n-distinct statistics, + <literal>f</literal> for functional dependency statistics + </entry> + </row> + + <row> + <entry><structfield>stxndistinct</structfield></entry> + <entry><type>pg_ndistinct</type></entry> + <entry></entry> + <entry> + N-distinct counts, serialized as <structname>pg_ndistinct</> type + </entry> + </row> + + <row> + <entry><structfield>stxdependencies</structfield></entry> + <entry><type>pg_dependencies</type></entry> + <entry></entry> + <entry> + Functional dependency statistics, serialized + as <structname>pg_dependencies</> type + </entry> + </row> + + </tbody> + </tgroup> + </table> + + <para> + The <structfield>stxkind</structfield> field is filled at creation of the + statistics object, indicating which statistic type(s) are desired. + The fields after it are initially NULL and are filled only when the + corresponding statistic has been computed by <command>ANALYZE</>. + </para> + </sect1> + <sect1 id="catalog-pg-subscription"> <title><structname>pg_subscription</structname></title> diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 32e17ee5f8e..b4b8f8dcb8f 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1071,26 +1071,42 @@ WHERE tablename = 'road'; are independent of each other, an assumption that does not hold when column values are correlated. Regular statistics, because of their per-individual-column nature, - do not capture the knowledge of cross-column correlation; - <firstterm>multivariate statistics</firstterm> can be used to instruct - the server to obtain statistics across such a set of columns, - which are later used by the query optimizer - to determine cardinality and selectivity - of clauses involving those columns. - Multivariate statistics are currently the only use of - <firstterm>extended statistics</firstterm>. + cannot capture any knowledge about cross-column correlation. + However, <productname>PostgreSQL</> has the ability to compute + <firstterm>multivariate statistics</firstterm>, which can capture + such information. </para> <para> - Extended statistics are created using + Because the number of possible column combinations is very large, + it's impractical to compute multivariate statistics automatically. + Instead, <firstterm>extended statistics objects</firstterm>, more often + called just <firstterm>statistics objects</>, can be created to instruct + the server to obtain statistics across interesting sets of columns. + </para> + + <para> + Statistics objects are created using <xref linkend="sql-createstatistics">, which see for more details. - Data collection is deferred until the next <command>ANALYZE</command> - on the table, after which the stored values can be examined in the + Creation of such an object merely creates a catalog entry expressing + interest in the statistics. Actual data collection is performed + by <command>ANALYZE</command> (either a manual command, or background + auto-analyze). The collected values can be examined in the <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link> catalog. </para> <para> + <command>ANALYZE</command> computes extended statistics based on the same + sample of table rows that it takes for computing regular single-column + statistics. Since the sample size is increased by increasing the + statistics target for the table or any of its columns (as described in + the previous section), a larger statistics target will normally result in + more accurate extended statistics, as well as more time spent calculating + them. + </para> + + <para> The following subsections describe the types of extended statistics that are currently supported. </para> @@ -1099,142 +1115,162 @@ WHERE tablename = 'road'; <title>Functional Dependencies</title> <para> - The simplest type of extended statistics are functional dependencies, - a concept used in definitions of database normal forms. - Put simply, it is said that column <literal>b</> is functionally - dependent on column <literal>a</> if knowledge of the value of - <literal>a</> is sufficient to determine the value of <literal>b</>. - In normalized databases, functional dependencies are allowed only on - primary keys and superkeys. However, many data sets are in practice not - fully normalized for various reasons; intentional denormalization for - performance reasons is a common example. + The simplest type of extended statistics tracks <firstterm>functional + dependencies</>, a concept used in definitions of database normal forms. + We say that column <structfield>b</> is functionally dependent on + column <structfield>a</> if knowledge of the value of + <structfield>a</> is sufficient to determine the value + of <structfield>b</>, that is there are no two rows having the same value + of <structfield>a</> but different values of <structfield>b</>. + In a fully normalized database, functional dependencies should exist + only on primary keys and superkeys. However, in practice many data sets + are not fully normalized for various reasons; intentional + denormalization for performance reasons is a common example. + Even in a fully normalized database, there may be partial correlation + between some columns, which can be expressed as partial functional + dependency. </para> <para> - The existance of functional dependencies directly affects the accuracy - of estimates in certain queries. - The reason is that conditions on the dependent columns do not - restrict the result set, but the query planner (lacking functional - dependency knowledge) considers them independent, resulting in - underestimates. - To inform the planner about the functional dependencies, we collect - measurements of dependency during <command>ANALYZE</>. Assessing - the degree of dependency between all sets of columns would be - prohibitively expensive, so the search is limited to potential - dependencies defined using the <literal>dependencies</> option of - extended statistics. It is advisable to create - <literal>dependencies</> statistics if and only if functional - dependencies actually exist, to avoid unnecessary overhead on both - <command>ANALYZE</> and query planning. + The existence of functional dependencies directly affects the accuracy + of estimates in certain queries. If a query contains conditions on + both the independent and the dependent column(s), the + conditions on the dependent columns do not further reduce the result + size; but without knowledge of the functional dependency, the query + planner will assume that the conditions are independent, resulting + in underestimating the result size. </para> <para> - To inspect functional dependencies on a statistics - <literal>stts</literal>, you may do this: + To inform the planner about functional dependencies, <command>ANALYZE</> + can collect measurements of cross-column dependency. Assessing the + degree of dependency between all sets of columns would be prohibitively + expensive, so data collection is limited to those groups of columns + appearing together in a statistics object defined with + the <literal>dependencies</> option. It is advisable to create + <literal>dependencies</> statistics only for column groups that are + strongly correlated, to avoid unnecessary overhead in both + <command>ANALYZE</> and later query planning. + </para> + + <para> + Here is an example of collecting functional-dependency statistics: <programlisting> -CREATE STATISTICS stts (dependencies) - ON zip, city FROM zipcodes; +CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes; + ANALYZE zipcodes; + SELECT stxname, stxkeys, stxdependencies FROM pg_statistic_ext - WHERE stxname = 'stts'; + WHERE stxname = 'stts'; stxname | stxkeys | stxdependencies ---------+---------+------------------------------------------ stts | 1 5 | {"1 => 5": 1.000000, "5 => 1": 0.423130} (1 row) </programlisting> - where it can be seen that column 1 (a zip code) fully determines column + Here it can be seen that column 1 (zip code) fully determines column 5 (city) so the coefficient is 1.0, while city only determines zip code about 42% of the time, meaning that there are many cities (58%) that are represented by more than a single ZIP code. </para> <para> - When computing the selectivity, the planner inspects all conditions and - attempts to identify which conditions are already implied by other - conditions. The selectivity estimates from any redundant conditions are - ignored from a selectivity point of view. In the example query above, - the selectivity estimates for either of the conditions may be eliminated, - thus improving the overall estimate. + When computing the selectivity for a query involving functionally + dependent columns, the planner adjusts the per-condition selectivity + estimates using the dependency coefficients so as not to produce + an underestimate. </para> <sect4> <title>Limitations of Functional Dependencies</title> <para> - Functional dependencies are a very simple type of statistics, and - as such have several limitations. The first limitation is that they - only work with simple equality conditions, comparing columns and constant - values. It's not possible to use them to eliminate equality conditions - comparing two columns or a column to an expression, range clauses, - <literal>LIKE</> or any other type of conditions. + Functional dependencies are currently only applied when considering + simple equality conditions that compare columns to constant values. + They are not used to improve estimates for equality conditions + comparing two columns or comparing a column to an expression, nor for + range clauses, <literal>LIKE</> or any other type of condition. </para> <para> - When eliminating the implied conditions, the planner assumes that the - conditions are compatible. Consider the following example, where - this assumption does not hold: - + When estimating with functional dependencies, the planner assumes that + conditions on the involved columns are compatible and hence redundant. + If they are incompatible, the correct estimate would be zero rows, but + that possibility is not considered. For example, given a query like <programlisting> -EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10; - QUERY PLAN ------------------------------------------------------------------------------ - Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1) - Filter: ((a = 1) AND (b = 10)) - Rows Removed by Filter: 10000 +SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105'; </programlisting> - - While there are no rows with such combination of values, the planner - is unable to verify whether the values match — it only knows that - the columns are functionally dependent. + the planner will disregard the <structfield>city</> clause as not + changing the selectivity, which is correct. However, it will make + the same assumption about +<programlisting> +SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210'; +</programlisting> + even though there will really be zero rows satisfying this query. + Functional dependency statistics do not provide enough information + to conclude that, however. </para> <para> - This assumption is related to queries executed on the database; in many - cases, it's actually satisfied (e.g. when the GUI only allows selecting - compatible values). But if that's not the case, functional dependencies - may not be a viable option. + In many practical situations, this assumption is usually satisfied; + for example, there might be a GUI in the application that only allows + selecting compatible city and zipcode values to use in a query. + But if that's not the case, functional dependencies may not be a viable + option. </para> </sect4> </sect3> <sect3> - <title>Multivariate N-Distinct Coefficients</title> + <title>Multivariate N-Distinct Counts</title> <para> Single-column statistics store the number of distinct values in each - column. Estimates of the number of distinct values on more than one - column (for example, for <literal>GROUP BY a, b</literal>) are + column. Estimates of the number of distinct values when combining more + than one column (for example, for <literal>GROUP BY a, b</literal>) are frequently wrong when the planner only has single-column statistical - data, however, causing it to select bad plans. - In order to improve n-distinct estimation when multiple columns are - grouped together, the <literal>ndistinct</> option of extended statistics - can be used, which instructs <command>ANALYZE</> to collect n-distinct - estimates for all possible combinations of two or more columns of the set - of columns in the statistics object (the per-column estimates are already - available in <structname>pg_statistic</>). + data, causing it to select bad plans. + </para> + + <para> + To improve such estimates, <command>ANALYZE</> can collect n-distinct + statistics for groups of columns. As before, it's impractical to do + this for every possible column grouping, so data is collected only for + those groups of columns appearing together in a statistics object + defined with the <literal>ndistinct</> option. Data will be collected + for each possible combination of two or more columns from the set of + listed columns. </para> <para> - Continuing the above example, the n-distinct coefficients in a ZIP - code table may look like the following: + Continuing the previous example, the n-distinct counts in a + table of ZIP codes might look like the following: <programlisting> -CREATE STATISTICS stts2 (ndistinct) - ON zip, state, city FROM zipcodes; +CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes; + ANALYZE zipcodes; + SELECT stxkeys AS k, stxndistinct AS nd FROM pg_statistic_ext - WHERE stxname = 'stts2'; + WHERE stxname = 'stts2'; -[ RECORD 1 ]-------------------------------------------------------- k | 1 2 5 nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178} (1 row) </programlisting> - which indicates that there are three combinations of columns that + This indicates that there are three combinations of columns that have 33178 distinct values: ZIP code and state; ZIP code and city; and ZIP code, city and state (the fact that they are all equal is - expected given the nature of ZIP-code data). On the other hand, - the combination of city and state only has 27435 distinct values. + expected given that ZIP code alone is unique in this table). On the + other hand, the combination of city and state has only 27435 distinct + values. + </para> + + <para> + It's advisable to create <literal>ndistinct</> statistics objects only + on combinations of columns that are actually used for grouping, and + for which misestimation of the number of groups is resulting in bad + plans. Otherwise, the <command>ANALYZE</> cycles are just wasted. </para> </sect3> </sect2> diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml index ef847b96333..8caf297f859 100644 --- a/doc/src/sgml/planstats.sgml +++ b/doc/src/sgml/planstats.sgml @@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity </indexterm> <sect2> - <title>Functional dependencies</title> + <title>Functional Dependencies</title> + <para> - Multivariate correlation can be seen with a very simple data set — a - table with two columns, both containing the same values: + Multivariate correlation can be demonstrated with a very simple data set + — a table with two columns, both containing the same values: <programlisting> CREATE TABLE t (a INT, b INT); @@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1; number of rows, we see that the estimate is very accurate (in fact exact, as the table is very small). Changing the <literal>WHERE</> to use the <structfield>b</> column, an identical - plan is generated. Observe what happens if we apply the same - condition on both columns combining them with <literal>AND</>: + plan is generated. But observe what happens if we apply the same + condition on both columns, combining them with <literal>AND</>: <programlisting> EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; @@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; </programlisting> The planner estimates the selectivity for each condition individually, - arriving to the 1% estimates as above, and then multiplies them, getting - the final 0.01% estimate. The <quote>actual</quote> figures, however, - show that this results in a significant underestimate, as the actual - number of rows matching the conditions (100) is two orders of magnitude - higher than the estimated value. + arriving at the same 1% estimates as above. Then it assumes that the + conditions are independent, and so it multiplies their selectivities, + producing a final selectivity estimate of just 0.01%. + This is a significant underestimate, as the actual number of rows + matching the conditions (100) is two orders of magnitude higher. </para> <para> - This problem can be fixed by applying functional-dependency + This problem can be fixed by creating a statistics object that + directs <command>ANALYZE</> to calculate functional-dependency multivariate statistics on the two columns: <programlisting> @@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; </sect2> <sect2> - <title>Multivariate N-Distinct coefficients</title> + <title>Multivariate N-Distinct Counts</title> + <para> - A similar problem occurs with estimation of the cardinality of distinct - elements, used to determine the number of groups that would be generated - by a <command>GROUP BY</command> clause. When <command>GROUP BY</command> - lists a single column, the n-distinct estimate (which can be seen as the - number of rows returned by the aggregate execution node) is very accurate: + A similar problem occurs with estimation of the cardinality of sets of + multiple columns, such as the number of groups that would be generated by + a <command>GROUP BY</command> clause. When <command>GROUP BY</command> + lists a single column, the n-distinct estimate (which is visible as the + estimated number of rows returned by the HashAggregate node) is very + accurate: <programlisting> EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a; QUERY PLAN @@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b; Group Key: a, b -> Seq Scan on t (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1) </programlisting> - By dropping the existing statistics and re-creating it to include n-distinct - calculation, the estimate is much improved: + By redefining the statistics object to include n-distinct counts for the + two columns, the estimate is much improved: <programlisting> DROP STATISTICS stts; CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t; diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml index 92ee4e4efa7..854746de24d 100644 --- a/doc/src/sgml/ref/create_statistics.sgml +++ b/doc/src/sgml/ref/create_statistics.sgml @@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na <term><replaceable class="PARAMETER">statistic_type</replaceable></term> <listitem> <para> - A statistic type to be computed in this statistics object. Currently - supported types are <literal>ndistinct</literal>, which enables - n-distinct coefficient tracking, - and <literal>dependencies</literal>, which enables functional - dependencies. + A statistic type to be computed in this statistics object. + Currently supported types are + <literal>ndistinct</literal>, which enables n-distinct statistics, and + <literal>dependencies</literal>, which enables functional + dependency statistics. + For more information, see <xref linkend="planner-stats-extended"> + and <xref linkend="multivariate-statistics-examples">. </para> </listitem> </varlistentry> @@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na <term><replaceable class="PARAMETER">column_name</replaceable></term> <listitem> <para> - The name of a table column to be included in the statistics object. + The name of a table column to be covered by the computed statistics. + At least two column names must be given. </para> </listitem> </varlistentry> @@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na <title>Notes</title> <para> - You must be the owner of a table to create or change statistics on it. + You must be the owner of a table to create a statistics object + reading it. Once created, however, the ownership of the statistics + object is independent of the underlying table(s). </para> </refsect1> @@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na <para> Create table <structname>t1</> with two functionally dependent columns, i.e. knowledge of a value in the first column is sufficient for determining the - value in the other column. Then functional dependencies are built on those - columns: + value in the other column. Then functional dependency statistics are built + on those columns: <programlisting> CREATE TABLE t1 ( @@ -136,21 +141,25 @@ CREATE TABLE t1 ( INSERT INTO t1 SELECT i/100, i/500 FROM generate_series(1,1000000) s(i); +ANALYZE t1; + +-- the number of matching rows will be drastically underestimated: +EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0); + CREATE STATISTICS s1 (dependencies) ON a, b FROM t1; ANALYZE t1; --- valid combination of values +-- now the rowcount estimate is more accurate: EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0); - --- invalid combination of values -EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1); </programlisting> - Without functional-dependency statistics, the planner would make the - same estimate of the number of matching rows for these two queries. - With such statistics, it is able to tell that one case has matches - and the other does not. + Without functional-dependency statistics, the planner would assume + that the two <literal>WHERE</> conditions are independent, and would + multiply their selectivities together to arrive at a much-too-small + rowcount estimate. + With such statistics, the planner recognizes that the <literal>WHERE</> + conditions are redundant and does not underestimate the rowcount. </para> </refsect1> |