Edit SGML documentation related to extended statistics.

author Tom Lane

Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)

committer Tom Lane

Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
author Tom Lane
Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
committer Tom Lane
Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index aa5e705e57493b3be19f514e30c633129433a8b7..b2fae027f5f7257e1f79cfcc49500ebfa0095a00 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -221,13 +221,13 @@
       
  
       
-      ltemplate">pg_pltemplate
-      template data for procedural languages
+      artitioned-table">pg_partitioned_table
+      information about partition key of tables
       
  
       
-      artitioned-table">pg_partitioned_table
-      information about partition key of tables
+      ltemplate">pg_pltemplate
+      template data for procedural languages
       
  
       
@@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<iteration count>:<salt><
    
   
  
- 
-  <structname>pg_statistic_ext</structname>
-
-  
-   pg_statistic_ext
-  
-
-  
-   The catalog pg_statistic_ext
-   holds extended planner statistics.
-  
-
-  -   <structname>pg_statistic_ext</> Columns
-
-   
-    
-     
-      Name
-      Type
-      References
-      Description
-     
-    
-
-    
-
-     
-      stxrelid
-      oid
-      pg_class.oid
-      The table that the described columns belongs to
-     
-
-     
-      stxname
-      name
-      
-      Name of the statistic.
-     
-
-     
-      stxnamespace
-      oid
-      pg_namespace.oid
-      
-       The OID of the namespace that contains this statistic
-      
-     
-
-     
-      stxowner
-      oid
-      pg_authid.oid
-      Owner of the statistic
-     
-
-     
-      stxkeys
-      int2vector
-      pg_attribute.attnum
-      
-       This is an array of values that indicate which table columns this
-       statistic covers. For example a value of 1 3 would
-       mean that the first and the third table columns make up the statistic key.
-      
-     
-
-     
-      stxkind
-      char[]
-      
-      
-        An array with the modes of the enabled statistic types.  Valid values
-        are:
-        d for ndistinct coefficients,
-        f for functional dependencies.
-      
-     
-
-     
-      stxndistinct
-      pg_ndistinct
-      
-      
-       N-distinct coefficients, serialized as pg_ndistinct type.
-      
-     
-
-     
-      stxdependencies
-      pg_dependencies
-      
-      
-       Functional dependencies, serialized as pg_dependencies type.
-      
-     
-
-    
-   
-  
-   <structname>pg_statistic_ext</> Columns
-
-   
-    
-     
-      Name
-      Type
-      References
-      Description
-     
-    
-
-    
-
-     
-      stxrelid
-      oid
-      pg_class.oid
-      The table that the described columns belongs to
-     
-
-     
-      stxname
-      name
-      
-      Name of the statistic.
-     
-
-     
-      stxnamespace
-      oid
-      pg_namespace.oid
-      
-       The OID of the namespace that contains this statistic
-      
-     
-
-     
-      stxowner
-      oid
-      pg_authid.oid
-      Owner of the statistic
-     
-
-     
-      stxkeys
-      int2vector
-      pg_attribute.attnum
-      
-       This is an array of values that indicate which table columns this
-       statistic covers. For example a value of 1 3 would
-       mean that the first and the third table columns make up the statistic key.
-      
-     
-
-     
-      stxkind
-      char[]
-      
-      
-        An array with the modes of the enabled statistic types.  Valid values
-        are:
-        d for ndistinct coefficients,
-        f for functional dependencies.
-      
-     
-
-     
-      stxndistinct
-      pg_ndistinct
-      
-      
-       N-distinct coefficients, serialized as pg_ndistinct type.
-      
-     
-
-     
-      stxdependencies
-      pg_dependencies
-      
-      
-       Functional dependencies, serialized as pg_dependencies type.
-      
-     
-
-    
-   
-  
- 
  
   
    <structname>pg_namespace</structname>
@@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<iteration count>:<salt><
   
  
  
+ 
+  <structname>pg_partitioned_table</structname>
+
+  
+   pg_partitioned_table
+  
+
+  
+   The catalog pg_partitioned_table stores
+   information about how tables are partitioned.
+  
+
+  +   <structname>pg_partitioned_table</> Columns
+
+   
+    
+     
+      Name
+      Type
+      References
+      Description
+     
+    
+
+    
+
+     
+      partrelid
+      oid
+      pg_class.oid
+      The OID of the pg_class entry for this partitioned table
+     
+
+     
+      partstrat
+      char
+      
+      
+       Partitioning strategy; l = list partitioned table,
+       r = range partitioned table
+      
+     
+
+     
+      partnatts
+      int2
+      
+      The number of columns in partition key
+     
+
+     
+      partattrs
+      int2vector
+      pg_attribute.attnum
+      
+       This is an array of partnatts values that
+       indicate which table columns are part of the partition key.  For
+       example, a value of 1 3 would mean that the first
+       and the third table columns make up the partition key.  A zero in this
+       array indicates that the corresponding partition key column is an
+       expression, rather than a simple column reference.
+      
+     
+
+     
+      partclass
+      oidvector
+      pg_opclass.oid
+      
+       For each column in the partition key, this contains the OID of the
+       operator class to use.  See
+       pg_opclass for details.
+      
+     
+
+     
+      partcollation
+      oidvector
+      pg_opclass.oid
+      
+       For each column in the partition key, this contains the OID of the
+       the collation to use for partitioning.
+      
+     
+
+     
+      partexprs
+      pg_node_tree
+      
+      
+       Expression trees (in nodeToString()
+       representation) for partition key columns that are not simple column
+       references.  This is a list with one element for each zero
+       entry in partattrs.  Null if all partition key columns
+       are simple references.
+      
+     
+
+    
+   
+  
+   <structname>pg_partitioned_table</> Columns
+
+   
+    
+     
+      Name
+      Type
+      References
+      Description
+     
+    
+
+    
+
+     
+      partrelid
+      oid
+      pg_class.oid
+      The OID of the pg_class entry for this partitioned table
+     
+
+     
+      partstrat
+      char
+      
+      
+       Partitioning strategy; l = list partitioned table,
+       r = range partitioned table
+      
+     
+
+     
+      partnatts
+      int2
+      
+      The number of columns in partition key
+     
+
+     
+      partattrs
+      int2vector
+      pg_attribute.attnum
+      
+       This is an array of partnatts values that
+       indicate which table columns are part of the partition key.  For
+       example, a value of 1 3 would mean that the first
+       and the third table columns make up the partition key.  A zero in this
+       array indicates that the corresponding partition key column is an
+       expression, rather than a simple column reference.
+      
+     
+
+     
+      partclass
+      oidvector
+      pg_opclass.oid
+      
+       For each column in the partition key, this contains the OID of the
+       operator class to use.  See
+       pg_opclass for details.
+      
+     
+
+     
+      partcollation
+      oidvector
+      pg_opclass.oid
+      
+       For each column in the partition key, this contains the OID of the
+       the collation to use for partitioning.
+      
+     
+
+     
+      partexprs
+      pg_node_tree
+      
+      
+       Expression trees (in nodeToString()
+       representation) for partition key columns that are not simple column
+       references.  This is a list with one element for each zero
+       entry in partattrs.  Null if all partition key columns
+       are simple references.
+      
+     
+
+    
+   
+  
+ 
+
+
   
    <structname>pg_pltemplate</structname>
  
@@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<iteration count>:<salt><
  
   
  
- 
-  <structname>pg_partitioned_table</structname>
-
-  
-   pg_partitioned_table
-  
-
-  
-   The catalog pg_partitioned_table stores
-   information about how tables are partitioned.
-  
-
-  -   <structname>pg_partitioned_table</> Columns
-
-   
-    
-     
-      Name
-      Type
-      References
-      Description
-     
-    
-
-    
-
-     
-      partrelid
-      oid
-      pg_class.oid
-      The OID of the pg_class entry for this partitioned table
-     
-
-     
-      partstrat
-      char
-      
-      
-       Partitioning strategy; l = list partitioned table,
-       r = range partitioned table
-      
-     
-
-     
-      partnatts
-      int2
-      
-      The number of columns in partition key
-     
-
-     
-      partattrs
-      int2vector
-      pg_attribute.attnum
-      
-       This is an array of partnatts values that
-       indicate which table columns are part of the partition key.  For
-       example, a value of 1 3 would mean that the first
-       and the third table columns make up the partition key.  A zero in this
-       array indicates that the corresponding partition key column is an
-       expression, rather than a simple column reference.
-      
-     
-
-     
-      partclass
-      oidvector
-      pg_opclass.oid
-      
-       For each column in the partition key, this contains the OID of the
-       operator class to use.  See
-       pg_opclass for details.
-      
-     
-
-     
-      partcollation
-      oidvector
-      pg_opclass.oid
-      
-       For each column in the partition key, this contains the OID of the
-       the collation to use for partitioning.
-      
-     
-
-     
-      partexprs
-      pg_node_tree
-      
-      
-       Expression trees (in nodeToString()
-       representation) for partition key columns that are not simple column
-       references.  This is a list with one element for each zero
-       entry in partattrs.  Null if all partition key columns
-       are simple references.
-      
-     
-
-    
-   
-  
-   <structname>pg_partitioned_table</> Columns
-
-   
-    
-     
-      Name
-      Type
-      References
-      Description
-     
-    
-
-    
-
-     
-      partrelid
-      oid
-      pg_class.oid
-      The OID of the pg_class entry for this partitioned table
-     
-
-     
-      partstrat
-      char
-      
-      
-       Partitioning strategy; l = list partitioned table,
-       r = range partitioned table
-      
-     
-
-     
-      partnatts
-      int2
-      
-      The number of columns in partition key
-     
-
-     
-      partattrs
-      int2vector
-      pg_attribute.attnum
-      
-       This is an array of partnatts values that
-       indicate which table columns are part of the partition key.  For
-       example, a value of 1 3 would mean that the first
-       and the third table columns make up the partition key.  A zero in this
-       array indicates that the corresponding partition key column is an
-       expression, rather than a simple column reference.
-      
-     
-
-     
-      partclass
-      oidvector
-      pg_opclass.oid
-      
-       For each column in the partition key, this contains the OID of the
-       operator class to use.  See
-       pg_opclass for details.
-      
-     
-
-     
-      partcollation
-      oidvector
-      pg_opclass.oid
-      
-       For each column in the partition key, this contains the OID of the
-       the collation to use for partitioning.
-      
-     
-
-     
-      partexprs
-      pg_node_tree
-      
-      
-       Expression trees (in nodeToString()
-       representation) for partition key columns that are not simple column
-       references.  This is a list with one element for each zero
-       entry in partattrs.  Null if all partition key columns
-       are simple references.
-      
-     
-
-    
-   
-  
- 
  
   
    <structname>pg_policy</structname>
@@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<iteration count>:<salt><
  
   
  
+ 
+  <structname>pg_statistic_ext</structname>
+
+  
+   pg_statistic_ext
+  
+
+  
+   The catalog pg_statistic_ext
+   holds extended planner statistics.
+   Each row in this catalog corresponds to a statistics object
+   created with .
+  
+
+  +   <structname>pg_statistic_ext</> Columns
+
+   
+    
+     
+      Name
+      Type
+      References
+      Description
+     
+    
+
+    
+
+     
+      stxrelid
+      oid
+      pg_class.oid
+      Table containing the columns described by this object
+     
+
+     
+      stxname
+      name
+      
+      Name of the statistics object
+     
+
+     
+      stxnamespace
+      oid
+      pg_namespace.oid
+      
+       The OID of the namespace that contains this statistics object
+      
+     
+
+     
+      stxowner
+      oid
+      pg_authid.oid
+      Owner of the statistics object
+     
+
+     
+      stxkeys
+      int2vector
+      pg_attribute.attnum
+      
+       An array of attribute numbers, indicating which table columns are
+       covered by this statistics object;
+       for example a value of 1 3 would
+       mean that the first and the third table columns are covered
+      
+     
+
+     
+      stxkind
+      char[]
+      
+      
+        An array containing codes for the enabled statistic types;
+        valid values are:
+        d for n-distinct statistics,
+        f for functional dependency statistics
+      
+     
+
+     
+      stxndistinct
+      pg_ndistinct
+      
+      
+       N-distinct counts, serialized as pg_ndistinct type
+      
+     
+
+     
+      stxdependencies
+      pg_dependencies
+      
+      
+       Functional dependency statistics, serialized
+       as pg_dependencies type
+      
+     
+
+    
+   
+  
+   <structname>pg_statistic_ext</> Columns
+
+   
+    
+     
+      Name
+      Type
+      References
+      Description
+     
+    
+
+    
+
+     
+      stxrelid
+      oid
+      pg_class.oid
+      Table containing the columns described by this object
+     
+
+     
+      stxname
+      name
+      
+      Name of the statistics object
+     
+
+     
+      stxnamespace
+      oid
+      pg_namespace.oid
+      
+       The OID of the namespace that contains this statistics object
+      
+     
+
+     
+      stxowner
+      oid
+      pg_authid.oid
+      Owner of the statistics object
+     
+
+     
+      stxkeys
+      int2vector
+      pg_attribute.attnum
+      
+       An array of attribute numbers, indicating which table columns are
+       covered by this statistics object;
+       for example a value of 1 3 would
+       mean that the first and the third table columns are covered
+      
+     
+
+     
+      stxkind
+      char[]
+      
+      
+        An array containing codes for the enabled statistic types;
+        valid values are:
+        d for n-distinct statistics,
+        f for functional dependency statistics
+      
+     
+
+     
+      stxndistinct
+      pg_ndistinct
+      
+      
+       N-distinct counts, serialized as pg_ndistinct type
+      
+     
+
+     
+      stxdependencies
+      pg_dependencies
+      
+      
+       Functional dependency statistics, serialized
+       as pg_dependencies type
+      
+     
+
+    
+   
+  
+
+  
+   The stxkind field is filled at creation of the
+   statistics object, indicating which statistic type(s) are desired.
+   The fields after it are initially NULL and are filled only when the
+   corresponding statistic has been computed by ANALYZE.
+  
+ 
+
   
    <structname>pg_subscription</structname>
  
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml

index 32e17ee5f8e19eb1965fb7fea4ac92fa629eb015..b4b8f8dcb8fbc763147368d4b8d3d0567c288723 100644 (file)
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1071,25 +1071,41 @@ WHERE tablename = 'road';
      are independent of each other,
      an assumption that does not hold when column values are correlated.
      Regular statistics, because of their per-individual-column nature,
-    do not capture the knowledge of cross-column correlation;
-    multivariate statistics can be used to instruct
-    the server to obtain statistics across such a set of columns,
-    which are later used by the query optimizer
-    to determine cardinality and selectivity
-    of clauses involving those columns.
-    Multivariate statistics are currently the only use of
-    extended statistics.
+    cannot capture any knowledge about cross-column correlation.
+    However, PostgreSQL has the ability to compute
+    multivariate statistics, which can capture
+    such information.
     
  
     
-    Extended statistics are created using
+    Because the number of possible column combinations is very large,
+    it's impractical to compute multivariate statistics automatically.
+    Instead, extended statistics objects, more often
+    called just statistics objects, can be created to instruct
+    the server to obtain statistics across interesting sets of columns.
+   
+
+   
+    Statistics objects are created using
      , which see for more details.
-    Data collection is deferred until the next ANALYZE
-    on the table, after which the stored values can be examined in the
+    Creation of such an object merely creates a catalog entry expressing
+    interest in the statistics.  Actual data collection is performed
+    by ANALYZE (either a manual command, or background
+    auto-analyze).  The collected values can be examined in the
      pg_statistic_ext
      catalog.
     
  
+   
+    ANALYZE computes extended statistics based on the same
+    sample of table rows that it takes for computing regular single-column
+    statistics.  Since the sample size is increased by increasing the
+    statistics target for the table or any of its columns (as described in
+    the previous section), a larger statistics target will normally result in
+    more accurate extended statistics, as well as more time spent calculating
+    them.
+   
+
     
      The following subsections describe the types of extended statistics
      that are currently supported.
@@ -1099,142 +1115,162 @@ WHERE tablename = 'road';
      Functional Dependencies
  
      
-     The simplest type of extended statistics are functional dependencies,
-     a concept used in definitions of database normal forms.
-     Put simply, it is said that column b is functionally
-     dependent on column a if knowledge of the value of
-     a is sufficient to determine the value of b.
-     In normalized databases, functional dependencies are allowed only on
-     primary keys and superkeys. However, many data sets are in practice not
-     fully normalized for various reasons; intentional denormalization for
-     performance reasons is a common example.
+     The simplest type of extended statistics tracks functional
+     dependencies, a concept used in definitions of database normal forms.
+     We say that column b is functionally dependent on
+     column a if knowledge of the value of
+     a is sufficient to determine the value
+     of b, that is there are no two rows having the same value
+     of a but different values of b.
+     In a fully normalized database, functional dependencies should exist
+     only on primary keys and superkeys. However, in practice many data sets
+     are not fully normalized for various reasons; intentional
+     denormalization for performance reasons is a common example.
+     Even in a fully normalized database, there may be partial correlation
+     between some columns, which can be expressed as partial functional
+     dependency.
      
  
      
-     The existance of functional dependencies directly affects the accuracy
-     of estimates in certain queries.
-     The reason is that conditions on the dependent columns do not
-     restrict the result set, but the query planner (lacking functional
-     dependency knowledge) considers them independent, resulting in
-     underestimates.
-     To inform the planner about the functional dependencies, we collect
-     measurements of dependency during ANALYZE. Assessing
-     the degree of dependency between all sets of columns would be
-     prohibitively expensive, so the search is limited to potential
-     dependencies defined using the dependencies option of
-     extended statistics.  It is advisable to create
-     dependencies statistics if and only if functional
-     dependencies actually exist, to avoid unnecessary overhead on both
-     ANALYZE and query planning.
+     The existence of functional dependencies directly affects the accuracy
+     of estimates in certain queries.  If a query contains conditions on
+     both the independent and the dependent column(s), the
+     conditions on the dependent columns do not further reduce the result
+     size; but without knowledge of the functional dependency, the query
+     planner will assume that the conditions are independent, resulting
+     in underestimating the result size.
      
  
      
-     To inspect functional dependencies on a statistics
-     stts, you may do this:
+     To inform the planner about functional dependencies, ANALYZE
+     can collect measurements of cross-column dependency. Assessing the
+     degree of dependency between all sets of columns would be prohibitively
+     expensive, so data collection is limited to those groups of columns
+     appearing together in a statistics object defined with
+     the dependencies option.  It is advisable to create
+     dependencies statistics only for column groups that are
+     strongly correlated, to avoid unnecessary overhead in both
+     ANALYZE and later query planning.
+    
+
+    
+     Here is an example of collecting functional-dependency statistics:
  
-CREATE STATISTICS stts (dependencies)
-               ON zip, city FROM zipcodes;
+CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes;
+
  ANALYZE zipcodes;
+
  SELECT stxname, stxkeys, stxdependencies
    FROM pg_statistic_ext
- WHERE stxname = 'stts';
+  WHERE stxname = 'stts';
   stxname | stxkeys |             stxdependencies               
  ---------+---------+------------------------------------------
   stts    | 1 5     | {"1 => 5": 1.000000, "5 => 1": 0.423130}
  (1 row)
  
-     where it can be seen that column 1 (a zip code) fully determines column
+     Here it can be seen that column 1 (zip code) fully determines column
       5 (city) so the coefficient is 1.0, while city only determines zip code
       about 42% of the time, meaning that there are many cities (58%) that are
       represented by more than a single ZIP code.
      
  
      
-     When computing the selectivity, the planner inspects all conditions and
-     attempts to identify which conditions are already implied by other
-     conditions.  The selectivity estimates from any redundant conditions are
-     ignored from a selectivity point of view. In the example query above,
-     the selectivity estimates for either of the conditions may be eliminated,
-     thus improving the overall estimate.
+     When computing the selectivity for a query involving functionally
+     dependent columns, the planner adjusts the per-condition selectivity
+     estimates using the dependency coefficients so as not to produce
+     an underestimate.
      
  
      
       Limitations of Functional Dependencies
  
       
-      Functional dependencies are a very simple type of statistics, and
-      as such have several limitations. The first limitation is that they
-      only work with simple equality conditions, comparing columns and constant
-      values. It's not possible to use them to eliminate equality conditions
-      comparing two columns or a column to an expression, range clauses,
-      LIKE or any other type of conditions.
+      Functional dependencies are currently only applied when considering
+      simple equality conditions that compare columns to constant values.
+      They are not used to improve estimates for equality conditions
+      comparing two columns or comparing a column to an expression, nor for
+      range clauses, LIKE or any other type of condition.
       
  
       
-      When eliminating the implied conditions, the planner assumes that the
-      conditions are compatible. Consider the following example, where
-      this assumption does not hold:
-
+      When estimating with functional dependencies, the planner assumes that
+      conditions on the involved columns are compatible and hence redundant.
+      If they are incompatible, the correct estimate would be zero rows, but
+      that possibility is not considered.  For example, given a query like
  
-EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
- Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1)
-   Filter: ((a = 1) AND (b = 10))
-   Rows Removed by Filter: 10000
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105';
  
-
-      While there are no rows with such combination of values, the planner
-      is unable to verify whether the values match — it only knows that
-      the columns are functionally dependent.
+      the planner will disregard the city clause as not
+      changing the selectivity, which is correct.  However, it will make
+      the same assumption about
+
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210';
+
+      even though there will really be zero rows satisfying this query.
+      Functional dependency statistics do not provide enough information
+      to conclude that, however.
       
  
       
-      This assumption is related to queries executed on the database; in many
-      cases, it's actually satisfied (e.g. when the GUI only allows selecting
-      compatible values). But if that's not the case, functional dependencies
-      may not be a viable option.
+      In many practical situations, this assumption is usually satisfied;
+      for example, there might be a GUI in the application that only allows
+      selecting compatible city and zipcode values to use in a query.
+      But if that's not the case, functional dependencies may not be a viable
+      option.
       
      
     
  
     
-    Multivariate N-Distinct Co<span class="marked">efficie</span>nts
+    Multivariate N-Distinct Co<span class="marked">u</span>nts
  
      
       Single-column statistics store the number of distinct values in each
-     column.  Estimates of the number of distinct values on more than one
-     column (for example, for GROUP BY a, b) are
+     column.  Estimates of the number of distinct values when combining more
+     than one column (for example, for GROUP BY a, b) are
       frequently wrong when the planner only has single-column statistical
-     data, however, causing it to select bad plans.
-     In order to improve n-distinct estimation when multiple columns are
-     grouped together, the ndistinct option of extended statistics
-     can be used, which instructs ANALYZE to collect n-distinct
-     estimates for all possible combinations of two or more columns of the set
-     of columns in the statistics object (the per-column estimates are already
-     available in pg_statistic).
+     data, causing it to select bad plans.
+    
+
+    
+     To improve such estimates, ANALYZE can collect n-distinct
+     statistics for groups of columns.  As before, it's impractical to do
+     this for every possible column grouping, so data is collected only for
+     those groups of columns appearing together in a statistics object
+     defined with the ndistinct option.  Data will be collected
+     for each possible combination of two or more columns from the set of
+     listed columns.
      
  
      
-     Continuing the above example, the n-distinct coefficients in a ZIP
-     code table may look like the following:
+     Continuing the previous example, the n-distinct counts in a
+     table of ZIP codes might look like the following:
  
-CREATE STATISTICS stts2 (ndistinct)
-               ON zip, state, city FROM zipcodes;
+CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes;
+
  ANALYZE zipcodes;
+
  SELECT stxkeys AS k, stxndistinct AS nd
    FROM pg_statistic_ext
- WHERE stxname = 'stts2';
+  WHERE stxname = 'stts2';
  -[ RECORD 1 ]--------------------------------------------------------
  k  | 1 2 5
  nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
  (1 row)
  
-     which indicates that there are three combinations of columns that
+     This indicates that there are three combinations of columns that
       have 33178 distinct values: ZIP code and state; ZIP code and city;
       and ZIP code, city and state (the fact that they are all equal is
-     expected given the nature of ZIP-code data).  On the other hand,
-     the combination of city and state only has 27435 distinct values.
+     expected given that ZIP code alone is unique in this table).  On the
+     other hand, the combination of city and state has only 27435 distinct
+     values.
+    
+
+    
+     It's advisable to create ndistinct statistics objects only
+     on combinations of columns that are actually used for grouping, and
+     for which misestimation of the number of groups is resulting in bad
+     plans.  Otherwise, the ANALYZE cycles are just wasted.
      
     
    
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml

index ef847b963335d6fd9bc84dee4fe7ba8a359fd725..8caf297f85967882a70cf99c50f2b00955946842 100644 (file)
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
    
  
    
-   Functional dependencies
+   Functional Dependencies
+
     
-    Multivariate correlation can be seen with a very simple data set — a
-    table with two columns, both containing the same values:
+    Multivariate correlation can be demonstrated with a very simple data set
+    — a table with two columns, both containing the same values:
  
  
  CREATE TABLE t (a INT, b INT);
@@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1;
      number of rows, we see that the estimate is very accurate
      (in fact exact, as the table is very small).  Changing the
      WHERE to use the b column, an identical
-    plan is generated.  Observe what happens if we apply the same
-    condition on both columns combining them with AND:
+    plan is generated.  But observe what happens if we apply the same
+    condition on both columns, combining them with AND:
  
  
  EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
@@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
  
  
      The planner estimates the selectivity for each condition individually,
-    arriving to the 1% estimates as above, and then multiplies them, getting
-    the final 0.01% estimate. The actual figures, however,
-    show that this results in a significant underestimate, as the actual
-    number of rows matching the conditions (100) is two orders of magnitude
-    higher than the estimated value.
+    arriving at the same 1% estimates as above.  Then it assumes that the
+    conditions are independent, and so it multiplies their selectivities,
+    producing a final selectivity estimate of just 0.01%.
+    This is a significant underestimate, as the actual number of rows
+    matching the conditions (100) is two orders of magnitude higher.
     
  
     
-    This problem can be fixed by applying functional-dependency
+    This problem can be fixed by creating a statistics object that
+    directs ANALYZE to calculate functional-dependency
      multivariate statistics on the two columns:
  
  
@@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
    
  
    
-   Multivariate N-Distinct coefficients
+   Multivariate N-Distinct Counts
+
     
-    A similar problem occurs with estimation of the cardinality of distinct
-    elements, used to determine the number of groups that would be generated
-    by a GROUP BY clause.  When GROUP BY
-    lists a single column, the n-distinct estimate (which can be seen as the
-    number of rows returned by the aggregate execution node) is very accurate:
+    A similar problem occurs with estimation of the cardinality of sets of
+    multiple columns, such as the number of groups that would be generated by
+    a GROUP BY clause.  When GROUP BY
+    lists a single column, the n-distinct estimate (which is visible as the
+    estimated number of rows returned by the HashAggregate node) is very
+    accurate:
  
  EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a;
                                         QUERY PLAN                                        
@@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
     Group Key: a, b
     ->  Seq Scan on t  (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1)
  
-    By dropping the existing statistics and re-creating it to include n-distinct
-    calculation, the estimate is much improved:
+    By redefining the statistics object to include n-distinct counts for the
+    two columns, the estimate is much improved:
  
  DROP STATISTICS stts;
  CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t;
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml

index 92ee4e4efa728ba8d30acce20bd6d81d30dbcf1a..854746de24da439f5198f66c3cb13921b695295a 100644 (file)
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
      statistic_type
      
       
-      A statistic type to be computed in this statistics object.  Currently
-      supported types are ndistinct, which enables
-      n-distinct coefficient tracking,
-      and dependencies, which enables functional
-      dependencies.
+      A statistic type to be computed in this statistics object.
+      Currently supported types are
+      ndistinct, which enables n-distinct statistics, and
+      dependencies, which enables functional
+      dependency statistics.
+      For more information, see 
+      and .
       
      
     
@@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
      column_name
      
       
-      The name of a table column to be included in the statistics object.
+      The name of a table column to be covered by the computed statistics.
+      At least two column names must be given.
       
      
     
@@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
    Notes
  
    
-   You must be the owner of a table to create or change statistics on it.
+   You must be the owner of a table to create a statistics object
+   reading it.  Once created, however, the ownership of the statistics
+   object is independent of the underlying table(s).
    
   
  
@@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
    
     Create table t1 with two functionally dependent columns, i.e.
     knowledge of a value in the first column is sufficient for determining the
-   value in the other column. Then functional dependencies are built on those
-   columns:
+   value in the other column. Then functional dependency statistics are built
+   on those columns:
  
  
  CREATE TABLE t1 (
@@ -136,21 +141,25 @@ CREATE TABLE t1 (
  INSERT INTO t1 SELECT i/100, i/500
                   FROM generate_series(1,1000000) s(i);
  
+ANALYZE t1;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
+
  CREATE STATISTICS s1 (dependencies) ON a, b FROM t1;
  
  ANALYZE t1;
  
--- valid combination of values
+-- now the rowcount estimate is more accurate:
  EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
-
--- invalid combination of values
-EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1);
  
  
-   Without functional-dependency statistics, the planner would make the
-   same estimate of the number of matching rows for these two queries.
-   With such statistics, it is able to tell that one case has matches
-   and the other does not.
+   Without functional-dependency statistics, the planner would assume
+   that the two WHERE conditions are independent, and would
+   multiply their selectivities together to arrive at a much-too-small
+   rowcount estimate.
+   With such statistics, the planner recognizes that the WHERE
+   conditions are redundant and does not underestimate the rowcount.
author	Tom Lane
	Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
committer	Tom Lane
	Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
doc/src/sgml/catalogs.sgml		patch \| blob \| blame \| history
doc/src/sgml/perform.sgml		patch \| blob \| blame \| history
doc/src/sgml/planstats.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/create_statistics.sgml		patch \| blob \| blame \| history