Add an at-least-marginally-plausible method of estimating the number

author Tom Lane

Tue, 19 Nov 2002 23:22:00 +0000 (23:22 +0000)

committer Tom Lane

Tue, 19 Nov 2002 23:22:00 +0000 (23:22 +0000)
author Tom Lane
Tue, 19 Nov 2002 23:22:00 +0000 (23:22 +0000)
committer Tom Lane
Tue, 19 Nov 2002 23:22:00 +0000 (23:22 +0000)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c

index 5fa82ee9faddfa8dafa24386bf3c1d5dd9955b14..0216f8ebde7fe83f122af02ff1682614204a5630 100644 (file)
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -45,7 +45,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/executor/nodeAgg.c,v 1.95 2002/11/13 00:39:47 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/executor/nodeAgg.c,v 1.96 2002/11/19 23:21:57 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -619,6 +619,9 @@ lookup_hash_entry(Agg *node, TupleTableSlot *slot)
         Datum       attr;
         bool        isNull;
  
+       /* rotate hashkey left 1 bit at each step */
+       hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
         attr = heap_getattr(tuple, att, tupdesc, &isNull);
         if (isNull)
             continue;           /* treat nulls as having hash key 0 */
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c

index 9dc29584e82ddc2fce048508073b8a6223b42c17..2c345b9f7856fa5be556bb0b60fadcd2c6cfe96e 100644 (file)
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -15,7 +15,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/nodes/copyfuncs.c,v 1.218 2002/11/15 02:50:06 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/nodes/copyfuncs.c,v 1.219 2002/11/19 23:21:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1865,8 +1865,8 @@ _copyQuery(Query *from)
  
     /*
      * We do not copy the planner internal fields: base_rel_list,
-    * other_rel_list, join_rel_list, equi_key_list, query_pathkeys. Not
-    * entirely clear if this is right?
+    * other_rel_list, join_rel_list, equi_key_list, query_pathkeys,
+    * hasJoinRTEs.  Not entirely clear if this is right?
      */
  
     return newnode;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c

index 68e93e48b08c47bb7ed291c30ddb82cfe2e4aca6..61e314ff186197e21f365cdb9e75d9d1987029de 100644 (file)
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -20,7 +20,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/nodes/equalfuncs.c,v 1.164 2002/11/15 02:50:06 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/nodes/equalfuncs.c,v 1.165 2002/11/19 23:21:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -628,9 +628,9 @@ _equalQuery(Query *a, Query *b)
  
     /*
      * We do not check the internal-to-the-planner fields: base_rel_list,
-    * other_rel_list, join_rel_list, equi_key_list, query_pathkeys. They
-    * might not be set yet, and in any case they should be derivable from
-    * the other fields.
+    * other_rel_list, join_rel_list, equi_key_list, query_pathkeys,
+    * hasJoinRTEs.  They might not be set yet, and in any case they should
+    * be derivable from the other fields.
      */
     return true;
  }
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c

index 717fcfa3cec75c300daa871d0059460fb60ff071..74e6f237b3ed90b33e3dee6d896f5da6793820d5 100644 (file)
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.122 2002/11/15 02:36:53 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.123 2002/11/19 23:21:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1684,7 +1684,8 @@ make_material(List *tlist, Plan *lefttree)
  
  Agg *
  make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
-        int ngrp, AttrNumber *grpColIdx, Plan *lefttree)
+        int ngrp, AttrNumber *grpColIdx, long numGroups, int numAggs,
+        Plan *lefttree)
  {
     Agg        *node = makeNode(Agg);
     Plan       *plan = &node->plan;
@@ -1692,6 +1693,7 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
     node->aggstrategy = aggstrategy;
     node->numCols = ngrp;
     node->grpColIdx = grpColIdx;
+   node->numGroups = numGroups;
  
     copy_plan_costsize(plan, lefttree);
  
@@ -1699,15 +1701,11 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
      * Charge one cpu_operator_cost per aggregate function per input
      * tuple.
      */
-   plan->total_cost += cpu_operator_cost * plan->plan_rows *
-       (length(pull_agg_clause((Node *) tlist)) +
-        length(pull_agg_clause((Node *) qual)));
+   plan->total_cost += cpu_operator_cost * plan->plan_rows * numAggs;
  
     /*
      * We will produce a single output tuple if not grouping,
-    * and a tuple per group otherwise.  For now, estimate the number of
-    * groups as 10% of the number of tuples --- bogus, but how to do
-    * better?
+    * and a tuple per group otherwise.
      */
     if (aggstrategy == AGG_PLAIN)
     {
@@ -1716,10 +1714,7 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
     }
     else
     {
-       plan->plan_rows *= 0.1;
-       if (plan->plan_rows < 1)
-           plan->plan_rows = 1;
-       node->numGroups = (long) plan->plan_rows;
+       plan->plan_rows = numGroups;
     }
  
     plan->state = (EState *) NULL;
@@ -1735,6 +1730,7 @@ Group *
  make_group(List *tlist,
            int ngrp,
            AttrNumber *grpColIdx,
+          double numGroups,
            Plan *lefttree)
  {
     Group      *node = makeNode(Group);
@@ -1748,13 +1744,8 @@ make_group(List *tlist,
      */
     plan->total_cost += cpu_operator_cost * plan->plan_rows * ngrp;
  
-   /*
-    * Estimate the number of groups as 10% of the number of tuples
-    * --- bogus, but how to do better?
-    */
-   plan->plan_rows *= 0.1;
-   if (plan->plan_rows < 1)
-       plan->plan_rows = 1;
+   /* One output tuple per estimated result group */
+   plan->plan_rows = numGroups;
  
     plan->state = (EState *) NULL;
     plan->qual = NULL;
@@ -1786,17 +1777,16 @@ make_unique(List *tlist, Plan *lefttree, List *distinctList)
  
     /*
      * Charge one cpu_operator_cost per comparison per input tuple. We
-    * assume all columns get compared at most of the tuples.
+    * assume all columns get compared at most of the tuples.  (XXX probably
+    * this is an overestimate.)
      */
     plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
  
     /*
-    * As for Group, we make the unsupported assumption that there will be
-    * 10% as many tuples out as in.
+    * plan->plan_rows is left as a copy of the input subplan's plan_rows;
+    * ie, we assume the filter removes nothing.  The caller must alter this
+    * if he has a better idea.
      */
-   plan->plan_rows *= 0.1;
-   if (plan->plan_rows < 1)
-       plan->plan_rows = 1;
  
     plan->state = (EState *) NULL;
     plan->targetlist = tlist;
@@ -1850,8 +1840,8 @@ make_setop(SetOpCmd cmd, List *tlist, Plan *lefttree,
     plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
  
     /*
-    * As for Group, we make the unsupported assumption that there will be
-    * 10% as many tuples out as in.
+    * We make the unsupported assumption that there will be 10% as many
+    * tuples out as in.  Any way to do better?
      */
     plan->plan_rows *= 0.1;
     if (plan->plan_rows < 1)
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c

index e06282c126533c829f05ae2129270049997e3282..e43c52f6dfe0d7bc97963d01106e711060d0e679 100644 (file)
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.75 2002/09/04 20:31:21 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.76 2002/11/19 23:21:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -784,6 +784,71 @@ process_implied_equality(Query *root, Node *item1, Node *item2,
                             pull_varnos((Node *) clause));
  }
  
+/*
+ * vars_known_equal
+ *   Detect whether two Vars are known equal due to equijoin clauses.
+ *
+ * This is not completely accurate since we avoid adding redundant restriction
+ * clauses to individual base rels (see qual_is_redundant).  However, after
+ * the implied-equality-deduction phase, it is complete for Vars of different
+ * rels; that's sufficient for planned uses.
+ */
+bool
+vars_known_equal(Query *root, Var *var1, Var *var2)
+{
+   Index       irel1;
+   Index       irel2;
+   RelOptInfo *rel1;
+   List       *restrictlist;
+   List       *itm;
+
+   /*
+    * Would need more work here if we wanted to check for known equality
+    * of general clauses: there might be multiple base rels involved.
+    */
+   Assert(IsA(var1, Var));
+   irel1 = var1->varno;
+   Assert(IsA(var2, Var));
+   irel2 = var2->varno;
+
+   /*
+    * If both vars belong to same rel, we need to look at that rel's
+    * baserestrictinfo list.  If different rels, each will have a
+    * joininfo node for the other, and we can scan either list.
+    */
+   rel1 = find_base_rel(root, irel1);
+   if (irel1 == irel2)
+       restrictlist = rel1->baserestrictinfo;
+   else
+   {
+       JoinInfo   *joininfo = find_joininfo_node(rel1,
+                                                 makeListi1(irel2));
+
+       restrictlist = joininfo->jinfo_restrictinfo;
+   }
+
+   /*
+    * Scan to see if equality is known.
+    */
+   foreach(itm, restrictlist)
+   {
+       RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(itm);
+       Node       *left,
+                  *right;
+
+       if (restrictinfo->mergejoinoperator == InvalidOid)
+           continue;           /* ignore non-mergejoinable clauses */
+       /* We now know the restrictinfo clause is a binary opclause */
+       left = (Node *) get_leftop(restrictinfo->clause);
+       right = (Node *) get_rightop(restrictinfo->clause);
+       if ((equal(var1, left) && equal(var2, right)) ||
+           (equal(var2, left) && equal(var1, right)))
+           return true;        /* found a matching clause */
+   }
+
+   return false;
+}
+
  /*
   * qual_is_redundant
   *   Detect whether an implied-equality qual that turns out to be a
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index ab51f0cedbb2cf8c39c39c1925d70683e7791bad..baccf2ffbda32342fcffc91e2ddb9653231c34e5 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,14 +8,17 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.128 2002/11/14 19:00:36 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.129 2002/11/19 23:21:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
  
  #include "postgres.h"
  
+#include 
+
  #include "catalog/pg_type.h"
+#include "miscadmin.h"
  #include "nodes/makefuncs.h"
  #ifdef OPTIMIZER_DEBUG
  #include "nodes/print.h"
@@ -35,6 +38,7 @@
  #include "parser/parse_expr.h"
  #include "rewrite/rewriteManip.h"
  #include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
  
  
  /* Expression kind codes for preprocess_expression */
@@ -160,6 +164,23 @@ subquery_planner(Query *parse, double tuple_fraction)
     parse->jointree = (FromExpr *)
         preprocess_jointree(parse, (Node *) parse->jointree);
  
+   /*
+    * Detect whether any rangetable entries are RTE_JOIN kind; if not,
+    * we can avoid the expense of doing flatten_join_alias_vars().
+    * This must be done after we have done pull_up_subqueries, of course.
+    */
+   parse->hasJoinRTEs = false;
+   foreach(lst, parse->rtable)
+   {
+       RangeTblEntry *rte = (RangeTblEntry *) lfirst(lst);
+
+       if (rte->rtekind == RTE_JOIN)
+       {
+           parse->hasJoinRTEs = true;
+           break;
+       }
+   }
+
     /*
      * Do expression preprocessing on targetlist and quals.
      */
@@ -694,9 +715,6 @@ preprocess_jointree(Query *parse, Node *jtnode)
  static Node *
  preprocess_expression(Query *parse, Node *expr, int kind)
  {
-   bool        has_join_rtes;
-   List       *rt;
-
     /*
      * Simplify constant expressions.
      *
@@ -737,22 +755,8 @@ preprocess_expression(Query *parse, Node *expr, int kind)
      * with base-relation variables, to allow quals to be pushed down. We
      * must do this after sublink processing, since it does not recurse
      * into sublinks.
-    *
-    * The flattening pass is expensive enough that it seems worthwhile to
-    * scan the rangetable to see if we can avoid it.
      */
-   has_join_rtes = false;
-   foreach(rt, parse->rtable)
-   {
-       RangeTblEntry *rte = lfirst(rt);
-
-       if (rte->rtekind == RTE_JOIN)
-       {
-           has_join_rtes = true;
-           break;
-       }
-   }
-   if (has_join_rtes)
+   if (parse->hasJoinRTEs)
         expr = flatten_join_alias_vars(expr, parse->rtable, false);
  
     return expr;
@@ -931,6 +935,9 @@ grouping_planner(Query *parse, double tuple_fraction)
         AttrNumber *groupColIdx = NULL;
         Path       *cheapest_path;
         Path       *sorted_path;
+       double      dNumGroups = 0;
+       long        numGroups = 0;
+       int         numAggs = 0;
         bool        use_hashed_grouping = false;
  
         /* Preprocess targetlist in case we are inside an INSERT/UPDATE. */
@@ -1006,6 +1013,19 @@ grouping_planner(Query *parse, double tuple_fraction)
         sort_pathkeys = make_pathkeys_for_sortclauses(parse->sortClause,
                                                       tlist);
  
+       /*
+        * Will need actual number of aggregates for estimating costs.
+        * Also, it's possible that optimization has eliminated all
+        * aggregates, and we may as well check for that here.
+        */
+       if (parse->hasAggs)
+       {
+           numAggs = length(pull_agg_clause((Node *) tlist)) +
+               length(pull_agg_clause(parse->havingQual));
+           if (numAggs == 0)
+               parse->hasAggs = false;
+       }
+
         /*
          * Figure out whether we need a sorted result from query_planner.
          *
@@ -1215,6 +1235,14 @@ grouping_planner(Query *parse, double tuple_fraction)
          */
         if (parse->groupClause)
         {
+           /*
+            * Always estimate the number of groups.
+            */
+           dNumGroups = estimate_num_groups(parse,
+                                            parse->groupClause,
+                                            cheapest_path->parent->rows);
+           numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
+
             /*
              * Executor doesn't support hashed aggregation with DISTINCT
              * aggregates.  (Doing so would imply storing *all* the input
@@ -1226,10 +1254,30 @@ grouping_planner(Query *parse, double tuple_fraction)
                 use_hashed_grouping = false;
             else
             {
-#if 0                          /* much more to do here */
-               /* TEMPORARY HOTWIRE FOR TESTING */
-               use_hashed_grouping = true;
+               /*
+                * Use hashed grouping if (a) we think we can fit the
+                * hashtable into SortMem, *and* (b) the estimated cost
+                * is no more than doing it the other way.  While avoiding
+                * the need for sorted input is usually a win, the fact
+                * that the output won't be sorted may be a loss; so we
+                * need to do an actual cost comparison.
+                *
+                * In most cases we have no good way to estimate the size of
+                * the transition value needed by an aggregate; arbitrarily
+                * assume it is 100 bytes.  Also set the overhead per hashtable
+                * entry at 64 bytes.
+                */
+               int     hashentrysize = cheapest_path->parent->width + 64 +
+                   numAggs * 100;
+
+               if (hashentrysize * dNumGroups <= SortMem * 1024L)
+               {
+                   /* much more to do here */
+#if 0
+                   /* TEMPORARY HOTWIRE FOR TESTING */
+                   use_hashed_grouping = true;
  #endif
+               }
             }
         }
  
@@ -1319,6 +1367,8 @@ grouping_planner(Query *parse, double tuple_fraction)
                                             AGG_HASHED,
                                             length(parse->groupClause),
                                             groupColIdx,
+                                           numGroups,
+                                           numAggs,
                                             result_plan);
             /* Hashed aggregation produces randomly-ordered results */
             current_pathkeys = NIL;
@@ -1356,6 +1406,8 @@ grouping_planner(Query *parse, double tuple_fraction)
                                             aggstrategy,
                                             length(parse->groupClause),
                                             groupColIdx,
+                                           numGroups,
+                                           numAggs,
                                             result_plan);
         }
         else
@@ -1387,6 +1439,7 @@ grouping_planner(Query *parse, double tuple_fraction)
                 result_plan = (Plan *) make_group(tlist,
                                                   length(parse->groupClause),
                                                   groupColIdx,
+                                                 dNumGroups,
                                                   result_plan);
             }
         }
@@ -1410,6 +1463,16 @@ grouping_planner(Query *parse, double tuple_fraction)
     {
         result_plan = (Plan *) make_unique(tlist, result_plan,
                                            parse->distinctClause);
+       /*
+        * If there was grouping or aggregation, leave plan_rows as-is
+        * (ie, assume the result was already mostly unique).  If not,
+        * it's reasonable to assume the UNIQUE filter has effects
+        * comparable to GROUP BY.
+        */
+       if (!parse->groupClause && !parse->hasAggs)
+           result_plan->plan_rows = estimate_num_groups(parse,
+                                                        parse->distinctClause,
+                                                        result_plan->plan_rows);
     }
  
     /*
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c

index 66998b036f945774c2fe3bc63dca7f0fadcaf613..4239d9c3c12cef8dfccba4ac5f623df92167c74c 100644 (file)
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -9,7 +9,7 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/setrefs.c,v 1.81 2002/09/04 20:31:21 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/optimizer/plan/setrefs.c,v 1.82 2002/11/19 23:21:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -439,7 +439,14 @@ join_references_mutator(Node *node,
             return (Node *) newvar;
         }
  
-       /* Perhaps it's a join alias that can be resolved to input vars? */
+       /* Return the Var unmodified, if it's for acceptable_rel */
+       if (var->varno == context->acceptable_rel)
+           return (Node *) copyObject(var);
+
+       /*
+        * Perhaps it's a join alias that can be resolved to input vars?
+        * We try this last since it's relatively slow.
+        */
         newnode = flatten_join_alias_vars((Node *) var,
                                           context->rtable,
                                           true);
@@ -450,13 +457,8 @@ join_references_mutator(Node *node,
             return newnode;
         }
  
-       /*
-        * No referent found for Var --- either raise an error, or return
-        * the Var unmodified if it's for acceptable_rel.
-        */
-       if (var->varno != context->acceptable_rel)
-           elog(ERROR, "join_references: variable not in subplan target lists");
-       return (Node *) copyObject(var);
+       /* No referent found for Var */
+       elog(ERROR, "join_references: variable not in subplan target lists");
     }
     return expression_tree_mutator(node,
                                    join_references_mutator,
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 936b9ad99c04751e94e25f691df309b3003588dd..23e012c64e9deac9474246b1b78a245b3fc5202f 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.120 2002/11/08 20:23:57 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.121 2002/11/19 23:21:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -85,7 +85,10 @@
  #include "optimizer/cost.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/plancat.h"
+#include "optimizer/planmain.h"
  #include "optimizer/prep.h"
+#include "optimizer/tlist.h"
+#include "optimizer/var.h"
  #include "parser/parse_func.h"
  #include "parser/parse_oper.h"
  #include "parser/parsetree.h"
@@ -1809,6 +1812,251 @@ mergejoinscansel(Query *root, Node *clause,
         *rightscan = 1.0;
  }
  
+/*
+ * estimate_num_groups     - Estimate number of groups in a grouped query
+ *
+ * Given a query having a GROUP BY clause, estimate how many groups there
+ * will be --- ie, the number of distinct combinations of the GROUP BY
+ * expressions.
+ *
+ * This routine is also used to estimate the number of rows emitted by
+ * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
+ * actually, we only use it for DISTINCT when there's no grouping or
+ * aggregation ahead of the DISTINCT.)
+ *
+ * Inputs:
+ * root - the query
+ * groupClauses - list of GroupClauses (or SortClauses for the DISTINCT
+ *     case, but those are equivalent structs)
+ * input_rows - number of rows estimated to arrive at the group/unique
+ *     filter step
+ *
+ * Given the lack of any cross-correlation statistics in the system, it's
+ * impossible to do anything really trustworthy with GROUP BY conditions
+ * involving multiple Vars.  We should however avoid assuming the worst
+ * case (all possible cross-product terms actually appear as groups) since
+ * very often the grouped-by Vars are highly correlated.  Our current approach
+ * is as follows:
+ * 1.  Reduce the given expressions to a list of unique Vars used.  For
+ *     example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
+ *     It is clearly correct not to count the same Var more than once.
+ *     It is also reasonable to treat f(x) the same as x: f() cannot
+ *     increase the number of distinct values (unless it is volatile,
+ *     which we consider unlikely for grouping), but it probably won't
+ *     reduce the number of distinct values much either.
+ * 2.  If the list contains Vars of different relations that are known equal
+ *     due to equijoin clauses, then drop all but one of the Vars from each
+ *     known-equal set, keeping the one with smallest estimated # of values
+ *     (since the extra values of the others can't appear in joined rows).
+ *     Note the reason we only consider Vars of different relations is that
+ *     if we considered ones of the same rel, we'd be double-counting the
+ *     restriction selectivity of the equality in the next step.
+ * 3.  For Vars within a single source rel, we multiply together the numbers
+ *     of values, clamp to the number of rows in the rel, and then multiply
+ *     by the selectivity of the restriction clauses for that rel.  The
+ *     initial product is probably too high (it's the worst case) but since
+ *     we can clamp to the rel's rows it won't be hugely bad.  Multiplying
+ *     by the restriction selectivity is effectively assuming that the
+ *     restriction clauses are independent of the grouping, which is a crummy
+ *     assumption, but it's hard to do better.
+ * 4.  If there are Vars from multiple rels, we repeat step 3 for each such
+ *     rel, and multiply the results together.
+ * Note that rels not containing grouped Vars are ignored completely, as are
+ * join clauses other than the equijoin clauses used in step 2.  Such rels
+ * cannot increase the number of groups, and we assume such clauses do not
+ * reduce the number either (somewhat bogus, but we don't have the info to
+ * do better).
+ */
+double
+estimate_num_groups(Query *root, List *groupClauses, double input_rows)
+{
+   List       *allvars = NIL;
+   List       *varinfos = NIL;
+   double      numdistinct;
+   List       *l;
+   typedef struct {            /* varinfos is a List of these */
+       Var    *var;
+       double  ndistinct;
+   } MyVarInfo;
+
+   /* We should not be called unless query has GROUP BY (or DISTINCT) */
+   Assert(groupClauses != NIL);
+
+   /* Step 1: get the unique Vars used */
+   foreach(l, groupClauses)
+   {
+       GroupClause *grpcl = (GroupClause *) lfirst(l);
+       Node       *groupexpr = get_sortgroupclause_expr(grpcl,
+                                                        root->targetList);
+       List       *varshere;
+
+       varshere = pull_var_clause(groupexpr, false);
+       /*
+        * Replace any JOIN alias Vars with the underlying Vars.  (This
+        * is not really right for FULL JOIN ...)
+        */
+       if (root->hasJoinRTEs)
+       {
+           varshere = (List *) flatten_join_alias_vars((Node *) varshere,
+                                                       root->rtable,
+                                                       true);
+           varshere = pull_var_clause((Node *) varshere, false);
+       }
+       /*
+        * If we find any variable-free GROUP BY item, then either it is
+        * a constant (and we can ignore it) or it contains a volatile
+        * function; in the latter case we punt and assume that each input
+        * row will yield a distinct group.
+        */
+       if (varshere == NIL)
+       {
+           if (contain_volatile_functions(groupexpr))
+               return input_rows;
+           continue;
+       }
+       allvars = nconc(allvars, varshere);
+   }
+
+   /* If now no Vars, we must have an all-constant GROUP BY list. */
+   if (allvars == NIL)
+       return 1.0;
+
+   /* Use set_union() to discard duplicates */
+   allvars = set_union(NIL, allvars);
+
+   /*
+    * Step 2: acquire statistical estimate of number of distinct values
+    * of each Var (total in its table, without regard for filtering).
+    * Also, detect known-equal Vars and discard the ones we don't want.
+    */
+   foreach(l, allvars)
+   {
+       Var    *var = (Var *) lfirst(l);
+       Oid     relid = getrelid(var->varno, root->rtable);
+       HeapTuple   statsTuple = NULL;
+       Form_pg_statistic stats = NULL;
+       double ndistinct;
+       bool    keep = true;
+       List   *l2;
+
+       if (OidIsValid(relid))
+       {
+           statsTuple = SearchSysCache(STATRELATT,
+                                       ObjectIdGetDatum(relid),
+                                       Int16GetDatum(var->varattno),
+                                       0, 0);
+           if (HeapTupleIsValid(statsTuple))
+               stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+       }
+       ndistinct = get_att_numdistinct(root, var, stats);
+       if (HeapTupleIsValid(statsTuple))
+           ReleaseSysCache(statsTuple);
+
+       foreach(l2, varinfos)
+       {
+           MyVarInfo  *varinfo = (MyVarInfo *) lfirst(l2);
+
+           if (var->varno != varinfo->var->varno &&
+               vars_known_equal(root, var, varinfo->var))
+           {
+               /* Found a match */
+               if (varinfo->ndistinct <= ndistinct)
+               {
+                   /* Keep older item, forget new one */
+                   keep = false;
+                   break;
+               }
+               else
+               {
+                   /*
+                    * Delete the older item.  We assume lremove() will not
+                    * break the lnext link of the item...
+                    */
+                   varinfos = lremove(varinfo, varinfos);
+               }
+           }
+       }
+
+       if (keep)
+       {
+           MyVarInfo  *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo));
+
+           varinfo->var = var;
+           varinfo->ndistinct = ndistinct;
+           varinfos = lcons(varinfo, varinfos);
+       }
+   }
+
+   /*
+    * Steps 3/4: group Vars by relation and estimate total numdistinct.
+    *
+    * For each iteration of the outer loop, we process the frontmost
+    * Var in varinfos, plus all other Vars in the same relation.  We
+    * remove these Vars from the newvarinfos list for the next iteration.
+    * This is the easiest way to group Vars of same rel together.
+    */
+   Assert(varinfos != NIL);
+   numdistinct = 1.0;
+
+   do
+   {
+       MyVarInfo  *varinfo1 = (MyVarInfo *) lfirst(varinfos);
+       RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno);
+       double  reldistinct = varinfo1->ndistinct;
+       List   *newvarinfos = NIL;
+
+       /*
+        * Get the largest numdistinct estimate of the Vars for this rel.
+        * Also, construct new varinfos list of remaining Vars.
+        */
+       foreach(l, lnext(varinfos))
+       {
+           MyVarInfo  *varinfo2 = (MyVarInfo *) lfirst(l);
+
+           if (varinfo2->var->varno == varinfo1->var->varno)
+           {
+               reldistinct *= varinfo2->ndistinct;
+           }
+           else
+           {
+               /* not time to process varinfo2 yet */
+               newvarinfos = lcons(varinfo2, newvarinfos);
+           }
+       }
+
+       /*
+        * Clamp to size of rel, multiply by restriction selectivity.
+        */
+       Assert(rel->reloptkind == RELOPT_BASEREL);
+       if (reldistinct > rel->tuples)
+           reldistinct = rel->tuples;
+       reldistinct *= rel->rows / rel->tuples;
+
+       /*
+        * Update estimate of total distinct groups.
+        */
+       numdistinct *= reldistinct;
+
+       varinfos = newvarinfos;
+   } while (varinfos != NIL);
+
+   /* Guard against out-of-range answers */
+   if (numdistinct > input_rows)
+       numdistinct = input_rows;
+   if (numdistinct < 1.0)
+       numdistinct = 1.0;
+
+   return numdistinct;
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Support routines
+ *
+ *-------------------------------------------------------------------------
+ */
+
  /*
   * get_var_maximum
   *     Estimate the maximum value of the specified variable.
@@ -3271,7 +3519,7 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)
  
  
  /*
- * We want test whether the database's LC_COLLATE setting is safe for
+ * We want to test whether the database's LC_COLLATE setting is safe for
   * LIKE/regexp index optimization.
   *
   * The key requirement here is that given a prefix string, say "foo",
@@ -3284,7 +3532,7 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)
   *
   * (In theory, locales other than C may be LIKE-safe so this function
   * could be different from lc_collate_is_c(), but in a different
- * theory, non-C locales are completely unpredicable so it's unlikely
+ * theory, non-C locales are completely unpredictable so it's unlikely
   * to happen.)
   *
   * Be sure to maintain the correspondence with the code in initdb.
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h

index cd976cd1a14ca5c6c1e01731148cc17617094c8d..92501196f938440f266e0378c9ab6be08d54b864 100644 (file)
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: parsenodes.h,v 1.215 2002/11/15 03:09:39 momjian Exp $
+ * $Id: parsenodes.h,v 1.216 2002/11/19 23:21:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -102,6 +102,7 @@ typedef struct Query
     List       *equi_key_list;  /* list of lists of equijoined
                                  * PathKeyItems */
     List       *query_pathkeys; /* desired pathkeys for query_planner() */
+   bool        hasJoinRTEs;    /* true if any RTEs are RTE_JOIN kind */
  } Query;
  
  
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h

index c927d54074038e6a1ed9e361b782debc5fa1dbb2..bd4bcddd308bfe3dc0b61566b16b4e4af8eb338a 100644 (file)
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: planmain.h,v 1.61 2002/11/06 00:00:45 tgl Exp $
+ * $Id: planmain.h,v 1.62 2002/11/19 23:22:00 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -35,8 +35,11 @@ extern Sort *make_sort(Query *root, List *tlist,
  extern Sort *make_sort_from_pathkeys(Query *root, List *tlist,
                         Plan *lefttree, List *pathkeys);
  extern Agg *make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
-                    int ngrp, AttrNumber *grpColIdx, Plan *lefttree);
-extern Group *make_group(List *tlist, int ngrp, AttrNumber *grpColIdx,
+                    int ngrp, AttrNumber *grpColIdx,
+                    long numGroups, int numAggs,
+                    Plan *lefttree);
+extern Group *make_group(List *tlist,
+                        int ngrp, AttrNumber *grpColIdx, double numGroups,
                          Plan *lefttree);
  extern Material *make_material(List *tlist, Plan *lefttree);
  extern Unique *make_unique(List *tlist, Plan *lefttree, List *distinctList);
@@ -54,6 +57,7 @@ extern void build_base_rel_tlists(Query *root, List *tlist);
  extern Relids distribute_quals_to_rels(Query *root, Node *jtnode);
  extern void process_implied_equality(Query *root, Node *item1, Node *item2,
                          Oid sortop1, Oid sortop2);
+extern bool vars_known_equal(Query *root, Var *var1, Var *var2);
  
  /*
   * prototypes for plan/setrefs.c
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index 8e73e61ffdc8b998030e439f98f0ce7161fe5c95..49f3bc7e005ff2fad73cc1fe3f480ff7a816b950 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $Id: selfuncs.h,v 1.9 2002/10/19 02:56:16 tgl Exp $
+ * $Id: selfuncs.h,v 1.10 2002/11/19 23:22:00 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -75,6 +75,9 @@ extern void mergejoinscansel(Query *root, Node *clause,
                  Selectivity *leftscan,
                  Selectivity *rightscan);
  
+extern double estimate_num_groups(Query *root, List *groupClauses,
+                                 double input_rows);
+
  extern Datum btcostestimate(PG_FUNCTION_ARGS);
  extern Datum rtcostestimate(PG_FUNCTION_ARGS);
  extern Datum hashcostestimate(PG_FUNCTION_ARGS);
author	Tom Lane
	Tue, 19 Nov 2002 23:22:00 +0000 (23:22 +0000)
committer	Tom Lane
	Tue, 19 Nov 2002 23:22:00 +0000 (23:22 +0000)
src/backend/executor/nodeAgg.c		patch \| blob \| blame \| history
src/backend/nodes/copyfuncs.c		patch \| blob \| blame \| history
src/backend/nodes/equalfuncs.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/createplan.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/initsplan.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/setrefs.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/nodes/parsenodes.h		patch \| blob \| blame \| history
src/include/optimizer/planmain.h		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history