Modify prefix_selectivity() so that it will never estimate the selectivity
authorTom Lane
Sat, 8 Mar 2008 22:41:38 +0000 (22:41 +0000)
committerTom Lane
Sat, 8 Mar 2008 22:41:38 +0000 (22:41 +0000)
of the generated range condition var >= 'foo' AND var < 'fop' as being less
than what eqsel() would estimate for var = 'foo'.  This is intuitively
reasonable and it gets rid of the need for some entirely ad-hoc coding we
formerly used to reject bogus estimates.  The basic problem here is that
if the prefix is more than a few characters long, the two boundary values
are too close together to be distinguishable by comparison to the column
histogram, resulting in a selectivity estimate of zero, which is often
not very sane.  Change motivated by an example from Peter Eisentraut.

Arguably this is a bug fix, but I'll refrain from back-patching it
for the moment.

src/backend/utils/adt/selfuncs.c

index 37f33d56fa0fd553be09434b62d0ed5d27c17408..b5558687d287ee4c8389ef142d9320ab65cd1774 100644 (file)
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.243 2008/01/01 19:45:52 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "utils/syscache.h"
 
 
+static double var_eq_const(VariableStatData *vardata, Oid operator,
+            Datum constval, bool constisnull,
+            bool varonleft);
+static double var_eq_non_const(VariableStatData *vardata, Oid operator,
+                Node *other,
+                bool varonleft);
 static double ineq_histogram_selectivity(VariableStatData *vardata,
                           FmgrInfo *opproc, bool isgt,
                           Datum constval, Oid consttype);
@@ -156,10 +162,6 @@ eqsel(PG_FUNCTION_ARGS)
    VariableStatData vardata;
    Node       *other;
    bool        varonleft;
-   Datum      *values;
-   int         nvalues;
-   float4     *numbers;
-   int         nnumbers;
    double      selec;
 
    /*
@@ -171,149 +173,139 @@ eqsel(PG_FUNCTION_ARGS)
        PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 
    /*
-    * If the something is a NULL constant, assume operator is strict and
+    * We can do a lot better if the something is a constant.  (Note: the
+    * Const might result from estimation rather than being a simple constant
+    * in the query.)
+    */
+   if (IsA(other, Const))
+       selec = var_eq_const(&vardata, operator,
+                            ((Const *) other)->constvalue,
+                            ((Const *) other)->constisnull,
+                            varonleft);
+   else
+       selec = var_eq_non_const(&vardata, operator, other,
+                                varonleft);
+
+   ReleaseVariableStats(vardata);
+
+   PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * var_eq_const --- eqsel for var = const case
+ *
+ * This is split out so that some other estimation functions can use it.
+ */
+static double
+var_eq_const(VariableStatData *vardata, Oid operator,
+            Datum constval, bool constisnull,
+            bool varonleft)
+{
+   double      selec;
+
+   /*
+    * If the constant is NULL, assume operator is strict and
     * return zero, ie, operator will never return TRUE.
     */
-   if (IsA(other, Const) &&
-       ((Const *) other)->constisnull)
-   {
-       ReleaseVariableStats(vardata);
-       PG_RETURN_FLOAT8(0.0);
-   }
+   if (constisnull)
+       return 0.0;
 
-   if (HeapTupleIsValid(vardata.statsTuple))
+   if (HeapTupleIsValid(vardata->statsTuple))
    {
        Form_pg_statistic stats;
+       Datum      *values;
+       int         nvalues;
+       float4     *numbers;
+       int         nnumbers;
+       bool        match = false;
+       int         i;
 
-       stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+       stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 
-       if (IsA(other, Const))
+       /*
+        * Is the constant "=" to any of the column's most common values?
+        * (Although the given operator may not really be "=", we will
+        * assume that seeing whether it returns TRUE is an appropriate
+        * test.  If you don't like this, maybe you shouldn't be using
+        * eqsel for your operator...)
+        */
+       if (get_attstatsslot(vardata->statsTuple,
+                            vardata->atttype, vardata->atttypmod,
+                            STATISTIC_KIND_MCV, InvalidOid,
+                            &values, &nvalues,
+                            &numbers, &nnumbers))
        {
-           /* Variable is being compared to a known non-null constant */
-           Datum       constval = ((Const *) other)->constvalue;
-           bool        match = false;
-           int         i;
+           FmgrInfo    eqproc;
 
-           /*
-            * Is the constant "=" to any of the column's most common values?
-            * (Although the given operator may not really be "=", we will
-            * assume that seeing whether it returns TRUE is an appropriate
-            * test.  If you don't like this, maybe you shouldn't be using
-            * eqsel for your operator...)
-            */
-           if (get_attstatsslot(vardata.statsTuple,
-                                vardata.atttype, vardata.atttypmod,
-                                STATISTIC_KIND_MCV, InvalidOid,
-                                &values, &nvalues,
-                                &numbers, &nnumbers))
-           {
-               FmgrInfo    eqproc;
-
-               fmgr_info(get_opcode(operator), &eqproc);
-
-               for (i = 0; i < nvalues; i++)
-               {
-                   /* be careful to apply operator right way 'round */
-                   if (varonleft)
-                       match = DatumGetBool(FunctionCall2(&eqproc,
-                                                          values[i],
-                                                          constval));
-                   else
-                       match = DatumGetBool(FunctionCall2(&eqproc,
-                                                          constval,
-                                                          values[i]));
-                   if (match)
-                       break;
-               }
-           }
-           else
-           {
-               /* no most-common-value info available */
-               values = NULL;
-               numbers = NULL;
-               i = nvalues = nnumbers = 0;
-           }
+           fmgr_info(get_opcode(operator), &eqproc);
 
-           if (match)
+           for (i = 0; i < nvalues; i++)
            {
-               /*
-                * Constant is "=" to this common value.  We know selectivity
-                * exactly (or as exactly as ANALYZE could calculate it,
-                * anyway).
-                */
-               selec = numbers[i];
-           }
-           else
-           {
-               /*
-                * Comparison is against a constant that is neither NULL nor
-                * any of the common values.  Its selectivity cannot be more
-                * than this:
-                */
-               double      sumcommon = 0.0;
-               double      otherdistinct;
-
-               for (i = 0; i < nnumbers; i++)
-                   sumcommon += numbers[i];
-               selec = 1.0 - sumcommon - stats->stanullfrac;
-               CLAMP_PROBABILITY(selec);
-
-               /*
-                * and in fact it's probably a good deal less. We approximate
-                * that all the not-common values share this remaining
-                * fraction equally, so we divide by the number of other
-                * distinct values.
-                */
-               otherdistinct = get_variable_numdistinct(&vardata)
-                   - nnumbers;
-               if (otherdistinct > 1)
-                   selec /= otherdistinct;
-
-               /*
-                * Another cross-check: selectivity shouldn't be estimated as
-                * more than the least common "most common value".
-                */
-               if (nnumbers > 0 && selec > numbers[nnumbers - 1])
-                   selec = numbers[nnumbers - 1];
+               /* be careful to apply operator right way 'round */
+               if (varonleft)
+                   match = DatumGetBool(FunctionCall2(&eqproc,
+                                                      values[i],
+                                                      constval));
+               else
+                   match = DatumGetBool(FunctionCall2(&eqproc,
+                                                      constval,
+                                                      values[i]));
+               if (match)
+                   break;
            }
+       }
+       else
+       {
+           /* no most-common-value info available */
+           values = NULL;
+           numbers = NULL;
+           i = nvalues = nnumbers = 0;
+       }
 
-           free_attstatsslot(vardata.atttype, values, nvalues,
-                             numbers, nnumbers);
+       if (match)
+       {
+           /*
+            * Constant is "=" to this common value.  We know selectivity
+            * exactly (or as exactly as ANALYZE could calculate it,
+            * anyway).
+            */
+           selec = numbers[i];
        }
        else
        {
-           double      ndistinct;
+           /*
+            * Comparison is against a constant that is neither NULL nor
+            * any of the common values.  Its selectivity cannot be more
+            * than this:
+            */
+           double      sumcommon = 0.0;
+           double      otherdistinct;
+
+           for (i = 0; i < nnumbers; i++)
+               sumcommon += numbers[i];
+           selec = 1.0 - sumcommon - stats->stanullfrac;
+           CLAMP_PROBABILITY(selec);
 
            /*
-            * Search is for a value that we do not know a priori, but we will
-            * assume it is not NULL.  Estimate the selectivity as non-null
-            * fraction divided by number of distinct values, so that we get a
-            * result averaged over all possible values whether common or
-            * uncommon.  (Essentially, we are assuming that the not-yet-known
-            * comparison value is equally likely to be any of the possible
-            * values, regardless of their frequency in the table.  Is that a
-            * good idea?)
+            * and in fact it's probably a good deal less. We approximate
+            * that all the not-common values share this remaining
+            * fraction equally, so we divide by the number of other
+            * distinct values.
             */
-           selec = 1.0 - stats->stanullfrac;
-           ndistinct = get_variable_numdistinct(&vardata);
-           if (ndistinct > 1)
-               selec /= ndistinct;
+           otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
+           if (otherdistinct > 1)
+               selec /= otherdistinct;
 
            /*
-            * Cross-check: selectivity should never be estimated as more than
-            * the most common value's.
+            * Another cross-check: selectivity shouldn't be estimated as
+            * more than the least common "most common value".
             */
-           if (get_attstatsslot(vardata.statsTuple,
-                                vardata.atttype, vardata.atttypmod,
-                                STATISTIC_KIND_MCV, InvalidOid,
-                                NULL, NULL,
-                                &numbers, &nnumbers))
-           {
-               if (nnumbers > 0 && selec > numbers[0])
-                   selec = numbers[0];
-               free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
-           }
+           if (nnumbers > 0 && selec > numbers[nnumbers - 1])
+               selec = numbers[nnumbers - 1];
        }
+
+       free_attstatsslot(vardata->atttype, values, nvalues,
+                         numbers, nnumbers);
    }
    else
    {
@@ -322,15 +314,78 @@ eqsel(PG_FUNCTION_ARGS)
         * of distinct values and assuming they are equally common. (The guess
         * is unlikely to be very good, but we do know a few special cases.)
         */
-       selec = 1.0 / get_variable_numdistinct(&vardata);
+       selec = 1.0 / get_variable_numdistinct(vardata);
    }
 
-   ReleaseVariableStats(vardata);
+   /* result should be in range, but make sure... */
+   CLAMP_PROBABILITY(selec);
+
+   return selec;
+}
+
+/*
+ * var_eq_non_const --- eqsel for var = something-other-than-const case
+ */
+static double
+var_eq_non_const(VariableStatData *vardata, Oid operator,
+                Node *other,
+                bool varonleft)
+{
+   double      selec;
+
+   if (HeapTupleIsValid(vardata->statsTuple))
+   {
+       Form_pg_statistic stats;
+       double      ndistinct;
+       float4     *numbers;
+       int         nnumbers;
+
+       stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+       /*
+        * Search is for a value that we do not know a priori, but we will
+        * assume it is not NULL.  Estimate the selectivity as non-null
+        * fraction divided by number of distinct values, so that we get a
+        * result averaged over all possible values whether common or
+        * uncommon.  (Essentially, we are assuming that the not-yet-known
+        * comparison value is equally likely to be any of the possible
+        * values, regardless of their frequency in the table.  Is that a
+        * good idea?)
+        */
+       selec = 1.0 - stats->stanullfrac;
+       ndistinct = get_variable_numdistinct(vardata);
+       if (ndistinct > 1)
+           selec /= ndistinct;
+
+       /*
+        * Cross-check: selectivity should never be estimated as more than
+        * the most common value's.
+        */
+       if (get_attstatsslot(vardata->statsTuple,
+                            vardata->atttype, vardata->atttypmod,
+                            STATISTIC_KIND_MCV, InvalidOid,
+                            NULL, NULL,
+                            &numbers, &nnumbers))
+       {
+           if (nnumbers > 0 && selec > numbers[0])
+               selec = numbers[0];
+           free_attstatsslot(vardata->atttype, NULL, 0, numbers, nnumbers);
+       }
+   }
+   else
+   {
+       /*
+        * No ANALYZE stats available, so make a guess using estimated number
+        * of distinct values and assuming they are equally common. (The guess
+        * is unlikely to be very good, but we do know a few special cases.)
+        */
+       selec = 1.0 / get_variable_numdistinct(vardata);
+   }
 
    /* result should be in range, but make sure... */
    CLAMP_PROBABILITY(selec);
 
-   PG_RETURN_FLOAT8((float8) selec);
+   return selec;
 }
 
 /*
@@ -1047,16 +1102,11 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
         */
        Oid         eqopr = get_opfamily_member(opfamily, vartype, vartype,
                                                BTEqualStrategyNumber);
-       List       *eqargs;
 
        if (eqopr == InvalidOid)
            elog(ERROR, "no = operator for opfamily %u", opfamily);
-       eqargs = list_make2(variable, prefix);
-       result = DatumGetFloat8(DirectFunctionCall4(eqsel,
-                                                   PointerGetDatum(root),
-                                                   ObjectIdGetDatum(eqopr),
-                                                   PointerGetDatum(eqargs),
-                                                   Int32GetDatum(varRelid)));
+       result = var_eq_const(&vardata, eqopr, prefix->constvalue,
+                             false, true);
    }
    else
    {
@@ -4430,6 +4480,7 @@ prefix_selectivity(VariableStatData *vardata,
    Oid         cmpopr;
    FmgrInfo    opproc;
    Const      *greaterstrcon;
+   Selectivity eq_sel;
 
    cmpopr = get_opfamily_member(opfamily, vartype, vartype,
                                 BTGreaterEqualStrategyNumber);
@@ -4444,7 +4495,7 @@ prefix_selectivity(VariableStatData *vardata,
    if (prefixsel <= 0.0)
    {
        /* No histogram is present ... return a suitable default estimate */
-       return 0.005;
+       return DEFAULT_MATCH_SEL;
    }
 
    /*-------
@@ -4452,17 +4503,17 @@ prefix_selectivity(VariableStatData *vardata,
     *  "x < greaterstr".
     *-------
     */
-   cmpopr = get_opfamily_member(opfamily, vartype, vartype,
-                                BTLessStrategyNumber);
-   if (cmpopr == InvalidOid)
-       elog(ERROR, "no < operator for opfamily %u", opfamily);
-   fmgr_info(get_opcode(cmpopr), &opproc);
-
    greaterstrcon = make_greater_string(prefixcon, &opproc);
    if (greaterstrcon)
    {
        Selectivity topsel;
 
+       cmpopr = get_opfamily_member(opfamily, vartype, vartype,
+                                    BTLessStrategyNumber);
+       if (cmpopr == InvalidOid)
+           elog(ERROR, "no < operator for opfamily %u", opfamily);
+       fmgr_info(get_opcode(cmpopr), &opproc);
+
        topsel = ineq_histogram_selectivity(vardata, &opproc, false,
                                            greaterstrcon->constvalue,
                                            greaterstrcon->consttype);
@@ -4477,16 +4528,30 @@ prefix_selectivity(VariableStatData *vardata,
         * doesn't count those anyway.
         */
        prefixsel = topsel + prefixsel - 1.0;
-
-       /*
-        * A zero or negative prefixsel should be converted into a small
-        * positive value; we probably are dealing with a very tight range and
-        * got a bogus result due to roundoff errors.
-        */
-       if (prefixsel <= 0.0)
-           prefixsel = 1.0e-10;
    }
 
+   /*
+    * If the prefix is long then the two bounding values might be too
+    * close together for the histogram to distinguish them usefully,
+    * resulting in a zero estimate (plus or minus roundoff error).
+    * To avoid returning a ridiculously small estimate, compute the
+    * estimated selectivity for "variable = 'foo'", and clamp to that.
+    * (Obviously, the resultant estimate should be at least that.)
+    *
+    * We apply this even if we couldn't make a greater string.  That case
+    * suggests that the prefix is near the maximum possible, and thus
+    * probably off the end of the histogram, and thus we probably got a
+    * very small estimate from the >= condition; so we still need to clamp.
+    */
+   cmpopr = get_opfamily_member(opfamily, vartype, vartype,
+                                BTEqualStrategyNumber);
+   if (cmpopr == InvalidOid)
+       elog(ERROR, "no = operator for opfamily %u", opfamily);
+   eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
+                         false, true);
+
+   prefixsel = Max(prefixsel, eq_sel);
+
    return prefixsel;
 }