First step in fixing selectivity-estimation code. eqsel and
authorTom Lane
Sun, 1 Aug 1999 04:54:25 +0000 (04:54 +0000)
committerTom Lane
Sun, 1 Aug 1999 04:54:25 +0000 (04:54 +0000)
neqsel now behave as per my suggestions in pghackers a few days ago.
selectivity for < > <= >= should work OK for integral types as well, but
still need work for nonintegral types.  Since these routines have never
actually executed before :-(, this may result in some significant changes
in the optimizer's choices of execution plans.  Let me know if you see
any serious misbehavior.
CAUTION: THESE CHANGES REQUIRE INITDB.  pg_statistic table has changed.

src/backend/commands/vacuum.c
src/backend/utils/adt/selfuncs.c
src/include/catalog/pg_statistic.h
src/include/commands/vacuum.h
src/include/utils/builtins.h

index 204bd8e2693351457505478f4d7e43b4f2f0b839..12d2bc9bf99b856d2308f842562dcb61bb07adfc 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.115 1999/07/19 07:07:20 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.116 1999/08/01 04:54:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -78,7 +78,7 @@ static void vc_vacpage(Page page, VPageDescr vpd);
 static void vc_vaconeind(VPageList vpl, Relation indrel, int num_tuples, int keep_tuples);
 static void vc_scanoneind(Relation indrel, int num_tuples);
 static void vc_attrstats(Relation onerel, VRelStats *vacrelstats, HeapTuple tuple);
-static void vc_bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int16 *bucket_len);
+static void vc_bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int *bucket_len);
 static void vc_updstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *vacrelstats);
 static void vc_delhilowstats(Oid relid, int attcnt, int *attnums);
 static VPageDescr vc_tidreapped(ItemPointer itemptr, VPageList vpl);
@@ -473,9 +473,13 @@ vc_vacone(Oid relid, bool analyze, List *va_cols)
            {
                pgopform = (Form_pg_operator) GETSTRUCT(func_operator);
                fmgr_info(pgopform->oprcode, &(stats->f_cmplt));
+               stats->op_cmplt = oprid(func_operator);
            }
            else
+           {
                stats->f_cmplt.fn_addr = NULL;
+               stats->op_cmplt = InvalidOid;
+           }
 
            func_operator = oper(">", stats->attr->atttypid, stats->attr->atttypid, true);
            if (func_operator != NULL)
@@ -2200,8 +2204,8 @@ vc_attrstats(Relation onerel, VRelStats *vacrelstats, HeapTuple tuple)
            {
                swapDatum(stats->guess1, stats->guess2);
                swapInt(stats->guess1_len, stats->guess2_len);
-               stats->guess1_cnt = stats->guess2_hits;
                swapLong(stats->guess1_hits, stats->guess2_hits);
+               stats->guess1_cnt = stats->guess1_hits;
            }
            if (stats->guess1_cnt > stats->best_cnt)
            {
@@ -2227,7 +2231,7 @@ vc_attrstats(Relation onerel, VRelStats *vacrelstats, HeapTuple tuple)
  *
  */
 static void
-vc_bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int16 *bucket_len)
+vc_bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int *bucket_len)
 {
    if (attr->attbyval && attr->attlen != -1)
        *bucket = value;
@@ -2340,13 +2344,14 @@ vc_updstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *
                    selratio = 0;
                else if (VacAttrStatsLtGtValid(stats) && stats->min_cnt + stats->max_cnt == stats->nonnull_cnt)
                {
+                   /* exact result when there are just 1 or 2 values... */
                    double      min_cnt_d = stats->min_cnt,
                                max_cnt_d = stats->max_cnt,
                                null_cnt_d = stats->null_cnt,
-                               nonnullcnt_d = stats->nonnull_cnt;      /* prevent overflow */
+                               nonnull_cnt_d = stats->nonnull_cnt;     /* prevent overflow */
 
                    selratio = (min_cnt_d * min_cnt_d + max_cnt_d * max_cnt_d + null_cnt_d * null_cnt_d) /
-                       (nonnullcnt_d + null_cnt_d) / (nonnullcnt_d + null_cnt_d);
+                       (nonnull_cnt_d + null_cnt_d) / (nonnull_cnt_d + null_cnt_d);
                }
                else
                {
@@ -2359,7 +2364,9 @@ vc_updstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *
                     */
                    selratio = (most * most + 0.20 * most * (total - most)) / total / total;
                }
-               if (selratio > 1.0)
+               if (selratio < 0.0)
+                   selratio = 0.0;
+               else if (selratio > 1.0)
                    selratio = 1.0;
                attp->attdisbursion = selratio;
 
@@ -2375,13 +2382,22 @@ vc_updstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *
                 * doing system relations, especially pg_statistic is a
                 * problem
                 */
-               if (VacAttrStatsLtGtValid(stats) && stats->initialized  /* &&
-                                                                        * !IsSystemRelationName(
-                                                                        *
-                    pgcform->relname.data) */ )
+               if (VacAttrStatsLtGtValid(stats) && stats->initialized
+                   /* && !IsSystemRelationName(pgcform->relname.data)
+                    */ )
                {
+                   float32data nullratio;
+                   float32data bestratio;
                    FmgrInfo    out_function;
                    char       *out_string;
+                   double      best_cnt_d = stats->best_cnt,
+                               null_cnt_d = stats->null_cnt,
+                               nonnull_cnt_d = stats->nonnull_cnt;     /* prevent overflow */
+
+                   nullratio = null_cnt_d / (nonnull_cnt_d + null_cnt_d);
+                   bestratio = best_cnt_d / (nonnull_cnt_d + null_cnt_d);
+
+                   fmgr_info(stats->outfunc, &out_function);
 
                    for (i = 0; i < Natts_pg_statistic; ++i)
                        nulls[i] = ' ';
@@ -2391,26 +2407,34 @@ vc_updstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *
                     * ----------------
                     */
                    i = 0;
-                   values[i++] = (Datum) relid;        /* 1 */
-                   values[i++] = (Datum) attp->attnum; /* 2 */
-                   values[i++] = (Datum) InvalidOid;   /* 3 */
-                   fmgr_info(stats->outfunc, &out_function);
-                   out_string = (*fmgr_faddr(&out_function)) (stats->min, stats->attr->atttypid);
-                   values[i++] = (Datum) fmgr(F_TEXTIN, out_string);
+                   values[i++] = (Datum) relid;        /* starelid */
+                   values[i++] = (Datum) attp->attnum; /* staattnum */
+                   values[i++] = (Datum) stats->op_cmplt;  /* staop */
+                   /* hack: this code knows float4 is pass-by-ref */
+                   values[i++] = PointerGetDatum(&nullratio);  /* stanullfrac */
+                   values[i++] = PointerGetDatum(&bestratio);  /* stacommonfrac */
+                   out_string = (*fmgr_faddr(&out_function)) (stats->best, stats->attr->atttypid, stats->attr->atttypmod);
+                   values[i++] = PointerGetDatum(textin(out_string)); /* stacommonval */
                    pfree(out_string);
-                   out_string = (char *) (*fmgr_faddr(&out_function)) (stats->max, stats->attr->atttypid);
-                   values[i++] = (Datum) fmgr(F_TEXTIN, out_string);
+                   out_string = (*fmgr_faddr(&out_function)) (stats->min, stats->attr->atttypid, stats->attr->atttypmod);
+                   values[i++] = PointerGetDatum(textin(out_string)); /* staloval */
+                   pfree(out_string);
+                   out_string = (char *) (*fmgr_faddr(&out_function)) (stats->max, stats->attr->atttypid, stats->attr->atttypmod);
+                   values[i++] = PointerGetDatum(textin(out_string)); /* stahival */
                    pfree(out_string);
 
                    stup = heap_formtuple(sd->rd_att, values, nulls);
 
                    /* ----------------
-                    *  insert the tuple in the relation and get the tuple's oid.
+                    *  insert the tuple in the relation.
                     * ----------------
                     */
                    heap_insert(sd, stup);
-                   pfree(DatumGetPointer(values[3]));
-                   pfree(DatumGetPointer(values[4]));
+
+                   /* release allocated space */
+                   pfree(DatumGetPointer(values[Anum_pg_statistic_stacommonval-1]));
+                   pfree(DatumGetPointer(values[Anum_pg_statistic_staloval-1]));
+                   pfree(DatumGetPointer(values[Anum_pg_statistic_stahival-1]));
                    pfree(stup);
                }
            }
index db78c48525649537e49dea428040a276a08c8182..0b6afc814b6ed849811f48cf10309e6dedd6ecd6 100644 (file)
@@ -6,13 +6,11 @@
  *   These routines are registered in the operator catalog in the
  *   "oprrest" and "oprjoin" attributes.
  *
- *   XXX check all the functions--I suspect them to be 1-based.
- *
  * Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.35 1999/07/17 20:17:59 momjian Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.36 1999/08/01 04:54:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include "access/heapam.h"
 #include "catalog/catname.h"
+#include "catalog/pg_operator.h"
 #include "catalog/pg_statistic.h"
+#include "catalog/pg_type.h"
+#include "parser/parse_oper.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
 /* N is not a valid var/constant or relation id */
 #define NONVALUE(N)        ((N) == -1)
 
-/*
- * generalize the test for functional index selectivity request
- */
-#define FunctionalSelectivity(nIndKeys,attNum) (attNum==InvalidAttrNumber)
+/* are we looking at a functional index selectivity request? */
+#define FunctionalSelectivity(nIndKeys,attNum) ((attNum)==InvalidAttrNumber)
 
-static float32data getattdisbursion(Oid relid, AttrNumber attnum);
-static void gethilokey(Oid relid, AttrNumber attnum, Oid opid,
-          char **high, char **low);
+/* default selectivity estimate for inequalities such as "A < b" */
+#define DEFAULT_INEQ_SEL  (1.0 / 3.0)
+
+static void getattproperties(Oid relid, AttrNumber attnum,
+                            Oid *typid,
+                            int *typlen,
+                            bool *typbyval,
+                            int32 *typmod);
+static bool getattstatistics(Oid relid, AttrNumber attnum,
+                            Oid typid, int32 typmod,
+                            double *nullfrac,
+                            double *commonfrac,
+                            Datum *commonval,
+                            Datum *loval,
+                            Datum *hival);
+static double getattdisbursion(Oid relid, AttrNumber attnum);
 
 
 /*
- *     eqsel           - Selectivity of "=" for any data type.
+ *     eqsel           - Selectivity of "=" for any data types.
  */
 float64
 eqsel(Oid opid,
      Oid relid,
      AttrNumber attno,
-     char *value,
+     Datum value,
      int32 flag)
 {
    float64     result;
@@ -55,18 +67,124 @@ eqsel(Oid opid,
    if (NONVALUE(attno) || NONVALUE(relid))
        *result = 0.1;
    else
-       *result = (float64data) getattdisbursion(relid, (int) attno);
+   {
+       Oid         typid;
+       int         typlen;
+       bool        typbyval;
+       int32       typmod;
+       double      nullfrac;
+       double      commonfrac;
+       Datum       commonval;
+       double      selec;
+
+       /* get info about the attribute */
+       getattproperties(relid, attno,
+                        &typid, &typlen, &typbyval, &typmod);
+
+       if (getattstatistics(relid, attno, typid, typmod,
+                            &nullfrac, &commonfrac, &commonval,
+                            NULL, NULL))
+       {
+           if (flag & SEL_CONSTANT)
+           {
+               /* Is the constant the same as the most common value? */
+               HeapTuple   oprtuple;
+               Oid         ltype,
+                           rtype;
+               Operator    func_operator;
+               bool        mostcommon = false;
+
+               /* get left and right datatypes of the operator */
+               oprtuple = get_operator_tuple(opid);
+               if (! HeapTupleIsValid(oprtuple))
+                   elog(ERROR, "eqsel: no tuple for operator %u", opid);
+               ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
+               rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
+
+               /* and find appropriate equality operator (no, it ain't
+                * necessarily opid itself...)
+                */
+               func_operator = oper("=", ltype, rtype, true);
+
+               if (func_operator != NULL)
+               {
+                   RegProcedure eqproc = ((Form_pg_operator) GETSTRUCT(func_operator))->oprcode;
+                   if (flag & SEL_RIGHT) /* given value on the right? */
+                       mostcommon = (bool)
+                           DatumGetUInt8(fmgr(eqproc, commonval, value));
+                   else
+                       mostcommon = (bool)
+                           DatumGetUInt8(fmgr(eqproc, value, commonval));
+               }
+
+               if (mostcommon)
+               {
+                   /* Search is for the most common value.  We know the
+                    * selectivity exactly (or as exactly as VACUUM could
+                    * calculate it, anyway).
+                    */
+                   selec = commonfrac;
+               }
+               else
+               {
+                   /* Comparison is against a constant that is neither the
+                    * most common value nor null.  Its selectivity cannot
+                    * be more than this:
+                    */
+                   selec = 1.0 - commonfrac - nullfrac;
+                   if (selec > commonfrac)
+                       selec = commonfrac;
+                   /* and in fact it's probably less, so apply a fudge
+                    * factor.
+                    */
+                   selec *= 0.5;
+               }
+           }
+           else
+           {
+               /* Search is for a value that we do not know a priori,
+                * but we will assume it is not NULL.  Selectivity
+                * cannot be more than this:
+                */
+               selec = 1.0 - nullfrac;
+               if (selec > commonfrac)
+                   selec = commonfrac;
+               /* and in fact it's probably less, so apply a fudge
+                * factor.
+                */
+               selec *= 0.5;
+           }
+
+           /* result should be in range, but make sure... */
+           if (selec < 0.0)
+               selec = 0.0;
+           else if (selec > 1.0)
+               selec = 1.0;
+
+           if (! typbyval)
+               pfree(DatumGetPointer(commonval));
+       }
+       else
+       {
+           /* No VACUUM ANALYZE stats available, so make a guess using
+            * the disbursion stat (if we have that, which is unlikely...)
+            */
+           selec = getattdisbursion(relid, attno);
+       }
+
+       *result = (float64data) selec;
+   }
    return result;
 }
 
 /*
- *     neqsel          - Selectivity of "!=" for any data type.
+ *     neqsel          - Selectivity of "!=" for any data types.
  */
 float64
 neqsel(Oid opid,
       Oid relid,
       AttrNumber attno,
-      char *value,
+      Datum value,
       int32 flag)
 {
    float64     result;
@@ -77,96 +195,164 @@ neqsel(Oid opid,
 }
 
 /*
- *     intltsel        - Selectivity of "<" for integers.
+ *     intltsel        - Selectivity of "<" (also "<=") for integers.
  *                       Should work for both longs and shorts.
  */
 float64
 intltsel(Oid opid,
         Oid relid,
         AttrNumber attno,
-        int32 value,
+        Datum value,
         int32 flag)
 {
    float64     result;
-   char       *highchar,
-              *lowchar;
-   long        val,
-               high,
-               low,
-               top,
-               bottom;
 
    result = (float64) palloc(sizeof(float64data));
-   if (NONVALUE(attno) || NONVALUE(relid))
-       *result = 1.0 / 3;
+   if (! (flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid))
+       *result = DEFAULT_INEQ_SEL;
    else
    {
-       /* XXX          val = atol(value); */
-       val = value;
-       gethilokey(relid, (int) attno, opid, &highchar, &lowchar);
-       if (*highchar == 'n' || *lowchar == 'n')
+       HeapTuple   oprtuple;
+       Oid         ltype,
+                   rtype;
+       Oid         typid;
+       int         typlen;
+       bool        typbyval;
+       int32       typmod;
+       Datum       hival,
+                   loval;
+       long        val,
+                   high,
+                   low,
+                   numerator,
+                   denominator;
+
+       /* get left and right datatypes of the operator */
+       oprtuple = get_operator_tuple(opid);
+       if (! HeapTupleIsValid(oprtuple))
+           elog(ERROR, "intltsel: no tuple for operator %u", opid);
+       ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
+       rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
+
+       /*
+        * TEMPORARY HACK: this code is currently getting called for
+        * a bunch of non-integral types.  Give a default estimate if
+        * either side is not pass-by-val.  Need better solution.
+        */
+       if (! get_typbyval(ltype) || ! get_typbyval(rtype))
        {
-           *result = 1.0 / 3.0;
+           *result = DEFAULT_INEQ_SEL;
            return result;
        }
-       high = atol(highchar);
-       low = atol(lowchar);
-       if ((flag & SEL_RIGHT && val < low) ||
-           (!(flag & SEL_RIGHT) && val > high))
+
+       /* Deduce type of the constant, and convert to uniform "long" format.
+        * Note that constant might well be a different type than attribute.
+        * XXX this ought to use a type-specific "convert to double" op.
+        */
+       typid = (flag & SEL_RIGHT) ? rtype : ltype;
+       switch (get_typlen(typid))
        {
-           float32data nvals;
+           case 1:
+               val = (long) DatumGetUInt8(value);
+               break;
+           case 2:
+               val = (long) DatumGetInt16(value);
+               break;
+           case 4:
+               val = (long) DatumGetInt32(value);
+               break;
+           default:
+               elog(ERROR, "intltsel: unsupported type %u", typid);
+               *result = DEFAULT_INEQ_SEL;
+               return result;
+       }
 
-           nvals = getattdisbursion(relid, (int) attno);
-           if (nvals == 0)
-               *result = 1.0 / 3.0;
-           else
-           {
-               *result = 3.0 * (float64data) nvals;
-               if (*result > 1.0)
-                   *result = 1;
-           }
+       /* Now get info about the attribute */
+       getattproperties(relid, attno,
+                        &typid, &typlen, &typbyval, &typmod);
+
+       if (! getattstatistics(relid, attno, typid, typmod,
+                              NULL, NULL, NULL,
+                              &loval, &hival))
+       {
+           *result = DEFAULT_INEQ_SEL;
+           return result;
+       }
+       /*
+        * Convert loval/hival to common "long int" representation.
+        */
+       switch (typlen)
+       {
+           case 1:
+               low = (long) DatumGetUInt8(loval);
+               high = (long) DatumGetUInt8(hival);
+               break;
+           case 2:
+               low = (long) DatumGetInt16(loval);
+               high = (long) DatumGetInt16(hival);
+               break;
+           case 4:
+               low = (long) DatumGetInt32(loval);
+               high = (long) DatumGetInt32(hival);
+               break;
+           default:
+               elog(ERROR, "intltsel: unsupported type %u", typid);
+               *result = DEFAULT_INEQ_SEL;
+               return result;
+       }
+       if (val < low || val > high)
+       {
+           /* If given value is outside the statistical range,
+            * assume we have out-of-date stats and return a default guess.
+            * We could return a small or large value if we trusted the stats
+            * more.   XXX change this eventually.
+            */
+           *result = DEFAULT_INEQ_SEL;
        }
        else
        {
-           bottom = high - low;
-           if (bottom == 0)
-               ++bottom;
+           denominator = high - low;
+           if (denominator <= 0)
+               denominator = 1;
            if (flag & SEL_RIGHT)
-               top = val - low;
+               numerator = val - low;
            else
-               top = high - val;
-           if (top > bottom)
+               numerator = high - val;
+           if (numerator <= 0) /* never return a zero estimate! */
+               numerator = 1;
+           if (numerator >= denominator)
                *result = 1.0;
            else
-           {
-               if (top == 0)
-                   ++top;
-               *result = ((1.0 * top) / bottom);
-           }
+               *result = (double) numerator / (double) denominator;
+       }
+       if (! typbyval)
+       {
+           pfree(DatumGetPointer(hival));
+           pfree(DatumGetPointer(loval));
        }
    }
    return result;
 }
 
 /*
- *     intgtsel        - Selectivity of ">" for integers.
+ *     intgtsel        - Selectivity of ">" (also ">=") for integers.
  *                       Should work for both longs and shorts.
  */
 float64
 intgtsel(Oid opid,
         Oid relid,
         AttrNumber attno,
-        int32 value,
+        Datum value,
         int32 flag)
 {
    float64     result;
-   int         notflag;
 
-   if (flag & 0)
-       notflag = flag & ~SEL_RIGHT;
-   else
-       notflag = flag | SEL_RIGHT;
-   result = intltsel(opid, relid, attno, value, (int32) notflag);
+   /* Compute selectivity of "<", then invert --- but only if we
+    * were able to produce a non-default estimate.
+    */
+   result = intltsel(opid, relid, attno, value, flag);
+   if (*result != DEFAULT_INEQ_SEL)
+       *result = 1.0 - *result;
    return result;
 }
 
@@ -181,7 +367,7 @@ eqjoinsel(Oid opid,
          AttrNumber attno2)
 {
    float64     result;
-   float32data num1,
+   float64data num1,
                num2,
                max;
 
@@ -191,13 +377,13 @@ eqjoinsel(Oid opid,
        *result = 0.1;
    else
    {
-       num1 = getattdisbursion(relid1, (int) attno1);
-       num2 = getattdisbursion(relid2, (int) attno2);
+       num1 = getattdisbursion(relid1, attno1);
+       num2 = getattdisbursion(relid2, attno2);
        max = (num1 > num2) ? num1 : num2;
-       if (max == 0)
+       if (max <= 0)
            *result = 1.0;
        else
-           *result = (float64data) max;
+           *result = max;
    }
    return result;
 }
@@ -220,7 +406,7 @@ neqjoinsel(Oid opid,
 }
 
 /*
- *     intltjoinsel    - Join selectivity of "<"
+ *     intltjoinsel    - Join selectivity of "<" and "<="
  */
 float64
 intltjoinsel(Oid opid,
@@ -232,12 +418,12 @@ intltjoinsel(Oid opid,
    float64     result;
 
    result = (float64) palloc(sizeof(float64data));
-   *result = 1.0 / 3.0;
+   *result = DEFAULT_INEQ_SEL;
    return result;
 }
 
 /*
- *     intgtjoinsel    - Join selectivity of ">"
+ *     intgtjoinsel    - Join selectivity of ">" and ">="
  */
 float64
 intgtjoinsel(Oid opid,
@@ -249,129 +435,230 @@ intgtjoinsel(Oid opid,
    float64     result;
 
    result = (float64) palloc(sizeof(float64data));
-   *result = 1.0 / 3.0;
+   *result = DEFAULT_INEQ_SEL;
    return result;
 }
 
 /*
- *     getattdisbursion        - Retrieves the number of values within an attribute.
- *
- *     Note:
- *             getattdisbursion and gethilokey both currently use keyed
- *             relation scans and amgetattr.  Alternatively,
- *             the relation scan could be non-keyed and the tuple
- *             returned could be cast (struct X *) tuple + tuple->t_hoff.
- *             The first method is good for testing the implementation,
- *             but the second may ultimately be faster?!?  In any case,
- *             using the cast instead of amgetattr would be
- *             more efficient.  However, the cast will not work
- *             for gethilokey which accesses stahikey in struct statistic.
+ * getattproperties
+ *   Retrieve pg_attribute properties for an attribute,
+ *   including type OID, type len, type byval flag, typmod.
  */
-static float32data
-getattdisbursion(Oid relid, AttrNumber attnum)
+static void
+getattproperties(Oid relid, AttrNumber attnum,
+                Oid *typid, int *typlen, bool *typbyval, int32 *typmod)
 {
    HeapTuple   atp;
-   float32data nvals;
-   int32       ntuples;
+   Form_pg_attribute att_tup;
 
    atp = SearchSysCacheTuple(ATTNUM,
                              ObjectIdGetDatum(relid),
                              Int16GetDatum(attnum),
                              0, 0);
-   if (!HeapTupleIsValid(atp))
-   {
-       elog(ERROR, "getattdisbursion: no attribute tuple %u %d",
-            relid, attnum);
-       return 0;
-   }
-   nvals = ((Form_pg_attribute) GETSTRUCT(atp))->attdisbursion;
-   if (nvals > 0)
-       return nvals;
-
-   atp = SearchSysCacheTuple(RELOID,
-                             ObjectIdGetDatum(relid),
-                             0, 0, 0);
-
-   /*
-    * XXX -- use number of tuples as number of distinctive values just
-    * for now, in case number of distinctive values is not cached
-    */
-   if (!HeapTupleIsValid(atp))
-   {
-       elog(ERROR, "getattdisbursion: no relation tuple %u", relid);
-       return 0;
-   }
-   ntuples = ((Form_pg_class) GETSTRUCT(atp))->reltuples;
-   /* Look above how nvals is used.    - vadim 04/09/97 */
-   if (ntuples > 0)
-       nvals = 1.0 / ntuples;
-
-   return nvals;
+   if (! HeapTupleIsValid(atp))
+       elog(ERROR, "getattproperties: no attribute tuple %u %d",
+            relid, (int) attnum);
+   att_tup = (Form_pg_attribute) GETSTRUCT(atp);
+
+   *typid = att_tup->atttypid;
+   *typlen = att_tup->attlen;
+   *typbyval = att_tup->attbyval;
+   *typmod = att_tup->atttypmod;
 }
 
 /*
- *     gethilokey      - Returns a pointer to strings containing
- *                       the high and low keys within an attribute.
+ * getattstatistics
+ *   Retrieve the pg_statistic data for an attribute.
+ *   Returns 'false' if no stats are available.
+ *
+ * Inputs:
+ * 'relid' and 'attnum' are the relation and attribute number.
+ * 'typid' and 'typmod' are the type and typmod of the column,
+ * which the caller must already have looked up.
  *
- *     Currently returns "0", and "0" in high and low if the statistic
- *     catalog does not contain the proper tuple.  Eventually, the
- *     statistic demon should have the tuple maintained, and it should
- *     elog() if the tuple is missing.
+ * Outputs:
+ * The available stats are nullfrac, commonfrac, commonval, loval, hival.
+ * The caller need not retrieve all five --- pass NULL pointers for the
+ * unwanted values.
  *
- *     XXX Question: is this worth sticking in the catalog caches,
- *         or will this get invalidated too often?
+ * commonval, loval, hival are returned as Datums holding the internal
+ * representation of the values.  (Note that these should be pfree'd
+ * after use if the data type is not by-value.)
+ *
+ * XXX currently, this does a linear search of pg_statistic because there
+ * is no index nor syscache for pg_statistic.  FIX THIS!
  */
-static void
-gethilokey(Oid relid,
-          AttrNumber attnum,
-          Oid opid,
-          char **high,
-          char **low)
+static bool
+getattstatistics(Oid relid, AttrNumber attnum, Oid typid, int32 typmod,
+                double *nullfrac,
+                double *commonfrac,
+                Datum *commonval,
+                Datum *loval,
+                Datum *hival)
 {
    Relation    rel;
    HeapScanDesc scan;
-   static ScanKeyData key[3] = {
+   static ScanKeyData key[2] = {
        {0, Anum_pg_statistic_starelid, F_OIDEQ, {0, 0, F_OIDEQ}},
-       {0, Anum_pg_statistic_staattnum, F_INT2EQ, {0, 0, F_INT2EQ}},
-       {0, Anum_pg_statistic_staop, F_OIDEQ, {0, 0, F_OIDEQ}}
+       {0, Anum_pg_statistic_staattnum, F_INT2EQ, {0, 0, F_INT2EQ}}
    };
    bool        isnull;
    HeapTuple   tuple;
+   HeapTuple   typeTuple;
+   FmgrInfo    inputproc;
 
    rel = heap_openr(StatisticRelationName);
 
    key[0].sk_argument = ObjectIdGetDatum(relid);
    key[1].sk_argument = Int16GetDatum((int16) attnum);
-   key[2].sk_argument = ObjectIdGetDatum(opid);
-   scan = heap_beginscan(rel, 0, SnapshotNow, 3, key);
+
+   scan = heap_beginscan(rel, 0, SnapshotNow, 2, key);
    tuple = heap_getnext(scan, 0);
    if (!HeapTupleIsValid(tuple))
    {
-       *high = "n";
-       *low = "n";
+       /* no such stats entry */
+       heap_endscan(scan);
+       heap_close(rel);
+       return false;
+   }
 
-       /*
-        * XXX          elog(ERROR, "gethilokey: statistic tuple not
-        * found");
-        */
-       return;
+   /* We assume that there will only be one entry in pg_statistic
+    * for the given rel/att.  Someday, VACUUM might store more than one...
+    */
+   if (nullfrac)
+       *nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac;
+   if (commonfrac)
+       *commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac;
+
+   /* Get the type input proc for the column datatype */
+   typeTuple = SearchSysCacheTuple(TYPOID,
+                                   ObjectIdGetDatum(typid),
+                                   0, 0, 0);
+   if (! HeapTupleIsValid(typeTuple))
+       elog(ERROR, "getattstatistics: Cache lookup failed for type %u",
+            typid);
+   fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc);
+
+   /* Values are variable-length fields, so cannot access as struct fields.
+    * Must do it the hard way with heap_getattr.
+    */
+   if (commonval)
+   {
+       text *val = (text *) heap_getattr(tuple,
+                                         Anum_pg_statistic_stacommonval,
+                                         RelationGetDescr(rel),
+                                         &isnull);
+       if (isnull)
+       {
+           elog(DEBUG, "getattstatistics: stacommonval is null");
+           *commonval = PointerGetDatum(NULL);
+       }
+       else
+       {
+           char *strval = textout(val);
+           *commonval = (Datum)
+               (*fmgr_faddr(&inputproc)) (strval, typid, typmod);
+           pfree(strval);
+       }
    }
-   *high = textout((struct varlena *)
-                   heap_getattr(tuple,
-                                Anum_pg_statistic_stahikey,
-                                RelationGetDescr(rel),
-                                &isnull));
-   if (isnull)
-       elog(DEBUG, "gethilokey: high key is null");
-   *low = textout((struct varlena *)
-                  heap_getattr(tuple,
-                               Anum_pg_statistic_stalokey,
-                               RelationGetDescr(rel),
-                               &isnull));
-   if (isnull)
-       elog(DEBUG, "gethilokey: low key is null");
+
+   if (loval)
+   {
+       text *val = (text *) heap_getattr(tuple,
+                                         Anum_pg_statistic_staloval,
+                                         RelationGetDescr(rel),
+                                         &isnull);
+       if (isnull)
+       {
+           elog(DEBUG, "getattstatistics: staloval is null");
+           *loval = PointerGetDatum(NULL);
+       }
+       else
+       {
+           char *strval = textout(val);
+           *loval = (Datum)
+               (*fmgr_faddr(&inputproc)) (strval, typid, typmod);
+           pfree(strval);
+       }
+   }
+
+   if (hival)
+   {
+       text *val = (text *) heap_getattr(tuple,
+                                         Anum_pg_statistic_stahival,
+                                         RelationGetDescr(rel),
+                                         &isnull);
+       if (isnull)
+       {
+           elog(DEBUG, "getattstatistics: stahival is null");
+           *hival = PointerGetDatum(NULL);
+       }
+       else
+       {
+           char *strval = textout(val);
+           *hival = (Datum)
+               (*fmgr_faddr(&inputproc)) (strval, typid, typmod);
+           pfree(strval);
+       }
+   }
+
    heap_endscan(scan);
    heap_close(rel);
+   return true;
+}
+
+/*
+ * getattdisbursion
+ *   Retrieve the disbursion statistic for an attribute,
+ *   or produce an estimate if no info is available.
+ */
+static double
+getattdisbursion(Oid relid, AttrNumber attnum)
+{
+   HeapTuple   atp;
+   double      disbursion;
+   int32       ntuples;
+
+   atp = SearchSysCacheTuple(ATTNUM,
+                             ObjectIdGetDatum(relid),
+                             Int16GetDatum(attnum),
+                             0, 0);
+   if (!HeapTupleIsValid(atp))
+   {
+       /* this should not happen */
+       elog(ERROR, "getattdisbursion: no attribute tuple %u %d",
+            relid, attnum);
+       return 0.1;
+   }
+
+   disbursion = ((Form_pg_attribute) GETSTRUCT(atp))->attdisbursion;
+   if (disbursion > 0.0)
+       return disbursion;
+
+   /* VACUUM ANALYZE has not stored a disbursion statistic for us.
+    * Produce an estimate = 1/numtuples.  This may produce
+    * unreasonably small estimates for large tables, so limit
+    * the estimate to no less than 0.01.
+    */
+   atp = SearchSysCacheTuple(RELOID,
+                             ObjectIdGetDatum(relid),
+                             0, 0, 0);
+   if (!HeapTupleIsValid(atp))
+   {
+       /* this should not happen */
+       elog(ERROR, "getattdisbursion: no relation tuple %u", relid);
+       return 0.1;
+   }
+
+   ntuples = ((Form_pg_class) GETSTRUCT(atp))->reltuples;
+
+   if (ntuples > 0)
+       disbursion = 1.0 / (double) ntuples;
+
+   if (disbursion < 0.01)
+       disbursion = 0.01;
+
+   return disbursion;
 }
 
 float64
index 19b87b68b1b09c37d5fa8fe876308bdb239fd3ae..1c719443282f433c295918d23d9c91695e745f2b 100644 (file)
@@ -7,7 +7,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: pg_statistic.h,v 1.6 1999/02/13 23:21:15 momjian Exp $
+ * $Id: pg_statistic.h,v 1.7 1999/08/01 04:54:21 tgl Exp $
  *
  * NOTES
  *   the genbki.sh script reads this file and generates .bki
  */
 CATALOG(pg_statistic)
 {
-   Oid         starelid;
-   int2        staattnum;
-   Oid         staop;
-   text        stalokey;       /* VARIABLE LENGTH FIELD */
-   text        stahikey;       /* VARIABLE LENGTH FIELD */
+   /* These fields form the unique key for the entry: */
+   Oid         starelid;       /* relation containing attribute */
+   int2        staattnum;      /* attribute (column) stats are for */
+   Oid         staop;          /* '<' comparison op used for lo/hi vals */
+   /* Note: the current VACUUM code will never produce more than one entry
+    * per column, but in theory there could be multiple entries if a datatype
+    * has more than one useful ordering operator.  Also, the current code
+    * will not write an entry unless it found at least one non-NULL value
+    * in the column; so the remaining fields will never be NULL.
+    */
+
+   /* These fields contain the stats about the column indicated by the key */
+   float4      stanullfrac;    /* the fraction of the entries that are NULL */
+   float4      stacommonfrac;  /* the fraction that are the most common val */
+
+   /* THE REST OF THESE ARE VARIABLE LENGTH FIELDS.
+    * They cannot be accessed as C struct entries; you have to use the
+    * full field access machinery (heap_getattr) for them.
+    *
+    * All three of these are text representations of data values of the
+    * column's data type.  To re-create the actual Datum, do
+    * datatypein(textout(givenvalue)).
+    */
+   text        stacommonval;   /* most common non-null value in column */
+   text        staloval;       /* smallest non-null value in column */
+   text        stahival;       /* largest non-null value in column */
 } FormData_pg_statistic;
 
 /* ----------------
@@ -50,11 +71,14 @@ typedef FormData_pg_statistic *Form_pg_statistic;
  *     compiler constants for pg_statistic
  * ----------------
  */
-#define Natts_pg_statistic             5
+#define Natts_pg_statistic             8
 #define Anum_pg_statistic_starelid     1
 #define Anum_pg_statistic_staattnum        2
 #define Anum_pg_statistic_staop            3
-#define Anum_pg_statistic_stalokey     4
-#define Anum_pg_statistic_stahikey     5
+#define Anum_pg_statistic_stanullfrac  4
+#define Anum_pg_statistic_stacommonfrac    5
+#define Anum_pg_statistic_stacommonval 6
+#define Anum_pg_statistic_staloval     7
+#define Anum_pg_statistic_stahival     8
 
 #endif  /* PG_STATISTIC_H */
index 59a72bbb15b949228ed7b059964f61f419a44027..59a7fe4a5063e4e82e7a0ee8c854b4f4238b5cc1 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: vacuum.h,v 1.22 1999/07/15 15:21:03 momjian Exp $
+ * $Id: vacuum.h,v 1.23 1999/08/01 04:54:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -67,22 +67,23 @@ typedef struct
                guess2,
                max,
                min;
-   int16       best_len,
+   int         best_len,
                guess1_len,
                guess2_len,
                max_len,
                min_len;
-   int32       best_cnt,
+   long        best_cnt,
                guess1_cnt,
                guess1_hits,
                guess2_hits,
                null_cnt,
-               nonnull_cnt;
-   int32       max_cnt,
+               nonnull_cnt,
+               max_cnt,
                min_cnt;
    FmgrInfo    f_cmpeq,
                f_cmplt,
                f_cmpgt;
+   Oid         op_cmplt;
    regproc     outfunc;
    bool        initialized;
 } VacAttrStats;
index e6a0b4157d5ea5ce013ed614513925f0975bbd8b..dfe1897cbe22e7d364972c0c4cbcafafd86ec260 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: builtins.h,v 1.84 1999/07/16 17:07:39 momjian Exp $
+ * $Id: builtins.h,v 1.85 1999/08/01 04:54:20 tgl Exp $
  *
  * NOTES
  *   This should normally only be included by fmgr.h.
@@ -372,10 +372,10 @@ extern Oid    regproctooid(RegProcedure rp);
 #define RegprocToOid(rp) regproctooid(rp)
 
 /* selfuncs.c */
-extern float64 eqsel(Oid opid, Oid relid, AttrNumber attno, char *value, int32 flag);
-extern float64 neqsel(Oid opid, Oid relid, AttrNumber attno, char *value, int32 flag);
-extern float64 intltsel(Oid opid, Oid relid, AttrNumber attno, int32 value, int32 flag);
-extern float64 intgtsel(Oid opid, Oid relid, AttrNumber attno, int32 value, int32 flag);
+extern float64 eqsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
+extern float64 neqsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
+extern float64 intltsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
+extern float64 intgtsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
 extern float64 eqjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);
 extern float64 neqjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);
 extern float64 intltjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);