Add selectivity estimation functions for intarray operators.
authorHeikki Linnakangas
Tue, 21 Jul 2015 17:54:18 +0000 (20:54 +0300)
committerHeikki Linnakangas
Tue, 21 Jul 2015 17:59:24 +0000 (20:59 +0300)
Uriy Zhuravlev and Alexander Korotkov, reviewed by Jeff Janes, some cleanup
by me.

contrib/intarray/Makefile
contrib/intarray/_int_selfuncs.c [new file with mode: 0644]
contrib/intarray/expected/_int.out
contrib/intarray/intarray--1.0--1.1.sql [new file with mode: 0644]
contrib/intarray/intarray--1.1.sql [moved from contrib/intarray/intarray--1.0.sql with 88% similarity]
contrib/intarray/intarray.control
contrib/intarray/sql/_int.sql

index 920c5b1ba032217db405b5dba5a7128db43cd689..5ea7f2aedf8e57b7dcaa46aa7d7f90fd997ac5ac 100644 (file)
@@ -2,10 +2,10 @@
 
 MODULE_big = _int
 OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \
-   _intbig_gist.o _int_gin.o $(WIN32RES)
+   _intbig_gist.o _int_gin.o _int_selfuncs.o $(WIN32RES)
 
 EXTENSION = intarray
-DATA = intarray--1.0.sql intarray--unpackaged--1.0.sql
+DATA = intarray--1.1.sql intarray--1.0--1.1.sql intarray--unpackaged--1.0.sql
 PGFILEDESC = "intarray - functions and operators for arrays of integers"
 
 REGRESS = _int
diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c
new file mode 100644 (file)
index 0000000..2af1c9b
--- /dev/null
@@ -0,0 +1,341 @@
+/*-------------------------------------------------------------------------
+ *
+ * _int_selfuncs.c
+ *   Functions for selectivity estimation of intarray operators
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   contrib/intarray/_int_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "_int.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic.h"
+#include "catalog/pg_type.h"
+#include "utils/selfuncs.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "miscadmin.h"
+
+PG_FUNCTION_INFO_V1(_int_overlap_sel);
+PG_FUNCTION_INFO_V1(_int_contains_sel);
+PG_FUNCTION_INFO_V1(_int_contained_sel);
+PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
+PG_FUNCTION_INFO_V1(_int_contains_joinsel);
+PG_FUNCTION_INFO_V1(_int_contained_joinsel);
+PG_FUNCTION_INFO_V1(_int_matchsel);
+
+Datum      _int_overlap_sel(PG_FUNCTION_ARGS);
+Datum      _int_contains_sel(PG_FUNCTION_ARGS);
+Datum      _int_contained_sel(PG_FUNCTION_ARGS);
+Datum      _int_overlap_joinsel(PG_FUNCTION_ARGS);
+Datum      _int_contains_joinsel(PG_FUNCTION_ARGS);
+Datum      _int_contained_joinsel(PG_FUNCTION_ARGS);
+Datum      _int_matchsel(PG_FUNCTION_ARGS);
+
+
+static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs,
+                   int nmncelems, float4 minfreq);
+static int compare_val_int4(const void *a, const void *b);
+
+/*
+ * Wrappers around the default array selectivity estimation functions.
+ *
+ * The default array selectivity operators for the @>, && and @< operators
+ * work fine for integer arrays. However, if we tried to just use arraycontsel
+ * and arracontjoinsel directly as the cost estimator functions for our
+ * operators, they would not work as intended, because they look at the
+ * operator's OID. Our operators behave exactly like the built-in anyarray
+ * versions, but we must tell the cost estimator functions which built-in
+ * operators they correspond to. These wrappers just replace the operator
+ * OID with the corresponding built-in operator's OID, and call the built-in
+ * function.
+ */
+
+Datum
+_int_overlap_sel(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
+                                       PG_GETARG_DATUM(0),
+                                     ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
+                                       PG_GETARG_DATUM(2),
+                                       PG_GETARG_DATUM(3)));
+}
+
+Datum
+_int_contains_sel(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
+                                       PG_GETARG_DATUM(0),
+                                    ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
+                                       PG_GETARG_DATUM(2),
+                                       PG_GETARG_DATUM(3)));
+}
+
+Datum
+_int_contained_sel(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
+                                       PG_GETARG_DATUM(0),
+                                   ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
+                                       PG_GETARG_DATUM(2),
+                                       PG_GETARG_DATUM(3)));
+}
+
+Datum
+_int_overlap_joinsel(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
+                                       PG_GETARG_DATUM(0),
+                                     ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
+                                       PG_GETARG_DATUM(2),
+                                       PG_GETARG_DATUM(3),
+                                       PG_GETARG_DATUM(4)));
+}
+
+Datum
+_int_contains_joinsel(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
+                                       PG_GETARG_DATUM(0),
+                                    ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
+                                       PG_GETARG_DATUM(2),
+                                       PG_GETARG_DATUM(3),
+                                       PG_GETARG_DATUM(4)));
+}
+
+Datum
+_int_contained_joinsel(PG_FUNCTION_ARGS)
+{
+   PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
+                                       PG_GETARG_DATUM(0),
+                                   ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
+                                       PG_GETARG_DATUM(2),
+                                       PG_GETARG_DATUM(3),
+                                       PG_GETARG_DATUM(4)));
+}
+
+
+/*
+ * _int_matchsel -- restriction selectivity function for intarray @@ query_int
+ */
+Datum
+_int_matchsel(PG_FUNCTION_ARGS)
+{
+   PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+
+   List       *args = (List *) PG_GETARG_POINTER(2);
+   int         varRelid = PG_GETARG_INT32(3);
+   VariableStatData vardata;
+   Node       *other;
+   bool        varonleft;
+   Selectivity selec;
+   QUERYTYPE  *query;
+   Datum      *mcelems = NULL;
+   float4     *mcefreqs = NULL;
+   int         nmcelems = 0;
+   float4      minfreq = 0.0;
+   float4      nullfrac = 0.0;
+   Form_pg_statistic stats;
+   Datum      *values = NULL;
+   int         nvalues = 0;
+   float4     *numbers = NULL;
+   int         nnumbers = 0;
+
+   /*
+    * If expression is not "variable @@ something" or "something @@ variable"
+    * then punt and return a default estimate.
+    */
+   if (!get_restriction_variable(root, args, varRelid,
+                                 &vardata, &other, &varonleft))
+       PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+
+   /*
+    * Variable should be int[]. We don't support cases where variable is
+    * query_int.
+    */
+   if (vardata.vartype != INT4ARRAYOID)
+       PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+
+   /*
+    * Can't do anything useful if the something is not a constant, either.
+    */
+   if (!IsA(other, Const))
+   {
+       ReleaseVariableStats(vardata);
+       PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+   }
+
+   /*
+    * The "@@" operator is strict, so we can cope with NULL right away.
+    */
+   if (((Const *) other)->constisnull)
+   {
+       ReleaseVariableStats(vardata);
+       PG_RETURN_FLOAT8(0.0);
+   }
+
+   /* The caller made sure the const is a query, so get it now */
+   query = DatumGetQueryTypeP(((Const *) other)->constvalue);
+
+   /* Empty query matches nothing */
+   if (query->size == 0)
+   {
+       ReleaseVariableStats(vardata);
+       return (Selectivity) 0.0;
+   }
+
+   /*
+    * Get the statistics for the intarray column.
+    *
+    * We're interested in the Most-Common-Elements list, and the NULL
+    * fraction.
+    */
+   if (HeapTupleIsValid(vardata.statsTuple))
+   {
+       stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+       nullfrac = stats->stanullfrac;
+
+       /*
+        * For an int4 array, the default array type analyze function will
+        * collect a Most Common Elements list, which is an array of int4s.
+        */
+       if (get_attstatsslot(vardata.statsTuple,
+                            INT4OID, -1,
+                            STATISTIC_KIND_MCELEM, InvalidOid,
+                            NULL,
+                            &values, &nvalues,
+                            &numbers, &nnumbers))
+       {
+           /*
+            * There should be three more Numbers than Values, because the
+            * last three (for intarray) cells are taken for minimal, maximal
+            * and nulls frequency. Punt if not.
+            */
+           if (nnumbers == nvalues + 3)
+           {
+               /* Grab the lowest frequency. */
+               minfreq = numbers[nnumbers - (nnumbers - nvalues)];
+
+               mcelems = values;
+               mcefreqs = numbers;
+               nmcelems = nvalues;
+           }
+       }
+   }
+
+   /* Process the logical expression in the query, using the stats */
+   selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
+                               mcelems, mcefreqs, nmcelems, minfreq);
+
+   /* MCE stats count only non-null rows, so adjust for null rows. */
+   selec *= (1.0 - nullfrac);
+
+   free_attstatsslot(INT4OID, values, nvalues, numbers, nnumbers);
+   ReleaseVariableStats(vardata);
+
+   CLAMP_PROBABILITY(selec);
+
+   PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * Estimate selectivity of single intquery operator
+ */
+static Selectivity
+int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
+                   int nmcelems, float4 minfreq)
+{
+   Selectivity selec;
+
+   /* since this function recurses, it could be driven to stack overflow */
+   check_stack_depth();
+
+   if (item->type == VAL)
+   {
+       Datum      *searchres;
+
+       if (mcelems == NULL)
+           return (Selectivity) DEFAULT_EQ_SEL;
+
+       searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
+                                     sizeof(Datum), compare_val_int4);
+       if (searchres)
+       {
+           /*
+            * The element is in MCELEM.  Return precise selectivity (or at
+            * least as precise as ANALYZE could find out).
+            */
+           selec = mcefreqs[searchres - mcelems];
+       }
+       else
+       {
+           /*
+            * The element is not in MCELEM.  Punt, but assume that the
+            * selectivity cannot be more than minfreq / 2.
+            */
+           selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
+       }
+   }
+   else if (item->type == OPR)
+   {
+       /* Current query node is an operator */
+       Selectivity s1,
+                   s2;
+
+       s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
+                                minfreq);
+       switch (item->val)
+       {
+           case (int32) '!':
+               selec = 1.0 - s1;
+               break;
+
+           case (int32) '&':
+               s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
+                                        nmcelems, minfreq);
+               selec = s1 * s2;
+               break;
+
+           case (int32) '|':
+               s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
+                                        nmcelems, minfreq);
+               selec = s1 + s2 - s1 * s2;
+               break;
+
+           default:
+               elog(ERROR, "unrecognized operator: %d", item->val);
+               selec = 0;      /* keep compiler quiet */
+               break;
+       }
+   }
+   else
+   {
+       elog(ERROR, "unrecognized int query item type: %u", item->type);
+       selec = 0;              /* keep compiler quiet */
+   }
+
+   /* Clamp intermediate results to stay sane despite roundoff error */
+   CLAMP_PROBABILITY(selec);
+
+   return selec;
+}
+
+/*
+ * Comparison function for binary search in mcelem array.
+ */
+static int
+compare_val_int4(const void *a, const void *b)
+{
+   int32       key = *(int32 *) a;
+   const Datum *t = (const Datum *) b;
+
+   return key - DatumGetInt32(*t);
+}
index 4080b9633fe98a91861684e0d82f1297c21b91af..962e5c6a4b1ddae2cf7eaa4bf46f41877c45dcde 100644 (file)
@@ -368,6 +368,7 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
 
 CREATE TABLE test__int( a int[] );
 \copy test__int from 'data/test__int.data'
+ANALYZE test__int;
 SELECT count(*) from test__int WHERE a && '{23,50}';
  count 
 -------
diff --git a/contrib/intarray/intarray--1.0--1.1.sql b/contrib/intarray/intarray--1.0--1.1.sql
new file mode 100644 (file)
index 0000000..fecebdd
--- /dev/null
@@ -0,0 +1,49 @@
+/* contrib/intarray/intarray--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION intarray UPDATE TO '1.1'" to load this file. \quit
+
+CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+ALTER OPERATOR @@ (_int4, query_int) SET (RESTRICT = _int_matchsel);
+ALTER OPERATOR ~~ (query_int, _int4) SET (RESTRICT = _int_matchsel);
+
+CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+ALTER OPERATOR && (_int4, _int4) SET (RESTRICT = _int_overlap_sel, JOIN = _int_overlap_joinsel);
+ALTER OPERATOR @> (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
+ALTER OPERATOR <@ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);
+
+ALTER OPERATOR @ (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
+ALTER OPERATOR ~ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);
similarity index 88%
rename from contrib/intarray/intarray--1.0.sql
rename to contrib/intarray/intarray--1.1.sql
index 0b89e0f55e52ac74a41239f70bc4be9ca1ad6a09..817625e54a0fb9f51ef62881f63a75e04f97f799 100644 (file)
@@ -1,4 +1,4 @@
-/* contrib/intarray/intarray--1.0.sql */
+/* contrib/intarray/intarray--1.1.sql */
 
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION intarray" to load this file. \quit
@@ -45,12 +45,17 @@ LANGUAGE C STRICT IMMUTABLE;
 
 COMMENT ON FUNCTION rboolop(query_int, _int4) IS 'boolean operation with array';
 
+CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
 CREATE OPERATOR @@ (
    LEFTARG = _int4,
    RIGHTARG = query_int,
    PROCEDURE = boolop,
    COMMUTATOR = '~~',
-   RESTRICT = contsel,
+   RESTRICT = _int_matchsel,
    JOIN = contjoinsel
 );
 
@@ -59,7 +64,7 @@ CREATE OPERATOR ~~ (
    RIGHTARG = _int4,
    PROCEDURE = rboolop,
    COMMUTATOR = '@@',
-   RESTRICT = contsel,
+   RESTRICT = _int_matchsel,
    JOIN = contjoinsel
 );
 
@@ -117,6 +122,36 @@ RETURNS _int4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 
+CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
 --
 -- OPERATORS
 --
@@ -126,8 +161,8 @@ CREATE OPERATOR && (
    RIGHTARG = _int4,
    PROCEDURE = _int_overlap,
    COMMUTATOR = '&&',
-   RESTRICT = contsel,
-   JOIN = contjoinsel
+   RESTRICT = _int_overlap_sel,
+   JOIN = _int_overlap_joinsel
 );
 
 --CREATE OPERATOR = (
@@ -157,8 +192,8 @@ CREATE OPERATOR @> (
    RIGHTARG = _int4,
    PROCEDURE = _int_contains,
    COMMUTATOR = '<@',
-   RESTRICT = contsel,
-   JOIN = contjoinsel
+   RESTRICT = _int_contains_sel,
+   JOIN = _int_contains_joinsel
 );
 
 CREATE OPERATOR <@ (
@@ -166,8 +201,8 @@ CREATE OPERATOR <@ (
    RIGHTARG = _int4,
    PROCEDURE = _int_contained,
    COMMUTATOR = '@>',
-   RESTRICT = contsel,
-   JOIN = contjoinsel
+   RESTRICT = _int_contained_sel,
+   JOIN = _int_contained_joinsel
 );
 
 -- obsolete:
@@ -176,8 +211,8 @@ CREATE OPERATOR @ (
    RIGHTARG = _int4,
    PROCEDURE = _int_contains,
    COMMUTATOR = '~',
-   RESTRICT = contsel,
-   JOIN = contjoinsel
+   RESTRICT = _int_contains_sel,
+   JOIN = _int_contains_joinsel
 );
 
 CREATE OPERATOR ~ (
@@ -185,8 +220,8 @@ CREATE OPERATOR ~ (
    RIGHTARG = _int4,
    PROCEDURE = _int_contained,
    COMMUTATOR = '@',
-   RESTRICT = contsel,
-   JOIN = contjoinsel
+   RESTRICT = _int_contained_sel,
+   JOIN = _int_contained_joinsel
 );
 
 --------------
index 7b3d4f78f053adc3c046f77d45f1bdb78cd54498..8c23e8d5e2ed799c6bf9831f6bb5f3fe07d05875 100644 (file)
@@ -1,5 +1,5 @@
 # intarray extension
 comment = 'functions, operators, and index support for 1-D arrays of integers'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/_int'
 relocatable = true
index 216c5c58d615a7cd1a5fe3714c9a5c91fb255138..f6fe2de55c587b7d0f3f89fb2af764f1deab291d 100644 (file)
@@ -68,8 +68,8 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
 
 
 CREATE TABLE test__int( a int[] );
-
 \copy test__int from 'data/test__int.data'
+ANALYZE test__int;
 
 SELECT count(*) from test__int WHERE a && '{23,50}';
 SELECT count(*) from test__int WHERE a @@ '23|50';