Code review for EXPLAIN patch. Fix some typos, make it behave sanely

author Tom Lane

Tue, 30 May 2006 19:24:25 +0000 (19:24 +0000)

committer Tom Lane

Tue, 30 May 2006 19:24:25 +0000 (19:24 +0000)
author Tom Lane
Tue, 30 May 2006 19:24:25 +0000 (19:24 +0000)
committer Tom Lane
Tue, 30 May 2006 19:24:25 +0000 (19:24 +0000)
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c

index 41b061bd526c6f4129c4b0d00e5e752193844553..d094ffbbcf6fd8d0acd3b4969fa725a99333e797 100644 (file)
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -7,90 +7,117 @@
   * Copyright (c) 2001-2006, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  
-#include 
  #include 
+#include 
  
  #include "executor/instrument.h"
  
-/* This is the function that is used to determine the sampling intervals. In
- * general, if the function is f(x), then for N tuples we will take on the
- * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
- * number of samples that would be collected over 1,000,000 tuples.
  
-  f(x) = x         =>   log2(N)                        20
-  f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
-  f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
-  
+/*
+ * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
+ * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
+ * (Because that requires a kernel call on most systems, it's expensive.)
+ *
+ * This macro determines the sampling interval: that is, after how many more
+ * iterations will we take the next time sample, given that niters iterations
+ * have occurred already.  In general, if the function is f(x), then for N
+ * iterations we will take on the order of integral(1/f(x), x=0..N)
+ * samples.  Some examples follow, with the number of samples that would be
+ * collected over 1,000,000 iterations:
+ *
+ *     f(x) = x         =>   log2(N)                        20
+ *     f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
+ *     f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
+ *
   * I've chosen the last one as it seems to provide a good compromise between
   * low overhead but still getting a meaningful number of samples. However,
   * not all machines have the cbrt() function so on those we substitute
   * sqrt(). The difference is not very significant in the tests I made.
-*/ 
+ *
+ * The actual sampling interval is randomized with the SampleFunc() value
+ * as the mean; this hopefully will reduce any measurement bias due to
+ * cyclic variation in the node execution time.
+ */
  #ifdef HAVE_CBRT
-#define SampleFunc cbrt
+#define SampleFunc(niters) cbrt(niters)
  #else
-#define SampleFunc sqrt
+#define SampleFunc(niters) sqrt(niters)
  #endif
  
+#define SampleInterval(niters) \
+   (SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
+
+/*
+ * We sample at every node iteration until we've reached this threshold,
+ * so that nodes not called a large number of times are completely accurate.
+ * (Perhaps this should be a GUC variable?  But beware of messing up
+ * CalculateSampleOverhead if value is too small.)
+ */
  #define SAMPLE_THRESHOLD 50
  
+/*
+ * Determine the sampling overhead.  This only needs to be done once per
+ * backend (actually, probably once per postmaster would do ...)
+ */
  static double SampleOverhead;
-static bool SampleOverheadCalculated;
+static bool SampleOverheadCalculated = false;
  
  static void
-CalculateSampleOverhead()
+CalculateSampleOverhead(void)
  {
     Instrumentation instr;
     int i;
-   
-   /* We want to determine the sampling overhead, to correct
-    * calculations later. This only needs to be done once per backend.
-    * Is this the place? A wrong value here (due to a mistimed
-    * task-switch) will cause bad calculations later.
-    *
+
+   /*
+    * We could get a wrong result due to being interrupted by task switch.
      * To minimize the risk we do it a few times and take the lowest.
      */
-   
     SampleOverhead = 1.0e6;
-   
-   for( i = 0; i<5; i++ )
+
+   for (i = 0; i < 5; i++)
     {
         int j;
         double overhead;
-       
-       memset( &instr, 0, sizeof(instr) );
-   
-       /* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
-       for( j=0; j
+
+       memset(&instr, 0, sizeof(instr));
+       /*
+        * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
+        * so that's as far as we can test.
+        */
+       for (j=0; j < SAMPLE_THRESHOLD; j++)
        {
-           InstrStartNode( &instr );
-           InstrStopNode( &instr, 1 );
+           InstrStartNode(&instr);
+           InstrStopNode(&instr, 1);
        }
        overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
-       if( overhead < SampleOverhead )
+       if (overhead < SampleOverhead)
            SampleOverhead = overhead;
    }
-       
+
    SampleOverheadCalculated = true;
 }
 
+
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
 InstrAlloc(int n)
 {
-   Instrumentation *instr = palloc0(n * sizeof(Instrumentation));
+   Instrumentation *instr;
 
-   /* we don't need to do any initialization except zero 'em */
-   
-   /* Calculate overhead, if not done yet */
-   if( !SampleOverheadCalculated )
+   /* Calculate sampling overhead, if not done yet in this backend */
+   if (!SampleOverheadCalculated)
        CalculateSampleOverhead();
+
+   instr = palloc0(n * sizeof(Instrumentation));
+
+   /* we don't need to do any initialization except zero 'em */
+
    return instr;
 }
 
@@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
 {
    if (INSTR_TIME_IS_ZERO(instr->starttime))
    {
-       /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
-       if (instr->tuplecount < SAMPLE_THRESHOLD)
+       /*
+        * Always sample if not yet up to threshold, else check whether
+        * next threshold has been reached
+        */
+       if (instr->itercount < SAMPLE_THRESHOLD)
            instr->sampling = true;
-       else
+       else if (instr->itercount >= instr->nextsample)
        {
-           /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
-           if( instr->tuplecount > instr->nextsample )
-           {
-               instr->sampling = true;
-               /* The doubling is so the random will average 1 over time */
-               instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
-           }
+           instr->sampling = true;
+           instr->nextsample =
+               instr->itercount + SampleInterval(instr->itercount);
        }
        if (instr->sampling)
            INSTR_TIME_SET_CURRENT(instr->starttime);
@@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
 void
 InstrStopNode(Instrumentation *instr, double nTuples)
 {
-   instr_time  endtime;
-
-   /* count the returned tuples */
+   /* count the returned tuples and iterations */
    instr->tuplecount += nTuples;
+   instr->itercount += 1;
 
+   /* measure runtime if appropriate */
    if (instr->sampling)
    {
+       instr_time  endtime;
+
        if (INSTR_TIME_IS_ZERO(instr->starttime))
        {
            elog(DEBUG2, "InstrStopNode called without start");
@@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 #endif
 
        INSTR_TIME_SET_ZERO(instr->starttime);
-       instr->samplecount += nTuples;
+
+       instr->samplecount += 1;
        instr->sampling = false;
    }
 
@@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
    if (!INSTR_TIME_IS_ZERO(instr->starttime))
        elog(DEBUG2, "InstrEndLoop called on running node");
 
-   /* Accumulate per-cycle statistics into totals */
+   /* Compute time spent in node */
    totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
 
-   instr->startup += instr->firsttuple;
-
-   /* Here we take into account sampling effects. Doing it naively ends
-    * up assuming the sampling overhead applies to all tuples, even the
-    * ones we didn't measure. We've calculated an overhead, so we
-    * subtract that for all samples we didn't measure. The first tuple
-    * is also special cased, because it usually takes longer. */
-
-   if( instr->samplecount < instr->tuplecount )
+   /*
+    * If we didn't measure runtime on every iteration, then we must increase
+    * the measured total to account for the other iterations.  Naively
+    * multiplying totaltime by itercount/samplecount would be wrong because
+    * it effectively assumes the sampling overhead applies to all iterations,
+    * even the ones we didn't measure.  (Note that what we are trying to
+    * estimate here is the actual time spent in the node, including the
+    * actual measurement overhead; not the time exclusive of measurement
+    * overhead.)  We exclude the first iteration from the correction basis,
+    * because it often takes longer than others.
+    */
+   if (instr->itercount > instr->samplecount)
    {
-       double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
-       instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
-                                         + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
+       double per_iter;
+
+       per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
+           - SampleOverhead;
+       if (per_iter > 0)               /* sanity check */
+           totaltime += per_iter * (instr->itercount - instr->samplecount);
    }
-   else
-       instr->total += totaltime;
-       
+
+   /* Accumulate per-cycle statistics into totals */
+   instr->startup += instr->firsttuple;
+   instr->total += totaltime;
    instr->ntuples += instr->tuplecount;
-   instr->nsamples += instr->samplecount;
    instr->nloops += 1;
 
    /* Reset for next cycle (if any) */
    instr->running = false;
+   instr->sampling = false;
    INSTR_TIME_SET_ZERO(instr->starttime);
    INSTR_TIME_SET_ZERO(instr->counter);
    instr->firsttuple = 0;
-   instr->samplecount = 0;
    instr->tuplecount = 0;
+   instr->itercount = 0;
+   instr->samplecount = 0;
+   instr->nextsample = 0;
 }
+
+       memset(&instr, 0, sizeof(instr));
+       /*
+        * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
+        * so that's as far as we can test.
+        */
+       for (j=0; j < SAMPLE_THRESHOLD; j++)
         {
-           InstrStartNode( &instr );
-           InstrStopNode( &instr, 1 );
+           InstrStartNode(&instr);
+           InstrStopNode(&instr, 1);
         }
         overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
-       if( overhead < SampleOverhead )
+       if (overhead < SampleOverhead)
             SampleOverhead = overhead;
     }
-       
+
     SampleOverheadCalculated = true;
  }
  
+
  /* Allocate new instrumentation structure(s) */
  Instrumentation *
  InstrAlloc(int n)
  {
-   Instrumentation *instr = palloc0(n * sizeof(Instrumentation));
+   Instrumentation *instr;
  
-   /* we don't need to do any initialization except zero 'em */
-   
-   /* Calculate overhead, if not done yet */
-   if( !SampleOverheadCalculated )
+   /* Calculate sampling overhead, if not done yet in this backend */
+   if (!SampleOverheadCalculated)
         CalculateSampleOverhead();
+
+   instr = palloc0(n * sizeof(Instrumentation));
+
+   /* we don't need to do any initialization except zero 'em */
+
     return instr;
  }
  
@@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
  {
     if (INSTR_TIME_IS_ZERO(instr->starttime))
     {
-       /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
-       if (instr->tuplecount < SAMPLE_THRESHOLD)
+       /*
+        * Always sample if not yet up to threshold, else check whether
+        * next threshold has been reached
+        */
+       if (instr->itercount < SAMPLE_THRESHOLD)
             instr->sampling = true;
-       else
+       else if (instr->itercount >= instr->nextsample)
         {
-           /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
-           if( instr->tuplecount > instr->nextsample )
-           {
-               instr->sampling = true;
-               /* The doubling is so the random will average 1 over time */
-               instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
-           }
+           instr->sampling = true;
+           instr->nextsample =
+               instr->itercount + SampleInterval(instr->itercount);
         }
         if (instr->sampling)
             INSTR_TIME_SET_CURRENT(instr->starttime);
@@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
  void
  InstrStopNode(Instrumentation *instr, double nTuples)
  {
-   instr_time  endtime;
-
-   /* count the returned tuples */
+   /* count the returned tuples and iterations */
     instr->tuplecount += nTuples;
+   instr->itercount += 1;
  
+   /* measure runtime if appropriate */
     if (instr->sampling)
     {
+       instr_time  endtime;
+
         if (INSTR_TIME_IS_ZERO(instr->starttime))
         {
             elog(DEBUG2, "InstrStopNode called without start");
@@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
  #endif
  
         INSTR_TIME_SET_ZERO(instr->starttime);
-       instr->samplecount += nTuples;
+
+       instr->samplecount += 1;
         instr->sampling = false;
     }
  
@@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
     if (!INSTR_TIME_IS_ZERO(instr->starttime))
         elog(DEBUG2, "InstrEndLoop called on running node");
  
-   /* Accumulate per-cycle statistics into totals */
+   /* Compute time spent in node */
     totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
  
-   instr->startup += instr->firsttuple;
-
-   /* Here we take into account sampling effects. Doing it naively ends
-    * up assuming the sampling overhead applies to all tuples, even the
-    * ones we didn't measure. We've calculated an overhead, so we
-    * subtract that for all samples we didn't measure. The first tuple
-    * is also special cased, because it usually takes longer. */
-
-   if( instr->samplecount < instr->tuplecount )
+   /*
+    * If we didn't measure runtime on every iteration, then we must increase
+    * the measured total to account for the other iterations.  Naively
+    * multiplying totaltime by itercount/samplecount would be wrong because
+    * it effectively assumes the sampling overhead applies to all iterations,
+    * even the ones we didn't measure.  (Note that what we are trying to
+    * estimate here is the actual time spent in the node, including the
+    * actual measurement overhead; not the time exclusive of measurement
+    * overhead.)  We exclude the first iteration from the correction basis,
+    * because it often takes longer than others.
+    */
+   if (instr->itercount > instr->samplecount)
     {
-       double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
-       instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
-                                         + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
+       double per_iter;
+
+       per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
+           - SampleOverhead;
+       if (per_iter > 0)               /* sanity check */
+           totaltime += per_iter * (instr->itercount - instr->samplecount);
     }
-   else
-       instr->total += totaltime;
-       
+
+   /* Accumulate per-cycle statistics into totals */
+   instr->startup += instr->firsttuple;
+   instr->total += totaltime;
     instr->ntuples += instr->tuplecount;
-   instr->nsamples += instr->samplecount;
     instr->nloops += 1;
  
     /* Reset for next cycle (if any) */
     instr->running = false;
+   instr->sampling = false;
     INSTR_TIME_SET_ZERO(instr->starttime);
     INSTR_TIME_SET_ZERO(instr->counter);
     instr->firsttuple = 0;
-   instr->samplecount = 0;
     instr->tuplecount = 0;
+   instr->itercount = 0;
+   instr->samplecount = 0;
+   instr->nextsample = 0;
  }
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h

index fbf578d473f8c7001f92ee633b0fa20d6ea518e6..fd57f8657884697510adf9d342dbb7eefa9d8e15 100644 (file)
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 2001-2006, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.14 2006/05/30 14:01:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.15 2006/05/30 19:24:25 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -57,20 +57,19 @@ typedef struct Instrumentation
  {
     /* Info about current plan cycle: */
     bool        running;        /* TRUE if we've completed first tuple */
+   bool        sampling;       /* Are we sampling in current iteration? */
     instr_time  starttime;      /* Start time of current iteration of node */
     instr_time  counter;        /* Accumulated runtime for this node */
     double      firsttuple;     /* Time for first tuple of this cycle */
     double      tuplecount;     /* Tuples emitted so far this cycle */
-   double      samplecount;        /* Samples collected this cycle */
+   double      itercount;      /* Plan node iterations this cycle */
+   double      samplecount;    /* Iterations in which we sampled runtime */
+   double      nextsample;     /* Next itercount to sample at */
     /* Accumulated statistics across all completed cycles: */
     double      startup;        /* Total startup time (in seconds) */
     double      total;          /* Total total time (in seconds) */
     double      ntuples;        /* Total tuples produced */
     double      nloops;         /* # of run cycles for this node */
-   double      nsamples;       /* # of samples taken */
-   /* Tracking for sampling */
-   bool        sampling;       /* Are we sampling this iteration */
-   double      nextsample;     /* The next tuplecount we're going to sample */
  } Instrumentation;
  
  extern Instrumentation *InstrAlloc(int n);
author	Tom Lane
	Tue, 30 May 2006 19:24:25 +0000 (19:24 +0000)
committer	Tom Lane
	Tue, 30 May 2006 19:24:25 +0000 (19:24 +0000)
src/backend/executor/instrument.c		patch \| blob \| blame \| history
src/include/executor/instrument.h		patch \| blob \| blame \| history