pgbench: Function to generate random permutations.

author Dean Rasheed

Tue, 6 Apr 2021 10:50:42 +0000 (11:50 +0100)

committer Dean Rasheed

Tue, 6 Apr 2021 10:50:42 +0000 (11:50 +0100)
author Dean Rasheed
Tue, 6 Apr 2021 10:50:42 +0000 (11:50 +0100)
committer Dean Rasheed
Tue, 6 Apr 2021 10:50:42 +0000 (11:50 +0100)
diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml

index 50cf22ba6baddf4c9efd08c95f139db0283382c5..8eb4f538d5b65a7ed34c6e8a00c424018efe2406 100644 (file)
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -1057,7 +1057,7 @@ pgbench  options  d
  
        
          default_seed 
-       seed used in hash functions by default
+       seed used in hash and pseudorandom permutation functions by default
        
  
        
@@ -1864,6 +1864,24 @@ SELECT 4 AS four \; SELECT 5 AS five \aset
         
        
  
+      
+       
+        permute ( i, size [, seed ] )
+        integer
+       
+       
+        Permuted value of i, in the range
+        [0, size).  This is the new position of
+        i (modulo size) in a
+        pseudorandom permutation of the integers 0...size-1,
+        parameterized by seed, see below.
+       
+       
+        permute(0, 4)
+        an integer between 0 and 3
+       
+      
+
        
         
          pi ()
@@ -2071,29 +2089,70 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
      
     
  
+   
+    
+      When designing a benchmark which selects rows non-uniformly, be aware
+      that the rows chosen may be correlated with other data such as IDs from
+      a sequence or the physical row ordering, which may skew performance
+      measurements.
+    
+    
+      To avoid this, you may wish to use the permute
+      function, or some other additional step with similar effect, to shuffle
+      the selected rows and remove such correlations.
+    
+   
+
    
      Hash functions hash, hash_murmur2 and
      hash_fnv1a accept an input value and an optional seed parameter.
      In case the seed isn't provided the value of :default_seed
      is used, which is initialized randomly unless set by the command-line
-    -D option. Hash functions can be used to scatter the
-    distribution of random functions such as random_zipfian or
-    random_exponential. For instance, the following pgbench
-    script simulates possible real world workload typical for social media and
-    blogging platforms where few accounts generate excessive load:
+    -D option.
+  
+
+  
+    permute accepts an input value, a size, and an optional
+    seed parameter.  It generates a pseudorandom permutation of integers in
+    the range [0, size), and returns the index of the input
+    value in the permuted values.  The permutation chosen is parameterized by
+    the seed, which defaults to :default_seed, if not
+    specified.  Unlike the hash functions, permute ensures
+    that there are no collisions or holes in the output values.  Input values
+    outside the interval are interpreted modulo the size.  The function raises
+    an error if the size is not positive.  permute can be
+    used to scatter the distribution of non-uniform random functions such as
+    random_zipfian or random_exponential
+    so that values drawn more often are not trivially correlated.  For
+    instance, the following pgbench script
+    simulates a possible real world workload typical for social media and
+    blogging platforms where a few accounts generate excessive load:
  
  
-\set r random_zipfian(0, 100000000, 1.07)
-\set k abs(hash(:r)) % 1000000
+\set size 1000000
+\set r random_zipfian(1, :size, 1.07)
+\set k 1 + permute(:r, :size)
  
  
      In some cases several distinct distributions are needed which don't correlate
-    with each other and this is when implicit seed parameter comes in handy:
+    with each other and this is when the optional seed parameter comes in handy:
  
  
-\set k1 abs(hash(:r, :default_seed + 123)) % 1000000
-\set k2 abs(hash(:r, :default_seed + 321)) % 1000000
+\set k1 1 + permute(:r, :size, :default_seed + 123)
+\set k2 1 + permute(:r, :size, :default_seed + 321)
  
+
+    A similar behavior can also be approximated with hash:
+
+
+\set size 1000000
+\set r random_zipfian(1, 100 * :size, 1.07)
+\set k 1 + abs(hash(:r)) % :size
+
+
+    However, since hash generates collisions, some values
+    will not be reachable and others will be more frequent than expected from
+    the original distribution.
    
  
    
diff --git a/src/bin/pgbench/exprparse.y b/src/bin/pgbench/exprparse.y

index 4d529ea5500106127980613345837e34de66c87b..56f75ccd253e74bc2e40064f12a87e933a267633 100644 (file)
--- a/src/bin/pgbench/exprparse.y
+++ b/src/bin/pgbench/exprparse.y
@@ -19,6 +19,7 @@
  #define PGBENCH_NARGS_VARIABLE (-1)
  #define PGBENCH_NARGS_CASE     (-2)
  #define PGBENCH_NARGS_HASH     (-3)
+#define PGBENCH_NARGS_PERMUTE  (-4)
  
  PgBenchExpr *expr_parse_result;
  
@@ -370,6 +371,9 @@ static const struct
     {
         "hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A
     },
+   {
+       "permute", PGBENCH_NARGS_PERMUTE, PGBENCH_PERMUTE
+   },
     /* keep as last array element */
     {
         NULL, 0, 0
@@ -482,6 +486,19 @@ make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList *args)
             }
             break;
  
+       /* pseudorandom permutation function with optional seed argument */
+       case PGBENCH_NARGS_PERMUTE:
+           if (len < 2 || len > 3)
+               expr_yyerror_more(yyscanner, "unexpected number of arguments",
+                                 PGBENCH_FUNCTIONS[fnumber].fname);
+
+           if (len == 2)
+           {
+               PgBenchExpr *var = make_variable("default_seed");
+               args = make_elist(var, args);
+           }
+           break;
+
         /* common case: positive arguments number */
         default:
             Assert(PGBENCH_FUNCTIONS[fnumber].nargs >= 0);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c

index 48ce1712cc4caf35c878b8632ba14997349b6cfa..da1d9ec535178362b0f24d986aa03c7e655bd96f 100644 (file)
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -66,6 +66,7 @@
  #include "getopt_long.h"
  #include "libpq-fe.h"
  #include "pgbench.h"
+#include "port/pg_bitutils.h"
  #include "portability/instr_time.h"
  
  #ifndef M_PI
@@ -1127,6 +1128,113 @@ getHashMurmur2(int64 val, uint64 seed)
     return (int64) result;
  }
  
+/*
+ * Pseudorandom permutation function
+ *
+ * For small sizes, this generates each of the (size!) possible permutations
+ * of integers in the range [0, size) with roughly equal probability.  Once
+ * the size is larger than 20, the number of possible permutations exceeds the
+ * number of distinct states of the internal pseudorandom number generators,
+ * and so not all possible permutations can be generated, but the permutations
+ * chosen should continue to give the appearance of being random.
+ *
+ * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE.
+ * DO NOT USE FOR SUCH PURPOSE.
+ */
+static int64
+permute(const int64 val, const int64 isize, const int64 seed)
+{
+   RandomState random_state1;
+   RandomState random_state2;
+   uint64      size;
+   uint64      v;
+   int         masklen;
+   uint64      mask;
+   int         i;
+
+   if (isize < 2)
+       return 0;               /* nothing to permute */
+
+   /* Initialize a pair of random states using the seed */
+   random_state1.xseed[0] = seed & 0xFFFF;
+   random_state1.xseed[1] = (seed >> 16) & 0xFFFF;
+   random_state1.xseed[2] = (seed >> 32) & 0xFFFF;
+
+   random_state2.xseed[0] = (((uint64) seed) >> 48) & 0xFFFF;
+   random_state2.xseed[1] = seed & 0xFFFF;
+   random_state2.xseed[2] = (seed >> 16) & 0xFFFF;
+
+   /* Computations are performed on unsigned values */
+   size = (uint64) isize;
+   v = (uint64) val % size;
+
+   /* Mask to work modulo largest power of 2 less than or equal to size */
+   masklen = pg_leftmost_one_pos64(size);
+   mask = (((uint64) 1) << masklen) - 1;
+
+   /*
+    * Permute the input value by applying several rounds of pseudorandom
+    * bijective transformations.  The intention here is to distribute each
+    * input uniformly randomly across the range, and separate adjacent inputs
+    * approximately uniformly randomly from each other, leading to a fairly
+    * random overall choice of permutation.
+    *
+    * To separate adjacent inputs, we multiply by a random number modulo
+    * (mask + 1), which is a power of 2.  For this to be a bijection, the
+    * multiplier must be odd.  Since this is known to lead to less randomness
+    * in the lower bits, we also apply a rotation that shifts the topmost bit
+    * into the least significant bit.  In the special cases where size <= 3,
+    * mask = 1 and each of these operations is actually a no-op, so we also
+    * XOR the value with a different random number to inject additional
+    * randomness.  Since the size is generally not a power of 2, we apply
+    * this bijection on overlapping upper and lower halves of the input.
+    *
+    * To distribute the inputs uniformly across the range, we then also apply
+    * a random offset modulo the full range.
+    *
+    * Taken together, these operations resemble a modified linear
+    * congruential generator, as is commonly used in pseudorandom number
+    * generators.  The number of rounds is fairly arbitrary, but six has been
+    * found empirically to give a fairly good tradeoff between performance
+    * and uniform randomness.  For small sizes it selects each of the (size!)
+    * possible permutations with roughly equal probability.  For larger
+    * sizes, not all permutations can be generated, but the intended random
+    * spread is still produced.
+    */
+   for (i = 0; i < 6; i++)
+   {
+       uint64      m,
+                   r,
+                   t;
+
+       /* Random multiply (by an odd number), XOR and rotate of lower half */
+       m = (uint64) getrand(&random_state1, 0, mask) | 1;
+       r = (uint64) getrand(&random_state2, 0, mask);
+       if (v <= mask)
+       {
+           v = ((v * m) ^ r) & mask;
+           v = ((v << 1) & mask) | (v >> (masklen - 1));
+       }
+
+       /* Random multiply (by an odd number), XOR and rotate of upper half */
+       m = (uint64) getrand(&random_state1, 0, mask) | 1;
+       r = (uint64) getrand(&random_state2, 0, mask);
+       t = size - 1 - v;
+       if (t <= mask)
+       {
+           t = ((t * m) ^ r) & mask;
+           t = ((t << 1) & mask) | (t >> (masklen - 1));
+           v = size - 1 - t;
+       }
+
+       /* Random offset */
+       r = (uint64) getrand(&random_state2, 0, size - 1);
+       v = (v + r) % size;
+   }
+
+   return (int64) v;
+}
+
  /*
   * Initialize the given SimpleStats struct to all zeroes
   */
@@ -2475,6 +2583,29 @@ evalStandardFunc(CState *st,
                 return true;
             }
  
+       case PGBENCH_PERMUTE:
+           {
+               int64       val,
+                           size,
+                           seed;
+
+               Assert(nargs == 3);
+
+               if (!coerceToInt(&vargs[0], &val) ||
+                   !coerceToInt(&vargs[1], &size) ||
+                   !coerceToInt(&vargs[2], &seed))
+                   return false;
+
+               if (size <= 0)
+               {
+                   pg_log_error("permute size parameter must be greater than zero");
+                   return false;
+               }
+
+               setIntValue(retval, permute(val, size, seed));
+               return true;
+           }
+
         default:
             /* cannot get here */
             Assert(0);
diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h

index 3a9d89e6f1509f2fbc3912fa894cf1bb4867d08f..6ce1c98649ad334da6a6cc33b136395e034386df 100644 (file)
--- a/src/bin/pgbench/pgbench.h
+++ b/src/bin/pgbench/pgbench.h
@@ -99,7 +99,8 @@ typedef enum PgBenchFunction
     PGBENCH_IS,
     PGBENCH_CASE,
     PGBENCH_HASH_FNV1A,
-   PGBENCH_HASH_MURMUR2
+   PGBENCH_HASH_MURMUR2,
+   PGBENCH_PERMUTE
  } PgBenchFunction;
  
  typedef struct PgBenchExpr PgBenchExpr;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl

index 82a46c72b6af8611459b48e2cbb391fbde00ea7d..c2482dea1752d65029ac2cccd7c43bab2460cc7c 100644 (file)
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -4,6 +4,7 @@ use warnings;
  use PostgresNode;
  use TestLib;
  use Test::More;
+use Config;
  
  # start a pgbench specific server
  my $node = get_new_node('main');
@@ -483,6 +484,17 @@ pgbench(
         qr{command=98.: int 5432\b},                    # :random_seed
         qr{command=99.: int -9223372036854775808\b},    # min int
         qr{command=100.: int 9223372036854775807\b},    # max int
+       # pseudorandom permutation tests
+       qr{command=101.: boolean true\b},
+       qr{command=102.: boolean true\b},
+       qr{command=103.: boolean true\b},
+       qr{command=104.: boolean true\b},
+       qr{command=105.: boolean true\b},
+       qr{command=109.: boolean true\b},
+       qr{command=110.: boolean true\b},
+       qr{command=111.: boolean true\b},
+       qr{command=112.: int 9223372036854775797\b},
+       qr{command=113.: boolean true\b},
     ],
     'pgbench expressions',
     {
@@ -610,6 +622,33 @@ SELECT :v0, :v1, :v2, :v3;
  -- minint constant parsing
  \set min debug(-9223372036854775808)
  \set max debug(-(:min + 1))
+-- parametric pseudorandom permutation function
+\set t debug(permute(0, 2) + permute(1, 2) = 1)
+\set t debug(permute(0, 3) + permute(1, 3) + permute(2, 3) = 3)
+\set t debug(permute(0, 4) + permute(1, 4) + permute(2, 4) + permute(3, 4) = 6)
+\set t debug(permute(0, 5) + permute(1, 5) + permute(2, 5) + permute(3, 5) + permute(4, 5) = 10)
+\set t debug(permute(0, 16) + permute(1, 16) + permute(2, 16) + permute(3, 16) + \
+             permute(4, 16) + permute(5, 16) + permute(6, 16) + permute(7, 16) + \
+             permute(8, 16) + permute(9, 16) + permute(10, 16) + permute(11, 16) + \
+             permute(12, 16) + permute(13, 16) + permute(14, 16) + permute(15, 16) = 120)
+-- random sanity checks
+\set size random(2, 1000)
+\set v random(0, :size - 1)
+\set p permute(:v, :size)
+\set t debug(0 <= :p and :p < :size and :p = permute(:v + :size, :size) and :p <> permute(:v + 1, :size))
+-- actual values
+\set t debug(permute(:v, 1) = 0)
+\set t debug(permute(0, 2, 5432) = 0 and permute(1, 2, 5432) = 1 and \
+             permute(0, 2, 5435) = 1 and permute(1, 2, 5435) = 0)
+-- 63 bits tests
+\set size debug(:max - 10)
+\set t debug(permute(:size-1, :size, 5432) = 5301702756001087507 and \
+             permute(:size-2, :size, 5432) = 8968485976055840695 and \
+             permute(:size-3, :size, 5432) = 6708495591295582115 and \
+             permute(:size-4, :size, 5432) = 2801794404574855121 and \
+             permute(:size-5, :size, 5432) = 1489011409218895840 and \
+             permute(:size-6, :size, 5432) = 2267749475878240183 and \
+             permute(:size-7, :size, 5432) = 1300324176838786780)
  }
     });
  
@@ -1048,6 +1087,10 @@ SELECT LEAST(} . join(', ', (':i') x 256) . q{)}
         'bad boolean',                     2,
         [qr{malformed variable.*trueXXX}], q{\set b :badtrue or true}
     ],
+   [
+       'invalid permute size',             2,
+       [qr{permute size parameter must be greater than zero}], q{\set i permute(0, 0)}
+   ],
  
     # GSET
     [
diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl

index e38c7d77d1c01ff9f298594a80d1e9d24d06fab2..4027e68dfac592440729aa430bd8b9a23191c7b5 100644 (file)
--- a/src/bin/pgbench/t/002_pgbench_no_server.pl
+++ b/src/bin/pgbench/t/002_pgbench_no_server.pl
@@ -341,6 +341,16 @@ my @script_tests = (
         'set i',
         [ qr{set i 1 }, qr{\^ error found here} ],
         { 'set_i_op' => "\\set i 1 +\n" }
+   ],
+   [
+       'not enough arguments to permute',
+       [qr{unexpected number of arguments \(permute\)}],
+       { 'bad-permute-1.sql' => "\\set i permute(1)\n" }
+   ],
+   [
+       'too many arguments to permute',
+       [qr{unexpected number of arguments \(permute\)}],
+       { 'bad-permute-2.sql' => "\\set i permute(1, 2, 3, 4)\n" }
     ],);
  
  for my $t (@script_tests)
author	Dean Rasheed
	Tue, 6 Apr 2021 10:50:42 +0000 (11:50 +0100)
committer	Dean Rasheed
	Tue, 6 Apr 2021 10:50:42 +0000 (11:50 +0100)
doc/src/sgml/ref/pgbench.sgml		patch \| blob \| blame \| history
src/bin/pgbench/exprparse.y		patch \| blob \| blame \| history
src/bin/pgbench/pgbench.c		patch \| blob \| blame \| history
src/bin/pgbench/pgbench.h		patch \| blob \| blame \| history
src/bin/pgbench/t/001_pgbench_with_server.pl		patch \| blob \| blame \| history
src/bin/pgbench/t/002_pgbench_no_server.pl		patch \| blob \| blame \| history