Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.

author Noah Misch

Sun, 24 Oct 2021 01:36:38 +0000 (18:36 -0700)

committer Noah Misch

Sun, 24 Oct 2021 01:36:42 +0000 (18:36 -0700)
author Noah Misch
Sun, 24 Oct 2021 01:36:38 +0000 (18:36 -0700)
committer Noah Misch
Sun, 24 Oct 2021 01:36:42 +0000 (18:36 -0700)
diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile

index a2b1b1036b3e310b4329e5c746ce568f2501519b..9a62e84defc7261c4f941432d34fedfac7680fdd 100644 (file)
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -11,6 +11,8 @@ PGFILEDESC = "amcheck - function for verifying relation integrity"
  
  REGRESS = check check_btree
  
+TAP_TESTS = 1
+
  ifdef USE_PGXS
  PG_CONFIG = pg_config
  PGXS := $(shell $(PG_CONFIG) --pgxs)
diff --git a/contrib/amcheck/t/002_cic.pl b/contrib/amcheck/t/002_cic.pl

new file mode 100644 (file)

index 0000000..26b5605
--- /dev/null
+++ b/contrib/amcheck/t/002_cic.pl
@@ -0,0 +1,78 @@
+
+# Copyright (c) 2021, PostgreSQL Global Development Group
+
+# Test CREATE INDEX CONCURRENTLY with concurrent modifications
+use strict;
+use warnings;
+
+use Config;
+use PostgresNode;
+use TestLib;
+
+use Test::More tests => 4;
+
+my ($node, $result);
+
+#
+# Test set-up
+#
+$node = get_new_node('CIC_test');
+$node->init;
+$node->append_conf('postgresql.conf', 'lock_timeout = 180000');
+$node->start;
+$node->safe_psql('postgres', q(CREATE EXTENSION amcheck));
+$node->safe_psql('postgres', q(CREATE TABLE tbl(i int)));
+$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i)));
+
+#
+# Stress CIC with pgbench
+#
+
+# Run background pgbench with CIC. We cannot mix-in this script into single
+# pgbench: CIC will deadlock with itself occasionally.
+my $pgbench_out   = '';
+my $pgbench_timer = IPC::Run::timeout(180);
+my $pgbench_h     = $node->background_pgbench(
+   '--no-vacuum --client=1 --transactions=200',
+   {
+       '002_pgbench_concurrent_cic' => q(
+           DROP INDEX CONCURRENTLY idx;
+           CREATE INDEX CONCURRENTLY idx ON tbl(i);
+           SELECT bt_index_check('idx',true);
+          )
+   },
+   \$pgbench_out,
+   $pgbench_timer);
+
+# Run pgbench.
+$node->pgbench(
+   '--no-vacuum --client=5 --transactions=200',
+   0,
+   [qr{actually processed}],
+   [qr{^$}],
+   'concurrent INSERTs',
+   {
+       '002_pgbench_concurrent_transaction' => q(
+           BEGIN;
+           INSERT INTO tbl VALUES(0);
+           COMMIT;
+         ),
+       '002_pgbench_concurrent_transaction_savepoints' => q(
+           BEGIN;
+           SAVEPOINT s1;
+           INSERT INTO tbl VALUES(0);
+           COMMIT;
+         )
+   });
+
+$pgbench_h->pump_nb;
+$pgbench_h->finish();
+$result =
+    ($Config{osname} eq "MSWin32")
+  ? ($pgbench_h->full_results)[0]
+  : $pgbench_h->result(0);
+is($result, 0, "pgbench with CIC works");
+
+# done
+$node->stop;
+done_testing();
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c

index 4fbc6d0720d240ff0b9f1f77715558f8c448940f..809c3a0972cf8a741336f2a999f248cb75157f16 100644 (file)
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -584,7 +584,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
             int         i;
  
             if (msg->rc.relId == InvalidOid)
-               RelationCacheInvalidate();
+               RelationCacheInvalidate(false);
             else
                 RelationCacheInvalidateEntry(msg->rc.relId);
  
@@ -641,12 +641,18 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
   */
  void
  InvalidateSystemCaches(void)
+{
+   InvalidateSystemCachesExtended(false);
+}
+
+void
+InvalidateSystemCachesExtended(bool debug_discard)
  {
     int         i;
  
     InvalidateCatalogSnapshot();
     ResetCatalogCaches();
-   RelationCacheInvalidate();  /* gets smgr and relmap too */
+   RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
  
     for (i = 0; i < syscache_callback_count; i++)
     {
@@ -717,7 +723,7 @@ AcceptInvalidationMessages(void)
         if (recursion_depth < 3)
         {
             recursion_depth++;
-           InvalidateSystemCaches();
+           InvalidateSystemCachesExtended(true);
             recursion_depth--;
         }
     }
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c

index 32e75560ead9ae9ea87643538026819486e800da..7976a0d66d4d8bade595fc490eaba20d7d9d936f 100644 (file)
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -150,6 +150,24 @@ bool       criticalSharedRelcachesBuilt = false;
   */
  static long relcacheInvalsReceived = 0L;
  
+/*
+ * in_progress_list is a stack of ongoing RelationBuildDesc() calls.  CREATE
+ * INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
+ * It critically relies on each backend absorbing those changes no later than
+ * next transaction start.  Hence, RelationBuildDesc() loops until it finishes
+ * without accepting a relevant invalidation.  (Most invalidation consumers
+ * don't do this.)
+ */
+typedef struct inprogressent
+{
+   Oid         reloid;         /* OID of relation being built */
+   bool        invalidated;    /* whether an invalidation arrived for it */
+} InProgressEnt;
+
+static InProgressEnt *in_progress_list;
+static int in_progress_list_len;
+static int in_progress_list_maxlen;
+
  /*
   * eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
   * cleanup work.  This list intentionally has limited size; if it overflows,
@@ -1043,6 +1061,7 @@ equalRSDesc(RowSecurityDesc *rsdesc1, RowSecurityDesc *rsdesc2)
  static Relation
  RelationBuildDesc(Oid targetRelId, bool insertIt)
  {
+   int         in_progress_offset;
     Relation    relation;
     Oid         relid;
     HeapTuple   pg_class_tuple;
@@ -1070,6 +1089,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
     oldcxt = MemoryContextSwitchTo(tmpcxt);
  #endif
  
+   /* Register to catch invalidation messages */
+   if (in_progress_list_len >= in_progress_list_maxlen)
+   {
+       int         allocsize;
+
+       allocsize = in_progress_list_maxlen * 2;
+       in_progress_list = repalloc(in_progress_list,
+                                   allocsize * sizeof(*in_progress_list));
+       in_progress_list_maxlen = allocsize;
+   }
+   in_progress_offset = in_progress_list_len++;
+   in_progress_list[in_progress_offset].reloid = targetRelId;
+retry:
+   in_progress_list[in_progress_offset].invalidated = false;
+
     /*
      * find the tuple in pg_class corresponding to the given relation id
      */
@@ -1085,6 +1119,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
         MemoryContextSwitchTo(oldcxt);
         MemoryContextDelete(tmpcxt);
  #endif
+       Assert(in_progress_offset + 1 == in_progress_list_len);
+       in_progress_list_len--;
         return NULL;
     }
  
@@ -1244,6 +1280,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
      */
     heap_freetuple(pg_class_tuple);
  
+   /*
+    * If an invalidation arrived mid-build, start over.  Between here and the
+    * end of this function, don't add code that does or reasonably could read
+    * system catalogs.  That range must be free from invalidation processing
+    * for the !insertIt case.  For the insertIt case, RelationCacheInsert()
+    * will enroll this relation in ordinary relcache invalidation processing,
+    */
+   if (in_progress_list[in_progress_offset].invalidated)
+   {
+       RelationDestroyRelation(relation, false);
+       goto retry;
+   }
+   Assert(in_progress_offset + 1 == in_progress_list_len);
+   in_progress_list_len--;
+
     /*
      * Insert newly created relation into relcache hash table, if requested.
      *
@@ -2586,6 +2637,14 @@ RelationClearRelation(Relation relation, bool rebuild)
  
         /* Build temporary entry, but don't link it into hashtable */
         newrel = RelationBuildDesc(save_relid, false);
+
+       /*
+        * Between here and the end of the swap, don't add code that does or
+        * reasonably could read system catalogs.  That range must be free
+        * from invalidation processing.  See RelationBuildDesc() manipulation
+        * of in_progress_list.
+        */
+
         if (newrel == NULL)
         {
             /*
@@ -2816,6 +2875,14 @@ RelationCacheInvalidateEntry(Oid relationId)
         relcacheInvalsReceived++;
         RelationFlushRelation(relation);
     }
+   else
+   {
+       int         i;
+
+       for (i = 0; i < in_progress_list_len; i++)
+           if (in_progress_list[i].reloid == relationId)
+               in_progress_list[i].invalidated = true;
+   }
  }
  
  /*
@@ -2824,11 +2891,11 @@ RelationCacheInvalidateEntry(Oid relationId)
   *  and rebuild those with positive reference counts.  Also reset the smgr
   *  relation cache and re-read relation mapping data.
   *
- *  This is currently used only to recover from SI message buffer overflow,
- *  so we do not touch relations having new-in-transaction relfilenodes; they
- *  cannot be targets of cross-backend SI updates (and our own updates now go
- *  through a separate linked list that isn't limited by the SI message
- *  buffer size).
+ *  Apart from debug_discard_caches, this is currently used only to recover
+ *  from SI message buffer overflow, so we do not touch relations having
+ *  new-in-transaction relfilenodes; they cannot be targets of cross-backend
+ *  SI updates (and our own updates now go through a separate linked list
+ *  that isn't limited by the SI message buffer size).
   *
   *  We do this in two phases: the first pass deletes deletable items, and
   *  the second one rebuilds the rebuildable items.  This is essential for
@@ -2846,9 +2913,14 @@ RelationCacheInvalidateEntry(Oid relationId)
   *  second pass processes nailed-in-cache items before other nondeletable
   *  items.  This should ensure that system catalogs are up to date before
   *  we attempt to use them to reload information about other open relations.
+ *
+ *  After those two phases of work having immediate effects, we normally
+ *  signal any RelationBuildDesc() on the stack to start over.  However, we
+ *  don't do this if called as part of debug_discard_caches.  Otherwise,
+ *  RelationBuildDesc() would become an infinite loop.
   */
  void
-RelationCacheInvalidate(void)
+RelationCacheInvalidate(bool debug_discard)
  {
     HASH_SEQ_STATUS status;
     RelIdCacheEnt *idhentry;
@@ -2856,6 +2928,7 @@ RelationCacheInvalidate(void)
     List       *rebuildFirstList = NIL;
     List       *rebuildList = NIL;
     ListCell   *l;
+   int         i;
  
     /*
      * Reload relation mapping data before starting to reconstruct cache.
@@ -2942,6 +3015,11 @@ RelationCacheInvalidate(void)
         RelationClearRelation(relation, true);
     }
     list_free(rebuildList);
+
+   if (!debug_discard)
+       /* Any RelationBuildDesc() on the stack must start over. */
+       for (i = 0; i < in_progress_list_len; i++)
+           in_progress_list[i].invalidated = true;
  }
  
  /*
@@ -3092,6 +3170,13 @@ AtEOXact_RelationCache(bool isCommit)
     RelIdCacheEnt *idhentry;
     int         i;
  
+   /*
+    * Forget in_progress_list.  This is relevant when we're aborting due to
+    * an error during RelationBuildDesc().
+    */
+   Assert(in_progress_list_len == 0 || !isCommit);
+   in_progress_list_len = 0;
+
     /*
      * Unless the eoxact_list[] overflowed, we only need to examine the rels
      * listed in it.  Otherwise fall back on a hash_seq_search scan.
@@ -3238,6 +3323,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
     RelIdCacheEnt *idhentry;
     int         i;
  
+   /*
+    * Forget in_progress_list.  This is relevant when we're aborting due to
+    * an error during RelationBuildDesc().  We don't commit subtransactions
+    * during RelationBuildDesc().
+    */
+   Assert(in_progress_list_len == 0 || !isCommit);
+   in_progress_list_len = 0;
+
     /*
      * Unless the eoxact_list[] overflowed, we only need to examine the rels
      * listed in it.  Otherwise fall back on a hash_seq_search scan.  Same
@@ -3786,6 +3879,7 @@ void
  RelationCacheInitialize(void)
  {
     HASHCTL     ctl;
+   int         allocsize;
  
     /*
      * make sure cache memory context exists
@@ -3802,6 +3896,15 @@ RelationCacheInitialize(void)
     RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
                                   &ctl, HASH_ELEM | HASH_BLOBS);
  
+   /*
+    * reserve enough in_progress_list slots for many cases
+    */
+   allocsize = 4;
+   in_progress_list =
+       MemoryContextAlloc(CacheMemoryContext,
+                          allocsize * sizeof(*in_progress_list));
+   in_progress_list_maxlen = allocsize;
+
     /*
      * relation mapper needs to be initialized too
      */
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h

index bc5081cf7210b7b3c36a77c27cafc747bf6d77dd..4c6b86c9610c5fda6abbccac9feeeaade6e16ef2 100644 (file)
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -61,4 +61,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
  extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
  
  extern void InvalidateSystemCaches(void);
+extern void InvalidateSystemCachesExtended(bool debug_discard);
  #endif                         /* INVAL_H */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h

index a432a407b08038f52e7146eea9d6c59e172c315f..ff478c3ba7f2baa98a9bb0cb693170b2aaa2ec70 100644 (file)
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -120,7 +120,7 @@ extern void RelationForgetRelation(Oid rid);
  
  extern void RelationCacheInvalidateEntry(Oid relationId);
  
-extern void RelationCacheInvalidate(void);
+extern void RelationCacheInvalidate(bool debug_discard);
  
  extern void RelationCloseSmgrByOid(Oid relationId);
  
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm

index 52f552be85b13059d4d581faa257b1fbdfb93542..cbcb74866b9016a0efcc5ace6dd450e3952604b0 100644 (file)
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1638,6 +1638,141 @@ sub interactive_psql
     return $harness;
  }
  
+# Common sub of pgbench-invoking interfaces.  Makes any requested script files
+# and returns pgbench command-line options causing use of those files.
+sub _pgbench_make_files
+{
+   my ($self, $files) = @_;
+   my @file_opts;
+
+   if (defined $files)
+   {
+
+       # note: files are ordered for determinism
+       for my $fn (sort keys %$files)
+       {
+           my $filename = $self->basedir . '/' . $fn;
+           push @file_opts, '-f', $filename;
+
+           # cleanup file weight
+           $filename =~ s/\@\d+$//;
+
+           #push @filenames, $filename;
+           # filenames are expected to be unique on a test
+           if (-e $filename)
+           {
+               ok(0, "$filename must not already exist");
+               unlink $filename or die "cannot unlink $filename: $!";
+           }
+           TestLib::append_to_file($filename, $$files{$fn});
+       }
+   }
+
+   return @file_opts;
+}
+
+=pod
+
+=item $node->pgbench($opts, $stat, $out, $err, $name, $files, @args)
+
+Invoke B, with parameters and files.
+
+=over
+
+=item $opts
+
+Options as a string to be split on spaces.
+
+=item $stat
+
+Expected exit status.
+
+=item $out
+
+Reference to a regexp list that must match stdout.
+
+=item $err
+
+Reference to a regexp list that must match stderr.
+
+=item $name
+
+Name of test for error messages.
+
+=item $files
+
+Reference to filename/contents dictionary.
+
+=item @args
+
+Further raw options or arguments.
+
+=back
+
+=cut
+
+sub pgbench
+{
+   local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+   my ($self, $opts, $stat, $out, $err, $name, $files, @args) = @_;
+   my @cmd = (
+       'pgbench',
+       split(/\s+/, $opts),
+       $self->_pgbench_make_files($files), @args);
+
+   $self->command_checks_all(\@cmd, $stat, $out, $err, $name);
+}
+
+=pod
+
+=item $node->background_pgbench($opts, $files, \$stdout, $timer) => harness
+
+Invoke B and return an IPC::Run harness object.  The process's stdin
+is empty, and its stdout and stderr go to the $stdout scalar reference.  This
+allows the caller to act on other parts of the system while B is
+running.  Errors from B are the caller's problem.
+
+The specified timer object is attached to the harness, as well.  It's caller's
+responsibility to select the timeout length, and to restart the timer after
+each command if the timeout is per-command.
+
+Be sure to "finish" the harness when done with it.
+
+=over
+
+=item $opts
+
+Options as a string to be split on spaces.
+
+=item $files
+
+Reference to filename/contents dictionary.
+
+=back
+
+=cut
+
+sub background_pgbench
+{
+   my ($self, $opts, $files, $stdout, $timer) = @_;
+
+   my @cmd =
+     ('pgbench', split(/\s+/, $opts), $self->_pgbench_make_files($files));
+
+   local $ENV{PGHOST} = $self->host;
+   local $ENV{PGPORT} = $self->port;
+
+   my $stdin = "";
+   # IPC::Run would otherwise append to existing contents:
+   $$stdout = "" if ref($stdout);
+
+   my $harness = IPC::Run::start \@cmd, '<', \$stdin, '>', $stdout, '2>&1',
+     $timer;
+
+   return $harness;
+}
+
  =pod
  
  =item $node->poll_query_until($dbname, $query [, $expected ])
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index 918d14bcae29d18f057b7691a1ed2a9b99e8db92..50fdb17fa3f2313004d9c990a15383a270a402b6 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1044,6 +1044,7 @@ ImportForeignSchemaStmt
  ImportForeignSchemaType
  ImportForeignSchema_function
  ImportQual
+InProgressEnt
  IncludeWal
  InclusionOpaque
  IncrementVarSublevelsUp_context
author	Noah Misch
	Sun, 24 Oct 2021 01:36:38 +0000 (18:36 -0700)
committer	Noah Misch
	Sun, 24 Oct 2021 01:36:42 +0000 (18:36 -0700)
contrib/amcheck/Makefile		patch \| blob \| blame \| history
contrib/amcheck/t/002_cic.pl	[new file with mode: 0644]	patch \| blob
src/backend/utils/cache/inval.c		patch \| blob \| blame \| history
src/backend/utils/cache/relcache.c		patch \| blob \| blame \| history
src/include/utils/inval.h		patch \| blob \| blame \| history
src/include/utils/relcache.h		patch \| blob \| blame \| history
src/test/perl/PostgresNode.pm		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history