Add logging for excessive ProcSignalBarrier waits.
authorThomas Munro
Wed, 11 May 2022 06:03:03 +0000 (18:03 +1200)
committerThomas Munro
Wed, 11 May 2022 06:03:03 +0000 (18:03 +1200)
To enable diagnosis of systems that are not processing ProcSignalBarrier
requests promptly, add a LOG message every 5 seconds if we seem to be
wedged.  Although you could already see this state as a wait event in
pg_stat_activity, the log message also shows the PID of the process that
is preventing progress.

Also add DEBUG1 logging around the whole wait loop.

Reviewed-by: Robert Haas
Discussion: https://postgr.es/m/CA%2BTgmoYJ03r5359gQutRGP9BtigYCg3_UskcmnVjBf-QO3-0pQ%40mail.gmail.com

src/backend/storage/ipc/procsignal.c

index 00d66902d8bfd8aa1ece8e456c2136aa1b6ffe5a..21a9fc0fdd2edc18d7160d1500b4e86994742383 100644 (file)
@@ -393,6 +393,11 @@ WaitForProcSignalBarrier(uint64 generation)
 {
    Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
 
+   elog(DEBUG1,
+        "waiting for all backends to process ProcSignalBarrier generation "
+        UINT64_FORMAT,
+        generation);
+
    for (int i = NumProcSignalSlots - 1; i >= 0; i--)
    {
        ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
@@ -407,13 +412,22 @@ WaitForProcSignalBarrier(uint64 generation)
        oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
        while (oldval < generation)
        {
-           ConditionVariableSleep(&slot->pss_barrierCV,
-                                  WAIT_EVENT_PROC_SIGNAL_BARRIER);
+           if (ConditionVariableTimedSleep(&slot->pss_barrierCV,
+                                           5000,
+                                           WAIT_EVENT_PROC_SIGNAL_BARRIER))
+               ereport(LOG,
+                       (errmsg("still waiting for backend with PID %lu to accept ProcSignalBarrier",
+                               (unsigned long) slot->pss_pid)));
            oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
        }
        ConditionVariableCancelSleep();
    }
 
+   elog(DEBUG1,
+        "finished waiting for all backends to process ProcSignalBarrier generation "
+        UINT64_FORMAT,
+        generation);
+
    /*
     * The caller is probably calling this function because it wants to read
     * the shared state or perform further writes to shared state once all