lwlock: Fix quadratic behavior with very long wait lists
authorAndres Freund
Sun, 20 Nov 2022 19:56:32 +0000 (11:56 -0800)
committerMichael Paquier
Thu, 18 Jan 2024 02:12:43 +0000 (11:12 +0900)
Until now LWLockDequeueSelf() sequentially searched the list of waiters to see
if the current proc is still is on the list of waiters, or has already been
removed. In extreme workloads, where the wait lists are very long, this leads
to a quadratic behavior. #backends iterating over a list #backends
long. Additionally, the likelihood of needing to call LWLockDequeueSelf() in
the first place also increases with the increased length of the wait queue, as
it becomes more likely that a lock is released while waiting for the wait list
lock, which is held for longer during lock release.

Due to the exponential back-off in perform_spin_delay() this is surprisingly
hard to detect. We should make that easier, e.g. by adding a wait event around
the pg_usleep() - but that's a separate patch.

The fix is simple - track whether a proc is currently waiting in the wait list
or already removed but waiting to be woken up in PGPROC->lwWaiting.

In some workloads with a lot of clients contending for a small number of
lwlocks (e.g. WALWriteLock), the fix can substantially increase throughput.

This has been originally fixed for 16~ with a4adc31f6902 without a
backpatch, and we have heard complaints from users impacted by this
quadratic behavior in older versions as well.

Author: Andres Freund 
Reviewed-by: Bharath Rupireddy
Discussion: https://postgr.es/m/20221027165914[email protected]
Discussion: https://postgr.es/m/CALj2ACXktNbG=K8Xi7PSqbofTZozavhaxjatVc14iYaLu4Maag@mail.gmail.com
Backpatch-through: 12

src/backend/access/transam/twophase.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/proc.c
src/include/storage/lwlock.h
src/include/storage/proc.h

index d375b1012beb9c0a0cbd13b39786326e43d36358..f8272c2f0d04a8af2b94bd603f2b7f071c0cdcd8 100644 (file)
@@ -484,7 +484,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
    proc->roleId = owner;
    proc->tempNamespaceId = InvalidOid;
    proc->isBackgroundWorker = false;
-   proc->lwWaiting = false;
+   proc->lwWaiting = LW_WS_NOT_WAITING;
    proc->lwWaitMode = 0;
    proc->waitLock = NULL;
    proc->waitProcLock = NULL;
index db89137c033df9e4237c77208483676f7ab7b029..49423ea4d22b8da22b3cd12299a37b7658a2cf7b 100644 (file)
@@ -998,6 +998,15 @@ LWLockWakeup(LWLock *lock)
            wokeup_somebody = true;
        }
 
+       /*
+        * Signal that the process isn't on the wait list anymore. This allows
+        * LWLockDequeueSelf() to remove itself of the waitlist with a
+        * proclist_delete(), rather than having to check if it has been
+        * removed from the list.
+        */
+       Assert(waiter->lwWaiting == LW_WS_WAITING);
+       waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
+
        /*
         * Once we've woken up an exclusive lock, there's no point in waking
         * up anybody else.
@@ -1055,7 +1064,7 @@ LWLockWakeup(LWLock *lock)
         * another lock.
         */
        pg_write_barrier();
-       waiter->lwWaiting = false;
+       waiter->lwWaiting = LW_WS_NOT_WAITING;
        PGSemaphoreUnlock(waiter->sem);
    }
 }
@@ -1076,7 +1085,7 @@ LWLockQueueSelf(LWLock *lock, LWLockMode mode)
    if (MyProc == NULL)
        elog(PANIC, "cannot wait without a PGPROC structure");
 
-   if (MyProc->lwWaiting)
+   if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
        elog(PANIC, "queueing for lock while waiting on another one");
 
    LWLockWaitListLock(lock);
@@ -1084,7 +1093,7 @@ LWLockQueueSelf(LWLock *lock, LWLockMode mode)
    /* setting the flag is protected by the spinlock */
    pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
 
-   MyProc->lwWaiting = true;
+   MyProc->lwWaiting = LW_WS_WAITING;
    MyProc->lwWaitMode = mode;
 
    /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
@@ -1112,8 +1121,7 @@ LWLockQueueSelf(LWLock *lock, LWLockMode mode)
 static void
 LWLockDequeueSelf(LWLock *lock)
 {
-   bool        found = false;
-   proclist_mutable_iter iter;
+   bool        on_waitlist;
 
 #ifdef LWLOCK_STATS
    lwlock_stats *lwstats;
@@ -1126,18 +1134,13 @@ LWLockDequeueSelf(LWLock *lock)
    LWLockWaitListLock(lock);
 
    /*
-    * Can't just remove ourselves from the list, but we need to iterate over
-    * all entries as somebody else could have dequeued us.
+    * Remove ourselves from the waitlist, unless we've already been
+    * removed. The removal happens with the wait list lock held, so there's
+    * no race in this check.
     */
-   proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
-   {
-       if (iter.cur == MyProc->pgprocno)
-       {
-           found = true;
-           proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
-           break;
-       }
-   }
+   on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
+   if (on_waitlist)
+       proclist_delete(&lock->waiters, MyProc->pgprocno, lwWaitLink);
 
    if (proclist_is_empty(&lock->waiters) &&
        (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
@@ -1149,8 +1152,8 @@ LWLockDequeueSelf(LWLock *lock)
    LWLockWaitListUnlock(lock);
 
    /* clear waiting state again, nice for debugging */
-   if (found)
-       MyProc->lwWaiting = false;
+   if (on_waitlist)
+       MyProc->lwWaiting = LW_WS_NOT_WAITING;
    else
    {
        int         extraWaits = 0;
@@ -1174,7 +1177,7 @@ LWLockDequeueSelf(LWLock *lock)
        for (;;)
        {
            PGSemaphoreLock(MyProc->sem);
-           if (!MyProc->lwWaiting)
+           if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
                break;
            extraWaits++;
        }
@@ -1325,7 +1328,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
        for (;;)
        {
            PGSemaphoreLock(proc->sem);
-           if (!proc->lwWaiting)
+           if (proc->lwWaiting == LW_WS_NOT_WAITING)
                break;
            extraWaits++;
        }
@@ -1490,7 +1493,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
            for (;;)
            {
                PGSemaphoreLock(proc->sem);
-               if (!proc->lwWaiting)
+               if (proc->lwWaiting == LW_WS_NOT_WAITING)
                    break;
                extraWaits++;
            }
@@ -1706,7 +1709,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
        for (;;)
        {
            PGSemaphoreLock(proc->sem);
-           if (!proc->lwWaiting)
+           if (proc->lwWaiting == LW_WS_NOT_WAITING)
                break;
            extraWaits++;
        }
@@ -1787,6 +1790,10 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 
        proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
        proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+       /* see LWLockWakeup() */
+       Assert(waiter->lwWaiting == LW_WS_WAITING);
+       waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
    }
 
    /* We are done updating shared state of the lock itself. */
@@ -1802,7 +1809,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
        proclist_delete(&wakeup, iter.cur, lwWaitLink);
        /* check comment in LWLockWakeup() about this barrier */
        pg_write_barrier();
-       waiter->lwWaiting = false;
+       waiter->lwWaiting = LW_WS_NOT_WAITING;
        PGSemaphoreUnlock(waiter->sem);
    }
 }
index 199c0eae078cae0cd4aba040ce8e71a064770853..81a2025eb507317c0aac8d3062bc45d5747e291e 100644 (file)
@@ -402,7 +402,7 @@ InitProcess(void)
    /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
    if (IsAutoVacuumWorkerProcess())
        MyPgXact->vacuumFlags |= PROC_IS_AUTOVACUUM;
-   MyProc->lwWaiting = false;
+   MyProc->lwWaiting = LW_WS_NOT_WAITING;
    MyProc->lwWaitMode = 0;
    MyProc->waitLock = NULL;
    MyProc->waitProcLock = NULL;
@@ -582,7 +582,7 @@ InitAuxiliaryProcess(void)
    MyProc->delayChkpt = false;
    MyProc->delayChkptEnd = false;
    MyPgXact->vacuumFlags = 0;
-   MyProc->lwWaiting = false;
+   MyProc->lwWaiting = LW_WS_NOT_WAITING;
    MyProc->lwWaitMode = 0;
    MyProc->waitLock = NULL;
    MyProc->waitProcLock = NULL;
index cdbfbed118375e0ac4ad23553e6019fadfc29fb0..a1197a8007814f45ef3f26cf76739752e99d9513 100644 (file)
 
 struct PGPROC;
 
+/* what state of the wait process is a backend in */
+typedef enum LWLockWaitState
+{
+   LW_WS_NOT_WAITING, /* not currently waiting / woken up */
+   LW_WS_WAITING, /* currently waiting */
+   LW_WS_PENDING_WAKEUP /* removed from waitlist, but not yet signalled */
+} LWLockWaitState;
+
 /*
  * Code outside of lwlock.c should not manipulate the contents of this
  * structure directly, but we have to declare it here to allow LWLocks to be
index 7c85b5645b76d8b5e88736b5e926ca7233c80b90..ffb185d3df5df5e2c99aef323a599fc712bf6365 100644 (file)
@@ -140,7 +140,7 @@ struct PGPROC
    bool        recoveryConflictPending;
 
    /* Info about LWLock the process is currently waiting for, if any. */
-   bool        lwWaiting;      /* true if waiting for an LW lock */
+   uint8       lwWaiting;      /* see LWLockWaitState */
    uint8       lwWaitMode;     /* lwlock mode being waited for */
    proclist_node lwWaitLink;   /* position in LW lock wait list */