Divide the lock manager's shared state into 'partitions', so as to

author Tom Lane

Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)

committer Tom Lane

Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
author Tom Lane
Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
committer Tom Lane
Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c

index ffdee8388b3a96e851be2ddb034bff473fbf549c..0898df623375e62a0b9e734e59d6a0cb7d1d8f70 100644 (file)
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *     $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.17 2005/11/22 18:17:07 momjian Exp $
+ *     $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.18 2005/12/11 21:02:17 tgl Exp $
   *
   * NOTES
   *     Each global transaction is associated with a global transaction
@@ -284,7 +284,8 @@ MarkAsPreparing(TransactionId xid, const char *gid,
     gxact->proc.lwWaitLink = NULL;
     gxact->proc.waitLock = NULL;
     gxact->proc.waitProcLock = NULL;
-   SHMQueueInit(&(gxact->proc.procLocks));
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       SHMQueueInit(&(gxact->proc.myProcLocks[i]));
     /* subxid data must be filled later by GXactLoadSubxactData */
     gxact->proc.subxids.overflowed = false;
     gxact->proc.subxids.nxids = 0;
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c

index 7ac8084f6a33fc88e4b2372e0629e50cca104795..cafadeb90542d3167ac3c2669fb656ffdb54a19b 100644 (file)
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -14,8 +14,8 @@
   *
   * The process array now also includes PGPROC structures representing
   * prepared transactions.  The xid and subxids fields of these are valid,
- * as is the procLocks list.  They can be distinguished from regular backend
- * PGPROCs at need by checking for pid == 0.
+ * as are the myProcLocks lists.  They can be distinguished from regular
+ * backend PGPROCs at need by checking for pid == 0.
   *
   *
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
@@ -23,7 +23,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.8 2005/11/22 18:17:20 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.9 2005/12/11 21:02:18 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README

index 25820f4b73d9e418713a1d85a78ec6273b34e261..fdda5bf82a44d2e3c30fbf8c550a121adabde367 100644 (file)
--- a/src/backend/storage/lmgr/README
+++ b/src/backend/storage/lmgr/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.18 2005/12/09 01:22:04 tgl Exp $
+$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.19 2005/12/11 21:02:18 tgl Exp $
  
  
  LOCKING OVERVIEW
@@ -50,9 +50,12 @@ LOCK DATA STRUCTURES
  Lock methods describe the overall locking behavior.  Currently there are
  two lock methods: DEFAULT and USER.  (USER locks are non-blocking.)
  
-Lock modes describe the type of the lock (read/write or shared/exclusive). 
-See src/tools/backend/index.html and src/include/storage/lock.h for more
-details.
+Lock modes describe the type of the lock (read/write or shared/exclusive).
+In principle, each lock method can have its own set of lock modes with
+different conflict rules, but currently DEFAULT and USER methods use
+identical lock mode sets.  See src/tools/backend/index.html and
+src/include/storage/lock.h for more details.  (Lock modes are also called
+lock types in some places in the code and documentation.)
  
  There are two fundamental lock structures in shared memory: the
  per-lockable-object LOCK struct, and the per-lock-and-requestor PROCLOCK
@@ -67,7 +70,7 @@ be made per lockable object/lock mode/backend.  Internally to a backend,
  however, the same lock may be requested and perhaps released multiple times
  in a transaction, and it can also be held both transactionally and session-
  wide.  The internal request counts are held in LOCALLOCK so that the shared
-LockMgrLock need not be obtained to alter them.
+data structures need not be accessed to alter them.
  
  ---------------------------------------------------------------------------
  
@@ -103,10 +106,10 @@ procLocks -
      be waiting for more!).
  
  waitProcs -
-    This is a shared memory queue of all process structures corresponding to
-    a backend that is waiting (sleeping) until another backend releases this
+    This is a shared memory queue of all PGPROC structures corresponding to
+    backends that are waiting (sleeping) until another backend releases this
      lock.  The process structure holds the information needed to determine
-    if it should be woken up when this lock is released.
+    if it should be woken up when the lock is released.
  
  nRequested -
      Keeps a count of how many times this lock has been attempted to be
@@ -131,12 +134,12 @@ nGranted -
  granted -
      Keeps count of how many locks of each type are currently held.  Once again
      only elements 1 through MAX_LOCKMODES-1 are used (0 is not).  Also, like
-    requested, summing the values of granted should total to the value
+    requested[], summing the values of granted[] should total to the value
      of nGranted.
  
  We should always have 0 <= nGranted <= nRequested, and
-0 <= granted[i] <= requested[i] for each i.  If the request counts go to
-zero, the lock object is no longer needed and can be freed.
+0 <= granted[i] <= requested[i] for each i.  When all the request counts
+go to zero, the LOCK object is no longer needed and can be freed.
  
  ---------------------------------------------------------------------------
  
@@ -154,15 +157,16 @@ tag -
          SHMEM offset of PGPROC of backend process that owns this PROCLOCK.
  
  holdMask -
-    A bitmask for the lock types successfully acquired by this PROCLOCK.
+    A bitmask for the lock modes successfully acquired by this PROCLOCK.
      This should be a subset of the LOCK object's grantMask, and also a
-    subset of the PGPROC object's heldLocks mask.
+    subset of the PGPROC object's heldLocks mask (if the PGPROC is
+    currently waiting for another lock mode on this lock).
  
  releaseMask -
-    A bitmask for the lock types due to be released during LockReleaseAll.
+    A bitmask for the lock modes due to be released during LockReleaseAll.
      This must be a subset of the holdMask.  Note that it is modified without
-    taking the LockMgrLock, and therefore it is unsafe for any backend except
-    the one owning the PROCLOCK to examine/change it.
+    taking the partition LWLock, and therefore it is unsafe for any
+    backend except the one owning the PROCLOCK to examine/change it.
  
  lockLink -
      List link for shared memory queue of all the PROCLOCK objects for the
@@ -174,7 +178,60 @@ procLink -
  
  ---------------------------------------------------------------------------
  
-The deadlock detection algorithm:
+
+LOCK MANAGER INTERNAL LOCKING
+
+Before PostgreSQL 8.2, all of the shared-memory data structures used by
+the lock manager were protected by a single LWLock, the LockMgrLock;
+any operation involving these data structures had to exclusively lock
+LockMgrLock.  Not too surprisingly, this became a contention bottleneck.
+To reduce contention, the lock manager's data structures have been split
+into multiple "partitions", each protected by an independent LWLock.
+Most operations only need to lock the single partition they are working in.
+Here are the details:
+
+* Each possible lock is assigned to one partition according to a hash of
+its LOCKTAG value (see LockTagToPartition()).  The partition's LWLock is
+considered to protect all the LOCK objects of that partition as well as
+their subsidiary PROCLOCKs.  The shared-memory hash tables for LOCKs and
+PROCLOCKs are divided into separate hash tables for each partition, and
+operations on each hash table are likewise protected by the partition
+lock.
+
+* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
+This has now been split into per-partition lists, so that access to a
+particular PROCLOCK list can be protected by the associated partition's
+LWLock.  (This is not strictly necessary at the moment, because at this
+writing a PGPROC's PROCLOCK list is only accessed by the owning backend
+anyway.  But it seems forward-looking to maintain a convention for how
+other backends could access it.  In any case LockReleaseAll needs to be
+able to quickly determine which partition each LOCK belongs to, and
+for the currently contemplated number of partitions, this way takes less
+shared memory than explicitly storing a partition number in LOCK structs
+would require.)
+
+* The other lock-related fields of a PGPROC are only interesting when
+the PGPROC is waiting for a lock, so we consider that they are protected
+by the partition LWLock of the awaited lock.
+
+For normal lock acquisition and release, it is sufficient to lock the
+partition containing the desired lock.  Deadlock checking needs to touch
+multiple partitions in general; for simplicity, we just make it lock all
+the partitions in partition-number order.  (To prevent LWLock deadlock,
+we establish the rule that any backend needing to lock more than one
+partition at once must lock them in partition-number order.)  It's
+possible that deadlock checking could be done without touching every
+partition in typical cases, but since in a properly functioning system
+deadlock checking should not occur often enough to be performance-critical,
+trying to make this work does not seem a productive use of effort.
+
+A backend's internal LOCALLOCK hash table is not partitioned.  We do store
+the partition number in LOCALLOCK table entries, but this is a straight
+speed-for-space tradeoff: we could instead recalculate the partition
+number from the LOCKTAG when needed.
+
+
+THE DEADLOCK DETECTION ALGORITHM
  
  Since we allow user transactions to request locks in any order, deadlock
  is possible.  We use a deadlock detection/breaking algorithm that is
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c

index adbd373bb7f00642d2ad9ac641e2a0210b0b3682..e72ab00b5b03a2eb36efb8ecc70662c51d4c2125 100644 (file)
--- a/src/backend/storage/lmgr/deadlock.c
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -12,7 +12,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/lmgr/deadlock.c,v 1.37 2005/12/09 01:22:04 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/lmgr/deadlock.c,v 1.38 2005/12/11 21:02:18 tgl Exp $
   *
   * Interface:
   *
@@ -53,9 +53,9 @@ typedef struct
   * Information saved about each edge in a detected deadlock cycle. This
   * is used to print a diagnostic message upon failure.
   *
- * Note: because we want to examine this info after releasing the LockMgrLock,
- * we can't just store LOCK and PGPROC pointers; we must extract out all the
- * info we want to be able to print.
+ * Note: because we want to examine this info after releasing the lock
+ * manager's partition locks, we can't just store LOCK and PGPROC pointers;
+ * we must extract out all the info we want to be able to print.
   */
  typedef struct
  {
@@ -188,19 +188,11 @@ InitDeadLockChecking(void)
   * deadlock.  If resolution is impossible, return TRUE --- the caller
   * is then expected to abort the given proc's transaction.
   *
- * We can't block on user locks, so no sense testing for deadlock
- * because there is no blocking, and no timer for the block.  So,
- * only look at regular locks.
- *
- * We must have already locked the master lock before being called.
- * NOTE: although the lockmethod structure appears to allow each lock
- * table to have a different masterLock, all locks that can block had
- * better use the same LWLock, else this code will not be adequately
- * interlocked!
+ * Caller must already have locked all partitions of the lock tables.
   *
   * On failure, deadlock details are recorded in deadlockDetails[] for
   * subsequent printing by DeadLockReport().  That activity is separate
- * because we don't want to do it while holding the master lock.
+ * because we don't want to do it while holding all those LWLocks.
   */
  bool
  DeadLockCheck(PGPROC *proc)
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c

index 344d677cd2f29104e2e20f27370fe54b577202e8..7f42b477cc607f6f960e61c8874dd15a5b9678d6 100644 (file)
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -1,14 +1,14 @@
  /*-------------------------------------------------------------------------
   *
   * lock.c
- *   POSTGRES low-level lock mechanism
+ *   POSTGRES primary lock mechanism
   *
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.161 2005/12/09 01:22:04 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.162 2005/12/11 21:02:18 tgl Exp $
   *
   * NOTES
   *   A lock table is a shared memory hash table.  When
@@ -163,10 +163,13 @@ typedef struct TwoPhaseLockRecord
  
  
  /*
- * Links to hash tables containing lock state
+ * Pointers to hash tables containing lock state
+ *
+ * The LockMethodLockHash and LockMethodProcLockHash hash tables are in
+ * shared memory; LockMethodLocalHash is local to each backend.
   */
-static HTAB *LockMethodLockHash;
-static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodLockHash[NUM_LOCK_PARTITIONS];
+static HTAB *LockMethodProcLockHash[NUM_LOCK_PARTITIONS];
  static HTAB *LockMethodLocalHash;
  
  
@@ -255,16 +258,25 @@ PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
  
  static void RemoveLocalLock(LOCALLOCK *locallock);
  static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
-static void WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
-          ResourceOwner owner);
+static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
  static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
             PROCLOCK *proclock, LockMethod lockMethodTable);
-static void CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock,
-           PROCLOCK *proclock, bool wakeupNeeded);
+static void CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+           LockMethod lockMethodTable, int partition,
+           bool wakeupNeeded);
  
  
  /*
- * InitLocks -- Initialize the lock module's shared memory.
+ * InitLocks -- Initialize the lock manager's data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments.  In the normal postmaster case, the shared hash tables
+ * are created here, as well as a locallock hash table that will remain
+ * unused and empty in the postmaster itself.  Backends inherit the pointers
+ * to the shared tables via fork(), and also inherit an image of the locallock
+ * hash table, which they proceed to use.  In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables and to create its locallock hash table.
   */
  void
  InitLocks(void)
@@ -274,13 +286,18 @@ InitLocks(void)
     int         hash_flags;
     long        init_table_size,
                 max_table_size;
+   int         i;
  
-   /* Compute init/max size to request for lock hashtables */
+   /*
+    * Compute init/max size to request for lock hashtables.  Note these
+    * calculations must agree with LockShmemSize!
+    */
     max_table_size = NLOCKENTS();
+   max_table_size = (max_table_size - 1) / NUM_LOCK_PARTITIONS + 1;
     init_table_size = max_table_size / 2;
  
     /*
-    * allocate a hash table for LOCK structs.  This is used to store
+    * Allocate hash tables for LOCK structs.  These are used to store
      * per-locked-object information.
      */
     MemSet(&info, 0, sizeof(info));
@@ -289,37 +306,45 @@ InitLocks(void)
     info.hash = tag_hash;
     hash_flags = (HASH_ELEM | HASH_FUNCTION);
  
-   sprintf(shmemName, "LOCK hash");
-   LockMethodLockHash = ShmemInitHash(shmemName,
-                                      init_table_size,
-                                      max_table_size,
-                                      &info,
-                                      hash_flags);
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+   {
+       sprintf(shmemName, "LOCK hash %d", i);
+       LockMethodLockHash[i] = ShmemInitHash(shmemName,
+                                             init_table_size,
+                                             max_table_size,
+                                             &info,
+                                             hash_flags);
+       if (!LockMethodLockHash[i])
+           elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+   }
  
-   if (!LockMethodLockHash)
-       elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+   /* Assume an average of 2 holders per lock */
+   max_table_size *= 2;
+   init_table_size *= 2;
  
     /*
-    * allocate a hash table for PROCLOCK structs.  This is used to store
-    * per-lock-holder information.
+    * Allocate hash tables for PROCLOCK structs.  These are used to store
+    * per-lock-per-holder information.
      */
     info.keysize = sizeof(PROCLOCKTAG);
     info.entrysize = sizeof(PROCLOCK);
     info.hash = tag_hash;
     hash_flags = (HASH_ELEM | HASH_FUNCTION);
  
-   sprintf(shmemName, "PROCLOCK hash");
-   LockMethodProcLockHash = ShmemInitHash(shmemName,
-                                          init_table_size,
-                                          max_table_size,
-                                          &info,
-                                          hash_flags);
-
-   if (!LockMethodProcLockHash)
-       elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+   {
+       sprintf(shmemName, "PROCLOCK hash %d", i);
+       LockMethodProcLockHash[i] = ShmemInitHash(shmemName,
+                                                 init_table_size,
+                                                 max_table_size,
+                                                 &info,
+                                                 hash_flags);
+       if (!LockMethodProcLockHash[i])
+           elog(FATAL, "could not initialize lock table \"%s\"", shmemName);
+   }
  
     /*
-    * allocate a non-shared hash table for LOCALLOCK structs.  This is used
+    * Allocate one non-shared hash table for LOCALLOCK structs.  This is used
      * to store lock counts and resource owner information.
      *
      * The non-shared table could already exist in this process (this occurs
@@ -355,6 +380,39 @@ GetLocksMethodTable(const LOCK *lock)
  }
  
  
+/*
+ * Given a LOCKTAG, determine which partition the lock belongs in.
+ *
+ * Basically what we want to do here is hash the locktag.  However, it
+ * seems unwise to use hash_any() because that is the same function that
+ * will be used to distribute the locks within each partition's hash table;
+ * if we use it, we run a big risk of having uneven distribution of hash
+ * codes within each hash table.  Instead, we use a simple linear XOR of the
+ * bits of the locktag.
+ */
+int
+LockTagToPartition(const LOCKTAG *locktag)
+{
+   const uint8 *ptr = (const uint8 *) locktag;
+   int         result = 0;
+   int         i;
+
+   for (i = 0; i < sizeof(LOCKTAG); i++)
+       result ^= *ptr++;
+#if NUM_LOCK_PARTITIONS == 16
+   result ^= result >> 4;
+   result &= 0x0F;
+#elif NUM_LOCK_PARTITIONS == 4
+   result ^= result >> 4;
+   result ^= result >> 2;
+   result &= 0x03;
+#else
+#error unsupported NUM_LOCK_PARTITIONS
+#endif
+   return result;
+}
+
+
  /*
   * LockAcquire -- Check for lock conflicts, sleep if conflict found,
   *     set lock if/when no conflicts.
@@ -397,7 +455,8 @@ LockAcquire(const LOCKTAG *locktag,
     PROCLOCKTAG proclocktag;
     bool        found;
     ResourceOwner owner;
-   LWLockId    masterLock;
+   int         partition;
+   LWLockId    partitionLock;
     int         status;
  
     if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
@@ -438,6 +497,7 @@ LockAcquire(const LOCKTAG *locktag,
         locallock->lock = NULL;
         locallock->proclock = NULL;
         locallock->isTempObject = isTempObject;
+       locallock->partition = LockTagToPartition(&(localtag.lock));
         locallock->nLocks = 0;
         locallock->numLockOwners = 0;
         locallock->maxLockOwners = 8;
@@ -474,9 +534,10 @@ LockAcquire(const LOCKTAG *locktag,
     /*
      * Otherwise we've got to mess with the shared lock table.
      */
-   masterLock = LockMgrLock;
+   partition = locallock->partition;
+   partitionLock = FirstLockMgrLock + partition;
  
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
     /*
      * Find or create a lock with this tag.
@@ -486,12 +547,12 @@ LockAcquire(const LOCKTAG *locktag,
      * pointer is valid, since a lock object with no locks can go away
      * anytime.
      */
-   lock = (LOCK *) hash_search(LockMethodLockHash,
+   lock = (LOCK *) hash_search(LockMethodLockHash[partition],
                                 (void *) locktag,
                                 HASH_ENTER_NULL, &found);
     if (!lock)
     {
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
         ereport(ERROR,
                 (errcode(ERRCODE_OUT_OF_MEMORY),
                  errmsg("out of shared memory"),
@@ -532,7 +593,7 @@ LockAcquire(const LOCKTAG *locktag,
     /*
      * Find or create a proclock entry with this tag
      */
-   proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+   proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
                                         (void *) &proclocktag,
                                         HASH_ENTER_NULL, &found);
     if (!proclock)
@@ -547,12 +608,12 @@ LockAcquire(const LOCKTAG *locktag,
              * anyone to release the lock object later.
              */
             Assert(SHMQueueEmpty(&(lock->procLocks)));
-           if (!hash_search(LockMethodLockHash,
+           if (!hash_search(LockMethodLockHash[partition],
                              (void *) &(lock->tag),
                              HASH_REMOVE, NULL))
                 elog(PANIC, "lock table corrupted");
         }
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
         ereport(ERROR,
                 (errcode(ERRCODE_OUT_OF_MEMORY),
                  errmsg("out of shared memory"),
@@ -569,7 +630,8 @@ LockAcquire(const LOCKTAG *locktag,
         proclock->releaseMask = 0;
         /* Add proclock to appropriate lists */
         SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
-       SHMQueueInsertBefore(&MyProc->procLocks, &proclock->procLink);
+       SHMQueueInsertBefore(&(MyProc->myProcLocks[partition]),
+                            &proclock->procLink);
         PROCLOCK_PRINT("LockAcquire: new", proclock);
     }
     else
@@ -666,7 +728,7 @@ LockAcquire(const LOCKTAG *locktag,
             {
                 SHMQueueDelete(&proclock->lockLink);
                 SHMQueueDelete(&proclock->procLink);
-               if (!hash_search(LockMethodProcLockHash,
+               if (!hash_search(LockMethodProcLockHash[partition],
                                  (void *) &(proclock->tag),
                                  HASH_REMOVE, NULL))
                     elog(PANIC, "proclock table corrupted");
@@ -678,7 +740,7 @@ LockAcquire(const LOCKTAG *locktag,
             LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
             Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
             Assert(lock->nGranted <= lock->nRequested);
-           LWLockRelease(masterLock);
+           LWLockRelease(partitionLock);
             if (locallock->nLocks == 0)
                 RemoveLocalLock(locallock);
             return LOCKACQUIRE_NOT_AVAIL;
@@ -692,7 +754,7 @@ LockAcquire(const LOCKTAG *locktag,
         /*
          * Sleep till someone wakes me up.
          */
-       WaitOnLock(lockmethodid, locallock, owner);
+       WaitOnLock(locallock, owner);
  
         /*
          * NOTE: do not do any material change of state between here and
@@ -709,14 +771,14 @@ LockAcquire(const LOCKTAG *locktag,
             PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
             LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
             /* Should we retry ? */
-           LWLockRelease(masterLock);
+           LWLockRelease(partitionLock);
             elog(ERROR, "LockAcquire failed");
         }
         PROCLOCK_PRINT("LockAcquire: granted", proclock);
         LOCK_PRINT("LockAcquire: granted", lock, lockmode);
     }
  
-   LWLockRelease(masterLock);
+   LWLockRelease(partitionLock);
  
     return LOCKACQUIRE_OK;
  }
@@ -894,11 +956,12 @@ UnGrantLock(LOCK *lock, LOCKMODE lockmode,
   * should be called after UnGrantLock, and wakeupNeeded is the result from
   * UnGrantLock.)
   *
- * The locktable's masterLock must be held at entry, and will be
+ * The lock table's partition lock must be held at entry, and will be
   * held at exit.
   */
  static void
-CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
+CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+           LockMethod lockMethodTable, int partition,
             bool wakeupNeeded)
  {
     /*
@@ -910,7 +973,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
         PROCLOCK_PRINT("CleanUpLock: deleting", proclock);
         SHMQueueDelete(&proclock->lockLink);
         SHMQueueDelete(&proclock->procLink);
-       if (!hash_search(LockMethodProcLockHash,
+       if (!hash_search(LockMethodProcLockHash[partition],
                          (void *) &(proclock->tag),
                          HASH_REMOVE, NULL))
             elog(PANIC, "proclock table corrupted");
@@ -924,7 +987,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
          */
         LOCK_PRINT("CleanUpLock: deleting", lock, 0);
         Assert(SHMQueueEmpty(&(lock->procLocks)));
-       if (!hash_search(LockMethodLockHash,
+       if (!hash_search(LockMethodLockHash[partition],
                          (void *) &(lock->tag),
                          HASH_REMOVE, NULL))
             elog(PANIC, "lock table corrupted");
@@ -932,7 +995,7 @@ CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock, PROCLOCK *proclock,
     else if (wakeupNeeded)
     {
         /* There are waiters on this lock, so wake them up. */
-       ProcLockWakeup(LockMethods[lockmethodid], lock);
+       ProcLockWakeup(lockMethodTable, lock);
     }
  }
  
@@ -988,12 +1051,12 @@ GrantAwaitedLock(void)
   * Caller must have set MyProc->heldLocks to reflect locks already held
   * on the lockable object by this process.
   *
- * The locktable's masterLock must be held at entry.
+ * The appropriate partition lock must be held at entry.
   */
  static void
-WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
-          ResourceOwner owner)
+WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
  {
+   LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock);
     LockMethod  lockMethodTable = LockMethods[lockmethodid];
     const char *old_status;
     char       *new_status;
@@ -1025,10 +1088,7 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
      * will also happen in the cancel/die case.
      */
  
-   if (ProcSleep(lockMethodTable,
-                 locallock->tag.mode,
-                 locallock->lock,
-                 locallock->proclock) != STATUS_OK)
+   if (ProcSleep(locallock, lockMethodTable) != STATUS_OK)
     {
         /*
          * We failed as a result of a deadlock, see CheckDeadLock(). Quit now.
@@ -1036,10 +1096,10 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
         awaitedLock = NULL;
         LOCK_PRINT("WaitOnLock: aborting on lock",
                    locallock->lock, locallock->tag.mode);
-       LWLockRelease(LockMgrLock);
+       LWLockRelease(FirstLockMgrLock + locallock->partition);
  
         /*
-        * Now that we aren't holding the LockMgrLock, we can give an error
+        * Now that we aren't holding the partition lock, we can give an error
          * report including details about the detected deadlock.
          */
         DeadLockReport();
@@ -1059,12 +1119,12 @@ WaitOnLock(LOCKMETHODID lockmethodid, LOCALLOCK *locallock,
   * Remove a proc from the wait-queue it is on
   * (caller must know it is on one).
   *
- * Locktable lock must be held by caller.
+ * Appropriate partition lock must be held by caller.
   *
   * NB: this does not clean up any locallock object that may exist for the lock.
   */
  void
-RemoveFromWaitQueue(PGPROC *proc)
+RemoveFromWaitQueue(PGPROC *proc, int partition)
  {
     LOCK       *waitLock = proc->waitLock;
     PROCLOCK   *proclock = proc->waitProcLock;
@@ -1102,7 +1162,9 @@ RemoveFromWaitQueue(PGPROC *proc)
      * LockRelease expects there to be no remaining proclocks.) Then see if
      * any other waiters for the lock can be woken up now.
      */
-   CleanUpLock(lockmethodid, waitLock, proclock, true);
+   CleanUpLock(waitLock, proclock,
+               LockMethods[lockmethodid], partition,
+               true);
  }
  
  /*
@@ -1125,7 +1187,8 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
     LOCALLOCK  *locallock;
     LOCK       *lock;
     PROCLOCK   *proclock;
-   LWLockId    masterLock;
+   int         partition;
+   LWLockId    partitionLock;
     bool        wakeupNeeded;
  
     if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
@@ -1212,9 +1275,10 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
     /*
      * Otherwise we've got to mess with the shared lock table.
      */
-   masterLock = LockMgrLock;
+   partition = locallock->partition;
+   partitionLock = FirstLockMgrLock + partition;
  
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
     /*
      * We don't need to re-find the lock or proclock, since we kept their
@@ -1233,7 +1297,7 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
     if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
     {
         PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock);
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
         elog(WARNING, "you don't own a lock of type %s",
              lockMethodTable->lockModeNames[lockmode]);
         RemoveLocalLock(locallock);
@@ -1245,9 +1309,11 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
      */
     wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
  
-   CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+   CleanUpLock(lock, proclock,
+               lockMethodTable, partition,
+               wakeupNeeded);
  
-   LWLockRelease(masterLock);
+   LWLockRelease(partitionLock);
  
     RemoveLocalLock(locallock);
     return TRUE;
@@ -1265,14 +1331,13 @@ void
  LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
  {
     HASH_SEQ_STATUS status;
-   SHM_QUEUE  *procLocks = &(MyProc->procLocks);
-   LWLockId    masterLock;
     LockMethod  lockMethodTable;
     int         i,
                 numLockModes;
     LOCALLOCK  *locallock;
-   PROCLOCK   *proclock;
     LOCK       *lock;
+   PROCLOCK   *proclock;
+   int         partition;
  
     if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
         elog(ERROR, "unrecognized lock method: %d", lockmethodid);
@@ -1284,7 +1349,6 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
  #endif
  
     numLockModes = lockMethodTable->numLockModes;
-   masterLock = LockMgrLock;
  
     /*
      * First we run through the locallock table and get rid of unwanted
@@ -1351,74 +1415,89 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
         RemoveLocalLock(locallock);
     }
  
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   /*
+    * Now, scan each lock partition separately.
+    */
+   for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+   {
+       LWLockId    partitionLock = FirstLockMgrLock + partition;
+       SHM_QUEUE  *procLocks = &(MyProc->myProcLocks[partition]);
  
-   proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
-                                        offsetof(PROCLOCK, procLink));
+       proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+                                            offsetof(PROCLOCK, procLink));
  
-   while (proclock)
-   {
-       bool        wakeupNeeded = false;
-       PROCLOCK   *nextplock;
+       if (!proclock)
+           continue;           /* needn't examine this partition */
  
-       /* Get link first, since we may unlink/delete this proclock */
-       nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
-                                             offsetof(PROCLOCK, procLink));
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
-       Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
+       while (proclock)
+       {
+           bool        wakeupNeeded = false;
+           PROCLOCK   *nextplock;
  
-       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+           /* Get link first, since we may unlink/delete this proclock */
+           nextplock = (PROCLOCK *)
+               SHMQueueNext(procLocks, &proclock->procLink,
+                            offsetof(PROCLOCK, procLink));
  
-       /* Ignore items that are not of the lockmethod to be removed */
-       if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
-           goto next_item;
+           Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
  
-       /*
-        * In allLocks mode, force release of all locks even if locallock
-        * table had problems
-        */
-       if (allLocks)
-           proclock->releaseMask = proclock->holdMask;
-       else
-           Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
+           lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
  
-       /*
-        * Ignore items that have nothing to be released, unless they have
-        * holdMask == 0 and are therefore recyclable
-        */
-       if (proclock->releaseMask == 0 && proclock->holdMask != 0)
-           goto next_item;
+           /* Ignore items that are not of the lockmethod to be removed */
+           if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+               goto next_item;
  
-       PROCLOCK_PRINT("LockReleaseAll", proclock);
-       LOCK_PRINT("LockReleaseAll", lock, 0);
-       Assert(lock->nRequested >= 0);
-       Assert(lock->nGranted >= 0);
-       Assert(lock->nGranted <= lock->nRequested);
-       Assert((proclock->holdMask & ~lock->grantMask) == 0);
+           /*
+            * In allLocks mode, force release of all locks even if locallock
+            * table had problems
+            */
+           if (allLocks)
+               proclock->releaseMask = proclock->holdMask;
+           else
+               Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
  
-       /*
-        * Release the previously-marked lock modes
-        */
-       for (i = 1; i <= numLockModes; i++)
-       {
-           if (proclock->releaseMask & LOCKBIT_ON(i))
-               wakeupNeeded |= UnGrantLock(lock, i, proclock,
-                                           lockMethodTable);
-       }
-       Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
-       Assert(lock->nGranted <= lock->nRequested);
-       LOCK_PRINT("LockReleaseAll: updated", lock, 0);
+           /*
+            * Ignore items that have nothing to be released, unless they have
+            * holdMask == 0 and are therefore recyclable
+            */
+           if (proclock->releaseMask == 0 && proclock->holdMask != 0)
+               goto next_item;
  
-       proclock->releaseMask = 0;
+           PROCLOCK_PRINT("LockReleaseAll", proclock);
+           LOCK_PRINT("LockReleaseAll", lock, 0);
+           Assert(lock->nRequested >= 0);
+           Assert(lock->nGranted >= 0);
+           Assert(lock->nGranted <= lock->nRequested);
+           Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+           /*
+            * Release the previously-marked lock modes
+            */
+           for (i = 1; i <= numLockModes; i++)
+           {
+               if (proclock->releaseMask & LOCKBIT_ON(i))
+                   wakeupNeeded |= UnGrantLock(lock, i, proclock,
+                                               lockMethodTable);
+           }
+           Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
+           Assert(lock->nGranted <= lock->nRequested);
+           LOCK_PRINT("LockReleaseAll: updated", lock, 0);
  
-       /* CleanUpLock will wake up waiters if needed. */
-       CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+           proclock->releaseMask = 0;
  
-next_item:
-       proclock = nextplock;
-   }
+           /* CleanUpLock will wake up waiters if needed. */
+           CleanUpLock(lock, proclock,
+                       lockMethodTable, partition,
+                       wakeupNeeded);
  
-   LWLockRelease(masterLock);
+       next_item:
+           proclock = nextplock;
+       } /* loop over PROCLOCKs within this partition */
+
+       LWLockRelease(partitionLock);
+   } /* loop over partitions */
  
  #ifdef LOCK_DEBUG
     if (*(lockMethodTable->trace_flag))
@@ -1627,19 +1706,16 @@ PostPrepare_Locks(TransactionId xid)
  {
     PGPROC     *newproc = TwoPhaseGetDummyProc(xid);
     HASH_SEQ_STATUS status;
-   SHM_QUEUE  *procLocks = &(MyProc->procLocks);
-   LWLockId    masterLock;
     LOCALLOCK  *locallock;
+   LOCK       *lock;
     PROCLOCK   *proclock;
     PROCLOCKTAG proclocktag;
     bool        found;
-   LOCK       *lock;
+   int         partition;
  
     /* This is a critical section: any error means big trouble */
     START_CRIT_SECTION();
  
-   masterLock = LockMgrLock;
-
     /*
      * First we run through the locallock table and get rid of unwanted
      * entries, then we scan the process's proclocks and transfer them to the
@@ -1678,105 +1754,121 @@ PostPrepare_Locks(TransactionId xid)
         RemoveLocalLock(locallock);
     }
  
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   /*
+    * Now, scan each lock partition separately.
+    */
+   for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+   {
+       LWLockId    partitionLock = FirstLockMgrLock + partition;
+       SHM_QUEUE  *procLocks = &(MyProc->myProcLocks[partition]);
  
-   proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
-                                        offsetof(PROCLOCK, procLink));
+       proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+                                            offsetof(PROCLOCK, procLink));
  
-   while (proclock)
-   {
-       PROCLOCK   *nextplock;
-       LOCKMASK    holdMask;
-       PROCLOCK   *newproclock;
+       if (!proclock)
+           continue;           /* needn't examine this partition */
  
-       /* Get link first, since we may unlink/delete this proclock */
-       nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
-                                             offsetof(PROCLOCK, procLink));
+       LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
-       Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
+       while (proclock)
+       {
+           PROCLOCK   *nextplock;
+           LOCKMASK    holdMask;
+           PROCLOCK   *newproclock;
  
-       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+           /* Get link first, since we may unlink/delete this proclock */
+           nextplock = (PROCLOCK *)
+               SHMQueueNext(procLocks, &proclock->procLink,
+                            offsetof(PROCLOCK, procLink));
  
-       /* Ignore nontransactional locks */
-       if (!LockMethods[LOCK_LOCKMETHOD(*lock)]->transactional)
-           goto next_item;
+           Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
  
-       PROCLOCK_PRINT("PostPrepare_Locks", proclock);
-       LOCK_PRINT("PostPrepare_Locks", lock, 0);
-       Assert(lock->nRequested >= 0);
-       Assert(lock->nGranted >= 0);
-       Assert(lock->nGranted <= lock->nRequested);
-       Assert((proclock->holdMask & ~lock->grantMask) == 0);
+           lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
  
-       /*
-        * Since there were no session locks, we should be releasing all locks
-        */
-       if (proclock->releaseMask != proclock->holdMask)
-           elog(PANIC, "we seem to have dropped a bit somewhere");
+           /* Ignore nontransactional locks */
+           if (!LockMethods[LOCK_LOCKMETHOD(*lock)]->transactional)
+               goto next_item;
  
-       holdMask = proclock->holdMask;
+           PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+           LOCK_PRINT("PostPrepare_Locks", lock, 0);
+           Assert(lock->nRequested >= 0);
+           Assert(lock->nGranted >= 0);
+           Assert(lock->nGranted <= lock->nRequested);
+           Assert((proclock->holdMask & ~lock->grantMask) == 0);
  
-       /*
-        * We cannot simply modify proclock->tag.proc to reassign ownership of
-        * the lock, because that's part of the hash key and the proclock
-        * would then be in the wrong hash chain.  So, unlink and delete the
-        * old proclock; create a new one with the right contents; and link it
-        * into place.  We do it in this order to be certain we won't run out
-        * of shared memory (the way dynahash.c works, the deleted object is
-        * certain to be available for reallocation).
-        */
-       SHMQueueDelete(&proclock->lockLink);
-       SHMQueueDelete(&proclock->procLink);
-       if (!hash_search(LockMethodProcLockHash,
-                        (void *) &(proclock->tag),
-                        HASH_REMOVE, NULL))
-           elog(PANIC, "proclock table corrupted");
+           /*
+            * Since there were no session locks, we should be releasing all
+            * locks
+            */
+           if (proclock->releaseMask != proclock->holdMask)
+               elog(PANIC, "we seem to have dropped a bit somewhere");
  
-       /*
-        * Create the hash key for the new proclock table.
-        */
-       MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));
-       proclocktag.lock = MAKE_OFFSET(lock);
-       proclocktag.proc = MAKE_OFFSET(newproc);
-
-       newproclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
-                                              (void *) &proclocktag,
-                                              HASH_ENTER_NULL, &found);
-       if (!newproclock)
-           ereport(PANIC,      /* should not happen */
-                   (errcode(ERRCODE_OUT_OF_MEMORY),
-                    errmsg("out of shared memory"),
-                    errdetail("Not enough memory for reassigning the prepared transaction's locks.")));
+           holdMask = proclock->holdMask;
  
-       /*
-        * If new, initialize the new entry
-        */
-       if (!found)
-       {
-           newproclock->holdMask = 0;
-           newproclock->releaseMask = 0;
-           /* Add new proclock to appropriate lists */
-           SHMQueueInsertBefore(&lock->procLocks, &newproclock->lockLink);
-           SHMQueueInsertBefore(&newproc->procLocks, &newproclock->procLink);
-           PROCLOCK_PRINT("PostPrepare_Locks: new", newproclock);
-       }
-       else
-       {
-           PROCLOCK_PRINT("PostPrepare_Locks: found", newproclock);
-           Assert((newproclock->holdMask & ~lock->grantMask) == 0);
-       }
+           /*
+            * We cannot simply modify proclock->tag.proc to reassign
+            * ownership of the lock, because that's part of the hash key and
+            * the proclock would then be in the wrong hash chain.  So, unlink
+            * and delete the old proclock; create a new one with the right
+            * contents; and link it into place.  We do it in this order to be
+            * certain we won't run out of shared memory (the way dynahash.c
+            * works, the deleted object is certain to be available for
+            * reallocation).
+            */
+           SHMQueueDelete(&proclock->lockLink);
+           SHMQueueDelete(&proclock->procLink);
+           if (!hash_search(LockMethodProcLockHash[partition],
+                            (void *) &(proclock->tag),
+                            HASH_REMOVE, NULL))
+               elog(PANIC, "proclock table corrupted");
  
-       /*
-        * Pass over the identified lock ownership.
-        */
-       Assert((newproclock->holdMask & holdMask) == 0);
-       newproclock->holdMask |= holdMask;
+           /*
+            * Create the hash key for the new proclock table.
+            */
+           MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));
+           proclocktag.lock = MAKE_OFFSET(lock);
+           proclocktag.proc = MAKE_OFFSET(newproc);
+
+           newproclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
+                                                  (void *) &proclocktag,
+                                                  HASH_ENTER_NULL, &found);
+           if (!newproclock)
+               ereport(PANIC,      /* should not happen */
+                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                        errmsg("out of shared memory"),
+                        errdetail("Not enough memory for reassigning the prepared transaction's locks.")));
  
-next_item:
-       proclock = nextplock;
-   }
+           /*
+            * If new, initialize the new entry
+            */
+           if (!found)
+           {
+               newproclock->holdMask = 0;
+               newproclock->releaseMask = 0;
+               /* Add new proclock to appropriate lists */
+               SHMQueueInsertBefore(&lock->procLocks, &newproclock->lockLink);
+               SHMQueueInsertBefore(&(newproc->myProcLocks[partition]),
+                                    &newproclock->procLink);
+               PROCLOCK_PRINT("PostPrepare_Locks: new", newproclock);
+           }
+           else
+           {
+               PROCLOCK_PRINT("PostPrepare_Locks: found", newproclock);
+               Assert((newproclock->holdMask & ~lock->grantMask) == 0);
+           }
+
+           /*
+            * Pass over the identified lock ownership.
+            */
+           Assert((newproclock->holdMask & holdMask) == 0);
+           newproclock->holdMask |= holdMask;
+
+       next_item:
+           proclock = nextplock;
+       } /* loop over PROCLOCKs within this partition */
  
-   LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
+   } /* loop over partitions */
  
     END_CRIT_SECTION();
  }
@@ -1789,20 +1881,23 @@ Size
  LockShmemSize(void)
  {
     Size        size = 0;
-   long        max_table_size = NLOCKENTS();
+   Size        tabsize;
+   long        max_table_size;
  
-   /* lockHash table */
-   size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+   /* lock hash tables */
+   max_table_size = NLOCKENTS();
+   max_table_size = (max_table_size - 1) / NUM_LOCK_PARTITIONS + 1;
+   tabsize = hash_estimate_size(max_table_size, sizeof(LOCK));
+   size = add_size(size, mul_size(tabsize, NUM_LOCK_PARTITIONS));
  
-   /* proclockHash table */
-   size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
+   /* proclock hash tables */
+   max_table_size *= 2;
+   tabsize = hash_estimate_size(max_table_size, sizeof(PROCLOCK));
+   size = add_size(size, mul_size(tabsize, NUM_LOCK_PARTITIONS));
  
     /*
-    * Note we count only one pair of hash tables, since the userlocks table
-    * actually overlays the main one.
-    *
-    * Since the lockHash entry count above is only an estimate, add 10%
-    * safety margin.
+    * Since there is likely to be some space wastage due to uneven use
+    * of the partitions, add 10% safety margin.
      */
     size = add_size(size, size / 10);
  
@@ -1818,9 +1913,9 @@ LockShmemSize(void)
   * copies of the same PGPROC and/or LOCK objects are likely to appear.
   * It is the caller's responsibility to match up duplicates if wanted.
   *
- * The design goal is to hold the LockMgrLock for as short a time as possible;
+ * The design goal is to hold the LWLocks for as short a time as possible;
   * thus, this function simply makes a copy of the necessary data and releases
- * the lock, allowing the caller to contemplate and format the data for as
+ * the locks, allowing the caller to contemplate and format the data for as
   * long as it pleases.
   */
  LockData *
@@ -1830,40 +1925,67 @@ GetLockStatusData(void)
     HTAB       *proclockTable;
     PROCLOCK   *proclock;
     HASH_SEQ_STATUS seqstat;
+   int         els;
+   int         el;
     int         i;
  
     data = (LockData *) palloc(sizeof(LockData));
  
-   LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
-
-   proclockTable = LockMethodProcLockHash;
-
-   data->nelements = i = proclockTable->hctl->nentries;
+   /*
+    * Acquire lock on the entire shared lock data structures.  We can't
+    * operate one partition at a time if we want to deliver a self-consistent
+    * view of the state.
+    *
+    * Since this is a read-only operation, we take shared instead of exclusive
+    * lock.  There's not a whole lot of point to this, because all the normal
+    * operations require exclusive lock, but it doesn't hurt anything either.
+    * It will at least allow two backends to do GetLockStatusData in parallel.
+    *
+    * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+    *
+    * Use same loop to count up the total number of PROCLOCK objects.
+    */
+   els = 0;
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+   {
+       LWLockAcquire(FirstLockMgrLock + i, LW_SHARED);
+       proclockTable = LockMethodProcLockHash[i];
+       els += proclockTable->hctl->nentries;
+   }
  
-   data->proclockaddrs = (SHMEM_OFFSET *) palloc(sizeof(SHMEM_OFFSET) * i);
-   data->proclocks = (PROCLOCK *) palloc(sizeof(PROCLOCK) * i);
-   data->procs = (PGPROC *) palloc(sizeof(PGPROC) * i);
-   data->locks = (LOCK *) palloc(sizeof(LOCK) * i);
+   data->nelements = els;
+   data->proclockaddrs = (SHMEM_OFFSET *) palloc(sizeof(SHMEM_OFFSET) * els);
+   data->proclocks = (PROCLOCK *) palloc(sizeof(PROCLOCK) * els);
+   data->procs = (PGPROC *) palloc(sizeof(PGPROC) * els);
+   data->locks = (LOCK *) palloc(sizeof(LOCK) * els);
  
-   hash_seq_init(&seqstat, proclockTable);
+   el = 0;
  
-   i = 0;
-   while ((proclock = hash_seq_search(&seqstat)))
+   /* Now scan the tables to copy the data */
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
     {
-       PGPROC     *proc = (PGPROC *) MAKE_PTR(proclock->tag.proc);
-       LOCK       *lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+       proclockTable = LockMethodProcLockHash[i];
+       hash_seq_init(&seqstat, proclockTable);
  
-       data->proclockaddrs[i] = MAKE_OFFSET(proclock);
-       memcpy(&(data->proclocks[i]), proclock, sizeof(PROCLOCK));
-       memcpy(&(data->procs[i]), proc, sizeof(PGPROC));
-       memcpy(&(data->locks[i]), lock, sizeof(LOCK));
+       while ((proclock = hash_seq_search(&seqstat)))
+       {
+           PGPROC     *proc = (PGPROC *) MAKE_PTR(proclock->tag.proc);
+           LOCK       *lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+
+           data->proclockaddrs[el] = MAKE_OFFSET(proclock);
+           memcpy(&(data->proclocks[el]), proclock, sizeof(PROCLOCK));
+           memcpy(&(data->procs[el]), proc, sizeof(PGPROC));
+           memcpy(&(data->locks[el]), lock, sizeof(LOCK));
  
-       i++;
+           el++;
+       }
     }
  
-   LWLockRelease(LockMgrLock);
+   /* And release locks */
+   for (i = NUM_LOCK_PARTITIONS; --i >= 0; )
+       LWLockRelease(FirstLockMgrLock + i);
  
-   Assert(i == data->nelements);
+   Assert(el == data->nelements);
  
     return data;
  }
@@ -1879,7 +2001,7 @@ GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
  
  #ifdef LOCK_DEBUG
  /*
- * Dump all locks in the given proc's procLocks list.
+ * Dump all locks in the given proc's myProcLocks lists.
   *
   * Caller is responsible for having acquired appropriate LWLocks.
   */
@@ -1889,29 +2011,34 @@ DumpLocks(PGPROC *proc)
     SHM_QUEUE  *procLocks;
     PROCLOCK   *proclock;
     LOCK       *lock;
+   int         i;
  
     if (proc == NULL)
         return;
  
-   procLocks = &proc->procLocks;
-
     if (proc->waitLock)
         LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0);
  
-   proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
-                                        offsetof(PROCLOCK, procLink));
-
-   while (proclock)
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
     {
-       Assert(proclock->tag.proc == MAKE_OFFSET(proc));
+       procLocks = &(proc->myProcLocks[i]);
  
-       lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+       proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+                                            offsetof(PROCLOCK, procLink));
  
-       PROCLOCK_PRINT("DumpLocks", proclock);
-       LOCK_PRINT("DumpLocks", lock, 0);
+       while (proclock)
+       {
+           Assert(proclock->tag.proc == MAKE_OFFSET(proc));
  
-       proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
-                                            offsetof(PROCLOCK, procLink));
+           lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+
+           PROCLOCK_PRINT("DumpLocks", proclock);
+           LOCK_PRINT("DumpLocks", lock, 0);
+
+           proclock = (PROCLOCK *)
+               SHMQueueNext(procLocks, &proclock->procLink,
+                            offsetof(PROCLOCK, procLink));
+       }
     }
  }
  
@@ -1928,25 +2055,30 @@ DumpAllLocks(void)
     LOCK       *lock;
     HTAB       *proclockTable;
     HASH_SEQ_STATUS status;
+   int         i;
  
     proc = MyProc;
-   proclockTable = LockMethodProcLockHash;
  
     if (proc && proc->waitLock)
         LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
  
-   hash_seq_init(&status, proclockTable);
-   while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
     {
-       PROCLOCK_PRINT("DumpAllLocks", proclock);
+       proclockTable = LockMethodProcLockHash[i];
+       hash_seq_init(&status, proclockTable);
  
-       if (proclock->tag.lock)
+       while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
         {
-           lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
-           LOCK_PRINT("DumpAllLocks", lock, 0);
+           PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+           if (proclock->tag.lock)
+           {
+               lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+               LOCK_PRINT("DumpAllLocks", lock, 0);
+           }
+           else
+               elog(LOG, "DumpAllLocks: proclock->tag.lock = NULL");
         }
-       else
-           elog(LOG, "DumpAllLocks: proclock->tag.lock = NULL");
     }
  }
  #endif   /* LOCK_DEBUG */
@@ -1975,7 +2107,8 @@ lock_twophase_recover(TransactionId xid, uint16 info,
     PROCLOCK   *proclock;
     PROCLOCKTAG proclocktag;
     bool        found;
-   LWLockId    masterLock;
+   int         partition;
+   LWLockId    partitionLock;
     LockMethod  lockMethodTable;
  
     Assert(len == sizeof(TwoPhaseLockRecord));
@@ -1987,19 +2120,20 @@ lock_twophase_recover(TransactionId xid, uint16 info,
         elog(ERROR, "unrecognized lock method: %d", lockmethodid);
     lockMethodTable = LockMethods[lockmethodid];
  
-   masterLock = LockMgrLock;
+   partition = LockTagToPartition(locktag);
+   partitionLock = FirstLockMgrLock + partition;
  
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
     /*
      * Find or create a lock with this tag.
      */
-   lock = (LOCK *) hash_search(LockMethodLockHash,
+   lock = (LOCK *) hash_search(LockMethodLockHash[partition],
                                 (void *) locktag,
                                 HASH_ENTER_NULL, &found);
     if (!lock)
     {
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
         ereport(ERROR,
                 (errcode(ERRCODE_OUT_OF_MEMORY),
                  errmsg("out of shared memory"),
@@ -2039,7 +2173,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
     /*
      * Find or create a proclock entry with this tag
      */
-   proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+   proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
                                         (void *) &proclocktag,
                                         HASH_ENTER_NULL, &found);
     if (!proclock)
@@ -2054,12 +2188,12 @@ lock_twophase_recover(TransactionId xid, uint16 info,
              * anyone to release the lock object later.
              */
             Assert(SHMQueueEmpty(&(lock->procLocks)));
-           if (!hash_search(LockMethodLockHash,
+           if (!hash_search(LockMethodLockHash[partition],
                              (void *) &(lock->tag),
                              HASH_REMOVE, NULL))
                 elog(PANIC, "lock table corrupted");
         }
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
         ereport(ERROR,
                 (errcode(ERRCODE_OUT_OF_MEMORY),
                  errmsg("out of shared memory"),
@@ -2075,7 +2209,8 @@ lock_twophase_recover(TransactionId xid, uint16 info,
         proclock->releaseMask = 0;
         /* Add proclock to appropriate lists */
         SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
-       SHMQueueInsertBefore(&proc->procLocks, &proclock->procLink);
+       SHMQueueInsertBefore(&(proc->myProcLocks[partition]),
+                            &proclock->procLink);
         PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
     }
     else
@@ -2106,7 +2241,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
      */
     GrantLock(lock, proclock, lockmode);
  
-   LWLockRelease(masterLock);
+   LWLockRelease(partitionLock);
  }
  
  /*
@@ -2123,10 +2258,11 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
     LOCKTAG    *locktag;
     LOCKMODE    lockmode;
     LOCKMETHODID lockmethodid;
-   PROCLOCKTAG proclocktag;
     LOCK       *lock;
     PROCLOCK   *proclock;
-   LWLockId    masterLock;
+   PROCLOCKTAG proclocktag;
+   int         partition;
+   LWLockId    partitionLock;
     LockMethod  lockMethodTable;
     bool        wakeupNeeded;
  
@@ -2139,14 +2275,15 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
         elog(ERROR, "unrecognized lock method: %d", lockmethodid);
     lockMethodTable = LockMethods[lockmethodid];
  
-   masterLock = LockMgrLock;
+   partition = LockTagToPartition(locktag);
+   partitionLock = FirstLockMgrLock + partition;
  
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
     /*
      * Re-find the lock object (it had better be there).
      */
-   lock = (LOCK *) hash_search(LockMethodLockHash,
+   lock = (LOCK *) hash_search(LockMethodLockHash[partition],
                                 (void *) locktag,
                                 HASH_FIND, NULL);
     if (!lock)
@@ -2158,7 +2295,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
     MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));       /* must clear padding */
     proclocktag.lock = MAKE_OFFSET(lock);
     proclocktag.proc = MAKE_OFFSET(proc);
-   proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+   proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[partition],
                                         (void *) &proclocktag,
                                         HASH_FIND, NULL);
     if (!proclock)
@@ -2171,7 +2308,7 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
     if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
     {
         PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
-       LWLockRelease(masterLock);
+       LWLockRelease(partitionLock);
         elog(WARNING, "you don't own a lock of type %s",
              lockMethodTable->lockModeNames[lockmode]);
         return;
@@ -2182,9 +2319,11 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
      */
     wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
  
-   CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+   CleanUpLock(lock, proclock,
+               lockMethodTable, partition,
+               wakeupNeeded);
  
-   LWLockRelease(masterLock);
+   LWLockRelease(partitionLock);
  }
  
  /*
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c

index a215a65285511dca9c7ca6f26145d1bcacd67a12..e1edabde905efa7004c02285742fb455a6b2843b 100644 (file)
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -8,14 +8,14 @@
   * exclusive and shared lock modes (to support read/write and read-only
   * access to a shared object). There are few other frammishes.  User-level
   * locking should be done with the full lock manager --- which depends on
- * an LWLock to protect its shared state.
+ * LWLocks to protect its shared state.
   *
   *
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.35 2005/12/06 23:08:33 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.36 2005/12/11 21:02:18 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -125,7 +125,10 @@ NumLWLocks(void)
      */
  
     /* Predefined LWLocks */
-   numLocks = (int) NumFixedLWLocks;
+   numLocks = (int) FirstLockMgrLock;
+
+   /* lock.c gets the ones starting at FirstLockMgrLock */
+   numLocks += NUM_LOCK_PARTITIONS;
  
     /* bufmgr.c needs two for each shared buffer */
     numLocks += 2 * NBuffers;
@@ -204,10 +207,11 @@ CreateLWLocks(void)
  
     /*
      * Initialize the dynamic-allocation counter, which is stored just before
-    * the first LWLock.
+    * the first LWLock.  The LWLocks used by lock.c are not dynamically
+    * allocated, it just assumes it has them.
      */
     LWLockCounter = (int *) ((char *) LWLockArray - 2 * sizeof(int));
-   LWLockCounter[0] = (int) NumFixedLWLocks;
+   LWLockCounter[0] = (int) FirstLockMgrLock + NUM_LOCK_PARTITIONS;
     LWLockCounter[1] = numLocks;
  }
  
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c

index 8d8269041e7f2ba24fcb38603930dc8de5f7e262..34d80bfceeacf8ea4e5007831964dab095e8440b 100644 (file)
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.169 2005/12/09 01:22:04 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.170 2005/12/11 21:02:18 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -18,9 +18,8 @@
   *     ProcQueueAlloc() -- create a shm queue for sleeping processes
   *     ProcQueueInit() -- create a queue without allocing memory
   *
- * Locking and waiting for buffers can cause the backend to be
- * put to sleep.  Whoever releases the lock, etc. wakes the
- * process up again (and gives it an error code so it knows
+ * Waiting for a lock causes the backend to be put to sleep.  Whoever releases
+ * the lock wakes the process up again (and gives it an error code so it knows
   * whether it was awoken on an error condition).
   *
   * Interface (b):
@@ -28,7 +27,7 @@
   * ProcReleaseLocks -- frees the locks associated with current transaction
   *
   * ProcKill -- destroys the shared memory state (and locks)
- *     associated with the process.
+ * associated with the process.
   */
  #include "postgres.h"
  
@@ -65,7 +64,8 @@ NON_EXEC_STATIC slock_t *ProcStructLock = NULL;
  static PROC_HDR *ProcGlobal = NULL;
  static PGPROC *DummyProcs = NULL;
  
-static bool waitingForLock = false;
+/* If we are waiting for a lock, this points to the associated LOCALLOCK */
+static LOCALLOCK *lockAwaited = NULL;
  
  /* Mark these volatile because they can be changed by signal handler */
  static volatile bool statement_timeout_active = false;
@@ -200,10 +200,10 @@ InitProcGlobal(void)
  void
  InitProcess(void)
  {
-   SHMEM_OFFSET myOffset;
-
     /* use volatile pointer to prevent code rearrangement */
     volatile PROC_HDR *procglobal = ProcGlobal;
+   SHMEM_OFFSET myOffset;
+   int         i;
  
     /*
      * ProcGlobal should be set by a previous call to InitProcGlobal (if we
@@ -264,7 +264,8 @@ InitProcess(void)
     MyProc->lwWaitLink = NULL;
     MyProc->waitLock = NULL;
     MyProc->waitProcLock = NULL;
-   SHMQueueInit(&(MyProc->procLocks));
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       SHMQueueInit(&(MyProc->myProcLocks[i]));
  
     /*
      * Add our PGPROC to the PGPROC array in shared memory.
@@ -304,6 +305,7 @@ void
  InitDummyProcess(int proctype)
  {
     PGPROC     *dummyproc;
+   int         i;
  
     /*
      * ProcGlobal should be set by a previous call to InitProcGlobal (we
@@ -360,7 +362,8 @@ InitDummyProcess(int proctype)
     MyProc->lwWaitLink = NULL;
     MyProc->waitLock = NULL;
     MyProc->waitProcLock = NULL;
-   SHMQueueInit(&(MyProc->procLocks));
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       SHMQueueInit(&(MyProc->myProcLocks[i]));
  
     /*
      * Arrange to clean up at process exit.
@@ -416,21 +419,24 @@ HaveNFreeProcs(int n)
  bool
  LockWaitCancel(void)
  {
+   LWLockId    partitionLock;
+
     /* Nothing to do if we weren't waiting for a lock */
-   if (!waitingForLock)
+   if (lockAwaited == NULL)
         return false;
  
     /* Turn off the deadlock timer, if it's still running (see ProcSleep) */
     disable_sig_alarm(false);
  
     /* Unlink myself from the wait queue, if on it (might not be anymore!) */
-   LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
+   partitionLock = FirstLockMgrLock + lockAwaited->partition;
+   LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
     if (MyProc->links.next != INVALID_OFFSET)
     {
         /* We could not have been granted the lock yet */
         Assert(MyProc->waitStatus == STATUS_ERROR);
-       RemoveFromWaitQueue(MyProc);
+       RemoveFromWaitQueue(MyProc, lockAwaited->partition);
     }
     else
     {
@@ -444,9 +450,9 @@ LockWaitCancel(void)
             GrantAwaitedLock();
     }
  
-   waitingForLock = false;
+   lockAwaited = NULL;
  
-   LWLockRelease(LockMgrLock);
+   LWLockRelease(partitionLock);
  
     /*
      * Reset the proc wait semaphore to zero.  This is necessary in the
@@ -606,18 +612,18 @@ ProcQueueInit(PROC_QUEUE *queue)
  
  
  /*
- * ProcSleep -- put a process to sleep
+ * ProcSleep -- put a process to sleep on the specified lock
   *
   * Caller must have set MyProc->heldLocks to reflect locks already held
   * on the lockable object by this process (under all XIDs).
   *
- * Locktable's masterLock must be held at entry, and will be held
+ * The lock table's partition lock must be held at entry, and will be held
   * at exit.
   *
   * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock).
   *
   * ASSUME: that no one will fiddle with the queue until after
- *     we release the masterLock.
+ *     we release the partition lock.
   *
   * NOTES: The process queue is now a priority queue for locking.
   *
@@ -625,12 +631,13 @@ ProcQueueInit(PROC_QUEUE *queue)
   * semaphore is normally zero, so when we try to acquire it, we sleep.
   */
  int
-ProcSleep(LockMethod lockMethodTable,
-         LOCKMODE lockmode,
-         LOCK *lock,
-         PROCLOCK *proclock)
+ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
  {
-   LWLockId    masterLock = LockMgrLock;
+   LOCKMODE    lockmode = locallock->tag.mode;
+   LOCK       *lock = locallock->lock;
+   PROCLOCK   *proclock = locallock->proclock;
+   int         partition = locallock->partition;
+   LWLockId    partitionLock = FirstLockMgrLock + partition;
     PROC_QUEUE *waitQueue = &(lock->waitProcs);
     LOCKMASK    myHeldLocks = MyProc->heldLocks;
     bool        early_deadlock = false;
@@ -732,22 +739,22 @@ ProcSleep(LockMethod lockMethodTable,
      */
     if (early_deadlock)
     {
-       RemoveFromWaitQueue(MyProc);
+       RemoveFromWaitQueue(MyProc, partition);
         return STATUS_ERROR;
     }
  
     /* mark that we are waiting for a lock */
-   waitingForLock = true;
+   lockAwaited = locallock;
  
     /*
-    * Release the locktable's masterLock.
+    * Release the lock table's partition lock.
      *
      * NOTE: this may also cause us to exit critical-section state, possibly
      * allowing a cancel/die interrupt to be accepted. This is OK because we
      * have recorded the fact that we are waiting for a lock, and so
      * LockWaitCancel will clean up if cancel/die happens.
      */
-   LWLockRelease(masterLock);
+   LWLockRelease(partitionLock);
  
     /*
      * Set timer so we can wake up after awhile and check for a deadlock. If a
@@ -785,16 +792,16 @@ ProcSleep(LockMethod lockMethodTable,
         elog(FATAL, "could not disable timer for process wakeup");
  
     /*
-    * Re-acquire the locktable's masterLock.  We have to do this to hold off
-    * cancel/die interrupts before we can mess with waitingForLock (else we
-    * might have a missed or duplicated locallock update).
+    * Re-acquire the lock table's partition lock.  We have to do this to
+    * hold off cancel/die interrupts before we can mess with lockAwaited
+    * (else we might have a missed or duplicated locallock update).
      */
-   LWLockAcquire(masterLock, LW_EXCLUSIVE);
+   LWLockAcquire(partitionLock, LW_EXCLUSIVE);
  
     /*
      * We no longer want LockWaitCancel to do anything.
      */
-   waitingForLock = false;
+   lockAwaited = NULL;
  
     /*
      * If we got the lock, be sure to remember it in the locallock table.
@@ -816,6 +823,8 @@ ProcSleep(LockMethod lockMethodTable,
   *  Also remove the process from the wait queue and set its links invalid.
   *  RETURN: the next process in the wait queue.
   *
+ * The appropriate lock partition lock must be held by caller.
+ *
   * XXX: presently, this code is only used for the "success" case, and only
   * works correctly for that case.  To clean up in failure case, would need
   * to twiddle the lock's request counts too --- see RemoveFromWaitQueue.
@@ -825,8 +834,6 @@ ProcWakeup(PGPROC *proc, int waitStatus)
  {
     PGPROC     *retProc;
  
-   /* assume that masterLock has been acquired */
-
     /* Proc should be sleeping ... */
     if (proc->links.prev == INVALID_OFFSET ||
         proc->links.next == INVALID_OFFSET)
@@ -854,6 +861,8 @@ ProcWakeup(PGPROC *proc, int waitStatus)
   * ProcLockWakeup -- routine for waking up processes when a lock is
   *     released (or a prior waiter is aborted).  Scan all waiters
   *     for lock, waken any that are no longer blocked.
+ *
+ * The appropriate lock partition lock must be held by caller.
   */
  void
  ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
@@ -908,25 +917,32 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
     Assert(waitQueue->size >= 0);
  }
  
-/* --------------------
+/*
+ * CheckDeadLock
+ *
   * We only get to this routine if we got SIGALRM after DeadlockTimeout
   * while waiting for a lock to be released by some other process.  Look
   * to see if there's a deadlock; if not, just return and continue waiting.
   * If we have a real deadlock, remove ourselves from the lock's wait queue
   * and signal an error to ProcSleep.
- * --------------------
   */
  static void
  CheckDeadLock(void)
  {
+   int         i;
+
     /*
-    * Acquire locktable lock.  Note that the deadlock check interrupt had
-    * better not be enabled anywhere that this process itself holds the
-    * locktable lock, else this will wait forever.  Also note that
-    * LWLockAcquire creates a critical section, so that this routine cannot
-    * be interrupted by cancel/die interrupts.
+    * Acquire exclusive lock on the entire shared lock data structures.
+    * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+    *
+    * Note that the deadlock check interrupt had better not be enabled
+    * anywhere that this process itself holds lock partition locks, else this
+    * will wait forever.  Also note that LWLockAcquire creates a critical
+    * section, so that this routine cannot be interrupted by cancel/die
+    * interrupts.
      */
-   LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
+   for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+       LWLockAcquire(FirstLockMgrLock + i, LW_EXCLUSIVE);
  
     /*
      * Check to see if we've been awoken by anyone in the interim.
@@ -937,14 +953,11 @@ CheckDeadLock(void)
      *
      * We check by looking to see if we've been unlinked from the wait queue.
      * This is quicker than checking our semaphore's state, since no kernel
-    * call is needed, and it is safe because we hold the locktable lock.
+    * call is needed, and it is safe because we hold the lock partition lock.
      */
     if (MyProc->links.prev == INVALID_OFFSET ||
         MyProc->links.next == INVALID_OFFSET)
-   {
-       LWLockRelease(LockMgrLock);
-       return;
-   }
+       goto check_done;
  
  #ifdef LOCK_DEBUG
     if (Debug_deadlocks)
@@ -954,16 +967,19 @@ CheckDeadLock(void)
     if (!DeadLockCheck(MyProc))
     {
         /* No deadlock, so keep waiting */
-       LWLockRelease(LockMgrLock);
-       return;
+       goto check_done;
     }
  
     /*
      * Oops.  We have a deadlock.
      *
-    * Get this process out of wait state.
+    * Get this process out of wait state.  (Note: we could do this more
+    * efficiently by relying on lockAwaited, but use this coding to preserve
+    * the flexibility to kill some other transaction than the one detecting
+    * the deadlock.)
      */
-   RemoveFromWaitQueue(MyProc);
+   Assert(MyProc->waitLock != NULL);
+   RemoveFromWaitQueue(MyProc, LockTagToPartition(&(MyProc->waitLock->tag)));
  
     /*
      * Set MyProc->waitStatus to STATUS_ERROR so that ProcSleep will report an
@@ -987,7 +1003,15 @@ CheckDeadLock(void)
      * them anymore.  However, RemoveFromWaitQueue took care of waking up any
      * such processes.
      */
-   LWLockRelease(LockMgrLock);
+
+   /*
+    * Release locks acquired at head of routine.  Order is not critical,
+    * so do it back-to-front to avoid waking another CheckDeadLock instance
+    * before it can get all the locks.
+    */
+check_done:
+   for (i = NUM_LOCK_PARTITIONS; --i >= 0; )
+       LWLockRelease(FirstLockMgrLock + i);
  }
  
  
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h

index e289632054cc6fe70ad83454866005a3b73dfb24..9af03fb4742785828c8b2ea11153f9e8c2f7c47f 100644 (file)
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.92 2005/12/09 01:22:04 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.93 2005/12/11 21:02:18 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -19,6 +19,13 @@
  #include "storage/shmem.h"
  
  
+/*
+ * Number of partitions the shared lock tables are divided into.
+ *
+ * See LockTagToPartition() if you change this.
+ */
+#define NUM_LOCK_PARTITIONS  16
+
  /* originally in procq.h */
  typedef struct PROC_QUEUE
  {
@@ -348,6 +355,7 @@ typedef struct LOCALLOCK
     LOCK       *lock;           /* associated LOCK object in shared mem */
     PROCLOCK   *proclock;       /* associated PROCLOCK object in shmem */
     bool        isTempObject;   /* true if lock is on a temporary object */
+   int         partition;      /* ID of partition containing this lock */
     int         nLocks;         /* total number of times lock is held */
     int         numLockOwners;  /* # of relevant ResourceOwners */
     int         maxLockOwners;  /* allocated size of array */
@@ -389,6 +397,7 @@ typedef enum
   */
  extern void InitLocks(void);
  extern LockMethod GetLocksMethodTable(const LOCK *lock);
+extern int LockTagToPartition(const LOCKTAG *locktag);
  extern LockAcquireResult LockAcquire(const LOCKTAG *locktag,
             bool isTempObject,
             LOCKMODE lockmode,
@@ -406,7 +415,7 @@ extern int LockCheckConflicts(LockMethod lockMethodTable,
                    LOCK *lock, PROCLOCK *proclock, PGPROC *proc);
  extern void GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode);
  extern void GrantAwaitedLock(void);
-extern void RemoveFromWaitQueue(PGPROC *proc);
+extern void RemoveFromWaitQueue(PGPROC *proc, int partition);
  extern Size LockShmemSize(void);
  extern bool DeadLockCheck(PGPROC *proc);
  extern void DeadLockReport(void);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h

index 4291e0b2e747b9fc3b4682e63a325f9bf423e7ed..c318e60b5771fbeb35845dad0d0eea98226278f4 100644 (file)
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.23 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.24 2005/12/11 21:02:18 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -16,9 +16,9 @@
  
  /*
   * We have a number of predefined LWLocks, plus a bunch of LWLocks that are
- * dynamically assigned (for shared buffers).  The LWLock structures live
- * in shared memory (since they contain shared data) and are identified by
- * values of this enumerated type. We abuse the notion of an enum somewhat
+ * dynamically assigned (e.g., for shared buffers).  The LWLock structures
+ * live in shared memory (since they contain shared data) and are identified
+ * by values of this enumerated type.  We abuse the notion of an enum somewhat
   * by allowing values not listed in the enum declaration to be assigned.
   * The extra value MaxDynamicLWLock is there to keep the compiler from
   * deciding that the enum can be represented as char or short ...
@@ -27,7 +27,6 @@ typedef enum LWLockId
  {
     BufMappingLock,
     BufFreelistLock,
-   LockMgrLock,
     OidGenLock,
     XidGenLock,
     ProcArrayLock,
@@ -46,8 +45,7 @@ typedef enum LWLockId
     RelCacheInitLock,
     BgWriterCommLock,
     TwoPhaseStateLock,
-
-   NumFixedLWLocks,            /* must be last except for MaxDynamicLWLock */
+   FirstLockMgrLock,           /* must be last except for MaxDynamicLWLock */
  
     MaxDynamicLWLock = 1000000000
  } LWLockId;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h

index 4cba391048eb582b803e5d1e02081ea12ace211e..2cfee41eff91e85f6767aa8098402e36ea69148a 100644 (file)
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.84 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.85 2005/12/11 21:02:18 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -52,7 +52,8 @@ struct XidCache
   * so that the prepared transactions appear to be still running and are
   * correctly shown as holding locks.  A prepared transaction PGPROC can be
   * distinguished from a real one at need by the fact that it has pid == 0.
- * The semaphore and lock-related fields in a prepared-xact PGPROC are unused.
+ * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
+ * but its myProcLocks[] lists are valid.
   */
  struct PGPROC
  {
@@ -86,8 +87,12 @@ struct PGPROC
     LOCKMASK    heldLocks;      /* bitmask for lock types already held on this
                                  * lock object by this backend */
  
-   SHM_QUEUE   procLocks;      /* list of PROCLOCK objects for locks held or
-                                * awaited by this backend */
+   /*
+    * All PROCLOCK objects for locks held or awaited by this backend are
+    * linked into one of these lists, according to the partition number of
+    * their lock.
+    */
+   SHM_QUEUE   myProcLocks[NUM_LOCK_PARTITIONS];
  
     struct XidCache subxids;    /* cache for subtransaction XIDs */
  };
@@ -99,7 +104,7 @@ extern DLLIMPORT PGPROC *MyProc;
  
  
  /*
- * There is one ProcGlobal struct for the whole installation.
+ * There is one ProcGlobal struct for the whole database cluster.
   */
  typedef struct PROC_HDR
  {
@@ -134,8 +139,7 @@ extern bool HaveNFreeProcs(int n);
  extern void ProcReleaseLocks(bool isCommit);
  
  extern void ProcQueueInit(PROC_QUEUE *queue);
-extern int ProcSleep(LockMethod lockMethodTable, LOCKMODE lockmode,
-         LOCK *lock, PROCLOCK *proclock);
+extern int ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
  extern PGPROC *ProcWakeup(PGPROC *proc, int waitStatus);
  extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
  extern bool LockWaitCancel(void);
author	Tom Lane
	Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
committer	Tom Lane
	Sun, 11 Dec 2005 21:02:18 +0000 (21:02 +0000)
src/backend/access/transam/twophase.c		patch \| blob \| blame \| history
src/backend/storage/ipc/procarray.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/README		patch \| blob \| blame \| history
src/backend/storage/lmgr/deadlock.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/lock.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/lwlock.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/proc.c		patch \| blob \| blame \| history
src/include/storage/lock.h		patch \| blob \| blame \| history
src/include/storage/lwlock.h		patch \| blob \| blame \| history
src/include/storage/proc.h		patch \| blob \| blame \| history