Cache hash index's metapage in rel->rd_amcache.

author Robert Haas

Tue, 7 Feb 2017 17:24:25 +0000 (12:24 -0500)

committer Robert Haas

Tue, 7 Feb 2017 17:35:45 +0000 (12:35 -0500)
author Robert Haas
Tue, 7 Feb 2017 17:24:25 +0000 (12:24 -0500)
committer Robert Haas
Tue, 7 Feb 2017 17:35:45 +0000 (12:35 -0500)
diff --git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out

index 31b67999fc8ac2aaa7174ca2e0bbfb1db533b4b5..7eb1537b29ec2e2cccca499269d56e6a205770a5 100644 (file)
--- a/contrib/pageinspect/expected/hash.out
+++ b/contrib/pageinspect/expected/hash.out
@@ -98,7 +98,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 1));
  live_items      | 0
  dead_items      | 0
  page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
  hasho_nextblkno | 4294967295
  hasho_bucket    | 0
  hasho_flag      | 2
@@ -111,7 +111,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 2));
  live_items      | 0
  dead_items      | 0
  page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
  hasho_nextblkno | 4294967295
  hasho_bucket    | 1
  hasho_flag      | 2
@@ -124,7 +124,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 3));
  live_items      | 1
  dead_items      | 0
  page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
  hasho_nextblkno | 4294967295
  hasho_bucket    | 2
  hasho_flag      | 2
@@ -137,7 +137,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 4));
  live_items      | 0
  dead_items      | 0
  page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
  hasho_nextblkno | 4294967295
  hasho_bucket    | 3
  hasho_flag      | 2
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml

index 4c201e75b0de33bb5ca36ec568b159daf7ac7a8c..5e6712f9cdee7c4943a64157b97bdc9423c20df9 100644 (file)
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -539,7 +539,7 @@ live_items      | 407
  dead_items      | 0
  page_size       | 8192
  free_size       | 8
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 4096
  hasho_nextblkno | 8474
  hasho_bucket    | 0
  hasho_flag      | 66
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README

index 01ea115f4d4791c3ea2b4d8f725ca69da4f4b93c..703ae982071e4f55041891e9f90e27171583bd1b 100644 (file)
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -149,6 +149,50 @@ We choose to always lock the lower-numbered bucket first.  The metapage is
  only ever locked after all bucket locks have been taken.
  
  
+Metapage Caching
+----------------
+
+Both scanning the index and inserting tuples require locating the bucket
+where a given tuple ought to be located.  To do this, we need the bucket
+count, highmask, and lowmask from the metapage; however, it's undesirable
+for performance reasons to have to have to lock and pin the metapage for
+every such operation.  Instead, we retain a cached copy of the metapage
+in each each backend's relcache entry.  This will produce the correct
+bucket mapping as long as the target bucket hasn't been split since the
+last cache refresh.
+
+To guard against the possibility that such a split has occurred, the
+primary page of each bucket chain stores the number of buckets that
+existed as of the time the bucket was last split, or if never split as
+of the time it was created, in the space normally used for the
+previous block number (that is, hasho_prevblkno).  This doesn't cost
+anything because the primary bucket page is always the first page in
+the chain, and the previous block number is therefore always, in
+reality, InvalidBlockNumber.
+
+After computing the ostensibly-correct bucket number based on our cached
+copy of the metapage, we lock the corresponding primary bucket page and
+check whether the bucket count stored in hasho_prevblkno is greater than
+our the number of buckets stored in our cached copy of the metapage.  If
+so, the bucket has certainly been split, because the must originally
+have been less than the number of buckets that existed at that time and
+can't have increased except due to a split.  If not, the bucket can't have
+been split, because a split would have created a new bucket with a higher
+bucket number than any we'd seen previously.  In the latter case, we've
+locked the correct bucket and can proceed; in the former case, we must
+release the lock on this bucket, lock the metapage, update our cache,
+unlock the metapage, and retry.
+
+Needing to retry occasionally might seem expensive, but the number of times
+any given bucket can be split is limited to a few dozen no matter how
+many times the hash index is accessed, because the total number of
+buckets is limited to less than 2^32.  On the other hand, the number of
+times we access a bucket is unbounded and will be several orders of
+magnitude larger even in unsympathetic cases.
+
+(The metapage cache is new in v10.  Older hash indexes had the primary
+bucket page's hasho_prevblkno initialized to InvalidBuffer.)
+
  Pseudocode Algorithms
  ---------------------
  
@@ -188,17 +232,7 @@ track of available overflow pages.
  
  The reader algorithm is:
  
-   pin meta page and take buffer content lock in shared mode
-   loop:
-       compute bucket number for target hash key
-       release meta page buffer content lock
-       if (correct bucket page is already locked)
-           break
-       release any existing bucket page buffer content lock (if a concurrent
-         split happened)
-       take the buffer content lock on bucket page in shared mode
-       retake meta page buffer content lock in shared mode
-   release pin on metapage
+    lock the primary bucket page of the target bucket
     if the target bucket is still being populated by a split:
         release the buffer content lock on current bucket page
         pin and acquire the buffer content lock on old bucket in shared mode
@@ -238,17 +272,7 @@ which this bucket is formed by split.
  
  The insertion algorithm is rather similar:
  
-   pin meta page and take buffer content lock in shared mode
-   loop:
-       compute bucket number for target hash key
-       release meta page buffer content lock
-       if (correct bucket page is already locked)
-           break
-       release any existing bucket page buffer content lock (if a concurrent
-         split happened)
-       take the buffer content lock on bucket page in exclusive mode
-       retake meta page buffer content lock in shared mode
-   release pin on metapage
+    lock the primary bucket page of the target bucket
  -- (so far same as reader, except for acquisition of buffer content lock in
     exclusive mode on primary bucket page)
     if the bucket-being-split flag is set for a bucket and pin count on it is
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c

index ec8ed33c7087e22f90973d713b1a2daaf96160d5..97ad22aa6f3aa88e341962a14c9e2353a7545a7f 100644 (file)
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -507,28 +507,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     Bucket      orig_maxbucket;
     Bucket      cur_maxbucket;
     Bucket      cur_bucket;
-   Buffer      metabuf;
+   Buffer      metabuf = InvalidBuffer;
     HashMetaPage metap;
-   HashMetaPageData local_metapage;
+   HashMetaPage cachedmetap;
  
     tuples_removed = 0;
     num_index_tuples = 0;
  
     /*
-    * Read the metapage to fetch original bucket and tuple counts.  Also, we
-    * keep a copy of the last-seen metapage so that we can use its
-    * hashm_spares[] values to compute bucket page addresses.  This is a bit
-    * hokey but perfectly safe, since the interesting entries in the spares
-    * array cannot change under us; and it beats rereading the metapage for
-    * each bucket.
+    * We need a copy of the metapage so that we can use its hashm_spares[]
+    * values to compute bucket page addresses, but a cached copy should be
+    * good enough.  (If not, we'll detect that further down and refresh the
+    * cache as necessary.)
      */
-   metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-   metap = HashPageGetMeta(BufferGetPage(metabuf));
-   orig_maxbucket = metap->hashm_maxbucket;
-   orig_ntuples = metap->hashm_ntuples;
-   memcpy(&local_metapage, metap, sizeof(local_metapage));
-   /* release the lock, but keep pin */
-   LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+   cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
+   Assert(cachedmetap != NULL);
+
+   orig_maxbucket = cachedmetap->hashm_maxbucket;
+   orig_ntuples = cachedmetap->hashm_ntuples;
  
     /* Scan the buckets that we know exist */
     cur_bucket = 0;
@@ -546,7 +542,7 @@ loop_top:
         bool        split_cleanup = false;
  
         /* Get address of bucket's start page */
-       bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
+       bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
  
         blkno = bucket_blkno;
  
@@ -577,20 +573,27 @@ loop_top:
              * hashm_lowmask might be old enough to cause us to fail to remove
              * tuples left behind by the most recent split.  To prevent that,
              * now that the primary page of the target bucket has been locked
-            * (and thus can't be further split), update our cached metapage
-            * data.
+            * (and thus can't be further split), check whether we need to
+            * update our cached metapage data.
+            *
+            * NB: The check for InvalidBlockNumber is only needed for
+            * on-disk compatibility with indexes created before we started
+            * storing hashm_maxbucket in the primary page's hasho_prevblkno.
              */
-           LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-           memcpy(&local_metapage, metap, sizeof(local_metapage));
-           LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+           if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber &&
+               bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
+           {
+               cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+               Assert(cachedmetap != NULL);
+           }
         }
  
         bucket_buf = buf;
  
         hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
-                         local_metapage.hashm_maxbucket,
-                         local_metapage.hashm_highmask,
-                         local_metapage.hashm_lowmask, &tuples_removed,
+                         cachedmetap->hashm_maxbucket,
+                         cachedmetap->hashm_highmask,
+                         cachedmetap->hashm_lowmask, &tuples_removed,
                           &num_index_tuples, split_cleanup,
                           callback, callback_state);
  
@@ -600,6 +603,9 @@ loop_top:
         cur_bucket++;
     }
  
+   if (BufferIsInvalid(metabuf))
+       metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+
     /* Write-lock metapage and check for split since we started */
     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -607,9 +613,10 @@ loop_top:
     if (cur_maxbucket != metap->hashm_maxbucket)
     {
         /* There's been a split, so process the additional bucket(s) */
-       cur_maxbucket = metap->hashm_maxbucket;
-       memcpy(&local_metapage, metap, sizeof(local_metapage));
         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+       cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+       Assert(cachedmetap != NULL);
+       cur_maxbucket = cachedmetap->hashm_maxbucket;
         goto loop_top;
     }
  
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c

index 39c70d3a80f42f02336ccd2d6d980b5ecf31d2f5..dc63063ac1fbdf372ede7caaaa7d467bb9216514 100644 (file)
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -32,9 +32,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
     Buffer      bucket_buf;
     Buffer      metabuf;
     HashMetaPage metap;
-   BlockNumber blkno;
-   BlockNumber oldblkno;
-   bool        retry;
+   HashMetaPage usedmetap = NULL;
     Page        metapage;
     Page        page;
     HashPageOpaque pageopaque;
@@ -42,9 +40,6 @@ _hash_doinsert(Relation rel, IndexTuple itup)
     bool        do_expand;
     uint32      hashkey;
     Bucket      bucket;
-   uint32      maxbucket;
-   uint32      highmask;
-   uint32      lowmask;
  
     /*
      * Get the hash key for the item (it's stored in the index tuple itself).
@@ -57,10 +52,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
                                  * need to be consistent */
  
  restart_insert:
-   /* Read the metapage */
-   metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+
+   /*
+    * Read the metapage.  We don't lock it yet; HashMaxItemSize() will
+    * examine pd_pagesize_version, but that can't change so we can examine
+    * it without a lock.
+    */
+   metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
     metapage = BufferGetPage(metabuf);
-   metap = HashPageGetMeta(metapage);
  
     /*
      * Check whether the item can fit on a hash page at all. (Eventually, we
@@ -76,66 +75,17 @@ restart_insert:
                         itemsz, HashMaxItemSize(metapage)),
             errhint("Values larger than a buffer page cannot be indexed.")));
  
-   oldblkno = InvalidBlockNumber;
-   retry = false;
-
-   /*
-    * Loop until we get a lock on the correct target bucket.
-    */
-   for (;;)
-   {
-       /*
-        * Compute the target bucket number, and convert to block number.
-        */
-       bucket = _hash_hashkey2bucket(hashkey,
-                                     metap->hashm_maxbucket,
-                                     metap->hashm_highmask,
-                                     metap->hashm_lowmask);
-
-       blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-       /*
-        * Copy bucket mapping info now; refer the comment in
-        * _hash_expandtable where we copy this information before calling
-        * _hash_splitbucket to see why this is okay.
-        */
-       maxbucket = metap->hashm_maxbucket;
-       highmask = metap->hashm_highmask;
-       lowmask = metap->hashm_lowmask;
-
-       /* Release metapage lock, but keep pin. */
-       LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-       /*
-        * If the previous iteration of this loop locked the primary page of
-        * what is still the correct target bucket, we are done.  Otherwise,
-        * drop any old lock before acquiring the new one.
-        */
-       if (retry)
-       {
-           if (oldblkno == blkno)
-               break;
-           _hash_relbuf(rel, buf);
-       }
-
-       /* Fetch and lock the primary bucket page for the target bucket */
-       buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
-
-       /*
-        * Reacquire metapage lock and check that no bucket split has taken
-        * place while we were awaiting the bucket lock.
-        */
-       LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-       oldblkno = blkno;
-       retry = true;
-   }
+   /* Lock the primary bucket page for the target bucket. */
+   buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
+                                         &usedmetap);
+   Assert(usedmetap != NULL);
  
     /* remember the primary bucket buffer to release the pin on it at end. */
     bucket_buf = buf;
  
     page = BufferGetPage(buf);
     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
-   Assert(pageopaque->hasho_bucket == bucket);
+   bucket = pageopaque->hasho_bucket;
  
     /*
      * If this bucket is in the process of being split, try to finish the
@@ -151,8 +101,10 @@ restart_insert:
         /* release the lock on bucket buffer, before completing the split. */
         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  
-       _hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
-                          maxbucket, highmask, lowmask);
+       _hash_finish_split(rel, metabuf, buf, bucket,
+                          usedmetap->hashm_maxbucket,
+                          usedmetap->hashm_highmask,
+                          usedmetap->hashm_lowmask);
  
         /* release the pin on old and meta buffer.  retry for insert. */
         _hash_dropbuf(rel, buf);
@@ -225,6 +177,7 @@ restart_insert:
      */
     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
  
+   metap = HashPageGetMeta(metapage);
     metap->hashm_ntuples += 1;
  
     /* Make sure this stays in sync with _hash_expandtable() */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c

index 69676eba95389675562753575b300b1071741fe4..d52f149389b3c1e666c64477b39c3dc71b7d22a2 100644 (file)
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
         buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
         pg = BufferGetPage(buf);
         pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
-       pageopaque->hasho_prevblkno = InvalidBlockNumber;
+
+       /*
+        * Set hasho_prevblkno with current hashm_maxbucket. This value will
+        * be used to validate cached HashMetaPageData. See
+        * _hash_getbucketbuf_from_hashkey().
+        */
+       pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
         pageopaque->hasho_nextblkno = InvalidBlockNumber;
         pageopaque->hasho_bucket = i;
         pageopaque->hasho_flag = LH_BUCKET_PAGE;
@@ -840,10 +846,14 @@ _hash_splitbucket(Relation rel,
     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
  
     /*
-    * Mark the old bucket to indicate that split is in progress.  At
-    * operation end, we clear split-in-progress flag.
+    * Mark the old bucket to indicate that split is in progress.  (At
+    * operation end, we will clear the split-in-progress flag.)  Also,
+    * for a primary bucket page, hasho_prevblkno stores the number of
+    * buckets that existed as of the last split, so we must update that
+    * value here.
      */
     oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
+   oopaque->hasho_prevblkno = maxbucket;
  
     npage = BufferGetPage(nbuf);
  
@@ -852,7 +862,7 @@ _hash_splitbucket(Relation rel,
      * split is in progress.
      */
     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
-   nopaque->hasho_prevblkno = InvalidBlockNumber;
+   nopaque->hasho_prevblkno = maxbucket;
     nopaque->hasho_nextblkno = InvalidBlockNumber;
     nopaque->hasho_bucket = nbucket;
     nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
@@ -1191,3 +1201,136 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
     LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
     hash_destroy(tidhtab);
  }
+
+/*
+ * _hash_getcachedmetap() -- Returns cached metapage data.
+ *
+ * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
+ *  the metapage.  If not set, we'll set it before returning if we have to
+ *  refresh the cache, and return with a pin but no lock on it; caller is
+ *  responsible for releasing the pin.
+ *
+ *  We refresh the cache if it's not initialized yet or force_refresh is true.
+ */
+HashMetaPage
+_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
+{
+   Page        page;
+
+   Assert(metabuf);
+   if (force_refresh || rel->rd_amcache == NULL)
+   {
+       char   *cache;
+
+       /*
+        * It's important that we don't set rd_amcache to an invalid
+        * value.  Either MemoryContextAlloc or _hash_getbuf could fail,
+        * so don't install a pointer to the newly-allocated storage in the
+        * actual relcache entry until both have succeeeded.
+        */
+       if (rel->rd_amcache == NULL)
+           cache = MemoryContextAlloc(rel->rd_indexcxt,
+                                      sizeof(HashMetaPageData));
+
+       /* Read the metapage. */
+       if (BufferIsValid(*metabuf))
+           LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
+       else
+           *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
+                                   LH_META_PAGE);
+       page = BufferGetPage(*metabuf);
+
+       /* Populate the cache. */
+       if (rel->rd_amcache == NULL)
+           rel->rd_amcache = cache;
+       memcpy(rel->rd_amcache, HashPageGetMeta(page),
+              sizeof(HashMetaPageData));
+
+       /* Release metapage lock, but keep the pin. */
+       LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
+   }
+
+   return (HashMetaPage) rel->rd_amcache;
+}
+
+/*
+ * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ *                                      hashkey.
+ *
+ * Bucket pages do not move or get removed once they are allocated. This give
+ * us an opportunity to use the previously saved metapage contents to reach
+ * the target bucket buffer, instead of reading from the metapage every time.
+ * This saves one buffer access every time we want to reach the target bucket
+ *  buffer, which is very helpful savings in bufmgr traffic and contention.
+ *
+ * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ * bucket buffer has to be locked for reading or writing.
+ *
+ * The out parameter cachedmetap is set with metapage contents used for
+ * hashkey to bucket buffer mapping. Some callers need this info to reach the
+ * old bucket in case of bucket split, see _hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
+                               HashMetaPage *cachedmetap)
+{
+   HashMetaPage metap;
+   Buffer      buf;
+   Buffer      metabuf = InvalidBuffer;
+   Page        page;
+   Bucket      bucket;
+   BlockNumber blkno;
+   HashPageOpaque opaque;
+
+   /* We read from target bucket buffer, hence locking is must. */
+   Assert(access == HASH_READ || access == HASH_WRITE);
+
+   metap = _hash_getcachedmetap(rel, &metabuf, false);
+   Assert(metap != NULL);
+
+   /*
+    * Loop until we get a lock on the correct target bucket.
+    */
+   for (;;)
+   {
+       /*
+        * Compute the target bucket number, and convert to block number.
+        */
+       bucket = _hash_hashkey2bucket(hashkey,
+                                     metap->hashm_maxbucket,
+                                     metap->hashm_highmask,
+                                     metap->hashm_lowmask);
+
+       blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+       /* Fetch the primary bucket page for the bucket */
+       buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+       page = BufferGetPage(buf);
+       opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+       Assert(opaque->hasho_bucket == bucket);
+
+       /*
+        * If this bucket hasn't been split, we're done.
+        *
+        * NB: The check for InvalidBlockNumber is only needed for on-disk
+        * compatibility with indexes created before we started storing
+        * hashm_maxbucket in the primary page's hasho_prevblkno.
+        */
+       if (opaque->hasho_prevblkno == InvalidBlockNumber ||
+           opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+           break;
+
+       /* Drop lock on this buffer, update cached metapage, and retry. */
+       _hash_relbuf(rel, buf);
+       metap = _hash_getcachedmetap(rel, &metabuf, true);
+       Assert(metap != NULL);
+   }
+
+   if (BufferIsValid(metabuf))
+       _hash_dropbuf(rel, metabuf);
+
+   if (cachedmetap)
+       *cachedmetap = metap;
+
+   return buf;
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c

index a59ad6ff7079f2af0e491ceff37849e99fc79c91..9e5d7e4babe09317d3377d75a471ba4fc06b8e21 100644 (file)
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -139,6 +139,7 @@ _hash_readprev(IndexScanDesc scan,
     BlockNumber blkno;
     Relation    rel = scan->indexRelation;
     HashScanOpaque so = (HashScanOpaque) scan->opaque;
+   bool        haveprevblk;
  
     blkno = (*opaquep)->hasho_prevblkno;
  
@@ -147,15 +148,23 @@ _hash_readprev(IndexScanDesc scan,
      * comments in _hash_first to know the reason of retaining pin.
      */
     if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
+   {
         LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
+       haveprevblk = false;
+   }
     else
+   {
         _hash_relbuf(rel, *bufp);
+       haveprevblk = true;
+   }
  
     *bufp = InvalidBuffer;
     /* check for interrupts while we're not holding any buffer lock */
     CHECK_FOR_INTERRUPTS();
-   if (BlockNumberIsValid(blkno))
+
+   if (haveprevblk)
     {
+       Assert(BlockNumberIsValid(blkno));
         *bufp = _hash_getbuf(rel, blkno, HASH_READ,
                              LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
         *pagep = BufferGetPage(*bufp);
@@ -215,14 +224,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
     ScanKey     cur;
     uint32      hashkey;
     Bucket      bucket;
-   BlockNumber blkno;
-   BlockNumber oldblkno = InvalidBuffer;
-   bool        retry = false;
     Buffer      buf;
-   Buffer      metabuf;
     Page        page;
     HashPageOpaque opaque;
-   HashMetaPage metap;
     IndexTuple  itup;
     ItemPointer current;
     OffsetNumber offnum;
@@ -277,59 +281,10 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
  
     so->hashso_sk_hash = hashkey;
  
-   /* Read the metapage */
-   metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-   page = BufferGetPage(metabuf);
-   metap = HashPageGetMeta(page);
-
-   /*
-    * Loop until we get a lock on the correct target bucket.
-    */
-   for (;;)
-   {
-       /*
-        * Compute the target bucket number, and convert to block number.
-        */
-       bucket = _hash_hashkey2bucket(hashkey,
-                                     metap->hashm_maxbucket,
-                                     metap->hashm_highmask,
-                                     metap->hashm_lowmask);
-
-       blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-       /* Release metapage lock, but keep pin. */
-       LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-       /*
-        * If the previous iteration of this loop locked what is still the
-        * correct target bucket, we are done.  Otherwise, drop any old lock
-        * and lock what now appears to be the correct bucket.
-        */
-       if (retry)
-       {
-           if (oldblkno == blkno)
-               break;
-           _hash_relbuf(rel, buf);
-       }
-
-       /* Fetch the primary bucket page for the bucket */
-       buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
-
-       /*
-        * Reacquire metapage lock and check that no bucket split has taken
-        * place while we were awaiting the bucket lock.
-        */
-       LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-       oldblkno = blkno;
-       retry = true;
-   }
-
-   /* done with the metapage */
-   _hash_dropbuf(rel, metabuf);
-
+   buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
     page = BufferGetPage(buf);
     opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-   Assert(opaque->hasho_bucket == bucket);
+   bucket = opaque->hasho_bucket;
  
     so->hashso_bucket_buf = buf;
  
diff --git a/src/include/access/hash.h b/src/include/access/hash.h

index 1a9b91f9f53dc64db64e2c39d1ea51c684be763f..c0455851f466d3361804f85aa6fe4c40f62589df 100644 (file)
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -61,10 +61,21 @@ typedef uint32 Bucket;
  #define LH_PAGE_TYPE \
     (LH_OVERFLOW_PAGE|LH_BUCKET_PAGE|LH_BITMAP_PAGE|LH_META_PAGE)
  
+/*
+ * In an overflow page, hasho_prevblkno stores the block number of the previous
+ * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
+ * hashm_maxbucket value as of the last time the bucket was last split, or
+ * else as of the time the bucket was created.  The latter convention is used
+ * to determine whether a cached copy of the metapage is too stale to be used
+ * without needing to lock or pin the metapage.
+ *
+ * hasho_nextblkno is always the block number of the next page in the
+ * bucket chain, or InvalidBlockNumber if there are no more such pages.
+ */
  typedef struct HashPageOpaqueData
  {
-   BlockNumber hasho_prevblkno;    /* previous ovfl (or bucket) blkno */
-   BlockNumber hasho_nextblkno;    /* next ovfl blkno */
+   BlockNumber hasho_prevblkno;    /* see above */
+   BlockNumber hasho_nextblkno;    /* see above */
     Bucket      hasho_bucket;   /* bucket number this pg belongs to */
     uint16      hasho_flag;     /* page type code, see above */
     uint16      hasho_page_id;  /* for identification of hash indexes */
@@ -309,6 +320,11 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
              int access, int flags);
  extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
                                    BlockNumber blkno, int flags);
+extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
+                    bool force_refresh);
+extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
+                               int access,
+                               HashMetaPage *cachedmetap);
  extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
  extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
                 ForkNumber forkNum);
author	Robert Haas
	Tue, 7 Feb 2017 17:24:25 +0000 (12:24 -0500)
committer	Robert Haas
	Tue, 7 Feb 2017 17:35:45 +0000 (12:35 -0500)
contrib/pageinspect/expected/hash.out		patch \| blob \| blame \| history
doc/src/sgml/pageinspect.sgml		patch \| blob \| blame \| history
src/backend/access/hash/README		patch \| blob \| blame \| history
src/backend/access/hash/hash.c		patch \| blob \| blame \| history
src/backend/access/hash/hashinsert.c		patch \| blob \| blame \| history
src/backend/access/hash/hashpage.c		patch \| blob \| blame \| history
src/backend/access/hash/hashsearch.c		patch \| blob \| blame \| history
src/include/access/hash.h		patch \| blob \| blame \| history