Check for conflicting queries during replay of gistvacuumpage()
authorAlexander Korotkov
Thu, 20 Dec 2018 23:37:37 +0000 (02:37 +0300)
committerAlexander Korotkov
Thu, 20 Dec 2018 23:37:37 +0000 (02:37 +0300)
013ebc0a7b implements so-called GiST microvacuum.  That is gistgettuple() marks
index tuples as dead when kill_prior_tuple is set.  Later, when new tuple
insertion claims page space, those dead index tuples are physically deleted
from page.  When this deletion is replayed on standby, it might conflict with
read-only queries.  But 013ebc0a7b doesn't handle this.  That may lead to
disappearance of some tuples from read-only snapshots on standby.

This commit implements resolving of conflicts between replay of GiST microvacuum
and standby queries.  On the master we implement new WAL record type
XLOG_GIST_DELETE, which comprises necessary information.  On stable releases
we've to be tricky to keep WAL compatibility.  Information required for conflict
processing is just appended to data of XLOG_GIST_PAGE_UPDATE record.  So,
PostgreSQL version, which doesn't know about conflict processing, will just
ignore that.

Reported-by: Andres Freund
Diagnosed-by: Andres Freund
Discussion: https://postgr.es/m/20181212224524.scafnlyjindmrbe6%40alap3.anarazel.de
Author: Alexander Korotkov
Backpatch-through: 9.6

src/backend/access/gist/gist.c
src/backend/access/gist/gistbuild.c
src/backend/access/gist/gistxlog.c
src/backend/access/rmgrdesc/gistdesc.c
src/include/access/gist_private.h
src/include/access/gistxlog.h

index 8a42effdf7a49e61eb3aa44b7fe687cd9f5d8e5c..a2cb84800e8d4b56b65b3628433611aa84e68858 100644 (file)
@@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
                 bool unlockbuf, bool unlockleftchild);
 static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
                GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
+static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+              Relation heapRel);
 
 
 #define ROTATEDIST(d) do { \
@@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
                         values, isnull, true /* size is currently bogus */ );
    itup->t_tid = *ht_ctid;
 
-   gistdoinsert(r, itup, 0, giststate);
+   gistdoinsert(r, itup, 0, giststate, heapRel);
 
    /* cleanup */
    MemoryContextSwitchTo(oldCxt);
@@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
                BlockNumber *newblkno,
                Buffer leftchildbuf,
                List **splitinfo,
-               bool markfollowright)
+               bool markfollowright,
+               Relation heapRel)
 {
    BlockNumber blkno = BufferGetBlockNumber(buffer);
    Page        page = BufferGetPage(buffer);
@@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
     */
    if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
    {
-       gistvacuumpage(rel, page, buffer);
+       gistvacuumpage(rel, page, buffer, heapRel);
        is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
    }
 
@@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
  * so it does not bother releasing palloc'd allocations.
  */
 void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+            GISTSTATE *giststate, Relation heapRel)
 {
    ItemId      iid;
    IndexTuple  idxtuple;
@@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
    memset(&state, 0, sizeof(GISTInsertState));
    state.freespace = freespace;
    state.r = r;
+   state.heapRel = heapRel;
 
    /* Start from the root */
    firststack.blkno = GIST_ROOT_BLKNO;
@@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
                               oldoffnum, NULL,
                               leftchild,
                               &splitinfo,
-                              true);
+                              true,
+                              state->heapRel);
 
    /*
     * Before recursing up in case the page was split, release locks on the
@@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate)
  * Function assumes that buffer is exclusively locked.
  */
 static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer)
+gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
 {
    OffsetNumber deletable[MaxIndexTuplesPerPage];
    int         ndeletable = 0;
@@ -1589,9 +1594,9 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer)
        {
            XLogRecPtr  recptr;
 
-           recptr = gistXLogUpdate(buffer,
+           recptr = gistXLogDelete(buffer,
                                    deletable, ndeletable,
-                                   NULL, 0, InvalidBuffer);
+                                   heapRel->rd_node);
 
            PageSetLSN(page, recptr);
        }
index 434f15f0148e0ff90dd131d36783edb405982960..b9c4e27e1a5ff881293595b93574fa68d5fe67c6 100644 (file)
@@ -56,6 +56,7 @@ typedef enum
 typedef struct
 {
    Relation    indexrel;
+   Relation    heaprel;
    GISTSTATE  *giststate;
 
    int64       indtuples;      /* number of tuples indexed */
@@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
    int         fillfactor;
 
    buildstate.indexrel = index;
+   buildstate.heaprel = heap;
    if (index->rd_options)
    {
        /* Get buffering mode from the options string */
@@ -484,7 +486,7 @@ gistBuildCallback(Relation index,
         * locked, we call gistdoinsert directly.
         */
        gistdoinsert(index, itup, buildstate->freespace,
-                    buildstate->giststate);
+                    buildstate->giststate, buildstate->heaprel);
    }
 
    /* Update tuple count and total size. */
@@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
                               itup, ntup, oldoffnum, &placed_to_blk,
                               InvalidBuffer,
                               &splitinfo,
-                              false);
+                              false,
+                              buildstate->heaprel);
 
    /*
     * If this is a root split, update the root path item kept in memory. This
index 1e091269785b8ce9de6b3f57dbcea0cef3aec1d6..01e025d5fdb586ab9f3a0d329c2ee2776c715922 100644 (file)
 #include "access/bufmask.h"
 #include "access/gist_private.h"
 #include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
 #include "utils/memutils.h"
 
 static MemoryContext opCtx;        /* working memory for operations */
@@ -160,6 +164,210 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
        UnlockReleaseBuffer(buffer);
 }
 
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record)
+{
+   gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record);
+   OffsetNumber *todelete;
+   Buffer      ibuffer,
+               hbuffer;
+   Page        ipage,
+               hpage;
+   RelFileNode rnode;
+   BlockNumber blkno;
+   ItemId      iitemid,
+               hitemid;
+   IndexTuple  itup;
+   HeapTupleHeader htuphdr;
+   BlockNumber hblkno;
+   OffsetNumber hoffnum;
+   TransactionId latestRemovedXid = InvalidTransactionId;
+   int         i;
+
+   /*
+    * If there's nothing running on the standby we don't need to derive a
+    * full latestRemovedXid value, so use a fast path out of here.  This
+    * returns InvalidTransactionId, and so will conflict with all HS
+    * transactions; but since we just worked out that that's zero people,
+    * it's OK.
+    *
+    * XXX There is a race condition here, which is that a new backend might
+    * start just after we look.  If so, it cannot need to conflict, but this
+    * coding will result in throwing a conflict anyway.
+    */
+   if (CountDBBackends(InvalidOid) == 0)
+       return latestRemovedXid;
+
+   /*
+    * In what follows, we have to examine the previous state of the index
+    * page, as well as the heap page(s) it points to.  This is only valid if
+    * WAL replay has reached a consistent database state; which means that
+    * the preceding check is not just an optimization, but is *necessary*. We
+    * won't have let in any user sessions before we reach consistency.
+    */
+   if (!reachedConsistency)
+       elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
+
+   /*
+    * Get index page.  If the DB is consistent, this should not fail, nor
+    * should any of the heap page fetches below.  If one does, we return
+    * InvalidTransactionId to cancel all HS transactions.  That's probably
+    * overkill, but it's safe, and certainly better than panicking here.
+    */
+   XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+   ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+   if (!BufferIsValid(ibuffer))
+       return InvalidTransactionId;
+   LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
+   ipage = (Page) BufferGetPage(ibuffer);
+
+   /*
+    * Loop through the deleted index items to obtain the TransactionId from
+    * the heap items they point to.
+    */
+   todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete);
+
+   for (i = 0; i < xlrec->ntodelete; i++)
+   {
+       /*
+        * Identify the index tuple about to be deleted
+        */
+       iitemid = PageGetItemId(ipage, todelete[i]);
+       itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+       /*
+        * Locate the heap page that the index tuple points at
+        */
+       hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+       hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
+       if (!BufferIsValid(hbuffer))
+       {
+           UnlockReleaseBuffer(ibuffer);
+           return InvalidTransactionId;
+       }
+       LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
+       hpage = (Page) BufferGetPage(hbuffer);
+
+       /*
+        * Look up the heap tuple header that the index tuple points at by
+        * using the heap node supplied with the xlrec. We can't use
+        * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+        * Note that we are not looking at tuple data here, just headers.
+        */
+       hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+       hitemid = PageGetItemId(hpage, hoffnum);
+
+       /*
+        * Follow any redirections until we find something useful.
+        */
+       while (ItemIdIsRedirected(hitemid))
+       {
+           hoffnum = ItemIdGetRedirect(hitemid);
+           hitemid = PageGetItemId(hpage, hoffnum);
+           CHECK_FOR_INTERRUPTS();
+       }
+
+       /*
+        * If the heap item has storage, then read the header and use that to
+        * set latestRemovedXid.
+        *
+        * Some LP_DEAD items may not be accessible, so we ignore them.
+        */
+       if (ItemIdHasStorage(hitemid))
+       {
+           htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+           HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+       }
+       else if (ItemIdIsDead(hitemid))
+       {
+           /*
+            * Conjecture: if hitemid is dead then it had xids before the xids
+            * marked on LP_NORMAL items. So we just ignore this item and move
+            * onto the next, for the purposes of calculating
+            * latestRemovedxids.
+            */
+       }
+       else
+           Assert(!ItemIdIsUsed(hitemid));
+
+       UnlockReleaseBuffer(hbuffer);
+   }
+
+   UnlockReleaseBuffer(ibuffer);
+
+   /*
+    * If all heap tuples were LP_DEAD then we will be returning
+    * InvalidTransactionId here, which avoids conflicts. This matches
+    * existing logic which assumes that LP_DEAD tuples must already be older
+    * than the latestRemovedXid on the cleanup record that set them as
+    * LP_DEAD, hence must already have generated a conflict.
+    */
+   return latestRemovedXid;
+}
+
+/*
+ * redo delete on gist index page to remove tuples marked as DEAD during index
+ * tuple insertion
+ */
+static void
+gistRedoDeleteRecord(XLogReaderState *record)
+{
+   XLogRecPtr  lsn = record->EndRecPtr;
+   gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record);
+   Buffer      buffer;
+   Page        page;
+
+   /*
+    * If we have any conflict processing to do, it must happen before we
+    * update the page.
+    *
+    * GiST delete records can conflict with standby queries.  You might think
+    * that vacuum records would conflict as well, but we've handled that
+    * already.  XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+    * cleaned by the vacuum of the heap and so we can resolve any conflicts
+    * just once when that arrives.  After that we know that no conflicts
+    * exist from individual gist vacuum records on that index.
+    */
+   if (InHotStandby)
+   {
+       TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record);
+       RelFileNode rnode;
+
+       XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+       ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+   }
+
+   if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+   {
+       page = (Page) BufferGetPage(buffer);
+
+       if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete)
+       {
+           OffsetNumber *todelete;
+
+           todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete);
+
+           PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+       }
+
+       GistClearPageHasGarbage(page);
+       GistMarkTuplesDeleted(page);
+
+       PageSetLSN(page, lsn);
+       MarkBufferDirty(buffer);
+   }
+
+   if (BufferIsValid(buffer))
+       UnlockReleaseBuffer(buffer);
+}
+
 /*
  * Returns an array of index pointers.
  */
@@ -318,6 +526,9 @@ gist_redo(XLogReaderState *record)
        case XLOG_GIST_PAGE_UPDATE:
            gistRedoPageUpdateRecord(record);
            break;
+       case XLOG_GIST_DELETE:
+           gistRedoDeleteRecord(record);
+           break;
        case XLOG_GIST_PAGE_SPLIT:
            gistRedoPageSplitRecord(record);
            break;
@@ -487,3 +698,35 @@ gistXLogUpdate(Buffer buffer,
 
    return recptr;
 }
+
+/*
+ * Write XLOG record describing a delete of leaf index tuples marked as DEAD
+ * during new tuple insertion.  One may think that this case is already covered
+ * by gistXLogUpdate().  But deletion of index tuples might conflict with
+ * standby queries and needs special handling.
+ */
+XLogRecPtr
+gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete,
+              RelFileNode hnode)
+{
+   gistxlogDelete xlrec;
+   XLogRecPtr  recptr;
+
+   xlrec.hnode = hnode;
+   xlrec.ntodelete = ntodelete;
+
+   XLogBeginInsert();
+   XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete);
+
+   /*
+    * We need the target-offsets array whether or not we store the whole
+    * buffer, to allow us to find the latestRemovedXid on a standby server.
+    */
+   XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber));
+
+   XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+   recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE);
+
+   return recptr;
+}
index e5e925e0c5a6bcf29201d9ff46fe35b5c7a07ee3..b79ed1dfdc85cb6de00a68e8256f67958191c067 100644 (file)
@@ -23,6 +23,11 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
 {
 }
 
+static void
+out_gistxlogDelete(StringInfo buf, gistxlogPageUpdate *xlrec)
+{
+}
+
 static void
 out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
 {
@@ -41,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record)
        case XLOG_GIST_PAGE_UPDATE:
            out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec);
            break;
+       case XLOG_GIST_DELETE:
+           out_gistxlogDelete(buf, (gistxlogPageUpdate *) rec);
+           break;
        case XLOG_GIST_PAGE_SPLIT:
            out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
            break;
@@ -59,6 +67,9 @@ gist_identify(uint8 info)
        case XLOG_GIST_PAGE_UPDATE:
            id = "PAGE_UPDATE";
            break;
+       case XLOG_GIST_DELETE:
+           id = "DELETE";
+           break;
        case XLOG_GIST_PAGE_SPLIT:
            id = "PAGE_SPLIT";
            break;
index 36ed7244ba02c9c6b4f18d1b6ef6672b21e11947..a73716d6eaac41890938ab6ee28df73bcba7988a 100644 (file)
@@ -240,6 +240,7 @@ typedef struct GistSplitVector
 typedef struct
 {
    Relation    r;
+   Relation    heapRel;
    Size        freespace;      /* free space to be left */
 
    GISTInsertStack *stack;
@@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate);
 extern void gistdoinsert(Relation r,
             IndexTuple itup,
             Size freespace,
-            GISTSTATE *GISTstate);
+            GISTSTATE *GISTstate,
+            Relation heapRel);
 
 /* A List of these is returned from gistplacetopage() in *splitinfo */
 typedef struct
@@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
                OffsetNumber oldoffnum, BlockNumber *newblkno,
                Buffer leftchildbuf,
                List **splitinfo,
-               bool markleftchild);
+               bool markleftchild,
+               Relation heapRel);
 
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
          int len, GISTSTATE *giststate);
@@ -414,6 +417,9 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer,
               IndexTuple *itup, int ntup,
               Buffer leftchild);
 
+XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+              int ntodelete, RelFileNode hnode);
+
 extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
              SplitedPageLayout *dist,
              BlockNumber origrlink, GistNSN oldnsn,
index 1a2b9496d0d87aff4c275a2a3331b687e5a353b2..b67c7100500d4ba6feccc911bad468a58c194a96 100644 (file)
@@ -18,6 +18,7 @@
 #include "lib/stringinfo.h"
 
 #define XLOG_GIST_PAGE_UPDATE      0x00
+#define XLOG_GIST_DELETE           0x10 /* delete leaf index tuples for a page */
  /* #define XLOG_GIST_NEW_ROOT          0x20 */    /* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT       0x30
  /* #define XLOG_GIST_INSERT_COMPLETE   0x40 */    /* not used anymore */
@@ -40,6 +41,22 @@ typedef struct gistxlogPageUpdate
     */
 } gistxlogPageUpdate;
 
+/*
+ * Backup Blk 0: Leaf page, whose index tuples are deleted.
+ */
+typedef struct gistxlogDelete
+{
+   RelFileNode hnode;          /* RelFileNode of the heap the index currently
+                                * points at */
+   uint16      ntodelete;      /* number of deleted offsets */
+
+   /*
+    * In payload of blk 0 : todelete OffsetNumbers
+    */
+} gistxlogDelete;
+
+#define SizeOfGistxlogDelete   (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16))
+
 /*
  * Backup Blk 0: If this operation completes a page split, by inserting a
  *              downlink for the split page, the left half of the split