Avoid WAL-logging individual tuple insertions during CREATE TABLE AS
authorTom Lane
Mon, 20 Jun 2005 18:37:02 +0000 (18:37 +0000)
committerTom Lane
Mon, 20 Jun 2005 18:37:02 +0000 (18:37 +0000)
(a/k/a SELECT INTO).  Instead, flush and fsync the whole relation before
committing.  We do still need the WAL log when PITR is active, however.
Simon Riggs and Tom Lane.

src/backend/access/heap/heapam.c
src/backend/access/heap/hio.c
src/backend/executor/execMain.c
src/backend/executor/execUtils.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/include/access/heapam.h
src/include/access/hio.h
src/include/nodes/execnodes.h

index 74f76c1d16aeeef14506ca83d78b4b6a9a809874..843b2909ef27afdb0a1a93249c601fc20cfabb45 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.194 2005/06/08 15:50:21 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.195 2005/06/20 18:37:01 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -1034,9 +1034,20 @@ heap_get_latest_tid(Relation relation,
  *
  * The new tuple is stamped with current transaction ID and the specified
  * command ID.
+ *
+ * If use_wal is false, the new tuple is not logged in WAL, even for a
+ * non-temp relation.  Safe usage of this behavior requires that we arrange
+ * that all new tuples go into new pages not containing any tuples from other
+ * transactions, that the relation gets fsync'd before commit, and that the
+ * transaction emits at least one WAL record to ensure RecordTransactionCommit
+ * will decide to WAL-log the commit.
+ *
+ * use_fsm is passed directly to RelationGetBufferForTuple, which see for
+ * more info.
  */
 Oid
-heap_insert(Relation relation, HeapTuple tup, CommandId cid)
+heap_insert(Relation relation, HeapTuple tup, CommandId cid,
+           bool use_wal, bool use_fsm)
 {
    TransactionId xid = GetCurrentTransactionId();
    Buffer      buffer;
@@ -1086,7 +1097,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
        heap_tuple_toast_attrs(relation, tup, NULL);
 
    /* Find buffer to insert this tuple into */
-   buffer = RelationGetBufferForTuple(relation, tup->t_len, InvalidBuffer);
+   buffer = RelationGetBufferForTuple(relation, tup->t_len,
+                                      InvalidBuffer, use_fsm);
 
    /* NO EREPORT(ERROR) from here till changes are logged */
    START_CRIT_SECTION();
@@ -1096,7 +1108,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
    pgstat_count_heap_insert(&relation->pgstat_info);
 
    /* XLOG stuff */
-   if (!relation->rd_istemp)
+   if (relation->rd_istemp)
+   {
+       /* No XLOG record, but still need to flag that XID exists on disk */
+       MyXactMadeTempRelUpdate = true;
+   }
+   else if (use_wal)
    {
        xl_heap_insert xlrec;
        xl_heap_header xlhdr;
@@ -1151,11 +1168,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
        PageSetLSN(page, recptr);
        PageSetTLI(page, ThisTimeLineID);
    }
-   else
-   {
-       /* No XLOG record, but still need to flag that XID exists on disk */
-       MyXactMadeTempRelUpdate = true;
-   }
 
    END_CRIT_SECTION();
 
@@ -1183,7 +1195,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
 Oid
 simple_heap_insert(Relation relation, HeapTuple tup)
 {
-   return heap_insert(relation, tup, GetCurrentCommandId());
+   return heap_insert(relation, tup, GetCurrentCommandId(), true, true);
 }
 
 /*
@@ -1743,7 +1755,7 @@ l2:
        {
            /* Assume there's no chance to put newtup on same page. */
            newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
-                                              buffer);
+                                              buffer, true);
        }
        else
        {
@@ -1760,7 +1772,7 @@ l2:
                 */
                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
-                                                  buffer);
+                                                  buffer, true);
            }
            else
            {
index 583bb209336c47e5b3cee00a05abbdf156e9771f..fc1b0afd21e8b5fa57913d5acc22349a5357924a 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.56 2005/05/07 21:32:23 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.57 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -79,12 +79,26 @@ RelationPutHeapTuple(Relation relation,
  * happen if space is freed in that page after heap_update finds there's not
  * enough there).  In that case, the page will be pinned and locked only once.
  *
+ * If use_fsm is true (the normal case), we use FSM to help us find free
+ * space.  If use_fsm is false, we always append a new empty page to the
+ * end of the relation if the tuple won't fit on the current target page.
+ * This can save some cycles when we know the relation is new and doesn't
+ * contain useful amounts of free space.
+ *
+ * The use_fsm = false case is also useful for non-WAL-logged additions to a
+ * relation, if the caller holds exclusive lock and is careful to invalidate
+ * relation->rd_targblock before the first insertion --- that ensures that
+ * all insertions will occur into newly added pages and not be intermixed
+ * with tuples from other transactions.  That way, a crash can't risk losing
+ * any committed data of other transactions.  (See heap_insert's comments
+ * for additional constraints needed for safe usage of this behavior.)
+ *
  * ereport(ERROR) is allowed here, so this routine *must* be called
  * before any (unlogged) changes are made in buffer pool.
  */
 Buffer
 RelationGetBufferForTuple(Relation relation, Size len,
-                         Buffer otherBuffer)
+                         Buffer otherBuffer, bool use_fsm)
 {
    Buffer      buffer = InvalidBuffer;
    Page        pageHeader;
@@ -121,11 +135,14 @@ RelationGetBufferForTuple(Relation relation, Size len,
     * on each page that proves not to be suitable.)  If the FSM has no
     * record of a page with enough free space, we give up and extend the
     * relation.
+    *
+    * When use_fsm is false, we either put the tuple onto the existing
+    * target page or extend the relation.
     */
 
    targetBlock = relation->rd_targblock;
 
-   if (targetBlock == InvalidBlockNumber)
+   if (targetBlock == InvalidBlockNumber && use_fsm)
    {
        /*
         * We have no cached target page, so ask the FSM for an initial
@@ -209,6 +226,10 @@ RelationGetBufferForTuple(Relation relation, Size len,
            ReleaseBuffer(buffer);
        }
 
+       /* Without FSM, always fall out of the loop and extend */
+       if (!use_fsm)
+           break;
+
        /*
         * Update FSM as to condition of this page, and ask for another
         * page to try.
index a390829bb8ef749dbb5882ce3b88ceb3d0fa202b..938474610ae80df3fcb45d2ac22118d6a2248481 100644 (file)
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.249 2005/05/22 22:30:19 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.250 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include "access/heapam.h"
+#include "access/xlog.h"
 #include "catalog/heap.h"
 #include "catalog/namespace.h"
 #include "commands/tablecmds.h"
@@ -44,6 +45,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/var.h"
 #include "parser/parsetree.h"
+#include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/guc.h"
 #include "utils/lsyscache.h"
@@ -784,6 +786,20 @@ InitPlan(QueryDesc *queryDesc, bool explainOnly)
         * And open the constructed table for writing.
         */
        intoRelationDesc = heap_open(intoRelationId, AccessExclusiveLock);
+
+       /* use_wal off requires rd_targblock be initially invalid */
+       Assert(intoRelationDesc->rd_targblock == InvalidBlockNumber);
+
+       /*
+        * We can skip WAL-logging the insertions, unless PITR is in use.
+        *
+        * Note that for a non-temp INTO table, this is safe only because
+        * we know that the catalog changes above will have been WAL-logged,
+        * and so RecordTransactionCommit will think it needs to WAL-log the
+        * eventual transaction commit.  Else the commit might be lost, even
+        * though all the data is safely fsync'd ...
+        */
+       estate->es_into_relation_use_wal = XLogArchivingActive();
    }
 
    estate->es_into_relation_descriptor = intoRelationDesc;
@@ -979,7 +995,22 @@ ExecEndPlan(PlanState *planstate, EState *estate)
     * close the "into" relation if necessary, again keeping lock
     */
    if (estate->es_into_relation_descriptor != NULL)
+   {
+       /*
+        * If we skipped using WAL, and it's not a temp relation,
+        * we must force the relation down to disk before it's
+        * safe to commit the transaction.  This requires forcing
+        * out any dirty buffers and then doing a forced fsync.
+        */
+       if (!estate->es_into_relation_use_wal &&
+           !estate->es_into_relation_descriptor->rd_istemp)
+       {
+           FlushRelationBuffers(estate->es_into_relation_descriptor);
+           smgrimmedsync(estate->es_into_relation_descriptor->rd_smgr);
+       }
+
        heap_close(estate->es_into_relation_descriptor, NoLock);
+   }
 
    /*
     * close any relations selected FOR UPDATE/FOR SHARE, again keeping locks
@@ -1307,7 +1338,9 @@ ExecSelect(TupleTableSlot *slot,
 
        tuple = ExecCopySlotTuple(slot);
        heap_insert(estate->es_into_relation_descriptor, tuple,
-                   estate->es_snapshot->curcid);
+                   estate->es_snapshot->curcid,
+                   estate->es_into_relation_use_wal,
+                   false);     /* never any point in using FSM */
        /* we know there are no indexes to update */
        heap_freetuple(tuple);
        IncrAppended();
@@ -1386,7 +1419,8 @@ ExecInsert(TupleTableSlot *slot,
     * insert the tuple
     */
    newId = heap_insert(resultRelationDesc, tuple,
-                       estate->es_snapshot->curcid);
+                       estate->es_snapshot->curcid,
+                       true, true);
 
    IncrAppended();
    (estate->es_processed)++;
@@ -2089,6 +2123,7 @@ EvalPlanQualStart(evalPlanQual *epq, EState *estate, evalPlanQual *priorepq)
    epqstate->es_result_relation_info = estate->es_result_relation_info;
    epqstate->es_junkFilter = estate->es_junkFilter;
    epqstate->es_into_relation_descriptor = estate->es_into_relation_descriptor;
+   epqstate->es_into_relation_use_wal = estate->es_into_relation_use_wal;
    epqstate->es_param_list_info = estate->es_param_list_info;
    if (estate->es_topPlan->nParamExec > 0)
        epqstate->es_param_exec_vals = (ParamExecData *)
index 133bf57bca23724dc5c1351c45dabbe8cf151b9a..8eaff494e3ea1d8e738aabc93adee353a376e1d9 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.123 2005/04/28 21:47:12 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.124 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -186,7 +186,9 @@ CreateExecutorState(void)
    estate->es_result_relation_info = NULL;
 
    estate->es_junkFilter = NULL;
+
    estate->es_into_relation_descriptor = NULL;
+   estate->es_into_relation_use_wal = false;
 
    estate->es_param_list_info = NULL;
    estate->es_param_exec_vals = NULL;
index 1c0cb7e240b9b61a65cfef399a9251da056bfd9f..fa7913aff74bd5db0e3559e008200a9abf1cd874 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.115 2005/05/29 04:23:05 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.116 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -660,6 +660,9 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 
 /*
  * mdimmedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.
  */
 bool
 mdimmedsync(SMgrRelation reln)
index 2c8cf07eec83993d063ad7fffb6b3ca90c3053ce..f286b20ee2533fdb4765504f0524ab2d652788a5 100644 (file)
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.90 2005/06/17 22:32:46 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.91 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -650,7 +650,8 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 /*
  * smgrimmedsync() -- Force the specified relation to stable storage.
  *
- *     Synchronously force all of the specified relation down to disk.
+ *     Synchronously force all previous writes to the specified relation
+ *     down to disk.
  *
  *     This is useful for building completely new relations (eg, new
  *     indexes).  Instead of incrementally WAL-logging the index build
@@ -664,6 +665,10 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
  *
  *     The preceding writes should specify isTemp = true to avoid
  *     duplicative fsyncs.
+ *
+ *     Note that you need to do FlushRelationBuffers() first if there is
+ *     any possibility that there are dirty buffers for the relation;
+ *     otherwise the sync is not very meaningful.
  */
 void
 smgrimmedsync(SMgrRelation reln)
index 151a62f9b68b02f9c67878a3c9f68b8561ba257e..dde6fe8ecd89e23e362d1cca54c278196038543f 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.101 2005/06/06 17:01:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.102 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -156,7 +156,8 @@ extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot,
                    ItemPointer tid);
 extern void setLastTid(const ItemPointer tid);
 
-extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid);
+extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid,
+                       bool use_wal, bool use_fsm);
 extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid,
            CommandId cid, Snapshot crosscheck, bool wait);
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
index 49091eb202fae1f16ecb1ed5be1dd1f766780a67..e706fea4ca136454753b67a6d72e3816074a0406 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/hio.h,v 1.27 2004/12/31 22:03:21 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/hio.h,v 1.28 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,6 @@
 extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
                     HeapTuple tuple);
 extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
-                         Buffer otherBuffer);
+                                       Buffer otherBuffer, bool use_fsm);
 
 #endif   /* HIO_H */
index 19f264119c3c0b3814ae4db2f238103f0304676a..df41c8561084b511a237a5ab134c443c438b3eb7 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.134 2005/06/15 07:27:44 neilc Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.135 2005/06/20 18:37:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -304,7 +304,9 @@ typedef struct EState
    ResultRelInfo *es_result_relation_info;     /* currently active array
                                                 * elt */
    JunkFilter *es_junkFilter;  /* currently active junk filter */
+
    Relation    es_into_relation_descriptor;    /* for SELECT INTO */
+   bool        es_into_relation_use_wal;
 
    /* Parameter info: */
    ParamListInfo es_param_list_info;   /* values of external params */