Create a new dedicated Postgres process, "wal writer", which exists to write
authorTom Lane
Tue, 24 Jul 2007 04:54:09 +0000 (04:54 +0000)
committerTom Lane
Tue, 24 Jul 2007 04:54:09 +0000 (04:54 +0000)
and fsync WAL at convenient intervals.  For the moment it just tries to
offload this work from backends, but soon it will be responsible for
guaranteeing a maximum delay before asynchronously-committed transactions
will be flushed to disk.

This is a portion of Simon Riggs' async-commit patch, committed to CVS
separately because a background WAL writer seems like it might be a good idea
independently of the async-commit feature.  I rebased walwriter.c on
bgwriter.c because it seemed like a more appropriate way of handling signals;
while the startup/shutdown logic in postmaster.c is more like autovac because
we want walwriter to quit before we start the shutdown checkpoint.

doc/src/sgml/config.sgml
src/backend/access/transam/xlog.c
src/backend/bootstrap/bootstrap.c
src/backend/postmaster/Makefile
src/backend/postmaster/postmaster.c
src/backend/postmaster/walwriter.c [new file with mode: 0644]
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/access/xlog.h
src/include/bootstrap/bootstrap.h
src/include/postmaster/walwriter.h [new file with mode: 0644]

index a3331bdef6eb1fd686e076af8265d64ff6e40ab9..0e49ba321780d07a3e30d9f5260be1810d2472cd 100644 (file)
@@ -1,4 +1,4 @@
-
+
 
 
   Server Configuration
@@ -1413,7 +1413,7 @@ SET ENABLE_SEQSCAN TO OFF;
        
       
      
-     
+
      
       wal_buffers (integer)
       
@@ -1438,7 +1438,27 @@ SET ENABLE_SEQSCAN TO OFF;
        
       
      
-                
+
+     
+      wal_writer_delay (integer)
+      
+       wal_writer_delay configuration parameter
+      
+      
+       
+        Specifies the delay between activity rounds for the WAL writer.
+        In each round the writer will flush WAL to disk. It then sleeps for
+        wal_writer_delay milliseconds, and repeats.  The default
+        value is 200 milliseconds (200ms).  Note that on many
+        systems, the effective resolution of sleep delays is 10 milliseconds;
+        setting wal_writer_delay to a value that is not a multiple
+        of 10 might have the same results as setting it to the next higher
+        multiple of 10. This parameter can only be set in the
+        postgresql.conf file or on the server command line.
+       
+      
+     
+
      
       commit_delay (integer)
       
@@ -1521,7 +1541,7 @@ SET ENABLE_SEQSCAN TO OFF;
       
       
        
-        Specifies the target length of checkpoints, as a fraction of 
+        Specifies the target length of checkpoints, as a fraction of
         the checkpoint interval. The default is 0.5.
 
         This parameter can only be set in the postgresql.conf
index 15c9f310a63a57611f983d1b25319b9b9c5eaf2f..25789ddaa68bfb752753d0c7ab6863fdd7c5f148 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.274 2007/06/30 19:12:01 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.275 2007/07/24 04:54:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -484,7 +484,6 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
    uint32      len,
                write_len;
    unsigned    i;
-   XLogwrtRqst LogwrtRqst;
    bool        updrqst;
    bool        doPageWrites;
    bool        isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
@@ -643,43 +642,6 @@ begin:;
 
    START_CRIT_SECTION();
 
-   /* update LogwrtResult before doing cache fill check */
-   {
-       /* use volatile pointer to prevent code rearrangement */
-       volatile XLogCtlData *xlogctl = XLogCtl;
-
-       SpinLockAcquire(&xlogctl->info_lck);
-       LogwrtRqst = xlogctl->LogwrtRqst;
-       LogwrtResult = xlogctl->LogwrtResult;
-       SpinLockRelease(&xlogctl->info_lck);
-   }
-
-   /*
-    * If cache is half filled then try to acquire write lock and do
-    * XLogWrite. Ignore any fractional blocks in performing this check.
-    */
-   LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % XLOG_BLCKSZ;
-   if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
-       (LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
-        XLogCtl->XLogCacheByte / 2))
-   {
-       if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
-       {
-           /*
-            * Since the amount of data we write here is completely optional
-            * anyway, tell XLogWrite it can be "flexible" and stop at a
-            * convenient boundary.  This allows writes triggered by this
-            * mechanism to synchronize with the cache boundaries, so that in
-            * a long transaction we'll basically dump alternating halves of
-            * the buffer array.
-            */
-           LogwrtResult = XLogCtl->Write.LogwrtResult;
-           if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
-               XLogWrite(LogwrtRqst, true, false);
-           LWLockRelease(WALWriteLock);
-       }
-   }
-
    /* Now wait to get insert lock */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
@@ -1800,6 +1762,85 @@ XLogFlush(XLogRecPtr record)
             LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
 }
 
+/*
+ * Flush xlog, but without specifying exactly where to flush to.
+ *
+ * We normally flush only completed blocks; but if there is nothing to do on
+ * that basis, we check for unflushed async commits in the current incomplete
+ * block, and flush through the latest one of those.  Thus, if async commits
+ * are not being used, we will flush complete blocks only.  We can guarantee
+ * that async commits reach disk after at most three cycles; normally only
+ * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
+ * at the end of the buffer ring; this makes a difference only with very high
+ * load or long wal_writer_delay, but imposes one extra cycle for the worst
+ * case for async commits.)
+ *
+ * This routine is invoked periodically by the background walwriter process.
+ */
+void
+XLogBackgroundFlush(void)
+{
+   XLogRecPtr  WriteRqstPtr;
+   bool        flexible = true;
+
+   /* read LogwrtResult and update local state */
+   {
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       LogwrtResult = xlogctl->LogwrtResult;
+       WriteRqstPtr = xlogctl->LogwrtRqst.Write;
+       SpinLockRelease(&xlogctl->info_lck);
+   }
+
+   /* back off to last completed page boundary */
+   WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
+
+#ifdef NOT_YET                 /* async commit patch is still to come */
+   /* if we have already flushed that far, consider async commit records */
+   if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
+   {
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       SpinLockAcquire(&xlogctl->async_commit_lck);
+       WriteRqstPtr = xlogctl->asyncCommitLSN;
+       SpinLockRelease(&xlogctl->async_commit_lck);
+       flexible = false;       /* ensure it all gets written */
+   }
+#endif
+
+   /* Done if already known flushed */
+   if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
+       return;
+
+#ifdef WAL_DEBUG
+   if (XLOG_DEBUG)
+       elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
+            WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
+            LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
+            LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
+#endif
+
+   START_CRIT_SECTION();
+
+   /* now wait for the write lock */
+   LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+   LogwrtResult = XLogCtl->Write.LogwrtResult;
+   if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
+   {
+       XLogwrtRqst WriteRqst;
+
+       WriteRqst.Write = WriteRqstPtr;
+       WriteRqst.Flush = WriteRqstPtr;
+       XLogWrite(WriteRqst, flexible, false);
+   }
+   LWLockRelease(WALWriteLock);
+
+   END_CRIT_SECTION();
+}
+
 /*
  * Test whether XLOG data has been flushed up to (at least) the given position.
  *
index 78eb6797db43883893f5417f0cd158fb704bdd16..3ffff2a2cce3fe0ee2bb50fd079a758bcf2b747b 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.234 2007/06/28 00:02:37 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.235 2007/07/24 04:54:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -30,6 +30,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/walwriter.h"
 #include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
@@ -195,7 +196,7 @@ static IndexList *ILHead = NULL;
  *  AuxiliaryProcessMain
  *
  *  The main entry point for auxiliary processes, such as the bgwriter,
- *  bootstrapper and the shared memory checker code.
+ *  walwriter, bootstrapper and the shared memory checker code.
  *
  *  This code is here just because of historical reasons.
  */
@@ -331,6 +332,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
            case BgWriterProcess:
                statmsg = "writer process";
                break;
+           case WalWriterProcess:
+               statmsg = "wal writer process";
+               break;
            default:
                statmsg = "??? process";
                break;
@@ -419,6 +423,12 @@ AuxiliaryProcessMain(int argc, char *argv[])
            InitXLOGAccess();
            BackgroundWriterMain();
            proc_exit(1);       /* should never return */
+
+       case WalWriterProcess:
+           /* don't set signals, walwriter has its own agenda */
+           InitXLOGAccess();
+           WalWriterMain();
+           proc_exit(1);       /* should never return */
            
        default:
            elog(PANIC, "unrecognized process type: %d", auxType);
index a49e0e393bfc6ffcb575697351589a2051e845d3..7ccba285f2149e173921e7e3a6e9debf1074ad7b 100644 (file)
@@ -4,7 +4,7 @@
 #    Makefile for src/backend/postmaster
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/postmaster/Makefile,v 1.22 2007/01/20 17:16:12 petere Exp $
+#    $PostgreSQL: pgsql/src/backend/postmaster/Makefile,v 1.23 2007/07/24 04:54:09 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,8 +12,8 @@ subdir = src/backend/postmaster
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = bgwriter.o autovacuum.o pgarch.o pgstat.o postmaster.o syslogger.o \
-   fork_process.o
+OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
+   syslogger.o walwriter.o
 
 all: SUBSYS.o
 
index 7a1270b0149eb0b10e7b518d252b430b66463c68..f1f9effae77085b7ec2c23aa25e0d570e3a350ac 100644 (file)
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.534 2007/07/23 10:16:54 mha Exp $
+ *   $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.535 2007/07/24 04:54:09 tgl Exp $
  *
  * NOTES
  *
@@ -136,7 +136,7 @@ typedef struct bkend
 {
    pid_t       pid;            /* process id of backend */
    long        cancel_key;     /* cancel key for cancels for this backend */
-   bool        is_autovacuum;  /* is it an autovacuum process */
+   bool        is_autovacuum;  /* is it an autovacuum process? */
 } Backend;
 
 static Dllist *BackendList;
@@ -144,9 +144,9 @@ static Dllist *BackendList;
 #ifdef EXEC_BACKEND
 /*
  * Number of entries in the backend table. Twice the number of backends,
- * plus four other subprocesses (stats, bgwriter, autovac, logger).
+ * plus five other subprocesses (stats, bgwriter, walwriter, autovac, logger).
  */
-#define NUM_BACKENDARRAY_ELEMS (2*MaxBackends + 4)
+#define NUM_BACKENDARRAY_ELEMS (2*MaxBackends + 5)
 static Backend *ShmemBackendArray;
 #endif
 
@@ -201,6 +201,7 @@ char       *bonjour_name;
 /* PIDs of special child processes; 0 when not running */
 static pid_t StartupPID = 0,
            BgWriterPID = 0,
+           WalWriterPID = 0,
            AutoVacPID = 0,
            PgArchPID = 0,
            PgStatPID = 0,
@@ -221,7 +222,7 @@ bool        ClientAuthInProgress = false;       /* T during new-client
 bool redirection_done = false; 
 
 /* received START_AUTOVAC_LAUNCHER signal */
-static bool start_autovac_launcher = false;
+static volatile sig_atomic_t start_autovac_launcher = false;
 
 /*
  * State for assigning random salts and cancel keys.
@@ -365,6 +366,7 @@ static void ShmemBackendArrayRemove(pid_t pid);
 
 #define StartupDataBase()      StartChildProcess(StartupProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
+#define StartWalWriter()       StartChildProcess(WalWriterProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -909,8 +911,9 @@ PostmasterMain(int argc, char *argv[])
     *
     * CAUTION: when changing this list, check for side-effects on the signal
     * handling setup of child processes.  See tcop/postgres.c,
-    * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/autovacuum.c,
-    * postmaster/pgarch.c, postmaster/pgstat.c, and postmaster/syslogger.c.
+    * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
+    * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and
+    * postmaster/syslogger.c.
     */
    pqinitmask();
    PG_SETMASK(&BlockSig);
@@ -1244,6 +1247,15 @@ ServerLoop(void)
                signal_child(BgWriterPID, SIGUSR2);
        }
 
+       /*
+        * Likewise, if we have lost the walwriter process, try to start a
+        * new one.  We don't need walwriter to complete a shutdown, so
+        * don't start it if shutdown already initiated.
+        */
+       if (WalWriterPID == 0 &&
+           StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
+           WalWriterPID = StartWalWriter();
+
        /* If we have lost the autovacuum launcher, try to start a new one */
        if (AutoVacPID == 0 &&
            (AutoVacuumingActive() || start_autovac_launcher) &&
@@ -1251,7 +1263,7 @@ ServerLoop(void)
        {
            AutoVacPID = StartAutoVacLauncher();
            if (AutoVacPID != 0)
-               start_autovac_launcher = false; /* signal successfully processed */
+               start_autovac_launcher = false; /* signal processed */
        }
 
        /* If we have lost the archiver, try to start a new one */
@@ -1842,6 +1854,8 @@ SIGHUP_handler(SIGNAL_ARGS)
        SignalChildren(SIGHUP);
        if (BgWriterPID != 0)
            signal_child(BgWriterPID, SIGHUP);
+       if (WalWriterPID != 0)
+           signal_child(WalWriterPID, SIGHUP);
        if (AutoVacPID != 0)
            signal_child(AutoVacPID, SIGHUP);
        if (PgArchPID != 0)
@@ -1901,8 +1915,11 @@ pmdie(SIGNAL_ARGS)
            /* and the autovac launcher too */
            if (AutoVacPID != 0)
                signal_child(AutoVacPID, SIGTERM);
+           /* and the walwriter too */
+           if (WalWriterPID != 0)
+               signal_child(WalWriterPID, SIGTERM);
 
-           if (DLGetHead(BackendList) || AutoVacPID != 0)
+           if (DLGetHead(BackendList) || AutoVacPID != 0 || WalWriterPID != 0)
                break;          /* let reaper() handle this */
 
            /*
@@ -1938,7 +1955,7 @@ pmdie(SIGNAL_ARGS)
            ereport(LOG,
                    (errmsg("received fast shutdown request")));
 
-           if (DLGetHead(BackendList) || AutoVacPID != 0)
+           if (DLGetHead(BackendList) || AutoVacPID != 0 || WalWriterPID != 0)
            {
                if (!FatalError)
                {
@@ -1947,6 +1964,8 @@ pmdie(SIGNAL_ARGS)
                    SignalChildren(SIGTERM);
                    if (AutoVacPID != 0)
                        signal_child(AutoVacPID, SIGTERM);
+                   if (WalWriterPID != 0)
+                       signal_child(WalWriterPID, SIGTERM);
                    /* reaper() does the rest */
                }
                break;
@@ -1957,6 +1976,7 @@ pmdie(SIGNAL_ARGS)
             *
             * Note: if we previously got SIGTERM then we may send SIGUSR2 to
             * the bgwriter a second time here.  This should be harmless.
+            * Ditto for the signals to the other special children.
             */
            if (StartupPID != 0)
            {
@@ -1993,6 +2013,8 @@ pmdie(SIGNAL_ARGS)
                signal_child(StartupPID, SIGQUIT);
            if (BgWriterPID != 0)
                signal_child(BgWriterPID, SIGQUIT);
+           if (WalWriterPID != 0)
+               signal_child(WalWriterPID, SIGQUIT);
            if (AutoVacPID != 0)
                signal_child(AutoVacPID, SIGQUIT);
            if (PgArchPID != 0)
@@ -2091,13 +2113,14 @@ reaper(SIGNAL_ARGS)
 
            /*
             * Go to shutdown mode if a shutdown request was pending.
-            * Otherwise, try to start the archiver, stats collector and
-            * autovacuum launcher.
+            * Otherwise, try to start the other special children.
             */
            if (Shutdown > NoShutdown && BgWriterPID != 0)
                signal_child(BgWriterPID, SIGUSR2);
            else if (Shutdown == NoShutdown)
            {
+               if (WalWriterPID == 0)
+                   WalWriterPID = StartWalWriter();
                if (XLogArchivingActive() && PgArchPID == 0)
                    PgArchPID = pgarch_start();
                if (PgStatPID == 0)
@@ -2121,7 +2144,8 @@ reaper(SIGNAL_ARGS)
            BgWriterPID = 0;
            if (EXIT_STATUS_0(exitstatus) &&
                Shutdown > NoShutdown && !FatalError &&
-               !DLGetHead(BackendList) && AutoVacPID == 0)
+               !DLGetHead(BackendList) &&
+               WalWriterPID == 0 && AutoVacPID == 0)
            {
                /*
                 * Normal postmaster exit is here: we've seen normal exit of
@@ -2150,7 +2174,8 @@ reaper(SIGNAL_ARGS)
             * required will happen on next postmaster start.
             */
            if (Shutdown > NoShutdown &&
-               !DLGetHead(BackendList) && AutoVacPID == 0)
+               !DLGetHead(BackendList) &&
+               WalWriterPID == 0 && AutoVacPID == 0)
            {
                ereport(LOG,
                        (errmsg("abnormal database system shutdown")));
@@ -2161,6 +2186,20 @@ reaper(SIGNAL_ARGS)
            continue;
        }
 
+       /*
+        * Was it the wal writer?  Normal exit can be ignored; we'll
+        * start a new one at the next iteration of the postmaster's main loop,
+        * if necessary.  Any other exit condition is treated as a crash.
+        */
+       if (WalWriterPID != 0 && pid == WalWriterPID)
+       {
+           WalWriterPID = 0;
+           if (!EXIT_STATUS_0(exitstatus))
+               HandleChildCrash(pid, exitstatus,
+                                _("wal writer process"));
+           continue;
+       }
+
        /*
         * Was it the autovacuum launcher?  Normal exit can be ignored; we'll
         * start a new one at the next iteration of the postmaster's main loop,
@@ -2233,7 +2272,8 @@ reaper(SIGNAL_ARGS)
         * StartupDataBase.  (We can ignore the archiver and stats processes
         * here since they are not connected to shmem.)
         */
-       if (DLGetHead(BackendList) || StartupPID != 0 || BgWriterPID != 0 ||
+       if (DLGetHead(BackendList) || StartupPID != 0 ||
+           BgWriterPID != 0 || WalWriterPID != 0 ||
            AutoVacPID != 0)
            goto reaper_done;
        ereport(LOG,
@@ -2249,7 +2289,8 @@ reaper(SIGNAL_ARGS)
 
    if (Shutdown > NoShutdown)
    {
-       if (DLGetHead(BackendList) || StartupPID != 0 || AutoVacPID != 0)
+       if (DLGetHead(BackendList) || StartupPID != 0 || AutoVacPID != 0 ||
+           WalWriterPID != 0)
            goto reaper_done;
        /* Start the bgwriter if not running */
        if (BgWriterPID == 0)
@@ -2315,7 +2356,8 @@ CleanupBackend(int pid,
 }
 
 /*
- * HandleChildCrash -- cleanup after failed backend, bgwriter, or autovacuum.
+ * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter,
+ * or autovacuum.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -2390,6 +2432,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
        signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
    }
 
+   /* Take care of the walwriter too */
+   if (pid == WalWriterPID)
+       WalWriterPID = 0;
+   else if (WalWriterPID != 0 && !FatalError)
+   {
+       ereport(DEBUG2,
+               (errmsg_internal("sending %s to process %d",
+                                (SendStop ? "SIGSTOP" : "SIGQUIT"),
+                                (int) WalWriterPID)));
+       signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+   }
+
    /* Take care of the autovacuum launcher too */
    if (pid == AutoVacPID)
        AutoVacPID = 0;
@@ -3622,9 +3676,11 @@ sigusr1_handler(SIGNAL_ARGS)
        start_autovac_launcher = true;
    }
 
-   /* The autovacuum launcher wants us to start a worker process. */
    if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER))
+   {
+       /* The autovacuum launcher wants us to start a worker process. */
        StartAutovacuumWorker();
+   }
 
    PG_SETMASK(&UnBlockSig);
 
@@ -3805,6 +3861,10 @@ StartChildProcess(AuxProcType type)
                ereport(LOG,
                   (errmsg("could not fork background writer process: %m")));
                break;
+           case WalWriterProcess:
+               ereport(LOG,
+                  (errmsg("could not fork wal writer process: %m")));
+               break;
            default:
                ereport(LOG,
                        (errmsg("could not fork process: %m")));
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
new file mode 100644 (file)
index 0000000..b4d5946
--- /dev/null
@@ -0,0 +1,311 @@
+/*-------------------------------------------------------------------------
+ *
+ * walwriter.c
+ *
+ * The WAL writer background process is new as of Postgres 8.3.  It attempts
+ * to keep regular backends from having to write out (and fsync) WAL pages.
+ * Also, it guarantees that transaction commit records that weren't synced
+ * to disk immediately upon commit (ie, were "asynchronously committed")
+ * will reach disk within a knowable time --- which, as it happens, is at
+ * most three times the wal_writer_delay cycle time.
+ *
+ * Note that as with the bgwriter for shared buffers, regular backends are
+ * still empowered to issue WAL writes and fsyncs when the walwriter doesn't
+ * keep up.
+ *
+ * Because the walwriter's cycle is directly linked to the maximum delay
+ * before async-commit transactions are guaranteed committed, it's probably
+ * unwise to load additional functionality onto it.  For instance, if you've
+ * got a yen to create xlog segments further in advance, that'd be better done
+ * in bgwriter than in walwriter.
+ *
+ * The walwriter is started by the postmaster as soon as the startup subprocess
+ * finishes.  It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs the walwriter to exit(0).
+ * Emergency termination is by SIGQUIT; like any backend, the walwriter will
+ * simply abort and exit on SIGQUIT.
+ *
+ * If the walwriter exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *   $PostgreSQL: pgsql/src/backend/postmaster/walwriter.c,v 1.1 2007/07/24 04:54:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include 
+#include 
+#include 
+#include 
+
+#include "access/xlog.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "postmaster/walwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
+
+
+/*
+ * GUC parameters
+ */
+int            WalWriterDelay = 200;
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+
+/* Signal handlers */
+static void wal_quickdie(SIGNAL_ARGS);
+static void WalSigHupHandler(SIGNAL_ARGS);
+static void WalShutdownHandler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for walwriter process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
+ */
+void
+WalWriterMain(void)
+{
+   sigjmp_buf  local_sigjmp_buf;
+   MemoryContext walwriter_context;
+
+   /*
+    * If possible, make this process a group leader, so that the postmaster
+    * can signal any child processes too.  (walwriter probably never has
+    * any child processes, but for consistency we make all postmaster
+    * child processes do this.)
+    */
+#ifdef HAVE_SETSID
+   if (setsid() < 0)
+       elog(FATAL, "setsid() failed: %m");
+#endif
+
+   /*
+    * Properly accept or ignore signals the postmaster might send us
+    *
+    * We have no particular use for SIGINT at the moment, but seems
+    * reasonable to treat like SIGTERM.
+    */
+   pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */
+   pqsignal(SIGINT, WalShutdownHandler);       /* request shutdown */
+   pqsignal(SIGTERM, WalShutdownHandler);      /* request shutdown */
+   pqsignal(SIGQUIT, wal_quickdie);        /* hard crash time */
+   pqsignal(SIGALRM, SIG_IGN);
+   pqsignal(SIGPIPE, SIG_IGN);
+   pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */
+   pqsignal(SIGUSR2, SIG_IGN); /* not used */
+
+   /*
+    * Reset some signals that are accepted by postmaster but not here
+    */
+   pqsignal(SIGCHLD, SIG_DFL);
+   pqsignal(SIGTTIN, SIG_DFL);
+   pqsignal(SIGTTOU, SIG_DFL);
+   pqsignal(SIGCONT, SIG_DFL);
+   pqsignal(SIGWINCH, SIG_DFL);
+
+   /* We allow SIGQUIT (quickdie) at all times */
+#ifdef HAVE_SIGPROCMASK
+   sigdelset(&BlockSig, SIGQUIT);
+#else
+   BlockSig &= ~(sigmask(SIGQUIT));
+#endif
+
+   /*
+    * Create a resource owner to keep track of our resources (not clear
+    * that we need this, but may as well have one).
+    */
+   CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer");
+
+   /*
+    * Create a memory context that we will do all our work in.  We do this so
+    * that we can reset the context during error recovery and thereby avoid
+    * possible memory leaks.  Formerly this code just ran in
+    * TopMemoryContext, but resetting that would be a really bad idea.
+    */
+   walwriter_context = AllocSetContextCreate(TopMemoryContext,
+                                             "Wal Writer",
+                                             ALLOCSET_DEFAULT_MINSIZE,
+                                             ALLOCSET_DEFAULT_INITSIZE,
+                                             ALLOCSET_DEFAULT_MAXSIZE);
+   MemoryContextSwitchTo(walwriter_context);
+
+   /*
+    * If an exception is encountered, processing resumes here.
+    *
+    * This code is heavily based on bgwriter.c, q.v.
+    */
+   if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+   {
+       /* Since not using PG_TRY, must reset error stack by hand */
+       error_context_stack = NULL;
+
+       /* Prevent interrupts while cleaning up */
+       HOLD_INTERRUPTS();
+
+       /* Report the error to the server log */
+       EmitErrorReport();
+
+       /*
+        * These operations are really just a minimal subset of
+        * AbortTransaction().  We don't have very many resources to worry
+        * about in walwriter, but we do have LWLocks, and perhaps buffers?
+        */
+       LWLockReleaseAll();
+       AbortBufferIO();
+       UnlockBuffers();
+       /* buffer pins are released here: */
+       ResourceOwnerRelease(CurrentResourceOwner,
+                            RESOURCE_RELEASE_BEFORE_LOCKS,
+                            false, true);
+       /* we needn't bother with the other ResourceOwnerRelease phases */
+       AtEOXact_Buffers(false);
+
+       /*
+        * Now return to normal top-level context and clear ErrorContext for
+        * next time.
+        */
+       MemoryContextSwitchTo(walwriter_context);
+       FlushErrorState();
+
+       /* Flush any leaked data in the top-level context */
+       MemoryContextResetAndDeleteChildren(walwriter_context);
+
+       /* Now we can allow interrupts again */
+       RESUME_INTERRUPTS();
+
+       /*
+        * Sleep at least 1 second after any error.  A write error is likely
+        * to be repeated, and we don't want to be filling the error logs as
+        * fast as we can.
+        */
+       pg_usleep(1000000L);
+
+       /*
+        * Close all open files after any error.  This is helpful on Windows,
+        * where holding deleted files open causes various strange errors.
+        * It's not clear we need it elsewhere, but shouldn't hurt.
+        */
+       smgrcloseall();
+   }
+
+   /* We can now handle ereport(ERROR) */
+   PG_exception_stack = &local_sigjmp_buf;
+
+   /*
+    * Unblock signals (they were blocked when the postmaster forked us)
+    */
+   PG_SETMASK(&UnBlockSig);
+
+   /*
+    * Loop forever
+    */
+   for (;;)
+   {
+       long        udelay;
+
+       /*
+        * Emergency bailout if postmaster has died.  This is to avoid the
+        * necessity for manual cleanup of all postmaster children.
+        */
+       if (!PostmasterIsAlive(true))
+           exit(1);
+
+       /*
+        * Process any requests or signals received recently.
+        */
+       if (got_SIGHUP)
+       {
+           got_SIGHUP = false;
+           ProcessConfigFile(PGC_SIGHUP);
+       }
+       if (shutdown_requested)
+       {
+           /* Normal exit from the walwriter is here */
+           proc_exit(0);       /* done */
+       }
+
+       /*
+        * Do what we're here for...
+        */
+       XLogBackgroundFlush();
+
+       /*
+        * Delay until time to do something more, but fall out of delay
+        * reasonably quickly if signaled.
+        */
+       udelay = WalWriterDelay * 1000L;
+       while (udelay > 999999L)
+       {
+           if (got_SIGHUP || shutdown_requested)
+               break;
+           pg_usleep(1000000L);
+           udelay -= 1000000L;
+       }
+       if (!(got_SIGHUP || shutdown_requested))
+           pg_usleep(udelay);
+   }
+}
+
+
+/* --------------------------------
+ *     signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * wal_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+wal_quickdie(SIGNAL_ARGS)
+{
+   PG_SETMASK(&BlockSig);
+
+   /*
+    * DO NOT proc_exit() -- we're here because shared memory may be
+    * corrupted, so we don't want to try to clean up our transaction. Just
+    * nail the windows shut and get out of town.
+    *
+    * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+    * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+    * backend.  This is necessary precisely because we don't clean up our
+    * shared memory state.
+    */
+   exit(2);
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+WalSigHupHandler(SIGNAL_ARGS)
+{
+   got_SIGHUP = true;
+}
+
+/* SIGTERM: set flag to exit normally */
+static void
+WalShutdownHandler(SIGNAL_ARGS)
+{
+   shutdown_requested = true;
+}
index 06915017e6e794fe16bce272fa091bfed97b83ad..b2d0ea9cae597f6d6397e2695bceecb6795b9fc3 100644 (file)
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut .
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.406 2007/07/24 01:53:56 alvherre Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.407 2007/07/24 04:54:09 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -54,6 +54,7 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
+#include "postmaster/walwriter.h"
 #include "storage/fd.h"
 #include "storage/freespace.h"
 #include "tcop/tcopprot.h"
@@ -1509,6 +1510,16 @@ static struct config_int ConfigureNamesInt[] =
        8, 4, INT_MAX, NULL, NULL
    },
 
+   {
+       {"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS,
+           gettext_noop("WAL writer sleep time between WAL flushes."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &WalWriterDelay,
+       200, 1, 10000, NULL, NULL
+   },
+
    {
        {"commit_delay", PGC_USERSET, WAL_CHECKPOINTS,
            gettext_noop("Sets the delay in microseconds between transaction commit and "
index 51c83ade0afdf324af8d616576ef26c29228fadd..8bfad997ff38cf96b49d0eb333b899e270d0ae99 100644 (file)
 #full_page_writes = on         # recover from partial page writes
 #wal_buffers = 64kB            # min 32kB
                    # (change requires restart)
+#wal_writer_delay = 200ms      # range 1-10000, in milliseconds
+
 #commit_delay = 0          # range 0-100000, in microseconds
 #commit_siblings = 5           # range 1-1000
 
index 1b4fecdb966f11f485360e28345c109cedcce15c..adc99a6eb0610e0f551279e8d721003bfd992106 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.80 2007/06/30 19:12:02 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.81 2007/07/24 04:54:09 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -196,6 +196,7 @@ extern CheckpointStatsData CheckpointStats;
 
 extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
 extern void XLogFlush(XLogRecPtr RecPtr);
+extern void XLogBackgroundFlush(void);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
index bbde68ea1b14f0211640498f3fc35de50e931da6..d75626c8d257f5de764265270c9c689811472496 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.46 2007/03/07 13:35:03 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.47 2007/07/24 04:54:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -69,7 +69,8 @@ typedef enum
    CheckerProcess,
    BootstrapProcess,
    StartupProcess,
-   BgWriterProcess
+   BgWriterProcess,
+   WalWriterProcess
 } AuxProcType;
 
 #endif   /* BOOTSTRAP_H */
diff --git a/src/include/postmaster/walwriter.h b/src/include/postmaster/walwriter.h
new file mode 100644 (file)
index 0000000..3cefe9a
--- /dev/null
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ *
+ * walwriter.h
+ *   Exports from postmaster/walwriter.c.
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL: pgsql/src/include/postmaster/walwriter.h,v 1.1 2007/07/24 04:54:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _WALWRITER_H
+#define _WALWRITER_H
+
+/* GUC options */
+extern int WalWriterDelay;
+
+extern void WalWriterMain(void);
+
+#endif   /* _WALWRITER_H */