Replace max_standby_delay with two parameters, max_standby_archive_delay and

author Tom Lane

Sat, 3 Jul 2010 20:43:58 +0000 (20:43 +0000)

committer Tom Lane

Sat, 3 Jul 2010 20:43:58 +0000 (20:43 +0000)
author Tom Lane
Sat, 3 Jul 2010 20:43:58 +0000 (20:43 +0000)
committer Tom Lane
Sat, 3 Jul 2010 20:43:58 +0000 (20:43 +0000)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 412e535a485538cd9acfb6e556aa8aeabdeb2693..391e4365c0c59aafa73db924cbdba72075b392bb 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1,4 +1,4 @@
-
+
  
  
    Server Configuration
@@ -1841,6 +1841,8 @@ SET ENABLE_SEQSCAN TO OFF;
       
        These settings control the behavior of the built-in
        streaming replication feature.
+      These parameters would be set on the primary server that is
+      to send replication data to one or more standby servers.
       
  
       
@@ -1866,7 +1868,7 @@ SET ENABLE_SEQSCAN TO OFF;
         
         
         
-        Specifies the delay between activity rounds for the WAL sender.
+        Specifies the delay between activity rounds for WAL sender processes.
          In each round the WAL sender sends any WAL accumulated since the last
          round to the standby server. It then sleeps for
          wal_sender_delay milliseconds, and repeats. The default
@@ -1887,34 +1889,42 @@ SET ENABLE_SEQSCAN TO OFF;
         
         
         
-        Specifies the number of past log file segments kept in the
+        Specifies the minimum number of past log file segments kept in the
          pg_xlog
          directory, in case a standby server needs to fetch them for streaming
          replication. Each segment is normally 16 megabytes. If a standby
          server connected to the primary falls behind by more than
          wal_keep_segments segments, the primary might remove
          a WAL segment still needed by the standby, in which case the
-        replication connection will be terminated.
+        replication connection will be terminated.  (However, the standby
+        server can recover by fetching the segment from archive, if WAL
+        archiving is in use.)
         
  
         
-        This sets only the minimum number of segments retained for standby
-        purposes; the system might need to retain more segments for WAL
-        archival or to recover from a checkpoint. If wal_keep_segments
-        is zero (the default), the system doesn't keep any extra segments
-        for standby purposes, and the number of old WAL segments available
-        for standbys is determined based only on the location of the previous
-        checkpoint and status of WAL archiving.
-        This parameter can only be set in the postgresql.conf
-        file or on the server command line.
+        This sets only the minimum number of segments retained in
+        pg_xlog; the system might need to retain more segments
+        for WAL archival or to recover from a checkpoint. If
+        wal_keep_segments is zero (the default), the system
+        doesn't keep any extra segments for standby purposes, and the number
+        of old WAL segments available to standby servers is a function of
+        the location of the previous checkpoint and status of WAL
+        archiving.  This parameter can only be set in the
+        postgresql.conf file or on the server command line.
         
         
        
       
      
+
      
      Standby Servers
  
+     
+      These settings control the behavior of a standby server that is
+      to receive replication data.
+     
+
      
  
       
@@ -1933,39 +1943,64 @@ SET ENABLE_SEQSCAN TO OFF;
        
       
  
-     delay" xreflabel="max_standby_delay">
-      max_standby_delay (integer)
+     archive-delay" xreflabel="max_standby_archive_delay">
+      max_standby_archive_delay (integer)
        
-       max_standby_delay configuration parameter
+       max_standby_archive_delay configuration parameter
        
        
         
-        When Hot Standby is active, this parameter specifies a wait policy
-        for applying WAL entries that conflict with active queries.
-        If a conflict should occur the server will delay up to this long
-        before it cancels conflicting queries, as
-        described in .
-        The default is 30 seconds (30 s). Units are milliseconds.
-        A value of -1 causes the standby to wait forever for a conflicting
-        query to complete.
+        When Hot Standby is active, this parameter determines how long the
+        standby server should wait before canceling standby queries that
+        conflict with about-to-be-applied WAL entries, as described in
+        .
+        max_standby_archive_delay applies when WAL data is
+        being read from WAL archive (and is therefore not current).
+        The default is 30 seconds. Units are milliseconds if not specified.
+        A value of -1 allows the standby to wait forever for conflicting
+        queries to complete.
          This parameter can only be set in the postgresql.conf
          file or on the server command line.
         
         
-        A high value makes query cancel less likely.
-        Increasing this parameter or setting it to -1 might delay master server
-        changes from appearing on the standby.
-      
-      
-       While it is tempting to believe that max_standby_delay
-       is the maximum length of time a query can run before
-       cancellation is possible, this is not true.  When a long-running
-       query ends, there is a finite time required to apply backlogged
-       WAL logs.  If a second long-running query appears before the
-       WAL has caught up, the snapshot taken by the second query will
-       allow significantly less than max_standby_delay seconds
-       before query cancellation is possible.
-      
+        Note that max_standby_archive_delay is not the same as the
+        maximum length of time a query can run before cancellation; rather it
+        is the maximum total time allowed to apply any one WAL segment's data.
+        Thus, if one query has resulted in significant delay earlier in the
+        WAL segment, subsequent conflicting queries will have much less grace
+        time.
+       
+      
+     
+
+     
+      max_standby_streaming_delay (integer)
+      
+       max_standby_streaming_delay configuration parameter
+      
+      
+       
+        When Hot Standby is active, this parameter determines how long the
+        standby server should wait before canceling standby queries that
+        conflict with about-to-be-applied WAL entries, as described in
+        .
+        max_standby_streaming_delay applies when WAL data is
+        being received via streaming replication.
+        The default is 30 seconds. Units are milliseconds if not specified.
+        A value of -1 allows the standby to wait forever for conflicting
+        queries to complete.
+        This parameter can only be set in the postgresql.conf
+        file or on the server command line.
+       
+       
+        Note that max_standby_streaming_delay is not the same as
+        the maximum length of time a query can run before cancellation; rather
+        it is the maximum total time allowed to apply WAL data once it has
+        been received from the primary server.  Thus, if one query has
+        resulted in significant delay, subsequent conflicting queries will
+        have much less grace time until the standby server has caught up
+        again.
+       
        
       
  
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml

index b87c88155112f3aacdc95439d07f61bcd6883145..857fcfea00063268df78c94b16edf39af6f8c05a 100644 (file)
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   High Availability, Load Balancing, and Replication
@@ -1132,18 +1132,18 @@ if (!triggered)
     
      Hot Standby is the term used to describe the ability to connect to
      the server and run read-only queries while the server is in archive
-    recovery. This
-    is useful for both log shipping replication and for restoring a backup
-    to an exact state with great precision.
+    recovery or standby mode. This
+    is useful both for replication purposes and for restoring a backup
+    to a desired state with great precision.
      The term Hot Standby also refers to the ability of the server to move
      from recovery through to normal operation while users continue running
      queries and/or keep their connections open.
     
  
     
-    Running queries in recovery mode is similar to normal query operation,
+    Running queries in hot standby mode is similar to normal query operation,
      though there are several usage and administrative differences
-    noted below.
+    explained below.
     
  
    
@@ -1170,7 +1170,7 @@ if (!triggered)
     
  
     
-    Transactions started during recovery may issue the following commands:
+    Transactions started during hot standby may issue the following commands:
  
      
       
@@ -1231,9 +1231,9 @@ if (!triggered)
     
  
     
-    Transactions started during recovery may never be assigned a transaction ID
-    and may not write to the system write-ahead log.  Therefore, the following
-    actions will produce error messages:
+    Transactions started during hot standby will never be assigned a
+    transaction ID and cannot write to the system write-ahead log.
+    Therefore, the following actions will produce error messages:
  
      
       
@@ -1323,22 +1323,22 @@ if (!triggered)
     
  
     
-    Outside of recovery, read-only transactions are allowed to update sequences
-    and to use LISTEN, UNLISTEN, and
+    In normal operation, read-only transactions are allowed to
+    update sequences and to use LISTEN, UNLISTEN, and
      NOTIFY, so Hot Standby sessions operate under slightly tighter
      restrictions than ordinary read-only sessions.  It is possible that some
      of these restrictions might be loosened in a future release.
     
  
     
-    During recovery, the parameter transaction_read_only is always
+    During hot standby, the parameter transaction_read_only is always
      true and may not be changed.  But as long as no attempt is made to modify
-    the database, connections during recovery will act much like any other
+    the database, connections during hot standby will act much like any other
      database connection.  If failover or switchover occurs, the database will
      switch to normal processing mode.  Sessions will remain connected while the
-    server changes mode.  Once recovery finishes, it will be possible to
+    server changes mode.  Once hot standby finishes, it will be possible to
      initiate read-write transactions (even from a session begun during
-    recovery).
+    hot standby).
     
  
     
@@ -1350,21 +1350,13 @@ if (!triggered)
      can be used to monitor the progress of recovery, or to allow you to
      write complex programs that restore the database to particular states.
     
-
-   
-    In general, queries will not experience lock conflicts from the database
-    changes made by recovery. This is because recovery follows normal
-    concurrency control mechanisms, known as MVCC. There are
-    some types of change that will cause conflicts, covered in the following
-    section.
-   
    
  
    
     Handling query conflicts
  
     
-    The primary and standby nodes are in many ways loosely connected. Actions
+    The primary and standby servers are in many ways loosely connected. Actions
      on the primary will have an effect on the standby. As a result, there is
      potential for negative interactions or conflicts between them. The easiest
      conflict to understand is performance: if a huge data load is taking place
@@ -1377,193 +1369,177 @@ if (!triggered)
      These conflicts are hard conflicts in the sense that queries
      might need to be cancelled and, in some cases, sessions disconnected to resolve them.
      The user is provided with several ways to handle these
-    conflicts. Conflicts can be caused by:
+    conflicts. Conflict cases include:
  
        
         
          
-         Access Exclusive Locks from primary node, including both explicit
-         LOCK commands and various DDL actions
+         Access Exclusive locks taken on the primary server, including both
+         explicit LOCK commands and various DDL
+         actions, conflict with table accesses in standby queries.
          
         
         
          
-         Dropping tablespaces on the primary while standby queries are using
-         those tablespaces for temporary work files (work_mem overflow)
+         Dropping a tablespace on the primary conflicts with standby queries
+         using that tablespace for temporary work files.
          
         
         
          
-         Dropping databases on the primary while users are connected to that
-         database on the standby.
+         Dropping a database on the primary conflicts with sessions connected
+         to that database on the standby.
          
         
         
          
-         The standby waiting longer than max_standby_delay
-         to acquire a buffer cleanup lock.
+         Application of a vacuum cleanup record from WAL conflicts with
+         standby transactions whose snapshots can still see any of
+         the rows to be removed.
          
         
         
          
-         Early cleanup of data still visible to the current query's snapshot.
+         Application of a vacuum cleanup record from WAL conflicts with
+         queries accessing the target page on the standby, whether or not
+         the data to be removed is visible.
          
         
        
     
  
     
-    Some WAL redo actions will be for DDL execution. These DDL
-    actions are replaying changes that have already committed on the primary
-    node, so they must not fail on the standby node. These DDL locks take
-    priority and will automatically cancel any read-only
-    transactions that get in their way, after a grace period. This is similar
-    to the possibility of being canceled by the deadlock detector.  But in this
-    case, the standby recovery process always wins, since the replayed actions
-    must not fail.  This also ensures that replication does not fall behind
-    while waiting for a query to complete. This prioritization presumes that
-    the standby exists primarily for high availability, and that adjusting the
-    grace period will allow a sufficient guard against unexpected cancellation.
+    On the primary server, these cases simply result in waiting; and the
+    user might choose to cancel either of the conflicting actions.  However,
+    on the standby there is no choice: the WAL-logged action already occurred
+    on the primary so the standby must not fail to apply it.  Furthermore,
+    allowing WAL application to wait indefinitely may be very undesirable,
+    because the standby's state will become increasingly far behind the
+    primary's.  Therefore, a mechanism is provided to forcibly cancel standby
+    queries that conflict with to-be-applied WAL records.
     
  
     
-    An example of the above would be an administrator on the primary server
-    running DROP TABLE on a table that is currently being queried
-    on the standby server.
-    Clearly the query cannot continue if DROP TABLE
-    proceeds. If this situation occurred on the primary, the DROP TABLE
-    would wait until the query had finished. When DROP TABLE is
-    run on the primary, the primary doesn't have
-    information about which queries are running on the standby, so it
-    cannot wait for any of the standby queries. The WAL change records come through to the
-    standby while the standby query is still running, causing a conflict.
+    An example of the problem situation is an administrator on the primary
+    server running DROP TABLE on a table that is currently being
+    queried on the standby server.  Clearly the standby query cannot continue
+    if the DROP TABLE is applied on the standby. If this situation
+    occurred on the primary, the DROP TABLE would wait until the
+    other query had finished. But when DROP TABLE is run on the
+    primary, the primary doesn't have information about what queries are
+    running on the standby, so it will not wait for any such standby
+    queries. The WAL change records come through to the standby while the
+    standby query is still running, causing a conflict.  The standby server
+    must either delay application of the WAL records (and everything after
+    them, too) or else cancel the conflicting query so that the DROP
+    TABLE can be applied.
     
  
     
-    The most common reason for conflict between standby queries and WAL redo is
-    "early cleanup". Normally, PostgreSQL allows cleanup of old
-    row versions when there are no users who need to see them to ensure correct
-    visibility of data (the heart of MVCC). If there is a standby query that has
-    been running for longer than any query on the primary then it is possible
-    for old row versions to be removed by either a vacuum or HOT. This will
-    then generate WAL records that, if applied, would remove data on the
-    standby that might potentially be required by the standby query.
-    In more technical language, the primary's xmin horizon is later than
-    the standby's xmin horizon, allowing dead rows to be removed.
+    When a conflicting query is short, it's typically desirable to allow it to
+    complete by delaying WAL application for a little bit; but a long delay in
+    WAL application is usually not desirable.  So the cancel mechanism has
+    parameters,  and 
+    linkend="guc-max-standby-streaming-delay">, that define the maximum
+    allowed delay in WAL application.  Conflicting queries will be canceled
+    once it has taken longer than the relevant delay setting to apply any
+    newly-received WAL data.  There are two parameters so that different delay
+    values can be specified for the case of reading WAL data from an archive
+    (i.e., initial recovery from a base backup or catching up a
+    standby server that has fallen far behind) versus reading WAL data via
+    streaming replication.
    
 
    
-    Experienced users should note that both row version cleanup and row version
-    freezing will potentially conflict with recovery queries. Running a
-    manual VACUUM FREEZE is likely to cause conflicts even on tables
-    with no updated or deleted rows.
+    In a standby server that exists primarily for high availability, it's
+    best to set the delay parameters relatively short, so that the server
+    cannot fall far behind the primary due to delays caused by standby
+    queries.  However, if the standby server is meant for executing
+    long-running queries, then a high or even infinite delay value may be
+    preferable.  Keep in mind however that a long-running query could
+    cause other sessions on the standby server to not see recent changes
+    on the primary, if it delays application of WAL records.
    
 
    
-    There are a number of choices for resolving query conflicts.  The default
-    is to wait and hope the query finishes. The server will wait
-    automatically until the lag between primary and standby is at most
-     (30 seconds by default).
-    Once that grace period expires,
-    one of the following actions is taken:
-
-      
-       
-        
-         If the conflict is caused by a lock, the conflicting standby
-         transaction is cancelled immediately. If the transaction is
-         idle-in-transaction, then the session is aborted instead.
-         This behavior might change in the future.
-        
-       
-
-       
-        
-         If the conflict is caused by cleanup records, the standby query is informed
-         a conflict has occurred and that it must cancel itself to avoid the
-         risk that it silently fails to read relevant data because
-         that data has been removed.  Some cleanup
-         records only conflict with older queries, while others
-         can affect all queries.
-        
-
-        
-         Cancelled queries may be retried immediately (after beginning a new
-         transaction, of course).  Since query cancellation depends on
-         the nature of the WAL records being replayed, a query that was
-         cancelled may succeed if it is executed again.
-        
-       
-      
+    The most common reason for conflict between standby queries and WAL replay
+    is early cleanup.  Normally, PostgreSQL allows
+    cleanup of old row versions when there are no transactions that need to
+    see them to ensure correct visibility of data according to MVCC rules.
+    However, this rule can only be applied for transactions executing on the
+    master.  So it is possible that cleanup on the master will remove row
+    versions that are still visible to a transaction on the standby.
    
 
    
-    Keep in mind that max_standby_delay is compared to the
-    difference between the standby server's clock and the transaction
-    commit timestamps read from the WAL log.  Thus, the grace period
-    allowed to any one query on the standby is never more than
-    max_standby_delay, and could be considerably less if the
-    standby has already fallen behind as a result of waiting for previous
-    queries to complete, or as a result of being unable to keep up with a
-    heavy update load.
+    Experienced users should note that both row version cleanup and row version
+    freezing will potentially conflict with standby queries. Running a manual
+    VACUUM FREEZE is likely to cause conflicts even on tables with
+    no updated or deleted rows.
    
 
-   
-    
-     Be sure that the primary and standby servers' clocks are kept in sync;
-     otherwise the values compared to max_standby_delay will be
-     erroneous, possibly leading to additional query cancellations.
-     If the clocks are intentionally not in sync, or if there is a large
-     propagation delay from primary to standby, it is advisable to set
-     max_standby_delay to -1.  In any case the value should be
-     larger than the largest expected clock skew between primary and standby.
-    
-   
+   
+    Once the delay specified by max_standby_archive_delay or
+    max_standby_streaming_delay has been exceeded, conflicting
+    queries will be cancelled.  This usually results just in a cancellation
+    error, although in the case of replaying a DROP DATABASE
+    the entire conflicting session will be terminated.  Also, if the conflict
+    is over a lock held by an idle transaction, the conflicting session is
+    terminated (this behavior might change in the future).
+   
 
    
-    Users should be clear that tables that are regularly and heavily updated on the
-    primary server will quickly cause cancellation of longer running queries on
-    the standby. In those cases max_standby_delay can be
-    considered similar to setting
-    statement_timeout.
-    
+    Cancelled queries may be retried immediately (after beginning a new
+    transaction, of course).  Since query cancellation depends on
+    the nature of the WAL records being replayed, a query that was
+    cancelled may well succeed if it is executed again.
+   
 
    
-    Other remedial actions exist if the number of cancellations is unacceptable.
-    The first option is to connect to the primary server and keep a query active
-    for as long as needed to run queries on the standby. This guarantees that
-    a WAL cleanup record is never generated and query conflicts do not occur,
-    as described above. This could be done using contrib/dblink
-    and pg_sleep(), or via other mechanisms. If you do this, you
-    should note that this will delay cleanup of dead rows on the primary by
-    vacuum or HOT, which may be undesirable. However, remember
-    that the primary and standby nodes are linked via the WAL, so the cleanup
-    situation is no different from the case where the query ran on the primary
-    node itself, and you are still getting the benefit of off-loading the
-    execution onto the standby. max_standby_delay should
-    not be used in this case because delayed WAL files might already
-    contain entries that invalidate the current snapshot.
+    Keep in mind that the delay parameters are compared to the elapsed time
+    since the WAL data was received by the standby server.  Thus, the grace
+    period allowed to any one query on the standby is never more than the
+    delay parameter, and could be considerably less if the standby has already
+    fallen behind as a result of waiting for previous queries to complete, or
+    as a result of being unable to keep up with a heavy update load.
    
 
    
-    It is also possible to set vacuum_defer_cleanup_age on the primary
-    to defer the cleanup of records by autovacuum, VACUUM
-    and HOT. This might allow
-    more time for queries to execute before they are cancelled on the standby,
-    without the need for setting a high max_standby_delay.
+    Users should be clear that tables that are regularly and heavily updated
+    on the primary server will quickly cause cancellation of longer running
+    queries on the standby. In such cases the setting of a finite value for
+    max_standby_archive_delay or
+    max_standby_streaming_delay can be considered similar to
+    setting statement_timeout.
    
 
    
-    Three-way deadlocks are possible between AccessExclusiveLocks arriving from
-    the primary, cleanup WAL records that require buffer cleanup locks, and
-    user requests that are waiting behind replayed AccessExclusiveLocks.
-    Deadlocks are resolved automatically after deadlock_timeout
-    seconds, though they are thought to be rare in practice.
+    Remedial possibilities exist if the number of standby-query cancellations
+    is found to be unacceptable.  The first option is to connect to the
+    primary server and keep a query active for as long as needed to
+    run queries on the standby. This prevents VACUUM from removing
+    recently-dead rows and so cleanup conflicts do not occur.
+    This could be done using contrib/dblink and
+    pg_sleep(), or via other mechanisms. If you do this, you
+    should note that this will delay cleanup of dead rows on the primary,
+    which may result in undesirable table bloat. However, the cleanup
+    situation will be no worse than if the standby queries were running
+    directly on the primary server, and you are still getting the benefit of
+    off-loading execution onto the standby.
+    max_standby_archive_delay must be kept large in this case,
+    because delayed WAL files might already contain entries that conflict with
+    the desired standby queries.
    
 
    
-    Dropping tablespaces or databases is discussed in the administrator's
-    section since they are not typical user situations.
+    Another option is to increase 
+    on the primary server, so that dead rows will not be cleaned up as quickly
+    as they normally would be.  This will allow more time for queries to
+    execute before they are cancelled on the standby, without having to set
+    a high max_standby_streaming_delay.  However it is
+    difficult to guarantee any specific execution-time window with this
+    approach, since vacuum_defer_cleanup_age is measured in
+    transactions executed on the primary server.
    
   
 
@@ -1644,19 +1620,15 @@ LOG:  database system is ready to accept read only connections
    
 
    
-    It is important that the administrator consider the appropriate setting
-    of max_standby_delay,
-    which can be set in postgresql.conf.
-    There is no optimal setting, so it should be set according to business
-    priorities. For example if the server is primarily tasked as a High
-    Availability server, then you may wish to lower
-    max_standby_delay or even set it to zero, though that is a
-    very aggressive setting. If the standby server is tasked as an additional
-    server for decision support queries then it might be acceptable to set this
-    to a value of many hours.  It is also possible to set
-    max_standby_delay to -1 which means wait forever for queries
-    to complete; this will be useful when performing
-    an archive recovery from a backup.
+    It is important that the administrator select appropriate settings for
+     and 
+    linkend="guc-max-standby-streaming-delay">.  The best choices vary
+    depending on business priorities.  For example if the server is primarily
+    tasked as a High Availability server, then you will want low delay
+    settings, perhaps even zero, though that is a very aggressive setting. If
+    the standby server is tasked as an additional server for decision support
+    queries then it might be acceptable to set the maximum delay values to
+    many hours, or even -1 which means wait forever for queries to complete.
    
 
    
@@ -1792,11 +1764,12 @@ LOG:  database system is ready to accept read only connections
    
 
    
-    Running DROP DATABASE, ALTER DATABASE ... SET TABLESPACE,
-    or ALTER DATABASE ... RENAME on primary will generate a log message
-    that will cause all users connected to that database on the standby to be
-    forcibly disconnected. This action occurs immediately, whatever the setting of
-    max_standby_delay.
+    Running DROP DATABASE, ALTER DATABASE ... SET
+    TABLESPACE, or ALTER DATABASE ... RENAME on the primary
+    will generate a WAL entry that will cause all users connected to that
+    database on the standby to be forcibly disconnected. This action occurs
+    immediately, whatever the setting of
+    max_standby_streaming_delay.
    
 
    
@@ -1817,7 +1790,7 @@ LOG:  database system is ready to accept read only connections
    
 
    
-    Autovacuum is not active during recovery, it will start normally at the
+    Autovacuum is not active during recovery.  It will start normally at the
     end of recovery.
    
 
@@ -1836,21 +1809,25 @@ LOG:  database system is ready to accept read only connections
 
    
     Various parameters have been mentioned above in
-    admin">
-    and ">.
+    conflict"> and
+    ">.
    
 
    
     On the primary, parameters  and
      can be used.
-     has no effect if set on the primary.
+     and
+     have no effect if set on
+    the primary.
    
 
    
-    On the standby, parameters  and
-     can be used.
-     has no effect during
-    recovery.
+    On the standby, parameters ,
+     and
+     can be used.
+     has no effect
+    as long as the server remains in standby mode, though it will
+    become relevant if the standby becomes primary.
    
   
 


diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 7e3cf80c0315649e671349fe0666f2dfc75ba74e..e66b75aa7ad4e01ff9f15ee8b5d1b96a1dd45e7b 100644 (file)


--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.427 2010/06/28 19:46:19 rhaas Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.428 2010/07/03 20:43:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -72,7 +72,6 @@ int           XLogArchiveTimeout = 0;
 bool       XLogArchiveMode = false;
 char      *XLogArchiveCommand = NULL;
 bool       EnableHotStandby = false;
-int            MaxStandbyDelay = 30 * 1000;
 bool       fullPageWrites = true;
 bool       log_checkpoints = false;
 int            sync_method = DEFAULT_SYNC_METHOD;
@@ -449,6 +448,15 @@ static ControlFileData *ControlFile = NULL;
  */
 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 
+/*
+ * Codes indicating where we got a WAL file from during recovery, or where
+ * to attempt to get one.  These are chosen so that they can be OR'd together
+ * in a bitmask state variable.
+ */
+#define XLOG_FROM_ARCHIVE      (1<<0)  /* Restored using restore_command */
+#define XLOG_FROM_PG_XLOG      (1<<1)  /* Existing file in pg_xlog */
+#define XLOG_FROM_STREAM       (1<<2)  /* Streamed from master */
+
 /*
  * openLogFile is -1 or a kernel FD for an open log file segment.
  * When it's open, openLogOff is the current seek offset in the file.
@@ -460,14 +468,6 @@ static uint32 openLogId = 0;
 static uint32 openLogSeg = 0;
 static uint32 openLogOff = 0;
 
-/*
- * Codes indicating where we got a WAL file from during recovery, or where
- * to attempt to get one.
- */
-#define XLOG_FROM_ARCHIVE      (1<<0)  /* Restored using restore_command */
-#define XLOG_FROM_PG_XLOG      (1<<1)  /* Existing file in pg_xlog */
-#define XLOG_FROM_STREAM       (1<<2)  /* Streamed from master */
-
 /*
  * These variables are used similarly to the ones above, but for reading
  * the XLOG.  Note, however, that readOff generally represents the offset
@@ -487,7 +487,16 @@ static int readSource = 0;     /* XLOG_FROM_* code */
  * Keeps track of which sources we've tried to read the current WAL
  * record from and failed.
  */
-static int failedSources = 0;
+static int failedSources = 0;  /* OR of XLOG_FROM_* codes */
+
+/*
+ * These variables track when we last obtained some WAL data to process,
+ * and where we got it from.  (XLogReceiptSource is initially the same as
+ * readSource, but readSource gets reset to zero when we don't have data
+ * to process right now.)
+ */
+static TimestampTz XLogReceiptTime = 0;
+static int XLogReceiptSource = 0;  /* XLOG_FROM_* code */
 
 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 static char *readBuf = NULL;
@@ -2626,7 +2635,7 @@ XLogFileOpen(uint32 log, uint32 seg)
  * Open a logfile segment for reading (during recovery).
  *
  * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
- * If source = XLOG_FROM_PG_XLOG, it's read from pg_xlog.
+ * Otherwise, it's assumed to be already available in pg_xlog.
  */
 static int
 XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
@@ -2655,6 +2664,7 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
            break;
 
        case XLOG_FROM_PG_XLOG:
+       case XLOG_FROM_STREAM:
            XLogFilePath(path, tli, log, seg);
            restoredFromArchive = false;
            break;
@@ -2674,7 +2684,13 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
                 xlogfname);
        set_ps_display(activitymsg, false);
 
+       /* Track source of data in assorted state variables */
        readSource = source;
+       XLogReceiptSource = source;
+       /* In FROM_STREAM case, caller tracks receipt time, not me */
+       if (source != XLOG_FROM_STREAM)
+           XLogReceiptTime = GetCurrentTimestamp();
+
        return fd;
    }
    if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
@@ -5568,7 +5584,7 @@ pg_is_in_recovery(PG_FUNCTION_ARGS)
 /*
  * Returns timestamp of last recovered commit/abort record.
  */
-TimestampTz
+static TimestampTz
 GetLatestXLogTime(void)
 {
    /* use volatile pointer to prevent code rearrangement */
@@ -5581,6 +5597,23 @@ GetLatestXLogTime(void)
    return recoveryLastXTime;
 }
 
+/*
+ * Returns time of receipt of current chunk of XLOG data, as well as
+ * whether it was received from streaming replication or from archives.
+ */
+void
+GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
+{
+   /*
+    * This must be executed in the startup process, since we don't export
+    * the relevant state to shared memory.
+    */
+   Assert(InRecovery);
+
+   *rtime = XLogReceiptTime;
+   *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
+}
+
 /*
  * Note that text field supplied is a parameter name and does not require
  * translation
@@ -6060,6 +6093,9 @@ StartupXLOG(void)
        xlogctl->recoveryLastRecPtr = ReadRecPtr;
        SpinLockRelease(&xlogctl->info_lck);
 
+       /* Also ensure XLogReceiptTime has a sane value */
+       XLogReceiptTime = GetCurrentTimestamp();
+
        /*
         * Let postmaster know we've started redo now, so that it can
         * launch bgwriter to perform restartpoints.  We don't bother
@@ -7647,7 +7683,7 @@ CreateRestartPoint(int flags)
        XLogRecPtr  endptr;
 
        /* Get the current (or recent) end of xlog */
-       endptr = GetWalRcvWriteRecPtr();
+       endptr = GetWalRcvWriteRecPtr(NULL);
 
        PrevLogSeg(_logId, _logSeg);
        RemoveOldXlogFiles(_logId, _logSeg, endptr);
@@ -8757,7 +8793,7 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
    XLogRecPtr  recptr;
    char        location[MAXFNAMELEN];
 
-   recptr = GetWalRcvWriteRecPtr();
+   recptr = GetWalRcvWriteRecPtr(NULL);
 
    if (recptr.xlogid == 0 && recptr.xrecoff == 0)
        PG_RETURN_NULL();
@@ -9272,6 +9308,8 @@ retry:
            {
                if (WalRcvInProgress())
                {
+                   bool    havedata;
+
                    /*
                     * If we find an invalid record in the WAL streamed from
                     * master, something is seriously wrong. There's little
@@ -9289,28 +9327,62 @@ retry:
                    }
 
                    /*
-                    * While walreceiver is active, wait for new WAL to arrive
-                    * from primary.
+                    * Walreceiver is active, so see if new data has arrived.
+                    *
+                    * We only advance XLogReceiptTime when we obtain fresh
+                    * WAL from walreceiver and observe that we had already
+                    * processed everything before the most recent "chunk"
+                    * that it flushed to disk.  In steady state where we are
+                    * keeping up with the incoming data, XLogReceiptTime
+                    * will be updated on each cycle.  When we are behind,
+                    * XLogReceiptTime will not advance, so the grace time
+                    * alloted to conflicting queries will decrease.
                     */
-                   receivedUpto = GetWalRcvWriteRecPtr();
                    if (XLByteLT(*RecPtr, receivedUpto))
+                       havedata = true;
+                   else
+                   {
+                       XLogRecPtr  latestChunkStart;
+
+                       receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
+                       if (XLByteLT(*RecPtr, receivedUpto))
+                       {
+                           havedata = true;
+                           if (!XLByteLT(*RecPtr, latestChunkStart))
+                               XLogReceiptTime = GetCurrentTimestamp();
+                       }
+                       else
+                           havedata = false;
+                   }
+                   if (havedata)
                    {
                        /*
                         * Great, streamed far enough. Open the file if it's
-                        * not open already.
+                        * not open already.  Use XLOG_FROM_STREAM so that
+                        * source info is set correctly and XLogReceiptTime
+                        * isn't changed.
                         */
                        if (readFile < 0)
                        {
                            readFile =
                                XLogFileRead(readId, readSeg, PANIC,
                                             recoveryTargetTLI,
-                                            XLOG_FROM_PG_XLOG, false);
+                                            XLOG_FROM_STREAM, false);
+                           Assert(readFile >= 0);
                            switched_segment = true;
+                       }
+                       else
+                       {
+                           /* just make sure source info is correct... */
                            readSource = XLOG_FROM_STREAM;
+                           XLogReceiptSource = XLOG_FROM_STREAM;
                        }
                        break;
                    }
 
+                   /*
+                    * Data not here yet, so check for trigger then sleep.
+                    */
                    if (CheckForStandbyTrigger())
                        goto triggered;
 
@@ -9388,7 +9460,7 @@ retry:
                    readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
                                                  sources);
                    switched_segment = true;
-                   if (readFile != -1)
+                   if (readFile >= 0)
                        break;
 
                    /*


diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c

index c5a6b315ebaf82d5493f0cb5fb97b588bfbd6402..153b7ff0e5004211d3aead491bde292b15c9ba81 100644 (file)


--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -29,7 +29,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.14 2010/06/09 15:04:07 heikki Exp $
+ *   $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.15 2010/07/03 20:43:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -524,6 +524,7 @@ XLogWalRcvFlush(void)
 
        /* Update shared-memory status */
        SpinLockAcquire(&walrcv->mutex);
+       walrcv->latestChunkStart = walrcv->receivedUpto;
        walrcv->receivedUpto = LogstreamResult.Flush;
        SpinLockRelease(&walrcv->mutex);
 


diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c

index 78ee7fb9f7eae63c09cde037f0da21b13773ed2e..4bc3bd875c05d3a0e22be07eb800a0284af3a90e 100644 (file)


--- a/src/backend/replication/walreceiverfuncs.c
+++ b/src/backend/replication/walreceiverfuncs.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/replication/walreceiverfuncs.c,v 1.5 2010/04/28 16:54:15 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/replication/walreceiverfuncs.c,v 1.6 2010/07/03 20:43:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -187,10 +187,11 @@ RequestXLogStreaming(XLogRecPtr recptr, const char *conninfo)
    if (recptr.xrecoff % XLogSegSize != 0)
        recptr.xrecoff -= recptr.xrecoff % XLogSegSize;
 
+   SpinLockAcquire(&walrcv->mutex);
+
    /* It better be stopped before we try to restart it */
    Assert(walrcv->walRcvState == WALRCV_STOPPED);
 
-   SpinLockAcquire(&walrcv->mutex);
    if (conninfo != NULL)
        strlcpy((char *) walrcv->conninfo, conninfo, MAXCONNINFO);
    else
@@ -199,16 +200,22 @@ RequestXLogStreaming(XLogRecPtr recptr, const char *conninfo)
    walrcv->startTime = now;
 
    walrcv->receivedUpto = recptr;
+   walrcv->latestChunkStart = recptr;
+
    SpinLockRelease(&walrcv->mutex);
 
    SendPostmasterSignal(PMSIGNAL_START_WALRECEIVER);
 }
 
 /*
- * Returns the byte position that walreceiver has written
+ * Returns the last+1 byte position that walreceiver has written.
+ *
+ * Optionally, returns the previous chunk start, that is the first byte
+ * written in the most recent walreceiver flush cycle.  Callers not
+ * interested in that value may pass NULL for latestChunkStart.
  */
 XLogRecPtr
-GetWalRcvWriteRecPtr(void)
+GetWalRcvWriteRecPtr(XLogRecPtr *latestChunkStart)
 {
    /* use volatile pointer to prevent code rearrangement */
    volatile WalRcvData *walrcv = WalRcv;
@@ -216,6 +223,8 @@ GetWalRcvWriteRecPtr(void)
 
    SpinLockAcquire(&walrcv->mutex);
    recptr = walrcv->receivedUpto;
+   if (latestChunkStart)
+       *latestChunkStart = walrcv->latestChunkStart;
    SpinLockRelease(&walrcv->mutex);
 
    return recptr;


diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c

index a61a4e62169fed468c5b90605c59ca7acc3f16f0..8525492bc25eb11bcd82631780e464eef91eccc6 100644 (file)


--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.25 2010/06/14 00:49:24 itagaki Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.26 2010/07/03 20:43:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -30,7 +30,10 @@
 #include "storage/standby.h"
 #include "utils/ps_status.h"
 
+/* User-settable GUC parameters */
 int            vacuum_defer_cleanup_age;
+int            max_standby_archive_delay = 30 * 1000;
+int            max_standby_streaming_delay = 30 * 1000;
 
 static List *RecoveryLockList;
 
@@ -40,13 +43,14 @@ static void ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid);
 static void LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
 static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
 
+
 /*
  * InitRecoveryTransactionEnvironment
- *     Initiallize tracking of in-progress transactions in master
+ *     Initialize tracking of in-progress transactions in master
  *
  * We need to issue shared invalidations and hold locks. Holding locks
- * means others may want to wait on us, so we need to make lock table
- * inserts to appear like a transaction. We could create and delete
+ * means others may want to wait on us, so we need to make a lock table
+ * vxact entry like a real transaction. We could create and delete
  * lock table entries for each transaction but its simpler just to create
  * one permanent entry and leave it there all the time. Locks are then
  * acquired and released as needed. Yes, this means you can see the
@@ -58,7 +62,7 @@ InitRecoveryTransactionEnvironment(void)
    VirtualTransactionId vxid;
 
    /*
-    * Initialise shared invalidation management for Startup process, being
+    * Initialize shared invalidation management for Startup process, being
     * careful to register ourselves as a sendOnly process so we don't need to
     * read messages, nor will we get signalled when the queue starts filling
     * up.
@@ -113,6 +117,36 @@ ShutdownRecoveryTransactionEnvironment(void)
  * -----------------------------------------------------
  */
 
+/*
+ * Determine the cutoff time at which we want to start canceling conflicting
+ * transactions.  Returns zero (a time safely in the past) if we are willing
+ * to wait forever.
+ */
+static TimestampTz
+GetStandbyLimitTime(void)
+{
+   TimestampTz rtime;
+   bool        fromStream;
+
+   /*
+    * The cutoff time is the last WAL data receipt time plus the appropriate
+    * delay variable.  Delay of -1 means wait forever.
+    */
+   GetXLogReceiptTime(&rtime, &fromStream);
+   if (fromStream)
+   {
+       if (max_standby_streaming_delay < 0)
+           return 0;           /* wait forever */
+       return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
+   }
+   else
+   {
+       if (max_standby_archive_delay < 0)
+           return 0;           /* wait forever */
+       return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
+   }
+}
+
 #define STANDBY_INITIAL_WAIT_US  1000
 static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
 
@@ -124,10 +158,11 @@ static int    standbyWait_us = STANDBY_INITIAL_WAIT_US;
 static bool
 WaitExceedsMaxStandbyDelay(void)
 {
-   /* Are we past max_standby_delay? */
-   if (MaxStandbyDelay >= 0 &&
-       TimestampDifferenceExceeds(GetLatestXLogTime(), GetCurrentTimestamp(),
-                                  MaxStandbyDelay))
+   TimestampTz ltime;
+
+   /* Are we past the limit time? */
+   ltime = GetStandbyLimitTime();
+   if (ltime && GetCurrentTimestamp() >= ltime)
        return true;
 
    /*
@@ -203,8 +238,8 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
                pid = CancelVirtualTransaction(*waitlist, reason);
 
                /*
-                * Wait awhile for it to die so that we avoid flooding an
-                * unresponsive backend when system is heavily loaded.
+                * Wait a little bit for it to die so that we avoid flooding
+                * an unresponsive backend when system is heavily loaded.
                 */
                if (pid != 0)
                    pg_usleep(5000L);
@@ -286,7 +321,7 @@ void
 ResolveRecoveryConflictWithDatabase(Oid dbid)
 {
    /*
-    * We don't do ResolveRecoveryConflictWithVirutalXIDs() here since that
+    * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
     * only waits for transactions and completely idle sessions would block
     * us. This is rare enough that we do this as simply as possible: no wait,
     * just force them off immediately.
@@ -355,12 +390,11 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
  * the limit of our patience. The sleep in LockBufferForCleanup() is
  * performed here, for code clarity.
  *
- * Resolve conflict by sending a SIGUSR1 reason to all backends to check if
+ * Resolve conflicts by sending a PROCSIG signal to all backends to check if
  * they hold one of the buffer pins that is blocking Startup process. If so,
  * backends will take an appropriate error action, ERROR or FATAL.
  *
- * We also check for deadlocks before we wait, though applications that cause
- * these will be extremely rare.  Deadlocks occur because if queries
+ * We also must check for deadlocks.  Deadlocks occur because if queries
  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
  * be cleared if the Startup process replays a transaction completion record.
  * If Startup process is also waiting then that is a deadlock. The deadlock
@@ -368,66 +402,51 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
  * Startup is sleeping and the query waits on a lock. We protect against
  * only the former sequence here, the latter sequence is checked prior to
  * the query sleeping, in CheckRecoveryConflictDeadlock().
+ *
+ * Deadlocks are extremely rare, and relatively expensive to check for,
+ * so we don't do a deadlock check right away ... only if we have had to wait
+ * at least deadlock_timeout.  Most of the logic about that is in proc.c.
  */
 void
 ResolveRecoveryConflictWithBufferPin(void)
 {
    bool        sig_alarm_enabled = false;
+   TimestampTz ltime;
+   TimestampTz now;
 
    Assert(InHotStandby);
 
-   if (MaxStandbyDelay == 0)
-   {
-       /*
-        * We don't want to wait, so just tell everybody holding the pin to
-        * get out of town.
-        */
-       SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
-   }
-   else if (MaxStandbyDelay < 0)
-   {
-       TimestampTz now = GetCurrentTimestamp();
+   ltime = GetStandbyLimitTime();
+   now = GetCurrentTimestamp();
 
+   if (!ltime)
+   {
        /*
-        * Set timeout for deadlock check (only)
+        * We're willing to wait forever for conflicts, so set timeout for
+        * deadlock check (only)
         */
        if (enable_standby_sig_alarm(now, now, true))
            sig_alarm_enabled = true;
        else
            elog(FATAL, "could not set timer for process wakeup");
    }
+   else if (now >= ltime)
+   {
+       /*
+        * We're already behind, so clear a path as quickly as possible.
+        */
+       SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+   }
    else
    {
-       TimestampTz then = GetLatestXLogTime();
-       TimestampTz now = GetCurrentTimestamp();
-
-       /* Are we past max_standby_delay? */
-       if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay))
-       {
-           /*
-            * We're already behind, so clear a path as quickly as possible.
-            */
-           SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
-       }
+       /*
+        * Wake up at ltime, and check for deadlocks as well if we will be
+        * waiting longer than deadlock_timeout
+        */
+       if (enable_standby_sig_alarm(now, ltime, false))
+           sig_alarm_enabled = true;
        else
-       {
-           TimestampTz max_standby_time;
-
-           /*
-            * At what point in the future do we hit MaxStandbyDelay?
-            */
-           max_standby_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);
-           Assert(max_standby_time > now);
-
-           /*
-            * Wake up at MaxStandby delay, and check for deadlocks as well
-            * if we will be waiting longer than deadlock_timeout
-            */
-           if (enable_standby_sig_alarm(now, max_standby_time, false))
-               sig_alarm_enabled = true;
-           else
-               elog(FATAL, "could not set timer for process wakeup");
-       }
+           elog(FATAL, "could not set timer for process wakeup");
    }
 
    /* Wait to be signaled by UnpinBuffer() */


diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c

index d7eb8695038451958ff9baa25f832fc73d86fa1e..2d77be0a34e09d759e191f9db83bb6c9452b50f2 100644 (file)


--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.219 2010/05/26 19:52:52 sriggs Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.220 2010/07/03 20:43:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1627,12 +1627,13 @@ handle_sig_alarm(SIGNAL_ARGS)
 bool
 enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_only)
 {
-   TimestampTz deadlock_time = TimestampTzPlusMilliseconds(now, DeadlockTimeout);
+   TimestampTz deadlock_time = TimestampTzPlusMilliseconds(now,
+                                                           DeadlockTimeout);
 
    if (deadlock_only)
    {
        /*
-        * Wake up at DeadlockTimeout only, then wait forever
+        * Wake up at deadlock_time only, then wait forever
         */
        statement_fin_time = deadlock_time;
        deadlock_timeout_active = true;
@@ -1641,7 +1642,7 @@ enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_on
    else if (fin_time > deadlock_time)
    {
        /*
-        * Wake up at DeadlockTimeout, then again at MaxStandbyDelay
+        * Wake up at deadlock_time, then again at fin_time
         */
        statement_fin_time = deadlock_time;
        statement_fin_time2 = fin_time;
@@ -1651,7 +1652,7 @@ enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_on
    else
    {
        /*
-        * Wake only at MaxStandbyDelay because its fairly soon
+        * Wake only at fin_time because its fairly soon
         */
        statement_fin_time = fin_time;
        deadlock_timeout_active = false;
@@ -1729,15 +1730,16 @@ CheckStandbyTimeout(void)
        if (deadlock_timeout_active)
        {
            /*
-            * We're still waiting when we reach DeadlockTimeout, so send out a request
-            * to have other backends check themselves for deadlock. Then continue
-            * waiting until MaxStandbyDelay.
+            * We're still waiting when we reach deadlock timeout, so send out
+            * a request to have other backends check themselves for
+            * deadlock. Then continue waiting until statement_fin_time,
+            * if that's set.
             */
            SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
            deadlock_timeout_active = false;
 
            /*
-            * Begin second waiting period to MaxStandbyDelay if required.
+            * Begin second waiting period if required.
             */
            if (statement_timeout_active)
            {
@@ -1748,8 +1750,8 @@ CheckStandbyTimeout(void)
        else
        {
            /*
-            * We've now reached MaxStandbyDelay, so ask all conflicts to leave, cos
-            * its time for us to press ahead with applying changes in recovery.
+            * We've now reached statement_fin_time, so ask all conflicts to
+            * leave, so we can press ahead with applying changes in recovery.
             */
            SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
        }


diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index 73305486569b0d050e33eb1c7caa37956d22a9ce..f2caad2cd9a87acd51f4ba53f13e0645cf380e59 100644 (file)


--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut .
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.557 2010/06/25 13:11:25 sriggs Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.558 2010/07/03 20:43:58 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -57,6 +57,7 @@
 #include "postmaster/walwriter.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
+#include "storage/standby.h"
 #include "storage/fd.h"
 #include "tcop/tcopprot.h"
 #include "tsearch/ts_cache.h"
@@ -116,7 +117,6 @@ extern char *default_tablespace;
 extern char *temp_tablespaces;
 extern bool synchronize_seqscans;
 extern bool fullPageWrites;
-extern int vacuum_defer_cleanup_age;
 extern int ssl_renegotiation_limit;
 
 #ifdef TRACE_SORT
@@ -1373,6 +1373,26 @@ static struct config_int ConfigureNamesInt[] =
        1000, 1, INT_MAX / 1000, NULL, NULL
    },
 
+   {
+       {"max_standby_archive_delay", PGC_SIGHUP, WAL_STANDBY_SERVERS,
+           gettext_noop("Sets the maximum delay before canceling queries when a hot standby server is processing archived WAL data."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &max_standby_archive_delay,
+       30 * 1000, -1, INT_MAX / 1000, NULL, NULL
+   },
+
+   {
+       {"max_standby_streaming_delay", PGC_SIGHUP, WAL_STANDBY_SERVERS,
+           gettext_noop("Sets the maximum delay before canceling queries when a hot standby server is processing streamed WAL data."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &max_standby_streaming_delay,
+       30 * 1000, -1, INT_MAX / 1000, NULL, NULL
+   },
+
    /*
     * Note: MaxBackends is limited to INT_MAX/4 because some places compute
     * 4*MaxBackends without any overflow check.  This check is made in
@@ -1392,16 +1412,6 @@ static struct config_int ConfigureNamesInt[] =
        100, 1, INT_MAX / 4, assign_maxconnections, NULL
    },
 
-   {
-       {"max_standby_delay", PGC_SIGHUP, WAL_STANDBY_SERVERS,
-           gettext_noop("Sets the maximum delay to avoid conflict processing on hot standby servers."),
-           NULL,
-           GUC_UNIT_MS
-       },
-       &MaxStandbyDelay,
-       30 * 1000, -1, INT_MAX / 1000, NULL, NULL
-   },
-
    {
        {"superuser_reserved_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
            gettext_noop("Sets the number of connection slots reserved for superusers."),


diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 5ea568a3551abfbf1eed7aff62e8b564403b8a6e..e765664ecc0d66a2720443a0632143aae01936ad 100644 (file)


--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,15 +186,19 @@
 # - Streaming Replication -
 
 #max_wal_senders = 0       # max number of walsender processes
-#wal_sender_delay = 200ms  # 1-10000 milliseconds
+#wal_sender_delay = 200ms  # walsender cycle time, 1-10000 milliseconds
 #wal_keep_segments = 0     # in logfile segments, 16MB each; 0 disables
 
 # - Standby Servers -
 
-#hot_standby = off     # allows queries during recovery
-#max_standby_delay = 30s   # max acceptable lag to allow queries to
-               # complete without conflict; -1 means forever
-#vacuum_defer_cleanup_age = 0  # num transactions by which cleanup is deferred
+#hot_standby = off         # "on" allows queries during recovery
+#max_standby_archive_delay = 30s   # max delay before canceling queries
+                   # when reading WAL from archive;
+                   # -1 allows indefinite delay
+#max_standby_streaming_delay = 30s # max delay before canceling queries
+                   # when reading streaming WAL;
+                   # -1 allows indefinite delay
+#vacuum_defer_cleanup_age = 0      # number of transactions by which cleanup is deferred
 
 
 #------------------------------------------------------------------------------


diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index cbadd7f91fb1607174f0518164f50c4a3e208a79..27e7f404d8dc15b09a42f329eda3c31cb41f2f02 100644 (file)


--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.113 2010/06/17 16:41:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.114 2010/07/03 20:43:58 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -135,22 +135,25 @@ typedef struct XLogRecData
 extern PGDLLIMPORT TimeLineID ThisTimeLineID;  /* current TLI */
 
 /*
- * Prior to 8.4, all activity during recovery was carried out by Startup
+ * Prior to 8.4, all activity during recovery was carried out by the startup
  * process. This local variable continues to be used in many parts of the
- * code to indicate actions taken by RecoveryManagers. Other processes who
- * potentially perform work during recovery should check RecoveryInProgress()
- * see XLogCtl notes in xlog.c
+ * code to indicate actions taken by RecoveryManagers. Other processes that
+ * potentially perform work during recovery should check RecoveryInProgress().
+ * See XLogCtl notes in xlog.c.
  */
 extern bool InRecovery;
 
 /*
  * Like InRecovery, standbyState is only valid in the startup process.
+ * In all other processes it will have the value STANDBY_DISABLED (so
+ * InHotStandby will read as FALSE).
  *
  * In DISABLED state, we're performing crash recovery or hot standby was
  * disabled in recovery.conf.
  *
- * In INITIALIZED state, we haven't yet received a RUNNING_XACTS or shutdown
- * checkpoint record to initialize our master transaction tracking system.
+ * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
+ * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
+ * to initialize our master-transaction tracking system.
  *
  * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
  * state. The tracked information might still be incomplete, so we can't allow
@@ -168,6 +171,7 @@ typedef enum
    STANDBY_SNAPSHOT_PENDING,
    STANDBY_SNAPSHOT_READY
 } HotStandbyState;
+
 extern HotStandbyState standbyState;
 
 #define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
@@ -193,7 +197,6 @@ extern int  XLogArchiveTimeout;
 extern bool XLogArchiveMode;
 extern char *XLogArchiveCommand;
 extern bool EnableHotStandby;
-extern int MaxStandbyDelay;
 extern bool log_checkpoints;
 
 /* WAL levels */
@@ -279,7 +282,7 @@ extern void issue_xlog_fsync(int fd, uint32 log, uint32 seg);
 
 extern bool RecoveryInProgress(void);
 extern bool XLogInsertAllowed(void);
-extern TimestampTz GetLatestXLogTime(void);
+extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
 
 extern void UpdateControlFile(void);
 extern uint64 GetSystemIdentifier(void);


diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h

index 5dcaeba3f339c33516f63c4e6fdd3ea305b4080b..734380ee4f88000230a685329f2636f5a77319bc 100644 (file)


--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -5,7 +5,7 @@
  *
  * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.9 2010/06/03 22:17:32 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.10 2010/07/03 20:43:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,25 +41,35 @@ typedef enum
 typedef struct
 {
    /*
-    * connection string; is used for walreceiver to connect with the primary.
-    */
-   char        conninfo[MAXCONNINFO];
-
-   /*
-    * PID of currently active walreceiver process, and the current state.
+    * PID of currently active walreceiver process, its current state and
+    * start time (actually, the time at which it was requested to be started).
     */
    pid_t       pid;
    WalRcvState walRcvState;
    pg_time_t   startTime;
 
    /*
-    * receivedUpto-1 is the last byte position that has been already
-    * received. When startup process starts the walreceiver, it sets this to
-    * the point where it wants the streaming to begin. After that,
-    * walreceiver updates this whenever it flushes the received WAL.
+    * receivedUpto-1 is the last byte position that has already been
+    * received.  When startup process starts the walreceiver, it sets
+    * receivedUpto to the point where it wants the streaming to begin.
+    * After that, walreceiver updates this whenever it flushes the received
+    * WAL to disk.
     */
    XLogRecPtr  receivedUpto;
 
+   /*
+    * latestChunkStart is the starting byte position of the current "batch"
+    * of received WAL.  It's actually the same as the previous value of
+    * receivedUpto before the last flush to disk.  Startup process can use
+    * this to detect whether it's keeping up or not.
+    */
+   XLogRecPtr  latestChunkStart;
+
+   /*
+    * connection string; is used for walreceiver to connect with the primary.
+    */
+   char        conninfo[MAXCONNINFO];
+
    slock_t     mutex;          /* locks shared variables shown above */
 } WalRcvData;
 
@@ -83,6 +93,6 @@ extern void ShutdownWalRcv(void);
 extern bool WalRcvInProgress(void);
 extern XLogRecPtr WaitNextXLogAvailable(XLogRecPtr recptr, bool *finished);
 extern void RequestXLogStreaming(XLogRecPtr recptr, const char *conninfo);
-extern XLogRecPtr GetWalRcvWriteRecPtr(void);
+extern XLogRecPtr GetWalRcvWriteRecPtr(XLogRecPtr *latestChunkStart);
 
 #endif   /* _WALRECEIVER_H */


diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h

index 9159301a168ac89a489bcc44f599ce67082ad2d9..0654c5bcccfa9564e01ff7aa14568f8ed76feb10 100644 (file)


--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/standby.h,v 1.10 2010/05/13 11:15:38 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/storage/standby.h,v 1.11 2010/07/03 20:43:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,7 +19,10 @@
 #include "storage/procsignal.h"
 #include "storage/relfilenode.h"
 
+/* User-settable GUC parameters */
 extern int vacuum_defer_cleanup_age;
+extern int max_standby_archive_delay;
+extern int max_standby_streaming_delay;
 
 extern void InitRecoveryTransactionEnvironment(void);
 extern void ShutdownRecoveryTransactionEnvironment(void);
@@ -83,7 +86,7 @@ extern void standby_desc(StringInfo buf, uint8 xl_info, char *rec);
 /*
  * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
  * not quite. This has nothing at all to do with visibility on this server,
- * so this is completely separate from snapmgr.c and snapmgr.h
+ * so this is completely separate from snapmgr.c and snapmgr.h.
  * This data is important for creating the initial snapshot state on a
  * standby server. We need lots more information than a normal snapshot,
  * hence we use a specific data structure for our needs. This data
+    linkend="guc-max-standby-streaming-delay">, that define the maximum
+    allowed delay in WAL application.  Conflicting queries will be canceled
+    once it has taken longer than the relevant delay setting to apply any
+    newly-received WAL data.  There are two parameters so that different delay
+    values can be specified for the case of reading WAL data from an archive
+    (i.e., initial recovery from a base backup or catching up a
+    standby server that has fallen far behind) versus reading WAL data via
+    streaming replication.
     
  
     
-    Experienced users should note that both row version cleanup and row version
-    freezing will potentially conflict with recovery queries. Running a
-    manual VACUUM FREEZE is likely to cause conflicts even on tables
-    with no updated or deleted rows.
+    In a standby server that exists primarily for high availability, it's
+    best to set the delay parameters relatively short, so that the server
+    cannot fall far behind the primary due to delays caused by standby
+    queries.  However, if the standby server is meant for executing
+    long-running queries, then a high or even infinite delay value may be
+    preferable.  Keep in mind however that a long-running query could
+    cause other sessions on the standby server to not see recent changes
+    on the primary, if it delays application of WAL records.
     
  
     
-    There are a number of choices for resolving query conflicts.  The default
-    is to wait and hope the query finishes. The server will wait
-    automatically until the lag between primary and standby is at most
-     (30 seconds by default).
-    Once that grace period expires,
-    one of the following actions is taken:
-
-      
-       
-        
-         If the conflict is caused by a lock, the conflicting standby
-         transaction is cancelled immediately. If the transaction is
-         idle-in-transaction, then the session is aborted instead.
-         This behavior might change in the future.
-        
-       
-
-       
-        
-         If the conflict is caused by cleanup records, the standby query is informed
-         a conflict has occurred and that it must cancel itself to avoid the
-         risk that it silently fails to read relevant data because
-         that data has been removed.  Some cleanup
-         records only conflict with older queries, while others
-         can affect all queries.
-        
-
-        
-         Cancelled queries may be retried immediately (after beginning a new
-         transaction, of course).  Since query cancellation depends on
-         the nature of the WAL records being replayed, a query that was
-         cancelled may succeed if it is executed again.
-        
-       
-      
+    The most common reason for conflict between standby queries and WAL replay
+    is early cleanup.  Normally, PostgreSQL allows
+    cleanup of old row versions when there are no transactions that need to
+    see them to ensure correct visibility of data according to MVCC rules.
+    However, this rule can only be applied for transactions executing on the
+    master.  So it is possible that cleanup on the master will remove row
+    versions that are still visible to a transaction on the standby.
     
  
     
-    Keep in mind that max_standby_delay is compared to the
-    difference between the standby server's clock and the transaction
-    commit timestamps read from the WAL log.  Thus, the grace period
-    allowed to any one query on the standby is never more than
-    max_standby_delay, and could be considerably less if the
-    standby has already fallen behind as a result of waiting for previous
-    queries to complete, or as a result of being unable to keep up with a
-    heavy update load.
+    Experienced users should note that both row version cleanup and row version
+    freezing will potentially conflict with standby queries. Running a manual
+    VACUUM FREEZE is likely to cause conflicts even on tables with
+    no updated or deleted rows.
     
  
-   
-    
-     Be sure that the primary and standby servers' clocks are kept in sync;
-     otherwise the values compared to max_standby_delay will be
-     erroneous, possibly leading to additional query cancellations.
-     If the clocks are intentionally not in sync, or if there is a large
-     propagation delay from primary to standby, it is advisable to set
-     max_standby_delay to -1.  In any case the value should be
-     larger than the largest expected clock skew between primary and standby.
-    
-   
+   
+    Once the delay specified by max_standby_archive_delay or
+    max_standby_streaming_delay has been exceeded, conflicting
+    queries will be cancelled.  This usually results just in a cancellation
+    error, although in the case of replaying a DROP DATABASE
+    the entire conflicting session will be terminated.  Also, if the conflict
+    is over a lock held by an idle transaction, the conflicting session is
+    terminated (this behavior might change in the future).
+   
  
     
-    Users should be clear that tables that are regularly and heavily updated on the
-    primary server will quickly cause cancellation of longer running queries on
-    the standby. In those cases max_standby_delay can be
-    considered similar to setting
-    statement_timeout.
-    
+    Cancelled queries may be retried immediately (after beginning a new
+    transaction, of course).  Since query cancellation depends on
+    the nature of the WAL records being replayed, a query that was
+    cancelled may well succeed if it is executed again.
+   
  
     
-    Other remedial actions exist if the number of cancellations is unacceptable.
-    The first option is to connect to the primary server and keep a query active
-    for as long as needed to run queries on the standby. This guarantees that
-    a WAL cleanup record is never generated and query conflicts do not occur,
-    as described above. This could be done using contrib/dblink
-    and pg_sleep(), or via other mechanisms. If you do this, you
-    should note that this will delay cleanup of dead rows on the primary by
-    vacuum or HOT, which may be undesirable. However, remember
-    that the primary and standby nodes are linked via the WAL, so the cleanup
-    situation is no different from the case where the query ran on the primary
-    node itself, and you are still getting the benefit of off-loading the
-    execution onto the standby. max_standby_delay should
-    not be used in this case because delayed WAL files might already
-    contain entries that invalidate the current snapshot.
+    Keep in mind that the delay parameters are compared to the elapsed time
+    since the WAL data was received by the standby server.  Thus, the grace
+    period allowed to any one query on the standby is never more than the
+    delay parameter, and could be considerably less if the standby has already
+    fallen behind as a result of waiting for previous queries to complete, or
+    as a result of being unable to keep up with a heavy update load.
     
  
     
-    It is also possible to set vacuum_defer_cleanup_age on the primary
-    to defer the cleanup of records by autovacuum, VACUUM
-    and HOT. This might allow
-    more time for queries to execute before they are cancelled on the standby,
-    without the need for setting a high max_standby_delay.
+    Users should be clear that tables that are regularly and heavily updated
+    on the primary server will quickly cause cancellation of longer running
+    queries on the standby. In such cases the setting of a finite value for
+    max_standby_archive_delay or
+    max_standby_streaming_delay can be considered similar to
+    setting statement_timeout.
     
  
     
-    Three-way deadlocks are possible between AccessExclusiveLocks arriving from
-    the primary, cleanup WAL records that require buffer cleanup locks, and
-    user requests that are waiting behind replayed AccessExclusiveLocks.
-    Deadlocks are resolved automatically after deadlock_timeout
-    seconds, though they are thought to be rare in practice.
+    Remedial possibilities exist if the number of standby-query cancellations
+    is found to be unacceptable.  The first option is to connect to the
+    primary server and keep a query active for as long as needed to
+    run queries on the standby. This prevents VACUUM from removing
+    recently-dead rows and so cleanup conflicts do not occur.
+    This could be done using contrib/dblink and
+    pg_sleep(), or via other mechanisms. If you do this, you
+    should note that this will delay cleanup of dead rows on the primary,
+    which may result in undesirable table bloat. However, the cleanup
+    situation will be no worse than if the standby queries were running
+    directly on the primary server, and you are still getting the benefit of
+    off-loading execution onto the standby.
+    max_standby_archive_delay must be kept large in this case,
+    because delayed WAL files might already contain entries that conflict with
+    the desired standby queries.
     
  
     
-    Dropping tablespaces or databases is discussed in the administrator's
-    section since they are not typical user situations.
+    Another option is to increase 
+    on the primary server, so that dead rows will not be cleaned up as quickly
+    as they normally would be.  This will allow more time for queries to
+    execute before they are cancelled on the standby, without having to set
+    a high max_standby_streaming_delay.  However it is
+    difficult to guarantee any specific execution-time window with this
+    approach, since vacuum_defer_cleanup_age is measured in
+    transactions executed on the primary server.
     
    
  
@@ -1644,19 +1620,15 @@ LOG:  database system is ready to accept read only connections
     
  
     
-    It is important that the administrator consider the appropriate setting
-    of max_standby_delay,
-    which can be set in postgresql.conf.
-    There is no optimal setting, so it should be set according to business
-    priorities. For example if the server is primarily tasked as a High
-    Availability server, then you may wish to lower
-    max_standby_delay or even set it to zero, though that is a
-    very aggressive setting. If the standby server is tasked as an additional
-    server for decision support queries then it might be acceptable to set this
-    to a value of many hours.  It is also possible to set
-    max_standby_delay to -1 which means wait forever for queries
-    to complete; this will be useful when performing
-    an archive recovery from a backup.
+    It is important that the administrator select appropriate settings for
+     and 
+    linkend="guc-max-standby-streaming-delay">.  The best choices vary
+    depending on business priorities.  For example if the server is primarily
+    tasked as a High Availability server, then you will want low delay
+    settings, perhaps even zero, though that is a very aggressive setting. If
+    the standby server is tasked as an additional server for decision support
+    queries then it might be acceptable to set the maximum delay values to
+    many hours, or even -1 which means wait forever for queries to complete.
    
 
    
@@ -1792,11 +1764,12 @@ LOG:  database system is ready to accept read only connections
    
 
    
-    Running DROP DATABASE, ALTER DATABASE ... SET TABLESPACE,
-    or ALTER DATABASE ... RENAME on primary will generate a log message
-    that will cause all users connected to that database on the standby to be
-    forcibly disconnected. This action occurs immediately, whatever the setting of
-    max_standby_delay.
+    Running DROP DATABASE, ALTER DATABASE ... SET
+    TABLESPACE, or ALTER DATABASE ... RENAME on the primary
+    will generate a WAL entry that will cause all users connected to that
+    database on the standby to be forcibly disconnected. This action occurs
+    immediately, whatever the setting of
+    max_standby_streaming_delay.
    
 
    
@@ -1817,7 +1790,7 @@ LOG:  database system is ready to accept read only connections
    
 
    
-    Autovacuum is not active during recovery, it will start normally at the
+    Autovacuum is not active during recovery.  It will start normally at the
     end of recovery.
    
 
@@ -1836,21 +1809,25 @@ LOG:  database system is ready to accept read only connections
 
    
     Various parameters have been mentioned above in
-    admin">
-    and ">.
+    conflict"> and
+    ">.
    
 
    
     On the primary, parameters  and
      can be used.
-     has no effect if set on the primary.
+     and
+     have no effect if set on
+    the primary.
    
 
    
-    On the standby, parameters  and
-     can be used.
-     has no effect during
-    recovery.
+    On the standby, parameters ,
+     and
+     can be used.
+     has no effect
+    as long as the server remains in standby mode, though it will
+    become relevant if the standby becomes primary.
    
   
 
+    linkend="guc-max-standby-streaming-delay">.  The best choices vary
+    depending on business priorities.  For example if the server is primarily
+    tasked as a High Availability server, then you will want low delay
+    settings, perhaps even zero, though that is a very aggressive setting. If
+    the standby server is tasked as an additional server for decision support
+    queries then it might be acceptable to set the maximum delay values to
+    many hours, or even -1 which means wait forever for queries to complete.
     
  
     
@@ -1792,11 +1764,12 @@ LOG:  database system is ready to accept read only connections
     
  
     
-    Running DROP DATABASE, ALTER DATABASE ... SET TABLESPACE,
-    or ALTER DATABASE ... RENAME on primary will generate a log message
-    that will cause all users connected to that database on the standby to be
-    forcibly disconnected. This action occurs immediately, whatever the setting of
-    max_standby_delay.
+    Running DROP DATABASE, ALTER DATABASE ... SET
+    TABLESPACE, or ALTER DATABASE ... RENAME on the primary
+    will generate a WAL entry that will cause all users connected to that
+    database on the standby to be forcibly disconnected. This action occurs
+    immediately, whatever the setting of
+    max_standby_streaming_delay.
     
  
     
@@ -1817,7 +1790,7 @@ LOG:  database system is ready to accept read only connections
     
  
     
-    Autovacuum is not active during recovery, it will start normally at the
+    Autovacuum is not active during recovery.  It will start normally at the
      end of recovery.
     
  
@@ -1836,21 +1809,25 @@ LOG:  database system is ready to accept read only connections
  
     
      Various parameters have been mentioned above in
-    admin">
-    and ">.
+    conflict"> and
+    ">.
     
  
     
      On the primary, parameters  and
       can be used.
-     has no effect if set on the primary.
+     and
+     have no effect if set on
+    the primary.
     
  
     
-    On the standby, parameters  and
-     can be used.
-     has no effect during
-    recovery.
+    On the standby, parameters ,
+     and
+     can be used.
+     has no effect
+    as long as the server remains in standby mode, though it will
+    become relevant if the standby becomes primary.
     
    
  
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 7e3cf80c0315649e671349fe0666f2dfc75ba74e..e66b75aa7ad4e01ff9f15ee8b5d1b96a1dd45e7b 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.427 2010/06/28 19:46:19 rhaas Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.428 2010/07/03 20:43:57 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -72,7 +72,6 @@ int           XLogArchiveTimeout = 0;
  bool       XLogArchiveMode = false;
  char      *XLogArchiveCommand = NULL;
  bool       EnableHotStandby = false;
-int            MaxStandbyDelay = 30 * 1000;
  bool       fullPageWrites = true;
  bool       log_checkpoints = false;
  int            sync_method = DEFAULT_SYNC_METHOD;
@@ -449,6 +448,15 @@ static ControlFileData *ControlFile = NULL;
   */
  static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
  
+/*
+ * Codes indicating where we got a WAL file from during recovery, or where
+ * to attempt to get one.  These are chosen so that they can be OR'd together
+ * in a bitmask state variable.
+ */
+#define XLOG_FROM_ARCHIVE      (1<<0)  /* Restored using restore_command */
+#define XLOG_FROM_PG_XLOG      (1<<1)  /* Existing file in pg_xlog */
+#define XLOG_FROM_STREAM       (1<<2)  /* Streamed from master */
+
  /*
   * openLogFile is -1 or a kernel FD for an open log file segment.
   * When it's open, openLogOff is the current seek offset in the file.
@@ -460,14 +468,6 @@ static uint32 openLogId = 0;
  static uint32 openLogSeg = 0;
  static uint32 openLogOff = 0;
  
-/*
- * Codes indicating where we got a WAL file from during recovery, or where
- * to attempt to get one.
- */
-#define XLOG_FROM_ARCHIVE      (1<<0)  /* Restored using restore_command */
-#define XLOG_FROM_PG_XLOG      (1<<1)  /* Existing file in pg_xlog */
-#define XLOG_FROM_STREAM       (1<<2)  /* Streamed from master */
-
  /*
   * These variables are used similarly to the ones above, but for reading
   * the XLOG.  Note, however, that readOff generally represents the offset
@@ -487,7 +487,16 @@ static int readSource = 0;     /* XLOG_FROM_* code */
   * Keeps track of which sources we've tried to read the current WAL
   * record from and failed.
   */
-static int failedSources = 0;
+static int failedSources = 0;  /* OR of XLOG_FROM_* codes */
+
+/*
+ * These variables track when we last obtained some WAL data to process,
+ * and where we got it from.  (XLogReceiptSource is initially the same as
+ * readSource, but readSource gets reset to zero when we don't have data
+ * to process right now.)
+ */
+static TimestampTz XLogReceiptTime = 0;
+static int XLogReceiptSource = 0;  /* XLOG_FROM_* code */
  
  /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
  static char *readBuf = NULL;
@@ -2626,7 +2635,7 @@ XLogFileOpen(uint32 log, uint32 seg)
   * Open a logfile segment for reading (during recovery).
   *
   * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
- * If source = XLOG_FROM_PG_XLOG, it's read from pg_xlog.
+ * Otherwise, it's assumed to be already available in pg_xlog.
   */
  static int
  XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
@@ -2655,6 +2664,7 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
             break;
  
         case XLOG_FROM_PG_XLOG:
+       case XLOG_FROM_STREAM:
             XLogFilePath(path, tli, log, seg);
             restoredFromArchive = false;
             break;
@@ -2674,7 +2684,13 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
                  xlogfname);
         set_ps_display(activitymsg, false);
  
+       /* Track source of data in assorted state variables */
         readSource = source;
+       XLogReceiptSource = source;
+       /* In FROM_STREAM case, caller tracks receipt time, not me */
+       if (source != XLOG_FROM_STREAM)
+           XLogReceiptTime = GetCurrentTimestamp();
+
         return fd;
     }
     if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
@@ -5568,7 +5584,7 @@ pg_is_in_recovery(PG_FUNCTION_ARGS)
  /*
   * Returns timestamp of last recovered commit/abort record.
   */
-TimestampTz
+static TimestampTz
  GetLatestXLogTime(void)
  {
     /* use volatile pointer to prevent code rearrangement */
@@ -5581,6 +5597,23 @@ GetLatestXLogTime(void)
     return recoveryLastXTime;
  }
  
+/*
+ * Returns time of receipt of current chunk of XLOG data, as well as
+ * whether it was received from streaming replication or from archives.
+ */
+void
+GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
+{
+   /*
+    * This must be executed in the startup process, since we don't export
+    * the relevant state to shared memory.
+    */
+   Assert(InRecovery);
+
+   *rtime = XLogReceiptTime;
+   *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
+}
+
  /*
   * Note that text field supplied is a parameter name and does not require
   * translation
@@ -6060,6 +6093,9 @@ StartupXLOG(void)
         xlogctl->recoveryLastRecPtr = ReadRecPtr;
         SpinLockRelease(&xlogctl->info_lck);
  
+       /* Also ensure XLogReceiptTime has a sane value */
+       XLogReceiptTime = GetCurrentTimestamp();
+
         /*
          * Let postmaster know we've started redo now, so that it can
          * launch bgwriter to perform restartpoints.  We don't bother
@@ -7647,7 +7683,7 @@ CreateRestartPoint(int flags)
         XLogRecPtr  endptr;
  
         /* Get the current (or recent) end of xlog */
-       endptr = GetWalRcvWriteRecPtr();
+       endptr = GetWalRcvWriteRecPtr(NULL);
  
         PrevLogSeg(_logId, _logSeg);
         RemoveOldXlogFiles(_logId, _logSeg, endptr);
@@ -8757,7 +8793,7 @@ pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
     XLogRecPtr  recptr;
     char        location[MAXFNAMELEN];
  
-   recptr = GetWalRcvWriteRecPtr();
+   recptr = GetWalRcvWriteRecPtr(NULL);
  
     if (recptr.xlogid == 0 && recptr.xrecoff == 0)
         PG_RETURN_NULL();
@@ -9272,6 +9308,8 @@ retry:
             {
                 if (WalRcvInProgress())
                 {
+                   bool    havedata;
+
                     /*
                      * If we find an invalid record in the WAL streamed from
                      * master, something is seriously wrong. There's little
@@ -9289,28 +9327,62 @@ retry:
                     }
  
                     /*
-                    * While walreceiver is active, wait for new WAL to arrive
-                    * from primary.
+                    * Walreceiver is active, so see if new data has arrived.
+                    *
+                    * We only advance XLogReceiptTime when we obtain fresh
+                    * WAL from walreceiver and observe that we had already
+                    * processed everything before the most recent "chunk"
+                    * that it flushed to disk.  In steady state where we are
+                    * keeping up with the incoming data, XLogReceiptTime
+                    * will be updated on each cycle.  When we are behind,
+                    * XLogReceiptTime will not advance, so the grace time
+                    * alloted to conflicting queries will decrease.
                      */
-                   receivedUpto = GetWalRcvWriteRecPtr();
                     if (XLByteLT(*RecPtr, receivedUpto))
+                       havedata = true;
+                   else
+                   {
+                       XLogRecPtr  latestChunkStart;
+
+                       receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
+                       if (XLByteLT(*RecPtr, receivedUpto))
+                       {
+                           havedata = true;
+                           if (!XLByteLT(*RecPtr, latestChunkStart))
+                               XLogReceiptTime = GetCurrentTimestamp();
+                       }
+                       else
+                           havedata = false;
+                   }
+                   if (havedata)
                     {
                         /*
                          * Great, streamed far enough. Open the file if it's
-                        * not open already.
+                        * not open already.  Use XLOG_FROM_STREAM so that
+                        * source info is set correctly and XLogReceiptTime
+                        * isn't changed.
                          */
                         if (readFile < 0)
                         {
                             readFile =
                                 XLogFileRead(readId, readSeg, PANIC,
                                              recoveryTargetTLI,
-                                            XLOG_FROM_PG_XLOG, false);
+                                            XLOG_FROM_STREAM, false);
+                           Assert(readFile >= 0);
                             switched_segment = true;
+                       }
+                       else
+                       {
+                           /* just make sure source info is correct... */
                             readSource = XLOG_FROM_STREAM;
+                           XLogReceiptSource = XLOG_FROM_STREAM;
                         }
                         break;
                     }
  
+                   /*
+                    * Data not here yet, so check for trigger then sleep.
+                    */
                     if (CheckForStandbyTrigger())
                         goto triggered;
  
@@ -9388,7 +9460,7 @@ retry:
                     readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
                                                   sources);
                     switched_segment = true;
-                   if (readFile != -1)
+                   if (readFile >= 0)
                         break;
  
                     /*
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c

index c5a6b315ebaf82d5493f0cb5fb97b588bfbd6402..153b7ff0e5004211d3aead491bde292b15c9ba81 100644 (file)
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -29,7 +29,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.14 2010/06/09 15:04:07 heikki Exp $
+ *   $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.15 2010/07/03 20:43:57 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -524,6 +524,7 @@ XLogWalRcvFlush(void)
  
         /* Update shared-memory status */
         SpinLockAcquire(&walrcv->mutex);
+       walrcv->latestChunkStart = walrcv->receivedUpto;
         walrcv->receivedUpto = LogstreamResult.Flush;
         SpinLockRelease(&walrcv->mutex);
  
diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c

index 78ee7fb9f7eae63c09cde037f0da21b13773ed2e..4bc3bd875c05d3a0e22be07eb800a0284af3a90e 100644 (file)
--- a/src/backend/replication/walreceiverfuncs.c
+++ b/src/backend/replication/walreceiverfuncs.c
@@ -10,7 +10,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/replication/walreceiverfuncs.c,v 1.5 2010/04/28 16:54:15 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/replication/walreceiverfuncs.c,v 1.6 2010/07/03 20:43:57 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -187,10 +187,11 @@ RequestXLogStreaming(XLogRecPtr recptr, const char *conninfo)
     if (recptr.xrecoff % XLogSegSize != 0)
         recptr.xrecoff -= recptr.xrecoff % XLogSegSize;
  
+   SpinLockAcquire(&walrcv->mutex);
+
     /* It better be stopped before we try to restart it */
     Assert(walrcv->walRcvState == WALRCV_STOPPED);
  
-   SpinLockAcquire(&walrcv->mutex);
     if (conninfo != NULL)
         strlcpy((char *) walrcv->conninfo, conninfo, MAXCONNINFO);
     else
@@ -199,16 +200,22 @@ RequestXLogStreaming(XLogRecPtr recptr, const char *conninfo)
     walrcv->startTime = now;
  
     walrcv->receivedUpto = recptr;
+   walrcv->latestChunkStart = recptr;
+
     SpinLockRelease(&walrcv->mutex);
  
     SendPostmasterSignal(PMSIGNAL_START_WALRECEIVER);
  }
  
  /*
- * Returns the byte position that walreceiver has written
+ * Returns the last+1 byte position that walreceiver has written.
+ *
+ * Optionally, returns the previous chunk start, that is the first byte
+ * written in the most recent walreceiver flush cycle.  Callers not
+ * interested in that value may pass NULL for latestChunkStart.
   */
  XLogRecPtr
-GetWalRcvWriteRecPtr(void)
+GetWalRcvWriteRecPtr(XLogRecPtr *latestChunkStart)
  {
     /* use volatile pointer to prevent code rearrangement */
     volatile WalRcvData *walrcv = WalRcv;
@@ -216,6 +223,8 @@ GetWalRcvWriteRecPtr(void)
  
     SpinLockAcquire(&walrcv->mutex);
     recptr = walrcv->receivedUpto;
+   if (latestChunkStart)
+       *latestChunkStart = walrcv->latestChunkStart;
     SpinLockRelease(&walrcv->mutex);
  
     return recptr;
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c

index a61a4e62169fed468c5b90605c59ca7acc3f16f0..8525492bc25eb11bcd82631780e464eef91eccc6 100644 (file)
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -11,7 +11,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.25 2010/06/14 00:49:24 itagaki Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.26 2010/07/03 20:43:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -30,7 +30,10 @@
  #include "storage/standby.h"
  #include "utils/ps_status.h"
  
+/* User-settable GUC parameters */
  int            vacuum_defer_cleanup_age;
+int            max_standby_archive_delay = 30 * 1000;
+int            max_standby_streaming_delay = 30 * 1000;
  
  static List *RecoveryLockList;
  
@@ -40,13 +43,14 @@ static void ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid);
  static void LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
  static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
  
+
  /*
   * InitRecoveryTransactionEnvironment
- *     Initiallize tracking of in-progress transactions in master
+ *     Initialize tracking of in-progress transactions in master
   *
   * We need to issue shared invalidations and hold locks. Holding locks
- * means others may want to wait on us, so we need to make lock table
- * inserts to appear like a transaction. We could create and delete
+ * means others may want to wait on us, so we need to make a lock table
+ * vxact entry like a real transaction. We could create and delete
   * lock table entries for each transaction but its simpler just to create
   * one permanent entry and leave it there all the time. Locks are then
   * acquired and released as needed. Yes, this means you can see the
@@ -58,7 +62,7 @@ InitRecoveryTransactionEnvironment(void)
     VirtualTransactionId vxid;
  
     /*
-    * Initialise shared invalidation management for Startup process, being
+    * Initialize shared invalidation management for Startup process, being
      * careful to register ourselves as a sendOnly process so we don't need to
      * read messages, nor will we get signalled when the queue starts filling
      * up.
@@ -113,6 +117,36 @@ ShutdownRecoveryTransactionEnvironment(void)
   * -----------------------------------------------------
   */
  
+/*
+ * Determine the cutoff time at which we want to start canceling conflicting
+ * transactions.  Returns zero (a time safely in the past) if we are willing
+ * to wait forever.
+ */
+static TimestampTz
+GetStandbyLimitTime(void)
+{
+   TimestampTz rtime;
+   bool        fromStream;
+
+   /*
+    * The cutoff time is the last WAL data receipt time plus the appropriate
+    * delay variable.  Delay of -1 means wait forever.
+    */
+   GetXLogReceiptTime(&rtime, &fromStream);
+   if (fromStream)
+   {
+       if (max_standby_streaming_delay < 0)
+           return 0;           /* wait forever */
+       return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
+   }
+   else
+   {
+       if (max_standby_archive_delay < 0)
+           return 0;           /* wait forever */
+       return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
+   }
+}
+
  #define STANDBY_INITIAL_WAIT_US  1000
  static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
  
@@ -124,10 +158,11 @@ static int    standbyWait_us = STANDBY_INITIAL_WAIT_US;
  static bool
  WaitExceedsMaxStandbyDelay(void)
  {
-   /* Are we past max_standby_delay? */
-   if (MaxStandbyDelay >= 0 &&
-       TimestampDifferenceExceeds(GetLatestXLogTime(), GetCurrentTimestamp(),
-                                  MaxStandbyDelay))
+   TimestampTz ltime;
+
+   /* Are we past the limit time? */
+   ltime = GetStandbyLimitTime();
+   if (ltime && GetCurrentTimestamp() >= ltime)
         return true;
  
     /*
@@ -203,8 +238,8 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
                 pid = CancelVirtualTransaction(*waitlist, reason);
  
                 /*
-                * Wait awhile for it to die so that we avoid flooding an
-                * unresponsive backend when system is heavily loaded.
+                * Wait a little bit for it to die so that we avoid flooding
+                * an unresponsive backend when system is heavily loaded.
                  */
                 if (pid != 0)
                     pg_usleep(5000L);
@@ -286,7 +321,7 @@ void
  ResolveRecoveryConflictWithDatabase(Oid dbid)
  {
     /*
-    * We don't do ResolveRecoveryConflictWithVirutalXIDs() here since that
+    * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
      * only waits for transactions and completely idle sessions would block
      * us. This is rare enough that we do this as simply as possible: no wait,
      * just force them off immediately.
@@ -355,12 +390,11 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
   * the limit of our patience. The sleep in LockBufferForCleanup() is
   * performed here, for code clarity.
   *
- * Resolve conflict by sending a SIGUSR1 reason to all backends to check if
+ * Resolve conflicts by sending a PROCSIG signal to all backends to check if
   * they hold one of the buffer pins that is blocking Startup process. If so,
   * backends will take an appropriate error action, ERROR or FATAL.
   *
- * We also check for deadlocks before we wait, though applications that cause
- * these will be extremely rare.  Deadlocks occur because if queries
+ * We also must check for deadlocks.  Deadlocks occur because if queries
   * wait on a lock, that must be behind an AccessExclusiveLock, which can only
   * be cleared if the Startup process replays a transaction completion record.
   * If Startup process is also waiting then that is a deadlock. The deadlock
@@ -368,66 +402,51 @@ ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
   * Startup is sleeping and the query waits on a lock. We protect against
   * only the former sequence here, the latter sequence is checked prior to
   * the query sleeping, in CheckRecoveryConflictDeadlock().
+ *
+ * Deadlocks are extremely rare, and relatively expensive to check for,
+ * so we don't do a deadlock check right away ... only if we have had to wait
+ * at least deadlock_timeout.  Most of the logic about that is in proc.c.
   */
  void
  ResolveRecoveryConflictWithBufferPin(void)
  {
     bool        sig_alarm_enabled = false;
+   TimestampTz ltime;
+   TimestampTz now;
  
     Assert(InHotStandby);
  
-   if (MaxStandbyDelay == 0)
-   {
-       /*
-        * We don't want to wait, so just tell everybody holding the pin to
-        * get out of town.
-        */
-       SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
-   }
-   else if (MaxStandbyDelay < 0)
-   {
-       TimestampTz now = GetCurrentTimestamp();
+   ltime = GetStandbyLimitTime();
+   now = GetCurrentTimestamp();
  
+   if (!ltime)
+   {
         /*
-        * Set timeout for deadlock check (only)
+        * We're willing to wait forever for conflicts, so set timeout for
+        * deadlock check (only)
          */
         if (enable_standby_sig_alarm(now, now, true))
             sig_alarm_enabled = true;
         else
             elog(FATAL, "could not set timer for process wakeup");
     }
+   else if (now >= ltime)
+   {
+       /*
+        * We're already behind, so clear a path as quickly as possible.
+        */
+       SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+   }
     else
     {
-       TimestampTz then = GetLatestXLogTime();
-       TimestampTz now = GetCurrentTimestamp();
-
-       /* Are we past max_standby_delay? */
-       if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay))
-       {
-           /*
-            * We're already behind, so clear a path as quickly as possible.
-            */
-           SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
-       }
+       /*
+        * Wake up at ltime, and check for deadlocks as well if we will be
+        * waiting longer than deadlock_timeout
+        */
+       if (enable_standby_sig_alarm(now, ltime, false))
+           sig_alarm_enabled = true;
         else
-       {
-           TimestampTz max_standby_time;
-
-           /*
-            * At what point in the future do we hit MaxStandbyDelay?
-            */
-           max_standby_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);
-           Assert(max_standby_time > now);
-
-           /*
-            * Wake up at MaxStandby delay, and check for deadlocks as well
-            * if we will be waiting longer than deadlock_timeout
-            */
-           if (enable_standby_sig_alarm(now, max_standby_time, false))
-               sig_alarm_enabled = true;
-           else
-               elog(FATAL, "could not set timer for process wakeup");
-       }
+           elog(FATAL, "could not set timer for process wakeup");
     }
  
     /* Wait to be signaled by UnpinBuffer() */
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c

index d7eb8695038451958ff9baa25f832fc73d86fa1e..2d77be0a34e09d759e191f9db83bb6c9452b50f2 100644 (file)
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.219 2010/05/26 19:52:52 sriggs Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.220 2010/07/03 20:43:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1627,12 +1627,13 @@ handle_sig_alarm(SIGNAL_ARGS)
  bool
  enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_only)
  {
-   TimestampTz deadlock_time = TimestampTzPlusMilliseconds(now, DeadlockTimeout);
+   TimestampTz deadlock_time = TimestampTzPlusMilliseconds(now,
+                                                           DeadlockTimeout);
  
     if (deadlock_only)
     {
         /*
-        * Wake up at DeadlockTimeout only, then wait forever
+        * Wake up at deadlock_time only, then wait forever
          */
         statement_fin_time = deadlock_time;
         deadlock_timeout_active = true;
@@ -1641,7 +1642,7 @@ enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_on
     else if (fin_time > deadlock_time)
     {
         /*
-        * Wake up at DeadlockTimeout, then again at MaxStandbyDelay
+        * Wake up at deadlock_time, then again at fin_time
          */
         statement_fin_time = deadlock_time;
         statement_fin_time2 = fin_time;
@@ -1651,7 +1652,7 @@ enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_on
     else
     {
         /*
-        * Wake only at MaxStandbyDelay because its fairly soon
+        * Wake only at fin_time because its fairly soon
          */
         statement_fin_time = fin_time;
         deadlock_timeout_active = false;
@@ -1729,15 +1730,16 @@ CheckStandbyTimeout(void)
         if (deadlock_timeout_active)
         {
             /*
-            * We're still waiting when we reach DeadlockTimeout, so send out a request
-            * to have other backends check themselves for deadlock. Then continue
-            * waiting until MaxStandbyDelay.
+            * We're still waiting when we reach deadlock timeout, so send out
+            * a request to have other backends check themselves for
+            * deadlock. Then continue waiting until statement_fin_time,
+            * if that's set.
              */
             SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
             deadlock_timeout_active = false;
  
             /*
-            * Begin second waiting period to MaxStandbyDelay if required.
+            * Begin second waiting period if required.
              */
             if (statement_timeout_active)
             {
@@ -1748,8 +1750,8 @@ CheckStandbyTimeout(void)
         else
         {
             /*
-            * We've now reached MaxStandbyDelay, so ask all conflicts to leave, cos
-            * its time for us to press ahead with applying changes in recovery.
+            * We've now reached statement_fin_time, so ask all conflicts to
+            * leave, so we can press ahead with applying changes in recovery.
              */
             SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
         }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index 73305486569b0d050e33eb1c7caa37956d22a9ce..f2caad2cd9a87acd51f4ba53f13e0645cf380e59 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
   * Written by Peter Eisentraut .
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.557 2010/06/25 13:11:25 sriggs Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.558 2010/07/03 20:43:58 tgl Exp $
   *
   *--------------------------------------------------------------------
   */
@@ -57,6 +57,7 @@
  #include "postmaster/walwriter.h"
  #include "replication/walsender.h"
  #include "storage/bufmgr.h"
+#include "storage/standby.h"
  #include "storage/fd.h"
  #include "tcop/tcopprot.h"
  #include "tsearch/ts_cache.h"
@@ -116,7 +117,6 @@ extern char *default_tablespace;
  extern char *temp_tablespaces;
  extern bool synchronize_seqscans;
  extern bool fullPageWrites;
-extern int vacuum_defer_cleanup_age;
  extern int ssl_renegotiation_limit;
  
  #ifdef TRACE_SORT
@@ -1373,6 +1373,26 @@ static struct config_int ConfigureNamesInt[] =
         1000, 1, INT_MAX / 1000, NULL, NULL
     },
  
+   {
+       {"max_standby_archive_delay", PGC_SIGHUP, WAL_STANDBY_SERVERS,
+           gettext_noop("Sets the maximum delay before canceling queries when a hot standby server is processing archived WAL data."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &max_standby_archive_delay,
+       30 * 1000, -1, INT_MAX / 1000, NULL, NULL
+   },
+
+   {
+       {"max_standby_streaming_delay", PGC_SIGHUP, WAL_STANDBY_SERVERS,
+           gettext_noop("Sets the maximum delay before canceling queries when a hot standby server is processing streamed WAL data."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &max_standby_streaming_delay,
+       30 * 1000, -1, INT_MAX / 1000, NULL, NULL
+   },
+
     /*
      * Note: MaxBackends is limited to INT_MAX/4 because some places compute
      * 4*MaxBackends without any overflow check.  This check is made in
@@ -1392,16 +1412,6 @@ static struct config_int ConfigureNamesInt[] =
         100, 1, INT_MAX / 4, assign_maxconnections, NULL
     },
  
-   {
-       {"max_standby_delay", PGC_SIGHUP, WAL_STANDBY_SERVERS,
-           gettext_noop("Sets the maximum delay to avoid conflict processing on hot standby servers."),
-           NULL,
-           GUC_UNIT_MS
-       },
-       &MaxStandbyDelay,
-       30 * 1000, -1, INT_MAX / 1000, NULL, NULL
-   },
-
     {
         {"superuser_reserved_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
             gettext_noop("Sets the number of connection slots reserved for superusers."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 5ea568a3551abfbf1eed7aff62e8b564403b8a6e..e765664ecc0d66a2720443a0632143aae01936ad 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,15 +186,19 @@
  # - Streaming Replication -
  
  #max_wal_senders = 0       # max number of walsender processes
-#wal_sender_delay = 200ms  # 1-10000 milliseconds
+#wal_sender_delay = 200ms  # walsender cycle time, 1-10000 milliseconds
  #wal_keep_segments = 0     # in logfile segments, 16MB each; 0 disables
  
  # - Standby Servers -
  
-#hot_standby = off     # allows queries during recovery
-#max_standby_delay = 30s   # max acceptable lag to allow queries to
-               # complete without conflict; -1 means forever
-#vacuum_defer_cleanup_age = 0  # num transactions by which cleanup is deferred
+#hot_standby = off         # "on" allows queries during recovery
+#max_standby_archive_delay = 30s   # max delay before canceling queries
+                   # when reading WAL from archive;
+                   # -1 allows indefinite delay
+#max_standby_streaming_delay = 30s # max delay before canceling queries
+                   # when reading streaming WAL;
+                   # -1 allows indefinite delay
+#vacuum_defer_cleanup_age = 0      # number of transactions by which cleanup is deferred
  
  
  #------------------------------------------------------------------------------
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index cbadd7f91fb1607174f0518164f50c4a3e208a79..27e7f404d8dc15b09a42f329eda3c31cb41f2f02 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.113 2010/06/17 16:41:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.114 2010/07/03 20:43:58 tgl Exp $
   */
  #ifndef XLOG_H
  #define XLOG_H
@@ -135,22 +135,25 @@ typedef struct XLogRecData
  extern PGDLLIMPORT TimeLineID ThisTimeLineID;  /* current TLI */
  
  /*
- * Prior to 8.4, all activity during recovery was carried out by Startup
+ * Prior to 8.4, all activity during recovery was carried out by the startup
   * process. This local variable continues to be used in many parts of the
- * code to indicate actions taken by RecoveryManagers. Other processes who
- * potentially perform work during recovery should check RecoveryInProgress()
- * see XLogCtl notes in xlog.c
+ * code to indicate actions taken by RecoveryManagers. Other processes that
+ * potentially perform work during recovery should check RecoveryInProgress().
+ * See XLogCtl notes in xlog.c.
   */
  extern bool InRecovery;
  
  /*
   * Like InRecovery, standbyState is only valid in the startup process.
+ * In all other processes it will have the value STANDBY_DISABLED (so
+ * InHotStandby will read as FALSE).
   *
   * In DISABLED state, we're performing crash recovery or hot standby was
   * disabled in recovery.conf.
   *
- * In INITIALIZED state, we haven't yet received a RUNNING_XACTS or shutdown
- * checkpoint record to initialize our master transaction tracking system.
+ * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
+ * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
+ * to initialize our master-transaction tracking system.
   *
   * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
   * state. The tracked information might still be incomplete, so we can't allow
@@ -168,6 +171,7 @@ typedef enum
     STANDBY_SNAPSHOT_PENDING,
     STANDBY_SNAPSHOT_READY
  } HotStandbyState;
+
  extern HotStandbyState standbyState;
  
  #define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
@@ -193,7 +197,6 @@ extern int  XLogArchiveTimeout;
  extern bool XLogArchiveMode;
  extern char *XLogArchiveCommand;
  extern bool EnableHotStandby;
-extern int MaxStandbyDelay;
  extern bool log_checkpoints;
  
  /* WAL levels */
@@ -279,7 +282,7 @@ extern void issue_xlog_fsync(int fd, uint32 log, uint32 seg);
  
  extern bool RecoveryInProgress(void);
  extern bool XLogInsertAllowed(void);
-extern TimestampTz GetLatestXLogTime(void);
+extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
  
  extern void UpdateControlFile(void);
  extern uint64 GetSystemIdentifier(void);
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h

index 5dcaeba3f339c33516f63c4e6fdd3ea305b4080b..734380ee4f88000230a685329f2636f5a77319bc 100644 (file)
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -5,7 +5,7 @@
   *
   * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.9 2010/06/03 22:17:32 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.10 2010/07/03 20:43:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -41,25 +41,35 @@ typedef enum
  typedef struct
  {
     /*
-    * connection string; is used for walreceiver to connect with the primary.
-    */
-   char        conninfo[MAXCONNINFO];
-
-   /*
-    * PID of currently active walreceiver process, and the current state.
+    * PID of currently active walreceiver process, its current state and
+    * start time (actually, the time at which it was requested to be started).
      */
     pid_t       pid;
     WalRcvState walRcvState;
     pg_time_t   startTime;
  
     /*
-    * receivedUpto-1 is the last byte position that has been already
-    * received. When startup process starts the walreceiver, it sets this to
-    * the point where it wants the streaming to begin. After that,
-    * walreceiver updates this whenever it flushes the received WAL.
+    * receivedUpto-1 is the last byte position that has already been
+    * received.  When startup process starts the walreceiver, it sets
+    * receivedUpto to the point where it wants the streaming to begin.
+    * After that, walreceiver updates this whenever it flushes the received
+    * WAL to disk.
      */
     XLogRecPtr  receivedUpto;
  
+   /*
+    * latestChunkStart is the starting byte position of the current "batch"
+    * of received WAL.  It's actually the same as the previous value of
+    * receivedUpto before the last flush to disk.  Startup process can use
+    * this to detect whether it's keeping up or not.
+    */
+   XLogRecPtr  latestChunkStart;
+
+   /*
+    * connection string; is used for walreceiver to connect with the primary.
+    */
+   char        conninfo[MAXCONNINFO];
+
     slock_t     mutex;          /* locks shared variables shown above */
  } WalRcvData;
  
@@ -83,6 +93,6 @@ extern void ShutdownWalRcv(void);
  extern bool WalRcvInProgress(void);
  extern XLogRecPtr WaitNextXLogAvailable(XLogRecPtr recptr, bool *finished);
  extern void RequestXLogStreaming(XLogRecPtr recptr, const char *conninfo);
-extern XLogRecPtr GetWalRcvWriteRecPtr(void);
+extern XLogRecPtr GetWalRcvWriteRecPtr(XLogRecPtr *latestChunkStart);
  
  #endif   /* _WALRECEIVER_H */
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h

index 9159301a168ac89a489bcc44f599ce67082ad2d9..0654c5bcccfa9564e01ff7aa14568f8ed76feb10 100644 (file)
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/standby.h,v 1.10 2010/05/13 11:15:38 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/storage/standby.h,v 1.11 2010/07/03 20:43:58 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -19,7 +19,10 @@
  #include "storage/procsignal.h"
  #include "storage/relfilenode.h"
  
+/* User-settable GUC parameters */
  extern int vacuum_defer_cleanup_age;
+extern int max_standby_archive_delay;
+extern int max_standby_streaming_delay;
  
  extern void InitRecoveryTransactionEnvironment(void);
  extern void ShutdownRecoveryTransactionEnvironment(void);
@@ -83,7 +86,7 @@ extern void standby_desc(StringInfo buf, uint8 xl_info, char *rec);
  /*
   * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
   * not quite. This has nothing at all to do with visibility on this server,
- * so this is completely separate from snapmgr.c and snapmgr.h
+ * so this is completely separate from snapmgr.c and snapmgr.h.
   * This data is important for creating the initial snapshot state on a
   * standby server. We need lots more information than a normal snapshot,
   * hence we use a specific data structure for our needs. This data
author	Tom Lane
	Sat, 3 Jul 2010 20:43:58 +0000 (20:43 +0000)
committer	Tom Lane
	Sat, 3 Jul 2010 20:43:58 +0000 (20:43 +0000)
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
doc/src/sgml/high-availability.sgml		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/replication/walreceiver.c		patch \| blob \| blame \| history
src/backend/replication/walreceiverfuncs.c		patch \| blob \| blame \| history
src/backend/storage/ipc/standby.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/proc.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| blame \| history
src/include/access/xlog.h		patch \| blob \| blame \| history
src/include/replication/walreceiver.h		patch \| blob \| blame \| history
src/include/storage/standby.h		patch \| blob \| blame \| history