parameter is greater than zero, the server will switch to a new
segment file whenever this many seconds have elapsed since the last
segment file switch, and there has been any database activity,
- including a single checkpoint. (Increasing
- checkpoint_timeout> will reduce unnecessary
- checkpoints on an idle system.)
- Note that archived files that are closed early
- due to a forced switch are still the same length as completely full
- files. Therefore, it is unwise to use a very short
+ including a single checkpoint (checkpoints are skipped if there is
+ no database activity). Note that archived files that are closed
+ early due to a forced switch are still the same length as completely
+ full files. Therefore, it is unwise to use a very short
archive_timeout> — it will bloat your archive
storage. archive_timeout> settings of a minute or so are
usually reasonable. You should consider using streaming replication,
heaptup->t_len - SizeofHeapTupleHeader);
/* filtering by origin on a row level is much more efficient */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
recptr = XLogInsert(RM_HEAP_ID, info);
XLogRegisterBufData(0, tupledata, totaldatalen);
/* filtering by origin on a row level is much more efficient */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
recptr = XLogInsert(RM_HEAP2_ID, info);
}
/* filtering by origin on a row level is much more efficient */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
XLogBeginInsert();
/* We want the same filtering on this as on a plain insert */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
}
/* filtering by origin on a row level is much more efficient */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
recptr = XLogInsert(RM_HEAP_ID, info);
XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin));
/* we allow filtering by xacts */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
return XLogInsert(RM_XACT_ID, info);
}
* the WAL record is just copied to the page and the lock is released. But
* to avoid the deadlock-scenario explained above, the indicator is always
* updated before sleeping while holding an insertion lock.
+ *
+ * lastImportantAt contains the LSN of the last important WAL record inserted
+ * using a given lock. This value is used to detect if there has been
+ * important WAL activity since the last time some action, like a checkpoint,
+ * was performed - allowing to not repeat the action if not. The LSN is
+ * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
+ * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
+ * records. Tracking the WAL activity directly in WALInsertLock has the
+ * advantage of not needing any additional locks to update the value.
*/
typedef struct
{
LWLock lock;
XLogRecPtr insertingAt;
+ XLogRecPtr lastImportantAt;
} WALInsertLock;
/*
XLogRecPtr unloggedLSN;
slock_t ulsn_lck;
- /* Time of last xlog segment switch. Protected by WALWriteLock. */
+ /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
pg_time_t lastSegSwitchTime;
+ XLogRecPtr lastSegSwitchLSN;
/*
* Protected by info_lck and WALWriteLock (you must hold either lock to
* which pages need a full-page image, and retry. If fpw_lsn is invalid, the
* record is always inserted.
*
+ * 'flags' gives more in-depth control on the record being inserted. See
+ * XLogSetRecordFlags() for details.
+ *
* The first XLogRecData in the chain must be for the record header, and its
* data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
* xl_crc fields in the header, the rest of the header must already be filled
* WAL rule "write the log before the data".)
*/
XLogRecPtr
-XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
+XLogInsertRecord(XLogRecData *rdata,
+ XLogRecPtr fpw_lsn,
+ uint8 flags)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
pg_crc32c rdata_crc;
*/
CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
StartPos, EndPos);
+
+ /*
+ * Unless record is flagged as not important, update LSN of last
+ * important record in the current slot. When holding all locks, just
+ * update the first one.
+ */
+ if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
+ {
+ int lockno = holdingAllLocks ? 0 : MyLockNo;
+
+ WALInsertLocks[lockno].l.lastImportantAt = StartPos;
+ }
}
else
{
XLogArchiveNotifySeg(openLogSegNo);
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+ XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
/*
* Request a checkpoint if we've consumed too much xlog since
{
LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+ WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
}
/*
*/
InRecovery = false;
- /* start the archive_timeout timer running */
+ /* start the archive_timeout timer and LSN running */
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+ XLogCtl->lastSegSwitchLSN = EndOfLog;
/* also initialize latestCompletedXid, to nextXid - 1 */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
}
/*
- * Get the time of the last xlog segment switch
+ * GetLastImportantRecPtr -- Returns the LSN of the last important record
+ * inserted. All records not explicitly marked as unimportant are considered
+ * important.
+ *
+ * The LSN is determined by computing the maximum of
+ * WALInsertLocks[i].lastImportantAt.
+ */
+XLogRecPtr
+GetLastImportantRecPtr(void)
+{
+ XLogRecPtr res = InvalidXLogRecPtr;
+ int i;
+
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+ {
+ XLogRecPtr last_important;
+
+ /*
+ * Need to take a lock to prevent torn reads of the LSN, which are
+ * possible on some of the supported platforms. WAL insert locks only
+ * support exclusive mode, so we have to use that.
+ */
+ LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+ last_important = WALInsertLocks[i].l.lastImportantAt;
+ LWLockRelease(&WALInsertLocks[i].l.lock);
+
+ if (res < last_important)
+ res = last_important;
+ }
+
+ return res;
+}
+
+/*
+ * Get the time and LSN of the last xlog segment switch
*/
pg_time_t
-GetLastSegSwitchTime(void)
+GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
{
pg_time_t result;
/* Need WALWriteLock, but shared lock is sufficient */
LWLockAcquire(WALWriteLock, LW_SHARED);
result = XLogCtl->lastSegSwitchTime;
+ *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
LWLockRelease(WALWriteLock);
return result;
* record will go to the next XLOG file and won't be archived (yet).
*/
if (XLogArchivingActive() && XLogArchiveCommandSet())
- RequestXLogSwitch();
+ RequestXLogSwitch(false);
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
uint32 freespace;
XLogRecPtr PriorRedoPtr;
XLogRecPtr curInsert;
- XLogRecPtr prevPtr;
+ XLogRecPtr last_important_lsn;
VirtualTransactionId *vxids;
int nvxids;
else
checkPoint.oldestActiveXid = InvalidTransactionId;
+ /*
+ * Get location of last important record before acquiring insert locks (as
+ * GetLastImportantRecPtr() also locks WAL locks).
+ */
+ last_important_lsn = GetLastImportantRecPtr();
+
/*
* We must block concurrent insertions while examining insert state to
* determine the checkpoint REDO pointer.
*/
WALInsertLockAcquireExclusive();
curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
- prevPtr = XLogBytePosToRecPtr(Insert->PrevBytePos);
/*
- * If this isn't a shutdown or forced checkpoint, and we have not inserted
- * any XLOG records since the start of the last checkpoint, skip the
- * checkpoint. The idea here is to avoid inserting duplicate checkpoints
- * when the system is idle. That wastes log space, and more importantly it
- * exposes us to possible loss of both current and previous checkpoint
- * records if the machine crashes just as we're writing the update.
- * (Perhaps it'd make even more sense to checkpoint only when the previous
- * checkpoint record is in a different xlog page?)
- *
- * If the previous checkpoint crossed a WAL segment, however, we create
- * the checkpoint anyway, to have the latest checkpoint fully contained in
- * the new segment. This is for a little bit of extra robustness: it's
- * better if you don't need to keep two WAL segments around to recover the
- * checkpoint.
+ * If this isn't a shutdown or forced checkpoint, and if there has been no
+ * WAL activity requiring a checkpoint, skip it. The idea here is to
+ * avoid inserting duplicate checkpoints when the system is idle.
*/
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FORCE)) == 0)
{
- if (prevPtr == ControlFile->checkPointCopy.redo &&
- prevPtr / XLOG_SEG_SIZE == curInsert / XLOG_SEG_SIZE)
+ if (last_important_lsn == ControlFile->checkPoint)
{
WALInsertLockRelease();
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
+ ereport(DEBUG1,
+ (errmsg("checkpoint skipped due to an idle system")));
return;
}
}
* write a switch record because we are already at segment start.
*/
XLogRecPtr
-RequestXLogSwitch(void)
+RequestXLogSwitch(bool mark_unimportant)
{
XLogRecPtr RecPtr;
/* XLOG SWITCH has no data */
XLogBeginInsert();
+
+ if (mark_unimportant)
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
return RecPtr;
* recovery case described above.
*/
if (!backup_started_in_recovery)
- RequestXLogSwitch();
+ RequestXLogSwitch(false);
do
{
* Force a switch to a new xlog segment file, so that the backup is valid
* as soon as archiver moves out the current segment file.
*/
- RequestXLogSwitch();
+ RequestXLogSwitch(false);
XLByteToPrevSeg(stoppoint, _logSegNo);
XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
- switchpoint = RequestXLogSwitch();
+ switchpoint = RequestXLogSwitch(false);
/*
* As a convenience, return the WAL location of the switch record
static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head;
static uint32 mainrdata_len; /* total # of bytes in chain */
-/* Should the in-progress insertion log the origin? */
-static bool include_origin = false;
+/* flags for the in-progress insertion */
+static uint8 curinsert_flags = 0;
/*
* These are used to hold the record header while constructing a record.
max_registered_block_id = 0;
mainrdata_len = 0;
mainrdata_last = (XLogRecData *) &mainrdata_head;
- include_origin = false;
+ curinsert_flags = 0;
begininsert_called = false;
}
}
/*
- * Should this record include the replication origin if one is set up?
+ * Set insert status flags for the upcoming WAL record.
+ *
+ * The flags that can be used here are:
+ * - XLOG_INCLUDE_ORIGIN, to determine if the replication origin should be
+ * included in the record.
+ * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for
+ * durability, which allows to avoid triggering WAL archiving and other
+ * background activity.
*/
void
-XLogIncludeOrigin(void)
+XLogSetRecordFlags(uint8 flags)
{
Assert(begininsert_called);
- include_origin = true;
+ curinsert_flags = flags;
}
/*
rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites,
&fpw_lsn);
- EndPos = XLogInsertRecord(rdt, fpw_lsn);
+ EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags);
} while (EndPos == InvalidXLogRecPtr);
XLogResetInsertion();
}
/* followed by the record's origin, if any */
- if (include_origin && replorigin_session_origin != InvalidRepOriginId)
+ if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) &&
+ replorigin_session_origin != InvalidRepOriginId)
{
*(scratch++) = XLR_BLOCK_ID_ORIGIN;
memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin));
* check whether there has been any WAL inserted since the last time
* we've logged a running xacts.
*
- * We do this logging in the bgwriter as its the only process that is
+ * We do this logging in the bgwriter as it is the only process that is
* run regularly and returns to its mainloop all the time. E.g.
* Checkpointer, when active, is barely ever in its mainloop and thus
* makes it hard to log regularly.
LOG_SNAPSHOT_INTERVAL_MS);
/*
- * only log if enough time has passed and some xlog record has
- * been inserted.
+ * Only log if enough time has passed and interesting records have
+ * been inserted since the last snapshot.
*/
if (now >= timeout &&
- last_snapshot_lsn != GetXLogInsertRecPtr())
+ last_snapshot_lsn < GetLastImportantRecPtr())
{
last_snapshot_lsn = LogStandbySnapshot();
last_snapshot_ts = now;
/*
* CheckArchiveTimeout -- check for archive_timeout and switch xlog files
*
- * This will switch to a new WAL file and force an archive file write
- * if any activity is recorded in the current WAL file, including just
- * a single checkpoint record.
+ * This will switch to a new WAL file and force an archive file write if
+ * meaningful activity is recorded in the current WAL file. This includes most
+ * writes, including just a single checkpoint record, but excludes WAL records
+ * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like
+ * snapshots of running transactions). Such records, depending on
+ * configuration, occur on regular intervals and don't contain important
+ * information. This avoids generating archives with a few unimportant
+ * records.
*/
static void
CheckArchiveTimeout(void)
{
pg_time_t now;
pg_time_t last_time;
+ XLogRecPtr last_switch_lsn;
if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
return;
* Update local state ... note that last_xlog_switch_time is the last time
* a switch was performed *or requested*.
*/
- last_time = GetLastSegSwitchTime();
+ last_time = GetLastSegSwitchData(&last_switch_lsn);
last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
- /* Now we can do the real check */
+ /* Now we can do the real checks */
if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
{
- XLogRecPtr switchpoint;
-
- /* OK, it's time to switch */
- switchpoint = RequestXLogSwitch();
-
/*
- * If the returned pointer points exactly to a segment boundary,
- * assume nothing happened.
+ * Switch segment only when "important" WAL has been logged since the
+ * last segment switch.
*/
- if ((switchpoint % XLogSegSize) != 0)
- ereport(DEBUG1,
- (errmsg("transaction log switch forced (archive_timeout=%d)",
- XLogArchiveTimeout)));
+ if (GetLastImportantRecPtr() > last_switch_lsn)
+ {
+ XLogRecPtr switchpoint;
+
+ /* mark switch as unimportant, avoids triggering checkpoints */
+ switchpoint = RequestXLogSwitch(true);
+
+ /*
+ * If the returned pointer points exactly to a segment boundary,
+ * assume nothing happened.
+ */
+ if ((switchpoint % XLogSegSize) != 0)
+ ereport(DEBUG1,
+ (errmsg("transaction log switch forced (archive_timeout=%d)",
+ XLogArchiveTimeout)));
+ }
/*
* Update state in any case, so we don't retry constantly when the
XLogRegisterData((char *) message, size);
/* allow origin filtering */
- XLogIncludeOrigin();
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
return XLogInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE);
}
/*
* Record an enhanced snapshot of running transactions into WAL.
*
- * The definitions of RunningTransactionsData and xl_xact_running_xacts
- * are similar. We keep them separate because xl_xact_running_xacts
- * is a contiguous chunk of memory and never exists fully until it is
- * assembled in WAL.
+ * The definitions of RunningTransactionsData and xl_xact_running_xacts are
+ * similar. We keep them separate because xl_xact_running_xacts is a
+ * contiguous chunk of memory and never exists fully until it is assembled in
+ * WAL. The inserted records are marked as not being important for durability,
+ * to avoid triggering superflous checkpoint / archiving activity.
*/
static XLogRecPtr
LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
/* Header */
XLogBeginInsert();
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
/* array of TransactionIds */
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
(void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
}
#define CHECKPOINT_CAUSE_XLOG 0x0040 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0080 /* Elapsed time */
+/*
+ * Flag bits for the record being inserted, set using XLogSetRecordFlags().
+ */
+#define XLOG_INCLUDE_ORIGIN 0x01 /* include the replication origin */
+#define XLOG_MARK_UNIMPORTANT 0x02 /* record not important for durability */
+
+
/* Checkpoint statistics */
typedef struct CheckpointStatsData
{
struct XLogRecData;
-extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn);
+extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata,
+ XLogRecPtr fpw_lsn,
+ uint8 flags);
extern void XLogFlush(XLogRecPtr RecPtr);
extern bool XLogBackgroundFlush(void);
extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
extern XLogRecPtr GetRedoRecPtr(void);
extern XLogRecPtr GetInsertRecPtr(void);
extern XLogRecPtr GetFlushRecPtr(void);
+extern XLogRecPtr GetLastImportantRecPtr(void);
extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
extern void RemovePromoteSignalFiles(void);
/*
* Exported to support xlog switching from checkpointer
*/
-extern pg_time_t GetLastSegSwitchTime(void);
-extern XLogRecPtr RequestXLogSwitch(void);
+extern pg_time_t GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN);
+extern XLogRecPtr RequestXLogSwitch(bool mark_uninmportant);
extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli);
/* prototypes for public functions in xloginsert.c: */
extern void XLogBeginInsert(void);
-extern void XLogIncludeOrigin(void);
+extern void XLogSetRecordFlags(uint8 flags);
extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info);
extern void XLogEnsureRecordSpace(int nbuffers, int ndatas);
extern void XLogRegisterData(char *data, int len);