temp relation: Temporary relations.
+
+ wal: Write Ahead Logs.
+
+
normal.
+
+ init: I/O operations performed while creating the
+ WAL segments are tracked in context
+ init.
+
+
vacuum: I/O operations performed outside of shared
{
errno = 0;
- /* Measure I/O timing to write WAL data */
- if (track_wal_io_timing)
- INSTR_TIME_SET_CURRENT(start);
- else
- INSTR_TIME_SET_ZERO(start);
+ /*
+ * Measure I/O timing to write WAL data, for pg_stat_io and/or
+ * pg_stat_wal.
+ */
+ start = pgstat_prepare_io_time(track_io_timing || track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = pg_pwrite(openLogFile, from, nleft, startoffset);
pgstat_report_wait_end();
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
+ IOOP_WRITE, start, 1, written);
+
/*
* Increment the I/O timing and the number of times WAL data
* were written out to disk.
int fd;
int save_errno;
int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
+ instr_time io_start;
Assert(logtli != 0);
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m", tmppath)));
+ /* Measure I/O timing when initializing segment */
+ io_start = pgstat_prepare_io_time(track_io_timing);
+
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
save_errno = 0;
if (wal_init_zero)
}
pgstat_report_wait_end();
+ /*
+ * A full segment worth of data is written when using wal_init_zero. One
+ * byte is written when not using it.
+ */
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
+ io_start, 1,
+ wal_init_zero ? wal_segment_size : 1);
+
if (save_errno)
{
/*
errmsg("could not write to file \"%s\": %m", tmppath)));
}
+ /* Measure I/O timing when flushing segment */
+ io_start = pgstat_prepare_io_time(track_io_timing);
+
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
if (pg_fsync(fd) != 0)
{
}
pgstat_report_wait_end();
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
+ IOOP_FSYNC, io_start, 1, 0);
+
if (close(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
return;
- /* Measure I/O timing to sync the WAL file */
- if (track_wal_io_timing)
- INSTR_TIME_SET_CURRENT(start);
- else
- INSTR_TIME_SET_ZERO(start);
+ /*
+ * Measure I/O timing to sync the WAL file for pg_stat_io and/or
+ * pg_stat_wal.
+ */
+ start = pgstat_prepare_io_time(track_io_timing || track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
switch (wal_sync_method)
INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start);
}
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
+ start, 1, 0);
+
PendingWalStats.wal_sync++;
}
#ifndef FRONTEND
#include "pgstat.h"
+#include "storage/bufmgr.h"
#else
#include "common/logging.h"
#endif
char *p;
XLogRecPtr recptr;
Size nbytes;
+#ifndef FRONTEND
+ instr_time io_start;
+#endif
p = buf;
recptr = startptr;
segbytes = nbytes;
#ifndef FRONTEND
+ /* Measure I/O timing when reading segment */
+ io_start = pgstat_prepare_io_time(track_io_timing);
+
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
#endif
#ifndef FRONTEND
pgstat_report_wait_end();
+
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
+ io_start, 1, readbytes);
#endif
if (readbytes <= 0)
#include "utils/datetime.h"
#include "utils/fmgrprotos.h"
#include "utils/guc_hooks.h"
+#include "utils/pgstat_internal.h"
#include "utils/pg_lsn.h"
#include "utils/ps_status.h"
#include "utils/pg_rusage.h"
uint32 targetPageOff;
XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
int r;
+ instr_time io_start;
XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
/* Read the requested page */
readOff = targetPageOff;
+ /* Measure I/O timing when reading segment */
+ io_start = pgstat_prepare_io_time(track_io_timing);
+
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
if (r != XLOG_BLCKSZ)
int save_errno = errno;
pgstat_report_wait_end();
+
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
+ io_start, 1, r);
+
XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
if (r < 0)
{
}
pgstat_report_wait_end();
+ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
+ io_start, 1, r);
+
Assert(targetSegNo == readSegNo);
Assert(targetPageOff == readOff);
Assert(reqLen <= readLen);
case B_DEAD_END_BACKEND:
case B_ARCHIVER:
case B_LOGGER:
- case B_WAL_RECEIVER:
- case B_WAL_WRITER:
- case B_WAL_SUMMARIZER:
case B_BG_WRITER:
case B_CHECKPOINTER:
case B_STARTUP:
case B_BG_WORKER:
case B_STANDALONE_BACKEND:
case B_SLOTSYNC_WORKER:
+ case B_WAL_RECEIVER:
case B_WAL_SENDER:
+ case B_WAL_SUMMARIZER:
+ case B_WAL_WRITER:
return true;
}
INSTR_TIME_SET_CURRENT(io_time);
INSTR_TIME_SUBTRACT(io_time, start_time);
- if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
+ if (io_object != IOOBJECT_WAL)
{
- pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
- if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
- else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
- }
- else if (io_op == IOOP_READ)
- {
- pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
- if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
- else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+ if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
+ {
+ pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+ if (io_object == IOOBJECT_RELATION)
+ INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+ else if (io_object == IOOBJECT_TEMP_RELATION)
+ INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+ }
+ else if (io_op == IOOP_READ)
+ {
+ pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+ if (io_object == IOOBJECT_RELATION)
+ INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+ else if (io_object == IOOBJECT_TEMP_RELATION)
+ INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+ }
}
INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
return "bulkread";
case IOCONTEXT_BULKWRITE:
return "bulkwrite";
+ case IOCONTEXT_INIT:
+ return "init";
case IOCONTEXT_NORMAL:
return "normal";
case IOCONTEXT_VACUUM:
return "relation";
case IOOBJECT_TEMP_RELATION:
return "temp relation";
+ case IOOBJECT_WAL:
+ return "wal";
}
elog(ERROR, "unrecognized IOObject value: %d", io_object);
* - Syslogger because it is not connected to shared memory
* - Archiver because most relevant archiving IO is delegated to a
* specialized command or module
-* - WAL Receiver, WAL Writer, and WAL Summarizer IO are not tracked in
-* pg_stat_io for now
*
* Function returns true if BackendType participates in the cumulative stats
* subsystem for IO and false if it does not.
case B_DEAD_END_BACKEND:
case B_ARCHIVER:
case B_LOGGER:
- case B_WAL_RECEIVER:
- case B_WAL_WRITER:
- case B_WAL_SUMMARIZER:
return false;
case B_AUTOVAC_LAUNCHER:
case B_SLOTSYNC_WORKER:
case B_STANDALONE_BACKEND:
case B_STARTUP:
+ case B_WAL_RECEIVER:
case B_WAL_SENDER:
+ case B_WAL_SUMMARIZER:
+ case B_WAL_WRITER:
return true;
}
if (!pgstat_tracks_io_bktype(bktype))
return false;
+ /*
+ * Currently, IO on IOOBJECT_WAL objects can only occur in the
+ * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContexts.
+ */
+ if (io_object == IOOBJECT_WAL &&
+ (io_context != IOCONTEXT_NORMAL &&
+ io_context != IOCONTEXT_INIT))
+ return false;
+
/*
* Currently, IO on temporary relations can only occur in the
* IOCONTEXT_NORMAL IOContext.
/*
* Some BackendTypes will not do certain IOOps.
*/
- if ((bktype == B_BG_WRITER || bktype == B_CHECKPOINTER) &&
+ if (bktype == B_BG_WRITER &&
(io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
return false;
+ if (bktype == B_CHECKPOINTER &&
+ ((io_object != IOOBJECT_WAL && io_op == IOOP_READ) ||
+ (io_op == IOOP_EVICT || io_op == IOOP_HIT)))
+ return false;
+
if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
return false;
+ /*
+ * Some BackendTypes do not perform reads with IOOBJECT_WAL.
+ */
+ if (io_object == IOOBJECT_WAL && io_op == IOOP_READ &&
+ (bktype == B_WAL_RECEIVER || bktype == B_BG_WRITER ||
+ bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_WORKER ||
+ bktype == B_WAL_WRITER))
+ return false;
+
/*
* Temporary tables are not logged and thus do not require fsync'ing.
* Writeback is not requested for temporary tables.
if (!strategy_io_context && io_op == IOOP_REUSE)
return false;
+ /*
+ * IOOBJECT_WAL IOObject will not do certain IOOps depending on IOContext.
+ */
+ if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
+ !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
+ return false;
+
+ if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
+ !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
+ return false;
+
/*
* IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
* counted in the IOCONTEXT_NORMAL IOContext. See comment in
* ------------------------------------------------------------
*/
-#define PGSTAT_FILE_FORMAT_ID 0x01A5BCB2
+#define PGSTAT_FILE_FORMAT_ID 0x01A5BCB3
typedef struct PgStat_ArchiverStats
{
{
IOOBJECT_RELATION,
IOOBJECT_TEMP_RELATION,
+ IOOBJECT_WAL,
} IOObject;
-#define IOOBJECT_NUM_TYPES (IOOBJECT_TEMP_RELATION + 1)
+#define IOOBJECT_NUM_TYPES (IOOBJECT_WAL + 1)
typedef enum IOContext
{
IOCONTEXT_BULKREAD,
IOCONTEXT_BULKWRITE,
+ IOCONTEXT_INIT,
IOCONTEXT_NORMAL,
IOCONTEXT_VACUUM,
} IOContext;
t
(1 row)
+-- Test pg_stat_io for WAL in an init context, that should do writes
+-- and syncs.
+SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
+ FROM pg_stat_io
+ WHERE context = 'init' AND object = 'wal' \gset io_sum_wal_init_
+SELECT :io_sum_wal_init_writes > 0;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT current_setting('fsync') = 'off'
+ OR :io_sum_wal_init_fsyncs > 0;
+ ?column?
+----------
+ t
+(1 row)
+
+-- Test pg_stat_io for WAL in a normal context, that should do reads as well.
+SELECT SUM(reads) > 0
+ FROM pg_stat_io
+ WHERE context = 'normal' AND object = 'wal';
+ ?column?
+----------
+ t
+(1 row)
+
-----
-- Test that resetting stats works for reset timestamp
-----
-- - extends of relations using shared buffers
-- - fsyncs done to ensure the durability of data dirtying shared buffers
-- - shared buffer hits
+-- - WAL writes and fsyncs in IOContext IOCONTEXT_NORMAL
-- There is no test for blocks evicted from shared buffers, because we cannot
-- be sure of the state of shared buffers at the point the test is run.
-- Create a regular table and insert some data to generate IOCONTEXT_NORMAL
SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
FROM pg_stat_get_backend_io(pg_backend_pid())
WHERE object = 'relation' \gset my_io_sum_shared_before_
+SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
+ FROM pg_stat_io
+ WHERE context = 'normal' AND object = 'wal' \gset io_sum_wal_normal_before_
CREATE TABLE test_io_shared(a int);
INSERT INTO test_io_shared SELECT i FROM generate_series(1,100)i;
SELECT pg_stat_force_next_flush();
t
(1 row)
+SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
+ FROM pg_stat_io
+ WHERE context = 'normal' AND object = 'wal' \gset io_sum_wal_normal_after_
+SELECT current_setting('synchronous_commit') = 'on';
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT :io_sum_wal_normal_after_writes > :io_sum_wal_normal_before_writes;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT current_setting('fsync') = 'off'
+ OR :io_sum_wal_normal_after_fsyncs > :io_sum_wal_normal_before_fsyncs;
+ ?column?
+----------
+ t
+(1 row)
+
-- Change the tablespace so that the table is rewritten directly, then SELECT
-- from it to cause it to be read back into shared buffers.
SELECT sum(reads) AS io_sum_shared_before_reads
FROM pg_stat_get_backend_idset() beid
WHERE pg_stat_get_backend_pid(beid) = pg_backend_pid();
+-- Test pg_stat_io for WAL in an init context, that should do writes
+-- and syncs.
+SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
+ FROM pg_stat_io
+ WHERE context = 'init' AND object = 'wal' \gset io_sum_wal_init_
+SELECT :io_sum_wal_init_writes > 0;
+SELECT current_setting('fsync') = 'off'
+ OR :io_sum_wal_init_fsyncs > 0;
+
+-- Test pg_stat_io for WAL in a normal context, that should do reads as well.
+SELECT SUM(reads) > 0
+ FROM pg_stat_io
+ WHERE context = 'normal' AND object = 'wal';
+
-----
-- Test that resetting stats works for reset timestamp
-----
-- - extends of relations using shared buffers
-- - fsyncs done to ensure the durability of data dirtying shared buffers
-- - shared buffer hits
+-- - WAL writes and fsyncs in IOContext IOCONTEXT_NORMAL
-- There is no test for blocks evicted from shared buffers, because we cannot
-- be sure of the state of shared buffers at the point the test is run.
SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
FROM pg_stat_get_backend_io(pg_backend_pid())
WHERE object = 'relation' \gset my_io_sum_shared_before_
+SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
+ FROM pg_stat_io
+ WHERE context = 'normal' AND object = 'wal' \gset io_sum_wal_normal_before_
CREATE TABLE test_io_shared(a int);
INSERT INTO test_io_shared SELECT i FROM generate_series(1,100)i;
SELECT pg_stat_force_next_flush();
SELECT :my_io_sum_shared_after_writes >= :my_io_sum_shared_before_writes;
SELECT current_setting('fsync') = 'off'
OR :my_io_sum_shared_after_fsyncs >= :my_io_sum_shared_before_fsyncs;
+SELECT sum(writes) AS writes, sum(fsyncs) AS fsyncs
+ FROM pg_stat_io
+ WHERE context = 'normal' AND object = 'wal' \gset io_sum_wal_normal_after_
+SELECT current_setting('synchronous_commit') = 'on';
+SELECT :io_sum_wal_normal_after_writes > :io_sum_wal_normal_before_writes;
+SELECT current_setting('fsync') = 'off'
+ OR :io_sum_wal_normal_after_fsyncs > :io_sum_wal_normal_before_fsyncs;
-- Change the tablespace so that the table is rewritten directly, then SELECT
-- from it to cause it to be read back into shared buffers.