*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.62 2000/03/17 02:36:05 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.63 2000/04/09 04:43:16 tgl Exp $
*
* NOTES
* Transaction aborts can now occur two ways:
{
FlushBufferPool();
if (leak)
- ResetBufferPool();
+ ResetBufferPool(true);
/*
* have the transaction access methods record the status
}
if (leak)
- ResetBufferPool();
+ ResetBufferPool(true);
}
if (SharedBufferChanged && !TransactionIdDidCommit(xid))
TransactionIdAbort(xid);
- ResetBufferPool();
+ /*
+ * Tell bufmgr and smgr to release resources.
+ */
+ ResetBufferPool(false); /* false -> is abort */
}
/* --------------------------------
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.30 2000/01/26 05:56:10 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.31 2000/04/09 04:43:15 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "utils/syscache.h"
/*
- * relpath - path to the relation
- * Perhaps this should be in-line code in relopen().
+ * relpath - construct path to a relation's file
+ *
+ * Note that this only works with relations that are visible to the current
+ * backend, ie, either in the current database or shared system relations.
+ *
+ * Result is a palloc'd string.
*/
char *
relpath(const char *relname)
{
char *path;
- size_t bufsize = 0;
if (IsSharedSystemRelationName(relname))
{
- bufsize = strlen(DataDir) + sizeof(NameData) + 2;
+ /* Shared system relations live in DataDir */
+ size_t bufsize = strlen(DataDir) + sizeof(NameData) + 2;
+
path = (char *) palloc(bufsize);
- snprintf(path, bufsize, "%s/%s", DataDir, relname);
+ snprintf(path, bufsize, "%s%c%s", DataDir, SEP_CHAR, relname);
return path;
}
+ /*
+ * If it is in the current database, assume it is in current working
+ * directory. NB: this does not work during bootstrap!
+ */
return pstrdup(relname);
}
+/*
+ * relpath_blind - construct path to a relation's file
+ *
+ * Construct the path using only the info available to smgrblindwrt,
+ * namely the names and OIDs of the database and relation. (Shared system
+ * relations are identified with dbid = 0.) Note that we may have to
+ * access a relation belonging to a different database!
+ *
+ * Result is a palloc'd string.
+ */
+
+char *
+relpath_blind(const char *dbname, const char *relname,
+ Oid dbid, Oid relid)
+{
+ char *path;
+
+ if (dbid == (Oid) 0)
+ {
+ /* Shared system relations live in DataDir */
+ path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
+ sprintf(path, "%s%c%s", DataDir, SEP_CHAR, relname);
+ }
+ else if (dbid == MyDatabaseId)
+ {
+ /* XXX why is this inconsistent with relpath() ? */
+ path = (char *) palloc(strlen(DatabasePath) + sizeof(NameData) + 2);
+ sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relname);
+ }
+ else
+ {
+ /* this is work around only !!! */
+ char dbpathtmp[MAXPGPATH];
+ Oid id;
+ char *dbpath;
+
+ GetRawDatabaseInfo(dbname, &id, dbpathtmp);
+
+ if (id != dbid)
+ elog(FATAL, "relpath_blind: oid of db %s is not %u",
+ dbname, dbid);
+ dbpath = ExpandDatabasePath(dbpathtmp);
+ if (dbpath == NULL)
+ elog(FATAL, "relpath_blind: can't expand path for db %s",
+ dbname);
+ path = (char *) palloc(strlen(dbpath) + sizeof(NameData) + 2);
+ sprintf(path, "%s%c%s", dbpath, SEP_CHAR, relname);
+ pfree(dbpath);
+ }
+ return path;
+}
+
+
/*
* IsSystemRelationName
* True iff name is the name of a system catalog relation.
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.32 2000/01/26 05:56:50 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.33 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
extern IpcSemaphoreId WaitIOSemId;
long *PrivateRefCount; /* also used in freelist.c */
-bits8 *BufferLocks; /* */
-long *CommitInfoNeedsSave;/* to write buffers where we have filled
- * in t_infomask */
+bits8 *BufferLocks; /* flag bits showing locks I have set */
+BufferTag *BufferTagLastDirtied; /* tag buffer had when last dirtied by me */
+BufferBlindId *BufferBlindLastDirtied; /* and its BlindId too */
+bool *BufferDirtiedByMe; /* T if buf has been dirtied in cur xact */
+
/*
* Data Structures:
#endif
PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
BufferLocks = (bits8 *) calloc(NBuffers, sizeof(bits8));
- CommitInfoNeedsSave = (long *) calloc(NBuffers, sizeof(long));
+ BufferTagLastDirtied = (BufferTag *) calloc(NBuffers, sizeof(BufferTag));
+ BufferBlindLastDirtied = (BufferBlindId *) calloc(NBuffers, sizeof(BufferBlindId));
+ BufferDirtiedByMe = (bool *) calloc(NBuffers, sizeof(bool));
}
/* -----------------------------------------------------
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.77 2000/03/31 02:43:31 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
bool bufferLockHeld);
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr, bool bufferLockHeld);
+static void SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr);
+static void ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr);
static void BufferSync(void);
-static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);
+static int BufferReplace(BufferDesc *bufHdr);
void PrintBufferDescs(void);
/* ---------------------------------------------------
{
BufferDesc *buf = &BufferDescriptors[buffer - 1];
- if (IsSystemRelationName(buf->sb_relname))
+ if (IsSystemRelationName(buf->blind.relname))
return false;
return true;
}
fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
return buffer;
* If there's no IO for the buffer and the buffer
* is BROKEN,it should be read again. So start a
* new buffer IO here.
-
- *
- * wierd race condition:
- *
- * We were waiting for someone else to read the buffer. While
- * we were waiting, the reader boof'd in some way, so the
- * contents of the buffer are still invalid. By saying
- * that we didn't find it, we can make the caller
- * reinitialize the buffer. If two processes are waiting
- * for this block, both will read the block. The second
- * one to finish may overwrite any updates made by the
- * first. (Assume higher level synchronization prevents
- * this from happening).
- *
- * This is never going to happen, don't worry about it.
- */
+ *
+ * wierd race condition:
+ *
+ * We were waiting for someone else to read the buffer. While
+ * we were waiting, the reader boof'd in some way, so the
+ * contents of the buffer are still invalid. By saying
+ * that we didn't find it, we can make the caller
+ * reinitialize the buffer. If two processes are waiting
+ * for this block, both will read the block. The second
+ * one to finish may overwrite any updates made by the
+ * first. (Assume higher level synchronization prevents
+ * this from happening).
+ *
+ * This is never going to happen, don't worry about it.
+ */
*foundPtr = FALSE;
}
#ifdef BMTRACE
* in WaitIO until we're done.
*/
inProgress = TRUE;
-#ifdef HAS_TEST_AND_SET
/*
* All code paths that acquire this lock pin the buffer first;
* since no one had it pinned (it just came off the free
* list), no one else can have this lock.
*/
-#endif /* HAS_TEST_AND_SET */
StartBufferIO(buf, false);
/*
* Write the buffer out, being careful to release BufMgrLock
* before starting the I/O.
- *
- * This #ifndef is here because a few extra semops REALLY kill
- * you on machines that don't have spinlocks. If you don't
- * operate with much concurrency, well...
*/
- smok = BufferReplace(buf, true);
-#ifndef OPTIMIZE_SINGLE
- SpinAcquire(BufMgrLock);
-#endif /* OPTIMIZE_SINGLE */
+ smok = BufferReplace(buf);
if (smok == FALSE)
{
elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s",
- buf->tag.blockNum, buf->sb_dbname, buf->sb_relname);
+ buf->tag.blockNum, buf->blind.dbname, buf->blind.relname);
inProgress = FALSE;
buf->flags |= BM_IO_ERROR;
buf->flags &= ~BM_IO_IN_PROGRESS;
if (buf->flags & BM_JUST_DIRTIED)
{
elog(FATAL, "BufferAlloc: content of block %u (%s) changed while flushing",
- buf->tag.blockNum, buf->sb_relname);
+ buf->tag.blockNum, buf->blind.relname);
}
else
buf->flags &= ~BM_DIRTY;
*/
if (buf != NULL)
{
+ buf->flags &= ~BM_IO_IN_PROGRESS;
TerminateBufferIO(buf);
/* give up the buffer since we don't need it any more */
PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
AddBufferToFreelist(buf);
buf->flags |= BM_FREE;
}
- buf->flags &= ~BM_IO_IN_PROGRESS;
}
PinBuffer(buf2);
}
/* record the database name and relation name for this buffer */
- strcpy(buf->sb_relname, RelationGetPhysicalRelationName(reln));
- strcpy(buf->sb_dbname, DatabaseName);
+ strcpy(buf->blind.dbname, DatabaseName);
+ strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln));
INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
if (!BufTableInsert(buf))
SpinAcquire(BufMgrLock);
Assert(bufHdr->refcount > 0);
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ SetBufferDirtiedByMe(buffer, bufHdr);
UnpinBuffer(bufHdr);
SpinRelease(BufMgrLock);
- CommitInfoNeedsSave[buffer - 1] = 0;
return TRUE;
}
buf = &BufferDescriptors[buffer - 1];
fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
*
* 'buffer' is known to be dirty/pinned, so there should not be a
* problem reading the BufferDesc members without the BufMgrLock
- * (nobody should be able to change tags, flags, etc. out from under
- * us). Unpin if 'release' is TRUE.
+ * (nobody should be able to change tags out from under us).
+ *
+ * Unpin if 'release' is TRUE.
*/
int
FlushBuffer(Buffer buffer, bool release)
if (BAD_BUFFER_ID(buffer))
return STATUS_ERROR;
+ Assert(PrivateRefCount[buffer - 1] > 0); /* else caller didn't pin */
+
bufHdr = &BufferDescriptors[buffer - 1];
bufdb = bufHdr->tag.relId.dbId;
if (status == SM_FAIL)
{
elog(ERROR, "FlushBuffer: cannot flush block %u of the relation %s",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
return STATUS_ERROR;
}
BufferFlushCount++;
/*
* If this buffer was marked by someone as DIRTY while we were
- * flushing it out we must not clear DIRTY flag - vadim 01/17/97
+ * flushing it out we must not clear shared DIRTY flag - vadim 01/17/97
+ *
+ * ... but we can clear BufferDirtiedByMe anyway - tgl 3/31/00
*/
if (bufHdr->flags & BM_JUST_DIRTIED)
{
elog(NOTICE, "FlushBuffer: content of block %u (%s) changed while flushing",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
}
else
bufHdr->flags &= ~BM_DIRTY;
+ ClearBufferDirtiedByMe(buffer, bufHdr);
if (release)
UnpinBuffer(bufHdr);
SpinRelease(BufMgrLock);
- CommitInfoNeedsSave[buffer - 1] = 0;
return STATUS_OK;
}
SharedBufferChanged = true;
SpinAcquire(BufMgrLock);
+ Assert(bufHdr->refcount > 0);
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ SetBufferDirtiedByMe(buffer, bufHdr);
SpinRelease(BufMgrLock);
- CommitInfoNeedsSave[buffer - 1] = 0;
return STATUS_OK;
}
AddBufferToFreelist(bufHdr);
bufHdr->flags |= BM_FREE;
}
- if (CommitInfoNeedsSave[buffer - 1])
- {
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- CommitInfoNeedsSave[buffer - 1] = 0;
- }
retbuf = ReadBufferWithBufferLock(relation, blockNum, true);
return retbuf;
}
return ReadBuffer(relation, blockNum);
}
+/*
+ * SetBufferDirtiedByMe -- mark a shared buffer as being dirtied by this xact
+ *
+ * This flag essentially remembers that we need to write and fsync this buffer
+ * before we can commit the transaction. The write might end up getting done
+ * by another backend, but we must do the fsync ourselves (else we could
+ * commit before the data actually reaches disk). We do not issue fsync
+ * instantly upon write; the storage manager keeps track of which files need
+ * to be fsync'd before commit can occur. A key aspect of this data structure
+ * is that we will be able to notify the storage manager that an fsync is
+ * needed even after another backend has done the physical write and replaced
+ * the buffer contents with something else!
+ *
+ * NB: we must be holding the bufmgr lock at entry, and the buffer must be
+ * pinned so that no other backend can take it away from us.
+ */
+static void
+SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
+{
+ BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1];
+ Relation reln;
+ int status;
+
+ /*
+ * If the flag is already set, check to see whether the buffertag is
+ * the same. If not, some other backend already wrote the buffer data
+ * that we dirtied. We must tell the storage manager to make an fsync
+ * pending on that file before we can overwrite the old tag value.
+ */
+ if (BufferDirtiedByMe[buffer - 1])
+ {
+ if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId &&
+ bufHdr->tag.relId.relId == tagLastDirtied->relId.relId &&
+ bufHdr->tag.blockNum == tagLastDirtied->blockNum)
+ return; /* Same tag already dirtied, so no work */
+
+#ifndef OPTIMIZE_SINGLE
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ reln = RelationIdCacheGetRelation(tagLastDirtied->relId.relId);
+
+ if (reln == (Relation) NULL)
+ {
+ status = smgrblindmarkdirty(DEFAULT_SMGR,
+ BufferBlindLastDirtied[buffer - 1].dbname,
+ BufferBlindLastDirtied[buffer - 1].relname,
+ tagLastDirtied->relId.dbId,
+ tagLastDirtied->relId.relId,
+ tagLastDirtied->blockNum);
+ }
+ else
+ {
+ status = smgrmarkdirty(DEFAULT_SMGR, reln,
+ tagLastDirtied->blockNum);
+ /* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+ RelationDecrementReferenceCount(reln);
+ }
+ if (status == SM_FAIL)
+ {
+ elog(ERROR, "SetBufferDirtiedByMe: cannot mark %u for %s",
+ tagLastDirtied->blockNum,
+ BufferBlindLastDirtied[buffer - 1].relname);
+ }
+
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ }
+
+ *tagLastDirtied = bufHdr->tag;
+ BufferBlindLastDirtied[buffer - 1] = bufHdr->blind;
+ BufferDirtiedByMe[buffer - 1] = true;
+}
+
+/*
+ * ClearBufferDirtiedByMe -- mark a shared buffer as no longer needing fsync
+ *
+ * If we write out a buffer ourselves, then the storage manager will set its
+ * needs-fsync flag for that file automatically, and so we can clear our own
+ * flag that says it needs to be done later.
+ *
+ * NB: we must be holding the bufmgr lock at entry.
+ */
+static void
+ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
+{
+ BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1];
+
+ /*
+ * Do *not* clear the flag if it refers to some other buffertag than
+ * the data we just wrote. This is unlikely, but possible if some
+ * other backend replaced the buffer contents since we set our flag.
+ */
+ if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId &&
+ bufHdr->tag.relId.relId == tagLastDirtied->relId.relId &&
+ bufHdr->tag.blockNum == tagLastDirtied->blockNum)
+ {
+ BufferDirtiedByMe[buffer - 1] = false;
+ }
+}
+
/*
* BufferSync -- Flush all dirty buffers in the pool.
*
- * This is called at transaction commit time. It does the wrong thing,
- * right now. We should flush only our own changes to stable storage,
- * and we should obey the lock protocol on the buffer manager metadata
- * as we do it. Also, we need to be sure that no other transaction is
+ * This is called at transaction commit time. We find all buffers
+ * that have been dirtied by the current xact and flush them to disk.
+ * We do *not* flush dirty buffers that have been dirtied by other xacts.
+ * (This is a substantial change from pre-7.0 behavior.)
+ *
+ * OLD COMMENTS (do these still apply?)
+ *
+ * Also, we need to be sure that no other transaction is
* modifying the page as we flush it. This is only a problem for objects
* that use a non-two-phase locking protocol, like btree indices. For
* those objects, we would like to set a write lock for the duration of
BufferSync()
{
int i;
- Oid bufdb;
- Oid bufrel;
- Relation reln;
BufferDesc *bufHdr;
int status;
+ Relation reln;
+ bool didwrite;
- SpinAcquire(BufMgrLock);
for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
{
+ /* Ignore buffers that were not dirtied by me */
+ if (! BufferDirtiedByMe[i])
+ continue;
+
+ SpinAcquire(BufMgrLock);
+
+ /*
+ * We only need to write if the buffer is still dirty and still
+ * contains the same disk page that it contained when we dirtied it.
+ * Otherwise, someone else has already written our changes for us,
+ * and we need only fsync.
+ *
+ * (NOTE: it's still possible to do an unnecessary write, if other
+ * xacts have written and then re-dirtied the page since our last
+ * change to it. But that should be pretty uncommon, and there's
+ * no easy way to detect it anyway.)
+ */
+ reln = NULL;
+ didwrite = false;
if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
+ Oid bufdb;
+ Oid bufrel;
+
bufdb = bufHdr->tag.relId.dbId;
bufrel = bufHdr->tag.relId.relId;
- if (bufdb == MyDatabaseId || bufdb == (Oid) 0)
+ if (bufdb == BufferTagLastDirtied[i].relId.dbId &&
+ bufrel == BufferTagLastDirtied[i].relId.relId &&
+ bufHdr->tag.blockNum == BufferTagLastDirtied[i].blockNum)
{
+ /*
+ * Try to find relation for buf. This could fail, if the
+ * rel has been flushed from the relcache since we dirtied
+ * the page. That should be uncommon, so paying the extra
+ * cost of a blind write when it happens seems OK.
+ */
reln = RelationIdCacheGetRelation(bufrel);
/*
if (bufHdr->flags & BM_IO_ERROR)
{
elog(ERROR, "BufferSync: write error %u for %s",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
}
- /* drop refcnt from RelationIdCacheGetRelation */
- if (reln != (Relation) NULL)
- RelationDecrementReferenceCount(reln);
- continue;
- }
-
- /*
- * To check if block content changed while flushing (see
- * below). - vadim 01/17/97
- */
- WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */
- bufHdr->flags &= ~BM_JUST_DIRTIED;
- StartBufferIO(bufHdr, false); /* output IO start */
-
- /*
- * If we didn't have the reldesc in our local cache, flush
- * this page out using the 'blind write' storage manager
- * routine. If we did find it, use the standard
- * interface.
- */
-
-#ifndef OPTIMIZE_SINGLE
- SpinRelease(BufMgrLock);
-#endif /* OPTIMIZE_SINGLE */
- if (reln == (Relation) NULL)
- {
- status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname,
- bufHdr->sb_relname, bufdb, bufrel,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
}
else
{
- status = smgrwrite(DEFAULT_SMGR, reln,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- }
+ /*
+ * To check if block content changed while flushing (see
+ * below). - vadim 01/17/97
+ */
+ WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */
+ bufHdr->flags &= ~BM_JUST_DIRTIED;
+ StartBufferIO(bufHdr, false); /* output IO start */
+
+ /*
+ * If we didn't have the reldesc in our local cache, write
+ * this page out using the 'blind write' storage manager
+ * routine. If we did find it, use the standard
+ * interface.
+ */
#ifndef OPTIMIZE_SINGLE
- SpinAcquire(BufMgrLock);
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+ if (reln == (Relation) NULL)
+ {
+ status = smgrblindwrt(DEFAULT_SMGR,
+ bufHdr->blind.dbname,
+ bufHdr->blind.relname,
+ bufdb, bufrel,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+ else
+ {
+ status = smgrwrite(DEFAULT_SMGR, reln,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
#endif /* OPTIMIZE_SINGLE */
- UnpinBuffer(bufHdr);
- if (status == SM_FAIL)
- {
- bufHdr->flags |= BM_IO_ERROR;
- elog(ERROR, "BufferSync: cannot write %u for %s",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ UnpinBuffer(bufHdr);
+ if (status == SM_FAIL)
+ {
+ bufHdr->flags |= BM_IO_ERROR;
+ elog(ERROR, "BufferSync: cannot write %u for %s",
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
+ }
+ bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
+ TerminateBufferIO(bufHdr); /* Sync IO finished */
+ BufferFlushCount++;
+ didwrite = true;
+
+ /*
+ * If this buffer was marked by someone as DIRTY while we
+ * were flushing it out we must not clear DIRTY flag -
+ * vadim 01/17/97
+ *
+ * but it is OK to clear BufferDirtiedByMe - tgl 3/31/00
+ */
+ if (!(bufHdr->flags & BM_JUST_DIRTIED))
+ bufHdr->flags &= ~BM_DIRTY;
}
- bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
- TerminateBufferIO(bufHdr); /* Sync IO finished */
- BufferFlushCount++;
- /*
- * If this buffer was marked by someone as DIRTY while we
- * were flushing it out we must not clear DIRTY flag -
- * vadim 01/17/97
- */
- if (!(bufHdr->flags & BM_JUST_DIRTIED))
- bufHdr->flags &= ~BM_DIRTY;
- /* drop refcnt from RelationIdCacheGetRelation */
+ /* drop refcnt obtained by RelationIdCacheGetRelation */
if (reln != (Relation) NULL)
RelationDecrementReferenceCount(reln);
}
}
+
+ /*
+ * If we did not write the buffer (because someone else did),
+ * we must still fsync the file containing it, to ensure that the
+ * write is down to disk before we commit.
+ */
+ if (! didwrite)
+ {
+#ifndef OPTIMIZE_SINGLE
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ reln = RelationIdCacheGetRelation(BufferTagLastDirtied[i].relId.relId);
+ if (reln == (Relation) NULL)
+ {
+ status = smgrblindmarkdirty(DEFAULT_SMGR,
+ BufferBlindLastDirtied[i].dbname,
+ BufferBlindLastDirtied[i].relname,
+ BufferTagLastDirtied[i].relId.dbId,
+ BufferTagLastDirtied[i].relId.relId,
+ BufferTagLastDirtied[i].blockNum);
+ }
+ else
+ {
+ status = smgrmarkdirty(DEFAULT_SMGR, reln,
+ BufferTagLastDirtied[i].blockNum);
+ /* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+ RelationDecrementReferenceCount(reln);
+
+ }
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+ }
+
+ BufferDirtiedByMe[i] = false;
+
+ SpinRelease(BufMgrLock);
}
- SpinRelease(BufMgrLock);
LocalBufferSync();
}
/* ----------------------------------------------
* ResetBufferPool
*
- * this routine is supposed to be called when a transaction aborts.
+ * This routine is supposed to be called when a transaction aborts.
* it will release all the buffer pins held by the transaction.
+ * Currently, we also call it during commit if BufferPoolCheckLeak
+ * detected a problem --- in that case, isCommit is TRUE, and we
+ * only clean up buffer pin counts.
+ *
+ * During abort, we also forget any pending fsync requests. Dirtied buffers
+ * will still get written, eventually, but there will be no fsync for them.
*
* ----------------------------------------------
*/
void
-ResetBufferPool()
+ResetBufferPool(bool isCommit)
{
int i;
SpinRelease(BufMgrLock);
}
PrivateRefCount[i] = 0;
- CommitInfoNeedsSave[i] = 0;
+
+ if (! isCommit)
+ BufferDirtiedByMe[i] = false;
}
ResetLocalBufferPool();
+
+ if (! isCommit)
+ smgrabort();
}
/* -----------------------------------------------
"Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \
relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)",
i - 1, buf->freeNext, buf->freePrev,
- buf->sb_relname, buf->tag.blockNum, buf->flags,
+ buf->blind.relname, buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i - 1]);
result = 1;
}
/*
* BufferReplace
*
- * Flush the buffer corresponding to 'bufHdr'
+ * Write out the buffer corresponding to 'bufHdr'
*
+ * This routine used to flush the data to disk (ie, force immediate fsync)
+ * but that's no longer necessary because BufferSync is smarter than before.
+ *
+ * BufMgrLock must be held at entry, and the buffer must be pinned.
*/
static int
-BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
+BufferReplace(BufferDesc *bufHdr)
{
Relation reln;
Oid bufdb,
bufrel;
int status;
- if (!bufferLockHeld)
- SpinAcquire(BufMgrLock);
-
/*
* first try to find the reldesc in the cache, if no luck, don't
* bother to build the reldesc from scratch, just do a blind write.
*/
-
bufdb = bufHdr->tag.relId.dbId;
bufrel = bufHdr->tag.relId.relId;
/* To check if block content changed while flushing. - vadim 01/17/97 */
bufHdr->flags &= ~BM_JUST_DIRTIED;
+#ifndef OPTIMIZE_SINGLE
SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
if (reln != (Relation) NULL)
{
- status = smgrflush(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
+ status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
}
else
{
- /* blind write always flushes */
- status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname,
- bufHdr->sb_relname, bufdb, bufrel,
+ status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname,
+ bufHdr->blind.relname, bufdb, bufrel,
bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
}
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
if (reln != (Relation) NULL)
RelationDecrementReferenceCount(reln);
if (status == SM_FAIL)
return FALSE;
+ /* If we had marked this buffer as needing to be fsync'd, we can forget
+ * about that, because it's now the storage manager's responsibility.
+ */
+ ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr);
+
BufferFlushCount++;
return TRUE;
}
/* Now we can do what we came for */
buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED);
- CommitInfoNeedsSave[i - 1] = 0;
+ ClearBufferDirtiedByMe(i, buf);
/*
* Release any refcount we may have.
*
}
/* Now we can do what we came for */
buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED);
+ ClearBufferDirtiedByMe(i, buf);
/*
* The thing should be free, if caller has checked that
* no backends are running in that database.
elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
blockNum=%d, flags=0x%x, refcount=%d %ld)",
i, buf->freeNext, buf->freePrev,
- buf->sb_relname, buf->tag.blockNum, buf->flags,
+ buf->blind.relname, buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
}
SpinRelease(BufMgrLock);
for (i = 0; i < NBuffers; ++i, ++buf)
{
printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n",
- i, buf->sb_relname, buf->tag.blockNum,
+ i, buf->blind.relname, buf->tag.blockNum,
buf->flags, buf->refcount, PrivateRefCount[i]);
}
}
if (PrivateRefCount[i] > 0)
elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
blockNum=%d, flags=0x%x, refcount=%d %ld)\n",
- i, buf->freeNext, buf->freePrev, buf->sb_relname,
+ i, buf->freeNext, buf->freePrev, buf->blind.relname,
buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
}
* FlushRelationBuffers
*
* This function removes from the buffer pool all pages of a relation
- * that have blocknumber >= specified block. If doFlush is true,
- * dirty buffers are written out --- otherwise it's an error for any
- * of the buffers to be dirty.
+ * that have blocknumber >= specified block. Pages that are dirty are
+ * written out first. If expectDirty is false, a notice is emitted
+ * warning of dirty buffers, but we proceed anyway. An error code is
+ * returned if we fail to dump a dirty buffer or if we find one of
+ * the target pages is pinned into the cache.
*
* This is used by VACUUM before truncating the relation to the given
- * number of blocks. For VACUUM, we pass doFlush = false since it would
- * mean a bug in VACUUM if any of the unwanted pages were still dirty.
- * (TRUNCATE TABLE also uses it in the same way.)
+ * number of blocks. For VACUUM, we pass expectDirty = false since it
+ * could mean a bug in VACUUM if any of the unwanted pages were still
+ * dirty. (TRUNCATE TABLE also uses it in the same way.)
*
- * This is also used by RENAME TABLE (with block = 0 and doFlush = true)
+ * This is also used by RENAME TABLE (with block=0 and expectDirty=true)
* to clear out the buffer cache before renaming the physical files of
* a relation. Without that, some other backend might try to do a
- * blind write of a buffer page (relying on the sb_relname of the buffer)
+ * blind write of a buffer page (relying on the BlindId of the buffer)
* and fail because it's not got the right filename anymore.
*
* In both cases, the caller should be holding AccessExclusiveLock on
* the target relation to ensure that no other backend is busy reading
- * more blocks of the relation...
+ * more blocks of the relation.
+ *
+ * Formerly, we considered it an error condition if we found unexpectedly
+ * dirty buffers. However, since BufferSync no longer forces out all
+ * dirty buffers at every xact commit, it's possible for dirty buffers
+ * to still be present in the cache due to failure of an earlier
+ * transaction. So, downgrade the error to a mere notice. Maybe we
+ * shouldn't even emit a notice...
*
- * Returns: 0 - Ok, -1 - DIRTY, -2 - PINNED
+ * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED
*
* XXX currently it sequentially searches the buffer pool, should be
* changed to more clever ways of searching.
* --------------------------------------------------------------------
*/
int
-FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
+FlushRelationBuffers(Relation rel, BlockNumber block, bool expectDirty)
{
int i;
BufferDesc *buf;
{
if (buf->flags & BM_DIRTY)
{
- if (doFlush)
- {
- if (FlushBuffer(-i-1, false) != STATUS_OK)
- {
- elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
- RelationGetRelationName(rel),
- block, buf->tag.blockNum);
- return -1;
- }
- }
- else
- {
+ if (! expectDirty)
elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty",
RelationGetRelationName(rel),
block, buf->tag.blockNum);
+ if (FlushBuffer(-i-1, false) != STATUS_OK)
+ {
+ elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
+ RelationGetRelationName(rel),
+ block, buf->tag.blockNum);
return -1;
}
}
SpinAcquire(BufMgrLock);
for (i = 0; i < NBuffers; i++)
{
+ recheck:
buf = &BufferDescriptors[i];
- if (buf->tag.relId.dbId == MyDatabaseId &&
- buf->tag.relId.relId == RelationGetRelid(rel) &&
+ if (buf->tag.relId.relId == RelationGetRelid(rel) &&
+ (buf->tag.relId.dbId == MyDatabaseId ||
+ buf->tag.relId.dbId == (Oid) NULL) &&
buf->tag.blockNum >= block)
{
if (buf->flags & BM_DIRTY)
{
- if (doFlush)
- {
- SpinRelease(BufMgrLock);
- if (FlushBuffer(i+1, false) != STATUS_OK)
- {
- elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
- buf->sb_relname, block, buf->tag.blockNum,
- PrivateRefCount[i], buf->refcount);
- return -1;
- }
- SpinAcquire(BufMgrLock);
- }
- else
- {
- SpinRelease(BufMgrLock);
+ PinBuffer(buf);
+ SpinRelease(BufMgrLock);
+ if (! expectDirty)
elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d)",
- buf->sb_relname, block, buf->tag.blockNum,
+ RelationGetRelationName(rel), block,
+ buf->tag.blockNum,
+ PrivateRefCount[i], buf->refcount);
+ if (FlushBuffer(i+1, true) != STATUS_OK)
+ {
+ elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
+ RelationGetRelationName(rel), block,
+ buf->tag.blockNum,
PrivateRefCount[i], buf->refcount);
return -1;
}
+ SpinAcquire(BufMgrLock);
+ /* Buffer could already be reassigned, so must recheck
+ * whether it still belongs to rel before freeing it!
+ */
+ goto recheck;
}
if (!(buf->flags & BM_FREE))
{
SpinRelease(BufMgrLock);
elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)",
- buf->sb_relname, block, buf->tag.blockNum,
+ RelationGetRelationName(rel), block,
+ buf->tag.blockNum,
PrivateRefCount[i], buf->refcount);
return -2;
}
AddBufferToFreelist(bufHdr);
bufHdr->flags |= BM_FREE;
}
- if (CommitInfoNeedsSave[buffer - 1])
- {
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- CommitInfoNeedsSave[buffer - 1] = 0;
- }
SpinRelease(BufMgrLock);
}
fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- b, buf->sb_relname, buf->tag.blockNum,
+ b, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[b - 1], file, line);
}
return b;
#endif /* BMTRACE */
+/*
+ * SetBufferCommitInfoNeedsSave
+ *
+ * Mark a buffer dirty when we have updated tuple commit-status bits in it.
+ *
+ * This is similar to WriteNoReleaseBuffer, except that we do not set
+ * SharedBufferChanged or BufferDirtiedByMe, because we have not made a
+ * critical change that has to be flushed to disk before xact commit --- the
+ * status-bit update could be redone by someone else just as easily. The
+ * buffer will be marked dirty, but it will not be written to disk until
+ * there is another reason to write it.
+ *
+ * This routine might get called many times on the same page, if we are making
+ * the first scan after commit of an xact that added/deleted many tuples.
+ * So, be as quick as we can if the buffer is already dirty.
+ */
void
SetBufferCommitInfoNeedsSave(Buffer buffer)
{
- if (!BufferIsLocal(buffer))
- CommitInfoNeedsSave[buffer - 1]++;
+ BufferDesc *bufHdr;
+
+ if (BufferIsLocal(buffer))
+ return;
+
+ if (BAD_BUFFER_ID(buffer))
+ return;
+
+ bufHdr = &BufferDescriptors[buffer - 1];
+
+ if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+ (BM_DIRTY | BM_JUST_DIRTIED))
+ {
+ SpinAcquire(BufMgrLock);
+ Assert(bufHdr->refcount > 0);
+ bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ SpinRelease(BufMgrLock);
+ }
}
void
Assert(!(buf->flags & BM_IO_IN_PROGRESS));
buf->flags |= BM_IO_IN_PROGRESS;
#ifdef HAS_TEST_AND_SET
- Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)))
+ /*
+ * There used to be
+ *
+ * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+ *
+ * here, but that's wrong because of the way WaitIO works: someone else
+ * waiting for the I/O to complete will succeed in grabbing the lock for
+ * a few instructions, and if we context-swap back to here the Assert
+ * could fail. Tiny window for failure, but I've seen it happen -- tgl
+ */
S_LOCK(&(buf->io_in_progress_lock));
#endif /* HAS_TEST_AND_SET */
InProgressBuf = buf;
IsForInput = forInput;
}
-extern void InitBufferIO(void)
+void InitBufferIO(void)
{
InProgressBuf = (BufferDesc *)0;
}
* set in case of output,this routine would kill all
* backends and reset postmaster.
*/
-extern void AbortBufferIO(void)
+void AbortBufferIO(void)
{
BufferDesc *buf = InProgressBuf;
if (buf)
buf->flags |= BM_DIRTY;
}
buf->flags |= BM_IO_ERROR;
- TerminateBufferIO(buf);
buf->flags &= ~BM_IO_IN_PROGRESS;
+ TerminateBufferIO(buf);
SpinRelease(BufMgrLock);
}
}
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.20 2000/01/26 05:56:52 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.21 2000/04/09 04:43:19 tgl Exp $
*
*-------------------------------------------------------------------------
*/
fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
int i = (buf - BufferDescriptors);
printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
- i, buf->sb_relname, buf->tag.blockNum,
+ i, buf->blind.relname, buf->tag.blockNum,
buf->flags, buf->refcount, PrivateRefCount[i],
buf->freeNext, buf->freePrev);
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.54 2000/03/17 02:36:19 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.55 2000/04/09 04:43:19 tgl Exp $
*
* NOTES:
*
vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
Assert(vfdP->seekPos != -1);
- /* if we have written to the file, sync it */
+ /* if we have written to the file, sync it before closing */
if (vfdP->fdstate & FD_DIRTY)
{
returnValue = pg_fsync(vfdP->fd);
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != -1);
}
-
- /* Update state as appropriate for re-open (needed?) */
- vfdP->fdstate &= ~FD_DIRTY;
}
/*
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
- /* record the write */
+ /* mark the file as needing fsync */
VfdCache[file].fdstate |= FD_DIRTY;
return returnCode;
return returnCode;
}
+/*
+ * FileSync --- if a file is marked as dirty, fsync it.
+ *
+ * The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
+ * write the file, but that we *have* written it and need to execute an
+ * fsync() to ensure the changes are down on disk before we mark the current
+ * transaction committed.
+ *
+ * FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
+ * It is cleared after successfully fsync'ing the file. FileClose() will
+ * fsync a dirty File that is about to be closed, since there will be no
+ * other place to remember the need to fsync after the VFD is gone.
+ *
+ * Note that the DIRTY bit is logically associated with the actual disk file,
+ * not with any particular kernel FD we might have open for it. We assume
+ * that fsync will force out any dirty buffers for that file, whether or not
+ * they were written through the FD being used for the fsync call --- they
+ * might even have been written by some other backend!
+ *
+ * Note also that LruDelete currently fsyncs a dirty file that it is about
+ * to close the kernel file descriptor for. The idea there is to avoid
+ * having to re-open the kernel descriptor later. But it's not real clear
+ * that this is a performance win; we could end up fsyncing the same file
+ * multiple times in a transaction, which would probably cost more time
+ * than is saved by avoiding an open() call. This should be studied.
+ *
+ * This routine used to think it could skip the fsync if the file is
+ * physically closed, but that is now WRONG; see comments for FileMarkDirty.
+ */
int
FileSync(File file)
{
Assert(FileIsValid(file));
- /*
- * If the file isn't open, then we don't need to sync it; we always
- * sync files when we close them. Also, if we haven't done any writes
- * that we haven't already synced, we can ignore the request.
- */
-
- if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY))
+ if (!(VfdCache[file].fdstate & FD_DIRTY))
+ {
+ /* Need not sync if file is not dirty. */
returnCode = 0;
- else
+ }
+ else if (disableFsync)
{
- returnCode = pg_fsync(VfdCache[file].fd);
+ /* Don't force the file open if pg_fsync isn't gonna sync it. */
+ returnCode = 0;
VfdCache[file].fdstate &= ~FD_DIRTY;
}
+ else
+ {
+ /* We don't use FileAccess() because we don't want to force the
+ * file to the front of the LRU ring; we aren't expecting to
+ * access it again soon.
+ */
+ if (FileIsNotOpen(file))
+ {
+ returnCode = LruInsert(file);
+ if (returnCode != 0)
+ return returnCode;
+ }
+ returnCode = pg_fsync(VfdCache[file].fd);
+ if (returnCode == 0)
+ VfdCache[file].fdstate &= ~FD_DIRTY;
+ }
return returnCode;
}
+/*
+ * FileMarkDirty --- mark a file as needing fsync at transaction commit.
+ *
+ * Since FileWrite marks the file dirty, this routine is not needed in
+ * normal use. It is called when the buffer manager detects that some other
+ * backend has written out a shared buffer that this backend dirtied (but
+ * didn't write) in the current xact. In that scenario, we need to fsync
+ * the file before we can commit. We cannot assume that the other backend
+ * has fsync'd the file yet; we need to do our own fsync to ensure that
+ * (a) the disk page is written and (b) this backend's commit is delayed
+ * until the write is complete.
+ *
+ * Note we are assuming that an fsync issued by this backend will write
+ * kernel disk buffers that were dirtied by another backend. Furthermore,
+ * it doesn't matter whether we currently have the file physically open;
+ * we must fsync even if we have to re-open the file to do it.
+ */
+void
+FileMarkDirty(File file)
+{
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)",
+ file, VfdCache[file].fileName));
+
+ VfdCache[file].fdstate |= FD_DIRTY;
+}
+
+
/*
* Routines that want to use stdio (ie, FILE*) should use AllocateFile
* rather than plain fopen(). This lets fd.c deal with freeing FDs if
* exit (it doesn't particularly care which). All still-open temporary-file
* VFDs are closed, which also causes the underlying files to be deleted.
* Furthermore, all "allocated" stdio files are closed.
+ *
+ * This routine is not involved in fsync'ing non-temporary files at xact
+ * commit; that is done by FileSync under control of the buffer manager.
+ * During a commit, that is done *before* control gets here. If we still
+ * have any needs-fsync bits set when we get here, we assume this is abort
+ * and clear them.
*/
void
AtEOXact_Files(void)
if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
VfdCache[i].fileName != NULL)
FileClose(i);
+ else
+ VfdCache[i].fdstate &= ~FD_DIRTY;
}
}
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.64 2000/02/07 02:38:18 inoue Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
typedef struct _MdfdVec
{
int mdfd_vfd; /* fd number in vfd pool */
- uint16 mdfd_flags; /* clean, dirty, free */
+ int mdfd_flags; /* free, temporary */
+
+/* these are the assigned bits in mdfd_flags: */
+#define MDFD_FREE (1 << 0)/* unused entry */
+#define MDFD_TEMP (1 << 1)/* close this entry at transaction end */
+
int mdfd_lstbcnt; /* most recent block count */
int mdfd_nextFree; /* next free vector */
#ifndef LET_OS_MANAGE_FILESIZE
static int CurFd = 0; /* first never-used fdvec index */
static MemoryContext MdCxt; /* context for all my allocations */
-#define MDFD_DIRTY (uint16) 0x01
-#define MDFD_FREE (uint16) 0x02
-
/* routines declared here */
+static void mdclose_fd(int fd);
static int _mdfd_getrelnfd(Relation reln);
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno);
+static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname,
+ Oid dbid, Oid relid, int blkno);
static int _fdvec_alloc(void);
static void _fdvec_free(int);
static BlockNumber _mdnblocks(File file, Size blcksz);
#endif
Md_fdvec[vfd].mdfd_lstbcnt = 0;
+ pfree(path);
+
return vfd;
}
return SM_FAIL;
}
- /* remember that we did a write, so we can sync at xact commit */
- v->mdfd_flags |= MDFD_DIRTY;
-
/* try to keep the last block count current, though it's just a hint */
#ifndef LET_OS_MANAGE_FILESIZE
if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
#endif
#endif
+ pfree(path);
+
return vfd;
}
mdclose(Relation reln)
{
int fd;
- MdfdVec *v;
- MemoryContext oldcxt;
fd = RelationGetFile(reln);
if (fd < 0)
return SM_SUCCESS; /* already closed, so no work */
+ mdclose_fd(fd);
+
+ reln->rd_fd = -1;
+
+ return SM_SUCCESS;
+}
+
+static void
+mdclose_fd(int fd)
+{
+ MdfdVec *v;
+ MemoryContext oldcxt;
+
oldcxt = MemoryContextSwitchTo(MdCxt);
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
/* if not closed already */
if (v->mdfd_vfd >= 0)
{
-
/*
* We sync the file descriptor so that we don't need to reopen
- * it at transaction commit to force changes to disk.
+ * it at transaction commit to force changes to disk. (This
+ * is not really optional, because we are about to forget that
+ * the file even exists...)
*/
-
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
-
- /* mark this file descriptor as clean in our private table */
- v->mdfd_flags &= ~MDFD_DIRTY;
}
/* Now free vector */
v = v->mdfd_chain;
{
if (v->mdfd_vfd >= 0)
{
-
/*
* We sync the file descriptor so that we don't need to reopen
- * it at transaction commit to force changes to disk.
+ * it at transaction commit to force changes to disk. (This
+ * is not really optional, because we are about to forget that
+ * the file even exists...)
*/
-
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
-
- /* mark this file descriptor as clean in our private table */
- v->mdfd_flags &= ~MDFD_DIRTY;
}
}
#endif
MemoryContextSwitchTo(oldcxt);
_fdvec_free(fd);
-
- /* be sure to mark relation closed */
- reln->rd_fd = -1;
-
- return SM_SUCCESS;
}
/*
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
- v->mdfd_flags |= MDFD_DIRTY;
-
return status;
}
|| FileSync(v->mdfd_vfd) < 0)
status = SM_FAIL;
- /*
- * By here, the block is written and changes have been forced to
- * stable storage. Mark the descriptor as clean until the next write,
- * so we don't sync it again unnecessarily at transaction commit.
- */
-
- v->mdfd_flags &= ~MDFD_DIRTY;
-
return status;
}
* mdblindwrt() -- Write a block to disk blind.
*
* We have to be able to do this using only the name and OID of
- * the database and relation in which the block belongs. This
- * is a synchronous write.
+ * the database and relation in which the block belongs. Otherwise
+ * this is just like mdwrite().
*/
int
-mdblindwrt(char *dbstr,
- char *relstr,
+mdblindwrt(char *dbname,
+ char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno,
char *buffer)
{
- int fd;
- int segno;
- long seekpos;
int status;
- char *path;
-
-#ifndef LET_OS_MANAGE_FILESIZE
- int nchars;
-
- /* be sure we have enough space for the '.segno', if any */
- segno = blkno / RELSEG_SIZE;
- if (segno > 0)
- nchars = 10;
- else
- nchars = 0;
+ long seekpos;
+ MdfdVec *v;
- /* construct the path to the file and open it */
- /* system table? then put in system area... */
- if (dbid == (Oid) 0)
- {
- path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
- if (segno == 0)
- sprintf(path, "%s/%s", DataDir, relstr);
- else
- sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
- }
- /* user table? then put in user database area... */
- else if (dbid == MyDatabaseId)
- {
- path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
- if (segno == 0)
- sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
- else
- sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
- }
- else
-/* this is work arround only !!! */
- {
- char dbpath[MAXPGPATH];
- Oid id;
- char *tmpPath;
-
- GetRawDatabaseInfo(dbstr, &id, dbpath);
-
- if (id != dbid)
- elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
- tmpPath = ExpandDatabasePath(dbpath);
- if (tmpPath == NULL)
- elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
- path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
- if (segno == 0)
- sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
- else
- sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
- pfree(tmpPath);
- }
-#else
- /* construct the path to the file and open it */
- /* system table? then put in system area... */
- if (dbid == (Oid) 0)
- {
- path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
- sprintf(path, "%s/%s", DataDir, relstr);
- }
- /* user table? then put in user database area... */
- else if (dbid == MyDatabaseId)
- {
- path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
- sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
- }
- else
-/* this is work arround only !!! */
- {
- char dbpath[MAXPGPATH];
- Oid id;
- char *tmpPath;
-
- GetRawDatabaseInfo(dbstr, &id, dbpath);
-
- if (id != dbid)
- elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
- tmpPath = ExpandDatabasePath(dbpath);
- if (tmpPath == NULL)
- elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
- path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
- sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
- pfree(tmpPath);
- }
-#endif
+ v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
-#ifndef __CYGWIN32__
- if ((fd = open(path, O_RDWR, 0600)) < 0)
-#else
- if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0)
-#endif
+ if (v == NULL)
return SM_FAIL;
- /* seek to the right spot */
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+ if (seekpos >= BLCKSZ * RELSEG_SIZE)
+ elog(FATAL, "seekpos too big!");
+#endif
#else
seekpos = (long) (BLCKSZ * (blkno));
#endif
- if (lseek(fd, seekpos, SEEK_SET) != seekpos)
- {
- close(fd);
+ if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return SM_FAIL;
- }
status = SM_SUCCESS;
-
- /* write and sync the block */
- if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
+ if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
- if (close(fd) < 0)
- status = SM_FAIL;
+ return status;
+}
- pfree(path);
+/*
+ * mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
+ *
+ * Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdmarkdirty(Relation reln, BlockNumber blkno)
+{
+ MdfdVec *v;
- return status;
+ v = _mdfd_getseg(reln, blkno);
+
+ FileMarkDirty(v->mdfd_vfd);
+
+ return SM_SUCCESS;
+}
+
+/*
+ * mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
+ *
+ * We have to be able to do this using only the name and OID of
+ * the database and relation in which the block belongs. Otherwise
+ * this is just like mdmarkdirty().
+ */
+int
+mdblindmarkdirty(char *dbname,
+ char *relname,
+ Oid dbid,
+ Oid relid,
+ BlockNumber blkno)
+{
+ MdfdVec *v;
+
+ v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
+
+ if (v == NULL)
+ return SM_FAIL;
+
+ FileMarkDirty(v->mdfd_vfd);
+
+ return SM_SUCCESS;
}
/*
for (i = 0; i < CurFd; i++)
{
+ v = &Md_fdvec[i];
+ if (v->mdfd_flags & MDFD_FREE)
+ continue;
+ if (v->mdfd_flags & MDFD_TEMP)
+ {
+ /* Sync and close the file */
+ mdclose_fd(i);
+ }
+ else
+ {
+ /* Sync, but keep the file entry */
+
#ifndef LET_OS_MANAGE_FILESIZE
- for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
+ for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain)
#else
- v = &Md_fdvec[i];
- if (v != (MdfdVec *) NULL)
+ if (v != (MdfdVec *) NULL)
#endif
- {
- if (v->mdfd_flags & MDFD_DIRTY)
{
if (FileSync(v->mdfd_vfd) < 0)
return SM_FAIL;
-
- v->mdfd_flags &= ~MDFD_DIRTY;
}
}
}
for (i = 0; i < CurFd; i++)
{
-#ifndef LET_OS_MANAGE_FILESIZE
- for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
- v->mdfd_flags &= ~MDFD_DIRTY;
-#else
v = &Md_fdvec[i];
- v->mdfd_flags &= ~MDFD_DIRTY;
-#endif
+ if (v->mdfd_flags & MDFD_FREE)
+ continue;
+ if (v->mdfd_flags & MDFD_TEMP)
+ {
+ /* Close the file */
+ mdclose_fd(i);
+ }
}
return SM_SUCCESS;
Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
Md_Free = fdvec;
-
}
static MdfdVec *
MemoryContext oldcxt;
MdfdVec *v;
int fd;
- bool dofree;
char *path,
*fullpath;
/* be sure we have enough space for the '.segno', if any */
path = relpath(RelationGetPhysicalRelationName(reln));
- dofree = false;
if (segno > 0)
{
- dofree = true;
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%d", path, segno);
+ pfree(path);
}
else
fullpath = path;
fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
#endif
- if (dofree)
- pfree(fullpath);
+ pfree(fullpath);
if (fd < 0)
return (MdfdVec *) NULL;
return v;
}
+/* Find the segment of the relation holding the specified block.
+ * This is the same as _mdfd_getseg() except that we must work
+ * "blind" with no Relation struct.
+ *
+ * NOTE: we have no easy way to tell whether a FD already exists for the
+ * target relation, so we always make a new one. This should probably
+ * be improved somehow, but I doubt it's a significant performance issue
+ * under normal circumstances. The FD is marked to be closed at end of xact
+ * so that we don't accumulate a lot of dead FDs.
+ */
+
+static MdfdVec *
+_mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid,
+ int blkno)
+{
+ MdfdVec *v;
+ char *path;
+ int fd;
+ int vfd;
+#ifndef LET_OS_MANAGE_FILESIZE
+ int segno;
+ int targsegno;
+#endif
+
+ /* construct the path to the file and open it */
+ path = relpath_blind(dbname, relname, dbid, relid);
+
+#ifndef __CYGWIN32__
+ fd = FileNameOpenFile(path, O_RDWR, 0600);
+#else
+ fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
+#endif
+
+ if (fd < 0)
+ return NULL;
+
+ vfd = _fdvec_alloc();
+ if (vfd < 0)
+ return NULL;
+
+ Md_fdvec[vfd].mdfd_vfd = fd;
+ Md_fdvec[vfd].mdfd_flags = MDFD_TEMP;
+ Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+#ifndef LET_OS_MANAGE_FILESIZE
+ Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
+
+#ifdef DIAGNOSTIC
+ if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
+ elog(FATAL, "segment too big on relopen!");
+#endif
+
+ targsegno = blkno / RELSEG_SIZE;
+ for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++)
+ {
+ char *segpath;
+ MdfdVec *newv;
+ MemoryContext oldcxt;
+
+ segpath = (char *) palloc(strlen(path) + 12);
+ sprintf(segpath, "%s.%d", path, segno);
+
+#ifndef __CYGWIN32__
+ fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600);
+#else
+ fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600);
+#endif
+
+ pfree(segpath);
+
+ if (fd < 0)
+ return (MdfdVec *) NULL;
+
+ /* allocate an mdfdvec entry for it */
+ oldcxt = MemoryContextSwitchTo(MdCxt);
+ newv = (MdfdVec *) palloc(sizeof(MdfdVec));
+ MemoryContextSwitchTo(oldcxt);
+
+ /* fill the entry */
+ newv->mdfd_vfd = fd;
+ newv->mdfd_flags = MDFD_TEMP;
+ newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+ newv->mdfd_chain = (MdfdVec *) NULL;
+#ifdef DIAGNOSTIC
+ if (newv->mdfd_lstbcnt > RELSEG_SIZE)
+ elog(FATAL, "segment too big on open!");
+#endif
+ v->mdfd_chain = newv;
+ v = newv;
+ }
+#else
+ v = &Md_fdvec[vfd];
+#endif
+
+ pfree(path);
+
+ return v;
+}
+
static BlockNumber
_mdnblocks(File file, Size blcksz)
{
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.32 2000/01/26 05:57:05 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
typedef struct f_smgr
{
- int (*smgr_init) ();/* may be NULL */
- int (*smgr_shutdown) (); /* may be NULL */
- int (*smgr_create) ();
- int (*smgr_unlink) ();
- int (*smgr_extend) ();
- int (*smgr_open) ();
- int (*smgr_close) ();
- int (*smgr_read) ();
- int (*smgr_write) ();
- int (*smgr_flush) ();
- int (*smgr_blindwrt) ();
- int (*smgr_nblocks) ();
- int (*smgr_truncate) ();
- int (*smgr_commit) (); /* may be NULL */
- int (*smgr_abort) (); /* may be NULL */
+ int (*smgr_init) (void); /* may be NULL */
+ int (*smgr_shutdown) (void); /* may be NULL */
+ int (*smgr_create) (Relation reln);
+ int (*smgr_unlink) (Relation reln);
+ int (*smgr_extend) (Relation reln, char *buffer);
+ int (*smgr_open) (Relation reln);
+ int (*smgr_close) (Relation reln);
+ int (*smgr_read) (Relation reln, BlockNumber blocknum,
+ char *buffer);
+ int (*smgr_write) (Relation reln, BlockNumber blocknum,
+ char *buffer);
+ int (*smgr_flush) (Relation reln, BlockNumber blocknum,
+ char *buffer);
+ int (*smgr_blindwrt) (char *dbname, char *relname,
+ Oid dbid, Oid relid,
+ BlockNumber blkno, char *buffer);
+ int (*smgr_markdirty) (Relation reln, BlockNumber blkno);
+ int (*smgr_blindmarkdirty) (char *dbname, char *relname,
+ Oid dbid, Oid relid,
+ BlockNumber blkno);
+ int (*smgr_nblocks) (Relation reln);
+ int (*smgr_truncate) (Relation reln, int nblocks);
+ int (*smgr_commit) (void); /* may be NULL */
+ int (*smgr_abort) (void); /* may be NULL */
} f_smgr;
/*
/* magnetic disk */
{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
- mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdtruncate,
- mdcommit, mdabort},
+ mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
+ mdnblocks, mdtruncate, mdcommit, mdabort},
#ifdef STABLE_MEMORY_STORAGE
/* main memory */
{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
- mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, NULL,
- mmcommit, mmabort},
+ mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
+ mmnblocks, NULL, mmcommit, mmabort},
#endif
};
char *relstr;
int status;
+ /* strdup here is probably redundant */
dbstr = pstrdup(dbname);
relstr = pstrdup(relname);
return status;
}
+/*
+ * smgrmarkdirty() -- Mark a page dirty (needs fsync).
+ *
+ * Mark the specified page as needing to be fsync'd before commit.
+ * Ordinarily, the storage manager will do this implicitly during
+ * smgrwrite(). However, the buffer manager may discover that some
+ * other backend has written a buffer that we dirtied in the current
+ * transaction. In that case, we still need to fsync the file to be
+ * sure the page is down to disk before we commit.
+ */
+int
+smgrmarkdirty(int16 which,
+ Relation reln,
+ BlockNumber blkno)
+{
+ int status;
+
+ status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
+
+ if (status == SM_FAIL)
+ elog(ERROR, "cannot mark block %d of %s",
+ blkno, RelationGetRelationName(reln));
+
+ return status;
+}
+
+/*
+ * smgrblindmarkdirty() -- Mark a page dirty, "blind".
+ *
+ * Just like smgrmarkdirty, except we don't have a reldesc.
+ */
+int
+smgrblindmarkdirty(int16 which,
+ char *dbname,
+ char *relname,
+ Oid dbid,
+ Oid relid,
+ BlockNumber blkno)
+{
+ char *dbstr;
+ char *relstr;
+ int status;
+
+ /* strdup here is probably redundant */
+ dbstr = pstrdup(dbname);
+ relstr = pstrdup(relname);
+
+ status = (*(smgrsw[which].smgr_blindmarkdirty)) (dbstr, relstr,
+ dbid, relid,
+ blkno);
+
+ if (status == SM_FAIL)
+ elog(ERROR, "cannot mark block %d of %s [%s] blind",
+ blkno, relstr, dbstr);
+
+ pfree(dbstr);
+ pfree(relstr);
+
+ return status;
+}
+
/*
* smgrnblocks() -- Calculate the number of POSTGRES blocks in the
* supplied relation.
return SM_SUCCESS;
}
-#ifdef NOT_USED
int
smgrabort()
{
return SM_SUCCESS;
}
-#endif
-
#ifdef NOT_USED
bool
smgriswo(int16 smgrno)
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: catalog.h,v 1.10 2000/01/26 05:57:56 momjian Exp $
+ * $Id: catalog.h,v 1.11 2000/04/09 04:43:14 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "access/tupdesc.h"
extern char *relpath(const char *relname);
+extern char *relpath_blind(const char *dbname, const char *relname,
+ Oid dbid, Oid relid);
extern bool IsSystemRelationName(const char *relname);
extern bool IsSharedSystemRelationName(const char *relname);
extern Oid newoid(void);
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: buf_internals.h,v 1.35 2000/01/26 05:58:32 momjian Exp $
+ * $Id: buf_internals.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
(a)->relId = (xx_reln)->rd_lockInfo.lockRelId \
)
+/* If we have to write a buffer "blind" (without a relcache entry),
+ * the BufferTag is not enough information. BufferBlindId carries the
+ * additional information needed.
+ */
+typedef struct bufblindid
+{
+ char dbname[NAMEDATALEN]; /* name of db in which buf belongs */
+ char relname[NAMEDATALEN]; /* name of reln */
+} BufferBlindId;
+
#define BAD_BUFFER_ID(bid) ((bid) < 1 || (bid) > NBuffers)
#define INVALID_DESCRIPTOR (-3)
bool ri_lock; /* read-intent lock */
bool w_lock; /* context exclusively locked */
- char sb_dbname[NAMEDATALEN]; /* name of db in which buf belongs */
- char sb_relname[NAMEDATALEN]; /* name of reln */
+ BufferBlindId blind; /* extra info to support blind write */
} BufferDesc;
/*
extern BufferBlock BufferBlocks;
extern long *PrivateRefCount;
extern bits8 *BufferLocks;
-extern long *CommitInfoNeedsSave;
+extern BufferTag *BufferTagLastDirtied;
+extern BufferBlindId *BufferBlindLastDirtied;
+extern bool *BufferDirtiedByMe;
extern SPINLOCK BufMgrLock;
/* localbuf.c */
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: bufmgr.h,v 1.35 2000/03/31 02:43:30 tgl Exp $
+ * $Id: bufmgr.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
extern void InitBufferPool(IPCKey key);
extern void PrintBufferUsage(FILE *statfp);
extern void ResetBufferUsage(void);
-extern void ResetBufferPool(void);
+extern void ResetBufferPool(bool isCommit);
extern int BufferPoolCheckLeak(void);
extern void FlushBufferPool(void);
extern BlockNumber BufferGetBlockNumber(Buffer buffer);
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: fd.h,v 1.19 2000/01/26 05:58:32 momjian Exp $
+ * $Id: fd.h,v 1.20 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/*
* calls:
*
- * File {Close, Read, Write, Seek, Tell, Sync}
+ * File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
extern long FileSeek(File file, long offset, int whence);
extern int FileTruncate(File file, long offset);
extern int FileSync(File file);
+extern void FileMarkDirty(File file);
/* Operations that allow use of regular stdio --- USE WITH CAUTION */
extern FILE *AllocateFile(char *name, char *mode);
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: smgr.h,v 1.17 2000/01/26 05:58:33 momjian Exp $
+ * $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
extern int smgrextend(int16 which, Relation reln, char *buffer);
extern int smgropen(int16 which, Relation reln);
extern int smgrclose(int16 which, Relation reln);
-extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
- char *buffer);
-extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
- char *buffer);
-extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
- char *buffer);
-extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
- Oid relid, BlockNumber blkno, char *buffer);
+extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
+ char *buffer);
+extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
+ char *buffer);
+extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
+ char *buffer);
+extern int smgrblindwrt(int16 which, char *dbname, char *relname,
+ Oid dbid, Oid relid,
+ BlockNumber blkno, char *buffer);
+extern int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno);
+extern int smgrblindmarkdirty(int16 which, char *dbname, char *relname,
+ Oid dbid, Oid relid,
+ BlockNumber blkno);
extern int smgrnblocks(int16 which, Relation reln);
extern int smgrtruncate(int16 which, Relation reln, int nblocks);
extern int smgrcommit(void);
+extern int smgrabort(void);
extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
-extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
- BlockNumber blkno, char *buffer);
+extern int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
+ BlockNumber blkno, char *buffer);
+extern int mdmarkdirty(Relation reln, BlockNumber blkno);
+extern int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
+ BlockNumber blkno);
extern int mdnblocks(Relation reln);
extern int mdtruncate(Relation reln, int nblocks);
extern int mdcommit(void);
extern SPINLOCK MMCacheLock;
extern int mminit(void);
-extern int mmshutdown(void);
extern int mmcreate(Relation reln);
extern int mmunlink(Relation reln);
extern int mmextend(Relation reln, char *buffer);
extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
-extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
- BlockNumber blkno, char *buffer);
+extern int mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
+ BlockNumber blkno, char *buffer);
+extern int mmmarkdirty(Relation reln, BlockNumber blkno);
+extern int mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
+ BlockNumber blkno);
extern int mmnblocks(Relation reln);
+extern int mmtruncate(Relation reln, int nblocks);
extern int mmcommit(void);
extern int mmabort(void);
+
+extern int mmshutdown(void);
extern int MMShmemSize(void);
/* smgrtype.c */