aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/postmaster/checkpointer.c51
-rw-r--r--src/backend/storage/buffer/bufmgr.c4
-rw-r--r--src/backend/storage/smgr/md.c87
-rw-r--r--src/include/postmaster/bgwriter.h2
-rw-r--r--src/include/storage/relfilenode.h16
-rw-r--r--src/include/storage/smgr.h7
6 files changed, 109 insertions, 58 deletions
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 87aa4e1ba0a..fe4f4053588 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -105,7 +105,7 @@
*/
typedef struct
{
- RelFileNodeBackend rnode;
+ RelFileNode rnode;
ForkNumber forknum;
BlockNumber segno; /* see md.c for special values */
/* might add a real request-type field later; not needed yet */
@@ -927,17 +927,22 @@ CheckpointerShmemSize(void)
void
CheckpointerShmemInit(void)
{
+ Size size = CheckpointerShmemSize();
bool found;
CheckpointerShmem = (CheckpointerShmemStruct *)
ShmemInitStruct("Checkpointer Data",
- CheckpointerShmemSize(),
+ size,
&found);
if (!found)
{
- /* First time through, so initialize */
- MemSet(CheckpointerShmem, 0, sizeof(CheckpointerShmemStruct));
+ /*
+ * First time through, so initialize. Note that we zero the whole
+ * requests array; this is so that CompactCheckpointerRequestQueue
+ * can assume that any pad bytes in the request structs are zeroes.
+ */
+ MemSet(CheckpointerShmem, 0, size);
SpinLockInit(&CheckpointerShmem->ckpt_lck);
CheckpointerShmem->max_requests = NBuffers;
}
@@ -1094,11 +1099,15 @@ RequestCheckpoint(int flags)
* Forward a file-fsync request from a backend to the checkpointer
*
* Whenever a backend is compelled to write directly to a relation
- * (which should be seldom, if the checkpointer is getting its job done),
+ * (which should be seldom, if the background writer is getting its job done),
* the backend calls this routine to pass over knowledge that the relation
* is dirty and must be fsync'd before next checkpoint. We also use this
* opportunity to count such writes for statistical purposes.
*
+ * This functionality is only supported for regular (not backend-local)
+ * relations, so the rnode argument is intentionally RelFileNode not
+ * RelFileNodeBackend.
+ *
* segno specifies which segment (not block!) of the relation needs to be
* fsync'd. (Since the valid range is much less than BlockNumber, we can
* use high values for special flags; that's all internal to md.c, which
@@ -1115,8 +1124,7 @@ RequestCheckpoint(int flags)
* let the backend know by returning false.
*/
bool
-ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
- BlockNumber segno)
+ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
{
CheckpointerRequest *request;
bool too_full;
@@ -1172,6 +1180,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
/*
* CompactCheckpointerRequestQueue
* Remove duplicates from the request queue to avoid backend fsyncs.
+ * Returns "true" if any entries were removed.
*
* Although a full fsync request queue is not common, it can lead to severe
* performance problems when it does happen. So far, this situation has
@@ -1181,7 +1190,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
* gets very expensive and can slow down the whole system.
*
* Trying to do this every time the queue is full could lose if there
- * aren't any removable entries. But should be vanishingly rare in
+ * aren't any removable entries. But that should be vanishingly rare in
* practice: there's one queue entry per shared buffer.
*/
static bool
@@ -1203,18 +1212,20 @@ CompactCheckpointerRequestQueue(void)
/* must hold CheckpointerCommLock in exclusive mode */
Assert(LWLockHeldByMe(CheckpointerCommLock));
+ /* Initialize skip_slot array */
+ skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
+
/* Initialize temporary hash table */
MemSet(&ctl, 0, sizeof(ctl));
ctl.keysize = sizeof(CheckpointerRequest);
ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
ctl.hash = tag_hash;
+ ctl.hcxt = CurrentMemoryContext;
+
htab = hash_create("CompactCheckpointerRequestQueue",
CheckpointerShmem->num_requests,
&ctl,
- HASH_ELEM | HASH_FUNCTION);
-
- /* Initialize skip_slot array */
- skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
/*
* The basic idea here is that a request can be skipped if it's followed
@@ -1229,19 +1240,28 @@ CompactCheckpointerRequestQueue(void)
* anyhow), but it's not clear that the extra complexity would buy us
* anything.
*/
- for (n = 0; n < CheckpointerShmem->num_requests; ++n)
+ for (n = 0; n < CheckpointerShmem->num_requests; n++)
{
CheckpointerRequest *request;
struct CheckpointerSlotMapping *slotmap;
bool found;
+ /*
+ * We use the request struct directly as a hashtable key. This
+ * assumes that any padding bytes in the structs are consistently the
+ * same, which should be okay because we zeroed them in
+ * CheckpointerShmemInit. Note also that RelFileNode had better
+ * contain no pad bytes.
+ */
request = &CheckpointerShmem->requests[n];
slotmap = hash_search(htab, request, HASH_ENTER, &found);
if (found)
{
+ /* Duplicate, so mark the previous occurrence as skippable */
skip_slot[slotmap->slot] = true;
- ++num_skipped;
+ num_skipped++;
}
+ /* Remember slot containing latest occurrence of this request value */
slotmap->slot = n;
}
@@ -1256,7 +1276,8 @@ CompactCheckpointerRequestQueue(void)
}
/* We found some duplicates; remove them. */
- for (n = 0, preserve_count = 0; n < CheckpointerShmem->num_requests; ++n)
+ preserve_count = 0;
+ for (n = 0; n < CheckpointerShmem->num_requests; n++)
{
if (skip_slot[n])
continue;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 78145472e16..a3bf9a4d44e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2049,7 +2049,7 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
int i;
/* If it's a local relation, it's localbuf.c's problem. */
- if (rnode.backend != InvalidBackendId)
+ if (RelFileNodeBackendIsTemp(rnode))
{
if (rnode.backend == MyBackendId)
DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
@@ -2103,7 +2103,7 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
int i;
/* If it's a local relation, it's localbuf.c's problem. */
- if (rnode.backend != InvalidBackendId)
+ if (RelFileNodeBackendIsTemp(rnode))
{
if (rnode.backend == MyBackendId)
DropRelFileNodeAllLocalBuffers(rnode.node);
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e5dec9d2a32..c05315d8849 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -38,7 +38,7 @@
/*
* Special values for the segno arg to RememberFsyncRequest.
*
- * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an
+ * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
* fsync request from the queue if an identical, subsequent request is found.
* See comments there before making changes here.
*/
@@ -122,13 +122,17 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
* be deleted after the next checkpoint, but we use a linked list instead of
* a hash table, because we don't expect there to be any duplicate requests.
*
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
* (Regular backends do not track pending operations locally, but forward
* them to the checkpointer.)
*/
typedef struct
{
- RelFileNodeBackend rnode; /* the targeted relation */
- ForkNumber forknum;
+ RelFileNode rnode; /* the targeted relation */
+ ForkNumber forknum; /* which fork */
BlockNumber segno; /* which segment */
} PendingOperationTag;
@@ -143,7 +147,7 @@ typedef struct
typedef struct
{
- RelFileNodeBackend rnode; /* the dead relation to delete */
+ RelFileNode rnode; /* the dead relation to delete */
CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */
} PendingUnlinkEntry;
@@ -302,11 +306,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
/*
* mdunlink() -- Unlink a relation.
*
- * Note that we're passed a RelFileNode --- by the time this is called,
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
* there won't be an SMgrRelation hashtable entry anymore.
*
- * Actually, we don't unlink the first segment file of the relation, but
- * just truncate it to zero length, and record a request to unlink it after
+ * For regular relations, we don't unlink the first segment file of the rel,
+ * but just truncate it to zero length, and record a request to unlink it after
* the next checkpoint. Additional segments can be unlinked immediately,
* however. Leaving the empty file in place prevents that relfilenode
* number from being reused. The scenario this protects us from is:
@@ -323,6 +327,12 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* number until it's safe, because relfilenode assignment skips over any
* existing file.
*
+ * We do not need to go through this dance for temp relations, though, because
+ * we never make WAL entries for temp rels, and so a temp rel poses no threat
+ * to the health of a regular rel that has taken over its relfilenode number.
+ * The fact that temp rels and regular rels have different file naming
+ * patterns provides additional safety.
+ *
* All the above applies only to the relation's main fork; other forks can
* just be removed immediately, since they are not needed to prevent the
* relfilenode number from being recycled. Also, we do not carefully
@@ -345,16 +355,18 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
/*
* We have to clean out any pending fsync requests for the doomed
- * relation, else the next mdsync() will fail.
+ * relation, else the next mdsync() will fail. There can't be any such
+ * requests for a temp relation, though.
*/
- ForgetRelationFsyncRequests(rnode, forkNum);
+ if (!RelFileNodeBackendIsTemp(rnode))
+ ForgetRelationFsyncRequests(rnode.node, forkNum);
path = relpath(rnode, forkNum);
/*
* Delete or truncate the first segment.
*/
- if (isRedo || forkNum != MAIN_FORKNUM)
+ if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
{
ret = unlink(path);
if (ret < 0 && errno != ENOENT)
@@ -1077,12 +1089,8 @@ mdsync(void)
* This fact justifies our not closing the reln in the success
* path either, which is a good thing since in
* non-checkpointer cases we couldn't safely do that.)
- * Furthermore, in many cases the relation will have been
- * dirtied through this same smgr relation, and so we can save
- * a file open/close cycle.
*/
- reln = smgropen(entry->tag.rnode.node,
- entry->tag.rnode.backend);
+ reln = smgropen(entry->tag.rnode, InvalidBackendId);
/*
* It is possible that the relation has been dropped or
@@ -1228,7 +1236,7 @@ mdpostckpt(void)
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
/* Unlink the file */
- path = relpath(entry->rnode, MAIN_FORKNUM);
+ path = relpathperm(entry->rnode, MAIN_FORKNUM);
if (unlink(path) < 0)
{
/*
@@ -1255,21 +1263,24 @@ mdpostckpt(void)
*
* If there is a local pending-ops table, just make an entry in it for
* mdsync to process later. Otherwise, try to pass off the fsync request
- * to the background writer process. If that fails, just do the fsync
- * locally before returning (we expect this will not happen often enough
+ * to the checkpointer process. If that fails, just do the fsync
+ * locally before returning (we hope this will not happen often enough
* to be a performance problem).
*/
static void
register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
+ /* Temp relations should never be fsync'd */
+ Assert(!SmgrIsTemp(reln));
+
if (pendingOpsTable)
{
/* push it into local pending-ops table */
- RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
+ RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
}
else
{
- if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
+ if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
return; /* passed it off successfully */
ereport(DEBUG1,
@@ -1286,16 +1297,23 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
/*
* register_unlink() -- Schedule a file to be deleted after next checkpoint
*
+ * We don't bother passing in the fork number, because this is only used
+ * with main forks.
+ *
* As with register_dirty_segment, this could involve either a local or
* a remote pending-ops table.
*/
static void
register_unlink(RelFileNodeBackend rnode)
{
+ /* Should never be used with temp relations */
+ Assert(!RelFileNodeBackendIsTemp(rnode));
+
if (pendingOpsTable)
{
/* push it into local pending-ops table */
- RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
+ RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
+ UNLINK_RELATION_REQUEST);
}
else
{
@@ -1307,7 +1325,7 @@ register_unlink(RelFileNodeBackend rnode)
* XXX should we just leave the file orphaned instead?
*/
Assert(IsUnderPostmaster);
- while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
+ while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
UNLINK_RELATION_REQUEST))
pg_usleep(10000L); /* 10 msec seems a good number */
}
@@ -1333,8 +1351,7 @@ register_unlink(RelFileNodeBackend rnode)
* structure for them.)
*/
void
-RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
- BlockNumber segno)
+RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
{
Assert(pendingOpsTable);
@@ -1347,7 +1364,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
- if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) &&
+ if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
entry->tag.forknum == forknum)
{
/* Okay, cancel this entry */
@@ -1368,7 +1385,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
- if (entry->tag.rnode.node.dbNode == rnode.node.dbNode)
+ if (entry->tag.rnode.dbNode == rnode.dbNode)
{
/* Okay, cancel this entry */
entry->canceled = true;
@@ -1382,7 +1399,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
next = lnext(cell);
- if (entry->rnode.node.dbNode == rnode.node.dbNode)
+ if (entry->rnode.dbNode == rnode.dbNode)
{
pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
pfree(entry);
@@ -1397,6 +1414,9 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
PendingUnlinkEntry *entry;
+ /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+ Assert(forknum == MAIN_FORKNUM);
+
entry = palloc(sizeof(PendingUnlinkEntry));
entry->rnode = rnode;
entry->cycle_ctr = mdckpt_cycle_ctr;
@@ -1446,10 +1466,10 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
}
/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
*/
void
-ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
+ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
{
if (pendingOpsTable)
{
@@ -1484,12 +1504,11 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
void
ForgetDatabaseFsyncRequests(Oid dbid)
{
- RelFileNodeBackend rnode;
+ RelFileNode rnode;
- rnode.node.dbNode = dbid;
- rnode.node.spcNode = 0;
- rnode.node.relNode = 0;
- rnode.backend = InvalidBackendId;
+ rnode.dbNode = dbid;
+ rnode.spcNode = 0;
+ rnode.relNode = 0;
if (pendingOpsTable)
{
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index f46d4ad7077..be5012a3b48 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -31,7 +31,7 @@ extern void CheckpointerMain(void);
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);
-extern bool ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
+extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void AbsorbFsyncRequests(void);
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h
index 60c38295375..5ec1d8f7177 100644
--- a/src/include/storage/relfilenode.h
+++ b/src/include/storage/relfilenode.h
@@ -69,6 +69,10 @@ typedef enum ForkNumber
* Note: in pg_class, relfilenode can be zero to denote that the relation
* is a "mapped" relation, whose current true filenode number is available
* from relmapper.c. Again, this case is NOT allowed in RelFileNodes.
+ *
+ * Note: various places use RelFileNode in hashtable keys. Therefore,
+ * there *must not* be any unused padding bytes in this struct. That
+ * should be safe as long as all the fields are of type Oid.
*/
typedef struct RelFileNode
{
@@ -79,7 +83,11 @@ typedef struct RelFileNode
/*
* Augmenting a relfilenode with the backend ID provides all the information
- * we need to locate the physical storage.
+ * we need to locate the physical storage. The backend ID is InvalidBackendId
+ * for regular relations (those accessible to more than one backend), or the
+ * owning backend's ID for backend-local relations. Backend-local relations
+ * are always transient and removed in case of a database crash; they are
+ * never WAL-logged or fsync'd.
*/
typedef struct RelFileNodeBackend
{
@@ -87,11 +95,15 @@ typedef struct RelFileNodeBackend
BackendId backend;
} RelFileNodeBackend;
+#define RelFileNodeBackendIsTemp(rnode) \
+ ((rnode).backend != InvalidBackendId)
+
/*
* Note: RelFileNodeEquals and RelFileNodeBackendEquals compare relNode first
* since that is most likely to be different in two unequal RelFileNodes. It
* is probably redundant to compare spcNode if the other fields are found equal,
- * but do it anyway to be sure.
+ * but do it anyway to be sure. Likewise for checking the backend ID in
+ * RelFileNodeBackendEquals.
*/
#define RelFileNodeEquals(node1, node2) \
((node1).relNode == (node2).relNode && \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index f8fc2b2d6e8..3560d539076 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -69,7 +69,7 @@ typedef struct SMgrRelationData
typedef SMgrRelationData *SMgrRelation;
#define SmgrIsTemp(smgr) \
- ((smgr)->smgr_rnode.backend != InvalidBackendId)
+ RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
extern void smgrinit(void);
extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
@@ -124,10 +124,9 @@ extern void mdsync(void);
extern void mdpostckpt(void);
extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
+extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNodeBackend rnode,
- ForkNumber forknum);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
extern void ForgetDatabaseFsyncRequests(Oid dbid);
/* smgrtype.c */