aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/storage/smgr/md.c95
1 files changed, 60 insertions, 35 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 73194ec4ef3..744601f1150 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114 2004/12/31 22:01:13 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114.4.1 2006/11/20 01:08:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -33,23 +33,44 @@
* descriptors in its own descriptor pool. This is done to make it
* easier to support relations that are larger than the operating
* system's file size limit (often 2GBytes). In order to do that,
- * we break relations up into chunks of < 2GBytes and store one chunk
- * in each of several files that represent the relation. See the
- * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
- * All chunks except the last MUST have size exactly equal to RELSEG_SIZE
- * blocks --- see mdnblocks() and mdtruncate().
+ * we break relations up into "segment" files that are each shorter than
+ * the OS file size limit. The segment size is set by the RELSEG_SIZE
+ * configuration constant in pg_config_manual.h.
+ *
+ * On disk, a relation must consist of consecutively numbered segment
+ * files in the pattern
+ * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
+ * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
+ * -- Optionally, any number of inactive segments of size 0 blocks.
+ * The full and partial segments are collectively the "active" segments.
+ * Inactive segments are those that once contained data but are currently
+ * not needed because of an mdtruncate() operation. The reason for leaving
+ * them present at size zero, rather than unlinking them, is that other
+ * backends and/or the bgwriter might be holding open file references to
+ * such segments. If the relation expands again after mdtruncate(), such
+ * that a deactivated segment becomes active again, it is important that
+ * such file references still be valid --- else data might get written
+ * out to an unlinked old copy of a segment file that will eventually
+ * disappear.
*
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
- * cache is, therefore, just the head of a list of MdfdVec objects.
- * But note the md_fd pointer can be NULL, indicating relation not open.
+ * cache is, therefore, just the head of a list of MdfdVec objects, one
+ * per segment. But note the md_fd pointer can be NULL, indicating
+ * relation not open.
*
- * Note that mdfd_chain == NULL does not necessarily mean the relation
+ * Also note that mdfd_chain == NULL does not necessarily mean the relation
* doesn't have another segment after this one; we may just not have
* opened the next segment yet. (We could not have "all segments are
* in the chain" as an invariant anyway, since another backend could
- * extend the relation when we weren't looking.)
+ * extend the relation when we weren't looking.) We do not make chain
+ * entries for inactive segments, however; as soon as we find a partial
+ * segment, we assume that any subsequent segments are inactive.
*
* All MdfdVec objects are palloc'd in the MdCxt memory context.
+ *
+ * Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
+ * for use on machines that support large files. Beware that that
+ * code has not been tested in a long time and is probably bit-rotted.
*/
typedef struct _MdfdVec
@@ -75,8 +96,6 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
*
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
- *
- * XXX for WIN32, may want to expand this to track pending deletes, too.
*/
typedef struct
{
@@ -221,12 +240,16 @@ mdunlink(RelFileNode rnode, bool isRedo)
}
#ifndef LET_OS_MANAGE_FILESIZE
- /* Get the additional segments, if any */
+ /* Delete the additional segments, if any */
if (status)
{
char *segpath = (char *) palloc(strlen(path) + 12);
BlockNumber segno;
+ /*
+ * Note that because we loop until getting ENOENT, we will
+ * correctly remove all inactive segments as well as active ones.
+ */
for (segno = 1;; segno++)
{
sprintf(segpath, "%s.%u", path, segno);
@@ -256,15 +279,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
*
* The semantics are basically the same as mdwrite(): write at the
* specified position. However, we are expecting to extend the
- * relation (ie, blocknum is the current EOF), and so in case of
+ * relation (ie, blocknum is >= the current EOF), and so in case of
* failure we clean up by truncating.
*
* This routine returns true or false, with errno set as appropriate.
- *
- * Note: this routine used to call mdnblocks() to get the block position
- * to write at, but that's pretty silly since the caller needs to know where
- * the block will be written, and accordingly must have done mdnblocks()
- * already. Might as well pass in the position and save a seek.
*/
bool
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
@@ -498,10 +516,10 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
- * Important side effect: all segments of the relation are opened
+ * Important side effect: all active segments of the relation are opened
* and added to the mdfd_chain list. If this routine has not been
* called, then only segments up to the last one actually touched
- * are present in the chain...
+ * are present in the chain.
*
* Returns # of blocks, or InvalidBlockNumber on error.
*/
@@ -518,10 +536,14 @@ mdnblocks(SMgrRelation reln)
* Skip through any segments that aren't the last one, to avoid
* redundant seeks on them. We have previously verified that these
* segments are exactly RELSEG_SIZE long, and it's useless to recheck
- * that each time. (NOTE: this assumption could only be wrong if
- * another backend has truncated the relation. We rely on higher code
- * levels to handle that scenario by closing and re-opening the md
- * fd.)
+ * that each time.
+ *
+ * NOTE: this assumption could only be wrong if another backend has
+ * truncated the relation. We rely on higher code levels to handle that
+ * scenario by closing and re-opening the md fd, which is handled via
+ * relcache flush. (Since the bgwriter doesn't participate in relcache
+ * flush, it could have segment chain entries for inactive segments;
+ * that's OK because the bgwriter never needs to compute relation size.)
*/
while (v->mdfd_chain != NULL)
{
@@ -579,8 +601,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
#endif
/*
- * NOTE: mdnblocks makes sure we have opened all existing segments, so
- * that truncate/delete loop will get them all!
+ * NOTE: mdnblocks makes sure we have opened all active segments, so
+ * that truncation loop will get them all!
*/
curnblk = mdnblocks(reln);
if (curnblk == InvalidBlockNumber)
@@ -601,14 +623,17 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
if (priorblocks > nblocks)
{
/*
- * This segment is no longer wanted at all (and has already
- * been unlinked from the mdfd_chain). We truncate the file
- * before deleting it because if other backends are holding
- * the file open, the unlink will fail on some platforms.
- * Better a zero-size file gets left around than a big file...
+ * This segment is no longer active (and has already been
+ * unlinked from the mdfd_chain). We truncate the file, but do
+ * not delete it, for reasons explained in the header comments.
*/
- FileTruncate(v->mdfd_vfd, 0);
- FileUnlink(v->mdfd_vfd);
+ if (FileTruncate(v->mdfd_vfd, 0) < 0)
+ return InvalidBlockNumber;
+ if (!isTemp)
+ {
+ if (!register_dirty_segment(reln, v))
+ return InvalidBlockNumber;
+ }
v = v->mdfd_chain;
Assert(ov != reln->md_fd); /* we never drop the 1st segment */
pfree(ov);
@@ -668,7 +693,7 @@ mdimmedsync(SMgrRelation reln)
BlockNumber curnblk;
/*
- * NOTE: mdnblocks makes sure we have opened all existing segments, so
+ * NOTE: mdnblocks makes sure we have opened all active segments, so
* that fsync loop will get them all!
*/
curnblk = mdnblocks(reln);