aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/smgr/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r--src/backend/storage/smgr/md.c507
1 files changed, 316 insertions, 191 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index f58ab03ce42..e0899a54600 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123 2006/11/20 01:07:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.124 2007/01/03 18:11:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -22,6 +22,7 @@
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
+#include "storage/bufmgr.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
@@ -108,9 +109,16 @@ typedef struct
static HTAB *pendingOpsTable = NULL;
+typedef enum /* behavior for mdopen & _mdfd_getseg */
+{
+ EXTENSION_FAIL, /* ereport if segment not present */
+ EXTENSION_RETURN_NULL, /* return NULL if not present */
+ EXTENSION_CREATE /* create new segments as needed */
+} ExtensionBehavior;
+
/* local routines */
-static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
-static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
+static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
@@ -118,14 +126,14 @@ static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
int oflags);
#endif
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
- bool allowNotFound);
-static BlockNumber _mdnblocks(File file, Size blcksz);
+ bool isTemp, ExtensionBehavior behavior);
+static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
*/
-bool
+void
mdinit(void)
{
MdCxt = AllocSetContextCreate(TopMemoryContext,
@@ -154,8 +162,6 @@ mdinit(void)
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
}
-
- return true;
}
/*
@@ -163,14 +169,14 @@ mdinit(void)
*
* If isRedo is true, it's okay for the relation to exist already.
*/
-bool
+void
mdcreate(SMgrRelation reln, bool isRedo)
{
char *path;
File fd;
if (isRedo && reln->md_fd != NULL)
- return true; /* created and opened already... */
+ return; /* created and opened already... */
Assert(reln->md_fd == NULL);
@@ -193,11 +199,15 @@ mdcreate(SMgrRelation reln, bool isRedo)
if (fd < 0)
{
pfree(path);
- /* be sure to return the error reported by create, not open */
+ /* be sure to report the error reported by create, not open */
errno = save_errno;
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create relation %u/%u/%u: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
}
- errno = 0;
}
pfree(path);
@@ -209,8 +219,6 @@ mdcreate(SMgrRelation reln, bool isRedo)
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
-
- return true;
}
/*
@@ -220,12 +228,12 @@ mdcreate(SMgrRelation reln, bool isRedo)
* there won't be an SMgrRelation hashtable entry anymore.
*
* If isRedo is true, it's okay for the relation to be already gone.
+ * Also, any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
*/
-bool
+void
mdunlink(RelFileNode rnode, bool isRedo)
{
- bool status = true;
- int save_errno = 0;
char *path;
path = relpath(rnode);
@@ -234,15 +242,17 @@ mdunlink(RelFileNode rnode, bool isRedo)
if (unlink(path) < 0)
{
if (!isRedo || errno != ENOENT)
- {
- status = false;
- save_errno = errno;
- }
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove relation %u/%u/%u: %m",
+ rnode.spcNode,
+ rnode.dbNode,
+ rnode.relNode)));
}
#ifndef LET_OS_MANAGE_FILESIZE
/* Delete the additional segments, if any */
- if (status)
+ else
{
char *segpath = (char *) palloc(strlen(path) + 12);
BlockNumber segno;
@@ -258,10 +268,13 @@ mdunlink(RelFileNode rnode, bool isRedo)
{
/* ENOENT is expected after the last segment... */
if (errno != ENOENT)
- {
- status = false;
- save_errno = errno;
- }
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove segment %u of relation %u/%u/%u: %m",
+ segno,
+ rnode.spcNode,
+ rnode.dbNode,
+ rnode.relNode)));
break;
}
}
@@ -270,29 +283,44 @@ mdunlink(RelFileNode rnode, bool isRedo)
#endif
pfree(path);
-
- errno = save_errno;
- return status;
}
/*
* mdextend() -- Add a block to the specified relation.
*
- * The semantics are basically the same as mdwrite(): write at the
- * specified position. However, we are expecting to extend the
- * relation (ie, blocknum is >= the current EOF), and so in case of
- * failure we clean up by truncating.
- *
- * This routine returns true or false, with errno set as appropriate.
+ * The semantics are nearly the same as mdwrite(): write at the
+ * specified position. However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF). Note that we assume writing a block beyond current EOF
+ * causes intervening file space to become filled with zeroes.
*/
-bool
+void
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum, false);
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
+ * any more --- we mustn't create a block whose number
+ * actually is InvalidBlockNumber.
+ */
+ if (blocknum == InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ InvalidBlockNumber)));
+
+ v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -302,52 +330,64 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
#endif
/*
- * Note: because caller obtained blocknum by calling _mdnblocks, which did
- * a seek(SEEK_END), this seek is often redundant and will be optimized
- * away by fd.c. It's not redundant, however, if there is a partial page
- * at the end of the file. In that case we want to try to overwrite the
- * partial page with a full page. It's also not redundant if bufmgr.c had
- * to dump another buffer of the same file to make room for the new page's
- * buffer.
+ * Note: because caller usually obtained blocknum by calling mdnblocks,
+ * which did a seek(SEEK_END), this seek is often redundant and will be
+ * optimized away by fd.c. It's not redundant, however, if there is a
+ * partial page at the end of the file. In that case we want to try to
+ * overwrite the partial page with a full page. It's also not redundant
+ * if bufmgr.c had to dump another buffer of the same file to make room
+ * for the new page's buffer.
*/
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
- if (nbytes > 0)
- {
- int save_errno = errno;
-
- /* Remove the partially-written page */
- FileTruncate(v->mdfd_vfd, seekpos);
- FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
- errno = save_errno;
- }
- return false;
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend relation %u/%u/%u: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode),
+ errhint("Check free disk space.")));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nbytes, BLCKSZ, blocknum),
+ errhint("Check free disk space.")));
}
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return false;
- }
+ register_dirty_segment(reln, v);
#ifndef LET_OS_MANAGE_FILESIZE
- Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
#endif
-
- return true;
}
/*
- * mdopen() -- Open the specified relation. ereport's on failure.
- * (Optionally, can return NULL instead of ereport for ENOENT.)
+ * mdopen() -- Open the specified relation.
*
* Note we only open the first segment, when there are multiple segments.
+ *
+ * If first segment is not present, either ereport or return NULL according
+ * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
+ * EXTENSION_CREATE means it's OK to extend an existing relation, not to
+ * invent one out of whole cloth.
*/
static MdfdVec *
-mdopen(SMgrRelation reln, bool allowNotFound)
+mdopen(SMgrRelation reln, ExtensionBehavior behavior)
{
MdfdVec *mdfd;
char *path;
@@ -374,7 +414,7 @@ mdopen(SMgrRelation reln, bool allowNotFound)
if (fd < 0)
{
pfree(path);
- if (allowNotFound && errno == ENOENT)
+ if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
@@ -393,7 +433,7 @@ mdopen(SMgrRelation reln, bool allowNotFound)
mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
mdfd->mdfd_chain = NULL;
- Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
#endif
return mdfd;
@@ -401,17 +441,15 @@ mdopen(SMgrRelation reln, bool allowNotFound)
/*
* mdclose() -- Close the specified relation, if it isn't closed already.
- *
- * Returns true or false with errno set as appropriate.
*/
-bool
+void
mdclose(SMgrRelation reln)
{
MdfdVec *v = reln->md_fd;
/* No work if already closed */
if (v == NULL)
- return true;
+ return;
reln->md_fd = NULL; /* prevent dangling pointer after error */
@@ -432,22 +470,19 @@ mdclose(SMgrRelation reln)
FileClose(v->mdfd_vfd);
pfree(v);
#endif
-
- return true;
}
/*
* mdread() -- Read the specified block from a relation.
*/
-bool
+void
mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
{
- bool status;
long seekpos;
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum, false);
+ v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -457,39 +492,66 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
- status = true;
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
/*
- * If we are at or past EOF, return zeroes without complaining. Also
- * substitute zeroes if we found a partial block at EOF.
- *
- * XXX this is really ugly, bad design. However the current
- * implementation of hash indexes requires it, because hash index
- * pages are initialized out-of-order.
+ * Short read: we are at or past EOF, or we read a partial block at
+ * EOF. Normally this is an error; upper levels should never try to
+ * read a nonexistent block. However, if zero_damaged_pages is ON
+ * or we are InRecovery, we should instead return zeroes without
+ * complaining. This allows, for example, the case of trying to
+ * update a block that was later truncated away.
*/
- if (nbytes == 0 ||
- (nbytes > 0 && mdnblocks(reln) == blocknum))
+ if (zero_damaged_pages || InRecovery)
MemSet(buffer, 0, BLCKSZ);
else
- status = false;
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nbytes, BLCKSZ)));
}
-
- return status;
}
/*
* mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF). To extend a relation,
+ * use mdextend().
*/
-bool
+void
mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
+ int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum, false);
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum < mdnblocks(reln));
+#endif
+
+ v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -499,18 +561,38 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return false;
-
- if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
- if (!isTemp)
+ if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
- if (!register_dirty_segment(reln, v))
- return false;
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nbytes, BLCKSZ),
+ errhint("Check free disk space.")));
}
- return true;
+ if (!isTemp)
+ register_dirty_segment(reln, v);
}
/*
@@ -520,13 +602,11 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
* and added to the mdfd_chain list. If this routine has not been
* called, then only segments up to the last one actually touched
* are present in the chain.
- *
- * Returns # of blocks, or InvalidBlockNumber on error.
*/
BlockNumber
mdnblocks(SMgrRelation reln)
{
- MdfdVec *v = mdopen(reln, false);
+ MdfdVec *v = mdopen(reln, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
@@ -552,7 +632,7 @@ mdnblocks(SMgrRelation reln)
for (;;)
{
- nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
+ nblocks = _mdnblocks(reln, v);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
elog(FATAL, "segment too big");
if (nblocks < ((BlockNumber) RELSEG_SIZE))
@@ -573,22 +653,26 @@ mdnblocks(SMgrRelation reln)
*/
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
if (v->mdfd_chain == NULL)
- return InvalidBlockNumber; /* failed? */
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open segment %u of relation %u/%u/%u: %m",
+ segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
}
v = v->mdfd_chain;
}
#else
- return _mdnblocks(v->mdfd_vfd, BLCKSZ);
+ return _mdnblocks(reln, v);
#endif
}
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
- *
- * Returns # of blocks or InvalidBlockNumber on error.
*/
-BlockNumber
+void
mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
{
MdfdVec *v;
@@ -603,14 +687,22 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
* that truncation loop will get them all!
*/
curnblk = mdnblocks(reln);
- if (curnblk == InvalidBlockNumber)
- return InvalidBlockNumber; /* mdnblocks failed */
if (nblocks > curnblk)
- return InvalidBlockNumber; /* bogus request */
+ {
+ /* Bogus request ... but no complaint if InRecovery */
+ if (InRecovery)
+ return;
+ ereport(ERROR,
+ (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks, curnblk)));
+ }
if (nblocks == curnblk)
- return nblocks; /* no work */
+ return; /* no work */
- v = mdopen(reln, false);
+ v = mdopen(reln, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
@@ -626,12 +718,15 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
* not delete it, for reasons explained in the header comments.
*/
if (FileTruncate(v->mdfd_vfd, 0) < 0)
- return InvalidBlockNumber;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks)));
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return InvalidBlockNumber;
- }
+ register_dirty_segment(reln, v);
v = v->mdfd_chain;
Assert(ov != reln->md_fd); /* we never drop the 1st segment */
pfree(ov);
@@ -649,12 +744,15 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
BlockNumber lastsegblocks = nblocks - priorblocks;
if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
- return InvalidBlockNumber;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks)));
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return InvalidBlockNumber;
- }
+ register_dirty_segment(reln, v);
v = v->mdfd_chain;
ov->mdfd_chain = NULL;
}
@@ -670,15 +768,16 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
}
#else
if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
- return InvalidBlockNumber;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks)));
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return InvalidBlockNumber;
- }
+ register_dirty_segment(reln, v);
#endif
-
- return nblocks;
}
/*
@@ -687,7 +786,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
* Note that only writes already issued are synced; this routine knows
* nothing of dirty buffers that may exist inside the buffer manager.
*/
-bool
+void
mdimmedsync(SMgrRelation reln)
{
MdfdVec *v;
@@ -698,24 +797,32 @@ mdimmedsync(SMgrRelation reln)
* that fsync loop will get them all!
*/
curnblk = mdnblocks(reln);
- if (curnblk == InvalidBlockNumber)
- return false; /* mdnblocks failed */
- v = mdopen(reln, false);
+ v = mdopen(reln, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
while (v != NULL)
{
if (FileSync(v->mdfd_vfd) < 0)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ v->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
v = v->mdfd_chain;
}
#else
if (FileSync(v->mdfd_vfd) < 0)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ v->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
#endif
-
- return true;
}
/*
@@ -724,7 +831,7 @@ mdimmedsync(SMgrRelation reln)
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOpsTable.
*/
-bool
+void
mdsync(void)
{
HASH_SEQ_STATUS hstat;
@@ -732,7 +839,7 @@ mdsync(void)
int absorb_counter;
if (!pendingOpsTable)
- return false;
+ elog(ERROR, "cannot sync without a pendingOpsTable");
/*
* If we are in the bgwriter, the sync had better include all fsync
@@ -795,21 +902,18 @@ mdsync(void)
*/
seg = _mdfd_getseg(reln,
entry->segno * ((BlockNumber) RELSEG_SIZE),
- true);
+ false, EXTENSION_RETURN_NULL);
if (seg)
{
if (FileSync(seg->mdfd_vfd) < 0 &&
errno != ENOENT)
- {
- ereport(LOG,
+ ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
entry->segno,
entry->rnode.spcNode,
entry->rnode.dbNode,
entry->rnode.relNode)));
- return false;
- }
}
}
@@ -818,8 +922,6 @@ mdsync(void)
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
-
- return true;
}
/*
@@ -830,11 +932,8 @@ mdsync(void)
* to the background writer process. If that fails, just do the fsync
* locally before returning (we expect this will not happen often enough
* to be a performance problem).
- *
- * A false result implies I/O failure during local fsync. errno will be
- * valid for error reporting.
*/
-static bool
+static void
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
if (pendingOpsTable)
@@ -847,17 +946,21 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
entry.segno = seg->mdfd_segno;
(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
- return true;
}
else
{
if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
- return true;
- }
+ return; /* passed it off successfully */
- if (FileSync(seg->mdfd_vfd) < 0)
- return false;
- return true;
+ if (FileSync(seg->mdfd_vfd) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ seg->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
+ }
}
/*
@@ -931,7 +1034,7 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
v->mdfd_chain = NULL;
- Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
@@ -940,51 +1043,66 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
- * specified block. ereport's on failure.
- * (Optionally, can return NULL instead of ereport for ENOENT.)
+ * specified block.
+ *
+ * If the segment doesn't exist, we ereport, return NULL, or create the
+ * segment, according to "behavior". Note: isTemp need only be correct
+ * in the EXTENSION_CREATE case.
*/
static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
+ ExtensionBehavior behavior)
{
- MdfdVec *v = mdopen(reln, allowNotFound);
+ MdfdVec *v = mdopen(reln, behavior);
#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber segstogo;
+ BlockNumber targetseg;
BlockNumber nextsegno;
if (!v)
- return NULL; /* only possible if allowNotFound */
+ return NULL; /* only possible if EXTENSION_RETURN_NULL */
- for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
- segstogo > 0;
- nextsegno++, segstogo--)
+ targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
+ for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
{
+ Assert(nextsegno == v->mdfd_segno + 1);
+
if (v->mdfd_chain == NULL)
{
/*
- * We will create the next segment only if the target block is
- * within it. This prevents Sorcerer's Apprentice syndrome if a
- * bug at higher levels causes us to be handed a ridiculously
- * large blkno --- otherwise we could create many thousands of
- * empty segment files before reaching the "target" block. We
- * should never need to create more than one new segment per call,
- * so this restriction seems reasonable.
+ * Normally we will create new segments only if authorized by
+ * the caller (i.e., we are doing mdextend()). But when doing
+ * WAL recovery, create segments anyway; this allows cases such as
+ * replaying WAL data that has a write into a high-numbered
+ * segment of a relation that was later deleted. We want to go
+ * ahead and create the segments so we can finish out the replay.
*
- * BUT: when doing WAL recovery, disable this logic and create
- * segments unconditionally. In this case it seems better to
- * assume the given blkno is good (it presumably came from a
- * CRC-checked WAL record); furthermore this lets us cope in the
- * case where we are replaying WAL data that has a write into a
- * high-numbered segment of a relation that was later deleted. We
- * want to go ahead and create the segments so we can finish out
- * the replay.
+ * We have to maintain the invariant that segments before the
+ * last active segment are of size RELSEG_SIZE; therefore, pad
+ * them out with zeroes if needed. (This only matters if caller
+ * is extending the relation discontiguously, but that can happen
+ * in hash indexes.)
*/
- v->mdfd_chain = _mdfd_openseg(reln,
- nextsegno,
- (segstogo == 1 || InRecovery) ? O_CREAT : 0);
+ if (behavior == EXTENSION_CREATE || InRecovery)
+ {
+ if (_mdnblocks(reln, v) < RELSEG_SIZE)
+ {
+ char *zerobuf = palloc0(BLCKSZ);
+
+ mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+ zerobuf, isTemp);
+ pfree(zerobuf);
+ }
+ v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
+ }
+ else
+ {
+ /* We won't create segment if not existent */
+ v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
+ }
if (v->mdfd_chain == NULL)
{
- if (allowNotFound && errno == ENOENT)
+ if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
@@ -1007,12 +1125,19 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
* Get number of blocks present in a single disk file
*/
static BlockNumber
-_mdnblocks(File file, Size blcksz)
+_mdnblocks(SMgrRelation reln, MdfdVec *seg)
{
long len;
- len = FileSeek(file, 0L, SEEK_END);
+ len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
if (len < 0)
- return 0; /* on failure, assume file is empty */
- return (BlockNumber) (len / blcksz);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
+ seg->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
+ /* note that this calculation will ignore any partial block at EOF */
+ return (BlockNumber) (len / BLCKSZ);
}