aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/xlog.c74
-rw-r--r--src/backend/storage/buffer/bufmgr.c9
-rw-r--r--src/backend/storage/buffer/s_lock.c58
-rw-r--r--src/backend/storage/file/fd.c24
-rw-r--r--src/include/config.h.in18
-rw-r--r--src/include/storage/fd.h3
-rw-r--r--src/include/storage/s_lock.h17
7 files changed, 140 insertions, 63 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7c18f0ff5a6..fbc4223034a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.53 2001/02/13 20:40:25 vadim Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.54 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -39,6 +39,13 @@
#include "miscadmin.h"
+
+/* Max time to wait to acquire XLog activity locks */
+#define XLOG_LOCK_TIMEOUT (5*60*1000000) /* 5 minutes */
+/* Max time to wait to acquire checkpoint lock */
+#define CHECKPOINT_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */
+
+
int XLOGbuffers = 8;
int XLOGfiles = 0; /* how many files to pre-allocate */
XLogRecPtr MyLastRecPtr = {0, 0};
@@ -178,8 +185,8 @@ typedef struct BkpBlock
/*
* We break each log file in 16Mb segments
*/
-#define XLogSegSize (16*1024*1024)
-#define XLogLastSeg (0xffffffff / XLogSegSize)
+#define XLogSegSize ((uint32) (16*1024*1024))
+#define XLogLastSeg (((uint32) 0xffffffff) / XLogSegSize)
#define XLogFileSize (XLogLastSeg * XLogSegSize)
#define NextLogSeg(_logId, _logSeg) \
@@ -423,7 +430,7 @@ begin:;
}
}
}
- S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++);
+ S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++, XLOG_LOCK_TIMEOUT);
if (!TAS(&(XLogCtl->insert_lck)))
break;
}
@@ -721,7 +728,7 @@ XLogFlush(XLogRecPtr record)
break;
}
}
- S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++);
+ S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT);
}
if (logFile >= 0 && (LgwrResult.Write.xlogid != logId ||
@@ -741,7 +748,7 @@ XLogFlush(XLogRecPtr record)
logFile = XLogFileOpen(logId, logSeg, false);
}
- if (pg_fsync(logFile) != 0)
+ if (pg_fdatasync(logFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
LgwrResult.Flush = LgwrResult.Write;
@@ -826,7 +833,7 @@ GetFreeXLBuffer()
InitXLBuffer(curridx);
return;
}
- S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++);
+ S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT);
}
}
@@ -846,7 +853,7 @@ XLogWrite(char *buffer)
{
if (wcnt > 0)
{
- if (pg_fsync(logFile) != 0)
+ if (pg_fdatasync(logFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
if (LgwrResult.Write.xlogid != logId)
@@ -928,7 +935,7 @@ XLogWrite(char *buffer)
if (XLByteLT(LgwrResult.Flush, LgwrRqst.Flush) &&
XLByteLE(LgwrRqst.Flush, LgwrResult.Write))
{
- if (pg_fsync(logFile) != 0)
+ if (pg_fdatasync(logFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
LgwrResult.Flush = LgwrResult.Write;
@@ -948,13 +955,14 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
{
char path[MAXPGPATH];
char tpath[MAXPGPATH];
+ char zbuffer[BLCKSZ];
int fd;
+ int nbytes;
XLogFileName(path, log, seg);
/*
- * Try to use existent file (checkpoint maker
- * creates it sometime).
+ * Try to use existent file (checkpoint maker creates it sometimes).
*/
if (*usexistent)
{
@@ -963,7 +971,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
{
if (errno != ENOENT)
elog(STOP, "InitOpen(logfile %u seg %u) failed: %m",
- logId, logSeg);
+ logId, logSeg);
}
else
return(fd);
@@ -979,33 +987,44 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
logId, logSeg);
- if (lseek(fd, XLogSegSize - 1, SEEK_SET) != (off_t) (XLogSegSize - 1))
- elog(STOP, "lseek(logfile %u seg %u) failed: %m",
- logId, logSeg);
-
- if (write(fd, "", 1) != 1)
- elog(STOP, "write(logfile %u seg %u) failed: %m",
- logId, logSeg);
+ /*
+ * Zero-fill the file. We have to do this the hard way to ensure that
+ * all the file space has really been allocated --- on platforms that
+ * allow "holes" in files, just seeking to the end doesn't allocate
+ * intermediate space. This way, we know that we have all the space
+ * and (after the fsync below) that all the indirect blocks are down
+ * on disk. Therefore, fdatasync(2) will be sufficient to sync future
+ * writes to the log file.
+ */
+ MemSet(zbuffer, 0, sizeof(zbuffer));
+ for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+ {
+ if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
+ elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
+ logId, logSeg);
+ }
if (pg_fsync(fd) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
- if (lseek(fd, 0, SEEK_SET) < 0)
- elog(STOP, "lseek(logfile %u seg %u off %u) failed: %m",
- log, seg, 0);
-
close(fd);
+ /*
+ * Prefer link() to rename() here just to be sure that we don't overwrite
+ * an existing logfile. However, there shouldn't be one, so rename()
+ * is an acceptable substitute except for the truly paranoid.
+ */
#ifndef __BEOS__
if (link(tpath, path) < 0)
+ elog(STOP, "InitRelink(logfile %u seg %u) failed: %m",
+ logId, logSeg);
+ unlink(tpath);
#else
if (rename(tpath, path) < 0)
-#endif
elog(STOP, "InitRelink(logfile %u seg %u) failed: %m",
logId, logSeg);
-
- unlink(tpath);
+#endif
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
@@ -2101,7 +2120,8 @@ CreateCheckPoint(bool shutdown)
/* Grab lock, using larger than normal sleep between tries (1 sec) */
while (TAS(&(XLogCtl->chkp_lck)))
{
- S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++, 1000000);
+ S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++,
+ CHECKPOINT_LOCK_TIMEOUT, 1000000);
}
memset(&checkPoint, 0, sizeof(checkPoint));
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6af9b4065d3..5c5b6b8875c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.106 2001/01/24 19:43:05 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.107 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1990,6 +1990,9 @@ UnlockBuffers(void)
}
}
+/* Max time to wait to acquire a buffer read or write lock */
+#define BUFFER_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */
+
void
LockBuffer(Buffer buffer, int mode)
{
@@ -2041,7 +2044,7 @@ LockBuffer(Buffer buffer, int mode)
{
S_UNLOCK(&(buf->cntx_lock));
RESUME_INTERRUPTS();
- S_LOCK_SLEEP(&(buf->cntx_lock), i++);
+ S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
HOLD_INTERRUPTS();
S_LOCK(&(buf->cntx_lock));
}
@@ -2069,7 +2072,7 @@ LockBuffer(Buffer buffer, int mode)
}
S_UNLOCK(&(buf->cntx_lock));
RESUME_INTERRUPTS();
- S_LOCK_SLEEP(&(buf->cntx_lock), i++);
+ S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
HOLD_INTERRUPTS();
S_LOCK(&(buf->cntx_lock));
}
diff --git a/src/backend/storage/buffer/s_lock.c b/src/backend/storage/buffer/s_lock.c
index 3918bf00767..ef70f45d887 100644
--- a/src/backend/storage/buffer/s_lock.c
+++ b/src/backend/storage/buffer/s_lock.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.32 2001/01/24 19:43:06 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.33 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -21,23 +21,39 @@
#include "storage/s_lock.h"
-/*
+/*----------
* Each time we busy spin we select the next element of this array as the
* number of microseconds to wait. This accomplishes pseudo random back-off.
- * Values are not critical but 10 milliseconds is a common platform
- * granularity.
*
- * Total time to cycle through all 20 entries might be about .07 sec,
- * so the given value of S_MAX_BUSY results in timeout after ~70 sec.
+ * Note that on most platforms, specified values will be rounded up to the
+ * next multiple of a clock tick, which is often ten milliseconds (10000).
+ * So, we are being way overoptimistic to assume that these different values
+ * are really different, other than the last. But there are a few platforms
+ * with better-than-usual timekeeping, and on these we will get pretty good
+ * pseudo-random behavior.
+ *
+ * Total time to cycle through all 20 entries will be at least 100 msec,
+ * more commonly (10 msec resolution) 220 msec, and on some platforms
+ * as much as 420 msec (when the remainder of the current tick cycle is
+ * ignored in deciding when to time out, as on FreeBSD and older Linuxen).
+ * We use the 100msec figure to figure max_spins, so actual timeouts may
+ * be as much as four times the nominal value, but will never be less.
+ *----------
*/
#define S_NSPINCYCLE 20
-#define S_MAX_BUSY 1000 * S_NSPINCYCLE
int s_spincycle[S_NSPINCYCLE] =
-{ 0, 0, 0, 0, 10000, 0, 0, 0, 10000, 0,
- 0, 10000, 0, 0, 10000, 0, 10000, 0, 10000, 10000
+{ 1, 10, 100, 1000,
+ 10000, 1000, 1000, 1000,
+ 10000, 1000, 1000, 10000,
+ 1000, 1000, 10000, 1000,
+ 10000, 1000, 10000, 30000
};
+#define AVG_SPINCYCLE 5000 /* average entry in microsec: 100ms / 20 */
+
+#define DEFAULT_TIMEOUT (100*1000000) /* default timeout: 100 sec */
+
/*
* s_lock_stuck() - complain about a stuck spinlock
@@ -58,34 +74,40 @@ s_lock_stuck(volatile slock_t *lock, const char *file, const int line)
/*
* s_lock_sleep() - sleep a pseudo-random amount of time, check for timeout
*
- * Normally 'microsec' is 0, specifying to use the next s_spincycle[] value.
+ * The 'timeout' is given in microsec, or may be 0 for "infinity". Note that
+ * this will be a lower bound (a fairly loose lower bound, on most platforms).
+ *
+ * 'microsec' is the number of microsec to delay per loop. Normally
+ * 'microsec' is 0, specifying to use the next s_spincycle[] value.
* Some callers may pass a nonzero interval, specifying to use exactly that
* delay value rather than a pseudo-random delay.
*/
void
-s_lock_sleep(unsigned spins, int microsec,
+s_lock_sleep(unsigned spins, int timeout, int microsec,
volatile slock_t *lock,
const char *file, const int line)
{
struct timeval delay;
- unsigned max_spins;
if (microsec > 0)
{
delay.tv_sec = 0;
delay.tv_usec = microsec;
- /* two-minute timeout in this case */
- max_spins = 120000000 / microsec;
}
else
{
delay.tv_sec = 0;
delay.tv_usec = s_spincycle[spins % S_NSPINCYCLE];
- max_spins = S_MAX_BUSY;
+ microsec = AVG_SPINCYCLE; /* use average to figure timeout */
}
- if (spins > max_spins)
- s_lock_stuck(lock, file, line);
+ if (timeout > 0)
+ {
+ unsigned max_spins = timeout / microsec;
+
+ if (spins > max_spins)
+ s_lock_stuck(lock, file, line);
+ }
(void) select(0, NULL, NULL, NULL, &delay);
}
@@ -110,7 +132,7 @@ s_lock(volatile slock_t *lock, const char *file, const int line)
*/
while (TAS(lock))
{
- s_lock_sleep(spins++, 0, lock, file, line);
+ s_lock_sleep(spins++, DEFAULT_TIMEOUT, 0, lock, file, line);
CHECK_FOR_INTERRUPTS();
}
}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 1feac25f428..c6a72b8f25d 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.72 2001/02/17 01:00:04 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.73 2001/02/18 04:39:42 tgl Exp $
*
* NOTES:
*
@@ -193,7 +193,7 @@ static char *filepath(char *filename);
static long pg_nofile(void);
/*
- * pg_fsync --- same as fsync except does nothing if -F switch was given
+ * pg_fsync --- same as fsync except does nothing if enableFsync is off
*/
int
pg_fsync(int fd)
@@ -205,6 +205,26 @@ pg_fsync(int fd)
}
/*
+ * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
+ *
+ * Not all platforms have fdatasync; treat as fsync if not available.
+ */
+int
+pg_fdatasync(int fd)
+{
+ if (enableFsync)
+ {
+#ifdef HAVE_FDATASYNC
+ return fdatasync(fd);
+#else
+ return fsync(fd);
+#endif
+ }
+ else
+ return 0;
+}
+
+/*
* BasicOpenFile --- same as open(2) except can free other FDs if needed
*
* This is exported for use by places that really want a plain kernel FD,
diff --git a/src/include/config.h.in b/src/include/config.h.in
index 5c2dc088ef2..68e15d067b7 100644
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -8,7 +8,7 @@
* or in config.h afterwards. Of course, if you edit config.h, then your
* changes will be overwritten the next time you run configure.
*
- * $Id: config.h.in,v 1.157 2001/01/22 23:28:52 tgl Exp $
+ * $Id: config.h.in,v 1.158 2001/02/18 04:39:42 tgl Exp $
*/
#ifndef CONFIG_H
@@ -548,6 +548,19 @@ extern void srandom(unsigned int seed);
*/
#define MAX_RANDOM_VALUE (0x7FFFFFFF)
+/* Define if you have dlopen() */
+#undef HAVE_DLOPEN
+
+/* Define if you have fdatasync() */
+#undef HAVE_FDATASYNC
+
+/* Define if the standard header unistd.h declares fdatasync() */
+#undef HAVE_FDATASYNC_DECL
+
+#if defined(HAVE_FDATASYNC) && !defined(HAVE_FDATASYNC_DECL)
+extern int fdatasync(int fildes);
+#endif
+
/* Set to 1 if you have libz.a */
#undef HAVE_LIBZ
@@ -611,9 +624,6 @@ extern void srandom(unsigned int seed);
/* Define if C++ compiler accepts "#include <string>" */
#undef HAVE_CXX_STRING_HEADER
-/* Define if you have dlopen() */
-#undef HAVE_DLOPEN
-
/* Define if you have the optreset variable */
#undef HAVE_INT_OPTRESET
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index fb8486b0758..46ec1fdb944 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: fd.h,v 1.26 2001/01/24 19:43:27 momjian Exp $
+ * $Id: fd.h,v 1.27 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -71,5 +71,6 @@ extern int BasicOpenFile(FileName fileName, int fileFlags, int fileMode);
extern void closeAllVfds(void);
extern void AtEOXact_Files(void);
extern int pg_fsync(int fd);
+extern int pg_fdatasync(int fd);
#endif /* FD_H */
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 9cf73163d60..5e3a15524b4 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/include/storage/s_lock.h,v 1.89 2001/02/16 23:50:40 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/include/storage/s_lock.h,v 1.90 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -48,11 +48,12 @@
* unsigned spins = 0;
*
* while (TAS(lock))
- * S_LOCK_SLEEP(lock, spins++);
+ * S_LOCK_SLEEP(lock, spins++, timeout);
* }
*
* where S_LOCK_SLEEP() checks for timeout and sleeps for a short
- * interval. Callers that want to perform useful work while waiting
+ * interval. (The timeout is expressed in microseconds, or can be 0 for
+ * "infinity".) Callers that want to perform useful work while waiting
* can write out this entire loop and insert the "useful work" inside
* the loop.
*
@@ -86,7 +87,7 @@
/* Platform-independent out-of-line support routines */
extern void s_lock(volatile slock_t *lock,
const char *file, const int line);
-extern void s_lock_sleep(unsigned spins, int microsec,
+extern void s_lock_sleep(unsigned spins, int timeout, int microsec,
volatile slock_t *lock,
const char *file, const int line);
@@ -518,13 +519,13 @@ extern int tas_sema(volatile slock_t *lock);
#endif /* S_LOCK */
#if !defined(S_LOCK_SLEEP)
-#define S_LOCK_SLEEP(lock,spins) \
- s_lock_sleep((spins), 0, (lock), __FILE__, __LINE__)
+#define S_LOCK_SLEEP(lock,spins,timeout) \
+ s_lock_sleep((spins), (timeout), 0, (lock), __FILE__, __LINE__)
#endif /* S_LOCK_SLEEP */
#if !defined(S_LOCK_SLEEP_INTERVAL)
-#define S_LOCK_SLEEP_INTERVAL(lock,spins,microsec) \
- s_lock_sleep((spins), (microsec), (lock), __FILE__, __LINE__)
+#define S_LOCK_SLEEP_INTERVAL(lock,spins,timeout,microsec) \
+ s_lock_sleep((spins), (timeout), (microsec), (lock), __FILE__, __LINE__)
#endif /* S_LOCK_SLEEP_INTERVAL */
#if !defined(S_LOCK_FREE)