diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/access/transam/xlog.c | 74 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 9 | ||||
-rw-r--r-- | src/backend/storage/buffer/s_lock.c | 58 | ||||
-rw-r--r-- | src/backend/storage/file/fd.c | 24 | ||||
-rw-r--r-- | src/include/config.h.in | 18 | ||||
-rw-r--r-- | src/include/storage/fd.h | 3 | ||||
-rw-r--r-- | src/include/storage/s_lock.h | 17 |
7 files changed, 140 insertions, 63 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7c18f0ff5a6..fbc4223034a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.53 2001/02/13 20:40:25 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.54 2001/02/18 04:39:42 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -39,6 +39,13 @@ #include "miscadmin.h" + +/* Max time to wait to acquire XLog activity locks */ +#define XLOG_LOCK_TIMEOUT (5*60*1000000) /* 5 minutes */ +/* Max time to wait to acquire checkpoint lock */ +#define CHECKPOINT_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */ + + int XLOGbuffers = 8; int XLOGfiles = 0; /* how many files to pre-allocate */ XLogRecPtr MyLastRecPtr = {0, 0}; @@ -178,8 +185,8 @@ typedef struct BkpBlock /* * We break each log file in 16Mb segments */ -#define XLogSegSize (16*1024*1024) -#define XLogLastSeg (0xffffffff / XLogSegSize) +#define XLogSegSize ((uint32) (16*1024*1024)) +#define XLogLastSeg (((uint32) 0xffffffff) / XLogSegSize) #define XLogFileSize (XLogLastSeg * XLogSegSize) #define NextLogSeg(_logId, _logSeg) \ @@ -423,7 +430,7 @@ begin:; } } } - S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++); + S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++, XLOG_LOCK_TIMEOUT); if (!TAS(&(XLogCtl->insert_lck))) break; } @@ -721,7 +728,7 @@ XLogFlush(XLogRecPtr record) break; } } - S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++); + S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT); } if (logFile >= 0 && (LgwrResult.Write.xlogid != logId || @@ -741,7 +748,7 @@ XLogFlush(XLogRecPtr record) logFile = XLogFileOpen(logId, logSeg, false); } - if (pg_fsync(logFile) != 0) + if (pg_fdatasync(logFile) != 0) elog(STOP, "fsync(logfile %u seg %u) failed: %m", logId, logSeg); LgwrResult.Flush = LgwrResult.Write; @@ -826,7 +833,7 @@ GetFreeXLBuffer() InitXLBuffer(curridx); return; } - S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++); + S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT); } } @@ -846,7 +853,7 @@ XLogWrite(char *buffer) { if (wcnt > 0) { - if (pg_fsync(logFile) != 0) + if (pg_fdatasync(logFile) != 0) elog(STOP, "fsync(logfile %u seg %u) failed: %m", logId, logSeg); if (LgwrResult.Write.xlogid != logId) @@ -928,7 +935,7 @@ XLogWrite(char *buffer) if (XLByteLT(LgwrResult.Flush, LgwrRqst.Flush) && XLByteLE(LgwrRqst.Flush, LgwrResult.Write)) { - if (pg_fsync(logFile) != 0) + if (pg_fdatasync(logFile) != 0) elog(STOP, "fsync(logfile %u seg %u) failed: %m", logId, logSeg); LgwrResult.Flush = LgwrResult.Write; @@ -948,13 +955,14 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) { char path[MAXPGPATH]; char tpath[MAXPGPATH]; + char zbuffer[BLCKSZ]; int fd; + int nbytes; XLogFileName(path, log, seg); /* - * Try to use existent file (checkpoint maker - * creates it sometime). + * Try to use existent file (checkpoint maker creates it sometimes). */ if (*usexistent) { @@ -963,7 +971,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) { if (errno != ENOENT) elog(STOP, "InitOpen(logfile %u seg %u) failed: %m", - logId, logSeg); + logId, logSeg); } else return(fd); @@ -979,33 +987,44 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent) elog(STOP, "InitCreate(logfile %u seg %u) failed: %m", logId, logSeg); - if (lseek(fd, XLogSegSize - 1, SEEK_SET) != (off_t) (XLogSegSize - 1)) - elog(STOP, "lseek(logfile %u seg %u) failed: %m", - logId, logSeg); - - if (write(fd, "", 1) != 1) - elog(STOP, "write(logfile %u seg %u) failed: %m", - logId, logSeg); + /* + * Zero-fill the file. We have to do this the hard way to ensure that + * all the file space has really been allocated --- on platforms that + * allow "holes" in files, just seeking to the end doesn't allocate + * intermediate space. This way, we know that we have all the space + * and (after the fsync below) that all the indirect blocks are down + * on disk. Therefore, fdatasync(2) will be sufficient to sync future + * writes to the log file. + */ + MemSet(zbuffer, 0, sizeof(zbuffer)); + for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer)) + { + if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer)) + elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m", + logId, logSeg); + } if (pg_fsync(fd) != 0) elog(STOP, "fsync(logfile %u seg %u) failed: %m", logId, logSeg); - if (lseek(fd, 0, SEEK_SET) < 0) - elog(STOP, "lseek(logfile %u seg %u off %u) failed: %m", - log, seg, 0); - close(fd); + /* + * Prefer link() to rename() here just to be sure that we don't overwrite + * an existing logfile. However, there shouldn't be one, so rename() + * is an acceptable substitute except for the truly paranoid. + */ #ifndef __BEOS__ if (link(tpath, path) < 0) + elog(STOP, "InitRelink(logfile %u seg %u) failed: %m", + logId, logSeg); + unlink(tpath); #else if (rename(tpath, path) < 0) -#endif elog(STOP, "InitRelink(logfile %u seg %u) failed: %m", logId, logSeg); - - unlink(tpath); +#endif fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) @@ -2101,7 +2120,8 @@ CreateCheckPoint(bool shutdown) /* Grab lock, using larger than normal sleep between tries (1 sec) */ while (TAS(&(XLogCtl->chkp_lck))) { - S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++, 1000000); + S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++, + CHECKPOINT_LOCK_TIMEOUT, 1000000); } memset(&checkPoint, 0, sizeof(checkPoint)); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6af9b4065d3..5c5b6b8875c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.106 2001/01/24 19:43:05 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.107 2001/02/18 04:39:42 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1990,6 +1990,9 @@ UnlockBuffers(void) } } +/* Max time to wait to acquire a buffer read or write lock */ +#define BUFFER_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */ + void LockBuffer(Buffer buffer, int mode) { @@ -2041,7 +2044,7 @@ LockBuffer(Buffer buffer, int mode) { S_UNLOCK(&(buf->cntx_lock)); RESUME_INTERRUPTS(); - S_LOCK_SLEEP(&(buf->cntx_lock), i++); + S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT); HOLD_INTERRUPTS(); S_LOCK(&(buf->cntx_lock)); } @@ -2069,7 +2072,7 @@ LockBuffer(Buffer buffer, int mode) } S_UNLOCK(&(buf->cntx_lock)); RESUME_INTERRUPTS(); - S_LOCK_SLEEP(&(buf->cntx_lock), i++); + S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT); HOLD_INTERRUPTS(); S_LOCK(&(buf->cntx_lock)); } diff --git a/src/backend/storage/buffer/s_lock.c b/src/backend/storage/buffer/s_lock.c index 3918bf00767..ef70f45d887 100644 --- a/src/backend/storage/buffer/s_lock.c +++ b/src/backend/storage/buffer/s_lock.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.32 2001/01/24 19:43:06 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.33 2001/02/18 04:39:42 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,23 +21,39 @@ #include "storage/s_lock.h" -/* +/*---------- * Each time we busy spin we select the next element of this array as the * number of microseconds to wait. This accomplishes pseudo random back-off. - * Values are not critical but 10 milliseconds is a common platform - * granularity. * - * Total time to cycle through all 20 entries might be about .07 sec, - * so the given value of S_MAX_BUSY results in timeout after ~70 sec. + * Note that on most platforms, specified values will be rounded up to the + * next multiple of a clock tick, which is often ten milliseconds (10000). + * So, we are being way overoptimistic to assume that these different values + * are really different, other than the last. But there are a few platforms + * with better-than-usual timekeeping, and on these we will get pretty good + * pseudo-random behavior. + * + * Total time to cycle through all 20 entries will be at least 100 msec, + * more commonly (10 msec resolution) 220 msec, and on some platforms + * as much as 420 msec (when the remainder of the current tick cycle is + * ignored in deciding when to time out, as on FreeBSD and older Linuxen). + * We use the 100msec figure to figure max_spins, so actual timeouts may + * be as much as four times the nominal value, but will never be less. + *---------- */ #define S_NSPINCYCLE 20 -#define S_MAX_BUSY 1000 * S_NSPINCYCLE int s_spincycle[S_NSPINCYCLE] = -{ 0, 0, 0, 0, 10000, 0, 0, 0, 10000, 0, - 0, 10000, 0, 0, 10000, 0, 10000, 0, 10000, 10000 +{ 1, 10, 100, 1000, + 10000, 1000, 1000, 1000, + 10000, 1000, 1000, 10000, + 1000, 1000, 10000, 1000, + 10000, 1000, 10000, 30000 }; +#define AVG_SPINCYCLE 5000 /* average entry in microsec: 100ms / 20 */ + +#define DEFAULT_TIMEOUT (100*1000000) /* default timeout: 100 sec */ + /* * s_lock_stuck() - complain about a stuck spinlock @@ -58,34 +74,40 @@ s_lock_stuck(volatile slock_t *lock, const char *file, const int line) /* * s_lock_sleep() - sleep a pseudo-random amount of time, check for timeout * - * Normally 'microsec' is 0, specifying to use the next s_spincycle[] value. + * The 'timeout' is given in microsec, or may be 0 for "infinity". Note that + * this will be a lower bound (a fairly loose lower bound, on most platforms). + * + * 'microsec' is the number of microsec to delay per loop. Normally + * 'microsec' is 0, specifying to use the next s_spincycle[] value. * Some callers may pass a nonzero interval, specifying to use exactly that * delay value rather than a pseudo-random delay. */ void -s_lock_sleep(unsigned spins, int microsec, +s_lock_sleep(unsigned spins, int timeout, int microsec, volatile slock_t *lock, const char *file, const int line) { struct timeval delay; - unsigned max_spins; if (microsec > 0) { delay.tv_sec = 0; delay.tv_usec = microsec; - /* two-minute timeout in this case */ - max_spins = 120000000 / microsec; } else { delay.tv_sec = 0; delay.tv_usec = s_spincycle[spins % S_NSPINCYCLE]; - max_spins = S_MAX_BUSY; + microsec = AVG_SPINCYCLE; /* use average to figure timeout */ } - if (spins > max_spins) - s_lock_stuck(lock, file, line); + if (timeout > 0) + { + unsigned max_spins = timeout / microsec; + + if (spins > max_spins) + s_lock_stuck(lock, file, line); + } (void) select(0, NULL, NULL, NULL, &delay); } @@ -110,7 +132,7 @@ s_lock(volatile slock_t *lock, const char *file, const int line) */ while (TAS(lock)) { - s_lock_sleep(spins++, 0, lock, file, line); + s_lock_sleep(spins++, DEFAULT_TIMEOUT, 0, lock, file, line); CHECK_FOR_INTERRUPTS(); } } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1feac25f428..c6a72b8f25d 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.72 2001/02/17 01:00:04 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.73 2001/02/18 04:39:42 tgl Exp $ * * NOTES: * @@ -193,7 +193,7 @@ static char *filepath(char *filename); static long pg_nofile(void); /* - * pg_fsync --- same as fsync except does nothing if -F switch was given + * pg_fsync --- same as fsync except does nothing if enableFsync is off */ int pg_fsync(int fd) @@ -205,6 +205,26 @@ pg_fsync(int fd) } /* + * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off + * + * Not all platforms have fdatasync; treat as fsync if not available. + */ +int +pg_fdatasync(int fd) +{ + if (enableFsync) + { +#ifdef HAVE_FDATASYNC + return fdatasync(fd); +#else + return fsync(fd); +#endif + } + else + return 0; +} + +/* * BasicOpenFile --- same as open(2) except can free other FDs if needed * * This is exported for use by places that really want a plain kernel FD, diff --git a/src/include/config.h.in b/src/include/config.h.in index 5c2dc088ef2..68e15d067b7 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -8,7 +8,7 @@ * or in config.h afterwards. Of course, if you edit config.h, then your * changes will be overwritten the next time you run configure. * - * $Id: config.h.in,v 1.157 2001/01/22 23:28:52 tgl Exp $ + * $Id: config.h.in,v 1.158 2001/02/18 04:39:42 tgl Exp $ */ #ifndef CONFIG_H @@ -548,6 +548,19 @@ extern void srandom(unsigned int seed); */ #define MAX_RANDOM_VALUE (0x7FFFFFFF) +/* Define if you have dlopen() */ +#undef HAVE_DLOPEN + +/* Define if you have fdatasync() */ +#undef HAVE_FDATASYNC + +/* Define if the standard header unistd.h declares fdatasync() */ +#undef HAVE_FDATASYNC_DECL + +#if defined(HAVE_FDATASYNC) && !defined(HAVE_FDATASYNC_DECL) +extern int fdatasync(int fildes); +#endif + /* Set to 1 if you have libz.a */ #undef HAVE_LIBZ @@ -611,9 +624,6 @@ extern void srandom(unsigned int seed); /* Define if C++ compiler accepts "#include <string>" */ #undef HAVE_CXX_STRING_HEADER -/* Define if you have dlopen() */ -#undef HAVE_DLOPEN - /* Define if you have the optreset variable */ #undef HAVE_INT_OPTRESET diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index fb8486b0758..46ec1fdb944 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: fd.h,v 1.26 2001/01/24 19:43:27 momjian Exp $ + * $Id: fd.h,v 1.27 2001/02/18 04:39:42 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -71,5 +71,6 @@ extern int BasicOpenFile(FileName fileName, int fileFlags, int fileMode); extern void closeAllVfds(void); extern void AtEOXact_Files(void); extern int pg_fsync(int fd); +extern int pg_fdatasync(int fd); #endif /* FD_H */ diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index 9cf73163d60..5e3a15524b4 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/include/storage/s_lock.h,v 1.89 2001/02/16 23:50:40 tgl Exp $ + * $Header: /cvsroot/pgsql/src/include/storage/s_lock.h,v 1.90 2001/02/18 04:39:42 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,11 +48,12 @@ * unsigned spins = 0; * * while (TAS(lock)) - * S_LOCK_SLEEP(lock, spins++); + * S_LOCK_SLEEP(lock, spins++, timeout); * } * * where S_LOCK_SLEEP() checks for timeout and sleeps for a short - * interval. Callers that want to perform useful work while waiting + * interval. (The timeout is expressed in microseconds, or can be 0 for + * "infinity".) Callers that want to perform useful work while waiting * can write out this entire loop and insert the "useful work" inside * the loop. * @@ -86,7 +87,7 @@ /* Platform-independent out-of-line support routines */ extern void s_lock(volatile slock_t *lock, const char *file, const int line); -extern void s_lock_sleep(unsigned spins, int microsec, +extern void s_lock_sleep(unsigned spins, int timeout, int microsec, volatile slock_t *lock, const char *file, const int line); @@ -518,13 +519,13 @@ extern int tas_sema(volatile slock_t *lock); #endif /* S_LOCK */ #if !defined(S_LOCK_SLEEP) -#define S_LOCK_SLEEP(lock,spins) \ - s_lock_sleep((spins), 0, (lock), __FILE__, __LINE__) +#define S_LOCK_SLEEP(lock,spins,timeout) \ + s_lock_sleep((spins), (timeout), 0, (lock), __FILE__, __LINE__) #endif /* S_LOCK_SLEEP */ #if !defined(S_LOCK_SLEEP_INTERVAL) -#define S_LOCK_SLEEP_INTERVAL(lock,spins,microsec) \ - s_lock_sleep((spins), (microsec), (lock), __FILE__, __LINE__) +#define S_LOCK_SLEEP_INTERVAL(lock,spins,timeout,microsec) \ + s_lock_sleep((spins), (timeout), (microsec), (lock), __FILE__, __LINE__) #endif /* S_LOCK_SLEEP_INTERVAL */ #if !defined(S_LOCK_FREE) |