diff options
Diffstat (limited to 'src/backend/storage/lmgr/s_lock.c')
-rw-r--r-- | src/backend/storage/lmgr/s_lock.c | 206 |
1 files changed, 111 insertions, 95 deletions
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c index cc0bf5e01fb..4a6ffb4f890 100644 --- a/src/backend/storage/lmgr/s_lock.c +++ b/src/backend/storage/lmgr/s_lock.c @@ -3,6 +3,38 @@ * s_lock.c * Hardware-dependent implementation of spinlocks. * + * When waiting for a contended spinlock we loop tightly for awhile, then + * delay using pg_usleep() and try again. Preferably, "awhile" should be a + * small multiple of the maximum time we expect a spinlock to be held. 100 + * iterations seems about right as an initial guess. However, on a + * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario + * it's usually better to spin a bit longer than to call the kernel, so we try + * to adapt the spin loop count depending on whether we seem to be in a + * uniprocessor or multiprocessor. + * + * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd + * be wrong; there are platforms where that can result in a "stuck + * spinlock" failure. This has been seen particularly on Alphas; it seems + * that the first TAS after returning from kernel space will always fail + * on that hardware. + * + * Once we do decide to block, we use randomly increasing pg_usleep() + * delays. The first delay is 1 msec, then the delay randomly increases to + * about one second, after which we reset to 1 msec and start again. The + * idea here is that in the presence of heavy contention we need to + * increase the delay, else the spinlock holder may never get to run and + * release the lock. (Consider situation where spinlock holder has been + * nice'd down in priority by the scheduler --- it will not get scheduled + * until all would-be acquirers are sleeping, so if we always use a 1-msec + * sleep, there is a real possibility of starvation.) But we can't just + * clamp the delay to an upper bound, else it would take a long time to + * make a reasonable number of tries. + * + * We time out and declare error after NUM_DELAYS delays (thus, exactly + * that many tries). With the given settings, this will usually take 2 or + * so minutes. It seems better to fix the total number of tries (and thus + * the probability of unintended failure) than to fix the total time + * spent. * * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -21,6 +53,14 @@ #include "storage/s_lock.h" #include "storage/barrier.h" + +#define MIN_SPINS_PER_DELAY 10 +#define MAX_SPINS_PER_DELAY 1000 +#define NUM_DELAYS 1000 +#define MIN_DELAY_USEC 1000L +#define MAX_DELAY_USEC 1000000L + + slock_t dummy_spinlock; static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; @@ -30,117 +70,107 @@ static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; * s_lock_stuck() - complain about a stuck spinlock */ static void -s_lock_stuck(volatile slock_t *lock, const char *file, int line) +s_lock_stuck(void *p, const char *file, int line) { #if defined(S_LOCK_TEST) fprintf(stderr, "\nStuck spinlock (%p) detected at %s:%d.\n", - lock, file, line); + p, file, line); exit(1); #else elog(PANIC, "stuck spinlock (%p) detected at %s:%d", - lock, file, line); + p, file, line); #endif } - /* * s_lock(lock) - platform-independent portion of waiting for a spinlock. */ int s_lock(volatile slock_t *lock, const char *file, int line) { - /* - * We loop tightly for awhile, then delay using pg_usleep() and try again. - * Preferably, "awhile" should be a small multiple of the maximum time we - * expect a spinlock to be held. 100 iterations seems about right as an - * initial guess. However, on a uniprocessor the loop is a waste of - * cycles, while in a multi-CPU scenario it's usually better to spin a bit - * longer than to call the kernel, so we try to adapt the spin loop count - * depending on whether we seem to be in a uniprocessor or multiprocessor. - * - * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd - * be wrong; there are platforms where that can result in a "stuck - * spinlock" failure. This has been seen particularly on Alphas; it seems - * that the first TAS after returning from kernel space will always fail - * on that hardware. - * - * Once we do decide to block, we use randomly increasing pg_usleep() - * delays. The first delay is 1 msec, then the delay randomly increases to - * about one second, after which we reset to 1 msec and start again. The - * idea here is that in the presence of heavy contention we need to - * increase the delay, else the spinlock holder may never get to run and - * release the lock. (Consider situation where spinlock holder has been - * nice'd down in priority by the scheduler --- it will not get scheduled - * until all would-be acquirers are sleeping, so if we always use a 1-msec - * sleep, there is a real possibility of starvation.) But we can't just - * clamp the delay to an upper bound, else it would take a long time to - * make a reasonable number of tries. - * - * We time out and declare error after NUM_DELAYS delays (thus, exactly - * that many tries). With the given settings, this will usually take 2 or - * so minutes. It seems better to fix the total number of tries (and thus - * the probability of unintended failure) than to fix the total time - * spent. - */ -#define MIN_SPINS_PER_DELAY 10 -#define MAX_SPINS_PER_DELAY 1000 -#define NUM_DELAYS 1000 -#define MIN_DELAY_USEC 1000L -#define MAX_DELAY_USEC 1000000L - - int spins = 0; - int delays = 0; - int cur_delay = 0; + SpinDelayStatus delayStatus = init_spin_delay((void *) lock); while (TAS_SPIN(lock)) { - /* CPU-specific delay each time through the loop */ - SPIN_DELAY(); + perform_spin_delay(&delayStatus); + } - /* Block the process every spins_per_delay tries */ - if (++spins >= spins_per_delay) - { - if (++delays > NUM_DELAYS) - s_lock_stuck(lock, file, line); + finish_spin_delay(&delayStatus); - if (cur_delay == 0) /* first time to delay? */ - cur_delay = MIN_DELAY_USEC; + return delayStatus.delays; +} - pg_usleep(cur_delay); +#ifdef USE_DEFAULT_S_UNLOCK +void +s_unlock(volatile slock_t *lock) +{ +#ifdef TAS_ACTIVE_WORD + /* HP's PA-RISC */ + *TAS_ACTIVE_WORD(lock) = -1; +#else + *lock = 0; +#endif +} +#endif + +/* + * Wait while spinning on a contended spinlock. + */ +void +perform_spin_delay(SpinDelayStatus *status) +{ + /* CPU-specific delay each time through the loop */ + SPIN_DELAY(); + + /* Block the process every spins_per_delay tries */ + if (++(status->spins) >= spins_per_delay) + { + if (++(status->delays) > NUM_DELAYS) + s_lock_stuck(status->ptr, status->file, status->line); + + if (status->cur_delay == 0) /* first time to delay? */ + status->cur_delay = MIN_DELAY_USEC; + + pg_usleep(status->cur_delay); #if defined(S_LOCK_TEST) - fprintf(stdout, "*"); - fflush(stdout); + fprintf(stdout, "*"); + fflush(stdout); #endif - /* increase delay by a random fraction between 1X and 2X */ - cur_delay += (int) (cur_delay * + /* increase delay by a random fraction between 1X and 2X */ + status->cur_delay += (int) (status->cur_delay * ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5); - /* wrap back to minimum delay when max is exceeded */ - if (cur_delay > MAX_DELAY_USEC) - cur_delay = MIN_DELAY_USEC; + /* wrap back to minimum delay when max is exceeded */ + if (status->cur_delay > MAX_DELAY_USEC) + status->cur_delay = MIN_DELAY_USEC; - spins = 0; - } + status->spins = 0; } +} - /* - * If we were able to acquire the lock without delaying, it's a good - * indication we are in a multiprocessor. If we had to delay, it's a sign - * (but not a sure thing) that we are in a uniprocessor. Hence, we - * decrement spins_per_delay slowly when we had to delay, and increase it - * rapidly when we didn't. It's expected that spins_per_delay will - * converge to the minimum value on a uniprocessor and to the maximum - * value on a multiprocessor. - * - * Note: spins_per_delay is local within our current process. We want to - * average these observations across multiple backends, since it's - * relatively rare for this function to even get entered, and so a single - * backend might not live long enough to converge on a good value. That - * is handled by the two routines below. - */ - if (cur_delay == 0) +/* + * After acquiring a spinlock, update estimates about how long to loop. + * + * If we were able to acquire the lock without delaying, it's a good + * indication we are in a multiprocessor. If we had to delay, it's a sign + * (but not a sure thing) that we are in a uniprocessor. Hence, we + * decrement spins_per_delay slowly when we had to delay, and increase it + * rapidly when we didn't. It's expected that spins_per_delay will + * converge to the minimum value on a uniprocessor and to the maximum + * value on a multiprocessor. + * + * Note: spins_per_delay is local within our current process. We want to + * average these observations across multiple backends, since it's + * relatively rare for this function to even get entered, and so a single + * backend might not live long enough to converge on a good value. That + * is handled by the two routines below. + */ +void +finish_spin_delay(SpinDelayStatus *status) +{ + if (status->cur_delay == 0) { /* we never had to delay */ if (spins_per_delay < MAX_SPINS_PER_DELAY) @@ -151,22 +181,8 @@ s_lock(volatile slock_t *lock, const char *file, int line) if (spins_per_delay > MIN_SPINS_PER_DELAY) spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY); } - return delays; } -#ifdef USE_DEFAULT_S_UNLOCK -void -s_unlock(volatile slock_t *lock) -{ -#ifdef TAS_ACTIVE_WORD - /* HP's PA-RISC */ - *TAS_ACTIVE_WORD(lock) = -1; -#else - *lock = 0; -#endif -} -#endif - /* * Set local copy of spins_per_delay during backend startup. * |