| /*------------------------------------------------------------------------- |
| * |
| * s_lock.c |
| * Hardware-dependent implementation of spinlocks. |
| * |
| * When waiting for a contended spinlock we loop tightly for awhile, then |
| * delay using pg_usleep() and try again. Preferably, "awhile" should be a |
| * small multiple of the maximum time we expect a spinlock to be held. 100 |
| * iterations seems about right as an initial guess. However, on a |
| * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario |
| * it's usually better to spin a bit longer than to call the kernel, so we try |
| * to adapt the spin loop count depending on whether we seem to be in a |
| * uniprocessor or multiprocessor. |
| * |
| * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd |
| * be wrong; there are platforms where that can result in a "stuck |
| * spinlock" failure. This has been seen particularly on Alphas; it seems |
| * that the first TAS after returning from kernel space will always fail |
| * on that hardware. |
| * |
| * Once we do decide to block, we use randomly increasing pg_usleep() |
| * delays. The first delay is 1 msec, then the delay randomly increases to |
| * about one second, after which we reset to 1 msec and start again. The |
| * idea here is that in the presence of heavy contention we need to |
| * increase the delay, else the spinlock holder may never get to run and |
| * release the lock. (Consider situation where spinlock holder has been |
| * nice'd down in priority by the scheduler --- it will not get scheduled |
| * until all would-be acquirers are sleeping, so if we always use a 1-msec |
| * sleep, there is a real possibility of starvation.) But we can't just |
| * clamp the delay to an upper bound, else it would take a long time to |
| * make a reasonable number of tries. |
| * |
| * We time out and declare error after NUM_DELAYS delays (thus, exactly |
| * that many tries). With the given settings, this will usually take 2 or |
| * so minutes. It seems better to fix the total number of tries (and thus |
| * the probability of unintended failure) than to fix the total time |
| * spent. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/storage/lmgr/s_lock.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "common/pg_prng.h" |
| #include "port/atomics.h" |
| #include "storage/s_lock.h" |
| #include "utils/wait_event.h" |
| |
| #define MIN_SPINS_PER_DELAY 10 |
| #define MAX_SPINS_PER_DELAY 1000 |
| #define NUM_DELAYS 1000 |
| #define MIN_DELAY_USEC 1000L |
| #define MAX_DELAY_USEC 1000000L |
| |
| |
| slock_t dummy_spinlock; |
| |
| static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; |
| |
| |
| /* |
| * s_lock_stuck() - complain about a stuck spinlock |
| */ |
| static void |
| s_lock_stuck(const char *file, int line, const char *func) |
| { |
| if (!func) |
| func = "(unknown)"; |
| #if defined(S_LOCK_TEST) |
| fprintf(stderr, |
| "\nStuck spinlock detected at %s, %s:%d.\n", |
| func, file, line); |
| exit(1); |
| #else |
| elog(PANIC, "stuck spinlock detected at %s, %s:%d", |
| func, file, line); |
| #endif |
| } |
| |
| /* |
| * s_lock(lock) - platform-independent portion of waiting for a spinlock. |
| */ |
| int |
| s_lock(volatile slock_t *lock, const char *file, int line, const char *func) |
| { |
| SpinDelayStatus delayStatus; |
| |
| init_spin_delay(&delayStatus, file, line, func); |
| |
| while (TAS_SPIN(lock)) |
| { |
| perform_spin_delay(&delayStatus); |
| } |
| |
| finish_spin_delay(&delayStatus); |
| |
| return delayStatus.delays; |
| } |
| |
| #ifdef USE_DEFAULT_S_UNLOCK |
| void |
| s_unlock(volatile slock_t *lock) |
| { |
| #ifdef TAS_ACTIVE_WORD |
| /* HP's PA-RISC */ |
| *TAS_ACTIVE_WORD(lock) = -1; |
| #else |
| *lock = 0; |
| #endif |
| } |
| #endif |
| |
| /* |
| * Wait while spinning on a contended spinlock. |
| */ |
| void |
| perform_spin_delay(SpinDelayStatus *status) |
| { |
| /* CPU-specific delay each time through the loop */ |
| SPIN_DELAY(); |
| |
| /* Block the process every spins_per_delay tries */ |
| if (++(status->spins) >= spins_per_delay) |
| { |
| if (++(status->delays) > NUM_DELAYS) |
| s_lock_stuck(status->file, status->line, status->func); |
| |
| if (status->cur_delay == 0) /* first time to delay? */ |
| status->cur_delay = MIN_DELAY_USEC; |
| |
| /* |
| * Once we start sleeping, the overhead of reporting a wait event is |
| * justified. Actively spinning easily stands out in profilers, but |
| * sleeping with an exponential backoff is harder to spot... |
| * |
| * We might want to report something more granular at some point, but |
| * this is better than nothing. |
| */ |
| pgstat_report_wait_start(WAIT_EVENT_SPIN_DELAY); |
| pg_usleep(status->cur_delay); |
| pgstat_report_wait_end(); |
| |
| #if defined(S_LOCK_TEST) |
| fprintf(stdout, "*"); |
| fflush(stdout); |
| #endif |
| |
| /* increase delay by a random fraction between 1X and 2X */ |
| status->cur_delay += (int) (status->cur_delay * |
| pg_prng_double(&pg_global_prng_state) + 0.5); |
| /* wrap back to minimum delay when max is exceeded */ |
| if (status->cur_delay > MAX_DELAY_USEC) |
| status->cur_delay = MIN_DELAY_USEC; |
| |
| status->spins = 0; |
| } |
| } |
| |
| /* |
| * After acquiring a spinlock, update estimates about how long to loop. |
| * |
| * If we were able to acquire the lock without delaying, it's a good |
| * indication we are in a multiprocessor. If we had to delay, it's a sign |
| * (but not a sure thing) that we are in a uniprocessor. Hence, we |
| * decrement spins_per_delay slowly when we had to delay, and increase it |
| * rapidly when we didn't. It's expected that spins_per_delay will |
| * converge to the minimum value on a uniprocessor and to the maximum |
| * value on a multiprocessor. |
| * |
| * Note: spins_per_delay is local within our current process. We want to |
| * average these observations across multiple backends, since it's |
| * relatively rare for this function to even get entered, and so a single |
| * backend might not live long enough to converge on a good value. That |
| * is handled by the two routines below. |
| */ |
| void |
| finish_spin_delay(SpinDelayStatus *status) |
| { |
| if (status->cur_delay == 0) |
| { |
| /* we never had to delay */ |
| if (spins_per_delay < MAX_SPINS_PER_DELAY) |
| spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY); |
| } |
| else |
| { |
| if (spins_per_delay > MIN_SPINS_PER_DELAY) |
| spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY); |
| } |
| } |
| |
| /* |
| * Set local copy of spins_per_delay during backend startup. |
| * |
| * NB: this has to be pretty fast as it is called while holding a spinlock |
| */ |
| void |
| set_spins_per_delay(int shared_spins_per_delay) |
| { |
| spins_per_delay = shared_spins_per_delay; |
| } |
| |
| /* |
| * Recompute shared estimate of spins_per_delay during backend exit. |
| * |
| * NB: this has to be pretty fast as it is called while holding a spinlock |
| */ |
| int |
| recompute_spins_per_delay(int shared_spins_per_delay) |
| { |
| /* |
| * We use an exponential moving average with a relatively slow adaption |
| * rate, so that noise in any one backend's result won't affect the shared |
| * value too much. As long as both inputs are within the allowed range, |
| * the result must be too, so we need not worry about clamping the result. |
| * |
| * We deliberately truncate rather than rounding; this is so that single |
| * adjustments inside a backend can affect the shared estimate (see the |
| * asymmetric adjustment rules above). |
| */ |
| return (shared_spins_per_delay * 15 + spins_per_delay) / 16; |
| } |
| |
| |
| /*****************************************************************************/ |
| #if defined(S_LOCK_TEST) |
| |
| /* |
| * test program for verifying a port's spinlock support. |
| */ |
| |
| struct test_lock_struct |
| { |
| char pad1; |
| slock_t lock; |
| char pad2; |
| }; |
| |
| volatile struct test_lock_struct test_lock; |
| |
| int |
| main() |
| { |
| pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL)); |
| |
| test_lock.pad1 = test_lock.pad2 = 0x44; |
| |
| S_INIT_LOCK(&test_lock.lock); |
| |
| if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) |
| { |
| printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); |
| return 1; |
| } |
| |
| if (!S_LOCK_FREE(&test_lock.lock)) |
| { |
| printf("S_LOCK_TEST: failed, lock not initialized\n"); |
| return 1; |
| } |
| |
| S_LOCK(&test_lock.lock); |
| |
| if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) |
| { |
| printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); |
| return 1; |
| } |
| |
| if (S_LOCK_FREE(&test_lock.lock)) |
| { |
| printf("S_LOCK_TEST: failed, lock not locked\n"); |
| return 1; |
| } |
| |
| S_UNLOCK(&test_lock.lock); |
| |
| if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) |
| { |
| printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); |
| return 1; |
| } |
| |
| if (!S_LOCK_FREE(&test_lock.lock)) |
| { |
| printf("S_LOCK_TEST: failed, lock not unlocked\n"); |
| return 1; |
| } |
| |
| S_LOCK(&test_lock.lock); |
| |
| if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) |
| { |
| printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); |
| return 1; |
| } |
| |
| if (S_LOCK_FREE(&test_lock.lock)) |
| { |
| printf("S_LOCK_TEST: failed, lock not re-locked\n"); |
| return 1; |
| } |
| |
| printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS); |
| printf(" exit with a 'stuck spinlock' message\n"); |
| printf(" if S_LOCK() and TAS() are working.\n"); |
| fflush(stdout); |
| |
| s_lock(&test_lock.lock, __FILE__, __LINE__, __func__); |
| |
| printf("S_LOCK_TEST: failed, lock not locked\n"); |
| return 1; |
| } |
| |
| #endif /* S_LOCK_TEST */ |