| /*------------------------------------------------------------------------- |
| * |
| * lwlock.c |
| * Lightweight lock manager |
| * |
| * Lightweight locks are intended primarily to provide mutual exclusion of |
| * access to shared-memory data structures. Therefore, they offer both |
| * exclusive and shared lock modes (to support read/write and read-only |
| * access to a shared object). There are few other frammishes. User-level |
| * locking should be done with the full lock manager --- which depends on |
| * LWLocks to protect its shared state. |
| * |
| * In addition to exclusive and shared modes, lightweight locks can be used to |
| * wait until a variable changes value. The variable is initially not set |
| * when the lock is acquired with LWLockAcquire, i.e. it remains set to the |
| * value it was set to when the lock was released last, and can be updated |
| * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar |
| * waits for the variable to be updated, or until the lock is free. When |
| * releasing the lock with LWLockReleaseClearVar() the value can be set to an |
| * appropriate value for a free lock. The meaning of the variable is up to |
| * the caller, the lightweight lock code just assigns and compares it. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/storage/lmgr/lwlock.c |
| * |
| * NOTES: |
| * |
| * This used to be a pretty straight forward reader-writer lock |
| * implementation, in which the internal state was protected by a |
| * spinlock. Unfortunately the overhead of taking the spinlock proved to be |
| * too high for workloads/locks that were taken in shared mode very |
| * frequently. Often we were spinning in the (obviously exclusive) spinlock, |
| * while trying to acquire a shared lock that was actually free. |
| * |
| * Thus a new implementation was devised that provides wait-free shared lock |
| * acquisition for locks that aren't exclusively locked. |
| * |
| * The basic idea is to have a single atomic variable 'lockcount' instead of |
| * the formerly separate shared and exclusive counters and to use atomic |
| * operations to acquire the lock. That's fairly easy to do for plain |
| * rw-spinlocks, but a lot harder for something like LWLocks that want to wait |
| * in the OS. |
| * |
| * For lock acquisition we use an atomic compare-and-exchange on the lockcount |
| * variable. For exclusive lock we swap in a sentinel value |
| * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders. |
| * |
| * To release the lock we use an atomic decrement to release the lock. If the |
| * new value is zero (we get that atomically), we know we can/have to release |
| * waiters. |
| * |
| * Obviously it is important that the sentinel value for exclusive locks |
| * doesn't conflict with the maximum number of possible share lockers - |
| * luckily MAX_BACKENDS makes that easily possible. |
| * |
| * |
| * The attentive reader might have noticed that naively doing the above has a |
| * glaring race condition: We try to lock using the atomic operations and |
| * notice that we have to wait. Unfortunately by the time we have finished |
| * queuing, the former locker very well might have already finished it's |
| * work. That's problematic because we're now stuck waiting inside the OS. |
| |
| * To mitigate those races we use a two phased attempt at locking: |
| * Phase 1: Try to do it atomically, if we succeed, nice |
| * Phase 2: Add ourselves to the waitqueue of the lock |
| * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from |
| * the queue |
| * Phase 4: Sleep till wake-up, goto Phase 1 |
| * |
| * This protects us against the problem from above as nobody can release too |
| * quick, before we're queued, since after Phase 2 we're already queued. |
| * ------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "miscadmin.h" |
| #include "pg_trace.h" |
| #include "pgstat.h" |
| #include "port/pg_bitutils.h" |
| #include "postmaster/postmaster.h" |
| #include "replication/slot.h" |
| #include "storage/ipc.h" |
| #include "storage/predicate.h" |
| #include "storage/proc.h" |
| #include "storage/proclist.h" |
| #include "storage/spin.h" |
| #include "utils/memutils.h" |
| |
| #ifdef LWLOCK_STATS |
| #include "utils/hsearch.h" |
| #endif |
| |
| |
| /* We use the ShmemLock spinlock to protect LWLockCounter */ |
| extern slock_t *ShmemLock; |
| |
| #define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30) |
| #define LW_FLAG_RELEASE_OK ((uint32) 1 << 29) |
| #define LW_FLAG_LOCKED ((uint32) 1 << 28) |
| |
| #define LW_VAL_EXCLUSIVE ((uint32) 1 << 24) |
| #define LW_VAL_SHARED 1 |
| |
| #define LW_LOCK_MASK ((uint32) ((1 << 25)-1)) |
| /* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */ |
| #define LW_SHARED_MASK ((uint32) ((1 << 24)-1)) |
| |
| StaticAssertDecl(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, |
| "MAX_BACKENDS too big for lwlock.c"); |
| |
| /* |
| * There are three sorts of LWLock "tranches": |
| * |
| * 1. The individually-named locks defined in lwlocknames.h each have their |
| * own tranche. The names of these tranches appear in IndividualLWLockNames[] |
| * in lwlocknames.c. |
| * |
| * 2. There are some predefined tranches for built-in groups of locks. |
| * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names |
| * appear in BuiltinTrancheNames[] below. |
| * |
| * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche |
| * or LWLockRegisterTranche. The names of these that are known in the current |
| * process appear in LWLockTrancheNames[]. |
| * |
| * All these names are user-visible as wait event names, so choose with care |
| * ... and do not forget to update the documentation's list of wait events. |
| */ |
| extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */ |
| |
| static const char *const BuiltinTrancheNames[] = { |
| /* LWTRANCHE_XACT_BUFFER: */ |
| "XactBuffer", |
| /* LWTRANCHE_COMMITTS_BUFFER: */ |
| "CommitTsBuffer", |
| /* LWTRANCHE_SUBTRANS_BUFFER: */ |
| "SubtransBuffer", |
| /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */ |
| "MultiXactOffsetBuffer", |
| /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */ |
| "MultiXactMemberBuffer", |
| /* LWTRANCHE_NOTIFY_BUFFER: */ |
| "NotifyBuffer", |
| /* LWTRANCHE_SERIAL_BUFFER: */ |
| "SerialBuffer", |
| /* LWTRANCHE_WAL_INSERT: */ |
| "WALInsert", |
| /* LWTRANCHE_BUFFER_CONTENT: */ |
| "BufferContent", |
| /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */ |
| "ReplicationOriginState", |
| /* LWTRANCHE_REPLICATION_SLOT_IO: */ |
| "ReplicationSlotIO", |
| /* LWTRANCHE_LOCK_FASTPATH: */ |
| "LockFastPath", |
| /* LWTRANCHE_BUFFER_MAPPING: */ |
| "BufferMapping", |
| /* LWTRANCHE_LOCK_MANAGER: */ |
| "LockManager", |
| /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */ |
| "PredicateLockManager", |
| /* LWTRANCHE_PARALLEL_HASH_JOIN: */ |
| "ParallelHashJoin", |
| /* LWTRANCHE_PARALLEL_QUERY_DSA: */ |
| "ParallelQueryDSA", |
| /* LWTRANCHE_PER_SESSION_DSA: */ |
| "PerSessionDSA", |
| /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */ |
| "PerSessionRecordType", |
| /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */ |
| "PerSessionRecordTypmod", |
| /* LWTRANCHE_SHARED_TUPLESTORE: */ |
| "SharedTupleStore", |
| /* LWTRANCHE_SHARED_TIDBITMAP: */ |
| "SharedTidBitmap", |
| /* LWTRANCHE_PARALLEL_APPEND: */ |
| "ParallelAppend", |
| /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */ |
| "PerXactPredicateList", |
| /* LWTRANCHE_PGSTATS_DSA: */ |
| "PgStatsDSA", |
| /* LWTRANCHE_PGSTATS_HASH: */ |
| "PgStatsHash", |
| /* LWTRANCHE_PGSTATS_DATA: */ |
| "PgStatsData", |
| /* LWTRANCHE_LAUNCHER_DSA: */ |
| "LogicalRepLauncherDSA", |
| /* LWTRANCHE_LAUNCHER_HASH: */ |
| "LogicalRepLauncherHash", |
| /* LWTRANCHE_DISTRIBUTEDLOG_BUFFERS */ |
| "DistributedLogBuffer" |
| }; |
| |
| StaticAssertDecl(lengthof(BuiltinTrancheNames) == |
| LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS, |
| "missing entries in BuiltinTrancheNames[]"); |
| |
| /* |
| * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and |
| * stores the names of all dynamically-created tranches known to the current |
| * process. Any unused entries in the array will contain NULL. |
| */ |
| static const char **LWLockTrancheNames = NULL; |
| static int LWLockTrancheNamesAllocated = 0; |
| |
| /* |
| * This points to the main array of LWLocks in shared memory. Backends inherit |
| * the pointer by fork from the postmaster (except in the EXEC_BACKEND case, |
| * where we have special measures to pass it down). |
| */ |
| LWLockPadded *MainLWLockArray = NULL; |
| |
| /* |
| * We use this structure to keep track of locked LWLocks for release |
| * during error recovery. Normally, only a few will be held at once, but |
| * occasionally the number can be much higher; for example, the pg_buffercache |
| * extension locks all buffer partitions simultaneously. |
| */ |
| #define MAX_SIMUL_LWLOCKS 200 |
| |
| /* struct representing the LWLocks we're holding */ |
| typedef struct LWLockHandle |
| { |
| LWLock *lock; |
| LWLockMode mode; |
| } LWLockHandle; |
| |
| static int num_held_lwlocks = 0; |
| static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS]; |
| |
| /* struct representing the LWLock tranche request for named tranche */ |
| typedef struct NamedLWLockTrancheRequest |
| { |
| char tranche_name[NAMEDATALEN]; |
| int num_lwlocks; |
| } NamedLWLockTrancheRequest; |
| |
| static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; |
| static int NamedLWLockTrancheRequestsAllocated = 0; |
| |
| /* |
| * NamedLWLockTrancheRequests is both the valid length of the request array, |
| * and the length of the shared-memory NamedLWLockTrancheArray later on. |
| * This variable and NamedLWLockTrancheArray are non-static so that |
| * postmaster.c can copy them to child processes in EXEC_BACKEND builds. |
| */ |
| int NamedLWLockTrancheRequests = 0; |
| |
| /* points to data in shared memory: */ |
| NamedLWLockTranche *NamedLWLockTrancheArray = NULL; |
| |
| static void InitializeLWLocks(void); |
| static inline void LWLockReportWaitStart(LWLock *lock); |
| static inline void LWLockReportWaitEnd(void); |
| static const char *GetLWTrancheName(uint16 trancheId); |
| |
| #define T_NAME(lock) \ |
| GetLWTrancheName((lock)->tranche) |
| |
| #ifdef LWLOCK_STATS |
| typedef struct lwlock_stats_key |
| { |
| int tranche; |
| void *instance; |
| } lwlock_stats_key; |
| |
| typedef struct lwlock_stats |
| { |
| lwlock_stats_key key; |
| int sh_acquire_count; |
| int ex_acquire_count; |
| int block_count; |
| int dequeue_self_count; |
| int spin_delay_count; |
| } lwlock_stats; |
| |
| static HTAB *lwlock_stats_htab; |
| static lwlock_stats lwlock_stats_dummy; |
| #endif |
| |
| #ifdef LOCK_DEBUG |
| bool Trace_lwlocks = false; |
| |
| inline static void |
| PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode) |
| { |
| /* hide statement & context here, otherwise the log is just too verbose */ |
| if (Trace_lwlocks) |
| { |
| uint32 state = pg_atomic_read_u32(&lock->state); |
| |
| ereport(LOG, |
| (errhidestmt(true), |
| errhidecontext(true), |
| errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d", |
| MyProcPid, |
| where, T_NAME(lock), lock, |
| (state & LW_VAL_EXCLUSIVE) != 0, |
| state & LW_SHARED_MASK, |
| (state & LW_FLAG_HAS_WAITERS) != 0, |
| pg_atomic_read_u32(&lock->nwaiters), |
| (state & LW_FLAG_RELEASE_OK) != 0))); |
| } |
| } |
| |
| inline static void |
| LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) |
| { |
| /* hide statement & context here, otherwise the log is just too verbose */ |
| if (Trace_lwlocks) |
| { |
| ereport(LOG, |
| (errhidestmt(true), |
| errhidecontext(true), |
| errmsg_internal("%s(%s %p): %s", where, |
| T_NAME(lock), lock, msg))); |
| } |
| } |
| |
| #else /* not LOCK_DEBUG */ |
| #define PRINT_LWDEBUG(a,b,c) ((void)0) |
| #define LOG_LWDEBUG(a,b,c) ((void)0) |
| #endif /* LOCK_DEBUG */ |
| |
| #ifdef LWLOCK_STATS |
| |
| static void init_lwlock_stats(void); |
| static void print_lwlock_stats(int code, Datum arg); |
| static lwlock_stats * get_lwlock_stats_entry(LWLock *lock); |
| |
| static void |
| init_lwlock_stats(void) |
| { |
| HASHCTL ctl; |
| static MemoryContext lwlock_stats_cxt = NULL; |
| static bool exit_registered = false; |
| |
| if (lwlock_stats_cxt != NULL) |
| MemoryContextDelete(lwlock_stats_cxt); |
| |
| /* |
| * The LWLock stats will be updated within a critical section, which |
| * requires allocating new hash entries. Allocations within a critical |
| * section are normally not allowed because running out of memory would |
| * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally |
| * turned on in production, so that's an acceptable risk. The hash entries |
| * are small, so the risk of running out of memory is minimal in practice. |
| */ |
| lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext, |
| "LWLock stats", |
| ALLOCSET_DEFAULT_SIZES); |
| MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true); |
| |
| ctl.keysize = sizeof(lwlock_stats_key); |
| ctl.entrysize = sizeof(lwlock_stats); |
| ctl.hcxt = lwlock_stats_cxt; |
| lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl, |
| HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| if (!exit_registered) |
| { |
| on_shmem_exit(print_lwlock_stats, 0); |
| exit_registered = true; |
| } |
| } |
| |
| static void |
| print_lwlock_stats(int code, Datum arg) |
| { |
| HASH_SEQ_STATUS scan; |
| lwlock_stats *lwstats; |
| |
| hash_seq_init(&scan, lwlock_stats_htab); |
| |
| /* Grab an LWLock to keep different backends from mixing reports */ |
| LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE); |
| |
| while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL) |
| { |
| fprintf(stderr, |
| "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n", |
| MyProcPid, GetLWTrancheName(lwstats->key.tranche), |
| lwstats->key.instance, lwstats->sh_acquire_count, |
| lwstats->ex_acquire_count, lwstats->block_count, |
| lwstats->spin_delay_count, lwstats->dequeue_self_count); |
| } |
| |
| LWLockRelease(&MainLWLockArray[0].lock); |
| } |
| |
| static lwlock_stats * |
| get_lwlock_stats_entry(LWLock *lock) |
| { |
| lwlock_stats_key key; |
| lwlock_stats *lwstats; |
| bool found; |
| |
| /* |
| * During shared memory initialization, the hash table doesn't exist yet. |
| * Stats of that phase aren't very interesting, so just collect operations |
| * on all locks in a single dummy entry. |
| */ |
| if (lwlock_stats_htab == NULL) |
| return &lwlock_stats_dummy; |
| |
| /* Fetch or create the entry. */ |
| MemSet(&key, 0, sizeof(key)); |
| key.tranche = lock->tranche; |
| key.instance = lock; |
| lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found); |
| if (!found) |
| { |
| lwstats->sh_acquire_count = 0; |
| lwstats->ex_acquire_count = 0; |
| lwstats->block_count = 0; |
| lwstats->dequeue_self_count = 0; |
| lwstats->spin_delay_count = 0; |
| } |
| return lwstats; |
| } |
| #endif /* LWLOCK_STATS */ |
| |
| |
| /* |
| * Compute number of LWLocks required by named tranches. These will be |
| * allocated in the main array. |
| */ |
| static int |
| NumLWLocksForNamedTranches(void) |
| { |
| int numLocks = 0; |
| int i; |
| |
| for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks; |
| |
| return numLocks; |
| } |
| |
| /* |
| * Compute shmem space needed for LWLocks and named tranches. |
| */ |
| Size |
| LWLockShmemSize(void) |
| { |
| Size size; |
| int i; |
| int numLocks = NUM_FIXED_LWLOCKS; |
| |
| /* Calculate total number of locks needed in the main array. */ |
| numLocks += NumLWLocksForNamedTranches(); |
| |
| /* Space for the LWLock array. */ |
| size = mul_size(numLocks, sizeof(LWLockPadded)); |
| |
| /* Space for dynamic allocation counter, plus room for alignment. */ |
| size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); |
| |
| /* space for named tranches. */ |
| size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); |
| |
| /* space for name of each tranche. */ |
| for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); |
| |
| return size; |
| } |
| |
| /* |
| * Allocate shmem space for the main LWLock array and all tranches and |
| * initialize it. We also register extension LWLock tranches here. |
| */ |
| void |
| CreateLWLocks(void) |
| { |
| if (!IsUnderPostmaster) |
| { |
| Size spaceLocks = LWLockShmemSize(); |
| int *LWLockCounter; |
| char *ptr; |
| |
| /* Allocate space */ |
| ptr = (char *) ShmemAlloc(spaceLocks); |
| |
| /* Leave room for dynamic allocation of tranches */ |
| ptr += sizeof(int); |
| |
| /* Ensure desired alignment of LWLock array */ |
| ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; |
| |
| MainLWLockArray = (LWLockPadded *) ptr; |
| |
| /* |
| * Initialize the dynamic-allocation counter for tranches, which is |
| * stored just before the first LWLock. |
| */ |
| LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); |
| *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; |
| |
| /* Initialize all LWLocks */ |
| InitializeLWLocks(); |
| } |
| |
| /* Register named extension LWLock tranches in the current process. */ |
| for (int i = 0; i < NamedLWLockTrancheRequests; i++) |
| LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, |
| NamedLWLockTrancheArray[i].trancheName); |
| } |
| |
| /* |
| * Initialize LWLocks that are fixed and those belonging to named tranches. |
| */ |
| static void |
| InitializeLWLocks(void) |
| { |
| int numNamedLocks = NumLWLocksForNamedTranches(); |
| int id; |
| int i; |
| int j; |
| LWLockPadded *lock; |
| |
| /* Initialize all individual LWLocks in main array */ |
| for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++) |
| LWLockInitialize(&lock->lock, id); |
| |
| /* Initialize buffer mapping LWLocks in main array */ |
| lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET; |
| for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++) |
| LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING); |
| |
| /* Initialize lmgrs' LWLocks in main array */ |
| lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET; |
| for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++) |
| LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER); |
| |
| /* Initialize predicate lmgrs' LWLocks in main array */ |
| lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET; |
| for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) |
| LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); |
| |
| /* |
| * Copy the info about any named tranches into shared memory (so that |
| * other processes can see it), and initialize the requested LWLocks. |
| */ |
| if (NamedLWLockTrancheRequests > 0) |
| { |
| char *trancheNames; |
| |
| NamedLWLockTrancheArray = (NamedLWLockTranche *) |
| &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; |
| |
| trancheNames = (char *) NamedLWLockTrancheArray + |
| (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); |
| lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; |
| |
| for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| { |
| NamedLWLockTrancheRequest *request; |
| NamedLWLockTranche *tranche; |
| char *name; |
| |
| request = &NamedLWLockTrancheRequestArray[i]; |
| tranche = &NamedLWLockTrancheArray[i]; |
| |
| name = trancheNames; |
| trancheNames += strlen(request->tranche_name) + 1; |
| strcpy(name, request->tranche_name); |
| tranche->trancheId = LWLockNewTrancheId(); |
| tranche->trancheName = name; |
| |
| for (j = 0; j < request->num_lwlocks; j++, lock++) |
| LWLockInitialize(&lock->lock, tranche->trancheId); |
| } |
| } |
| } |
| |
| /* |
| * InitLWLockAccess - initialize backend-local state needed to hold LWLocks |
| */ |
| void |
| InitLWLockAccess(void) |
| { |
| #ifdef LWLOCK_STATS |
| init_lwlock_stats(); |
| #endif |
| } |
| |
| /* |
| * GetNamedLWLockTranche - returns the base address of LWLock from the |
| * specified tranche. |
| * |
| * Caller needs to retrieve the requested number of LWLocks starting from |
| * the base lock address returned by this API. This can be used for |
| * tranches that are requested by using RequestNamedLWLockTranche() API. |
| */ |
| LWLockPadded * |
| GetNamedLWLockTranche(const char *tranche_name) |
| { |
| int lock_pos; |
| int i; |
| |
| /* |
| * Obtain the position of base address of LWLock belonging to requested |
| * tranche_name in MainLWLockArray. LWLocks for named tranches are placed |
| * in MainLWLockArray after fixed locks. |
| */ |
| lock_pos = NUM_FIXED_LWLOCKS; |
| for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| { |
| if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name, |
| tranche_name) == 0) |
| return &MainLWLockArray[lock_pos]; |
| |
| lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks; |
| } |
| |
| elog(ERROR, "requested tranche is not registered"); |
| |
| /* just to keep compiler quiet */ |
| return NULL; |
| } |
| |
| /* |
| * Allocate a new tranche ID. |
| */ |
| int |
| LWLockNewTrancheId(void) |
| { |
| int result; |
| int *LWLockCounter; |
| |
| LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); |
| SpinLockAcquire(ShmemLock); |
| result = (*LWLockCounter)++; |
| SpinLockRelease(ShmemLock); |
| |
| return result; |
| } |
| |
| /* |
| * Register a dynamic tranche name in the lookup table of the current process. |
| * |
| * This routine will save a pointer to the tranche name passed as an argument, |
| * so the name should be allocated in a backend-lifetime context |
| * (shared memory, TopMemoryContext, static constant, or similar). |
| * |
| * The tranche name will be user-visible as a wait event name, so try to |
| * use a name that fits the style for those. |
| */ |
| void |
| LWLockRegisterTranche(int tranche_id, const char *tranche_name) |
| { |
| /* This should only be called for user-defined tranches. */ |
| if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED) |
| return; |
| |
| /* Convert to array index. */ |
| tranche_id -= LWTRANCHE_FIRST_USER_DEFINED; |
| |
| /* If necessary, create or enlarge array. */ |
| if (tranche_id >= LWLockTrancheNamesAllocated) |
| { |
| int newalloc; |
| |
| newalloc = pg_nextpower2_32(Max(8, tranche_id + 1)); |
| |
| if (LWLockTrancheNames == NULL) |
| LWLockTrancheNames = (const char **) |
| MemoryContextAllocZero(TopMemoryContext, |
| newalloc * sizeof(char *)); |
| else |
| LWLockTrancheNames = |
| repalloc0_array(LWLockTrancheNames, const char *, LWLockTrancheNamesAllocated, newalloc); |
| LWLockTrancheNamesAllocated = newalloc; |
| } |
| |
| LWLockTrancheNames[tranche_id] = tranche_name; |
| } |
| |
| /* |
| * RequestNamedLWLockTranche |
| * Request that extra LWLocks be allocated during postmaster |
| * startup. |
| * |
| * This may only be called via the shmem_request_hook of a library that is |
| * loaded into the postmaster via shared_preload_libraries. Calls from |
| * elsewhere will fail. |
| * |
| * The tranche name will be user-visible as a wait event name, so try to |
| * use a name that fits the style for those. |
| */ |
| void |
| RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) |
| { |
| NamedLWLockTrancheRequest *request; |
| |
| // if (!process_shmem_requests_in_progress) |
| // elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook"); |
| |
| if (NamedLWLockTrancheRequestArray == NULL) |
| { |
| NamedLWLockTrancheRequestsAllocated = 16; |
| NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) |
| MemoryContextAlloc(TopMemoryContext, |
| NamedLWLockTrancheRequestsAllocated |
| * sizeof(NamedLWLockTrancheRequest)); |
| } |
| |
| if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) |
| { |
| int i = pg_nextpower2_32(NamedLWLockTrancheRequests + 1); |
| |
| NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) |
| repalloc(NamedLWLockTrancheRequestArray, |
| i * sizeof(NamedLWLockTrancheRequest)); |
| NamedLWLockTrancheRequestsAllocated = i; |
| } |
| |
| request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; |
| Assert(strlen(tranche_name) + 1 <= NAMEDATALEN); |
| strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); |
| request->num_lwlocks = num_lwlocks; |
| NamedLWLockTrancheRequests++; |
| } |
| |
| /* |
| * LWLockInitialize - initialize a new lwlock; it's initially unlocked |
| */ |
| void |
| LWLockInitialize(LWLock *lock, int tranche_id) |
| { |
| pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| #ifdef LOCK_DEBUG |
| pg_atomic_init_u32(&lock->nwaiters, 0); |
| #endif |
| lock->tranche = tranche_id; |
| proclist_init(&lock->waiters); |
| } |
| |
| /* |
| * Report start of wait event for light-weight locks. |
| * |
| * This function will be used by all the light-weight lock calls which |
| * needs to wait to acquire the lock. This function distinguishes wait |
| * event based on tranche and lock id. |
| */ |
| static inline void |
| LWLockReportWaitStart(LWLock *lock) |
| { |
| pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche); |
| } |
| |
| /* |
| * Report end of wait event for light-weight locks. |
| */ |
| static inline void |
| LWLockReportWaitEnd(void) |
| { |
| pgstat_report_wait_end(); |
| } |
| |
| /* |
| * Return the name of an LWLock tranche. |
| */ |
| static const char * |
| GetLWTrancheName(uint16 trancheId) |
| { |
| /* Individual LWLock? */ |
| if (trancheId < NUM_INDIVIDUAL_LWLOCKS) |
| return IndividualLWLockNames[trancheId]; |
| |
| /* Built-in tranche? */ |
| if (trancheId < LWTRANCHE_FIRST_USER_DEFINED) |
| return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS]; |
| |
| /* |
| * It's an extension tranche, so look in LWLockTrancheNames[]. However, |
| * it's possible that the tranche has never been registered in the current |
| * process, in which case give up and return "extension". |
| */ |
| trancheId -= LWTRANCHE_FIRST_USER_DEFINED; |
| |
| if (trancheId >= LWLockTrancheNamesAllocated || |
| LWLockTrancheNames[trancheId] == NULL) |
| return "extension"; |
| |
| return LWLockTrancheNames[trancheId]; |
| } |
| |
| /* |
| * Return an identifier for an LWLock based on the wait class and event. |
| */ |
| const char * |
| GetLWLockIdentifier(uint32 classId, uint16 eventId) |
| { |
| Assert(classId == PG_WAIT_LWLOCK); |
| /* The event IDs are just tranche numbers. */ |
| return GetLWTrancheName(eventId); |
| } |
| |
| /* |
| * Internal function that tries to atomically acquire the lwlock in the passed |
| * in mode. |
| * |
| * This function will not block waiting for a lock to become free - that's the |
| * callers job. |
| * |
| * Returns true if the lock isn't free and we need to wait. |
| */ |
| static bool |
| LWLockAttemptLock(LWLock *lock, LWLockMode mode) |
| { |
| uint32 old_state; |
| |
| Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED); |
| |
| /* |
| * Read once outside the loop, later iterations will get the newer value |
| * via compare & exchange. |
| */ |
| old_state = pg_atomic_read_u32(&lock->state); |
| |
| /* loop until we've determined whether we could acquire the lock or not */ |
| while (true) |
| { |
| uint32 desired_state; |
| bool lock_free; |
| |
| desired_state = old_state; |
| |
| if (mode == LW_EXCLUSIVE) |
| { |
| lock_free = (old_state & LW_LOCK_MASK) == 0; |
| if (lock_free) |
| desired_state += LW_VAL_EXCLUSIVE; |
| } |
| else |
| { |
| lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0; |
| if (lock_free) |
| desired_state += LW_VAL_SHARED; |
| } |
| |
| /* |
| * Attempt to swap in the state we are expecting. If we didn't see |
| * lock to be free, that's just the old value. If we saw it as free, |
| * we'll attempt to mark it acquired. The reason that we always swap |
| * in the value is that this doubles as a memory barrier. We could try |
| * to be smarter and only swap in values if we saw the lock as free, |
| * but benchmark haven't shown it as beneficial so far. |
| * |
| * Retry if the value changed since we last looked at it. |
| */ |
| if (pg_atomic_compare_exchange_u32(&lock->state, |
| &old_state, desired_state)) |
| { |
| if (lock_free) |
| { |
| /* Great! Got the lock. */ |
| #ifdef LOCK_DEBUG |
| if (mode == LW_EXCLUSIVE) |
| lock->owner = MyProc; |
| #endif |
| return false; |
| } |
| else |
| return true; /* somebody else has the lock */ |
| } |
| } |
| pg_unreachable(); |
| } |
| |
| /* |
| * Lock the LWLock's wait list against concurrent activity. |
| * |
| * NB: even though the wait list is locked, non-conflicting lock operations |
| * may still happen concurrently. |
| * |
| * Time spent holding mutex should be short! |
| */ |
| static void |
| LWLockWaitListLock(LWLock *lock) |
| { |
| uint32 old_state; |
| #ifdef LWLOCK_STATS |
| lwlock_stats *lwstats; |
| uint32 delays = 0; |
| |
| lwstats = get_lwlock_stats_entry(lock); |
| #endif |
| |
| while (true) |
| { |
| /* always try once to acquire lock directly */ |
| old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED); |
| if (!(old_state & LW_FLAG_LOCKED)) |
| break; /* got lock */ |
| |
| /* and then spin without atomic operations until lock is released */ |
| { |
| SpinDelayStatus delayStatus; |
| |
| init_local_spin_delay(&delayStatus); |
| |
| while (old_state & LW_FLAG_LOCKED) |
| { |
| perform_spin_delay(&delayStatus); |
| old_state = pg_atomic_read_u32(&lock->state); |
| } |
| #ifdef LWLOCK_STATS |
| delays += delayStatus.delays; |
| #endif |
| finish_spin_delay(&delayStatus); |
| } |
| |
| /* |
| * Retry. The lock might obviously already be re-acquired by the time |
| * we're attempting to get it again. |
| */ |
| } |
| |
| #ifdef LWLOCK_STATS |
| lwstats->spin_delay_count += delays; |
| #endif |
| } |
| |
| /* |
| * Unlock the LWLock's wait list. |
| * |
| * Note that it can be more efficient to manipulate flags and release the |
| * locks in a single atomic operation. |
| */ |
| static void |
| LWLockWaitListUnlock(LWLock *lock) |
| { |
| uint32 old_state PG_USED_FOR_ASSERTS_ONLY; |
| |
| old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED); |
| |
| Assert(old_state & LW_FLAG_LOCKED); |
| } |
| |
| /* |
| * Wakeup all the lockers that currently have a chance to acquire the lock. |
| */ |
| static void |
| LWLockWakeup(LWLock *lock) |
| { |
| bool new_release_ok; |
| bool wokeup_somebody = false; |
| proclist_head wakeup; |
| proclist_mutable_iter iter; |
| |
| proclist_init(&wakeup); |
| |
| new_release_ok = true; |
| |
| /* lock wait list while collecting backends to wake up */ |
| LWLockWaitListLock(lock); |
| |
| proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) |
| { |
| PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| |
| if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) |
| continue; |
| |
| proclist_delete(&lock->waiters, iter.cur, lwWaitLink); |
| proclist_push_tail(&wakeup, iter.cur, lwWaitLink); |
| |
| if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) |
| { |
| /* |
| * Prevent additional wakeups until retryer gets to run. Backends |
| * that are just waiting for the lock to become free don't retry |
| * automatically. |
| */ |
| new_release_ok = false; |
| |
| /* |
| * Don't wakeup (further) exclusive locks. |
| */ |
| wokeup_somebody = true; |
| } |
| |
| /* |
| * Signal that the process isn't on the wait list anymore. This allows |
| * LWLockDequeueSelf() to remove itself of the waitlist with a |
| * proclist_delete(), rather than having to check if it has been |
| * removed from the list. |
| */ |
| Assert(waiter->lwWaiting == LW_WS_WAITING); |
| waiter->lwWaiting = LW_WS_PENDING_WAKEUP; |
| |
| /* |
| * Once we've woken up an exclusive lock, there's no point in waking |
| * up anybody else. |
| */ |
| if (waiter->lwWaitMode == LW_EXCLUSIVE) |
| break; |
| } |
| |
| Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS); |
| |
| /* unset required flags, and release lock, in one fell swoop */ |
| { |
| uint32 old_state; |
| uint32 desired_state; |
| |
| old_state = pg_atomic_read_u32(&lock->state); |
| while (true) |
| { |
| desired_state = old_state; |
| |
| /* compute desired flags */ |
| |
| if (new_release_ok) |
| desired_state |= LW_FLAG_RELEASE_OK; |
| else |
| desired_state &= ~LW_FLAG_RELEASE_OK; |
| |
| if (proclist_is_empty(&wakeup)) |
| desired_state &= ~LW_FLAG_HAS_WAITERS; |
| |
| desired_state &= ~LW_FLAG_LOCKED; /* release lock */ |
| |
| if (pg_atomic_compare_exchange_u32(&lock->state, &old_state, |
| desired_state)) |
| break; |
| } |
| } |
| |
| /* Awaken any waiters I removed from the queue. */ |
| proclist_foreach_modify(iter, &wakeup, lwWaitLink) |
| { |
| PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| |
| LOG_LWDEBUG("LWLockRelease", lock, "release waiter"); |
| proclist_delete(&wakeup, iter.cur, lwWaitLink); |
| |
| /* |
| * Guarantee that lwWaiting being unset only becomes visible once the |
| * unlink from the link has completed. Otherwise the target backend |
| * could be woken up for other reason and enqueue for a new lock - if |
| * that happens before the list unlink happens, the list would end up |
| * being corrupted. |
| * |
| * The barrier pairs with the LWLockWaitListLock() when enqueuing for |
| * another lock. |
| */ |
| pg_write_barrier(); |
| waiter->lwWaiting = LW_WS_NOT_WAITING; |
| PGSemaphoreUnlock(waiter->sem); |
| } |
| } |
| |
| /* |
| * Add ourselves to the end of the queue. |
| * |
| * NB: Mode can be LW_WAIT_UNTIL_FREE here! |
| */ |
| static void |
| LWLockQueueSelf(LWLock *lock, LWLockMode mode) |
| { |
| /* |
| * If we don't have a PGPROC structure, there's no way to wait. This |
| * should never occur, since MyProc should only be null during shared |
| * memory initialization. |
| */ |
| if (MyProc == NULL) |
| elog(PANIC, "cannot wait without a PGPROC structure"); |
| |
| if (MyProc->lwWaiting != LW_WS_NOT_WAITING) |
| elog(PANIC, "queueing for lock while waiting on another one"); |
| |
| LWLockWaitListLock(lock); |
| |
| /* setting the flag is protected by the spinlock */ |
| pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS); |
| |
| MyProc->lwWaiting = LW_WS_WAITING; |
| MyProc->lwWaitMode = mode; |
| |
| /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */ |
| if (mode == LW_WAIT_UNTIL_FREE) |
| proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink); |
| else |
| proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink); |
| |
| /* Can release the mutex now */ |
| LWLockWaitListUnlock(lock); |
| |
| #ifdef LOCK_DEBUG |
| pg_atomic_fetch_add_u32(&lock->nwaiters, 1); |
| #endif |
| } |
| |
| /* |
| * Remove ourselves from the waitlist. |
| * |
| * This is used if we queued ourselves because we thought we needed to sleep |
| * but, after further checking, we discovered that we don't actually need to |
| * do so. |
| */ |
| static void |
| LWLockDequeueSelf(LWLock *lock) |
| { |
| bool on_waitlist; |
| |
| #ifdef LWLOCK_STATS |
| lwlock_stats *lwstats; |
| |
| lwstats = get_lwlock_stats_entry(lock); |
| |
| lwstats->dequeue_self_count++; |
| #endif |
| |
| LWLockWaitListLock(lock); |
| |
| /* |
| * Remove ourselves from the waitlist, unless we've already been removed. |
| * The removal happens with the wait list lock held, so there's no race in |
| * this check. |
| */ |
| on_waitlist = MyProc->lwWaiting == LW_WS_WAITING; |
| if (on_waitlist) |
| proclist_delete(&lock->waiters, MyProc->pgprocno, lwWaitLink); |
| |
| if (proclist_is_empty(&lock->waiters) && |
| (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0) |
| { |
| pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS); |
| } |
| |
| /* XXX: combine with fetch_and above? */ |
| LWLockWaitListUnlock(lock); |
| |
| /* clear waiting state again, nice for debugging */ |
| if (on_waitlist) |
| MyProc->lwWaiting = LW_WS_NOT_WAITING; |
| else |
| { |
| int extraWaits = 0; |
| |
| /* |
| * Somebody else dequeued us and has or will wake us up. Deal with the |
| * superfluous absorption of a wakeup. |
| */ |
| |
| /* |
| * Reset RELEASE_OK flag if somebody woke us before we removed |
| * ourselves - they'll have set it to false. |
| */ |
| pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| |
| /* |
| * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would |
| * get reset at some inconvenient point later. Most of the time this |
| * will immediately return. |
| */ |
| for (;;) |
| { |
| PGSemaphoreLock(MyProc->sem); |
| if (MyProc->lwWaiting == LW_WS_NOT_WAITING) |
| break; |
| extraWaits++; |
| } |
| |
| /* |
| * Fix the process wait semaphore's count for any absorbed wakeups. |
| */ |
| while (extraWaits-- > 0) |
| PGSemaphoreUnlock(MyProc->sem); |
| } |
| |
| #ifdef LOCK_DEBUG |
| { |
| /* not waiting anymore */ |
| uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| |
| Assert(nwaiters < MAX_BACKENDS); |
| } |
| #endif |
| } |
| |
| /* |
| * LWLockAcquire - acquire a lightweight lock in the specified mode |
| * |
| * If the lock is not available, sleep until it is. Returns true if the lock |
| * was available immediately, false if we had to sleep. |
| * |
| * Side effect: cancel/die interrupts are held off until lock release. |
| */ |
| bool |
| LWLockAcquire(LWLock *lock, LWLockMode mode) |
| { |
| PGPROC *proc = MyProc; |
| bool result = true; |
| int extraWaits = 0; |
| #ifdef LWLOCK_STATS |
| lwlock_stats *lwstats; |
| |
| lwstats = get_lwlock_stats_entry(lock); |
| #endif |
| |
| Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); |
| |
| PRINT_LWDEBUG("LWLockAcquire", lock, mode); |
| |
| #ifdef LWLOCK_STATS |
| /* Count lock acquisition attempts */ |
| if (mode == LW_EXCLUSIVE) |
| lwstats->ex_acquire_count++; |
| else |
| lwstats->sh_acquire_count++; |
| #endif /* LWLOCK_STATS */ |
| |
| /* |
| * We can't wait if we haven't got a PGPROC. This should only occur |
| * during bootstrap or shared memory initialization. Put an Assert here |
| * to catch unsafe coding practices. |
| */ |
| Assert(!(proc == NULL && IsUnderPostmaster)); |
| |
| /* Ensure we will have room to remember the lock */ |
| if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) |
| elog(ERROR, "too many LWLocks taken"); |
| |
| /* |
| * Lock out cancel/die interrupts until we exit the code section protected |
| * by the LWLock. This ensures that interrupts will not interfere with |
| * manipulations of data structures in shared memory. |
| */ |
| HOLD_INTERRUPTS(); |
| |
| /* |
| * Loop here to try to acquire lock after each time we are signaled by |
| * LWLockRelease. |
| * |
| * NOTE: it might seem better to have LWLockRelease actually grant us the |
| * lock, rather than retrying and possibly having to go back to sleep. But |
| * in practice that is no good because it means a process swap for every |
| * lock acquisition when two or more processes are contending for the same |
| * lock. Since LWLocks are normally used to protect not-very-long |
| * sections of computation, a process needs to be able to acquire and |
| * release the same lock many times during a single CPU time slice, even |
| * in the presence of contention. The efficiency of being able to do that |
| * outweighs the inefficiency of sometimes wasting a process dispatch |
| * cycle because the lock is not free when a released waiter finally gets |
| * to run. See pgsql-hackers archives for 29-Dec-01. |
| */ |
| for (;;) |
| { |
| bool mustwait; |
| |
| /* |
| * Try to grab the lock the first time, we're not in the waitqueue |
| * yet/anymore. |
| */ |
| mustwait = LWLockAttemptLock(lock, mode); |
| |
| if (!mustwait) |
| { |
| LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock"); |
| break; /* got the lock */ |
| } |
| |
| /* |
| * Ok, at this point we couldn't grab the lock on the first try. We |
| * cannot simply queue ourselves to the end of the list and wait to be |
| * woken up because by now the lock could long have been released. |
| * Instead add us to the queue and try to grab the lock again. If we |
| * succeed we need to revert the queuing and be happy, otherwise we |
| * recheck the lock. If we still couldn't grab it, we know that the |
| * other locker will see our queue entries when releasing since they |
| * existed before we checked for the lock. |
| */ |
| |
| /* add to the queue */ |
| LWLockQueueSelf(lock, mode); |
| |
| /* we're now guaranteed to be woken up if necessary */ |
| mustwait = LWLockAttemptLock(lock, mode); |
| |
| /* ok, grabbed the lock the second time round, need to undo queueing */ |
| if (!mustwait) |
| { |
| LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue"); |
| |
| LWLockDequeueSelf(lock); |
| break; |
| } |
| |
| /* |
| * Wait until awakened. |
| * |
| * It is possible that we get awakened for a reason other than being |
| * signaled by LWLockRelease. If so, loop back and wait again. Once |
| * we've gotten the LWLock, re-increment the sema by the number of |
| * additional signals received. |
| */ |
| LOG_LWDEBUG("LWLockAcquire", lock, "waiting"); |
| |
| #ifdef LWLOCK_STATS |
| lwstats->block_count++; |
| #endif |
| |
| LWLockReportWaitStart(lock); |
| if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); |
| |
| for (;;) |
| { |
| PGSemaphoreLock(proc->sem); |
| if (proc->lwWaiting == LW_WS_NOT_WAITING) |
| break; |
| extraWaits++; |
| } |
| |
| /* Retrying, allow LWLockRelease to release waiters again. */ |
| pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| |
| #ifdef LOCK_DEBUG |
| { |
| /* not waiting anymore */ |
| uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| |
| Assert(nwaiters < MAX_BACKENDS); |
| } |
| #endif |
| |
| if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); |
| LWLockReportWaitEnd(); |
| |
| LOG_LWDEBUG("LWLockAcquire", lock, "awakened"); |
| |
| /* Now loop back and try to acquire lock again. */ |
| result = false; |
| } |
| |
| if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode); |
| |
| /* Add lock to list of locks held by this backend */ |
| held_lwlocks[num_held_lwlocks].lock = lock; |
| held_lwlocks[num_held_lwlocks++].mode = mode; |
| |
| /* |
| * Fix the process wait semaphore's count for any absorbed wakeups. |
| */ |
| while (extraWaits-- > 0) |
| PGSemaphoreUnlock(proc->sem); |
| |
| return result; |
| } |
| |
| /* |
| * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode |
| * |
| * If the lock is not available, return false with no side-effects. |
| * |
| * If successful, cancel/die interrupts are held off until lock release. |
| */ |
| bool |
| LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) |
| { |
| bool mustwait; |
| |
| Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); |
| |
| PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode); |
| |
| /* Ensure we will have room to remember the lock */ |
| if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) |
| elog(ERROR, "too many LWLocks taken"); |
| |
| /* |
| * Lock out cancel/die interrupts until we exit the code section protected |
| * by the LWLock. This ensures that interrupts will not interfere with |
| * manipulations of data structures in shared memory. |
| */ |
| HOLD_INTERRUPTS(); |
| |
| /* Check for the lock */ |
| mustwait = LWLockAttemptLock(lock, mode); |
| |
| if (mustwait) |
| { |
| /* Failed to get lock, so release interrupt holdoff */ |
| RESUME_INTERRUPTS(); |
| |
| LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed"); |
| if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode); |
| } |
| else |
| { |
| /* Add lock to list of locks held by this backend */ |
| held_lwlocks[num_held_lwlocks].lock = lock; |
| held_lwlocks[num_held_lwlocks++].mode = mode; |
| if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode); |
| } |
| return !mustwait; |
| } |
| |
| /* |
| * LWLockAcquireOrWait - Acquire lock, or wait until it's free |
| * |
| * The semantics of this function are a bit funky. If the lock is currently |
| * free, it is acquired in the given mode, and the function returns true. If |
| * the lock isn't immediately free, the function waits until it is released |
| * and returns false, but does not acquire the lock. |
| * |
| * This is currently used for WALWriteLock: when a backend flushes the WAL, |
| * holding WALWriteLock, it can flush the commit records of many other |
| * backends as a side-effect. Those other backends need to wait until the |
| * flush finishes, but don't need to acquire the lock anymore. They can just |
| * wake up, observe that their records have already been flushed, and return. |
| */ |
| bool |
| LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) |
| { |
| PGPROC *proc = MyProc; |
| bool mustwait; |
| int extraWaits = 0; |
| #ifdef LWLOCK_STATS |
| lwlock_stats *lwstats; |
| |
| lwstats = get_lwlock_stats_entry(lock); |
| #endif |
| |
| Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); |
| |
| PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode); |
| |
| /* Ensure we will have room to remember the lock */ |
| if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) |
| elog(ERROR, "too many LWLocks taken"); |
| |
| /* |
| * Lock out cancel/die interrupts until we exit the code section protected |
| * by the LWLock. This ensures that interrupts will not interfere with |
| * manipulations of data structures in shared memory. |
| */ |
| HOLD_INTERRUPTS(); |
| |
| /* |
| * NB: We're using nearly the same twice-in-a-row lock acquisition |
| * protocol as LWLockAcquire(). Check its comments for details. |
| */ |
| mustwait = LWLockAttemptLock(lock, mode); |
| |
| if (mustwait) |
| { |
| LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); |
| |
| mustwait = LWLockAttemptLock(lock, mode); |
| |
| if (mustwait) |
| { |
| /* |
| * Wait until awakened. Like in LWLockAcquire, be prepared for |
| * bogus wakeups. |
| */ |
| LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting"); |
| |
| #ifdef LWLOCK_STATS |
| lwstats->block_count++; |
| #endif |
| |
| LWLockReportWaitStart(lock); |
| if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); |
| |
| for (;;) |
| { |
| PGSemaphoreLock(proc->sem); |
| if (proc->lwWaiting == LW_WS_NOT_WAITING) |
| break; |
| extraWaits++; |
| } |
| |
| #ifdef LOCK_DEBUG |
| { |
| /* not waiting anymore */ |
| uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| |
| Assert(nwaiters < MAX_BACKENDS); |
| } |
| #endif |
| if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); |
| LWLockReportWaitEnd(); |
| |
| LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened"); |
| } |
| else |
| { |
| LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue"); |
| |
| /* |
| * Got lock in the second attempt, undo queueing. We need to treat |
| * this as having successfully acquired the lock, otherwise we'd |
| * not necessarily wake up people we've prevented from acquiring |
| * the lock. |
| */ |
| LWLockDequeueSelf(lock); |
| } |
| } |
| |
| /* |
| * Fix the process wait semaphore's count for any absorbed wakeups. |
| */ |
| while (extraWaits-- > 0) |
| PGSemaphoreUnlock(proc->sem); |
| |
| if (mustwait) |
| { |
| /* Failed to get lock, so release interrupt holdoff */ |
| RESUME_INTERRUPTS(); |
| LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed"); |
| if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode); |
| } |
| else |
| { |
| LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded"); |
| /* Add lock to list of locks held by this backend */ |
| held_lwlocks[num_held_lwlocks].lock = lock; |
| held_lwlocks[num_held_lwlocks++].mode = mode; |
| if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode); |
| } |
| |
| return !mustwait; |
| } |
| |
| /* |
| * Does the lwlock in its current state need to wait for the variable value to |
| * change? |
| * |
| * If we don't need to wait, and it's because the value of the variable has |
| * changed, store the current value in newval. |
| * |
| * *result is set to true if the lock was free, and false otherwise. |
| */ |
| static bool |
| LWLockConflictsWithVar(LWLock *lock, |
| uint64 *valptr, uint64 oldval, uint64 *newval, |
| bool *result) |
| { |
| bool mustwait; |
| uint64 value; |
| |
| /* |
| * Test first to see if it the slot is free right now. |
| * |
| * XXX: the caller uses a spinlock before this, so we don't need a memory |
| * barrier here as far as the current usage is concerned. But that might |
| * not be safe in general. |
| */ |
| mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0; |
| |
| if (!mustwait) |
| { |
| *result = true; |
| return false; |
| } |
| |
| *result = false; |
| |
| /* |
| * Read value using the lwlock's wait list lock, as we can't generally |
| * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to |
| * do atomic 64 bit reads/writes the spinlock should be optimized away. |
| */ |
| LWLockWaitListLock(lock); |
| value = *valptr; |
| LWLockWaitListUnlock(lock); |
| |
| if (value != oldval) |
| { |
| mustwait = false; |
| *newval = value; |
| } |
| else |
| { |
| mustwait = true; |
| } |
| |
| return mustwait; |
| } |
| |
| /* |
| * LWLockWaitForVar - Wait until lock is free, or a variable is updated. |
| * |
| * If the lock is held and *valptr equals oldval, waits until the lock is |
| * either freed, or the lock holder updates *valptr by calling |
| * LWLockUpdateVar. If the lock is free on exit (immediately or after |
| * waiting), returns true. If the lock is still held, but *valptr no longer |
| * matches oldval, returns false and sets *newval to the current value in |
| * *valptr. |
| * |
| * Note: this function ignores shared lock holders; if the lock is held |
| * in shared mode, returns 'true'. |
| */ |
| bool |
| LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) |
| { |
| PGPROC *proc = MyProc; |
| int extraWaits = 0; |
| bool result = false; |
| #ifdef LWLOCK_STATS |
| lwlock_stats *lwstats; |
| |
| lwstats = get_lwlock_stats_entry(lock); |
| #endif |
| |
| PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE); |
| |
| /* |
| * Lock out cancel/die interrupts while we sleep on the lock. There is no |
| * cleanup mechanism to remove us from the wait queue if we got |
| * interrupted. |
| */ |
| HOLD_INTERRUPTS(); |
| |
| /* |
| * Loop here to check the lock's status after each time we are signaled. |
| */ |
| for (;;) |
| { |
| bool mustwait; |
| |
| mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, |
| &result); |
| |
| if (!mustwait) |
| break; /* the lock was free or value didn't match */ |
| |
| /* |
| * Add myself to wait queue. Note that this is racy, somebody else |
| * could wakeup before we're finished queuing. NB: We're using nearly |
| * the same twice-in-a-row lock acquisition protocol as |
| * LWLockAcquire(). Check its comments for details. The only |
| * difference is that we also have to check the variable's values when |
| * checking the state of the lock. |
| */ |
| LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); |
| |
| /* |
| * Set RELEASE_OK flag, to make sure we get woken up as soon as the |
| * lock is released. |
| */ |
| pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| |
| /* |
| * We're now guaranteed to be woken up if necessary. Recheck the lock |
| * and variables state. |
| */ |
| mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, |
| &result); |
| |
| /* Ok, no conflict after we queued ourselves. Undo queueing. */ |
| if (!mustwait) |
| { |
| LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue"); |
| |
| LWLockDequeueSelf(lock); |
| break; |
| } |
| |
| /* |
| * Wait until awakened. |
| * |
| * It is possible that we get awakened for a reason other than being |
| * signaled by LWLockRelease. If so, loop back and wait again. Once |
| * we've gotten the LWLock, re-increment the sema by the number of |
| * additional signals received. |
| */ |
| LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting"); |
| |
| #ifdef LWLOCK_STATS |
| lwstats->block_count++; |
| #endif |
| |
| LWLockReportWaitStart(lock); |
| if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); |
| |
| for (;;) |
| { |
| PGSemaphoreLock(proc->sem); |
| if (proc->lwWaiting == LW_WS_NOT_WAITING) |
| break; |
| extraWaits++; |
| } |
| |
| #ifdef LOCK_DEBUG |
| { |
| /* not waiting anymore */ |
| uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| |
| Assert(nwaiters < MAX_BACKENDS); |
| } |
| #endif |
| |
| if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE); |
| LWLockReportWaitEnd(); |
| |
| LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened"); |
| |
| /* Now loop back and check the status of the lock again. */ |
| } |
| |
| /* |
| * Fix the process wait semaphore's count for any absorbed wakeups. |
| */ |
| while (extraWaits-- > 0) |
| PGSemaphoreUnlock(proc->sem); |
| |
| /* |
| * Now okay to allow cancel/die interrupts. |
| */ |
| RESUME_INTERRUPTS(); |
| |
| return result; |
| } |
| |
| |
| /* |
| * LWLockUpdateVar - Update a variable and wake up waiters atomically |
| * |
| * Sets *valptr to 'val', and wakes up all processes waiting for us with |
| * LWLockWaitForVar(). Setting the value and waking up the processes happen |
| * atomically so that any process calling LWLockWaitForVar() on the same lock |
| * is guaranteed to see the new value, and act accordingly. |
| * |
| * The caller must be holding the lock in exclusive mode. |
| */ |
| void |
| LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) |
| { |
| proclist_head wakeup; |
| proclist_mutable_iter iter; |
| |
| PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE); |
| |
| proclist_init(&wakeup); |
| |
| LWLockWaitListLock(lock); |
| |
| Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE); |
| |
| /* Update the lock's value */ |
| *valptr = val; |
| |
| /* |
| * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken |
| * up. They are always in the front of the queue. |
| */ |
| proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) |
| { |
| PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| |
| if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) |
| break; |
| |
| proclist_delete(&lock->waiters, iter.cur, lwWaitLink); |
| proclist_push_tail(&wakeup, iter.cur, lwWaitLink); |
| |
| /* see LWLockWakeup() */ |
| Assert(waiter->lwWaiting == LW_WS_WAITING); |
| waiter->lwWaiting = LW_WS_PENDING_WAKEUP; |
| } |
| |
| /* We are done updating shared state of the lock itself. */ |
| LWLockWaitListUnlock(lock); |
| |
| /* |
| * Awaken any waiters I removed from the queue. |
| */ |
| proclist_foreach_modify(iter, &wakeup, lwWaitLink) |
| { |
| PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| |
| proclist_delete(&wakeup, iter.cur, lwWaitLink); |
| /* check comment in LWLockWakeup() about this barrier */ |
| pg_write_barrier(); |
| waiter->lwWaiting = LW_WS_NOT_WAITING; |
| PGSemaphoreUnlock(waiter->sem); |
| } |
| } |
| |
| |
| /* |
| * LWLockRelease - release a previously acquired lock |
| */ |
| void |
| LWLockRelease(LWLock *lock) |
| { |
| LWLockMode mode; |
| uint32 oldstate; |
| bool check_waiters; |
| int i; |
| |
| /* |
| * Remove lock from list of locks held. Usually, but not always, it will |
| * be the latest-acquired lock; so search array backwards. |
| */ |
| for (i = num_held_lwlocks; --i >= 0;) |
| if (lock == held_lwlocks[i].lock) |
| break; |
| |
| if (i < 0) |
| elog(ERROR, "lock %s is not held", T_NAME(lock)); |
| |
| mode = held_lwlocks[i].mode; |
| |
| num_held_lwlocks--; |
| for (; i < num_held_lwlocks; i++) |
| held_lwlocks[i] = held_lwlocks[i + 1]; |
| |
| PRINT_LWDEBUG("LWLockRelease", lock, mode); |
| |
| /* |
| * Release my hold on lock, after that it can immediately be acquired by |
| * others, even if we still have to wakeup other waiters. |
| */ |
| if (mode == LW_EXCLUSIVE) |
| oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE); |
| else |
| oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED); |
| |
| /* nobody else can have that kind of lock */ |
| Assert(!(oldstate & LW_VAL_EXCLUSIVE)); |
| |
| if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED()) |
| TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock)); |
| |
| /* |
| * We're still waiting for backends to get scheduled, don't wake them up |
| * again. |
| */ |
| if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) == |
| (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) && |
| (oldstate & LW_LOCK_MASK) == 0) |
| check_waiters = true; |
| else |
| check_waiters = false; |
| |
| /* |
| * As waking up waiters requires the spinlock to be acquired, only do so |
| * if necessary. |
| */ |
| if (check_waiters) |
| { |
| /* XXX: remove before commit? */ |
| LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters"); |
| LWLockWakeup(lock); |
| } |
| |
| /* |
| * Now okay to allow cancel/die interrupts. |
| */ |
| RESUME_INTERRUPTS(); |
| } |
| |
| /* |
| * LWLockReleaseClearVar - release a previously acquired lock, reset variable |
| */ |
| void |
| LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val) |
| { |
| LWLockWaitListLock(lock); |
| |
| /* |
| * Set the variable's value before releasing the lock, that prevents race |
| * a race condition wherein a new locker acquires the lock, but hasn't yet |
| * set the variables value. |
| */ |
| *valptr = val; |
| LWLockWaitListUnlock(lock); |
| |
| LWLockRelease(lock); |
| } |
| |
| |
| /* |
| * LWLockReleaseAll - release all currently-held locks |
| * |
| * Used to clean up after ereport(ERROR). An important difference between this |
| * function and retail LWLockRelease calls is that InterruptHoldoffCount is |
| * unchanged by this operation. This is necessary since InterruptHoldoffCount |
| * has been set to an appropriate level earlier in error recovery. We could |
| * decrement it below zero if we allow it to drop for each released lock! |
| */ |
| void |
| LWLockReleaseAll(void) |
| { |
| while (num_held_lwlocks > 0) |
| { |
| HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ |
| |
| LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock); |
| } |
| } |
| |
| |
| /* |
| * LWLockHeldByMe - test whether my process holds a lock in any mode |
| * |
| * This is meant as debug support only. |
| */ |
| bool |
| LWLockHeldByMe(LWLock *lock) |
| { |
| int i; |
| |
| for (i = 0; i < num_held_lwlocks; i++) |
| { |
| if (held_lwlocks[i].lock == lock) |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * LWLockHeldByMe - test whether my process holds any of an array of locks |
| * |
| * This is meant as debug support only. |
| */ |
| bool |
| LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride) |
| { |
| char *held_lock_addr; |
| char *begin; |
| char *end; |
| int i; |
| |
| begin = (char *) lock; |
| end = begin + nlocks * stride; |
| for (i = 0; i < num_held_lwlocks; i++) |
| { |
| held_lock_addr = (char *) held_lwlocks[i].lock; |
| if (held_lock_addr >= begin && |
| held_lock_addr < end && |
| (held_lock_addr - begin) % stride == 0) |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * LWLockHeldByMeInMode - test whether my process holds a lock in given mode |
| * |
| * This is meant as debug support only. |
| */ |
| bool |
| LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode) |
| { |
| int i; |
| |
| for (i = 0; i < num_held_lwlocks; i++) |
| { |
| if (held_lwlocks[i].lock == lock && held_lwlocks[i].mode == mode) |
| return true; |
| } |
| return false; |
| } |