| /*------------------------------------------------------------------------- |
| * |
| * freelist.c |
| * routines for managing the buffer pool's replacement strategy. |
| * |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/storage/buffer/freelist.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "port/atomics.h" |
| #include "storage/buf_internals.h" |
| #include "storage/bufmgr.h" |
| #include "storage/proc.h" |
| |
| #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) |
| |
| |
| /* |
| * The shared freelist control information. |
| */ |
| typedef struct |
| { |
| /* Spinlock: protects the values below */ |
| slock_t buffer_strategy_lock; |
| |
| /* |
| * Clock sweep hand: index of next buffer to consider grabbing. Note that |
| * this isn't a concrete buffer - we only ever increase the value. So, to |
| * get an actual buffer, it needs to be used modulo NBuffers. |
| */ |
| pg_atomic_uint32 nextVictimBuffer; |
| |
| int firstFreeBuffer; /* Head of list of unused buffers */ |
| int lastFreeBuffer; /* Tail of list of unused buffers */ |
| |
| /* |
| * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, |
| * when the list is empty) |
| */ |
| |
| /* |
| * Statistics. These counters should be wide enough that they can't |
| * overflow during a single bgwriter cycle. |
| */ |
| uint32 completePasses; /* Complete cycles of the clock sweep */ |
| pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ |
| |
| /* |
| * Bgworker process to be notified upon activity or -1 if none. See |
| * StrategyNotifyBgWriter. |
| */ |
| int bgwprocno; |
| } BufferStrategyControl; |
| |
| /* Pointers to shared state */ |
| static BufferStrategyControl *StrategyControl = NULL; |
| |
| /* |
| * Private (non-shared) state for managing a ring of shared buffers to re-use. |
| * This is currently the only kind of BufferAccessStrategy object, but someday |
| * we might have more kinds. |
| */ |
| typedef struct BufferAccessStrategyData |
| { |
| /* Overall strategy type */ |
| BufferAccessStrategyType btype; |
| /* Number of elements in buffers[] array */ |
| int ring_size; |
| |
| /* |
| * Index of the "current" slot in the ring, ie, the one most recently |
| * returned by GetBufferFromRing. |
| */ |
| int current; |
| |
| /* |
| * True if the buffer just returned by StrategyGetBuffer had been in the |
| * ring already. |
| */ |
| bool current_was_in_ring; |
| |
| /* |
| * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we |
| * have not yet selected a buffer for this ring slot. For allocation |
| * simplicity this is palloc'd together with the fixed fields of the |
| * struct. |
| */ |
| Buffer buffers[FLEXIBLE_ARRAY_MEMBER]; |
| } BufferAccessStrategyData; |
| |
| |
| /* Prototypes for internal functions */ |
| static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, |
| uint32 *buf_state); |
| static void AddBufferToRing(BufferAccessStrategy strategy, |
| BufferDesc *buf); |
| |
| /* |
| * ClockSweepTick - Helper routine for StrategyGetBuffer() |
| * |
| * Move the clock hand one buffer ahead of its current position and return the |
| * id of the buffer now under the hand. |
| */ |
| static inline uint32 |
| ClockSweepTick(void) |
| { |
| uint32 victim; |
| |
| /* |
| * Atomically move hand ahead one buffer - if there's several processes |
| * doing this, this can lead to buffers being returned slightly out of |
| * apparent order. |
| */ |
| victim = |
| pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); |
| |
| if (victim >= NBuffers) |
| { |
| uint32 originalVictim = victim; |
| |
| /* always wrap what we look up in BufferDescriptors */ |
| victim = victim % NBuffers; |
| |
| /* |
| * If we're the one that just caused a wraparound, force |
| * completePasses to be incremented while holding the spinlock. We |
| * need the spinlock so StrategySyncStart() can return a consistent |
| * value consisting of nextVictimBuffer and completePasses. |
| */ |
| if (victim == 0) |
| { |
| uint32 expected; |
| uint32 wrapped; |
| bool success = false; |
| |
| expected = originalVictim + 1; |
| |
| while (!success) |
| { |
| /* |
| * Acquire the spinlock while increasing completePasses. That |
| * allows other readers to read nextVictimBuffer and |
| * completePasses in a consistent manner which is required for |
| * StrategySyncStart(). In theory delaying the increment |
| * could lead to an overflow of nextVictimBuffers, but that's |
| * highly unlikely and wouldn't be particularly harmful. |
| */ |
| SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| |
| wrapped = expected % NBuffers; |
| |
| success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, |
| &expected, wrapped); |
| if (success) |
| StrategyControl->completePasses++; |
| SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| } |
| } |
| } |
| return victim; |
| } |
| |
| /* |
| * have_free_buffer -- a lockless check to see if there is a free buffer in |
| * buffer pool. |
| * |
| * If the result is true that will become stale once free buffers are moved out |
| * by other operations, so the caller who strictly want to use a free buffer |
| * should not call this. |
| */ |
| bool |
| have_free_buffer(void) |
| { |
| if (StrategyControl->firstFreeBuffer >= 0) |
| return true; |
| else |
| return false; |
| } |
| |
| /* |
| * StrategyGetBuffer |
| * |
| * Called by the bufmgr to get the next candidate buffer to use in |
| * BufferAlloc(). The only hard requirement BufferAlloc() has is that |
| * the selected buffer must not currently be pinned by anyone. |
| * |
| * strategy is a BufferAccessStrategy object, or NULL for default strategy. |
| * |
| * To ensure that no one else can pin the buffer before we do, we must |
| * return the buffer with the buffer header spinlock still held. |
| */ |
| BufferDesc * |
| StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) |
| { |
| BufferDesc *buf; |
| int bgwprocno; |
| int trycounter; |
| uint32 local_buf_state; /* to avoid repeated (de-)referencing */ |
| |
| /* |
| * If given a strategy object, see whether it can select a buffer. We |
| * assume strategy objects don't need buffer_strategy_lock. |
| */ |
| if (strategy != NULL) |
| { |
| buf = GetBufferFromRing(strategy, buf_state); |
| if (buf != NULL) |
| return buf; |
| } |
| |
| /* |
| * If asked, we need to waken the bgwriter. Since we don't want to rely on |
| * a spinlock for this we force a read from shared memory once, and then |
| * set the latch based on that value. We need to go through that length |
| * because otherwise bgwprocno might be reset while/after we check because |
| * the compiler might just reread from memory. |
| * |
| * This can possibly set the latch of the wrong process if the bgwriter |
| * dies in the wrong moment. But since PGPROC->procLatch is never |
| * deallocated the worst consequence of that is that we set the latch of |
| * some arbitrary process. |
| */ |
| bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno); |
| if (bgwprocno != -1) |
| { |
| /* reset bgwprocno first, before setting the latch */ |
| StrategyControl->bgwprocno = -1; |
| |
| /* |
| * Not acquiring ProcArrayLock here which is slightly icky. It's |
| * actually fine because procLatch isn't ever freed, so we just can |
| * potentially set the wrong process' (or no process') latch. |
| */ |
| SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); |
| } |
| |
| /* |
| * We count buffer allocation requests so that the bgwriter can estimate |
| * the rate of buffer consumption. Note that buffers recycled by a |
| * strategy object are intentionally not counted here. |
| */ |
| pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); |
| |
| /* |
| * First check, without acquiring the lock, whether there's buffers in the |
| * freelist. Since we otherwise don't require the spinlock in every |
| * StrategyGetBuffer() invocation, it'd be sad to acquire it here - |
| * uselessly in most cases. That obviously leaves a race where a buffer is |
| * put on the freelist but we don't see the store yet - but that's pretty |
| * harmless, it'll just get used during the next buffer acquisition. |
| * |
| * If there's buffers on the freelist, acquire the spinlock to pop one |
| * buffer of the freelist. Then check whether that buffer is usable and |
| * repeat if not. |
| * |
| * Note that the freeNext fields are considered to be protected by the |
| * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to |
| * manipulate them without holding the spinlock. |
| */ |
| if (StrategyControl->firstFreeBuffer >= 0) |
| { |
| while (true) |
| { |
| /* Acquire the spinlock to remove element from the freelist */ |
| SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| |
| if (StrategyControl->firstFreeBuffer < 0) |
| { |
| SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| break; |
| } |
| |
| buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); |
| Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); |
| |
| /* Unconditionally remove buffer from freelist */ |
| StrategyControl->firstFreeBuffer = buf->freeNext; |
| buf->freeNext = FREENEXT_NOT_IN_LIST; |
| |
| /* |
| * Release the lock so someone else can access the freelist while |
| * we check out this buffer. |
| */ |
| SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| |
| /* |
| * If the buffer is pinned or has a nonzero usage_count, we cannot |
| * use it; discard it and retry. (This can only happen if VACUUM |
| * put a valid buffer in the freelist and then someone else used |
| * it before we got to it. It's probably impossible altogether as |
| * of 8.3, but we'd better check anyway.) |
| */ |
| local_buf_state = LockBufHdr(buf); |
| if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 |
| && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) |
| { |
| if (strategy != NULL) |
| AddBufferToRing(strategy, buf); |
| *buf_state = local_buf_state; |
| return buf; |
| } |
| UnlockBufHdr(buf, local_buf_state); |
| |
| } |
| } |
| |
| /* Nothing on the freelist, so run the "clock sweep" algorithm */ |
| trycounter = NBuffers; |
| for (;;) |
| { |
| buf = GetBufferDescriptor(ClockSweepTick()); |
| |
| /* |
| * If the buffer is pinned or has a nonzero usage_count, we cannot use |
| * it; decrement the usage_count (unless pinned) and keep scanning. |
| */ |
| local_buf_state = LockBufHdr(buf); |
| |
| if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) |
| { |
| if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) |
| { |
| local_buf_state -= BUF_USAGECOUNT_ONE; |
| |
| trycounter = NBuffers; |
| } |
| else |
| { |
| /* Found a usable buffer */ |
| if (strategy != NULL) |
| AddBufferToRing(strategy, buf); |
| *buf_state = local_buf_state; |
| return buf; |
| } |
| } |
| else if (--trycounter == 0) |
| { |
| /* |
| * We've scanned all the buffers without making any state changes, |
| * so all the buffers are pinned (or were when we looked at them). |
| * We could hope that someone will free one eventually, but it's |
| * probably better to fail than to risk getting stuck in an |
| * infinite loop. |
| */ |
| UnlockBufHdr(buf, local_buf_state); |
| ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), |
| errmsg("Unable to allocate database I/O buffer; " |
| "no unpinned buffers available. " |
| "(shared_buffers=%d)", NBuffers), |
| errhint("The shared_buffers setting may need to " |
| "be increased.") |
| )); |
| } |
| UnlockBufHdr(buf, local_buf_state); |
| } |
| } |
| |
| /* |
| * StrategyFreeBuffer: put a buffer on the freelist |
| */ |
| void |
| StrategyFreeBuffer(BufferDesc *buf) |
| { |
| SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| |
| /* |
| * It is possible that we are told to put something in the freelist that |
| * is already in it; don't screw up the list if so. |
| */ |
| if (buf->freeNext == FREENEXT_NOT_IN_LIST) |
| { |
| buf->freeNext = StrategyControl->firstFreeBuffer; |
| if (buf->freeNext < 0) |
| StrategyControl->lastFreeBuffer = buf->buf_id; |
| StrategyControl->firstFreeBuffer = buf->buf_id; |
| } |
| |
| SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| } |
| |
| /* |
| * StrategySyncStart -- tell BufferSync where to start syncing |
| * |
| * The result is the buffer index of the best buffer to sync first. |
| * BufferSync() will proceed circularly around the buffer array from there. |
| * |
| * In addition, we return the completed-pass count (which is effectively |
| * the higher-order bits of nextVictimBuffer) and the count of recent buffer |
| * allocs if non-NULL pointers are passed. The alloc count is reset after |
| * being read. |
| */ |
| int |
| StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) |
| { |
| uint32 nextVictimBuffer; |
| int result; |
| |
| SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); |
| result = nextVictimBuffer % NBuffers; |
| |
| if (complete_passes) |
| { |
| *complete_passes = StrategyControl->completePasses; |
| |
| /* |
| * Additionally add the number of wraparounds that happened before |
| * completePasses could be incremented. C.f. ClockSweepTick(). |
| */ |
| *complete_passes += nextVictimBuffer / NBuffers; |
| } |
| |
| if (num_buf_alloc) |
| { |
| *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); |
| } |
| SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| return result; |
| } |
| |
| /* |
| * StrategyNotifyBgWriter -- set or clear allocation notification latch |
| * |
| * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will |
| * set that latch. Pass -1 to clear the pending notification before it |
| * happens. This feature is used by the bgwriter process to wake itself up |
| * from hibernation, and is not meant for anybody else to use. |
| */ |
| void |
| StrategyNotifyBgWriter(int bgwprocno) |
| { |
| /* |
| * We acquire buffer_strategy_lock just to ensure that the store appears |
| * atomic to StrategyGetBuffer. The bgwriter should call this rather |
| * infrequently, so there's no performance penalty from being safe. |
| */ |
| SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| StrategyControl->bgwprocno = bgwprocno; |
| SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| } |
| |
| |
| /* |
| * StrategyShmemSize |
| * |
| * estimate the size of shared memory used by the freelist-related structures. |
| * |
| * Note: for somewhat historical reasons, the buffer lookup hashtable size |
| * is also determined here. |
| */ |
| Size |
| StrategyShmemSize(void) |
| { |
| Size size = 0; |
| |
| /* size of lookup hash table ... see comment in StrategyInitialize */ |
| size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); |
| |
| /* size of the shared replacement strategy control block */ |
| size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); |
| |
| return size; |
| } |
| |
| /* |
| * StrategyInitialize -- initialize the buffer cache replacement |
| * strategy. |
| * |
| * Assumes: All of the buffers are already built into a linked list. |
| * Only called by postmaster and only during initialization. |
| */ |
| void |
| StrategyInitialize(bool init) |
| { |
| bool found; |
| |
| /* |
| * Initialize the shared buffer lookup hashtable. |
| * |
| * Since we can't tolerate running out of lookup table entries, we must be |
| * sure to specify an adequate table size here. The maximum steady-state |
| * usage is of course NBuffers entries, but BufferAlloc() tries to insert |
| * a new entry before deleting the old. In principle this could be |
| * happening in each partition concurrently, so we could need as many as |
| * NBuffers + NUM_BUFFER_PARTITIONS entries. |
| */ |
| InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); |
| |
| /* |
| * Get or create the shared strategy control block |
| */ |
| StrategyControl = (BufferStrategyControl *) |
| ShmemInitStruct("Buffer Strategy Status", |
| sizeof(BufferStrategyControl), |
| &found); |
| |
| if (!found) |
| { |
| /* |
| * Only done once, usually in postmaster |
| */ |
| Assert(init); |
| |
| SpinLockInit(&StrategyControl->buffer_strategy_lock); |
| |
| /* |
| * Grab the whole linked list of free buffers for our strategy. We |
| * assume it was previously set up by InitBufferPool(). |
| */ |
| StrategyControl->firstFreeBuffer = 0; |
| StrategyControl->lastFreeBuffer = NBuffers - 1; |
| |
| /* Initialize the clock sweep pointer */ |
| pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); |
| |
| /* Clear statistics */ |
| StrategyControl->completePasses = 0; |
| pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); |
| |
| /* No pending notification */ |
| StrategyControl->bgwprocno = -1; |
| } |
| else |
| Assert(!init); |
| } |
| |
| |
| /* ---------------------------------------------------------------- |
| * Backend-private buffer ring management |
| * ---------------------------------------------------------------- |
| */ |
| |
| |
| /* |
| * GetAccessStrategy -- create a BufferAccessStrategy object |
| * |
| * The object is allocated in the current memory context. |
| */ |
| BufferAccessStrategy |
| GetAccessStrategy(BufferAccessStrategyType btype) |
| { |
| BufferAccessStrategy strategy; |
| int ring_size; |
| |
| /* |
| * Select ring size to use. See buffer/README for rationales. |
| * |
| * Note: if you change the ring size for BAS_BULKREAD, see also |
| * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c. |
| */ |
| switch (btype) |
| { |
| case BAS_NORMAL: |
| /* if someone asks for NORMAL, just give 'em a "default" object */ |
| return NULL; |
| |
| case BAS_BULKREAD: |
| ring_size = 256 * 1024 / BLCKSZ; |
| break; |
| case BAS_BULKWRITE: |
| ring_size = 16 * 1024 * 1024 / BLCKSZ; |
| break; |
| case BAS_VACUUM: |
| ring_size = 256 * 1024 / BLCKSZ; |
| break; |
| |
| default: |
| elog(ERROR, "unrecognized buffer access strategy: %d", |
| (int) btype); |
| return NULL; /* keep compiler quiet */ |
| } |
| |
| /* Make sure ring isn't an undue fraction of shared buffers */ |
| ring_size = Min(NBuffers / 8, ring_size); |
| |
| /* Allocate the object and initialize all elements to zeroes */ |
| strategy = (BufferAccessStrategy) |
| palloc0(offsetof(BufferAccessStrategyData, buffers) + |
| ring_size * sizeof(Buffer)); |
| |
| /* Set fields that don't start out zero */ |
| strategy->btype = btype; |
| strategy->ring_size = ring_size; |
| |
| return strategy; |
| } |
| |
| /* |
| * FreeAccessStrategy -- release a BufferAccessStrategy object |
| * |
| * A simple pfree would do at the moment, but we would prefer that callers |
| * don't assume that much about the representation of BufferAccessStrategy. |
| */ |
| void |
| FreeAccessStrategy(BufferAccessStrategy strategy) |
| { |
| /* don't crash if called on a "default" strategy */ |
| if (strategy != NULL) |
| pfree(strategy); |
| } |
| |
| /* |
| * GetBufferFromRing -- returns a buffer from the ring, or NULL if the |
| * ring is empty. |
| * |
| * The bufhdr spin lock is held on the returned buffer. |
| */ |
| static BufferDesc * |
| GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) |
| { |
| BufferDesc *buf; |
| Buffer bufnum; |
| uint32 local_buf_state; /* to avoid repeated (de-)referencing */ |
| |
| |
| /* Advance to next ring slot */ |
| if (++strategy->current >= strategy->ring_size) |
| strategy->current = 0; |
| |
| /* |
| * If the slot hasn't been filled yet, tell the caller to allocate a new |
| * buffer with the normal allocation strategy. He will then fill this |
| * slot by calling AddBufferToRing with the new buffer. |
| */ |
| bufnum = strategy->buffers[strategy->current]; |
| if (bufnum == InvalidBuffer) |
| { |
| strategy->current_was_in_ring = false; |
| return NULL; |
| } |
| |
| /* |
| * If the buffer is pinned we cannot use it under any circumstances. |
| * |
| * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, |
| * since our own previous usage of the ring element would have left it |
| * there, but it might've been decremented by clock sweep since then). A |
| * higher usage_count indicates someone else has touched the buffer, so we |
| * shouldn't re-use it. |
| */ |
| buf = GetBufferDescriptor(bufnum - 1); |
| local_buf_state = LockBufHdr(buf); |
| if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 |
| && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) |
| { |
| strategy->current_was_in_ring = true; |
| *buf_state = local_buf_state; |
| return buf; |
| } |
| UnlockBufHdr(buf, local_buf_state); |
| |
| /* |
| * Tell caller to allocate a new buffer with the normal allocation |
| * strategy. He'll then replace this ring element via AddBufferToRing. |
| */ |
| strategy->current_was_in_ring = false; |
| return NULL; |
| } |
| |
| /* |
| * AddBufferToRing -- add a buffer to the buffer ring |
| * |
| * Caller must hold the buffer header spinlock on the buffer. Since this |
| * is called with the spinlock held, it had better be quite cheap. |
| */ |
| static void |
| AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf) |
| { |
| strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); |
| } |
| |
| /* |
| * StrategyRejectBuffer -- consider rejecting a dirty buffer |
| * |
| * When a nondefault strategy is used, the buffer manager calls this function |
| * when it turns out that the buffer selected by StrategyGetBuffer needs to |
| * be written out and doing so would require flushing WAL too. This gives us |
| * a chance to choose a different victim. |
| * |
| * Returns true if buffer manager should ask for a new victim, and false |
| * if this buffer should be written and re-used. |
| */ |
| bool |
| StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf) |
| { |
| /* We only do this in bulkread mode */ |
| if (strategy->btype != BAS_BULKREAD) |
| return false; |
| |
| /* Don't muck with behavior of normal buffer-replacement strategy */ |
| if (!strategy->current_was_in_ring || |
| strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) |
| return false; |
| |
| /* |
| * Remove the dirty buffer from the ring; necessary to prevent infinite |
| * loop if all ring members are dirty. |
| */ |
| strategy->buffers[strategy->current] = InvalidBuffer; |
| |
| return true; |
| } |