| /*------------------------------------------------------------------------- |
| * |
| * buf_internals.h |
| * Internal definitions for buffer manager and the buffer replacement |
| * strategy. |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * src/include/storage/buf_internals.h |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #ifndef BUFMGR_INTERNALS_H |
| #define BUFMGR_INTERNALS_H |
| |
| #include "pgstat.h" |
| #include "port/atomics.h" |
| #include "storage/buf.h" |
| #include "storage/bufmgr.h" |
| #include "storage/condition_variable.h" |
| #include "storage/latch.h" |
| #include "storage/lwlock.h" |
| #include "storage/shmem.h" |
| #include "storage/smgr.h" |
| #include "storage/spin.h" |
| #include "utils/relcache.h" |
| |
| /* |
| * Buffer state is a single 32-bit variable where following data is combined. |
| * |
| * - 17 bits refcount in GPDB (18 bits in Postgres. We have one extra flag, BM_TEMP) |
| * - 4 bits usage count |
| * - 10 bits of flags |
| * |
| * Combining these values allows to perform some operations without locking |
| * the buffer header, by modifying them together with a CAS loop. |
| * |
| * The definition of buffer state components is below. |
| */ |
| #define BUF_REFCOUNT_ONE 1 |
| #define BUF_REFCOUNT_MASK ((1U << 17) - 1) |
| #define BUF_USAGECOUNT_MASK 0x001E0000U |
| #define BUF_USAGECOUNT_ONE (1U << 17) |
| #define BUF_USAGECOUNT_SHIFT 17 |
| #define BUF_FLAG_MASK 0xFFE00000U |
| |
| /* Get refcount and usagecount from buffer state */ |
| #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) |
| #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) |
| |
| /* |
| * Flags for buffer descriptors |
| * |
| * Note: BM_TAG_VALID essentially means that there is a buffer hashtable |
| * entry associated with the buffer's tag. |
| */ |
| #define BM_LOCKED (1U << 22) /* buffer header is locked */ |
| #define BM_DIRTY (1U << 23) /* data needs writing */ |
| #define BM_VALID (1U << 24) /* data is valid */ |
| #define BM_TAG_VALID (1U << 25) /* tag is assigned */ |
| #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */ |
| #define BM_IO_ERROR (1U << 27) /* previous I/O failed */ |
| #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */ |
| #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */ |
| #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */ |
| #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged, |
| * or init fork) */ |
| #define BM_TEMP (1U << 21) /* GPDB: temporary relation */ |
| |
| /* |
| * The maximum allowed value of usage_count represents a tradeoff between |
| * accuracy and speed of the clock-sweep buffer management algorithm. A |
| * large value (comparable to NBuffers) would approximate LRU semantics. |
| * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of |
| * clock sweeps to find a free buffer, so in practice we don't want the |
| * value to be very large. |
| */ |
| #define BM_MAX_USAGE_COUNT 5 |
| |
| /* |
| * Buffer tag identifies which disk block the buffer contains. |
| * |
| * Note: the BufferTag data must be sufficient to determine where to write the |
| * block, without reference to pg_class or pg_tablespace entries. It's |
| * possible that the backend flushing the buffer doesn't even believe the |
| * relation is visible yet (its xact may have started before the xact that |
| * created the rel). The storage manager must be able to cope anyway. |
| * |
| * Note: if there's any pad bytes in the struct, InitBufferTag will have |
| * to be fixed to zero them, since this struct is used as a hash key. |
| */ |
| typedef struct buftag |
| { |
| Oid spcOid; /* tablespace oid */ |
| Oid dbOid; /* database oid */ |
| RelFileNumber relNumber; /* relation file number */ |
| ForkNumber forkNum; /* fork number */ |
| BlockNumber blockNum; /* blknum relative to begin of reln */ |
| } BufferTag; |
| |
| static inline RelFileNumber |
| BufTagGetRelNumber(const BufferTag *tag) |
| { |
| return tag->relNumber; |
| } |
| |
| static inline ForkNumber |
| BufTagGetForkNum(const BufferTag *tag) |
| { |
| return tag->forkNum; |
| } |
| |
| static inline void |
| BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber, |
| ForkNumber forknum) |
| { |
| tag->relNumber = relnumber; |
| tag->forkNum = forknum; |
| } |
| |
| static inline RelFileLocator |
| BufTagGetRelFileLocator(const BufferTag *tag) |
| { |
| RelFileLocator rlocator; |
| |
| rlocator.spcOid = tag->spcOid; |
| rlocator.dbOid = tag->dbOid; |
| rlocator.relNumber = BufTagGetRelNumber(tag); |
| |
| return rlocator; |
| } |
| |
| static inline void |
| ClearBufferTag(BufferTag *tag) |
| { |
| tag->spcOid = InvalidOid; |
| tag->dbOid = InvalidOid; |
| BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber); |
| tag->blockNum = InvalidBlockNumber; |
| } |
| |
| static inline void |
| InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, |
| ForkNumber forkNum, BlockNumber blockNum) |
| { |
| tag->spcOid = rlocator->spcOid; |
| tag->dbOid = rlocator->dbOid; |
| BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum); |
| tag->blockNum = blockNum; |
| } |
| |
| static inline bool |
| BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2) |
| { |
| return (tag1->spcOid == tag2->spcOid) && |
| (tag1->dbOid == tag2->dbOid) && |
| (tag1->relNumber == tag2->relNumber) && |
| (tag1->blockNum == tag2->blockNum) && |
| (tag1->forkNum == tag2->forkNum); |
| } |
| |
| static inline bool |
| BufTagMatchesRelFileLocator(const BufferTag *tag, |
| const RelFileLocator *rlocator) |
| { |
| return (tag->spcOid == rlocator->spcOid) && |
| (tag->dbOid == rlocator->dbOid) && |
| (BufTagGetRelNumber(tag) == rlocator->relNumber); |
| } |
| |
| |
| /* |
| * The shared buffer mapping table is partitioned to reduce contention. |
| * To determine which partition lock a given tag requires, compute the tag's |
| * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). |
| * NB: NUM_BUFFER_PARTITIONS must be a power of 2! |
| */ |
| static inline uint32 |
| BufTableHashPartition(uint32 hashcode) |
| { |
| return hashcode % NUM_BUFFER_PARTITIONS; |
| } |
| |
| static inline LWLock * |
| BufMappingPartitionLock(uint32 hashcode) |
| { |
| return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + |
| BufTableHashPartition(hashcode)].lock; |
| } |
| |
| static inline LWLock * |
| BufMappingPartitionLockByIndex(uint32 index) |
| { |
| return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock; |
| } |
| |
| /* |
| * BufferDesc -- shared descriptor/state data for a single shared buffer. |
| * |
| * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change |
| * tag, state or wait_backend_pgprocno fields. In general, buffer header lock |
| * is a spinlock which is combined with flags, refcount and usagecount into |
| * single atomic variable. This layout allow us to do some operations in a |
| * single atomic operation, without actually acquiring and releasing spinlock; |
| * for instance, increase or decrease refcount. buf_id field never changes |
| * after initialization, so does not need locking. freeNext is protected by |
| * the buffer_strategy_lock not buffer header lock. The LWLock can take care |
| * of itself. The buffer header lock is *not* used to control access to the |
| * data in the buffer! |
| * |
| * It's assumed that nobody changes the state field while buffer header lock |
| * is held. Thus buffer header lock holder can do complex updates of the |
| * state variable in single write, simultaneously with lock release (cleaning |
| * BM_LOCKED flag). On the other hand, updating of state without holding |
| * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag |
| * is not set. Atomic increment/decrement, OR/AND etc. are not allowed. |
| * |
| * An exception is that if we have the buffer pinned, its tag can't change |
| * underneath us, so we can examine the tag without locking the buffer header. |
| * Also, in places we do one-time reads of the flags without bothering to |
| * lock the buffer header; this is generally for situations where we don't |
| * expect the flag bit being tested to be changing. |
| * |
| * We can't physically remove items from a disk page if another backend has |
| * the buffer pinned. Hence, a backend may need to wait for all other pins |
| * to go away. This is signaled by storing its own pgprocno into |
| * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present, |
| * there can be only one such waiter per buffer. |
| * |
| * We use this same struct for local buffer headers, but the locks are not |
| * used and not all of the flag bits are useful either. To avoid unnecessary |
| * overhead, manipulations of the state field should be done without actual |
| * atomic operations (i.e. only pg_atomic_read_u32() and |
| * pg_atomic_unlocked_write_u32()). |
| * |
| * Be careful to avoid increasing the size of the struct when adding or |
| * reordering members. Keeping it below 64 bytes (the most common CPU |
| * cache line size) is fairly important for performance. |
| * |
| * Per-buffer I/O condition variables are currently kept outside this struct in |
| * a separate array. They could be moved in here and still fit within that |
| * limit on common systems, but for now that is not done. |
| */ |
| typedef struct BufferDesc |
| { |
| BufferTag tag; /* ID of page contained in buffer */ |
| int buf_id; /* buffer's index number (from 0) */ |
| |
| /* state of the tag, containing flags, refcount and usagecount */ |
| pg_atomic_uint32 state; |
| |
| int wait_backend_pgprocno; /* backend of pin-count waiter */ |
| int freeNext; /* link in freelist chain */ |
| LWLock content_lock; /* to lock access to buffer contents */ |
| } BufferDesc; |
| |
| /* |
| * Concurrent access to buffer headers has proven to be more efficient if |
| * they're cache line aligned. So we force the start of the BufferDescriptors |
| * array to be on a cache line boundary and force the elements to be cache |
| * line sized. |
| * |
| * XXX: As this is primarily matters in highly concurrent workloads which |
| * probably all are 64bit these days, and the space wastage would be a bit |
| * more noticeable on 32bit systems, we don't force the stride to be cache |
| * line sized on those. If somebody does actual performance testing, we can |
| * reevaluate. |
| * |
| * Note that local buffer descriptors aren't forced to be aligned - as there's |
| * no concurrent access to those it's unlikely to be beneficial. |
| * |
| * We use a 64-byte cache line size here, because that's the most common |
| * size. Making it bigger would be a waste of memory. Even if running on a |
| * platform with either 32 or 128 byte line sizes, it's good to align to |
| * boundaries and avoid false sharing. |
| */ |
| #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1) |
| |
| typedef union BufferDescPadded |
| { |
| BufferDesc bufferdesc; |
| char pad[BUFFERDESC_PAD_TO_SIZE]; |
| } BufferDescPadded; |
| |
| /* |
| * The PendingWriteback & WritebackContext structure are used to keep |
| * information about pending flush requests to be issued to the OS. |
| */ |
| typedef struct PendingWriteback |
| { |
| /* could store different types of pending flushes here */ |
| BufferTag tag; |
| } PendingWriteback; |
| |
| /* struct forward declared in bufmgr.h */ |
| typedef struct WritebackContext |
| { |
| /* pointer to the max number of writeback requests to coalesce */ |
| int *max_pending; |
| |
| /* current number of pending writeback requests */ |
| int nr_pending; |
| |
| /* pending requests */ |
| PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]; |
| } WritebackContext; |
| |
| /* in buf_init.c */ |
| extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; |
| extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; |
| extern PGDLLIMPORT WritebackContext BackendWritebackContext; |
| |
| /* in localbuf.c */ |
| extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors; |
| |
| |
| static inline BufferDesc * |
| GetBufferDescriptor(uint32 id) |
| { |
| return &(BufferDescriptors[id]).bufferdesc; |
| } |
| |
| static inline BufferDesc * |
| GetLocalBufferDescriptor(uint32 id) |
| { |
| return &LocalBufferDescriptors[id]; |
| } |
| |
| static inline Buffer |
| BufferDescriptorGetBuffer(const BufferDesc *bdesc) |
| { |
| return (Buffer) (bdesc->buf_id + 1); |
| } |
| |
| static inline ConditionVariable * |
| BufferDescriptorGetIOCV(const BufferDesc *bdesc) |
| { |
| return &(BufferIOCVArray[bdesc->buf_id]).cv; |
| } |
| |
| static inline LWLock * |
| BufferDescriptorGetContentLock(const BufferDesc *bdesc) |
| { |
| return (LWLock *) (&bdesc->content_lock); |
| } |
| |
| /* |
| * The freeNext field is either the index of the next freelist entry, |
| * or one of these special values: |
| */ |
| #define FREENEXT_END_OF_LIST (-1) |
| #define FREENEXT_NOT_IN_LIST (-2) |
| |
| /* |
| * Functions for acquiring/releasing a shared buffer header's spinlock. Do |
| * not apply these to local buffers! |
| */ |
| extern uint32 LockBufHdr(BufferDesc *desc); |
| |
| static inline void |
| UnlockBufHdr(BufferDesc *desc, uint32 buf_state) |
| { |
| pg_write_barrier(); |
| pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED)); |
| } |
| |
| /* in bufmgr.c */ |
| |
| /* |
| * Structure to sort buffers per file on checkpoints. |
| * |
| * This structure is allocated per buffer in shared memory, so it should be |
| * kept as small as possible. |
| */ |
| typedef struct CkptSortItem |
| { |
| Oid tsId; |
| RelFileNumber relNumber; |
| ForkNumber forkNum; |
| BlockNumber blockNum; |
| int buf_id; |
| } CkptSortItem; |
| |
| extern PGDLLIMPORT CkptSortItem *CkptBufferIds; |
| |
| /* |
| * Internal buffer management routines |
| */ |
| /* bufmgr.c */ |
| extern void WritebackContextInit(WritebackContext *context, int *max_pending); |
| extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context); |
| extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context, |
| IOContext io_context, BufferTag *tag); |
| |
| /* freelist.c */ |
| extern IOContext IOContextForStrategy(BufferAccessStrategy strategy); |
| extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, |
| uint32 *buf_state, bool *from_ring); |
| extern void StrategyFreeBuffer(BufferDesc *buf); |
| extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, |
| BufferDesc *buf, bool from_ring); |
| |
| extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); |
| extern void StrategyNotifyBgWriter(int bgwprocno); |
| |
| extern Size StrategyShmemSize(void); |
| extern void StrategyInitialize(bool init); |
| extern bool have_free_buffer(void); |
| |
| /* buf_table.c */ |
| extern Size BufTableShmemSize(int size); |
| extern void InitBufTable(int size); |
| extern uint32 BufTableHashCode(BufferTag *tagPtr); |
| extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); |
| extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); |
| extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); |
| |
| /* localbuf.c */ |
| extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount); |
| extern void UnpinLocalBuffer(Buffer buffer); |
| extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, |
| ForkNumber forkNum, |
| BlockNumber blockNum); |
| extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, |
| BlockNumber blockNum, bool *foundPtr, Buffer non_evited_buffer); |
| extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, |
| ForkNumber fork, |
| uint32 flags, |
| uint32 extend_by, |
| BlockNumber extend_upto, |
| Buffer *buffers, |
| uint32 *extended_by); |
| extern void MarkLocalBufferDirty(Buffer buffer); |
| extern void DropRelationLocalBuffers(RelFileLocator rlocator, |
| ForkNumber forkNum, |
| BlockNumber firstDelBlock); |
| extern void DropRelationAllLocalBuffers(RelFileLocator rlocator); |
| extern void AtEOXact_LocalBuffers(bool isCommit); |
| |
| #endif /* BUFMGR_INTERNALS_H */ |