| /*------------------------------------------------------------------------- |
| * |
| * freespace.c |
| * POSTGRES free space map for quickly finding free space in relations |
| * |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/storage/freespace/freespace.c |
| * |
| * |
| * NOTES: |
| * |
| * Free Space Map keeps track of the amount of free space on pages, and |
| * allows quickly searching for a page with enough free space. The FSM is |
| * stored in a dedicated relation fork of all heap relations, and those |
| * index access methods that need it (see also indexfsm.c). See README for |
| * more information. |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/heapam_xlog.h" |
| #include "access/htup_details.h" |
| #include "access/xlog.h" |
| #include "access/xlogutils.h" |
| #include "miscadmin.h" |
| #include "storage/freespace.h" |
| #include "storage/fsm_internals.h" |
| #include "storage/lmgr.h" |
| #include "storage/smgr.h" |
| |
| |
| /* |
| * We use just one byte to store the amount of free space on a page, so we |
| * divide the amount of free space a page can have into 256 different |
| * categories. The highest category, 255, represents a page with at least |
| * MaxFSMRequestSize bytes of free space, and the second highest category |
| * represents the range from 254 * FSM_CAT_STEP, inclusive, to |
| * MaxFSMRequestSize, exclusive. |
| * |
| * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming |
| * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the |
| * categories look like this: |
| * |
| * |
| * Range Category |
| * 0 - 31 0 |
| * 32 - 63 1 |
| * ... ... ... |
| * 8096 - 8127 253 |
| * 8128 - 8163 254 |
| * 8164 - 8192 255 |
| * |
| * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize |
| * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize |
| * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize |
| * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a |
| * completely empty page, that would mean that we could never satisfy a |
| * request of exactly MaxFSMRequestSize bytes. |
| */ |
| #define FSM_CATEGORIES 256 |
| #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES) |
| #define MaxFSMRequestSize MaxHeapTupleSize |
| |
| /* |
| * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks, |
| * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise, |
| * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice, |
| * this means that 4096 bytes is the smallest BLCKSZ that we can get away |
| * with a 3-level tree, and 512 is the smallest we support. |
| */ |
| #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) |
| |
| #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1) |
| #define FSM_BOTTOM_LEVEL 0 |
| |
| /* |
| * The internal FSM routines work on a logical addressing scheme. Each |
| * level of the tree can be thought of as a separately addressable file. |
| */ |
| typedef struct |
| { |
| int level; /* level */ |
| int logpageno; /* page number within the level */ |
| } FSMAddress; |
| |
| /* Address of the root page. */ |
| static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0}; |
| |
| /* functions to navigate the tree */ |
| static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot); |
| static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot); |
| static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot); |
| static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot); |
| static BlockNumber fsm_logical_to_physical(FSMAddress addr); |
| |
| static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend); |
| static void fsm_extend(Relation rel, BlockNumber fsm_nblocks); |
| |
| /* functions to convert amount of free space to a FSM category */ |
| static uint8 fsm_space_avail_to_cat(Size avail); |
| static uint8 fsm_space_needed_to_cat(Size needed); |
| static Size fsm_space_cat_to_avail(uint8 cat); |
| |
| /* workhorse functions for various operations */ |
| static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, |
| uint8 newValue, uint8 minValue); |
| static BlockNumber fsm_search(Relation rel, uint8 min_cat); |
| static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, |
| BlockNumber start, BlockNumber end, |
| bool *eof); |
| |
| |
| /******** Public API ********/ |
| |
| /* |
| * GetPageWithFreeSpace - try to find a page in the given relation with |
| * at least the specified amount of free space. |
| * |
| * If successful, return the block number; if not, return InvalidBlockNumber. |
| * |
| * The caller must be prepared for the possibility that the returned page |
| * will turn out to have too little space available by the time the caller |
| * gets a lock on it. In that case, the caller should report the actual |
| * amount of free space available on that page and then try again (see |
| * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, |
| * extend the relation. |
| */ |
| BlockNumber |
| GetPageWithFreeSpace(Relation rel, Size spaceNeeded) |
| { |
| uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded); |
| |
| return fsm_search(rel, min_cat); |
| } |
| |
| /* |
| * RecordAndGetPageWithFreeSpace - update info about a page and try again. |
| * |
| * We provide this combo form to save some locking overhead, compared to |
| * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's |
| * also some effort to return a page close to the old page; if there's a |
| * page with enough free space on the same FSM page where the old one page |
| * is located, it is preferred. |
| */ |
| BlockNumber |
| RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, |
| Size oldSpaceAvail, Size spaceNeeded) |
| { |
| int old_cat = fsm_space_avail_to_cat(oldSpaceAvail); |
| int search_cat = fsm_space_needed_to_cat(spaceNeeded); |
| FSMAddress addr; |
| uint16 slot; |
| int search_slot; |
| |
| /* Get the location of the FSM byte representing the heap block */ |
| addr = fsm_get_location(oldPage, &slot); |
| |
| search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat); |
| |
| /* |
| * If fsm_set_and_search found a suitable new block, return that. |
| * Otherwise, search as usual. |
| */ |
| if (search_slot != -1) |
| return fsm_get_heap_blk(addr, search_slot); |
| else |
| return fsm_search(rel, search_cat); |
| } |
| |
| /* |
| * RecordPageWithFreeSpace - update info about a page. |
| * |
| * Note that if the new spaceAvail value is higher than the old value stored |
| * in the FSM, the space might not become visible to searchers until the next |
| * FreeSpaceMapVacuum call, which updates the upper level pages. |
| */ |
| void |
| RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) |
| { |
| int new_cat = fsm_space_avail_to_cat(spaceAvail); |
| FSMAddress addr; |
| uint16 slot; |
| |
| /* Get the location of the FSM byte representing the heap block */ |
| addr = fsm_get_location(heapBlk, &slot); |
| |
| fsm_set_and_search(rel, addr, slot, new_cat, 0); |
| } |
| |
| /* |
| * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in |
| * WAL replay |
| */ |
| void |
| XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, |
| Size spaceAvail) |
| { |
| int new_cat = fsm_space_avail_to_cat(spaceAvail); |
| FSMAddress addr; |
| uint16 slot; |
| BlockNumber blkno; |
| Buffer buf; |
| Page page; |
| |
| /* Get the location of the FSM byte representing the heap block */ |
| addr = fsm_get_location(heapBlk, &slot); |
| blkno = fsm_logical_to_physical(addr); |
| |
| /* If the page doesn't exist already, extend */ |
| buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| |
| page = BufferGetPage(buf); |
| if (PageIsNew(page)) |
| PageInit(page, BLCKSZ, 0); |
| |
| if (fsm_set_avail(page, slot, new_cat)) |
| MarkBufferDirtyHint(buf, false); |
| UnlockReleaseBuffer(buf); |
| } |
| |
| /* |
| * GetRecordedFreeSpace - return the amount of free space on a particular page, |
| * according to the FSM. |
| */ |
| Size |
| GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk) |
| { |
| FSMAddress addr; |
| uint16 slot; |
| Buffer buf; |
| uint8 cat; |
| |
| /* Get the location of the FSM byte representing the heap block */ |
| addr = fsm_get_location(heapBlk, &slot); |
| |
| buf = fsm_readbuf(rel, addr, false); |
| if (!BufferIsValid(buf)) |
| return 0; |
| cat = fsm_get_avail(BufferGetPage(buf), slot); |
| ReleaseBuffer(buf); |
| |
| return fsm_space_cat_to_avail(cat); |
| } |
| |
| /* |
| * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation. |
| * |
| * nblocks is the new size of the heap. |
| * |
| * Return the number of blocks of new FSM. |
| * If it's InvalidBlockNumber, there is nothing to truncate; |
| * otherwise the caller is responsible for calling smgrtruncate() |
| * to truncate the FSM pages, and FreeSpaceMapVacuumRange() |
| * to update upper-level pages in the FSM. |
| */ |
| BlockNumber |
| FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks) |
| { |
| BlockNumber new_nfsmblocks; |
| FSMAddress first_removed_address; |
| uint16 first_removed_slot; |
| Buffer buf; |
| |
| RelationOpenSmgr(rel); |
| |
| /* |
| * If no FSM has been created yet for this relation, there's nothing to |
| * truncate. |
| */ |
| if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) |
| return InvalidBlockNumber; |
| |
| /* Get the location in the FSM of the first removed heap block */ |
| first_removed_address = fsm_get_location(nblocks, &first_removed_slot); |
| |
| /* |
| * Zero out the tail of the last remaining FSM page. If the slot |
| * representing the first removed heap block is at a page boundary, as the |
| * first slot on the FSM page that first_removed_address points to, we can |
| * just truncate that page altogether. |
| */ |
| if (first_removed_slot > 0) |
| { |
| buf = fsm_readbuf(rel, first_removed_address, false); |
| if (!BufferIsValid(buf)) |
| return InvalidBlockNumber; /* nothing to do; the FSM was already |
| * smaller */ |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* NO EREPORT(ERROR) from here till changes are logged */ |
| START_CRIT_SECTION(); |
| |
| fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); |
| |
| /* |
| * Truncation of a relation is WAL-logged at a higher-level, and we |
| * will be called at WAL replay. But if checksums are enabled, we need |
| * to still write a WAL record to protect against a torn page, if the |
| * page is flushed to disk before the truncation WAL record. We cannot |
| * use MarkBufferDirtyHint here, because that will not dirty the page |
| * during recovery. |
| */ |
| MarkBufferDirty(buf); |
| if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) |
| log_newpage_buffer(buf, false); |
| |
| END_CRIT_SECTION(); |
| |
| UnlockReleaseBuffer(buf); |
| |
| new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; |
| } |
| else |
| { |
| new_nfsmblocks = fsm_logical_to_physical(first_removed_address); |
| if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) |
| return InvalidBlockNumber; /* nothing to do; the FSM was already |
| * smaller */ |
| } |
| |
| return new_nfsmblocks; |
| } |
| |
| /* |
| * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM |
| * |
| * We assume that the bottom-level pages have already been updated with |
| * new free-space information. |
| */ |
| void |
| FreeSpaceMapVacuum(Relation rel) |
| { |
| bool dummy; |
| |
| /* Recursively scan the tree, starting at the root */ |
| (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, |
| (BlockNumber) 0, InvalidBlockNumber, |
| &dummy); |
| } |
| |
| /* |
| * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM |
| * |
| * As above, but assume that only heap pages between start and end-1 inclusive |
| * have new free-space information, so update only the upper-level slots |
| * covering that block range. end == InvalidBlockNumber is equivalent to |
| * "all the rest of the relation". |
| */ |
| void |
| FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end) |
| { |
| bool dummy; |
| |
| /* Recursively scan the tree, starting at the root */ |
| if (end > start) |
| (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy); |
| } |
| |
| /******** Internal routines ********/ |
| |
| /* |
| * Return category corresponding x bytes of free space |
| */ |
| static uint8 |
| fsm_space_avail_to_cat(Size avail) |
| { |
| int cat; |
| |
| Assert(avail < BLCKSZ); |
| |
| if (avail >= MaxFSMRequestSize) |
| return 255; |
| |
| cat = avail / FSM_CAT_STEP; |
| |
| /* |
| * The highest category, 255, is reserved for MaxFSMRequestSize bytes or |
| * more. |
| */ |
| if (cat > 254) |
| cat = 254; |
| |
| return (uint8) cat; |
| } |
| |
| /* |
| * Return the lower bound of the range of free space represented by given |
| * category. |
| */ |
| static Size |
| fsm_space_cat_to_avail(uint8 cat) |
| { |
| /* The highest category represents exactly MaxFSMRequestSize bytes. */ |
| if (cat == 255) |
| return MaxFSMRequestSize; |
| else |
| return cat * FSM_CAT_STEP; |
| } |
| |
| /* |
| * Which category does a page need to have, to accommodate x bytes of data? |
| * While fsm_space_avail_to_cat() rounds down, this needs to round up. |
| */ |
| static uint8 |
| fsm_space_needed_to_cat(Size needed) |
| { |
| int cat; |
| |
| /* Can't ask for more space than the highest category represents */ |
| if (needed > MaxFSMRequestSize) |
| elog(ERROR, "invalid FSM request size %zu", needed); |
| |
| if (needed == 0) |
| return 1; |
| |
| cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP; |
| |
| if (cat > 255) |
| cat = 255; |
| |
| return (uint8) cat; |
| } |
| |
| /* |
| * Returns the physical block number of a FSM page |
| */ |
| static BlockNumber |
| fsm_logical_to_physical(FSMAddress addr) |
| { |
| BlockNumber pages; |
| int leafno; |
| int l; |
| |
| /* |
| * Calculate the logical page number of the first leaf page below the |
| * given page. |
| */ |
| leafno = addr.logpageno; |
| for (l = 0; l < addr.level; l++) |
| leafno *= SlotsPerFSMPage; |
| |
| /* Count upper level nodes required to address the leaf page */ |
| pages = 0; |
| for (l = 0; l < FSM_TREE_DEPTH; l++) |
| { |
| pages += leafno + 1; |
| leafno /= SlotsPerFSMPage; |
| } |
| |
| /* |
| * If the page we were asked for wasn't at the bottom level, subtract the |
| * additional lower level pages we counted above. |
| */ |
| pages -= addr.level; |
| |
| /* Turn the page count into 0-based block number */ |
| return pages - 1; |
| } |
| |
| /* |
| * Return the FSM location corresponding to given heap block. |
| */ |
| static FSMAddress |
| fsm_get_location(BlockNumber heapblk, uint16 *slot) |
| { |
| FSMAddress addr; |
| |
| addr.level = FSM_BOTTOM_LEVEL; |
| addr.logpageno = heapblk / SlotsPerFSMPage; |
| *slot = heapblk % SlotsPerFSMPage; |
| |
| return addr; |
| } |
| |
| /* |
| * Return the heap block number corresponding to given location in the FSM. |
| */ |
| static BlockNumber |
| fsm_get_heap_blk(FSMAddress addr, uint16 slot) |
| { |
| Assert(addr.level == FSM_BOTTOM_LEVEL); |
| return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot; |
| } |
| |
| /* |
| * Given a logical address of a child page, get the logical page number of |
| * the parent, and the slot within the parent corresponding to the child. |
| */ |
| static FSMAddress |
| fsm_get_parent(FSMAddress child, uint16 *slot) |
| { |
| FSMAddress parent; |
| |
| Assert(child.level < FSM_ROOT_LEVEL); |
| |
| parent.level = child.level + 1; |
| parent.logpageno = child.logpageno / SlotsPerFSMPage; |
| *slot = child.logpageno % SlotsPerFSMPage; |
| |
| return parent; |
| } |
| |
| /* |
| * Given a logical address of a parent page and a slot number, get the |
| * logical address of the corresponding child page. |
| */ |
| static FSMAddress |
| fsm_get_child(FSMAddress parent, uint16 slot) |
| { |
| FSMAddress child; |
| |
| Assert(parent.level > FSM_BOTTOM_LEVEL); |
| |
| child.level = parent.level - 1; |
| child.logpageno = parent.logpageno * SlotsPerFSMPage + slot; |
| |
| return child; |
| } |
| |
| /* |
| * Read a FSM page. |
| * |
| * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is |
| * true, the FSM file is extended. |
| */ |
| static Buffer |
| fsm_readbuf(Relation rel, FSMAddress addr, bool extend) |
| { |
| BlockNumber blkno = fsm_logical_to_physical(addr); |
| Buffer buf; |
| |
| RelationOpenSmgr(rel); |
| |
| /* |
| * If we haven't cached the size of the FSM yet, check it first. Also |
| * recheck if the requested block seems to be past end, since our cached |
| * value might be stale. (We send smgr inval messages on truncation, but |
| * not on extension.) |
| */ |
| if (rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber || |
| blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM]) |
| { |
| /* Invalidate the cache so smgrnblocks asks the kernel. */ |
| rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber; |
| if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) |
| smgrnblocks(rel->rd_smgr, FSM_FORKNUM); |
| else |
| rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0; |
| } |
| |
| /* Handle requests beyond EOF */ |
| if (blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM]) |
| { |
| if (extend) |
| fsm_extend(rel, blkno + 1); |
| else |
| return InvalidBuffer; |
| } |
| |
| /* |
| * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM |
| * information is not accurate anyway, so it's better to clear corrupt |
| * pages than error out. Since the FSM changes are not WAL-logged, the |
| * so-called torn page problem on crash can lead to pages with corrupt |
| * headers, for example. |
| * |
| * The initialize-the-page part is trickier than it looks, because of the |
| * possibility of multiple backends doing this concurrently, and our |
| * desire to not uselessly take the buffer lock in the normal path where |
| * the page is OK. We must take the lock to initialize the page, so |
| * recheck page newness after we have the lock, in case someone else |
| * already did it. Also, because we initially check PageIsNew with no |
| * lock, it's possible to fall through and return the buffer while someone |
| * else is still initializing the page (i.e., we might see pd_upper as set |
| * but other page header fields are still zeroes). This is harmless for |
| * callers that will take a buffer lock themselves, but some callers |
| * inspect the page without any lock at all. The latter is OK only so |
| * long as it doesn't depend on the page header having correct contents. |
| * Current usage is safe because PageGetContents() does not require that. |
| */ |
| buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); |
| if (PageIsNew(BufferGetPage(buf))) |
| { |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| if (PageIsNew(BufferGetPage(buf))) |
| PageInit(BufferGetPage(buf), BLCKSZ, 0); |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| return buf; |
| } |
| |
| /* |
| * Ensure that the FSM fork is at least fsm_nblocks long, extending |
| * it if necessary with empty pages. And by empty, I mean pages filled |
| * with zeros, meaning there's no free space. |
| */ |
| static void |
| fsm_extend(Relation rel, BlockNumber fsm_nblocks) |
| { |
| BlockNumber fsm_nblocks_now; |
| PGAlignedBlock pg; |
| |
| PageInit((Page) pg.data, BLCKSZ, 0); |
| |
| /* |
| * We use the relation extension lock to lock out other backends trying to |
| * extend the FSM at the same time. It also locks out extension of the |
| * main fork, unnecessarily, but extending the FSM happens seldom enough |
| * that it doesn't seem worthwhile to have a separate lock tag type for |
| * it. |
| * |
| * Note that another backend might have extended or created the relation |
| * by the time we get the lock. |
| */ |
| LockRelationForExtension(rel, ExclusiveLock); |
| |
| /* Might have to re-open if a cache flush happened */ |
| RelationOpenSmgr(rel); |
| |
| /* |
| * Create the FSM file first if it doesn't exist. If |
| * smgr_cached_nblocks[FSM_FORKNUM] is positive then it must exist, no |
| * need for an smgrexists call. |
| */ |
| if ((rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == 0 || |
| rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber) && |
| !smgrexists(rel->rd_smgr, FSM_FORKNUM)) |
| smgrcreate(rel->rd_smgr, FSM_FORKNUM, false); |
| |
| /* Invalidate cache so that smgrnblocks() asks the kernel. */ |
| rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber; |
| fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); |
| |
| while (fsm_nblocks_now < fsm_nblocks) |
| { |
| PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now); |
| |
| smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, |
| pg.data, false); |
| fsm_nblocks_now++; |
| } |
| |
| UnlockRelationForExtension(rel, ExclusiveLock); |
| } |
| |
| /* |
| * Set value in given FSM page and slot. |
| * |
| * If minValue > 0, the updated page is also searched for a page with at |
| * least minValue of free space. If one is found, its slot number is |
| * returned, -1 otherwise. |
| */ |
| static int |
| fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, |
| uint8 newValue, uint8 minValue) |
| { |
| Buffer buf; |
| Page page; |
| int newslot = -1; |
| |
| buf = fsm_readbuf(rel, addr, true); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| |
| page = BufferGetPage(buf); |
| |
| if (fsm_set_avail(page, slot, newValue)) |
| MarkBufferDirtyHint(buf, false); |
| |
| if (minValue != 0) |
| { |
| /* Search while we still hold the lock */ |
| newslot = fsm_search_avail(buf, minValue, |
| addr.level == FSM_BOTTOM_LEVEL, |
| true); |
| } |
| |
| UnlockReleaseBuffer(buf); |
| |
| return newslot; |
| } |
| |
| /* |
| * Search the tree for a heap page with at least min_cat of free space |
| */ |
| static BlockNumber |
| fsm_search(Relation rel, uint8 min_cat) |
| { |
| int restarts = 0; |
| FSMAddress addr = FSM_ROOT_ADDRESS; |
| |
| for (;;) |
| { |
| int slot; |
| Buffer buf; |
| uint8 max_avail = 0; |
| |
| /* Read the FSM page. */ |
| buf = fsm_readbuf(rel, addr, false); |
| |
| /* Search within the page */ |
| if (BufferIsValid(buf)) |
| { |
| LockBuffer(buf, BUFFER_LOCK_SHARE); |
| slot = fsm_search_avail(buf, min_cat, |
| (addr.level == FSM_BOTTOM_LEVEL), |
| false); |
| if (slot == -1) |
| max_avail = fsm_get_max_avail(BufferGetPage(buf)); |
| UnlockReleaseBuffer(buf); |
| } |
| else |
| slot = -1; |
| |
| if (slot != -1) |
| { |
| /* |
| * Descend the tree, or return the found block if we're at the |
| * bottom. |
| */ |
| if (addr.level == FSM_BOTTOM_LEVEL) |
| return fsm_get_heap_blk(addr, slot); |
| |
| addr = fsm_get_child(addr, slot); |
| } |
| else if (addr.level == FSM_ROOT_LEVEL) |
| { |
| /* |
| * At the root, failure means there's no page with enough free |
| * space in the FSM. Give up. |
| */ |
| return InvalidBlockNumber; |
| } |
| else |
| { |
| uint16 parentslot; |
| FSMAddress parent; |
| |
| /* |
| * At lower level, failure can happen if the value in the upper- |
| * level node didn't reflect the value on the lower page. Update |
| * the upper node, to avoid falling into the same trap again, and |
| * start over. |
| * |
| * There's a race condition here, if another backend updates this |
| * page right after we release it, and gets the lock on the parent |
| * page before us. We'll then update the parent page with the now |
| * stale information we had. It's OK, because it should happen |
| * rarely, and will be fixed by the next vacuum. |
| */ |
| parent = fsm_get_parent(addr, &parentslot); |
| fsm_set_and_search(rel, parent, parentslot, max_avail, 0); |
| |
| /* |
| * If the upper pages are badly out of date, we might need to loop |
| * quite a few times, updating them as we go. Any inconsistencies |
| * should eventually be corrected and the loop should end. Looping |
| * indefinitely is nevertheless scary, so provide an emergency |
| * valve. |
| */ |
| if (restarts++ > 10000) |
| return InvalidBlockNumber; |
| |
| /* Start search all over from the root */ |
| addr = FSM_ROOT_ADDRESS; |
| } |
| } |
| } |
| |
| |
| /* |
| * Recursive guts of FreeSpaceMapVacuum |
| * |
| * Examine the FSM page indicated by addr, as well as its children, updating |
| * upper-level nodes that cover the heap block range from start to end-1. |
| * (It's okay if end is beyond the actual end of the map.) |
| * Return the maximum freespace value on this page. |
| * |
| * If addr is past the end of the FSM, set *eof_p to true and return 0. |
| * |
| * This traverses the tree in depth-first order. The tree is stored |
| * physically in depth-first order, so this should be pretty I/O efficient. |
| */ |
| static uint8 |
| fsm_vacuum_page(Relation rel, FSMAddress addr, |
| BlockNumber start, BlockNumber end, |
| bool *eof_p) |
| { |
| Buffer buf; |
| Page page; |
| uint8 max_avail; |
| |
| /* Read the page if it exists, or return EOF */ |
| buf = fsm_readbuf(rel, addr, false); |
| if (!BufferIsValid(buf)) |
| { |
| *eof_p = true; |
| return 0; |
| } |
| else |
| *eof_p = false; |
| |
| page = BufferGetPage(buf); |
| |
| /* |
| * If we're above the bottom level, recurse into children, and fix the |
| * information stored about them at this level. |
| */ |
| if (addr.level > FSM_BOTTOM_LEVEL) |
| { |
| FSMAddress fsm_start, |
| fsm_end; |
| uint16 fsm_start_slot, |
| fsm_end_slot; |
| int slot, |
| start_slot, |
| end_slot; |
| bool eof = false; |
| |
| /* |
| * Compute the range of slots we need to update on this page, given |
| * the requested range of heap blocks to consider. The first slot to |
| * update is the one covering the "start" block, and the last slot is |
| * the one covering "end - 1". (Some of this work will be duplicated |
| * in each recursive call, but it's cheap enough to not worry about.) |
| */ |
| fsm_start = fsm_get_location(start, &fsm_start_slot); |
| fsm_end = fsm_get_location(end - 1, &fsm_end_slot); |
| |
| while (fsm_start.level < addr.level) |
| { |
| fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot); |
| fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot); |
| } |
| Assert(fsm_start.level == addr.level); |
| |
| if (fsm_start.logpageno == addr.logpageno) |
| start_slot = fsm_start_slot; |
| else if (fsm_start.logpageno > addr.logpageno) |
| start_slot = SlotsPerFSMPage; /* shouldn't get here... */ |
| else |
| start_slot = 0; |
| |
| if (fsm_end.logpageno == addr.logpageno) |
| end_slot = fsm_end_slot; |
| else if (fsm_end.logpageno > addr.logpageno) |
| end_slot = SlotsPerFSMPage - 1; |
| else |
| end_slot = -1; /* shouldn't get here... */ |
| |
| for (slot = start_slot; slot <= end_slot; slot++) |
| { |
| int child_avail; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* After we hit end-of-file, just clear the rest of the slots */ |
| if (!eof) |
| child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), |
| start, end, |
| &eof); |
| else |
| child_avail = 0; |
| |
| /* Update information about the child */ |
| if (fsm_get_avail(page, slot) != child_avail) |
| { |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| fsm_set_avail(page, slot, child_avail); |
| MarkBufferDirtyHint(buf, false); |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| } |
| } |
| |
| /* Now get the maximum value on the page, to return to caller */ |
| max_avail = fsm_get_max_avail(page); |
| |
| /* |
| * Reset the next slot pointer. This encourages the use of low-numbered |
| * pages, increasing the chances that a later vacuum can truncate the |
| * relation. We don't bother with a lock here, nor with marking the page |
| * dirty if it wasn't already, since this is just a hint. |
| */ |
| #ifdef MPROTECT_BUFFERS |
| /* |
| * When mprotect() is used to detect shared buffer access violations, lock |
| * must be acquired so that write access is allowed on this buffer. |
| */ |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| #endif |
| ((FSMPage) PageGetContents(page))->fp_next_slot = 0; |
| #ifdef MPROTECT_BUFFERS |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| #endif |
| ReleaseBuffer(buf); |
| |
| return max_avail; |
| } |