| /*------------------------------------------------------------------------- |
| * |
| * hashovfl.c |
| * Overflow page management code for the Postgres hash access method |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/access/hash/hashovfl.c |
| * |
| * NOTES |
| * Overflow pages look like ordinary relation pages. |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/hash.h" |
| #include "access/hash_xlog.h" |
| #include "miscadmin.h" |
| #include "utils/rel.h" |
| |
| |
| static uint32 _hash_firstfreebit(uint32 map); |
| |
| |
| /* |
| * Convert overflow page bit number (its index in the free-page bitmaps) |
| * to block number within the index. |
| */ |
| static BlockNumber |
| bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) |
| { |
| uint32 splitnum = metap->hashm_ovflpoint; |
| uint32 i; |
| |
| /* Convert zero-based bitnumber to 1-based page number */ |
| ovflbitnum += 1; |
| |
| /* Determine the split number for this page (must be >= 1) */ |
| for (i = 1; |
| i < splitnum && ovflbitnum > metap->hashm_spares[i]; |
| i++) |
| /* loop */ ; |
| |
| /* |
| * Convert to absolute page number by adding the number of bucket pages |
| * that exist before this split point. |
| */ |
| return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); |
| } |
| |
| /* |
| * _hash_ovflblkno_to_bitno |
| * |
| * Convert overflow page block number to bit number for free-page bitmap. |
| */ |
| uint32 |
| _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) |
| { |
| uint32 splitnum = metap->hashm_ovflpoint; |
| uint32 i; |
| uint32 bitnum; |
| |
| /* Determine the split number containing this page */ |
| for (i = 1; i <= splitnum; i++) |
| { |
| if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) |
| break; /* oops */ |
| bitnum = ovflblkno - _hash_get_totalbuckets(i); |
| |
| /* |
| * bitnum has to be greater than number of overflow page added in |
| * previous split point. The overflow page at this splitnum (i) if any |
| * should start from (_hash_get_totalbuckets(i) + |
| * metap->hashm_spares[i - 1] + 1). |
| */ |
| if (bitnum > metap->hashm_spares[i - 1] && |
| bitnum <= metap->hashm_spares[i]) |
| return bitnum - 1; /* -1 to convert 1-based to 0-based */ |
| } |
| |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid overflow block number %u", ovflblkno))); |
| return 0; /* keep compiler quiet */ |
| } |
| |
| /* |
| * _hash_addovflpage |
| * |
| * Add an overflow page to the bucket whose last page is pointed to by 'buf'. |
| * |
| * On entry, the caller must hold a pin but no lock on 'buf'. The pin is |
| * dropped before exiting (we assume the caller is not interested in 'buf' |
| * anymore) if not asked to retain. The pin will be retained only for the |
| * primary bucket. The returned overflow page will be pinned and |
| * write-locked; it is guaranteed to be empty. |
| * |
| * The caller must hold a pin, but no lock, on the metapage buffer. |
| * That buffer is returned in the same state. |
| * |
| * NB: since this could be executed concurrently by multiple processes, |
| * one should not assume that the returned overflow page will be the |
| * immediate successor of the originally passed 'buf'. Additional overflow |
| * pages might have been added to the bucket chain in between. |
| */ |
| Buffer |
| _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) |
| { |
| Buffer ovflbuf; |
| Page page; |
| Page ovflpage; |
| HashPageOpaque pageopaque; |
| HashPageOpaque ovflopaque; |
| HashMetaPage metap; |
| Buffer mapbuf = InvalidBuffer; |
| Buffer newmapbuf = InvalidBuffer; |
| BlockNumber blkno; |
| uint32 orig_firstfree; |
| uint32 splitnum; |
| uint32 *freep = NULL; |
| uint32 max_ovflpg; |
| uint32 bit; |
| uint32 bitmap_page_bit; |
| uint32 first_page; |
| uint32 last_bit; |
| uint32 last_page; |
| uint32 i, |
| j; |
| bool page_found = false; |
| |
| /* |
| * Write-lock the tail page. Here, we need to maintain locking order such |
| * that, first acquire the lock on tail page of bucket, then on meta page |
| * to find and lock the bitmap page and if it is found, then lock on meta |
| * page is released, then finally acquire the lock on new overflow buffer. |
| * We need this locking order to avoid deadlock with backends that are |
| * doing inserts. |
| * |
| * Note: We could have avoided locking many buffers here if we made two |
| * WAL records for acquiring an overflow page (one to allocate an overflow |
| * page and another to add it to overflow bucket chain). However, doing |
| * so can leak an overflow page, if the system crashes after allocation. |
| * Needless to say, it is better to have a single record from a |
| * performance point of view as well. |
| */ |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* probably redundant... */ |
| _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
| |
| /* loop to find current tail page, in case someone else inserted too */ |
| for (;;) |
| { |
| BlockNumber nextblkno; |
| |
| page = BufferGetPage(buf); |
| pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| nextblkno = pageopaque->hasho_nextblkno; |
| |
| if (!BlockNumberIsValid(nextblkno)) |
| break; |
| |
| /* we assume we do not need to write the unmodified page */ |
| if (retain_pin) |
| { |
| /* pin will be retained only for the primary bucket page */ |
| Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| else |
| _hash_relbuf(rel, buf); |
| |
| retain_pin = false; |
| |
| buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); |
| } |
| |
| /* Get exclusive lock on the meta page */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| _hash_checkpage(rel, metabuf, LH_META_PAGE); |
| metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| |
| /* start search at hashm_firstfree */ |
| orig_firstfree = metap->hashm_firstfree; |
| first_page = orig_firstfree >> BMPG_SHIFT(metap); |
| bit = orig_firstfree & BMPG_MASK(metap); |
| i = first_page; |
| j = bit / BITS_PER_MAP; |
| bit &= ~(BITS_PER_MAP - 1); |
| |
| /* outer loop iterates once per bitmap page */ |
| for (;;) |
| { |
| BlockNumber mapblkno; |
| Page mappage; |
| uint32 last_inpage; |
| |
| /* want to end search with the last existing overflow page */ |
| splitnum = metap->hashm_ovflpoint; |
| max_ovflpg = metap->hashm_spares[splitnum] - 1; |
| last_page = max_ovflpg >> BMPG_SHIFT(metap); |
| last_bit = max_ovflpg & BMPG_MASK(metap); |
| |
| if (i > last_page) |
| break; |
| |
| Assert(i < metap->hashm_nmaps); |
| mapblkno = metap->hashm_mapp[i]; |
| |
| if (i == last_page) |
| last_inpage = last_bit; |
| else |
| last_inpage = BMPGSZ_BIT(metap) - 1; |
| |
| /* Release exclusive lock on metapage while reading bitmap page */ |
| LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| |
| mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); |
| mappage = BufferGetPage(mapbuf); |
| freep = HashPageGetBitmap(mappage); |
| |
| for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) |
| { |
| if (freep[j] != ALL_SET) |
| { |
| page_found = true; |
| |
| /* Reacquire exclusive lock on the meta page */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* convert bit to bit number within page */ |
| bit += _hash_firstfreebit(freep[j]); |
| bitmap_page_bit = bit; |
| |
| /* convert bit to absolute bit number */ |
| bit += (i << BMPG_SHIFT(metap)); |
| /* Calculate address of the recycled overflow page */ |
| blkno = bitno_to_blkno(metap, bit); |
| |
| /* Fetch and init the recycled page */ |
| ovflbuf = _hash_getinitbuf(rel, blkno); |
| |
| goto found; |
| } |
| } |
| |
| /* No free space here, try to advance to next map page */ |
| _hash_relbuf(rel, mapbuf); |
| mapbuf = InvalidBuffer; |
| i++; |
| j = 0; /* scan from start of next map page */ |
| bit = 0; |
| |
| /* Reacquire exclusive lock on the meta page */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| } |
| |
| /* |
| * No free pages --- have to extend the relation to add an overflow page. |
| * First, check to see if we have to add a new bitmap page too. |
| */ |
| if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) |
| { |
| /* |
| * We create the new bitmap page with all pages marked "in use". |
| * Actually two pages in the new bitmap's range will exist |
| * immediately: the bitmap page itself, and the following page which |
| * is the one we return to the caller. Both of these are correctly |
| * marked "in use". Subsequent pages do not exist yet, but it is |
| * convenient to pre-mark them as "in use" too. |
| */ |
| bit = metap->hashm_spares[splitnum]; |
| |
| /* metapage already has a write lock */ |
| if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("out of overflow pages in hash index \"%s\"", |
| RelationGetRelationName(rel)))); |
| |
| newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); |
| } |
| else |
| { |
| /* |
| * Nothing to do here; since the page will be past the last used page, |
| * we know its bitmap bit was preinitialized to "in use". |
| */ |
| } |
| |
| /* Calculate address of the new overflow page */ |
| bit = BufferIsValid(newmapbuf) ? |
| metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; |
| blkno = bitno_to_blkno(metap, bit); |
| |
| /* |
| * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the |
| * relation length stays in sync with ours. XXX It's annoying to do this |
| * with metapage write lock held; would be better to use a lock that |
| * doesn't block incoming searches. |
| * |
| * It is okay to hold two buffer locks here (one on tail page of bucket |
| * and other on new overflow page) since there cannot be anyone else |
| * contending for access to ovflbuf. |
| */ |
| ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); |
| |
| found: |
| |
| /* |
| * Do the update. No ereport(ERROR) until changes are logged. We want to |
| * log the changes for bitmap page and overflow page together to avoid |
| * loss of pages in case the new page is added. |
| */ |
| START_CRIT_SECTION(); |
| |
| if (page_found) |
| { |
| Assert(BufferIsValid(mapbuf)); |
| |
| /* mark page "in use" in the bitmap */ |
| SETBIT(freep, bitmap_page_bit); |
| MarkBufferDirty(mapbuf); |
| } |
| else |
| { |
| /* update the count to indicate new overflow page is added */ |
| metap->hashm_spares[splitnum]++; |
| |
| if (BufferIsValid(newmapbuf)) |
| { |
| _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); |
| MarkBufferDirty(newmapbuf); |
| |
| /* add the new bitmap page to the metapage's list of bitmaps */ |
| metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); |
| metap->hashm_nmaps++; |
| metap->hashm_spares[splitnum]++; |
| } |
| |
| MarkBufferDirty(metabuf); |
| |
| /* |
| * for new overflow page, we don't need to explicitly set the bit in |
| * bitmap page, as by default that will be set to "in use". |
| */ |
| } |
| |
| /* |
| * Adjust hashm_firstfree to avoid redundant searches. But don't risk |
| * changing it if someone moved it while we were searching bitmap pages. |
| */ |
| if (metap->hashm_firstfree == orig_firstfree) |
| { |
| metap->hashm_firstfree = bit + 1; |
| MarkBufferDirty(metabuf); |
| } |
| |
| /* initialize new overflow page */ |
| ovflpage = BufferGetPage(ovflbuf); |
| ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
| ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); |
| ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
| ovflopaque->hasho_bucket = pageopaque->hasho_bucket; |
| ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; |
| ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
| |
| MarkBufferDirty(ovflbuf); |
| |
| /* logically chain overflow page to previous page */ |
| pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); |
| |
| MarkBufferDirty(buf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| XLogRecPtr recptr; |
| xl_hash_add_ovfl_page xlrec; |
| |
| xlrec.bmpage_found = page_found; |
| xlrec.bmsize = metap->hashm_bmsize; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); |
| |
| XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); |
| XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); |
| |
| XLogRegisterBuffer(1, buf, REGBUF_STANDARD); |
| |
| if (BufferIsValid(mapbuf)) |
| { |
| XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); |
| XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); |
| } |
| |
| if (BufferIsValid(newmapbuf)) |
| XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); |
| |
| XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); |
| XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); |
| |
| PageSetLSN(BufferGetPage(ovflbuf), recptr); |
| PageSetLSN(BufferGetPage(buf), recptr); |
| |
| if (BufferIsValid(mapbuf)) |
| PageSetLSN(BufferGetPage(mapbuf), recptr); |
| |
| if (BufferIsValid(newmapbuf)) |
| PageSetLSN(BufferGetPage(newmapbuf), recptr); |
| |
| PageSetLSN(BufferGetPage(metabuf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| |
| if (retain_pin) |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| else |
| _hash_relbuf(rel, buf); |
| |
| if (BufferIsValid(mapbuf)) |
| _hash_relbuf(rel, mapbuf); |
| |
| LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| |
| if (BufferIsValid(newmapbuf)) |
| _hash_relbuf(rel, newmapbuf); |
| |
| return ovflbuf; |
| } |
| |
| /* |
| * _hash_firstfreebit() |
| * |
| * Return the number of the first bit that is not set in the word 'map'. |
| */ |
| static uint32 |
| _hash_firstfreebit(uint32 map) |
| { |
| uint32 i, |
| mask; |
| |
| mask = 0x1; |
| for (i = 0; i < BITS_PER_MAP; i++) |
| { |
| if (!(mask & map)) |
| return i; |
| mask <<= 1; |
| } |
| |
| elog(ERROR, "firstfreebit found no free bit"); |
| |
| return 0; /* keep compiler quiet */ |
| } |
| |
| /* |
| * _hash_freeovflpage() - |
| * |
| * Remove this overflow page from its bucket's chain, and mark the page as |
| * free. On entry, ovflbuf is write-locked; it is released before exiting. |
| * |
| * Add the tuples (itups) to wbuf in this function. We could do that in the |
| * caller as well, but the advantage of doing it here is we can easily write |
| * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and |
| * removal of overflow page has to done as an atomic operation, otherwise |
| * during replay on standby users might find duplicate records. |
| * |
| * Since this function is invoked in VACUUM, we provide an access strategy |
| * parameter that controls fetches of the bucket pages. |
| * |
| * Returns the block number of the page that followed the given page |
| * in the bucket, or InvalidBlockNumber if no following page. |
| * |
| * NB: caller must not hold lock on metapage, nor on page, that's next to |
| * ovflbuf in the bucket chain. We don't acquire the lock on page that's |
| * prior to ovflbuf in chain if it is same as wbuf because the caller already |
| * has a lock on same. |
| */ |
| BlockNumber |
| _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, |
| Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, |
| Size *tups_size, uint16 nitups, |
| BufferAccessStrategy bstrategy) |
| { |
| HashMetaPage metap; |
| Buffer metabuf; |
| Buffer mapbuf; |
| BlockNumber ovflblkno; |
| BlockNumber prevblkno; |
| BlockNumber blkno; |
| BlockNumber nextblkno; |
| BlockNumber writeblkno; |
| HashPageOpaque ovflopaque; |
| Page ovflpage; |
| Page mappage; |
| uint32 *freep; |
| uint32 ovflbitno; |
| int32 bitmappage, |
| bitmapbit; |
| Bucket bucket PG_USED_FOR_ASSERTS_ONLY; |
| Buffer prevbuf = InvalidBuffer; |
| Buffer nextbuf = InvalidBuffer; |
| bool update_metap = false; |
| |
| /* Get information from the doomed page */ |
| _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); |
| ovflblkno = BufferGetBlockNumber(ovflbuf); |
| ovflpage = BufferGetPage(ovflbuf); |
| ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
| nextblkno = ovflopaque->hasho_nextblkno; |
| prevblkno = ovflopaque->hasho_prevblkno; |
| writeblkno = BufferGetBlockNumber(wbuf); |
| bucket = ovflopaque->hasho_bucket; |
| |
| /* |
| * Fix up the bucket chain. this is a doubly-linked list, so we must fix |
| * up the bucket chain members behind and ahead of the overflow page being |
| * deleted. Concurrency issues are avoided by using lock chaining as |
| * described atop hashbucketcleanup. |
| */ |
| if (BlockNumberIsValid(prevblkno)) |
| { |
| if (prevblkno == writeblkno) |
| prevbuf = wbuf; |
| else |
| prevbuf = _hash_getbuf_with_strategy(rel, |
| prevblkno, |
| HASH_WRITE, |
| LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, |
| bstrategy); |
| } |
| if (BlockNumberIsValid(nextblkno)) |
| nextbuf = _hash_getbuf_with_strategy(rel, |
| nextblkno, |
| HASH_WRITE, |
| LH_OVERFLOW_PAGE, |
| bstrategy); |
| |
| /* Note: bstrategy is intentionally not used for metapage and bitmap */ |
| |
| /* Read the metapage so we can determine which bitmap page to use */ |
| metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); |
| metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| |
| /* Identify which bit to set */ |
| ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); |
| |
| bitmappage = ovflbitno >> BMPG_SHIFT(metap); |
| bitmapbit = ovflbitno & BMPG_MASK(metap); |
| |
| if (bitmappage >= metap->hashm_nmaps) |
| elog(ERROR, "invalid overflow bit number %u", ovflbitno); |
| blkno = metap->hashm_mapp[bitmappage]; |
| |
| /* Release metapage lock while we access the bitmap page */ |
| LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| |
| /* read the bitmap page to clear the bitmap bit */ |
| mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); |
| mappage = BufferGetPage(mapbuf); |
| freep = HashPageGetBitmap(mappage); |
| Assert(ISSET(freep, bitmapbit)); |
| |
| /* Get write-lock on metapage to update firstfree */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* This operation needs to log multiple tuples, prepare WAL for that */ |
| if (RelationNeedsWAL(rel)) |
| XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); |
| |
| START_CRIT_SECTION(); |
| |
| /* |
| * we have to insert tuples on the "write" page, being careful to preserve |
| * hashkey ordering. (If we insert many tuples into the same "write" page |
| * it would be worth qsort'ing them). |
| */ |
| if (nitups > 0) |
| { |
| _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); |
| MarkBufferDirty(wbuf); |
| } |
| |
| /* |
| * Reinitialize the freed overflow page. Just zeroing the page won't |
| * work, because WAL replay routines expect pages to be initialized. See |
| * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are |
| * careful to make the special space valid here so that tools like |
| * pageinspect won't get confused. |
| */ |
| _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); |
| |
| ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
| |
| ovflopaque->hasho_prevblkno = InvalidBlockNumber; |
| ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
| ovflopaque->hasho_bucket = -1; |
| ovflopaque->hasho_flag = LH_UNUSED_PAGE; |
| ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
| |
| MarkBufferDirty(ovflbuf); |
| |
| if (BufferIsValid(prevbuf)) |
| { |
| Page prevpage = BufferGetPage(prevbuf); |
| HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); |
| |
| Assert(prevopaque->hasho_bucket == bucket); |
| prevopaque->hasho_nextblkno = nextblkno; |
| MarkBufferDirty(prevbuf); |
| } |
| if (BufferIsValid(nextbuf)) |
| { |
| Page nextpage = BufferGetPage(nextbuf); |
| HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); |
| |
| Assert(nextopaque->hasho_bucket == bucket); |
| nextopaque->hasho_prevblkno = prevblkno; |
| MarkBufferDirty(nextbuf); |
| } |
| |
| /* Clear the bitmap bit to indicate that this overflow page is free */ |
| CLRBIT(freep, bitmapbit); |
| MarkBufferDirty(mapbuf); |
| |
| /* if this is now the first free page, update hashm_firstfree */ |
| if (ovflbitno < metap->hashm_firstfree) |
| { |
| metap->hashm_firstfree = ovflbitno; |
| update_metap = true; |
| MarkBufferDirty(metabuf); |
| } |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| xl_hash_squeeze_page xlrec; |
| XLogRecPtr recptr; |
| int i; |
| |
| xlrec.prevblkno = prevblkno; |
| xlrec.nextblkno = nextblkno; |
| xlrec.ntups = nitups; |
| xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); |
| xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); |
| |
| /* |
| * bucket buffer needs to be registered to ensure that we can acquire |
| * a cleanup lock on it during replay. |
| */ |
| if (!xlrec.is_prim_bucket_same_wrt) |
| XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
| |
| XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); |
| if (xlrec.ntups > 0) |
| { |
| XLogRegisterBufData(1, (char *) itup_offsets, |
| nitups * sizeof(OffsetNumber)); |
| for (i = 0; i < nitups; i++) |
| XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); |
| } |
| |
| XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); |
| |
| /* |
| * If prevpage and the writepage (block in which we are moving tuples |
| * from overflow) are same, then no need to separately register |
| * prevpage. During replay, we can directly update the nextblock in |
| * writepage. |
| */ |
| if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) |
| XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); |
| |
| if (BufferIsValid(nextbuf)) |
| XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); |
| |
| XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); |
| XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); |
| |
| if (update_metap) |
| { |
| XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); |
| XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); |
| } |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); |
| |
| PageSetLSN(BufferGetPage(wbuf), recptr); |
| PageSetLSN(BufferGetPage(ovflbuf), recptr); |
| |
| if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) |
| PageSetLSN(BufferGetPage(prevbuf), recptr); |
| if (BufferIsValid(nextbuf)) |
| PageSetLSN(BufferGetPage(nextbuf), recptr); |
| |
| PageSetLSN(BufferGetPage(mapbuf), recptr); |
| |
| if (update_metap) |
| PageSetLSN(BufferGetPage(metabuf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| |
| /* release previous bucket if it is not same as write bucket */ |
| if (BufferIsValid(prevbuf) && prevblkno != writeblkno) |
| _hash_relbuf(rel, prevbuf); |
| |
| if (BufferIsValid(ovflbuf)) |
| _hash_relbuf(rel, ovflbuf); |
| |
| if (BufferIsValid(nextbuf)) |
| _hash_relbuf(rel, nextbuf); |
| |
| _hash_relbuf(rel, mapbuf); |
| _hash_relbuf(rel, metabuf); |
| |
| return nextblkno; |
| } |
| |
| |
| /* |
| * _hash_initbitmapbuffer() |
| * |
| * Initialize a new bitmap page. All bits in the new bitmap page are set to |
| * "1", indicating "in use". |
| */ |
| void |
| _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) |
| { |
| Page pg; |
| HashPageOpaque op; |
| uint32 *freep; |
| |
| pg = BufferGetPage(buf); |
| |
| /* initialize the page */ |
| if (initpage) |
| _hash_pageinit(pg, BufferGetPageSize(buf)); |
| |
| /* initialize the page's special space */ |
| op = (HashPageOpaque) PageGetSpecialPointer(pg); |
| op->hasho_prevblkno = InvalidBlockNumber; |
| op->hasho_nextblkno = InvalidBlockNumber; |
| op->hasho_bucket = -1; |
| op->hasho_flag = LH_BITMAP_PAGE; |
| op->hasho_page_id = HASHO_PAGE_ID; |
| |
| /* set all of the bits to 1 */ |
| freep = HashPageGetBitmap(pg); |
| MemSet(freep, 0xFF, bmsize); |
| |
| /* |
| * Set pd_lower just past the end of the bitmap page data. We could even |
| * set pd_lower equal to pd_upper, but this is more precise and makes the |
| * page look compressible to xlog.c. |
| */ |
| ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; |
| } |
| |
| |
| /* |
| * _hash_squeezebucket(rel, bucket) |
| * |
| * Try to squeeze the tuples onto pages occurring earlier in the |
| * bucket chain in an attempt to free overflow pages. When we start |
| * the "squeezing", the page from which we start taking tuples (the |
| * "read" page) is the last bucket in the bucket chain and the page |
| * onto which we start squeezing tuples (the "write" page) is the |
| * first page in the bucket chain. The read page works backward and |
| * the write page works forward; the procedure terminates when the |
| * read page and write page are the same page. |
| * |
| * At completion of this procedure, it is guaranteed that all pages in |
| * the bucket are nonempty, unless the bucket is totally empty (in |
| * which case all overflow pages will be freed). The original implementation |
| * required that to be true on entry as well, but it's a lot easier for |
| * callers to leave empty overflow pages and let this guy clean it up. |
| * |
| * Caller must acquire cleanup lock on the primary page of the target |
| * bucket to exclude any scans that are in progress, which could easily |
| * be confused into returning the same tuple more than once or some tuples |
| * not at all by the rearrangement we are performing here. To prevent |
| * any concurrent scan to cross the squeeze scan we use lock chaining |
| * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup. |
| * |
| * We need to retain a pin on the primary bucket to ensure that no concurrent |
| * split can start. |
| * |
| * Since this function is invoked in VACUUM, we provide an access strategy |
| * parameter that controls fetches of the bucket pages. |
| */ |
| void |
| _hash_squeezebucket(Relation rel, |
| Bucket bucket pg_attribute_unused(), |
| BlockNumber bucket_blkno, |
| Buffer bucket_buf, |
| BufferAccessStrategy bstrategy) |
| { |
| BlockNumber wblkno; |
| BlockNumber rblkno; |
| Buffer wbuf; |
| Buffer rbuf; |
| Page wpage; |
| Page rpage; |
| HashPageOpaque wopaque; |
| HashPageOpaque ropaque; |
| |
| /* |
| * start squeezing into the primary bucket page. |
| */ |
| wblkno = bucket_blkno; |
| wbuf = bucket_buf; |
| wpage = BufferGetPage(wbuf); |
| wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); |
| |
| /* |
| * if there aren't any overflow pages, there's nothing to squeeze. caller |
| * is responsible for releasing the pin on primary bucket page. |
| */ |
| if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) |
| { |
| LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
| return; |
| } |
| |
| /* |
| * Find the last page in the bucket chain by starting at the base bucket |
| * page and working forward. Note: we assume that a hash bucket chain is |
| * usually smaller than the buffer ring being used by VACUUM, else using |
| * the access strategy here would be counterproductive. |
| */ |
| rbuf = InvalidBuffer; |
| ropaque = wopaque; |
| do |
| { |
| rblkno = ropaque->hasho_nextblkno; |
| if (rbuf != InvalidBuffer) |
| _hash_relbuf(rel, rbuf); |
| rbuf = _hash_getbuf_with_strategy(rel, |
| rblkno, |
| HASH_WRITE, |
| LH_OVERFLOW_PAGE, |
| bstrategy); |
| rpage = BufferGetPage(rbuf); |
| ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); |
| Assert(ropaque->hasho_bucket == bucket); |
| } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); |
| |
| /* |
| * squeeze the tuples. |
| */ |
| for (;;) |
| { |
| OffsetNumber roffnum; |
| OffsetNumber maxroffnum; |
| OffsetNumber deletable[MaxOffsetNumber]; |
| IndexTuple itups[MaxIndexTuplesPerPage]; |
| Size tups_size[MaxIndexTuplesPerPage]; |
| OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; |
| uint16 ndeletable = 0; |
| uint16 nitups = 0; |
| Size all_tups_size = 0; |
| int i; |
| bool retain_pin = false; |
| |
| readpage: |
| /* Scan each tuple in "read" page */ |
| maxroffnum = PageGetMaxOffsetNumber(rpage); |
| for (roffnum = FirstOffsetNumber; |
| roffnum <= maxroffnum; |
| roffnum = OffsetNumberNext(roffnum)) |
| { |
| IndexTuple itup; |
| Size itemsz; |
| |
| /* skip dead tuples */ |
| if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) |
| continue; |
| |
| itup = (IndexTuple) PageGetItem(rpage, |
| PageGetItemId(rpage, roffnum)); |
| itemsz = IndexTupleSize(itup); |
| itemsz = MAXALIGN(itemsz); |
| |
| /* |
| * Walk up the bucket chain, looking for a page big enough for |
| * this item and all other accumulated items. Exit if we reach |
| * the read page. |
| */ |
| while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) |
| { |
| Buffer next_wbuf = InvalidBuffer; |
| bool tups_moved = false; |
| |
| Assert(!PageIsEmpty(wpage)); |
| |
| if (wblkno == bucket_blkno) |
| retain_pin = true; |
| |
| wblkno = wopaque->hasho_nextblkno; |
| Assert(BlockNumberIsValid(wblkno)); |
| |
| /* don't need to move to next page if we reached the read page */ |
| if (wblkno != rblkno) |
| next_wbuf = _hash_getbuf_with_strategy(rel, |
| wblkno, |
| HASH_WRITE, |
| LH_OVERFLOW_PAGE, |
| bstrategy); |
| |
| if (nitups > 0) |
| { |
| Assert(nitups == ndeletable); |
| |
| /* |
| * This operation needs to log multiple tuples, prepare |
| * WAL for that. |
| */ |
| if (RelationNeedsWAL(rel)) |
| XLogEnsureRecordSpace(0, 3 + nitups); |
| |
| START_CRIT_SECTION(); |
| |
| /* |
| * we have to insert tuples on the "write" page, being |
| * careful to preserve hashkey ordering. (If we insert |
| * many tuples into the same "write" page it would be |
| * worth qsort'ing them). |
| */ |
| _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); |
| MarkBufferDirty(wbuf); |
| |
| /* Delete tuples we already moved off read page */ |
| PageIndexMultiDelete(rpage, deletable, ndeletable); |
| MarkBufferDirty(rbuf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| XLogRecPtr recptr; |
| xl_hash_move_page_contents xlrec; |
| |
| xlrec.ntups = nitups; |
| xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); |
| |
| /* |
| * bucket buffer needs to be registered to ensure that |
| * we can acquire a cleanup lock on it during replay. |
| */ |
| if (!xlrec.is_prim_bucket_same_wrt) |
| XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
| |
| XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); |
| XLogRegisterBufData(1, (char *) itup_offsets, |
| nitups * sizeof(OffsetNumber)); |
| for (i = 0; i < nitups; i++) |
| XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); |
| |
| XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); |
| XLogRegisterBufData(2, (char *) deletable, |
| ndeletable * sizeof(OffsetNumber)); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); |
| |
| PageSetLSN(BufferGetPage(wbuf), recptr); |
| PageSetLSN(BufferGetPage(rbuf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| |
| tups_moved = true; |
| } |
| |
| /* |
| * release the lock on previous page after acquiring the lock |
| * on next page |
| */ |
| if (retain_pin) |
| LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
| else |
| _hash_relbuf(rel, wbuf); |
| |
| /* nothing more to do if we reached the read page */ |
| if (rblkno == wblkno) |
| { |
| _hash_relbuf(rel, rbuf); |
| return; |
| } |
| |
| wbuf = next_wbuf; |
| wpage = BufferGetPage(wbuf); |
| wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); |
| Assert(wopaque->hasho_bucket == bucket); |
| retain_pin = false; |
| |
| /* be tidy */ |
| for (i = 0; i < nitups; i++) |
| pfree(itups[i]); |
| nitups = 0; |
| all_tups_size = 0; |
| ndeletable = 0; |
| |
| /* |
| * after moving the tuples, rpage would have been compacted, |
| * so we need to rescan it. |
| */ |
| if (tups_moved) |
| goto readpage; |
| } |
| |
| /* remember tuple for deletion from "read" page */ |
| deletable[ndeletable++] = roffnum; |
| |
| /* |
| * we need a copy of index tuples as they can be freed as part of |
| * overflow page, however we need them to write a WAL record in |
| * _hash_freeovflpage. |
| */ |
| itups[nitups] = CopyIndexTuple(itup); |
| tups_size[nitups++] = itemsz; |
| all_tups_size += itemsz; |
| } |
| |
| /* |
| * If we reach here, there are no live tuples on the "read" page --- |
| * it was empty when we got to it, or we moved them all. So we can |
| * just free the page without bothering with deleting tuples |
| * individually. Then advance to the previous "read" page. |
| * |
| * Tricky point here: if our read and write pages are adjacent in the |
| * bucket chain, our write lock on wbuf will conflict with |
| * _hash_freeovflpage's attempt to update the sibling links of the |
| * removed page. In that case, we don't need to lock it again. |
| */ |
| rblkno = ropaque->hasho_prevblkno; |
| Assert(BlockNumberIsValid(rblkno)); |
| |
| /* free this overflow page (releases rbuf) */ |
| _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, |
| tups_size, nitups, bstrategy); |
| |
| /* be tidy */ |
| for (i = 0; i < nitups; i++) |
| pfree(itups[i]); |
| |
| /* are we freeing the page adjacent to wbuf? */ |
| if (rblkno == wblkno) |
| { |
| /* retain the pin on primary bucket page till end of bucket scan */ |
| if (wblkno == bucket_blkno) |
| LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
| else |
| _hash_relbuf(rel, wbuf); |
| return; |
| } |
| |
| rbuf = _hash_getbuf_with_strategy(rel, |
| rblkno, |
| HASH_WRITE, |
| LH_OVERFLOW_PAGE, |
| bstrategy); |
| rpage = BufferGetPage(rbuf); |
| ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); |
| Assert(ropaque->hasho_bucket == bucket); |
| } |
| |
| /* NOTREACHED */ |
| } |