| /*------------------------------------------------------------------------- |
| * |
| * hashinsert.c |
| * Item insertion in hash tables for Postgres. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/access/hash/hashinsert.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "access/hash.h" |
| #include "access/hash_xlog.h" |
| #include "access/xloginsert.h" |
| #include "miscadmin.h" |
| #include "storage/buf_internals.h" |
| #include "storage/lwlock.h" |
| #include "storage/predicate.h" |
| #include "utils/rel.h" |
| |
| static void _hash_vacuum_one_page(Relation rel, Relation hrel, |
| Buffer metabuf, Buffer buf); |
| |
| /* |
| * _hash_doinsert() -- Handle insertion of a single index tuple. |
| * |
| * This routine is called by the public interface routines, hashbuild |
| * and hashinsert. By here, itup is completely filled in. |
| * |
| * 'sorted' must only be passed as 'true' when inserts are done in hashkey |
| * order. |
| */ |
| void |
| _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted) |
| { |
| Buffer buf = InvalidBuffer; |
| Buffer bucket_buf; |
| Buffer metabuf; |
| HashMetaPage metap; |
| HashMetaPage usedmetap = NULL; |
| Page metapage; |
| Page page; |
| HashPageOpaque pageopaque; |
| Size itemsz; |
| bool do_expand; |
| uint32 hashkey; |
| Bucket bucket; |
| OffsetNumber itup_off; |
| |
| /* |
| * Get the hash key for the item (it's stored in the index tuple itself). |
| */ |
| hashkey = _hash_get_indextuple_hashkey(itup); |
| |
| /* compute item size too */ |
| itemsz = IndexTupleSize(itup); |
| itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we |
| * need to be consistent */ |
| |
| restart_insert: |
| |
| /* |
| * Read the metapage. We don't lock it yet; HashMaxItemSize() will |
| * examine pd_pagesize_version, but that can't change so we can examine it |
| * without a lock. |
| */ |
| metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); |
| metapage = BufferGetPage(metabuf); |
| |
| /* |
| * Check whether the item can fit on a hash page at all. (Eventually, we |
| * ought to try to apply TOAST methods if not.) Note that at this point, |
| * itemsz doesn't include the ItemId. |
| * |
| * XXX this is useless code if we are only storing hash keys. |
| */ |
| if (itemsz > HashMaxItemSize(metapage)) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("index row size %zu exceeds hash maximum %zu", |
| itemsz, HashMaxItemSize(metapage)), |
| errhint("Values larger than a buffer page cannot be indexed."))); |
| |
| /* Lock the primary bucket page for the target bucket. */ |
| buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, |
| &usedmetap); |
| Assert(usedmetap != NULL); |
| |
| CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(buf)); |
| |
| /* remember the primary bucket buffer to release the pin on it at end. */ |
| bucket_buf = buf; |
| |
| page = BufferGetPage(buf); |
| pageopaque = HashPageGetOpaque(page); |
| bucket = pageopaque->hasho_bucket; |
| |
| /* |
| * If this bucket is in the process of being split, try to finish the |
| * split before inserting, because that might create room for the |
| * insertion to proceed without allocating an additional overflow page. |
| * It's only interesting to finish the split if we're trying to insert |
| * into the bucket from which we're removing tuples (the "old" bucket), |
| * not if we're trying to insert into the bucket into which tuples are |
| * being moved (the "new" bucket). |
| */ |
| if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) |
| { |
| /* release the lock on bucket buffer, before completing the split. */ |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| |
| _hash_finish_split(rel, metabuf, buf, bucket, |
| usedmetap->hashm_maxbucket, |
| usedmetap->hashm_highmask, |
| usedmetap->hashm_lowmask); |
| |
| /* release the pin on old and meta buffer. retry for insert. */ |
| _hash_dropbuf(rel, buf); |
| _hash_dropbuf(rel, metabuf); |
| goto restart_insert; |
| } |
| |
| /* Do the insertion */ |
| while (PageGetFreeSpace(page) < itemsz) |
| { |
| BlockNumber nextblkno; |
| |
| /* |
| * Check if current page has any DEAD tuples. If yes, delete these |
| * tuples and see if we can get a space for the new item to be |
| * inserted before moving to the next page in the bucket chain. |
| */ |
| if (H_HAS_DEAD_TUPLES(pageopaque)) |
| { |
| |
| if (IsBufferCleanupOK(buf)) |
| { |
| _hash_vacuum_one_page(rel, heapRel, metabuf, buf); |
| |
| if (PageGetFreeSpace(page) >= itemsz) |
| break; /* OK, now we have enough space */ |
| } |
| } |
| |
| /* |
| * no space on this page; check for an overflow page |
| */ |
| nextblkno = pageopaque->hasho_nextblkno; |
| |
| if (BlockNumberIsValid(nextblkno)) |
| { |
| /* |
| * ovfl page exists; go get it. if it doesn't have room, we'll |
| * find out next pass through the loop test above. we always |
| * release both the lock and pin if this is an overflow page, but |
| * only the lock if this is the primary bucket page, since the pin |
| * on the primary bucket must be retained throughout the scan. |
| */ |
| if (buf != bucket_buf) |
| _hash_relbuf(rel, buf); |
| else |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); |
| page = BufferGetPage(buf); |
| } |
| else |
| { |
| /* |
| * we're at the end of the bucket chain and we haven't found a |
| * page with enough room. allocate a new overflow page. |
| */ |
| |
| /* release our write lock without modifying buffer */ |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| |
| /* chain to a new overflow page */ |
| buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf)); |
| page = BufferGetPage(buf); |
| |
| /* should fit now, given test above */ |
| Assert(PageGetFreeSpace(page) >= itemsz); |
| } |
| pageopaque = HashPageGetOpaque(page); |
| Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE); |
| Assert(pageopaque->hasho_bucket == bucket); |
| } |
| |
| /* |
| * Write-lock the metapage so we can increment the tuple count. After |
| * incrementing it, check to see if it's time for a split. |
| */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* Do the update. No ereport(ERROR) until changes are logged */ |
| START_CRIT_SECTION(); |
| |
| /* found page with enough space, so add the item here */ |
| itup_off = _hash_pgaddtup(rel, buf, itemsz, itup, sorted); |
| MarkBufferDirty(buf); |
| |
| /* metapage operations */ |
| metap = HashPageGetMeta(metapage); |
| metap->hashm_ntuples += 1; |
| |
| /* Make sure this stays in sync with _hash_expandtable() */ |
| do_expand = metap->hashm_ntuples > |
| (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); |
| |
| MarkBufferDirty(metabuf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| xl_hash_insert xlrec; |
| XLogRecPtr recptr; |
| |
| xlrec.offnum = itup_off; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfHashInsert); |
| |
| XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); |
| |
| XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
| XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); |
| |
| PageSetLSN(BufferGetPage(buf), recptr); |
| PageSetLSN(BufferGetPage(metabuf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| |
| /* drop lock on metapage, but keep pin */ |
| LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| |
| /* |
| * Release the modified page and ensure to release the pin on primary |
| * page. |
| */ |
| _hash_relbuf(rel, buf); |
| if (buf != bucket_buf) |
| _hash_dropbuf(rel, bucket_buf); |
| |
| /* Attempt to split if a split is needed */ |
| if (do_expand) |
| _hash_expandtable(rel, metabuf); |
| |
| /* Finally drop our pin on the metapage */ |
| _hash_dropbuf(rel, metabuf); |
| } |
| |
| /* |
| * _hash_pgaddtup() -- add a tuple to a particular page in the index. |
| * |
| * This routine adds the tuple to the page as requested; it does not write out |
| * the page. It is an error to call this function without pin and write lock |
| * on the target buffer. |
| * |
| * Returns the offset number at which the tuple was inserted. This function |
| * is responsible for preserving the condition that tuples in a hash index |
| * page are sorted by hashkey value, however, if the caller is certain that |
| * the hashkey for the tuple being added is >= the hashkeys of all existing |
| * tuples on the page, then the 'appendtup' flag may be passed as true. This |
| * saves from having to binary search for the correct location to insert the |
| * tuple. |
| */ |
| OffsetNumber |
| _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup, |
| bool appendtup) |
| { |
| OffsetNumber itup_off; |
| Page page; |
| |
| _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
| page = BufferGetPage(buf); |
| |
| /* |
| * Find where to insert the tuple (preserving page's hashkey ordering). If |
| * 'appendtup' is true then we just insert it at the end. |
| */ |
| if (appendtup) |
| { |
| itup_off = PageGetMaxOffsetNumber(page) + 1; |
| |
| #ifdef USE_ASSERT_CHECKING |
| /* ensure this tuple's hashkey is >= the final existing tuple */ |
| if (PageGetMaxOffsetNumber(page) > 0) |
| { |
| IndexTuple lasttup; |
| ItemId itemid; |
| |
| itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); |
| lasttup = (IndexTuple) PageGetItem(page, itemid); |
| |
| Assert(_hash_get_indextuple_hashkey(lasttup) <= |
| _hash_get_indextuple_hashkey(itup)); |
| } |
| #endif |
| } |
| else |
| { |
| uint32 hashkey = _hash_get_indextuple_hashkey(itup); |
| |
| itup_off = _hash_binsearch(page, hashkey); |
| } |
| |
| if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) |
| == InvalidOffsetNumber) |
| elog(ERROR, "failed to add index item to \"%s\"", |
| RelationGetRelationName(rel)); |
| |
| return itup_off; |
| } |
| |
| /* |
| * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the |
| * index. |
| * |
| * This routine has same requirements for locking and tuple ordering as |
| * _hash_pgaddtup(). |
| * |
| * Returns the offset number array at which the tuples were inserted. |
| */ |
| void |
| _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, |
| OffsetNumber *itup_offsets, uint16 nitups) |
| { |
| OffsetNumber itup_off; |
| Page page; |
| uint32 hashkey; |
| int i; |
| |
| _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
| page = BufferGetPage(buf); |
| |
| for (i = 0; i < nitups; i++) |
| { |
| Size itemsize; |
| |
| itemsize = IndexTupleSize(itups[i]); |
| itemsize = MAXALIGN(itemsize); |
| |
| /* Find where to insert the tuple (preserving page's hashkey ordering) */ |
| hashkey = _hash_get_indextuple_hashkey(itups[i]); |
| itup_off = _hash_binsearch(page, hashkey); |
| |
| itup_offsets[i] = itup_off; |
| |
| if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) |
| == InvalidOffsetNumber) |
| elog(ERROR, "failed to add index item to \"%s\"", |
| RelationGetRelationName(rel)); |
| } |
| } |
| |
| /* |
| * _hash_vacuum_one_page - vacuum just one index page. |
| * |
| * Try to remove LP_DEAD items from the given page. We must acquire cleanup |
| * lock on the page being modified before calling this function. |
| */ |
| |
| static void |
| _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf) |
| { |
| OffsetNumber deletable[MaxOffsetNumber]; |
| int ndeletable = 0; |
| OffsetNumber offnum, |
| maxoff; |
| Page page = BufferGetPage(buf); |
| HashPageOpaque pageopaque; |
| HashMetaPage metap; |
| |
| /* Scan each tuple in page to see if it is marked as LP_DEAD */ |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (offnum = FirstOffsetNumber; |
| offnum <= maxoff; |
| offnum = OffsetNumberNext(offnum)) |
| { |
| ItemId itemId = PageGetItemId(page, offnum); |
| |
| if (ItemIdIsDead(itemId)) |
| deletable[ndeletable++] = offnum; |
| } |
| |
| if (ndeletable > 0) |
| { |
| TransactionId snapshotConflictHorizon; |
| |
| snapshotConflictHorizon = |
| index_compute_xid_horizon_for_tuples(rel, hrel, buf, |
| deletable, ndeletable); |
| |
| /* |
| * Write-lock the meta page so that we can decrement tuple count. |
| */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* No ereport(ERROR) until changes are logged */ |
| START_CRIT_SECTION(); |
| |
| PageIndexMultiDelete(page, deletable, ndeletable); |
| |
| /* |
| * Mark the page as not containing any LP_DEAD items. This is not |
| * certainly true (there might be some that have recently been marked, |
| * but weren't included in our target-item list), but it will almost |
| * always be true and it doesn't seem worth an additional page scan to |
| * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint |
| * anyway. |
| */ |
| pageopaque = HashPageGetOpaque(page); |
| pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
| |
| metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| metap->hashm_ntuples -= ndeletable; |
| |
| MarkBufferDirty(buf); |
| MarkBufferDirty(metabuf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| xl_hash_vacuum_one_page xlrec; |
| XLogRecPtr recptr; |
| |
| xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(hrel); |
| xlrec.snapshotConflictHorizon = snapshotConflictHorizon; |
| xlrec.ntuples = ndeletable; |
| |
| XLogBeginInsert(); |
| XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
| XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage); |
| |
| /* |
| * We need the target-offsets array whether or not we store the |
| * whole buffer, to allow us to find the snapshotConflictHorizon |
| * on a standby server. |
| */ |
| XLogRegisterData((char *) deletable, |
| ndeletable * sizeof(OffsetNumber)); |
| |
| XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); |
| |
| PageSetLSN(BufferGetPage(buf), recptr); |
| PageSetLSN(BufferGetPage(metabuf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| |
| /* |
| * Releasing write lock on meta page as we have updated the tuple |
| * count. |
| */ |
| LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| } |
| } |