| /*------------------------------------------------------------------------- |
| * |
| * hash.h |
| * header file for postgres hash access method implementation |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * src/include/access/hash.h |
| * |
| * NOTES |
| * modeled after Margo Seltzer's hash implementation for unix. |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #ifndef HASH_H |
| #define HASH_H |
| |
| #include "access/amapi.h" |
| #include "access/itup.h" |
| #include "access/sdir.h" |
| #include "catalog/pg_am_d.h" |
| #include "common/hashfn.h" |
| #include "lib/stringinfo.h" |
| #include "storage/bufmgr.h" |
| #include "storage/lockdefs.h" |
| #include "utils/hsearch.h" |
| #include "utils/relcache.h" |
| |
| /* |
| * Mapping from hash bucket number to physical block number of bucket's |
| * starting page. Beware of multiple evaluations of argument! |
| */ |
| typedef uint32 Bucket; |
| |
| #define InvalidBucket ((Bucket) 0xFFFFFFFF) |
| |
| #define BUCKET_TO_BLKNO(metap,B) \ |
| ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1) |
| |
| /* |
| * Special space for hash index pages. |
| * |
| * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at. |
| * Additional bits in the flag word are used for more transient purposes. |
| * |
| * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE. |
| * However, we ensure that each used page type has a distinct bit so that |
| * we can OR together page types for uses such as the allowable-page-types |
| * argument of _hash_checkpage(). |
| */ |
| #define LH_UNUSED_PAGE (0) |
| #define LH_OVERFLOW_PAGE (1 << 0) |
| #define LH_BUCKET_PAGE (1 << 1) |
| #define LH_BITMAP_PAGE (1 << 2) |
| #define LH_META_PAGE (1 << 3) |
| #define LH_BUCKET_BEING_POPULATED (1 << 4) |
| #define LH_BUCKET_BEING_SPLIT (1 << 5) |
| #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6) |
| #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) |
| |
| #define LH_PAGE_TYPE \ |
| (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE) |
| |
| /* |
| * In an overflow page, hasho_prevblkno stores the block number of the previous |
| * page in the bucket chain; in a bucket page, hasho_prevblkno stores the |
| * hashm_maxbucket value as of the last time the bucket was last split, or |
| * else as of the time the bucket was created. The latter convention is used |
| * to determine whether a cached copy of the metapage is too stale to be used |
| * without needing to lock or pin the metapage. |
| * |
| * hasho_nextblkno is always the block number of the next page in the |
| * bucket chain, or InvalidBlockNumber if there are no more such pages. |
| */ |
| typedef struct HashPageOpaqueData |
| { |
| BlockNumber hasho_prevblkno; /* see above */ |
| BlockNumber hasho_nextblkno; /* see above */ |
| Bucket hasho_bucket; /* bucket number this pg belongs to */ |
| uint16 hasho_flag; /* page type code + flag bits, see above */ |
| uint16 hasho_page_id; /* for identification of hash indexes */ |
| } HashPageOpaqueData; |
| |
| typedef HashPageOpaqueData *HashPageOpaque; |
| |
| #define HashPageGetOpaque(page) ((HashPageOpaque) PageGetSpecialPointer(page)) |
| |
| #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0) |
| #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0) |
| #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0) |
| #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0) |
| |
| /* |
| * The page ID is for the convenience of pg_filedump and similar utilities, |
| * which otherwise would have a hard time telling pages of different index |
| * types apart. It should be the last 2 bytes on the page. This is more or |
| * less "free" due to alignment considerations. |
| */ |
| #define HASHO_PAGE_ID 0xFF80 |
| |
| typedef struct HashScanPosItem /* what we remember about each match */ |
| { |
| ItemPointerData heapTid; /* TID of referenced heap item */ |
| OffsetNumber indexOffset; /* index item's location within page */ |
| } HashScanPosItem; |
| |
| typedef struct HashScanPosData |
| { |
| Buffer buf; /* if valid, the buffer is pinned */ |
| BlockNumber currPage; /* current hash index page */ |
| BlockNumber nextPage; /* next overflow page */ |
| BlockNumber prevPage; /* prev overflow or bucket page */ |
| |
| /* |
| * The items array is always ordered in index order (ie, increasing |
| * indexoffset). When scanning backwards it is convenient to fill the |
| * array back-to-front, so we start at the last slot and fill downwards. |
| * Hence we need both a first-valid-entry and a last-valid-entry counter. |
| * itemIndex is a cursor showing which entry was last returned to caller. |
| */ |
| int firstItem; /* first valid index in items[] */ |
| int lastItem; /* last valid index in items[] */ |
| int itemIndex; /* current index in items[] */ |
| |
| HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ |
| } HashScanPosData; |
| |
| #define HashScanPosIsPinned(scanpos) \ |
| ( \ |
| AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ |
| !BufferIsValid((scanpos).buf)), \ |
| BufferIsValid((scanpos).buf) \ |
| ) |
| |
| #define HashScanPosIsValid(scanpos) \ |
| ( \ |
| AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ |
| !BufferIsValid((scanpos).buf)), \ |
| BlockNumberIsValid((scanpos).currPage) \ |
| ) |
| |
| #define HashScanPosInvalidate(scanpos) \ |
| do { \ |
| (scanpos).buf = InvalidBuffer; \ |
| (scanpos).currPage = InvalidBlockNumber; \ |
| (scanpos).nextPage = InvalidBlockNumber; \ |
| (scanpos).prevPage = InvalidBlockNumber; \ |
| (scanpos).firstItem = 0; \ |
| (scanpos).lastItem = 0; \ |
| (scanpos).itemIndex = 0; \ |
| } while (0) |
| |
| /* |
| * HashScanOpaqueData is private state for a hash index scan. |
| */ |
| typedef struct HashScanOpaqueData |
| { |
| /* Hash value of the scan key, ie, the hash key we seek */ |
| uint32 hashso_sk_hash; |
| |
| /* remember the buffer associated with primary bucket */ |
| Buffer hashso_bucket_buf; |
| |
| /* |
| * remember the buffer associated with primary bucket page of bucket being |
| * split. it is required during the scan of the bucket which is being |
| * populated during split operation. |
| */ |
| Buffer hashso_split_bucket_buf; |
| |
| /* Whether scan starts on bucket being populated due to split */ |
| bool hashso_buc_populated; |
| |
| /* |
| * Whether scanning bucket being split? The value of this parameter is |
| * referred only when hashso_buc_populated is true. |
| */ |
| bool hashso_buc_split; |
| /* info about killed items if any (killedItems is NULL if never used) */ |
| int *killedItems; /* currPos.items indexes of killed items */ |
| int numKilled; /* number of currently stored items */ |
| |
| /* |
| * Identify all the matching items on a page and save them in |
| * HashScanPosData |
| */ |
| HashScanPosData currPos; /* current position data */ |
| } HashScanOpaqueData; |
| |
| typedef HashScanOpaqueData *HashScanOpaque; |
| |
| /* |
| * Definitions for metapage. |
| */ |
| |
| #define HASH_METAPAGE 0 /* metapage is always block 0 */ |
| |
| #define HASH_MAGIC 0x6440640 |
| #define HASH_VERSION 4 |
| |
| /* |
| * spares[] holds the number of overflow pages currently allocated at or |
| * before a certain splitpoint. For example, if spares[3] = 7 then there are |
| * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The |
| * value in spares[ovflpoint] increases as overflow pages are added at the |
| * end of the index. Once ovflpoint increases (ie, we have actually allocated |
| * the bucket pages belonging to that splitpoint) the number of spares at the |
| * prior splitpoint cannot change anymore. |
| * |
| * ovflpages that have been recycled for reuse can be found by looking at |
| * bitmaps that are stored within ovflpages dedicated for the purpose. |
| * The blknos of these bitmap pages are kept in mapp[]; nmaps is the |
| * number of currently existing bitmaps. |
| * |
| * The limitation on the size of spares[] comes from the fact that there's |
| * no point in having more than 2^32 buckets with only uint32 hashcodes. |
| * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is |
| * adjusted in such a way to accommodate multi phased allocation of buckets |
| * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE). |
| * |
| * There is no particular upper limit on the size of mapp[], other than |
| * needing to fit into the metapage. (With 8K block size, 1024 bitmaps |
| * limit us to 256 GB of overflow space...). For smaller block size we |
| * can not use 1024 bitmaps as it will lead to the meta page data crossing |
| * the block size boundary. So we use BLCKSZ to determine the maximum number |
| * of bitmaps. |
| */ |
| #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024) |
| |
| #define HASH_SPLITPOINT_PHASE_BITS 2 |
| #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS) |
| #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1) |
| #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10 |
| |
| /* defines max number of splitpoint phases a hash index can have */ |
| #define HASH_MAX_SPLITPOINT_GROUP 32 |
| #define HASH_MAX_SPLITPOINTS \ |
| (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \ |
| HASH_SPLITPOINT_PHASES_PER_GRP) + \ |
| HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) |
| |
| typedef struct HashMetaPageData |
| { |
| uint32 hashm_magic; /* magic no. for hash tables */ |
| uint32 hashm_version; /* version ID */ |
| double hashm_ntuples; /* number of tuples stored in the table */ |
| uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */ |
| uint16 hashm_bsize; /* index page size (bytes) */ |
| uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power |
| * of 2 */ |
| uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */ |
| uint32 hashm_maxbucket; /* ID of maximum bucket in use */ |
| uint32 hashm_highmask; /* mask to modulo into entire table */ |
| uint32 hashm_lowmask; /* mask to modulo into lower half of table */ |
| uint32 hashm_ovflpoint; /* splitpoint from which ovflpage being |
| * allocated */ |
| uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ |
| uint32 hashm_nmaps; /* number of bitmap pages */ |
| RegProcedure hashm_procid; /* hash function id from pg_proc */ |
| uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each |
| * splitpoint */ |
| BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ |
| } HashMetaPageData; |
| |
| typedef HashMetaPageData *HashMetaPage; |
| |
| typedef struct HashOptions |
| { |
| int32 varlena_header_; /* varlena header (do not touch directly!) */ |
| int fillfactor; /* page fill factor in percent (0..100) */ |
| } HashOptions; |
| |
| #define HashGetFillFactor(relation) \ |
| (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ |
| IsIndexAccessMethod(relation->rd_rel->relam, HASH_AM_OID)), \ |
| (relation)->rd_options ? \ |
| ((HashOptions *) (relation)->rd_options)->fillfactor : \ |
| HASH_DEFAULT_FILLFACTOR) |
| #define HashGetTargetPageUsage(relation) \ |
| (BLCKSZ * HashGetFillFactor(relation) / 100) |
| |
| /* |
| * Maximum size of a hash index item (it's okay to have only one per page) |
| */ |
| #define HashMaxItemSize(page) \ |
| MAXALIGN_DOWN(PageGetPageSize(page) - \ |
| SizeOfPageHeaderData - \ |
| sizeof(ItemIdData) - \ |
| MAXALIGN(sizeof(HashPageOpaqueData))) |
| |
| #define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT |
| |
| #define HASH_MIN_FILLFACTOR 10 |
| #define HASH_DEFAULT_FILLFACTOR 75 |
| |
| /* |
| * Constants |
| */ |
| #define BYTE_TO_BIT 3 /* 2^3 bits/byte */ |
| #define ALL_SET ((uint32) ~0) |
| |
| /* |
| * Bitmap pages do not contain tuples. They do contain the standard |
| * page headers and trailers; however, everything in between is a |
| * giant bit array. The number of bits that fit on a page obviously |
| * depends on the page size and the header/trailer overhead. We require |
| * the number of bits per page to be a power of 2. |
| */ |
| #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) |
| #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) |
| #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift) |
| #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) |
| |
| #define HashPageGetBitmap(page) \ |
| ((uint32 *) PageGetContents(page)) |
| |
| #define HashGetMaxBitmapSize(page) \ |
| (PageGetPageSize((Page) page) - \ |
| (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData)))) |
| |
| #define HashPageGetMeta(page) \ |
| ((HashMetaPage) PageGetContents(page)) |
| |
| /* |
| * The number of bits in an ovflpage bitmap word. |
| */ |
| #define BITS_PER_MAP 32 /* Number of bits in uint32 */ |
| |
| /* Given the address of the beginning of a bit map, clear/set the nth bit */ |
| #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) |
| #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) |
| #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) |
| |
| /* |
| * page-level and high-level locking modes (see README) |
| */ |
| #define HASH_READ BUFFER_LOCK_SHARE |
| #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE |
| #define HASH_NOLOCK (-1) |
| |
| /* |
| * When a new operator class is declared, we require that the user supply |
| * us with an amproc function for hashing a key of the new type, returning |
| * a 32-bit hash value. We call this the "standard" hash function. We |
| * also allow an optional "extended" hash function which accepts a salt and |
| * returns a 64-bit hash value. This is highly recommended but, for reasons |
| * of backward compatibility, optional. |
| * |
| * When the salt is 0, the low 32 bits of the value returned by the extended |
| * hash function should match the value that would have been returned by the |
| * standard hash function. |
| */ |
| #define HASHSTANDARD_PROC 1 |
| #define HASHEXTENDED_PROC 2 |
| #define HASHOPTIONS_PROC 3 |
| #define HASHNProcs 3 |
| |
| |
| /* public routines */ |
| |
| extern IndexBuildResult *hashbuild(Relation heap, Relation index, |
| struct IndexInfo *indexInfo); |
| extern void hashbuildempty(Relation index); |
| extern bool hashinsert(Relation rel, Datum *values, bool *isnull, |
| ItemPointer ht_ctid, Relation heapRel, |
| IndexUniqueCheck checkUnique, |
| bool indexUnchanged, |
| struct IndexInfo *indexInfo); |
| extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); |
| extern int64 hashgetbitmap(IndexScanDesc scan, Node **bmNodeP); |
| extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); |
| extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
| ScanKey orderbys, int norderbys); |
| extern void hashendscan(IndexScanDesc scan); |
| extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, |
| IndexBulkDeleteResult *stats, |
| IndexBulkDeleteCallback callback, |
| void *callback_state); |
| extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info, |
| IndexBulkDeleteResult *stats); |
| extern bytea *hashoptions(Datum reloptions, bool validate); |
| extern bool hashvalidate(Oid opclassoid); |
| extern void hashadjustmembers(Oid opfamilyoid, |
| Oid opclassoid, |
| List *operators, |
| List *functions); |
| |
| /* private routines */ |
| |
| /* hashinsert.c */ |
| extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, |
| bool sorted); |
| extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, |
| Size itemsize, IndexTuple itup, |
| bool appendtup); |
| extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, |
| OffsetNumber *itup_offsets, uint16 nitups); |
| |
| /* hashovfl.c */ |
| extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); |
| extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, |
| Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, |
| Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy); |
| extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage); |
| extern void _hash_squeezebucket(Relation rel, |
| Bucket bucket, BlockNumber bucket_blkno, |
| Buffer bucket_buf, |
| BufferAccessStrategy bstrategy); |
| extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno); |
| |
| /* hashpage.c */ |
| extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, |
| int access, int flags); |
| extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, |
| BlockNumber blkno, int flags); |
| extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, |
| bool force_refresh); |
| extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, |
| int access, |
| HashMetaPage *cachedmetap); |
| extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); |
| extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, |
| uint32 flag, bool initpage); |
| extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, |
| ForkNumber forkNum); |
| extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, |
| int access, int flags, |
| BufferAccessStrategy bstrategy); |
| extern void _hash_relbuf(Relation rel, Buffer buf); |
| extern void _hash_dropbuf(Relation rel, Buffer buf); |
| extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so); |
| extern uint32 _hash_init(Relation rel, double num_tuples, |
| ForkNumber forkNum); |
| extern void _hash_init_metabuffer(Buffer buf, double num_tuples, |
| RegProcedure procid, uint16 ffactor, bool initpage); |
| extern void _hash_pageinit(Page page, Size size); |
| extern void _hash_expandtable(Relation rel, Buffer metabuf); |
| extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, |
| Bucket obucket, uint32 maxbucket, uint32 highmask, |
| uint32 lowmask); |
| |
| /* hashsearch.c */ |
| extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); |
| extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); |
| |
| /* hashsort.c */ |
| typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ |
| |
| extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets); |
| extern void _h_spooldestroy(HSpool *hspool); |
| extern void _h_spool(HSpool *hspool, ItemPointer self, |
| Datum *values, bool *isnull); |
| extern void _h_indexbuild(HSpool *hspool, Relation heapRel); |
| |
| /* hashutil.c */ |
| extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup); |
| extern uint32 _hash_datum2hashkey(Relation rel, Datum key); |
| extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); |
| extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, |
| uint32 highmask, uint32 lowmask); |
| extern uint32 _hash_spareindex(uint32 num_bucket); |
| extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); |
| extern void _hash_checkpage(Relation rel, Buffer buf, int flags); |
| extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); |
| extern bool _hash_convert_tuple(Relation index, |
| Datum *user_values, bool *user_isnull, |
| Datum *index_values, bool *index_isnull); |
| extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); |
| extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); |
| extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket); |
| extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); |
| extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, |
| uint32 lowmask, uint32 maxbucket); |
| extern void _hash_kill_items(IndexScanDesc scan); |
| |
| /* hash.c */ |
| extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, |
| Buffer bucket_buf, BlockNumber bucket_blkno, |
| BufferAccessStrategy bstrategy, |
| uint32 maxbucket, uint32 highmask, uint32 lowmask, |
| double *tuples_removed, double *num_index_tuples, |
| bool split_cleanup, |
| IndexBulkDeleteCallback callback, void *callback_state); |
| |
| #endif /* HASH_H */ |