| /*------------------------------------------------------------------------- |
| * |
| * hashutil.c |
| * Utility code for Postgres hash implementation. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/access/hash/hashutil.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/hash.h" |
| #include "access/reloptions.h" |
| #include "access/relscan.h" |
| #include "port/pg_bitutils.h" |
| #include "storage/buf_internals.h" |
| #include "utils/lsyscache.h" |
| #include "utils/rel.h" |
| |
| #define CALC_NEW_BUCKET(old_bucket, lowmask) \ |
| old_bucket | (lowmask + 1) |
| |
| /* |
| * _hash_checkqual -- does the index tuple satisfy the scan conditions? |
| */ |
| bool |
| _hash_checkqual(IndexScanDesc scan, IndexTuple itup) |
| { |
| /* |
| * Currently, we can't check any of the scan conditions since we do not |
| * have the original index entry value to supply to the sk_func. Always |
| * return true; we expect that hashgettuple already set the recheck flag |
| * to make the main indexscan code do it. |
| */ |
| #ifdef NOT_USED |
| TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); |
| ScanKey key = scan->keyData; |
| int scanKeySize = scan->numberOfKeys; |
| |
| while (scanKeySize > 0) |
| { |
| Datum datum; |
| bool isNull; |
| Datum test; |
| |
| datum = index_getattr(itup, |
| key->sk_attno, |
| tupdesc, |
| &isNull); |
| |
| /* assume sk_func is strict */ |
| if (isNull) |
| return false; |
| if (key->sk_flags & SK_ISNULL) |
| return false; |
| |
| test = FunctionCall2Coll(&key->sk_func, key->sk_collation, |
| datum, key->sk_argument); |
| |
| if (!DatumGetBool(test)) |
| return false; |
| |
| key++; |
| scanKeySize--; |
| } |
| #endif |
| |
| return true; |
| } |
| |
| /* |
| * _hash_datum2hashkey -- given a Datum, call the index's hash function |
| * |
| * The Datum is assumed to be of the index's column type, so we can use the |
| * "primary" hash function that's tracked for us by the generic index code. |
| */ |
| uint32 |
| _hash_datum2hashkey(Relation rel, Datum key) |
| { |
| FmgrInfo *procinfo; |
| Oid collation; |
| |
| /* XXX assumes index has only one attribute */ |
| procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC); |
| collation = rel->rd_indcollation[0]; |
| |
| return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key)); |
| } |
| |
| /* |
| * _hash_datum2hashkey_type -- given a Datum of a specified type, |
| * hash it in a fashion compatible with this index |
| * |
| * This is much more expensive than _hash_datum2hashkey, so use it only in |
| * cross-type situations. |
| */ |
| uint32 |
| _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) |
| { |
| RegProcedure hash_proc; |
| Oid collation; |
| |
| /* XXX assumes index has only one attribute */ |
| hash_proc = get_opfamily_proc(rel->rd_opfamily[0], |
| keytype, |
| keytype, |
| HASHSTANDARD_PROC); |
| if (!RegProcedureIsValid(hash_proc)) |
| elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"", |
| HASHSTANDARD_PROC, keytype, keytype, |
| RelationGetRelationName(rel)); |
| collation = rel->rd_indcollation[0]; |
| |
| return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key)); |
| } |
| |
| /* |
| * _hash_hashkey2bucket -- determine which bucket the hashkey maps to. |
| */ |
| Bucket |
| _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, |
| uint32 highmask, uint32 lowmask) |
| { |
| Bucket bucket; |
| |
| bucket = hashkey & highmask; |
| if (bucket > maxbucket) |
| bucket = bucket & lowmask; |
| |
| return bucket; |
| } |
| |
| /* |
| * _hash_spareindex -- returns spare index / global splitpoint phase of the |
| * bucket |
| */ |
| uint32 |
| _hash_spareindex(uint32 num_bucket) |
| { |
| uint32 splitpoint_group; |
| uint32 splitpoint_phases; |
| |
| splitpoint_group = pg_ceil_log2_32(num_bucket); |
| |
| if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) |
| return splitpoint_group; |
| |
| /* account for single-phase groups */ |
| splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; |
| |
| /* account for multi-phase groups before splitpoint_group */ |
| splitpoint_phases += |
| ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) << |
| HASH_SPLITPOINT_PHASE_BITS); |
| |
| /* account for phases within current group */ |
| splitpoint_phases += |
| (((num_bucket - 1) >> |
| (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) & |
| HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */ |
| |
| return splitpoint_phases; |
| } |
| |
| /* |
| * _hash_get_totalbuckets -- returns total number of buckets allocated till |
| * the given splitpoint phase. |
| */ |
| uint32 |
| _hash_get_totalbuckets(uint32 splitpoint_phase) |
| { |
| uint32 splitpoint_group; |
| uint32 total_buckets; |
| uint32 phases_within_splitpoint_group; |
| |
| if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) |
| return (1 << splitpoint_phase); |
| |
| /* get splitpoint's group */ |
| splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; |
| splitpoint_group += |
| ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >> |
| HASH_SPLITPOINT_PHASE_BITS); |
| |
| /* account for buckets before splitpoint_group */ |
| total_buckets = (1 << (splitpoint_group - 1)); |
| |
| /* account for buckets within splitpoint_group */ |
| phases_within_splitpoint_group = |
| (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) & |
| HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */ |
| total_buckets += |
| (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) * |
| phases_within_splitpoint_group); |
| |
| return total_buckets; |
| } |
| |
| /* |
| * _hash_checkpage -- sanity checks on the format of all hash pages |
| * |
| * If flags is not zero, it is a bitwise OR of the acceptable page types |
| * (values of hasho_flag & LH_PAGE_TYPE). |
| */ |
| void |
| _hash_checkpage(Relation rel, Buffer buf, int flags) |
| { |
| Page page = BufferGetPage(buf); |
| |
| /* |
| * ReadBuffer verifies that every newly-read page passes |
| * PageHeaderIsValid, which means it either contains a reasonably sane |
| * page header or is all-zero. We have to defend against the all-zero |
| * case, however. |
| */ |
| if (PageIsNew(page)) |
| ereport(ERROR, |
| (errcode(ERRCODE_INDEX_CORRUPTED), |
| errmsg("index \"%s\" contains unexpected zero page at block %u", |
| RelationGetRelationName(rel), |
| BufferGetBlockNumber(buf)), |
| errhint("Please REINDEX it."))); |
| |
| /* |
| * Additionally check that the special area looks sane. |
| */ |
| if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) |
| ereport(ERROR, |
| (errcode(ERRCODE_INDEX_CORRUPTED), |
| errmsg("index \"%s\" contains corrupted page at block %u", |
| RelationGetRelationName(rel), |
| BufferGetBlockNumber(buf)), |
| errhint("Please REINDEX it."))); |
| |
| if (flags) |
| { |
| HashPageOpaque opaque = HashPageGetOpaque(page); |
| |
| if ((opaque->hasho_flag & flags) == 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_INDEX_CORRUPTED), |
| errmsg("index \"%s\" contains corrupted page at block %u", |
| RelationGetRelationName(rel), |
| BufferGetBlockNumber(buf)), |
| errhint("Please REINDEX it."))); |
| } |
| |
| /* |
| * When checking the metapage, also verify magic number and version. |
| */ |
| if (flags == LH_META_PAGE) |
| { |
| HashMetaPage metap = HashPageGetMeta(page); |
| |
| if (metap->hashm_magic != HASH_MAGIC) |
| ereport(ERROR, |
| (errcode(ERRCODE_INDEX_CORRUPTED), |
| errmsg("index \"%s\" is not a hash index", |
| RelationGetRelationName(rel)))); |
| |
| if (metap->hashm_version != HASH_VERSION) |
| ereport(ERROR, |
| (errcode(ERRCODE_INDEX_CORRUPTED), |
| errmsg("index \"%s\" has wrong hash version", |
| RelationGetRelationName(rel)), |
| errhint("Please REINDEX it."))); |
| } |
| } |
| |
| bytea * |
| hashoptions(Datum reloptions, bool validate) |
| { |
| static const relopt_parse_elt tab[] = { |
| {"fillfactor", RELOPT_TYPE_INT, offsetof(HashOptions, fillfactor)}, |
| }; |
| |
| return (bytea *) build_reloptions(reloptions, validate, |
| RELOPT_KIND_HASH, |
| sizeof(HashOptions), |
| tab, lengthof(tab)); |
| } |
| |
| /* |
| * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value |
| */ |
| uint32 |
| _hash_get_indextuple_hashkey(IndexTuple itup) |
| { |
| char *attp; |
| |
| /* |
| * We assume the hash key is the first attribute and can't be null, so |
| * this can be done crudely but very very cheaply ... |
| */ |
| attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info); |
| return *((uint32 *) attp); |
| } |
| |
| /* |
| * _hash_convert_tuple - convert raw index data to hash key |
| * |
| * Inputs: values and isnull arrays for the user data column(s) |
| * Outputs: values and isnull arrays for the index tuple, suitable for |
| * passing to index_form_tuple(). |
| * |
| * Returns true if successful, false if not (because there are null values). |
| * On a false result, the given data need not be indexed. |
| * |
| * Note: callers know that the index-column arrays are always of length 1. |
| * In principle, there could be more than one input column, though we do not |
| * currently support that. |
| */ |
| bool |
| _hash_convert_tuple(Relation index, |
| Datum *user_values, bool *user_isnull, |
| Datum *index_values, bool *index_isnull) |
| { |
| uint32 hashkey; |
| |
| /* |
| * We do not insert null values into hash indexes. This is okay because |
| * the only supported search operator is '=', and we assume it is strict. |
| */ |
| if (user_isnull[0]) |
| return false; |
| |
| hashkey = _hash_datum2hashkey(index, user_values[0]); |
| index_values[0] = UInt32GetDatum(hashkey); |
| index_isnull[0] = false; |
| return true; |
| } |
| |
| /* |
| * _hash_binsearch - Return the offset number in the page where the |
| * specified hash value should be sought or inserted. |
| * |
| * We use binary search, relying on the assumption that the existing entries |
| * are ordered by hash key. |
| * |
| * Returns the offset of the first index entry having hashkey >= hash_value, |
| * or the page's max offset plus one if hash_value is greater than all |
| * existing hash keys in the page. This is the appropriate place to start |
| * a search, or to insert a new item. |
| */ |
| OffsetNumber |
| _hash_binsearch(Page page, uint32 hash_value) |
| { |
| OffsetNumber upper; |
| OffsetNumber lower; |
| |
| /* Loop invariant: lower <= desired place <= upper */ |
| upper = PageGetMaxOffsetNumber(page) + 1; |
| lower = FirstOffsetNumber; |
| |
| while (upper > lower) |
| { |
| OffsetNumber off; |
| IndexTuple itup; |
| uint32 hashkey; |
| |
| off = (upper + lower) / 2; |
| Assert(OffsetNumberIsValid(off)); |
| |
| itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); |
| hashkey = _hash_get_indextuple_hashkey(itup); |
| if (hashkey < hash_value) |
| lower = off + 1; |
| else |
| upper = off; |
| } |
| |
| return lower; |
| } |
| |
| /* |
| * _hash_binsearch_last |
| * |
| * Same as above, except that if there are multiple matching items in the |
| * page, we return the offset of the last one instead of the first one, |
| * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1. |
| * This is handy for starting a new page in a backwards scan. |
| */ |
| OffsetNumber |
| _hash_binsearch_last(Page page, uint32 hash_value) |
| { |
| OffsetNumber upper; |
| OffsetNumber lower; |
| |
| /* Loop invariant: lower <= desired place <= upper */ |
| upper = PageGetMaxOffsetNumber(page); |
| lower = FirstOffsetNumber - 1; |
| |
| while (upper > lower) |
| { |
| IndexTuple itup; |
| OffsetNumber off; |
| uint32 hashkey; |
| |
| off = (upper + lower + 1) / 2; |
| Assert(OffsetNumberIsValid(off)); |
| |
| itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); |
| hashkey = _hash_get_indextuple_hashkey(itup); |
| if (hashkey > hash_value) |
| upper = off - 1; |
| else |
| lower = off; |
| } |
| |
| return lower; |
| } |
| |
| /* |
| * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket |
| * from which current (new) bucket is being split. |
| */ |
| BlockNumber |
| _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) |
| { |
| Bucket old_bucket; |
| uint32 mask; |
| Buffer metabuf; |
| HashMetaPage metap; |
| BlockNumber blkno; |
| |
| /* |
| * To get the old bucket from the current bucket, we need a mask to modulo |
| * into lower half of table. This mask is stored in meta page as |
| * hashm_lowmask, but here we can't rely on the same, because we need a |
| * value of lowmask that was prevalent at the time when bucket split was |
| * started. Masking the most significant bit of new bucket would give us |
| * old bucket. |
| */ |
| mask = (((uint32) 1) << pg_leftmost_one_pos32(new_bucket)) - 1; |
| old_bucket = new_bucket & mask; |
| |
| metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); |
| metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| |
| blkno = BUCKET_TO_BLKNO(metap, old_bucket); |
| |
| _hash_relbuf(rel, metabuf); |
| |
| return blkno; |
| } |
| |
| /* |
| * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket |
| * that will be generated after split from old bucket. |
| * |
| * This is used to find the new bucket from old bucket based on current table |
| * half. It is mainly required to finish the incomplete splits where we are |
| * sure that not more than one bucket could have split in progress from old |
| * bucket. |
| */ |
| BlockNumber |
| _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) |
| { |
| Bucket new_bucket; |
| Buffer metabuf; |
| HashMetaPage metap; |
| BlockNumber blkno; |
| |
| metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); |
| metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| |
| new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, |
| metap->hashm_lowmask, |
| metap->hashm_maxbucket); |
| blkno = BUCKET_TO_BLKNO(metap, new_bucket); |
| |
| _hash_relbuf(rel, metabuf); |
| |
| return blkno; |
| } |
| |
| /* |
| * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be |
| * generated after split from current (old) bucket. |
| * |
| * This is used to find the new bucket from old bucket. New bucket can be |
| * obtained by OR'ing old bucket with most significant bit of current table |
| * half (lowmask passed in this function can be used to identify msb of |
| * current table half). There could be multiple buckets that could have |
| * been split from current bucket. We need the first such bucket that exists. |
| * Caller must ensure that no more than one split has happened from old |
| * bucket. |
| */ |
| Bucket |
| _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, |
| uint32 lowmask, uint32 maxbucket) |
| { |
| Bucket new_bucket; |
| |
| new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); |
| if (new_bucket > maxbucket) |
| { |
| lowmask = lowmask >> 1; |
| new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); |
| } |
| |
| return new_bucket; |
| } |
| |
| /* |
| * _hash_kill_items - set LP_DEAD state for items an indexscan caller has |
| * told us were killed. |
| * |
| * scan->opaque, referenced locally through so, contains information about the |
| * current page and killed tuples thereon (generally, this should only be |
| * called if so->numKilled > 0). |
| * |
| * The caller does not have a lock on the page and may or may not have the |
| * page pinned in a buffer. Note that read-lock is sufficient for setting |
| * LP_DEAD status (which is only a hint). |
| * |
| * The caller must have pin on bucket buffer, but may or may not have pin |
| * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). |
| * |
| * We match items by heap TID before assuming they are the right ones to |
| * delete. |
| * |
| * There are never any scans active in a bucket at the time VACUUM begins, |
| * because VACUUM takes a cleanup lock on the primary bucket page and scans |
| * hold a pin. A scan can begin after VACUUM leaves the primary bucket page |
| * but before it finishes the entire bucket, but it can never pass VACUUM, |
| * because VACUUM always locks the next page before releasing the lock on |
| * the previous one. Therefore, we don't have to worry about accidentally |
| * killing a TID that has been reused for an unrelated tuple. |
| */ |
| void |
| _hash_kill_items(IndexScanDesc scan) |
| { |
| HashScanOpaque so = (HashScanOpaque) scan->opaque; |
| Relation rel = scan->indexRelation; |
| BlockNumber blkno; |
| Buffer buf; |
| Page page; |
| HashPageOpaque opaque; |
| OffsetNumber offnum, |
| maxoff; |
| int numKilled = so->numKilled; |
| int i; |
| bool killedsomething = false; |
| bool havePin = false; |
| |
| Assert(so->numKilled > 0); |
| Assert(so->killedItems != NULL); |
| Assert(HashScanPosIsValid(so->currPos)); |
| |
| /* |
| * Always reset the scan state, so we don't look for same items on other |
| * pages. |
| */ |
| so->numKilled = 0; |
| |
| blkno = so->currPos.currPage; |
| if (HashScanPosIsPinned(so->currPos)) |
| { |
| /* |
| * We already have pin on this buffer, so, all we need to do is |
| * acquire lock on it. |
| */ |
| havePin = true; |
| buf = so->currPos.buf; |
| LockBuffer(buf, BUFFER_LOCK_SHARE); |
| } |
| else |
| buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); |
| |
| page = BufferGetPage(buf); |
| opaque = HashPageGetOpaque(page); |
| maxoff = PageGetMaxOffsetNumber(page); |
| |
| for (i = 0; i < numKilled; i++) |
| { |
| int itemIndex = so->killedItems[i]; |
| HashScanPosItem *currItem = &so->currPos.items[itemIndex]; |
| |
| offnum = currItem->indexOffset; |
| |
| Assert(itemIndex >= so->currPos.firstItem && |
| itemIndex <= so->currPos.lastItem); |
| |
| while (offnum <= maxoff) |
| { |
| ItemId iid = PageGetItemId(page, offnum); |
| IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); |
| |
| if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) |
| { |
| /* found the item */ |
| ItemIdMarkDead(iid); |
| killedsomething = true; |
| break; /* out of inner search loop */ |
| } |
| offnum = OffsetNumberNext(offnum); |
| } |
| } |
| |
| /* |
| * Since this can be redone later if needed, mark as dirty hint. Whenever |
| * we mark anything LP_DEAD, we also set the page's |
| * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. |
| */ |
| if (killedsomething) |
| { |
| opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; |
| MarkBufferDirtyHint(buf, true); |
| } |
| |
| if (so->hashso_bucket_buf == so->currPos.buf || |
| havePin) |
| LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); |
| else |
| _hash_relbuf(rel, buf); |
| } |