| /*------------------------------------------------------------------------- |
| * |
| * hash.c |
| * Implementation of Margo Seltzer's Hashing package for postgres. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/access/hash/hash.c |
| * |
| * NOTES |
| * This file contains only the public interface routines. |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "access/hash.h" |
| #include "access/hash_xlog.h" |
| #include "access/relscan.h" |
| #include "access/tableam.h" |
| #include "catalog/index.h" |
| #include "commands/progress.h" |
| #include "commands/vacuum.h" |
| #include "miscadmin.h" |
| #include "optimizer/plancat.h" |
| #include "pgstat.h" |
| #include "utils/builtins.h" |
| #include "utils/index_selfuncs.h" |
| #include "utils/rel.h" |
| |
| /* Working state for hashbuild and its callback */ |
| typedef struct |
| { |
| HSpool *spool; /* NULL if not using spooling */ |
| double indtuples; /* # tuples accepted into index */ |
| Relation heapRel; /* heap relation descriptor */ |
| } HashBuildState; |
| |
| static void hashbuildCallback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *state); |
| |
| |
| /* |
| * Hash handler function: return IndexAmRoutine with access method parameters |
| * and callbacks. |
| */ |
| Datum |
| hashhandler(PG_FUNCTION_ARGS) |
| { |
| IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
| |
| amroutine->amstrategies = HTMaxStrategyNumber; |
| amroutine->amsupport = HASHNProcs; |
| amroutine->amoptsprocnum = HASHOPTIONS_PROC; |
| amroutine->amcanorder = false; |
| amroutine->amcanorderbyop = false; |
| amroutine->amcanbackward = true; |
| amroutine->amcanunique = false; |
| amroutine->amcanmulticol = false; |
| amroutine->amoptionalkey = false; |
| amroutine->amsearcharray = false; |
| amroutine->amsearchnulls = false; |
| amroutine->amstorage = false; |
| amroutine->amclusterable = false; |
| amroutine->ampredlocks = true; |
| amroutine->amcanparallel = false; |
| amroutine->amcaninclude = false; |
| amroutine->amusemaintenanceworkmem = false; |
| amroutine->amparallelvacuumoptions = |
| VACUUM_OPTION_PARALLEL_BULKDEL; |
| amroutine->amkeytype = INT4OID; |
| |
| amroutine->ambuild = hashbuild; |
| amroutine->ambuildempty = hashbuildempty; |
| amroutine->aminsert = hashinsert; |
| amroutine->ambulkdelete = hashbulkdelete; |
| amroutine->amvacuumcleanup = hashvacuumcleanup; |
| amroutine->amcanreturn = NULL; |
| amroutine->amcostestimate = hashcostestimate; |
| amroutine->amoptions = hashoptions; |
| amroutine->amproperty = NULL; |
| amroutine->ambuildphasename = NULL; |
| amroutine->amvalidate = hashvalidate; |
| amroutine->amadjustmembers = hashadjustmembers; |
| amroutine->ambeginscan = hashbeginscan; |
| amroutine->amrescan = hashrescan; |
| amroutine->amgettuple = hashgettuple; |
| amroutine->amgetbitmap = hashgetbitmap; |
| amroutine->amendscan = hashendscan; |
| amroutine->ammarkpos = NULL; |
| amroutine->amrestrpos = NULL; |
| amroutine->amestimateparallelscan = NULL; |
| amroutine->aminitparallelscan = NULL; |
| amroutine->amparallelrescan = NULL; |
| |
| PG_RETURN_POINTER(amroutine); |
| } |
| |
| /* |
| * hashbuild() -- build a new hash index. |
| */ |
| IndexBuildResult * |
| hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
| { |
| IndexBuildResult *result; |
| BlockNumber relpages; |
| double reltuples; |
| double allvisfrac; |
| uint32 num_buckets; |
| long sort_threshold; |
| HashBuildState buildstate; |
| |
| /* |
| * We expect to be called exactly once for any index relation. If that's |
| * not the case, big trouble's what we have. |
| */ |
| if (RelationGetNumberOfBlocks(index) != 0) |
| elog(ERROR, "index \"%s\" already contains data", |
| RelationGetRelationName(index)); |
| |
| /* Estimate the number of rows currently present in the table */ |
| estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); |
| |
| /* Initialize the hash index metadata page and initial buckets */ |
| num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); |
| |
| /* |
| * If we just insert the tuples into the index in scan order, then |
| * (assuming their hash codes are pretty random) there will be no locality |
| * of access to the index, and if the index is bigger than available RAM |
| * then we'll thrash horribly. To prevent that scenario, we can sort the |
| * tuples by (expected) bucket number. However, such a sort is useless |
| * overhead when the index does fit in RAM. We choose to sort if the |
| * initial index size exceeds maintenance_work_mem, or the number of |
| * buffers usable for the index, whichever is less. (Limiting by the |
| * number of buffers should reduce thrashing between PG buffers and kernel |
| * buffers, which seems useful even if no physical I/O results. Limiting |
| * by maintenance_work_mem is useful to allow easy testing of the sort |
| * code path, and may be useful to DBAs as an additional control knob.) |
| * |
| * NOTE: this test will need adjustment if a bucket is ever different from |
| * one page. Also, "initial index size" accounting does not include the |
| * metapage, nor the first bitmap page. |
| */ |
| sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ; |
| if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) |
| sort_threshold = Min(sort_threshold, NBuffers); |
| else |
| sort_threshold = Min(sort_threshold, NLocBuffer); |
| |
| if (num_buckets >= (uint32) sort_threshold) |
| buildstate.spool = _h_spoolinit(heap, index, num_buckets); |
| else |
| buildstate.spool = NULL; |
| |
| /* prepare to build the index */ |
| buildstate.indtuples = 0; |
| buildstate.heapRel = heap; |
| |
| /* do the heap scan */ |
| reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
| hashbuildCallback, |
| (void *) &buildstate, NULL); |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL, |
| buildstate.indtuples); |
| |
| if (buildstate.spool) |
| { |
| /* sort the tuples and insert them into the index */ |
| _h_indexbuild(buildstate.spool, buildstate.heapRel); |
| _h_spooldestroy(buildstate.spool); |
| } |
| |
| /* |
| * Return statistics |
| */ |
| result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
| |
| result->heap_tuples = reltuples; |
| result->index_tuples = buildstate.indtuples; |
| |
| return result; |
| } |
| |
| /* |
| * hashbuildempty() -- build an empty hash index in the initialization fork |
| */ |
| void |
| hashbuildempty(Relation index) |
| { |
| _hash_init(index, 0, INIT_FORKNUM); |
| } |
| |
| /* |
| * Per-tuple callback for table_index_build_scan |
| */ |
| static void |
| hashbuildCallback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *state) |
| { |
| HashBuildState *buildstate = (HashBuildState *) state; |
| Datum index_values[1]; |
| bool index_isnull[1]; |
| IndexTuple itup; |
| |
| /* convert data to a hash key; on failure, do not insert anything */ |
| if (!_hash_convert_tuple(index, |
| values, isnull, |
| index_values, index_isnull)) |
| return; |
| |
| /* Either spool the tuple for sorting, or just put it into the index */ |
| if (buildstate->spool) |
| _h_spool(buildstate->spool, tid, index_values, index_isnull); |
| else |
| { |
| /* form an index tuple and point it at the heap tuple */ |
| itup = index_form_tuple(RelationGetDescr(index), |
| index_values, index_isnull); |
| itup->t_tid = *tid; |
| _hash_doinsert(index, itup, buildstate->heapRel); |
| pfree(itup); |
| } |
| |
| buildstate->indtuples += 1; |
| } |
| |
| /* |
| * hashinsert() -- insert an index tuple into a hash table. |
| * |
| * Hash on the heap tuple's key, form an index tuple with hash code. |
| * Find the appropriate location for the new tuple, and put it there. |
| */ |
| bool |
| hashinsert(Relation rel, Datum *values, bool *isnull, |
| ItemPointer ht_ctid, Relation heapRel, |
| IndexUniqueCheck checkUnique, |
| bool indexUnchanged, |
| IndexInfo *indexInfo) |
| { |
| Datum index_values[1]; |
| bool index_isnull[1]; |
| IndexTuple itup; |
| |
| /* convert data to a hash key; on failure, do not insert anything */ |
| if (!_hash_convert_tuple(rel, |
| values, isnull, |
| index_values, index_isnull)) |
| return false; |
| |
| /* form an index tuple and point it at the heap tuple */ |
| itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); |
| itup->t_tid = *ht_ctid; |
| |
| _hash_doinsert(rel, itup, heapRel); |
| |
| pfree(itup); |
| |
| return false; |
| } |
| |
| |
| /* |
| * hashgettuple() -- Get the next tuple in the scan. |
| */ |
| bool |
| hashgettuple(IndexScanDesc scan, ScanDirection dir) |
| { |
| HashScanOpaque so = (HashScanOpaque) scan->opaque; |
| bool res; |
| |
| /* Hash indexes are always lossy since we store only the hash code */ |
| scan->xs_recheck = true; |
| |
| /* |
| * If we've already initialized this scan, we can just advance it in the |
| * appropriate direction. If we haven't done so yet, we call a routine to |
| * get the first item in the scan. |
| */ |
| if (!HashScanPosIsValid(so->currPos)) |
| res = _hash_first(scan, dir); |
| else |
| { |
| /* |
| * Check to see if we should kill the previously-fetched tuple. |
| */ |
| if (scan->kill_prior_tuple) |
| { |
| /* |
| * Yes, so remember it for later. (We'll deal with all such tuples |
| * at once right after leaving the index page or at end of scan.) |
| * In case if caller reverses the indexscan direction it is quite |
| * possible that the same item might get entered multiple times. |
| * But, we don't detect that; instead, we just forget any excess |
| * entries. |
| */ |
| if (so->killedItems == NULL) |
| so->killedItems = (int *) |
| palloc(MaxIndexTuplesPerPage * sizeof(int)); |
| |
| if (so->numKilled < MaxIndexTuplesPerPage) |
| so->killedItems[so->numKilled++] = so->currPos.itemIndex; |
| } |
| |
| /* |
| * Now continue the scan. |
| */ |
| res = _hash_next(scan, dir); |
| } |
| |
| return res; |
| } |
| |
| |
| /* |
| * hashgetbitmap() -- get all tuples at once |
| */ |
| int64 |
| hashgetbitmap(IndexScanDesc scan, Node **bmNodeP) |
| { |
| TIDBitmap *tbm; |
| HashScanOpaque so = (HashScanOpaque) scan->opaque; |
| bool res; |
| int64 ntids = 0; |
| HashScanPosItem *currItem; |
| |
| /* |
| * GPDB specific code. Since GPDB also support StreamBitmap |
| * in bitmap index. So normally we need to create specific bitmap |
| * node in the amgetbitmap AM. |
| */ |
| Assert(bmNodeP); |
| if (*bmNodeP == NULL) |
| { |
| /* XXX should we use less than work_mem for this? */ |
| tbm = tbm_create(work_mem * 1024L, scan->dsa); |
| *bmNodeP = (Node *) tbm; |
| } |
| else if (!IsA(*bmNodeP, TIDBitmap)) |
| elog(ERROR, "non hash bitmap"); |
| else |
| tbm = (TIDBitmap *)*bmNodeP; |
| |
| res = _hash_first(scan, ForwardScanDirection); |
| |
| while (res) |
| { |
| currItem = &so->currPos.items[so->currPos.itemIndex]; |
| |
| /* |
| * _hash_first and _hash_next handle eliminate dead index entries |
| * whenever scan->ignore_killed_tuples is true. Therefore, there's |
| * nothing to do here except add the results to the TIDBitmap. |
| */ |
| tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); |
| ntids++; |
| |
| res = _hash_next(scan, ForwardScanDirection); |
| } |
| |
| return ntids; |
| } |
| |
| |
| /* |
| * hashbeginscan() -- start a scan on a hash index |
| */ |
| IndexScanDesc |
| hashbeginscan(Relation rel, int nkeys, int norderbys) |
| { |
| IndexScanDesc scan; |
| HashScanOpaque so; |
| |
| /* no order by operators allowed */ |
| Assert(norderbys == 0); |
| |
| scan = RelationGetIndexScan(rel, nkeys, norderbys); |
| |
| so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); |
| HashScanPosInvalidate(so->currPos); |
| so->hashso_bucket_buf = InvalidBuffer; |
| so->hashso_split_bucket_buf = InvalidBuffer; |
| |
| so->hashso_buc_populated = false; |
| so->hashso_buc_split = false; |
| |
| so->killedItems = NULL; |
| so->numKilled = 0; |
| |
| scan->opaque = so; |
| |
| return scan; |
| } |
| |
| /* |
| * hashrescan() -- rescan an index relation |
| */ |
| void |
| hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
| ScanKey orderbys, int norderbys) |
| { |
| HashScanOpaque so = (HashScanOpaque) scan->opaque; |
| Relation rel = scan->indexRelation; |
| |
| if (HashScanPosIsValid(so->currPos)) |
| { |
| /* Before leaving current page, deal with any killed items */ |
| if (so->numKilled > 0) |
| _hash_kill_items(scan); |
| } |
| |
| _hash_dropscanbuf(rel, so); |
| |
| /* set position invalid (this will cause _hash_first call) */ |
| HashScanPosInvalidate(so->currPos); |
| |
| /* Update scan key, if a new one is given */ |
| if (scankey && scan->numberOfKeys > 0) |
| { |
| memmove(scan->keyData, |
| scankey, |
| scan->numberOfKeys * sizeof(ScanKeyData)); |
| } |
| |
| so->hashso_buc_populated = false; |
| so->hashso_buc_split = false; |
| } |
| |
| /* |
| * hashendscan() -- close down a scan |
| */ |
| void |
| hashendscan(IndexScanDesc scan) |
| { |
| HashScanOpaque so = (HashScanOpaque) scan->opaque; |
| Relation rel = scan->indexRelation; |
| |
| if (HashScanPosIsValid(so->currPos)) |
| { |
| /* Before leaving current page, deal with any killed items */ |
| if (so->numKilled > 0) |
| _hash_kill_items(scan); |
| } |
| |
| _hash_dropscanbuf(rel, so); |
| |
| if (so->killedItems != NULL) |
| pfree(so->killedItems); |
| pfree(so); |
| scan->opaque = NULL; |
| } |
| |
| /* |
| * Bulk deletion of all index entries pointing to a set of heap tuples. |
| * The set of target tuples is specified via a callback routine that tells |
| * whether any given heap tuple (identified by ItemPointer) is being deleted. |
| * |
| * This function also deletes the tuples that are moved by split to other |
| * bucket. |
| * |
| * Result: a palloc'd struct containing statistical info for VACUUM displays. |
| */ |
| IndexBulkDeleteResult * |
| hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
| IndexBulkDeleteCallback callback, void *callback_state) |
| { |
| Relation rel = info->index; |
| double tuples_removed; |
| double num_index_tuples; |
| double orig_ntuples; |
| Bucket orig_maxbucket; |
| Bucket cur_maxbucket; |
| Bucket cur_bucket; |
| Buffer metabuf = InvalidBuffer; |
| HashMetaPage metap; |
| HashMetaPage cachedmetap; |
| |
| tuples_removed = 0; |
| num_index_tuples = 0; |
| |
| /* |
| * We need a copy of the metapage so that we can use its hashm_spares[] |
| * values to compute bucket page addresses, but a cached copy should be |
| * good enough. (If not, we'll detect that further down and refresh the |
| * cache as necessary.) |
| */ |
| cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); |
| Assert(cachedmetap != NULL); |
| |
| orig_maxbucket = cachedmetap->hashm_maxbucket; |
| orig_ntuples = cachedmetap->hashm_ntuples; |
| |
| /* Scan the buckets that we know exist */ |
| cur_bucket = 0; |
| cur_maxbucket = orig_maxbucket; |
| |
| loop_top: |
| while (cur_bucket <= cur_maxbucket) |
| { |
| BlockNumber bucket_blkno; |
| BlockNumber blkno; |
| Buffer bucket_buf; |
| Buffer buf; |
| HashPageOpaque bucket_opaque; |
| Page page; |
| bool split_cleanup = false; |
| |
| /* Get address of bucket's start page */ |
| bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); |
| |
| blkno = bucket_blkno; |
| |
| /* |
| * We need to acquire a cleanup lock on the primary bucket page to out |
| * wait concurrent scans before deleting the dead tuples. |
| */ |
| buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); |
| LockBufferForCleanup(buf); |
| _hash_checkpage(rel, buf, LH_BUCKET_PAGE); |
| |
| page = BufferGetPage(buf); |
| bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| |
| /* |
| * If the bucket contains tuples that are moved by split, then we need |
| * to delete such tuples. We can't delete such tuples if the split |
| * operation on bucket is not finished as those are needed by scans. |
| */ |
| if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && |
| H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) |
| { |
| split_cleanup = true; |
| |
| /* |
| * This bucket might have been split since we last held a lock on |
| * the metapage. If so, hashm_maxbucket, hashm_highmask and |
| * hashm_lowmask might be old enough to cause us to fail to remove |
| * tuples left behind by the most recent split. To prevent that, |
| * now that the primary page of the target bucket has been locked |
| * (and thus can't be further split), check whether we need to |
| * update our cached metapage data. |
| */ |
| Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); |
| if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) |
| { |
| cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); |
| Assert(cachedmetap != NULL); |
| } |
| } |
| |
| bucket_buf = buf; |
| |
| hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, |
| cachedmetap->hashm_maxbucket, |
| cachedmetap->hashm_highmask, |
| cachedmetap->hashm_lowmask, &tuples_removed, |
| &num_index_tuples, split_cleanup, |
| callback, callback_state); |
| |
| _hash_dropbuf(rel, bucket_buf); |
| |
| /* Advance to next bucket */ |
| cur_bucket++; |
| } |
| |
| if (BufferIsInvalid(metabuf)) |
| metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); |
| |
| /* Write-lock metapage and check for split since we started */ |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| |
| if (cur_maxbucket != metap->hashm_maxbucket) |
| { |
| /* There's been a split, so process the additional bucket(s) */ |
| LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); |
| Assert(cachedmetap != NULL); |
| cur_maxbucket = cachedmetap->hashm_maxbucket; |
| goto loop_top; |
| } |
| |
| /* Okay, we're really done. Update tuple count in metapage. */ |
| START_CRIT_SECTION(); |
| |
| if (orig_maxbucket == metap->hashm_maxbucket && |
| orig_ntuples == metap->hashm_ntuples) |
| { |
| /* |
| * No one has split or inserted anything since start of scan, so |
| * believe our count as gospel. |
| */ |
| metap->hashm_ntuples = num_index_tuples; |
| } |
| else |
| { |
| /* |
| * Otherwise, our count is untrustworthy since we may have |
| * double-scanned tuples in split buckets. Proceed by dead-reckoning. |
| * (Note: we still return estimated_count = false, because using this |
| * count is better than not updating reltuples at all.) |
| */ |
| if (metap->hashm_ntuples > tuples_removed) |
| metap->hashm_ntuples -= tuples_removed; |
| else |
| metap->hashm_ntuples = 0; |
| num_index_tuples = metap->hashm_ntuples; |
| } |
| |
| MarkBufferDirty(metabuf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| xl_hash_update_meta_page xlrec; |
| XLogRecPtr recptr; |
| |
| xlrec.ntuples = metap->hashm_ntuples; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); |
| |
| XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); |
| PageSetLSN(BufferGetPage(metabuf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| |
| _hash_relbuf(rel, metabuf); |
| |
| /* return statistics */ |
| if (stats == NULL) |
| stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
| stats->estimated_count = false; |
| stats->num_index_tuples = num_index_tuples; |
| stats->tuples_removed += tuples_removed; |
| /* hashvacuumcleanup will fill in num_pages */ |
| |
| return stats; |
| } |
| |
| /* |
| * Post-VACUUM cleanup. |
| * |
| * Result: a palloc'd struct containing statistical info for VACUUM displays. |
| */ |
| IndexBulkDeleteResult * |
| hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
| { |
| Relation rel = info->index; |
| BlockNumber num_pages; |
| |
| /* If hashbulkdelete wasn't called, return NULL signifying no change */ |
| /* Note: this covers the analyze_only case too */ |
| if (stats == NULL) |
| return NULL; |
| |
| /* update statistics */ |
| num_pages = RelationGetNumberOfBlocks(rel); |
| stats->num_pages = num_pages; |
| |
| return stats; |
| } |
| |
| /* |
| * Helper function to perform deletion of index entries from a bucket. |
| * |
| * This function expects that the caller has acquired a cleanup lock on the |
| * primary bucket page, and will return with a write lock again held on the |
| * primary bucket page. The lock won't necessarily be held continuously, |
| * though, because we'll release it when visiting overflow pages. |
| * |
| * There can't be any concurrent scans in progress when we first enter this |
| * function because of the cleanup lock we hold on the primary bucket page, |
| * but as soon as we release that lock, there might be. If those scans got |
| * ahead of our cleanup scan, they might see a tuple before we kill it and |
| * wake up only after VACUUM has completed and the TID has been recycled for |
| * an unrelated tuple. To avoid that calamity, we prevent scans from passing |
| * our cleanup scan by locking the next page in the bucket chain before |
| * releasing the lock on the previous page. (This type of lock chaining is not |
| * ideal, so we might want to look for a better solution at some point.) |
| * |
| * We need to retain a pin on the primary bucket to ensure that no concurrent |
| * split can start. |
| */ |
| void |
| hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, |
| BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, |
| uint32 maxbucket, uint32 highmask, uint32 lowmask, |
| double *tuples_removed, double *num_index_tuples, |
| bool split_cleanup, |
| IndexBulkDeleteCallback callback, void *callback_state) |
| { |
| BlockNumber blkno; |
| Buffer buf; |
| Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; |
| bool bucket_dirty = false; |
| |
| blkno = bucket_blkno; |
| buf = bucket_buf; |
| |
| if (split_cleanup) |
| new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, |
| lowmask, maxbucket); |
| |
| /* Scan each page in bucket */ |
| for (;;) |
| { |
| HashPageOpaque opaque; |
| OffsetNumber offno; |
| OffsetNumber maxoffno; |
| Buffer next_buf; |
| Page page; |
| OffsetNumber deletable[MaxOffsetNumber]; |
| int ndeletable = 0; |
| bool retain_pin = false; |
| bool clear_dead_marking = false; |
| |
| vacuum_delay_point(); |
| |
| page = BufferGetPage(buf); |
| opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| |
| /* Scan each tuple in page */ |
| maxoffno = PageGetMaxOffsetNumber(page); |
| for (offno = FirstOffsetNumber; |
| offno <= maxoffno; |
| offno = OffsetNumberNext(offno)) |
| { |
| ItemPointer htup; |
| IndexTuple itup; |
| Bucket bucket; |
| bool kill_tuple = false; |
| |
| itup = (IndexTuple) PageGetItem(page, |
| PageGetItemId(page, offno)); |
| htup = &(itup->t_tid); |
| |
| /* |
| * To remove the dead tuples, we strictly want to rely on results |
| * of callback function. refer btvacuumpage for detailed reason. |
| */ |
| if (callback && callback(htup, callback_state)) |
| { |
| kill_tuple = true; |
| if (tuples_removed) |
| *tuples_removed += 1; |
| } |
| else if (split_cleanup) |
| { |
| /* delete the tuples that are moved by split. */ |
| bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), |
| maxbucket, |
| highmask, |
| lowmask); |
| /* mark the item for deletion */ |
| if (bucket != cur_bucket) |
| { |
| /* |
| * We expect tuples to either belong to current bucket or |
| * new_bucket. This is ensured because we don't allow |
| * further splits from bucket that contains garbage. See |
| * comments in _hash_expandtable. |
| */ |
| Assert(bucket == new_bucket); |
| kill_tuple = true; |
| } |
| } |
| |
| if (kill_tuple) |
| { |
| /* mark the item for deletion */ |
| deletable[ndeletable++] = offno; |
| } |
| else |
| { |
| /* we're keeping it, so count it */ |
| if (num_index_tuples) |
| *num_index_tuples += 1; |
| } |
| } |
| |
| /* retain the pin on primary bucket page till end of bucket scan */ |
| if (blkno == bucket_blkno) |
| retain_pin = true; |
| else |
| retain_pin = false; |
| |
| blkno = opaque->hasho_nextblkno; |
| |
| /* |
| * Apply deletions, advance to next page and write page if needed. |
| */ |
| if (ndeletable > 0) |
| { |
| /* No ereport(ERROR) until changes are logged */ |
| START_CRIT_SECTION(); |
| |
| PageIndexMultiDelete(page, deletable, ndeletable); |
| bucket_dirty = true; |
| |
| /* |
| * Let us mark the page as clean if vacuum removes the DEAD tuples |
| * from an index page. We do this by clearing |
| * LH_PAGE_HAS_DEAD_TUPLES flag. |
| */ |
| if (tuples_removed && *tuples_removed > 0 && |
| H_HAS_DEAD_TUPLES(opaque)) |
| { |
| opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
| clear_dead_marking = true; |
| } |
| |
| MarkBufferDirty(buf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| xl_hash_delete xlrec; |
| XLogRecPtr recptr; |
| |
| xlrec.clear_dead_marking = clear_dead_marking; |
| xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfHashDelete); |
| |
| /* |
| * bucket buffer needs to be registered to ensure that we can |
| * acquire a cleanup lock on it during replay. |
| */ |
| if (!xlrec.is_primary_bucket_page) |
| XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
| |
| XLogRegisterBuffer(1, buf, REGBUF_STANDARD); |
| XLogRegisterBufData(1, (char *) deletable, |
| ndeletable * sizeof(OffsetNumber)); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); |
| PageSetLSN(BufferGetPage(buf), recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| } |
| |
| /* bail out if there are no more pages to scan. */ |
| if (!BlockNumberIsValid(blkno)) |
| break; |
| |
| next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, |
| LH_OVERFLOW_PAGE, |
| bstrategy); |
| |
| /* |
| * release the lock on previous page after acquiring the lock on next |
| * page |
| */ |
| if (retain_pin) |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| else |
| _hash_relbuf(rel, buf); |
| |
| buf = next_buf; |
| } |
| |
| /* |
| * lock the bucket page to clear the garbage flag and squeeze the bucket. |
| * if the current buffer is same as bucket buffer, then we already have |
| * lock on bucket page. |
| */ |
| if (buf != bucket_buf) |
| { |
| _hash_relbuf(rel, buf); |
| LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); |
| } |
| |
| /* |
| * Clear the garbage flag from bucket after deleting the tuples that are |
| * moved by split. We purposefully clear the flag before squeeze bucket, |
| * so that after restart, vacuum shouldn't again try to delete the moved |
| * by split tuples. |
| */ |
| if (split_cleanup) |
| { |
| HashPageOpaque bucket_opaque; |
| Page page; |
| |
| page = BufferGetPage(bucket_buf); |
| bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| |
| /* No ereport(ERROR) until changes are logged */ |
| START_CRIT_SECTION(); |
| |
| bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; |
| MarkBufferDirty(bucket_buf); |
| |
| /* XLOG stuff */ |
| if (RelationNeedsWAL(rel)) |
| { |
| XLogRecPtr recptr; |
| |
| XLogBeginInsert(); |
| XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); |
| |
| recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); |
| PageSetLSN(page, recptr); |
| } |
| |
| END_CRIT_SECTION(); |
| } |
| |
| /* |
| * If we have deleted anything, try to compact free space. For squeezing |
| * the bucket, we must have a cleanup lock, else it can impact the |
| * ordering of tuples for a scan that has started before it. |
| */ |
| if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) |
| _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, |
| bstrategy); |
| else |
| LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); |
| } |