| /*------------------------------------------------------------------------- |
| * |
| * nbtree.c |
| * Implementation of Lehman and Yao's btree management algorithm for |
| * Postgres. |
| * |
| * NOTES |
| * This file contains only the public interface routines. |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/access/nbtree/nbtree.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/nbtree.h" |
| #include "access/nbtxlog.h" |
| #include "access/relscan.h" |
| #include "access/xlog.h" |
| #include "access/xloginsert.h" |
| #include "commands/progress.h" |
| #include "commands/vacuum.h" |
| #include "miscadmin.h" |
| #include "nodes/execnodes.h" |
| #include "pgstat.h" |
| #include "postmaster/autovacuum.h" |
| #include "storage/condition_variable.h" |
| #include "storage/indexfsm.h" |
| #include "storage/ipc.h" |
| #include "storage/lmgr.h" |
| #include "storage/smgr.h" |
| #include "utils/builtins.h" |
| #include "utils/index_selfuncs.h" |
| #include "utils/memutils.h" |
| #include "utils/guc.h" |
| |
| #include "catalog/indexing.h" |
| #include "catalog/pg_namespace.h" |
| |
| /* |
| * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. |
| * |
| * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to |
| * a new page; others must wait. |
| * |
| * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan |
| * to a new page; some process can start doing that. |
| * |
| * BTPARALLEL_DONE indicates that the scan is complete (including error exit). |
| * We reach this state once for every distinct combination of array keys. |
| */ |
| typedef enum |
| { |
| BTPARALLEL_NOT_INITIALIZED, |
| BTPARALLEL_ADVANCING, |
| BTPARALLEL_IDLE, |
| BTPARALLEL_DONE |
| } BTPS_State; |
| |
| /* |
| * BTParallelScanDescData contains btree specific shared information required |
| * for parallel scan. |
| */ |
| typedef struct BTParallelScanDescData |
| { |
| BlockNumber btps_scanPage; /* latest or next page to be scanned */ |
| BTPS_State btps_pageStatus; /* indicates whether next page is |
| * available for scan. see above for |
| * possible states of parallel scan. */ |
| int btps_arrayKeyCount; /* count indicating number of array scan |
| * keys processed by parallel scan */ |
| slock_t btps_mutex; /* protects above variables */ |
| ConditionVariable btps_cv; /* used to synchronize parallel scan */ |
| } BTParallelScanDescData; |
| |
| typedef struct BTParallelScanDescData *BTParallelScanDesc; |
| |
| |
| static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
| IndexBulkDeleteCallback callback, void *callback_state, |
| BTCycleId cycleid); |
| static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno); |
| static BTVacuumPosting btreevacuumposting(BTVacState *vstate, |
| IndexTuple posting, |
| OffsetNumber updatedoffset, |
| int *nremaining); |
| |
| |
| /* |
| * Btree handler function: return IndexAmRoutine with access method parameters |
| * and callbacks. |
| */ |
| Datum |
| bthandler(PG_FUNCTION_ARGS) |
| { |
| IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
| |
| amroutine->amstrategies = BTMaxStrategyNumber; |
| amroutine->amsupport = BTNProcs; |
| amroutine->amoptsprocnum = BTOPTIONS_PROC; |
| amroutine->amcanorder = true; |
| amroutine->amcanorderbyop = false; |
| amroutine->amcanbackward = true; |
| amroutine->amcanunique = true; |
| amroutine->amcanmulticol = true; |
| amroutine->amoptionalkey = true; |
| amroutine->amsearcharray = true; |
| amroutine->amsearchnulls = true; |
| amroutine->amstorage = false; |
| amroutine->amclusterable = true; |
| amroutine->ampredlocks = true; |
| amroutine->amcanparallel = true; |
| amroutine->amcaninclude = true; |
| amroutine->amusemaintenanceworkmem = false; |
| amroutine->amsummarizing = false; |
| amroutine->amparallelvacuumoptions = |
| VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP; |
| amroutine->amkeytype = InvalidOid; |
| |
| amroutine->ambuild = btbuild; |
| amroutine->ambuildempty = btbuildempty; |
| amroutine->aminsert = btinsert; |
| amroutine->ambulkdelete = btbulkdelete; |
| amroutine->amvacuumcleanup = btvacuumcleanup; |
| amroutine->amcanreturn = btcanreturn; |
| amroutine->amcostestimate = btcostestimate; |
| amroutine->amoptions = btoptions; |
| amroutine->amproperty = btproperty; |
| amroutine->ambuildphasename = btbuildphasename; |
| amroutine->amvalidate = btvalidate; |
| amroutine->amadjustmembers = btadjustmembers; |
| amroutine->ambeginscan = btbeginscan; |
| amroutine->amrescan = btrescan; |
| amroutine->amgettuple = btgettuple; |
| amroutine->amgetbitmap = btgetbitmap; |
| amroutine->amendscan = btendscan; |
| amroutine->ammarkpos = btmarkpos; |
| amroutine->amrestrpos = btrestrpos; |
| amroutine->amestimateparallelscan = btestimateparallelscan; |
| amroutine->aminitparallelscan = btinitparallelscan; |
| amroutine->amparallelrescan = btparallelrescan; |
| |
| PG_RETURN_POINTER(amroutine); |
| } |
| |
| /* |
| * btbuildempty() -- build an empty btree index in the initialization fork |
| */ |
| void |
| btbuildempty(Relation index) |
| { |
| bool allequalimage = _bt_allequalimage(index, false); |
| Buffer metabuf; |
| Page metapage; |
| |
| /* |
| * Initalize the metapage. |
| * |
| * Regular index build bypasses the buffer manager and uses smgr functions |
| * directly, with an smgrimmedsync() call at the end. That makes sense |
| * when the index is large, but for an empty index, it's better to use the |
| * buffer cache to avoid the smgrimmedsync(). |
| */ |
| |
| metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); |
| metapage = BufferGetPage(metabuf); |
| PageEncryptInplace(metapage, INIT_FORKNUM, |
| BTREE_METAPAGE); |
| Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); |
| _bt_lockbuf(index, metabuf, BT_WRITE); |
| |
| START_CRIT_SECTION(); |
| |
| metapage = BufferGetPage(metabuf); |
| _bt_initmetapage(metapage, P_NONE, 0, allequalimage); |
| MarkBufferDirty(metabuf); |
| log_newpage_buffer(metabuf, true); |
| |
| END_CRIT_SECTION(); |
| |
| _bt_unlockbuf(index, metabuf); |
| ReleaseBuffer(metabuf); |
| } |
| |
| /* |
| * For a newly inserted heap tid, check if an entry with this tid |
| * already exists in a unique index. If it does, abort the inserting |
| * transaction. |
| */ |
| static void |
| _bt_validate_tid(Relation irel, ItemPointer h_tid) |
| { |
| BlockNumber blkno; |
| BlockNumber num_pages; |
| Buffer buf; |
| Page page; |
| BTPageOpaque opaque; |
| IndexTuple itup; |
| OffsetNumber maxoff, |
| minoff, |
| offnum; |
| |
| elog(DEBUG1, "validating tid (%d,%d) for index (%s)", |
| ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), |
| RelationGetRelationName(irel)); |
| |
| blkno = BTREE_METAPAGE + 1; |
| num_pages = RelationGetNumberOfBlocks(irel); |
| |
| for (; blkno < num_pages; blkno++) |
| { |
| buf = ReadBuffer(irel, blkno); |
| page = BufferGetPage(buf); |
| opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
| if (!PageIsNew(page)) |
| _bt_checkpage(irel, buf); |
| if (P_ISLEAF(opaque)) |
| { |
| minoff = P_FIRSTDATAKEY(opaque); |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (offnum = minoff; |
| offnum <= maxoff; |
| offnum = OffsetNumberNext(offnum)) |
| { |
| itup = (IndexTuple) PageGetItem(page, |
| PageGetItemId(page, offnum)); |
| if (ItemPointerEquals(&itup->t_tid, h_tid)) |
| { |
| Form_pg_attribute key_att = TupleDescAttr(RelationGetDescr(irel), 0); |
| Oid key = InvalidOid; |
| bool isnull; |
| if (key_att->atttypid == OIDOID) |
| { |
| key = DatumGetInt32( |
| index_getattr(itup, 1, RelationGetDescr(irel), &isnull)); |
| elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)", |
| ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), |
| NameStr(key_att->attname), key, RelationGetRelationName(irel)); |
| } |
| else |
| { |
| elog(ERROR, "found tid (%d,%d) already in index (%s)", |
| ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), |
| RelationGetRelationName(irel)); |
| } |
| } |
| } |
| } |
| ReleaseBuffer(buf); |
| } |
| } |
| |
| /* |
| * btinsert() -- insert an index tuple into a btree. |
| * |
| * Descend the tree recursively, find the appropriate location for our |
| * new tuple, and put it there. |
| */ |
| bool |
| btinsert(Relation rel, Datum *values, bool *isnull, |
| ItemPointer ht_ctid, Relation heapRel, |
| IndexUniqueCheck checkUnique, |
| bool indexUnchanged, |
| IndexInfo *indexInfo) |
| { |
| bool result; |
| IndexTuple itup; |
| |
| if (checkUnique && ( |
| (gp_indexcheck_insert == INDEX_CHECK_ALL && RelationIsHeap(heapRel)) || |
| (gp_indexcheck_insert == INDEX_CHECK_SYSTEM && |
| PG_CATALOG_NAMESPACE == RelationGetNamespace(heapRel)))) |
| { |
| _bt_validate_tid(rel, ht_ctid); |
| } |
| |
| /* generate an index tuple */ |
| itup = index_form_tuple(RelationGetDescr(rel), values, isnull); |
| itup->t_tid = *ht_ctid; |
| |
| result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel); |
| |
| pfree(itup); |
| |
| return result; |
| } |
| |
| /* |
| * btgettuple() -- Get the next tuple in the scan. |
| */ |
| bool |
| btgettuple(IndexScanDesc scan, ScanDirection dir) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| bool res; |
| |
| /* btree indexes are never lossy */ |
| scan->xs_recheck = false; |
| |
| /* |
| * If we have any array keys, initialize them during first call for a |
| * scan. We can't do this in btrescan because we don't know the scan |
| * direction at that time. |
| */ |
| if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) |
| { |
| /* punt if we have any unsatisfiable array keys */ |
| if (so->numArrayKeys < 0) |
| return false; |
| |
| _bt_start_array_keys(scan, dir); |
| } |
| |
| /* This loop handles advancing to the next array elements, if any */ |
| do |
| { |
| /* |
| * If we've already initialized this scan, we can just advance it in |
| * the appropriate direction. If we haven't done so yet, we call |
| * _bt_first() to get the first item in the scan. |
| */ |
| if (!BTScanPosIsValid(so->currPos)) |
| res = _bt_first(scan, dir); |
| else |
| { |
| /* |
| * Check to see if we should kill the previously-fetched tuple. |
| */ |
| if (scan->kill_prior_tuple) |
| { |
| /* |
| * Yes, remember it for later. (We'll deal with all such |
| * tuples at once right before leaving the index page.) The |
| * test for numKilled overrun is not just paranoia: if the |
| * caller reverses direction in the indexscan then the same |
| * item might get entered multiple times. It's not worth |
| * trying to optimize that, so we don't detect it, but instead |
| * just forget any excess entries. |
| */ |
| if (so->killedItems == NULL) |
| so->killedItems = (int *) |
| palloc(MaxTIDsPerBTreePage * sizeof(int)); |
| if (so->numKilled < MaxTIDsPerBTreePage) |
| so->killedItems[so->numKilled++] = so->currPos.itemIndex; |
| } |
| |
| /* |
| * Now continue the scan. |
| */ |
| res = _bt_next(scan, dir); |
| } |
| |
| /* If we have a tuple, return it ... */ |
| if (res) |
| break; |
| /* ... otherwise see if we have more array keys to deal with */ |
| } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); |
| |
| return res; |
| } |
| |
| /* |
| * btgetbitmap() -- construct a TIDBitmap. |
| * GPDB_14_MERGE_FIXME: the code of upstream assumes that |
| * bmNode is always non-empty and the type of the second |
| * argument is TIDBitmap *. |
| * It might be possible to align the code with upstream. |
| */ |
| int64 |
| btgetbitmap(IndexScanDesc scan, Node **bmNodeP) |
| { |
| TIDBitmap *tbm; |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| int64 ntids = 0; |
| ItemPointer heapTid; |
| |
| /* |
| * GPDB specific code. Since GPDB also support StreamBitmap |
| * in bitmap index. So normally we need to create specific bitmap |
| * node in the amgetbitmap AM. |
| */ |
| Assert(bmNodeP); |
| if (*bmNodeP == NULL) |
| { |
| /* XXX should we use less than work_mem for this? */ |
| tbm = tbm_create(work_mem * 1024L, scan->dsa); |
| *bmNodeP = (Node *) tbm; |
| } |
| else if (!IsA(*bmNodeP, TIDBitmap)) |
| elog(ERROR, "non btree bitmap"); |
| else |
| tbm = (TIDBitmap *)*bmNodeP; |
| |
| /* |
| * If we have any array keys, initialize them. |
| */ |
| if (so->numArrayKeys) |
| { |
| /* punt if we have any unsatisfiable array keys */ |
| if (so->numArrayKeys < 0) |
| return ntids; |
| |
| _bt_start_array_keys(scan, ForwardScanDirection); |
| } |
| |
| /* This loop handles advancing to the next array elements, if any */ |
| do |
| { |
| /* Fetch the first page & tuple */ |
| if (_bt_first(scan, ForwardScanDirection)) |
| { |
| /* Save tuple ID, and continue scanning */ |
| heapTid = &scan->xs_heaptid; |
| tbm_add_tuples(tbm, heapTid, 1, false); |
| ntids++; |
| |
| for (;;) |
| { |
| /* |
| * Advance to next tuple within page. This is the same as the |
| * easy case in _bt_next(). |
| */ |
| if (++so->currPos.itemIndex > so->currPos.lastItem) |
| { |
| /* let _bt_next do the heavy lifting */ |
| if (!_bt_next(scan, ForwardScanDirection)) |
| break; |
| } |
| |
| /* Save tuple ID, and continue scanning */ |
| heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; |
| tbm_add_tuples(tbm, heapTid, 1, false); |
| ntids++; |
| } |
| } |
| /* Now see if we have more array keys to deal with */ |
| } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); |
| |
| return ntids; |
| } |
| |
| /* |
| * btbeginscan() -- start a scan on a btree index |
| */ |
| IndexScanDesc |
| btbeginscan(Relation rel, int nkeys, int norderbys) |
| { |
| IndexScanDesc scan; |
| BTScanOpaque so; |
| |
| /* no order by operators allowed */ |
| Assert(norderbys == 0); |
| |
| /* get the scan */ |
| scan = RelationGetIndexScan(rel, nkeys, norderbys); |
| |
| /* allocate private workspace */ |
| so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); |
| BTScanPosInvalidate(so->currPos); |
| BTScanPosInvalidate(so->markPos); |
| if (scan->numberOfKeys > 0) |
| so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); |
| else |
| so->keyData = NULL; |
| |
| so->arrayKeyData = NULL; /* assume no array keys for now */ |
| so->arraysStarted = false; |
| so->numArrayKeys = 0; |
| so->arrayKeys = NULL; |
| so->arrayContext = NULL; |
| |
| so->killedItems = NULL; /* until needed */ |
| so->numKilled = 0; |
| |
| /* |
| * We don't know yet whether the scan will be index-only, so we do not |
| * allocate the tuple workspace arrays until btrescan. However, we set up |
| * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. |
| */ |
| so->currTuples = so->markTuples = NULL; |
| |
| scan->xs_itupdesc = RelationGetDescr(rel); |
| |
| scan->opaque = so; |
| |
| return scan; |
| } |
| |
| /* |
| * btrescan() -- rescan an index relation |
| */ |
| void |
| btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
| ScanKey orderbys, int norderbys) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| |
| /* we aren't holding any read locks, but gotta drop the pins */ |
| if (BTScanPosIsValid(so->currPos)) |
| { |
| /* Before leaving current page, deal with any killed items */ |
| if (so->numKilled > 0) |
| _bt_killitems(scan); |
| BTScanPosUnpinIfPinned(so->currPos); |
| BTScanPosInvalidate(so->currPos); |
| } |
| |
| so->markItemIndex = -1; |
| so->arrayKeyCount = 0; |
| BTScanPosUnpinIfPinned(so->markPos); |
| BTScanPosInvalidate(so->markPos); |
| |
| /* |
| * Allocate tuple workspace arrays, if needed for an index-only scan and |
| * not already done in a previous rescan call. To save on palloc |
| * overhead, both workspaces are allocated as one palloc block; only this |
| * function and btendscan know that. |
| * |
| * NOTE: this data structure also makes it safe to return data from a |
| * "name" column, even though btree name_ops uses an underlying storage |
| * datatype of cstring. The risk there is that "name" is supposed to be |
| * padded to NAMEDATALEN, but the actual index tuple is probably shorter. |
| * However, since we only return data out of tuples sitting in the |
| * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some |
| * data out of the markTuples array --- running off the end of memory for |
| * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats |
| * adding special-case treatment for name_ops elsewhere. |
| */ |
| if (scan->xs_want_itup && so->currTuples == NULL) |
| { |
| so->currTuples = (char *) palloc(BLCKSZ * 2); |
| so->markTuples = so->currTuples + BLCKSZ; |
| } |
| |
| /* |
| * Reset the scan keys |
| */ |
| if (scankey && scan->numberOfKeys > 0) |
| memmove(scan->keyData, |
| scankey, |
| scan->numberOfKeys * sizeof(ScanKeyData)); |
| so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ |
| |
| /* If any keys are SK_SEARCHARRAY type, set up array-key info */ |
| _bt_preprocess_array_keys(scan); |
| } |
| |
| /* |
| * btendscan() -- close down a scan |
| */ |
| void |
| btendscan(IndexScanDesc scan) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| |
| /* we aren't holding any read locks, but gotta drop the pins */ |
| if (BTScanPosIsValid(so->currPos)) |
| { |
| /* Before leaving current page, deal with any killed items */ |
| if (so->numKilled > 0) |
| _bt_killitems(scan); |
| BTScanPosUnpinIfPinned(so->currPos); |
| } |
| |
| so->markItemIndex = -1; |
| BTScanPosUnpinIfPinned(so->markPos); |
| |
| /* No need to invalidate positions, the RAM is about to be freed. */ |
| |
| /* Release storage */ |
| if (so->keyData != NULL) |
| pfree(so->keyData); |
| /* so->arrayKeyData and so->arrayKeys are in arrayContext */ |
| if (so->arrayContext != NULL) |
| MemoryContextDelete(so->arrayContext); |
| if (so->killedItems != NULL) |
| pfree(so->killedItems); |
| if (so->currTuples != NULL) |
| pfree(so->currTuples); |
| /* so->markTuples should not be pfree'd, see btrescan */ |
| pfree(so); |
| } |
| |
| /* |
| * btmarkpos() -- save current scan position |
| */ |
| void |
| btmarkpos(IndexScanDesc scan) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| |
| /* There may be an old mark with a pin (but no lock). */ |
| BTScanPosUnpinIfPinned(so->markPos); |
| |
| /* |
| * Just record the current itemIndex. If we later step to next page |
| * before releasing the marked position, _bt_steppage makes a full copy of |
| * the currPos struct in markPos. If (as often happens) the mark is moved |
| * before we leave the page, we don't have to do that work. |
| */ |
| if (BTScanPosIsValid(so->currPos)) |
| so->markItemIndex = so->currPos.itemIndex; |
| else |
| { |
| BTScanPosInvalidate(so->markPos); |
| so->markItemIndex = -1; |
| } |
| |
| /* Also record the current positions of any array keys */ |
| if (so->numArrayKeys) |
| _bt_mark_array_keys(scan); |
| } |
| |
| /* |
| * btrestrpos() -- restore scan to last saved position |
| */ |
| void |
| btrestrpos(IndexScanDesc scan) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| |
| /* Restore the marked positions of any array keys */ |
| if (so->numArrayKeys) |
| _bt_restore_array_keys(scan); |
| |
| if (so->markItemIndex >= 0) |
| { |
| /* |
| * The scan has never moved to a new page since the last mark. Just |
| * restore the itemIndex. |
| * |
| * NB: In this case we can't count on anything in so->markPos to be |
| * accurate. |
| */ |
| so->currPos.itemIndex = so->markItemIndex; |
| } |
| else |
| { |
| /* |
| * The scan moved to a new page after last mark or restore, and we are |
| * now restoring to the marked page. We aren't holding any read |
| * locks, but if we're still holding the pin for the current position, |
| * we must drop it. |
| */ |
| if (BTScanPosIsValid(so->currPos)) |
| { |
| /* Before leaving current page, deal with any killed items */ |
| if (so->numKilled > 0) |
| _bt_killitems(scan); |
| BTScanPosUnpinIfPinned(so->currPos); |
| } |
| |
| if (BTScanPosIsValid(so->markPos)) |
| { |
| /* bump pin on mark buffer for assignment to current buffer */ |
| if (BTScanPosIsPinned(so->markPos)) |
| IncrBufferRefCount(so->markPos.buf); |
| memcpy(&so->currPos, &so->markPos, |
| offsetof(BTScanPosData, items[1]) + |
| so->markPos.lastItem * sizeof(BTScanPosItem)); |
| if (so->currTuples) |
| memcpy(so->currTuples, so->markTuples, |
| so->markPos.nextTupleOffset); |
| } |
| else |
| BTScanPosInvalidate(so->currPos); |
| } |
| } |
| |
| /* |
| * btestimateparallelscan -- estimate storage for BTParallelScanDescData |
| */ |
| Size |
| btestimateparallelscan(void) |
| { |
| return sizeof(BTParallelScanDescData); |
| } |
| |
| /* |
| * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan |
| */ |
| void |
| btinitparallelscan(void *target) |
| { |
| BTParallelScanDesc bt_target = (BTParallelScanDesc) target; |
| |
| SpinLockInit(&bt_target->btps_mutex); |
| bt_target->btps_scanPage = InvalidBlockNumber; |
| bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; |
| bt_target->btps_arrayKeyCount = 0; |
| ConditionVariableInit(&bt_target->btps_cv); |
| } |
| |
| /* |
| * btparallelrescan() -- reset parallel scan |
| */ |
| void |
| btparallelrescan(IndexScanDesc scan) |
| { |
| BTParallelScanDesc btscan; |
| ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
| |
| Assert(parallel_scan); |
| |
| btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
| parallel_scan->ps_offset); |
| |
| /* |
| * In theory, we don't need to acquire the spinlock here, because there |
| * shouldn't be any other workers running at this point, but we do so for |
| * consistency. |
| */ |
| SpinLockAcquire(&btscan->btps_mutex); |
| btscan->btps_scanPage = InvalidBlockNumber; |
| btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; |
| btscan->btps_arrayKeyCount = 0; |
| SpinLockRelease(&btscan->btps_mutex); |
| } |
| |
| /* |
| * _bt_parallel_seize() -- Begin the process of advancing the scan to a new |
| * page. Other scans must wait until we call _bt_parallel_release() |
| * or _bt_parallel_done(). |
| * |
| * The return value is true if we successfully seized the scan and false |
| * if we did not. The latter case occurs if no pages remain for the current |
| * set of scankeys. |
| * |
| * If the return value is true, *pageno returns the next or current page |
| * of the scan (depending on the scan direction). An invalid block number |
| * means the scan hasn't yet started, and P_NONE means we've reached the end. |
| * The first time a participating process reaches the last page, it will return |
| * true and set *pageno to P_NONE; after that, further attempts to seize the |
| * scan will return false. |
| * |
| * Callers should ignore the value of pageno if the return value is false. |
| */ |
| bool |
| _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| BTPS_State pageStatus; |
| bool exit_loop = false; |
| bool status = true; |
| ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
| BTParallelScanDesc btscan; |
| |
| *pageno = P_NONE; |
| |
| btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
| parallel_scan->ps_offset); |
| |
| while (1) |
| { |
| SpinLockAcquire(&btscan->btps_mutex); |
| pageStatus = btscan->btps_pageStatus; |
| |
| if (so->arrayKeyCount < btscan->btps_arrayKeyCount) |
| { |
| /* Parallel scan has already advanced to a new set of scankeys. */ |
| status = false; |
| } |
| else if (pageStatus == BTPARALLEL_DONE) |
| { |
| /* |
| * We're done with this set of scankeys. This may be the end, or |
| * there could be more sets to try. |
| */ |
| status = false; |
| } |
| else if (pageStatus != BTPARALLEL_ADVANCING) |
| { |
| /* |
| * We have successfully seized control of the scan for the purpose |
| * of advancing it to a new page! |
| */ |
| btscan->btps_pageStatus = BTPARALLEL_ADVANCING; |
| *pageno = btscan->btps_scanPage; |
| exit_loop = true; |
| } |
| SpinLockRelease(&btscan->btps_mutex); |
| if (exit_loop || !status) |
| break; |
| ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); |
| } |
| ConditionVariableCancelSleep(); |
| |
| return status; |
| } |
| |
| /* |
| * _bt_parallel_release() -- Complete the process of advancing the scan to a |
| * new page. We now have the new value btps_scanPage; some other backend |
| * can now begin advancing the scan. |
| */ |
| void |
| _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) |
| { |
| ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
| BTParallelScanDesc btscan; |
| |
| btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
| parallel_scan->ps_offset); |
| |
| SpinLockAcquire(&btscan->btps_mutex); |
| btscan->btps_scanPage = scan_page; |
| btscan->btps_pageStatus = BTPARALLEL_IDLE; |
| SpinLockRelease(&btscan->btps_mutex); |
| ConditionVariableSignal(&btscan->btps_cv); |
| } |
| |
| /* |
| * _bt_parallel_done() -- Mark the parallel scan as complete. |
| * |
| * When there are no pages left to scan, this function should be called to |
| * notify other workers. Otherwise, they might wait forever for the scan to |
| * advance to the next page. |
| */ |
| void |
| _bt_parallel_done(IndexScanDesc scan) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
| BTParallelScanDesc btscan; |
| bool status_changed = false; |
| |
| /* Do nothing, for non-parallel scans */ |
| if (parallel_scan == NULL) |
| return; |
| |
| btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
| parallel_scan->ps_offset); |
| |
| /* |
| * Mark the parallel scan as done for this combination of scan keys, |
| * unless some other process already did so. See also |
| * _bt_advance_array_keys. |
| */ |
| SpinLockAcquire(&btscan->btps_mutex); |
| if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && |
| btscan->btps_pageStatus != BTPARALLEL_DONE) |
| { |
| btscan->btps_pageStatus = BTPARALLEL_DONE; |
| status_changed = true; |
| } |
| SpinLockRelease(&btscan->btps_mutex); |
| |
| /* wake up all the workers associated with this parallel scan */ |
| if (status_changed) |
| ConditionVariableBroadcast(&btscan->btps_cv); |
| } |
| |
| /* |
| * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array |
| * keys. |
| * |
| * Updates the count of array keys processed for both local and parallel |
| * scans. |
| */ |
| void |
| _bt_parallel_advance_array_keys(IndexScanDesc scan) |
| { |
| BTScanOpaque so = (BTScanOpaque) scan->opaque; |
| ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
| BTParallelScanDesc btscan; |
| |
| btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
| parallel_scan->ps_offset); |
| |
| so->arrayKeyCount++; |
| SpinLockAcquire(&btscan->btps_mutex); |
| if (btscan->btps_pageStatus == BTPARALLEL_DONE) |
| { |
| btscan->btps_scanPage = InvalidBlockNumber; |
| btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; |
| btscan->btps_arrayKeyCount++; |
| } |
| SpinLockRelease(&btscan->btps_mutex); |
| } |
| |
| /* |
| * Bulk deletion of all index entries pointing to a set of heap tuples. |
| * The set of target tuples is specified via a callback routine that tells |
| * whether any given heap tuple (identified by ItemPointer) is being deleted. |
| * |
| * Result: a palloc'd struct containing statistical info for VACUUM displays. |
| */ |
| IndexBulkDeleteResult * |
| btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
| IndexBulkDeleteCallback callback, void *callback_state) |
| { |
| Relation rel = info->index; |
| BTCycleId cycleid; |
| |
| /* allocate stats if first time through, else re-use existing struct */ |
| if (stats == NULL) |
| stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
| |
| /* Establish the vacuum cycle ID to use for this scan */ |
| /* The ENSURE stuff ensures we clean up shared memory on failure */ |
| PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); |
| { |
| cycleid = _bt_start_vacuum(rel); |
| |
| btvacuumscan(info, stats, callback, callback_state, cycleid); |
| } |
| PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); |
| _bt_end_vacuum(rel); |
| |
| return stats; |
| } |
| |
| /* |
| * Post-VACUUM cleanup. |
| * |
| * Result: a palloc'd struct containing statistical info for VACUUM displays. |
| */ |
| IndexBulkDeleteResult * |
| btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
| { |
| BlockNumber num_delpages; |
| |
| /* No-op in ANALYZE ONLY mode */ |
| if (info->analyze_only) |
| return stats; |
| |
| /* |
| * If btbulkdelete was called, we need not do anything (we just maintain |
| * the information used within _bt_vacuum_needs_cleanup() by calling |
| * _bt_set_cleanup_info() below). |
| * |
| * If btbulkdelete was _not_ called, then we have a choice to make: we |
| * must decide whether or not a btvacuumscan() call is needed now (i.e. |
| * whether the ongoing VACUUM operation can entirely avoid a physical scan |
| * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us |
| * now. |
| */ |
| if (stats == NULL) |
| { |
| /* Check if VACUUM operation can entirely avoid btvacuumscan() call */ |
| if (!_bt_vacuum_needs_cleanup(info->index)) |
| return NULL; |
| |
| /* |
| * Since we aren't going to actually delete any leaf items, there's no |
| * need to go through all the vacuum-cycle-ID pushups here. |
| * |
| * Posting list tuples are a source of inaccuracy for cleanup-only |
| * scans. btvacuumscan() will assume that the number of index tuples |
| * from each page can be used as num_index_tuples, even though |
| * num_index_tuples is supposed to represent the number of TIDs in the |
| * index. This naive approach can underestimate the number of tuples |
| * in the index significantly. |
| * |
| * We handle the problem by making num_index_tuples an estimate in |
| * cleanup-only case. |
| */ |
| stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
| btvacuumscan(info, stats, NULL, NULL, 0); |
| } |
| |
| /* |
| * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup(). |
| * |
| * num_delpages is the number of deleted pages now in the index that were |
| * not safe to place in the FSM to be recycled just yet. num_delpages is |
| * greater than 0 only when _bt_pagedel() actually deleted pages during |
| * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must |
| * have failed to place any newly deleted pages in the FSM just moments |
| * ago. (Actually, there are edge cases where recycling of the current |
| * VACUUM's newly deleted pages does not even become safe by the time the |
| * next VACUUM comes around. See nbtree/README.) |
| */ |
| Assert(stats->pages_deleted >= stats->pages_free); |
| num_delpages = stats->pages_deleted - stats->pages_free; |
| _bt_set_cleanup_info(info->index, num_delpages); |
| |
| /* |
| * It's quite possible for us to be fooled by concurrent page splits into |
| * double-counting some index tuples, so disbelieve any total that exceeds |
| * the underlying heap's count ... if we know that accurately. Otherwise |
| * this might just make matters worse. |
| * |
| * Posting list tuples are another source of inaccuracy. Cleanup-only |
| * btvacuumscan calls assume that the number of index tuples can be used |
| * as num_index_tuples, even though num_index_tuples is supposed to |
| * represent the number of TIDs in the index. This naive approach can |
| * underestimate the number of tuples in the index. |
| */ |
| if (!info->estimated_count) |
| { |
| if (stats->num_index_tuples > info->num_heap_tuples) |
| stats->num_index_tuples = info->num_heap_tuples; |
| } |
| |
| return stats; |
| } |
| |
| /* |
| * btvacuumscan --- scan the index for VACUUMing purposes |
| * |
| * This combines the functions of looking for leaf tuples that are deletable |
| * according to the vacuum callback, looking for empty pages that can be |
| * deleted, and looking for old deleted pages that can be recycled. Both |
| * btbulkdelete and btvacuumcleanup invoke this (the latter only if no |
| * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true). |
| * |
| * The caller is responsible for initially allocating/zeroing a stats struct |
| * and for obtaining a vacuum cycle ID if necessary. |
| */ |
| static void |
| btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
| IndexBulkDeleteCallback callback, void *callback_state, |
| BTCycleId cycleid) |
| { |
| Relation rel = info->index; |
| BTVacState vstate; |
| BlockNumber num_pages; |
| BlockNumber scanblkno; |
| bool needLock; |
| |
| /* |
| * Reset fields that track information about the entire index now. This |
| * avoids double-counting in the case where a single VACUUM command |
| * requires multiple scans of the index. |
| * |
| * Avoid resetting the tuples_removed and pages_newly_deleted fields here, |
| * since they track information about the VACUUM command, and so must last |
| * across each call to btvacuumscan(). |
| * |
| * (Note that pages_free is treated as state about the whole index, not |
| * the current VACUUM. This is appropriate because RecordFreeIndexPage() |
| * calls are idempotent, and get repeated for the same deleted pages in |
| * some scenarios. The point for us is to track the number of recyclable |
| * pages in the index at the end of the VACUUM command.) |
| */ |
| stats->num_pages = 0; |
| stats->num_index_tuples = 0; |
| stats->pages_deleted = 0; |
| stats->pages_free = 0; |
| |
| /* Set up info to pass down to btvacuumpage */ |
| vstate.info = info; |
| vstate.stats = stats; |
| vstate.callback = callback; |
| vstate.callback_state = callback_state; |
| vstate.cycleid = cycleid; |
| |
| /* Create a temporary memory context to run _bt_pagedel in */ |
| vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, |
| "_bt_pagedel", |
| ALLOCSET_DEFAULT_SIZES); |
| |
| /* Initialize vstate fields used by _bt_pendingfsm_finalize */ |
| vstate.bufsize = 0; |
| vstate.maxbufsize = 0; |
| vstate.pendingpages = NULL; |
| vstate.npendingpages = 0; |
| /* Consider applying _bt_pendingfsm_finalize optimization */ |
| _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); |
| |
| /* |
| * The outer loop iterates over all index pages except the metapage, in |
| * physical order (we hope the kernel will cooperate in providing |
| * read-ahead for speed). It is critical that we visit all leaf pages, |
| * including ones added after we start the scan, else we might fail to |
| * delete some deletable tuples. Hence, we must repeatedly check the |
| * relation length. We must acquire the relation-extension lock while |
| * doing so to avoid a race condition: if someone else is extending the |
| * relation, there is a window where bufmgr/smgr have created a new |
| * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If |
| * we manage to scan such a page here, we'll improperly assume it can be |
| * recycled. Taking the lock synchronizes things enough to prevent a |
| * problem: either num_pages won't include the new page, or _bt_getbuf |
| * already has write lock on the buffer and it will be fully initialized |
| * before we can examine it. Also, we need not worry if a page is added |
| * immediately after we look; the page splitting code already has |
| * write-lock on the left page before it adds a right page, so we must |
| * already have processed any tuples due to be moved into such a page. |
| * |
| * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't |
| * think the use of the extension lock is still required. |
| * |
| * We can skip locking for new or temp relations, however, since no one |
| * else could be accessing them. |
| */ |
| needLock = !RELATION_IS_LOCAL(rel); |
| |
| scanblkno = BTREE_METAPAGE + 1; |
| for (;;) |
| { |
| /* Get the current relation length */ |
| if (needLock) |
| LockRelationForExtension(rel, ExclusiveLock); |
| num_pages = RelationGetNumberOfBlocks(rel); |
| if (needLock) |
| UnlockRelationForExtension(rel, ExclusiveLock); |
| |
| if (info->report_progress) |
| pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, |
| num_pages); |
| |
| /* Quit if we've scanned the whole relation */ |
| if (scanblkno >= num_pages) |
| break; |
| /* Iterate over pages, then loop back to recheck length */ |
| for (; scanblkno < num_pages; scanblkno++) |
| { |
| btvacuumpage(&vstate, scanblkno); |
| if (info->report_progress) |
| pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, |
| scanblkno); |
| } |
| } |
| |
| /* Set statistics num_pages field to final size of index */ |
| stats->num_pages = num_pages; |
| |
| MemoryContextDelete(vstate.pagedelcontext); |
| |
| /* |
| * If there were any calls to _bt_pagedel() during scan of the index then |
| * see if any of the resulting pages can be placed in the FSM now. When |
| * it's not safe we'll have to leave it up to a future VACUUM operation. |
| * |
| * Finally, if we placed any pages in the FSM (either just now or during |
| * the scan), forcibly update the upper-level FSM pages to ensure that |
| * searchers can find them. |
| */ |
| _bt_pendingfsm_finalize(rel, &vstate); |
| if (stats->pages_free > 0) |
| IndexFreeSpaceMapVacuum(rel); |
| } |
| |
| /* |
| * btvacuumpage --- VACUUM one page |
| * |
| * This processes a single page for btvacuumscan(). In some cases we must |
| * backtrack to re-examine and VACUUM pages that were the scanblkno during |
| * a previous call here. This is how we handle page splits (that happened |
| * after our cycleid was acquired) whose right half page happened to reuse |
| * a block that we might have processed at some point before it was |
| * recycled (i.e. before the page split). |
| */ |
| static void |
| btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) |
| { |
| IndexVacuumInfo *info = vstate->info; |
| IndexBulkDeleteResult *stats = vstate->stats; |
| IndexBulkDeleteCallback callback = vstate->callback; |
| void *callback_state = vstate->callback_state; |
| Relation rel = info->index; |
| Relation heaprel = info->heaprel; |
| bool attempt_pagedel; |
| BlockNumber blkno, |
| backtrack_to; |
| Buffer buf; |
| Page page; |
| BTPageOpaque opaque; |
| |
| blkno = scanblkno; |
| |
| backtrack: |
| |
| attempt_pagedel = false; |
| backtrack_to = P_NONE; |
| |
| /* call vacuum_delay_point while not holding any buffer lock */ |
| vacuum_delay_point(); |
| |
| /* |
| * We can't use _bt_getbuf() here because it always applies |
| * _bt_checkpage(), which will barf on an all-zero page. We want to |
| * recycle all-zero pages, not fail. Also, we want to use a nondefault |
| * buffer access strategy. |
| */ |
| buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, |
| info->strategy); |
| _bt_lockbuf(rel, buf, BT_READ); |
| page = BufferGetPage(buf); |
| opaque = NULL; |
| if (!PageIsNew(page)) |
| { |
| _bt_checkpage(rel, buf); |
| opaque = BTPageGetOpaque(page); |
| } |
| |
| Assert(blkno <= scanblkno); |
| if (blkno != scanblkno) |
| { |
| /* |
| * We're backtracking. |
| * |
| * We followed a right link to a sibling leaf page (a page that |
| * happens to be from a block located before scanblkno). The only |
| * case we want to do anything with is a live leaf page having the |
| * current vacuum cycle ID. |
| * |
| * The page had better be in a state that's consistent with what we |
| * expect. Check for conditions that imply corruption in passing. It |
| * can't be half-dead because only an interrupted VACUUM process can |
| * leave pages in that state, so we'd definitely have dealt with it |
| * back when the page was the scanblkno page (half-dead pages are |
| * always marked fully deleted by _bt_pagedel(), barring corruption). |
| */ |
| if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque)) |
| { |
| Assert(false); |
| ereport(LOG, |
| (errcode(ERRCODE_INDEX_CORRUPTED), |
| errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"", |
| blkno, scanblkno, RelationGetRelationName(rel)))); |
| _bt_relbuf(rel, buf); |
| return; |
| } |
| |
| /* |
| * We may have already processed the page in an earlier call, when the |
| * page was scanblkno. This happens when the leaf page split occurred |
| * after the scan began, but before the right sibling page became the |
| * scanblkno. |
| * |
| * Page may also have been deleted by current btvacuumpage() call, |
| * since _bt_pagedel() sometimes deletes the right sibling page of |
| * scanblkno in passing (it does so after we decided where to |
| * backtrack to). We don't need to process this page as a deleted |
| * page a second time now (in fact, it would be wrong to count it as a |
| * deleted page in the bulk delete statistics a second time). |
| */ |
| if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque)) |
| { |
| /* Done with current scanblkno (and all lower split pages) */ |
| _bt_relbuf(rel, buf); |
| return; |
| } |
| } |
| |
| if (!opaque || BTPageIsRecyclable(page, heaprel)) |
| { |
| /* Okay to recycle this page (which could be leaf or internal) */ |
| RecordFreeIndexPage(rel, blkno); |
| stats->pages_deleted++; |
| stats->pages_free++; |
| } |
| else if (P_ISDELETED(opaque)) |
| { |
| /* |
| * Already deleted page (which could be leaf or internal). Can't |
| * recycle yet. |
| */ |
| stats->pages_deleted++; |
| } |
| else if (P_ISHALFDEAD(opaque)) |
| { |
| /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */ |
| attempt_pagedel = true; |
| |
| /* |
| * _bt_pagedel() will increment both pages_newly_deleted and |
| * pages_deleted stats in all cases (barring corruption) |
| */ |
| } |
| else if (P_ISLEAF(opaque)) |
| { |
| OffsetNumber deletable[MaxIndexTuplesPerPage]; |
| int ndeletable; |
| BTVacuumPosting updatable[MaxIndexTuplesPerPage]; |
| int nupdatable; |
| OffsetNumber offnum, |
| minoff, |
| maxoff; |
| int nhtidsdead, |
| nhtidslive; |
| |
| /* |
| * Trade in the initial read lock for a full cleanup lock on this |
| * page. We must get such a lock on every leaf page over the course |
| * of the vacuum scan, whether or not it actually contains any |
| * deletable tuples --- see nbtree/README. |
| */ |
| _bt_upgradelockbufcleanup(rel, buf); |
| |
| /* |
| * Check whether we need to backtrack to earlier pages. What we are |
| * concerned about is a page split that happened since we started the |
| * vacuum scan. If the split moved tuples on the right half of the |
| * split (i.e. the tuples that sort high) to a block that we already |
| * passed over, then we might have missed the tuples. We need to |
| * backtrack now. (Must do this before possibly clearing btpo_cycleid |
| * or deleting scanblkno page below!) |
| */ |
| if (vstate->cycleid != 0 && |
| opaque->btpo_cycleid == vstate->cycleid && |
| !(opaque->btpo_flags & BTP_SPLIT_END) && |
| !P_RIGHTMOST(opaque) && |
| opaque->btpo_next < scanblkno) |
| backtrack_to = opaque->btpo_next; |
| |
| ndeletable = 0; |
| nupdatable = 0; |
| minoff = P_FIRSTDATAKEY(opaque); |
| maxoff = PageGetMaxOffsetNumber(page); |
| nhtidsdead = 0; |
| nhtidslive = 0; |
| if (callback) |
| { |
| /* btbulkdelete callback tells us what to delete (or update) */ |
| for (offnum = minoff; |
| offnum <= maxoff; |
| offnum = OffsetNumberNext(offnum)) |
| { |
| IndexTuple itup; |
| |
| itup = (IndexTuple) PageGetItem(page, |
| PageGetItemId(page, offnum)); |
| |
| Assert(!BTreeTupleIsPivot(itup)); |
| if (!BTreeTupleIsPosting(itup)) |
| { |
| /* Regular tuple, standard table TID representation */ |
| if (callback(&itup->t_tid, callback_state)) |
| { |
| deletable[ndeletable++] = offnum; |
| nhtidsdead++; |
| } |
| else |
| nhtidslive++; |
| } |
| else |
| { |
| BTVacuumPosting vacposting; |
| int nremaining; |
| |
| /* Posting list tuple */ |
| vacposting = btreevacuumposting(vstate, itup, offnum, |
| &nremaining); |
| if (vacposting == NULL) |
| { |
| /* |
| * All table TIDs from the posting tuple remain, so no |
| * delete or update required |
| */ |
| Assert(nremaining == BTreeTupleGetNPosting(itup)); |
| } |
| else if (nremaining > 0) |
| { |
| |
| /* |
| * Store metadata about posting list tuple in |
| * updatable array for entire page. Existing tuple |
| * will be updated during the later call to |
| * _bt_delitems_vacuum(). |
| */ |
| Assert(nremaining < BTreeTupleGetNPosting(itup)); |
| updatable[nupdatable++] = vacposting; |
| nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; |
| } |
| else |
| { |
| /* |
| * All table TIDs from the posting list must be |
| * deleted. We'll delete the index tuple completely |
| * (no update required). |
| */ |
| Assert(nremaining == 0); |
| deletable[ndeletable++] = offnum; |
| nhtidsdead += BTreeTupleGetNPosting(itup); |
| pfree(vacposting); |
| } |
| |
| nhtidslive += nremaining; |
| } |
| } |
| } |
| |
| /* |
| * Apply any needed deletes or updates. We issue just one |
| * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic. |
| */ |
| if (ndeletable > 0 || nupdatable > 0) |
| { |
| Assert(nhtidsdead >= ndeletable + nupdatable); |
| _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, |
| nupdatable); |
| |
| stats->tuples_removed += nhtidsdead; |
| /* must recompute maxoff */ |
| maxoff = PageGetMaxOffsetNumber(page); |
| |
| /* can't leak memory here */ |
| for (int i = 0; i < nupdatable; i++) |
| pfree(updatable[i]); |
| } |
| else |
| { |
| /* |
| * If the leaf page has been split during this vacuum cycle, it |
| * seems worth expending a write to clear btpo_cycleid even if we |
| * don't have any deletions to do. (If we do, _bt_delitems_vacuum |
| * takes care of this.) This ensures we won't process the page |
| * again. |
| * |
| * We treat this like a hint-bit update because there's no need to |
| * WAL-log it. |
| */ |
| Assert(nhtidsdead == 0); |
| if (vstate->cycleid != 0 && |
| opaque->btpo_cycleid == vstate->cycleid) |
| { |
| opaque->btpo_cycleid = 0; |
| MarkBufferDirtyHint(buf, true); |
| } |
| } |
| |
| /* |
| * If the leaf page is now empty, try to delete it; else count the |
| * live tuples (live table TIDs in posting lists are counted as |
| * separate live tuples). We don't delete when backtracking, though, |
| * since that would require teaching _bt_pagedel() about backtracking |
| * (doesn't seem worth adding more complexity to deal with that). |
| * |
| * We don't count the number of live TIDs during cleanup-only calls to |
| * btvacuumscan (i.e. when callback is not set). We count the number |
| * of index tuples directly instead. This avoids the expense of |
| * directly examining all of the tuples on each page. VACUUM will |
| * treat num_index_tuples as an estimate in cleanup-only case, so it |
| * doesn't matter that this underestimates num_index_tuples |
| * significantly in some cases. |
| */ |
| if (minoff > maxoff) |
| attempt_pagedel = (blkno == scanblkno); |
| else if (callback) |
| stats->num_index_tuples += nhtidslive; |
| else |
| stats->num_index_tuples += maxoff - minoff + 1; |
| |
| Assert(!attempt_pagedel || nhtidslive == 0); |
| } |
| |
| if (attempt_pagedel) |
| { |
| MemoryContext oldcontext; |
| |
| /* Run pagedel in a temp context to avoid memory leakage */ |
| MemoryContextReset(vstate->pagedelcontext); |
| oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); |
| |
| /* |
| * _bt_pagedel maintains the bulk delete stats on our behalf; |
| * pages_newly_deleted and pages_deleted are likely to be incremented |
| * during call |
| */ |
| Assert(blkno == scanblkno); |
| _bt_pagedel(rel, buf, vstate); |
| |
| MemoryContextSwitchTo(oldcontext); |
| /* pagedel released buffer, so we shouldn't */ |
| } |
| else |
| _bt_relbuf(rel, buf); |
| |
| if (backtrack_to != P_NONE) |
| { |
| blkno = backtrack_to; |
| goto backtrack; |
| } |
| } |
| |
| /* |
| * btreevacuumposting --- determine TIDs still needed in posting list |
| * |
| * Returns metadata describing how to build replacement tuple without the TIDs |
| * that VACUUM needs to delete. Returned value is NULL in the common case |
| * where no changes are needed to caller's posting list tuple (we avoid |
| * allocating memory here as an optimization). |
| * |
| * The number of TIDs that should remain in the posting list tuple is set for |
| * caller in *nremaining. |
| */ |
| static BTVacuumPosting |
| btreevacuumposting(BTVacState *vstate, IndexTuple posting, |
| OffsetNumber updatedoffset, int *nremaining) |
| { |
| int live = 0; |
| int nitem = BTreeTupleGetNPosting(posting); |
| ItemPointer items = BTreeTupleGetPosting(posting); |
| BTVacuumPosting vacposting = NULL; |
| |
| for (int i = 0; i < nitem; i++) |
| { |
| if (!vstate->callback(items + i, vstate->callback_state)) |
| { |
| /* Live table TID */ |
| live++; |
| } |
| else if (vacposting == NULL) |
| { |
| /* |
| * First dead table TID encountered. |
| * |
| * It's now clear that we need to delete one or more dead table |
| * TIDs, so start maintaining metadata describing how to update |
| * existing posting list tuple. |
| */ |
| vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + |
| nitem * sizeof(uint16)); |
| |
| vacposting->itup = posting; |
| vacposting->updatedoffset = updatedoffset; |
| vacposting->ndeletedtids = 0; |
| vacposting->deletetids[vacposting->ndeletedtids++] = i; |
| } |
| else |
| { |
| /* Second or subsequent dead table TID */ |
| vacposting->deletetids[vacposting->ndeletedtids++] = i; |
| } |
| } |
| |
| *nremaining = live; |
| return vacposting; |
| } |
| |
| /* |
| * btcanreturn() -- Check whether btree indexes support index-only scans. |
| * |
| * btrees always do, so this is trivial. |
| */ |
| bool |
| btcanreturn(Relation index, int attno) |
| { |
| return true; |
| } |