src/backend/access/nbtree/nbtree.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * nbtree.c
  *	  Implementation of Lehman and Yao's btree management algorithm for
  *	  Postgres.
  *
  * NOTES
  *	  This file contains only the public interface routines.
  *
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/access/nbtree/nbtree.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
 #include "access/relscan.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "commands/progress.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "nodes/execnodes.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "storage/condition_variable.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
 #include "utils/guc.h"

 #include "catalog/indexing.h"
 #include "catalog/pg_namespace.h"

 /*
  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
  *
  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
  * a new page; others must wait.
  *
  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
  * to a new page; some process can start doing that.
  *
  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
  * We reach this state once for every distinct combination of array keys.
  */
 typedef enum
 {
 	BTPARALLEL_NOT_INITIALIZED,
 	BTPARALLEL_ADVANCING,
 	BTPARALLEL_IDLE,
 	BTPARALLEL_DONE
 } BTPS_State;

 /*
  * BTParallelScanDescData contains btree specific shared information required
  * for parallel scan.
  */
 typedef struct BTParallelScanDescData
 {
 	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
 									 * available for scan. see above for
 									 * possible states of parallel scan. */
 	int			btps_arrayKeyCount; /* count indicating number of array scan
 									 * keys processed by parallel scan */
 	slock_t		btps_mutex;		/* protects above variables */
 	ConditionVariable btps_cv;	/* used to synchronize parallel scan */
 }			BTParallelScanDescData;

 typedef struct BTParallelScanDescData *BTParallelScanDesc;


 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 IndexBulkDeleteCallback callback, void *callback_state,
 						 BTCycleId cycleid);
 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
 										  IndexTuple posting,
 										  OffsetNumber updatedoffset,
 										  int *nremaining);


 /*
  * Btree handler function: return IndexAmRoutine with access method parameters
  * and callbacks.
  */
 Datum
 bthandler(PG_FUNCTION_ARGS)
 {
 	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);

 	amroutine->amstrategies = BTMaxStrategyNumber;
 	amroutine->amsupport = BTNProcs;
 	amroutine->amoptsprocnum = BTOPTIONS_PROC;
 	amroutine->amcanorder = true;
 	amroutine->amcanorderbyop = false;
 	amroutine->amcanbackward = true;
 	amroutine->amcanunique = true;
 	amroutine->amcanmulticol = true;
 	amroutine->amoptionalkey = true;
 	amroutine->amsearcharray = true;
 	amroutine->amsearchnulls = true;
 	amroutine->amstorage = false;
 	amroutine->amclusterable = true;
 	amroutine->ampredlocks = true;
 	amroutine->amcanparallel = true;
 	amroutine->amcaninclude = true;
 	amroutine->amusemaintenanceworkmem = false;
 	amroutine->amsummarizing = false;
 	amroutine->amparallelvacuumoptions =
 		VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
 	amroutine->amkeytype = InvalidOid;

 	amroutine->ambuild = btbuild;
 	amroutine->ambuildempty = btbuildempty;
 	amroutine->aminsert = btinsert;
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
 	amroutine->amcostestimate = btcostestimate;
 	amroutine->amoptions = btoptions;
 	amroutine->amproperty = btproperty;
 	amroutine->ambuildphasename = btbuildphasename;
 	amroutine->amvalidate = btvalidate;
 	amroutine->amadjustmembers = btadjustmembers;
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
 	amroutine->amgetbitmap = btgetbitmap;
 	amroutine->amendscan = btendscan;
 	amroutine->ammarkpos = btmarkpos;
 	amroutine->amrestrpos = btrestrpos;
 	amroutine->amestimateparallelscan = btestimateparallelscan;
 	amroutine->aminitparallelscan = btinitparallelscan;
 	amroutine->amparallelrescan = btparallelrescan;

 	PG_RETURN_POINTER(amroutine);
 }

 /*
  *	btbuildempty() -- build an empty btree index in the initialization fork
  */
 void
 btbuildempty(Relation index)
 {
 	bool		allequalimage = _bt_allequalimage(index, false);
 	Buffer		metabuf;
 	Page		metapage;

 	/*
 	 * Initalize the metapage.
 	 *
 	 * Regular index build bypasses the buffer manager and uses smgr functions
 	 * directly, with an smgrimmedsync() call at the end.  That makes sense
 	 * when the index is large, but for an empty index, it's better to use the
 	 * buffer cache to avoid the smgrimmedsync().
 	 */

 	metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
 	metapage = BufferGetPage(metabuf);
 	PageEncryptInplace(metapage, INIT_FORKNUM,
 					   BTREE_METAPAGE);
 	Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
 	_bt_lockbuf(index, metabuf, BT_WRITE);

 	START_CRIT_SECTION();

 	metapage = BufferGetPage(metabuf);
 	_bt_initmetapage(metapage, P_NONE, 0, allequalimage);
 	MarkBufferDirty(metabuf);
 	log_newpage_buffer(metabuf, true);

 	END_CRIT_SECTION();

 	_bt_unlockbuf(index, metabuf);
 	ReleaseBuffer(metabuf);
 }

 /*
  * For a newly inserted heap tid, check if an entry with this tid
  * already exists in a unique index.  If it does, abort the inserting
  * transaction.
  */
 static void
 _bt_validate_tid(Relation irel, ItemPointer h_tid)
 {
 	BlockNumber blkno;
 	BlockNumber num_pages;
 	Buffer buf;
 	Page page;
 	BTPageOpaque opaque;
 	IndexTuple itup;
 	OffsetNumber maxoff,
 			minoff,
 			offnum;

 	elog(DEBUG1, "validating tid (%d,%d) for index (%s)",
 		 ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid),
 		 RelationGetRelationName(irel));

 	blkno = BTREE_METAPAGE + 1;
 	num_pages = RelationGetNumberOfBlocks(irel);

 	for (; blkno < num_pages; blkno++)
 	{
 		buf = ReadBuffer(irel, blkno);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		if (!PageIsNew(page))
 			_bt_checkpage(irel, buf);
 		if (P_ISLEAF(opaque))
 		{
 			minoff = P_FIRSTDATAKEY(opaque);
 			maxoff = PageGetMaxOffsetNumber(page);
 			for (offnum = minoff;
 				 offnum <= maxoff;
 				 offnum = OffsetNumberNext(offnum))
 			{
 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
 				if (ItemPointerEquals(&itup->t_tid, h_tid))
 				{
 					Form_pg_attribute key_att = TupleDescAttr(RelationGetDescr(irel), 0);
 					Oid key = InvalidOid;
 					bool isnull;
 					if (key_att->atttypid == OIDOID)
 					{
 						key = DatumGetInt32(
 								index_getattr(itup, 1, RelationGetDescr(irel), &isnull));
 						elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)",
 							 ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid),
 							 NameStr(key_att->attname), key, RelationGetRelationName(irel));
 					}
 					else
 					{
 						elog(ERROR, "found tid (%d,%d) already in index (%s)",
 							 ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid),
 							 RelationGetRelationName(irel));
 					}
 				}
 			}
 		}
 		ReleaseBuffer(buf);
 	}
 }

 /*
  *	btinsert() -- insert an index tuple into a btree.
  *
  *		Descend the tree recursively, find the appropriate location for our
  *		new tuple, and put it there.
  */
 bool
 btinsert(Relation rel, Datum *values, bool *isnull,
 		 ItemPointer ht_ctid, Relation heapRel,
 		 IndexUniqueCheck checkUnique,
 		 bool indexUnchanged,
 		 IndexInfo *indexInfo)
 {
 	bool		result;
 	IndexTuple	itup;

 	if (checkUnique && (
 				(gp_indexcheck_insert == INDEX_CHECK_ALL && RelationIsHeap(heapRel)) ||
 				(gp_indexcheck_insert == INDEX_CHECK_SYSTEM &&
 				 PG_CATALOG_NAMESPACE == RelationGetNamespace(heapRel))))
 	{
 		_bt_validate_tid(rel, ht_ctid);
 	}

 	/* generate an index tuple */
 	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 	itup->t_tid = *ht_ctid;

 	result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);

 	pfree(itup);

 	return result;
 }

 /*
  *	btgettuple() -- Get the next tuple in the scan.
  */
 bool
 btgettuple(IndexScanDesc scan, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		res;

 	/* btree indexes are never lossy */
 	scan->xs_recheck = false;

 	/*
 	 * If we have any array keys, initialize them during first call for a
 	 * scan.  We can't do this in btrescan because we don't know the scan
 	 * direction at that time.
 	 */
 	if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
 	{
 		/* punt if we have any unsatisfiable array keys */
 		if (so->numArrayKeys < 0)
 			return false;

 		_bt_start_array_keys(scan, dir);
 	}

 	/* This loop handles advancing to the next array elements, if any */
 	do
 	{
 		/*
 		 * If we've already initialized this scan, we can just advance it in
 		 * the appropriate direction.  If we haven't done so yet, we call
 		 * _bt_first() to get the first item in the scan.
 		 */
 		if (!BTScanPosIsValid(so->currPos))
 			res = _bt_first(scan, dir);
 		else
 		{
 			/*
 			 * Check to see if we should kill the previously-fetched tuple.
 			 */
 			if (scan->kill_prior_tuple)
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
 				 * tuples at once right before leaving the index page.)  The
 				 * test for numKilled overrun is not just paranoia: if the
 				 * caller reverses direction in the indexscan then the same
 				 * item might get entered multiple times. It's not worth
 				 * trying to optimize that, so we don't detect it, but instead
 				 * just forget any excess entries.
 				 */
 				if (so->killedItems == NULL)
 					so->killedItems = (int *)
 						palloc(MaxTIDsPerBTreePage * sizeof(int));
 				if (so->numKilled < MaxTIDsPerBTreePage)
 					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 			}

 			/*
 			 * Now continue the scan.
 			 */
 			res = _bt_next(scan, dir);
 		}

 		/* If we have a tuple, return it ... */
 		if (res)
 			break;
 		/* ... otherwise see if we have more array keys to deal with */
 	} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));

 	return res;
 }

 /*
  * btgetbitmap() -- construct a TIDBitmap.
  * GPDB_14_MERGE_FIXME: the code of upstream assumes that
  * bmNode is always non-empty and the type of the second
  * argument is TIDBitmap *.
  * It might be possible to align the code with upstream.
  */
 int64
 btgetbitmap(IndexScanDesc scan, Node **bmNodeP)
 {
 	TIDBitmap  *tbm;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	int64		ntids = 0;
 	ItemPointer heapTid;

 	/*
 	 * GPDB specific code. Since GPDB also support StreamBitmap
 	 * in bitmap index. So normally we need to create specific bitmap
 	 * node in the amgetbitmap AM.
 	 */
 	Assert(bmNodeP);
 	if (*bmNodeP == NULL)
 	{
 		/* XXX should we use less than work_mem for this? */
 		tbm = tbm_create(work_mem * 1024L, scan->dsa);
 		*bmNodeP = (Node *) tbm;
 	}
 	else if (!IsA(*bmNodeP, TIDBitmap))
 		elog(ERROR, "non btree bitmap");
 	else
 		tbm = (TIDBitmap *)*bmNodeP;

 	/*
 	 * If we have any array keys, initialize them.
 	 */
 	if (so->numArrayKeys)
 	{
 		/* punt if we have any unsatisfiable array keys */
 		if (so->numArrayKeys < 0)
 			return ntids;

 		_bt_start_array_keys(scan, ForwardScanDirection);
 	}

 	/* This loop handles advancing to the next array elements, if any */
 	do
 	{
 		/* Fetch the first page & tuple */
 		if (_bt_first(scan, ForwardScanDirection))
 		{
 			/* Save tuple ID, and continue scanning */
 			heapTid = &scan->xs_heaptid;
 			tbm_add_tuples(tbm, heapTid, 1, false);
 			ntids++;

 			for (;;)
 			{
 				/*
 				 * Advance to next tuple within page.  This is the same as the
 				 * easy case in _bt_next().
 				 */
 				if (++so->currPos.itemIndex > so->currPos.lastItem)
 				{
 					/* let _bt_next do the heavy lifting */
 					if (!_bt_next(scan, ForwardScanDirection))
 						break;
 				}

 				/* Save tuple ID, and continue scanning */
 				heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
 				tbm_add_tuples(tbm, heapTid, 1, false);
 				ntids++;
 			}
 		}
 		/* Now see if we have more array keys to deal with */
 	} while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));

 	return ntids;
 }

 /*
  *	btbeginscan() -- start a scan on a btree index
  */
 IndexScanDesc
 btbeginscan(Relation rel, int nkeys, int norderbys)
 {
 	IndexScanDesc scan;
 	BTScanOpaque so;

 	/* no order by operators allowed */
 	Assert(norderbys == 0);

 	/* get the scan */
 	scan = RelationGetIndexScan(rel, nkeys, norderbys);

 	/* allocate private workspace */
 	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 	BTScanPosInvalidate(so->currPos);
 	BTScanPosInvalidate(so->markPos);
 	if (scan->numberOfKeys > 0)
 		so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 	else
 		so->keyData = NULL;

 	so->arrayKeyData = NULL;	/* assume no array keys for now */
 	so->arraysStarted = false;
 	so->numArrayKeys = 0;
 	so->arrayKeys = NULL;
 	so->arrayContext = NULL;

 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;

 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
 	 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
 	 */
 	so->currTuples = so->markTuples = NULL;

 	scan->xs_itupdesc = RelationGetDescr(rel);

 	scan->opaque = so;

 	return scan;
 }

 /*
  *	btrescan() -- rescan an index relation
  */
 void
 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		 ScanKey orderbys, int norderbys)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 		BTScanPosInvalidate(so->currPos);
 	}

 	so->markItemIndex = -1;
 	so->arrayKeyCount = 0;
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);

 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
 	 * not already done in a previous rescan call.  To save on palloc
 	 * overhead, both workspaces are allocated as one palloc block; only this
 	 * function and btendscan know that.
 	 *
 	 * NOTE: this data structure also makes it safe to return data from a
 	 * "name" column, even though btree name_ops uses an underlying storage
 	 * datatype of cstring.  The risk there is that "name" is supposed to be
 	 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
 	 * However, since we only return data out of tuples sitting in the
 	 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
 	 * data out of the markTuples array --- running off the end of memory for
 	 * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
 	 * adding special-case treatment for name_ops elsewhere.
 	 */
 	if (scan->xs_want_itup && so->currTuples == NULL)
 	{
 		so->currTuples = (char *) palloc(BLCKSZ * 2);
 		so->markTuples = so->currTuples + BLCKSZ;
 	}

 	/*
 	 * Reset the scan keys
 	 */
 	if (scankey && scan->numberOfKeys > 0)
 		memmove(scan->keyData,
 				scankey,
 				scan->numberOfKeys * sizeof(ScanKeyData));
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */

 	/* If any keys are SK_SEARCHARRAY type, set up array-key info */
 	_bt_preprocess_array_keys(scan);
 }

 /*
  *	btendscan() -- close down a scan
  */
 void
 btendscan(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 	}

 	so->markItemIndex = -1;
 	BTScanPosUnpinIfPinned(so->markPos);

 	/* No need to invalidate positions, the RAM is about to be freed. */

 	/* Release storage */
 	if (so->keyData != NULL)
 		pfree(so->keyData);
 	/* so->arrayKeyData and so->arrayKeys are in arrayContext */
 	if (so->arrayContext != NULL)
 		MemoryContextDelete(so->arrayContext);
 	if (so->killedItems != NULL)
 		pfree(so->killedItems);
 	if (so->currTuples != NULL)
 		pfree(so->currTuples);
 	/* so->markTuples should not be pfree'd, see btrescan */
 	pfree(so);
 }

 /*
  *	btmarkpos() -- save current scan position
  */
 void
 btmarkpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	/* There may be an old mark with a pin (but no lock). */
 	BTScanPosUnpinIfPinned(so->markPos);

 	/*
 	 * Just record the current itemIndex.  If we later step to next page
 	 * before releasing the marked position, _bt_steppage makes a full copy of
 	 * the currPos struct in markPos.  If (as often happens) the mark is moved
 	 * before we leave the page, we don't have to do that work.
 	 */
 	if (BTScanPosIsValid(so->currPos))
 		so->markItemIndex = so->currPos.itemIndex;
 	else
 	{
 		BTScanPosInvalidate(so->markPos);
 		so->markItemIndex = -1;
 	}

 	/* Also record the current positions of any array keys */
 	if (so->numArrayKeys)
 		_bt_mark_array_keys(scan);
 }

 /*
  *	btrestrpos() -- restore scan to last saved position
  */
 void
 btrestrpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	/* Restore the marked positions of any array keys */
 	if (so->numArrayKeys)
 		_bt_restore_array_keys(scan);

 	if (so->markItemIndex >= 0)
 	{
 		/*
 		 * The scan has never moved to a new page since the last mark.  Just
 		 * restore the itemIndex.
 		 *
 		 * NB: In this case we can't count on anything in so->markPos to be
 		 * accurate.
 		 */
 		so->currPos.itemIndex = so->markItemIndex;
 	}
 	else
 	{
 		/*
 		 * The scan moved to a new page after last mark or restore, and we are
 		 * now restoring to the marked page.  We aren't holding any read
 		 * locks, but if we're still holding the pin for the current position,
 		 * we must drop it.
 		 */
 		if (BTScanPosIsValid(so->currPos))
 		{
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
 			BTScanPosUnpinIfPinned(so->currPos);
 		}

 		if (BTScanPosIsValid(so->markPos))
 		{
 			/* bump pin on mark buffer for assignment to current buffer */
 			if (BTScanPosIsPinned(so->markPos))
 				IncrBufferRefCount(so->markPos.buf);
 			memcpy(&so->currPos, &so->markPos,
 				   offsetof(BTScanPosData, items[1]) +
 				   so->markPos.lastItem * sizeof(BTScanPosItem));
 			if (so->currTuples)
 				memcpy(so->currTuples, so->markTuples,
 					   so->markPos.nextTupleOffset);
 		}
 		else
 			BTScanPosInvalidate(so->currPos);
 	}
 }

 /*
  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
  */
 Size
 btestimateparallelscan(void)
 {
 	return sizeof(BTParallelScanDescData);
 }

 /*
  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
  */
 void
 btinitparallelscan(void *target)
 {
 	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;

 	SpinLockInit(&bt_target->btps_mutex);
 	bt_target->btps_scanPage = InvalidBlockNumber;
 	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 	bt_target->btps_arrayKeyCount = 0;
 	ConditionVariableInit(&bt_target->btps_cv);
 }

 /*
  *	btparallelrescan() -- reset parallel scan
  */
 void
 btparallelrescan(IndexScanDesc scan)
 {
 	BTParallelScanDesc btscan;
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;

 	Assert(parallel_scan);

 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);

 	/*
 	 * In theory, we don't need to acquire the spinlock here, because there
 	 * shouldn't be any other workers running at this point, but we do so for
 	 * consistency.
 	 */
 	SpinLockAcquire(&btscan->btps_mutex);
 	btscan->btps_scanPage = InvalidBlockNumber;
 	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 	btscan->btps_arrayKeyCount = 0;
 	SpinLockRelease(&btscan->btps_mutex);
 }

 /*
  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
  *		page.  Other scans must wait until we call _bt_parallel_release()
  *		or _bt_parallel_done().
  *
  * The return value is true if we successfully seized the scan and false
  * if we did not.  The latter case occurs if no pages remain for the current
  * set of scankeys.
  *
  * If the return value is true, *pageno returns the next or current page
  * of the scan (depending on the scan direction).  An invalid block number
  * means the scan hasn't yet started, and P_NONE means we've reached the end.
  * The first time a participating process reaches the last page, it will return
  * true and set *pageno to P_NONE; after that, further attempts to seize the
  * scan will return false.
  *
  * Callers should ignore the value of pageno if the return value is false.
  */
 bool
 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	BTPS_State	pageStatus;
 	bool		exit_loop = false;
 	bool		status = true;
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;

 	*pageno = P_NONE;

 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);

 	while (1)
 	{
 		SpinLockAcquire(&btscan->btps_mutex);
 		pageStatus = btscan->btps_pageStatus;

 		if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
 		{
 			/* Parallel scan has already advanced to a new set of scankeys. */
 			status = false;
 		}
 		else if (pageStatus == BTPARALLEL_DONE)
 		{
 			/*
 			 * We're done with this set of scankeys.  This may be the end, or
 			 * there could be more sets to try.
 			 */
 			status = false;
 		}
 		else if (pageStatus != BTPARALLEL_ADVANCING)
 		{
 			/*
 			 * We have successfully seized control of the scan for the purpose
 			 * of advancing it to a new page!
 			 */
 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 			*pageno = btscan->btps_scanPage;
 			exit_loop = true;
 		}
 		SpinLockRelease(&btscan->btps_mutex);
 		if (exit_loop || !status)
 			break;
 		ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
 	}
 	ConditionVariableCancelSleep();

 	return status;
 }

 /*
  * _bt_parallel_release() -- Complete the process of advancing the scan to a
  *		new page.  We now have the new value btps_scanPage; some other backend
  *		can now begin advancing the scan.
  */
 void
 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 {
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;

 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);

 	SpinLockAcquire(&btscan->btps_mutex);
 	btscan->btps_scanPage = scan_page;
 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
 	SpinLockRelease(&btscan->btps_mutex);
 	ConditionVariableSignal(&btscan->btps_cv);
 }

 /*
  * _bt_parallel_done() -- Mark the parallel scan as complete.
  *
  * When there are no pages left to scan, this function should be called to
  * notify other workers.  Otherwise, they might wait forever for the scan to
  * advance to the next page.
  */
 void
 _bt_parallel_done(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
 	bool		status_changed = false;

 	/* Do nothing, for non-parallel scans */
 	if (parallel_scan == NULL)
 		return;

 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);

 	/*
 	 * Mark the parallel scan as done for this combination of scan keys,
 	 * unless some other process already did so.  See also
 	 * _bt_advance_array_keys.
 	 */
 	SpinLockAcquire(&btscan->btps_mutex);
 	if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
 		btscan->btps_pageStatus != BTPARALLEL_DONE)
 	{
 		btscan->btps_pageStatus = BTPARALLEL_DONE;
 		status_changed = true;
 	}
 	SpinLockRelease(&btscan->btps_mutex);

 	/* wake up all the workers associated with this parallel scan */
 	if (status_changed)
 		ConditionVariableBroadcast(&btscan->btps_cv);
 }

 /*
  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
  *			keys.
  *
  * Updates the count of array keys processed for both local and parallel
  * scans.
  */
 void
 _bt_parallel_advance_array_keys(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;

 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);

 	so->arrayKeyCount++;
 	SpinLockAcquire(&btscan->btps_mutex);
 	if (btscan->btps_pageStatus == BTPARALLEL_DONE)
 	{
 		btscan->btps_scanPage = InvalidBlockNumber;
 		btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 		btscan->btps_arrayKeyCount++;
 	}
 	SpinLockRelease(&btscan->btps_mutex);
 }

 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
  * whether any given heap tuple (identified by ItemPointer) is being deleted.
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			 IndexBulkDeleteCallback callback, void *callback_state)
 {
 	Relation	rel = info->index;
 	BTCycleId	cycleid;

 	/* allocate stats if first time through, else re-use existing struct */
 	if (stats == NULL)
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));

 	/* Establish the vacuum cycle ID to use for this scan */
 	/* The ENSURE stuff ensures we clean up shared memory on failure */
 	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 	{
 		cycleid = _bt_start_vacuum(rel);

 		btvacuumscan(info, stats, callback, callback_state, cycleid);
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 	_bt_end_vacuum(rel);

 	return stats;
 }

 /*
  * Post-VACUUM cleanup.
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 {
 	BlockNumber num_delpages;

 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
 		return stats;

 	/*
 	 * If btbulkdelete was called, we need not do anything (we just maintain
 	 * the information used within _bt_vacuum_needs_cleanup() by calling
 	 * _bt_set_cleanup_info() below).
 	 *
 	 * If btbulkdelete was _not_ called, then we have a choice to make: we
 	 * must decide whether or not a btvacuumscan() call is needed now (i.e.
 	 * whether the ongoing VACUUM operation can entirely avoid a physical scan
 	 * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
 	 * now.
 	 */
 	if (stats == NULL)
 	{
 		/* Check if VACUUM operation can entirely avoid btvacuumscan() call */
 		if (!_bt_vacuum_needs_cleanup(info->index))
 			return NULL;

 		/*
 		 * Since we aren't going to actually delete any leaf items, there's no
 		 * need to go through all the vacuum-cycle-ID pushups here.
 		 *
 		 * Posting list tuples are a source of inaccuracy for cleanup-only
 		 * scans.  btvacuumscan() will assume that the number of index tuples
 		 * from each page can be used as num_index_tuples, even though
 		 * num_index_tuples is supposed to represent the number of TIDs in the
 		 * index.  This naive approach can underestimate the number of tuples
 		 * in the index significantly.
 		 *
 		 * We handle the problem by making num_index_tuples an estimate in
 		 * cleanup-only case.
 		 */
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 		btvacuumscan(info, stats, NULL, NULL, 0);
 	}

 	/*
 	 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
 	 *
 	 * num_delpages is the number of deleted pages now in the index that were
 	 * not safe to place in the FSM to be recycled just yet.  num_delpages is
 	 * greater than 0 only when _bt_pagedel() actually deleted pages during
 	 * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
 	 * have failed to place any newly deleted pages in the FSM just moments
 	 * ago.  (Actually, there are edge cases where recycling of the current
 	 * VACUUM's newly deleted pages does not even become safe by the time the
 	 * next VACUUM comes around.  See nbtree/README.)
 	 */
 	Assert(stats->pages_deleted >= stats->pages_free);
 	num_delpages = stats->pages_deleted - stats->pages_free;
 	_bt_set_cleanup_info(info->index, num_delpages);

 	/*
 	 * It's quite possible for us to be fooled by concurrent page splits into
 	 * double-counting some index tuples, so disbelieve any total that exceeds
 	 * the underlying heap's count ... if we know that accurately.  Otherwise
 	 * this might just make matters worse.
 	 *
 	 * Posting list tuples are another source of inaccuracy.  Cleanup-only
 	 * btvacuumscan calls assume that the number of index tuples can be used
 	 * as num_index_tuples, even though num_index_tuples is supposed to
 	 * represent the number of TIDs in the index.  This naive approach can
 	 * underestimate the number of tuples in the index.
 	 */
 	if (!info->estimated_count)
 	{
 		if (stats->num_index_tuples > info->num_heap_tuples)
 			stats->num_index_tuples = info->num_heap_tuples;
 	}

 	return stats;
 }

 /*
  * btvacuumscan --- scan the index for VACUUMing purposes
  *
  * This combines the functions of looking for leaf tuples that are deletable
  * according to the vacuum callback, looking for empty pages that can be
  * deleted, and looking for old deleted pages that can be recycled.  Both
  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
  *
  * The caller is responsible for initially allocating/zeroing a stats struct
  * and for obtaining a vacuum cycle ID if necessary.
  */
 static void
 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			 IndexBulkDeleteCallback callback, void *callback_state,
 			 BTCycleId cycleid)
 {
 	Relation	rel = info->index;
 	BTVacState	vstate;
 	BlockNumber num_pages;
 	BlockNumber scanblkno;
 	bool		needLock;

 	/*
 	 * Reset fields that track information about the entire index now.  This
 	 * avoids double-counting in the case where a single VACUUM command
 	 * requires multiple scans of the index.
 	 *
 	 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
 	 * since they track information about the VACUUM command, and so must last
 	 * across each call to btvacuumscan().
 	 *
 	 * (Note that pages_free is treated as state about the whole index, not
 	 * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
 	 * calls are idempotent, and get repeated for the same deleted pages in
 	 * some scenarios.  The point for us is to track the number of recyclable
 	 * pages in the index at the end of the VACUUM command.)
 	 */
 	stats->num_pages = 0;
 	stats->num_index_tuples = 0;
 	stats->pages_deleted = 0;
 	stats->pages_free = 0;

 	/* Set up info to pass down to btvacuumpage */
 	vstate.info = info;
 	vstate.stats = stats;
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;

 	/* Create a temporary memory context to run _bt_pagedel in */
 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 												  "_bt_pagedel",
 												  ALLOCSET_DEFAULT_SIZES);

 	/* Initialize vstate fields used by _bt_pendingfsm_finalize */
 	vstate.bufsize = 0;
 	vstate.maxbufsize = 0;
 	vstate.pendingpages = NULL;
 	vstate.npendingpages = 0;
 	/* Consider applying _bt_pendingfsm_finalize optimization */
 	_bt_pendingfsm_init(rel, &vstate, (callback == NULL));

 	/*
 	 * The outer loop iterates over all index pages except the metapage, in
 	 * physical order (we hope the kernel will cooperate in providing
 	 * read-ahead for speed).  It is critical that we visit all leaf pages,
 	 * including ones added after we start the scan, else we might fail to
 	 * delete some deletable tuples.  Hence, we must repeatedly check the
 	 * relation length.  We must acquire the relation-extension lock while
 	 * doing so to avoid a race condition: if someone else is extending the
 	 * relation, there is a window where bufmgr/smgr have created a new
 	 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
 	 * we manage to scan such a page here, we'll improperly assume it can be
 	 * recycled.  Taking the lock synchronizes things enough to prevent a
 	 * problem: either num_pages won't include the new page, or _bt_getbuf
 	 * already has write lock on the buffer and it will be fully initialized
 	 * before we can examine it.  Also, we need not worry if a page is added
 	 * immediately after we look; the page splitting code already has
 	 * write-lock on the left page before it adds a right page, so we must
 	 * already have processed any tuples due to be moved into such a page.
 	 *
 	 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
 	 * think the use of the extension lock is still required.
 	 *
 	 * We can skip locking for new or temp relations, however, since no one
 	 * else could be accessing them.
 	 */
 	needLock = !RELATION_IS_LOCAL(rel);

 	scanblkno = BTREE_METAPAGE + 1;
 	for (;;)
 	{
 		/* Get the current relation length */
 		if (needLock)
 			LockRelationForExtension(rel, ExclusiveLock);
 		num_pages = RelationGetNumberOfBlocks(rel);
 		if (needLock)
 			UnlockRelationForExtension(rel, ExclusiveLock);

 		if (info->report_progress)
 			pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
 										 num_pages);

 		/* Quit if we've scanned the whole relation */
 		if (scanblkno >= num_pages)
 			break;
 		/* Iterate over pages, then loop back to recheck length */
 		for (; scanblkno < num_pages; scanblkno++)
 		{
 			btvacuumpage(&vstate, scanblkno);
 			if (info->report_progress)
 				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
 											 scanblkno);
 		}
 	}

 	/* Set statistics num_pages field to final size of index */
 	stats->num_pages = num_pages;

 	MemoryContextDelete(vstate.pagedelcontext);

 	/*
 	 * If there were any calls to _bt_pagedel() during scan of the index then
 	 * see if any of the resulting pages can be placed in the FSM now.  When
 	 * it's not safe we'll have to leave it up to a future VACUUM operation.
 	 *
 	 * Finally, if we placed any pages in the FSM (either just now or during
 	 * the scan), forcibly update the upper-level FSM pages to ensure that
 	 * searchers can find them.
 	 */
 	_bt_pendingfsm_finalize(rel, &vstate);
 	if (stats->pages_free > 0)
 		IndexFreeSpaceMapVacuum(rel);
 }

 /*
  * btvacuumpage --- VACUUM one page
  *
  * This processes a single page for btvacuumscan().  In some cases we must
  * backtrack to re-examine and VACUUM pages that were the scanblkno during
  * a previous call here.  This is how we handle page splits (that happened
  * after our cycleid was acquired) whose right half page happened to reuse
  * a block that we might have processed at some point before it was
  * recycled (i.e. before the page split).
  */
 static void
 btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
 {
 	IndexVacuumInfo *info = vstate->info;
 	IndexBulkDeleteResult *stats = vstate->stats;
 	IndexBulkDeleteCallback callback = vstate->callback;
 	void	   *callback_state = vstate->callback_state;
 	Relation	rel = info->index;
 	Relation	heaprel = info->heaprel;
 	bool		attempt_pagedel;
 	BlockNumber blkno,
 				backtrack_to;
 	Buffer		buf;
 	Page		page;
 	BTPageOpaque opaque;

 	blkno = scanblkno;

 backtrack:

 	attempt_pagedel = false;
 	backtrack_to = P_NONE;

 	/* call vacuum_delay_point while not holding any buffer lock */
 	vacuum_delay_point();

 	/*
 	 * We can't use _bt_getbuf() here because it always applies
 	 * _bt_checkpage(), which will barf on an all-zero page. We want to
 	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
 	 * buffer access strategy.
 	 */
 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 							 info->strategy);
 	_bt_lockbuf(rel, buf, BT_READ);
 	page = BufferGetPage(buf);
 	opaque = NULL;
 	if (!PageIsNew(page))
 	{
 		_bt_checkpage(rel, buf);
 		opaque = BTPageGetOpaque(page);
 	}

 	Assert(blkno <= scanblkno);
 	if (blkno != scanblkno)
 	{
 		/*
 		 * We're backtracking.
 		 *
 		 * We followed a right link to a sibling leaf page (a page that
 		 * happens to be from a block located before scanblkno).  The only
 		 * case we want to do anything with is a live leaf page having the
 		 * current vacuum cycle ID.
 		 *
 		 * The page had better be in a state that's consistent with what we
 		 * expect.  Check for conditions that imply corruption in passing.  It
 		 * can't be half-dead because only an interrupted VACUUM process can
 		 * leave pages in that state, so we'd definitely have dealt with it
 		 * back when the page was the scanblkno page (half-dead pages are
 		 * always marked fully deleted by _bt_pagedel(), barring corruption).
 		 */
 		if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
 		{
 			Assert(false);
 			ereport(LOG,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
 									 blkno, scanblkno, RelationGetRelationName(rel))));
 			_bt_relbuf(rel, buf);
 			return;
 		}

 		/*
 		 * We may have already processed the page in an earlier call, when the
 		 * page was scanblkno.  This happens when the leaf page split occurred
 		 * after the scan began, but before the right sibling page became the
 		 * scanblkno.
 		 *
 		 * Page may also have been deleted by current btvacuumpage() call,
 		 * since _bt_pagedel() sometimes deletes the right sibling page of
 		 * scanblkno in passing (it does so after we decided where to
 		 * backtrack to).  We don't need to process this page as a deleted
 		 * page a second time now (in fact, it would be wrong to count it as a
 		 * deleted page in the bulk delete statistics a second time).
 		 */
 		if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
 		{
 			/* Done with current scanblkno (and all lower split pages) */
 			_bt_relbuf(rel, buf);
 			return;
 		}
 	}

 	if (!opaque || BTPageIsRecyclable(page, heaprel))
 	{
 		/* Okay to recycle this page (which could be leaf or internal) */
 		RecordFreeIndexPage(rel, blkno);
 		stats->pages_deleted++;
 		stats->pages_free++;
 	}
 	else if (P_ISDELETED(opaque))
 	{
 		/*
 		 * Already deleted page (which could be leaf or internal).  Can't
 		 * recycle yet.
 		 */
 		stats->pages_deleted++;
 	}
 	else if (P_ISHALFDEAD(opaque))
 	{
 		/* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
 		attempt_pagedel = true;

 		/*
 		 * _bt_pagedel() will increment both pages_newly_deleted and
 		 * pages_deleted stats in all cases (barring corruption)
 		 */
 	}
 	else if (P_ISLEAF(opaque))
 	{
 		OffsetNumber deletable[MaxIndexTuplesPerPage];
 		int			ndeletable;
 		BTVacuumPosting updatable[MaxIndexTuplesPerPage];
 		int			nupdatable;
 		OffsetNumber offnum,
 					minoff,
 					maxoff;
 		int			nhtidsdead,
 					nhtidslive;

 		/*
 		 * Trade in the initial read lock for a full cleanup lock on this
 		 * page.  We must get such a lock on every leaf page over the course
 		 * of the vacuum scan, whether or not it actually contains any
 		 * deletable tuples --- see nbtree/README.
 		 */
 		_bt_upgradelockbufcleanup(rel, buf);

 		/*
 		 * Check whether we need to backtrack to earlier pages.  What we are
 		 * concerned about is a page split that happened since we started the
 		 * vacuum scan.  If the split moved tuples on the right half of the
 		 * split (i.e. the tuples that sort high) to a block that we already
 		 * passed over, then we might have missed the tuples.  We need to
 		 * backtrack now.  (Must do this before possibly clearing btpo_cycleid
 		 * or deleting scanblkno page below!)
 		 */
 		if (vstate->cycleid != 0 &&
 			opaque->btpo_cycleid == vstate->cycleid &&
 			!(opaque->btpo_flags & BTP_SPLIT_END) &&
 			!P_RIGHTMOST(opaque) &&
 			opaque->btpo_next < scanblkno)
 			backtrack_to = opaque->btpo_next;

 		ndeletable = 0;
 		nupdatable = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
 		nhtidsdead = 0;
 		nhtidslive = 0;
 		if (callback)
 		{
 			/* btbulkdelete callback tells us what to delete (or update) */
 			for (offnum = minoff;
 				 offnum <= maxoff;
 				 offnum = OffsetNumberNext(offnum))
 			{
 				IndexTuple	itup;

 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));

 				Assert(!BTreeTupleIsPivot(itup));
 				if (!BTreeTupleIsPosting(itup))
 				{
 					/* Regular tuple, standard table TID representation */
 					if (callback(&itup->t_tid, callback_state))
 					{
 						deletable[ndeletable++] = offnum;
 						nhtidsdead++;
 					}
 					else
 						nhtidslive++;
 				}
 				else
 				{
 					BTVacuumPosting vacposting;
 					int			nremaining;

 					/* Posting list tuple */
 					vacposting = btreevacuumposting(vstate, itup, offnum,
 													&nremaining);
 					if (vacposting == NULL)
 					{
 						/*
 						 * All table TIDs from the posting tuple remain, so no
 						 * delete or update required
 						 */
 						Assert(nremaining == BTreeTupleGetNPosting(itup));
 					}
 					else if (nremaining > 0)
 					{

 						/*
 						 * Store metadata about posting list tuple in
 						 * updatable array for entire page.  Existing tuple
 						 * will be updated during the later call to
 						 * _bt_delitems_vacuum().
 						 */
 						Assert(nremaining < BTreeTupleGetNPosting(itup));
 						updatable[nupdatable++] = vacposting;
 						nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
 					}
 					else
 					{
 						/*
 						 * All table TIDs from the posting list must be
 						 * deleted.  We'll delete the index tuple completely
 						 * (no update required).
 						 */
 						Assert(nremaining == 0);
 						deletable[ndeletable++] = offnum;
 						nhtidsdead += BTreeTupleGetNPosting(itup);
 						pfree(vacposting);
 					}

 					nhtidslive += nremaining;
 				}
 			}
 		}

 		/*
 		 * Apply any needed deletes or updates.  We issue just one
 		 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
 		 */
 		if (ndeletable > 0 || nupdatable > 0)
 		{
 			Assert(nhtidsdead >= ndeletable + nupdatable);
 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
 								nupdatable);

 			stats->tuples_removed += nhtidsdead;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);

 			/* can't leak memory here */
 			for (int i = 0; i < nupdatable; i++)
 				pfree(updatable[i]);
 		}
 		else
 		{
 			/*
 			 * If the leaf page has been split during this vacuum cycle, it
 			 * seems worth expending a write to clear btpo_cycleid even if we
 			 * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
 			 * takes care of this.)  This ensures we won't process the page
 			 * again.
 			 *
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
 			 */
 			Assert(nhtidsdead == 0);
 			if (vstate->cycleid != 0 &&
 				opaque->btpo_cycleid == vstate->cycleid)
 			{
 				opaque->btpo_cycleid = 0;
 				MarkBufferDirtyHint(buf, true);
 			}
 		}

 		/*
 		 * If the leaf page is now empty, try to delete it; else count the
 		 * live tuples (live table TIDs in posting lists are counted as
 		 * separate live tuples).  We don't delete when backtracking, though,
 		 * since that would require teaching _bt_pagedel() about backtracking
 		 * (doesn't seem worth adding more complexity to deal with that).
 		 *
 		 * We don't count the number of live TIDs during cleanup-only calls to
 		 * btvacuumscan (i.e. when callback is not set).  We count the number
 		 * of index tuples directly instead.  This avoids the expense of
 		 * directly examining all of the tuples on each page.  VACUUM will
 		 * treat num_index_tuples as an estimate in cleanup-only case, so it
 		 * doesn't matter that this underestimates num_index_tuples
 		 * significantly in some cases.
 		 */
 		if (minoff > maxoff)
 			attempt_pagedel = (blkno == scanblkno);
 		else if (callback)
 			stats->num_index_tuples += nhtidslive;
 		else
 			stats->num_index_tuples += maxoff - minoff + 1;

 		Assert(!attempt_pagedel || nhtidslive == 0);
 	}

 	if (attempt_pagedel)
 	{
 		MemoryContext oldcontext;

 		/* Run pagedel in a temp context to avoid memory leakage */
 		MemoryContextReset(vstate->pagedelcontext);
 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);

 		/*
 		 * _bt_pagedel maintains the bulk delete stats on our behalf;
 		 * pages_newly_deleted and pages_deleted are likely to be incremented
 		 * during call
 		 */
 		Assert(blkno == scanblkno);
 		_bt_pagedel(rel, buf, vstate);

 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
 	}
 	else
 		_bt_relbuf(rel, buf);

 	if (backtrack_to != P_NONE)
 	{
 		blkno = backtrack_to;
 		goto backtrack;
 	}
 }

 /*
  * btreevacuumposting --- determine TIDs still needed in posting list
  *
  * Returns metadata describing how to build replacement tuple without the TIDs
  * that VACUUM needs to delete.  Returned value is NULL in the common case
  * where no changes are needed to caller's posting list tuple (we avoid
  * allocating memory here as an optimization).
  *
  * The number of TIDs that should remain in the posting list tuple is set for
  * caller in *nremaining.
  */
 static BTVacuumPosting
 btreevacuumposting(BTVacState *vstate, IndexTuple posting,
 				   OffsetNumber updatedoffset, int *nremaining)
 {
 	int			live = 0;
 	int			nitem = BTreeTupleGetNPosting(posting);
 	ItemPointer items = BTreeTupleGetPosting(posting);
 	BTVacuumPosting vacposting = NULL;

 	for (int i = 0; i < nitem; i++)
 	{
 		if (!vstate->callback(items + i, vstate->callback_state))
 		{
 			/* Live table TID */
 			live++;
 		}
 		else if (vacposting == NULL)
 		{
 			/*
 			 * First dead table TID encountered.
 			 *
 			 * It's now clear that we need to delete one or more dead table
 			 * TIDs, so start maintaining metadata describing how to update
 			 * existing posting list tuple.
 			 */
 			vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
 								nitem * sizeof(uint16));

 			vacposting->itup = posting;
 			vacposting->updatedoffset = updatedoffset;
 			vacposting->ndeletedtids = 0;
 			vacposting->deletetids[vacposting->ndeletedtids++] = i;
 		}
 		else
 		{
 			/* Second or subsequent dead table TID */
 			vacposting->deletetids[vacposting->ndeletedtids++] = i;
 		}
 	}

 	*nremaining = live;
 	return vacposting;
 }

 /*
  *	btcanreturn() -- Check whether btree indexes support index-only scans.
  *
  * btrees always do, so this is trivial.
  */
 bool
 btcanreturn(Relation index, int attno)
 {
 	return true;
 }