src/backend/access/nbtree/nbtree.c - hawq - Git at Google

 /*-------------------------------------------------------------------------
  *
  * nbtree.c
  *	  Implementation of Lehman and Yao's btree management algorithm for
  *	  Postgres.
  *
  * NOTES
  *	  This file contains only the public interface routines.
  *
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.153 2006/11/01 19:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 #include "miscadmin.h"
 #include "access/genam.h"
 #include "access/nbtree.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
 #include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "utils/memutils.h"
 #include "nodes/tidbitmap.h"

 /* Working state for btbuild and its callback */
 typedef struct
 {
 	bool		isUnique;
 	bool		haveDead;
 	Relation	heapRel;
 	BTSpool    *spool;

 	/*
 	 * spool2 is needed only when the index is an unique index. Dead tuples
 	 * are put into spool2 instead of spool in order to avoid uniqueness
 	 * check.
 	 */
 	BTSpool    *spool2;
 	double		indtuples;
 } BTBuildState;

 /* Working state needed by btvacuumpage */
 typedef struct
 {
 	IndexVacuumInfo *info;
 	IndexBulkDeleteResult *stats;
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
 	BlockNumber *freePages;
 	int			nFreePages;		/* number of entries in freePages[] */
 	int			maxFreePages;	/* allocated size of freePages[] */
 	BlockNumber totFreePages;	/* true total # of free pages */
 	MemoryContext pagedelcontext;
 } BTVacState;


 static void btbuildCallback(Relation index,
 				ItemPointer tupleId,
 				Datum *values,
 				bool *isnull,
 				bool tupleIsAlive,
 				void *state);
 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			 IndexBulkDeleteCallback callback, void *callback_state,
 			 BTCycleId cycleid);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 			 BlockNumber orig_blkno);


 /*
  *	btbuild() -- build a new btree index.
  */
 Datum
 btbuild(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
 	IndexBuildResult *result;
 	double		reltuples;
 	BTBuildState buildstate;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	buildstate.isUnique = indexInfo->ii_Unique;
 	buildstate.haveDead = false;
 	buildstate.heapRel = heap;
 	buildstate.spool = NULL;
 	buildstate.spool2 = NULL;
 	buildstate.indtuples = 0;

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 		ResetUsage();
 #endif   /* BTREE_BUILD_STATS */

 	/*
 	 * We expect to be called exactly once for any index relation. If that's
 	 * not the case, big trouble's what we have.
 	 */
 	if (RelationGetNumberOfBlocks(index) != 0)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));

 	PG_TRY();
 	{
 		buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);

 		/*
 		 * If building a unique index, put dead tuples in a second spool to keep
 		 * them out of the uniqueness check.
 		 */
 		if (indexInfo->ii_Unique)
 			buildstate.spool2 = _bt_spoolinit(index, false, true);

 		/* do the heap scan */
 		reltuples = IndexBuildScan(heap, index, indexInfo,
 				btbuildCallback, (void *) &buildstate);

 		/* okay, all heap tuples are indexed */
 		if (buildstate.spool2 && !buildstate.haveDead)
 		{
 			/* spool2 turns out to be unnecessary */
 			_bt_spooldestroy(buildstate.spool2);
 			buildstate.spool2 = NULL;
 		}

 		/*
 		 * Finish the build by (1) completing the sort of the spool file, (2)
 		 * inserting the sorted tuples into btree pages and (3) building the upper
 		 * levels.
 		 */
 		_bt_leafbuild(buildstate.spool, buildstate.spool2);
 		_bt_spooldestroy(buildstate.spool);
 		buildstate.spool = NULL;

 		if (buildstate.spool2)
 		{
 			_bt_spooldestroy(buildstate.spool2);
 			buildstate.spool2 = NULL;
 		}

 	}
 	PG_CATCH();
 	{
 		/* Clean up the sort state on error */
 		if (buildstate.spool)
 		{
 			_bt_spooldestroy(buildstate.spool);
 			buildstate.spool = NULL;
 		}

 		if (buildstate.spool2)
 		{
 			_bt_spooldestroy(buildstate.spool2);
 			buildstate.spool2 = NULL;
 		}

 		PG_RE_THROW();
 	}
 	PG_END_TRY();

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 	{
 		ShowUsage("BTREE BUILD STATS");
 		ResetUsage();
 	}
 #endif   /* BTREE_BUILD_STATS */

 	/*
 	 * If we are reindexing a pre-existing index, it is critical to send out a
 	 * relcache invalidation SI message to ensure all backends re-read the
 	 * index metapage.	We expect that the caller will ensure that happens
 	 * (typically as a side effect of updating index stats, but it must happen
 	 * even if the stats don't change!)
 	 */

 	/*
 	 * Return statistics
 	 */
 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

 	result->heap_tuples = reltuples;
 	result->index_tuples = buildstate.indtuples;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_POINTER(result);
 }

 /*
  * Per-tuple callback from IndexBuildHeapScan
  */
 static void
 btbuildCallback(Relation index,
 				ItemPointer tupleId,
 				Datum *values,
 				bool *isnull,
 				bool tupleIsAlive,
 				void *state)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	BTBuildState *buildstate = (BTBuildState *) state;
 	IndexTuple	itup;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/* form an index tuple and point it at the heap tuple */
 	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
 	itup->t_tid = *tupleId;

 	/*
 	 * insert the index tuple into the appropriate spool file for subsequent
 	 * processing
 	 */
 	if (tupleIsAlive || buildstate->spool2 == NULL)
 		_bt_spool(itup, buildstate->spool);
 	else
 	{
 		/* dead tuples are put into spool2 */
 		buildstate->haveDead = true;
 		_bt_spool(itup, buildstate->spool2);
 	}

 	buildstate->indtuples += 1;

 	pfree(itup);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 }

 /*
  *	btinsert() -- insert an index tuple into a btree.
  *
  *		Descend the tree recursively, find the appropriate location for our
  *		new tuple, and put it there.
  */
 Datum
 btinsert(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
 	Datum	   *values = (Datum *) PG_GETARG_POINTER(1);
 	bool	   *isnull = (bool *) PG_GETARG_POINTER(2);
 	ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
 	Relation	heapRel = (Relation) PG_GETARG_POINTER(4);
 	bool		checkUnique = PG_GETARG_BOOL(5);
 	IndexTuple	itup;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/* generate an index tuple */
 	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 	itup->t_tid = *ht_ctid;

 	_bt_doinsert(rel, itup, checkUnique, heapRel);

 	pfree(itup);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_BOOL(true);
 }

 /*
  *	btgettuple() -- Get the next tuple in the scan.
  */
 Datum
 btgettuple(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		res;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/*
 	 * If we've already initialized this scan, we can just advance it in the
 	 * appropriate direction.  If we haven't done so yet, we call a routine to
 	 * get the first item in the scan.
 	 */
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/*
 		 * Check to see if we should kill the previously-fetched tuple.
 		 */
 		if (scan->kill_prior_tuple)
 		{
 			/*
 			 * Yes, remember it for later.	(We'll deal with all such tuples
 			 * at once right before leaving the index page.)  The test for
 			 * numKilled overrun is not just paranoia: if the caller reverses
 			 * direction in the indexscan then the same item might get entered
 			 * multiple times.	It's not worth trying to optimize that, so we
 			 * don't detect it, but instead just forget any excess entries.
 			 */
 			if (so->killedItems == NULL)
 				so->killedItems = (int *)
 					palloc(MaxIndexTuplesPerPage * sizeof(int));
 			if (so->numKilled < MaxIndexTuplesPerPage)
 				so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 		}

 		/*
 		 * Now continue the scan.
 		 */
 		res = _bt_next(scan, dir);
 	}
 	else
 		res = _bt_first(scan, dir);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_BOOL(res);
 }

 /*
  * btgetmulti() -- construct a HashBitmap.
  */
 Datum
 btgetmulti(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	Node *n = (Node *)PG_GETARG_POINTER(1);
 	HashBitmap	*hashBitmap;

 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		res = true;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	if (n == NULL || IsA(n, StreamBitmap))
 	{
 		/* XXX should we use less than work_mem for this? */
 		hashBitmap = tbm_create(work_mem * 1024L);
 	}
 	else
 	{
 		hashBitmap = (HashBitmap *)n;
 	}
 	/* If we haven't started the scan yet, fetch the first page & tuple. */
 	if (!BTScanPosIsValid(so->currPos))
 	{
 		res = _bt_first(scan, ForwardScanDirection);
 		if (res)
 		{
 			/* Save tuple ID, and continue scanning */
 			tbm_add_tuples(hashBitmap, &(scan->xs_ctup.t_self), 1);
 		}
 	}

 	while (res)
 	{
 		/*
 		 * Advance to next tuple within page.  This is the same as the
 		 * easy case in _bt_next().
 		 */
 		if (++so->currPos.itemIndex > so->currPos.lastItem)
 		{
 			/* let _bt_next do the heavy lifting */
 			res = _bt_next(scan, ForwardScanDirection);
 			if (!res)
 				break;
 		}

 		/* Save tuple ID, and continue scanning */
 		tbm_add_tuples(hashBitmap,
 					   &(so->currPos.items[so->currPos.itemIndex].heapTid),
 					   1);
 	}

 	if(n && IsA(n, StreamBitmap))
 	{
 		stream_add_node((StreamBitmap *)n,
 			tbm_create_stream_node(hashBitmap), BMS_OR);
 		PG_RETURN_POINTER(n);
 	}

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_POINTER(hashBitmap);
 }

 /*
  *	btbeginscan() -- start a scan on a btree index
  */
 Datum
 btbeginscan(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
 	int			keysz = PG_GETARG_INT32(1);
 	ScanKey		scankey = (ScanKey) PG_GETARG_POINTER(2);
 	IndexScanDesc scan;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/* get the scan */
 	scan = RelationGetIndexScan(rel, keysz, scankey);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_POINTER(scan);
 }

 /*
  *	btrescan() -- rescan an index relation
  */
 Datum
 btrescan(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	ScanKey		scankey = (ScanKey) PG_GETARG_POINTER(1);
 	BTScanOpaque so;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	so = (BTScanOpaque) scan->opaque;

 	if (so == NULL)				/* if called from btbeginscan */
 	{
 		so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 		so->currPos.buf = so->markPos.buf = InvalidBuffer;
 		if (scan->numberOfKeys > 0)
 			so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 		else
 			so->keyData = NULL;
 		so->killedItems = NULL; /* until needed */
 		so->numKilled = 0;
 		scan->opaque = so;
 	}

 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan, false);
 		ReleaseBuffer(so->currPos.buf);
 		so->currPos.buf = InvalidBuffer;
 	}

 	if (BTScanPosIsValid(so->markPos))
 	{
 		ReleaseBuffer(so->markPos.buf);
 		so->markPos.buf = InvalidBuffer;
 	}
 	so->markItemIndex = -1;

 	/*
 	 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
 	 * - vadim 05/05/97
 	 */
 	if (scankey && scan->numberOfKeys > 0)
 		memmove(scan->keyData,
 				scankey,
 				scan->numberOfKeys * sizeof(ScanKeyData));
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_VOID();
 }

 /*
  *	btendscan() -- close down a scan
  */
 Datum
 btendscan(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan, false);
 		ReleaseBuffer(so->currPos.buf);
 		so->currPos.buf = InvalidBuffer;
 	}

 	if (BTScanPosIsValid(so->markPos))
 	{
 		ReleaseBuffer(so->markPos.buf);
 		so->markPos.buf = InvalidBuffer;
 	}
 	so->markItemIndex = -1;

 	if (so->killedItems != NULL)
 		pfree(so->killedItems);
 	if (so->keyData != NULL)
 		pfree(so->keyData);
 	pfree(so);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_VOID();
 }

 /*
  *	btmarkpos() -- save current scan position
  */
 Datum
 btmarkpos(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/* we aren't holding any read locks, but gotta drop the pin */
 	if (BTScanPosIsValid(so->markPos))
 	{
 		ReleaseBuffer(so->markPos.buf);
 		so->markPos.buf = InvalidBuffer;
 	}

 	/*
 	 * Just record the current itemIndex.  If we later step to next page
 	 * before releasing the marked position, _bt_steppage makes a full copy of
 	 * the currPos struct in markPos.  If (as often happens) the mark is moved
 	 * before we leave the page, we don't have to do that work.
 	 */
 	if (BTScanPosIsValid(so->currPos))
 		so->markItemIndex = so->currPos.itemIndex;
 	else
 		so->markItemIndex = -1;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_VOID();
 }

 /*
  *	btrestrpos() -- restore scan to last saved position
  */
 Datum
 btrestrpos(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	if (so->markItemIndex >= 0)
 	{
 		/*
 		 * The mark position is on the same page we are currently on. Just
 		 * restore the itemIndex.
 		 */
 		so->currPos.itemIndex = so->markItemIndex;
 	}
 	else
 	{
 		/* we aren't holding any read locks, but gotta drop the pin */
 		if (BTScanPosIsValid(so->currPos))
 		{
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0 &&
 				so->currPos.buf != so->markPos.buf)
 				_bt_killitems(scan, false);
 			ReleaseBuffer(so->currPos.buf);
 			so->currPos.buf = InvalidBuffer;
 		}

 		if (BTScanPosIsValid(so->markPos))
 		{
 			/* bump pin on mark buffer for assignment to current buffer */
 			IncrBufferRefCount(so->markPos.buf);
 			memcpy(&so->currPos, &so->markPos,
 				   offsetof(BTScanPosData, items[1]) +
 				   so->markPos.lastItem * sizeof(BTScanPosItem));
 		}
 	}

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_VOID();
 }

 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
  * whether any given heap tuple (identified by ItemPointer) is being deleted.
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 Datum
 btbulkdelete(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
 	IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
 	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
 	void	   *callback_state = (void *) PG_GETARG_POINTER(3);
 	Relation	rel = info->index;
 	BTCycleId	cycleid;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
 	RelationFetchGpRelationNodeForXLog(rel);

 	/* allocate stats if first time through, else re-use existing struct */
 	if (stats == NULL)
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));

 	/* Establish the vacuum cycle ID to use for this scan */
 	/* The ENSURE stuff ensures we clean up shared memory on failure */
 	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 	{
 		cycleid = _bt_start_vacuum(rel);

 		btvacuumscan(info, stats, callback, callback_state, cycleid);
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 	_bt_end_vacuum(rel);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_POINTER(stats);
 }

 /*
  * Post-VACUUM cleanup.
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 Datum
 btvacuumcleanup(PG_FUNCTION_ARGS)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
 	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/*
 	 * If btbulkdelete was called, we need not do anything, just return the
 	 * stats from the latest btbulkdelete call.  If it wasn't called, we must
 	 * still do a pass over the index, to recycle any newly-recyclable pages
 	 * and to obtain index statistics.
 	 *
 	 * Since we aren't going to actually delete any leaf items, there's no
 	 * need to go through all the vacuum-cycle-ID pushups.
 	 */
 	if (stats == NULL)
 	{
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 		btvacuumscan(info, stats, NULL, NULL, 0);
 	}

 	/*
 	 * During a non-FULL vacuum it's quite possible for us to be fooled by
 	 * concurrent page splits into double-counting some index tuples, so
 	 * disbelieve any total that exceeds the underlying heap's count. (We
 	 * can't check this during btbulkdelete.)
 	 */
 	if (!info->vacuum_full)
 	{
 		if (stats->num_index_tuples > info->num_heap_tuples)
 			stats->num_index_tuples = info->num_heap_tuples;
 	}

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 	PG_RETURN_POINTER(stats);
 }

 /*
  * btvacuumscan --- scan the index for VACUUMing purposes
  *
  * This combines the functions of looking for leaf tuples that are deletable
  * according to the vacuum callback, looking for empty pages that can be
  * deleted, and looking for old deleted pages that can be recycled.  Both
  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
  * btbulkdelete call occurred).
  *
  * The caller is responsible for initially allocating/zeroing a stats struct
  * and for obtaining a vacuum cycle ID if necessary.
  */
 static void
 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			 IndexBulkDeleteCallback callback, void *callback_state,
 			 BTCycleId cycleid)
 {
 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;

 	Relation	rel = info->index;
 	BTVacState	vstate;
 	BlockNumber num_pages;
 	BlockNumber blkno;
 	bool		needLock;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

 	/*
 	 * Reset counts that will be incremented during the scan; needed in case
 	 * of multiple scans during a single VACUUM command
 	 */
 	stats->num_index_tuples = 0;
 	stats->pages_deleted = 0;

 	/* Set up info to pass down to btvacuumpage */
 	vstate.info = info;
 	vstate.stats = stats;
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
 	vstate.freePages = NULL;	/* temporarily */
 	vstate.nFreePages = 0;
 	vstate.maxFreePages = 0;
 	vstate.totFreePages = 0;

 	/* Create a temporary memory context to run _bt_pagedel in */
 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 												  "_bt_pagedel",
 												  ALLOCSET_DEFAULT_MINSIZE,
 												  ALLOCSET_DEFAULT_INITSIZE,
 												  ALLOCSET_DEFAULT_MAXSIZE);

 	/*
 	 * The outer loop iterates over all index pages except the metapage, in
 	 * physical order (we hope the kernel will cooperate in providing
 	 * read-ahead for speed).  It is critical that we visit all leaf pages,
 	 * including ones added after we start the scan, else we might fail to
 	 * delete some deletable tuples.  Hence, we must repeatedly check the
 	 * relation length.  We must acquire the relation-extension lock while
 	 * doing so to avoid a race condition: if someone else is extending the
 	 * relation, there is a window where bufmgr/smgr have created a new
 	 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
 	 * we manage to scan such a page here, we'll improperly assume it can be
 	 * recycled.  Taking the lock synchronizes things enough to prevent a
 	 * problem: either num_pages won't include the new page, or _bt_getbuf
 	 * already has write lock on the buffer and it will be fully initialized
 	 * before we can examine it.  (See also vacuumlazy.c, which has the same
 	 * issue.)	Also, we need not worry if a page is added immediately after
 	 * we look; the page splitting code already has write-lock on the left
 	 * page before it adds a right page, so we must already have processed any
 	 * tuples due to be moved into such a page.
 	 *
 	 * We can skip locking for new or temp relations, however, since no one
 	 * else could be accessing them.
 	 */
 	needLock = !RELATION_IS_LOCAL(rel);

 	blkno = BTREE_METAPAGE + 1;
 	for (;;)
 	{
 		/* Get the current relation length */
 		if (needLock)
 			LockRelationForExtension(rel, ExclusiveLock);
 		num_pages = RelationGetNumberOfBlocks(rel);
 		if (needLock)
 			UnlockRelationForExtension(rel, ExclusiveLock);

 		/* Allocate freePages after we read num_pages the first time */
 		if (vstate.freePages == NULL)
 		{
 			/* No point in remembering more than MaxFSMPages pages */
 			vstate.maxFreePages = MaxFSMPages;
 			if ((BlockNumber) vstate.maxFreePages > num_pages)
 				vstate.maxFreePages = (int) num_pages;
 			vstate.freePages = (BlockNumber *)
 				palloc(vstate.maxFreePages * sizeof(BlockNumber));
 		}

 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
 		/* Iterate over pages, then loop back to recheck length */
 		for (; blkno < num_pages; blkno++)
 		{
 			btvacuumpage(&vstate, blkno, blkno);
 		}
 	}

 	/*
 	 * During VACUUM FULL, we truncate off any recyclable pages at the end of
 	 * the index.  In a normal vacuum it'd be unsafe to do this except by
 	 * acquiring exclusive lock on the index and then rechecking all the
 	 * pages; doesn't seem worth it.
 	 */
 	if (info->vacuum_full && vstate.nFreePages > 0)
 	{
 		BlockNumber new_pages = num_pages;

 		while (vstate.nFreePages > 0 &&
 			   vstate.freePages[vstate.nFreePages - 1] == new_pages - 1)
 		{
 			new_pages--;
 			stats->pages_deleted--;
 			vstate.nFreePages--;
 			vstate.totFreePages = vstate.nFreePages;	/* can't be more */
 		}
 		if (new_pages != num_pages)
 		{
 			/*
 			 * Okay to truncate.
 			 */
 			RelationTruncate(
 						rel,
 						new_pages,
 						/* markPersistentAsPhysicallyTruncated */ true);

 			/* update statistics */
 			stats->pages_removed += num_pages - new_pages;

 			num_pages = new_pages;
 		}
 	}

 	/*
 	 * Update the shared Free Space Map with the info we now have about free
 	 * pages in the index, discarding any old info the map may have. We do not
 	 * need to sort the page numbers; they're in order already.
 	 */
 	RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages,
 						 vstate.nFreePages, vstate.freePages);

 	pfree(vstate.freePages);

 	MemoryContextDelete(vstate.pagedelcontext);

 	/* update statistics */
 	stats->num_pages = num_pages;
 	stats->pages_free = vstate.totFreePages;

 	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

 }

 /*
  * btvacuumpage --- VACUUM one page
  *
  * This processes a single page for btvacuumscan().  In some cases we
  * must go back and re-examine previously-scanned pages; this routine
  * recurses when necessary to handle that case.
  *
  * blkno is the page to process.  orig_blkno is the highest block number
  * reached by the outer btvacuumscan loop (the same as blkno, unless we
  * are recursing to re-examine a previous page).
  */
 static void
 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
 {
 	MIRROREDLOCK_BUFMGR_DECLARE;

 	IndexVacuumInfo *info = vstate->info;
 	IndexBulkDeleteResult *stats = vstate->stats;
 	IndexBulkDeleteCallback callback = vstate->callback;
 	void	   *callback_state = vstate->callback_state;
 	Relation	rel = info->index;
 	bool		delete_now;
 	BlockNumber recurse_to;
 	Buffer		buf;
 	Page		page;
 	BTPageOpaque opaque;
 restart:
 	delete_now = false;
 	recurse_to = P_NONE;

 	/* call vacuum_delay_point while not holding any buffer lock */
 	vacuum_delay_point();

 	/*
 	 * We can't use _bt_getbuf() here because it always applies
 	 * _bt_checkpage(), which will barf on an all-zero page. We want to
 	 * recycle all-zero pages, not fail.
 	 */

 	// -------- MirroredLock ----------
 	MIRROREDLOCK_BUFMGR_LOCK;

 	buf = ReadBuffer(rel, blkno);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	if (!PageIsNew(page))
 		_bt_checkpage(rel, buf);

 	/*
 	 * If we are recursing, the only case we want to do anything with is a
 	 * live leaf page having the current vacuum cycle ID.  Any other state
 	 * implies we already saw the page (eg, deleted it as being empty). In
 	 * particular, we don't want to risk adding it to freePages twice.
 	 */
 	if (blkno != orig_blkno)
 	{
 		if (_bt_page_recyclable(page) ||
 			P_IGNORE(opaque) ||
 			!P_ISLEAF(opaque) ||
 			opaque->btpo_cycleid != vstate->cycleid)
 		{
 			_bt_relbuf(rel, buf);

 			MIRROREDLOCK_BUFMGR_UNLOCK;
 			// -------- MirroredLock ----------

 			return;
 		}
 	}

 	/* Page is valid, see what to do with it */
 	if (_bt_page_recyclable(page))
 	{
 		/* Okay to recycle this page */
 		if (vstate->nFreePages < vstate->maxFreePages)
 			vstate->freePages[vstate->nFreePages++] = blkno;
 		vstate->totFreePages++;
 		stats->pages_deleted++;
 	}
 	else if (P_ISDELETED(opaque))
 	{
 		/* Already deleted, but can't recycle yet */
 		stats->pages_deleted++;
 	}
 	else if (P_ISHALFDEAD(opaque))
 	{
 		/* Half-dead, try to delete */
 		delete_now = true;
 	}
 	else if (P_ISLEAF(opaque))
 	{
 		OffsetNumber deletable[MaxOffsetNumber];
 		int			ndeletable;
 		OffsetNumber offnum,
 					minoff,
 					maxoff;

 		/*
 		 * Trade in the initial read lock for a super-exclusive write lock on
 		 * this page.  We must get such a lock on every leaf page over the
 		 * course of the vacuum scan, whether or not it actually contains any
 		 * deletable tuples --- see nbtree/README.
 		 */
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 		LockBufferForCleanup(buf);

 		/*
 		 * Check whether we need to recurse back to earlier pages.	What we
 		 * are concerned about is a page split that happened since we started
 		 * the vacuum scan.  If the split moved some tuples to a lower page
 		 * then we might have missed 'em.  If so, set up for tail recursion.
 		 * (Must do this before possibly clearing btpo_cycleid below!)
 		 */
 		if (vstate->cycleid != 0 &&
 			opaque->btpo_cycleid == vstate->cycleid &&
 			!(opaque->btpo_flags & BTP_SPLIT_END) &&
 			!P_RIGHTMOST(opaque) &&
 			opaque->btpo_next < orig_blkno)
 			recurse_to = opaque->btpo_next;

 		/*
 		 * Scan over all items to see which ones need deleted according to the
 		 * callback function.
 		 */
 		ndeletable = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
 		if (callback)
 		{
 			for (offnum = minoff;
 				 offnum <= maxoff;
 				 offnum = OffsetNumberNext(offnum))
 			{
 				IndexTuple	itup;
 				ItemPointer htup;

 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
 				htup = &(itup->t_tid);
 				if (callback(htup, callback_state))
 					deletable[ndeletable++] = offnum;
 			}
 		}

 		/*
 		 * Apply any needed deletes.  We issue just one _bt_delitems() call
 		 * per page, so as to minimize WAL traffic.
 		 */
 		if (ndeletable > 0)
 		{
 			_bt_delitems(rel, buf, deletable, ndeletable);
 			stats->tuples_removed += ndeletable;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
 		}
 		else
 		{
 			/*
 			 * If the page has been split during this vacuum cycle, it seems
 			 * worth expending a write to clear btpo_cycleid even if we don't
 			 * have any deletions to do.  (If we do, _bt_delitems takes care
 			 * of this.)  This ensures we won't process the page again.
 			 *
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
 			 */
 			if (vstate->cycleid != 0 &&
 				opaque->btpo_cycleid == vstate->cycleid)
 			{
 				opaque->btpo_cycleid = 0;
 				SetBufferCommitInfoNeedsSave(buf);
 			}
 		}

 		/*
 		 * If it's now empty, try to delete; else count the live tuples. We
 		 * don't delete when recursing, though, to avoid putting entries into
 		 * freePages out-of-order (doesn't seem worth any extra code to handle
 		 * the case).
 		 */
 		if (minoff > maxoff)
 			delete_now = (blkno == orig_blkno);
 		else
 			stats->num_index_tuples += maxoff - minoff + 1;
 	}

 	if (delete_now)
 	{
 		MemoryContext oldcontext;
 		int			ndel;

 		/* Run pagedel in a temp context to avoid memory leakage */
 		MemoryContextReset(vstate->pagedelcontext);
 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);

 		ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full);

 		/* count only this page, else may double-count parent */
 		if (ndel)
 			stats->pages_deleted++;

 		/*
 		 * During VACUUM FULL it's okay to recycle deleted pages immediately,
 		 * since there can be no other transactions scanning the index.  Note
 		 * that we will only recycle the current page and not any parent pages
 		 * that _bt_pagedel might have recursed to; this seems reasonable in
 		 * the name of simplicity.	(Trying to do otherwise would mean we'd
 		 * have to sort the list of recyclable pages we're building.)
 		 */
 		if (ndel && info->vacuum_full)
 		{
 			if (vstate->nFreePages < vstate->maxFreePages)
 				vstate->freePages[vstate->nFreePages++] = blkno;
 			vstate->totFreePages++;
 		}

 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
 	}
 	else
 		_bt_relbuf(rel, buf);


 	MIRROREDLOCK_BUFMGR_UNLOCK;
 	// -------- MirroredLock ----------

 	/*
 	 * This is really tail recursion, but if the compiler is too stupid to
 	 * optimize it as such, we'd eat an uncomfortably large amount of stack
 	 * space per recursion level (due to the deletable[] array). A failure is
 	 * improbable since the number of levels isn't likely to be large ... but
 	 * just in case, let's hand-optimize into a loop.
 	 */
 	if (recurse_to != P_NONE)
 	{
 		blkno = recurse_to;
 		goto restart;
 	}
 }