src/backend/access/heap/pruneheap.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * pruneheap.c
  *	  heap page pruning and HOT-chain management code
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/access/heap/pruneheap.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
 #include "access/htup_details.h"
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "utils/snapmgr.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"

 /* Working data for heap_page_prune and subroutines */
 typedef struct
 {
 	Relation	rel;

 	/* State used to test tuple visibility; Initialized for the relation */
 	TransactionId oldest_xmin;
 	GlobalVisState *vistest;

 	/*
 	 * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
 	 * been computed (done on demand, and only if
 	 * OldSnapshotThresholdActive()). The first time a tuple is about to be
 	 * removed based on the limited horizon, old_snap_used is set to true, and
 	 * SetOldSnapshotThresholdTimestamp() is called. See
 	 * heap_prune_satisfies_vacuum().
 	 */
 	TimestampTz old_snap_ts;
 	TransactionId old_snap_xmin;
 	bool		old_snap_used;

 	TransactionId new_prune_xid;	/* new prune hint value for page */
 	TransactionId snapshotConflictHorizon;	/* latest xid removed */
 	int			nredirected;	/* numbers of entries in arrays below */
 	int			ndead;
 	int			nunused;
 	/* arrays that accumulate indexes of items to be changed */
 	OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
 	OffsetNumber nowdead[MaxHeapTuplesPerPage];
 	OffsetNumber nowunused[MaxHeapTuplesPerPage];

 	/*
 	 * marked[i] is true if item i is entered in one of the above arrays.
 	 *
 	 * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
 	 * 1. Otherwise every access would need to subtract 1.
 	 */
 	bool		marked[MaxHeapTuplesPerPage + 1];

 	/*
 	 * Tuple visibility is only computed once for each tuple, for correctness
 	 * and efficiency reasons; see comment in heap_page_prune() for details.
 	 * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
 	 * indicate no visibility has been computed, e.g. for LP_DEAD items.
 	 *
 	 * Same indexing as ->marked.
 	 */
 	int8		htsv[MaxHeapTuplesPerPage + 1];
 } PruneState;

 /* Local functions */
 static HTSV_Result heap_prune_satisfies_vacuum(Relation relation, PruneState *prstate,
 											   HeapTuple tup,
 											   Buffer buffer);
 static int	heap_prune_chain(Buffer buffer,
 							 OffsetNumber rootoffnum,
 							 PruneState *prstate);
 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
 static void heap_prune_record_redirect(PruneState *prstate,
 									   OffsetNumber offnum, OffsetNumber rdoffnum);
 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
 static void page_verify_redirects(Page page);


 /*
  * Optionally prune and repair fragmentation in the specified page.
  *
  * This is an opportunistic function.  It will perform housekeeping
  * only if the page heuristically looks like a candidate for pruning and we
  * can acquire buffer cleanup lock without blocking.
  *
  * Note: this is called quite often.  It's important that it fall out quickly
  * if there's not any use in pruning.
  *
  * Caller must have pin on the buffer, and must *not* have a lock on it.
  */
 void
 heap_page_prune_opt(Relation relation, Buffer buffer)
 {
 	Page		page = BufferGetPage(buffer);
 	TransactionId prune_xid;
 	GlobalVisState *vistest;
 	TransactionId limited_xmin = InvalidTransactionId;
 	TimestampTz limited_ts = 0;
 	Size		minfree;

 	/*
 	 * We can't write WAL in recovery mode, so there's no point trying to
 	 * clean the page. The primary will likely issue a cleaning WAL record
 	 * soon anyway, so this is no particular loss.
 	 */
 	if (RecoveryInProgress())
 		return;

 	/*
 	 * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
 	 * currently are broken, and discussion of what to do about them is
 	 * ongoing. See
 	 * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
 	 */
 	if (old_snapshot_threshold == 0)
 		SnapshotTooOldMagicForTest();

 	/*
 	 * First check whether there's any chance there's something to prune,
 	 * determining the appropriate horizon is a waste if there's no prune_xid
 	 * (i.e. no updates/deletes left potentially dead tuples around).
 	 */
 	prune_xid = ((PageHeader) page)->pd_prune_xid;
 	if (!TransactionIdIsValid(prune_xid))
 		return;

 	/*
 	 * Check whether prune_xid indicates that there may be dead rows that can
 	 * be cleaned up.
 	 *
 	 * It is OK to check the old snapshot limit before acquiring the cleanup
 	 * lock because the worst that can happen is that we are not quite as
 	 * aggressive about the cleanup (by however many transaction IDs are
 	 * consumed between this point and acquiring the lock).  This allows us to
 	 * save significant overhead in the case where the page is found not to be
 	 * prunable.
 	 *
 	 * Even if old_snapshot_threshold is set, we first check whether the page
 	 * can be pruned without. Both because
 	 * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
 	 * unnecessarily relying on old_snapshot_threshold avoids causing
 	 * conflicts.
 	 */
 	vistest = GlobalVisTestFor(relation);

 	if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
 	{
 		if (!OldSnapshotThresholdActive())
 			return;

 		if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
 												 relation,
 												 &limited_xmin, &limited_ts))
 			return;

 		if (!TransactionIdPrecedes(prune_xid, limited_xmin))
 			return;
 	}

 	/*
 	 * We prune when a previous UPDATE failed to find enough space on the page
 	 * for a new tuple version, or when free space falls below the relation's
 	 * fill-factor target (but not less than 10%).
 	 *
 	 * Checking free space here is questionable since we aren't holding any
 	 * lock on the buffer; in the worst case we could get a bogus answer. It's
 	 * unlikely to be *seriously* wrong, though, since reading either pd_lower
 	 * or pd_upper is probably atomic.  Avoiding taking a lock seems more
 	 * important than sometimes getting a wrong answer in what is after all
 	 * just a heuristic estimate.
 	 */
 	minfree = RelationGetTargetPageFreeSpace(relation,
 											 HEAP_DEFAULT_FILLFACTOR);
 	minfree = Max(minfree, BLCKSZ / 10);

 	if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
 	{
 		/* OK, try to get exclusive buffer lock */
 		if (!ConditionalLockBufferForCleanup(buffer))
 			return;

 		/*
 		 * Now that we have buffer lock, get accurate information about the
 		 * page's free space, and recheck the heuristic about whether to
 		 * prune.
 		 */
 		if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
 		{
 			int			ndeleted,
 						nnewlpdead;

 			ndeleted = heap_page_prune(relation, buffer, InvalidTransactionId,
 									   vistest, limited_xmin,
 									   limited_ts, &nnewlpdead, NULL);

 			/*
 			 * Report the number of tuples reclaimed to pgstats.  This is
 			 * ndeleted minus the number of newly-LP_DEAD-set items.
 			 *
 			 * We derive the number of dead tuples like this to avoid totally
 			 * forgetting about items that were set to LP_DEAD, since they
 			 * still need to be cleaned up by VACUUM.  We only want to count
 			 * heap-only tuples that just became LP_UNUSED in our report,
 			 * which don't.
 			 *
 			 * VACUUM doesn't have to compensate in the same way when it
 			 * tracks ndeleted, since it will set the same LP_DEAD items to
 			 * LP_UNUSED separately.
 			 */
 			if (ndeleted > nnewlpdead)
 				pgstat_update_heap_dead_tuples(relation,
 											   ndeleted - nnewlpdead);
 		}

 		/* And release buffer lock */
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

 		/*
 		 * We avoid reuse of any free space created on the page by unrelated
 		 * UPDATEs/INSERTs by opting to not update the FSM at this point.  The
 		 * free space should be reused by UPDATEs to *this* page.
 		 */
 	}
 }


 /*
  * Prune and repair fragmentation in the specified page.
  *
  * Caller must have pin and buffer cleanup lock on the page.  Note that we
  * don't update the FSM information for page on caller's behalf.  Caller might
  * also need to account for a reduction in the length of the line pointer
  * array following array truncation by us.
  *
  * vistest and oldest_xmin are used to distinguish whether tuples are DEAD or
  * RECENTLY_DEAD (see heap_prune_satisfies_vacuum and
  * HeapTupleSatisfiesVacuum). If oldest_xmin is provided by the caller, it is
  * used before consulting GlobalVisState.
  *
  * old_snap_xmin / old_snap_ts need to either have been set by
  * TransactionIdLimitedForOldSnapshots, or InvalidTransactionId/0
  * respectively.
  *
  * Sets *nnewlpdead for caller, indicating the number of items that were
  * newly set LP_DEAD during prune operation.
  *
  * off_loc is the offset location required by the caller to use in error
  * callback.
  *
  * Returns the number of tuples deleted from the page during this call.
  */
 int
 heap_page_prune(Relation relation, Buffer buffer,
 				TransactionId oldest_xmin,
 				GlobalVisState *vistest,
 				TransactionId old_snap_xmin,
 				TimestampTz old_snap_ts,
 				int *nnewlpdead,
 				OffsetNumber *off_loc)
 {
 	int			ndeleted = 0;
 	Page		page = BufferGetPage(buffer);
 	BlockNumber blockno = BufferGetBlockNumber(buffer);
 	OffsetNumber offnum,
 				maxoff;
 	PruneState	prstate;
 	HeapTupleData tup;

 	/*
 	 * Our strategy is to scan the page and make lists of items to change,
 	 * then apply the changes within a critical section.  This keeps as much
 	 * logic as possible out of the critical section, and also ensures that
 	 * WAL replay will work the same as the normal case.
 	 *
 	 * First, initialize the new pd_prune_xid value to zero (indicating no
 	 * prunable tuples).  If we find any tuples which may soon become
 	 * prunable, we will save the lowest relevant XID in new_prune_xid. Also
 	 * initialize the rest of our working state.
 	 */
 	prstate.new_prune_xid = InvalidTransactionId;
 	prstate.rel = relation;
 	prstate.oldest_xmin = oldest_xmin;
 	prstate.vistest = vistest;
 	prstate.old_snap_xmin = old_snap_xmin;
 	prstate.old_snap_ts = old_snap_ts;
 	prstate.old_snap_used = false;
 	prstate.snapshotConflictHorizon = InvalidTransactionId;
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));

 	maxoff = PageGetMaxOffsetNumber(page);
 	tup.t_tableOid = RelationGetRelid(prstate.rel);

 	/*
 	 * Determine HTSV for all tuples.
 	 *
 	 * This is required for correctness to deal with cases where running HTSV
 	 * twice could result in different results (e.g. RECENTLY_DEAD can turn to
 	 * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
 	 * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the
 	 * inserting transaction aborts, ...). That in turn could cause
 	 * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
 	 * once directly via a heap_prune_chain() and once following a HOT chain.
 	 *
 	 * It's also good for performance. Most commonly tuples within a page are
 	 * stored at decreasing offsets (while the items are stored at increasing
 	 * offsets). When processing all tuples on a page this leads to reading
 	 * memory at decreasing offsets within a page, with a variable stride.
 	 * That's hard for CPU prefetchers to deal with. Processing the items in
 	 * reverse order (and thus the tuples in increasing order) increases
 	 * prefetching efficiency significantly / decreases the number of cache
 	 * misses.
 	 */
 	for (offnum = maxoff;
 		 offnum >= FirstOffsetNumber;
 		 offnum = OffsetNumberPrev(offnum))
 	{
 		ItemId		itemid = PageGetItemId(page, offnum);
 		HeapTupleHeader htup;

 		/* Nothing to do if slot doesn't contain a tuple */
 		if (!ItemIdIsNormal(itemid))
 		{
 			prstate.htsv[offnum] = -1;
 			continue;
 		}

 		htup = (HeapTupleHeader) PageGetItem(page, itemid);
 		tup.t_data = htup;
 		tup.t_len = ItemIdGetLength(itemid);
 		ItemPointerSet(&(tup.t_self), blockno, offnum);

 		/*
 		 * Set the offset number so that we can display it along with any
 		 * error that occurred while processing this tuple.
 		 */
 		if (off_loc)
 			*off_loc = offnum;

 		prstate.htsv[offnum] = heap_prune_satisfies_vacuum(relation, &prstate, &tup,
 														   buffer);
 	}

 	/* Scan the page */
 	for (offnum = FirstOffsetNumber;
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
 	{
 		ItemId		itemid;

 		/* Ignore items already processed as part of an earlier chain */
 		if (prstate.marked[offnum])
 			continue;

 		/* see preceding loop */
 		if (off_loc)
 			*off_loc = offnum;

 		/* Nothing to do if slot is empty or already dead */
 		itemid = PageGetItemId(page, offnum);
 		if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
 			continue;

 		/* Process this item or chain of items */
 		ndeleted += heap_prune_chain(buffer, offnum, &prstate);
 	}

 	/* Clear the offset information once we have processed the given page. */
 	if (off_loc)
 		*off_loc = InvalidOffsetNumber;

 	/* Any error while applying the changes is critical */
 	START_CRIT_SECTION();

 	/* Have we found any prunable items? */
 	if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
 	{
 		/*
 		 * Apply the planned item changes, then repair page fragmentation, and
 		 * update the page's hint bit about whether it has free line pointers.
 		 */
 		heap_page_prune_execute(buffer,
 								prstate.redirected, prstate.nredirected,
 								prstate.nowdead, prstate.ndead,
 								prstate.nowunused, prstate.nunused);

 		/*
 		 * Update the page's pd_prune_xid field to either zero, or the lowest
 		 * XID of any soon-prunable tuple.
 		 */
 		((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;

 		/*
 		 * Also clear the "page is full" flag, since there's no point in
 		 * repeating the prune/defrag process until something else happens to
 		 * the page.
 		 */
 		PageClearFull(page);

 		MarkBufferDirty(buffer);

 		/*
 		 * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
 		 */
 		if (RelationNeedsWAL(relation))
 		{
 			xl_heap_prune xlrec;
 			XLogRecPtr	recptr;

 			xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation);
 			xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon;
 			xlrec.nredirected = prstate.nredirected;
 			xlrec.ndead = prstate.ndead;

 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);

 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);

 			/*
 			 * The OffsetNumber arrays are not actually in the buffer, but we
 			 * pretend that they are.  When XLogInsert stores the whole
 			 * buffer, the offset arrays need not be stored too.
 			 */
 			if (prstate.nredirected > 0)
 				XLogRegisterBufData(0, (char *) prstate.redirected,
 									prstate.nredirected *
 									sizeof(OffsetNumber) * 2);

 			if (prstate.ndead > 0)
 				XLogRegisterBufData(0, (char *) prstate.nowdead,
 									prstate.ndead * sizeof(OffsetNumber));

 			if (prstate.nunused > 0)
 				XLogRegisterBufData(0, (char *) prstate.nowunused,
 									prstate.nunused * sizeof(OffsetNumber));

 			recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);

 			PageSetLSN(BufferGetPage(buffer), recptr);
 		}
 	}
 	else
 	{
 		/*
 		 * If we didn't prune anything, but have found a new value for the
 		 * pd_prune_xid field, update it and mark the buffer dirty. This is
 		 * treated as a non-WAL-logged hint.
 		 *
 		 * Also clear the "page is full" flag if it is set, since there's no
 		 * point in repeating the prune/defrag process until something else
 		 * happens to the page.
 		 */
 		if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
 			PageIsFull(page))
 		{
 			((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
 			PageClearFull(page);
 			MarkBufferDirtyHint(buffer, true);
 		}
 	}

 	END_CRIT_SECTION();

 	/* Record number of newly-set-LP_DEAD items for caller */
 	*nnewlpdead = prstate.ndead;

 	return ndeleted;
 }


 /*
  * Perform visibility checks for heap pruning.
  *
  * This is more complicated than just using GlobalVisTestIsRemovableXid()
  * because of old_snapshot_threshold. We only want to increase the threshold
  * that triggers errors for old snapshots when we actually decide to remove a
  * row based on the limited horizon.
  *
  * Due to its cost we also only want to call
  * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
  * done so in heap_page_prune_opt() if pd_prune_xid was old enough. But we
  * still want to be able to remove rows that are too new to be removed
  * according to prstate->vistest, but that can be removed based on
  * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
  * demand in here, if appropriate.
  */
 static HTSV_Result
 heap_prune_satisfies_vacuum(Relation relation,PruneState *prstate, HeapTuple tup, Buffer buffer)
 {
 	HTSV_Result res;
 	TransactionId dead_after;

 	res = HeapTupleSatisfiesVacuumHorizon(relation, tup, buffer, &dead_after);

 	if (res != HEAPTUPLE_RECENTLY_DEAD)
 		return res;

 	/*
 	 * If we are already relying on the limited xmin, there is no need to
 	 * delay doing so anymore.
 	 */
 	if (prstate->old_snap_used)
 	{
 		Assert(TransactionIdIsValid(prstate->old_snap_xmin));

 		if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
 			res = HEAPTUPLE_DEAD;
 		return res;
 	}

 	/*
 	 * For VACUUM, we must be sure to prune tuples with xmax older than
 	 * oldest_xmin -- a visibility cutoff determined at the beginning of
 	 * vacuuming the relation. oldest_xmin is used for freezing determination
 	 * and we cannot freeze dead tuples' xmaxes.
 	 */
 	if (TransactionIdIsValid(prstate->oldest_xmin) &&
 		NormalTransactionIdPrecedes(dead_after, prstate->oldest_xmin))
 		return HEAPTUPLE_DEAD;

 	/*
 	 * Determine whether or not the tuple is considered dead when compared
 	 * with the provided GlobalVisState. On-access pruning does not provide
 	 * oldest_xmin. And for vacuum, even if the tuple's xmax is not older than
 	 * oldest_xmin, GlobalVisTestIsRemovableXid() could find the row dead if
 	 * the GlobalVisState has been updated since the beginning of vacuuming
 	 * the relation.
 	 */
 	if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
 		return HEAPTUPLE_DEAD;

 	/*
 	 * If GlobalVisTestIsRemovableXid() is not sufficient to find the row dead
 	 * and old_snapshot_threshold is enabled, try to use the lowered horizon.
 	 */
 	if (OldSnapshotThresholdActive())
 	{
 		/* haven't determined limited horizon yet, requests */
 		if (!TransactionIdIsValid(prstate->old_snap_xmin))
 		{
 			TransactionId horizon =
 				GlobalVisTestNonRemovableHorizon(prstate->vistest);

 			TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
 												&prstate->old_snap_xmin,
 												&prstate->old_snap_ts);
 		}

 		if (TransactionIdIsValid(prstate->old_snap_xmin) &&
 			TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
 		{
 			/*
 			 * About to remove row based on snapshot_too_old. Need to raise
 			 * the threshold so problematic accesses would error.
 			 */
 			Assert(!prstate->old_snap_used);
 			SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
 											 prstate->old_snap_xmin);
 			prstate->old_snap_used = true;
 			res = HEAPTUPLE_DEAD;
 		}
 	}

 	return res;
 }


 /*
  * Prune specified line pointer or a HOT chain originating at line pointer.
  *
  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
  * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
  * DEAD, our visibility test is just too coarse to detect it.
  *
  * In general, pruning must never leave behind a DEAD tuple that still has
  * tuple storage.  VACUUM isn't prepared to deal with that case.  That's why
  * VACUUM prunes the same heap page a second time (without dropping its lock
  * in the interim) when it sees a newly DEAD tuple that we initially saw as
  * in-progress.  Retrying pruning like this can only happen when an inserting
  * transaction concurrently aborts.
  *
  * The root line pointer is redirected to the tuple immediately after the
  * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
  * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
  * tuple, which we treat as a chain of length 1.)
  *
  * We don't actually change the page here. We just add entries to the arrays in
  * prstate showing the changes to be made.  Items to be redirected are added
  * to the redirected[] array (two entries per redirection); items to be set to
  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
  * state are added to nowunused[].
  *
  * Returns the number of tuples (to be) deleted from the page.
  */
 static int
 heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 {
 	int			ndeleted = 0;
 	Page		dp = (Page) BufferGetPage(buffer);
 	TransactionId priorXmax = InvalidTransactionId;
 	ItemId		rootlp;
 	HeapTupleHeader htup;
 	OffsetNumber latestdead = InvalidOffsetNumber,
 				maxoff = PageGetMaxOffsetNumber(dp),
 				offnum;
 	OffsetNumber chainitems[MaxHeapTuplesPerPage];
 	int			nchain = 0,
 				i;

 	rootlp = PageGetItemId(dp, rootoffnum);

 	/*
 	 * If it's a heap-only tuple, then it is not the start of a HOT chain.
 	 */
 	if (ItemIdIsNormal(rootlp))
 	{
 		Assert(prstate->htsv[rootoffnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);

 		if (HeapTupleHeaderIsHeapOnly(htup))
 		{
 			/*
 			 * If the tuple is DEAD and doesn't chain to anything else, mark
 			 * it unused immediately.  (If it does chain, we can only remove
 			 * it as part of pruning its chain.)
 			 *
 			 * We need this primarily to handle aborted HOT updates, that is,
 			 * XMIN_INVALID heap-only tuples.  Those might not be linked to by
 			 * any chain, since the parent tuple might be re-updated before
 			 * any pruning occurs.  So we have to be able to reap them
 			 * separately from chain-pruning.  (Note that
 			 * HeapTupleHeaderIsHotUpdated will never return true for an
 			 * XMIN_INVALID tuple, so this code will work even when there were
 			 * sequential updates within the aborted transaction.)
 			 *
 			 * Note that we might first arrive at a dead heap-only tuple
 			 * either here or while following a chain below.  Whichever path
 			 * gets there first will mark the tuple unused.
 			 */
 			if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
 				!HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
 				HeapTupleHeaderAdvanceConflictHorizon(htup,
 													  &prstate->snapshotConflictHorizon);
 				ndeleted++;
 			}

 			/* Nothing more to do */
 			return ndeleted;
 		}
 	}

 	/* Start from the root tuple */
 	offnum = rootoffnum;

 	/* while not end of the chain */
 	for (;;)
 	{
 		ItemId		lp;
 		bool		tupdead,
 					recent_dead;

 		/* Sanity check (pure paranoia) */
 		if (offnum < FirstOffsetNumber)
 			break;

 		/*
 		 * An offset past the end of page's line pointer array is possible
 		 * when the array was truncated (original item must have been unused)
 		 */
 		if (offnum > maxoff)
 			break;

 		/* If item is already processed, stop --- it must not be same chain */
 		if (prstate->marked[offnum])
 			break;

 		lp = PageGetItemId(dp, offnum);

 		/* Unused item obviously isn't part of the chain */
 		if (!ItemIdIsUsed(lp))
 			break;

 		/*
 		 * If we are looking at the redirected root line pointer, jump to the
 		 * first normal tuple in the chain.  If we find a redirect somewhere
 		 * else, stop --- it must not be same chain.
 		 */
 		if (ItemIdIsRedirected(lp))
 		{
 			if (nchain > 0)
 				break;			/* not at start of chain */
 			chainitems[nchain++] = offnum;
 			offnum = ItemIdGetRedirect(rootlp);
 			continue;
 		}

 		/*
 		 * Likewise, a dead line pointer can't be part of the chain. (We
 		 * already eliminated the case of dead root tuple outside this
 		 * function.)
 		 */
 		if (ItemIdIsDead(lp))
 			break;

 		Assert(ItemIdIsNormal(lp));
 		Assert(prstate->htsv[offnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, lp);

 		/*
 		 * Check the tuple XMIN against prior XMAX, if any
 		 */
 		if (TransactionIdIsValid(priorXmax) &&
 			!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
 			break;

 		/*
 		 * OK, this tuple is indeed a member of the chain.
 		 */
 		chainitems[nchain++] = offnum;

 		/*
 		 * Check tuple's visibility status.
 		 */
 		tupdead = recent_dead = false;

 		switch ((HTSV_Result) prstate->htsv[offnum])
 		{
 			case HEAPTUPLE_DEAD:
 				tupdead = true;
 				break;

 			case HEAPTUPLE_RECENTLY_DEAD:
 				recent_dead = true;

 				/*
 				 * This tuple may soon become DEAD.  Update the hint field so
 				 * that the page is reconsidered for pruning in future.
 				 */
 				heap_prune_record_prunable(prstate,
 										   HeapTupleHeaderGetUpdateXid(htup));
 				break;

 			case HEAPTUPLE_DELETE_IN_PROGRESS:

 				/*
 				 * This tuple may soon become DEAD.  Update the hint field so
 				 * that the page is reconsidered for pruning in future.
 				 */
 				heap_prune_record_prunable(prstate,
 										   HeapTupleHeaderGetUpdateXid(htup));
 				break;

 			case HEAPTUPLE_LIVE:
 			case HEAPTUPLE_INSERT_IN_PROGRESS:

 				/*
 				 * If we wanted to optimize for aborts, we might consider
 				 * marking the page prunable when we see INSERT_IN_PROGRESS.
 				 * But we don't.  See related decisions about when to mark the
 				 * page prunable in heapam.c.
 				 */
 				break;

 			default:
 				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
 				break;
 		}

 		/*
 		 * Remember the last DEAD tuple seen.  We will advance past
 		 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
 		 * but we can't advance past anything else.  We have to make sure that
 		 * we don't miss any DEAD tuples, since DEAD tuples that still have
 		 * tuple storage after pruning will confuse VACUUM.
 		 */
 		if (tupdead)
 		{
 			latestdead = offnum;
 			HeapTupleHeaderAdvanceConflictHorizon(htup,
 												  &prstate->snapshotConflictHorizon);
 		}
 		else if (!recent_dead)
 			break;

 		/*
 		 * If the tuple is not HOT-updated, then we are at the end of this
 		 * HOT-update chain.
 		 */
 		if (!HeapTupleHeaderIsHotUpdated(htup))
 			break;

 		/* HOT implies it can't have moved to different partition */
 		Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));

 		/*
 		 * Advance to next chain member.
 		 */
 		Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
 			   BufferGetBlockNumber(buffer));
 		offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
 		priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 	}

 	/*
 	 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
 	 * the DEAD tuples at the start of the chain are removed and the root line
 	 * pointer is appropriately redirected.
 	 */
 	if (OffsetNumberIsValid(latestdead))
 	{
 		/*
 		 * Mark as unused each intermediate item that we are able to remove
 		 * from the chain.
 		 *
 		 * When the previous item is the last dead tuple seen, we are at the
 		 * right candidate for redirection.
 		 */
 		for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
 		{
 			heap_prune_record_unused(prstate, chainitems[i]);
 			ndeleted++;
 		}

 		/*
 		 * If the root entry had been a normal tuple, we are deleting it, so
 		 * count it in the result.  But changing a redirect (even to DEAD
 		 * state) doesn't count.
 		 */
 		if (ItemIdIsNormal(rootlp))
 			ndeleted++;

 		/*
 		 * If the DEAD tuple is at the end of the chain, the entire chain is
 		 * dead and the root line pointer can be marked dead.  Otherwise just
 		 * redirect the root to the correct chain member.
 		 */
 		if (i >= nchain)
 			heap_prune_record_dead(prstate, rootoffnum);
 		else
 			heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
 	}
 	else if (nchain < 2 && ItemIdIsRedirected(rootlp))
 	{
 		/*
 		 * We found a redirect item that doesn't point to a valid follow-on
 		 * item.  This can happen if the loop in heap_page_prune caused us to
 		 * visit the dead successor of a redirect item before visiting the
 		 * redirect item.  We can clean up by setting the redirect item to
 		 * DEAD state.
 		 */
 		heap_prune_record_dead(prstate, rootoffnum);
 	}

 	return ndeleted;
 }

 /* Record lowest soon-prunable XID */
 static void
 heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
 {
 	/*
 	 * This should exactly match the PageSetPrunable macro.  We can't store
 	 * directly into the page header yet, so we update working state.
 	 */
 	Assert(TransactionIdIsNormal(xid));
 	if (!TransactionIdIsValid(prstate->new_prune_xid) ||
 		TransactionIdPrecedes(xid, prstate->new_prune_xid))
 		prstate->new_prune_xid = xid;
 }

 /* Record line pointer to be redirected */
 static void
 heap_prune_record_redirect(PruneState *prstate,
 						   OffsetNumber offnum, OffsetNumber rdoffnum)
 {
 	Assert(prstate->nredirected < MaxHeapTuplesPerPage);
 	prstate->redirected[prstate->nredirected * 2] = offnum;
 	prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
 	prstate->nredirected++;
 	Assert(!prstate->marked[offnum]);
 	prstate->marked[offnum] = true;
 	Assert(!prstate->marked[rdoffnum]);
 	prstate->marked[rdoffnum] = true;
 }

 /* Record line pointer to be marked dead */
 static void
 heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
 {
 	Assert(prstate->ndead < MaxHeapTuplesPerPage);
 	prstate->nowdead[prstate->ndead] = offnum;
 	prstate->ndead++;
 	Assert(!prstate->marked[offnum]);
 	prstate->marked[offnum] = true;
 }

 /* Record line pointer to be marked unused */
 static void
 heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
 {
 	Assert(prstate->nunused < MaxHeapTuplesPerPage);
 	prstate->nowunused[prstate->nunused] = offnum;
 	prstate->nunused++;
 	Assert(!prstate->marked[offnum]);
 	prstate->marked[offnum] = true;
 }


 /*
  * Perform the actual page changes needed by heap_page_prune.
  * It is expected that the caller has a full cleanup lock on the
  * buffer.
  */
 void
 heap_page_prune_execute(Buffer buffer,
 						OffsetNumber *redirected, int nredirected,
 						OffsetNumber *nowdead, int ndead,
 						OffsetNumber *nowunused, int nunused)
 {
 	Page		page = (Page) BufferGetPage(buffer);
 	OffsetNumber *offnum;
 	HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;

 	/* Shouldn't be called unless there's something to do */
 	Assert(nredirected > 0 || ndead > 0 || nunused > 0);

 	/* Update all redirected line pointers */
 	offnum = redirected;
 	for (int i = 0; i < nredirected; i++)
 	{
 		OffsetNumber fromoff = *offnum++;
 		OffsetNumber tooff = *offnum++;
 		ItemId		fromlp = PageGetItemId(page, fromoff);
 		ItemId		tolp PG_USED_FOR_ASSERTS_ONLY;

 #ifdef USE_ASSERT_CHECKING

 		/*
 		 * Any existing item that we set as an LP_REDIRECT (any 'from' item)
 		 * must be the first item from a HOT chain.  If the item has tuple
 		 * storage then it can't be a heap-only tuple.  Otherwise we are just
 		 * maintaining an existing LP_REDIRECT from an existing HOT chain that
 		 * has been pruned at least once before now.
 		 */
 		if (!ItemIdIsRedirected(fromlp))
 		{
 			Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));

 			htup = (HeapTupleHeader) PageGetItem(page, fromlp);
 			Assert(!HeapTupleHeaderIsHeapOnly(htup));
 		}
 		else
 		{
 			/* We shouldn't need to redundantly set the redirect */
 			Assert(ItemIdGetRedirect(fromlp) != tooff);
 		}

 		/*
 		 * The item that we're about to set as an LP_REDIRECT (the 'from'
 		 * item) will point to an existing item (the 'to' item) that is
 		 * already a heap-only tuple.  There can be at most one LP_REDIRECT
 		 * item per HOT chain.
 		 *
 		 * We need to keep around an LP_REDIRECT item (after original
 		 * non-heap-only root tuple gets pruned away) so that it's always
 		 * possible for VACUUM to easily figure out what TID to delete from
 		 * indexes when an entire HOT chain becomes dead.  A heap-only tuple
 		 * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
 		 * tuple can.
 		 *
 		 * This check may miss problems, e.g. the target of a redirect could
 		 * be marked as unused subsequently. The page_verify_redirects() check
 		 * below will catch such problems.
 		 */
 		tolp = PageGetItemId(page, tooff);
 		Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
 		htup = (HeapTupleHeader) PageGetItem(page, tolp);
 		Assert(HeapTupleHeaderIsHeapOnly(htup));
 #endif

 		ItemIdSetRedirect(fromlp, tooff);
 	}

 	/* Update all now-dead line pointers */
 	offnum = nowdead;
 	for (int i = 0; i < ndead; i++)
 	{
 		OffsetNumber off = *offnum++;
 		ItemId		lp = PageGetItemId(page, off);

 #ifdef USE_ASSERT_CHECKING

 		/*
 		 * An LP_DEAD line pointer must be left behind when the original item
 		 * (which is dead to everybody) could still be referenced by a TID in
 		 * an index.  This should never be necessary with any individual
 		 * heap-only tuple item, though. (It's not clear how much of a problem
 		 * that would be, but there is no reason to allow it.)
 		 */
 		if (ItemIdHasStorage(lp))
 		{
 			Assert(ItemIdIsNormal(lp));
 			htup = (HeapTupleHeader) PageGetItem(page, lp);
 			Assert(!HeapTupleHeaderIsHeapOnly(htup));
 		}
 		else
 		{
 			/* Whole HOT chain becomes dead */
 			Assert(ItemIdIsRedirected(lp));
 		}
 #endif

 		ItemIdSetDead(lp);
 	}

 	/* Update all now-unused line pointers */
 	offnum = nowunused;
 	for (int i = 0; i < nunused; i++)
 	{
 		OffsetNumber off = *offnum++;
 		ItemId		lp = PageGetItemId(page, off);

 #ifdef USE_ASSERT_CHECKING

 		/*
 		 * Only heap-only tuples can become LP_UNUSED during pruning.  They
 		 * don't need to be left in place as LP_DEAD items until VACUUM gets
 		 * around to doing index vacuuming.
 		 */
 		Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
 		htup = (HeapTupleHeader) PageGetItem(page, lp);
 		Assert(HeapTupleHeaderIsHeapOnly(htup));
 #endif

 		ItemIdSetUnused(lp);
 	}

 	/*
 	 * Finally, repair any fragmentation, and update the page's hint bit about
 	 * whether it has free pointers.
 	 */
 	PageRepairFragmentation(page);

 	/*
 	 * Now that the page has been modified, assert that redirect items still
 	 * point to valid targets.
 	 */
 	page_verify_redirects(page);
 }


 /*
  * If built with assertions, verify that all LP_REDIRECT items point to a
  * valid item.
  *
  * One way that bugs related to HOT pruning show is redirect items pointing to
  * removed tuples. It's not trivial to reliably check that marking an item
  * unused will not orphan a redirect item during heap_prune_chain() /
  * heap_page_prune_execute(), so we additionally check the whole page after
  * pruning. Without this check such bugs would typically only cause asserts
  * later, potentially well after the corruption has been introduced.
  *
  * Also check comments in heap_page_prune_execute()'s redirection loop.
  */
 static void
 page_verify_redirects(Page page)
 {
 #ifdef USE_ASSERT_CHECKING
 	OffsetNumber offnum;
 	OffsetNumber maxoff;

 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = FirstOffsetNumber;
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
 	{
 		ItemId		itemid = PageGetItemId(page, offnum);
 		OffsetNumber targoff;
 		ItemId		targitem;
 		HeapTupleHeader htup;

 		if (!ItemIdIsRedirected(itemid))
 			continue;

 		targoff = ItemIdGetRedirect(itemid);
 		targitem = PageGetItemId(page, targoff);

 		Assert(ItemIdIsUsed(targitem));
 		Assert(ItemIdIsNormal(targitem));
 		Assert(ItemIdHasStorage(targitem));
 		htup = (HeapTupleHeader) PageGetItem(page, targitem);
 		Assert(HeapTupleHeaderIsHeapOnly(htup));
 	}
 #endif
 }


 /*
  * For all items in this page, find their respective root line pointers.
  * If item k is part of a HOT-chain with root at item j, then we set
  * root_offsets[k - 1] = j.
  *
  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
  * Unused entries are filled with InvalidOffsetNumber (zero).
  *
  * The function must be called with at least share lock on the buffer, to
  * prevent concurrent prune operations.
  *
  * Note: The information collected here is valid only as long as the caller
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
  */
 void
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
 	OffsetNumber offnum,
 				maxoff;

 	MemSet(root_offsets, InvalidOffsetNumber,
 		   MaxHeapTuplesPerPage * sizeof(OffsetNumber));

 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
 	{
 		ItemId		lp = PageGetItemId(page, offnum);
 		HeapTupleHeader htup;
 		OffsetNumber nextoffnum;
 		TransactionId priorXmax;

 		/* skip unused and dead items */
 		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
 			continue;

 		if (ItemIdIsNormal(lp))
 		{
 			htup = (HeapTupleHeader) PageGetItem(page, lp);

 			/*
 			 * Check if this tuple is part of a HOT-chain rooted at some other
 			 * tuple. If so, skip it for now; we'll process it when we find
 			 * its root.
 			 */
 			if (HeapTupleHeaderIsHeapOnly(htup))
 				continue;

 			/*
 			 * This is either a plain tuple or the root of a HOT-chain.
 			 * Remember it in the mapping.
 			 */
 			root_offsets[offnum - 1] = offnum;

 			/* If it's not the start of a HOT-chain, we're done with it */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
 				continue;

 			/* Set up to scan the HOT-chain */
 			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 		else
 		{
 			/* Must be a redirect item. We do not set its root_offsets entry */
 			Assert(ItemIdIsRedirected(lp));
 			/* Set up to scan the HOT-chain */
 			nextoffnum = ItemIdGetRedirect(lp);
 			priorXmax = InvalidTransactionId;
 		}

 		/*
 		 * Now follow the HOT-chain and collect other tuples in the chain.
 		 *
 		 * Note: Even though this is a nested loop, the complexity of the
 		 * function is O(N) because a tuple in the page should be visited not
 		 * more than twice, once in the outer loop and once in HOT-chain
 		 * chases.
 		 */
 		for (;;)
 		{
 			/* Sanity check (pure paranoia) */
 			if (offnum < FirstOffsetNumber)
 				break;

 			/*
 			 * An offset past the end of page's line pointer array is possible
 			 * when the array was truncated
 			 */
 			if (offnum > maxoff)
 				break;

 			lp = PageGetItemId(page, nextoffnum);

 			/* Check for broken chains */
 			if (!ItemIdIsNormal(lp))
 				break;

 			htup = (HeapTupleHeader) PageGetItem(page, lp);

 			if (TransactionIdIsValid(priorXmax) &&
 				!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
 				break;

 			/* Remember the root line pointer for this item */
 			root_offsets[nextoffnum - 1] = offnum;

 			/* Advance to next chain member, if any */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
 				break;

 			/* HOT implies it can't have moved to different partition */
 			Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));

 			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
 }