src/backend/access/nbtree/nbtinsert.c - hawq - Git at Google

 /*-------------------------------------------------------------------------
  *
  * nbtinsert.c
  *	  Item insertion in Lehman and Yao btrees for Postgres.
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.146.2.2 2007/12/31 04:52:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "access/transam.h"
 #include "cdb/cdbappendonlyam.h"
 #include "cdb/cdbvars.h"  /*Gp_is_primary*/
 #include "miscadmin.h"
 #include "utils/inval.h"


 typedef struct
 {
 	/* context data for _bt_checksplitloc */
 	Size		newitemsz;		/* size of new item to be inserted */
 	int			fillfactor;		/* needed when splitting rightmost page */
 	bool		is_leaf;		/* T if splitting a leaf page */
 	bool		is_rightmost;	/* T if splitting a rightmost page */

 	bool		have_split;		/* found a valid split? */

 	/* these fields valid only if have_split is true */
 	bool		newitemonleft;	/* new item on left or right of best split */
 	OffsetNumber firstright;	/* best split point */
 	int			best_delta;		/* best size delta so far */
 } FindSplitData;


 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);

 static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
 				 Relation heapRel, Buffer buf,
 				 ScanKey itup_scankey);
 static void _bt_insertonpg(Relation rel, Buffer buf,
 			   BTStack stack,
 			   int keysz, ScanKey scankey,
 			   IndexTuple itup,
 			   OffsetNumber afteritem,
 			   bool split_only_page);
 static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		  OffsetNumber newitemoff, Size newitemsz,
 		  IndexTuple newitem, bool newitemonleft);
 static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 				 OffsetNumber newitemoff,
 				 Size newitemsz,
 				 bool *newitemonleft);
 static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
 				  int leftfree, int rightfree,
 				  bool newitemonleft, Size firstrightitemsz);
 static void _bt_pgaddtup(Relation rel, Page page,
 			 Size itemsize, IndexTuple itup,
 			 OffsetNumber itup_off, const char *where);
 static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
 			int keysz, ScanKey scankey);
 static void _bt_vacuum_one_page(Relation rel, Buffer buffer);


 /*
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
  *
  *		This routine is called by the public interface routines, btbuild
  *		and btinsert.  By here, itup is filled in, including the TID.
  */
 void
 _bt_doinsert(Relation rel, IndexTuple itup,
 			 bool index_is_unique, Relation heapRel)
 {
 	MIRROREDLOCK_BUFMGR_DECLARE;

 	int			natts = rel->rd_rel->relnatts;
 	ScanKey		itup_scankey;
 	BTStack		stack;
 	Buffer		buf;

 	/* we need an insertion scan key to do our search, so build one */
 	itup_scankey = _bt_mkscankey(rel, itup);

 top:
 	// -------- MirroredLock ----------
 	MIRROREDLOCK_BUFMGR_LOCK;

 	/* find the first page containing this key */
 	stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);

 	/* trade in our read lock for a write lock */
 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 	LockBuffer(buf, BT_WRITE);

 	/*
 	 * If the page was split between the time that we surrendered our read
 	 * lock and acquired our write lock, then this page may no longer be the
 	 * right place for the key we want to insert.  In this case, we need to
 	 * move right in the tree.	See Lehman and Yao for an excruciatingly
 	 * precise description.
 	 */
 	buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);

 	/*
 	 * If we're not allowing duplicates, make sure the key isn't already in
 	 * the index.
 	 *
 	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
 	 * in the index; so it cannot defend against concurrent insertions of the
 	 * same key.  We protect against that by means of holding a write lock on
 	 * the target page.  Any other would-be inserter of the same key must
 	 * acquire a write lock on the same target page, so only one would-be
 	 * inserter can be making the check at one time.  Furthermore, once we are
 	 * past the check we hold write locks continuously until we have performed
 	 * our insertion, so no later inserter can fail to see our insertion.
 	 * (This requires some care in _bt_insertonpg.)
 	 *
 	 * If we must wait for another xact, we release the lock while waiting,
 	 * and then must start over completely.
 	 */
 	if (index_is_unique)
 	{
 		TransactionId xwait;

 		xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);

 		if (TransactionIdIsValid(xwait) )
 		{
 			/* Have to wait for the other guy ... */
 			_bt_relbuf(rel, buf);
 			/*
 			 * We have to unlock it to resume interrupts.  In case we wait for
 			 * the lock in XactLockTableWait and a cancellation is requested,
 			 * we should be able to respond it.
 			 */
 			MIRROREDLOCK_BUFMGR_UNLOCK;
 			XactLockTableWait(xwait);
 			/* start over... */
 			_bt_freestack(stack);
 			goto top;
 		}
 	}

 	/* do the insertion */
 	_bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);

 	MIRROREDLOCK_BUFMGR_UNLOCK;
 	// -------- MirroredLock ----------

 	/* be tidy */
 	_bt_freestack(stack);
 	_bt_freeskey(itup_scankey);
 }

 /*
  *	_bt_check_unique() -- Check for violation of unique index constraint
  *
  * Returns InvalidTransactionId if there is no conflict, else an xact ID
  * we must wait for to see if it commits a conflicting tuple.	If an actual
  * conflict is detected, no return --- just ereport().
  */
 static TransactionId
 _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 				 Buffer buf, ScanKey itup_scankey)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	int			natts = rel->rd_rel->relnatts;
 	OffsetNumber offset,
 				maxoff;
 	Page		page;
 	BTPageOpaque opaque;
 	Buffer		nbuf = InvalidBuffer;

 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);

 	/*
 	 * Find first item >= proposed new item.  Note we could also get a pointer
 	 * to end-of-page here.
 	 */
 	offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);

 	/*
 	 * Scan over all equal tuples, looking for live conflicts.
 	 */
 	for (;;)
 	{
 		HeapTupleData htup;
 		Buffer		hbuffer;
 		ItemId		curitemid;
 		IndexTuple	curitup;
 		BlockNumber nblkno;

 		/*
 		 * make sure the offset points to an actual item before trying to
 		 * examine it...
 		 */
 		if (offset <= maxoff)
 		{
 			curitemid = PageGetItemId(page, offset);

 			/*
 			 * We can skip items that are marked killed.
 			 *
 			 * Formerly, we applied _bt_isequal() before checking the kill
 			 * flag, so as to fall out of the item loop as soon as possible.
 			 * However, in the presence of heavy update activity an index may
 			 * contain many killed items with the same key; running
 			 * _bt_isequal() on each killed item gets expensive. Furthermore
 			 * it is likely that the non-killed version of each key appears
 			 * first, so that we didn't actually get to exit any sooner
 			 * anyway. So now we just advance over killed items as quickly as
 			 * we can. We only apply _bt_isequal() when we get to a non-killed
 			 * item or the end of the page.
 			 */
 			if (!ItemIdDeleted(curitemid))
 			{
 				/*
 				 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
 				 * how we handling NULLs - and so we must not use _bt_compare
 				 * in real comparison, but only for ordering/finding items on
 				 * pages. - vadim 03/24/97
 				 */
 				if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
 					break;		/* we're past all the equal tuples */

 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);

 				/*
 				 * If the parent relation is an AO/CO table, we have to find out
 				 * if this tuple is actually in the table.
 				 */
 				{
 					htup.t_self = curitup->t_tid;
 					if (heap_fetch(heapRel, SnapshotDirty, &htup, &hbuffer,
 								   true, NULL))
 					{
 						/* it is a duplicate */
 						TransactionId xwait =
 							(TransactionIdIsValid(SnapshotDirty->xmin)) ?
 							SnapshotDirty->xmin : SnapshotDirty->xmax;

 						ReleaseBuffer(hbuffer);

 						/*
 						 * If this tuple is being updated by other transaction
 						 * then we have to wait for its commit/abort.
 						 */
 						if (TransactionIdIsValid(xwait))
 						{
 							if (nbuf != InvalidBuffer)
 								_bt_relbuf(rel, nbuf);
 							/* Tell _bt_doinsert to wait... */
 							return xwait;
 						}

 						/*
 						 * Otherwise we have a definite conflict.  But before
 						 * complaining, look to see if the tuple we want to insert
 						 * is itself now committed dead --- if so, don't complain.
 						 * This is a waste of time in normal scenarios but we must
 						 * do it to support CREATE INDEX CONCURRENTLY.
 						 */
 						htup.t_self = itup->t_tid;
 						if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer,
 									   false, NULL))
 						{
 							/* Normal case --- it's still live */
 							ReleaseBuffer(hbuffer);
 						}
 						else if (htup.t_data != NULL)
 						{
 							/*
 							 * It's been deleted, so no error, and no need to
 							 * continue searching
 							 */
 							break;
 						}
 						else
 						{
 							/* couldn't find the tuple?? */
 							elog(ERROR, "failed to fetch tuple being inserted");
 						}

 						ereport(ERROR,
 								(errcode(ERRCODE_UNIQUE_VIOLATION),
 								 errmsg("duplicate key violates unique constraint \"%s\"",
 										RelationGetRelationName(rel)),
 								 errOmitLocation(true)));
 					}
 					else if (htup.t_data != NULL)
 					{
 						/*
 						 * Hmm, if we can't see the tuple, maybe it can be marked
 						 * killed.	This logic should match index_getnext and
 						 * btgettuple.
 						 */
 						LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
 						if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
 													 hbuffer, true) == HEAPTUPLE_DEAD)
 						{
 							curitemid->lp_flags |= LP_DELETE;
 							opaque->btpo_flags |= BTP_HAS_GARBAGE;
 							/* be sure to mark the proper buffer dirty... */
 							if (nbuf != InvalidBuffer)
 								SetBufferCommitInfoNeedsSave(nbuf);
 							else
 								SetBufferCommitInfoNeedsSave(buf);
 						}
 						LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
 					}
 					ReleaseBuffer(hbuffer);
 				}
 			}
 		}

 		/*
 		 * Advance to next tuple to continue checking.
 		 */
 		if (offset < maxoff)
 			offset = OffsetNumberNext(offset);
 		else
 		{
 			/* If scankey == hikey we gotta check the next page too */
 			if (P_RIGHTMOST(opaque))
 				break;
 			if (!_bt_isequal(itupdesc, page, P_HIKEY,
 							 natts, itup_scankey))
 				break;
 			/* Advance to next non-dead page --- there must be one */
 			for (;;)
 			{
 				nblkno = opaque->btpo_next;
 				nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
 				page = BufferGetPage(nbuf);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 				if (!P_IGNORE(opaque))
 					break;
 				if (P_RIGHTMOST(opaque))
 					elog(ERROR, "fell off the end of index \"%s\"",
 						 RelationGetRelationName(rel));
 			}
 			maxoff = PageGetMaxOffsetNumber(page);
 			offset = P_FIRSTDATAKEY(opaque);
 		}
 	}

 	if (nbuf != InvalidBuffer)
 		_bt_relbuf(rel, nbuf);

 	return InvalidTransactionId;
 }

 /*----------
  *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
  *
  *		This recursive procedure does the following things:
  *
  *			+  finds the right place to insert the tuple.
  *			+  if necessary, splits the target page (making sure that the
  *			   split is equitable as far as post-insert free space goes).
  *			+  inserts the tuple.
  *			+  if the page was split, pops the parent stack, and finds the
  *			   right place to insert the new child pointer (by walking
  *			   right using information stored in the parent stack).
  *			+  invokes itself with the appropriate tuple for the right
  *			   child page on the parent.
  *			+  updates the metapage if a true root or fast root is split.
  *
  *		On entry, we must have the right buffer in which to do the
  *		insertion, and the buffer must be pinned and write-locked.	On return,
  *		we will have dropped both the pin and the lock on the buffer.
  *
  *		If 'afteritem' is >0 then the new tuple must be inserted after the
  *		existing item of that number, noplace else.  If 'afteritem' is 0
  *		then the procedure finds the exact spot to insert it by searching.
  *		(keysz and scankey parameters are used ONLY if afteritem == 0.
  *		The scankey must be an insertion-type scankey.)
  *
  *		NOTE: if the new key is equal to one or more existing keys, we can
  *		legitimately place it anywhere in the series of equal keys --- in fact,
  *		if the new key is equal to the page's "high key" we can place it on
  *		the next page.	If it is equal to the high key, and there's not room
  *		to insert the new tuple on the current page without splitting, then
  *		we can move right hoping to find more free space and avoid a split.
  *		(We should not move right indefinitely, however, since that leads to
  *		O(N^2) insertion behavior in the presence of many equal keys.)
  *		Once we have chosen the page to put the key on, we'll insert it before
  *		any existing equal keys because of the way _bt_binsrch() works.
  *
  *		The locking interactions in this code are critical.  You should
  *		grok Lehman and Yao's paper before making any changes.  In addition,
  *		you need to understand how we disambiguate duplicate keys in this
  *		implementation, in order to be able to find our location using
  *		L&Y "move right" operations.  Since we may insert duplicate user
  *		keys, and since these dups may propagate up the tree, we use the
  *		'afteritem' parameter to position ourselves correctly for the
  *		insertion on internal pages.
  *----------
  */
 static void
 _bt_insertonpg(Relation rel,
 			   Buffer buf,
 			   BTStack stack,
 			   int keysz,
 			   ScanKey scankey,
 			   IndexTuple itup,
 			   OffsetNumber afteritem,
 			   bool split_only_page)
 {
 	Page		page;
 	BTPageOpaque lpageop;
 	OffsetNumber newitemoff;
 	OffsetNumber firstright = InvalidOffsetNumber;
 	Size		itemsz;

 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
 	RelationFetchGpRelationNodeForXLog(rel);

 	page = BufferGetPage(buf);
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

 	itemsz = IndexTupleDSize(*itup);
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */

 	/*
 	 * Check whether the item can fit on a btree page at all. (Eventually, we
 	 * ought to try to apply TOAST methods if not.) We actually need to be
 	 * able to fit three items on every page, so restrict any one item to 1/3
 	 * the per-page available space. Note that at this point, itemsz doesn't
 	 * include the ItemId.
 	 */
 	if (itemsz > BTMaxItemSize(page))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("index row size %lu exceeds btree maximum, %lu",
 						(unsigned long) itemsz,
 						(unsigned long) BTMaxItemSize(page)),
 		errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 				"Consider a function index of an MD5 hash of the value, "
 				"or use full text indexing.")));

 	/*
 	 * Determine exactly where new item will go.
 	 */
 	if (afteritem > 0)
 		newitemoff = afteritem + 1;
 	else
 	{
 		/*----------
 		 * If we will need to split the page to put the item here,
 		 * check whether we can put the tuple somewhere to the right,
 		 * instead.  Keep scanning right until we
 		 *		(a) find a page with enough free space,
 		 *		(b) reach the last page where the tuple can legally go, or
 		 *		(c) get tired of searching.
 		 * (c) is not flippant; it is important because if there are many
 		 * pages' worth of equal keys, it's better to split one of the early
 		 * pages than to scan all the way to the end of the run of equal keys
 		 * on every insert.  We implement "get tired" as a random choice,
 		 * since stopping after scanning a fixed number of pages wouldn't work
 		 * well (we'd never reach the right-hand side of previously split
 		 * pages).	Currently the probability of moving right is set at 0.99,
 		 * which may seem too high to change the behavior much, but it does an
 		 * excellent job of preventing O(N^2) behavior with many equal keys.
 		 *----------
 		 */
 		bool		movedright = false;

 		while (PageGetFreeSpace(page) < itemsz)
 		{
 			Buffer		rbuf;

 			/*
 			 * before considering moving right, see if we can obtain enough
 			 * space by erasing LP_DELETE items
 			 */
 			if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
 			{
 				_bt_vacuum_one_page(rel, buf);
 				if (PageGetFreeSpace(page) >= itemsz)
 					break;		/* OK, now we have enough space */
 			}

 			/*
 			 * nope, so check conditions (b) and (c) enumerated above
 			 */
 			if (P_RIGHTMOST(lpageop) ||
 				_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
 				random() <= (MAX_RANDOM_VALUE / 100))
 				break;

 			/*
 			 * step right to next non-dead page
 			 *
 			 * must write-lock that page before releasing write lock on
 			 * current page; else someone else's _bt_check_unique scan could
 			 * fail to see our insertion.  write locks on intermediate dead
 			 * pages won't do because we don't know when they will get
 			 * de-linked from the tree.
 			 */
 			rbuf = InvalidBuffer;

 			for (;;)
 			{
 				BlockNumber rblkno = lpageop->btpo_next;

 				rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
 				page = BufferGetPage(rbuf);
 				lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 				if (!P_IGNORE(lpageop))
 					break;
 				if (P_RIGHTMOST(lpageop))
 					elog(ERROR, "fell off the end of index \"%s\"",
 						 RelationGetRelationName(rel));
 			}
 			_bt_relbuf(rel, buf);
 			buf = rbuf;
 			movedright = true;
 		}

 		/*
 		 * Now we are on the right page, so find the insert position. If we
 		 * moved right at all, we know we should insert at the start of the
 		 * page, else must find the position by searching.
 		 */
 		if (movedright)
 			newitemoff = P_FIRSTDATAKEY(lpageop);
 		else
 			newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
 	}

 	/*
 	 * Do we need to split the page to fit the item on it?
 	 *
 	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
 	 * so this comparison is correct even though we appear to be accounting
 	 * only for the item and not for its line pointer.
 	 */
 	if (PageGetFreeSpace(page) < itemsz)
 	{
 		bool		is_root = P_ISROOT(lpageop);
 		bool		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
 		bool		newitemonleft;
 		Buffer		rbuf;

 		/* Choose the split point */
 		firstright = _bt_findsplitloc(rel, page,
 									  newitemoff, itemsz,
 									  &newitemonleft);

 		/* split the buffer into left and right halves */
 		rbuf = _bt_split(rel, buf, firstright,
 						 newitemoff, itemsz, itup, newitemonleft);

 		/*----------
 		 * By here,
 		 *
 		 *		+  our target page has been split;
 		 *		+  the original tuple has been inserted;
 		 *		+  we have write locks on both the old (left half)
 		 *		   and new (right half) buffers, after the split; and
 		 *		+  we know the key we want to insert into the parent
 		 *		   (it's the "high key" on the left child page).
 		 *
 		 * We're ready to do the parent insertion.  We need to hold onto the
 		 * locks for the child pages until we locate the parent, but we can
 		 * release them before doing the actual insertion (see Lehman and Yao
 		 * for the reasoning).
 		 *----------
 		 */
 		_bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
 	}
 	else
 	{
 		Buffer		metabuf = InvalidBuffer;
 		Page		metapg = NULL;
 		BTMetaPageData *metad = NULL;
 		OffsetNumber itup_off;
 		BlockNumber itup_blkno;

 		itup_off = newitemoff;
 		itup_blkno = BufferGetBlockNumber(buf);

 		/*
 		 * If we are doing this insert because we split a page that was the
 		 * only one on its tree level, but was not the root, it may have been
 		 * the "fast root".  We need to ensure that the fast root link points
 		 * at or above the current page.  We can safely acquire a lock on the
 		 * metapage here --- see comments for _bt_newroot().
 		 */
 		if (split_only_page)
 		{
 			Assert(!P_ISLEAF(lpageop));

 			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 			metapg = BufferGetPage(metabuf);
 			metad = BTPageGetMeta(metapg);

 			if (metad->btm_fastlevel >= lpageop->btpo.level)
 			{
 				/* no update wanted */
 				_bt_relbuf(rel, metabuf);
 				metabuf = InvalidBuffer;
 			}
 		}

 		/* Do the update.  No ereport(ERROR) until changes are logged */
 		START_CRIT_SECTION();

 		_bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page");

 		MarkBufferDirty(buf);

 		if (BufferIsValid(metabuf))
 		{
 			metad->btm_fastroot = itup_blkno;
 			metad->btm_fastlevel = lpageop->btpo.level;
 			MarkBufferDirty(metabuf);
 		}

 		/* XLOG stuff */
 		if (!rel->rd_istemp)
 		{
 			xl_btree_insert xlrec;
 			BlockNumber xldownlink;
 			xl_btree_metadata xlmeta;
 			uint8		xlinfo;
 			XLogRecPtr	recptr;
 			XLogRecData rdata[4];
 			XLogRecData *nextrdata;
 			IndexTupleData trunctuple;

 			xl_btreetid_set(&(xlrec.target), rel, itup_blkno, itup_off);

 			rdata[0].data = (char *) &xlrec;
 			rdata[0].len = SizeOfBtreeInsert;
 			rdata[0].buffer = InvalidBuffer;
 			rdata[0].next = nextrdata = &(rdata[1]);

 			if (P_ISLEAF(lpageop))
 				xlinfo = XLOG_BTREE_INSERT_LEAF;
 			else
 			{
 				xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid));
 				Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);

 				nextrdata->data = (char *) &xldownlink;
 				nextrdata->len = sizeof(BlockNumber);
 				nextrdata->buffer = InvalidBuffer;
 				nextrdata->next = nextrdata + 1;
 				nextrdata++;

 				xlinfo = XLOG_BTREE_INSERT_UPPER;
 			}

 			if (BufferIsValid(metabuf))
 			{
 				xlmeta.root = metad->btm_root;
 				xlmeta.level = metad->btm_level;
 				xlmeta.fastroot = metad->btm_fastroot;
 				xlmeta.fastlevel = metad->btm_fastlevel;

 				nextrdata->data = (char *) &xlmeta;
 				nextrdata->len = sizeof(xl_btree_metadata);
 				nextrdata->buffer = InvalidBuffer;
 				nextrdata->next = nextrdata + 1;
 				nextrdata++;

 				xlinfo = XLOG_BTREE_INSERT_META;
 			}

 			/* Read comments in _bt_pgaddtup */
 			if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
 			{
 				trunctuple = *itup;
 				trunctuple.t_info = sizeof(IndexTupleData);
 				nextrdata->data = (char *) &trunctuple;
 				nextrdata->len = sizeof(IndexTupleData);
 			}
 			else
 			{
 				nextrdata->data = (char *) itup;
 				nextrdata->len = IndexTupleDSize(*itup);
 			}
 			nextrdata->buffer = buf;
 			nextrdata->buffer_std = true;
 			nextrdata->next = NULL;

 			recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

 			if (BufferIsValid(metabuf))
 			{
 				PageSetLSN(metapg, recptr);
 				PageSetTLI(metapg, ThisTimeLineID);
 			}

 			PageSetLSN(page, recptr);
 			PageSetTLI(page, ThisTimeLineID);
 		}

 		END_CRIT_SECTION();

 		/* release buffers; send out relcache inval if metapage changed */
 		if (BufferIsValid(metabuf))
 		{
 			if (!InRecovery)
 				CacheInvalidateRelcache(rel);
 			_bt_relbuf(rel, metabuf);
 		}

 		_bt_relbuf(rel, buf);
 	}
 }

 /*
  *	_bt_split() -- split a page in the btree.
  *
  *		On entry, buf is the page to split, and is pinned and write-locked.
  *		firstright is the item index of the first item to be moved to the
  *		new right page.  newitemoff etc. tell us about the new item that
  *		must be inserted along with the data from the old page.
  *
  *		Returns the new right sibling of buf, pinned and write-locked.
  *		The pin and lock on buf are maintained.
  */
 static Buffer
 _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
 		  bool newitemonleft)
 {
 	Buffer		rbuf;
 	Page		origpage;
 	Page		leftpage,
 				rightpage;
 	BTPageOpaque ropaque,
 				lopaque,
 				oopaque;
 	Buffer		sbuf = InvalidBuffer;
 	Page		spage = NULL;
 	BTPageOpaque sopaque = NULL;
 	OffsetNumber itup_off = 0;
 	BlockNumber itup_blkno = 0;
 	Size		itemsz;
 	ItemId		itemid;
 	IndexTuple	item;
 	OffsetNumber leftoff,
 				rightoff;
 	OffsetNumber maxoff;
 	OffsetNumber i;

 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
 	RelationFetchGpRelationNodeForXLog(rel);

 	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 	origpage = BufferGetPage(buf);
 	leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
 	rightpage = BufferGetPage(rbuf);

 	_bt_pageinit(leftpage, BufferGetPageSize(buf));
 	/* rightpage was already initialized by _bt_getbuf */

 	/* init btree private data */
 	oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
 	lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
 	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);

 	/* if we're splitting this page, it won't be the root when we're done */
 	/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
 	lopaque->btpo_flags = oopaque->btpo_flags;
 	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
 	ropaque->btpo_flags = lopaque->btpo_flags;
 	lopaque->btpo_prev = oopaque->btpo_prev;
 	lopaque->btpo_next = BufferGetBlockNumber(rbuf);
 	ropaque->btpo_prev = BufferGetBlockNumber(buf);
 	ropaque->btpo_next = oopaque->btpo_next;
 	lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
 	/* Since we already have write-lock on both pages, ok to read cycleid */
 	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
 	ropaque->btpo_cycleid = lopaque->btpo_cycleid;

 	/*
 	 * If the page we're splitting is not the rightmost page at its level in
 	 * the tree, then the first entry on the page is the high key for the
 	 * page.  We need to copy that to the right half.  Otherwise (meaning the
 	 * rightmost page case), all the items on the right half will be user
 	 * data.
 	 */
 	rightoff = P_HIKEY;

 	if (!P_RIGHTMOST(oopaque))
 	{
 		itemid = PageGetItemId(origpage, P_HIKEY);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
 						LP_USED) == InvalidOffsetNumber)
 			elog(PANIC, "failed to add hikey to the right sibling"
 				 " while splitting block %u of index \"%s\"",
 				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
 		rightoff = OffsetNumberNext(rightoff);
 	}

 	/*
 	 * The "high key" for the new left page will be the first key that's going
 	 * to go into the new right page.  This might be either the existing data
 	 * item at position firstright, or the incoming tuple.
 	 */
 	leftoff = P_HIKEY;
 	if (!newitemonleft && newitemoff == firstright)
 	{
 		/* incoming tuple will become first on right page */
 		itemsz = newitemsz;
 		item = newitem;
 	}
 	else
 	{
 		/* existing item at firstright will become first on right page */
 		itemid = PageGetItemId(origpage, firstright);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 	}
 	if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
 					LP_USED) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add hikey to the left sibling"
 			 " while splitting block %u of index \"%s\"",
 			 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
 	leftoff = OffsetNumberNext(leftoff);

 	/*
 	 * Now transfer all the data items to the appropriate page
 	 */
 	maxoff = PageGetMaxOffsetNumber(origpage);

 	for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
 	{
 		itemid = PageGetItemId(origpage, i);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);

 		/* does new item belong before this one? */
 		if (i == newitemoff)
 		{
 			if (newitemonleft)
 			{
 				_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
 							 "left sibling");
 				itup_off = leftoff;
 				itup_blkno = BufferGetBlockNumber(buf);
 				leftoff = OffsetNumberNext(leftoff);
 			}
 			else
 			{
 				_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
 							 "right sibling");
 				itup_off = rightoff;
 				itup_blkno = BufferGetBlockNumber(rbuf);
 				rightoff = OffsetNumberNext(rightoff);
 			}
 		}

 		/* decide which page to put it on */
 		if (i < firstright)
 		{
 			_bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
 						 "left sibling");
 			leftoff = OffsetNumberNext(leftoff);
 		}
 		else
 		{
 			_bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
 						 "right sibling");
 			rightoff = OffsetNumberNext(rightoff);
 		}
 	}

 	/* cope with possibility that newitem goes at the end */
 	if (i <= newitemoff)
 	{
 		if (newitemonleft)
 		{
 			_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
 						 "left sibling");
 			itup_off = leftoff;
 			itup_blkno = BufferGetBlockNumber(buf);
 			leftoff = OffsetNumberNext(leftoff);
 		}
 		else
 		{
 			_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
 						 "right sibling");
 			itup_off = rightoff;
 			itup_blkno = BufferGetBlockNumber(rbuf);
 			rightoff = OffsetNumberNext(rightoff);
 		}
 	}

 	/*
 	 * We have to grab the right sibling (if any) and fix the prev pointer
 	 * there. We are guaranteed that this is deadlock-free since no other
 	 * writer will be holding a lock on that page and trying to move left, and
 	 * all readers release locks on a page before trying to fetch its
 	 * neighbors.
 	 */

 	if (!P_RIGHTMOST(ropaque))
 	{
 		sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
 		spage = BufferGetPage(sbuf);
 		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
 		if (sopaque->btpo_prev != ropaque->btpo_prev)
 			elog(PANIC, "right sibling's left-link doesn't match: "
 				 "block %u links to %u instead of expected %u in index \"%s\"",
 				 ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev,
 				 RelationGetRelationName(rel));

 		/*
 		 * Check to see if we can set the SPLIT_END flag in the right-hand
 		 * split page; this can save some I/O for vacuum since it need not
 		 * proceed to the right sibling.  We can set the flag if the right
 		 * sibling has a different cycleid: that means it could not be part of
 		 * a group of pages that were all split off from the same ancestor
 		 * page.  If you're confused, imagine that page A splits to A B and
 		 * then again, yielding A C B, while vacuum is in progress.  Tuples
 		 * originally in A could now be in either B or C, hence vacuum must
 		 * examine both pages.	But if D, our right sibling, has a different
 		 * cycleid then it could not contain any tuples that were in A when
 		 * the vacuum started.
 		 */
 		if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
 			ropaque->btpo_flags |= BTP_SPLIT_END;
 	}

 	/*
 	 * Right sibling is locked, new siblings are prepared, but original page
 	 * is not updated yet. Log changes before continuing.
 	 *
 	 * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
 	 * not starting the critical section till here because we haven't been
 	 * scribbling on the original page yet, and we don't care about the new
 	 * sibling until it's linked into the btree.
 	 */
 	START_CRIT_SECTION();

 	MarkBufferDirty(buf);
 	MarkBufferDirty(rbuf);

 	if (!P_RIGHTMOST(ropaque))
 	{
 		sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
 		MarkBufferDirty(sbuf);
 	}

 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
 		xl_btree_split xlrec;
 		uint8		xlinfo;
 		XLogRecPtr	recptr;
 		XLogRecData rdata[4];

 		xl_btreetid_set(&(xlrec.target), rel, itup_blkno, itup_off);
 		if (newitemonleft)
 			xlrec.otherblk = BufferGetBlockNumber(rbuf);
 		else
 			xlrec.otherblk = BufferGetBlockNumber(buf);
 		xlrec.leftblk = lopaque->btpo_prev;
 		xlrec.rightblk = ropaque->btpo_next;
 		xlrec.level = lopaque->btpo.level;

 		/*
 		 * Direct access to page is not good but faster - we should implement
 		 * some new func in page API.  Note we only store the tuples
 		 * themselves, knowing that the item pointers are in the same order
 		 * and can be reconstructed by scanning the tuples.  See comments for
 		 * _bt_restore_page().
 		 */
 		xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
 			((PageHeader) leftpage)->pd_upper;

 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfBtreeSplit;
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].next = &(rdata[1]);

 		rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;
 		rdata[1].len = xlrec.leftlen;
 		rdata[1].buffer = InvalidBuffer;
 		rdata[1].next = &(rdata[2]);

 		rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
 		rdata[2].len = ((PageHeader) rightpage)->pd_special -
 			((PageHeader) rightpage)->pd_upper;
 		rdata[2].buffer = InvalidBuffer;
 		rdata[2].next = NULL;

 		if (!P_RIGHTMOST(ropaque))
 		{
 			rdata[2].next = &(rdata[3]);
 			rdata[3].data = NULL;
 			rdata[3].len = 0;
 			rdata[3].buffer = sbuf;
 			rdata[3].buffer_std = true;
 			rdata[3].next = NULL;
 		}

 		if (P_ISROOT(oopaque))
 			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
 		else
 			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;

 		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

 		PageSetLSN(leftpage, recptr);
 		PageSetTLI(leftpage, ThisTimeLineID);
 		PageSetLSN(rightpage, recptr);
 		PageSetTLI(rightpage, ThisTimeLineID);
 		if (!P_RIGHTMOST(ropaque))
 		{
 			PageSetLSN(spage, recptr);
 			PageSetTLI(spage, ThisTimeLineID);
 		}
 	}

 	/*
 	 * By here, the original data page has been split into two new halves, and
 	 * these are correct.  The algorithm requires that the left page never
 	 * move during a split, so we copy the new left page back on top of the
 	 * original.  Note that this is not a waste of time, since we also require
 	 * (in the page management code) that the center of a page always be
 	 * clean, and the most efficient way to guarantee this is just to compact
 	 * the data by reinserting it into a new left page.  (XXX the latter
 	 * comment is probably obsolete.)
 	 *
 	 * It's a bit weird that we don't fill in the left page till after writing
 	 * the XLOG entry, but not really worth changing.  Note that we use the
 	 * origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
 	 * entry, so simply reshuffling the code won't do.
 	 */

 	PageRestoreTempPage(leftpage, origpage);

 	END_CRIT_SECTION();

 	/* release the old right sibling */
 	if (!P_RIGHTMOST(ropaque))
 		_bt_relbuf(rel, sbuf);

 	/* split's done */
 	return rbuf;
 }

 /*
  *	_bt_findsplitloc() -- find an appropriate place to split a page.
  *
  * The idea here is to equalize the free space that will be on each split
  * page, *after accounting for the inserted tuple*.  (If we fail to account
  * for it, we might find ourselves with too little room on the page that
  * it needs to go into!)
  *
  * If the page is the rightmost page on its level, we instead try to arrange
  * to leave the left split page fillfactor% full.  In this way, when we are
  * inserting successively increasing keys (consider sequences, timestamps,
  * etc) we will end up with a tree whose pages are about fillfactor% full,
  * instead of the 50% full result that we'd get without this special case.
  * This is the same as nbtsort.c produces for a newly-created tree.  Note
  * that leaf and nonleaf pages use different fillfactors.
  *
  * We are passed the intended insert position of the new tuple, expressed as
  * the offsetnumber of the tuple it must go in front of.  (This could be
  * maxoff+1 if the tuple is to go at the end.)
  *
  * We return the index of the first existing tuple that should go on the
  * righthand page, plus a boolean indicating whether the new tuple goes on
  * the left or right page.	The bool is necessary to disambiguate the case
  * where firstright == newitemoff.
  */
 static OffsetNumber
 _bt_findsplitloc(Relation rel,
 				 Page page,
 				 OffsetNumber newitemoff,
 				 Size newitemsz,
 				 bool *newitemonleft)
 {
 	BTPageOpaque opaque;
 	OffsetNumber offnum;
 	OffsetNumber maxoff;
 	ItemId		itemid;
 	FindSplitData state;
 	int			leftspace,
 				rightspace,
 				goodenough,
 				dataitemtotal,
 				dataitemstoleft;

 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

 	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
 	newitemsz += sizeof(ItemIdData);
 	state.newitemsz = newitemsz;
 	state.is_leaf = P_ISLEAF(opaque);
 	state.is_rightmost = P_RIGHTMOST(opaque);
 	state.have_split = false;
 	if (state.is_leaf)
 		state.fillfactor = RelationGetFillFactor(rel,
 												 BTREE_DEFAULT_FILLFACTOR);
 	else
 		state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
 	state.newitemonleft = false;	/* these just to keep compiler quiet */
 	state.firstright = 0;
 	state.best_delta = 0;

 	/* Total free space available on a btree page, after fixed overhead */
 	leftspace = rightspace =
 		PageGetPageSize(page) - SizeOfPageHeaderData -
 		MAXALIGN(sizeof(BTPageOpaqueData));

 	/*
 	 * Finding the best possible split would require checking all the possible
 	 * split points, because of the high-key and left-key special cases.
 	 * That's probably more work than it's worth; instead, stop as soon as we
 	 * find a "good-enough" split, where good-enough is defined as an
 	 * imbalance in free space of no more than pagesize/16 (arbitrary...) This
 	 * should let us stop near the middle on most pages, instead of plowing to
 	 * the end.
 	 */
 	goodenough = leftspace / 16;

 	/* The right page will have the same high key as the old page */
 	if (!P_RIGHTMOST(opaque))
 	{
 		itemid = PageGetItemId(page, P_HIKEY);
 		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
 							 sizeof(ItemIdData));
 	}

 	/* Count up total space in data items without actually scanning 'em */
 	dataitemtotal = rightspace - (int) PageGetFreeSpace(page);

 	/*
 	 * Scan through the data items and calculate space usage for a split at
 	 * each possible position.
 	 */
 	dataitemstoleft = 0;
 	maxoff = PageGetMaxOffsetNumber(page);

 	for (offnum = P_FIRSTDATAKEY(opaque);
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
 	{
 		Size		itemsz;
 		int			leftfree,
 					rightfree;

 		itemid = PageGetItemId(page, offnum);
 		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);

 		/*
 		 * We have to allow for the current item becoming the high key of the
 		 * left page; therefore it counts against left space as well as right
 		 * space.
 		 */
 		leftfree = leftspace - dataitemstoleft - (int) itemsz;
 		rightfree = rightspace - (dataitemtotal - dataitemstoleft);

 		/*
 		 * Will the new item go to left or right of split?
 		 */
 		if (offnum > newitemoff)
 			_bt_checksplitloc(&state, offnum, leftfree, rightfree,
 							  true, itemsz);
 		else if (offnum < newitemoff)
 			_bt_checksplitloc(&state, offnum, leftfree, rightfree,
 							  false, itemsz);
 		else
 		{
 			/* need to try it both ways! */
 			_bt_checksplitloc(&state, offnum, leftfree, rightfree,
 							  true, itemsz);
 			/*
 			 * Here we are contemplating newitem as first on right.  In this
 			 * case it, not the current item, will become the high key of the
 			 * left page, and so we have to correct the allowance made above.
 			 */
 			leftfree += (int) itemsz - (int) newitemsz;
 			_bt_checksplitloc(&state, offnum, leftfree, rightfree,
 							  false, newitemsz);
 		}

 		/* Abort scan once we find a good-enough choice */
 		if (state.have_split && state.best_delta <= goodenough)
 			break;

 		dataitemstoleft += itemsz;
 	}

 	/*
 	 * I believe it is not possible to fail to find a feasible split, but just
 	 * in case ...
 	 */
 	if (!state.have_split)
 		elog(ERROR, "could not find a feasible split point for index \"%s\"",
 			 RelationGetRelationName(rel));

 	*newitemonleft = state.newitemonleft;
 	return state.firstright;
 }

 /*
  * Subroutine to analyze a particular possible split choice (ie, firstright
  * and newitemonleft settings), and record the best split so far in *state.
  */
 static void
 _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
 				  int leftfree, int rightfree,
 				  bool newitemonleft, Size firstrightitemsz)
 {
 	/*
 	 * Account for the new item on whichever side it is to be put.
 	 */
 	if (newitemonleft)
 		leftfree -= (int) state->newitemsz;
 	else
 		rightfree -= (int) state->newitemsz;

 	/*
 	 * If we are not on the leaf level, we will be able to discard the key
 	 * data from the first item that winds up on the right page.
 	 */
 	if (!state->is_leaf)
 		rightfree += (int) firstrightitemsz -
 			(int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));

 	/*
 	 * If feasible split point, remember best delta.
 	 */
 	if (leftfree >= 0 && rightfree >= 0)
 	{
 		int			delta;

 		if (state->is_rightmost)
 		{
 			/*
 			 * If splitting a rightmost page, try to put (100-fillfactor)% of
 			 * free space on left page. See comments for _bt_findsplitloc.
 			 */
 			delta = (state->fillfactor * leftfree)
 				- ((100 - state->fillfactor) * rightfree);
 		}
 		else
 		{
 			/* Otherwise, aim for equal free space on both sides */
 			delta = leftfree - rightfree;
 		}

 		if (delta < 0)
 			delta = -delta;
 		if (!state->have_split || delta < state->best_delta)
 		{
 			state->have_split = true;
 			state->newitemonleft = newitemonleft;
 			state->firstright = firstright;
 			state->best_delta = delta;
 		}
 	}
 }

 /*
  * _bt_insert_parent() -- Insert downlink into parent after a page split.
  *
  * On entry, buf and rbuf are the left and right split pages, which we
  * still hold write locks on per the L&Y algorithm.  We release the
  * write locks once we have write lock on the parent page.	(Any sooner,
  * and it'd be possible for some other process to try to split or delete
  * one of these pages, and get confused because it cannot find the downlink.)
  *
  * stack - stack showing how we got here.  May be NULL in cases that don't
  *			have to be efficient (concurrent ROOT split, WAL recovery)
  * is_root - we split the true root
  * is_only - we split a page alone on its level (might have been fast root)
  *
  * This is exported so it can be called by nbtxlog.c.
  */
 void
 _bt_insert_parent(Relation rel,
 				  Buffer buf,
 				  Buffer rbuf,
 				  BTStack stack,
 				  bool is_root,
 				  bool is_only)
 {
 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	/*
 	 * Here we have to do something Lehman and Yao don't talk about: deal with
 	 * a root split and construction of a new root.  If our stack is empty
 	 * then we have just split a node on what had been the root level when we
 	 * descended the tree.	If it was still the root then we perform a
 	 * new-root construction.  If it *wasn't* the root anymore, search to find
 	 * the next higher level that someone constructed meanwhile, and find the
 	 * right place to insert as for the normal case.
 	 *
 	 * If we have to search for the parent level, we do so by re-descending
 	 * from the root.  This is not super-efficient, but it's rare enough not
 	 * to matter.  (This path is also taken when called from WAL recovery ---
 	 * we have no stack in that case.)
 	 */
 	if (is_root)
 	{
 		Buffer		rootbuf;

 		Assert(stack == NULL);
 		Assert(is_only);
 		/* create a new root node and update the metapage */
 		rootbuf = _bt_newroot(rel, buf, rbuf);
 		/* release the split buffers */
 		_bt_relbuf(rel, rootbuf);
 		_bt_relbuf(rel, rbuf);
 		_bt_relbuf(rel, buf);
 	}
 	else
 	{
 		BlockNumber bknum = BufferGetBlockNumber(buf);
 		BlockNumber rbknum = BufferGetBlockNumber(rbuf);
 		Page		page = BufferGetPage(buf);
 		IndexTuple	new_item;
 		BTStackData fakestack;
 		IndexTuple	ritem;
 		Buffer		pbuf;

 		if (stack == NULL)
 		{
 			BTPageOpaque lpageop;

 			if (!InRecovery)
 				elog(DEBUG2, "concurrent ROOT page split");
 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 			/* Find the leftmost page at the next level up */
 			pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
 			/* Set up a phony stack entry pointing there */
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
 			stack->bts_offset = InvalidOffsetNumber;
 			/* bts_btentry will be initialized below */
 			stack->bts_parent = NULL;
 			_bt_relbuf(rel, pbuf);
 		}

 		/* get high key from left page == lowest key on new right page */
 		ritem = (IndexTuple) PageGetItem(page,
 										 PageGetItemId(page, P_HIKEY));

 		/* form an index tuple that points at the new right page */
 		new_item = CopyIndexTuple(ritem);
 		ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);

 		/*
 		 * Find the parent buffer and get the parent page.
 		 *
 		 * Oops - if we were moved right then we need to change stack item! We
 		 * want to find parent pointing to where we are, right ?	- vadim
 		 * 05/27/97
 		 */
 		ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);

 		pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);

 		/* Now we can unlock the children */
 		_bt_relbuf(rel, rbuf);
 		_bt_relbuf(rel, buf);

 		/* Check for error only after writing children */
 		if (pbuf == InvalidBuffer)
 			elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
 				 RelationGetRelationName(rel), bknum, rbknum);

 		/* Recursively update the parent */
 		_bt_insertonpg(rel, pbuf, stack->bts_parent,
 					   0, NULL, new_item, stack->bts_offset,
 					   is_only);

 		/* be tidy */
 		pfree(new_item);
 	}
 }

 /*
  *	_bt_getstackbuf() -- Walk back up the tree one step, and find the item
  *						 we last looked at in the parent.
  *
  *		This is possible because we save the downlink from the parent item,
  *		which is enough to uniquely identify it.  Insertions into the parent
  *		level could cause the item to move right; deletions could cause it
  *		to move left, but not left of the page we previously found it in.
  *
  *		Adjusts bts_blkno & bts_offset if changed.
  *
  *		Returns InvalidBuffer if item not found (should not happen).
  */
 Buffer
 _bt_getstackbuf(Relation rel, BTStack stack, int access)
 {
 	BlockNumber blkno;
 	OffsetNumber start;

 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	blkno = stack->bts_blkno;
 	start = stack->bts_offset;

 	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		BTPageOpaque opaque;

 		buf = _bt_getbuf(rel, blkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

 		if (!P_IGNORE(opaque))
 		{
 			OffsetNumber offnum,
 						minoff,
 						maxoff;
 			ItemId		itemid;
 			IndexTuple	item;

 			minoff = P_FIRSTDATAKEY(opaque);
 			maxoff = PageGetMaxOffsetNumber(page);

 			/*
 			 * start = InvalidOffsetNumber means "search the whole page". We
 			 * need this test anyway due to possibility that page has a high
 			 * key now when it didn't before.
 			 */
 			if (start < minoff)
 				start = minoff;

 			/*
 			 * Need this check too, to guard against possibility that page
 			 * split since we visited it originally.
 			 */
 			if (start > maxoff)
 				start = OffsetNumberNext(maxoff);

 			/*
 			 * These loops will check every item on the page --- but in an
 			 * order that's attuned to the probability of where it actually
 			 * is.	Scan to the right first, then to the left.
 			 */
 			for (offnum = start;
 				 offnum <= maxoff;
 				 offnum = OffsetNumberNext(offnum))
 			{
 				itemid = PageGetItemId(page, offnum);
 				item = (IndexTuple) PageGetItem(page, itemid);
 				if (BTEntrySame(item, &stack->bts_btentry))
 				{
 					/* Return accurate pointer to where link is now */
 					stack->bts_blkno = blkno;
 					stack->bts_offset = offnum;
 					return buf;
 				}
 			}

 			for (offnum = OffsetNumberPrev(start);
 				 offnum >= minoff;
 				 offnum = OffsetNumberPrev(offnum))
 			{
 				itemid = PageGetItemId(page, offnum);
 				item = (IndexTuple) PageGetItem(page, itemid);
 				if (BTEntrySame(item, &stack->bts_btentry))
 				{
 					/* Return accurate pointer to where link is now */
 					stack->bts_blkno = blkno;
 					stack->bts_offset = offnum;
 					return buf;
 				}
 			}
 		}

 		/*
 		 * The item we're looking for moved right at least one page.
 		 */
 		if (P_RIGHTMOST(opaque))
 		{
 			_bt_relbuf(rel, buf);
 			return InvalidBuffer;
 		}
 		blkno = opaque->btpo_next;
 		start = InvalidOffsetNumber;
 		_bt_relbuf(rel, buf);
 	}
 }

 /*
  *	_bt_newroot() -- Create a new root page for the index.
  *
  *		We've just split the old root page and need to create a new one.
  *		In order to do this, we add a new root page to the file, then lock
  *		the metadata page and update it.  This is guaranteed to be deadlock-
  *		free, because all readers release their locks on the metadata page
  *		before trying to lock the root, and all writers lock the root before
  *		trying to lock the metadata page.  We have a write lock on the old
  *		root page, so we have not introduced any cycles into the waits-for
  *		graph.
  *
  *		On entry, lbuf (the old root) and rbuf (its new peer) are write-
  *		locked. On exit, a new root page exists with entries for the
  *		two new children, metapage is updated and unlocked/unpinned.
  *		The new root buffer is returned to caller which has to unlock/unpin
  *		lbuf, rbuf & rootbuf.
  */
 static Buffer
 _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 {
 	Buffer		rootbuf;
 	Page		lpage,
 				rootpage;
 	BlockNumber lbkno,
 				rbkno;
 	BlockNumber rootblknum;
 	BTPageOpaque rootopaque;
 	ItemId		itemid;
 	IndexTuple	item;
 	Size		itemsz;
 	IndexTuple	new_item;
 	Buffer		metabuf;
 	Page		metapg;
 	BTMetaPageData *metad;

 	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

 	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
 	RelationFetchGpRelationNodeForXLog(rel);

 	lbkno = BufferGetBlockNumber(lbuf);
 	rbkno = BufferGetBlockNumber(rbuf);
 	lpage = BufferGetPage(lbuf);

 	/* get a new root page */
 	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 	rootpage = BufferGetPage(rootbuf);
 	rootblknum = BufferGetBlockNumber(rootbuf);

 	/* acquire lock on the metapage */
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);

 	/* NO EREPORT(ERROR) from here till newroot op is logged */
 	START_CRIT_SECTION();

 	/* set btree special data */
 	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 	rootopaque->btpo_flags = BTP_ROOT;
 	rootopaque->btpo.level =
 		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
 	rootopaque->btpo_cycleid = 0;

 	/* update metapage data */
 	metad->btm_root = rootblknum;
 	metad->btm_level = rootopaque->btpo.level;
 	metad->btm_fastroot = rootblknum;
 	metad->btm_fastlevel = rootopaque->btpo.level;

 	/*
 	 * Create downlink item for left page (old root).  Since this will be the
 	 * first item in a non-leaf page, it implicitly has minus-infinity key
 	 * value, so we need not store any actual key in it.
 	 */
 	itemsz = sizeof(IndexTupleData);
 	new_item = (IndexTuple) palloc(itemsz);
 	new_item->t_info = itemsz;
 	ItemPointerSet(&(new_item->t_tid), lbkno, P_HIKEY);

 	/*
 	 * Insert the left page pointer into the new root page.  The root page is
 	 * the rightmost page on its level so there is no "high key" in it; the
 	 * two items will go into positions P_HIKEY and P_FIRSTKEY.
 	 *
 	 * Note: we *must* insert the two items in item-number order, for the
 	 * benefit of _bt_restore_page().
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add leftkey to new root page"
 			 " while splitting block %u of index \"%s\"",
 			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
 	pfree(new_item);

 	/*
 	 * Create downlink item for right page.  The key for it is obtained from
 	 * the "high key" position in the left page.
 	 */
 	itemid = PageGetItemId(lpage, P_HIKEY);
 	itemsz = ItemIdGetLength(itemid);
 	item = (IndexTuple) PageGetItem(lpage, itemid);
 	new_item = CopyIndexTuple(item);
 	ItemPointerSet(&(new_item->t_tid), rbkno, P_HIKEY);

 	/*
 	 * insert the right page pointer into the new root page.
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add rightkey to new root page"
 			 " while splitting block %u of index \"%s\"",
 			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
 	pfree(new_item);

 	MarkBufferDirty(rootbuf);
 	MarkBufferDirty(metabuf);

 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
 		xl_btree_newroot xlrec;
 		XLogRecPtr	recptr;
 		XLogRecData rdata[2];

 		xl_btreenode_set(&(xlrec.btreenode), rel);
 		xlrec.rootblk = rootblknum;
 		xlrec.level = metad->btm_level;

 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfBtreeNewroot;
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].next = &(rdata[1]);

 		/*
 		 * Direct access to page is not good but faster - we should implement
 		 * some new func in page API.
 		 */
 		rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
 		rdata[1].len = ((PageHeader) rootpage)->pd_special -
 			((PageHeader) rootpage)->pd_upper;
 		rdata[1].buffer = InvalidBuffer;
 		rdata[1].next = NULL;

 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);

 		PageSetLSN(rootpage, recptr);
 		PageSetTLI(rootpage, ThisTimeLineID);
 		PageSetLSN(metapg, recptr);
 		PageSetTLI(metapg, ThisTimeLineID);
 	}

 	END_CRIT_SECTION();

 	/* send out relcache inval for metapage change */
 	if (!InRecovery)
 		CacheInvalidateRelcache(rel);

 	/* done with metapage */
 	_bt_relbuf(rel, metabuf);

 	return rootbuf;
 }

 /*
  *	_bt_pgaddtup() -- add a tuple to a particular page in the index.
  *
  *		This routine adds the tuple to the page as requested.  It does
  *		not affect pin/lock status, but you'd better have a write lock
  *		and pin on the target buffer!  Don't forget to write and release
  *		the buffer afterwards, either.
  *
  *		The main difference between this routine and a bare PageAddItem call
  *		is that this code knows that the leftmost index tuple on a non-leaf
  *		btree page doesn't need to have a key.  Therefore, it strips such
  *		tuples down to just the tuple header.  CAUTION: this works ONLY if
  *		we insert the tuples in order, so that the given itup_off does
  *		represent the final position of the tuple!
  */
 static void
 _bt_pgaddtup(Relation rel,
 			 Page page,
 			 Size itemsize,
 			 IndexTuple itup,
 			 OffsetNumber itup_off,
 			 const char *where)
 {
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	IndexTupleData trunctuple;

 	if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
 	{
 		trunctuple = *itup;
 		trunctuple.t_info = sizeof(IndexTupleData);
 		itup = &trunctuple;
 		itemsize = sizeof(IndexTupleData);
 	}

 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
 					LP_USED) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item to the %s in index \"%s\"",
 			 where, RelationGetRelationName(rel));
 }

 /*
  * _bt_isequal - used in _bt_doinsert in check for duplicates.
  *
  * This is very similar to _bt_compare, except for NULL handling.
  * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
  */
 static bool
 _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
 			int keysz, ScanKey scankey)
 {
 	IndexTuple	itup;
 	int			i;

 	/* Better be comparing to a leaf item */
 	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));

 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

 	for (i = 1; i <= keysz; i++)
 	{
 		AttrNumber	attno;
 		Datum		datum;
 		bool		isNull;
 		int32		result;

 		attno = scankey->sk_attno;
 		Assert(attno == i);
 		datum = index_getattr(itup, attno, itupdesc, &isNull);

 		/* NULLs are never equal to anything */
 		if (isNull || (scankey->sk_flags & SK_ISNULL))
 			return false;

 		result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
 											 datum,
 											 scankey->sk_argument));

 		if (result != 0)
 			return false;

 		scankey++;
 	}

 	/* if we get here, the keys are equal */
 	return true;
 }

 /*
  * _bt_vacuum_one_page - vacuum just one index page.
  *
  * Try to remove LP_DELETE items from the given page.  The passed buffer
  * must be exclusive-locked, but unlike a real VACUUM, we don't need a
  * super-exclusive "cleanup" lock (see nbtree/README).
  */
 static void
 _bt_vacuum_one_page(Relation rel, Buffer buffer)
 {
 	OffsetNumber deletable[MaxOffsetNumber];
 	int			ndeletable = 0;
 	OffsetNumber offnum,
 				minoff,
 				maxoff;
 	Page		page = BufferGetPage(buffer);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

 	/*
 	 * Scan over all items to see which ones need deleted according to
 	 * LP_DELETE flags.
 	 */
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = minoff;
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
 	{
 		ItemId		itemId = PageGetItemId(page, offnum);

 		if (ItemIdDeleted(itemId))
 			deletable[ndeletable++] = offnum;
 	}

 	if (ndeletable > 0)
 		_bt_delitems(rel, buffer, deletable, ndeletable);

 	/*
 	 * Note: if we didn't find any LP_DELETE items, then the page's
 	 * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
 	 * separate write to clear it, however.  We will clear it when we split
 	 * the page.
 	 */
 }