src/backend/access/nbtree/nbtutils.c - hawq - Git at Google

 /*-------------------------------------------------------------------------
  *
  * nbtutils.c
  *	  Utility code for Postgres btree implementation.
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.79.2.1 2007/03/30 00:13:05 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include <time.h>

 #include "access/genam.h"
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "executor/execdebug.h"
 #include "miscadmin.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"


 static void _bt_mark_scankey_required(ScanKey skey);
 static bool _bt_check_rowcompare(ScanKey skey,
 					 IndexTuple tuple, TupleDesc tupdesc,
 					 ScanDirection dir, bool *continuescan);


 /*
  * _bt_mkscankey
  *		Build an insertion scan key that contains comparison data from itup
  *		as well as comparator routines appropriate to the key datatypes.
  *
  *		The result is intended for use with _bt_compare().
  */
 ScanKey
 _bt_mkscankey(Relation rel, IndexTuple itup)
 {
 	ScanKey		skey;
 	TupleDesc	itupdesc;
 	int			natts;
 	int			i;

 	itupdesc = RelationGetDescr(rel);
 	natts = RelationGetNumberOfAttributes(rel);

 	skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

 	for (i = 0; i < natts; i++)
 	{
 		FmgrInfo   *procinfo;
 		Datum		arg;
 		bool		null;

 		/*
 		 * We can use the cached (default) support procs since no cross-type
 		 * comparison can be needed.
 		 */
 		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
 		arg = index_getattr(itup, i + 1, itupdesc, &null);
 		ScanKeyEntryInitializeWithInfo(&skey[i],
 									   null ? SK_ISNULL : 0,
 									   (AttrNumber) (i + 1),
 									   InvalidStrategy,
 									   InvalidOid,
 									   procinfo,
 									   arg);
 	}

 	return skey;
 }

 /*
  * _bt_mkscankey_nodata
  *		Build an insertion scan key that contains 3-way comparator routines
  *		appropriate to the key datatypes, but no comparison data.  The
  *		comparison data ultimately used must match the key datatypes.
  *
  *		The result cannot be used with _bt_compare(), unless comparison
  *		data is first stored into the key entries.	Currently this
  *		routine is only called by nbtsort.c and tuplesort.c, which have
  *		their own comparison routines.
  */
 ScanKey
 _bt_mkscankey_nodata(Relation rel)
 {
 	ScanKey		skey;
 	int			natts;
 	int			i;

 	natts = RelationGetNumberOfAttributes(rel);

 	skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

 	for (i = 0; i < natts; i++)
 	{
 		FmgrInfo   *procinfo;

 		/*
 		 * We can use the cached (default) support procs since no cross-type
 		 * comparison can be needed.
 		 */
 		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
 		ScanKeyEntryInitializeWithInfo(&skey[i],
 									   SK_ISNULL,
 									   (AttrNumber) (i + 1),
 									   InvalidStrategy,
 									   InvalidOid,
 									   procinfo,
 									   (Datum) 0);
 	}

 	return skey;
 }

 /*
  * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
  */
 void
 _bt_freeskey(ScanKey skey)
 {
 	pfree(skey);
 }

 /*
  * free a retracement stack made by _bt_search.
  */
 void
 _bt_freestack(BTStack stack)
 {
 	BTStack		ostack;

 	while (stack != NULL)
 	{
 		ostack = stack;
 		stack = stack->bts_parent;
 		pfree(ostack);
 	}
 }


 /*----------
  *	_bt_preprocess_keys() -- Preprocess scan keys
  *
  * The caller-supplied search-type keys (in scan->keyData[]) are copied to
  * so->keyData[] with possible transformation.	scan->numberOfKeys is
  * the number of input keys, so->numberOfKeys gets the number of output
  * keys (possibly less, never greater).
  *
  * The primary purpose of this routine is to discover how many scan keys
  * must be satisfied to continue the scan.	It also attempts to eliminate
  * redundant keys and detect contradictory keys.  At present, redundant and
  * contradictory keys can only be detected for same-data-type comparisons,
  * but that's the usual case so it seems worth doing.
  *
  * The output keys must be sorted by index attribute.  Presently we expect
  * (but verify) that the input keys are already so sorted --- this is done
  * by group_clauses_by_indexkey() in indxpath.c.  Some reordering of the keys
  * within each attribute may be done as a byproduct of the processing here,
  * but no other code depends on that.
  *
  * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
  * if they must be satisfied in order to continue the scan forward or backward
  * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
  * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
  * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
  * But once we reach tuples like (1,4,z) we can stop scanning because no
  * later tuples could match.  This is reflected by marking the x and y keys,
  * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
  * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
  * For the first attribute without an "=" key, any "<" and "<=" keys are
  * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
  * This can be seen to be correct by considering the above example.  Note
  * in particular that if there are no keys for a given attribute, the keys for
  * subsequent attributes can never be required; for instance "WHERE y = 4"
  * requires a full-index scan.
  *
  * If possible, redundant keys are eliminated: we keep only the tightest
  * >/>= bound and the tightest </<= bound, and if there's an = key then
  * that's the only one returned.  (So, we return either a single = key,
  * or one or two boundary-condition keys for each attr.)  However, we can
  * only detect redundant keys when the right-hand datatypes are all equal
  * to the index datatype, because we do not know suitable operators for
  * comparing right-hand values of two different datatypes.	(In theory
  * we could handle comparison of a RHS of the index datatype with a RHS of
  * another type, but that seems too much pain for too little gain.)  So,
  * keys whose operator has a nondefault subtype (ie, its RHS is not of the
  * index datatype) are ignored here, except for noting whether they include
  * an "=" condition or not.  The logic about required keys still works if
  * we don't eliminate redundant keys.
  *
  * As a byproduct of this work, we can detect contradictory quals such
  * as "x = 1 AND x > 2".  If we see that, we return so->quals_ok = FALSE,
  * indicating the scan need not be run at all since no tuples can match.
  * Again though, only keys with RHS datatype equal to the index datatype
  * can be checked for contradictions.
  *
  * Row comparison keys are treated the same as comparisons to nondefault
  * datatypes: we just transfer them into the preprocessed array without any
  * editorialization.  We can treat them the same as an ordinary inequality
  * comparison on the row's first index column, for the purposes of the logic
  * about required keys.
  *
  * Note: the reason we have to copy the preprocessed scan keys into private
  * storage is that we are modifying the array based on comparisons of the
  * key argument values, which could change on a rescan.  Therefore we can't
  * overwrite the caller's data structure.
  *----------
  */
 void
 _bt_preprocess_keys(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	int			numberOfKeys = scan->numberOfKeys;
 	int			new_numberOfKeys;
 	int			numberOfEqualCols;
 	ScanKey		inkeys;
 	ScanKey		outkeys;
 	ScanKey		cur;
 	ScanKey		xform[BTMaxStrategyNumber];
 	bool		hasOtherTypeEqual;
 	Datum		test;
 	int			i,
 				j;
 	AttrNumber	attno;

 	/* initialize result variables */
 	so->qual_ok = true;
 	so->numberOfKeys = 0;

 	if (numberOfKeys < 1)
 		return;					/* done if qual-less scan */

 	inkeys = scan->keyData;
 	outkeys = so->keyData;
 	cur = &inkeys[0];
 	/* we check that input keys are correctly ordered */
 	if (cur->sk_attno < 1)
 		elog(ERROR, "btree index keys must be ordered by attribute");

 	/* We can short-circuit most of the work if there's just one key */
 	if (numberOfKeys == 1)
 	{
 		/*
 		 * We don't use indices for 'A is null' and 'A is not null' currently
 		 * and 'A < = > <> NULL' will always fail - so qual is not OK if
 		 * comparison value is NULL.	  - vadim 03/21/97
 		 */
 		if (cur->sk_flags & SK_ISNULL)
 			so->qual_ok = false;
 		memcpy(outkeys, inkeys, sizeof(ScanKeyData));
 		so->numberOfKeys = 1;
 		/* We can mark the qual as required if it's for first index col */
 		if (cur->sk_attno == 1)
 			_bt_mark_scankey_required(outkeys);
 		return;
 	}

 	/*
 	 * Otherwise, do the full set of pushups.
 	 */
 	new_numberOfKeys = 0;
 	numberOfEqualCols = 0;

 	/*
 	 * Initialize for processing of keys for attr 1.
 	 *
 	 * xform[i] points to the currently best scan key of strategy type i+1, if
 	 * any is found with a default operator subtype; it is NULL if we haven't
 	 * yet found such a key for this attr.	Scan keys of nondefault subtypes
 	 * are transferred to the output with no processing except for noting if
 	 * they are of "=" type.
 	 */
 	attno = 1;
 	memset(xform, 0, sizeof(xform));
 	hasOtherTypeEqual = false;

 	/*
 	 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
 	 * handle after-last-key processing.  Actual exit from the loop is at the
 	 * "break" statement below.
 	 */
 	for (i = 0;; cur++, i++)
 	{
 		if (i < numberOfKeys)
 		{
 			/* See comments above: any NULL implies cannot match qual */
 			/* Note: we assume SK_ISNULL is never set in a row header key */
 			if (cur->sk_flags & SK_ISNULL)
 			{
 				so->qual_ok = false;

 				/*
 				 * Quit processing so we don't try to invoke comparison
 				 * routines on NULLs.
 				 */
 				return;
 			}
 		}

 		/*
 		 * If we are at the end of the keys for a particular attr, finish up
 		 * processing and emit the cleaned-up keys.
 		 */
 		if (i == numberOfKeys || cur->sk_attno != attno)
 		{
 			int			priorNumberOfEqualCols = numberOfEqualCols;

 			/* check input keys are correctly ordered */
 			if (i < numberOfKeys && cur->sk_attno < attno)
 				elog(ERROR, "btree index keys must be ordered by attribute");

 			/*
 			 * If = has been specified, no other key will be used. In case of
 			 * key > 2 && key == 1 and so on we have to set qual_ok to false
 			 * before discarding the other keys.
 			 */
 			if (xform[BTEqualStrategyNumber - 1])
 			{
 				ScanKey		eq = xform[BTEqualStrategyNumber - 1];

 				for (j = BTMaxStrategyNumber; --j >= 0;)
 				{
 					ScanKey		chk = xform[j];

 					if (!chk || j == (BTEqualStrategyNumber - 1))
 						continue;
 					test = FunctionCall2(&chk->sk_func,
 										 eq->sk_argument,
 										 chk->sk_argument);
 					if (!DatumGetBool(test))
 					{
 						so->qual_ok = false;
 						break;
 					}
 				}
 				xform[BTLessStrategyNumber - 1] = NULL;
 				xform[BTLessEqualStrategyNumber - 1] = NULL;
 				xform[BTGreaterEqualStrategyNumber - 1] = NULL;
 				xform[BTGreaterStrategyNumber - 1] = NULL;
 				/* track number of attrs for which we have "=" keys */
 				numberOfEqualCols++;
 			}
 			else
 			{
 				/* track number of attrs for which we have "=" keys */
 				if (hasOtherTypeEqual)
 					numberOfEqualCols++;
 			}

 			/* keep only one of <, <= */
 			if (xform[BTLessStrategyNumber - 1]
 				&& xform[BTLessEqualStrategyNumber - 1])
 			{
 				ScanKey		lt = xform[BTLessStrategyNumber - 1];
 				ScanKey		le = xform[BTLessEqualStrategyNumber - 1];

 				test = FunctionCall2(&le->sk_func,
 									 lt->sk_argument,
 									 le->sk_argument);
 				if (DatumGetBool(test))
 					xform[BTLessEqualStrategyNumber - 1] = NULL;
 				else
 					xform[BTLessStrategyNumber - 1] = NULL;
 			}

 			/* keep only one of >, >= */
 			if (xform[BTGreaterStrategyNumber - 1]
 				&& xform[BTGreaterEqualStrategyNumber - 1])
 			{
 				ScanKey		gt = xform[BTGreaterStrategyNumber - 1];
 				ScanKey		ge = xform[BTGreaterEqualStrategyNumber - 1];

 				test = FunctionCall2(&ge->sk_func,
 									 gt->sk_argument,
 									 ge->sk_argument);
 				if (DatumGetBool(test))
 					xform[BTGreaterEqualStrategyNumber - 1] = NULL;
 				else
 					xform[BTGreaterStrategyNumber - 1] = NULL;
 			}

 			/*
 			 * Emit the cleaned-up keys into the outkeys[] array, and then
 			 * mark them if they are required.	They are required (possibly
 			 * only in one direction) if all attrs before this one had "=".
 			 */
 			for (j = BTMaxStrategyNumber; --j >= 0;)
 			{
 				if (xform[j])
 				{
 					ScanKey		outkey = &outkeys[new_numberOfKeys++];

 					memcpy(outkey, xform[j], sizeof(ScanKeyData));
 					if (priorNumberOfEqualCols == attno - 1)
 						_bt_mark_scankey_required(outkey);
 				}
 			}

 			/*
 			 * Exit loop here if done.
 			 */
 			if (i == numberOfKeys)
 				break;

 			/* Re-initialize for new attno */
 			attno = cur->sk_attno;
 			memset(xform, 0, sizeof(xform));
 			hasOtherTypeEqual = false;
 		}

 		/* check strategy this key's operator corresponds to */
 		j = cur->sk_strategy - 1;

 		/* if row comparison or wrong RHS data type, punt */
 		if ((cur->sk_flags & SK_ROW_HEADER) || cur->sk_subtype != InvalidOid)
 		{
 			ScanKey		outkey = &outkeys[new_numberOfKeys++];

 			memcpy(outkey, cur, sizeof(ScanKeyData));
 			if (numberOfEqualCols == attno - 1)
 				_bt_mark_scankey_required(outkey);
 			if (j == (BTEqualStrategyNumber - 1))
 				hasOtherTypeEqual = true;
 			continue;
 		}

 		/* have we seen one of these before? */
 		if (xform[j])
 		{
 			/* yup, keep the more restrictive key */
 			test = FunctionCall2(&cur->sk_func,
 								 cur->sk_argument,
 								 xform[j]->sk_argument);
 			if (DatumGetBool(test))
 				xform[j] = cur;
 			else if (j == (BTEqualStrategyNumber - 1))
 			{
 				/* key == a && key == b, but a != b */
 				so->qual_ok = false;
 				return;
 			}
 		}
 		else
 		{
 			/* nope, so remember this scankey */
 			xform[j] = cur;
 		}
 	}

 	so->numberOfKeys = new_numberOfKeys;
 }

 /*
  * Mark a scankey as "required to continue the scan".
  *
  * Depending on the operator type, the key may be required for both scan
  * directions or just one.	Also, if the key is a row comparison header,
  * we have to mark the appropriate subsidiary ScanKeys as required.  In
  * such cases, the first subsidiary key is required, but subsequent ones
  * are required only as long as they correspond to successive index columns.
  * Otherwise the row comparison ordering is different from the index ordering
  * and so we can't stop the scan on the basis of those lower-order columns.
  *
  * Note: when we set required-key flag bits in a subsidiary scankey, we are
  * scribbling on a data structure belonging to the index AM's caller, not on
  * our private copy.  This should be OK because the marking will not change
  * from scan to scan within a query, and so we'd just re-mark the same way
  * anyway on a rescan.	Something to keep an eye on though.
  */
 static void
 _bt_mark_scankey_required(ScanKey skey)
 {
 	int			addflags;

 	switch (skey->sk_strategy)
 	{
 		case BTLessStrategyNumber:
 		case BTLessEqualStrategyNumber:
 			addflags = SK_BT_REQFWD;
 			break;
 		case BTEqualStrategyNumber:
 			addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
 			break;
 		case BTGreaterEqualStrategyNumber:
 		case BTGreaterStrategyNumber:
 			addflags = SK_BT_REQBKWD;
 			break;
 		default:
 			elog(ERROR, "unrecognized StrategyNumber: %d",
 				 (int) skey->sk_strategy);
 			addflags = 0;		/* keep compiler quiet */
 			break;
 	}

 	skey->sk_flags |= addflags;

 	if (skey->sk_flags & SK_ROW_HEADER)
 	{
 		ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
 		AttrNumber	attno = skey->sk_attno;

 		/* First subkey should be same as the header says */
 		Assert(subkey->sk_attno == attno);

 		for (;;)
 		{
 			Assert(subkey->sk_flags & SK_ROW_MEMBER);
 			Assert(subkey->sk_strategy == skey->sk_strategy);
 			if (subkey->sk_attno != attno)
 				break;			/* non-adjacent key, so not required */
 			subkey->sk_flags |= addflags;
 			if (subkey->sk_flags & SK_ROW_END)
 				break;
 			subkey++;
 			attno++;
 		}
 	}
 }

 /*
  * Test whether an indextuple satisfies all the scankey conditions.
  *
  * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
  * If not, return FALSE (xs_ctup is not changed).
  *
  * If the tuple fails to pass the qual, we also determine whether there's
  * any need to continue the scan beyond this tuple, and set *continuescan
  * accordingly.  See comments for _bt_preprocess_keys(), above, about how
  * this is done.
  *
  * scan: index scan descriptor (containing a search-type scankey)
  * page: buffer page containing index tuple
  * offnum: offset number of index tuple (must be a valid item!)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
  */
 bool
 _bt_checkkeys(IndexScanDesc scan,
 			  Page page, OffsetNumber offnum,
 			  ScanDirection dir, bool *continuescan)
 {
 	ItemId		iid = PageGetItemId(page, offnum);
 	bool		tuple_valid;
 	IndexTuple	tuple;
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
 	int			keysz;
 	int			ikey;
 	ScanKey		key;

 	*continuescan = true;		/* default assumption */

 	/*
 	 * If the scan specifies not to return killed tuples, then we treat a
 	 * killed tuple as not passing the qual.  Most of the time, it's a win to
 	 * not bother examining the tuple's index keys, but just return
 	 * immediately with continuescan = true to proceed to the next tuple.
 	 * However, if this is the last tuple on the page, we should check the
 	 * index keys to prevent uselessly advancing to the next page.
 	 */
 	if (scan->ignore_killed_tuples && ItemIdDeleted(iid))
 	{
 		/* return immediately if there are more tuples on the page */
 		if (ScanDirectionIsForward(dir))
 		{
 			if (offnum < PageGetMaxOffsetNumber(page))
 				return false;
 		}
 		else
 		{
 			BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

 			if (offnum > P_FIRSTDATAKEY(opaque))
 				return false;
 		}

 		/*
 		 * OK, we want to check the keys, but we'll return FALSE even if the
 		 * tuple passes the key tests.
 		 */
 		tuple_valid = false;
 	}
 	else
 		tuple_valid = true;

 	tuple = (IndexTuple) PageGetItem(page, iid);

 	IncrIndexProcessed();

 	tupdesc = RelationGetDescr(scan->indexRelation);
 	so = (BTScanOpaque) scan->opaque;
 	keysz = so->numberOfKeys;

 	for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
 	{
 		Datum		datum;
 		bool		isNull;
 		Datum		test;

 		/* row-comparison keys need special processing */
 		if (key->sk_flags & SK_ROW_HEADER)
 		{
 			if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
 				continue;
 			return false;
 		}

 		datum = index_getattr(tuple,
 							  key->sk_attno,
 							  tupdesc,
 							  &isNull);

 		/* btree doesn't support 'A is null' clauses, yet */
 		if (key->sk_flags & SK_ISNULL)
 		{
 			/* we shouldn't get here, really; see _bt_preprocess_keys() */
 			*continuescan = false;
 			return false;
 		}

 		if (isNull)
 		{
 			/*
 			 * Since NULLs are sorted after non-NULLs, we know we have reached
 			 * the upper limit of the range of values for this index attr.	On
 			 * a forward scan, we can stop if this qual is one of the "must
 			 * match" subset.  On a backward scan, however, we should keep
 			 * going.
 			 */
 			if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir))
 				*continuescan = false;

 			/*
 			 * In any case, this indextuple doesn't match the qual.
 			 */
 			return false;
 		}

 		test = FunctionCall2(&key->sk_func, datum, key->sk_argument);

 		if (!DatumGetBool(test))
 		{
 			/*
 			 * Tuple fails this qual.  If it's a required qual for the current
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 *
 			 * Note: because we stop the scan as soon as any required equality
 			 * qual fails, it is critical that equality quals be used for the
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
 			if ((key->sk_flags & SK_BT_REQFWD) &&
 				ScanDirectionIsForward(dir))
 				*continuescan = false;
 			else if ((key->sk_flags & SK_BT_REQBKWD) &&
 					 ScanDirectionIsBackward(dir))
 				*continuescan = false;

 			/*
 			 * In any case, this indextuple doesn't match the qual.
 			 */
 			return false;
 		}
 	}

 	/* If we get here, the tuple passes all index quals. */
 	if (tuple_valid)
 		scan->xs_ctup.t_self = tuple->t_tid;

 	return tuple_valid;
 }

 /*
  * Test whether an indextuple satisfies a row-comparison scan condition.
  *
  * Return true if so, false if not.  If not, also clear *continuescan if
  * it's not possible for any future tuples in the current scan direction
  * to pass the qual.
  *
  * This is a subroutine for _bt_checkkeys, which see for more info.
  */
 static bool
 _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
 					 ScanDirection dir, bool *continuescan)
 {
 	ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
 	int32		cmpresult = 0;
 	bool		result;

 	/* First subkey should be same as the header says */
 	Assert(subkey->sk_attno == skey->sk_attno);

 	/* Loop over columns of the row condition */
 	for (;;)
 	{
 		Datum		datum;
 		bool		isNull;

 		Assert(subkey->sk_flags & SK_ROW_MEMBER);
 		Assert(subkey->sk_strategy == skey->sk_strategy);

 		datum = index_getattr(tuple,
 							  subkey->sk_attno,
 							  tupdesc,
 							  &isNull);

 		if (isNull)
 		{
 			/*
 			 * Since NULLs are sorted after non-NULLs, we know we have reached
 			 * the upper limit of the range of values for this index attr.	On
 			 * a forward scan, we can stop if this qual is one of the "must
 			 * match" subset.  On a backward scan, however, we should keep
 			 * going.
 			 */
 			if ((subkey->sk_flags & SK_BT_REQFWD) &&
 				ScanDirectionIsForward(dir))
 				*continuescan = false;

 			/*
 			 * In any case, this indextuple doesn't match the qual.
 			 */
 			return false;
 		}

 		if (subkey->sk_flags & SK_ISNULL)
 		{
 			/*
 			 * Unlike the simple-scankey case, this isn't a disallowed case.
 			 * But it can never match.	If all the earlier row comparison
 			 * columns are required for the scan direction, we can stop the
 			 * scan, because there can't be another tuple that will succeed.
 			 */
 			if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
 				subkey--;
 			if ((subkey->sk_flags & SK_BT_REQFWD) &&
 				ScanDirectionIsForward(dir))
 				*continuescan = false;
 			else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
 					 ScanDirectionIsBackward(dir))
 				*continuescan = false;
 			return false;
 		}

 		/* Perform the test --- three-way comparison not bool operator */
 		cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func,
 												datum,
 												subkey->sk_argument));

 		/* Done comparing if unequal, else advance to next column */
 		if (cmpresult != 0)
 			break;

 		if (subkey->sk_flags & SK_ROW_END)
 			break;
 		subkey++;
 	}

 	/*
 	 * At this point cmpresult indicates the overall result of the row
 	 * comparison, and subkey points to the deciding column (or the last
 	 * column if the result is "=").
 	 */
 	switch (subkey->sk_strategy)
 	{
 			/* EQ and NE cases aren't allowed here */
 		case BTLessStrategyNumber:
 			result = (cmpresult < 0);
 			break;
 		case BTLessEqualStrategyNumber:
 			result = (cmpresult <= 0);
 			break;
 		case BTGreaterEqualStrategyNumber:
 			result = (cmpresult >= 0);
 			break;
 		case BTGreaterStrategyNumber:
 			result = (cmpresult > 0);
 			break;
 		default:
 			elog(ERROR, "unrecognized RowCompareType: %d",
 				 (int) subkey->sk_strategy);
 			result = 0;			/* keep compiler quiet */
 			break;
 	}

 	if (!result)
 	{
 		/*
 		 * Tuple fails this qual.  If it's a required qual for the current
 		 * scan direction, then we can conclude no further tuples will pass,
 		 * either.	Note we have to look at the deciding column, not
 		 * necessarily the first or last column of the row condition.
 		 */
 		if ((subkey->sk_flags & SK_BT_REQFWD) &&
 			ScanDirectionIsForward(dir))
 			*continuescan = false;
 		else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
 				 ScanDirectionIsBackward(dir))
 			*continuescan = false;
 	}

 	return result;
 }

 /*
  * _bt_killitems - set LP_DELETE bit for items an indexscan caller has
  * told us were killed
  *
  * scan->so contains information about the current page and killed tuples
  * thereon (generally, this should only be called if so->numKilled > 0).
  *
  * The caller must have pin on so->currPos.buf, but may or may not have
  * read-lock, as indicated by haveLock.  Note that we assume read-lock
  * is sufficient for setting LP_DELETE hint bits.
  *
  * We match items by heap TID before assuming they are the right ones to
  * delete.	We cope with cases where items have moved right due to insertions.
  * If an item has moved off the current page due to a split, we'll fail to
  * find it and do nothing (this is not an error case --- we assume the item
  * will eventually get marked in a future indexscan).  Note that because we
  * hold pin on the target page continuously from initially reading the items
  * until applying this function, VACUUM cannot have deleted any items from
  * the page, and so there is no need to search left from the recorded offset.
  * (This observation also guarantees that the item is still the right one
  * to delete, which might otherwise be questionable since heap TIDs can get
  * recycled.)
  */
 void
 _bt_killitems(IndexScanDesc scan, bool haveLock)
 {
 	MIRROREDLOCK_BUFMGR_DECLARE;

 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Page		page;
 	BTPageOpaque opaque;
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
 	int			i;
 	bool		killedsomething = false;

 	Assert(BufferIsValid(so->currPos.buf));

 	if (!haveLock)
 	{
 		// -------- MirroredLock ----------
 		MIRROREDLOCK_BUFMGR_LOCK;

 		LockBuffer(so->currPos.buf, BT_READ);
 	}
 	else
 	{
 		MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;
 	}

 	page = BufferGetPage(so->currPos.buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);

 	for (i = 0; i < so->numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
 		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
 		OffsetNumber offnum = kitem->indexOffset;

 		Assert(itemIndex >= so->currPos.firstItem &&
 			   itemIndex <= so->currPos.lastItem);
 		if (offnum < minoff)
 			continue;			/* pure paranoia */
 		while (offnum <= maxoff)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);

 			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
 			{
 				/* found the item */
 				iid->lp_flags |= LP_DELETE;
 				killedsomething = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
 		}
 	}

 	/*
 	 * Since this can be redone later if needed, it's treated the same as a
 	 * commit-hint-bit status update for heap tuples: we mark the buffer dirty
 	 * but don't make a WAL log entry.
 	 *
 	 * Whenever we mark anything LP_DELETEd, we also set the page's
 	 * BTP_HAS_GARBAGE flag, which is likewise just a hint.
 	 */
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
 		SetBufferCommitInfoNeedsSave(so->currPos.buf);
 	}

 	if (!haveLock)
 	{
 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

 		MIRROREDLOCK_BUFMGR_UNLOCK;
 		// -------- MirroredLock ----------

 	}

 	/*
 	 * Always reset the scan state, so we don't look for same items on other
 	 * pages.
 	 */
 	so->numKilled = 0;
 }


 /*
  * The following routines manage a shared-memory area in which we track
  * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
  * operations.	There is a single counter which increments each time we
  * start a vacuum to assign it a cycle ID.	Since multiple vacuums could
  * be active concurrently, we have to track the cycle ID for each active
  * vacuum; this requires at most MaxBackends entries (usually far fewer).
  * We assume at most one vacuum can be active for a given index.
  *
  * Access to the shared memory area is controlled by BtreeVacuumLock.
  * In principle we could use a separate lmgr locktag for each index,
  * but a single LWLock is much cheaper, and given the short time that
  * the lock is ever held, the concurrency hit should be minimal.
  */

 typedef struct BTOneVacInfo
 {
 	LockRelId	relid;			/* global identifier of an index */
 	BTCycleId	cycleid;		/* cycle ID for its active VACUUM */
 } BTOneVacInfo;

 typedef struct BTVacInfo
 {
 	BTCycleId	cycle_ctr;		/* cycle ID most recently assigned */
 	int			num_vacuums;	/* number of currently active VACUUMs */
 	int			max_vacuums;	/* allocated length of vacuums[] array */
 	BTOneVacInfo vacuums[1];	/* VARIABLE LENGTH ARRAY */
 } BTVacInfo;

 static BTVacInfo *btvacinfo;


 /*
  * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
  *		or zero if there is no active VACUUM
  *
  * Note: for correct interlocking, the caller must already hold pin and
  * exclusive lock on each buffer it will store the cycle ID into.  This
  * ensures that even if a VACUUM starts immediately afterwards, it cannot
  * process those pages until the page split is complete.
  */
 BTCycleId
 _bt_vacuum_cycleid(Relation rel)
 {
 	BTCycleId	result = 0;
 	int			i;

 	/* Share lock is enough since this is a read-only operation */
 	LWLockAcquire(BtreeVacuumLock, LW_SHARED);

 	for (i = 0; i < btvacinfo->num_vacuums; i++)
 	{
 		BTOneVacInfo *vac = &btvacinfo->vacuums[i];

 		if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
 			vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
 		{
 			result = vac->cycleid;
 			break;
 		}
 	}

 	LWLockRelease(BtreeVacuumLock);
 	return result;
 }

 /*
  * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
  *
  * Note: the caller must guarantee that it will eventually call
  * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
  * that this happens even in elog(FATAL) scenarios, the appropriate coding
  * is not just a PG_TRY, but
  *		PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
  */
 BTCycleId
 _bt_start_vacuum(Relation rel)
 {
 	BTCycleId	result;
 	int			i;
 	BTOneVacInfo *vac;

 	LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);

 	/* Assign the next cycle ID, being careful to avoid zero */
 	do
 	{
 		result = ++(btvacinfo->cycle_ctr);
 	} while (result == 0);

 	/* Let's just make sure there's no entry already for this index */
 	for (i = 0; i < btvacinfo->num_vacuums; i++)
 	{
 		vac = &btvacinfo->vacuums[i];
 		if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
 			vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
 		{
 			/*
 			 * Unlike most places in the backend, we have to explicitly
 			 * release our LWLock before throwing an error.  This is because
 			 * we expect _bt_end_vacuum() to be called before transaction
 			 * abort cleanup can run to release LWLocks.
 			 */
 			LWLockRelease(BtreeVacuumLock);
 			elog(ERROR, "multiple active vacuums for index \"%s\"",
 				 RelationGetRelationName(rel));
 		}
 	}

 	/* OK, add an entry */
 	if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
 	{
 		LWLockRelease(BtreeVacuumLock);
 		elog(ERROR, "out of btvacinfo slots");
 	}
 	vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
 	vac->relid = rel->rd_lockInfo.lockRelId;
 	vac->cycleid = result;
 	btvacinfo->num_vacuums++;

 	LWLockRelease(BtreeVacuumLock);
 	return result;
 }

 /*
  * _bt_end_vacuum --- mark a btree VACUUM operation as done
  *
  * Note: this is deliberately coded not to complain if no entry is found;
  * this allows the caller to put PG_TRY around the start_vacuum operation.
  */
 void
 _bt_end_vacuum(Relation rel)
 {
 	int			i;

 	LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);

 	/* Find the array entry */
 	for (i = 0; i < btvacinfo->num_vacuums; i++)
 	{
 		BTOneVacInfo *vac = &btvacinfo->vacuums[i];

 		if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
 			vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
 		{
 			/* Remove it by shifting down the last entry */
 			*vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
 			btvacinfo->num_vacuums--;
 			break;
 		}
 	}

 	LWLockRelease(BtreeVacuumLock);
 }

 /*
  * _bt_end_vacuum wrapped as an on_shmem_exit callback function
  */
 void
 _bt_end_vacuum_callback(int code __attribute__((unused)), Datum arg)
 {
 	_bt_end_vacuum((Relation) DatumGetPointer(arg));
 }

 /*
  * BTreeShmemSize --- report amount of shared memory space needed
  */
 Size
 BTreeShmemSize(void)
 {
 	Size		size;

 	size = offsetof(BTVacInfo, vacuums[0]);
 	size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
 	return size;
 }

 /*
  * BTreeShmemInit --- initialize this module's shared memory
  */
 void
 BTreeShmemInit(void)
 {
 	bool		found;

 	btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
 											  BTreeShmemSize(),
 											  &found);

 	if (!IsUnderPostmaster)
 	{
 		/* Initialize shared memory area */
 		Assert(!found);

 		/*
 		 * It doesn't really matter what the cycle counter starts at, but
 		 * having it always start the same doesn't seem good.  Seed with
 		 * low-order bits of time() instead.
 		 */
 		btvacinfo->cycle_ctr = (BTCycleId) time(NULL);

 		btvacinfo->num_vacuums = 0;
 		btvacinfo->max_vacuums = MaxBackends;
 	}
 	else
 	{
 		Assert(found);
 	}
 }

 Datum
 btoptions(PG_FUNCTION_ARGS)
 {
 	Datum		reloptions = PG_GETARG_DATUM(0);
 	bool		validate = PG_GETARG_BOOL(1);
 	bytea	   *result;

 	result = default_reloptions(reloptions, validate,
 								RELKIND_INDEX,
 								BTREE_MIN_FILLFACTOR,
 								BTREE_DEFAULT_FILLFACTOR);
 	if (result)
 		PG_RETURN_BYTEA_P(result);
 	PG_RETURN_NULL();
 }