src/backend/access/gin/ginget.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * ginget.c
  *	  fetch tuples from a GIN scan.
  *
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *			src/backend/access/gin/ginget.c
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/gin_private.h"
 #include "access/relscan.h"
 #include "common/pg_prng.h"
 #include "miscadmin.h"
 #include "storage/predicate.h"
 #include "utils/datum.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"

 /* GUC parameter */
 int			GinFuzzySearchLimit = 0;

 typedef struct pendingPosition
 {
 	Buffer		pendingBuffer;
 	OffsetNumber firstOffset;
 	OffsetNumber lastOffset;
 	ItemPointerData item;
 	bool	   *hasMatchKey;
 } pendingPosition;


 /*
  * Goes to the next page if current offset is outside of bounds
  */
 static bool
 moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
 {
 	Page		page = BufferGetPage(stack->buffer);

 	if (stack->off > PageGetMaxOffsetNumber(page))
 	{
 		/*
 		 * We scanned the whole page, so we should take right page
 		 */
 		if (GinPageRightMost(page))
 			return false;		/* no more pages */

 		stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
 		stack->blkno = BufferGetBlockNumber(stack->buffer);
 		stack->off = FirstOffsetNumber;
 		PredicateLockPage(btree->index, stack->blkno, snapshot);
 	}

 	return true;
 }

 /*
  * Scan all pages of a posting tree and save all its heap ItemPointers
  * in scanEntry->matchBitmap
  */
 static void
 scanPostingTree(Relation index, GinScanEntry scanEntry,
 				BlockNumber rootPostingTree, Snapshot snapshot)
 {
 	GinBtreeData btree;
 	GinBtreeStack *stack;
 	Buffer		buffer;
 	Page		page;

 	/* Descend to the leftmost leaf page */
 	stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
 	buffer = stack->buffer;

 	IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */

 	freeGinBtreeStack(stack);

 	/*
 	 * Loop iterates through all leaf pages of posting tree
 	 */
 	for (;;)
 	{
 		page = BufferGetPage(buffer);
 		if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
 		{
 			int			n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);

 			scanEntry->predictNumberResult += n;
 		}

 		if (GinPageRightMost(page))
 			break;				/* no more pages */

 		buffer = ginStepRight(buffer, index, GIN_SHARE);
 	}

 	UnlockReleaseBuffer(buffer);
 }

 /*
  * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
  * match the search entry.  This supports three different match modes:
  *
  * 1. Partial-match support: scan from current point until the
  *	  comparePartialFn says we're done.
  * 2. SEARCH_MODE_ALL: scan from current point (which should be first
  *	  key for the current attnum) until we hit null items or end of attnum
  * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
  *	  key for the current attnum) until we hit end of attnum
  *
  * Returns true if done, false if it's necessary to restart scan from scratch
  */
 static bool
 collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
 				   GinScanEntry scanEntry, Snapshot snapshot)
 {
 	OffsetNumber attnum;
 	Form_pg_attribute attr;

 	/* Initialize empty bitmap result */
 	scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);

 	/* Null query cannot partial-match anything */
 	if (scanEntry->isPartialMatch &&
 		scanEntry->queryCategory != GIN_CAT_NORM_KEY)
 		return true;

 	/* Locate tupdesc entry for key column (for attbyval/attlen data) */
 	attnum = scanEntry->attnum;
 	attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);

 	/*
 	 * Predicate lock entry leaf page, following pages will be locked by
 	 * moveRightIfItNeeded()
 	 */
 	PredicateLockPage(btree->index,
 					  BufferGetBlockNumber(stack->buffer),
 					  snapshot);

 	for (;;)
 	{
 		Page		page;
 		IndexTuple	itup;
 		Datum		idatum;
 		GinNullCategory icategory;

 		/*
 		 * stack->off points to the interested entry, buffer is already locked
 		 */
 		if (moveRightIfItNeeded(btree, stack, snapshot) == false)
 			return true;

 		page = BufferGetPage(stack->buffer);
 		TestForOldSnapshot(snapshot, btree->index, page);
 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));

 		/*
 		 * If tuple stores another attribute then stop scan
 		 */
 		if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
 			return true;

 		/* Safe to fetch attribute value */
 		idatum = gintuple_get_key(btree->ginstate, itup, &icategory);

 		/*
 		 * Check for appropriate scan stop conditions
 		 */
 		if (scanEntry->isPartialMatch)
 		{
 			int32		cmp;

 			/*
 			 * In partial match, stop scan at any null (including
 			 * placeholders); partial matches never match nulls
 			 */
 			if (icategory != GIN_CAT_NORM_KEY)
 				return true;

 			/*----------
 			 * Check of partial match.
 			 * case cmp == 0 => match
 			 * case cmp > 0 => not match and finish scan
 			 * case cmp < 0 => not match and continue scan
 			 *----------
 			 */
 			cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
 												  btree->ginstate->supportCollation[attnum - 1],
 												  scanEntry->queryKey,
 												  idatum,
 												  UInt16GetDatum(scanEntry->strategy),
 												  PointerGetDatum(scanEntry->extra_data)));

 			if (cmp > 0)
 				return true;
 			else if (cmp < 0)
 			{
 				stack->off++;
 				continue;
 			}
 		}
 		else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
 		{
 			/*
 			 * In ALL mode, we are not interested in null items, so we can
 			 * stop if we get to a null-item placeholder (which will be the
 			 * last entry for a given attnum).  We do want to include NULL_KEY
 			 * and EMPTY_ITEM entries, though.
 			 */
 			if (icategory == GIN_CAT_NULL_ITEM)
 				return true;
 		}

 		/*
 		 * OK, we want to return the TIDs listed in this entry.
 		 */
 		if (GinIsPostingTree(itup))
 		{
 			BlockNumber rootPostingTree = GinGetPostingTree(itup);

 			/*
 			 * We should unlock current page (but not unpin) during tree scan
 			 * to prevent deadlock with vacuum processes.
 			 *
 			 * We save current entry value (idatum) to be able to re-find our
 			 * tuple after re-locking
 			 */
 			if (icategory == GIN_CAT_NORM_KEY)
 				idatum = datumCopy(idatum, attr->attbyval, attr->attlen);

 			LockBuffer(stack->buffer, GIN_UNLOCK);

 			/*
 			 * Acquire predicate lock on the posting tree.  We already hold a
 			 * lock on the entry page, but insertions to the posting tree
 			 * don't check for conflicts on that level.
 			 */
 			PredicateLockPage(btree->index, rootPostingTree, snapshot);

 			/* Collect all the TIDs in this entry's posting tree */
 			scanPostingTree(btree->index, scanEntry, rootPostingTree,
 							snapshot);

 			/*
 			 * We lock again the entry page and while it was unlocked insert
 			 * might have occurred, so we need to re-find our position.
 			 */
 			LockBuffer(stack->buffer, GIN_SHARE);
 			page = BufferGetPage(stack->buffer);
 			if (!GinPageIsLeaf(page))
 			{
 				/*
 				 * Root page becomes non-leaf while we unlock it. We will
 				 * start again, this situation doesn't occur often - root can
 				 * became a non-leaf only once per life of index.
 				 */
 				return false;
 			}

 			/* Search forward to re-find idatum */
 			for (;;)
 			{
 				if (moveRightIfItNeeded(btree, stack, snapshot) == false)
 					ereport(ERROR,
 							(errcode(ERRCODE_INTERNAL_ERROR),
 							 errmsg("failed to re-find tuple within index \"%s\"",
 									RelationGetRelationName(btree->index))));

 				page = BufferGetPage(stack->buffer);
 				itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));

 				if (gintuple_get_attrnum(btree->ginstate, itup) == attnum)
 				{
 					Datum		newDatum;
 					GinNullCategory newCategory;

 					newDatum = gintuple_get_key(btree->ginstate, itup,
 												&newCategory);

 					if (ginCompareEntries(btree->ginstate, attnum,
 										  newDatum, newCategory,
 										  idatum, icategory) == 0)
 						break;	/* Found! */
 				}

 				stack->off++;
 			}

 			if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
 				pfree(DatumGetPointer(idatum));
 		}
 		else
 		{
 			ItemPointer ipd;
 			int			nipd;

 			ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
 			tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
 			scanEntry->predictNumberResult += GinGetNPosting(itup);
 			pfree(ipd);
 		}

 		/*
 		 * Done with this entry, go to the next
 		 */
 		stack->off++;
 	}
 }

 /*
  * Start* functions setup beginning state of searches: finds correct buffer and pins it.
  */
 static void
 startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
 {
 	GinBtreeData btreeEntry;
 	GinBtreeStack *stackEntry;
 	Page		page;
 	bool		needUnlock;

 restartScanEntry:
 	entry->buffer = InvalidBuffer;
 	ItemPointerSetMin(&entry->curItem);
 	entry->offset = InvalidOffsetNumber;
 	if (entry->list)
 		pfree(entry->list);
 	entry->list = NULL;
 	entry->nlist = 0;
 	entry->matchBitmap = NULL;
 	entry->matchResult = NULL;
 	entry->reduceResult = false;
 	entry->predictNumberResult = 0;

 	/*
 	 * we should find entry, and begin scan of posting tree or just store
 	 * posting list in memory
 	 */
 	ginPrepareEntryScan(&btreeEntry, entry->attnum,
 						entry->queryKey, entry->queryCategory,
 						ginstate);
 	stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
 	page = BufferGetPage(stackEntry->buffer);

 	/* ginFindLeafPage() will have already checked snapshot age. */
 	needUnlock = true;

 	entry->isFinished = true;

 	if (entry->isPartialMatch ||
 		entry->queryCategory == GIN_CAT_EMPTY_QUERY)
 	{
 		/*
 		 * btreeEntry.findItem locates the first item >= given search key.
 		 * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
 		 * because of the way the GIN_CAT_EMPTY_QUERY category code is
 		 * assigned.)  We scan forward from there and collect all TIDs needed
 		 * for the entry type.
 		 */
 		btreeEntry.findItem(&btreeEntry, stackEntry);
 		if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
 			== false)
 		{
 			/*
 			 * GIN tree was seriously restructured, so we will cleanup all
 			 * found data and rescan. See comments near 'return false' in
 			 * collectMatchBitmap()
 			 */
 			if (entry->matchBitmap)
 			{
 				if (entry->matchIterator)
 					tbm_end_iterate(entry->matchIterator);
 				entry->matchIterator = NULL;
 				tbm_free(entry->matchBitmap);
 				entry->matchBitmap = NULL;
 			}
 			LockBuffer(stackEntry->buffer, GIN_UNLOCK);
 			freeGinBtreeStack(stackEntry);
 			goto restartScanEntry;
 		}

 		if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
 		{
 			entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
 			entry->isFinished = false;
 		}
 	}
 	else if (btreeEntry.findItem(&btreeEntry, stackEntry))
 	{
 		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));

 		if (GinIsPostingTree(itup))
 		{
 			BlockNumber rootPostingTree = GinGetPostingTree(itup);
 			GinBtreeStack *stack;
 			Page		entrypage;
 			ItemPointerData minItem;

 			/*
 			 * This is an equality scan, so lock the root of the posting tree.
 			 * It represents a lock on the exact key value, and covers all the
 			 * items in the posting tree.
 			 */
 			PredicateLockPage(ginstate->index, rootPostingTree, snapshot);

 			/*
 			 * We should unlock entry page before touching posting tree to
 			 * prevent deadlocks with vacuum processes. Because entry is never
 			 * deleted from page and posting tree is never reduced to the
 			 * posting list, we can unlock page after getting BlockNumber of
 			 * root of posting tree.
 			 */
 			LockBuffer(stackEntry->buffer, GIN_UNLOCK);
 			needUnlock = false;

 			stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
 											rootPostingTree, snapshot);
 			entry->buffer = stack->buffer;

 			/*
 			 * We keep buffer pinned because we need to prevent deletion of
 			 * page during scan. See GIN's vacuum implementation. RefCount is
 			 * increased to keep buffer pinned after freeGinBtreeStack() call.
 			 */
 			IncrBufferRefCount(entry->buffer);

 			entrypage = BufferGetPage(entry->buffer);

 			/*
 			 * Load the first page into memory.
 			 */
 			ItemPointerSetMin(&minItem);
 			entry->list = GinDataLeafPageGetItems(entrypage, &entry->nlist, minItem);

 			entry->predictNumberResult = stack->predictNumber * entry->nlist;

 			LockBuffer(entry->buffer, GIN_UNLOCK);
 			freeGinBtreeStack(stack);
 			entry->isFinished = false;
 		}
 		else
 		{
 			/*
 			 * Lock the entry leaf page.  This is more coarse-grained than
 			 * necessary, because it will conflict with any insertions that
 			 * land on the same leaf page, not only the exact key we searched
 			 * for.  But locking an individual tuple would require updating
 			 * that lock whenever it moves because of insertions or vacuums,
 			 * which seems too complicated.
 			 */
 			PredicateLockPage(ginstate->index,
 							  BufferGetBlockNumber(stackEntry->buffer),
 							  snapshot);
 			if (GinGetNPosting(itup) > 0)
 			{
 				entry->list = ginReadTuple(ginstate, entry->attnum, itup,
 										   &entry->nlist);
 				entry->predictNumberResult = entry->nlist;

 				entry->isFinished = false;
 			}
 		}
 	}
 	else
 	{
 		/*
 		 * No entry found.  Predicate lock the leaf page, to lock the place
 		 * where the entry would've been, had there been one.
 		 */
 		PredicateLockPage(ginstate->index,
 						  BufferGetBlockNumber(stackEntry->buffer), snapshot);
 	}

 	if (needUnlock)
 		LockBuffer(stackEntry->buffer, GIN_UNLOCK);
 	freeGinBtreeStack(stackEntry);
 }

 /*
  * Comparison function for scan entry indexes. Sorts by predictNumberResult,
  * least frequent items first.
  */
 static int
 entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
 {
 	const GinScanKey key = (const GinScanKey) arg;
 	int			i1 = *(const int *) a1;
 	int			i2 = *(const int *) a2;
 	uint32		n1 = key->scanEntry[i1]->predictNumberResult;
 	uint32		n2 = key->scanEntry[i2]->predictNumberResult;

 	if (n1 < n2)
 		return -1;
 	else if (n1 == n2)
 		return 0;
 	else
 		return 1;
 }

 static void
 startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
 {
 	MemoryContext oldCtx = CurrentMemoryContext;
 	int			i;
 	int			j;
 	int		   *entryIndexes;

 	ItemPointerSetMin(&key->curItem);
 	key->curItemMatches = false;
 	key->recheckCurItem = false;
 	key->isFinished = false;

 	/*
 	 * Divide the entries into two distinct sets: required and additional.
 	 * Additional entries are not enough for a match alone, without any items
 	 * from the required set, but are needed by the consistent function to
 	 * decide if an item matches. When scanning, we can skip over items from
 	 * additional entries that have no corresponding matches in any of the
 	 * required entries. That speeds up queries like "frequent & rare"
 	 * considerably, if the frequent term can be put in the additional set.
 	 *
 	 * There can be many legal ways to divide them entries into these two
 	 * sets. A conservative division is to just put everything in the required
 	 * set, but the more you can put in the additional set, the more you can
 	 * skip during the scan. To maximize skipping, we try to put as many
 	 * frequent items as possible into additional, and less frequent ones into
 	 * required. To do that, sort the entries by frequency
 	 * (predictNumberResult), and put entries into the required set in that
 	 * order, until the consistent function says that none of the remaining
 	 * entries can form a match, without any items from the required set. The
 	 * rest go to the additional set.
 	 *
 	 * Exclude-only scan keys are known to have no required entries.
 	 */
 	if (key->excludeOnly)
 	{
 		MemoryContextSwitchTo(so->keyCtx);

 		key->nrequired = 0;
 		key->nadditional = key->nentries;
 		key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
 		for (i = 0; i < key->nadditional; i++)
 			key->additionalEntries[i] = key->scanEntry[i];
 	}
 	else if (key->nentries > 1)
 	{
 		MemoryContextSwitchTo(so->tempCtx);

 		entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
 		for (i = 0; i < key->nentries; i++)
 			entryIndexes[i] = i;
 		qsort_arg(entryIndexes, key->nentries, sizeof(int),
 				  entryIndexByFrequencyCmp, key);

 		for (i = 1; i < key->nentries; i++)
 			key->entryRes[entryIndexes[i]] = GIN_MAYBE;
 		for (i = 0; i < key->nentries - 1; i++)
 		{
 			/* Pass all entries <= i as FALSE, and the rest as MAYBE */
 			key->entryRes[entryIndexes[i]] = GIN_FALSE;

 			if (key->triConsistentFn(key) == GIN_FALSE)
 				break;

 			/* Make this loop interruptible in case there are many keys */
 			CHECK_FOR_INTERRUPTS();
 		}
 		/* i is now the last required entry. */

 		MemoryContextSwitchTo(so->keyCtx);

 		key->nrequired = i + 1;
 		key->nadditional = key->nentries - key->nrequired;
 		key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
 		key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));

 		j = 0;
 		for (i = 0; i < key->nrequired; i++)
 			key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
 		for (i = 0; i < key->nadditional; i++)
 			key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];

 		/* clean up after consistentFn calls (also frees entryIndexes) */
 		MemoryContextReset(so->tempCtx);
 	}
 	else
 	{
 		MemoryContextSwitchTo(so->keyCtx);

 		key->nrequired = 1;
 		key->nadditional = 0;
 		key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
 		key->requiredEntries[0] = key->scanEntry[0];
 	}
 	MemoryContextSwitchTo(oldCtx);
 }

 static void
 startScan(IndexScanDesc scan)
 {
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
 	GinState   *ginstate = &so->ginstate;
 	uint32		i;

 	for (i = 0; i < so->totalentries; i++)
 		startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);

 	if (GinFuzzySearchLimit > 0)
 	{
 		/*
 		 * If all of keys more than threshold we will try to reduce result, we
 		 * hope (and only hope, for intersection operation of array our
 		 * supposition isn't true), that total result will not more than
 		 * minimal predictNumberResult.
 		 */
 		bool		reduce = true;

 		for (i = 0; i < so->totalentries; i++)
 		{
 			if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
 			{
 				reduce = false;
 				break;
 			}
 		}
 		if (reduce)
 		{
 			for (i = 0; i < so->totalentries; i++)
 			{
 				so->entries[i]->predictNumberResult /= so->totalentries;
 				so->entries[i]->reduceResult = true;
 			}
 		}
 	}

 	/*
 	 * Now that we have the estimates for the entry frequencies, finish
 	 * initializing the scan keys.
 	 */
 	for (i = 0; i < so->nkeys; i++)
 		startScanKey(ginstate, so, so->keys + i);
 }

 /*
  * Load the next batch of item pointers from a posting tree.
  *
  * Note that we copy the page into GinScanEntry->list array and unlock it, but
  * keep it pinned to prevent interference with vacuum.
  */
 static void
 entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
 				   ItemPointerData advancePast, Snapshot snapshot)
 {
 	Page		page;
 	int			i;
 	bool		stepright;

 	if (!BufferIsValid(entry->buffer))
 	{
 		entry->isFinished = true;
 		return;
 	}

 	/*
 	 * We have two strategies for finding the correct page: step right from
 	 * the current page, or descend the tree again from the root. If
 	 * advancePast equals the current item, the next matching item should be
 	 * on the next page, so we step right. Otherwise, descend from root.
 	 */
 	if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
 	{
 		stepright = true;
 		LockBuffer(entry->buffer, GIN_SHARE);
 	}
 	else
 	{
 		GinBtreeStack *stack;

 		ReleaseBuffer(entry->buffer);

 		/*
 		 * Set the search key, and find the correct leaf page.
 		 */
 		if (ItemPointerIsLossyPage(&advancePast))
 		{
 			ItemPointerSet(&entry->btree.itemptr,
 						   GinItemPointerGetBlockNumber(&advancePast) + 1,
 						   FirstOffsetNumber);
 		}
 		else
 		{
 			ItemPointerSet(&entry->btree.itemptr,
 						   GinItemPointerGetBlockNumber(&advancePast),
 						   OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
 		}
 		entry->btree.fullScan = false;
 		stack = ginFindLeafPage(&entry->btree, true, false, snapshot);

 		/* we don't need the stack, just the buffer. */
 		entry->buffer = stack->buffer;
 		IncrBufferRefCount(entry->buffer);
 		freeGinBtreeStack(stack);
 		stepright = false;
 	}

 	elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
 		 GinItemPointerGetBlockNumber(&advancePast),
 		 GinItemPointerGetOffsetNumber(&advancePast),
 		 !stepright);

 	page = BufferGetPage(entry->buffer);
 	for (;;)
 	{
 		entry->offset = InvalidOffsetNumber;
 		if (entry->list)
 		{
 			pfree(entry->list);
 			entry->list = NULL;
 			entry->nlist = 0;
 		}

 		if (stepright)
 		{
 			/*
 			 * We've processed all the entries on this page. If it was the
 			 * last page in the tree, we're done.
 			 */
 			if (GinPageRightMost(page))
 			{
 				UnlockReleaseBuffer(entry->buffer);
 				entry->buffer = InvalidBuffer;
 				entry->isFinished = true;
 				return;
 			}

 			/*
 			 * Step to next page, following the right link. then find the
 			 * first ItemPointer greater than advancePast.
 			 */
 			entry->buffer = ginStepRight(entry->buffer,
 										 ginstate->index,
 										 GIN_SHARE);
 			page = BufferGetPage(entry->buffer);
 		}
 		stepright = true;

 		if (GinPageGetOpaque(page)->flags & GIN_DELETED)
 			continue;			/* page was deleted by concurrent vacuum */

 		/*
 		 * The first item > advancePast might not be on this page, but
 		 * somewhere to the right, if the page was split, or a non-match from
 		 * another key in the query allowed us to skip some items from this
 		 * entry. Keep following the right-links until we re-find the correct
 		 * page.
 		 */
 		if (!GinPageRightMost(page) &&
 			ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
 		{
 			/*
 			 * the item we're looking is > the right bound of the page, so it
 			 * can't be on this page.
 			 */
 			continue;
 		}

 		entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);

 		for (i = 0; i < entry->nlist; i++)
 		{
 			if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
 			{
 				entry->offset = i;

 				if (GinPageRightMost(page))
 				{
 					/* after processing the copied items, we're done. */
 					UnlockReleaseBuffer(entry->buffer);
 					entry->buffer = InvalidBuffer;
 				}
 				else
 					LockBuffer(entry->buffer, GIN_UNLOCK);
 				return;
 			}
 		}
 	}
 }

 #define gin_rand() pg_prng_double(&pg_global_prng_state)
 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )

 /*
  * Sets entry->curItem to next heap item pointer > advancePast, for one entry
  * of one scan key, or sets entry->isFinished to true if there are no more.
  *
  * Item pointers are returned in ascending order.
  *
  * Note: this can return a "lossy page" item pointer, indicating that the
  * entry potentially matches all items on that heap page.  However, it is
  * not allowed to return both a lossy page pointer and exact (regular)
  * item pointers for the same page.  (Doing so would break the key-combination
  * logic in keyGetItem and scanGetItem; see comment in scanGetItem.)  In the
  * current implementation this is guaranteed by the behavior of tidbitmaps.
  */
 static void
 entryGetItem(GinState *ginstate, GinScanEntry entry,
 			 ItemPointerData advancePast, Snapshot snapshot)
 {
 	Assert(!entry->isFinished);

 	Assert(!ItemPointerIsValid(&entry->curItem) ||
 		   ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);

 	if (entry->matchBitmap)
 	{
 		/* A bitmap result */
 		BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
 		OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);

 		for (;;)
 		{
 			/*
 			 * If we've exhausted all items on this block, move to next block
 			 * in the bitmap.
 			 */
 			while (entry->matchResult == NULL ||
 				   (entry->matchResult->ntuples >= 0 &&
 					entry->offset >= entry->matchResult->ntuples) ||
 				   entry->matchResult->blockno < advancePastBlk ||
 				   (ItemPointerIsLossyPage(&advancePast) &&
 					entry->matchResult->blockno == advancePastBlk))
 			{
 				entry->matchResult = tbm_iterate(entry->matchIterator);

 				if (entry->matchResult == NULL)
 				{
 					ItemPointerSetInvalid(&entry->curItem);
 					tbm_end_iterate(entry->matchIterator);
 					entry->matchIterator = NULL;
 					entry->isFinished = true;
 					break;
 				}

 				/*
 				 * Reset counter to the beginning of entry->matchResult. Note:
 				 * entry->offset is still greater than matchResult->ntuples if
 				 * matchResult is lossy.  So, on next call we will get next
 				 * result from TIDBitmap.
 				 */
 				entry->offset = 0;
 			}
 			if (entry->isFinished)
 				break;

 			/*
 			 * We're now on the first page after advancePast which has any
 			 * items on it. If it's a lossy result, return that.
 			 */
 			if (entry->matchResult->ntuples < 0)
 			{
 				ItemPointerSetLossyPage(&entry->curItem,
 										entry->matchResult->blockno);

 				/*
 				 * We might as well fall out of the loop; we could not
 				 * estimate number of results on this page to support correct
 				 * reducing of result even if it's enabled.
 				 */
 				break;
 			}

 			/*
 			 * Not a lossy page. Skip over any offsets <= advancePast, and
 			 * return that.
 			 */
 			if (entry->matchResult->blockno == advancePastBlk)
 			{
 				/*
 				 * First, do a quick check against the last offset on the
 				 * page. If that's > advancePast, so are all the other
 				 * offsets, so just go back to the top to get the next page.
 				 */
 				if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
 				{
 					entry->offset = entry->matchResult->ntuples;
 					continue;
 				}

 				/* Otherwise scan to find the first item > advancePast */
 				while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
 					entry->offset++;
 			}

 			ItemPointerSet(&entry->curItem,
 						   entry->matchResult->blockno,
 						   entry->matchResult->offsets[entry->offset]);
 			entry->offset++;

 			/* Done unless we need to reduce the result */
 			if (!entry->reduceResult || !dropItem(entry))
 				break;
 		}
 	}
 	else if (!BufferIsValid(entry->buffer))
 	{
 		/*
 		 * A posting list from an entry tuple, or the last page of a posting
 		 * tree.
 		 */
 		for (;;)
 		{
 			if (entry->offset >= entry->nlist)
 			{
 				ItemPointerSetInvalid(&entry->curItem);
 				entry->isFinished = true;
 				break;
 			}

 			entry->curItem = entry->list[entry->offset++];

 			/* If we're not past advancePast, keep scanning */
 			if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
 				continue;

 			/* Done unless we need to reduce the result */
 			if (!entry->reduceResult || !dropItem(entry))
 				break;
 		}
 	}
 	else
 	{
 		/* A posting tree */
 		for (;;)
 		{
 			/* If we've processed the current batch, load more items */
 			while (entry->offset >= entry->nlist)
 			{
 				entryLoadMoreItems(ginstate, entry, advancePast, snapshot);

 				if (entry->isFinished)
 				{
 					ItemPointerSetInvalid(&entry->curItem);
 					return;
 				}
 			}

 			entry->curItem = entry->list[entry->offset++];

 			/* If we're not past advancePast, keep scanning */
 			if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
 				continue;

 			/* Done unless we need to reduce the result */
 			if (!entry->reduceResult || !dropItem(entry))
 				break;

 			/*
 			 * Advance advancePast (so that entryLoadMoreItems will load the
 			 * right data), and keep scanning
 			 */
 			advancePast = entry->curItem;
 		}
 	}
 }

 /*
  * Identify the "current" item among the input entry streams for this scan key
  * that is greater than advancePast, and test whether it passes the scan key
  * qual condition.
  *
  * The current item is the smallest curItem among the inputs.  key->curItem
  * is set to that value.  key->curItemMatches is set to indicate whether that
  * TID passes the consistentFn test.  If so, key->recheckCurItem is set true
  * iff recheck is needed for this item pointer (including the case where the
  * item pointer is a lossy page pointer).
  *
  * If all entry streams are exhausted, sets key->isFinished to true.
  *
  * Item pointers must be returned in ascending order.
  *
  * Note: this can return a "lossy page" item pointer, indicating that the
  * key potentially matches all items on that heap page.  However, it is
  * not allowed to return both a lossy page pointer and exact (regular)
  * item pointers for the same page.  (Doing so would break the key-combination
  * logic in scanGetItem.)
  */
 static void
 keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
 		   ItemPointerData advancePast, Snapshot snapshot)
 {
 	ItemPointerData minItem;
 	ItemPointerData curPageLossy;
 	uint32		i;
 	bool		haveLossyEntry;
 	GinScanEntry entry;
 	GinTernaryValue res;
 	MemoryContext oldCtx;
 	bool		allFinished;

 	Assert(!key->isFinished);

 	/*
 	 * We might have already tested this item; if so, no need to repeat work.
 	 * (Note: the ">" case can happen, if advancePast is exact but we
 	 * previously had to set curItem to a lossy-page pointer.)
 	 */
 	if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
 		return;

 	/*
 	 * Find the minimum item > advancePast among the active entry streams.
 	 *
 	 * Note: a lossy-page entry is encoded by a ItemPointer with max value for
 	 * offset (0xffff), so that it will sort after any exact entries for the
 	 * same page.  So we'll prefer to return exact pointers not lossy
 	 * pointers, which is good.
 	 */
 	ItemPointerSetMax(&minItem);
 	allFinished = true;
 	for (i = 0; i < key->nrequired; i++)
 	{
 		entry = key->requiredEntries[i];

 		if (entry->isFinished)
 			continue;

 		/*
 		 * Advance this stream if necessary.
 		 *
 		 * In particular, since entry->curItem was initialized with
 		 * ItemPointerSetMin, this ensures we fetch the first item for each
 		 * entry on the first call.
 		 */
 		if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
 		{
 			entryGetItem(ginstate, entry, advancePast, snapshot);
 			if (entry->isFinished)
 				continue;
 		}

 		allFinished = false;
 		if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
 			minItem = entry->curItem;
 	}

 	if (allFinished && !key->excludeOnly)
 	{
 		/* all entries are finished */
 		key->isFinished = true;
 		return;
 	}

 	if (!key->excludeOnly)
 	{
 		/*
 		 * For a normal scan key, we now know there are no matches < minItem.
 		 *
 		 * If minItem is lossy, it means that there were no exact items on the
 		 * page among requiredEntries, because lossy pointers sort after exact
 		 * items. However, there might be exact items for the same page among
 		 * additionalEntries, so we mustn't advance past them.
 		 */
 		if (ItemPointerIsLossyPage(&minItem))
 		{
 			if (GinItemPointerGetBlockNumber(&advancePast) <
 				GinItemPointerGetBlockNumber(&minItem))
 			{
 				ItemPointerSet(&advancePast,
 							   GinItemPointerGetBlockNumber(&minItem),
 							   InvalidOffsetNumber);
 			}
 		}
 		else
 		{
 			Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
 			ItemPointerSet(&advancePast,
 						   GinItemPointerGetBlockNumber(&minItem),
 						   OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
 		}
 	}
 	else
 	{
 		/*
 		 * excludeOnly scan keys don't have any entries that are necessarily
 		 * present in matching items.  So, we consider the item just after
 		 * advancePast.
 		 */
 		Assert(key->nrequired == 0);
 		ItemPointerSet(&minItem,
 					   GinItemPointerGetBlockNumber(&advancePast),
 					   OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
 	}

 	/*
 	 * We might not have loaded all the entry streams for this TID yet. We
 	 * could call the consistent function, passing MAYBE for those entries, to
 	 * see if it can decide if this TID matches based on the information we
 	 * have. But if the consistent-function is expensive, and cannot in fact
 	 * decide with partial information, that could be a big loss. So, load all
 	 * the additional entries, before calling the consistent function.
 	 */
 	for (i = 0; i < key->nadditional; i++)
 	{
 		entry = key->additionalEntries[i];

 		if (entry->isFinished)
 			continue;

 		if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
 		{
 			entryGetItem(ginstate, entry, advancePast, snapshot);
 			if (entry->isFinished)
 				continue;
 		}

 		/*
 		 * Normally, none of the items in additionalEntries can have a curItem
 		 * larger than minItem. But if minItem is a lossy page, then there
 		 * might be exact items on the same page among additionalEntries.
 		 */
 		if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
 		{
 			Assert(ItemPointerIsLossyPage(&minItem));
 			minItem = entry->curItem;
 		}
 	}

 	/*
 	 * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
 	 * and perform consistentFn test.
 	 *
 	 * Lossy-page entries pose a problem, since we don't know the correct
 	 * entryRes state to pass to the consistentFn, and we also don't know what
 	 * its combining logic will be (could be AND, OR, or even NOT). If the
 	 * logic is OR then the consistentFn might succeed for all items in the
 	 * lossy page even when none of the other entries match.
 	 *
 	 * Our strategy is to call the tri-state consistent function, with the
 	 * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
 	 * returns FALSE, none of the lossy items alone are enough for a match, so
 	 * we don't need to return a lossy-page pointer. Otherwise, return a
 	 * lossy-page pointer to indicate that the whole heap page must be
 	 * checked.  (On subsequent calls, we'll do nothing until minItem is past
 	 * the page altogether, thus ensuring that we never return both regular
 	 * and lossy pointers for the same page.)
 	 *
 	 * An exception is that it doesn't matter what we pass for lossy pointers
 	 * in "hidden" entries, because the consistentFn's result can't depend on
 	 * them. We could pass them as MAYBE as well, but if we're using the
 	 * "shim" implementation of a tri-state consistent function (see
 	 * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
 	 * them as true.
 	 *
 	 * Note that only lossy-page entries pointing to the current item's page
 	 * should trigger this processing; we might have future lossy pages in the
 	 * entry array, but they aren't relevant yet.
 	 */
 	key->curItem = minItem;
 	ItemPointerSetLossyPage(&curPageLossy,
 							GinItemPointerGetBlockNumber(&key->curItem));
 	haveLossyEntry = false;
 	for (i = 0; i < key->nentries; i++)
 	{
 		entry = key->scanEntry[i];
 		if (entry->isFinished == false &&
 			ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
 		{
 			if (i < key->nuserentries)
 				key->entryRes[i] = GIN_MAYBE;
 			else
 				key->entryRes[i] = GIN_TRUE;
 			haveLossyEntry = true;
 		}
 		else
 			key->entryRes[i] = GIN_FALSE;
 	}

 	/* prepare for calling consistentFn in temp context */
 	oldCtx = MemoryContextSwitchTo(tempCtx);

 	if (haveLossyEntry)
 	{
 		/* Have lossy-page entries, so see if whole page matches */
 		res = key->triConsistentFn(key);

 		if (res == GIN_TRUE || res == GIN_MAYBE)
 		{
 			/* Yes, so clean up ... */
 			MemoryContextSwitchTo(oldCtx);
 			MemoryContextReset(tempCtx);

 			/* and return lossy pointer for whole page */
 			key->curItem = curPageLossy;
 			key->curItemMatches = true;
 			key->recheckCurItem = true;
 			return;
 		}
 	}

 	/*
 	 * At this point we know that we don't need to return a lossy whole-page
 	 * pointer, but we might have matches for individual exact item pointers,
 	 * possibly in combination with a lossy pointer. Pass lossy pointers as
 	 * MAYBE to the ternary consistent function, to let it decide if this
 	 * tuple satisfies the overall key, even though we don't know if the lossy
 	 * entries match.
 	 *
 	 * Prepare entryRes array to be passed to consistentFn.
 	 */
 	for (i = 0; i < key->nentries; i++)
 	{
 		entry = key->scanEntry[i];
 		if (entry->isFinished)
 			key->entryRes[i] = GIN_FALSE;
 #if 0

 		/*
 		 * This case can't currently happen, because we loaded all the entries
 		 * for this item earlier.
 		 */
 		else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
 			key->entryRes[i] = GIN_MAYBE;
 #endif
 		else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
 			key->entryRes[i] = GIN_MAYBE;
 		else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
 			key->entryRes[i] = GIN_TRUE;
 		else
 			key->entryRes[i] = GIN_FALSE;
 	}

 	res = key->triConsistentFn(key);

 	switch (res)
 	{
 		case GIN_TRUE:
 			key->curItemMatches = true;
 			/* triConsistentFn set recheckCurItem */
 			break;

 		case GIN_FALSE:
 			key->curItemMatches = false;
 			break;

 		case GIN_MAYBE:
 			key->curItemMatches = true;
 			key->recheckCurItem = true;
 			break;

 		default:

 			/*
 			 * the 'default' case shouldn't happen, but if the consistent
 			 * function returns something bogus, this is the safe result
 			 */
 			key->curItemMatches = true;
 			key->recheckCurItem = true;
 			break;
 	}

 	/*
 	 * We have a tuple, and we know if it matches or not. If it's a non-match,
 	 * we could continue to find the next matching tuple, but let's break out
 	 * and give scanGetItem a chance to advance the other keys. They might be
 	 * able to skip past to a much higher TID, allowing us to save work.
 	 */

 	/* clean up after consistentFn calls */
 	MemoryContextSwitchTo(oldCtx);
 	MemoryContextReset(tempCtx);
 }

 /*
  * Get next heap item pointer (after advancePast) from scan.
  * Returns true if anything found.
  * On success, *item and *recheck are set.
  *
  * Note: this is very nearly the same logic as in keyGetItem(), except
  * that we know the keys are to be combined with AND logic, whereas in
  * keyGetItem() the combination logic is known only to the consistentFn.
  */
 static bool
 scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
 			ItemPointerData *item, bool *recheck)
 {
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
 	uint32		i;
 	bool		match;

 	/*----------
 	 * Advance the scan keys in lock-step, until we find an item that matches
 	 * all the keys. If any key reports isFinished, meaning its subset of the
 	 * entries is exhausted, we can stop.  Otherwise, set *item to the next
 	 * matching item.
 	 *
 	 * This logic works only if a keyGetItem stream can never contain both
 	 * exact and lossy pointers for the same page.  Else we could have a
 	 * case like
 	 *
 	 *		stream 1		stream 2
 	 *		...             ...
 	 *		42/6			42/7
 	 *		50/1			42/0xffff
 	 *		...             ...
 	 *
 	 * We would conclude that 42/6 is not a match and advance stream 1,
 	 * thus never detecting the match to the lossy pointer in stream 2.
 	 * (keyGetItem has a similar problem versus entryGetItem.)
 	 *----------
 	 */
 	do
 	{
 		ItemPointerSetMin(item);
 		match = true;
 		for (i = 0; i < so->nkeys && match; i++)
 		{
 			GinScanKey	key = so->keys + i;

 			/*
 			 * If we're considering a lossy page, skip excludeOnly keys. They
 			 * can't exclude the whole page anyway.
 			 */
 			if (ItemPointerIsLossyPage(item) && key->excludeOnly)
 			{
 				/*
 				 * ginNewScanKey() should never mark the first key as
 				 * excludeOnly.
 				 */
 				Assert(i > 0);
 				continue;
 			}

 			/* Fetch the next item for this key that is > advancePast. */
 			keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
 					   scan->xs_snapshot);

 			if (key->isFinished)
 				return false;

 			/*
 			 * If it's not a match, we can immediately conclude that nothing
 			 * <= this item matches, without checking the rest of the keys.
 			 */
 			if (!key->curItemMatches)
 			{
 				advancePast = key->curItem;
 				match = false;
 				break;
 			}

 			/*
 			 * It's a match. We can conclude that nothing < matches, so the
 			 * other key streams can skip to this item.
 			 *
 			 * Beware of lossy pointers, though; from a lossy pointer, we can
 			 * only conclude that nothing smaller than this *block* matches.
 			 */
 			if (ItemPointerIsLossyPage(&key->curItem))
 			{
 				if (GinItemPointerGetBlockNumber(&advancePast) <
 					GinItemPointerGetBlockNumber(&key->curItem))
 				{
 					ItemPointerSet(&advancePast,
 								   GinItemPointerGetBlockNumber(&key->curItem),
 								   InvalidOffsetNumber);
 				}
 			}
 			else
 			{
 				Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
 				ItemPointerSet(&advancePast,
 							   GinItemPointerGetBlockNumber(&key->curItem),
 							   OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
 			}

 			/*
 			 * If this is the first key, remember this location as a potential
 			 * match, and proceed to check the rest of the keys.
 			 *
 			 * Otherwise, check if this is the same item that we checked the
 			 * previous keys for (or a lossy pointer for the same page). If
 			 * not, loop back to check the previous keys for this item (we
 			 * will check this key again too, but keyGetItem returns quickly
 			 * for that)
 			 */
 			if (i == 0)
 			{
 				*item = key->curItem;
 			}
 			else
 			{
 				if (ItemPointerIsLossyPage(&key->curItem) ||
 					ItemPointerIsLossyPage(item))
 				{
 					Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
 					match = (GinItemPointerGetBlockNumber(&key->curItem) ==
 							 GinItemPointerGetBlockNumber(item));
 				}
 				else
 				{
 					Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
 					match = (ginCompareItemPointers(&key->curItem, item) == 0);
 				}
 			}
 		}
 	} while (!match);

 	Assert(!ItemPointerIsMin(item));

 	/*
 	 * Now *item contains the first ItemPointer after previous result that
 	 * satisfied all the keys for that exact TID, or a lossy reference to the
 	 * same page.
 	 *
 	 * We must return recheck = true if any of the keys are marked recheck.
 	 */
 	*recheck = false;
 	for (i = 0; i < so->nkeys; i++)
 	{
 		GinScanKey	key = so->keys + i;

 		if (key->recheckCurItem)
 		{
 			*recheck = true;
 			break;
 		}
 	}

 	return true;
 }


 /*
  * Functions for scanning the pending list
  */


 /*
  * Get ItemPointer of next heap row to be checked from pending list.
  * Returns false if there are no more. On pages with several heap rows
  * it returns each row separately, on page with part of heap row returns
  * per page data.  pos->firstOffset and pos->lastOffset are set to identify
  * the range of pending-list tuples belonging to this heap row.
  *
  * The pendingBuffer is presumed pinned and share-locked on entry, and is
  * pinned and share-locked on success exit.  On failure exit it's released.
  */
 static bool
 scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
 {
 	OffsetNumber maxoff;
 	Page		page;
 	IndexTuple	itup;

 	ItemPointerSetInvalid(&pos->item);
 	for (;;)
 	{
 		page = BufferGetPage(pos->pendingBuffer);
 		TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);

 		maxoff = PageGetMaxOffsetNumber(page);
 		if (pos->firstOffset > maxoff)
 		{
 			BlockNumber blkno = GinPageGetOpaque(page)->rightlink;

 			if (blkno == InvalidBlockNumber)
 			{
 				UnlockReleaseBuffer(pos->pendingBuffer);
 				pos->pendingBuffer = InvalidBuffer;

 				return false;
 			}
 			else
 			{
 				/*
 				 * Here we must prevent deletion of next page by insertcleanup
 				 * process, which may be trying to obtain exclusive lock on
 				 * current page.  So, we lock next page before releasing the
 				 * current one
 				 */
 				Buffer		tmpbuf = ReadBuffer(scan->indexRelation, blkno);

 				LockBuffer(tmpbuf, GIN_SHARE);
 				UnlockReleaseBuffer(pos->pendingBuffer);

 				pos->pendingBuffer = tmpbuf;
 				pos->firstOffset = FirstOffsetNumber;
 			}
 		}
 		else
 		{
 			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
 			pos->item = itup->t_tid;
 			if (GinPageHasFullRow(page))
 			{
 				/*
 				 * find itempointer to the next row
 				 */
 				for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
 				{
 					itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
 					if (!ItemPointerEquals(&pos->item, &itup->t_tid))
 						break;
 				}
 			}
 			else
 			{
 				/*
 				 * All itempointers are the same on this page
 				 */
 				pos->lastOffset = maxoff + 1;
 			}

 			/*
 			 * Now pos->firstOffset points to the first tuple of current heap
 			 * row, pos->lastOffset points to the first tuple of next heap row
 			 * (or to the end of page)
 			 */
 			break;
 		}
 	}

 	return true;
 }

 /*
  * Scan pending-list page from current tuple (off) up till the first of:
  * - match is found (then returns true)
  * - no later match is possible
  * - tuple's attribute number is not equal to entry's attrnum
  * - reach end of page
  *
  * datum[]/category[]/datumExtracted[] arrays are used to cache the results
  * of gintuple_get_key() on the current page.
  */
 static bool
 matchPartialInPendingList(GinState *ginstate, Page page,
 						  OffsetNumber off, OffsetNumber maxoff,
 						  GinScanEntry entry,
 						  Datum *datum, GinNullCategory *category,
 						  bool *datumExtracted)
 {
 	IndexTuple	itup;
 	int32		cmp;

 	/* Partial match to a null is not possible */
 	if (entry->queryCategory != GIN_CAT_NORM_KEY)
 		return false;

 	while (off < maxoff)
 	{
 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));

 		if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
 			return false;

 		if (datumExtracted[off - 1] == false)
 		{
 			datum[off - 1] = gintuple_get_key(ginstate, itup,
 											  &category[off - 1]);
 			datumExtracted[off - 1] = true;
 		}

 		/* Once we hit nulls, no further match is possible */
 		if (category[off - 1] != GIN_CAT_NORM_KEY)
 			return false;

 		/*----------
 		 * Check partial match.
 		 * case cmp == 0 => match
 		 * case cmp > 0 => not match and end scan (no later match possible)
 		 * case cmp < 0 => not match and continue scan
 		 *----------
 		 */
 		cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
 											  ginstate->supportCollation[entry->attnum - 1],
 											  entry->queryKey,
 											  datum[off - 1],
 											  UInt16GetDatum(entry->strategy),
 											  PointerGetDatum(entry->extra_data)));
 		if (cmp == 0)
 			return true;
 		else if (cmp > 0)
 			return false;

 		off++;
 	}

 	return false;
 }

 /*
  * Set up the entryRes array for each key by looking at
  * every entry for current heap row in pending list.
  *
  * Returns true if each scan key has at least one entryRes match.
  * This corresponds to the situations where the normal index search will
  * try to apply the key's consistentFn.  (A tuple not meeting that requirement
  * cannot be returned by the normal search since no entry stream will
  * source its TID.)
  *
  * The pendingBuffer is presumed pinned and share-locked on entry.
  */
 static bool
 collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
 {
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
 	OffsetNumber attrnum;
 	Page		page;
 	IndexTuple	itup;
 	int			i,
 				j;

 	/*
 	 * Reset all entryRes and hasMatchKey flags
 	 */
 	for (i = 0; i < so->nkeys; i++)
 	{
 		GinScanKey	key = so->keys + i;

 		memset(key->entryRes, GIN_FALSE, key->nentries);
 	}
 	memset(pos->hasMatchKey, false, so->nkeys);

 	/*
 	 * Outer loop iterates over multiple pending-list pages when a single heap
 	 * row has entries spanning those pages.
 	 */
 	for (;;)
 	{
 		Datum		datum[BLCKSZ / sizeof(IndexTupleData)];
 		GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
 		bool		datumExtracted[BLCKSZ / sizeof(IndexTupleData)];

 		Assert(pos->lastOffset > pos->firstOffset);
 		memset(datumExtracted + pos->firstOffset - 1, 0,
 			   sizeof(bool) * (pos->lastOffset - pos->firstOffset));

 		page = BufferGetPage(pos->pendingBuffer);
 		TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);

 		for (i = 0; i < so->nkeys; i++)
 		{
 			GinScanKey	key = so->keys + i;

 			for (j = 0; j < key->nentries; j++)
 			{
 				GinScanEntry entry = key->scanEntry[j];
 				OffsetNumber StopLow = pos->firstOffset,
 							StopHigh = pos->lastOffset,
 							StopMiddle;

 				/* If already matched on earlier page, do no extra work */
 				if (key->entryRes[j])
 					continue;

 				/*
 				 * Interesting tuples are from pos->firstOffset to
 				 * pos->lastOffset and they are ordered by (attnum, Datum) as
 				 * it's done in entry tree.  So we can use binary search to
 				 * avoid linear scanning.
 				 */
 				while (StopLow < StopHigh)
 				{
 					int			res;

 					StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);

 					itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));

 					attrnum = gintuple_get_attrnum(&so->ginstate, itup);

 					if (key->attnum < attrnum)
 					{
 						StopHigh = StopMiddle;
 						continue;
 					}
 					if (key->attnum > attrnum)
 					{
 						StopLow = StopMiddle + 1;
 						continue;
 					}

 					if (datumExtracted[StopMiddle - 1] == false)
 					{
 						datum[StopMiddle - 1] =
 							gintuple_get_key(&so->ginstate, itup,
 											 &category[StopMiddle - 1]);
 						datumExtracted[StopMiddle - 1] = true;
 					}

 					if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
 					{
 						/* special behavior depending on searchMode */
 						if (entry->searchMode == GIN_SEARCH_MODE_ALL)
 						{
 							/* match anything except NULL_ITEM */
 							if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
 								res = -1;
 							else
 								res = 0;
 						}
 						else
 						{
 							/* match everything */
 							res = 0;
 						}
 					}
 					else
 					{
 						res = ginCompareEntries(&so->ginstate,
 												entry->attnum,
 												entry->queryKey,
 												entry->queryCategory,
 												datum[StopMiddle - 1],
 												category[StopMiddle - 1]);
 					}

 					if (res == 0)
 					{
 						/*
 						 * Found exact match (there can be only one, except in
 						 * EMPTY_QUERY mode).
 						 *
 						 * If doing partial match, scan forward from here to
 						 * end of page to check for matches.
 						 *
 						 * See comment above about tuple's ordering.
 						 */
 						if (entry->isPartialMatch)
 							key->entryRes[j] =
 								matchPartialInPendingList(&so->ginstate,
 														  page,
 														  StopMiddle,
 														  pos->lastOffset,
 														  entry,
 														  datum,
 														  category,
 														  datumExtracted);
 						else
 							key->entryRes[j] = true;

 						/* done with binary search */
 						break;
 					}
 					else if (res < 0)
 						StopHigh = StopMiddle;
 					else
 						StopLow = StopMiddle + 1;
 				}

 				if (StopLow >= StopHigh && entry->isPartialMatch)
 				{
 					/*
 					 * No exact match on this page.  If doing partial match,
 					 * scan from the first tuple greater than target value to
 					 * end of page.  Note that since we don't remember whether
 					 * the comparePartialFn told us to stop early on a
 					 * previous page, we will uselessly apply comparePartialFn
 					 * to the first tuple on each subsequent page.
 					 */
 					key->entryRes[j] =
 						matchPartialInPendingList(&so->ginstate,
 												  page,
 												  StopHigh,
 												  pos->lastOffset,
 												  entry,
 												  datum,
 												  category,
 												  datumExtracted);
 				}

 				pos->hasMatchKey[i] |= key->entryRes[j];
 			}
 		}

 		/* Advance firstOffset over the scanned tuples */
 		pos->firstOffset = pos->lastOffset;

 		if (GinPageHasFullRow(page))
 		{
 			/*
 			 * We have examined all pending entries for the current heap row.
 			 * Break out of loop over pages.
 			 */
 			break;
 		}
 		else
 		{
 			/*
 			 * Advance to next page of pending entries for the current heap
 			 * row.  Complain if there isn't one.
 			 */
 			ItemPointerData item = pos->item;

 			if (scanGetCandidate(scan, pos) == false ||
 				!ItemPointerEquals(&pos->item, &item))
 				elog(ERROR, "could not find additional pending pages for same heap tuple");
 		}
 	}

 	/*
 	 * All scan keys except excludeOnly require at least one entry to match.
 	 * excludeOnly keys are an exception, because their implied
 	 * GIN_CAT_EMPTY_QUERY scanEntry always matches.  So return "true" if all
 	 * non-excludeOnly scan keys have at least one match.
 	 */
 	for (i = 0; i < so->nkeys; i++)
 	{
 		if (pos->hasMatchKey[i] == false && !so->keys[i].excludeOnly)
 			return false;
 	}

 	return true;
 }

 /*
  * Collect all matched rows from pending list into bitmap.
  */
 static void
 scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
 {
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
 	MemoryContext oldCtx;
 	bool		recheck,
 				match;
 	int			i;
 	pendingPosition pos;
 	Buffer		metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
 	Page		page;
 	BlockNumber blkno;

 	*ntids = 0;

 	/*
 	 * Acquire predicate lock on the metapage, to conflict with any fastupdate
 	 * insertions.
 	 */
 	PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);

 	LockBuffer(metabuffer, GIN_SHARE);
 	page = BufferGetPage(metabuffer);
 	TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
 	blkno = GinPageGetMeta(page)->head;

 	/*
 	 * fetch head of list before unlocking metapage. head page must be pinned
 	 * to prevent deletion by vacuum process
 	 */
 	if (blkno == InvalidBlockNumber)
 	{
 		/* No pending list, so proceed with normal scan */
 		UnlockReleaseBuffer(metabuffer);
 		return;
 	}

 	pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
 	LockBuffer(pos.pendingBuffer, GIN_SHARE);
 	pos.firstOffset = FirstOffsetNumber;
 	UnlockReleaseBuffer(metabuffer);
 	pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);

 	/*
 	 * loop for each heap row. scanGetCandidate returns full row or row's
 	 * tuples from first page.
 	 */
 	while (scanGetCandidate(scan, &pos))
 	{
 		/*
 		 * Check entries in tuple and set up entryRes array.
 		 *
 		 * If pending tuples belonging to the current heap row are spread
 		 * across several pages, collectMatchesForHeapRow will read all of
 		 * those pages.
 		 */
 		if (!collectMatchesForHeapRow(scan, &pos))
 			continue;

 		/*
 		 * Matching of entries of one row is finished, so check row using
 		 * consistent functions.
 		 */
 		oldCtx = MemoryContextSwitchTo(so->tempCtx);
 		recheck = false;
 		match = true;

 		for (i = 0; i < so->nkeys; i++)
 		{
 			GinScanKey	key = so->keys + i;

 			if (!key->boolConsistentFn(key))
 			{
 				match = false;
 				break;
 			}
 			recheck |= key->recheckCurItem;
 		}

 		MemoryContextSwitchTo(oldCtx);
 		MemoryContextReset(so->tempCtx);

 		if (match)
 		{
 			tbm_add_tuples(tbm, &pos.item, 1, recheck);
 			(*ntids)++;
 		}
 	}

 	pfree(pos.hasMatchKey);
 }


 #define GinIsVoidRes(s)		( ((GinScanOpaque) scan->opaque)->isVoidRes )

 int64
 gingetbitmap(IndexScanDesc scan, Node **bmNodeP)
 {

 	TIDBitmap  *tbm;
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
 	int64		ntids;
 	ItemPointerData iptr;
 	bool		recheck;

 	/*
 	 * GPDB specific code. Since GPDB also support StreamBitmap
 	 * in bitmap index. So normally we need to create specific bitmap
 	 * node in the amgetbitmap AM.
 	 */
 	Assert(bmNodeP);
 	if (*bmNodeP == NULL)
 	{
 		/* XXX should we use less than work_mem for this? */
 		tbm = tbm_create(work_mem * 1024L, scan->dsa);
 		*bmNodeP = (Node *) tbm;
 	}
 	else if (!IsA(*bmNodeP, TIDBitmap))
 		elog(ERROR, "non gin bitmap");
 	else
 		tbm = (TIDBitmap *)*bmNodeP;

 	/*
 	 * Set up the scan keys, and check for unsatisfiable query.
 	 */
 	ginFreeScanKeys(so);		/* there should be no keys yet, but just to be
 								 * sure */
 	ginNewScanKey(scan);

 	if (GinIsVoidRes(scan))
 		return 0;

 	ntids = 0;

 	/*
 	 * First, scan the pending list and collect any matching entries into the
 	 * bitmap.  After we scan a pending item, some other backend could post it
 	 * into the main index, and so we might visit it a second time during the
 	 * main scan.  This is okay because we'll just re-set the same bit in the
 	 * bitmap.  (The possibility of duplicate visits is a major reason why GIN
 	 * can't support the amgettuple API, however.) Note that it would not do
 	 * to scan the main index before the pending list, since concurrent
 	 * cleanup could then make us miss entries entirely.
 	 */
 	scanPendingInsert(scan, tbm, &ntids);

 	/*
 	 * Now scan the main index.
 	 */
 	startScan(scan);

 	ItemPointerSetMin(&iptr);

 	for (;;)
 	{
 		CHECK_FOR_INTERRUPTS();

 		if (!scanGetItem(scan, iptr, &iptr, &recheck))
 			break;

 		if (ItemPointerIsLossyPage(&iptr))
 			tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
 		else
 			tbm_add_tuples(tbm, &iptr, 1, recheck);
 		ntids++;
 	}

 	return ntids;
 }