src/backend/nodes/tidbitmap.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * tidbitmap.c
  *	  PostgreSQL tuple-id (TID) bitmap package
  *
  * This module provides bitmap data structures that are spiritually
  * similar to Bitmapsets, but are specially adapted to store sets of
  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
  * Also, since we wish to be able to store very large tuple sets in
  * memory with this data structure, we support "lossy" storage, in which
  * we no longer remember individual tuple offsets on a page but only the
  * fact that a particular page needs to be visited.
  *
  * The "lossy" storage uses one bit per disk page, so at the standard 8K
  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
  * of memory.  People pushing around tables of that size should have a
  * couple of Mb to spare, so we don't worry about providing a second level
  * of lossiness.  In theory we could fall back to page ranges at some
  * point, but for now that seems useless complexity.
  *
  *
  * Copyright (c) 2003-2021, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
  *	  src/backend/nodes/tidbitmap.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <limits.h>

 #include "access/htup.h"
 #include "access/htup_details.h"
 #include "access/bitmap.h"		/* XXX: remove once pull_stream is generic */
 #include "common/hashfn.h"
 #include "executor/instrument.h"	/* Instrumentation */
 #include "nodes/tidbitmap.h"
 #include "storage/lwlock.h"
 #include "utils/dsa.h"

 #define WORDNUM(x)	((x) / TBM_BITS_PER_BITMAPWORD)
 #define BITNUM(x)	((x) % TBM_BITS_PER_BITMAPWORD)

 static bool tbm_iterate_page(PagetableEntry *page, TBMIterateResult *output);
 static PagetableEntry *tbm_next_page(TBMIterator *iterator, bool *more);
 static void tbm_upd_instrument(TIDBitmap *tbm);

 /*
  * Holds array of pagetable entries.
  */
 typedef struct PTEntryArray
 {
 	pg_atomic_uint32 refcount;	/* no. of iterator attached */
 	PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
 } PTEntryArray;

 /*
  * We want to avoid the overhead of creating the hashtable, which is
  * comparatively large, when not necessary. Particularly when we are using a
  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
  * long enough to accumulate one entry in such cases.  We therefore avoid
  * creating an actual hashtable until we need two pagetable entries.  When
  * just one pagetable entry is needed, we store it in a fixed field of
  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
  * when nentries is zero or one.)
  */
 typedef enum
 {
 	TBM_EMPTY,					/* no hashtable, nentries == 0 */
 	TBM_ONE_PAGE,				/* entry1 contains the single entry */
 	TBM_HASH					/* pagetable is valid, entry1 is not */
 } TBMStatus;

 /*
  * Current iterating state of the TBM.
  */
 typedef enum
 {
 	TBM_NOT_ITERATING,			/* not yet converted to page and chunk array */
 	TBM_ITERATING_PRIVATE,		/* converted to local page and chunk array */
 	TBM_ITERATING_SHARED		/* converted to shared page and chunk array */
 } TBMIteratingState;

 /*
  * Here is the representation for a whole TIDBitMap:
  */
 struct TIDBitmap
 {
 	NodeTag		type;			/* to make it a valid Node */
 	MemoryContext mcxt;			/* memory context containing me */
 	TBMStatus	status;			/* see codes above */
 	struct pagetable_hash *pagetable;	/* hash table of PagetableEntry's */
 	int			nentries;		/* number of entries in pagetable */
 	int			nentries_hwm;	/* high-water mark for number of entries */
 	int			maxentries;		/* limit on same to meet maxbytes */
 	int			npages;			/* number of exact entries in pagetable */
 	int			nchunks;		/* number of lossy entries in pagetable */
 	TBMIteratingState iterating;	/* tbm_begin_iterate called? */
 	uint32		lossify_start;	/* offset to start lossifying hashtable at */
 	PagetableEntry entry1;		/* used when status == TBM_ONE_PAGE */
 	/* these are valid when iterating is true: */
 	PagetableEntry **spages;	/* sorted exact-page list, or NULL */
 	PagetableEntry **schunks;	/* sorted lossy-chunk list, or NULL */
     dsa_pointer dsapagetable;	/* dsa_pointer to the element array */
     dsa_pointer dsapagetableold;	/* dsa_pointer to the old element array */
     dsa_pointer ptpages;		/* dsa_pointer to the page array */
     dsa_pointer ptchunks;		/* dsa_pointer to the chunk array */
     dsa_area   *dsa;			/* reference to per-query dsa area */

 	/* CDB: Statistics for EXPLAIN ANALYZE */
 	struct Instrumentation *instrument;
 	Size		bytesperentry;
 };

 /*
  * When iterating over a bitmap in sorted order, a TBMIterator is used to
  * track our progress.  There can be several iterators scanning the same
  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
  * any iterator is created.
  */
 struct TBMIterator
 {
 	TIDBitmap  *tbm;			/* TIDBitmap we're iterating over */
 	int			spageptr;		/* next spages index */
 	int			schunkptr;		/* next schunks index */
 	int			schunkbit;		/* next bit to check in current schunk */
 	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
 };

 struct GenericBMIterator
 {
 	const Node *bm;				/* [TID|Stream]Bitmap we're iterating over */
 	union
 	{
 		TBMIterator		 *hash;		/* iterator for TIDBitmap implementation */
 		StreamBMIterator *stream;	/* iterator for StreamBitmap implementation */
 	} impl;
 };

 /*
  * Holds the shared members of the iterator so that multiple processes
  * can jointly iterate.
  */
 typedef struct TBMSharedIteratorState
 {
 	int			nentries;		/* number of entries in pagetable */
 	int			maxentries;		/* limit on same to meet maxbytes */
 	int			npages;			/* number of exact entries in pagetable */
 	int			nchunks;		/* number of lossy entries in pagetable */
 	dsa_pointer pagetable;		/* dsa pointers to head of pagetable data */
 	dsa_pointer spages;			/* dsa pointer to page array */
 	dsa_pointer schunks;		/* dsa pointer to chunk array */
 	LWLock		lock;			/* lock to protect below members */
 	int			spageptr;		/* next spages index */
 	int			schunkptr;		/* next schunks index */
 	int			schunkbit;		/* next bit to check in current schunk */
 } TBMSharedIteratorState;

 /*
  * pagetable iteration array.
  */
 typedef struct PTIterationArray
 {
 	pg_atomic_uint32 refcount;	/* no. of iterator attached */
 	int			index[FLEXIBLE_ARRAY_MEMBER];	/* index array */
 } PTIterationArray;

 /*
  * same as TBMIterator, but it is used for joint iteration, therefore this
  * also holds a reference to the shared state.
  */
 struct TBMSharedIterator
 {
 	TBMSharedIteratorState *state;	/* shared state */
 	PTEntryArray *ptbase;		/* pagetable element array */
 	PTIterationArray *ptpages;	/* sorted exact page index list */
 	PTIterationArray *ptchunks; /* sorted lossy page index list */
 	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
 };

 /* Local function prototypes */
 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
 							   const TIDBitmap *b);
 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
 												BlockNumber pageno);
 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_lossify(TIDBitmap *tbm);
 static int	tbm_comparator(const void *left, const void *right);
 static int	tbm_shared_comparator(const void *left, const void *right,
 								  void *arg);
 static void tbm_stream_set_instrument(StreamNode * self, struct Instrumentation *instr);
 static void tbm_stream_upd_instrument(StreamNode * self);

 /* define hashtable mapping block numbers to PagetableEntry's */
 #define SH_USE_NONDEFAULT_ALLOCATOR
 #define SH_PREFIX pagetable
 #define SH_ELEMENT_TYPE PagetableEntry
 #define SH_KEY_TYPE BlockNumber
 #define SH_KEY blockno
 #define SH_HASH_KEY(tb, key) murmurhash32(key)
 #define SH_EQUAL(tb, a, b) a == b
 #define SH_SCOPE static inline
 #define SH_DEFINE
 #define SH_DECLARE
 #include "lib/simplehash.h"

 static StreamBMIterator *tbm_stream_begin_iterate(StreamNode *node);
 static void tbm_stream_end_iterate(StreamBMIterator *iterator);

 /* IndexStream callbacks */
 static void index_stream_begin_iterate(StreamNode *self, StreamBMIterator *iterator);
 static bool tbm_stream_block(StreamBMIterator *iterator, PagetableEntry *e);
 static void index_stream_end_iterate(StreamBMIterator *self);
 static void tbm_stream_free(StreamNode *self);

 /* OpStream callbacks */
 static void opstream_begin_iterate(StreamNode *self, StreamBMIterator *iterator);
 static bool opstream_iterate(StreamBMIterator *iterator, PagetableEntry *e);
 static void opstream_end_iterate(StreamBMIterator *self);
 static void opstream_free(StreamNode *self);

 /*
  * tbm_create - create an initially-empty bitmap
  *
  * The bitmap will live in the memory context that is CurrentMemoryContext
  * at the time of this call.  It will be limited to (approximately) maxbytes
  * total memory consumption.  If the DSA passed to this function is not NULL
  * then the memory for storing elements of the underlying page table will
  * be allocated from the DSA.
  */
 TIDBitmap *
 tbm_create(long maxbytes, dsa_area *dsa)
 {
 	TIDBitmap  *tbm;

 	/*
 	 * Ensure that we don't have heap tuple offsets going beyond (INT16_MAX +
 	 * 1) or 32768. The executor iterates only over the first 32K tuples for
 	 * lossy bitmap pages [MPP-24326].
 	 */
 	COMPILE_ASSERT(MaxHeapTuplesPerPage <= (INT16_MAX + 1));

 	/* Create the TIDBitmap struct and zero all its fields */
 	tbm = makeNode(TIDBitmap);

 	tbm->mcxt = CurrentMemoryContext;
 	tbm->status = TBM_EMPTY;
 	tbm->instrument = NULL;

 	tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
 	tbm->lossify_start = 0;
 	tbm->dsa = dsa;
 	tbm->dsapagetable = InvalidDsaPointer;
 	tbm->dsapagetableold = InvalidDsaPointer;
 	tbm->ptpages = InvalidDsaPointer;
 	tbm->ptchunks = InvalidDsaPointer;

 	return tbm;
 }

 /*
  * Actually create the hashtable.  Since this is a moderately expensive
  * proposition, we don't do it until we have to.
  */
 static void
 tbm_create_pagetable(TIDBitmap *tbm)
 {
 	Assert(tbm->status != TBM_HASH);
 	Assert(tbm->pagetable == NULL);

 	tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);

 	/* If entry1 is valid, push it into the hashtable */
 	if (tbm->status == TBM_ONE_PAGE)
 	{
 		PagetableEntry *page;
 		bool		found;
 		char		oldstatus;

 		page = pagetable_insert(tbm->pagetable,
 								tbm->entry1.blockno,
 								&found);
 		Assert(!found);
 		oldstatus = page->status;
 		memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
 		page->status = oldstatus;
 	}

 	tbm->status = TBM_HASH;
 }

 /*
  * tbm_free - free a TIDBitmap
  */
 void
 tbm_free(TIDBitmap *tbm)
 {
 	if (tbm->instrument)
 		tbm_upd_instrument(tbm);
 	if (tbm->pagetable)
 		pagetable_destroy(tbm->pagetable);
 	if (tbm->spages)
 		pfree(tbm->spages);
 	if (tbm->schunks)
 		pfree(tbm->schunks);
 	pfree(tbm);
 }


 /*
  * tbm_upd_instrument - Update the Instrumentation attached to a TIDBitmap.
  */
 static void
 tbm_upd_instrument(TIDBitmap *tbm)
 {
 	Instrumentation *instr = tbm->instrument;
 	Size		workmemused;

 	if (!instr)
 		return;

 	/* Update page table high-water mark. */
 	tbm->nentries_hwm = Max(tbm->nentries_hwm, tbm->nentries);

 	/* How much of our work_mem quota was actually used? */
 	workmemused = tbm->nentries_hwm * tbm->bytesperentry;
 	instr->workmemused = Max(instr->workmemused, workmemused);
 }	/* tbm_upd_instrument */


 /*
  * tbm_set_instrument
  *	Attach caller's Instrumentation object to a TIDBitmap, unless the
  *	TIDBitmap already has one.  We want the statistics to be associated
  *	with the plan node which originally created the bitmap, rather than a
  *	downstream consumer of the bitmap.
  */
 static void
 tbm_set_instrument(TIDBitmap *tbm, struct Instrumentation *instr)
 {
 	if (instr == NULL ||
 		tbm->instrument == NULL)
 	{
 		tbm->instrument = instr;
 		tbm_upd_instrument(tbm);
 	}
 }	/* tbm_set_instrument */


 /*
  * tbm_free_shared_area - free shared state
  *
  * Free shared iterator state, Also free shared pagetable and iterator arrays
  * memory if they are not referred by any of the shared iterator i.e recount
  * is becomes 0.
  */
 void
 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
 {
 	TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
 	PTEntryArray *ptbase;
 	PTIterationArray *ptpages;
 	PTIterationArray *ptchunks;

 	if (DsaPointerIsValid(istate->pagetable))
 	{
 		ptbase = dsa_get_address(dsa, istate->pagetable);
 		if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
 			dsa_free(dsa, istate->pagetable);
 	}
 	if (DsaPointerIsValid(istate->spages))
 	{
 		ptpages = dsa_get_address(dsa, istate->spages);
 		if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
 			dsa_free(dsa, istate->spages);
 	}
 	if (DsaPointerIsValid(istate->schunks))
 	{
 		ptchunks = dsa_get_address(dsa, istate->schunks);
 		if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
 			dsa_free(dsa, istate->schunks);
 	}

 	dsa_free(dsa, dp);
 }

 /*
  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
  */
 void
 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
 			   bool recheck)
 {
 	BlockNumber currblk = InvalidBlockNumber;
 	PagetableEntry *page = NULL;	/* only valid when currblk is valid */
 	int			i;

 	Assert(tbm->iterating == TBM_NOT_ITERATING);
 	for (i = 0; i < ntids; i++)
 	{
 		BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
 		OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
 		int			wordnum,
 					bitnum;

 		/* safety check to ensure we don't overrun bit array bounds */

 		/* UNDONE: Turn this off until we convert this module to AO TIDs. */
 #if 0
 		if (off < 1 || off > MAX_TUPLES_PER_PAGE)
 			elog(ERROR, "tuple offset out of range: %u", off);
 #endif

 		/*
 		 * Look up target page unless we already did.  This saves cycles when
 		 * the input includes consecutive tuples on the same page, which is
 		 * common enough to justify an extra test here.
 		 */
 		if (blk != currblk)
 		{
 			if (tbm_page_is_lossy(tbm, blk))
 				page = NULL;	/* remember page is lossy */
 			else
 				page = tbm_get_pageentry(tbm, blk);
 			currblk = blk;
 		}

 		if (page == NULL)
 			continue;			/* whole page is already marked */

 		if (page->ischunk)
 		{
 			/* The page is a lossy chunk header, set bit for itself */
 			wordnum = bitnum = 0;
 		}
 		else
 		{
 			/* Page is exact, so set bit for individual tuple */
 			wordnum = WORDNUM(off - 1);
 			bitnum = BITNUM(off - 1);
 		}
 		page->words[wordnum] |= ((tbm_bitmapword) 1 << bitnum);
 		page->recheck |= recheck;

 		if (tbm->nentries > tbm->maxentries)
 		{
 			tbm_lossify(tbm);
 			/* Page could have been converted to lossy, so force new lookup */
 			currblk = InvalidBlockNumber;
 		}
 	}
 }

 /*
  * tbm_add_page - add a whole page to a TIDBitmap
  *
  * This causes the whole page to be reported (with the recheck flag)
  * when the TIDBitmap is scanned.
  */
 void
 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
 {
 	/* Enter the page in the bitmap, or mark it lossy if already present */
 	tbm_mark_page_lossy(tbm, pageno);
 	/* If we went over the memory limit, lossify some more pages */
 	if (tbm->nentries > tbm->maxentries)
 		tbm_lossify(tbm);
 }

 /*
  * tbm_union - set union
  *
  * a is modified in-place, b is not changed
  */
 void
 tbm_union(TIDBitmap *a, const TIDBitmap *b)
 {
 	Assert(!a->iterating);
 	/* Nothing to do if b is empty */
 	if (b->nentries == 0)
 		return;
 	/* Scan through chunks and pages in b, merge into a */
 	if (b->status == TBM_ONE_PAGE)
 		tbm_union_page(a, &b->entry1);
 	else
 	{
 		pagetable_iterator i;
 		PagetableEntry *bpage;

 		Assert(b->status == TBM_HASH);
 		pagetable_start_iterate(b->pagetable, &i);
 		while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
 			tbm_union_page(a, bpage);
 	}
 }

 /* Process one page of b during a union op */
 static void
 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
 {
 	PagetableEntry *apage;
 	int			wordnum;

 	if (bpage->ischunk)
 	{
 		/* Scan b's chunk, mark each indicated page lossy in a */
 		for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
 		{
 			tbm_bitmapword w = bpage->words[wordnum];

 			if (w != 0)
 			{
 				BlockNumber pg;

 				pg = bpage->blockno + (wordnum * TBM_BITS_PER_BITMAPWORD);
 				while (w != 0)
 				{
 					if (w & 1)
 						tbm_mark_page_lossy(a, pg);
 					pg++;
 					w >>= 1;
 				}
 			}
 		}
 	}
 	else if (tbm_page_is_lossy(a, bpage->blockno))
 	{
 		/* page is already lossy in a, nothing to do */
 		return;
 	}
 	else
 	{
 		apage = tbm_get_pageentry(a, bpage->blockno);
 		if (apage->ischunk)
 		{
 			/* The page is a lossy chunk header, set bit for itself */
 			apage->words[0] |= ((tbm_bitmapword) 1 << 0);
 		}
 		else
 		{
 			/* Both pages are exact, merge at the bit level */
 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 				apage->words[wordnum] |= bpage->words[wordnum];
 			apage->recheck |= bpage->recheck;
 		}
 	}

 	if (a->nentries > a->maxentries)
 		tbm_lossify(a);
 }

 /*
  * tbm_intersect - set intersection
  *
  * a is modified in-place, b is not changed
  */
 void
 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 {
 	Assert(!a->iterating);
 	/* Nothing to do if a is empty */
 	if (a->nentries == 0)
 		return;

 	a->nentries_hwm = Max(a->nentries_hwm, a->nentries);

 	/* Scan through chunks and pages in a, try to match to b */
 	if (a->status == TBM_ONE_PAGE)
 	{
 		if (tbm_intersect_page(a, &a->entry1, b))
 		{
 			/* Page is now empty, remove it from a */
 			Assert(!a->entry1.ischunk);
 			a->npages--;
 			a->nentries--;
 			Assert(a->nentries == 0);
 			a->status = TBM_EMPTY;
 		}
 	}
 	else
 	{
 		pagetable_iterator i;
 		PagetableEntry *apage;

 		Assert(a->status == TBM_HASH);
 		pagetable_start_iterate(a->pagetable, &i);
 		while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
 		{
 			if (tbm_intersect_page(a, apage, b))
 			{
 				/* Page or chunk is now empty, remove it from a */
 				if (apage->ischunk)
 					a->nchunks--;
 				else
 					a->npages--;
 				a->nentries--;
 				if (!pagetable_delete(a->pagetable, apage->blockno))
 					elog(ERROR, "hash table corrupted");
 			}
 		}
 	}
 }

 /*
  * Process one page of a during an intersection op
  *
  * Returns true if apage is now empty and should be deleted from a
  */
 static bool
 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
 {
 	const PagetableEntry *bpage;
 	int			wordnum;

 	if (apage->ischunk)
 	{
 		/* Scan each bit in chunk, try to clear */
 		bool		candelete = true;

 		for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
 		{
 			tbm_bitmapword w = apage->words[wordnum];

 			if (w != 0)
 			{
 				tbm_bitmapword neww = w;
 				BlockNumber pg;
 				int			bitnum;

 				pg = apage->blockno + (wordnum * TBM_BITS_PER_BITMAPWORD);
 				bitnum = 0;
 				while (w != 0)
 				{
 					if (w & 1)
 					{
 						if (!tbm_page_is_lossy(b, pg) &&
 							tbm_find_pageentry(b, pg) == NULL)
 						{
 							/* Page is not in b at all, lose lossy bit */
 							neww &= ~((tbm_bitmapword) 1 << bitnum);
 						}
 					}
 					pg++;
 					bitnum++;
 					w >>= 1;
 				}
 				apage->words[wordnum] = neww;
 				if (neww != 0)
 					candelete = false;
 			}
 		}
 		return candelete;
 	}
 	else if (tbm_page_is_lossy(b, apage->blockno))
 	{
 		/*
 		 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
 		 * because the page 'b' is lossy, we don't know which ones. Therefore
 		 * we mark 'a' as requiring rechecks, to indicate that at most those
 		 * tuples set in 'a' are matches.
 		 */
 		apage->recheck = true;
 		return false;
 	}
 	else
 	{
 		bool		candelete = true;

 		bpage = tbm_find_pageentry(b, apage->blockno);
 		if (bpage != NULL)
 		{
 			/* Both pages are exact, merge at the bit level */
 			Assert(!bpage->ischunk);
 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 			{
 				apage->words[wordnum] &= bpage->words[wordnum];
 				if (apage->words[wordnum] != 0)
 					candelete = false;
 			}
 			apage->recheck |= bpage->recheck;
 		}
 		/* If there is no matching b page, we can just delete the a page */
 		return candelete;
 	}
 }

 /*
  * tbm_is_empty - is a TIDBitmap completely empty?
  */
 bool
 tbm_is_empty(const TIDBitmap *tbm)
 {
 	return (tbm->nentries == 0);
 }

 /*
  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
  *
  * The TBMIterator struct is created in the caller's memory context.
  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
  * okay to just allow the memory context to be released, too.  It is caller's
  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
  * is freed.
  *
  * NB: after this is called, it is no longer allowed to modify the contents
  * of the bitmap.  However, you can call this multiple times to scan the
  * contents repeatedly, including parallel scans.
  */
 TBMIterator *
 tbm_begin_iterate(TIDBitmap *tbm)
 {
 	TBMIterator *iterator;

 	Assert(tbm->iterating != TBM_ITERATING_SHARED);

 	/*
 	 * Create the TBMIterator struct, with enough trailing space to serve the
 	 * needs of the TBMIterateResult sub-struct.
 	 */
 	iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
 									  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
 	iterator->tbm = tbm;

 	/*
 	 * Initialize iteration pointers.
 	 */
 	iterator->spageptr = 0;
 	iterator->schunkptr = 0;
 	iterator->schunkbit = 0;

 	/*
 	 * If we have a hashtable, create and fill the sorted page lists, unless
 	 * we already did that for a previous iterator.  Note that the lists are
 	 * attached to the bitmap not the iterator, so they can be used by more
 	 * than one iterator.
 	 */
 	if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
 	{
 		pagetable_iterator i;
 		PagetableEntry *page;
 		int			npages;
 		int			nchunks;

 		if (!tbm->spages && tbm->npages > 0)
 			tbm->spages = (PagetableEntry **)
 				MemoryContextAlloc(tbm->mcxt,
 								   tbm->npages * sizeof(PagetableEntry *));
 		if (!tbm->schunks && tbm->nchunks > 0)
 			tbm->schunks = (PagetableEntry **)
 				MemoryContextAlloc(tbm->mcxt,
 								   tbm->nchunks * sizeof(PagetableEntry *));

 		npages = nchunks = 0;
 		pagetable_start_iterate(tbm->pagetable, &i);
 		while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
 		{
 			if (page->ischunk)
 				tbm->schunks[nchunks++] = page;
 			else
 				tbm->spages[npages++] = page;
 		}
 		Assert(npages == tbm->npages);
 		Assert(nchunks == tbm->nchunks);
 		if (npages > 1)
 			qsort(tbm->spages, npages, sizeof(PagetableEntry *),
 				  tbm_comparator);
 		if (nchunks > 1)
 			qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
 				  tbm_comparator);
 	}

 	tbm->iterating = TBM_ITERATING_PRIVATE;

 	return iterator;
 }

 /*
  * tbm_stream_begin_iterate - prepare to iterate through a StreamBitmap
  */
 static StreamBMIterator *
 tbm_stream_begin_iterate(StreamNode *node)
 {
 	/*
 	 * Create the StreamBMIterator struct, with enough trailing space to serve
 	 * the needs of the TBMIterateResult sub-struct.
 	 */
 	StreamBMIterator *iterator = palloc0(sizeof(StreamBMIterator) +
 								 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));

 	iterator->node = node;
 	node->begin_iterate(node, iterator);

 	return iterator;
 }

 /*
  * tbm_generic_begin_iterate - prepare to iterate through either a TIDBitmap or
  * a StreamBitmap
  *
  * The GenericBMIterator struct is created in the caller's memory context.
  * For a clean shutdown of the iteration, call tbm_generic_end_iterate.
  * It is caller's responsibility not to touch the iterator anymore once the
  * Node passed to this function is freed.
  *
  * Similarly to tbm_begin_iterate, after this is called, it is no longer allowed
  * to modify the contents of the bitmap.  However, you can call this multiple
  * times to scan the contents repeatedly, including parallel scans.
  */
 GenericBMIterator *
 tbm_generic_begin_iterate(Node *bm)
 {
 	GenericBMIterator *iterator;

 	Assert(IsA(bm, TIDBitmap) || IsA(bm, StreamBitmap));

 	/* Allocate space. */
 	iterator = palloc(sizeof(*iterator));
 	iterator->bm = bm;

 	switch (bm->type)
 	{
 		case T_TIDBitmap:
 			iterator->impl.hash = tbm_begin_iterate((TIDBitmap *) bm);
 			break;

 		case T_StreamBitmap:
 		{
 			StreamBitmap *sbm = (StreamBitmap *) bm;
 			iterator->impl.stream = tbm_stream_begin_iterate(sbm->streamNode);
 			break;
 		}

 		default:
 			elog(ERROR, "invalid node type");
 	}

 	return iterator;
 }

 /*
  * tbm_generic_iterate - scan through next page of a TIDBitmap or a
  * StreamBitmap.
  */
 TBMIterateResult *
 tbm_generic_iterate(GenericBMIterator *iterator)
 {
 	const Node *tbm = iterator->bm;

 	Assert(IsA(tbm, TIDBitmap) || IsA(tbm, StreamBitmap));

 	switch (tbm->type)
 	{
 		case T_TIDBitmap:
 			{
 #ifdef USE_ASSERT_CHECKING
 				const TIDBitmap *hashBitmap = (const TIDBitmap *) tbm;
 #endif
 				TBMIterator *hashIterator = iterator->impl.hash;

 				Assert(hashIterator->tbm == hashBitmap);
 				Assert(hashBitmap->iterating);

 				return tbm_iterate(hashIterator);
 			}
 		case T_StreamBitmap:
 			{
 				StreamBMIterator *streamIterator = iterator->impl.stream;
 				TBMIterateResult *output = NULL;

 				MemSet(&streamIterator->entry, 0, sizeof(PagetableEntry));
 				if (streamIterator->pull(streamIterator, &streamIterator->entry))
 				{
 					output = &streamIterator->output;
 					tbm_iterate_page(&streamIterator->entry, output);
 				}

 				return output;
 			}
 		default:
 			elog(ERROR, "unrecoganized node type");
 	}

 	return NULL;
 }

 /*
  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
  *
  * The necessary shared state will be allocated from the DSA passed to
  * tbm_create, so that multiple processes can attach to it and iterate jointly.
  *
  * This will convert the pagetable hash into page and chunk array of the index
  * into pagetable array.
  */
 dsa_pointer
 tbm_prepare_shared_iterate(TIDBitmap *tbm)
 {
 	dsa_pointer dp;
 	TBMSharedIteratorState *istate;
 	PTEntryArray *ptbase = NULL;
 	PTIterationArray *ptpages = NULL;
 	PTIterationArray *ptchunks = NULL;

 	Assert(tbm->dsa != NULL);
 	Assert(tbm->iterating != TBM_ITERATING_PRIVATE);

 	/*
 	 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
 	 * lock, this will also be used by multiple worker for shared iterate.
 	 */
 	dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
 	istate = dsa_get_address(tbm->dsa, dp);

 	/*
 	 * If we're not already iterating, create and fill the sorted page lists.
 	 * (If we are, the sorted page lists are already stored in the TIDBitmap,
 	 * and we can just reuse them.)
 	 */
 	if (tbm->iterating == TBM_NOT_ITERATING)
 	{
 		pagetable_iterator i;
 		PagetableEntry *page;
 		int			idx;
 		int			npages;
 		int			nchunks;

 		/*
 		 * Allocate the page and chunk array memory from the DSA to share
 		 * across multiple processes.
 		 */
 		if (tbm->npages)
 		{
 			tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
 										tbm->npages * sizeof(int));
 			ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
 			pg_atomic_init_u32(&ptpages->refcount, 0);
 		}
 		if (tbm->nchunks)
 		{
 			tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
 										 tbm->nchunks * sizeof(int));
 			ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
 			pg_atomic_init_u32(&ptchunks->refcount, 0);
 		}

 		/*
 		 * If TBM status is TBM_HASH then iterate over the pagetable and
 		 * convert it to page and chunk arrays.  But if it's in the
 		 * TBM_ONE_PAGE mode then directly allocate the space for one entry
 		 * from the DSA.
 		 */
 		npages = nchunks = 0;
 		if (tbm->status == TBM_HASH)
 		{
 			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);

 			pagetable_start_iterate(tbm->pagetable, &i);
 			while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
 			{
 				idx = page - ptbase->ptentry;
 				if (page->ischunk)
 					ptchunks->index[nchunks++] = idx;
 				else
 					ptpages->index[npages++] = idx;
 			}

 			Assert(npages == tbm->npages);
 			Assert(nchunks == tbm->nchunks);
 		}
 		else if (tbm->status == TBM_ONE_PAGE)
 		{
 			/*
 			 * In one page mode allocate the space for one pagetable entry,
 			 * initialize it, and directly store its index (i.e. 0) in the
 			 * page array.
 			 */
 			tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
 											 sizeof(PagetableEntry));
 			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
 			memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
 			ptpages->index[0] = 0;
 		}

 		if (ptbase != NULL)
 			pg_atomic_init_u32(&ptbase->refcount, 0);
 		if (npages > 1)
 			qsort_arg((void *) (ptpages->index), npages, sizeof(int),
 					  tbm_shared_comparator, (void *) ptbase->ptentry);
 		if (nchunks > 1)
 			qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
 					  tbm_shared_comparator, (void *) ptbase->ptentry);
 	}

 	/*
 	 * Store the TBM members in the shared state so that we can share them
 	 * across multiple processes.
 	 */
 	istate->nentries = tbm->nentries;
 	istate->maxentries = tbm->maxentries;
 	istate->npages = tbm->npages;
 	istate->nchunks = tbm->nchunks;
 	istate->pagetable = tbm->dsapagetable;
 	istate->spages = tbm->ptpages;
 	istate->schunks = tbm->ptchunks;

 	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
 	ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
 	ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);

 	/*
 	 * For every shared iterator, referring to pagetable and iterator array,
 	 * increase the refcount by 1 so that while freeing the shared iterator we
 	 * don't free pagetable and iterator array until its refcount becomes 0.
 	 */
 	if (ptbase != NULL)
 		pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
 	if (ptpages != NULL)
 		pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
 	if (ptchunks != NULL)
 		pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);

 	/* Initialize the iterator lock */
 	LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);

 	/* Initialize the shared iterator state */
 	istate->schunkbit = 0;
 	istate->schunkptr = 0;
 	istate->spageptr = 0;

 	tbm->iterating = TBM_ITERATING_SHARED;

 	return dp;
 }

 /*
  * tbm_extract_page_tuple - extract the tuple offsets from a page
  *
  * The extracted offsets are stored into TBMIterateResult.
  */
 static inline int
 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
 {
 	int			wordnum;
 	int			ntuples = 0;

 	for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 	{
 		tbm_bitmapword	w = page->words[wordnum];

 		if (w != 0)
 		{
 			int			off = wordnum * TBM_BITS_PER_BITMAPWORD + 1;

 			while (w != 0)
 			{
 				if (w & 1)
 					output->offsets[ntuples++] = (OffsetNumber) off;
 				off++;
 				w >>= 1;
 			}
 		}
 	}

 	return ntuples;
 }

 /*
  * tbm_iterate_page - get a TBMIterateResult from a given PagetableEntry.
  */
 static bool
 tbm_iterate_page(PagetableEntry *page, TBMIterateResult *output)
 {
 	int			ntuples;

 	if (page->ischunk)
 	{
 		ntuples = -1;
 		output->recheck = true;
 	}
 	else
 	{
 		/* scan bitmap to extract individual offset numbers */
 		ntuples = tbm_extract_page_tuple(page, output);
 		output->recheck = page->recheck;
 	}

 	output->blockno = page->blockno;
 	output->ntuples = ntuples;

 	return true;
 }

 /*
  *	tbm_advance_schunkbit - Advance the schunkbit
  */
 static inline void
 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
 {
 	int			schunkbit = *schunkbitp;

 	while (schunkbit < PAGES_PER_CHUNK)
 	{
 		int			wordnum = WORDNUM(schunkbit);
 		int			bitnum = BITNUM(schunkbit);

 		if ((chunk->words[wordnum] & ((tbm_bitmapword) 1 << bitnum)) != 0)
 			break;
 		schunkbit++;
 	}

 	*schunkbitp = schunkbit;
 }

 /*
  * tbm_iterate - scan through next page of a TIDBitmap
  *
  * Gets a TBMIterateResult representing one page, or NULL if there are
  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
  * remember the exact tuples to look at on this page --- the caller must
  * examine all tuples on the page and check if they meet the intended
  * condition.
  */
 TBMIterateResult *
 tbm_iterate(TBMIterator *iterator)
 {
 	PagetableEntry *e;
 	bool		more;
 	TBMIterateResult *output = &(iterator->output);

 	e = tbm_next_page(iterator, &more);
 	if (more && e)
 	{
 		tbm_iterate_page(e, output);
 		return output;
 	}
 	return NULL;
 }

 /*
  * tbm_next_page - actually traverse the TIDBitmap
  *
  * Store the next block of matches in nextpage.
  */

 static PagetableEntry *
 tbm_next_page(TBMIterator *iterator, bool *more)
 {
 	TIDBitmap  *tbm = iterator->tbm;
 	Assert(tbm->iterating == TBM_ITERATING_PRIVATE);

 	*more = true;

 	/*
 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
 	 * schunkbit to the next set bit.
 	 */
 	while (iterator->schunkptr < tbm->nchunks)
 	{
 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
 		int			schunkbit = iterator->schunkbit;

 		tbm_advance_schunkbit(chunk, &schunkbit);
 		if (schunkbit < PAGES_PER_CHUNK)
 		{
 			iterator->schunkbit = schunkbit;
 			break;
 		}
 		/* advance to next chunk */
 		iterator->schunkptr++;
 		iterator->schunkbit = 0;
 	}

 	/*
 	 * If both chunk and per-page data remain, must output the numerically
 	 * earlier page.
 	 */
 	if (iterator->schunkptr < tbm->nchunks)
 	{
 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
 		PagetableEntry *nextpage;
 		BlockNumber chunk_blockno;

 		chunk_blockno = chunk->blockno + iterator->schunkbit;
 		if (iterator->spageptr >= tbm->npages ||
 			chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
 			nextpage = (PagetableEntry *) palloc(sizeof(PagetableEntry));
 			nextpage->ischunk = true;
 			nextpage->blockno = chunk_blockno;
 			iterator->schunkbit++;
 			return nextpage;
 		}
 	}

 	if (iterator->spageptr < tbm->npages)
 	{
 		PagetableEntry *e;

 		/* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
 		if (tbm->status == TBM_ONE_PAGE)
 			e = &tbm->entry1;
 		else
 			e = tbm->spages[iterator->spageptr];

 		iterator->spageptr++;
 		return e;
 	}

 	/* Nothing more in the bitmap */
 	*more = false;
 	return NULL;
 }

 /*
  *	tbm_shared_iterate - scan through next page of a TIDBitmap
  *
  *	As above, but this will iterate using an iterator which is shared
  *	across multiple processes.  We need to acquire the iterator LWLock,
  *	before accessing the shared members.
  */
 TBMIterateResult *
 tbm_shared_iterate(TBMSharedIterator *iterator)
 {
 	TBMIterateResult *output = &iterator->output;
 	TBMSharedIteratorState *istate = iterator->state;
 	PagetableEntry *ptbase = NULL;
 	int		   *idxpages = NULL;
 	int		   *idxchunks = NULL;

 	if (iterator->ptbase != NULL)
 		ptbase = iterator->ptbase->ptentry;
 	if (iterator->ptpages != NULL)
 		idxpages = iterator->ptpages->index;
 	if (iterator->ptchunks != NULL)
 		idxchunks = iterator->ptchunks->index;

 	/* Acquire the LWLock before accessing the shared members */
 	LWLockAcquire(&istate->lock, LW_EXCLUSIVE);

 	/*
 	 * If lossy chunk pages remain, make sure we've advanced schunkptr/
 	 * schunkbit to the next set bit.
 	 */
 	while (istate->schunkptr < istate->nchunks)
 	{
 		PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
 		int			schunkbit = istate->schunkbit;

 		tbm_advance_schunkbit(chunk, &schunkbit);
 		if (schunkbit < PAGES_PER_CHUNK)
 		{
 			istate->schunkbit = schunkbit;
 			break;
 		}
 		/* advance to next chunk */
 		istate->schunkptr++;
 		istate->schunkbit = 0;
 	}

 	/*
 	 * If both chunk and per-page data remain, must output the numerically
 	 * earlier page.
 	 */
 	if (istate->schunkptr < istate->nchunks)
 	{
 		PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
 		BlockNumber chunk_blockno;

 		chunk_blockno = chunk->blockno + istate->schunkbit;

 		if (istate->spageptr >= istate->npages ||
 			chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
 			output->blockno = chunk_blockno;
 			output->ntuples = -1;
 			output->recheck = true;
 			istate->schunkbit++;

 			LWLockRelease(&istate->lock);
 			return output;
 		}
 	}

 	if (istate->spageptr < istate->npages)
 	{
 		PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
 		int			ntuples;

 		/* scan bitmap to extract individual offset numbers */
 		ntuples = tbm_extract_page_tuple(page, output);
 		output->blockno = page->blockno;
 		output->ntuples = ntuples;
 		output->recheck = page->recheck;
 		istate->spageptr++;

 		LWLockRelease(&istate->lock);

 		return output;
 	}

 	LWLockRelease(&istate->lock);

 	/* Nothing more in the bitmap */
 	return NULL;
 }

 /*
  * tbm_end_iterate - finish an iteration over a TIDBitmap
  *
  * Currently this is just a pfree, but it might do more someday.  (For
  * instance, it could be useful to count open iterators and allow the
  * bitmap to return to read/write status when there are no more iterators.)
  */
 void
 tbm_end_iterate(TBMIterator *iterator)
 {
 	pfree(iterator);
 }

 /*
  * tbm_stream_end_iterate - finish an iteration over a StreamBitmap
  */
 static void
 tbm_stream_end_iterate(StreamBMIterator *iterator)
 {
 	/* end_iterate() will clean up whatever begin_iterate() set up. */
 	iterator->end_iterate(iterator);
 	pfree(iterator);
 }

 /*
  * tbm_generic_end_iterate - finish an iteration over a TIDBitmap or
  * StreamBitmap
  */
 void
 tbm_generic_end_iterate(GenericBMIterator *iterator)
 {
 	const Node *bm = iterator->bm;

 	switch (bm->type)
 	{
 		case T_TIDBitmap:
 			tbm_end_iterate(iterator->impl.hash);
 			break;

 		case T_StreamBitmap:
 		{
 			tbm_stream_end_iterate(iterator->impl.stream);
 			break;
 		}

 		default:
 			Assert((bm->type == T_TIDBitmap)
 				   || (bm->type == T_StreamBitmap));
 	}
 }

 /*
  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
  *
  * This doesn't free any of the shared state associated with the iterator,
  * just our backend-private state.
  */
 void
 tbm_end_shared_iterate(TBMSharedIterator *iterator)
 {
 	pfree(iterator);
 }

 /*
  * tbm_find_pageentry - find a PagetableEntry for the pageno
  *
  * Returns NULL if there is no non-lossy entry for the pageno.
  */
 static const PagetableEntry *
 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
 {
 	const PagetableEntry *page;

 	if (tbm->nentries == 0)		/* in case pagetable doesn't exist */
 		return NULL;

 	if (tbm->status == TBM_ONE_PAGE)
 	{
 		page = &tbm->entry1;
 		if (page->blockno != pageno)
 			return NULL;
 		Assert(!page->ischunk);
 		return page;
 	}

 	page = pagetable_lookup(tbm->pagetable, pageno);
 	if (page == NULL)
 		return NULL;
 	if (page->ischunk)
 		return NULL;			/* don't want a lossy chunk header */
 	return page;
 }

 /*
  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
  *
  * If new, the entry is marked as an exact (non-chunk) entry.
  *
  * This may cause the table to exceed the desired memory size.  It is
  * up to the caller to call tbm_lossify() at the next safe point if so.
  */
 static PagetableEntry *
 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	bool		found;

 	if (tbm->status == TBM_EMPTY)
 	{
 		/* Use the fixed slot */
 		page = &tbm->entry1;
 		found = false;
 		tbm->status = TBM_ONE_PAGE;
 	}
 	else
 	{
 		if (tbm->status == TBM_ONE_PAGE)
 		{
 			page = &tbm->entry1;
 			if (page->blockno == pageno)
 				return page;
 			/* Time to switch from one page to a hashtable */
 			tbm_create_pagetable(tbm);
 		}

 		/* Look up or create an entry */
 		page = pagetable_insert(tbm->pagetable, pageno, &found);
 	}

 	/* Initialize it if not present before */
 	if (!found)
 	{
 		char		oldstatus = page->status;

 		MemSet(page, 0, sizeof(PagetableEntry));
 		page->status = oldstatus;
 		page->blockno = pageno;
 		/* must count it too */
 		tbm->nentries++;
 		tbm->npages++;
 	}

 	return page;
 }

 /*
  * tbm_page_is_lossy - is the page marked as lossily stored?
  */
 static bool
 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	BlockNumber chunk_pageno;
 	int			bitno;

 	/* we can skip the lookup if there are no lossy chunks */
 	if (tbm->nchunks == 0)
 		return false;
 	Assert(tbm->status == TBM_HASH);

 	bitno = pageno % PAGES_PER_CHUNK;
 	chunk_pageno = pageno - bitno;

 	page = pagetable_lookup(tbm->pagetable, chunk_pageno);

 	if (page != NULL && page->ischunk)
 	{
 		int			wordnum = WORDNUM(bitno);
 		int			bitnum = BITNUM(bitno);

 		if ((page->words[wordnum] & ((tbm_bitmapword) 1 << bitnum)) != 0)
 			return true;
 	}
 	return false;
 }

 /*
  * tbm_mark_page_lossy - mark the page number as lossily stored
  *
  * This may cause the table to exceed the desired memory size.  It is
  * up to the caller to call tbm_lossify() at the next safe point if so.
  */
 static void
 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 {
 	PagetableEntry *page;
 	bool		found;
 	BlockNumber chunk_pageno;
 	int			bitno;
 	int			wordnum;
 	int			bitnum;

 	/* We force the bitmap into hashtable mode whenever it's lossy */
 	if (tbm->status != TBM_HASH)
 		tbm_create_pagetable(tbm);

 	bitno = pageno % PAGES_PER_CHUNK;
 	chunk_pageno = pageno - bitno;

 	/*
 	 * Remove any extant non-lossy entry for the page.  If the page is its own
 	 * chunk header, however, we skip this and handle the case below.
 	 */
 	if (bitno != 0)
 	{
 		if (pagetable_delete(tbm->pagetable, pageno))
 		{
 			/* It was present, so adjust counts */
 			tbm->nentries_hwm = Max(tbm->nentries_hwm, tbm->nentries);
 			tbm->nentries--;
 			tbm->npages--;		/* assume it must have been non-lossy */
 		}
 	}

 	/* Look up or create entry for chunk-header page */
 	page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);

 	/* Initialize it if not present before */
 	if (!found)
 	{
 		char		oldstatus = page->status;

 		MemSet(page, 0, sizeof(PagetableEntry));
 		page->status = oldstatus;
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* must count it too */
 		tbm->nentries++;
 		tbm->nchunks++;
 	}
 	else if (!page->ischunk)
 	{
 		char		oldstatus = page->status;

 		/* chunk header page was formerly non-lossy, make it lossy */
 		MemSet(page, 0, sizeof(PagetableEntry));
 		page->status = oldstatus;
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* we assume it had some tuple bit(s) set, so mark it lossy */
 		page->words[0] = ((tbm_bitmapword) 1 << 0);
 		/* adjust counts */
 		tbm->nchunks++;
 		tbm->npages--;
 	}

 	/* Now set the original target page's bit */
 	wordnum = WORDNUM(bitno);
 	bitnum = BITNUM(bitno);
 	page->words[wordnum] |= ((tbm_bitmapword) 1 << bitnum);
 }

 /*
  * tbm_lossify - lose some information to get back under the memory limit
  */
 static void
 tbm_lossify(TIDBitmap *tbm)
 {
 	pagetable_iterator i;
 	PagetableEntry *page;

 	/*
 	 * XXX Really stupid implementation: this just lossifies pages in
 	 * essentially random order.  We should be paying some attention to the
 	 * number of bits set in each page, instead.
 	 *
 	 * Since we are called as soon as nentries exceeds maxentries, we should
 	 * push nentries down to significantly less than maxentries, or else we'll
 	 * just end up doing this again very soon.  We shoot for maxentries/2.
 	 */
 	Assert(tbm->iterating == TBM_NOT_ITERATING);
 	Assert(tbm->status == TBM_HASH);

 	pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
 	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
 	{
 		if (page->ischunk)
 			continue;			/* already a chunk header */

 		/*
 		 * If the page would become a chunk header, we won't save anything by
 		 * converting it to lossy, so skip it.
 		 */
 		if ((page->blockno % PAGES_PER_CHUNK) == 0)
 			continue;

 		/* This does the dirty work ... */
 		tbm_mark_page_lossy(tbm, page->blockno);

 		if (tbm->nentries <= tbm->maxentries / 2)
 		{
 			/*
 			 * We have made enough room. Remember where to start lossifying
 			 * next round, so we evenly iterate over the hashtable.
 			 */
 			tbm->lossify_start = i.cur;
 			break;
 		}

 		/*
 		 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
 		 * hashtable and may have deleted the non-lossy chunk.  We can
 		 * continue the same hash table scan, since failure to visit one
 		 * element or visiting the newly inserted element, isn't fatal.
 		 */
 	}

 	/*
 	 * With a big bitmap and small work_mem, it's possible that we cannot get
 	 * under maxentries.  Again, if that happens, we'd end up uselessly
 	 * calling tbm_lossify over and over.  To prevent this from becoming a
 	 * performance sink, force maxentries up to at least double the current
 	 * number of entries.  (In essence, we're admitting inability to fit
 	 * within work_mem when we do this.)  Note that this test will not fire if
 	 * we broke out of the loop early; and if we didn't, the current number of
 	 * entries is simply not reducible any further.
 	 */
 	if (tbm->nentries > tbm->maxentries / 2)
 		tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
 }

 /*
  * qsort comparator to handle PagetableEntry pointers.
  */
 static int
 tbm_comparator(const void *left, const void *right)
 {
 	BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
 	BlockNumber r = (*((PagetableEntry *const *) right))->blockno;

 	if (l < r)
 		return -1;
 	else if (l > r)
 		return 1;
 	return 0;
 }

 /*
  * As above, but this will get index into PagetableEntry array.  Therefore,
  * it needs to get actual PagetableEntry using the index before comparing the
  * blockno.
  */
 static int
 tbm_shared_comparator(const void *left, const void *right, void *arg)
 {
 	PagetableEntry *base = (PagetableEntry *) arg;
 	PagetableEntry *lpage = &base[*(int *) left];
 	PagetableEntry *rpage = &base[*(int *) right];

 	if (lpage->blockno < rpage->blockno)
 		return -1;
 	else if (lpage->blockno > rpage->blockno)
 		return 1;
 	return 0;
 }

 /*
  *	tbm_attach_shared_iterate
  *
  *	Allocate a backend-private iterator and attach the shared iterator state
  *	to it so that multiple processed can iterate jointly.
  *
  *	We also converts the DSA pointers to local pointers and store them into
  *	our private iterator.
  */
 TBMSharedIterator *
 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
 {
 	TBMSharedIterator *iterator;
 	TBMSharedIteratorState *istate;

 	/*
 	 * Create the TBMSharedIterator struct, with enough trailing space to
 	 * serve the needs of the TBMIterateResult sub-struct.
 	 */
 	iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
 											 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));

 	istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);

 	iterator->state = istate;

 	iterator->ptbase = dsa_get_address(dsa, istate->pagetable);

 	if (istate->npages)
 		iterator->ptpages = dsa_get_address(dsa, istate->spages);
 	if (istate->nchunks)
 		iterator->ptchunks = dsa_get_address(dsa, istate->schunks);

 	return iterator;
 }

 /*
  * pagetable_allocate
  *
  * Callback function for allocating the memory for hashtable elements.
  * Allocate memory for hashtable elements, using DSA if available.
  */
 static inline void *
 pagetable_allocate(pagetable_hash *pagetable, Size size)
 {
 	TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
 	PTEntryArray *ptbase;

 	if (tbm->dsa == NULL)
 		return MemoryContextAllocExtended(pagetable->ctx, size,
 										  MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);

 	/*
 	 * Save the dsapagetable reference in dsapagetableold before allocating
 	 * new memory so that pagetable_free can free the old entry.
 	 */
 	tbm->dsapagetableold = tbm->dsapagetable;
 	tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
 											  sizeof(PTEntryArray) + size,
 											  DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
 	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);

 	return ptbase->ptentry;
 }

 /*
  * pagetable_free
  *
  * Callback function for freeing hash table elements.
  */
 static inline void
 pagetable_free(pagetable_hash *pagetable, void *pointer)
 {
 	TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;

 	/* pfree the input pointer if DSA is not available */
 	if (tbm->dsa == NULL)
 		pfree(pointer);
 	else if (DsaPointerIsValid(tbm->dsapagetableold))
 	{
 		dsa_free(tbm->dsa, tbm->dsapagetableold);
 		tbm->dsapagetableold = InvalidDsaPointer;
 	}
 }

 /*
  * tbm_calculate_entries
  *
  * Estimate number of hashtable entries we can have within maxbytes.
  */
 long
 tbm_calculate_entries(double maxbytes)
 {
 	long		nbuckets;

 	/*
 	 * Estimate number of hashtable entries we can have within maxbytes. This
 	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
 	 * for our purpose.  Also count an extra Pointer per entry for the arrays
 	 * created during iteration readout.
 	 */
 	nbuckets = maxbytes /
 		(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
 	nbuckets = Min(nbuckets, INT_MAX - 1);	/* safety limit */
 	nbuckets = Max(nbuckets, 16);	/* sanity limit */

 	return nbuckets;
 }


 /*
  * functions related to streaming
  */

 static void
 opstream_free(StreamNode *self)
 {
 	ListCell   *cell;
 	List	   *input = self->opaque;

 	foreach(cell, input)
 	{
 		StreamNode *inp = (StreamNode *) lfirst(cell);

 		if (inp->free)
 			inp->free(inp);
 	}
 	list_free(input);
 	pfree(self);
 }

 static void
 opstream_set_instrument(StreamNode *self, struct Instrumentation *instr)
 {
 	ListCell   *cell;
 	List	   *input = self->opaque;

 	foreach(cell, input)
 	{
 		StreamNode *inp = (StreamNode *) lfirst(cell);

 		if (inp->set_instrument)
 			inp->set_instrument(inp, instr);
 	}
 }

 static void
 opstream_upd_instrument(StreamNode *self)
 {
 	ListCell   *cell;
 	List	   *input = self->opaque;

 	foreach(cell, input)
 	{
 		StreamNode *inp = (StreamNode *) lfirst(cell);

 		if (inp->upd_instrument)
 			inp->upd_instrument(inp);
 	}
 }

 static OpStream *
 make_opstream(StreamType kind, StreamNode *n1, StreamNode *n2)
 {
 	OpStream   *op;

 	Assert(kind == BMS_OR || kind == BMS_AND);
 	Assert(PointerIsValid(n1));

 	op = (OpStream *) palloc0(sizeof(OpStream));
 	op->type = kind;
 	op->opaque = list_make2(n1, n2);
 	op->begin_iterate = opstream_begin_iterate;
 	op->free = opstream_free;
 	op->set_instrument = opstream_set_instrument;
 	op->upd_instrument = opstream_upd_instrument;
 	return op;
 }

 /*
  * stream_move_node - move a streamnode from StreamBitMap (source)'s streamnode
  * to given StreamBitMap(destination). Also transfer the ownership of source streamnode by
  * resetting to NULL.
  */
 void
 stream_move_node(StreamBitmap *destination, StreamBitmap *source, StreamType kind)
 {
 	Assert(NULL != destination);
 	Assert(NULL != source);
 	stream_add_node(destination,
 			source->streamNode, kind);
 	/* destination owns the streamNode and hence resetting it to NULL for source->streamNode. */
 	source->streamNode = NULL;
 }


 /*
  * stream_add_node() - add a new node to a bitmap stream
  * node is a base node -- i.e., an index/external
  * kind is one of BMS_INDEX, BMS_OR or BMS_AND
  */

 void
 stream_add_node(StreamBitmap *sbm, StreamNode *node, StreamType kind)
 {
 	/* CDB: Tell node where to put its statistics for EXPLAIN ANALYZE. */
 	if (node->set_instrument)
 		node->set_instrument(node, sbm->instrument);

 	/* initialised */
 	if (sbm->streamNode)
 	{
 		StreamNode *n = sbm->streamNode;

 		/* StreamNode is already an index, transform to OpStream */
 		if ((n->type == BMS_AND && kind == BMS_AND) ||
 			(n->type == BMS_OR && kind == BMS_OR))
 		{
 			/* n->opaque is our list of inputs; append to it */
 			n->opaque = lappend(n->opaque, node);
 		}
 		else if ((n->type == BMS_AND && kind != BMS_AND) ||
 				 (n->type == BMS_OR && kind != BMS_OR) ||
 				 (n->type == BMS_INDEX))
 		{
 			sbm->streamNode = make_opstream(kind, sbm->streamNode, node);
 		}
 		else
 			elog(ERROR, "unknown stream type %i", (int) n->type);
 	}
 	else
 	{
 		if (kind == BMS_INDEX)
 			sbm->streamNode = node;
 		else
 			sbm->streamNode = make_opstream(kind, node, NULL);
 	}
 }

 /*
  * tbm_create_stream_node() - turn a TIDBitmap into a stream
  */

 StreamNode *
 tbm_create_stream_node(TIDBitmap *tbm)
 {
 	IndexStream *is;

 	is = (IndexStream *) palloc0(sizeof(IndexStream));

 	is->type = BMS_INDEX;
 	is->opaque = tbm;
 	is->begin_iterate = index_stream_begin_iterate;
 	is->free = tbm_stream_free;
 	is->set_instrument = tbm_stream_set_instrument;
 	is->upd_instrument = tbm_stream_upd_instrument;

 	return is;
 }

 /*
  * IndexStream iteration callbacks
  */

 static void
 index_stream_begin_iterate(StreamNode *self, StreamBMIterator *iterator)
 {
 	TIDBitmap *tbm = self->opaque;

 	iterator->pull = tbm_stream_block;
 	iterator->end_iterate = index_stream_end_iterate;

 	/* Begin iterating on the underlying TIDBitmap. */
 	iterator->input.hash = tbm_begin_iterate(tbm);
 }

 static void
 index_stream_end_iterate(StreamBMIterator *self)
 {
 	tbm_end_iterate(self->input.hash);
 }

 /*
  * tbm_stream_block() - Fetch the next block from TIDBitmap stream
  *
  * Notice that the IndexStream passed in as opaque will tell us the
  * desired block to stream. If the block requrested is greater than or equal
  * to the block we've cached inside the iterator, return that.
  */

 static bool
 tbm_stream_block(StreamBMIterator *iterator, PagetableEntry *e)
 {
 	TBMIterator *hashIterator = iterator->input.hash;
 	PagetableEntry *next = iterator->nextentry;
 	bool		more;

 	Assert(iterator->node->type == BMS_INDEX);

 	/* have we already got an entry? */
 	if (next && iterator->nextblock <= next->blockno)
 	{
 		memcpy(e, next, sizeof(PagetableEntry));
 		return true;
 	}

 	/* we need a new entry */
 	iterator->nextentry = tbm_next_page(hashIterator, &more);
 	if (more)
 	{
 		Assert(iterator->nextentry);
 		memcpy(e, iterator->nextentry, sizeof(PagetableEntry));
 	}
 	iterator->nextblock++;
 	return more;
 }

 static void
 tbm_stream_free(StreamNode *self)
 {
 	TIDBitmap *tbm = self->opaque;
 	Assert(self->type == BMS_INDEX);
 	tbm_free(tbm);
 	pfree(self);
 }

 static void
 tbm_stream_set_instrument(StreamNode *self, struct Instrumentation *instr)
 {
 	TIDBitmap *tbm = self->opaque;
 	Assert(self->type == BMS_INDEX);
 	tbm_set_instrument(tbm, instr);
 }

 static void
 tbm_stream_upd_instrument(StreamNode *self)
 {
 	TIDBitmap *tbm = self->opaque;
 	Assert(self->type == BMS_INDEX);
 	tbm_upd_instrument(tbm);
 }

 /*
  * OpStream iteration callbacks
  */

 static void
 opstream_begin_iterate(StreamNode *self, StreamBMIterator *iterator)
 {
 	List	 *input = self->opaque;
 	ListCell *cell;

 	iterator->pull = opstream_iterate;
 	iterator->end_iterate = opstream_end_iterate;

 	/* Recursively initialize an iterator for each StreamNode. */
 	foreach(cell, input)
 	{
 		StreamNode 		 *inNode = lfirst(cell);
 		StreamBMIterator *inIter = tbm_stream_begin_iterate(inNode);

 		iterator->input.stream = lappend(iterator->input.stream, inIter);
 	}
 }

 static void
 opstream_end_iterate(StreamBMIterator *self)
 {
 	ListCell *cell;

 	/* Recursively free all iterators in the stream "tree". */
 	foreach(cell, self->input.stream)
 	{
 		StreamBMIterator *inIter = lfirst(cell);
 		tbm_stream_end_iterate(inIter);
 	}
 	list_free(self->input.stream);
 }

 /*
  * opstream_iterate()
  *
  * This is an iterator for OpStreams. The function doesn't
  * know anything about the streams it is actually iterating.
  *
  * Returns false when no more results can be obtained, otherwise true.
  */
 static bool
 opstream_iterate(StreamBMIterator *iterator, PagetableEntry *e)
 {
 	const StreamNode   *n = iterator->node;
 	bool				res = false;

 	/*
 	 * There are two ways we can do this: either, we could maintain our
 	 * own top level BatchWords structure and pull blocks out of that OR
 	 * we could maintain batch words for each sub map and union/intersect
 	 * those together to get the resulting page entries.
 	 *
 	 * Now, BatchWords are specific to bitmap indexes so we'd have to
 	 * translate TIDBitmaps. All the infrastructure is available to
 	 * translate bitmap indexes into the TIDBitmap mechanism so we'll do
 	 * that for now.
 	 */
 	ListCell   *map;
 	BlockNumber minblockno;
 	ListCell   *cell;
 	int			wordnum;
 	List	   *matches;
 	bool		empty;

 	Assert(n->type == BMS_OR || n->type == BMS_AND);

 	/*
 	 * First, iterate through each input bitmap stream and save the block
 	 * which is returned. TIDBitmaps are designed such that they do not
 	 * return blocks with no matches -- that is, say a TIDBitmap has
 	 * matches for block 1, 4 and 5 it store matches only for those
 	 * blocks. Therefore, we may have one stream return a match for block
 	 * 10, another for block 15 and another yet for block 10 again. In
 	 * this case, we cannot include block 15 in the union/intersection
 	 * because it represents matches on some page later in the scan. We'll
 	 * get around to it in good time.
 	 *
 	 * In this case, if we're doing a union, we perform the operation
 	 * without reference to block 15. If we're performing an intersection
 	 * we cannot perform it on block 10 because we didn't get any matches
 	 * for block 10 for one of the streams: the intersection with fail.
 	 * So, we set the desired block (op->nextblock) to block 15 and loop
 	 * around to the `restart' label.
 	 */
 restart:
 	e->blockno = InvalidBlockNumber;
 	empty = false;
 	matches = NIL;
 	minblockno = InvalidBlockNumber;
 	Assert(PointerIsValid(iterator->input.stream));
 	foreach(map, iterator->input.stream)
 	{
 		StreamBMIterator *inIter = lfirst(map);
 		PagetableEntry *new;
 		bool		r;

 		new = (PagetableEntry *) palloc0(sizeof(PagetableEntry));

 		/* set the desired block */
 		inIter->nextblock = iterator->nextblock;
 		r = inIter->pull(inIter, new);

 		/*
 		 * Let to caller know we got a result from some input bitmap. This
 		 * doesn't hold true if we're doing an intersection, and that is
 		 * handled below
 		 */
 		res = res || r;

 		/* only include a match if the pull function tells us to */
 		if (r)
 		{
 			if (minblockno == InvalidBlockNumber)
 				minblockno = new->blockno;
 			else if (n->type == BMS_OR)
 				minblockno = Min(minblockno, new->blockno);
 			else
 				minblockno = Max(minblockno, new->blockno);
 			matches = lappend(matches, (void *) new);
 		}
 		else
 		{
 			pfree(new);

 			if (n->type == BMS_AND)
 			{
 				/*
 				 * No more results for this stream and since we're doing
 				 * an intersection we wont get any valid results from now
 				 * on, so tell our caller that
 				 */
 				iterator->nextblock = minblockno + 1;	/* seems safe */
 				return false;
 			}
 			else if (n->type == BMS_OR)
 				continue;
 		}
 	}

 	/*
 	 * Now we iterate through the actual matches and perform the desired
 	 * operation on those from the same minimum block
 	 */
 	foreach(cell, matches)
 	{
 		PagetableEntry *tmp = (PagetableEntry *) lfirst(cell);

 		if (tmp->blockno == minblockno)
 		{
 			if (e->blockno == InvalidBlockNumber)
 			{
 				memcpy(e, tmp, sizeof(PagetableEntry));
 				continue;
 			}

 			/* already initialised, so OR/AND together */
 			if (tmp->ischunk == true)
 			{
 				/*
 				 * Okay, new entry is lossy so match our output as lossy
 				 */
 				e->ischunk = true;
 				/* XXX: we can just return now... I think :) */
 				iterator->nextblock = minblockno + 1;
 				list_free_deep(matches);
 				return res;
 			}

 			/* union/intersect existing output and new matches */
 			for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 			{
 				if (n->type == BMS_OR)
 					e->words[wordnum] |= tmp->words[wordnum];
 				else
 					e->words[wordnum] &= tmp->words[wordnum];
 			}
 			e->recheck |= tmp->recheck;
 		}
 		else if (n->type == BMS_AND)
 		{
 			/*
 			 * One of our input maps didn't return a block for the desired
 			 * block number so, we loop around again.
 			 *
 			 * Notice that we don't set the next block as minblockno + 1.
 			 * We don't know if the other streams will find a match for
 			 * minblockno, so we cannot skip past it yet.
 			 */

 			iterator->nextblock = minblockno;
 			empty = true;
 			break;
 		}
 	}
 	if (empty)
 	{
 		/* start again */
 		empty = false;
 		MemSet(e->words, 0, sizeof(tbm_bitmapword) * WORDS_PER_PAGE);
 		list_free_deep(matches);
 		goto restart;
 	}
 	else
 		list_free_deep(matches);
 	if (res)
 		iterator->nextblock = minblockno + 1;

 	return res;
 }


 /*
  * --------- These functions accept either TIDBitmap or StreamBitmap ---------
  */


 /*
  * tbm_generic_free - free a TIDBitmap or StreamBitmap
  */
 void
 tbm_generic_free(Node *bm)
 {
 	if (bm == NULL)
 		return;

 	switch (bm->type)
 	{
 		case T_TIDBitmap:
 			tbm_free((TIDBitmap *) bm);
 			break;
 		case T_StreamBitmap:
 			{
 				StreamBitmap *sbm = (StreamBitmap *) bm;
 				StreamNode *sn = sbm->streamNode;

 				sbm->streamNode = NULL;
 				if (sn && sn->free)
 					sn->free(sn);

 				pfree(sbm);

 				break;
 			}
 		default:
 			Assert(0);
 	}
 }	/* tbm_generic_free */


 /*
  * tbm_generic_set_instrument - attach caller's Instrumentation object to bitmap
  */
 void
 tbm_generic_set_instrument(Node *bm, struct Instrumentation *instr)
 {
 	if (bm == NULL)
 		return;

 	switch (bm->type)
 	{
 		case T_TIDBitmap:
 			tbm_set_instrument((TIDBitmap *) bm, instr);
 			break;
 		case T_StreamBitmap:
 			{
 				StreamBitmap *sbm = (StreamBitmap *) bm;

 				if (sbm->streamNode &&
 					sbm->streamNode->set_instrument)
 					sbm->streamNode->set_instrument(sbm->streamNode, instr);
 				break;
 			}
 		default:
 			Assert(0);
 	}
 }	/* tbm_generic_set_instrument */


 /*
  * tbm_generic_upd_instrument - update stats in caller's Instrumentation object
  *
  * Some callers don't bother to tbm_free() their bitmaps, but let the storage
  * be reclaimed when the MemoryContext is reset.  Such callers should use this
  * function to make sure the statistics are transferred to the Instrumentation
  * object before the bitmap goes away.
  */
 void
 tbm_generic_upd_instrument(Node *bm)
 {
 	if (bm == NULL)
 		return;

 	switch (bm->type)
 	{
 		case T_TIDBitmap:
 			tbm_upd_instrument((TIDBitmap *) bm);
 			break;
 		case T_StreamBitmap:
 			{
 				StreamBitmap *sbm = (StreamBitmap *) bm;

 				if (sbm->streamNode &&
 					sbm->streamNode->upd_instrument)
 					sbm->streamNode->upd_instrument(sbm->streamNode);
 				break;
 			}
 		default:
 			Assert(0);
 	}
 }	/* tbm_generic_upd_instrument */

 void
 tbm_convert_appendonly_tid_out(ItemPointer psudeoHeapTid, AOTupleId *aoTid)
 {
 	/* UNDONE: For now, just copy. */
 	memcpy(aoTid, psudeoHeapTid, sizeof(ItemPointerData));
 }