src/backend/access/gist/gistbuild.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * gistbuild.c
  *	  build algorithm for GiST indexes implementation.
  *
  * There are two different strategies:
  *
  * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
  *    order, and create downlinks and internal pages as we go. This builds
  *    the index from the bottom up, similar to how B-tree index build
  *    works.
  *
  * 2. Start with an empty index, and insert all tuples one by one.
  *
  * The sorted method is used if the operator classes for all columns have
  * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
  *
  * The second strategy can optionally use buffers at different levels of
  * the tree to reduce I/O, see "Buffering build algorithm" in the README
  * for a more detailed explanation. It initially calls insert over and
  * over, but switches to the buffered algorithm after a certain number of
  * tuples (unless buffering mode is disabled).
  *
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/access/gist/gistbuild.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <math.h>

 #include "access/genam.h"
 #include "access/gist_private.h"
 #include "access/gistxlog.h"
 #include "access/tableam.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/tuplesort.h"

 extern XLogRecPtr LSNForEncryption(bool use_wal_lsn);

 /* Step of index tuples for check whether to switch to buffering build mode */
 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256

 /*
  * Number of tuples to process in the slow way before switching to buffering
  * mode, when buffering is explicitly turned on. Also, the number of tuples
  * to process between readjusting the buffer size parameter, while in
  * buffering mode.
  */
 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096

 /*
  * Strategy used to build the index. It can change between the
  * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
  * that needs to be decided up-front and cannot be changed afterwards.
  */
 typedef enum
 {
 	GIST_SORTED_BUILD,			/* bottom-up build by sorting */
 	GIST_BUFFERING_DISABLED,	/* in regular build mode and aren't going to
 								 * switch */
 	GIST_BUFFERING_AUTO,		/* in regular build mode, but will switch to
 								 * buffering build mode if the index grows too
 								 * big */
 	GIST_BUFFERING_STATS,		/* gathering statistics of index tuple size
 								 * before switching to the buffering build
 								 * mode */
 	GIST_BUFFERING_ACTIVE		/* in buffering build mode */
 } GistBuildMode;

 /* Working state for gistbuild and its callback */
 typedef struct
 {
 	Relation	indexrel;
 	Relation	heaprel;
 	GISTSTATE  *giststate;

 	Size		freespace;		/* amount of free space to leave on pages */

 	GistBuildMode buildMode;

 	int64		indtuples;		/* number of tuples indexed */

 	/*
 	 * Extra data structures used during a buffering build. 'gfbb' contains
 	 * information related to managing the build buffers. 'parentMap' is a
 	 * lookup table of the parent of each internal page.
 	 */
 	int64		indtuplesSize;	/* total size of all indexed tuples */
 	GISTBuildBuffers *gfbb;
 	HTAB	   *parentMap;

 	/*
 	 * Extra data structures used during a sorting build.
 	 */
 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */

 	BlockNumber pages_allocated;
 	BlockNumber pages_written;

 	int			ready_num_pages;
 	BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
 	Page		ready_pages[XLR_MAX_BLOCK_ID];
 } GISTBuildState;

 #define GIST_SORTED_BUILD_PAGE_NUM 4

 /*
  * In sorted build, we use a stack of these structs, one for each level,
  * to hold an in-memory buffer of last pages at the level.
  *
  * Sorting GiST build requires good linearization of the sort opclass. This is
  * not always the case in multidimensional data. To tackle the anomalies, we
  * buffer index tuples and apply picksplit that can be multidimension-aware.
  */
 typedef struct GistSortedBuildLevelState
 {
 	int			current_page;
 	BlockNumber last_blkno;
 	struct GistSortedBuildLevelState *parent;	/* Upper level, if any */
 	Page		pages[GIST_SORTED_BUILD_PAGE_NUM];
 } GistSortedBuildLevelState;

 /* prototypes for private functions */

 static void gistSortedBuildCallback(Relation index, ItemPointer tid,
 									Datum *values, bool *isnull,
 									bool tupleIsAlive, void *state);
 static void gist_indexsortbuild(GISTBuildState *state);
 static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 											   GistSortedBuildLevelState *levelstate,
 											   IndexTuple itup);
 static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 												 GistSortedBuildLevelState *levelstate);
 static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);

 static void gistInitBuffering(GISTBuildState *buildstate);
 static int	calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
 static void gistBuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
 									 IndexTuple itup);
 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
 							BlockNumber startblkno, int startlevel);
 static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
 											 Buffer buffer, int level,
 											 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
 											 BlockNumber parentblk, OffsetNumber downlinkoffnum);
 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
 											 BlockNumber childblkno, int level,
 											 BlockNumber *parentblkno,
 											 OffsetNumber *downlinkoffnum);
 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
 static int	gistGetMaxLevel(Relation index);

 static void gistInitParentMap(GISTBuildState *buildstate);
 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
 							   BlockNumber parent);
 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
 									 Buffer parentbuf);
 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);


 /*
  * Main entry point to GiST index build.
  */
 IndexBuildResult *
 gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 {
 	IndexBuildResult *result;
 	double		reltuples;
 	GISTBuildState buildstate;
 	MemoryContext oldcxt = CurrentMemoryContext;
 	int			fillfactor;
 	Oid			SortSupportFnOids[INDEX_MAX_KEYS];
 	GiSTOptions *options = (GiSTOptions *) index->rd_options;

 	/*
 	 * We expect to be called exactly once for any index relation. If that's
 	 * not the case, big trouble's what we have.
 	 */
 	if (RelationGetNumberOfBlocks(index) != 0)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));

 	buildstate.indexrel = index;
 	buildstate.heaprel = heap;
 	buildstate.sortstate = NULL;
 	buildstate.giststate = initGISTstate(index);

 	/*
 	 * Create a temporary memory context that is reset once for each tuple
 	 * processed.  (Note: we don't bother to make this a child of the
 	 * giststate's scanCxt, so we have to delete it separately at the end.)
 	 */
 	buildstate.giststate->tempCxt = createTempGistContext();

 	/*
 	 * Choose build strategy.  First check whether the user specified to use
 	 * buffering mode.  (The use-case for that in the field is somewhat
 	 * questionable perhaps, but it's important for testing purposes.)
 	 */
 	if (options)
 	{
 		if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
 			buildstate.buildMode = GIST_BUFFERING_STATS;
 		else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
 			buildstate.buildMode = GIST_BUFFERING_DISABLED;
 		else					/* must be "auto" */
 			buildstate.buildMode = GIST_BUFFERING_AUTO;
 	}
 	else
 	{
 		buildstate.buildMode = GIST_BUFFERING_AUTO;
 	}

 	/*
 	 * Unless buffering mode was forced, see if we can use sorting instead.
 	 */
 	if (buildstate.buildMode != GIST_BUFFERING_STATS)
 	{
 		bool		hasallsortsupports = true;
 		int			keyscount = IndexRelationGetNumberOfKeyAttributes(index);

 		for (int i = 0; i < keyscount; i++)
 		{
 			SortSupportFnOids[i] = index_getprocid(index, i + 1,
 												   GIST_SORTSUPPORT_PROC);
 			if (!OidIsValid(SortSupportFnOids[i]))
 			{
 				hasallsortsupports = false;
 				break;
 			}
 		}
 		if (hasallsortsupports)
 			buildstate.buildMode = GIST_SORTED_BUILD;
 	}

 	/*
 	 * Calculate target amount of free space to leave on pages.
 	 */
 	fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
 	buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;

 	/*
 	 * Build the index using the chosen strategy.
 	 */
 	buildstate.indtuples = 0;
 	buildstate.indtuplesSize = 0;

 	if (buildstate.buildMode == GIST_SORTED_BUILD)
 	{
 		/*
 		 * Sort all data, build the index from bottom up.
 		 */
 		buildstate.sortstate = tuplesort_begin_index_gist(heap,
 														  index,
 														  maintenance_work_mem,
 														  NULL,
 														  TUPLESORT_NONE);

 		/* Scan the table, adding all tuples to the tuplesort */
 		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
 										   gistSortedBuildCallback,
 										   (void *) &buildstate, NULL);

 		/*
 		 * Perform the sort and build index pages.
 		 */
 		tuplesort_performsort(buildstate.sortstate);

 		gist_indexsortbuild(&buildstate);

 		tuplesort_end(buildstate.sortstate);
 	}
 	else
 	{
 		/*
 		 * Initialize an empty index and insert all tuples, possibly using
 		 * buffers on intermediate levels.
 		 */
 		Buffer		buffer;
 		Page		page;

 		/* initialize the root page */
 		buffer = gistNewBuffer(index, heap);
 		Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
 		page = BufferGetPage(buffer);

 		START_CRIT_SECTION();

 		GISTInitBuffer(buffer, F_LEAF);

 		MarkBufferDirty(buffer);
 		PageSetLSN(page, GistBuildLSN);

 		UnlockReleaseBuffer(buffer);

 		END_CRIT_SECTION();

 		/* Scan the table, inserting all the tuples to the index. */
 		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
 										   gistBuildCallback,
 										   (void *) &buildstate, NULL);

 		/*
 		 * If buffering was used, flush out all the tuples that are still in
 		 * the buffers.
 		 */
 		if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
 		{
 			elog(DEBUG1, "all tuples processed, emptying buffers");
 			gistEmptyAllBuffers(&buildstate);
 			gistFreeBuildBuffers(buildstate.gfbb);
 		}

 		/*
 		 * We didn't write WAL records as we built the index, so if
 		 * WAL-logging is required, write all pages to the WAL now.
 		 */
 		if (RelationNeedsWAL(index))
 		{
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
 	}

 	/* okay, all heap tuples are indexed */
 	MemoryContextSwitchTo(oldcxt);
 	MemoryContextDelete(buildstate.giststate->tempCxt);

 	freeGISTstate(buildstate.giststate);

 	/*
 	 * Return statistics
 	 */
 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

 	result->heap_tuples = reltuples;
 	result->index_tuples = (double) buildstate.indtuples;

 	return result;
 }

 /*-------------------------------------------------------------------------
  * Routines for sorted build
  *-------------------------------------------------------------------------
  */

 /*
  * Per-tuple callback for table_index_build_scan.
  */
 static void
 gistSortedBuildCallback(Relation index,
 						ItemPointer tid,
 						Datum *values,
 						bool *isnull,
 						bool tupleIsAlive,
 						void *state)
 {
 	GISTBuildState *buildstate = (GISTBuildState *) state;
 	MemoryContext oldCtx;
 	Datum		compressed_values[INDEX_MAX_KEYS];

 	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);

 	/* Form an index tuple and point it at the heap tuple */
 	gistCompressValues(buildstate->giststate, index,
 					   values, isnull,
 					   true, compressed_values);

 	tuplesort_putindextuplevalues(buildstate->sortstate,
 								  buildstate->indexrel,
 								  tid,
 								  compressed_values, isnull);

 	MemoryContextSwitchTo(oldCtx);
 	MemoryContextReset(buildstate->giststate->tempCxt);

 	/* Update tuple count. */
 	buildstate->indtuples += 1;
 }

 /*
  * Build GiST index from bottom up from pre-sorted tuples.
  */
 static void
 gist_indexsortbuild(GISTBuildState *state)
 {
 	IndexTuple	itup;
 	GistSortedBuildLevelState *levelstate;
 	Page		page;

 	state->pages_allocated = 0;
 	state->pages_written = 0;
 	state->ready_num_pages = 0;

 	/*
 	 * Write an empty page as a placeholder for the root page. It will be
 	 * replaced with the real root page at the end.
 	 */
 	page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
 	smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			   page, true);
 	state->pages_allocated++;
 	state->pages_written++;

 	/* Allocate a temporary buffer for the first leaf page batch. */
 	levelstate = palloc0(sizeof(GistSortedBuildLevelState));
 	levelstate->pages[0] = page;
 	levelstate->parent = NULL;
 	gistinitpage(page, F_LEAF);

 	/*
 	 * Fill index pages with tuples in the sorted order.
 	 */
 	while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
 	{
 		gist_indexsortbuild_levelstate_add(state, levelstate, itup);
 		MemoryContextReset(state->giststate->tempCxt);
 	}

 	/*
 	 * Write out the partially full non-root pages.
 	 *
 	 * Keep in mind that flush can build a new root. If number of pages is > 1
 	 * then new root is required.
 	 */
 	while (levelstate->parent != NULL || levelstate->current_page != 0)
 	{
 		GistSortedBuildLevelState *parent;

 		gist_indexsortbuild_levelstate_flush(state, levelstate);
 		parent = levelstate->parent;
 		for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
 			if (levelstate->pages[i])
 				pfree(levelstate->pages[i]);
 		pfree(levelstate);
 		levelstate = parent;
 	}

 	gist_indexsortbuild_flush_ready_pages(state);

 	/* Write out the root */
 	PageSetLSN(levelstate->pages[0], GistBuildLSN);
 	PageEncryptInplace(levelstate->pages[0], MAIN_FORKNUM,
 					   GIST_ROOT_BLKNO);
 	PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
 	smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			  levelstate->pages[0], true);
 	if (RelationNeedsWAL(state->indexrel))
 		log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					levelstate->pages[0], true);

 	pfree(levelstate->pages[0]);
 	pfree(levelstate);

 	/*
 	 * When we WAL-logged index pages, we must nonetheless fsync index files.
 	 * Since we're building outside shared buffers, a CHECKPOINT occurring
 	 * during the build has no way to flush the previously written data to
 	 * disk (indeed it won't know the index even exists).  A crash later on
 	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
 	 * earlier WAL entries. If we do not fsync those pages here, they might
 	 * still not be on disk when the crash occurs.
 	 */
 	if (RelationNeedsWAL(state->indexrel))
 		smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM);
 }

 /*
  * Add tuple to a page. If the pages are full, write them out and re-initialize
  * a new page first.
  */
 static void
 gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 								   GistSortedBuildLevelState *levelstate,
 								   IndexTuple itup)
 {
 	Size		sizeNeeded;

 	/* Check if tuple can be added to the current page */
 	sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
 	if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
 	{
 		Page		newPage;
 		Page		old_page = levelstate->pages[levelstate->current_page];
 		uint16		old_page_flags = GistPageGetOpaque(old_page)->flags;

 		if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
 		{
 			gist_indexsortbuild_levelstate_flush(state, levelstate);
 		}
 		else
 			levelstate->current_page++;

 		if (levelstate->pages[levelstate->current_page] == NULL)
 			levelstate->pages[levelstate->current_page] =
 				palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);

 		newPage = levelstate->pages[levelstate->current_page];
 		gistinitpage(newPage, old_page_flags);
 	}

 	gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
 }

 static void
 gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 									 GistSortedBuildLevelState *levelstate)
 {
 	GistSortedBuildLevelState *parent;
 	BlockNumber blkno;
 	MemoryContext oldCtx;
 	IndexTuple	union_tuple;
 	SplitedPageLayout *dist;
 	IndexTuple *itvec;
 	int			vect_len;
 	bool		isleaf = GistPageIsLeaf(levelstate->pages[0]);

 	CHECK_FOR_INTERRUPTS();

 	oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);

 	/* Get index tuples from first page */
 	itvec = gistextractpage(levelstate->pages[0], &vect_len);
 	if (levelstate->current_page > 0)
 	{
 		/* Append tuples from each page */
 		for (int i = 1; i < levelstate->current_page + 1; i++)
 		{
 			int			len_local;
 			IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);

 			itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
 			pfree(itvec_local);
 		}

 		/* Apply picksplit to list of all collected tuples */
 		dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
 	}
 	else
 	{
 		/* Create splitted layout from single page */
 		dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
 		union_tuple = gistunion(state->indexrel, itvec, vect_len,
 								state->giststate);
 		dist->itup = union_tuple;
 		dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
 		dist->block.num = vect_len;
 	}

 	MemoryContextSwitchTo(oldCtx);

 	/* Reset page counter */
 	levelstate->current_page = 0;

 	/* Create pages for all partitions in split result */
 	for (; dist != NULL; dist = dist->next)
 	{
 		char	   *data;
 		Page		target;

 		/* check once per page */
 		CHECK_FOR_INTERRUPTS();

 		/* Create page and copy data */
 		data = (char *) (dist->list);
 		target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
 		gistinitpage(target, isleaf ? F_LEAF : 0);
 		for (int i = 0; i < dist->block.num; i++)
 		{
 			IndexTuple	thistup = (IndexTuple) data;

 			if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));

 			data += IndexTupleSize(thistup);
 		}
 		union_tuple = dist->itup;

 		if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
 			gist_indexsortbuild_flush_ready_pages(state);

 		/*
 		 * The page is now complete. Assign a block number to it, and add it
 		 * to the list of finished pages. (We don't write it out immediately,
 		 * because we want to WAL-log the pages in batches.)
 		 */
 		blkno = state->pages_allocated++;
 		state->ready_blknos[state->ready_num_pages] = blkno;
 		state->ready_pages[state->ready_num_pages] = target;
 		state->ready_num_pages++;
 		ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);

 		/*
 		 * Set the right link to point to the previous page. This is just for
 		 * debugging purposes: GiST only follows the right link if a page is
 		 * split concurrently to a scan, and that cannot happen during index
 		 * build.
 		 *
 		 * It's a bit counterintuitive that we set the right link on the new
 		 * page to point to the previous page, not the other way around. But
 		 * GiST pages are not ordered like B-tree pages are, so as long as the
 		 * right-links form a chain through all the pages at the same level,
 		 * the order doesn't matter.
 		 */
 		if (levelstate->last_blkno)
 			GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
 		levelstate->last_blkno = blkno;

 		/*
 		 * Insert the downlink to the parent page. If this was the root,
 		 * create a new page as the parent, which becomes the new root.
 		 */
 		parent = levelstate->parent;
 		if (parent == NULL)
 		{
 			parent = palloc0(sizeof(GistSortedBuildLevelState));
 			parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 			parent->parent = NULL;
 			gistinitpage(parent->pages[0], 0);

 			levelstate->parent = parent;
 		}
 		gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
 	}
 }

 static void
 gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 {
 	if (state->ready_num_pages == 0)
 		return;

 	for (int i = 0; i < state->ready_num_pages; i++)
 	{
 		Page		page = state->ready_pages[i];
 		BlockNumber blkno = state->ready_blknos[i];

 		/* Currently, the blocks must be buffered in order. */
 		if (blkno != state->pages_written)
 			elog(ERROR, "unexpected block number to flush GiST sorting build");

 		PageSetLSN(page, !FileEncryptionEnabled ? GistBuildLSN :
 				   LSNForEncryption(RelationIsPermanent(state->indexrel)));
 		/* Make sure LSNs are vaild, and if encryption, are not constant. */
 		Assert(!XLogRecPtrIsInvalid(PageGetLSN(page)) &&
 			   (!FileEncryptionEnabled ||
 				PageGetLSN(page) != GistBuildLSN));
 		PageEncryptInplace(page, MAIN_FORKNUM,
 				   blkno);
 		PageSetChecksumInplace(page, blkno);
 		smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
 				   true);

 		state->pages_written++;
 	}

 	if (RelationNeedsWAL(state->indexrel))
 		log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages,
 					 state->ready_blknos, state->ready_pages, true);

 	for (int i = 0; i < state->ready_num_pages; i++)
 		pfree(state->ready_pages[i]);

 	state->ready_num_pages = 0;
 }


 /*-------------------------------------------------------------------------
  * Routines for non-sorted build
  *-------------------------------------------------------------------------
  */

 /*
  * Attempt to switch to buffering mode.
  *
  * If there is not enough memory for buffering build, sets bufferingMode
  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
  * GIST_BUFFERING_ACTIVE.
  */
 static void
 gistInitBuffering(GISTBuildState *buildstate)
 {
 	Relation	index = buildstate->indexrel;
 	int			pagesPerBuffer;
 	Size		pageFreeSpace;
 	Size		itupAvgSize,
 				itupMinSize;
 	double		avgIndexTuplesPerPage,
 				maxIndexTuplesPerPage;
 	int			i;
 	int			levelStep;

 	/* Calc space of index page which is available for index tuples */
 	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
 		- sizeof(ItemIdData)
 		- buildstate->freespace;

 	/*
 	 * Calculate average size of already inserted index tuples using gathered
 	 * statistics.
 	 */
 	itupAvgSize = (double) buildstate->indtuplesSize /
 		(double) buildstate->indtuples;

 	/*
 	 * Calculate minimal possible size of index tuple by index metadata.
 	 * Minimal possible size of varlena is VARHDRSZ.
 	 *
 	 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
 	 * And we should take padding into account here.
 	 */
 	itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
 	for (i = 0; i < index->rd_att->natts; i++)
 	{
 		if (TupleDescAttr(index->rd_att, i)->attlen < 0)
 			itupMinSize += VARHDRSZ;
 		else
 			itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
 	}

 	/* Calculate average and maximal number of index tuples which fit to page */
 	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
 	maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;

 	/*
 	 * We need to calculate two parameters for the buffering algorithm:
 	 * levelStep and pagesPerBuffer.
 	 *
 	 * levelStep determines the size of subtree that we operate on, while
 	 * emptying a buffer. A higher value is better, as you need fewer buffer
 	 * emptying steps to build the index. However, if you set it too high, the
 	 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
 	 * of the buffers.
 	 *
 	 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
 	 * the number of tuples on page (ie. fanout), and M is the amount of
 	 * internal memory available. Curiously, they doesn't explain *why* that
 	 * setting is optimal. We calculate it by taking the highest levelStep so
 	 * that a subtree still fits in cache. For a small B, our way of
 	 * calculating levelStep is very close to Arge et al's formula. For a
 	 * large B, our formula gives a value that is 2x higher.
 	 *
 	 * The average size (in pages) of a subtree of depth n can be calculated
 	 * as a geometric series:
 	 *
 	 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
 	 *
 	 * where B is the average number of index tuples on page. The subtree is
 	 * cached in the shared buffer cache and the OS cache, so we choose
 	 * levelStep so that the subtree size is comfortably smaller than
 	 * effective_cache_size, with a safety factor of 4.
 	 *
 	 * The estimate on the average number of index tuples on page is based on
 	 * average tuple sizes observed before switching to buffered build, so the
 	 * real subtree size can be somewhat larger. Also, it would selfish to
 	 * gobble the whole cache for our index build. The safety factor of 4
 	 * should account for those effects.
 	 *
 	 * The other limiting factor for setting levelStep is that while
 	 * processing a subtree, we need to hold one page for each buffer at the
 	 * next lower buffered level. The max. number of buffers needed for that
 	 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
 	 * hopefully maintenance_work_mem is set high enough that you're
 	 * constrained by effective_cache_size rather than maintenance_work_mem.
 	 *
 	 * XXX: the buffer hash table consumes a fair amount of memory too per
 	 * buffer, but that is not currently taken into account. That scales on
 	 * the total number of buffers used, ie. the index size and on levelStep.
 	 * Note that a higher levelStep *reduces* the amount of memory needed for
 	 * the hash table.
 	 */
 	levelStep = 1;
 	for (;;)
 	{
 		double		subtreesize;
 		double		maxlowestlevelpages;

 		/* size of an average subtree at this levelStep (in pages). */
 		subtreesize =
 			(1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
 			(1 - avgIndexTuplesPerPage);

 		/* max number of pages at the lowest level of a subtree */
 		maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);

 		/* subtree must fit in cache (with safety factor of 4) */
 		if (subtreesize > effective_cache_size / 4)
 			break;

 		/* each node in the lowest level of a subtree has one page in memory */
 		if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
 			break;

 		/* Good, we can handle this levelStep. See if we can go one higher. */
 		levelStep++;
 	}

 	/*
 	 * We just reached an unacceptable value of levelStep in previous loop.
 	 * So, decrease levelStep to get last acceptable value.
 	 */
 	levelStep--;

 	/*
 	 * If there's not enough cache or maintenance_work_mem, fall back to plain
 	 * inserts.
 	 */
 	if (levelStep <= 0)
 	{
 		elog(DEBUG1, "failed to switch to buffered GiST build");
 		buildstate->buildMode = GIST_BUFFERING_DISABLED;
 		return;
 	}

 	/*
 	 * The second parameter to set is pagesPerBuffer, which determines the
 	 * size of each buffer. We adjust pagesPerBuffer also during the build,
 	 * which is why this calculation is in a separate function.
 	 */
 	pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);

 	/* Initialize GISTBuildBuffers with these parameters */
 	buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
 											gistGetMaxLevel(index));

 	gistInitParentMap(buildstate);

 	buildstate->buildMode = GIST_BUFFERING_ACTIVE;

 	elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
 		 levelStep, pagesPerBuffer);
 }

 /*
  * Calculate pagesPerBuffer parameter for the buffering algorithm.
  *
  * Buffer size is chosen so that assuming that tuples are distributed
  * randomly, emptying half a buffer fills on average one page in every buffer
  * at the next lower level.
  */
 static int
 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
 {
 	double		pagesPerBuffer;
 	double		avgIndexTuplesPerPage;
 	double		itupAvgSize;
 	Size		pageFreeSpace;

 	/* Calc space of index page which is available for index tuples */
 	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
 		- sizeof(ItemIdData)
 		- buildstate->freespace;

 	/*
 	 * Calculate average size of already inserted index tuples using gathered
 	 * statistics.
 	 */
 	itupAvgSize = (double) buildstate->indtuplesSize /
 		(double) buildstate->indtuples;

 	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;

 	/*
 	 * Recalculate required size of buffers.
 	 */
 	pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);

 	return (int) rint(pagesPerBuffer);
 }

 /*
  * Per-tuple callback for table_index_build_scan.
  */
 static void
 gistBuildCallback(Relation index,
 				  ItemPointer tid,
 				  Datum *values,
 				  bool *isnull,
 				  bool tupleIsAlive,
 				  void *state)
 {
 	GISTBuildState *buildstate = (GISTBuildState *) state;
 	IndexTuple	itup;
 	MemoryContext oldCtx;

 	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);

 	/* form an index tuple and point it at the heap tuple */
 	itup = gistFormTuple(buildstate->giststate, index,
 						 values, isnull,
 						 true);
 	itup->t_tid = *tid;

 	/* Update tuple count and total size. */
 	buildstate->indtuples += 1;
 	buildstate->indtuplesSize += IndexTupleSize(itup);

 	/*
 	 * XXX In buffering builds, the tempCxt is also reset down inside
 	 * gistProcessEmptyingQueue().  This is not great because it risks
 	 * confusion and possible use of dangling pointers (for example, itup
 	 * might be already freed when control returns here).  It's generally
 	 * better that a memory context be "owned" by only one function.  However,
 	 * currently this isn't causing issues so it doesn't seem worth the amount
 	 * of refactoring that would be needed to avoid it.
 	 */
 	if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
 	{
 		/* We have buffers, so use them. */
 		gistBufferingBuildInsert(buildstate, itup);
 	}
 	else
 	{
 		/*
 		 * There's no buffers (yet). Since we already have the index relation
 		 * locked, we call gistdoinsert directly.
 		 */
 		gistdoinsert(index, itup, buildstate->freespace,
 					 buildstate->giststate, buildstate->heaprel, true);
 	}

 	MemoryContextSwitchTo(oldCtx);
 	MemoryContextReset(buildstate->giststate->tempCxt);

 	if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
 		buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
 	{
 		/* Adjust the target buffer size now */
 		buildstate->gfbb->pagesPerBuffer =
 			calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
 	}

 	/*
 	 * In 'auto' mode, check if the index has grown too large to fit in cache,
 	 * and switch to buffering mode if it has.
 	 *
 	 * To avoid excessive calls to smgrnblocks(), only check this every
 	 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
 	 *
 	 * In 'stats' state, switch as soon as we have seen enough tuples to have
 	 * some idea of the average tuple size.
 	 */
 	if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
 		 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
 		 effective_cache_size < smgrnblocks(RelationGetSmgr(index),
 											MAIN_FORKNUM)) ||
 		(buildstate->buildMode == GIST_BUFFERING_STATS &&
 		 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
 	{
 		/*
 		 * Index doesn't fit in effective cache anymore. Try to switch to
 		 * buffering build mode.
 		 */
 		gistInitBuffering(buildstate);
 	}
 }

 /*
  * Insert function for buffering index build.
  */
 static void
 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
 {
 	/* Insert the tuple to buffers. */
 	gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);

 	/* If we filled up (half of a) buffer, process buffer emptying. */
 	gistProcessEmptyingQueue(buildstate);
 }

 /*
  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
  * page or node buffer, and inserts the tuple there. Returns true if we have
  * to stop buffer emptying process (because one of child buffers can't take
  * index tuples anymore).
  */
 static bool
 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
 				BlockNumber startblkno, int startlevel)
 {
 	GISTSTATE  *giststate = buildstate->giststate;
 	GISTBuildBuffers *gfbb = buildstate->gfbb;
 	Relation	indexrel = buildstate->indexrel;
 	BlockNumber childblkno;
 	Buffer		buffer;
 	bool		result = false;
 	BlockNumber blkno;
 	int			level;
 	OffsetNumber downlinkoffnum = InvalidOffsetNumber;
 	BlockNumber parentblkno = InvalidBlockNumber;

 	CHECK_FOR_INTERRUPTS();

 	/*
 	 * Loop until we reach a leaf page (level == 0) or a level with buffers
 	 * (not including the level we start at, because we would otherwise make
 	 * no progress).
 	 */
 	blkno = startblkno;
 	level = startlevel;
 	for (;;)
 	{
 		ItemId		iid;
 		IndexTuple	idxtuple,
 					newtup;
 		Page		page;
 		OffsetNumber childoffnum;

 		/* Have we reached a level with buffers? */
 		if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
 			break;

 		/* Have we reached a leaf page? */
 		if (level == 0)
 			break;

 		/*
 		 * Nope. Descend down to the next level then. Choose a child to
 		 * descend down to.
 		 */

 		buffer = ReadBuffer(indexrel, blkno);
 		LockBuffer(buffer, GIST_EXCLUSIVE);

 		page = (Page) BufferGetPage(buffer);
 		childoffnum = gistchoose(indexrel, page, itup, giststate);
 		iid = PageGetItemId(page, childoffnum);
 		idxtuple = (IndexTuple) PageGetItem(page, iid);
 		childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));

 		if (level > 1)
 			gistMemorizeParent(buildstate, childblkno, blkno);

 		/*
 		 * Check that the key representing the target child node is consistent
 		 * with the key we're inserting. Update it if it's not.
 		 */
 		newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
 		if (newtup)
 		{
 			blkno = gistbufferinginserttuples(buildstate,
 											  buffer,
 											  level,
 											  &newtup,
 											  1,
 											  childoffnum,
 											  InvalidBlockNumber,
 											  InvalidOffsetNumber);
 			/* gistbufferinginserttuples() released the buffer */
 		}
 		else
 			UnlockReleaseBuffer(buffer);

 		/* Descend to the child */
 		parentblkno = blkno;
 		blkno = childblkno;
 		downlinkoffnum = childoffnum;
 		Assert(level > 0);
 		level--;
 	}

 	if (LEVEL_HAS_BUFFERS(level, gfbb))
 	{
 		/*
 		 * We've reached level with buffers. Place the index tuple to the
 		 * buffer, and add the buffer to the emptying queue if it overflows.
 		 */
 		GISTNodeBuffer *childNodeBuffer;

 		/* Find the buffer or create a new one */
 		childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);

 		/* Add index tuple to it */
 		gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);

 		if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
 			result = true;
 	}
 	else
 	{
 		/*
 		 * We've reached a leaf page. Place the tuple here.
 		 */
 		Assert(level == 0);
 		buffer = ReadBuffer(indexrel, blkno);
 		LockBuffer(buffer, GIST_EXCLUSIVE);
 		gistbufferinginserttuples(buildstate, buffer, level,
 								  &itup, 1, InvalidOffsetNumber,
 								  parentblkno, downlinkoffnum);
 		/* gistbufferinginserttuples() released the buffer */
 	}

 	return result;
 }

 /*
  * Insert tuples to a given page.
  *
  * This is analogous with gistinserttuples() in the regular insertion code.
  *
  * Returns the block number of the page where the (first) new or updated tuple
  * was inserted. Usually that's the original page, but might be a sibling page
  * if the original page was split.
  *
  * Caller should hold a lock on 'buffer' on entry. This function will unlock
  * and unpin it.
  */
 static BlockNumber
 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
 						  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
 						  BlockNumber parentblk, OffsetNumber downlinkoffnum)
 {
 	GISTBuildBuffers *gfbb = buildstate->gfbb;
 	List	   *splitinfo;
 	bool		is_split;
 	BlockNumber placed_to_blk = InvalidBlockNumber;

 	is_split = gistplacetopage(buildstate->indexrel,
 							   buildstate->freespace,
 							   buildstate->giststate,
 							   buffer,
 							   itup, ntup, oldoffnum, &placed_to_blk,
 							   InvalidBuffer,
 							   &splitinfo,
 							   false,
 							   buildstate->heaprel, true);

 	/*
 	 * If this is a root split, update the root path item kept in memory. This
 	 * ensures that all path stacks are always complete, including all parent
 	 * nodes up to the root. That simplifies the algorithm to re-find correct
 	 * parent.
 	 */
 	if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
 	{
 		Page		page = BufferGetPage(buffer);
 		OffsetNumber off;
 		OffsetNumber maxoff;

 		Assert(level == gfbb->rootlevel);
 		gfbb->rootlevel++;

 		elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);

 		/*
 		 * All the downlinks on the old root page are now on one of the child
 		 * pages. Visit all the new child pages to memorize the parents of the
 		 * grandchildren.
 		 */
 		if (gfbb->rootlevel > 1)
 		{
 			maxoff = PageGetMaxOffsetNumber(page);
 			for (off = FirstOffsetNumber; off <= maxoff; off++)
 			{
 				ItemId		iid = PageGetItemId(page, off);
 				IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
 				BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 				Buffer		childbuf = ReadBuffer(buildstate->indexrel, childblkno);

 				LockBuffer(childbuf, GIST_SHARE);
 				gistMemorizeAllDownlinks(buildstate, childbuf);
 				UnlockReleaseBuffer(childbuf);

 				/*
 				 * Also remember that the parent of the new child page is the
 				 * root block.
 				 */
 				gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
 			}
 		}
 	}

 	if (splitinfo)
 	{
 		/*
 		 * Insert the downlinks to the parent. This is analogous with
 		 * gistfinishsplit() in the regular insertion code, but the locking is
 		 * simpler, and we have to maintain the buffers on internal nodes and
 		 * the parent map.
 		 */
 		IndexTuple *downlinks;
 		int			ndownlinks,
 					i;
 		Buffer		parentBuffer;
 		ListCell   *lc;

 		/* Parent may have changed since we memorized this path. */
 		parentBuffer =
 			gistBufferingFindCorrectParent(buildstate,
 										   BufferGetBlockNumber(buffer),
 										   level,
 										   &parentblk,
 										   &downlinkoffnum);

 		/*
 		 * If there's a buffer associated with this page, that needs to be
 		 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
 		 * downlinks in 'splitinfo', to make sure they're consistent not only
 		 * with the tuples already on the pages, but also the tuples in the
 		 * buffers that will eventually be inserted to them.
 		 */
 		gistRelocateBuildBuffersOnSplit(gfbb,
 										buildstate->giststate,
 										buildstate->indexrel,
 										level,
 										buffer, splitinfo);

 		/* Create an array of all the downlink tuples */
 		ndownlinks = list_length(splitinfo);
 		downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
 		i = 0;
 		foreach(lc, splitinfo)
 		{
 			GISTPageSplitInfo *splitinfo = lfirst(lc);

 			/*
 			 * Remember the parent of each new child page in our parent map.
 			 * This assumes that the downlinks fit on the parent page. If the
 			 * parent page is split, too, when we recurse up to insert the
 			 * downlinks, the recursive gistbufferinginserttuples() call will
 			 * update the map again.
 			 */
 			if (level > 0)
 				gistMemorizeParent(buildstate,
 								   BufferGetBlockNumber(splitinfo->buf),
 								   BufferGetBlockNumber(parentBuffer));

 			/*
 			 * Also update the parent map for all the downlinks that got moved
 			 * to a different page. (actually this also loops through the
 			 * downlinks that stayed on the original page, but it does no
 			 * harm).
 			 */
 			if (level > 1)
 				gistMemorizeAllDownlinks(buildstate, splitinfo->buf);

 			/*
 			 * Since there's no concurrent access, we can release the lower
 			 * level buffers immediately. This includes the original page.
 			 */
 			UnlockReleaseBuffer(splitinfo->buf);
 			downlinks[i++] = splitinfo->downlink;
 		}

 		/* Insert them into parent. */
 		gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
 								  downlinks, ndownlinks, downlinkoffnum,
 								  InvalidBlockNumber, InvalidOffsetNumber);

 		list_free_deep(splitinfo);	/* we don't need this anymore */
 	}
 	else
 		UnlockReleaseBuffer(buffer);

 	return placed_to_blk;
 }

 /*
  * Find the downlink pointing to a child page.
  *
  * 'childblkno' indicates the child page to find the parent for. 'level' is
  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
  * point to a location where the downlink used to be - we will check that
  * location first, and save some cycles if it hasn't moved. The function
  * returns a buffer containing the downlink, exclusively-locked, and
  * *parentblkno and *downlinkoffnum are set to the real location of the
  * downlink.
  *
  * If the child page is a leaf (level == 0), the caller must supply a correct
  * parentblkno. Otherwise we use the parent map hash table to find the parent
  * block.
  *
  * This function serves the same purpose as gistFindCorrectParent() during
  * normal index inserts, but this is simpler because we don't need to deal
  * with concurrent inserts.
  */
 static Buffer
 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
 							   BlockNumber childblkno, int level,
 							   BlockNumber *parentblkno,
 							   OffsetNumber *downlinkoffnum)
 {
 	BlockNumber parent;
 	Buffer		buffer;
 	Page		page;
 	OffsetNumber maxoff;
 	OffsetNumber off;

 	if (level > 0)
 		parent = gistGetParent(buildstate, childblkno);
 	else
 	{
 		/*
 		 * For a leaf page, the caller must supply a correct parent block
 		 * number.
 		 */
 		if (*parentblkno == InvalidBlockNumber)
 			elog(ERROR, "no parent buffer provided of child %u", childblkno);
 		parent = *parentblkno;
 	}

 	buffer = ReadBuffer(buildstate->indexrel, parent);
 	page = BufferGetPage(buffer);
 	LockBuffer(buffer, GIST_EXCLUSIVE);
 	gistcheckpage(buildstate->indexrel, buffer);
 	maxoff = PageGetMaxOffsetNumber(page);

 	/* Check if it was not moved */
 	if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
 		*downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
 	{
 		ItemId		iid = PageGetItemId(page, *downlinkoffnum);
 		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);

 		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
 		{
 			/* Still there */
 			return buffer;
 		}
 	}

 	/*
 	 * Downlink was not at the offset where it used to be. Scan the page to
 	 * find it. During normal gist insertions, it might've moved to another
 	 * page, to the right, but during a buffering build, we keep track of the
 	 * parent of each page in the lookup table so we should always know what
 	 * page it's on.
 	 */
 	for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
 	{
 		ItemId		iid = PageGetItemId(page, off);
 		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);

 		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
 		{
 			/* yes!!, found it */
 			*downlinkoffnum = off;
 			return buffer;
 		}
 	}

 	elog(ERROR, "failed to re-find parent for block %u", childblkno);
 	return InvalidBuffer;		/* keep compiler quiet */
 }

 /*
  * Process buffers emptying stack. Emptying of one buffer can cause emptying
  * of other buffers. This function iterates until this cascading emptying
  * process finished, e.g. until buffers emptying stack is empty.
  */
 static void
 gistProcessEmptyingQueue(GISTBuildState *buildstate)
 {
 	GISTBuildBuffers *gfbb = buildstate->gfbb;

 	/* Iterate while we have elements in buffers emptying stack. */
 	while (gfbb->bufferEmptyingQueue != NIL)
 	{
 		GISTNodeBuffer *emptyingNodeBuffer;

 		/* Get node buffer from emptying stack. */
 		emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
 		gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
 		emptyingNodeBuffer->queuedForEmptying = false;

 		/*
 		 * We are going to load last pages of buffers where emptying will be
 		 * to. So let's unload any previously loaded buffers.
 		 */
 		gistUnloadNodeBuffers(gfbb);

 		/*
 		 * Pop tuples from the buffer and run them down to the buffers at
 		 * lower level, or leaf pages. We continue until one of the lower
 		 * level buffers fills up, or this buffer runs empty.
 		 *
 		 * In Arge et al's paper, the buffer emptying is stopped after
 		 * processing 1/2 node buffer worth of tuples, to avoid overfilling
 		 * any of the lower level buffers. However, it's more efficient to
 		 * keep going until one of the lower level buffers actually fills up,
 		 * so that's what we do. This doesn't need to be exact, if a buffer
 		 * overfills by a few tuples, there's no harm done.
 		 */
 		while (true)
 		{
 			IndexTuple	itup;

 			/* Get next index tuple from the buffer */
 			if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
 				break;

 			/*
 			 * Run it down to the underlying node buffer or leaf page.
 			 *
 			 * Note: it's possible that the buffer we're emptying splits as a
 			 * result of this call. If that happens, our emptyingNodeBuffer
 			 * points to the left half of the split. After split, it's very
 			 * likely that the new left buffer is no longer over the half-full
 			 * threshold, but we might as well keep flushing tuples from it
 			 * until we fill a lower-level buffer.
 			 */
 			if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
 			{
 				/*
 				 * A lower level buffer filled up. Stop emptying this buffer,
 				 * to avoid overflowing the lower level buffer.
 				 */
 				break;
 			}

 			/* Free all the memory allocated during index tuple processing */
 			MemoryContextReset(buildstate->giststate->tempCxt);
 		}
 	}
 }

 /*
  * Empty all node buffers, from top to bottom. This is done at the end of
  * index build to flush all remaining tuples to the index.
  *
  * Note: This destroys the buffersOnLevels lists, so the buffers should not
  * be inserted to after this call.
  */
 static void
 gistEmptyAllBuffers(GISTBuildState *buildstate)
 {
 	GISTBuildBuffers *gfbb = buildstate->gfbb;
 	MemoryContext oldCtx;
 	int			i;

 	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);

 	/*
 	 * Iterate through the levels from top to bottom.
 	 */
 	for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
 	{
 		/*
 		 * Empty all buffers on this level. Note that new buffers can pop up
 		 * in the list during the processing, as a result of page splits, so a
 		 * simple walk through the list won't work. We remove buffers from the
 		 * list when we see them empty; a buffer can't become non-empty once
 		 * it's been fully emptied.
 		 */
 		while (gfbb->buffersOnLevels[i] != NIL)
 		{
 			GISTNodeBuffer *nodeBuffer;

 			nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);

 			if (nodeBuffer->blocksCount != 0)
 			{
 				/*
 				 * Add this buffer to the emptying queue, and proceed to empty
 				 * the queue.
 				 */
 				if (!nodeBuffer->queuedForEmptying)
 				{
 					MemoryContextSwitchTo(gfbb->context);
 					nodeBuffer->queuedForEmptying = true;
 					gfbb->bufferEmptyingQueue =
 						lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
 					MemoryContextSwitchTo(buildstate->giststate->tempCxt);
 				}
 				gistProcessEmptyingQueue(buildstate);
 			}
 			else
 				gfbb->buffersOnLevels[i] =
 					list_delete_first(gfbb->buffersOnLevels[i]);
 		}
 		elog(DEBUG2, "emptied all buffers at level %d", i);
 	}
 	MemoryContextSwitchTo(oldCtx);
 }

 /*
  * Get the depth of the GiST index.
  */
 static int
 gistGetMaxLevel(Relation index)
 {
 	int			maxLevel;
 	BlockNumber blkno;

 	/*
 	 * Traverse down the tree, starting from the root, until we hit the leaf
 	 * level.
 	 */
 	maxLevel = 0;
 	blkno = GIST_ROOT_BLKNO;
 	while (true)
 	{
 		Buffer		buffer;
 		Page		page;
 		IndexTuple	itup;

 		buffer = ReadBuffer(index, blkno);

 		/*
 		 * There's no concurrent access during index build, so locking is just
 		 * pro forma.
 		 */
 		LockBuffer(buffer, GIST_SHARE);
 		page = (Page) BufferGetPage(buffer);

 		if (GistPageIsLeaf(page))
 		{
 			/* We hit the bottom, so we're done. */
 			UnlockReleaseBuffer(buffer);
 			break;
 		}

 		/*
 		 * Pick the first downlink on the page, and follow it. It doesn't
 		 * matter which downlink we choose, the tree has the same depth
 		 * everywhere, so we just pick the first one.
 		 */
 		itup = (IndexTuple) PageGetItem(page,
 										PageGetItemId(page, FirstOffsetNumber));
 		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
 		UnlockReleaseBuffer(buffer);

 		/*
 		 * We're going down on the tree. It means that there is yet one more
 		 * level in the tree.
 		 */
 		maxLevel++;
 	}
 	return maxLevel;
 }


 /*
  * Routines for managing the parent map.
  *
  * Whenever a page is split, we need to insert the downlinks into the parent.
  * We need to somehow find the parent page to do that. In normal insertions,
  * we keep a stack of nodes visited when we descend the tree. However, in
  * buffering build, we can start descending the tree from any internal node,
  * when we empty a buffer by cascading tuples to its children. So we don't
  * have a full stack up to the root available at that time.
  *
  * So instead, we maintain a hash table to track the parent of every internal
  * page. We don't need to track the parents of leaf nodes, however. Whenever
  * we insert to a leaf, we've just descended down from its parent, so we know
  * its immediate parent already. This helps a lot to limit the memory used
  * by this hash table.
  *
  * Whenever an internal node is split, the parent map needs to be updated.
  * the parent of the new child page needs to be recorded, and also the
  * entries for all page whose downlinks are moved to a new page at the split
  * needs to be updated.
  *
  * We also update the parent map whenever we descend the tree. That might seem
  * unnecessary, because we maintain the map whenever a downlink is moved or
  * created, but it is needed because we switch to buffering mode after
  * creating a tree with regular index inserts. Any pages created before
  * switching to buffering mode will not be present in the parent map initially,
  * but will be added there the first time we visit them.
  */

 typedef struct
 {
 	BlockNumber childblkno;		/* hash key */
 	BlockNumber parentblkno;
 } ParentMapEntry;

 static void
 gistInitParentMap(GISTBuildState *buildstate)
 {
 	HASHCTL		hashCtl;

 	hashCtl.keysize = sizeof(BlockNumber);
 	hashCtl.entrysize = sizeof(ParentMapEntry);
 	hashCtl.hcxt = CurrentMemoryContext;
 	buildstate->parentMap = hash_create("gistbuild parent map",
 										1024,
 										&hashCtl,
 										HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 }

 static void
 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
 {
 	ParentMapEntry *entry;
 	bool		found;

 	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
 										   &child,
 										   HASH_ENTER,
 										   &found);
 	entry->parentblkno = parent;
 }

 /*
  * Scan all downlinks on a page, and memorize their parent.
  */
 static void
 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
 {
 	OffsetNumber maxoff;
 	OffsetNumber off;
 	BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
 	Page		page = BufferGetPage(parentbuf);

 	Assert(!GistPageIsLeaf(page));

 	maxoff = PageGetMaxOffsetNumber(page);
 	for (off = FirstOffsetNumber; off <= maxoff; off++)
 	{
 		ItemId		iid = PageGetItemId(page, off);
 		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
 		BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));

 		gistMemorizeParent(buildstate, childblkno, parentblkno);
 	}
 }

 static BlockNumber
 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
 {
 	ParentMapEntry *entry;
 	bool		found;

 	/* Find node buffer in hash table */
 	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
 										   &child,
 										   HASH_FIND,
 										   &found);
 	if (!found)
 		elog(ERROR, "could not find parent of block %u in lookup table", child);

 	return entry->parentblkno;
 }