src/backend/access/nbtree/nbtsort.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * nbtsort.c
  *		Build a btree from sorted input by loading leaf pages sequentially.
  *
  * NOTES
  *
  * We use tuplesort.c to sort the given index tuples into order.
  * Then we scan the index tuples in order and build the btree pages
  * for each level.  We load source tuples into leaf-level pages.
  * Whenever we fill a page at one level, we add a link to it to its
  * parent level (starting a new parent level if necessary).  When
  * done, we write out each final page on each level, adding it to
  * its parent level.  When we have only one page on a level, it must be
  * the root -- it can be attached to the btree metapage and we are done.
  *
  * It is not wise to pack the pages entirely full, since then *any*
  * insertion would cause a split (and not only of the leaf page; the need
  * for a split would cascade right up the tree).  The steady-state load
  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
  * pages to the user-controllable fill factor (default 90%) while upper pages
  * are always packed to 70%.  This gives us reasonable density (there aren't
  * many upper pages if the keys are reasonable-size) without risking a lot of
  * cascading splits during early insertions.
  *
  * Formerly the index pages being built were kept in shared buffers, but
  * that is of no value (since other backends have no interest in them yet)
  * and it created locking problems for CHECKPOINT, because the upper-level
  * pages were held exclusive-locked for long periods.  Now we just build
  * the pages in local memory and smgrwrite or smgrextend them as we finish
  * them.  They will need to be re-read into shared buffers on first use after
  * the build finishes.
  *
  * This code isn't concerned about the FSM at all. The caller is responsible
  * for initializing that.
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/access/nbtree/nbtsort.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/nbtree.h"
 #include "access/parallel.h"
 #include "access/relscan.h"
 #include "access/table.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
 #include "commands/progress.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
 #include "utils/rel.h"
 #include "utils/sortsupport.h"
 #include "utils/tuplesort.h"


 /* Magic numbers for parallel state sharing */
 #define PARALLEL_KEY_BTREE_SHARED		UINT64CONST(0xA000000000000001)
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
 #define PARALLEL_KEY_TUPLESORT_SPOOL2	UINT64CONST(0xA000000000000003)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xA000000000000004)
 #define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xA000000000000005)
 #define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xA000000000000006)

 /*
  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
  * parallel index builds.  This may be useful as a debugging aid.
 #undef DISABLE_LEADER_PARTICIPATION
  */

 /*
  * Status record for spooling/sorting phase.  (Note we may have two of
  * these due to the special requirements for uniqueness-checking with
  * dead tuples.)
  */
 typedef struct BTSpool
 {
 	void *sortstate;	/* state data for tuplesort.c */
 	Relation	heap;
 	Relation	index;
 	bool		isunique;
 	bool		nulls_not_distinct;
 } BTSpool;

 /*
  * Status for index builds performed in parallel.  This is allocated in a
  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
  * entry, private to tuplesort.c but allocated by this module on its behalf.
  */
 typedef struct BTShared
 {
 	/*
 	 * These fields are not modified during the sort.  They primarily exist
 	 * for the benefit of worker processes that need to create BTSpool state
 	 * corresponding to that used by the leader.
 	 */
 	Oid			heaprelid;
 	Oid			indexrelid;
 	bool		isunique;
 	bool		nulls_not_distinct;
 	bool		isconcurrent;
 	int			scantuplesortstates;

 	/*
 	 * workersdonecv is used to monitor the progress of workers.  All parallel
 	 * participants must indicate that they are done before leader can use
 	 * mutable state that workers maintain during scan (and before leader can
 	 * proceed to tuplesort_performsort()).
 	 */
 	ConditionVariable workersdonecv;

 	/*
 	 * mutex protects all fields before heapdesc.
 	 *
 	 * These fields contain status information of interest to B-Tree index
 	 * builds that must work just the same when an index is built in parallel.
 	 */
 	slock_t		mutex;

 	/*
 	 * Mutable state that is maintained by workers, and reported back to
 	 * leader at end of parallel scan.
 	 *
 	 * nparticipantsdone is number of worker processes finished.
 	 *
 	 * reltuples is the total number of input heap tuples.
 	 *
 	 * havedead indicates if RECENTLY_DEAD tuples were encountered during
 	 * build.
 	 *
 	 * indtuples is the total number of tuples that made it into the index.
 	 *
 	 * brokenhotchain indicates if any worker detected a broken HOT chain
 	 * during build.
 	 */
 	int			nparticipantsdone;
 	double		reltuples;
 	bool		havedead;
 	double		indtuples;
 	bool		brokenhotchain;

 	/*
 	 * ParallelTableScanDescData data follows. Can't directly embed here, as
 	 * implementations of the parallel table scan desc interface might need
 	 * stronger alignment.
 	 */
 } BTShared;

 /*
  * Return pointer to a BTShared's parallel table scan.
  *
  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
  * MAXALIGN.
  */
 #define ParallelTableScanFromBTShared(shared) \
 	(ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))

 /*
  * Status for leader in parallel index build.
  */
 typedef struct BTLeader
 {
 	/* parallel context itself */
 	ParallelContext *pcxt;

 	/*
 	 * nparticipanttuplesorts is the exact number of worker processes
 	 * successfully launched, plus one leader process if it participates as a
 	 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
 	 * participating as a worker).
 	 */
 	int			nparticipanttuplesorts;

 	/*
 	 * Leader process convenience pointers to shared state (leader avoids TOC
 	 * lookups).
 	 *
 	 * btshared is the shared state for entire build.  sharedsort is the
 	 * shared, tuplesort-managed state passed to each process tuplesort.
 	 * sharedsort2 is the corresponding btspool2 shared state, used only when
 	 * building unique indexes.  snapshot is the snapshot used by the scan iff
 	 * an MVCC snapshot is required.
 	 */
 	BTShared   *btshared;
 	Sharedsort *sharedsort;
 	Sharedsort *sharedsort2;
 	Snapshot	snapshot;
 	WalUsage   *walusage;
 	BufferUsage *bufferusage;
 } BTLeader;

 /*
  * Working state for btbuild and its callback.
  *
  * When parallel CREATE INDEX is used, there is a BTBuildState for each
  * participant.
  */
 typedef struct BTBuildState
 {
 	bool		isunique;
 	bool		nulls_not_distinct;
 	bool		havedead;
 	Relation	heap;
 	BTSpool    *spool;

 	/*
 	 * spool2 is needed only when the index is a unique index. Dead tuples are
 	 * put into spool2 instead of spool in order to avoid uniqueness check.
 	 */
 	BTSpool    *spool2;
 	double		indtuples;

 	/*
 	 * btleader is only present when a parallel index build is performed, and
 	 * only in the leader process. (Actually, only the leader has a
 	 * BTBuildState.  Workers have their own spool and spool2, though.)
 	 */
 	BTLeader   *btleader;
 } BTBuildState;

 /*
  * Status record for a btree page being built.  We have one of these
  * for each active tree level.
  */
 typedef struct BTPageState
 {
 	Page		btps_page;		/* workspace for page building */
 	BlockNumber btps_blkno;		/* block # to write this page at */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
 	Size		btps_lastextra; /* last item's extra posting list space */
 	uint32		btps_level;		/* tree level (0 = leaf) */
 	Size		btps_full;		/* "full" if less than this much free space */
 	struct BTPageState *btps_next;	/* link to parent level, if any */
 } BTPageState;

 /*
  * Overall status record for index writing phase.
  */
 typedef struct BTWriteState
 {
 	Relation	heap;
 	Relation	index;
 	BTScanInsert inskey;		/* generic insertion scankey */
 	bool		btws_use_wal;	/* dump pages to WAL? */
 	BlockNumber btws_pages_alloced; /* # pages allocated */
 	BlockNumber btws_pages_written; /* # pages written out */
 	Page		btws_zeropage;	/* workspace for filling zeroes */
 } BTWriteState;


 static double _bt_spools_heapscan(Relation heap, Relation index,
 								  BTBuildState *buildstate, IndexInfo *indexInfo);
 static void _bt_spooldestroy(BTSpool *btspool);
 static void _bt_spool(BTSpool *btspool, ItemPointer self,
 					  Datum *values, bool *isnull);
 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
 							   bool *isnull, bool tupleIsAlive, void *state);
 static Page _bt_blnewpage(uint32 level);
 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
 static void _bt_slideleft(Page rightmostpage);
 static void _bt_sortaddtup(Page page, Size itemsize,
 						   IndexTuple itup, OffsetNumber itup_off,
 						   bool newfirstdataitem);
 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
 						 IndexTuple itup, Size truncextra);
 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
 										  BTPageState *state,
 										  BTDedupState dstate);
 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
 							   int request);
 static void _bt_end_parallel(BTLeader *btleader);
 static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
 static double _bt_parallel_heapscan(BTBuildState *buildstate,
 									bool *brokenhotchain);
 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
 									   BTShared *btshared, Sharedsort *sharedsort,
 									   Sharedsort *sharedsort2, int sortmem,
 									   bool progress);


 /*
  *	btbuild() -- build a new btree index.
  */
 IndexBuildResult *
 btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 {
 	IndexBuildResult *result;
 	BTBuildState buildstate;
 	double		reltuples;

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 		ResetUsage();
 #endif							/* BTREE_BUILD_STATS */

 	buildstate.isunique = indexInfo->ii_Unique;
 	buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
 	buildstate.havedead = false;
 	buildstate.heap = heap;
 	buildstate.spool = NULL;
 	buildstate.spool2 = NULL;
 	buildstate.indtuples = 0;
 	buildstate.btleader = NULL;

 	/*
 	 * We expect to be called exactly once for any index relation. If that's
 	 * not the case, big trouble's what we have.
 	 */
 	if (RelationGetNumberOfBlocks(index) != 0)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));

 	reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);

 	/*
 	 * Finish the build by (1) completing the sort of the spool file, (2)
 	 * inserting the sorted tuples into btree pages and (3) building the upper
 	 * levels.  Finally, it may also be necessary to end use of parallelism.
 	 */
 	_bt_leafbuild(buildstate.spool, buildstate.spool2);
 	_bt_spooldestroy(buildstate.spool);
 	if (buildstate.spool2)
 		_bt_spooldestroy(buildstate.spool2);
 	if (buildstate.btleader)
 		_bt_end_parallel(buildstate.btleader);

 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

 	result->heap_tuples = reltuples;
 	result->index_tuples = buildstate.indtuples;

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 	{
 		ShowUsage("BTREE BUILD STATS");
 		ResetUsage();
 	}
 #endif							/* BTREE_BUILD_STATS */

 	return result;
 }

 /*
  * Create and initialize one or two spool structures, and save them in caller's
  * buildstate argument.  May also fill-in fields within indexInfo used by index
  * builds.
  *
  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
  * routine encapsulates all aspects of managing parallelism.  Caller need only
  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
  *
  * Returns the total number of heap tuples scanned.
  */
 static double
 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
 					IndexInfo *indexInfo)
 {
 	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
 	SortCoordinate coordinate = NULL;
 	double		reltuples = 0;

 	/*
 	 * We size the sort area as maintenance_work_mem rather than work_mem to
 	 * speed index creation.  This should be OK since a single backend can't
 	 * run multiple index creations in parallel (see also: notes on
 	 * parallelism and maintenance_work_mem below).
 	 */
 	btspool->heap = heap;
 	btspool->index = index;
 	btspool->isunique = indexInfo->ii_Unique;
 	btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;

 	/* Save as primary spool */
 	buildstate->spool = btspool;

 	/* Report table scan phase started */
 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);

 	/* Attempt to launch parallel worker scan when required */
 	if (indexInfo->ii_ParallelWorkers > 0)
 		_bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
 						   indexInfo->ii_ParallelWorkers);

 	/*
 	 * If parallel build requested and at least one worker process was
 	 * successfully launched, set up coordination state
 	 */
 	if (buildstate->btleader)
 	{
 		coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
 		coordinate->isWorker = false;
 		coordinate->nParticipants =
 			buildstate->btleader->nparticipanttuplesorts;
 		coordinate->sharedsort = buildstate->btleader->sharedsort;
 	}

 	/*
 	 * Begin serial/leader tuplesort.
 	 *
 	 * In cases where parallelism is involved, the leader receives the same
 	 * share of maintenance_work_mem as a serial sort (it is generally treated
 	 * in the same way as a serial sort once we return).  Parallel worker
 	 * Tuplesortstates will have received only a fraction of
 	 * maintenance_work_mem, though.
 	 *
 	 * We rely on the lifetime of the Leader Tuplesortstate almost not
 	 * overlapping with any worker Tuplesortstate's lifetime.  There may be
 	 * some small overlap, but that's okay because we rely on leader
 	 * Tuplesortstate only allocating a small, fixed amount of memory here.
 	 * When its tuplesort_performsort() is called (by our caller), and
 	 * significant amounts of memory are likely to be used, all workers must
 	 * have already freed almost all memory held by their Tuplesortstates
 	 * (they are about to go away completely, too).  The overall effect is
 	 * that maintenance_work_mem always represents an absolute high watermark
 	 * on the amount of memory used by a CREATE INDEX operation, regardless of
 	 * the use of parallelism or any other factor.
 	 */
 	buildstate->spool->sortstate =
 		tuplesort_begin_index_btree(heap, index, buildstate->isunique,
 									buildstate->nulls_not_distinct,
 									maintenance_work_mem, coordinate,
 									TUPLESORT_NONE);

 	/*
 	 * If building a unique index, put dead tuples in a second spool to keep
 	 * them out of the uniqueness check.  We expect that the second spool (for
 	 * dead tuples) won't get very full, so we give it only work_mem.
 	 */
 	if (indexInfo->ii_Unique)
 	{
 		BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
 		SortCoordinate coordinate2 = NULL;

 		/* Initialize secondary spool */
 		btspool2->heap = heap;
 		btspool2->index = index;
 		btspool2->isunique = false;
 		/* Save as secondary spool */
 		buildstate->spool2 = btspool2;

 		if (buildstate->btleader)
 		{
 			/*
 			 * Set up non-private state that is passed to
 			 * tuplesort_begin_index_btree() about the basic high level
 			 * coordination of a parallel sort.
 			 */
 			coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
 			coordinate2->isWorker = false;
 			coordinate2->nParticipants =
 				buildstate->btleader->nparticipanttuplesorts;
 			coordinate2->sharedsort = buildstate->btleader->sharedsort2;
 		}

 		/*
 		 * We expect that the second one (for dead tuples) won't get very
 		 * full, so we give it only work_mem
 		 */
 		buildstate->spool2->sortstate =
 			tuplesort_begin_index_btree(heap, index, false, false, work_mem,
 										coordinate2, TUPLESORT_NONE);
 	}

 	/* Fill spool using either serial or parallel heap scan */
 	if (!buildstate->btleader)
 		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
 										   _bt_build_callback, (void *) buildstate,
 										   NULL);
 	else
 		reltuples = _bt_parallel_heapscan(buildstate,
 										  &indexInfo->ii_BrokenHotChain);

 	/*
 	 * Set the progress target for the next phase.  Reset the block number
 	 * values set by table_index_build_scan
 	 */
 	{
 		const int	progress_index[] = {
 			PROGRESS_CREATEIDX_TUPLES_TOTAL,
 			PROGRESS_SCAN_BLOCKS_TOTAL,
 			PROGRESS_SCAN_BLOCKS_DONE
 		};
 		const int64 progress_vals[] = {
 			buildstate->indtuples,
 			0, 0
 		};

 		pgstat_progress_update_multi_param(3, progress_index, progress_vals);
 	}

 	/* okay, all heap tuples are spooled */
 	if (buildstate->spool2 && !buildstate->havedead)
 	{
 		/* spool2 turns out to be unnecessary */
 		_bt_spooldestroy(buildstate->spool2);
 		buildstate->spool2 = NULL;
 	}

 	return reltuples;
 }

 /*
  * clean up a spool structure and its substructures.
  */
 static void
 _bt_spooldestroy(BTSpool *btspool)
 {
 	tuplesort_end(btspool->sortstate);
 	pfree(btspool);
 }

 /*
  * spool an index entry into the sort file.
  */
 static void
 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
 {
 	tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
 								  self, values, isnull);
 }

 /*
  * given a spool loaded by successive calls to _bt_spool,
  * create an entire btree.
  */
 static void
 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 {
 	BTWriteState wstate;

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 	{
 		ShowUsage("BTREE BUILD (Spool) STATISTICS");
 		ResetUsage();
 	}
 #endif							/* BTREE_BUILD_STATS */

 	/* Execute the sort */
 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
 	tuplesort_performsort(btspool->sortstate);
 	if (btspool2)
 	{
 		pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 									 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
 		tuplesort_performsort(btspool2->sortstate);
 	}

 	wstate.heap = btspool->heap;
 	wstate.index = btspool->index;
 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
 	/* _bt_mkscankey() won't set allequalimage without metapage */
 	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
 	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);

 	/* reserve the metapage */
 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
 	wstate.btws_pages_written = 0;
 	wstate.btws_zeropage = NULL;	/* until needed */

 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
 	_bt_load(&wstate, btspool, btspool2);
 }

 /*
  * Per-tuple callback for table_index_build_scan
  */
 static void
 _bt_build_callback(Relation index,
 				   ItemPointer tid,
 				   Datum *values,
 				   bool *isnull,
 				   bool tupleIsAlive,
 				   void *state)
 {
 	BTBuildState *buildstate = (BTBuildState *) state;

 	/*
 	 * insert the index tuple into the appropriate spool file for subsequent
 	 * processing
 	 */
 	if (tupleIsAlive || buildstate->spool2 == NULL)
 		_bt_spool(buildstate->spool, tid, values, isnull);
 	else
 	{
 		/* dead tuples are put into spool2 */
 		buildstate->havedead = true;
 		_bt_spool(buildstate->spool2, tid, values, isnull);
 	}

 	buildstate->indtuples += 1;
 }

 /*
  * allocate workspace for a new, clean btree page, not linked to any siblings.
  */
 static Page
 _bt_blnewpage(uint32 level)
 {
 	Page		page;
 	BTPageOpaque opaque;

 	page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);

 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);

 	/* Initialize BT opaque state */
 	opaque = BTPageGetOpaque(page);
 	opaque->btpo_prev = opaque->btpo_next = P_NONE;
 	opaque->btpo_level = level;
 	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
 	opaque->btpo_cycleid = 0;

 	/* Make the P_HIKEY line pointer appear allocated */
 	((PageHeader) page)->pd_lower += sizeof(ItemIdData);

 	return page;
 }

 /*
  * emit a completed btree page, and release the working storage.
  */
 static void
 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 {
 	/* XLOG stuff */
 	if (wstate->btws_use_wal)
 	{
 		/* We use the XLOG_FPI record type for this */
 		log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
 	}

 	/*
 	 * If we have to write pages nonsequentially, fill in the space with
 	 * zeroes until we come back and overwrite.  This is not logically
 	 * necessary on standard Unix filesystems (unwritten space will read as
 	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
 	 * pages aren't WAL-logged though.
 	 */
 	while (blkno > wstate->btws_pages_written)
 	{
 		if (!wstate->btws_zeropage)
 			wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
 														  PG_IO_ALIGN_SIZE,
 														  MCXT_ALLOC_ZERO);
 		/* don't set checksum for all-zero page */
 		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
 				   wstate->btws_pages_written++,
 				   wstate->btws_zeropage,
 				   true);
 	}

 	PageEncryptInplace(page, MAIN_FORKNUM,
 					   blkno);
 	PageSetChecksumInplace(page, blkno);

 	/*
 	 * Now write the page.  There's no need for smgr to schedule an fsync for
 	 * this write; we'll do it ourselves before ending the build.
 	 */
 	if (blkno == wstate->btws_pages_written)
 	{
 		/* extending the file... */
 		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
 				   page, true);
 		wstate->btws_pages_written++;
 	}
 	else
 	{
 		/* overwriting a block we zero-filled before */
 		smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
 				  page, true);
 	}

 	pfree(page);
 }

 /*
  * allocate and initialize a new BTPageState.  the returned structure
  * is suitable for immediate use by _bt_buildadd.
  */
 static BTPageState *
 _bt_pagestate(BTWriteState *wstate, uint32 level)
 {
 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));

 	/* create initial page for level */
 	state->btps_page = _bt_blnewpage(level);

 	/* and assign it a page position */
 	state->btps_blkno = wstate->btws_pages_alloced++;

 	state->btps_lowkey = NULL;
 	/* initialize lastoff so first item goes into P_FIRSTKEY */
 	state->btps_lastoff = P_HIKEY;
 	state->btps_lastextra = 0;
 	state->btps_level = level;
 	/* set "full" threshold based on level.  See notes at head of file. */
 	if (level > 0)
 		state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
 	else
 		state->btps_full = BTGetTargetPageFreeSpace(wstate->index);

 	/* no parent level, yet */
 	state->btps_next = NULL;

 	return state;
 }

 /*
  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
  * P_HIKEY, overwriting P_HIKEY).
  *
  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
  * rightmost page on its level is not supposed to get a high key.  Now that
  * it's clear that this page is a rightmost page, remove the unneeded empty
  * P_HIKEY line pointer space.
  */
 static void
 _bt_slideleft(Page rightmostpage)
 {
 	OffsetNumber off;
 	OffsetNumber maxoff;
 	ItemId		previi;

 	maxoff = PageGetMaxOffsetNumber(rightmostpage);
 	Assert(maxoff >= P_FIRSTKEY);
 	previi = PageGetItemId(rightmostpage, P_HIKEY);
 	for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
 	{
 		ItemId		thisii = PageGetItemId(rightmostpage, off);

 		*previi = *thisii;
 		previi = thisii;
 	}
 	((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
 }

 /*
  * Add an item to a page being built.
  *
  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
  * raises an error directly.
  *
  * Note that our nbtsort.c caller does not know yet if the page will be
  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
  * caller.  Page that turns out to be the rightmost on its level is fixed by
  * calling _bt_slideleft().
  */
 static void
 _bt_sortaddtup(Page page,
 			   Size itemsize,
 			   IndexTuple itup,
 			   OffsetNumber itup_off,
 			   bool newfirstdataitem)
 {
 	IndexTupleData trunctuple;

 	if (newfirstdataitem)
 	{
 		trunctuple = *itup;
 		trunctuple.t_info = sizeof(IndexTupleData);
 		BTreeTupleSetNAtts(&trunctuple, 0, false);
 		itup = &trunctuple;
 		itemsize = sizeof(IndexTupleData);
 	}

 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
 					false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to the index page");
 }

 /*----------
  * Add an item to a disk page from the sort output (or add a posting list
  * item formed from the sort output).
  *
  * We must be careful to observe the page layout conventions of nbtsearch.c:
  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
  * - on non-leaf pages, the key portion of the first item need not be
  *	 stored, we should store only the link.
  *
  * A leaf page being built looks like:
  *
  * +----------------+---------------------------------+
  * | PageHeaderData | linp0 linp1 linp2 ...           |
  * +-----------+----+---------------------------------+
  * | ... linpN |									  |
  * +-----------+--------------------------------------+
  * |	 ^ last										  |
  * |												  |
  * +-------------+------------------------------------+
  * |			 | itemN ...                          |
  * +-------------+------------------+-----------------+
  * |		  ... item3 item2 item1 | "special space" |
  * +--------------------------------+-----------------+
  *
  * Contrast this with the diagram in bufpage.h; note the mismatch
  * between linps and items.  This is because we reserve linp0 as a
  * placeholder for the pointer to the "high key" item; when we have
  * filled up the page, we will set linp0 to point to itemN and clear
  * linpN.  On the other hand, if we find this is the last (rightmost)
  * page, we leave the items alone and slide the linp array over.  If
  * the high key is to be truncated, offset 1 is deleted, and we insert
  * the truncated high key at offset 1.
  *
  * 'last' pointer indicates the last offset added to the page.
  *
  * 'truncextra' is the size of the posting list in itup, if any.  This
  * information is stashed for the next call here, when we may benefit
  * from considering the impact of truncating away the posting list on
  * the page before deciding to finish the page off.  Posting lists are
  * often relatively large, so it is worth going to the trouble of
  * accounting for the saving from truncating away the posting list of
  * the tuple that becomes the high key (that may be the only way to
  * get close to target free space on the page).  Note that this is
  * only used for the soft fillfactor-wise limit, not the critical hard
  * limit.
  *----------
  */
 static void
 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 			 Size truncextra)
 {
 	Page		npage;
 	BlockNumber nblkno;
 	OffsetNumber last_off;
 	Size		last_truncextra;
 	Size		pgspc;
 	Size		itupsz;
 	bool		isleaf;

 	/*
 	 * This is a handy place to check for cancel interrupts during the btree
 	 * load phase of index creation.
 	 */
 	CHECK_FOR_INTERRUPTS();

 	npage = state->btps_page;
 	nblkno = state->btps_blkno;
 	last_off = state->btps_lastoff;
 	last_truncextra = state->btps_lastextra;
 	state->btps_lastextra = truncextra;

 	pgspc = PageGetFreeSpace(npage);
 	itupsz = IndexTupleSize(itup);
 	itupsz = MAXALIGN(itupsz);
 	/* Leaf case has slightly different rules due to suffix truncation */
 	isleaf = (state->btps_level == 0);

 	/*
 	 * Check whether the new item can fit on a btree page on current level at
 	 * all.
 	 *
 	 * Every newly built index will treat heap TID as part of the keyspace,
 	 * which imposes the requirement that new high keys must occasionally have
 	 * a heap TID appended within _bt_truncate().  That may leave a new pivot
 	 * tuple one or two MAXALIGN() quantums larger than the original
 	 * firstright tuple it's derived from.  v4 deals with the problem by
 	 * decreasing the limit on the size of tuples inserted on the leaf level
 	 * by the same small amount.  Enforce the new v4+ limit on the leaf level,
 	 * and the old limit on internal levels, since pivot tuples may need to
 	 * make use of the reserved space.  This should never fail on internal
 	 * pages.
 	 */
 	if (unlikely(itupsz > BTMaxItemSize(npage)))
 		_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
 							 itup);

 	/*
 	 * Check to see if current page will fit new item, with space left over to
 	 * append a heap TID during suffix truncation when page is a leaf page.
 	 *
 	 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
 	 * high key with heap TID when finishing off a leaf page, since we rely on
 	 * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
 	 * internal pages we can always fit 3 pivot tuples with larger internal
 	 * page tuple limit (includes page high key).
 	 *
 	 * Most of the time, a page is only "full" in the sense that the soft
 	 * fillfactor-wise limit has been exceeded.  However, we must always leave
 	 * at least two items plus a high key on each page before starting a new
 	 * page.  Disregard fillfactor and insert on "full" current page if we
 	 * don't have the minimum number of items yet.  (Note that we deliberately
 	 * assume that suffix truncation neither enlarges nor shrinks new high key
 	 * when applying soft limit, except when last tuple has a posting list.)
 	 */
 	Assert(last_truncextra == 0 || isleaf);
 	if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
 		(pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
 	{
 		/*
 		 * Finish off the page and write it out.
 		 */
 		Page		opage = npage;
 		BlockNumber oblkno = nblkno;
 		ItemId		ii;
 		ItemId		hii;
 		IndexTuple	oitup;

 		/* Create new page of same level */
 		npage = _bt_blnewpage(state->btps_level);

 		/* and assign it a page position */
 		nblkno = wstate->btws_pages_alloced++;

 		/*
 		 * We copy the last item on the page into the new page, and then
 		 * rearrange the old page so that the 'last item' becomes its high key
 		 * rather than a true data item.  There had better be at least two
 		 * items on the page already, else the page would be empty of useful
 		 * data.
 		 */
 		Assert(last_off > P_FIRSTKEY);
 		ii = PageGetItemId(opage, last_off);
 		oitup = (IndexTuple) PageGetItem(opage, ii);
 		_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
 					   !isleaf);

 		/*
 		 * Move 'last' into the high key position on opage.  _bt_blnewpage()
 		 * allocated empty space for a line pointer when opage was first
 		 * created, so this is a matter of rearranging already-allocated space
 		 * on page, and initializing high key line pointer. (Actually, leaf
 		 * pages must also swap oitup with a truncated version of oitup, which
 		 * is sometimes larger than oitup, though never by more than the space
 		 * needed to append a heap TID.)
 		 */
 		hii = PageGetItemId(opage, P_HIKEY);
 		*hii = *ii;
 		ItemIdSetUnused(ii);	/* redundant */
 		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);

 		if (isleaf)
 		{
 			IndexTuple	lastleft;
 			IndexTuple	truncated;

 			/*
 			 * Truncate away any unneeded attributes from high key on leaf
 			 * level.  This is only done at the leaf level because downlinks
 			 * in internal pages are either negative infinity items, or get
 			 * their contents from copying from one level down.  See also:
 			 * _bt_split().
 			 *
 			 * We don't try to bias our choice of split point to make it more
 			 * likely that _bt_truncate() can truncate away more attributes,
 			 * whereas the split point used within _bt_split() is chosen much
 			 * more delicately.  Even still, the lastleft and firstright
 			 * tuples passed to _bt_truncate() here are at least not fully
 			 * equal to each other when deduplication is used, unless there is
 			 * a large group of duplicates (also, unique index builds usually
 			 * have few or no spool2 duplicates).  When the split point is
 			 * between two unequal tuples, _bt_truncate() will avoid including
 			 * a heap TID in the new high key, which is the most important
 			 * benefit of suffix truncation.
 			 *
 			 * Overwrite the old item with new truncated high key directly.
 			 * oitup is already located at the physical beginning of tuple
 			 * space, so this should directly reuse the existing tuple space.
 			 */
 			ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
 			lastleft = (IndexTuple) PageGetItem(opage, ii);

 			Assert(IndexTupleSize(oitup) > last_truncextra);
 			truncated = _bt_truncate(wstate->index, lastleft, oitup,
 									 wstate->inskey);
 			if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
 										 IndexTupleSize(truncated)))
 				elog(ERROR, "failed to add high key to the index page");
 			pfree(truncated);

 			/* oitup should continue to point to the page's high key */
 			hii = PageGetItemId(opage, P_HIKEY);
 			oitup = (IndexTuple) PageGetItem(opage, hii);
 		}

 		/*
 		 * Link the old page into its parent, using its low key.  If we don't
 		 * have a parent, we have to create one; this adds a new btree level.
 		 */
 		if (state->btps_next == NULL)
 			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);

 		Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
 				IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
 				BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
 			   P_LEFTMOST(BTPageGetOpaque(opage)));
 		Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
 			   !P_LEFTMOST(BTPageGetOpaque(opage)));
 		BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
 		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
 		pfree(state->btps_lowkey);

 		/*
 		 * Save a copy of the high key from the old page.  It is also the low
 		 * key for the new page.
 		 */
 		state->btps_lowkey = CopyIndexTuple(oitup);

 		/*
 		 * Set the sibling links for both pages.
 		 */
 		{
 			BTPageOpaque oopaque = BTPageGetOpaque(opage);
 			BTPageOpaque nopaque = BTPageGetOpaque(npage);

 			oopaque->btpo_next = nblkno;
 			nopaque->btpo_prev = oblkno;
 			nopaque->btpo_next = P_NONE;	/* redundant */
 		}

 		/*
 		 * Write out the old page.  We never need to touch it again, so we can
 		 * free the opage workspace too.
 		 */
 		_bt_blwritepage(wstate, opage, oblkno);

 		/*
 		 * Reset last_off to point to new page
 		 */
 		last_off = P_FIRSTKEY;
 	}

 	/*
 	 * By here, either original page is still the current page, or a new page
 	 * was created that became the current page.  Either way, the current page
 	 * definitely has space for new item.
 	 *
 	 * If the new item is the first for its page, it must also be the first
 	 * item on its entire level.  On later same-level pages, a low key for a
 	 * page will be copied from the prior page in the code above.  Generate a
 	 * minus infinity low key here instead.
 	 */
 	if (last_off == P_HIKEY)
 	{
 		Assert(state->btps_lowkey == NULL);
 		state->btps_lowkey = palloc0(sizeof(IndexTupleData));
 		state->btps_lowkey->t_info = sizeof(IndexTupleData);
 		BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
 	}

 	/*
 	 * Add the new item into the current page.
 	 */
 	last_off = OffsetNumberNext(last_off);
 	_bt_sortaddtup(npage, itupsz, itup, last_off,
 				   !isleaf && last_off == P_FIRSTKEY);

 	state->btps_page = npage;
 	state->btps_blkno = nblkno;
 	state->btps_lastoff = last_off;
 }

 /*
  * Finalize pending posting list tuple, and add it to the index.  Final tuple
  * is based on saved base tuple, and saved list of heap TIDs.
  *
  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
  * using _bt_buildadd().
  */
 static void
 _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
 							  BTDedupState dstate)
 {
 	Assert(dstate->nitems > 0);

 	if (dstate->nitems == 1)
 		_bt_buildadd(wstate, state, dstate->base, 0);
 	else
 	{
 		IndexTuple	postingtuple;
 		Size		truncextra;

 		/* form a tuple with a posting list */
 		postingtuple = _bt_form_posting(dstate->base,
 										dstate->htids,
 										dstate->nhtids);
 		/* Calculate posting list overhead */
 		truncextra = IndexTupleSize(postingtuple) -
 			BTreeTupleGetPostingOffset(postingtuple);

 		_bt_buildadd(wstate, state, postingtuple, truncextra);
 		pfree(postingtuple);
 	}

 	dstate->nmaxitems = 0;
 	dstate->nhtids = 0;
 	dstate->nitems = 0;
 	dstate->phystupsize = 0;
 }

 /*
  * Finish writing out the completed btree.
  */
 static void
 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 {
 	BTPageState *s;
 	BlockNumber rootblkno = P_NONE;
 	uint32		rootlevel = 0;
 	Page		metapage;

 	/*
 	 * Each iteration of this loop completes one more level of the tree.
 	 */
 	for (s = state; s != NULL; s = s->btps_next)
 	{
 		BlockNumber blkno;
 		BTPageOpaque opaque;

 		blkno = s->btps_blkno;
 		opaque = BTPageGetOpaque(s->btps_page);

 		/*
 		 * We have to link the last page on this level to somewhere.
 		 *
 		 * If we're at the top, it's the root, so attach it to the metapage.
 		 * Otherwise, add an entry for it to its parent using its low key.
 		 * This may cause the last page of the parent level to split, but
 		 * that's not a problem -- we haven't gotten to it yet.
 		 */
 		if (s->btps_next == NULL)
 		{
 			opaque->btpo_flags |= BTP_ROOT;
 			rootblkno = blkno;
 			rootlevel = s->btps_level;
 		}
 		else
 		{
 			Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
 					IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
 					BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
 				   P_LEFTMOST(opaque));
 			Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
 				   !P_LEFTMOST(opaque));
 			BTreeTupleSetDownLink(s->btps_lowkey, blkno);
 			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
 			pfree(s->btps_lowkey);
 			s->btps_lowkey = NULL;
 		}

 		/*
 		 * This is the rightmost page, so the ItemId array needs to be slid
 		 * back one slot.  Then we can dump out the page.
 		 */
 		_bt_slideleft(s->btps_page);
 		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
 		s->btps_page = NULL;	/* writepage freed the workspace */
 	}

 	/*
 	 * As the last step in the process, construct the metapage and make it
 	 * point to the new root (unless we had no data at all, in which case it's
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
 	 */
 	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	_bt_initmetapage(metapage, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
 }

 /*
  * Read tuples in correct sort order from tuplesort, and load them into
  * btree leaves.
  */
 static void
 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 {
 	BTPageState *state = NULL;
 	bool		merge = (btspool2 != NULL);
 	IndexTuple	itup,
 				itup2 = NULL;
 	bool		load1;
 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
 	int			i,
 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
 	bool		deduplicate;

 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);

 	if (merge)
 	{
 		/*
 		 * Another BTSpool for dead tuples exists. Now we have to merge
 		 * btspool and btspool2.
 		 */

 		/* the preparation of merge */
 		itup = tuplesort_getindextuple(btspool->sortstate, true);
 		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);

 		/* Prepare SortSupport data for each column */
 		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));

 		for (i = 0; i < keysz; i++)
 		{
 			SortSupport sortKey = sortKeys + i;
 			ScanKey		scanKey = wstate->inskey->scankeys + i;
 			int16		strategy;

 			sortKey->ssup_cxt = CurrentMemoryContext;
 			sortKey->ssup_collation = scanKey->sk_collation;
 			sortKey->ssup_nulls_first =
 				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 			sortKey->ssup_attno = scanKey->sk_attno;
 			/* Abbreviation is not supported here */
 			sortKey->abbreviate = false;

 			Assert(sortKey->ssup_attno != 0);

 			strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
 				BTGreaterStrategyNumber : BTLessStrategyNumber;

 			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
 		}

 		for (;;)
 		{
 			load1 = true;		/* load BTSpool next ? */
 			if (itup2 == NULL)
 			{
 				if (itup == NULL)
 					break;
 			}
 			else if (itup != NULL)
 			{
 				int32		compare = 0;

 				for (i = 1; i <= keysz; i++)
 				{
 					SortSupport entry;
 					Datum		attrDatum1,
 								attrDatum2;
 					bool		isNull1,
 								isNull2;

 					entry = sortKeys + i - 1;
 					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
 					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);

 					compare = ApplySortComparator(attrDatum1, isNull1,
 												  attrDatum2, isNull2,
 												  entry);
 					if (compare > 0)
 					{
 						load1 = false;
 						break;
 					}
 					else if (compare < 0)
 						break;
 				}

 				/*
 				 * If key values are equal, we sort on ItemPointer.  This is
 				 * required for btree indexes, since heap TID is treated as an
 				 * implicit last key attribute in order to ensure that all
 				 * keys in the index are physically unique.
 				 */
 				if (compare == 0)
 				{
 					compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
 					Assert(compare != 0);
 					if (compare > 0)
 						load1 = false;
 				}
 			}
 			else
 				load1 = false;

 			/* When we see first tuple, create first index page */
 			if (state == NULL)
 				state = _bt_pagestate(wstate, 0);

 			if (load1)
 			{
 				_bt_buildadd(wstate, state, itup, 0);
 				itup = tuplesort_getindextuple(btspool->sortstate, true);
 			}
 			else
 			{
 				_bt_buildadd(wstate, state, itup2, 0);
 				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
 			}

 			/* Report progress */
 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
 										 ++tuples_done);
 		}
 		pfree(sortKeys);
 	}
 	else if (deduplicate)
 	{
 		/* merge is unnecessary, deduplicate into posting lists */
 		BTDedupState dstate;

 		dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
 		dstate->deduplicate = true; /* unused */
 		dstate->nmaxitems = 0;	/* unused */
 		dstate->maxpostingsize = 0; /* set later */
 		/* Metadata about base tuple of current pending posting list */
 		dstate->base = NULL;
 		dstate->baseoff = InvalidOffsetNumber;	/* unused */
 		dstate->basetupsize = 0;
 		/* Metadata about current pending posting list TIDs */
 		dstate->htids = NULL;
 		dstate->nhtids = 0;
 		dstate->nitems = 0;
 		dstate->phystupsize = 0;	/* unused */
 		dstate->nintervals = 0; /* unused */

 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
 											   true)) != NULL)
 		{
 			/* When we see first tuple, create first index page */
 			if (state == NULL)
 			{
 				state = _bt_pagestate(wstate, 0);

 				/*
 				 * Limit size of posting list tuples to 1/10 space we want to
 				 * leave behind on the page, plus space for final item's line
 				 * pointer.  This is equal to the space that we'd like to
 				 * leave behind on each leaf page when fillfactor is 90,
 				 * allowing us to get close to fillfactor% space utilization
 				 * when there happen to be a great many duplicates.  (This
 				 * makes higher leaf fillfactor settings ineffective when
 				 * building indexes that have many duplicates, but packing
 				 * leaf pages full with few very large tuples doesn't seem
 				 * like a useful goal.)
 				 */
 				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
 					sizeof(ItemIdData);
 				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
 					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
 				dstate->htids = palloc(dstate->maxpostingsize);

 				/* start new pending posting list with itup copy */
 				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
 										InvalidOffsetNumber);
 			}
 			else if (_bt_keep_natts_fast(wstate->index, dstate->base,
 										 itup) > keysz &&
 					 _bt_dedup_save_htid(dstate, itup))
 			{
 				/*
 				 * Tuple is equal to base tuple of pending posting list.  Heap
 				 * TID from itup has been saved in state.
 				 */
 			}
 			else
 			{
 				/*
 				 * Tuple is not equal to pending posting list tuple, or
 				 * _bt_dedup_save_htid() opted to not merge current item into
 				 * pending posting list.
 				 */
 				_bt_sort_dedup_finish_pending(wstate, state, dstate);
 				pfree(dstate->base);

 				/* start new pending posting list with itup copy */
 				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
 										InvalidOffsetNumber);
 			}

 			/* Report progress */
 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
 										 ++tuples_done);
 		}

 		if (state)
 		{
 			/*
 			 * Handle the last item (there must be a last item when the
 			 * tuplesort returned one or more tuples)
 			 */
 			_bt_sort_dedup_finish_pending(wstate, state, dstate);
 			pfree(dstate->base);
 			pfree(dstate->htids);
 		}

 		pfree(dstate);
 	}
 	else
 	{
 		/* merging and deduplication are both unnecessary */
 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
 											   true)) != NULL)
 		{
 			/* When we see first tuple, create first index page */
 			if (state == NULL)
 				state = _bt_pagestate(wstate, 0);

 			_bt_buildadd(wstate, state, itup, 0);

 			/* Report progress */
 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
 										 ++tuples_done);
 		}
 	}

 	/* Close down final pages and write the metapage */
 	_bt_uppershutdown(wstate, state);

 	/*
 	 * When we WAL-logged index pages, we must nonetheless fsync index files.
 	 * Since we're building outside shared buffers, a CHECKPOINT occurring
 	 * during the build has no way to flush the previously written data to
 	 * disk (indeed it won't know the index even exists).  A crash later on
 	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
 	 * earlier WAL entries. If we do not fsync those pages here, they might
 	 * still not be on disk when the crash occurs.
 	 */
 	if (wstate->btws_use_wal)
 		smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
 }

 /*
  * Create parallel context, and launch workers for leader.
  *
  * buildstate argument should be initialized (with the exception of the
  * tuplesort state in spools, which may later be created based on shared
  * state initially set up here).
  *
  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
  *
  * request is the target number of parallel worker processes to launch.
  *
  * Sets buildstate's BTLeader, which caller must use to shut down parallel
  * mode by passing it to _bt_end_parallel() at the very end of its index
  * build.  If not even a single worker process can be launched, this is
  * never set, and caller should proceed with a serial index build.
  */
 static void
 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 {
 	ParallelContext *pcxt;
 	int			scantuplesortstates;
 	Snapshot	snapshot;
 	Size		estbtshared;
 	Size		estsort;
 	BTShared   *btshared;
 	Sharedsort *sharedsort;
 	Sharedsort *sharedsort2;
 	BTSpool    *btspool = buildstate->spool;
 	BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
 	WalUsage   *walusage;
 	BufferUsage *bufferusage;
 	bool		leaderparticipates = true;
 	int			querylen;

 #ifdef DISABLE_LEADER_PARTICIPATION
 	leaderparticipates = false;
 #endif

 	/*
 	 * Enter parallel mode, and create context for parallel build of btree
 	 * index
 	 */
 	EnterParallelMode();
 	Assert(request > 0);
 	pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
 								 request);

 	scantuplesortstates = leaderparticipates ? request + 1 : request;

 	/*
 	 * Prepare for scan of the base relation.  In a normal index build, we use
 	 * SnapshotAny because we must retrieve all tuples and do our own time
 	 * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
 	 * concurrent build, we take a regular MVCC snapshot and index whatever's
 	 * live according to that.
 	 */
 	if (!isconcurrent)
 		snapshot = SnapshotAny;
 	else
 		snapshot = RegisterSnapshot(GetTransactionSnapshot());

 	/*
 	 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
 	 * PARALLEL_KEY_TUPLESORT tuplesort workspace
 	 */
 	estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
 	shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
 	estsort = tuplesort_estimate_shared(scantuplesortstates);
 	shm_toc_estimate_chunk(&pcxt->estimator, estsort);

 	/*
 	 * Unique case requires a second spool, and so we may have to account for
 	 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
 	 */
 	if (!btspool->isunique)
 		shm_toc_estimate_keys(&pcxt->estimator, 2);
 	else
 	{
 		shm_toc_estimate_chunk(&pcxt->estimator, estsort);
 		shm_toc_estimate_keys(&pcxt->estimator, 3);
 	}

 	/*
 	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
 	 * and PARALLEL_KEY_BUFFER_USAGE.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
 	 * have no way of knowing whether anyone's looking at pgWalUsage or
 	 * pgBufferUsage, so do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(WalUsage), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);

 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
 	if (debug_query_string)
 	{
 		querylen = strlen(debug_query_string);
 		shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
 		shm_toc_estimate_keys(&pcxt->estimator, 1);
 	}
 	else
 		querylen = 0;			/* keep compiler quiet */

 	/* Everyone's had a chance to ask for space, so now create the DSM */
 	InitializeParallelDSM(pcxt);

 	/* If no DSM segment was available, back out (do serial build) */
 	if (pcxt->seg == NULL)
 	{
 		if (IsMVCCSnapshot(snapshot))
 			UnregisterSnapshot(snapshot);
 		DestroyParallelContext(pcxt);
 		ExitParallelMode();
 		return;
 	}

 	/* Store shared build state, for which we reserved space */
 	btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
 	/* Initialize immutable state */
 	btshared->heaprelid = RelationGetRelid(btspool->heap);
 	btshared->indexrelid = RelationGetRelid(btspool->index);
 	btshared->isunique = btspool->isunique;
 	btshared->nulls_not_distinct = btspool->nulls_not_distinct;
 	btshared->isconcurrent = isconcurrent;
 	btshared->scantuplesortstates = scantuplesortstates;
 	ConditionVariableInit(&btshared->workersdonecv);
 	SpinLockInit(&btshared->mutex);
 	/* Initialize mutable state */
 	btshared->nparticipantsdone = 0;
 	btshared->reltuples = 0.0;
 	btshared->havedead = false;
 	btshared->indtuples = 0.0;
 	btshared->brokenhotchain = false;
 	table_parallelscan_initialize(btspool->heap,
 								  ParallelTableScanFromBTShared(btshared),
 								  snapshot);

 	/*
 	 * Store shared tuplesort-private state, for which we reserved space.
 	 * Then, initialize opaque state using tuplesort routine.
 	 */
 	sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
 	tuplesort_initialize_shared(sharedsort, scantuplesortstates,
 								pcxt->seg);

 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);

 	/* Unique case requires a second spool, and associated shared state */
 	if (!btspool->isunique)
 		sharedsort2 = NULL;
 	else
 	{
 		/*
 		 * Store additional shared tuplesort-private state, for which we
 		 * reserved space.  Then, initialize opaque state using tuplesort
 		 * routine.
 		 */
 		sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
 		tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
 									pcxt->seg);

 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
 	}

 	/* Store query string for workers */
 	if (debug_query_string)
 	{
 		char	   *sharedquery;

 		sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
 		memcpy(sharedquery, debug_query_string, querylen + 1);
 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
 	}

 	/*
 	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
 	 * initialize.
 	 */
 	walusage = shm_toc_allocate(pcxt->toc,
 								mul_size(sizeof(WalUsage), pcxt->nworkers));
 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
 	bufferusage = shm_toc_allocate(pcxt->toc,
 								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);

 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
 	btleader->pcxt = pcxt;
 	btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
 	if (leaderparticipates)
 		btleader->nparticipanttuplesorts++;
 	btleader->btshared = btshared;
 	btleader->sharedsort = sharedsort;
 	btleader->sharedsort2 = sharedsort2;
 	btleader->snapshot = snapshot;
 	btleader->walusage = walusage;
 	btleader->bufferusage = bufferusage;

 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
 	{
 		_bt_end_parallel(btleader);
 		return;
 	}

 	/* Save leader state now that it's clear build will be parallel */
 	buildstate->btleader = btleader;

 	/* Join heap scan ourselves */
 	if (leaderparticipates)
 		_bt_leader_participate_as_worker(buildstate);

 	/*
 	 * Caller needs to wait for all launched workers when we return.  Make
 	 * sure that the failure-to-start case will not hang forever.
 	 */
 	WaitForParallelWorkersToAttach(pcxt);
 }

 /*
  * Shut down workers, destroy parallel context, and end parallel mode.
  */
 static void
 _bt_end_parallel(BTLeader *btleader)
 {
 	int			i;

 	/* Shutdown worker processes */
 	WaitForParallelWorkersToFinish(btleader->pcxt);

 	/*
 	 * Next, accumulate WAL usage.  (This must wait for the workers to finish,
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
 		InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);

 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(btleader->snapshot))
 		UnregisterSnapshot(btleader->snapshot);
 	DestroyParallelContext(btleader->pcxt);
 	ExitParallelMode();
 }

 /*
  * Returns size of shared memory required to store state for a parallel
  * btree index build based on the snapshot its parallel scan will use.
  */
 static Size
 _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
 {
 	/* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
 	return add_size(BUFFERALIGN(sizeof(BTShared)),
 					table_parallelscan_estimate(heap, snapshot));
 }

 /*
  * Within leader, wait for end of heap scan.
  *
  * When called, parallel heap scan started by _bt_begin_parallel() will
  * already be underway within worker processes (when leader participates
  * as a worker, we should end up here just as workers are finishing).
  *
  * Fills in fields needed for ambuild statistics, and lets caller set
  * field indicating that some worker encountered a broken HOT chain.
  *
  * Returns the total number of heap tuples scanned.
  */
 static double
 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
 {
 	BTShared   *btshared = buildstate->btleader->btshared;
 	int			nparticipanttuplesorts;
 	double		reltuples;

 	nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
 	for (;;)
 	{
 		SpinLockAcquire(&btshared->mutex);
 		if (btshared->nparticipantsdone == nparticipanttuplesorts)
 		{
 			buildstate->havedead = btshared->havedead;
 			buildstate->indtuples = btshared->indtuples;
 			*brokenhotchain = btshared->brokenhotchain;
 			reltuples = btshared->reltuples;
 			SpinLockRelease(&btshared->mutex);
 			break;
 		}
 		SpinLockRelease(&btshared->mutex);

 		ConditionVariableSleep(&btshared->workersdonecv,
 							   WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
 	}

 	ConditionVariableCancelSleep();

 	return reltuples;
 }

 /*
  * Within leader, participate as a parallel worker.
  */
 static void
 _bt_leader_participate_as_worker(BTBuildState *buildstate)
 {
 	BTLeader   *btleader = buildstate->btleader;
 	BTSpool    *leaderworker;
 	BTSpool    *leaderworker2;
 	int			sortmem;

 	/* Allocate memory and initialize private spool */
 	leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
 	leaderworker->heap = buildstate->spool->heap;
 	leaderworker->index = buildstate->spool->index;
 	leaderworker->isunique = buildstate->spool->isunique;
 	leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;

 	/* Initialize second spool, if required */
 	if (!btleader->btshared->isunique)
 		leaderworker2 = NULL;
 	else
 	{
 		/* Allocate memory for worker's own private secondary spool */
 		leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));

 		/* Initialize worker's own secondary spool */
 		leaderworker2->heap = leaderworker->heap;
 		leaderworker2->index = leaderworker->index;
 		leaderworker2->isunique = false;
 	}

 	/*
 	 * Might as well use reliable figure when doling out maintenance_work_mem
 	 * (when requested number of workers were not launched, this will be
 	 * somewhat higher than it is for other workers).
 	 */
 	sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;

 	/* Perform work common to all participants */
 	_bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
 							   btleader->sharedsort, btleader->sharedsort2,
 							   sortmem, true);

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 	{
 		ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
 		ResetUsage();
 	}
 #endif							/* BTREE_BUILD_STATS */
 }

 /*
  * Perform work within a launched parallel process.
  */
 void
 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 {
 	char	   *sharedquery;
 	BTSpool    *btspool;
 	BTSpool    *btspool2;
 	BTShared   *btshared;
 	Sharedsort *sharedsort;
 	Sharedsort *sharedsort2;
 	Relation	heapRel;
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
 	WalUsage   *walusage;
 	BufferUsage *bufferusage;
 	int			sortmem;

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 		ResetUsage();
 #endif							/* BTREE_BUILD_STATS */

 	/*
 	 * The only possible status flag that can be set to the parallel worker is
 	 * PROC_IN_SAFE_IC.
 	 */
 	Assert((MyProc->statusFlags == 0) ||
 		   (MyProc->statusFlags == PROC_IN_SAFE_IC));

 	/* Set debug_query_string for individual workers first */
 	sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
 	debug_query_string = sharedquery;

 	/* Report the query string from leader */
 	pgstat_report_activity(STATE_RUNNING, debug_query_string);

 	/* Look up nbtree shared state */
 	btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);

 	/* Open relations using lock modes known to be obtained by index.c */
 	if (!btshared->isconcurrent)
 	{
 		heapLockmode = ShareLock;
 		indexLockmode = AccessExclusiveLock;
 	}
 	else
 	{
 		heapLockmode = ShareUpdateExclusiveLock;
 		indexLockmode = RowExclusiveLock;
 	}

 	/* Open relations within worker */
 	heapRel = table_open(btshared->heaprelid, heapLockmode);
 	indexRel = index_open(btshared->indexrelid, indexLockmode);

 	/* Initialize worker's own spool */
 	btspool = (BTSpool *) palloc0(sizeof(BTSpool));
 	btspool->heap = heapRel;
 	btspool->index = indexRel;
 	btspool->isunique = btshared->isunique;
 	btspool->nulls_not_distinct = btshared->nulls_not_distinct;

 	/* Look up shared state private to tuplesort.c */
 	sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
 	tuplesort_attach_shared(sharedsort, seg);
 	if (!btshared->isunique)
 	{
 		btspool2 = NULL;
 		sharedsort2 = NULL;
 	}
 	else
 	{
 		/* Allocate memory for worker's own private secondary spool */
 		btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));

 		/* Initialize worker's own secondary spool */
 		btspool2->heap = btspool->heap;
 		btspool2->index = btspool->index;
 		btspool2->isunique = false;
 		/* Look up shared state private to tuplesort.c */
 		sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
 		tuplesort_attach_shared(sharedsort2, seg);
 	}

 	/* Prepare to track buffer usage during parallel execution */
 	InstrStartParallelQuery();

 	/* Perform sorting of spool, and possibly a spool2 */
 	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
 	_bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
 							   sharedsort2, sortmem, false);

 	/* Report WAL/buffer usage during parallel execution */
 	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
 	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
 						  &walusage[ParallelWorkerNumber]);

 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 	{
 		ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
 		ResetUsage();
 	}
 #endif							/* BTREE_BUILD_STATS */

 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
 }

 /*
  * Perform a worker's portion of a parallel sort.
  *
  * This generates a tuplesort for passed btspool, and a second tuplesort
  * state if a second btspool is need (i.e. for unique index builds).  All
  * other spool fields should already be set when this is called.
  *
  * sortmem is the amount of working memory to use within each worker,
  * expressed in KBs.
  *
  * When this returns, workers are done, and need only release resources.
  */
 static void
 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
 						   BTShared *btshared, Sharedsort *sharedsort,
 						   Sharedsort *sharedsort2, int sortmem, bool progress)
 {
 	SortCoordinate coordinate;
 	BTBuildState buildstate;
 	TableScanDesc scan;
 	double		reltuples;
 	IndexInfo  *indexInfo;

 	/* Initialize local tuplesort coordination state */
 	coordinate = palloc0(sizeof(SortCoordinateData));
 	coordinate->isWorker = true;
 	coordinate->nParticipants = -1;
 	coordinate->sharedsort = sharedsort;

 	/* Begin "partial" tuplesort */
 	btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
 													 btspool->index,
 													 btspool->isunique,
 													 btspool->nulls_not_distinct,
 													 sortmem, coordinate,
 													 TUPLESORT_NONE);

 	/*
 	 * Just as with serial case, there may be a second spool.  If so, a
 	 * second, dedicated spool2 partial tuplesort is required.
 	 */
 	if (btspool2)
 	{
 		SortCoordinate coordinate2;

 		/*
 		 * We expect that the second one (for dead tuples) won't get very
 		 * full, so we give it only work_mem (unless sortmem is less for
 		 * worker).  Worker processes are generally permitted to allocate
 		 * work_mem independently.
 		 */
 		coordinate2 = palloc0(sizeof(SortCoordinateData));
 		coordinate2->isWorker = true;
 		coordinate2->nParticipants = -1;
 		coordinate2->sharedsort = sharedsort2;
 		btspool2->sortstate =
 			tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
 										Min(sortmem, work_mem), coordinate2,
 										false);
 	}

 	/* Fill in buildstate for _bt_build_callback() */
 	buildstate.isunique = btshared->isunique;
 	buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
 	buildstate.havedead = false;
 	buildstate.heap = btspool->heap;
 	buildstate.spool = btspool;
 	buildstate.spool2 = btspool2;
 	buildstate.indtuples = 0;
 	buildstate.btleader = NULL;

 	/* Join parallel scan */
 	indexInfo = BuildIndexInfo(btspool->index);
 	indexInfo->ii_Concurrent = btshared->isconcurrent;
 	scan = table_beginscan_parallel(btspool->heap,
 									ParallelTableScanFromBTShared(btshared));
 	reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
 									   true, progress, _bt_build_callback,
 									   (void *) &buildstate, scan);

 	/* Execute this worker's part of the sort */
 	if (progress)
 		pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 									 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
 	tuplesort_performsort(btspool->sortstate);
 	if (btspool2)
 	{
 		if (progress)
 			pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 										 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
 		tuplesort_performsort(btspool2->sortstate);
 	}

 	/*
 	 * Done.  Record ambuild statistics, and whether we encountered a broken
 	 * HOT chain.
 	 */
 	SpinLockAcquire(&btshared->mutex);
 	btshared->nparticipantsdone++;
 	btshared->reltuples += reltuples;
 	if (buildstate.havedead)
 		btshared->havedead = true;
 	btshared->indtuples += buildstate.indtuples;
 	if (indexInfo->ii_BrokenHotChain)
 		btshared->brokenhotchain = true;
 	SpinLockRelease(&btshared->mutex);

 	/* Notify leader */
 	ConditionVariableSignal(&btshared->workersdonecv);

 	/* We can end tuplesorts immediately */
 	tuplesort_end(btspool->sortstate);
 	if (btspool2)
 		tuplesort_end(btspool2->sortstate);
 }