src/backend/executor/nodeIncrementalSort.c - age - Git at Google

 /*-------------------------------------------------------------------------
  *
  * nodeIncrementalSort.c
  *	  Routines to handle incremental sorting of relations.
  *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/executor/nodeIncrementalSort.c
  *
  * DESCRIPTION
  *
  *	Incremental sort is an optimized variant of multikey sort for cases
  *	when the input is already sorted by a prefix of the sort keys.  For
  *	example when a sort by (key1, key2 ... keyN) is requested, and the
  *	input is already sorted by (key1, key2 ... keyM), M < N, we can
  *	divide the input into groups where keys (key1, ... keyM) are equal,
  *	and only sort on the remaining columns.
  *
  *	Consider the following example.  We have input tuples consisting of
  *	two integers (X, Y) already presorted by X, while it's required to
  *	sort them by both X and Y.  Let input tuples be following.
  *
  *	(1, 5)
  *	(1, 2)
  *	(2, 9)
  *	(2, 1)
  *	(2, 5)
  *	(3, 3)
  *	(3, 7)
  *
  *	An incremental sort algorithm would split the input into the following
  *	groups, which have equal X, and then sort them by Y individually:
  *
  *		(1, 5) (1, 2)
  *		(2, 9) (2, 1) (2, 5)
  *		(3, 3) (3, 7)
  *
  *	After sorting these groups and putting them altogether, we would get
  *	the following result which is sorted by X and Y, as requested:
  *
  *	(1, 2)
  *	(1, 5)
  *	(2, 1)
  *	(2, 5)
  *	(2, 9)
  *	(3, 3)
  *	(3, 7)
  *
  *	Incremental sort may be more efficient than plain sort, particularly
  *	on large datasets, as it reduces the amount of data to sort at once,
  *	making it more likely it fits into work_mem (eliminating the need to
  *	spill to disk).  But the main advantage of incremental sort is that
  *	it can start producing rows early, before sorting the whole dataset,
  *	which is a significant benefit especially for queries with LIMIT.
  *
  *	The algorithm we've implemented here is modified from the theoretical
  *	base described above by operating in two different modes:
  *	  - Fetching a minimum number of tuples without checking prefix key
  *	    group membership and sorting on all columns when safe.
  *	  - Fetching all tuples for a single prefix key group and sorting on
  *	    solely the unsorted columns.
  *	We always begin in the first mode, and employ a heuristic to switch
  *	into the second mode if we believe it's beneficial.
  *
  *	Sorting incrementally can potentially use less memory, avoid fetching
  *	and sorting all tuples in the dataset, and begin returning tuples before
  *	the entire result set is available.
  *
  *	The hybrid mode approach allows us to optimize for both very small
  *	groups (where the overhead of a new tuplesort is high) and very	large
  *	groups (where we can lower cost by not having to sort on already sorted
  *	columns), albeit at some extra cost while switching between modes.
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/htup_details.h"
 #include "executor/execdebug.h"
 #include "executor/nodeIncrementalSort.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
 #include "utils/tuplesort.h"

 /*
  * We need to store the instrumentation information in either local node's sort
  * info or, for a parallel worker process, in the shared info (this avoids
  * having to additionally memcpy the info from local memory to shared memory
  * at each instrumentation call). This macro expands to choose the proper sort
  * state and group info.
  *
  * Arguments:
  * - node: type IncrementalSortState *
  * - groupName: the token fullsort or prefixsort
  */
 #define INSTRUMENT_SORT_GROUP(node, groupName) \
 	do { \
 		if ((node)->ss.ps.instrument != NULL) \
 		{ \
 			if ((node)->shared_info && (node)->am_worker) \
 			{ \
 				Assert(IsParallelWorker()); \
 				Assert(ParallelWorkerNumber <= (node)->shared_info->num_workers); \
 				instrumentSortedGroup(&(node)->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, \
 									  (node)->groupName##_state); \
 			} \
 			else \
 			{ \
 				instrumentSortedGroup(&(node)->incsort_info.groupName##GroupInfo, \
 									  (node)->groupName##_state); \
 			} \
 		} \
 	} while (0)


 /* ----------------------------------------------------------------
  * instrumentSortedGroup
  *
  * Because incremental sort processes (potentially many) sort batches, we need
  * to capture tuplesort stats each time we finalize a sort state. This summary
  * data is later used for EXPLAIN ANALYZE output.
  * ----------------------------------------------------------------
  */
 static void
 instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo,
 					  Tuplesortstate *sortState)
 {
 	TuplesortInstrumentation sort_instr;

 	groupInfo->groupCount++;

 	tuplesort_get_stats(sortState, &sort_instr);

 	/* Calculate total and maximum memory and disk space used. */
 	switch (sort_instr.spaceType)
 	{
 		case SORT_SPACE_TYPE_DISK:
 			groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed;
 			if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed)
 				groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed;

 			break;
 		case SORT_SPACE_TYPE_MEMORY:
 			groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed;
 			if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed)
 				groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed;

 			break;
 	}

 	/* Track each sort method we've used. */
 	groupInfo->sortMethods |= sort_instr.sortMethod;
 }

 /* ----------------------------------------------------------------
  * preparePresortedCols
  *
  * Prepare information for presorted_keys comparisons.
  * ----------------------------------------------------------------
  */
 static void
 preparePresortedCols(IncrementalSortState *node)
 {
 	IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);

 	node->presorted_keys =
 		(PresortedKeyData *) palloc(plannode->nPresortedCols *
 									sizeof(PresortedKeyData));

 	/* Pre-cache comparison functions for each pre-sorted key. */
 	for (int i = 0; i < plannode->nPresortedCols; i++)
 	{
 		Oid			equalityOp,
 					equalityFunc;
 		PresortedKeyData *key;

 		key = &node->presorted_keys[i];
 		key->attno = plannode->sort.sortColIdx[i];

 		equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i],
 													 NULL);
 		if (!OidIsValid(equalityOp))
 			elog(ERROR, "missing equality operator for ordering operator %u",
 				 plannode->sort.sortOperators[i]);

 		equalityFunc = get_opcode(equalityOp);
 		if (!OidIsValid(equalityFunc))
 			elog(ERROR, "missing function for operator %u", equalityOp);

 		/* Lookup the comparison function */
 		fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext);

 		/* We can initialize the callinfo just once and re-use it */
 		key->fcinfo = palloc0(SizeForFunctionCallInfo(2));
 		InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2,
 								 plannode->sort.collations[i], NULL, NULL);
 		key->fcinfo->args[0].isnull = false;
 		key->fcinfo->args[1].isnull = false;
 	}
 }

 /* ----------------------------------------------------------------
  * isCurrentGroup
  *
  * Check whether a given tuple belongs to the current sort group by comparing
  * the presorted column values to the pivot tuple of the current group.
  * ----------------------------------------------------------------
  */
 static bool
 isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple)
 {
 	int			nPresortedCols;

 	nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols;

 	/*
 	 * That the input is sorted by keys * (0, ... n) implies that the tail
 	 * keys are more likely to change. Therefore we do our comparison starting
 	 * from the last pre-sorted column to optimize for early detection of
 	 * inequality and minimizing the number of function calls..
 	 */
 	for (int i = nPresortedCols - 1; i >= 0; i--)
 	{
 		Datum		datumA,
 					datumB,
 					result;
 		bool		isnullA,
 					isnullB;
 		AttrNumber	attno = node->presorted_keys[i].attno;
 		PresortedKeyData *key;

 		datumA = slot_getattr(pivot, attno, &isnullA);
 		datumB = slot_getattr(tuple, attno, &isnullB);

 		/* Special case for NULL-vs-NULL, else use standard comparison */
 		if (isnullA || isnullB)
 		{
 			if (isnullA == isnullB)
 				continue;
 			else
 				return false;
 		}

 		key = &node->presorted_keys[i];

 		key->fcinfo->args[0].value = datumA;
 		key->fcinfo->args[1].value = datumB;

 		/* just for paranoia's sake, we reset isnull each time */
 		key->fcinfo->isnull = false;

 		result = FunctionCallInvoke(key->fcinfo);

 		/* Check for null result, since caller is clearly not expecting one */
 		if (key->fcinfo->isnull)
 			elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid);

 		if (!DatumGetBool(result))
 			return false;
 	}
 	return true;
 }

 /* ----------------------------------------------------------------
  * switchToPresortedPrefixMode
  *
  * When we determine that we've likely encountered a large batch of tuples all
  * having the same presorted prefix values, we want to optimize tuplesort by
  * only sorting on unsorted suffix keys.
  *
  * The problem is that we've already accumulated several tuples in another
  * tuplesort configured to sort by all columns (assuming that there may be
  * more than one prefix key group). So to switch to presorted prefix mode we
  * have to go back and look at all the tuples we've already accumulated to
  * verify they're all part of the same prefix key group before sorting them
  * solely by unsorted suffix keys.
  *
  * While it's likely that all tuples already fetched are all part of a single
  * prefix group, we also have to handle the possibility that there is at least
  * one different prefix key group before the large prefix key group.
  * ----------------------------------------------------------------
  */
 static void
 switchToPresortedPrefixMode(PlanState *pstate)
 {
 	IncrementalSortState *node = castNode(IncrementalSortState, pstate);
 	ScanDirection dir;
 	int64		nTuples;
 	TupleDesc	tupDesc;
 	PlanState  *outerNode;
 	IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);

 	dir = node->ss.ps.state->es_direction;
 	outerNode = outerPlanState(node);
 	tupDesc = ExecGetResultType(outerNode);

 	/* Configure the prefix sort state the first time around. */
 	if (node->prefixsort_state == NULL)
 	{
 		Tuplesortstate *prefixsort_state;
 		int			nPresortedCols = plannode->nPresortedCols;

 		/*
 		 * Optimize the sort by assuming the prefix columns are all equal and
 		 * thus we only need to sort by any remaining columns.
 		 */
 		prefixsort_state = tuplesort_begin_heap(tupDesc,
 												plannode->sort.numCols - nPresortedCols,
 												&(plannode->sort.sortColIdx[nPresortedCols]),
 												&(plannode->sort.sortOperators[nPresortedCols]),
 												&(plannode->sort.collations[nPresortedCols]),
 												&(plannode->sort.nullsFirst[nPresortedCols]),
 												work_mem,
 												NULL,
 												false);
 		node->prefixsort_state = prefixsort_state;
 	}
 	else
 	{
 		/* Next group of presorted data */
 		tuplesort_reset(node->prefixsort_state);
 	}

 	/*
 	 * If the current node has a bound, then it's reasonably likely that a
 	 * large prefix key group will benefit from bounded sort, so configure the
 	 * tuplesort to allow for that optimization.
 	 */
 	if (node->bounded)
 	{
 		SO1_printf("Setting bound on presorted prefix tuplesort to: " INT64_FORMAT "\n",
 				   node->bound - node->bound_Done);
 		tuplesort_set_bound(node->prefixsort_state,
 							node->bound - node->bound_Done);
 	}

 	/*
 	 * Copy as many tuples as we can (i.e., in the same prefix key group) from
 	 * the full sort state to the prefix sort state.
 	 */
 	for (nTuples = 0; nTuples < node->n_fullsort_remaining; nTuples++)
 	{
 		/*
 		 * When we encounter multiple prefix key groups inside the full sort
 		 * tuplesort we have to carry over the last read tuple into the next
 		 * batch.
 		 */
 		if (nTuples == 0 && !TupIsNull(node->transfer_tuple))
 		{
 			tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
 			/* The carried over tuple is our new group pivot tuple. */
 			ExecCopySlot(node->group_pivot, node->transfer_tuple);
 		}
 		else
 		{
 			tuplesort_gettupleslot(node->fullsort_state,
 								   ScanDirectionIsForward(dir),
 								   false, node->transfer_tuple, NULL);

 			/*
 			 * If this is our first time through the loop, then we need to
 			 * save the first tuple we get as our new group pivot.
 			 */
 			if (TupIsNull(node->group_pivot))
 				ExecCopySlot(node->group_pivot, node->transfer_tuple);

 			if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple))
 			{
 				tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
 			}
 			else
 			{
 				/*
 				 * The tuple isn't part of the current batch so we need to
 				 * carry it over into the next batch of tuples we transfer out
 				 * of the full sort tuplesort into the presorted prefix
 				 * tuplesort. We don't actually have to do anything special to
 				 * save the tuple since we've already loaded it into the
 				 * node->transfer_tuple slot, and, even though that slot
 				 * points to memory inside the full sort tuplesort, we can't
 				 * reset that tuplesort anyway until we've fully transferred
 				 * out its tuples, so this reference is safe. We do need to
 				 * reset the group pivot tuple though since we've finished the
 				 * current prefix key group.
 				 */
 				ExecClearTuple(node->group_pivot);

 				/* Break out of for-loop early */
 				break;
 			}
 		}
 	}

 	/*
 	 * Track how many tuples remain in the full sort batch so that we know if
 	 * we need to sort multiple prefix key groups before processing tuples
 	 * remaining in the large single prefix key group we think we've
 	 * encountered.
 	 */
 	SO1_printf("Moving " INT64_FORMAT " tuples to presorted prefix tuplesort\n", nTuples);
 	node->n_fullsort_remaining -= nTuples;
 	SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT "\n", node->n_fullsort_remaining);

 	if (node->n_fullsort_remaining == 0)
 	{
 		/*
 		 * We've found that all tuples remaining in the full sort batch are in
 		 * the same prefix key group and moved all of those tuples into the
 		 * presorted prefix tuplesort.  We don't know that we've yet found the
 		 * last tuple in the current prefix key group, so save our pivot
 		 * comparison tuple and continue fetching tuples from the outer
 		 * execution node to load into the presorted prefix tuplesort.
 		 */
 		ExecCopySlot(node->group_pivot, node->transfer_tuple);
 		SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n");
 		node->execution_status = INCSORT_LOADPREFIXSORT;

 		/*
 		 * Make sure we clear the transfer tuple slot so that next time we
 		 * encounter a large prefix key group we don't incorrectly assume we
 		 * have a tuple carried over from the previous group.
 		 */
 		ExecClearTuple(node->transfer_tuple);
 	}
 	else
 	{
 		/*
 		 * We finished a group but didn't consume all of the tuples from the
 		 * full sort state, so we'll sort this batch, let the outer node read
 		 * out all of those tuples, and then come back around to find another
 		 * batch.
 		 */
 		SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples);
 		tuplesort_performsort(node->prefixsort_state);

 		INSTRUMENT_SORT_GROUP(node, prefixsort);

 		if (node->bounded)
 		{
 			/*
 			 * If the current node has a bound and we've already sorted n
 			 * tuples, then the functional bound remaining is (original bound
 			 * - n), so store the current number of processed tuples for use
 			 * in configuring sorting bound.
 			 */
 			SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
 					   Min(node->bound, node->bound_Done + nTuples), node->bound_Done);
 			node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
 		}

 		SO_printf("Setting execution_status to INCSORT_READPREFIXSORT  (switchToPresortedPrefixMode)\n");
 		node->execution_status = INCSORT_READPREFIXSORT;
 	}
 }

 /*
  * Sorting many small groups with tuplesort is inefficient. In order to
  * cope with this problem we don't start a new group until the current one
  * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also
  * means we can't assume small groups of tuples all have the same prefix keys.)
  * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking
  * for the new group as soon as we've met our bound to avoid fetching more
  * tuples than we absolutely have to fetch.
  */
 #define DEFAULT_MIN_GROUP_SIZE 32

 /*
  * While we've optimized for small prefix key groups by not starting our prefix
  * key comparisons until we've reached a minimum number of tuples, we don't want
  * that optimization to cause us to lose out on the benefits of being able to
  * assume a large group of tuples is fully presorted by its prefix keys.
  * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic
  * for determining when we believe we've encountered a large group, and, if we
  * get to that point without finding a new prefix key group we transition to
  * presorted prefix key mode.
  */
 #define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE)

 /* ----------------------------------------------------------------
  *		ExecIncrementalSort
  *
  *		Assuming that outer subtree returns tuple presorted by some prefix
  *		of target sort columns, performs incremental sort.
  *
  *		Conditions:
  *		  -- none.
  *
  *		Initial States:
  *		  -- the outer child is prepared to return the first tuple.
  * ----------------------------------------------------------------
  */
 static TupleTableSlot *
 ExecIncrementalSort(PlanState *pstate)
 {
 	IncrementalSortState *node = castNode(IncrementalSortState, pstate);
 	EState	   *estate;
 	ScanDirection dir;
 	Tuplesortstate *read_sortstate;
 	Tuplesortstate *fullsort_state;
 	TupleTableSlot *slot;
 	IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan;
 	PlanState  *outerNode;
 	TupleDesc	tupDesc;
 	int64		nTuples = 0;
 	int64		minGroupSize;

 	CHECK_FOR_INTERRUPTS();

 	estate = node->ss.ps.state;
 	dir = estate->es_direction;
 	fullsort_state = node->fullsort_state;

 	/*
 	 * If a previous iteration has sorted a batch, then we need to check to
 	 * see if there are any remaining tuples in that batch that we can return
 	 * before moving on to other execution states.
 	 */
 	if (node->execution_status == INCSORT_READFULLSORT
 		|| node->execution_status == INCSORT_READPREFIXSORT)
 	{
 		/*
 		 * Return next tuple from the current sorted group set if available.
 		 */
 		read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
 			fullsort_state : node->prefixsort_state;
 		slot = node->ss.ps.ps_ResultTupleSlot;

 		/*
 		 * We have to populate the slot from the tuplesort before checking
 		 * outerNodeDone because it will set the slot to NULL if no more
 		 * tuples remain. If the tuplesort is empty, but we don't have any
 		 * more tuples available for sort from the outer node, then
 		 * outerNodeDone will have been set so we'll return that now-empty
 		 * slot to the caller.
 		 */
 		if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
 								   false, slot, NULL) || node->outerNodeDone)

 			/*
 			 * Note: there isn't a good test case for the node->outerNodeDone
 			 * check directly, but we need it for any plan where the outer
 			 * node will fail when trying to fetch too many tuples.
 			 */
 			return slot;
 		else if (node->n_fullsort_remaining > 0)
 		{
 			/*
 			 * When we transition to presorted prefix mode, we might have
 			 * accumulated at least one additional prefix key group in the
 			 * full sort tuplesort. The first call to
 			 * switchToPresortedPrefixMode() will have pulled the first one of
 			 * those groups out, and we've returned those tuples to the parent
 			 * node, but if at this point we still have tuples remaining in
 			 * the full sort state (i.e., n_fullsort_remaining > 0), then we
 			 * need to re-execute the prefix mode transition function to pull
 			 * out the next prefix key group.
 			 */
 			SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (" INT64_FORMAT ")\n",
 					   node->n_fullsort_remaining);
 			switchToPresortedPrefixMode(pstate);
 		}
 		else
 		{
 			/*
 			 * If we don't have any sorted tuples to read and we're not
 			 * currently transitioning into presorted prefix sort mode, then
 			 * it's time to start the process all over again by building a new
 			 * group in the full sort state.
 			 */
 			SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n");
 			node->execution_status = INCSORT_LOADFULLSORT;
 		}
 	}

 	/*
 	 * Scan the subplan in the forward direction while creating the sorted
 	 * data.
 	 */
 	estate->es_direction = ForwardScanDirection;

 	outerNode = outerPlanState(node);
 	tupDesc = ExecGetResultType(outerNode);

 	/* Load tuples into the full sort state. */
 	if (node->execution_status == INCSORT_LOADFULLSORT)
 	{
 		/*
 		 * Initialize sorting structures.
 		 */
 		if (fullsort_state == NULL)
 		{
 			/*
 			 * Initialize presorted column support structures for
 			 * isCurrentGroup(). It's correct to do this along with the
 			 * initial initialization for the full sort state (and not for the
 			 * prefix sort state) since we always load the full sort state
 			 * first.
 			 */
 			preparePresortedCols(node);

 			/*
 			 * Since we optimize small prefix key groups by accumulating a
 			 * minimum number of tuples before sorting, we can't assume that a
 			 * group of tuples all have the same prefix key values. Hence we
 			 * setup the full sort tuplesort to sort by all requested sort
 			 * keys.
 			 */
 			fullsort_state = tuplesort_begin_heap(tupDesc,
 												  plannode->sort.numCols,
 												  plannode->sort.sortColIdx,
 												  plannode->sort.sortOperators,
 												  plannode->sort.collations,
 												  plannode->sort.nullsFirst,
 												  work_mem,
 												  NULL,
 												  false);
 			node->fullsort_state = fullsort_state;
 		}
 		else
 		{
 			/* Reset sort for the next batch. */
 			tuplesort_reset(fullsort_state);
 		}

 		/*
 		 * Calculate the remaining tuples left if bounded and configure both
 		 * bounded sort and the minimum group size accordingly.
 		 */
 		if (node->bounded)
 		{
 			int64		currentBound = node->bound - node->bound_Done;

 			/*
 			 * Bounded sort isn't likely to be a useful optimization for full
 			 * sort mode since we limit full sort mode to a relatively small
 			 * number of tuples and tuplesort doesn't switch over to top-n
 			 * heap sort anyway unless it hits (2 * bound) tuples.
 			 */
 			if (currentBound < DEFAULT_MIN_GROUP_SIZE)
 				tuplesort_set_bound(fullsort_state, currentBound);

 			minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound);
 		}
 		else
 			minGroupSize = DEFAULT_MIN_GROUP_SIZE;

 		/*
 		 * Because we have to read the next tuple to find out that we've
 		 * encountered a new prefix key group, on subsequent groups we have to
 		 * carry over that extra tuple and add it to the new group's sort here
 		 * before we read any new tuples from the outer node.
 		 */
 		if (!TupIsNull(node->group_pivot))
 		{
 			tuplesort_puttupleslot(fullsort_state, node->group_pivot);
 			nTuples++;

 			/*
 			 * We're in full sort mode accumulating a minimum number of tuples
 			 * and not checking for prefix key equality yet, so we can't
 			 * assume the group pivot tuple will reamin the same -- unless
 			 * we're using a minimum group size of 1, in which case the pivot
 			 * is obviously still the pviot.
 			 */
 			if (nTuples != minGroupSize)
 				ExecClearTuple(node->group_pivot);
 		}


 		/*
 		 * Pull as many tuples from the outer node as possible given our
 		 * current operating mode.
 		 */
 		for (;;)
 		{
 			slot = ExecProcNode(outerNode);

 			/*
 			 * If the outer node can't provide us any more tuples, then we can
 			 * sort the current group and return those tuples.
 			 */
 			if (TupIsNull(slot))
 			{
 				/*
 				 * We need to know later if the outer node has completed to be
 				 * able to distinguish between being done with a batch and
 				 * being done with the whole node.
 				 */
 				node->outerNodeDone = true;

 				SO1_printf("Sorting fullsort with " INT64_FORMAT " tuples\n", nTuples);
 				tuplesort_performsort(fullsort_state);

 				INSTRUMENT_SORT_GROUP(node, fullsort);

 				SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n");
 				node->execution_status = INCSORT_READFULLSORT;
 				break;
 			}

 			/* Accumulate the next group of presorted tuples. */
 			if (nTuples < minGroupSize)
 			{
 				/*
 				 * If we haven't yet hit our target minimum group size, then
 				 * we don't need to bother checking for inclusion in the
 				 * current prefix group since at this point we'll assume that
 				 * we'll full sort this batch to avoid a large number of very
 				 * tiny (and thus inefficient) sorts.
 				 */
 				tuplesort_puttupleslot(fullsort_state, slot);
 				nTuples++;

 				/*
 				 * If we've reached our minimum group size, then we need to
 				 * store the most recent tuple as a pivot.
 				 */
 				if (nTuples == minGroupSize)
 					ExecCopySlot(node->group_pivot, slot);
 			}
 			else
 			{
 				/*
 				 * If we've already accumulated enough tuples to reach our
 				 * minimum group size, then we need to compare any additional
 				 * tuples to our pivot tuple to see if we reach the end of
 				 * that prefix key group. Only after we find changed prefix
 				 * keys can we guarantee sort stability of the tuples we've
 				 * already accumulated.
 				 */
 				if (isCurrentGroup(node, node->group_pivot, slot))
 				{
 					/*
 					 * As long as the prefix keys match the pivot tuple then
 					 * load the tuple into the tuplesort.
 					 */
 					tuplesort_puttupleslot(fullsort_state, slot);
 					nTuples++;
 				}
 				else
 				{
 					/*
 					 * Since the tuple we fetched isn't part of the current
 					 * prefix key group we don't want to sort it as part of
 					 * the current batch. Instead we use the group_pivot slot
 					 * to carry it over to the next batch (even though we
 					 * won't actually treat it as a group pivot).
 					 */
 					ExecCopySlot(node->group_pivot, slot);

 					if (node->bounded)
 					{
 						/*
 						 * If the current node has a bound, and we've already
 						 * sorted n tuples, then the functional bound
 						 * remaining is (original bound - n), so store the
 						 * current number of processed tuples for later use
 						 * configuring the sort state's bound.
 						 */
 						SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
 								   node->bound_Done,
 								   Min(node->bound, node->bound_Done + nTuples));
 						node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
 					}

 					/*
 					 * Once we find changed prefix keys we can complete the
 					 * sort and transition modes to reading out the sorted
 					 * tuples.
 					 */
 					SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n",
 							   nTuples);
 					tuplesort_performsort(fullsort_state);

 					INSTRUMENT_SORT_GROUP(node, fullsort);

 					SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n");
 					node->execution_status = INCSORT_READFULLSORT;
 					break;
 				}
 			}

 			/*
 			 * Unless we've already transitioned modes to reading from the
 			 * full sort state, then we assume that having read at least
 			 * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're
 			 * processing a large group of tuples all having equal prefix keys
 			 * (but haven't yet found the final tuple in that prefix key
 			 * group), so we need to transition into presorted prefix mode.
 			 */
 			if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE &&
 				node->execution_status != INCSORT_READFULLSORT)
 			{
 				/*
 				 * The group pivot we have stored has already been put into
 				 * the tuplesort; we don't want to carry it over. Since we
 				 * haven't yet found the end of the prefix key group, it might
 				 * seem like we should keep this, but we don't actually know
 				 * how many prefix key groups might be represented in the full
 				 * sort state, so we'll let the mode transition function
 				 * manage this state for us.
 				 */
 				ExecClearTuple(node->group_pivot);

 				/*
 				 * Unfortunately the tuplesort API doesn't include a way to
 				 * retrieve tuples unless a sort has been performed, so we
 				 * perform the sort even though we could just as easily rely
 				 * on FIFO retrieval semantics when transferring them to the
 				 * presorted prefix tuplesort.
 				 */
 				SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", nTuples);
 				tuplesort_performsort(fullsort_state);

 				INSTRUMENT_SORT_GROUP(node, fullsort);

 				/*
 				 * If the full sort tuplesort happened to switch into top-n
 				 * heapsort mode then we will only be able to retrieve
 				 * currentBound tuples (since the tuplesort will have only
 				 * retained the top-n tuples). This is safe even though we
 				 * haven't yet completed fetching the current prefix key group
 				 * because the tuples we've "lost" already sorted "below" the
 				 * retained ones, and we're already contractually guaranteed
 				 * to not need any more than the currentBound tuples.
 				 */
 				if (tuplesort_used_bound(node->fullsort_state))
 				{
 					int64		currentBound = node->bound - node->bound_Done;

 					SO2_printf("Read " INT64_FORMAT " tuples, but setting to " INT64_FORMAT " because we used bounded sort\n",
 							   nTuples, Min(currentBound, nTuples));
 					nTuples = Min(currentBound, nTuples);
 				}

 				SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT " and calling switchToPresortedPrefixMode()\n",
 						   nTuples);

 				/*
 				 * We might have multiple prefix key groups in the full sort
 				 * state, so the mode transition function needs to know that
 				 * it needs to move from the fullsort to presorted prefix
 				 * sort.
 				 */
 				node->n_fullsort_remaining = nTuples;

 				/* Transition the tuples to the presorted prefix tuplesort. */
 				switchToPresortedPrefixMode(pstate);

 				/*
 				 * Since we know we had tuples to move to the presorted prefix
 				 * tuplesort, we know that unless that transition has verified
 				 * that all tuples belonged to the same prefix key group (in
 				 * which case we can go straight to continuing to load tuples
 				 * into that tuplesort), we should have a tuple to return
 				 * here.
 				 *
 				 * Either way, the appropriate execution status should have
 				 * been set by switchToPresortedPrefixMode(), so we can drop
 				 * out of the loop here and let the appropriate path kick in.
 				 */
 				break;
 			}
 		}
 	}

 	if (node->execution_status == INCSORT_LOADPREFIXSORT)
 	{
 		/*
 		 * We only enter this state after the mode transition function has
 		 * confirmed all remaining tuples from the full sort state have the
 		 * same prefix and moved those tuples to the prefix sort state. That
 		 * function has also set a group pivot tuple (which doesn't need to be
 		 * carried over; it's already been put into the prefix sort state).
 		 */
 		Assert(!TupIsNull(node->group_pivot));

 		/*
 		 * Read tuples from the outer node and load them into the prefix sort
 		 * state until we encounter a tuple whose prefix keys don't match the
 		 * current group_pivot tuple, since we can't guarantee sort stability
 		 * until we have all tuples matching those prefix keys.
 		 */
 		for (;;)
 		{
 			slot = ExecProcNode(outerNode);

 			/*
 			 * If we've exhausted tuples from the outer node we're done
 			 * loading the prefix sort state.
 			 */
 			if (TupIsNull(slot))
 			{
 				/*
 				 * We need to know later if the outer node has completed to be
 				 * able to distinguish between being done with a batch and
 				 * being done with the whole node.
 				 */
 				node->outerNodeDone = true;
 				break;
 			}

 			/*
 			 * If the tuple's prefix keys match our pivot tuple, we're not
 			 * done yet and can load it into the prefix sort state. If not, we
 			 * don't want to sort it as part of the current batch. Instead we
 			 * use the group_pivot slot to carry it over to the next batch
 			 * (even though we won't actually treat it as a group pivot).
 			 */
 			if (isCurrentGroup(node, node->group_pivot, slot))
 			{
 				tuplesort_puttupleslot(node->prefixsort_state, slot);
 				nTuples++;
 			}
 			else
 			{
 				ExecCopySlot(node->group_pivot, slot);
 				break;
 			}
 		}

 		/*
 		 * Perform the sort and begin returning the tuples to the parent plan
 		 * node.
 		 */
 		SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples);
 		tuplesort_performsort(node->prefixsort_state);

 		INSTRUMENT_SORT_GROUP(node, prefixsort);

 		SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n");
 		node->execution_status = INCSORT_READPREFIXSORT;

 		if (node->bounded)
 		{
 			/*
 			 * If the current node has a bound, and we've already sorted n
 			 * tuples, then the functional bound remaining is (original bound
 			 * - n), so store the current number of processed tuples for use
 			 * in configuring sorting bound.
 			 */
 			SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
 					   node->bound_Done,
 					   Min(node->bound, node->bound_Done + nTuples));
 			node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
 		}
 	}

 	/* Restore to user specified direction. */
 	estate->es_direction = dir;

 	/*
 	 * Get the first or next tuple from tuplesort. Returns NULL if no more
 	 * tuples.
 	 */
 	read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
 		fullsort_state : node->prefixsort_state;
 	slot = node->ss.ps.ps_ResultTupleSlot;
 	(void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
 								  false, slot, NULL);
 	return slot;
 }

 /* ----------------------------------------------------------------
  *		ExecInitIncrementalSort
  *
  *		Creates the run-time state information for the sort node
  *		produced by the planner and initializes its outer subtree.
  * ----------------------------------------------------------------
  */
 IncrementalSortState *
 ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags)
 {
 	IncrementalSortState *incrsortstate;

 	SO_printf("ExecInitIncrementalSort: initializing sort node\n");

 	/*
 	 * Incremental sort can't be used with EXEC_FLAG_BACKWARD or
 	 * EXEC_FLAG_MARK, because the current sort state contains only one sort
 	 * batch rather than the full result set.
 	 */
 	Assert((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) == 0);

 	/* Initialize state structure. */
 	incrsortstate = makeNode(IncrementalSortState);
 	incrsortstate->ss.ps.plan = (Plan *) node;
 	incrsortstate->ss.ps.state = estate;
 	incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort;

 	incrsortstate->execution_status = INCSORT_LOADFULLSORT;
 	incrsortstate->bounded = false;
 	incrsortstate->outerNodeDone = false;
 	incrsortstate->bound_Done = 0;
 	incrsortstate->fullsort_state = NULL;
 	incrsortstate->prefixsort_state = NULL;
 	incrsortstate->group_pivot = NULL;
 	incrsortstate->transfer_tuple = NULL;
 	incrsortstate->n_fullsort_remaining = 0;
 	incrsortstate->presorted_keys = NULL;

 	if (incrsortstate->ss.ps.instrument != NULL)
 	{
 		IncrementalSortGroupInfo *fullsortGroupInfo =
 		&incrsortstate->incsort_info.fullsortGroupInfo;
 		IncrementalSortGroupInfo *prefixsortGroupInfo =
 		&incrsortstate->incsort_info.prefixsortGroupInfo;

 		fullsortGroupInfo->groupCount = 0;
 		fullsortGroupInfo->maxDiskSpaceUsed = 0;
 		fullsortGroupInfo->totalDiskSpaceUsed = 0;
 		fullsortGroupInfo->maxMemorySpaceUsed = 0;
 		fullsortGroupInfo->totalMemorySpaceUsed = 0;
 		fullsortGroupInfo->sortMethods = 0;
 		prefixsortGroupInfo->groupCount = 0;
 		prefixsortGroupInfo->maxDiskSpaceUsed = 0;
 		prefixsortGroupInfo->totalDiskSpaceUsed = 0;
 		prefixsortGroupInfo->maxMemorySpaceUsed = 0;
 		prefixsortGroupInfo->totalMemorySpaceUsed = 0;
 		prefixsortGroupInfo->sortMethods = 0;
 	}

 	/*
 	 * Miscellaneous initialization
 	 *
 	 * Sort nodes don't initialize their ExprContexts because they never call
 	 * ExecQual or ExecProject.
 	 */

 	/*
 	 * Initialize child nodes.
 	 *
 	 * Incremental sort does not support backwards scans and mark/restore, so
 	 * we don't bother removing the flags from eflags here. We allow passing a
 	 * REWIND flag, because although incremental sort can't use it, the child
 	 * nodes may be able to do something more useful.
 	 */
 	outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags);

 	/*
 	 * Initialize scan slot and type.
 	 */
 	ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple);

 	/*
 	 * Initialize return slot and type. No need to initialize projection info
 	 * because we don't do any projections.
 	 */
 	ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple);
 	incrsortstate->ss.ps.ps_ProjInfo = NULL;

 	/*
 	 * Initialize standalone slots to store a tuple for pivot prefix keys and
 	 * for carrying over a tuple from one batch to the next.
 	 */
 	incrsortstate->group_pivot =
 		MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
 								 &TTSOpsMinimalTuple);
 	incrsortstate->transfer_tuple =
 		MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
 								 &TTSOpsMinimalTuple);

 	SO_printf("ExecInitIncrementalSort: sort node initialized\n");

 	return incrsortstate;
 }

 /* ----------------------------------------------------------------
  *		ExecEndIncrementalSort(node)
  * ----------------------------------------------------------------
  */
 void
 ExecEndIncrementalSort(IncrementalSortState *node)
 {
 	SO_printf("ExecEndIncrementalSort: shutting down sort node\n");

 	/* clean out the scan tuple */
 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
 	/* must drop pointer to sort result tuple */
 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
 	/* must drop standalone tuple slots from outer node */
 	ExecDropSingleTupleTableSlot(node->group_pivot);
 	ExecDropSingleTupleTableSlot(node->transfer_tuple);

 	/*
 	 * Release tuplesort resources.
 	 */
 	if (node->fullsort_state != NULL)
 	{
 		tuplesort_end(node->fullsort_state);
 		node->fullsort_state = NULL;
 	}
 	if (node->prefixsort_state != NULL)
 	{
 		tuplesort_end(node->prefixsort_state);
 		node->prefixsort_state = NULL;
 	}

 	/*
 	 * Shut down the subplan.
 	 */
 	ExecEndNode(outerPlanState(node));

 	SO_printf("ExecEndIncrementalSort: sort node shutdown\n");
 }

 void
 ExecReScanIncrementalSort(IncrementalSortState *node)
 {
 	PlanState  *outerPlan = outerPlanState(node);

 	/*
 	 * Incremental sort doesn't support efficient rescan even when parameters
 	 * haven't changed (e.g., rewind) because unlike regular sort we don't
 	 * store all tuples at once for the full sort.
 	 *
 	 * So even if EXEC_FLAG_REWIND is set we just reset all of our state and
 	 * re-execute the sort along with the child node. Incremental sort itself
 	 * can't do anything smarter, but maybe the child nodes can.
 	 *
 	 * In theory if we've only filled the full sort with one batch (and
 	 * haven't reset it for a new batch yet) then we could efficiently rewind,
 	 * but that seems a narrow enough case that it's not worth handling
 	 * specially at this time.
 	 */

 	/* must drop pointer to sort result tuple */
 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);

 	if (node->group_pivot != NULL)
 		ExecClearTuple(node->group_pivot);
 	if (node->transfer_tuple != NULL)
 		ExecClearTuple(node->transfer_tuple);

 	node->outerNodeDone = false;
 	node->n_fullsort_remaining = 0;
 	node->bound_Done = 0;
 	node->presorted_keys = NULL;

 	node->execution_status = INCSORT_LOADFULLSORT;

 	/*
 	 * If we've set up either of the sort states yet, we need to reset them.
 	 * We could end them and null out the pointers, but there's no reason to
 	 * repay the setup cost, and because ExecIncrementalSort guards presorted
 	 * column functions by checking to see if the full sort state has been
 	 * initialized yet, setting the sort states to null here might actually
 	 * cause a leak.
 	 */
 	if (node->fullsort_state != NULL)
 	{
 		tuplesort_reset(node->fullsort_state);
 		node->fullsort_state = NULL;
 	}
 	if (node->prefixsort_state != NULL)
 	{
 		tuplesort_reset(node->prefixsort_state);
 		node->prefixsort_state = NULL;
 	}

 	/*
 	 * If chgParam of subnode is not null, theni the plan will be re-scanned
 	 * by the first ExecProcNode.
 	 */
 	if (outerPlan->chgParam == NULL)
 		ExecReScan(outerPlan);
 }

 /* ----------------------------------------------------------------
  *						Parallel Query Support
  * ----------------------------------------------------------------
  */

 /* ----------------------------------------------------------------
  *		ExecSortEstimate
  *
  *		Estimate space required to propagate sort statistics.
  * ----------------------------------------------------------------
  */
 void
 ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt)
 {
 	Size		size;

 	/* don't need this if not instrumenting or no workers */
 	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
 		return;

 	size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo));
 	size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo));
 	shm_toc_estimate_chunk(&pcxt->estimator, size);
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 }

 /* ----------------------------------------------------------------
  *		ExecSortInitializeDSM
  *
  *		Initialize DSM space for sort statistics.
  * ----------------------------------------------------------------
  */
 void
 ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt)
 {
 	Size		size;

 	/* don't need this if not instrumenting or no workers */
 	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
 		return;

 	size = offsetof(SharedIncrementalSortInfo, sinfo)
 		+ pcxt->nworkers * sizeof(IncrementalSortInfo);
 	node->shared_info = shm_toc_allocate(pcxt->toc, size);
 	/* ensure any unfilled slots will contain zeroes */
 	memset(node->shared_info, 0, size);
 	node->shared_info->num_workers = pcxt->nworkers;
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
 				   node->shared_info);
 }

 /* ----------------------------------------------------------------
  *		ExecSortInitializeWorker
  *
  *		Attach worker to DSM space for sort statistics.
  * ----------------------------------------------------------------
  */
 void
 ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt)
 {
 	node->shared_info =
 		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
 	node->am_worker = true;
 }

 /* ----------------------------------------------------------------
  *		ExecSortRetrieveInstrumentation
  *
  *		Transfer sort statistics from DSM to private memory.
  * ----------------------------------------------------------------
  */
 void
 ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node)
 {
 	Size		size;
 	SharedIncrementalSortInfo *si;

 	if (node->shared_info == NULL)
 		return;

 	size = offsetof(SharedIncrementalSortInfo, sinfo)
 		+ node->shared_info->num_workers * sizeof(IncrementalSortInfo);
 	si = palloc(size);
 	memcpy(si, node->shared_info, size);
 	node->shared_info = si;
 }