src/backend/executor/execPartition.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * execPartition.c
  *	  Support routines for partitioning.
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/executor/execPartition.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "access/table.h"
 #include "access/tableam.h"
 #include "catalog/partition.h"
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_type.h"
 #include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
 #include "foreign/fdwapi.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "partitioning/partbounds.h"
 #include "partitioning/partdesc.h"
 #include "partitioning/partprune.h"
 #include "rewrite/rewriteManip.h"
 #include "utils/acl.h"
 #include "utils/lsyscache.h"
 #include "utils/partcache.h"
 #include "utils/rls.h"
 #include "utils/ruleutils.h"

 #include "cdb/cdbaocsam.h"
 #include "cdb/cdbappendonlyam.h"

 /*
  * Helper macro that is used to determine if a Modifytable node came from a
  * Dynamic scan (produced by Orca), which requires tuple routing to determine
  * the correct partition
  */
 #define IsDynamicScan(plan) castNode(ModifyTable, plan) != NULL && \
 							castNode(ModifyTable, plan)->forceTupleRouting
 /*-----------------------
  * PartitionTupleRouting - Encapsulates all information required to
  * route a tuple inserted into a partitioned table to one of its leaf
  * partitions.
  *
  * partition_root
  *		The partitioned table that's the target of the command.
  *
  * partition_dispatch_info
  *		Array of 'max_dispatch' elements containing a pointer to a
  *		PartitionDispatch object for every partitioned table touched by tuple
  *		routing.  The entry for the target partitioned table is *always*
  *		present in the 0th element of this array.  See comment for
  *		PartitionDispatchData->indexes for details on how this array is
  *		indexed.
  *
  * nonleaf_partitions
  *		Array of 'max_dispatch' elements containing pointers to fake
  *		ResultRelInfo objects for nonleaf partitions, useful for checking
  *		the partition constraint.
  *
  * num_dispatch
  *		The current number of items stored in the 'partition_dispatch_info'
  *		array.  Also serves as the index of the next free array element for
  *		new PartitionDispatch objects that need to be stored.
  *
  * max_dispatch
  *		The current allocated size of the 'partition_dispatch_info' array.
  *
  * partitions
  *		Array of 'max_partitions' elements containing a pointer to a
  *		ResultRelInfo for every leaf partition touched by tuple routing.
  *		Some of these are pointers to ResultRelInfos which are borrowed out of
  *		the owning ModifyTableState node.  The remainder have been built
  *		especially for tuple routing.  See comment for
  *		PartitionDispatchData->indexes for details on how this array is
  *		indexed.
  *
  * is_borrowed_rel
  *		Array of 'max_partitions' booleans recording whether a given entry
  *		in 'partitions' is a ResultRelInfo pointer borrowed from the owning
  *		ModifyTableState node, rather than being built here.
  *
  * num_partitions
  *		The current number of items stored in the 'partitions' array.  Also
  *		serves as the index of the next free array element for new
  *		ResultRelInfo objects that need to be stored.
  *
  * max_partitions
  *		The current allocated size of the 'partitions' array.
  *
  * memcxt
  *		Memory context used to allocate subsidiary structs.
  *-----------------------
  */
 struct PartitionTupleRouting
 {
 	Relation	partition_root;
 	PartitionDispatch *partition_dispatch_info;
 	ResultRelInfo **nonleaf_partitions;
 	int			num_dispatch;
 	int			max_dispatch;
 	ResultRelInfo **partitions;
 	bool	   *is_borrowed_rel;
 	int			num_partitions;
 	int			max_partitions;
 	MemoryContext memcxt;
 };

 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to any of its partitions.  A
  * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
  * struct and stored inside its 'partition_dispatch_info' array.
  *
  * reldesc
  *		Relation descriptor of the table
  *
  * key
  *		Partition key information of the table
  *
  * keystate
  *		Execution state required for expressions in the partition key
  *
  * partdesc
  *		Partition descriptor of the table
  *
  * tupslot
  *		A standalone TupleTableSlot initialized with this table's tuple
  *		descriptor, or NULL if no tuple conversion between the parent is
  *		required.
  *
  * tupmap
  *		TupleConversionMap to convert from the parent's rowtype to this table's
  *		rowtype  (when extracting the partition key of a tuple just before
  *		routing it through this table). A NULL value is stored if no tuple
  *		conversion is required.
  *
  * indexes
  *		Array of partdesc->nparts elements.  For leaf partitions the index
  *		corresponds to the partition's ResultRelInfo in the encapsulating
  *		PartitionTupleRouting's partitions array.  For partitioned partitions,
  *		the index corresponds to the PartitionDispatch for it in its
  *		partition_dispatch_info array.  -1 indicates we've not yet allocated
  *		anything in PartitionTupleRouting for the partition.
  *-----------------------
  */
 typedef struct PartitionDispatchData
 {
 	Relation	reldesc;
 	PartitionKey key;
 	List	   *keystate;		/* list of ExprState */
 	PartitionDesc partdesc;
 	TupleTableSlot *tupslot;
 	AttrMap    *tupmap;
 	int			indexes[FLEXIBLE_ARRAY_MEMBER];
 }			PartitionDispatchData;


 static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
 											EState *estate, PartitionTupleRouting *proute,
 											PartitionDispatch dispatch,
 											ResultRelInfo *rootResultRelInfo,
 											int partidx);
 static void ExecInitRoutingInfo(ModifyTableState *mtstate,
 								EState *estate,
 								PartitionTupleRouting *proute,
 								PartitionDispatch dispatch,
 								ResultRelInfo *partRelInfo,
 								int partidx,
 								bool is_borrowed_rel);
 static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
 													   PartitionTupleRouting *proute,
 													   Oid partoid, PartitionDispatch parent_pd,
 													   int partidx, ResultRelInfo *rootResultRelInfo);
 static void FormPartitionKeyDatum(PartitionDispatch pd,
 								  TupleTableSlot *slot,
 								  EState *estate,
 								  Datum *values,
 								  bool *isnull,
 								  AttrMap *attno_map);

 static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
 												  Datum *values,
 												  bool *isnull,
 												  int maxfieldlen);
 static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
 static List *adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap);
 static void InitPartitionPruneContext(PartitionPruneContext *context,
 									  List *pruning_steps,
 									  PartitionDesc partdesc,
 									  PartitionKey partkey,
 									  PlanState *planstate,
 									  ExprContext *econtext);
 static void PartitionPruneFixSubPlanMap(PartitionPruneState *prunestate,
 										Bitmapset *initially_valid_subplans,
 										int n_total_subplans);
 static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
 										   PartitionedRelPruningData *pprune,
 										   bool initial_prune,
 										   Bitmapset **validsubplans);


 /*
  * ExecSetupPartitionTupleRouting - sets up information needed during
  * tuple routing for partitioned tables, encapsulates it in
  * PartitionTupleRouting, and returns it.
  *
  * Callers must use the returned PartitionTupleRouting during calls to
  * ExecFindPartition().  The actual ResultRelInfo for a partition is only
  * allocated when the partition is found for the first time.
  *
  * The current memory context is used to allocate this struct and all
  * subsidiary structs that will be allocated from it later on.  Typically
  * it should be estate->es_query_cxt.
  */
 PartitionTupleRouting *
 ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
 {
 	PartitionTupleRouting *proute;

 	/*
 	 * Here we attempt to expend as little effort as possible in setting up
 	 * the PartitionTupleRouting.  Each partition's ResultRelInfo is built on
 	 * demand, only when we actually need to route a tuple to that partition.
 	 * The reason for this is that a common case is for INSERT to insert a
 	 * single tuple into a partitioned table and this must be fast.
 	 */
 	proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
 	proute->partition_root = rel;
 	proute->memcxt = CurrentMemoryContext;
 	/* Rest of members initialized by zeroing */

 	/*
 	 * Initialize this table's PartitionDispatch object.  Here we pass in the
 	 * parent as NULL as we don't need to care about any parent of the target
 	 * partitioned table.
 	 */
 	ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
 								  NULL, 0, NULL);

 	return proute;
 }

 /*
  * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
  * the tuple contained in *slot should belong to.
  *
  * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
  * one up or reuse one from mtstate's resultRelInfo array.  When reusing a
  * ResultRelInfo from the mtstate we verify that the relation is a valid
  * target for INSERTs and initialize tuple routing information.
  *
  * rootResultRelInfo is the relation named in the query.
  *
  * estate must be non-NULL; we'll need it to compute any expressions in the
  * partition keys.  Also, its per-tuple contexts are used as evaluation
  * scratch space.
  *
  * If no leaf partition is found, this routine errors out with the appropriate
  * error message.  An error may also be raised if the found target partition
  * is not a valid target for an INSERT.
  */
 ResultRelInfo *
 ExecFindPartition(ModifyTableState *mtstate,
 				  ResultRelInfo *rootResultRelInfo,
 				  PartitionTupleRouting *proute,
 				  TupleTableSlot *slot, EState *estate)
 {
 	PartitionDispatch *pd = proute->partition_dispatch_info;
 	Datum		values[PARTITION_MAX_KEYS];
 	bool		isnull[PARTITION_MAX_KEYS];
 	Relation	rel;
 	PartitionDispatch dispatch;
 	PartitionDesc partdesc;
 	ExprContext *ecxt = GetPerTupleExprContext(estate);
 	TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
 	TupleTableSlot *rootslot = slot;
 	TupleTableSlot *myslot = NULL;
 	MemoryContext oldcxt;
 	ResultRelInfo *rri = NULL;

 	/* use per-tuple context here to avoid leaking memory */
 	oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));

 	/*
 	 * First check the root table's partition constraint, if any.  No point in
 	 * routing the tuple if it doesn't belong in the root table itself.
 	 */
 	if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
 		ExecPartitionCheck(rootResultRelInfo, slot, estate, true);

 	/* start with the root partitioned table */
 	dispatch = pd[0];
 	while (dispatch != NULL)
 	{
 		int			partidx = -1;
 		bool		is_leaf;

 		CHECK_FOR_INTERRUPTS();

 		rel = dispatch->reldesc;
 		partdesc = dispatch->partdesc;

 		/*
 		 * Extract partition key from tuple. Expression evaluation machinery
 		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
 		 * point to the correct tuple slot.  The slot might have changed from
 		 * what was used for the parent table if the table of the current
 		 * partitioning level has different tuple descriptor from the parent.
 		 * So update ecxt_scantuple accordingly.
 		 */
 		ecxt->ecxt_scantuple = slot;
 		/*
 		 * If the operation is delete and its child is a dynamic scan,
 		 * then we need to remap the the attributes from the tuple to the relation.
 		 * This is because the tuple descriptor's columns do not exactly match
 		 * the relation attributes in the catalog--the tuple has a subset of the attrs.
 		 * FormPartitionKeyDatum uses this map to get the partition key index in
 		 * the tuple descriptor, and the values at that index within the tuple.
 		 */
 		AttrMap *attno_map = NULL;
 		if (IsDynamicScan(mtstate->ps.plan)
 			&& mtstate->operation == CMD_DELETE)
 		{
 			attno_map = convert_tuples_by_name_map_missing_ok(slot->tts_tupleDescriptor,
 															  RelationGetDescr(dispatch->reldesc));
 		}
 		/* Populate values/isnull with partition key value from tuple */
 		FormPartitionKeyDatum(dispatch, slot, estate, values, isnull, attno_map);

 		/*
 		 * If this partitioned table has no partitions or no partition for
 		 * these values, error out.
 		 */
 		if (partdesc->nparts == 0 ||
 			(partidx = get_partition_for_tuple(dispatch->key, dispatch->partdesc, values, isnull)) < 0)
 		{
 			char	   *val_desc;

 			val_desc = ExecBuildSlotPartitionKeyDescription(rel,
 															values, isnull, 64);
 			Assert(OidIsValid(RelationGetRelid(rel)));
 			ereport(ERROR,
 					/*
 					 * GPDB: use dedicated error code for this, not the generic
 					 * ERRCODE_CHECK_VIOLATION as in upstream. The SREH stuff
 					 * only catches errors in the ERRCODE_DATA_EXCEPTION class,
 					 * so without this, this error would not be caught by SREH.
 					 */
 					(errcode(ERRCODE_NO_PARTITION_FOR_PARTITIONING_KEY),
 					 errmsg("no partition of relation \"%s\" found for row",
 							RelationGetRelationName(rel)),
 					 val_desc ?
 					 errdetail("Partition key of the failing row contains %s.",
 							   val_desc) : 0,
 					 errtable(rel)));
 		}

 		is_leaf = partdesc->is_leaf[partidx];
 		if (is_leaf)
 		{
 			/*
 			 * We've reached the leaf -- hurray, we're done.  Look to see if
 			 * we've already got a ResultRelInfo for this partition.
 			 */
 			if (likely(dispatch->indexes[partidx] >= 0))
 			{
 				/* ResultRelInfo already built */
 				Assert(dispatch->indexes[partidx] < proute->num_partitions);
 				rri = proute->partitions[dispatch->indexes[partidx]];
 			}
 			else
 			{
 				/*
 				 * If the partition is known in the owning ModifyTableState
 				 * node, we can re-use that ResultRelInfo instead of creating
 				 * a new one with ExecInitPartitionInfo().
 				 */
 				rri = ExecLookupResultRelByOid(mtstate,
 											   partdesc->oids[partidx],
 											   true, false);
 				if (rri)
 				{
 					/* Verify this ResultRelInfo allows INSERTs */
 					CheckValidResultRel(rri, CMD_INSERT, NULL);

 					/*
 					 * Initialize information needed to insert this and
 					 * subsequent tuples routed to this partition.
 					 */
 					ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
 										rri, partidx, true);
 				}
 				else
 				{
 					/* We need to create a new one. */
 					rri = ExecInitPartitionInfo(mtstate, estate, proute,
 												dispatch,
 												rootResultRelInfo, partidx);
 				}
 			}
 			Assert(rri != NULL);

 			/* Signal to terminate the loop */
 			dispatch = NULL;
 		}
 		else
 		{
 			/*
 			 * Partition is a sub-partitioned table; get the PartitionDispatch
 			 */
 			if (likely(dispatch->indexes[partidx] >= 0))
 			{
 				/* Already built. */
 				Assert(dispatch->indexes[partidx] < proute->num_dispatch);

 				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];

 				/*
 				 * Move down to the next partition level and search again
 				 * until we find a leaf partition that matches this tuple
 				 */
 				dispatch = pd[dispatch->indexes[partidx]];
 			}
 			else
 			{
 				/* Not yet built. Do that now. */
 				PartitionDispatch subdispatch;

 				/*
 				 * Create the new PartitionDispatch.  We pass the current one
 				 * in as the parent PartitionDispatch
 				 */
 				subdispatch = ExecInitPartitionDispatchInfo(estate,
 															proute,
 															partdesc->oids[partidx],
 															dispatch, partidx,
 															mtstate->rootResultRelInfo);
 				Assert(dispatch->indexes[partidx] >= 0 &&
 					   dispatch->indexes[partidx] < proute->num_dispatch);

 				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
 				dispatch = subdispatch;
 			}

 			/*
 			 * Convert the tuple to the new parent's layout, if different from
 			 * the previous parent.
 			 */
 			if (dispatch->tupslot)
 			{
 				AttrMap    *map = dispatch->tupmap;
 				TupleTableSlot *tempslot = myslot;

 				myslot = dispatch->tupslot;
 				slot = execute_attr_map_slot(map, slot, myslot);

 				if (tempslot != NULL)
 					ExecClearTuple(tempslot);
 			}
 		}

 		/*
 		 * If this partition is the default one, we must check its partition
 		 * constraint now, which may have changed concurrently due to
 		 * partitions being added to the parent.
 		 *
 		 * (We do this here, and do not rely on ExecInsert doing it, because
 		 * we don't want to miss doing it for non-leaf partitions.)
 		 */
 		if (partidx == partdesc->boundinfo->default_index)
 		{
 			/*
 			 * The tuple must match the partition's layout for the constraint
 			 * expression to be evaluated successfully.  If the partition is
 			 * sub-partitioned, that would already be the case due to the code
 			 * above, but for a leaf partition the tuple still matches the
 			 * parent's layout.
 			 *
 			 * Note that we have a map to convert from root to current
 			 * partition, but not from immediate parent to current partition.
 			 * So if we have to convert, do it from the root slot; if not, use
 			 * the root slot as-is.
 			 */
 			if (is_leaf)
 			{
 				TupleConversionMap *map = ExecGetRootToChildMap(rri, estate);

 				if (map)
 					slot = execute_attr_map_slot(map->attrMap, rootslot,
 												 rri->ri_PartitionTupleSlot);
 				else
 					slot = rootslot;
 			}
 			/*
 			 * If this is a DELETE operation with a dynamic scan child,
 			 * then we need to convert the slot to be based on the relation descriptor, as
 			 * ExecPartitionCheck uses the partition bounds defined on the relation, not the slot.
 			 * The tuple slot returned by the scan however only includes only the projection columns.
 			 */
 			if (IsDynamicScan(mtstate->ps.plan)
 				&& mtstate->operation == CMD_DELETE)
 			{
 				TupleTableSlot *slotOut = MakeSingleTupleTableSlot(RelationGetDescr(rri->ri_RelationDesc), &TTSOpsVirtual);
 				slot = execute_attr_map_slot(attno_map, slot, slotOut);
 				ExecPartitionCheck(rri, slot, estate, true);

 				/*
 				 * Drop the slot. Note that this slot points to slotOut and myslot,
 				 * so we drop it to properly clean up the slot and relation descriptor
 				 */
 				ExecDropSingleTupleTableSlot(slot);
 			}
 			else
 			{
 				ExecPartitionCheck(rri, slot, estate, true);
 			}
 		}
 		if (attno_map)
 			free_attrmap(attno_map);
 	}

 	/* Release the tuple in the lowest parent's dedicated slot. */
 	if (myslot != NULL)
 		ExecClearTuple(myslot);
 	/* and restore ecxt's scantuple */
 	ecxt->ecxt_scantuple = ecxt_scantuple_saved;
 	MemoryContextSwitchTo(oldcxt);

 	return rri;
 }

 /*
  * ExecInitPartitionInfo
  *		Lock the partition and initialize ResultRelInfo.  Also setup other
  *		information for the partition and store it in the next empty slot in
  *		the proute->partitions array.
  *
  * Returns the ResultRelInfo
  */
 static ResultRelInfo *
 ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
 					  PartitionTupleRouting *proute,
 					  PartitionDispatch dispatch,
 					  ResultRelInfo *rootResultRelInfo,
 					  int partidx)
 {
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	Oid			partOid = dispatch->partdesc->oids[partidx];
 	Relation	partrel;
 	int			firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
 	Relation	firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
 	ResultRelInfo *leaf_part_rri;
 	MemoryContext oldcxt;
 	AttrMap    *part_attmap = NULL;
 	bool		found_whole_row;

 	oldcxt = MemoryContextSwitchTo(proute->memcxt);

 	partrel = table_open(partOid, RowExclusiveLock);

 	leaf_part_rri = makeNode(ResultRelInfo);

     /*
      * Init leaf partition ResultRelInfo
      * Here ResultRelInfo->ri_RangeTableIndex is a dummy element, because we
      * will rebuild ri_RelationDesc later. So we assign 1 for it instead of 0
      * which cause failure in EPQ process, and fwd scenarios should still keep 0
      * since it will handle 0 in their own fwd process.
      * related issue https://github.com/greenplum-db/gpdb/issues/14935
      */
     InitResultRelInfo(leaf_part_rri,
 					  partrel,
 					  partrel->rd_rel->relkind != RELKIND_FOREIGN_TABLE ? 1 : 0,
 					  rootResultRelInfo,
 					  estate->es_instrument);

 	/*
 	 * Verify result relation is a valid target for an INSERT.  An UPDATE of a
 	 * partition-key becomes a DELETE+INSERT operation, so this check is still
 	 * required when the operation is CMD_UPDATE.
 	 */
 	CheckValidResultRel(leaf_part_rri, CMD_INSERT, NULL);

 	/*
 	 * Open partition indices.  The user may have asked to check for conflicts
 	 * within this leaf partition and do "nothing" instead of throwing an
 	 * error.  Be prepared in that case by initializing the index information
 	 * needed by ExecInsert() to perform speculative insertions.
 	 */
 	if (partrel->rd_rel->relhasindex &&
 		leaf_part_rri->ri_IndexRelationDescs == NULL)
 		ExecOpenIndices(leaf_part_rri,
 						(node != NULL &&
 						 node->onConflictAction != ONCONFLICT_NONE));

 	/*
 	 * Build WITH CHECK OPTION constraints for the partition.  Note that we
 	 * didn't build the withCheckOptionList for partitions within the planner,
 	 * but simple translation of varattnos will suffice.  This only occurs for
 	 * the INSERT case or in the case of UPDATE/MERGE tuple routing where we
 	 * didn't find a result rel to reuse.
 	 */
 	if (node && node->withCheckOptionLists != NIL)
 	{
 		List	   *wcoList;
 		List	   *wcoExprs = NIL;
 		ListCell   *ll;

 		/*
 		 * In the case of INSERT on a partitioned table, there is only one
 		 * plan.  Likewise, there is only one WCO list, not one per partition.
 		 * For UPDATE/MERGE, there are as many WCO lists as there are plans.
 		 */
 		Assert((node->operation == CMD_INSERT &&
 				list_length(node->withCheckOptionLists) == 1 &&
 				list_length(node->resultRelations) == 1) ||
 			   (node->operation == CMD_UPDATE &&
 				list_length(node->withCheckOptionLists) ==
 				list_length(node->resultRelations)) ||
 			   (node->operation == CMD_MERGE &&
 				list_length(node->withCheckOptionLists) ==
 				list_length(node->resultRelations)));

 		/*
 		 * Use the WCO list of the first plan as a reference to calculate
 		 * attno's for the WCO list of this partition.  In the INSERT case,
 		 * that refers to the root partitioned table, whereas in the UPDATE
 		 * tuple routing case, that refers to the first partition in the
 		 * mtstate->resultRelInfo array.  In any case, both that relation and
 		 * this partition should have the same columns, so we should be able
 		 * to map attributes successfully.
 		 */
 		wcoList = linitial(node->withCheckOptionLists);

 		/*
 		 * Convert Vars in it to contain this partition's attribute numbers.
 		 */
 		part_attmap =
 			build_attrmap_by_name(RelationGetDescr(partrel),
 								  RelationGetDescr(firstResultRel),
 								  false);
 		wcoList = (List *)
 			map_variable_attnos((Node *) wcoList,
 								firstVarno, 0,
 								part_attmap,
 								RelationGetForm(partrel)->reltype,
 								&found_whole_row);
 		/* We ignore the value of found_whole_row. */

 		foreach(ll, wcoList)
 		{
 			WithCheckOption *wco = lfirst_node(WithCheckOption, ll);
 			ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
 											   &mtstate->ps);

 			wcoExprs = lappend(wcoExprs, wcoExpr);
 		}

 		leaf_part_rri->ri_WithCheckOptions = wcoList;
 		leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
 	}

 	/*
 	 * Build the RETURNING projection for the partition.  Note that we didn't
 	 * build the returningList for partitions within the planner, but simple
 	 * translation of varattnos will suffice.  This only occurs for the INSERT
 	 * case or in the case of UPDATE tuple routing where we didn't find a
 	 * result rel to reuse.
 	 */
 	if (node && node->returningLists != NIL)
 	{
 		TupleTableSlot *slot;
 		ExprContext *econtext;
 		List	   *returningList;

 		/* See the comment above for WCO lists. */
 		/* (except no RETURNING support for MERGE yet) */
 		Assert((node->operation == CMD_INSERT &&
 				list_length(node->returningLists) == 1 &&
 				list_length(node->resultRelations) == 1) ||
 			   (node->operation == CMD_UPDATE &&
 				list_length(node->returningLists) ==
 				list_length(node->resultRelations)));

 		/*
 		 * Use the RETURNING list of the first plan as a reference to
 		 * calculate attno's for the RETURNING list of this partition.  See
 		 * the comment above for WCO lists for more details on why this is
 		 * okay.
 		 */
 		returningList = linitial(node->returningLists);

 		/*
 		 * Convert Vars in it to contain this partition's attribute numbers.
 		 */
 		if (part_attmap == NULL)
 			part_attmap =
 				build_attrmap_by_name(RelationGetDescr(partrel),
 									  RelationGetDescr(firstResultRel),
 									  false);
 		returningList = (List *)
 			map_variable_attnos((Node *) returningList,
 								firstVarno, 0,
 								part_attmap,
 								RelationGetForm(partrel)->reltype,
 								&found_whole_row);
 		/* We ignore the value of found_whole_row. */

 		leaf_part_rri->ri_returningList = returningList;

 		/*
 		 * Initialize the projection itself.
 		 *
 		 * Use the slot and the expression context that would have been set up
 		 * in ExecInitModifyTable() for projection's output.
 		 */
 		Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
 		slot = mtstate->ps.ps_ResultTupleSlot;
 		Assert(mtstate->ps.ps_ExprContext != NULL);
 		econtext = mtstate->ps.ps_ExprContext;
 		leaf_part_rri->ri_projectReturning =
 			ExecBuildProjectionInfo(returningList, econtext, slot,
 									&mtstate->ps, RelationGetDescr(partrel));
 	}

 	/* Set up information needed for routing tuples to the partition. */
 	ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
 						leaf_part_rri, partidx, false);

 	/*
 	 * If there is an ON CONFLICT clause, initialize state for it.
 	 */
 	if (node && node->onConflictAction != ONCONFLICT_NONE)
 	{
 		TupleDesc	partrelDesc = RelationGetDescr(partrel);
 		ExprContext *econtext = mtstate->ps.ps_ExprContext;
 		ListCell   *lc;
 		List	   *arbiterIndexes = NIL;

 		/*
 		 * If there is a list of arbiter indexes, map it to a list of indexes
 		 * in the partition.  We do that by scanning the partition's index
 		 * list and searching for ancestry relationships to each index in the
 		 * ancestor table.
 		 */
 		if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL)
 		{
 			List	   *childIdxs;

 			childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc);

 			foreach(lc, childIdxs)
 			{
 				Oid			childIdx = lfirst_oid(lc);
 				List	   *ancestors;
 				ListCell   *lc2;

 				ancestors = get_partition_ancestors(childIdx);
 				foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
 				{
 					if (list_member_oid(ancestors, lfirst_oid(lc2)))
 						arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
 				}
 				list_free(ancestors);
 			}
 		}

 		/*
 		 * If the resulting lists are of inequal length, something is wrong.
 		 * (This shouldn't happen, since arbiter index selection should not
 		 * pick up an invalid index.)
 		 */
 		if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
 			list_length(arbiterIndexes))
 			elog(ERROR, "invalid arbiter index list");
 		leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;

 		/*
 		 * In the DO UPDATE case, we have some more state to initialize.
 		 */
 		if (node->onConflictAction == ONCONFLICT_UPDATE)
 		{
 			OnConflictSetState *onconfl = makeNode(OnConflictSetState);
 			TupleConversionMap *map;

 			map = ExecGetRootToChildMap(leaf_part_rri, estate);

 			Assert(node->onConflictSet != NIL);
 			Assert(rootResultRelInfo->ri_onConflict != NULL);

 			leaf_part_rri->ri_onConflict = onconfl;

 			/*
 			 * Need a separate existing slot for each partition, as the
 			 * partition could be of a different AM, even if the tuple
 			 * descriptors match.
 			 */
 			onconfl->oc_Existing =
 				table_slot_create(leaf_part_rri->ri_RelationDesc,
 								  &mtstate->ps.state->es_tupleTable);

 			/*
 			 * If the partition's tuple descriptor matches exactly the root
 			 * parent (the common case), we can re-use most of the parent's ON
 			 * CONFLICT SET state, skipping a bunch of work.  Otherwise, we
 			 * need to create state specific to this partition.
 			 */
 			if (map == NULL)
 			{
 				/*
 				 * It's safe to reuse these from the partition root, as we
 				 * only process one tuple at a time (therefore we won't
 				 * overwrite needed data in slots), and the results of
 				 * projections are independent of the underlying storage.
 				 * Projections and where clauses themselves don't store state
 				 * / are independent of the underlying storage.
 				 */
 				onconfl->oc_ProjSlot =
 					rootResultRelInfo->ri_onConflict->oc_ProjSlot;
 				onconfl->oc_ProjInfo =
 					rootResultRelInfo->ri_onConflict->oc_ProjInfo;
 				onconfl->oc_WhereClause =
 					rootResultRelInfo->ri_onConflict->oc_WhereClause;
 			}
 			else
 			{
 				List	   *onconflset;
 				List	   *onconflcols;

 				/*
 				 * Translate expressions in onConflictSet to account for
 				 * different attribute numbers.  For that, map partition
 				 * varattnos twice: first to catch the EXCLUDED
 				 * pseudo-relation (INNER_VAR), and second to handle the main
 				 * target relation (firstVarno).
 				 */
 				onconflset = copyObject(node->onConflictSet);
 				if (part_attmap == NULL)
 					part_attmap =
 						build_attrmap_by_name(RelationGetDescr(partrel),
 											  RelationGetDescr(firstResultRel),
 											  false);
 				onconflset = (List *)
 					map_variable_attnos((Node *) onconflset,
 										INNER_VAR, 0,
 										part_attmap,
 										RelationGetForm(partrel)->reltype,
 										&found_whole_row);
 				/* We ignore the value of found_whole_row. */
 				onconflset = (List *)
 					map_variable_attnos((Node *) onconflset,
 										firstVarno, 0,
 										part_attmap,
 										RelationGetForm(partrel)->reltype,
 										&found_whole_row);
 				/* We ignore the value of found_whole_row. */

 				/* Finally, adjust the target colnos to match the partition. */
 				onconflcols = adjust_partition_colnos(node->onConflictCols,
 													  leaf_part_rri);

 				/* create the tuple slot for the UPDATE SET projection */
 				onconfl->oc_ProjSlot =
 					table_slot_create(partrel,
 									  &mtstate->ps.state->es_tupleTable);

 				/* build UPDATE SET projection state */
 				onconfl->oc_ProjInfo =
 					ExecBuildUpdateProjection(onconflset,
 											  true,
 											  onconflcols,
 											  partrelDesc,
 											  econtext,
 											  onconfl->oc_ProjSlot,
 											  &mtstate->ps);

 				/*
 				 * If there is a WHERE clause, initialize state where it will
 				 * be evaluated, mapping the attribute numbers appropriately.
 				 * As with onConflictSet, we need to map partition varattnos
 				 * to the partition's tupdesc.
 				 */
 				if (node->onConflictWhere)
 				{
 					List	   *clause;

 					clause = copyObject((List *) node->onConflictWhere);
 					clause = (List *)
 						map_variable_attnos((Node *) clause,
 											INNER_VAR, 0,
 											part_attmap,
 											RelationGetForm(partrel)->reltype,
 											&found_whole_row);
 					/* We ignore the value of found_whole_row. */
 					clause = (List *)
 						map_variable_attnos((Node *) clause,
 											firstVarno, 0,
 											part_attmap,
 											RelationGetForm(partrel)->reltype,
 											&found_whole_row);
 					/* We ignore the value of found_whole_row. */
 					onconfl->oc_WhereClause =
 						ExecInitQual((List *) clause, &mtstate->ps);
 				}
 			}
 		}
 	}

 	/*
 	 * Since we've just initialized this ResultRelInfo, it's not in any list
 	 * attached to the estate as yet.  Add it, so that it can be found later.
 	 *
 	 * Note that the entries in this list appear in no predetermined order,
 	 * because partition result rels are initialized as and when they're
 	 * needed.
 	 */
 	MemoryContextSwitchTo(estate->es_query_cxt);
 	estate->es_tuple_routing_result_relations =
 		lappend(estate->es_tuple_routing_result_relations,
 				leaf_part_rri);

 	/*
 	 * Initialize information about this partition that's needed to handle
 	 * MERGE.  We take the "first" result relation's mergeActionList as
 	 * reference and make copy for this relation, converting stuff that
 	 * references attribute numbers to match this relation's.
 	 *
 	 * This duplicates much of the logic in ExecInitMerge(), so if something
 	 * changes there, look here too.
 	 */
 	if (node && node->operation == CMD_MERGE)
 	{
 		List	   *firstMergeActionList = linitial(node->mergeActionLists);
 		ListCell   *lc;
 		ExprContext *econtext = mtstate->ps.ps_ExprContext;

 		if (part_attmap == NULL)
 			part_attmap =
 				build_attrmap_by_name(RelationGetDescr(partrel),
 									  RelationGetDescr(firstResultRel),
 									  false);

 		if (unlikely(!leaf_part_rri->ri_projectNewInfoValid))
 			ExecInitMergeTupleSlots(mtstate, leaf_part_rri);

 		foreach(lc, firstMergeActionList)
 		{
 			/* Make a copy for this relation to be safe.  */
 			MergeAction *action = copyObject(lfirst(lc));
 			MergeActionState *action_state;
 			List	  **list;

 			/* Generate the action's state for this relation */
 			action_state = makeNode(MergeActionState);
 			action_state->mas_action = action;

 			/* And put the action in the appropriate list */
 			if (action->matched)
 				list = &leaf_part_rri->ri_matchedMergeAction;
 			else
 				list = &leaf_part_rri->ri_notMatchedMergeAction;
 			*list = lappend(*list, action_state);

 			switch (action->commandType)
 			{
 				case CMD_INSERT:

 					/*
 					 * ExecCheckPlanOutput() already done on the targetlist
 					 * when "first" result relation initialized and it is same
 					 * for all result relations.
 					 */
 					action_state->mas_proj =
 						ExecBuildProjectionInfo(action->targetList, econtext,
 												leaf_part_rri->ri_newTupleSlot,
 												&mtstate->ps,
 												RelationGetDescr(partrel));
 					break;
 				case CMD_UPDATE:

 					/*
 					 * Convert updateColnos from "first" result relation
 					 * attribute numbers to this result rel's.
 					 */
 					if (part_attmap)
 						action->updateColnos =
 							adjust_partition_colnos_using_map(action->updateColnos,
 															  part_attmap);
 					action_state->mas_proj =
 						ExecBuildUpdateProjection(action->targetList,
 												  true,
 												  action->updateColnos,
 												  RelationGetDescr(leaf_part_rri->ri_RelationDesc),
 												  econtext,
 												  leaf_part_rri->ri_newTupleSlot,
 												  NULL);
 					break;
 				case CMD_DELETE:
 				case CMD_NOTHING:
 					/* Nothing to do */
 					break;

 				default:
 					elog(ERROR, "unknown action in MERGE WHEN clause");
 			}

 			/* found_whole_row intentionally ignored. */
 			action->qual =
 				map_variable_attnos(action->qual,
 									firstVarno, 0,
 									part_attmap,
 									RelationGetForm(partrel)->reltype,
 									&found_whole_row);
 			action_state->mas_whenqual =
 				ExecInitQual((List *) action->qual, &mtstate->ps);
 		}
 	}

 	if (RelationIsAoRows(leaf_part_rri->ri_RelationDesc))
 		appendonly_dml_init(leaf_part_rri->ri_RelationDesc, mtstate->operation);
 	else if (RelationIsAoCols(leaf_part_rri->ri_RelationDesc))
 		aoco_dml_init(leaf_part_rri->ri_RelationDesc, mtstate->operation);
 	else if (ext_dml_init_hook)
 		ext_dml_init_hook(leaf_part_rri->ri_RelationDesc, mtstate->operation);

 	MemoryContextSwitchTo(oldcxt);

 	return leaf_part_rri;
 }

 /*
  * ExecInitRoutingInfo
  *		Set up information needed for translating tuples between root
  *		partitioned table format and partition format, and keep track of it
  *		in PartitionTupleRouting.
  */
 static void
 ExecInitRoutingInfo(ModifyTableState *mtstate,
 					EState *estate,
 					PartitionTupleRouting *proute,
 					PartitionDispatch dispatch,
 					ResultRelInfo *partRelInfo,
 					int partidx,
 					bool is_borrowed_rel)
 {
 	MemoryContext oldcxt;
 	int			rri_index;

 	oldcxt = MemoryContextSwitchTo(proute->memcxt);

 	/*
 	 * Set up tuple conversion between root parent and the partition if the
 	 * two have different rowtypes.  If conversion is indeed required, also
 	 * initialize a slot dedicated to storing this partition's converted
 	 * tuples.  Various operations that are applied to tuples after routing,
 	 * such as checking constraints, will refer to this slot.
 	 */
 	if (ExecGetRootToChildMap(partRelInfo, estate) != NULL)
 	{
 		Relation	partrel = partRelInfo->ri_RelationDesc;

 		/*
 		 * This pins the partition's TupleDesc, which will be released at the
 		 * end of the command.
 		 */
 		partRelInfo->ri_PartitionTupleSlot =
 			table_slot_create(partrel, &estate->es_tupleTable);
 	}
 	else
 		partRelInfo->ri_PartitionTupleSlot = NULL;

 	/*
 	 * If the partition is a foreign table, let the FDW init itself for
 	 * routing tuples to the partition.
 	 */
 	if (partRelInfo->ri_FdwRoutine != NULL &&
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
 		partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);

 	/*
 	 * Determine if the FDW supports batch insert and determine the batch size
 	 * (a FDW may support batching, but it may be disabled for the
 	 * server/table or for this particular query).
 	 *
 	 * If the FDW does not support batching, we set the batch size to 1.
 	 */
 	if (partRelInfo->ri_FdwRoutine != NULL &&
 		partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
 		partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
 		partRelInfo->ri_BatchSize =
 			partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
 	else
 		partRelInfo->ri_BatchSize = 1;

 	Assert(partRelInfo->ri_BatchSize >= 1);

 	partRelInfo->ri_CopyMultiInsertBuffer = NULL;

 	/*
 	 * Keep track of it in the PartitionTupleRouting->partitions array.
 	 */
 	Assert(dispatch->indexes[partidx] == -1);

 	rri_index = proute->num_partitions++;

 	/* Allocate or enlarge the array, as needed */
 	if (proute->num_partitions >= proute->max_partitions)
 	{
 		if (proute->max_partitions == 0)
 		{
 			proute->max_partitions = 8;
 			proute->partitions = (ResultRelInfo **)
 				palloc(sizeof(ResultRelInfo *) * proute->max_partitions);
 			proute->is_borrowed_rel = (bool *)
 				palloc(sizeof(bool) * proute->max_partitions);
 		}
 		else
 		{
 			proute->max_partitions *= 2;
 			proute->partitions = (ResultRelInfo **)
 				repalloc(proute->partitions, sizeof(ResultRelInfo *) *
 						 proute->max_partitions);
 			proute->is_borrowed_rel = (bool *)
 				repalloc(proute->is_borrowed_rel, sizeof(bool) *
 						 proute->max_partitions);
 		}
 	}

 	proute->partitions[rri_index] = partRelInfo;
 	proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
 	dispatch->indexes[partidx] = rri_index;

 	MemoryContextSwitchTo(oldcxt);
 }

 /*
  * ExecInitPartitionDispatchInfo
  *		Lock the partitioned table (if not locked already) and initialize
  *		PartitionDispatch for a partitioned table and store it in the next
  *		available slot in the proute->partition_dispatch_info array.  Also,
  *		record the index into this array in the parent_pd->indexes[] array in
  *		the partidx element so that we can properly retrieve the newly created
  *		PartitionDispatch later.
  */
 static PartitionDispatch
 ExecInitPartitionDispatchInfo(EState *estate,
 							  PartitionTupleRouting *proute, Oid partoid,
 							  PartitionDispatch parent_pd, int partidx,
 							  ResultRelInfo *rootResultRelInfo)
 {
 	Relation	rel;
 	PartitionDesc partdesc;
 	PartitionDispatch pd;
 	int			dispatchidx;
 	MemoryContext oldcxt;

 	/*
 	 * For data modification, it is better that executor does not include
 	 * partitions being detached, except when running in snapshot-isolation
 	 * mode.  This means that a read-committed transaction immediately gets a
 	 * "no partition for tuple" error when a tuple is inserted into a
 	 * partition that's being detached concurrently, but a transaction in
 	 * repeatable-read mode can still use such a partition.
 	 */
 	if (estate->es_partition_directory == NULL)
 		estate->es_partition_directory =
 			CreatePartitionDirectory(estate->es_query_cxt,
 									 !IsolationUsesXactSnapshot());

 	oldcxt = MemoryContextSwitchTo(proute->memcxt);

 	/*
 	 * Only sub-partitioned tables need to be locked here.  The root
 	 * partitioned table will already have been locked as it's referenced in
 	 * the query's rtable.
 	 */
 	if (partoid != RelationGetRelid(proute->partition_root))
 		rel = table_open(partoid, RowExclusiveLock);
 	else
 		rel = proute->partition_root;
 	partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);

 	pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
 									partdesc->nparts * sizeof(int));
 	pd->reldesc = rel;
 	pd->key = RelationGetPartitionKey(rel);
 	pd->keystate = NIL;
 	pd->partdesc = partdesc;
 	if (parent_pd != NULL)
 	{
 		TupleDesc	tupdesc = RelationGetDescr(rel);

 		/*
 		 * For sub-partitioned tables where the column order differs from its
 		 * direct parent partitioned table, we must store a tuple table slot
 		 * initialized with its tuple descriptor and a tuple conversion map to
 		 * convert a tuple from its parent's rowtype to its own.  This is to
 		 * make sure that we are looking at the correct row using the correct
 		 * tuple descriptor when computing its partition key for tuple
 		 * routing.
 		 */
 		pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
 												  tupdesc,
 												  false);
 		pd->tupslot = pd->tupmap ?
 			MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
 	}
 	else
 	{
 		/* Not required for the root partitioned table */
 		pd->tupmap = NULL;
 		pd->tupslot = NULL;
 	}

 	/*
 	 * Initialize with -1 to signify that the corresponding partition's
 	 * ResultRelInfo or PartitionDispatch has not been created yet.
 	 */
 	memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);

 	/* Track in PartitionTupleRouting for later use */
 	dispatchidx = proute->num_dispatch++;

 	/* Allocate or enlarge the array, as needed */
 	if (proute->num_dispatch >= proute->max_dispatch)
 	{
 		if (proute->max_dispatch == 0)
 		{
 			proute->max_dispatch = 4;
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
 			proute->nonleaf_partitions = (ResultRelInfo **)
 				palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 		else
 		{
 			proute->max_dispatch *= 2;
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				repalloc(proute->partition_dispatch_info,
 						 sizeof(PartitionDispatch) * proute->max_dispatch);
 			proute->nonleaf_partitions = (ResultRelInfo **)
 				repalloc(proute->nonleaf_partitions,
 						 sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 	}
 	proute->partition_dispatch_info[dispatchidx] = pd;

 	/*
 	 * If setting up a PartitionDispatch for a sub-partitioned table, we may
 	 * also need a minimally valid ResultRelInfo for checking the partition
 	 * constraint later; set that up now.
 	 */
 	if (parent_pd)
 	{
 		ResultRelInfo *rri = makeNode(ResultRelInfo);

 		InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
 		proute->nonleaf_partitions[dispatchidx] = rri;
 	}
 	else
 		proute->nonleaf_partitions[dispatchidx] = NULL;

 	/*
 	 * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
 	 * install a downlink in the parent to allow quick descent.
 	 */
 	if (parent_pd)
 	{
 		Assert(parent_pd->indexes[partidx] == -1);
 		parent_pd->indexes[partidx] = dispatchidx;
 	}

 	MemoryContextSwitchTo(oldcxt);

 	return pd;
 }

 /*
  * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
  * routing.
  *
  * Close all the partitioned tables, leaf partitions, and their indices.
  */
 void
 ExecCleanupTupleRouting(ModifyTableState *mtstate,
 						PartitionTupleRouting *proute)
 {
 	int			i;

 	/*
 	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
 	 * partitioned table, which we must not try to close, because it is the
 	 * main target table of the query that will be closed by callers such as
 	 * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
 	 * partitioned table.
 	 */
 	for (i = 1; i < proute->num_dispatch; i++)
 	{
 		PartitionDispatch pd = proute->partition_dispatch_info[i];

 		table_close(pd->reldesc, NoLock);

 		if (pd->tupslot)
 			ExecDropSingleTupleTableSlot(pd->tupslot);
 	}

 	for (i = 0; i < proute->num_partitions; i++)
 	{
 		ResultRelInfo *resultRelInfo = proute->partitions[i];

 		/* Allow any FDWs to shut down */
 		if (resultRelInfo->ri_FdwRoutine != NULL &&
 			resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
 			resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
 														   resultRelInfo);

 		/*
 		 * Close it if it's not one of the result relations borrowed from the
 		 * owning ModifyTableState; those will be closed by ExecEndPlan().
 		 */
 		if (proute->is_borrowed_rel[i])
 			continue;

 		/*
 		 * Only leaf node can have a valid access method.  If we find an
 		 * appendoptimized table, ensure the DML operation is finished.
 		 */
 		if (RelationIsAoRows(resultRelInfo->ri_RelationDesc))
 			appendonly_dml_finish(resultRelInfo->ri_RelationDesc, mtstate->operation);
 		else if (RelationIsAoCols(resultRelInfo->ri_RelationDesc))
 			aoco_dml_finish(resultRelInfo->ri_RelationDesc, mtstate->operation);
 		else if (ext_dml_finish_hook)
 			ext_dml_finish_hook(resultRelInfo->ri_RelationDesc, mtstate->operation);

 		ExecCloseIndices(resultRelInfo);
 		table_close(resultRelInfo->ri_RelationDesc, NoLock);
 	}
 }

 /* ----------------
  *		FormPartitionKeyDatum
  *			Construct values[] and isnull[] arrays for the partition key
  *			of a tuple.
  *
  *	pd				Partition dispatch object of the partitioned table
  *	slot			Heap tuple from which to extract partition key
  *	estate			executor state for evaluating any partition key
  *					expressions (must be non-NULL)
  *	values			Array of partition key Datums (output area)
  *	isnull			Array of is-null indicators (output area)
  *	attno_map		Map of att nums from tuple descriptor to relation, needed since
  *					partition bound attnums correspond to relation, not tuple desc (GPDB only)
  *
  * the ecxt_scantuple slot of estate's per-tuple expr context must point to
  * the heap tuple passed in.
  * ----------------
  */
 static void
 FormPartitionKeyDatum(PartitionDispatch pd,
 					  TupleTableSlot *slot,
 					  EState *estate,
 					  Datum *values,
 					  bool *isnull,
 					  AttrMap *attno_map)
 {
 	ListCell   *partexpr_item;
 	int			i;

 	if (pd->key->partexprs != NIL && pd->keystate == NIL)
 	{
 		/* Check caller has set up context correctly */
 		Assert(estate != NULL &&
 			   GetPerTupleExprContext(estate)->ecxt_scantuple == slot);

 		/* First time through, set up expression evaluation state */
 		pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
 	}

 	partexpr_item = list_head(pd->keystate);
 	for (i = 0; i < pd->key->partnatts; i++)
 	{
 		AttrNumber	keycol = pd->key->partattrs[i];
 		/* Use passed in map to extract part key, as slot's attrs may not match the Relation's attrs */
 		if (attno_map) {
 			Assert(pd->key->partattrs[i] - 1 < attno_map->maplen);
 			keycol = attno_map->attnums[pd->key->partattrs[i] - 1];
 		}

 		Datum		datum;
 		bool		isNull;

 		if (keycol != 0)
 		{
 			/* Plain column; get the value directly from the heap tuple */
 			datum = slot_getattr(slot, keycol, &isNull);
 		}
 		else
 		{
 			/* Expression; need to evaluate it */
 			if (partexpr_item == NULL)
 				elog(ERROR, "wrong number of partition key expressions");
 			datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
 											  GetPerTupleExprContext(estate),
 											  &isNull);
 			partexpr_item = lnext(pd->keystate, partexpr_item);
 		}
 		values[i] = datum;
 		isnull[i] = isNull;
 	}

 	if (partexpr_item != NULL)
 		elog(ERROR, "wrong number of partition key expressions");
 }

 /*
  * The number of times the same partition must be found in a row before we
  * switch from a binary search for the given values to just checking if the
  * values belong to the last found partition.  This must be above 0.
  */
 #define PARTITION_CACHED_FIND_THRESHOLD			16

 /*
  * get_partition_for_tuple
  *		Finds partition of relation which accepts the partition key specified
  *		in values and isnull.
  *
  * Calling this function can be quite expensive when LIST and RANGE
  * partitioned tables have many partitions.  This is due to the binary search
  * that's done to find the correct partition.  Many of the use cases for LIST
  * and RANGE partitioned tables make it likely that the same partition is
  * found in subsequent ExecFindPartition() calls.  This is especially true for
  * cases such as RANGE partitioned tables on a TIMESTAMP column where the
  * partition key is the current time.  When asked to find a partition for a
  * RANGE or LIST partitioned table, we record the partition index and datum
  * offset we've found for the given 'values' in the PartitionDesc (which is
  * stored in relcache), and if we keep finding the same partition
  * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
  * logic and instead of performing a binary search to find the correct
  * partition, we'll just double-check that 'values' still belong to the last
  * found partition, and if so, we'll return that partition index, thus
  * skipping the need for the binary search.  If we fail to match the last
  * partition when double checking, then we fall back on doing a binary search.
  * In this case, unless we find 'values' belong to the DEFAULT partition,
  * we'll reset the number of times we've hit the same partition so that we
  * don't attempt to use the cache again until we've found that partition at
  * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
  *
  * For cases where the partition changes on each lookup, the amount of
  * additional work required just amounts to recording the last found partition
  * and bound offset then resetting the found counter.  This is cheap and does
  * not appear to cause any meaningful slowdowns for such cases.
  *
  * No caching of partitions is done when the last found partition is the
  * DEFAULT or NULL partition.  For the case of the DEFAULT partition, there
  * is no bound offset storing the matching datum, so we cannot confirm the
  * indexes match.  For the NULL partition, this is just so cheap, there's no
  * sense in caching.
  *
  * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
  * found or -1 if none found.
  */
 int
 get_partition_for_tuple(PartitionKey key, PartitionDesc partdesc, Datum *values, bool *isnull)
 {
 	int			bound_offset = -1;
 	int			part_index = -1;
 	PartitionBoundInfo boundinfo = partdesc->boundinfo;

 	if (partdesc->nparts == 0)
 		return part_index;
 	/*
 	 * In the switch statement below, when we perform a cached lookup for
 	 * RANGE and LIST partitioned tables, if we find that the last found
 	 * partition matches the 'values', we return the partition index right
 	 * away.  We do this instead of breaking out of the switch as we don't
 	 * want to execute the code about the DEFAULT partition or do any updates
 	 * for any of the cache-related fields.  That would be a waste of effort
 	 * as we already know it's not the DEFAULT partition and have no need to
 	 * increment the number of times we found the same partition any higher
 	 * than PARTITION_CACHED_FIND_THRESHOLD.
 	 */

 	/* Route as appropriate based on partitioning strategy. */
 	switch (key->strategy)
 	{
 		case PARTITION_STRATEGY_HASH:
 			{
 				uint64		rowHash;

 				/* hash partitioning is too cheap to bother caching */
 				rowHash = compute_partition_hash_value(key->partnatts,
 													   key->partsupfunc,
 													   key->partcollation,
 													   values, isnull);

 				/*
 				 * HASH partitions can't have a DEFAULT partition and we don't
 				 * do any caching work for them, so just return the part index
 				 */
 				return boundinfo->indexes[rowHash % boundinfo->nindexes];
 			}

 		case PARTITION_STRATEGY_LIST:
 			if (isnull[0])
 			{
 				/* this is far too cheap to bother doing any caching */
 				if (partition_bound_accepts_nulls(boundinfo))
 				{
 					/*
 					 * When there is a NULL partition we just return that
 					 * directly.  We don't have a bound_offset so it's not
 					 * valid to drop into the code after the switch which
 					 * checks and updates the cache fields.  We perhaps should
 					 * be invalidating the details of the last cached
 					 * partition but there's no real need to.  Keeping those
 					 * fields set gives a chance at matching to the cached
 					 * partition on the next lookup.
 					 */
 					return boundinfo->null_index;
 				}
 			}
 			else
 			{
 				bool		equal;

 				if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
 				{
 					int			last_datum_offset = partdesc->last_found_datum_index;
 					Datum		lastDatum = boundinfo->datums[last_datum_offset][0];
 					int32		cmpval;

 					/* does the last found datum index match this datum? */
 					cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
 															 key->partcollation[0],
 															 lastDatum,
 															 values[0]));

 					if (cmpval == 0)
 						return boundinfo->indexes[last_datum_offset];

 					/* fall-through and do a manual lookup */
 				}

 				bound_offset = partition_list_bsearch(key->partsupfunc,
 													  key->partcollation,
 													  boundinfo,
 													  values[0], &equal);
 				if (bound_offset >= 0 && equal)
 					part_index = boundinfo->indexes[bound_offset];
 			}
 			break;

 		case PARTITION_STRATEGY_RANGE:
 			{
 				bool		equal = false,
 							range_partkey_has_null = false;
 				int			i;

 				/*
 				 * No range includes NULL, so this will be accepted by the
 				 * default partition if there is one, and otherwise rejected.
 				 */
 				for (i = 0; i < key->partnatts; i++)
 				{
 					if (isnull[i])
 					{
 						range_partkey_has_null = true;
 						break;
 					}
 				}

 				/* NULLs belong in the DEFAULT partition */
 				if (range_partkey_has_null)
 					break;

 				if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
 				{
 					int			last_datum_offset = partdesc->last_found_datum_index;
 					Datum	   *lastDatums = boundinfo->datums[last_datum_offset];
 					PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset];
 					int32		cmpval;

 					/* check if the value is >= to the lower bound */
 					cmpval = partition_rbound_datum_cmp(key->partsupfunc,
 														key->partcollation,
 														lastDatums,
 														kind,
 														values,
 														key->partnatts);

 					/*
 					 * If it's equal to the lower bound then no need to check
 					 * the upper bound.
 					 */
 					if (cmpval == 0)
 						return boundinfo->indexes[last_datum_offset + 1];

 					if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums)
 					{
 						/* check if the value is below the upper bound */
 						lastDatums = boundinfo->datums[last_datum_offset + 1];
 						kind = boundinfo->kind[last_datum_offset + 1];
 						cmpval = partition_rbound_datum_cmp(key->partsupfunc,
 															key->partcollation,
 															lastDatums,
 															kind,
 															values,
 															key->partnatts);

 						if (cmpval > 0)
 							return boundinfo->indexes[last_datum_offset + 1];
 					}
 					/* fall-through and do a manual lookup */
 				}

 				bound_offset = partition_range_datum_bsearch(key->partsupfunc,
 															 key->partcollation,
 															 boundinfo,
 															 key->partnatts,
 															 values,
 															 &equal);

 				/*
 				 * The bound at bound_offset is less than or equal to the
 				 * tuple value, so the bound at offset+1 is the upper bound of
 				 * the partition we're looking for, if there actually exists
 				 * one.
 				 */
 				part_index = boundinfo->indexes[bound_offset + 1];
 			}
 			break;

 		default:
 			elog(ERROR, "unexpected partition strategy: %d",
 				 (int) key->strategy);
 	}

 	/*
 	 * part_index < 0 means we failed to find a partition of this parent. Use
 	 * the default partition, if there is one.
 	 */
 	if (part_index < 0)
 	{
 		/*
 		 * No need to reset the cache fields here.  The next set of values
 		 * might end up belonging to the cached partition, so leaving the
 		 * cache alone improves the chances of a cache hit on the next lookup.
 		 */
 		return boundinfo->default_index;
 	}

 	/* we should only make it here when the code above set bound_offset */
 	Assert(bound_offset >= 0);

 	/*
 	 * Attend to the cache fields.  If the bound_offset matches the last
 	 * cached bound offset then we've found the same partition as last time,
 	 * so bump the count by one.  If all goes well, we'll eventually reach
 	 * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
 	 * around.  Otherwise, we'll reset the cache count back to 1 to mark that
 	 * we've found this partition for the first time.
 	 */
 	if (bound_offset == partdesc->last_found_datum_index)
 		partdesc->last_found_count++;
 	else
 	{
 		partdesc->last_found_count = 1;
 		partdesc->last_found_part_index = part_index;
 		partdesc->last_found_datum_index = bound_offset;
 	}

 	return part_index;
 }

 /*
  * ExecBuildSlotPartitionKeyDescription
  *
  * This works very much like BuildIndexValueDescription() and is currently
  * used for building error messages when ExecFindPartition() fails to find
  * partition for a row.
  */
 static char *
 ExecBuildSlotPartitionKeyDescription(Relation rel,
 									 Datum *values,
 									 bool *isnull,
 									 int maxfieldlen)
 {
 	StringInfoData buf;
 	PartitionKey key = RelationGetPartitionKey(rel);
 	int			partnatts = get_partition_natts(key);
 	int			i;
 	Oid			relid = RelationGetRelid(rel);
 	AclResult	aclresult;

 	if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
 		return NULL;

 	/* If the user has table-level access, just go build the description. */
 	aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
 	if (aclresult != ACLCHECK_OK)
 	{
 		/*
 		 * Step through the columns of the partition key and make sure the
 		 * user has SELECT rights on all of them.
 		 */
 		for (i = 0; i < partnatts; i++)
 		{
 			AttrNumber	attnum = get_partition_col_attnum(key, i);

 			/*
 			 * If this partition key column is an expression, we return no
 			 * detail rather than try to figure out what column(s) the
 			 * expression includes and if the user has SELECT rights on them.
 			 */
 			if (attnum == InvalidAttrNumber ||
 				pg_attribute_aclcheck(relid, attnum, GetUserId(),
 									  ACL_SELECT) != ACLCHECK_OK)
 				return NULL;
 		}
 	}

 	initStringInfo(&buf);
 	appendStringInfo(&buf, "(%s) = (",
 					 pg_get_partkeydef_columns(relid, true));

 	for (i = 0; i < partnatts; i++)
 	{
 		char	   *val;
 		int			vallen;

 		if (isnull[i])
 			val = "null";
 		else
 		{
 			Oid			foutoid;
 			bool		typisvarlena;

 			getTypeOutputInfo(get_partition_col_typid(key, i),
 							  &foutoid, &typisvarlena);
 			val = OidOutputFunctionCall(foutoid, values[i]);
 		}

 		if (i > 0)
 			appendStringInfoString(&buf, ", ");

 		/* truncate if needed */
 		vallen = strlen(val);
 		if (vallen <= maxfieldlen)
 			appendBinaryStringInfo(&buf, val, vallen);
 		else
 		{
 			vallen = pg_mbcliplen(val, vallen, maxfieldlen);
 			appendBinaryStringInfo(&buf, val, vallen);
 			appendStringInfoString(&buf, "...");
 		}
 	}

 	appendStringInfoChar(&buf, ')');

 	return buf.data;
 }

 /*
  * adjust_partition_colnos
  *		Adjust the list of UPDATE target column numbers to account for
  *		attribute differences between the parent and the partition.
  *
  * Note: mustn't be called if no adjustment is required.
  */
 static List *
 adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
 {
 	TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);

 	Assert(map != NULL);

 	return adjust_partition_colnos_using_map(colnos, map->attrMap);
 }

 /*
  * adjust_partition_colnos_using_map
  *		Like adjust_partition_colnos, but uses a caller-supplied map instead
  *		of assuming to map from the "root" result relation.
  *
  * Note: mustn't be called if no adjustment is required.
  */
 static List *
 adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap)
 {
 	List	   *new_colnos = NIL;
 	ListCell   *lc;

 	Assert(attrMap != NULL);	/* else we shouldn't be here */

 	foreach(lc, colnos)
 	{
 		AttrNumber	parentattrno = lfirst_int(lc);

 		if (parentattrno <= 0 ||
 			parentattrno > attrMap->maplen ||
 			attrMap->attnums[parentattrno - 1] == 0)
 			elog(ERROR, "unexpected attno %d in target column list",
 				 parentattrno);
 		new_colnos = lappend_int(new_colnos,
 								 attrMap->attnums[parentattrno - 1]);
 	}

 	return new_colnos;
 }

 /*-------------------------------------------------------------------------
  * Run-Time Partition Pruning Support.
  *
  * The following series of functions exist to support the removal of unneeded
  * subplans for queries against partitioned tables.  The supporting functions
  * here are designed to work with any plan type which supports an arbitrary
  * number of subplans, e.g. Append, MergeAppend.
  *
  * When pruning involves comparison of a partition key to a constant, it's
  * done by the planner.  However, if we have a comparison to a non-constant
  * but not volatile expression, that presents an opportunity for run-time
  * pruning by the executor, allowing irrelevant partitions to be skipped
  * dynamically.
  *
  * We must distinguish expressions containing PARAM_EXEC Params from
  * expressions that don't contain those.  Even though a PARAM_EXEC Param is
  * considered to be a stable expression, it can change value from one plan
  * node scan to the next during query execution.  Stable comparison
  * expressions that don't involve such Params allow partition pruning to be
  * done once during executor startup.  Expressions that do involve such Params
  * require us to prune separately for each scan of the parent plan node.
  *
  * Note that pruning away unneeded subplans during executor startup has the
  * added benefit of not having to initialize the unneeded subplans at all.
  *
  *
  * Functions:
  *
  * ExecInitPartitionPruning:
  *		Creates the PartitionPruneState required by ExecFindMatchingSubPlans.
  *		Details stored include how to map the partition index returned by the
  *		partition pruning code into subplan indexes.  Also determines the set
  *		of subplans to initialize considering the result of performing initial
  *		pruning steps if any.  Maps in PartitionPruneState are updated to
  *		account for initial pruning possibly having eliminated some of the
  *		subplans.
  *
  * ExecFindMatchingSubPlans:
  *		Returns indexes of matching subplans after evaluating the expressions
  *		that are safe to evaluate at a given point.  This function is first
  *		called during ExecInitPartitionPruning() to find the initially
  *		matching subplans based on performing the initial pruning steps and
  *		then must be called again each time the value of a Param listed in
  *		PartitionPruneState's 'execparamids' changes.
  *-------------------------------------------------------------------------
  */

 /*
  * ExecInitPartitionPruning
  *		Initialize data structure needed for run-time partition pruning and
  *		do initial pruning if needed
  *
  * On return, *initially_valid_subplans is assigned the set of indexes of
  * child subplans that must be initialized along with the parent plan node.
  * Initial pruning is performed here if needed and in that case only the
  * surviving subplans' indexes are added.
  *
  * If subplans are indeed pruned, subplan_map arrays contained in the returned
  * PartitionPruneState are re-sequenced to not count those, though only if the
  * maps will be needed for subsequent execution pruning passes.
  */
 PartitionPruneState *
 ExecInitPartitionPruning(PlanState *planstate,
 						 int n_total_subplans,
 						 PartitionPruneInfo *pruneinfo,
 						 Bitmapset **initially_valid_subplans)
 {
 	PartitionPruneState *prunestate;
 	EState	   *estate = planstate->state;

 	/* We may need an expression context to evaluate partition exprs */
 	ExecAssignExprContext(estate, planstate);

 	/* Create the working data structure for pruning */
 	prunestate = CreatePartitionPruneState(planstate, pruneinfo);

 	/*
 	 * Perform an initial partition prune pass, if required.
 	 */
 	if (prunestate->do_initial_prune)
 		*initially_valid_subplans = ExecFindMatchingSubPlans(prunestate, true,
 															 NULL, -1, NULL);
 	else
 	{
 		/* No pruning, so we'll need to initialize all subplans */
 		Assert(n_total_subplans > 0);
 		*initially_valid_subplans = bms_add_range(NULL, 0,
 												  n_total_subplans - 1);
 	}

 	/*
 	 * Re-sequence subplan indexes contained in prunestate to account for any
 	 * that were removed above due to initial pruning.  No need to do this if
 	 * no steps were removed.
 	 */
 	if (bms_num_members(*initially_valid_subplans) < n_total_subplans)
 	{
 		/*
 		 * We can safely skip this when !do_exec_prune, even though that
 		 * leaves invalid data in prunestate, because that data won't be
 		 * consulted again (cf initial Assert in ExecFindMatchingSubPlans).
 		 */
 		if (prunestate->do_exec_prune)
 			PartitionPruneFixSubPlanMap(prunestate,
 										*initially_valid_subplans,
 										n_total_subplans);
 	}

 	return prunestate;
 }

 /*
  * CreatePartitionPruneState
  *		Build the data structure required for calling ExecFindMatchingSubPlans
  *
  * 'planstate' is the parent plan node's execution state.
  *
  * 'pruneinfo' is a PartitionPruneInfo as generated by
  * make_partition_pruneinfo.  Here we build a PartitionPruneState containing a
  * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
  * pruneinfo->prune_infos), each of which contains a PartitionedRelPruningData
  * for each PartitionedRelPruneInfo appearing in that sublist.  This two-level
  * system is needed to keep from confusing the different hierarchies when a
  * UNION ALL contains multiple partitioned tables as children.  The data
  * stored in each PartitionedRelPruningData can be re-used each time we
  * re-evaluate which partitions match the pruning steps provided in each
  * PartitionedRelPruneInfo.
  */
 PartitionPruneState *
 CreatePartitionPruneState(PlanState *planstate, PartitionPruneInfo *pruneinfo)
 {
 	EState	   *estate = planstate->state;
 	PartitionPruneState *prunestate;
 	int			n_part_hierarchies;
 	ListCell   *lc;
 	int			i;
 	ExprContext *econtext = planstate->ps_ExprContext;

 	/* For data reading, executor always includes detached partitions */
 	if (estate->es_partition_directory == NULL)
 		estate->es_partition_directory =
 			CreatePartitionDirectory(estate->es_query_cxt, false);

 	n_part_hierarchies = list_length(pruneinfo->prune_infos);
 	Assert(n_part_hierarchies > 0);

 	/*
 	 * Allocate the data structure
 	 */
 	prunestate = (PartitionPruneState *)
 		palloc(offsetof(PartitionPruneState, partprunedata) +
 			   sizeof(PartitionPruningData *) * n_part_hierarchies);

 	prunestate->execparamids = NULL;
 	/* other_subplans can change at runtime, so we need our own copy */
 	prunestate->other_subplans = bms_copy(pruneinfo->other_subplans);
 	prunestate->do_initial_prune = false;	/* may be set below */
 	prunestate->do_exec_prune = false;	/* may be set below */
 	prunestate->num_partprunedata = n_part_hierarchies;

 	/*
 	 * Create a short-term memory context which we'll use when making calls to
 	 * the partition pruning functions.  This avoids possible memory leaks,
 	 * since the pruning functions call comparison functions that aren't under
 	 * our control.
 	 */
 	prunestate->prune_context =
 		AllocSetContextCreate(CurrentMemoryContext,
 							  "Partition Prune",
 							  ALLOCSET_DEFAULT_SIZES);

 	i = 0;
 	foreach(lc, pruneinfo->prune_infos)
 	{
 		List	   *partrelpruneinfos = lfirst_node(List, lc);
 		int			npartrelpruneinfos = list_length(partrelpruneinfos);
 		PartitionPruningData *prunedata;
 		ListCell   *lc2;
 		int			j;

 		prunedata = (PartitionPruningData *)
 			palloc(offsetof(PartitionPruningData, partrelprunedata) +
 				   npartrelpruneinfos * sizeof(PartitionedRelPruningData));
 		prunestate->partprunedata[i] = prunedata;
 		prunedata->num_partrelprunedata = npartrelpruneinfos;

 		j = 0;
 		foreach(lc2, partrelpruneinfos)
 		{
 			PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
 			PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
 			Relation	partrel;
 			PartitionDesc partdesc;
 			PartitionKey partkey;

 			/*
 			 * We can rely on the copies of the partitioned table's partition
 			 * key and partition descriptor appearing in its relcache entry,
 			 * because that entry will be held open and locked for the
 			 * duration of this executor run.
 			 */
 			partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex);
 			partkey = RelationGetPartitionKey(partrel);
 			partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
 												partrel);

 			/*
 			 * Initialize the subplan_map and subpart_map.
 			 *
 			 * The set of partitions that exist now might not be the same that
 			 * existed when the plan was made.  The normal case is that it is;
 			 * optimize for that case with a quick comparison, and just copy
 			 * the subplan_map and make subpart_map point to the one in
 			 * PruneInfo.
 			 *
 			 * For the case where they aren't identical, we could have more
 			 * partitions on either side; or even exactly the same number of
 			 * them on both but the set of OIDs doesn't match fully.  Handle
 			 * this by creating new subplan_map and subpart_map arrays that
 			 * corresponds to the ones in the PruneInfo where the new
 			 * partition descriptor's OIDs match.  Any that don't match can be
 			 * set to -1, as if they were pruned.  By construction, both
 			 * arrays are in partition bounds order.
 			 */
 			pprune->nparts = partdesc->nparts;
 			pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts);

 			if (partdesc->nparts == pinfo->nparts &&
 				memcmp(partdesc->oids, pinfo->relid_map,
 					   sizeof(int) * partdesc->nparts) == 0)
 			{
 				pprune->subpart_map = pinfo->subpart_map;
 				memcpy(pprune->subplan_map, pinfo->subplan_map,
 					   sizeof(int) * pinfo->nparts);
 			}
 			else
 			{
 				int			pd_idx = 0;
 				int			pp_idx;

 				/*
 				 * When the partition arrays are not identical, there could be
 				 * some new ones but it's also possible that one was removed;
 				 * we cope with both situations by walking the arrays and
 				 * discarding those that don't match.
 				 *
 				 * If the number of partitions on both sides match, it's still
 				 * possible that one partition has been detached and another
 				 * attached.  Cope with that by creating a map that skips any
 				 * mismatches.
 				 */
 				pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);

 				for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
 				{
 					/* Skip any InvalidOid relid_map entries */
 					while (pd_idx < pinfo->nparts &&
 						   !OidIsValid(pinfo->relid_map[pd_idx]))
 						pd_idx++;

 			recheck:
 					if (pd_idx < pinfo->nparts &&
 						pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
 					{
 						/* match... */
 						pprune->subplan_map[pp_idx] =
 							pinfo->subplan_map[pd_idx];
 						pprune->subpart_map[pp_idx] =
 							pinfo->subpart_map[pd_idx];
 						pd_idx++;
 						continue;
 					}

 					/*
 					 * There isn't an exact match in the corresponding
 					 * positions of both arrays.  Peek ahead in
 					 * pinfo->relid_map to see if we have a match for the
 					 * current partition in partdesc.  Normally if a match
 					 * exists it's just one element ahead, and it means the
 					 * planner saw one extra partition that we no longer see
 					 * now (its concurrent detach finished just in between);
 					 * so we skip that one by updating pd_idx to the new
 					 * location and jumping above.  We can then continue to
 					 * match the rest of the elements after skipping the OID
 					 * with no match; no future matches are tried for the
 					 * element that was skipped, because we know the arrays to
 					 * be in the same order.
 					 *
 					 * If we don't see a match anywhere in the rest of the
 					 * pinfo->relid_map array, that means we see an element
 					 * now that the planner didn't see, so mark that one as
 					 * pruned and move on.
 					 */
 					for (int pd_idx2 = pd_idx + 1; pd_idx2 < pinfo->nparts; pd_idx2++)
 					{
 						if (pd_idx2 >= pinfo->nparts)
 							break;
 						if (pinfo->relid_map[pd_idx2] == partdesc->oids[pp_idx])
 						{
 							pd_idx = pd_idx2;
 							goto recheck;
 						}
 					}

 					pprune->subpart_map[pp_idx] = -1;
 					pprune->subplan_map[pp_idx] = -1;
 				}
 			}

 			/* present_parts is also subject to later modification */
 			pprune->present_parts = bms_copy(pinfo->present_parts);

 			/*
 			 * Initialize pruning contexts as needed.  Note that we must skip
 			 * execution-time partition pruning in EXPLAIN (GENERIC_PLAN),
 			 * since parameter values may be missing.
 			 */
 			pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
 			if (pinfo->initial_pruning_steps &&
 				!(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
 			{
 				InitPartitionPruneContext(&pprune->initial_context,
 										  pinfo->initial_pruning_steps,
 										  partdesc, partkey, planstate,
 										  econtext);
 				/* Record whether initial pruning is needed at any level */
 				prunestate->do_initial_prune = true;
 			}
 			pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
 			if (pinfo->exec_pruning_steps &&
 				!(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
 			{
 				InitPartitionPruneContext(&pprune->exec_context,
 										  pinfo->exec_pruning_steps,
 										  partdesc, partkey, planstate,
 										  econtext);
 				/* Record whether exec pruning is needed at any level */
 				prunestate->do_exec_prune = true;
 			}

 			/*
 			 * Accumulate the IDs of all PARAM_EXEC Params affecting the
 			 * partitioning decisions at this plan node.
 			 */
 			prunestate->execparamids = bms_add_members(prunestate->execparamids,
 													   pinfo->execparamids);

 			j++;
 		}
 		i++;
 	}

 	return prunestate;
 }

 /*
  * Initialize a PartitionPruneContext for the given list of pruning steps.
  */
 static void
 InitPartitionPruneContext(PartitionPruneContext *context,
 						  List *pruning_steps,
 						  PartitionDesc partdesc,
 						  PartitionKey partkey,
 						  PlanState *planstate,
 						  ExprContext *econtext)
 {
 	int			n_steps;
 	int			partnatts;
 	ListCell   *lc;

 	n_steps = list_length(pruning_steps);

 	context->strategy = partkey->strategy;
 	context->partnatts = partnatts = partkey->partnatts;
 	context->nparts = partdesc->nparts;
 	context->boundinfo = partdesc->boundinfo;
 	context->partcollation = partkey->partcollation;
 	context->partsupfunc = partkey->partsupfunc;

 	/* We'll look up type-specific support functions as needed */
 	context->stepcmpfuncs = (FmgrInfo *)
 		palloc0(sizeof(FmgrInfo) * n_steps * partnatts);

 	context->ppccontext = CurrentMemoryContext;
 	context->planstate = planstate;
 	context->exprcontext = econtext;

 	/* Initialize expression state for each expression we need */
 	context->exprstates = (ExprState **)
 		palloc0(sizeof(ExprState *) * n_steps * partnatts);
 	foreach(lc, pruning_steps)
 	{
 		PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
 		ListCell   *lc2 = list_head(step->exprs);
 		int			keyno;

 		/* not needed for other step kinds */
 		if (!IsA(step, PartitionPruneStepOp))
 			continue;

 		Assert(list_length(step->exprs) <= partnatts);

 		for (keyno = 0; keyno < partnatts; keyno++)
 		{
 			if (bms_is_member(keyno, step->nullkeys))
 				continue;

 			if (lc2 != NULL)
 			{
 				Expr	   *expr = lfirst(lc2);

 				/* not needed for Consts */
 				if (!IsA(expr, Const))
 				{
 					int			stateidx = PruneCxtStateIdx(partnatts,
 															step->step.step_id,
 															keyno);

 					/*
 					 * When planstate is NULL, pruning_steps is known not to
 					 * contain any expressions that depend on the parent plan.
 					 * Information of any available EXTERN parameters must be
 					 * passed explicitly in that case, which the caller must
 					 * have made available via econtext.
 					 */
 					if (planstate == NULL)
 						context->exprstates[stateidx] =
 							ExecInitExprWithParams(expr,
 												   econtext->ecxt_param_list_info);
 					else
 						context->exprstates[stateidx] =
 							ExecInitExpr(expr, context->planstate);
 				}
 				lc2 = lnext(step->exprs, lc2);
 			}
 		}
 	}
 }

 /*
  * PartitionPruneFixSubPlanMap
  *		Fix mapping of partition indexes to subplan indexes contained in
  *		prunestate by considering the new list of subplans that survived
  *		initial pruning
  *
  * Current values of the indexes present in PartitionPruneState count all the
  * subplans that would be present before initial pruning was done.  If initial
  * pruning got rid of some of the subplans, any subsequent pruning passes will
  * be looking at a different set of target subplans to choose from than those
  * in the pre-initial-pruning set, so the maps in PartitionPruneState
  * containing those indexes must be updated to reflect the new indexes of
  * subplans in the post-initial-pruning set.
  */
 static void
 PartitionPruneFixSubPlanMap(PartitionPruneState *prunestate,
 							Bitmapset *initially_valid_subplans,
 							int n_total_subplans)
 {
 	int		   *new_subplan_indexes;
 	Bitmapset  *new_other_subplans;
 	int			i;
 	int			newidx;

 	/*
 	 * First we must build a temporary array which maps old subplan indexes to
 	 * new ones.  For convenience of initialization, we use 1-based indexes in
 	 * this array and leave pruned items as 0.
 	 */
 	new_subplan_indexes = (int *) palloc0(sizeof(int) * n_total_subplans);
 	newidx = 1;
 	i = -1;
 	while ((i = bms_next_member(initially_valid_subplans, i)) >= 0)
 	{
 		Assert(i < n_total_subplans);
 		new_subplan_indexes[i] = newidx++;
 	}

 	/*
 	 * Now we can update each PartitionedRelPruneInfo's subplan_map with new
 	 * subplan indexes.  We must also recompute its present_parts bitmap.
 	 */
 	for (i = 0; i < prunestate->num_partprunedata; i++)
 	{
 		PartitionPruningData *prunedata = prunestate->partprunedata[i];
 		int			j;

 		/*
 		 * Within each hierarchy, we perform this loop in back-to-front order
 		 * so that we determine present_parts for the lowest-level partitioned
 		 * tables first.  This way we can tell whether a sub-partitioned
 		 * table's partitions were entirely pruned so we can exclude it from
 		 * the current level's present_parts.
 		 */
 		for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
 		{
 			PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
 			int			nparts = pprune->nparts;
 			int			k;

 			/* We just rebuild present_parts from scratch */
 			bms_free(pprune->present_parts);
 			pprune->present_parts = NULL;

 			for (k = 0; k < nparts; k++)
 			{
 				int			oldidx = pprune->subplan_map[k];
 				int			subidx;

 				/*
 				 * If this partition existed as a subplan then change the old
 				 * subplan index to the new subplan index.  The new index may
 				 * become -1 if the partition was pruned above, or it may just
 				 * come earlier in the subplan list due to some subplans being
 				 * removed earlier in the list.  If it's a subpartition, add
 				 * it to present_parts unless it's entirely pruned.
 				 */
 				if (oldidx >= 0)
 				{
 					Assert(oldidx < n_total_subplans);
 					pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;

 					if (new_subplan_indexes[oldidx] > 0)
 						pprune->present_parts =
 							bms_add_member(pprune->present_parts, k);
 				}
 				else if ((subidx = pprune->subpart_map[k]) >= 0)
 				{
 					PartitionedRelPruningData *subprune;

 					subprune = &prunedata->partrelprunedata[subidx];

 					if (!bms_is_empty(subprune->present_parts))
 						pprune->present_parts =
 							bms_add_member(pprune->present_parts, k);
 				}
 			}
 		}
 	}

 	/*
 	 * We must also recompute the other_subplans set, since indexes in it may
 	 * change.
 	 */
 	new_other_subplans = NULL;
 	i = -1;
 	while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
 		new_other_subplans = bms_add_member(new_other_subplans,
 											new_subplan_indexes[i] - 1);

 	bms_free(prunestate->other_subplans);
 	prunestate->other_subplans = new_other_subplans;

 	pfree(new_subplan_indexes);
 }

 /*
  * Like ExecFindMatchingSubPlans, but adds the matching partitions
  * to an existing Bitmapset.
  */
 Bitmapset *
 ExecAddMatchingSubPlans(PartitionPruneState *prunestate, Bitmapset *result)
 {
 	Bitmapset *thisresult;

 	thisresult = ExecFindMatchingSubPlans(prunestate, true, NULL, -1, NIL);

 	result = bms_add_members(result, thisresult);

 	bms_free(thisresult);

 	return result;
 }

 /*
  * ExecFindMatchingSubPlans
  *		Determine which subplans match the pruning steps detailed in
  *		'prunestate' for the current comparison expression values.
  *
  * Here we assume we may evaluate PARAM_EXEC Params.
  *
  * GPDB: 'join_prune_paramids' can contain a list of PARAM_EXEC Param IDs
  * containing results that were computed earlier by PartitionSelector
  * nodes.
  */
 Bitmapset *
 ExecFindMatchingSubPlans(PartitionPruneState *prunestate,
 						 bool initial_prune,
 						 EState *estate,
 						 int nplans, List *join_prune_paramids)
 {
 	Bitmapset  *result = NULL;
 	MemoryContext oldcontext;
 	int			i;
 	Bitmapset  *join_selected = NULL;

 	if (join_prune_paramids)
 	{
 		ListCell   *lc;

 		join_selected = bms_add_range(join_selected, 0, nplans - 1);

 		foreach (lc, join_prune_paramids)
 		{
 			int			paramid = lfirst_int(lc);
 			ParamExecData *param;
 			PartitionSelectorState *psstate;

 			param = &(estate->es_param_exec_vals[paramid]);
 			Assert(param->execPlan == NULL);
 			Assert(!param->isnull);
 			psstate = (PartitionSelectorState *) DatumGetPointer(param->value);

 			if (psstate == NULL)
 			{
 				/*
 				 * The planner should have ensured that the Partition Selector
 				 * is fully executed before the Append.
 				 */
 				elog(WARNING, "partition selector was not fully executed");
 			}
 			else
 			{
 				Assert(IsA(psstate, PartitionSelectorState));

 				join_selected = bms_intersect(join_selected,
 											  psstate->part_prune_result);
 			}
 		}


 		if (!prunestate)
 		{
 			/* rely entirely on partition selectors */
 			return join_selected;
 		}
 	}

 	/*
 	 * Either we're here on the initial prune done during pruning
 	 * initialization, or we're at a point where PARAM_EXEC Params can be
 	 * evaluated *and* there are steps in which to do so.
 	 */
 	Assert(initial_prune || prunestate->do_exec_prune);

 	/*
 	 * Switch to a temp context to avoid leaking memory in the executor's
 	 * query-lifespan memory context.
 	 */
 	oldcontext = MemoryContextSwitchTo(prunestate->prune_context);

 	/*
 	 * For each hierarchy, do the pruning tests, and add nondeletable
 	 * subplans' indexes to "result".
 	 */
 	for (i = 0; i < prunestate->num_partprunedata; i++)
 	{
 		PartitionPruningData *prunedata = prunestate->partprunedata[i];
 		PartitionedRelPruningData *pprune;

 		/*
 		 * We pass the zeroth item, belonging to the root table of the
 		 * hierarchy, and find_matching_subplans_recurse() takes care of
 		 * recursing to other (lower-level) parents as needed.
 		 */
 		pprune = &prunedata->partrelprunedata[0];
 		find_matching_subplans_recurse(prunedata, pprune, initial_prune,
 									   &result);

 		/* Expression eval may have used space in ExprContext too */
 		if (pprune->exec_pruning_steps)
 			ResetExprContext(pprune->exec_context.exprcontext);
 	}

 	/* Add in any subplans that partition pruning didn't account for */
 	result = bms_add_members(result, prunestate->other_subplans);

 	MemoryContextSwitchTo(oldcontext);

 	/* Copy result out of the temp context before we reset it */
 	result = bms_copy(result);

 	if (join_prune_paramids)
 	{
 		result = bms_intersect(result, join_selected);
 	}

 	MemoryContextReset(prunestate->prune_context);

 	return result;
 }

 /*
  * find_matching_subplans_recurse
  *		Recursive worker function for ExecFindMatchingSubPlans
  *
  * Adds valid (non-prunable) subplan IDs to *validsubplans
  */
 static void
 find_matching_subplans_recurse(PartitionPruningData *prunedata,
 							   PartitionedRelPruningData *pprune,
 							   bool initial_prune,
 							   Bitmapset **validsubplans)
 {
 	Bitmapset  *partset;
 	int			i;

 	/* Guard against stack overflow due to overly deep partition hierarchy. */
 	check_stack_depth();

 	/*
 	 * Prune as appropriate, if we have pruning steps matching the current
 	 * execution context.  Otherwise just include all partitions at this
 	 * level.
 	 */
 	if (initial_prune && pprune->initial_pruning_steps)
 		partset = get_matching_partitions(&pprune->initial_context,
 										  pprune->initial_pruning_steps);
 	else if (!initial_prune && pprune->exec_pruning_steps)
 		partset = get_matching_partitions(&pprune->exec_context,
 										  pprune->exec_pruning_steps);
 	else
 		partset = pprune->present_parts;

 	/* Translate partset into subplan indexes */
 	i = -1;
 	while ((i = bms_next_member(partset, i)) >= 0)
 	{
 		if (pprune->subplan_map[i] >= 0)
 			*validsubplans = bms_add_member(*validsubplans,
 											pprune->subplan_map[i]);
 		else
 		{
 			int			partidx = pprune->subpart_map[i];

 			if (partidx >= 0)
 				find_matching_subplans_recurse(prunedata,
 											   &prunedata->partrelprunedata[partidx],
 											   initial_prune, validsubplans);
 			else
 			{
 				/*
 				 * We get here if the planner already pruned all the sub-
 				 * partitions for this partition.  Silently ignore this
 				 * partition in this case.  The end result is the same: we
 				 * would have pruned all partitions just the same, but we
 				 * don't have any pruning steps to execute to verify this.
 				 */
 			}
 		}
 	}
 }