src/backend/commands/cluster.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * cluster.c
  *	  CLUSTER a table on an index.  This is now also used for VACUUM FULL.
  *
  * There is hardly anything left of Paul Brown's original implementation...
  *
  *
  * Portions Copyright (c) 2006-2008, Greenplum inc
  * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates.
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/commands/cluster.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "access/amapi.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "access/tableam.h"
 #include "access/toast_internals.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
 #include "catalog/dependency.h"
 #include "catalog/gp_matview_aux.h"
 #include "catalog/heap.h"
 #include "catalog/index.h"
 #include "catalog/namespace.h"
 #include "catalog/objectaccess.h"
 #include "catalog/pg_appendonly.h"
 #include "catalog/pg_attribute_encoding.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_am.h"
 #include "catalog/toasting.h"
 #include "commands/cluster.h"
 #include "commands/defrem.h"
 #include "commands/progress.h"
 #include "commands/tablecmds.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/faultinjector.h"
 #include "utils/fmgroids.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/pg_rusage.h"
 #include "utils/relmapper.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tuplesort.h"

 #include "catalog/aocatalog.h"
 #include "catalog/oid_dispatch.h"
 #include "cdb/cdbvars.h"
 #include "cdb/cdbdisp_query.h"
 #include "cdb/cdboidsync.h"
 #include "libpq/pqformat.h"

 /*
  * This struct is used to pass around the information on tables to be
  * clustered. We need this so we can make a list of them when invoked without
  * a specific table/index pair.
  */
 typedef struct
 {
 	Oid			tableOid;
 	Oid			indexOid;
 } RelToCluster;


 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 							bool verbose, bool *pSwapToastByContent,
 							TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
 static List *get_tables_to_cluster(MemoryContext cluster_context);


 /*---------------------------------------------------------------------------
  * This cluster code allows for clustering multiple tables at once. Because
  * of this, we cannot just run everything on a single transaction, or we
  * would be forced to acquire exclusive locks on all the tables being
  * clustered, simultaneously --- very likely leading to deadlock.
  *
  * To solve this we follow a similar strategy to VACUUM code,
  * clustering each relation in a separate transaction. For this to work,
  * we need to:
  *	- provide a separate memory context so that we can pass information in
  *	  a way that survives across transactions
  *	- start a new transaction every time a new relation is clustered
  *	- check for validity of the information on to-be-clustered relations,
  *	  as someone might have deleted a relation behind our back, or
  *	  clustered one on a different index
  *	- end the transaction
  *
  * The single-relation case does not have any such overhead.
  *
  * We also allow a relation to be specified without index.  In that case,
  * the indisclustered bit will be looked up, and an ERROR will be thrown
  * if there is no index with the bit set.
  *---------------------------------------------------------------------------
  */
 void
 cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 {
 	ListCell   *lc;
 	ClusterParams params = {0};
 	bool		verbose = false;

 	/* Parse option list */
 	foreach(lc, stmt->params)
 	{
 		DefElem    *opt = (DefElem *) lfirst(lc);

 		if (strcmp(opt->defname, "verbose") == 0)
 			verbose = defGetBoolean(opt);
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("unrecognized CLUSTER option \"%s\"",
 							opt->defname),
 					 parser_errposition(pstate, opt->location)));
 	}

 	params.options = (verbose ? CLUOPT_VERBOSE : 0);

 	if (stmt->relation != NULL)
 	{
 		/* This is the single-relation case. */
 		Oid			tableOid,
 					indexOid = InvalidOid;
 		Relation	rel;

 		/* Find, lock, and check permissions on the table */
 		tableOid = RangeVarGetRelidExtended(stmt->relation,
 											AccessExclusiveLock,
 											0,
 											RangeVarCallbackOwnsTable, NULL);
 		rel = table_open(tableOid, NoLock);

 		/*
 		 * Reject clustering a remote temp table ... their local buffer
 		 * manager is not going to cope.
 		 */
 		if (RELATION_IS_OTHER_TEMP(rel))
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot cluster temporary tables of other sessions")));

 		/*
 		 * Reject clustering a partitioned table.
 		 */
 		if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot cluster a partitioned table")));

 		if (stmt->indexname == NULL)
 		{
 			ListCell   *index;

 			/* We need to find the index that has indisclustered set. */
 			foreach(index, RelationGetIndexList(rel))
 			{
 				indexOid = lfirst_oid(index);
 				if (get_index_isclustered(indexOid))
 					break;
 				indexOid = InvalidOid;
 			}

 			if (!OidIsValid(indexOid))
 				ereport(ERROR,
 						(errcode(ERRCODE_UNDEFINED_OBJECT),
 						 errmsg("there is no previously clustered index for table \"%s\"",
 								stmt->relation->relname)));
 		}
 		else
 		{
 			/*
 			 * The index is expected to be in the same namespace as the
 			 * relation.
 			 */
 			indexOid = get_relname_relid(stmt->indexname,
 										 rel->rd_rel->relnamespace);
 			if (!OidIsValid(indexOid))
 				ereport(ERROR,
 						(errcode(ERRCODE_UNDEFINED_OBJECT),
 						 errmsg("index \"%s\" for table \"%s\" does not exist",
 								stmt->indexname, stmt->relation->relname)));
 		}

 		/* close relation, keep lock till commit */
 		table_close(rel, NoLock);

 		/* Do the job. */
 		/* GPDB_14_MERGE_FIXME: do we need the return value of cluster_rel to dispatch ? */
 		cluster_rel(tableOid, indexOid, &params);

 		if (Gp_role == GP_ROLE_DISPATCH)
 		{
 			CdbDispatchUtilityStatement((Node *) stmt,
 										DF_CANCEL_ON_ERROR|
 										DF_WITH_SNAPSHOT|
 										DF_NEED_TWO_PHASE,
 										GetAssignedOidsForDispatch(),
 										NULL);
 		}

 		if (IS_QD_OR_SINGLENODE())
 		{
 			/*
 			 * Update view status.
 			 * In principle, CLUSTER command won't change the ligical data of
 			 * a table, it may change the physical pages by index.
 			 * But for Append Agg Plan in SERVERLESS mode, we need to fetch
 			 * delta tuples from base table which requires the ability of storage
 			 * to distint the pages instead, since latest relative materialized
 			 * view REFRESH.
 			 */
 			SetRelativeMatviewAuxStatus(tableOid,
 										MV_DATA_STATUS_UP_REORGANIZED,
 										MV_DATA_STATUS_TRANSFER_DIRECTION_ALL);

 		}
 	}
 	else
 	{
 		/*
 		 * This is the "multi relation" case. We need to cluster all tables
 		 * that have some index with indisclustered set.
 		 */
 		MemoryContext cluster_context;
 		List	   *rvs;
 		ListCell   *rv;

 		/*
 		 * We cannot run this form of CLUSTER inside a user transaction block;
 		 * we'd be holding locks way too long.
 		 */
 		PreventInTransactionBlock(isTopLevel, "CLUSTER");

 		/*
 		 * Create special memory context for cross-transaction storage.
 		 *
 		 * Since it is a child of PortalContext, it will go away even in case
 		 * of error.
 		 */
 		cluster_context = AllocSetContextCreate(PortalContext,
 												"Cluster",
 												ALLOCSET_DEFAULT_SIZES);

 		/*
 		 * Build the list of relations to cluster.  Note that this lives in
 		 * cluster_context.
 		 */
 		rvs = get_tables_to_cluster(cluster_context);

 		/* Commit to get out of starting transaction */
 		PopActiveSnapshot();
 		CommitTransactionCommand();

 		/* Ok, now that we've got them all, cluster them one by one */
 		foreach(rv, rvs)
 		{
 			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
 			bool		dispatch;
 			ClusterParams cluster_params = params;

 			/* Start a new transaction for each relation. */
 			StartTransactionCommand();
 			/* functions in indexes may want a snapshot set */
 			PushActiveSnapshot(GetTransactionSnapshot());
 			/* Do the job. */
 			cluster_params.options |= CLUOPT_RECHECK;
 			dispatch = cluster_rel(rvtc->tableOid, rvtc->indexOid,
 						&cluster_params);

 			if (Gp_role == GP_ROLE_DISPATCH && dispatch)
 			{
 				stmt->relation = makeNode(RangeVar);
 				stmt->relation->schemaname = get_namespace_name(get_rel_namespace(rvtc->tableOid));
 				stmt->relation->relname = get_rel_name(rvtc->tableOid);
 				/* other fields in stmt are same */
 				CdbDispatchUtilityStatement((Node *) stmt,
 											DF_CANCEL_ON_ERROR|
 											DF_WITH_SNAPSHOT,
 											GetAssignedOidsForDispatch(),
 											NULL);
 			}
 			/* See comments above. */
 			if (IS_QD_OR_SINGLENODE())
 				SetRelativeMatviewAuxStatus(rvtc->tableOid,
 											MV_DATA_STATUS_UP_REORGANIZED,
 											MV_DATA_STATUS_TRANSFER_DIRECTION_ALL);

 			PopActiveSnapshot();
 			CommitTransactionCommand();
 		}

 		/* Start a new transaction for the cleanup work. */
 		StartTransactionCommand();

 		/* Clean up working storage */
 		MemoryContextDelete(cluster_context);
 	}
 }

 /*
  * cluster_rel
  *
  * This clusters the table by creating a new, clustered table and
  * swapping the relfilenodes of the new table and the old table, so
  * the OID of the original table is preserved.  Thus we do not lose
  * GRANT, inheritance nor references to this table (this was a bug
  * in releases through 7.3).
  *
  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
  * the new table, it's better to create the indexes afterwards than to fill
  * them incrementally while we load the table.
  *
  * If indexOid is InvalidOid, the table will be rewritten in physical order
  * instead of index order.  This is the new implementation of VACUUM FULL,
  * and error messages should refer to the operation as VACUUM not CLUSTER.
  *
  */
 bool
 cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 {
 	Relation	OldHeap;
 	Oid			save_userid;
 	int			save_sec_context;
 	int			save_nestlevel;
 	bool		verbose = ((params->options & CLUOPT_VERBOSE) != 0);
 	bool		recheck = ((params->options & CLUOPT_RECHECK) != 0);
 	bool		result = false;

 	/* Check for user-requested abort. */
 	CHECK_FOR_INTERRUPTS();

 	pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
 	if (OidIsValid(indexOid))
 		pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
 									 PROGRESS_CLUSTER_COMMAND_CLUSTER);
 	else
 		pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
 									 PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);

 	/*
 	 * We grab exclusive access to the target rel and index for the duration
 	 * of the transaction.  (This is redundant for the single-transaction
 	 * case, since cluster() already did it.)  The index lock is taken inside
 	 * check_index_is_clusterable.
 	 */
 	OldHeap = try_relation_open(tableOid, AccessExclusiveLock, false);

 	/* If the table has gone away, we can skip processing it */
 	if (!OldHeap)
 	{
 		pgstat_progress_end_command();
 		return false;
 	}

 	/*
 	 * Switch to the table owner's userid, so that any index functions are run
 	 * as that user.  Also lock down security-restricted operations and
 	 * arrange to make GUC variable changes local to this command.
 	 */
 	GetUserIdAndSecContext(&save_userid, &save_sec_context);
 	SetUserIdAndSecContext(OldHeap->rd_rel->relowner,
 						   save_sec_context | SECURITY_RESTRICTED_OPERATION);
 	save_nestlevel = NewGUCNestLevel();

 	/*
 	 * Since we may open a new transaction for each relation, we have to check
 	 * that the relation still is what we think it is.
 	 *
 	 * If this is a single-transaction CLUSTER, we can skip these tests. We
 	 * *must* skip the one on indisclustered since it would reject an attempt
 	 * to cluster a not-previously-clustered index.
 	 */
 	if (recheck)
 	{
 		/* Check that the user still owns the relation */
 		if (!pg_class_ownercheck(tableOid, save_userid))
 		{
 			relation_close(OldHeap, AccessExclusiveLock);
 			goto out;
 		}

 		/*
 		 * Silently skip a temp table for a remote session.  Only doing this
 		 * check in the "recheck" case is appropriate (which currently means
 		 * somebody is executing a database-wide CLUSTER), because there is
 		 * another check in cluster() which will stop any attempt to cluster
 		 * remote temp tables by name.  There is another check in cluster_rel
 		 * which is redundant, but we leave it for extra safety.
 		 */
 		if (RELATION_IS_OTHER_TEMP(OldHeap))
 		{
 			relation_close(OldHeap, AccessExclusiveLock);
 			goto out;
 		}

 		if (OidIsValid(indexOid))
 		{
 			/*
 			 * Check that the index still exists
 			 */
 			if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
 			{
 				relation_close(OldHeap, AccessExclusiveLock);
 				goto out;
 			}

 			/*
 			 * Check that the index is still the one with indisclustered set.
 			 */
 			if (!get_index_isclustered(indexOid))
 			{
 				relation_close(OldHeap, AccessExclusiveLock);
 				goto out;
 			}
 		}
 	}

 	/*
 	 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
 	 * would work in most respects, but the index would only get marked as
 	 * indisclustered in the current database, leading to unexpected behavior
 	 * if CLUSTER were later invoked in another database.
 	 */
 	if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot cluster a shared catalog")));

 	/*
 	 * Don't process temp tables of other backends ... their local buffer
 	 * manager is not going to cope.
 	 */
 	if (RELATION_IS_OTHER_TEMP(OldHeap))
 	{
 		if (OidIsValid(indexOid))
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot cluster temporary tables of other sessions")));
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot vacuum temporary tables of other sessions")));
 	}

 	/*
 	 * Also check for active uses of the relation in the current transaction,
 	 * including open scans and pending AFTER trigger events.
 	 */
 	CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");

 	/* Check heap and index are valid to cluster on */
 	if (OidIsValid(indexOid))
 		check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);

 	/*
 	 * Quietly ignore the request if this is a materialized view which has not
 	 * been populated from its query. No harm is done because there is no data
 	 * to deal with, and we don't want to throw an error if this is part of a
 	 * multi-relation request -- for example, CLUSTER was run on the entire
 	 * database.
 	 */
 	if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
 		!RelationIsPopulated(OldHeap))
 	{
 		relation_close(OldHeap, AccessExclusiveLock);
 		goto out;
 	}

 	/*
 	 * All predicate locks on the tuples or pages are about to be made
 	 * invalid, because we move tuples around.  Promote them to relation
 	 * locks.  Predicate locks on indexes will be promoted when they are
 	 * reindexed.
 	 */
 	TransferPredicateLocksToHeapRelation(OldHeap);

 	/* rebuild_relation does all the dirty work */
 	rebuild_relation(OldHeap, indexOid, verbose);

 	/* NB: rebuild_relation does table_close() on OldHeap */

 	result = true;

 out:
 	/* Roll back any GUC changes executed by index functions */
 	AtEOXact_GUC(false, save_nestlevel);

 	/* Restore userid and security context */
 	SetUserIdAndSecContext(save_userid, save_sec_context);

 	pgstat_progress_end_command();

 	return result;
 }

 /*
  * Verify that the specified heap and index are valid to cluster on
  *
  * Side effect: obtains lock on the index.  The caller may
  * in some cases already have AccessExclusiveLock on the table, but
  * not in all cases so we can't rely on the table-level lock for
  * protection here.
  */
 void
 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
 {
 	Relation	OldIndex;

 	OldIndex = index_open(indexOid, lockmode);

 	/*
 	 * Check that index is in fact an index on the given relation
 	 */
 	if (OldIndex->rd_index == NULL ||
 		OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 				 errmsg("\"%s\" is not an index for table \"%s\"",
 						RelationGetRelationName(OldIndex),
 						RelationGetRelationName(OldHeap))));

 	/* Index AM must allow clustering */
 	if (!OldIndex->rd_indam->amclusterable)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
 						RelationGetRelationName(OldIndex))));

 	/*
 	 * Disallow clustering on incomplete indexes (those that might not index
 	 * every row of the relation).  We could relax this by making a separate
 	 * seqscan pass over the table to copy the missing rows, but that seems
 	 * expensive and tedious.
 	 */
 	if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot cluster on partial index \"%s\"",
 						RelationGetRelationName(OldIndex))));

 	/*
 	 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
 	 * it might well not contain entries for every heap row, or might not even
 	 * be internally consistent.  (But note that we don't check indcheckxmin;
 	 * the worst consequence of following broken HOT chains would be that we
 	 * might put recently-dead tuples out-of-order in the new table, and there
 	 * is little harm in that.)
 	 */
 	if (!OldIndex->rd_index->indisvalid)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot cluster on invalid index \"%s\"",
 						RelationGetRelationName(OldIndex))));

 	/* Drop relcache refcnt on OldIndex, but keep lock */
 	index_close(OldIndex, NoLock);
 }

 /*
  * mark_index_clustered: mark the specified index as the one clustered on
  *
  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
  */
 void
 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
 {
 	HeapTuple	indexTuple;
 	Form_pg_index indexForm;
 	Relation	pg_index;
 	ListCell   *index;

 	/* Disallow applying to a partitioned table */
 	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot mark index clustered in partitioned table")));

 	/*
 	 * If the index is already marked clustered, no need to do anything.
 	 */
 	if (OidIsValid(indexOid))
 	{
 		if (get_index_isclustered(indexOid))
 			return;
 	}

 	/*
 	 * Check each index of the relation and set/clear the bit as needed.
 	 */
 	pg_index = table_open(IndexRelationId, RowExclusiveLock);

 	foreach(index, RelationGetIndexList(rel))
 	{
 		Oid			thisIndexOid = lfirst_oid(index);

 		indexTuple = SearchSysCacheCopy1(INDEXRELID,
 										 ObjectIdGetDatum(thisIndexOid));
 		if (!HeapTupleIsValid(indexTuple))
 			elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
 		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

 		/*
 		 * Unset the bit if set.  We know it's wrong because we checked this
 		 * earlier.
 		 */
 		if (indexForm->indisclustered)
 		{
 			indexForm->indisclustered = false;
 			CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
 		}
 		else if (thisIndexOid == indexOid)
 		{
 			/* this was checked earlier, but let's be real sure */
 			if (!indexForm->indisvalid)
 				elog(ERROR, "cannot cluster on invalid index %u", indexOid);
 			indexForm->indisclustered = true;
 			CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
 		}

 		InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
 									 InvalidOid, is_internal);

 		heap_freetuple(indexTuple);
 	}

 	table_close(pg_index, RowExclusiveLock);
 }

 /*
  * rebuild_relation: rebuild an existing relation in index or physical order
  *
  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
  *
  * NB: this routine closes OldHeap at the right time; caller should not.
  */
 static void
 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
 {
 	Oid			tableOid = RelationGetRelid(OldHeap);
 	Oid			accessMethod = OldHeap->rd_rel->relam;
 	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
 	Oid			OIDNewHeap;
 	char		relpersistence;
 	bool		is_system_catalog;
 	bool		swap_toast_by_content;
 	TransactionId frozenXid;
 	MultiXactId cutoffMulti;
 	/*
 	 * GPDB_12_MERGE_FIXME: We use specific bool in abstract code. This should
 	 * be somehow hidden by table am api or necessity of this switch should be
 	 * revisited.
 	 */
 	bool		is_ao = RelationIsAppendOptimized(OldHeap);

 	/* Mark the correct index as clustered */
 	if (OidIsValid(indexOid))
 		mark_index_clustered(OldHeap, indexOid, true);

 	/* Remember info about rel before closing OldHeap */
 	relpersistence = OldHeap->rd_rel->relpersistence;
 	is_system_catalog = IsSystemRelation(OldHeap);

 	/* Close relcache entry, but keep lock until transaction commit */
 	table_close(OldHeap, NoLock);

 	/* Create the transient table that will receive the re-ordered data */
 	OIDNewHeap = make_new_heap(tableOid, tableSpace,
 							   accessMethod,
 							   relpersistence,
 							   AccessExclusiveLock,
 							   true /* createAoBlockDirectory */,
 							   false);

 	/* Copy the heap data into the new table in the desired order */
 	copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
 					&swap_toast_by_content, &frozenXid, &cutoffMulti);

 	/*
 	 * Swap the physical files of the target and transient tables, then
 	 * rebuild the target's indexes and throw away the transient table.
 	 */
 	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
 					 swap_toast_by_content,
 					 !is_ao /* swap_stats */,
 					 false, true,
 					 frozenXid, cutoffMulti,
 					 relpersistence);
 }

 static char *
 make_column_name(char *prefix, char *colname)
 {
 	StringInfoData namebuf;

 	initStringInfo(&namebuf);
 	appendStringInfo(&namebuf, "%s%s", prefix, colname);
 	return namebuf.data;
 }

 /*
  * Create the transient table that will be filled with new data during
  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
  * duplicates the logical structure of the OldHeap; but will have the
  * specified physical storage properties NewTableSpace, NewAccessMethod, and
  * relpersistence.
  *
  * Specify a colprefix can create a table with different colname, incase
  * column conflict issue happens in REFRESH MATERIALIZED VIEW operation.
  *
  * After this, the caller should load the new heap with transferred/modified
  * data, then call finish_heap_swap to complete the operation.
  */
 Oid
 make_new_heap_with_colname(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
 			  char relpersistence,
 			  LOCKMODE lockmode,
 			  bool createAoBlockDirectory,
 			  bool makeCdbPolicy,
 			  char *colprefix)
 {
 	TupleDesc	OldHeapDesc;
 	char		NewHeapName[NAMEDATALEN];
 	Oid			OIDNewHeap;
 	Oid			toastid;
 	Relation	OldHeap;
 	HeapTuple	tuple;
 	Datum		reloptions;
 	bool		isNull;
 	Oid			namespaceid;

 	OldHeap = table_open(OIDOldHeap, lockmode);
 	OldHeapDesc = RelationGetDescr(OldHeap);

 	if (colprefix != NULL)
 	{
 		for (int i = 0; i < OldHeapDesc->natts; i++)
 		{
 			Form_pg_attribute attr = TupleDescAttr(OldHeapDesc, i);
 			char *attname = make_column_name(colprefix, NameStr(attr->attname));
 			namestrcpy(&(attr->attname), attname);
 			pfree(attname);
 		}
 	}
 	/*
 	 * Note that the NewHeap will not receive any of the defaults or
 	 * constraints associated with the OldHeap; we don't need 'em, and there's
 	 * no reason to spend cycles inserting them into the catalogs only to
 	 * delete them.
 	 */

 	/*
 	 * But we do want to use reloptions of the old heap for new heap.
 	 */
 	tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
 	if (!HeapTupleIsValid(tuple))
 		elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
 	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 								 &isNull);
 	if (isNull)
 		reloptions = (Datum) 0;

 	/*
 	 * Unless we are changing access method between heap and AO/CO, look further.
 	 */
 	/*
 	 * GPDB: some considerations when AM is going to change between heap and AO/CO:
 	 *
 	 * If user has also requested setting new reloptions, the new reloptions should have
 	 * replaced the old ones at this point. We just need to reuse those on the new table.
 	 *
 	 * If user does NOT request new reloptions, we should discard the existing reloptions.
 	 * And one more consideration if we are changing the table from heap to AO: we should
 	 * also pick up options from gp_default_storage_options, just like CREATE TABLE does.
 	 */
 	if (RelationIsHeap(OldHeap) && IsAccessMethodAO(NewAccessMethod))
 	{
 		/*
 		 * Heap to AO/CO: filter out any reloptions that belong to heap,
 		 * and pick up from gp_default_storage_options.
 		 */
 		int 		numoptions;
 		relopt_value 	*options;

 		/*
 		 * Process the reloptions as for AO tables. And validate=false will silently
 		 * filter out any reloptions that belong to heap.
 		 */
 		StdRdOptions *stdRdOptions = (StdRdOptions *)default_reloptions(reloptions,
 																	false, /* validate */
 																	RELOPT_KIND_APPENDOPTIMIZED);

 		/* Pick up from gp_default_storage_options. */
 		options = parseRelOptions(reloptions, false, RELOPT_KIND_APPENDOPTIMIZED, &numoptions);
 		validate_and_refill_options(stdRdOptions, options, numoptions, RELOPT_KIND_APPENDOPTIMIZED, true);

 		/* Update the reloptions string. */
 		reloptions = transformAOStdRdOptions(stdRdOptions, reloptions);
 	}
 	else if (RelationIsAppendOptimized(OldHeap) && NewAccessMethod == HEAP_TABLE_AM_OID)
 	{
 		/*
 		 * AO/CO to Heap: unfortunately we don't have a convenient routine to transform
 		 * heap StdRdOptions back to reloption string. So we take a slightly different
 		 * approach than the case of heap to AO/CO: we check if there is any AO reloptions:
 		 *
 		 * (1) If there is, just discard them (AO options do not apply to heap).
 		 * (2) If there is none, that means we either have replaced it with heap reloptions
 		 * or the reloptions field is just empty, and either way we will pass the existing
 		 * reloptions on to the new table.
 		 *
 		 * This is possible because at this point we only have either AO/AOCO reloptions or
 		 * heap reloptions, but we cannot have both (see ATExecSetRelOptions).
 		 */
 		Datum 	aoreloptions = (Datum) 0;
 		StdRdOptions *stdRdOptions = (StdRdOptions *)default_reloptions(reloptions,
 																	false, /* validate */
 																	RELOPT_KIND_APPENDOPTIMIZED);

 		/*
 		 * Transform the stdRdOptions to get a reloptions string, from which we will
 		 * know if there is any AO reloptions.
 		 */
 		aoreloptions = transformAOStdRdOptions(stdRdOptions, aoreloptions);
 		if (aoreloptions != (Datum) 0)
 			reloptions = (Datum) 0;
 	}

 	if (relpersistence == RELPERSISTENCE_TEMP)
 		namespaceid = LookupCreationNamespace("pg_temp");
 	else
 		namespaceid = RelationGetNamespace(OldHeap);

 	/*
 	 * Create the new heap, using a temporary name in the same namespace as
 	 * the existing table.  NOTE: there is some risk of collision with user
 	 * relnames.  Working around this seems more trouble than it's worth; in
 	 * particular, we can't create the new heap in a different namespace from
 	 * the old, or we will have problems with the TEMP status of temp tables.
 	 *
 	 * Note: the new heap is not a shared relation, even if we are rebuilding
 	 * a shared rel.  However, we do make the new heap mapped if the source is
 	 * mapped.  This simplifies swap_relation_files, and is absolutely
 	 * necessary for rebuilding pg_class, for reasons explained there.
 	 */
 	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);

 	OIDNewHeap = heap_create_with_catalog(NewHeapName,
 										  namespaceid,
 										  NewTableSpace,
 										  InvalidOid,
 										  InvalidOid,
 										  InvalidOid,
 										  OldHeap->rd_rel->relowner,
 										  NewAccessMethod,
 										  OldHeapDesc,
 										  NIL,
 										  RELKIND_RELATION,
 										  relpersistence,
 										  false,
 										  RelationIsMapped(OldHeap),
 										  ONCOMMIT_NOOP,
 										  makeCdbPolicy? OldHeap->rd_cdbpolicy: NULL,/*CDB*/
 										  reloptions,
 										  false,
 										  true,
 										  true,
 										  OIDOldHeap,
 										  NULL,
 										  true);
 	Assert(OIDNewHeap != InvalidOid);

 	ReleaseSysCache(tuple);

 	/*
 	 * Advance command counter so that the newly-created relation's catalog
 	 * tuples will be visible to table_open.
 	 */
 	CommandCounterIncrement();

 	/*
 	 * If necessary, create a TOAST table for the new relation, or an Append
 	 * Only segment table.
 	 *
 	 * If the relation doesn't have a TOAST table already, we can't need one
 	 * for the new relation.  The other way around is possible though: if some
 	 * wide columns have been dropped, NewHeapCreateToastTable can decide that
 	 * no TOAST table is needed for the new table.
 	 *
 	 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
 	 * that the TOAST table will be visible for insertion.
 	 */
 	toastid = OldHeap->rd_rel->reltoastrelid;
 	if (OidIsValid(toastid))
 	{
 		/* keep the existing toast table's reloptions, if any */
 		tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
 		if (!HeapTupleIsValid(tuple))
 			elog(ERROR, "cache lookup failed for relation %u", toastid);
 		reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 									 &isNull);
 		if (isNull)
 			reloptions = (Datum) 0;

 		NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);

 		ReleaseSysCache(tuple);
 	}

 	if (IsAccessMethodAO(NewAccessMethod))
 		NewRelationCreateAOAuxTables(OIDNewHeap, createAoBlockDirectory);

 	CacheInvalidateRelcacheByRelid(OIDNewHeap);

 	/*
 	 * Copy the pg_attribute_encoding entries over if new table needs them.
 	 * Note that in the case of AM change from heap/ao to aoco, we still need
 	 * to do this since we created those entries for the heap/ao table at the
 	 * phase 2 of ATSETAM (see ATExecCmd).
 	 */
 	if (NewAccessMethod == AO_COLUMN_TABLE_AM_OID)
 		cloneAttributeEncoding(OIDOldHeap,
 							   OIDNewHeap,
 							   RelationGetNumberOfAttributes(OldHeap));

 	table_close(OldHeap, NoLock);

 	return OIDNewHeap;
 }

 Oid
 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
 			  char relpersistence,
 			  LOCKMODE lockmode,
 			  bool createAoBlockDirectory,
 			  bool makeCdbPolicy)
 {
 	return make_new_heap_with_colname(OIDOldHeap, NewTableSpace, NewAccessMethod,
 						relpersistence, lockmode, createAoBlockDirectory, makeCdbPolicy,
 						NULL);

 }

 /*
  * Do the physical copying of table data.
  *
  * There are three output parameters:
  * *pSwapToastByContent is set true if toast tables must be swapped by content.
  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
  */
 static void
 copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 				bool *pSwapToastByContent, TransactionId *pFreezeXid,
 				MultiXactId *pCutoffMulti)
 {
 	Relation	NewHeap,
 				OldHeap,
 				OldIndex;
 	Relation	relRelation;
 	HeapTuple	reltup;
 	Form_pg_class relform;
 	TupleDesc	oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
 	TupleDesc	newTupDesc PG_USED_FOR_ASSERTS_ONLY;
 	TransactionId OldestXmin;
 	TransactionId FreezeXid;
 	MultiXactId MultiXactCutoff;
 	bool		use_sort;
 	double		num_tuples = 0,
 				tups_vacuumed = 0,
 				tups_recently_dead = 0;
 	BlockNumber num_pages;
 	int			elevel = verbose ? INFO : DEBUG2;
 	PGRUsage	ru0;

 	pg_rusage_init(&ru0);

 	/*
 	 * Open the relations we need.
 	 */
 	NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
 	OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
 	if (OidIsValid(OIDOldIndex))
 		OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
 	else
 		OldIndex = NULL;

 	/*
 	 * Their tuple descriptors should be exactly alike, but here we only need
 	 * assume that they have the same number of columns.
 	 */
 	oldTupDesc = RelationGetDescr(OldHeap);
 	newTupDesc = RelationGetDescr(NewHeap);
 	Assert(newTupDesc->natts == oldTupDesc->natts);

 	/*
 	 * If the OldHeap has a toast table, get lock on the toast table to keep
 	 * it from being vacuumed.  This is needed because autovacuum processes
 	 * toast tables independently of their main tables, with no lock on the
 	 * latter.  If an autovacuum were to start on the toast table after we
 	 * compute our OldestXmin below, it would use a later OldestXmin, and then
 	 * possibly remove as DEAD toast tuples belonging to main tuples we think
 	 * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
 	 * tuples.
 	 *
 	 * We don't need to open the toast relation here, just lock it.  The lock
 	 * will be held till end of transaction.
 	 */
 	if (OldHeap->rd_rel->reltoastrelid)
 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);

 	/*
 	 * If both tables have TOAST tables, perform toast swap by content.  It is
 	 * possible that the old table has a toast table but the new one doesn't,
 	 * if toastable columns have been dropped.  In that case we have to do
 	 * swap by links.  This is okay because swap by content is only essential
 	 * for system catalogs, and we don't support schema changes for them.
 	 */
 	if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
 	{
 		*pSwapToastByContent = true;

 		/*
 		 * When doing swap by content, any toast pointers written into NewHeap
 		 * must use the old toast table's OID, because that's where the toast
 		 * data will eventually be found.  Set this up by setting rd_toastoid.
 		 * This also tells toast_save_datum() to preserve the toast value
 		 * OIDs, which we want so as not to invalidate toast pointers in
 		 * system catalog caches, and to avoid making multiple copies of a
 		 * single toast value.
 		 *
 		 * Note that we must hold NewHeap open until we are done writing data,
 		 * since the relcache will not guarantee to remember this setting once
 		 * the relation is closed.  Also, this technique depends on the fact
 		 * that no one will try to read from the NewHeap until after we've
 		 * finished writing it and swapping the rels --- otherwise they could
 		 * follow the toast pointers to the wrong place.  (It would actually
 		 * work for values copied over from the old toast table, but not for
 		 * any values that we toast which were previously not toasted.)
 		 */
 		NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
 	}
 	else
 		*pSwapToastByContent = false;

 	/*
 	 * Compute xids used to freeze and weed out dead tuples and multixacts.
 	 * Since we're going to rewrite the whole table anyway, there's no reason
 	 * not to be aggressive about this.
 	 */
 	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
 						  NULL);

 	/*
 	 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
 	 * backwards, so take the max.
 	 */
 	if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
 		TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
 		FreezeXid = OldHeap->rd_rel->relfrozenxid;

 	/*
 	 * MultiXactCutoff, similarly, shouldn't go backwards either.
 	 */
 	if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
 		MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
 		MultiXactCutoff = OldHeap->rd_rel->relminmxid;

 	/*
 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
 	 * the OldHeap.  We know how to use a sort to duplicate the ordering of a
 	 * btree index, and will use seqscan-and-sort for that case if the planner
 	 * tells us it's cheaper.  Otherwise, always indexscan if an index is
 	 * provided, else plain seqscan.
 	 */
 	if (OldIndex != NULL && IsIndexAccessMethod(OldIndex->rd_rel->relam, BTREE_AM_OID))
 		use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
 	else
 		use_sort = false;

 	/* Log what we're doing */
 	if (OldIndex != NULL && !use_sort)
 		ereport(elevel,
 				(errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
 						get_namespace_name(RelationGetNamespace(OldHeap)),
 						RelationGetRelationName(OldHeap),
 						RelationGetRelationName(OldIndex))));
 	else if (use_sort)
 		ereport(elevel,
 				(errmsg("clustering \"%s.%s\" using sequential scan and sort",
 						get_namespace_name(RelationGetNamespace(OldHeap)),
 						RelationGetRelationName(OldHeap))));
 	else
 		ereport(elevel,
 				(errmsg("vacuuming \"%s.%s\"",
 						get_namespace_name(RelationGetNamespace(OldHeap)),
 						RelationGetRelationName(OldHeap))));

 	/*
 	 * Hand of the actual copying to AM specific function, the generic code
 	 * cannot know how to deal with visibility across AMs. Note that this
 	 * routine is allowed to set FreezeXid / MultiXactCutoff to different
 	 * values (e.g. because the AM doesn't use freezing).
 	 */
 	table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
 									OldestXmin, &FreezeXid, &MultiXactCutoff,
 									&num_tuples, &tups_vacuumed,
 									&tups_recently_dead);

 	/* return selected values to caller, get set as relfrozenxid/minmxid */
 	*pFreezeXid = FreezeXid;
 	*pCutoffMulti = MultiXactCutoff;

 	/* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
 	NewHeap->rd_toastoid = InvalidOid;

 	num_pages = RelationGetNumberOfBlocks(NewHeap);

 	/* Log what we did */
 	ereport(elevel,
 			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
 					RelationGetRelationName(OldHeap),
 					tups_vacuumed, num_tuples,
 					RelationGetNumberOfBlocks(OldHeap)),
 			 errdetail("%.0f dead row versions cannot be removed yet.\n"
 					   "%s.",
 					   tups_recently_dead,
 					   pg_rusage_show(&ru0))));

 	if (OldIndex != NULL)
 		index_close(OldIndex, NoLock);
 	table_close(OldHeap, NoLock);
 	table_close(NewHeap, NoLock);

 	/* Update pg_class to reflect the correct values of pages and tuples. */
 	relRelation = table_open(RelationRelationId, RowExclusiveLock);

 	reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
 	if (!HeapTupleIsValid(reltup))
 		elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
 	relform = (Form_pg_class) GETSTRUCT(reltup);

 	relform->relpages = num_pages;
 	relform->reltuples = num_tuples;

 	/* Don't update the stats for pg_class.  See swap_relation_files. */
 	if (OIDOldHeap != RelationRelationId)
 		CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
 	else
 		CacheInvalidateRelcacheByTuple(reltup);

 	/* Clean up. */
 	heap_freetuple(reltup);
 	table_close(relRelation, RowExclusiveLock);

 	/* Make the update visible */
 	CommandCounterIncrement();
 }

 /*
  * Change dependency links for objects that are being swapped.
  *
  * 'tabletype' can be "TOAST table", "aoseg", "aoblkdir".
  * It is used for printing error messages.
  */
 static void
 changeDependencyLinks(Oid baseOid1, Oid baseOid2, Oid oid1, Oid oid2,
 					  const char *tabletype)
 {
 	ObjectAddress baseobject, newobject;
 	long		count;

 	/* Delete old dependencies */
 	if (oid1)
 	{
 		count = deleteDependencyRecordsFor(RelationRelationId, oid1, false);
 		if (count != 1)
 			elog(ERROR, "expected one dependency record for %s table, found %ld",
 				 tabletype, count);
 	}

 	if (oid2)
 	{
 		count = deleteDependencyRecordsFor(RelationRelationId, oid2, false);
 		if (count != 1)
 			elog(ERROR, "expected one dependency record for %s table, found %ld",
 				 tabletype, count);
 	}

 	/* Register new dependencies */
 	baseobject.classId = RelationRelationId;
 	baseobject.objectSubId = 0;
 	newobject.classId = RelationRelationId;
 	newobject.objectSubId = 0;

 	if (oid1)
 	{
 		baseobject.objectId = baseOid1;
 		newobject.objectId = oid1;
 		recordDependencyOn(&newobject, &baseobject, DEPENDENCY_INTERNAL);
 	}

 	if (oid2)
 	{
 		baseobject.objectId = baseOid2;
 		newobject.objectId = oid2;
 		recordDependencyOn(&newobject, &baseobject, DEPENDENCY_INTERNAL);
 	}
 }

 /*
  * Swap the physical files of two given relations.
  *
  * We swap the physical identity (reltablespace, relfilenode) while keeping the
  * same logical identities of the two relations.  relpersistence is also
  * swapped, which is critical since it determines where buffers live for each
  * relation.
  *
  * We can swap associated TOAST data in either of two ways: recursively swap
  * the physical content of the toast tables (and their indexes), or swap the
  * TOAST links in the given relations' pg_class entries.  The former is needed
  * to manage rewrites of shared catalogs (where we cannot change the pg_class
  * links) while the latter is the only way to handle cases in which a toast
  * table is added or removed altogether.
  *
  * Additionally, the first relation is marked with relfrozenxid set to
  * frozenXid.  It seems a bit ugly to have this here, but the caller would
  * have to do it anyway, so having it here saves a heap_update.  Note: in
  * the swap-toast-links case, we assume we don't need to change the toast
  * table's relfrozenxid: the new version of the toast table should already
  * have relfrozenxid set to RecentXmin, which is good enough.
  *
  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
  * having to look the information up again later in finish_heap_swap.
  *
  * GPDB: also swap aoseg, aoblkdir links.
  */
 void
 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 					bool swap_toast_by_content,
 					bool swap_stats,
 					bool is_internal,
 					TransactionId frozenXid,
 					MultiXactId cutoffMulti,
 					Oid *mapped_tables)
 {
 	Relation	relRelation,
 				rel;
 	HeapTuple	reltup1,
 				reltup2;
 	Form_pg_class relform1,
 				relform2;
 	Oid relfilenode1,
 				  relfilenode2;
 	Oid swaptemp;
 	char		swptmpchr;

 	/* We need writable copies of both pg_class tuples. */
 	relRelation = table_open(RelationRelationId, RowExclusiveLock);

 	reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
 	if (!HeapTupleIsValid(reltup1))
 		elog(ERROR, "cache lookup failed for relation %u", r1);
 	relform1 = (Form_pg_class) GETSTRUCT(reltup1);

 	reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
 	if (!HeapTupleIsValid(reltup2))
 		elog(ERROR, "cache lookup failed for relation %u", r2);
 	relform2 = (Form_pg_class) GETSTRUCT(reltup2);

 	if (IsAccessMethodAO(relform1->relam) || IsAccessMethodAO(relform2->relam))
 		ATAOEntries(relform1, relform2, frozenXid, cutoffMulti);

 	/* Also swap reloptions if we are swaping between heap and AO/AOCO tables. */
 	if ((relform1->relam == HEAP_TABLE_AM_OID && IsAccessMethodAO(relform2->relam)) ||
 		(relform2->relam == HEAP_TABLE_AM_OID && IsAccessMethodAO(relform1->relam)))
 	{
 		Datum		val[Natts_pg_class] = {0};
 		bool		null[Natts_pg_class] = {0};
 		bool		repl[Natts_pg_class] = {0};
 		bool 		isNull;

 		val[Anum_pg_class_reloptions - 1] = SysCacheGetAttr(RELOID, reltup2, Anum_pg_class_reloptions, &isNull);
 		null[Anum_pg_class_reloptions - 1] = isNull;
 		repl[Anum_pg_class_reloptions - 1] = true;

 		reltup1 = heap_modify_tuple(reltup1, RelationGetDescr(relRelation),
 									val, null, repl);
 		relform1 = (Form_pg_class) GETSTRUCT(reltup1);
 	}

 	relfilenode1 = relform1->relfilenode;
 	relfilenode2 = relform2->relfilenode;

 	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
 	{
 		/*
 		 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
 		 * relpersistence
 		 */
 		Assert(!target_is_pg_class);

 		swaptemp = relform1->relfilenode;
 		relform1->relfilenode = relform2->relfilenode;
 		relform2->relfilenode = swaptemp;

 		swaptemp = relform1->reltablespace;
 		relform1->reltablespace = relform2->reltablespace;
 		relform2->reltablespace = swaptemp;

 		swaptemp = relform1->relam;
 		relform1->relam = relform2->relam;
 		relform2->relam = swaptemp;

 		swptmpchr = relform1->relpersistence;
 		relform1->relpersistence = relform2->relpersistence;
 		relform2->relpersistence = swptmpchr;

 		/* Also swap toast links, if we're swapping by links */
 		if (!swap_toast_by_content)
 		{
 			swaptemp = relform1->reltoastrelid;
 			relform1->reltoastrelid = relform2->reltoastrelid;
 			relform2->reltoastrelid = swaptemp;
 		}
 	}
 	else
 	{
 		/*
 		 * Mapped-relation case.  Here we have to swap the relation mappings
 		 * instead of modifying the pg_class columns.  Both must be mapped.
 		 */
 		if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
 			elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
 				 NameStr(relform1->relname));

 		/*
 		 * We can't change the tablespace nor persistence of a mapped rel, and
 		 * we can't handle toast link swapping for one either, because we must
 		 * not apply any critical changes to its pg_class row.  These cases
 		 * should be prevented by upstream permissions tests, so these checks
 		 * are non-user-facing emergency backstop.
 		 */
 		if (relform1->reltablespace != relform2->reltablespace)
 			elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
 				 NameStr(relform1->relname));
 		if (relform1->relpersistence != relform2->relpersistence)
 			elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
 				 NameStr(relform1->relname));
 		if (relform1->relam != relform2->relam)
 			elog(ERROR, "cannot change access method of mapped relation \"%s\"",
 				 NameStr(relform1->relname));
 		if (!swap_toast_by_content &&
 			(relform1->reltoastrelid || relform2->reltoastrelid))
 			elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
 				 NameStr(relform1->relname));

 		/*
 		 * Fetch the mappings --- shouldn't fail, but be paranoid
 		 */
 		relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
 		if (!OidIsValid(relfilenode1))
 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
 				 NameStr(relform1->relname), r1);
 		relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
 		if (!OidIsValid(relfilenode2))
 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
 				 NameStr(relform2->relname), r2);

 		/*
 		 * Send replacement mappings to relmapper.  Note these won't actually
 		 * take effect until CommandCounterIncrement.
 		 */
 		RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
 		RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);

 		/* Pass OIDs of mapped r2 tables back to caller */
 		*mapped_tables++ = r2;
 	}

 	/*
 	 * Recognize that rel1's relfilenode (swapped from rel2) is new in this
 	 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
 	 * new.
 	 */
 	{
 		Relation	rel1,
 					rel2;

 		rel1 = relation_open(r1, NoLock);
 		rel2 = relation_open(r2, NoLock);
 		rel2->rd_createSubid = rel1->rd_createSubid;
 		rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
 		rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
 		RelationAssumeNewRelfilenode(rel1);
 		relation_close(rel1, NoLock);
 		relation_close(rel2, NoLock);
 	}

 	/*
 	 * In the case of a shared catalog, these next few steps will only affect
 	 * our own database's pg_class row; but that's okay, because they are all
 	 * noncritical updates.  That's also an important fact for the case of a
 	 * mapped catalog, because it's possible that we'll commit the map change
 	 * and then fail to commit the pg_class update.
 	 */

 	/* set rel1's frozen Xid and minimum MultiXid */
 	if (relform1->relkind != RELKIND_INDEX)
 	{
 		Assert(!TransactionIdIsValid(frozenXid) ||
 			   TransactionIdIsNormal(frozenXid));
 		relform1->relfrozenxid = frozenXid;
 		Assert(MultiXactIdIsValid(cutoffMulti));
 		relform1->relminmxid = cutoffMulti;
 	}
 	/*
 	 * Cloudberry: append-optimized tables do not have a valid relfrozenxid.
 	 * Overwrite the entry for both relations.
 	 */
 	if (relform1->relkind != RELKIND_INDEX && IsAccessMethodAO(relform1->relam))
 		relform1->relfrozenxid = InvalidTransactionId;
 	if (relform2->relkind != RELKIND_INDEX && IsAccessMethodAO(relform2->relam))
 		relform2->relfrozenxid = InvalidTransactionId;

 	/* swap size statistics too, since new rel has freshly-updated stats */
 	if (swap_stats)
 	{
 		int32		swap_pages;
 		float4		swap_tuples;
 		int32		swap_allvisible;

 		swap_pages = relform1->relpages;
 		relform1->relpages = relform2->relpages;
 		relform2->relpages = swap_pages;

 		swap_tuples = relform1->reltuples;
 		relform1->reltuples = relform2->reltuples;
 		relform2->reltuples = swap_tuples;

 		swap_allvisible = relform1->relallvisible;
 		relform1->relallvisible = relform2->relallvisible;
 		relform2->relallvisible = swap_allvisible;
 	}

 	/*
 	 * Swap auxiliary tables if the table AM has non-standard structure.
 	 * See the details of the callback swap_relation_files.
 	 */
 	if ((relform1->relkind == RELKIND_RELATION ||
 		relform1->relkind == RELKIND_MATVIEW)
 		&& (relform1->relam == PAX_AM_OID ||
 			relform2->relam == PAX_AM_OID))
 	{
 		const TableAmRoutine *tam;
 		Oid relam;

 		relam = relform1->relam;
 		if (relam != relform2->relam)
 			elog(ERROR, "PAX not allow swap relation files for different AM");

 		tam = GetTableAmRoutineByAmId(relam);
 		if (tam->swap_relation_files)
 			tam->swap_relation_files(r1, r2, frozenXid, cutoffMulti);
 	}

 	/*
 	 * Update the tuples in pg_class --- unless the target relation of the
 	 * swap is pg_class itself.  In that case, there is zero point in making
 	 * changes because we'd be updating the old data that we're about to throw
 	 * away.  Because the real work being done here for a mapped relation is
 	 * just to change the relation map settings, it's all right to not update
 	 * the pg_class rows in this case. The most important changes will instead
 	 * performed later, in finish_heap_swap() itself.
 	 */
 	if (!target_is_pg_class)
 	{
 		CatalogIndexState indstate;

 		indstate = CatalogOpenIndexes(relRelation);
 		CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
 								   indstate);
 		CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
 								   indstate);
 		CatalogCloseIndexes(indstate);

 		/*
 		 * Increment counter to reflect the AM change as the caller might soon
 		 * build the new relation descriptor which expects consistent AM and aux
 		 * tables. This shouldn't be needed for other cases as of now, especially
 		 * not for critical catalogs such as pg_attribute.
 		 */
 		if (relform1->relam != relform2->relam)
 			CommandCounterIncrement();
 	}
 	else
 	{
 		/* no update ... but we do still need relcache inval */
 		CacheInvalidateRelcacheByTuple(reltup1);
 		CacheInvalidateRelcacheByTuple(reltup2);
 	}

 	/*
 	 * Post alter hook for modified relations. The change to r2 is always
 	 * internal, but r1 depends on the invocation context.
 	 */
 	InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
 								 InvalidOid, is_internal);
 	InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
 								 InvalidOid, true);

 	/*
 	 * If we have toast tables associated with the relations being swapped,
 	 * deal with them too.
 	 */
 	if (relform1->reltoastrelid || relform2->reltoastrelid)
 	{
 		if (swap_toast_by_content)
 		{
 			if (relform1->reltoastrelid && relform2->reltoastrelid)
 			{
 				/* Recursively swap the contents of the toast tables */
 				swap_relation_files(relform1->reltoastrelid,
 									relform2->reltoastrelid,
 									target_is_pg_class,
 									swap_toast_by_content,
 									swap_stats,
 									is_internal,
 									frozenXid,
 									cutoffMulti,
 									mapped_tables);
 			}
 			else
 			{
 				/* caller messed up */
 				elog(ERROR, "cannot swap toast files by content when there's only one");
 			}
 		}
 		else
 		{
 			/*
 			 * We swapped the ownership links, so we need to change dependency
 			 * data to match.
 			 *
 			 * NOTE: it is possible that only one table has a toast table.
 			 *
 			 * NOTE: at present, a TOAST table's only dependency is the one on
 			 * its owning table.  If more are ever created, we'd need to use
 			 * something more selective than deleteDependencyRecordsFor() to
 			 * get rid of just the link we want.
 			 */

 			/*
 			 * We disallow this case for system catalogs, to avoid the
 			 * possibility that the catalog we're rebuilding is one of the
 			 * ones the dependency changes would change.  It's too late to be
 			 * making any data changes to the target catalog.
 			 */
 			if (IsSystemClass(r1, relform1))
 				elog(ERROR, "cannot swap toast files by links for system catalogs");

 			/* Delete old dependencies */
 			changeDependencyLinks(r1, r2,
 								  relform1->reltoastrelid, relform2->reltoastrelid,
 								  "TOAST");
 		}
 	}

 #ifdef USE_ASSERT_CHECKING
 		/*
 		 * Check with assert if AO table's toast table kept existing relfrozenxid unchanged.
 		 *
 		 * CLUSTER operation on append-optimized tables does not
 		 * compute freeze limit (frozenXid) because AO tables do not
 		 * have relfrozenxid.  The toast tables need to keep existing
 		 * relfrozenxid value unchanged in this case.
 		*/
 		if (swap_toast_by_content
 			&& frozenXid == InvalidTransactionId
 			&& relform1->relkind == RELKIND_TOASTVALUE
 			&& relform2->relkind == RELKIND_TOASTVALUE)
 		{
 			Assert(relform1->relfrozenxid == relform2->relfrozenxid);
 		}
 #endif

 	/*
 	 * If we're swapping two toast tables by content, do the same for their
 	 * valid index. The swap can actually be safely done only if the relations
 	 * have indexes.
 	 */
 	if (swap_toast_by_content &&
 		relform1->relkind == RELKIND_TOASTVALUE &&
 		relform2->relkind == RELKIND_TOASTVALUE)
 	{
 		Oid			toastIndex1,
 					toastIndex2;

 		/* Get valid index for each relation */
 		toastIndex1 = toast_get_valid_index(r1,
 											AccessExclusiveLock);
 		toastIndex2 = toast_get_valid_index(r2,
 											AccessExclusiveLock);

 		swap_relation_files(toastIndex1,
 							toastIndex2,
 							target_is_pg_class,
 							swap_toast_by_content,
 							swap_stats,
 							is_internal,
 							InvalidTransactionId,
 							InvalidMultiXactId,
 							mapped_tables);
 	}

 	/* Send statistics from QE to QD */
 	if (Gp_role == GP_ROLE_EXECUTE && swap_stats && !IsSystemClass(r1, relform1))
 	{
 		rel = relation_open(r1, AccessShareLock);

 		vac_send_relstats_to_qd(rel,
 								relform1->relpages,
 								relform1->reltuples,
 								relform1->relallvisible);

 		relation_close(rel, AccessShareLock);
 	}
 	/* Clean up. */
 	heap_freetuple(reltup1);
 	heap_freetuple(reltup2);

 	table_close(relRelation, RowExclusiveLock);

 	/*
 	 * Close both relcache entries' smgr links.  We need this kluge because
 	 * both links will be invalidated during upcoming CommandCounterIncrement.
 	 * Whichever of the rels is the second to be cleared will have a dangling
 	 * reference to the other's smgr entry.  Rather than trying to avoid this
 	 * by ordering operations just so, it's easiest to close the links first.
 	 * (Fortunately, since one of the entries is local in our transaction,
 	 * it's sufficient to clear out our own relcache this way; the problem
 	 * cannot arise for other backends when they see our update on the
 	 * non-transient relation.)
 	 *
 	 * Caution: the placement of this step interacts with the decision to
 	 * handle toast rels by recursion.  When we are trying to rebuild pg_class
 	 * itself, the smgr close on pg_class must happen after all accesses in
 	 * this function.
 	 */
 	RelationCloseSmgrByOid(r1);
 	RelationCloseSmgrByOid(r2);
 }

 /*
  * Remove the transient table that was built by make_new_heap, and finish
  * cleaning up (including rebuilding all indexes on the old heap).
  */
 void
 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
 				 bool is_system_catalog,
 				 bool swap_toast_by_content,
 				 bool swap_stats,
 				 bool check_constraints,
 				 bool is_internal,
 				 TransactionId frozenXid,
 				 MultiXactId cutoffMulti,
 				 char newrelpersistence)
 {
 	ObjectAddress object;
 	Oid			mapped_tables[4];
 	int			reindex_flags;
 	ReindexParams reindex_params = {0};
 	int			i;

 	/* Report that we are now swapping relation files */
 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
 								 PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);

 	/* Zero out possible results from swapped_relation_files */
 	memset(mapped_tables, 0, sizeof(mapped_tables));

 	/*
 	 * Swap the contents of the heap relations (including any toast tables).
 	 * Also set old heap's relfrozenxid to frozenXid.
 	 */
 	swap_relation_files(OIDOldHeap, OIDNewHeap,
 						(OIDOldHeap == RelationRelationId),
 						swap_toast_by_content,
 						swap_stats,
 						is_internal,
 						frozenXid, cutoffMulti, mapped_tables);

 	SIMPLE_FAULT_INJECTOR("after_swap_relation_files");

 	/*
 	 * If it's a system catalog, queue a sinval message to flush all catcaches
 	 * on the catalog when we reach CommandCounterIncrement.
 	 */
 	if (is_system_catalog)
 		CacheInvalidateCatalog(OIDOldHeap);

 	/*
 	 * Rebuild each index on the relation (but not the toast table, which is
 	 * all-new at this point).  It is important to do this before the DROP
 	 * step because if we are processing a system catalog that will be used
 	 * during DROP, we want to have its indexes available.  There is no
 	 * advantage to the other order anyway because this is all transactional,
 	 * so no chance to reclaim disk space before commit.  We do not need a
 	 * final CommandCounterIncrement() because reindex_relation does it.
 	 *
 	 * Note: because index_build is called via reindex_relation, it will never
 	 * set indcheckxmin true for the indexes.  This is OK even though in some
 	 * sense we are building new indexes rather than rebuilding existing ones,
 	 * because the new heap won't contain any HOT chains at all, let alone
 	 * broken ones, so it can't be necessary to set indcheckxmin.
 	 */
 	reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
 	if (check_constraints)
 		reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;

 	/*
 	 * Ensure that the indexes have the same persistence as the parent
 	 * relation.
 	 */
 	if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
 	else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;

 	/* Report that we are now reindexing relations */
 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
 								 PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);

 	reindex_relation(OIDOldHeap, reindex_flags, &reindex_params);

 	/* Report that we are now doing clean up */
 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
 								 PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);

 	/*
 	 * If the relation being rebuilt is pg_class, swap_relation_files()
 	 * couldn't update pg_class's own pg_class entry (check comments in
 	 * swap_relation_files()), thus relfrozenxid was not updated. That's
 	 * annoying because a potential reason for doing a VACUUM FULL is a
 	 * imminent or actual anti-wraparound shutdown.  So, now that we can
 	 * access the new relation using its indices, update relfrozenxid.
 	 * pg_class doesn't have a toast relation, so we don't need to update the
 	 * corresponding toast relation. Not that there's little point moving all
 	 * relfrozenxid updates here since swap_relation_files() needs to write to
 	 * pg_class for non-mapped relations anyway.
 	 */
 	if (OIDOldHeap == RelationRelationId)
 	{
 		Relation	relRelation;
 		HeapTuple	reltup;
 		Form_pg_class relform;

 		relRelation = table_open(RelationRelationId, RowExclusiveLock);

 		reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
 		if (!HeapTupleIsValid(reltup))
 			elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
 		relform = (Form_pg_class) GETSTRUCT(reltup);

 		relform->relfrozenxid = frozenXid;
 		relform->relminmxid = cutoffMulti;

 		CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);

 		table_close(relRelation, RowExclusiveLock);
 	}

 	/* Destroy new heap with old filenode */
 	object.classId = RelationRelationId;
 	object.objectId = OIDNewHeap;
 	object.objectSubId = 0;

 	/*
 	 * The new relation is local to our transaction and we know nothing
 	 * depends on it, so DROP_RESTRICT should be OK.
 	 */
 	performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);

 	/* performDeletion does CommandCounterIncrement at end */

 	/*
 	 * Now we must remove any relation mapping entries that we set up for the
 	 * transient table, as well as its toast table and toast index if any. If
 	 * we fail to do this before commit, the relmapper will complain about new
 	 * permanent map entries being added post-bootstrap.
 	 */
 	for (i = 0; OidIsValid(mapped_tables[i]); i++)
 		RelationMapRemoveMapping(mapped_tables[i]);

 	/*
 	 * At this point, everything is kosher except that, if we did toast swap
 	 * by links, the toast table's name corresponds to the transient table.
 	 * The name is irrelevant to the backend because it's referenced by OID,
 	 * but users looking at the catalogs could be confused.  Rename it to
 	 * prevent this problem.
 	 *
 	 * Note no lock required on the relation, because we already hold an
 	 * exclusive lock on it.
 	 */
 	if (!swap_toast_by_content)
 	{
 		Relation	newrel;

 		newrel = table_open(OIDOldHeap, NoLock);
 		if (OidIsValid(newrel->rd_rel->reltoastrelid))
 		{
 			Oid			toastidx;
 			char		NewToastName[NAMEDATALEN];

 			/* Get the associated valid index to be renamed */
 			toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
 											 NoLock);

 			/* rename the toast table ... */
 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
 					 OIDOldHeap);
 			RenameRelationInternal(newrel->rd_rel->reltoastrelid,
 								   NewToastName, true, false);

 			/* ... and its valid index too. */
 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
 					 OIDOldHeap);

 			RenameRelationInternal(toastidx,
 								   NewToastName, true, true);

 			/*
 			 * Reset the relrewrite for the toast. The command-counter
 			 * increment is required here as we are about to update
 			 * the tuple that is updated as part of RenameRelationInternal.
 			 */
 			CommandCounterIncrement();
 			ResetRelRewrite(newrel->rd_rel->reltoastrelid);
 		}
 		relation_close(newrel, NoLock);
 	}

 	/* if it's not a catalog table, clear any missing attribute settings */
 	if (!is_system_catalog)
 	{
 		Relation	newrel;

 		newrel = table_open(OIDOldHeap, NoLock);
 		RelationClearMissing(newrel);
 		relation_close(newrel, NoLock);
 	}
 }


 /*
  * Get a list of tables that the current user owns and
  * have indisclustered set.  Return the list in a List * of RelToCluster
  * (stored in the specified memory context), each one giving the tableOid
  * and the indexOid on which the table is already clustered.
  */
 static List *
 get_tables_to_cluster(MemoryContext cluster_context)
 {
 	Relation	indRelation;
 	TableScanDesc scan;
 	ScanKeyData entry;
 	HeapTuple	indexTuple;
 	Form_pg_index index;
 	MemoryContext old_context;
 	RelToCluster *rvtc;
 	List	   *rvs = NIL;

 	/*
 	 * Get all indexes that have indisclustered set and are owned by
 	 * appropriate user.
 	 */
 	indRelation = table_open(IndexRelationId, AccessShareLock);
 	ScanKeyInit(&entry,
 				Anum_pg_index_indisclustered,
 				BTEqualStrategyNumber, F_BOOLEQ,
 				BoolGetDatum(true));
 	scan = table_beginscan_catalog(indRelation, 1, &entry);
 	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
 		index = (Form_pg_index) GETSTRUCT(indexTuple);

 		if (!pg_class_ownercheck(index->indrelid, GetUserId()))
 			continue;

 		/*
 		 * We have to build the list in a different memory context so it will
 		 * survive the cross-transaction processing
 		 */
 		old_context = MemoryContextSwitchTo(cluster_context);

 		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
 		rvtc->tableOid = index->indrelid;
 		rvtc->indexOid = index->indexrelid;
 		rvs = lappend(rvs, rvtc);

 		MemoryContextSwitchTo(old_context);
 	}
 	table_endscan(scan);

 	relation_close(indRelation, AccessShareLock);

 	return rvs;
 }