src/backend/utils/cache/relcache.c - hawq - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /*-------------------------------------------------------------------------
  *
  * relcache.c
  *	  POSTGRES relation descriptor cache code
  *
  * Portions Copyright (c) 2005-2009, Greenplum inc.
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.250.2.1 2008/02/27 17:44:33 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 /*
  * INTERFACE ROUTINES
  *		RelationCacheInitialize			- initialize relcache (to empty)
  *		RelationCacheInitializePhase2	- finish initializing relcache
  *		RelationIdGetRelation			- get a reldesc by relation id
  *		RelationClose					- close an open relation
  *
  * NOTES
  *		The following code contains many undocumented hacks.  Please be
  *		careful....
  */
 #include "postgres.h"

 #include <sys/file.h>
 #include <fcntl.h>
 #include <unistd.h>

 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/reloptions.h"
 #include "access/sysattr.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "catalog/catquery.h"
 #include "catalog/indexing.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_attrdef.h"
 #include "catalog/pg_authid.h"
 #include "catalog/pg_constraint.h"
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_rewrite.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_trigger.h"
 #include "catalog/pg_type.h"
 #include "commands/trigger.h"
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "optimizer/clauses.h"
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
 #include "rewrite/rewriteDefine.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
 #include "utils/resowner.h"
 #include "utils/syscache.h"
 #include "utils/typcache.h"

 #include "cdb/cdbcat.h"         /* GpPolicy */
 #include "cdb/cdbvars.h"        /* Gp_role */
 #include "cdb/cdbmirroredflatfile.h"
 #include "cdb/cdbpersistentfilesysobj.h"
 #include "cdb/cdbpersistentrelation.h"
 #include "libpq/libpq-be.h"		/* MyProcPort: dereferencing pointer to incomplete type */


 /*
  * name of relcache init file, used to speed up backend startup
  */
 #define RELCACHE_INIT_FILENAME	"pg_internal.init"

 #define RELCACHE_INIT_FILEMAGIC		0x573263	/* version ID value */

 /*
  *		hardcoded tuple descriptors.  see include/catalog/pg_attribute.h
  */
 static const FormData_pg_attribute Desc_pg_class[Natts_pg_class] = {Schema_pg_class};
 static const FormData_pg_attribute Desc_pg_attribute[Natts_pg_attribute] = {Schema_pg_attribute};
 static const FormData_pg_attribute Desc_pg_proc[Natts_pg_proc] = {Schema_pg_proc};
 static const FormData_pg_attribute Desc_pg_type[Natts_pg_type] = {Schema_pg_type};
 static const FormData_pg_attribute Desc_pg_index[Natts_pg_index] = {Schema_pg_index};

 /*
  *		Hash tables that index the relation cache
  *
  *		We used to index the cache by both name and OID, but now there
  *		is only an index by OID.
  */
 typedef struct relidcacheent
 {
 	Oid			reloid;
 	Relation	reldesc;
 } RelIdCacheEnt;

 static HTAB *RelationIdCache;

 /*
  * This flag is false until we have prepared the critical relcache entries
  * that are needed to do indexscans on the tables read by relcache building.
  */
 bool		criticalRelcachesBuilt = false;

 /*
  * This counter counts relcache inval events received since backend startup
  * (but only for rels that are actually in cache).	Presently, we use it only
  * to detect whether data about to be written by write_relcache_init_file()
  * might already be obsolete.
  */
 static long relcacheInvalsReceived = 0L;

 /*
  * This list remembers the OIDs of the relations cached in the relcache
  * init file.
  */
 static List *initFileRelationIds = NIL;

 /*
  * This flag lets us optimize away work in AtEO(Sub)Xact_RelationCache().
  */
 static bool need_eoxact_work = false;


 /*
  *		macros to manipulate the lookup hashtables
  */
 #define RelationCacheInsert(RELATION)	\
 do { \
 	RelIdCacheEnt *idhentry; bool found; \
 	idhentry = (RelIdCacheEnt*)hash_search(RelationIdCache, \
 										   (void *) &(RELATION->rd_id), \
 										   HASH_ENTER, \
 										   &found); \
 	/* used to give notice if found -- now just keep quiet */ \
 	idhentry->reldesc = RELATION; \
 } while(0)

 #define RelationIdCacheLookup(ID, RELATION) \
 do { \
 	RelIdCacheEnt *hentry; \
 	hentry = (RelIdCacheEnt*)hash_search(RelationIdCache, \
 										 (void *) &(ID), HASH_FIND,NULL); \
 	if (hentry) \
 		RELATION = hentry->reldesc; \
 	else \
 		RELATION = NULL; \
 } while(0)

 #define RelationCacheDelete(RELATION) \
 do { \
 	RelIdCacheEnt *idhentry; \
 	idhentry = (RelIdCacheEnt*)hash_search(RelationIdCache, \
 										   (void *) &(RELATION->rd_id), \
 										   HASH_REMOVE, NULL); \
 	if (idhentry == NULL) \
 		elog(WARNING, "trying to delete a rd_id reldesc that does not exist"); \
 } while(0)


 /*
  * Special cache for opclass-related information
  *
  * Note: only default operators and support procs get cached, ie, those with
  * lefttype = righttype = opcintype.
  */
 typedef struct opclasscacheent
 {
 	Oid			opclassoid;		/* lookup key: OID of opclass */
 	bool		valid;			/* set TRUE after successful fill-in */
 	StrategyNumber numStrats;	/* max # of strategies (from pg_am) */
 	StrategyNumber numSupport;	/* max # of support procs (from pg_am) */
 	Oid		   *operatorOids;	/* strategy operators' OIDs */
 	RegProcedure *supportProcs; /* support procs */
 } OpClassCacheEnt;

 static HTAB *OpClassCache = NULL;


 /* non-export function prototypes */

 static void RelationDestroyRelation(Relation relation);
 static void RelationClearRelation(Relation relation, bool rebuild);

 static void RelationReloadClassinfo(Relation relation);
 static void RelationFlushRelation(Relation relation);
 static bool load_relcache_init_file(void);
 static void write_relcache_init_file(void);
 static void write_item(const void *data, Size len, FILE *fp);

 static void formrdesc(const char *relationName, Oid relationReltype,
 		  bool hasoids, int natts, const FormData_pg_attribute *att);

 static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, Relation *pg_class_relation);
 static Relation AllocateRelationDesc(Form_pg_class relp);
 static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
 static void RelationBuildTupleDesc(Relation relation);
 static Relation RelationBuildDesc(Oid targetRelId, bool insertIt);
 static void RelationInitPhysicalAddr(Relation relation);
 static void load_critical_index(Oid indexoid, Oid heapoid);
 static TupleDesc GetPgClassDescriptor(void);
 static TupleDesc GetPgIndexDescriptor(void);
 static void AttrDefaultFetch(Relation relation);
 static void CheckConstraintFetch(Relation relation);
 static List *insert_ordered_oid(List *list, Oid datum);
 static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
 				  StrategyNumber numStrats,
 				  StrategyNumber numSupport);


 /*
  *		ScanPgRelation
  *
  *		This is used by RelationBuildDesc to find a pg_class
  *		tuple matching targetRelId.  The caller must hold at least
  *		AccessShareLock on the target relid to prevent concurrent-update
  *		scenarios --- else our SnapshotNow scan might fail to find any
  *		version that it thinks is live.
  *
  *		NB: the returned tuple has been copied into palloc'd storage
  *		and must eventually be freed with heap_freetuple.
  */
 static HeapTuple
 ScanPgRelation(Oid targetRelId, bool indexOK, Relation *pg_class_relation)
 {
 	HeapTuple	pg_class_tuple;
 	Relation	pg_class_desc;
 	cqContext	cqc;

 	/*
 	 * If something goes wrong during backend startup, we might find ourselves
 	 * trying to read pg_class before we've selected a database.  That ain't
 	 * gonna work, so bail out with a useful error message.  If this happens,
 	 * it probably means a relcache entry that needs to be nailed isn't.
 	 */
 	if (!OidIsValid(MyDatabaseId))
 		elog(FATAL, "cannot read pg_class without having selected a database");

 	/*
 	 * form a scan key
 	 */

 	/*
 	 * Open pg_class and fetch a tuple.  Force heap scan if we haven't yet
 	 * built the critical relcache entries (this includes initdb and startup
 	 * without a pg_internal.init file).  The caller can also force a heap
 	 * scan by setting indexOK == false.
 	 */
 	pg_class_desc = heap_open(RelationRelationId, AccessShareLock);

 	pg_class_tuple = caql_getfirst(
 			caql_syscache(
 					caql_indexOK(caql_addrel(cqclr(&cqc), pg_class_desc),
 								 (indexOK && criticalRelcachesBuilt)),
 					false),
 			cql("SELECT * FROM pg_class "
 				" WHERE oid = :1 ",
 				ObjectIdGetDatum(targetRelId)));

 	/*
 	 * Must copy tuple before releasing buffer. -- already a copy
 	 */

 	/* all done */

 	if (pg_class_relation == NULL)
 		heap_close(pg_class_desc, AccessShareLock);
 	else
 		*pg_class_relation = pg_class_desc;

 	return pg_class_tuple;
 }

 void
 GpRelfileNodeBeginScan(
 	Relation 			gp_relfile_node,

 	Oid					relationId,

 	Oid 				relfilenode,

 	GpRelfileNodeScan 	*gpRelfileNodeScan)
 {
 	Assert (relfilenode != 0);

 	MemSet(gpRelfileNodeScan, 0, sizeof(GpRelfileNodeScan));

 	/*
 	 * form a scan key
 	 */
 	/* XXX XXX: break this out -- find callers - jic 2011/12/09 */
 	/* maybe it's ok - return a cql context ? */

 	/* XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX */
 	/* no json defs for persistent tables ? */
 /*
 	cqxx("SELECT * FROM gp_relation_node_relfilenode "
 		 " WHERE oid = :1 ",
 		 ObjectIdGetDatum(relfilenode));
 */
 	/* XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX */

 	ScanKeyInit(&gpRelfileNodeScan->scankey[0],
 				Anum_gp_relfile_node_relfilenode_oid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(relfilenode));

 	/*
 	 * Open pg_class and fetch a tuple.  Force heap scan if we haven't yet
 	 * built the critical relcache entries (this includes initdb and startup
 	 * without a pg_internal.init file).  The caller can also force a heap
 	 * scan by setting indexOK == false.
 	 */
 	/*
 	 * in hawq, we change the schema of gp_relation_node,
 	 * disable index scan.
 	 */
 	gpRelfileNodeScan->scan = \
 		systable_beginscan(gp_relfile_node, InvalidOid,
 						   /* indexOK */ FALSE,
 						   SnapshotNow,
 						   /* nKeys */ 1,
 						   gpRelfileNodeScan->scankey);

 	gpRelfileNodeScan->gp_relfile_node = gp_relfile_node;
 	gpRelfileNodeScan->relationId = relationId;
 	gpRelfileNodeScan->relfilenode = relfilenode;
 }

 HeapTuple
 GpRelfileNodeGetNext(
 	GpRelfileNodeScan 	*gpRelfileNodeScan,

 	int32				*segmentFileNum,

 	ItemPointer			persistentTid,

 	int64				*persistentSerialNum)
 {
 	HeapTuple tuple;

 	bool			nulls[Natts_gp_relfile_node];
 	Datum			values[Natts_gp_relfile_node];

 	Oid actualRelationNode;

 	tuple = systable_getnext((SysScanDesc)gpRelfileNodeScan->scan);

 	/*
 	 * if no such tuple exists, return NULL
 	 */
 	if (!HeapTupleIsValid(tuple))
 	{
 		MemSet(persistentTid, 0, sizeof(ItemPointerData));
 		*persistentSerialNum = 0;
 		return tuple;
 	}

 	heap_deform_tuple(tuple, RelationGetDescr(gpRelfileNodeScan->gp_relfile_node), values, nulls);

 	GpRelfileNode_GetValues(
 						values,
 						&actualRelationNode,
 						segmentFileNum,
 						persistentTid,
 						persistentSerialNum);
 	if (actualRelationNode != gpRelfileNodeScan->relfilenode)
 		elog(FATAL, "Mismatch in node tuple for gp_relation_node for relation %u, relfilenode %u, relation node %u",
 			 gpRelfileNodeScan->relationId,
 			 gpRelfileNodeScan->relfilenode,
 			 actualRelationNode);

 	return tuple;
 }


 void
 GpRelfileNodeEndScan(
 	GpRelfileNodeScan 	*gpRelfileNodeScan)
 {
 	/* all done */
 	systable_endscan((SysScanDesc)gpRelfileNodeScan->scan);
 }


 HeapTuple
 ScanGpRelfileNodeTuple(
 	Relation 	gp_relfile_node,
 	Oid 		relfilenode,
 	int32		segmentFileNum)
 {
 	HeapTuple	tuple;
 	SysScanDesc scan;
 	ScanKeyData key[2];

 	Assert (relfilenode != 0);

 	/*
 	 * form a scan key
 	 */

 	/* XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX */
 /*
 	cqxx("SELECT * FROM gp_relation_node "
 		 " WHERE relfilenode_oid = :1 "
 		 " AND segment_file_num = :2 ",
 		 ObjectIdGetDatum(relfilenode),
 		 Int32GetDatum(segmentFileNum));
 */
 	/* XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX */

 	ScanKeyInit(&key[0],
 				Anum_gp_relfile_node_relfilenode_oid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(relfilenode));
 	ScanKeyInit(&key[1],
 				Anum_gp_relfile_node_segment_file_num,
 				BTEqualStrategyNumber, F_INT4EQ,
 				Int32GetDatum(segmentFileNum));

 	/*
 	 * Open pg_class and fetch a tuple.  Force heap scan if we haven't yet
 	 * built the critical relcache entries (this includes initdb and startup
 	 * without a pg_internal.init file).  The caller can also force a heap
 	 * scan by setting indexOK == false.
 	 */
 	scan = systable_beginscan(gp_relfile_node, GpRelfileNodeOidIndexId,
 									   /* indexOK */ true,
 									   SnapshotNow,
 									   2, key);

 	tuple = systable_getnext(scan);

 	/*
 	 * Must copy tuple before releasing buffer.
 	 */
 	if (HeapTupleIsValid(tuple))
 		tuple = heap_copytuple(tuple);

 	/* all done */
 	systable_endscan(scan);

 	return tuple;
 }

 HeapTuple
 FetchGpRelfileNodeTuple(
 	Relation 		gp_relfile_node,
 	Oid 			relfilenode,
 	int32			segmentFileNum,
 	ItemPointer		persistentTid,
 	int64			*persistentSerialNum)
 {
 	HeapTuple tuple;

 	bool			nulls[Natts_gp_relfile_node];
 	Datum			values[Natts_gp_relfile_node];

 	Oid actualRelationNode;
 	int32 actualSegmentFileNum;

 	Assert (relfilenode != 0);

 	tuple = ScanGpRelfileNodeTuple(
 					gp_relfile_node,
 					relfilenode,
 					segmentFileNum);

 	/*
 	 * if no such tuple exists, return NULL
 	 */
 	if (!HeapTupleIsValid(tuple))
 	{
 		MemSet(persistentTid, 0, sizeof(ItemPointerData));
 		*persistentSerialNum = 0;
 		return tuple;
 	}

 	heap_deform_tuple(tuple, RelationGetDescr(gp_relfile_node), values, nulls);

 	GpRelfileNode_GetValues(
 						values,
 						&actualRelationNode,
 						&actualSegmentFileNum,
 						persistentTid,
 						persistentSerialNum);
 	Assert (actualRelationNode == relfilenode);

 	return tuple;
 }

 bool
 ReadGpRelfileNode(
 	Oid 			relfilenode,

 	int32			segmentFileNum,

 	ItemPointer		persistentTid,

 	int64			*persistentSerialNum)
 {
 	Relation gp_relfile_node;
 	HeapTuple tuple;
 	bool found;

 	MemSet(persistentTid, 0, sizeof(ItemPointerData));
 	*persistentSerialNum = 0;

 	gp_relfile_node = heap_open(GpRelfileNodeRelationId, AccessShareLock);

 	tuple = FetchGpRelfileNodeTuple(
 						gp_relfile_node,
 						relfilenode,
 						segmentFileNum,
 						persistentTid,
 						persistentSerialNum);

 	/*
 	 * if no such tuple exists, return NULL
 	 */
 	if (!HeapTupleIsValid(tuple))
 	{
 		found = false;
 	}
 	else
 	{
 		found = true;
 		heap_freetuple(tuple);
 	}

 	heap_close(gp_relfile_node, AccessShareLock);

 	return found;
 }

 void
 RelationFetchGpRelationNode(
 	Relation relation)
 {

 	if (!relation->rd_relationnodeinfo.isPresent)
 	{
 		if (Persistent_BeforePersistenceWork() || InRecovery)
 		{
 			MemSet(&relation->rd_relationnodeinfo.persistentTid, 0, sizeof(ItemPointerData));
 			relation->rd_relationnodeinfo.persistentSerialNum = 0;

 			relation->rd_relationnodeinfo.isPresent = true;
 			relation->rd_relationnodeinfo.tidAllowedToBeZero = true;

 			return; // The initdb process will load the persistent table once we out of bootstrap mode.
 		}

 		if (relation->rd_rel->relstorage != RELSTORAGE_AOROWS
 					&& relation->rd_rel->relstorage != RELSTORAGE_PARQUET)
 		{
 			if (!ReadGpRelfileNode(
 						relation->rd_node.relNode,
 						/* segmentFileNum */ 0,
 						&relation->rd_relationnodeinfo.persistentTid,
 						&relation->rd_relationnodeinfo.persistentSerialNum))
 			{
 				elog(ERROR, "Did not find gp_relfile_node entry for relation name %s, relation id %u, relfilenode %u",
 						relation->rd_rel->relname.data,
 						relation->rd_id,
 						relation->rd_node.relNode);
 			}
 		}
 		else
 		{
 			PersistentRelation_LookupTidAndSerialNum(
 											&relation->rd_node,
 											&relation->rd_relationnodeinfo.persistentTid,
 											&relation->rd_relationnodeinfo.persistentSerialNum);
 		}

 		Assert(!Persistent_BeforePersistenceWork());
 		if (Debug_check_for_invalid_persistent_tid &&
 			PersistentStore_IsZeroTid(&relation->rd_relationnodeinfo.persistentTid))
 		{
 			elog(ERROR,
 				 "RelationFetchGpRelationNode has invalid TID (0,0) into relation %u/%u/%u '%s', serial number " INT64_FORMAT,
 				 relation->rd_node.spcNode,
 				 relation->rd_node.dbNode,
 				 relation->rd_node.relNode,
 				 NameStr(relation->rd_rel->relname),
 				 relation->rd_relationnodeinfo.persistentSerialNum);
 		}

 		relation->rd_relationnodeinfo.isPresent = true;

 	}

 }

 // UNDONE: Temporary
 void
 RelationFetchGpRelationNodeForXLog_Index(
 	Relation relation)
 {
 	static int countInThisBackend = 0;
 	static int deep = 0;

 	deep++;

 	countInThisBackend++;

 	if (deep >= 2)
 	{
 		int saveDeep;

 		if (Debug_gp_relation_node_fetch_wait_for_debugging)
 		{
 			/* Code for investigating MPP-16395, will be removed as part of the fix */
 			elog(WARNING, "RelationFetchGpRelationNodeForXLog_Index [%d] for non-heap %u/%u/%u (deep %d) -- waiting for debug attach...",
 				 countInThisBackend,
 				 relation->rd_node.spcNode,
 				 relation->rd_node.dbNode,
 				 relation->rd_node.relNode,
 				 deep);

 			for (int i=0; i < 24 * 60; i++)
 			{
 				pg_usleep(60000000L); /* 60 sec */
 			}
 		}

 		/*
 		 * Reset counter in case the user continues to use the session.
 		 */
 		saveDeep = deep;
 		deep = 0;

 		elog(ERROR, "RelationFetchGpRelationNodeForXLog_Index [%d] for non-heap %u/%u/%u (deep %d)",
 			 countInThisBackend,
 			 relation->rd_node.spcNode,
 			 relation->rd_node.dbNode,
 			 relation->rd_node.relNode,
 			 saveDeep);
 	}

 	RelationFetchGpRelationNode(relation);

 	deep--;
 }

 /*
  *		AllocateRelationDesc
  *
  *		This is used to allocate memory for a new relation descriptor
  *		and initialize the rd_rel field from the given pg_class tuple.
  */
 static Relation
 AllocateRelationDesc(Form_pg_class relp)
 {
 	Relation	relation;
 	MemoryContext oldcxt;
 	Form_pg_class relationForm;

 	/* Relcache entries must live in CacheMemoryContext */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

 	/*
 	 * allocate and zero space for new relation descriptor
 	 */
 	relation = (Relation) palloc0(sizeof(RelationData));

 	/*
 	 * clear fields of reldesc that should initialize to something non-zero
 	 */
 	relation->rd_targblock = InvalidBlockNumber;

 	/* make sure relation is marked as having no open file yet */
 	relation->rd_smgr = NULL;

 	/*
 	 * Copy the relation tuple form
 	 *
 	 * We only allocate space for the fixed fields, ie, CLASS_TUPLE_SIZE. The
 	 * variable-length fields (relacl, reloptions) are NOT stored in the
 	 * relcache --- there'd be little point in it, since we don't copy the
 	 * tuple's nulls bitmap and hence wouldn't know if the values are valid.
 	 * Bottom line is that relacl *cannot* be retrieved from the relcache. Get
 	 * it from the syscache if you need it.  The same goes for the original
 	 * form of reloptions (however, we do store the parsed form of reloptions
 	 * in rd_options).
 	 */
 	relationForm = (Form_pg_class) palloc(CLASS_TUPLE_SIZE);

 	memcpy(relationForm, relp, CLASS_TUPLE_SIZE);

 	/* initialize relation tuple form */
 	relation->rd_rel = relationForm;

 	/*
 	 * This part MUST be remain as a fetch on demand, otherwise you end up
 	 * needing it to open pg_class and then relation_open does infinite recursion...
 	 */
 	relation->rd_relationnodeinfo.isPresent = false;
 	relation->rd_relationnodeinfo.tidAllowedToBeZero = false;

 	/* and allocate attribute tuple form storage */
 	relation->rd_att = CreateTemplateTupleDesc(relationForm->relnatts,
 											   relationForm->relhasoids);
 	/* which we mark as a reference-counted tupdesc */
 	relation->rd_att->tdrefcount = 1;

 	MemoryContextSwitchTo(oldcxt);

 	return relation;
 }

 /*
  * RelationParseRelOptions
  *		Convert pg_class.reloptions into pre-parsed rd_options
  *
  * tuple is the real pg_class tuple (not rd_rel!) for relation
  *
  * Note: rd_rel and (if an index) rd_am must be valid already
  */
 static void
 RelationParseRelOptions(Relation relation, HeapTuple tuple)
 {
 	Datum		datum;
 	bool		isnull;
 	bytea	   *options;

 	relation->rd_options = NULL;

 	/* Fall out if relkind should not have options */
 	switch (relation->rd_rel->relkind)
 	{
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_AOSEGMENTS:
 		case RELKIND_AOBLOCKDIR:
 		case RELKIND_INDEX:
 			break;
 		default:
 			return;
 	}

 	/*
 	 * Fetch reloptions from tuple; have to use a hardwired descriptor because
 	 * we might not have any other for pg_class yet (consider executing this
 	 * code for pg_class itself)
 	 */
 	datum = fastgetattr(tuple,
 						Anum_pg_class_reloptions,
 						GetPgClassDescriptor(),
 						&isnull);
 	if (isnull)
 		return;

 	/* Parse into appropriate format; don't error out here */
 	switch (relation->rd_rel->relkind)
 	{
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_AOSEGMENTS:
 		case RELKIND_AOBLOCKDIR:
 		case RELKIND_UNCATALOGED:
 			options = heap_reloptions(relation->rd_rel->relkind, datum,
 									  false);
 			break;
 		case RELKIND_INDEX:
 			options = index_reloptions(relation->rd_am->amoptions, datum,
 									   false);
 			break;
 		default:
 			Assert(false);		/* can't get here */
 			options = NULL;		/* keep compiler quiet */
 			break;
 	}

 	/*
 	 * Copy parsed data into CacheMemoryContext.  To guard against the
 	 * possibility of leaks in the reloptions code, we want to do the actual
 	 * parsing in the caller's memory context and copy the results into
 	 * CacheMemoryContext after the fact.
 	 */
 	if (options)
 	{
 		relation->rd_options = MemoryContextAlloc(CacheMemoryContext,
 												  VARSIZE(options));
 		memcpy(relation->rd_options, options, VARSIZE(options));
 		pfree(options);
 	}
 }

 /*
  *		RelationBuildTupleDesc
  *
  *		Form the relation's tuple descriptor from information in
  *		the pg_attribute, pg_attrdef & pg_constraint system catalogs.
  */
 static void
 RelationBuildTupleDesc(Relation relation)
 {
 	HeapTuple	pg_attribute_tuple;
 	Relation	pg_attribute_desc;
 	cqContext	cqc;
 	cqContext  *pcqCtx;
 	int			need;
 	TupleConstr *constr;
 	AttrDefault *attrdef = NULL;
 	int			ndef = 0;

 	/* copy some fields from pg_class row to rd_att */
 	relation->rd_att->tdtypeid = relation->rd_rel->reltype;
 	relation->rd_att->tdtypmod = -1;	/* unnecessary, but... */
 	relation->rd_att->tdhasoid = relation->rd_rel->relhasoids;

 	constr = (TupleConstr *) MemoryContextAlloc(CacheMemoryContext,
 												sizeof(TupleConstr));
 	constr->has_not_null = false;

 	/*
 	 * Form a scan key that selects only user attributes (attnum > 0).
 	 * (Eliminating system attribute rows at the index level is lots faster
 	 * than fetching them.)
 	 */

 	/*
 	 * Open pg_attribute and begin a scan.	Force heap scan if we haven't yet
 	 * built the critical relcache entries (this includes initdb and startup
 	 * without a pg_internal.init file).
 	 */
 	pg_attribute_desc = heap_open(AttributeRelationId, AccessShareLock);

 	pcqCtx = caql_beginscan(
 			caql_syscache(
 					caql_indexOK(caql_addrel(cqclr(&cqc), pg_attribute_desc),
 								 criticalRelcachesBuilt),
 					false),
 			cql("SELECT * FROM pg_attribute "
 				" WHERE attrelid = :1 "
 				" AND attnum > :2 ",
 				ObjectIdGetDatum(RelationGetRelid(relation)),
 				Int16GetDatum(0)));

 	/*
 	 * add attribute data to relation->rd_att
 	 */
 	need = relation->rd_rel->relnatts;

 	while (HeapTupleIsValid(pg_attribute_tuple = caql_getnext(pcqCtx)))
 	{
 		Form_pg_attribute attp;

 		attp = (Form_pg_attribute) GETSTRUCT(pg_attribute_tuple);

 		if (attp->attnum <= 0 ||
 			attp->attnum > relation->rd_rel->relnatts)
 			elog(ERROR, "invalid attribute number %d for %s",
 				 attp->attnum, RelationGetRelationName(relation));

 		memcpy(relation->rd_att->attrs[attp->attnum - 1],
 			   attp,
 			   ATTRIBUTE_FIXED_PART_SIZE);

 		/* Update constraint/default info */
 		if (attp->attnotnull)
 			constr->has_not_null = true;

 		if (attp->atthasdef)
 		{
 			if (attrdef == NULL)
 				attrdef = (AttrDefault *)
 					MemoryContextAllocZero(CacheMemoryContext,
 										   relation->rd_rel->relnatts *
 										   sizeof(AttrDefault));
 			attrdef[ndef].adnum = attp->attnum;
 			attrdef[ndef].adbin = NULL;
 			ndef++;
 		}
 		need--;
 		if (need == 0)
 			break;
 	}

 	/*
 	 * end the scan and close the attribute relation
 	 */
 	caql_endscan(pcqCtx);
 	heap_close(pg_attribute_desc, AccessShareLock);

 	if (need != 0)
 		elog(ERROR, "catalog is missing %d attribute(s) for relid %u",
 			 need, RelationGetRelid(relation));

 	/*
 	 * The attcacheoff values we read from pg_attribute should all be -1
 	 * ("unknown").  Verify this if assert checking is on.	They will be
 	 * computed when and if needed during tuple access.
 	 */
 #ifdef USE_ASSERT_CHECKING
 	{
 		int			i;

 		for (i = 0; i < relation->rd_rel->relnatts; i++)
 			Assert(relation->rd_att->attrs[i]->attcacheoff == -1);
 	}
 #endif

 	/*
 	 * However, we can easily set the attcacheoff value for the first
 	 * attribute: it must be zero.	This eliminates the need for special cases
 	 * for attnum=1 that used to exist in fastgetattr() and index_getattr().
 	 */
 	if (relation->rd_rel->relnatts > 0)
 		relation->rd_att->attrs[0]->attcacheoff = 0;

 	/*
 	 * Set up constraint/default info
 	 */
 	if (constr->has_not_null || ndef > 0 || relation->rd_rel->relchecks)
 	{
 		relation->rd_att->constr = constr;

 		if (ndef > 0)			/* DEFAULTs */
 		{
 			if (ndef < relation->rd_rel->relnatts)
 				constr->defval = (AttrDefault *)
 					repalloc(attrdef, ndef * sizeof(AttrDefault));
 			else
 				constr->defval = attrdef;
 			constr->num_defval = ndef;
 			AttrDefaultFetch(relation);
 		}
 		else
 			constr->num_defval = 0;

 		if (relation->rd_rel->relchecks > 0)	/* CHECKs */
 		{
 			constr->num_check = relation->rd_rel->relchecks;
 			constr->check = (ConstrCheck *)
 				MemoryContextAllocZero(CacheMemoryContext,
 									constr->num_check * sizeof(ConstrCheck));
 			CheckConstraintFetch(relation);
 		}
 		else
 			constr->num_check = 0;
 	}
 	else
 	{
 		pfree(constr);
 		relation->rd_att->constr = NULL;
 	}
 }

 /*
  *		RelationBuildRuleLock
  *
  *		Form the relation's rewrite rules from information in
  *		the pg_rewrite system catalog.
  *
  * Note: The rule parsetrees are potentially very complex node structures.
  * To allow these trees to be freed when the relcache entry is flushed,
  * we make a private memory context to hold the RuleLock information for
  * each relcache entry that has associated rules.  The context is used
  * just for rule info, not for any other subsidiary data of the relcache
  * entry, because that keeps the update logic in RelationClearRelation()
  * manageable.	The other subsidiary data structures are simple enough
  * to be easy to free explicitly, anyway.
  */
 static void
 RelationBuildRuleLock(Relation relation)
 {
 	MemoryContext rulescxt;
 	MemoryContext oldcxt;
 	HeapTuple	rewrite_tuple;
 	Relation	rewrite_desc;
 	TupleDesc	rewrite_tupdesc;
 	cqContext	cqc;
 	cqContext  *pcqCtx;
 	RuleLock   *rulelock;
 	int			numlocks;
 	RewriteRule **rules;
 	int			maxlocks;

 	/*
 	 * Make the private context.  Parameters are set on the assumption that
 	 * it'll probably not contain much data.
 	 */
 	rulescxt = AllocSetContextCreate(CacheMemoryContext,
 									 RelationGetRelationName(relation),
 									 ALLOCSET_SMALL_MINSIZE,
 									 ALLOCSET_SMALL_INITSIZE,
 									 ALLOCSET_SMALL_MAXSIZE);
 	relation->rd_rulescxt = rulescxt;

 	/*
 	 * allocate an array to hold the rewrite rules (the array is extended if
 	 * necessary)
 	 */
 	maxlocks = 4;
 	rules = (RewriteRule **)
 		MemoryContextAlloc(rulescxt, sizeof(RewriteRule *) * maxlocks);
 	numlocks = 0;

 	/*
 	 * open pg_rewrite and begin a scan
 	 *
 	 * Note: since we scan the rules using RewriteRelRulenameIndexId, we will
 	 * be reading the rules in name order, except possibly during
 	 * emergency-recovery operations (ie, IgnoreSystemIndexes). This in turn
 	 * ensures that rules will be fired in name order.
 	 */
 	rewrite_desc = heap_open(RewriteRelationId, AccessShareLock);
 	rewrite_tupdesc = RelationGetDescr(rewrite_desc);

 	pcqCtx = caql_beginscan(
 			caql_syscache(
 					caql_indexOK(caql_addrel(cqclr(&cqc), rewrite_desc),
 								 true),
 					false),
 			cql("SELECT * FROM pg_rewrite "
 				" WHERE ev_class = :1 ",
 				ObjectIdGetDatum(RelationGetRelid(relation))));

 	while (HeapTupleIsValid(rewrite_tuple = caql_getnext(pcqCtx)))
 	{
 		Form_pg_rewrite rewrite_form = (Form_pg_rewrite) GETSTRUCT(rewrite_tuple);
 		bool		isnull;
 		Datum		rule_datum;
 		char	   *rule_str;
 		RewriteRule *rule;

 		rule = (RewriteRule *) MemoryContextAlloc(rulescxt,
 												  sizeof(RewriteRule));

 		rule->ruleId = HeapTupleGetOid(rewrite_tuple);

 		rule->event = rewrite_form->ev_type - '0';
 		rule->attrno = rewrite_form->ev_attr;
 		rule->isInstead = rewrite_form->is_instead;

 		/*
 		 * Must use heap_getattr to fetch ev_action and ev_qual.  Also, the
 		 * rule strings are often large enough to be toasted.  To avoid
 		 * leaking memory in the caller's context, do the detoasting here so
 		 * we can free the detoasted version.
 		 */
 		rule_datum = heap_getattr(rewrite_tuple,
 								  Anum_pg_rewrite_ev_action,
 								  rewrite_tupdesc,
 								  &isnull);
 		Assert(!isnull);
 		rule_str = TextDatumGetCString(rule_datum);
 		oldcxt = MemoryContextSwitchTo(rulescxt);
 		rule->actions = (List *) stringToNode(rule_str);
 		MemoryContextSwitchTo(oldcxt);
 		pfree(rule_str);

 		rule_datum = heap_getattr(rewrite_tuple,
 								  Anum_pg_rewrite_ev_qual,
 								  rewrite_tupdesc,
 								  &isnull);
 		Assert(!isnull);
 		rule_str = TextDatumGetCString(rule_datum);
 		oldcxt = MemoryContextSwitchTo(rulescxt);
 		rule->qual = (Node *) stringToNode(rule_str);
 		MemoryContextSwitchTo(oldcxt);
 		pfree(rule_str);

 		/*
 		 * We want the rule's table references to be checked as though by the
 		 * table owner, not the user referencing the rule.	Therefore, scan
 		 * through the rule's actions and set the checkAsUser field on all
 		 * rtable entries.	We have to look at the qual as well, in case it
 		 * contains sublinks.
 		 *
 		 * The reason for doing this when the rule is loaded, rather than when
 		 * it is stored, is that otherwise ALTER TABLE OWNER would have to
 		 * grovel through stored rules to update checkAsUser fields. Scanning
 		 * the rule tree during load is relatively cheap (compared to
 		 * constructing it in the first place), so we do it here.
 		 */
 		setRuleCheckAsUser((Node *) rule->actions, relation->rd_rel->relowner);
 		setRuleCheckAsUser(rule->qual, relation->rd_rel->relowner);

 		if (numlocks >= maxlocks)
 		{
 			maxlocks *= 2;
 			rules = (RewriteRule **)
 				repalloc(rules, sizeof(RewriteRule *) * maxlocks);
 		}
 		rules[numlocks++] = rule;
 	}

 	/*
 	 * end the scan and close the attribute relation
 	 */
 	caql_endscan(pcqCtx);
 	heap_close(rewrite_desc, AccessShareLock);

 	/*
 	 * form a RuleLock and insert into relation
 	 */
 	rulelock = (RuleLock *) MemoryContextAlloc(rulescxt, sizeof(RuleLock));
 	rulelock->numLocks = numlocks;
 	rulelock->rules = rules;

 	relation->rd_rules = rulelock;
 }

 /*
  *		equalRuleLocks
  *
  *		Determine whether two RuleLocks are equivalent
  *
  *		Probably this should be in the rules code someplace...
  */
 static bool
 equalRuleLocks(RuleLock *rlock1, RuleLock *rlock2)
 {
 	int			i;

 	/*
 	 * As of 7.3 we assume the rule ordering is repeatable, because
 	 * RelationBuildRuleLock should read 'em in a consistent order.  So just
 	 * compare corresponding slots.
 	 */
 	if (rlock1 != NULL)
 	{
 		if (rlock2 == NULL)
 			return false;
 		if (rlock1->numLocks != rlock2->numLocks)
 			return false;
 		for (i = 0; i < rlock1->numLocks; i++)
 		{
 			RewriteRule *rule1 = rlock1->rules[i];
 			RewriteRule *rule2 = rlock2->rules[i];

 			if (rule1->ruleId != rule2->ruleId)
 				return false;
 			if (rule1->event != rule2->event)
 				return false;
 			if (rule1->attrno != rule2->attrno)
 				return false;
 			if (rule1->isInstead != rule2->isInstead)
 				return false;
 			if (!equal(rule1->qual, rule2->qual))
 				return false;
 			if (!equal(rule1->actions, rule2->actions))
 				return false;
 		}
 	}
 	else if (rlock2 != NULL)
 		return false;
 	return true;
 }

 /*
  *		RelationBuildDesc
  *
  *		Build a relation descriptor.  The caller must hold at least
  *		AccessShareLock on the target relid.
  *
  *		The new descriptor is inserted into the hash table if insertIt is true.
  *
  *		Returns NULL if no pg_class row could be found for the given relid
  *		(suggesting we are trying to access a just-deleted relation).
  *		Any other error is reported via elog.
  */
 static Relation
 RelationBuildDesc(Oid targetRelId, bool insertIt)
 {
 	Relation	relation;
 	Oid			relid;
 	Relation    pg_class_relation;
 	HeapTuple	pg_class_tuple;
 	Form_pg_class relp;

 	/*
 	 * find the tuple in pg_class corresponding to the given relation id
 	 */
 	pg_class_tuple = ScanPgRelation(targetRelId, true, &pg_class_relation);

 	/*
 	 * if no such tuple exists, return NULL
 	 */
 	if (!HeapTupleIsValid(pg_class_tuple)){
 		if(RelationIsValid(pg_class_relation))
 			heap_close(pg_class_relation, AccessShareLock);
 		return NULL;
 	}
 	/*
 	 * get information from the pg_class_tuple
 	 */
 	relid = HeapTupleGetOid(pg_class_tuple);
 	relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
 	heap_close(pg_class_relation, AccessShareLock);

 	/*
 	 * allocate storage for the relation descriptor, and copy pg_class_tuple
 	 * to relation->rd_rel and new fields into relation->rd_newfields.
 	 */
 	relation = AllocateRelationDesc(relp);

 	/*
 	 * initialize the relation's relation id (relation->rd_id)
 	 */
 	RelationGetRelid(relation) = relid;

 	/*
 	 * normal relations are not nailed into the cache; nor can a pre-existing
 	 * relation be new.  It could be temp though.  (Actually, it could be new
 	 * too, but it's okay to forget that fact if forced to flush the entry.)
 	 */
 	relation->rd_refcnt = 0;
 	relation->rd_isnailed = false;
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_istemp = isTempNamespace(relation->rd_rel->relnamespace);
 	relation->rd_issyscat = (strncmp(relation->rd_rel->relname.data, "pg_", 3) == 0);

     /*
      * CDB: On QEs, temp relations must use shared buffer cache so data
      * will be visible to all segmates.  On QD, sequence objects must
      * use shared buffer cache so data will be visible to sequence server.
      */
     if (relation->rd_istemp &&
         relation->rd_rel->relkind != RELKIND_SEQUENCE &&
         Gp_role != GP_ROLE_EXECUTE)
         relation->rd_isLocalBuf = true;
     else
         relation->rd_isLocalBuf = false;

 	/*
 	 * initialize the tuple descriptor (relation->rd_att).
 	 */
 	RelationBuildTupleDesc(relation);

 	/*
 	 * Fetch rules and triggers that affect this relation
 	 */
 	if (relation->rd_rel->relhasrules)
 		RelationBuildRuleLock(relation);
 	else
 	{
 		relation->rd_rules = NULL;
 		relation->rd_rulescxt = NULL;
 	}

 	if (relation->rd_rel->reltriggers > 0)
 		RelationBuildTriggers(relation);
 	else
 		relation->trigdesc = NULL;

 	/*
 	 * if it's an index, initialize index-related information
 	 */
 	if (OidIsValid(relation->rd_rel->relam))
 		RelationInitIndexAccessInfo(relation);

 	/* extract reloptions if any */
 	RelationParseRelOptions(relation, pg_class_tuple);

 	/*
 	 * initialize the relation lock manager information
 	 */
 	RelationInitLockInfo(relation);		/* see lmgr.c */

 	/*
 	 * initialize physical addressing information for the relation
 	 */
 	RelationInitPhysicalAddr(relation);

 	/* make sure relation is marked as having no open file yet */
 	relation->rd_smgr = NULL;

     /*
      * initialize Greenplum Database partitioning info
      */
     if (relation->rd_rel->relkind == RELKIND_RELATION &&
         !IsSystemRelation(relation))
         relation->rd_cdbpolicy = GpPolicyFetch(CacheMemoryContext, targetRelId);

     relation->rd_cdbDefaultStatsWarningIssued = false;

 	/*
 	 * now we can free the memory allocated for pg_class_tuple
 	 */
 	heap_freetuple(pg_class_tuple);

 	/*
 	 * Insert newly created relation into relcache hash table, if requested.
 	 */
 	if (insertIt)
 		RelationCacheInsert(relation);

 	/* It's fully valid */
 	relation->rd_isvalid = true;

 	return relation;
 }

 /*
  * Initialize the physical addressing info (RelFileNode) for a relcache entry
  */
 static void
 RelationInitPhysicalAddr(Relation relation)
 {
 	if (relation->rd_rel->reltablespace)
 		relation->rd_node.spcNode = relation->rd_rel->reltablespace;
 	else if (relstorage_is_ao(relation->rd_rel->relstorage))
 	{
 		relation->rd_node.spcNode = Gp_role != GP_ROLE_EXECUTE ? get_database_dts(MyDatabaseId) : MyProcPort->dbdtsoid;
 	}
 	else
 		relation->rd_node.spcNode = MyDatabaseTableSpace;

 	if (relation->rd_rel->relisshared)
 		relation->rd_node.dbNode = InvalidOid;
 	else if (relation->rd_id < FirstNormalObjectId ||
 			(AmActiveMaster() && Gp_role != GP_ROLE_EXECUTE) ||
 			AmStandbyMaster())
 		relation->rd_node.dbNode = MyDatabaseId;
 	else
 		relation->rd_node.dbNode = MyProcPort->dboid;
 	relation->rd_node.relNode = relation->rd_rel->relfilenode;
 }

 /*
  * Initialize index-access-method support data for an index relation
  */
 void
 RelationInitIndexAccessInfo(Relation relation)
 {
 	HeapTuple	tuple;
 	Form_pg_am	aform;
 	Datum		indclassDatum;
 	bool		isnull;
 	MemoryContext indexcxt;
 	MemoryContext oldcontext;
 	Oid		   *operator;
 	RegProcedure *support;
 	FmgrInfo   *supportinfo;
 	int			natts;
 	uint16		amstrategies;
 	uint16		amsupport;

 	/*
 	 * Make a copy of the pg_index entry for the index.  Since pg_index
 	 * contains variable-length and possibly-null fields, we have to do this
 	 * honestly rather than just treating it as a Form_pg_index struct.
 	 */
 	tuple = SearchSysCache(INDEXRELID,
 						   ObjectIdGetDatum(RelationGetRelid(relation)),
 						   0, 0, 0);
 	if (!HeapTupleIsValid(tuple))
 		elog(ERROR, "cache lookup failed for index %u",
 			 RelationGetRelid(relation));
 	oldcontext = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indextuple = heap_copytuple(tuple);
 	relation->rd_index = (Form_pg_index) GETSTRUCT(relation->rd_indextuple);
 	MemoryContextSwitchTo(oldcontext);
 	ReleaseSysCache(tuple);

 	/*
 	 * indclass cannot be referenced directly through the C struct, because it
 	 * is after the variable-width indkey field.  Therefore we extract the
 	 * datum the hard way and provide a direct link in the relcache.
 	 */
 	indclassDatum = fastgetattr(relation->rd_indextuple,
 								Anum_pg_index_indclass,
 								GetPgIndexDescriptor(),
 								&isnull);
 	Assert(!isnull);
 	relation->rd_indclass = (oidvector *) DatumGetPointer(indclassDatum);

 	/*
 	 * Make a copy of the pg_am entry for the index's access method
 	 */
 	tuple = SearchSysCache(AMOID,
 						   ObjectIdGetDatum(relation->rd_rel->relam),
 						   0, 0, 0);
 	if (!HeapTupleIsValid(tuple))
 		elog(ERROR, "cache lookup failed for access method %u",
 			 relation->rd_rel->relam);
 	aform = (Form_pg_am) MemoryContextAlloc(CacheMemoryContext, sizeof *aform);
 	memcpy(aform, GETSTRUCT(tuple), sizeof *aform);
 	ReleaseSysCache(tuple);
 	relation->rd_am = aform;

 	natts = relation->rd_rel->relnatts;
 	if (natts != relation->rd_index->indnatts)
 		elog(ERROR, "relnatts disagrees with indnatts for index %u",
 			 RelationGetRelid(relation));
 	amstrategies = aform->amstrategies;
 	amsupport = aform->amsupport;

 	/*
 	 * Make the private context to hold index access info.	The reason we need
 	 * a context, and not just a couple of pallocs, is so that we won't leak
 	 * any subsidiary info attached to fmgr lookup records.
 	 *
 	 * Context parameters are set on the assumption that it'll probably not
 	 * contain much data.
 	 */
 	indexcxt = AllocSetContextCreate(CacheMemoryContext,
 									 RelationGetRelationName(relation),
 									 ALLOCSET_SMALL_MINSIZE,
 									 ALLOCSET_SMALL_INITSIZE,
 									 ALLOCSET_SMALL_MAXSIZE);
 	relation->rd_indexcxt = indexcxt;

 	/*
 	 * Allocate arrays to hold data
 	 */
 	relation->rd_aminfo = (RelationAmInfo *)
 		MemoryContextAllocZero(indexcxt, sizeof(RelationAmInfo));

 	if (amstrategies > 0)
 		operator = (Oid *)
 			MemoryContextAllocZero(indexcxt,
 								   natts * amstrategies * sizeof(Oid));
 	else
 		operator = NULL;

 	if (amsupport > 0)
 	{
 		int			nsupport = natts * amsupport;

 		support = (RegProcedure *)
 			MemoryContextAllocZero(indexcxt, nsupport * sizeof(RegProcedure));
 		supportinfo = (FmgrInfo *)
 			MemoryContextAllocZero(indexcxt, nsupport * sizeof(FmgrInfo));
 	}
 	else
 	{
 		support = NULL;
 		supportinfo = NULL;
 	}

 	relation->rd_operator = operator;
 	relation->rd_support = support;
 	relation->rd_supportinfo = supportinfo;

 	/*
 	 * Fill the operator and support procedure OID arrays.	(aminfo and
 	 * supportinfo are left as zeroes, and are filled on-the-fly when used)
 	 */
 	IndexSupportInitialize(relation->rd_indclass,
 						   operator, support,
 						   amstrategies, amsupport, natts);

 	/*
 	 * expressions and predicate cache will be filled later
 	 */
 	relation->rd_indexprs = NIL;
 	relation->rd_indpred = NIL;
 	relation->rd_amcache = NULL;
 }

 /*
  * IndexSupportInitialize
  *		Initializes an index's cached opclass information,
  *		given the index's pg_index.indclass entry.
  *
  * Data is returned into *indexOperator and *indexSupport, which are arrays
  * allocated by the caller.
  *
  * The caller also passes maxStrategyNumber, maxSupportNumber, and
  * maxAttributeNumber, since these indicate the size of the arrays
  * it has allocated --- but in practice these numbers must always match
  * those obtainable from the system catalog entries for the index and
  * access method.
  */
 void
 IndexSupportInitialize(oidvector *indclass,
 					   Oid *indexOperator,
 					   RegProcedure *indexSupport,
 					   StrategyNumber maxStrategyNumber,
 					   StrategyNumber maxSupportNumber,
 					   AttrNumber maxAttributeNumber)
 {
 	int			attIndex;

 	for (attIndex = 0; attIndex < maxAttributeNumber; attIndex++)
 	{
 		OpClassCacheEnt *opcentry;

 		if (!OidIsValid(indclass->values[attIndex]))
 			elog(ERROR, "bogus pg_index tuple");

 		/* look up the info for this opclass, using a cache */
 		opcentry = LookupOpclassInfo(indclass->values[attIndex],
 									 maxStrategyNumber,
 									 maxSupportNumber);

 		/* copy cached data into relcache entry */
 		if (maxStrategyNumber > 0)
 			memcpy(&indexOperator[attIndex * maxStrategyNumber],
 				   opcentry->operatorOids,
 				   maxStrategyNumber * sizeof(Oid));
 		if (maxSupportNumber > 0)
 			memcpy(&indexSupport[attIndex * maxSupportNumber],
 				   opcentry->supportProcs,
 				   maxSupportNumber * sizeof(RegProcedure));
 	}
 }

 /*
  * LookupOpclassInfo
  *
  * This routine maintains a per-opclass cache of the information needed
  * by IndexSupportInitialize().  This is more efficient than relying on
  * the catalog cache, because we can load all the info about a particular
  * opclass in a single indexscan of pg_amproc or pg_amop.
  *
  * The information from pg_am about expected range of strategy and support
  * numbers is passed in, rather than being looked up, mainly because the
  * caller will have it already.
  *
  * Note there is no provision for flushing the cache.  This is OK at the
  * moment because there is no way to ALTER any interesting properties of an
  * existing opclass --- all you can do is drop it, which will result in
  * a useless but harmless dead entry in the cache.	To support altering
  * opclass membership (not the same as opfamily membership!), we'd need to
  * be able to flush this cache as well as the contents of relcache entries
  * for indexes.
  */
 static OpClassCacheEnt *
 LookupOpclassInfo(Oid operatorClassOid,
 				  StrategyNumber numStrats,
 				  StrategyNumber numSupport)
 {
 	OpClassCacheEnt *opcentry;
 	bool		found;
 	Relation	rel;
 	cqContext	cqc;
 	cqContext  *pcqCtx;
 	HeapTuple	htup;
 	bool		indexOK;

 	if (OpClassCache == NULL)
 	{
 		/* First time through: initialize the opclass cache */
 		HASHCTL		ctl;

 		if (!CacheMemoryContext)
 			CreateCacheMemoryContext();

 		MemSet(&ctl, 0, sizeof(ctl));
 		ctl.keysize = sizeof(Oid);
 		ctl.entrysize = sizeof(OpClassCacheEnt);
 		ctl.hash = oid_hash;
 		OpClassCache = hash_create("Operator class cache", 64,
 								   &ctl, HASH_ELEM | HASH_FUNCTION);
 	}

 	opcentry = (OpClassCacheEnt *) hash_search(OpClassCache,
 											   (void *) &operatorClassOid,
 											   HASH_ENTER, &found);

 	if (found && opcentry->valid)
 	{
 		/* Already made an entry for it */
 		Assert(numStrats == opcentry->numStrats);
 		Assert(numSupport == opcentry->numSupport);
 		return opcentry;
 	}

 	/* Need to fill in new entry */
 	opcentry->valid = false;	/* until known OK */
 	opcentry->numStrats = numStrats;
 	opcentry->numSupport = numSupport;

 	if (numStrats > 0)
 		opcentry->operatorOids = (Oid *)
 			MemoryContextAllocZero(CacheMemoryContext,
 								   numStrats * sizeof(Oid));
 	else
 		opcentry->operatorOids = NULL;

 	if (numSupport > 0)
 		opcentry->supportProcs = (RegProcedure *)
 			MemoryContextAllocZero(CacheMemoryContext,
 								   numSupport * sizeof(RegProcedure));
 	else
 		opcentry->supportProcs = NULL;

 	/*
 	 * To avoid infinite recursion during startup, force heap scans if we're
 	 * looking up info for the opclasses used by the indexes we would like to
 	 * reference here.
 	 */
 	indexOK = criticalRelcachesBuilt ||
 		(operatorClassOid != OID_BTREE_OPS_OID &&
 		 operatorClassOid != INT2_BTREE_OPS_OID);

 	/*
 	 * Scan pg_amop to obtain operators for the opclass.  We only fetch the
 	 * default ones (those with subtype zero).
 	 */
 	if (numStrats > 0)
 	{
 		rel = heap_open(AccessMethodOperatorRelationId, AccessShareLock);

 		pcqCtx = caql_beginscan(
 				caql_syscache(
 						caql_indexOK(caql_addrel(cqclr(&cqc), rel),
 									 indexOK),
 						false),
 				cql("SELECT * FROM pg_amop "
 					" WHERE amopclaid = :1 "
 					" AND amopsubtype = :2 ",
 					ObjectIdGetDatum(operatorClassOid),
 					ObjectIdGetDatum(InvalidOid)));

 		while (HeapTupleIsValid(htup = caql_getnext(pcqCtx)))
 		{
 			Form_pg_amop amopform = (Form_pg_amop) GETSTRUCT(htup);

 			if (amopform->amopstrategy <= 0 ||
 				(StrategyNumber) amopform->amopstrategy > numStrats)
 				elog(ERROR, "invalid amopstrategy number %d for opclass %u",
 					 amopform->amopstrategy, operatorClassOid);
 			opcentry->operatorOids[amopform->amopstrategy - 1] =
 				amopform->amopopr;
 		}

 		caql_endscan(pcqCtx);
 		heap_close(rel, AccessShareLock);
 	}

 	/*
 	 * Scan pg_amproc to obtain support procs for the opclass.	We only fetch
 	 * the default ones (those with lefttype = righttype = opcintype).
 	 */
 	if (numSupport > 0)
 	{
 		rel = heap_open(AccessMethodProcedureRelationId, AccessShareLock);

 		pcqCtx = caql_beginscan(
 				caql_syscache(
 						caql_indexOK(caql_addrel(cqclr(&cqc), rel),
 									 indexOK),
 						false),
 				cql("SELECT * FROM pg_amproc "
 					" WHERE amopclaid = :1 "
 					" AND amprocsubtype = :2 ",
 					ObjectIdGetDatum(operatorClassOid),
 					ObjectIdGetDatum(InvalidOid)));

 		while (HeapTupleIsValid(htup = caql_getnext(pcqCtx)))
 		{
 			Form_pg_amproc amprocform = (Form_pg_amproc) GETSTRUCT(htup);

 			if (amprocform->amprocnum <= 0 ||
 				(StrategyNumber) amprocform->amprocnum > numSupport)
 				elog(ERROR, "invalid amproc number %d for opclass %u",
 					 amprocform->amprocnum, operatorClassOid);

 			opcentry->supportProcs[amprocform->amprocnum - 1] =
 				amprocform->amproc;
 		}

 		caql_endscan(pcqCtx);
 		heap_close(rel, AccessShareLock);
 	}

 	opcentry->valid = true;
 	return opcentry;
 }


 /*
  *		formrdesc
  *
  *		This is a special cut-down version of RelationBuildDesc()
  *		used by RelationCacheInitializePhase2() in initializing the relcache.
  *		The relation descriptor is built just from the supplied parameters,
  *		without actually looking at any system table entries.  We cheat
  *		quite a lot since we only need to work for a few basic system
  *		catalogs.
  *
  * formrdesc is currently used for: pg_class, pg_attribute, pg_proc,
  * and pg_type (see RelationCacheInitializePhase2).
  *
  * Note that these catalogs can't have constraints (except attnotnull),
  * default values, rules, or triggers, since we don't cope with any of that.
  *
  * NOTE: we assume we are already switched into CacheMemoryContext.
  */
 static void
 formrdesc(const char *relationName, Oid relationReltype,
 		  bool hasoids, int natts, const FormData_pg_attribute *att)
 {
 	Relation	relation;
 	int			i;
 	bool		has_not_null;

 	/*
 	 * allocate new relation desc, clear all fields of reldesc
 	 */
 	relation = (Relation) palloc0(sizeof(RelationData));
 	relation->rd_targblock = InvalidBlockNumber;

 	/* make sure relation is marked as having no open file yet */
 	relation->rd_smgr = NULL;

 	/*
 	 * initialize reference count: 1 because it is nailed in cache
 	 */
 	relation->rd_refcnt = 1;

 	/*
 	 * all entries built with this routine are nailed-in-cache; none are for
 	 * new or temp relations.
 	 */
 	relation->rd_isnailed = true;
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_istemp = false;
 	relation->rd_issyscat = (strncmp(relationName, "pg_", 3) == 0);	/* GP */
     relation->rd_isLocalBuf = false;    /*CDB*/

 	/*
 	 * initialize relation tuple form
 	 *
 	 * The data we insert here is pretty incomplete/bogus, but it'll serve to
 	 * get us launched.  RelationCacheInitializePhase2() will read the real
 	 * data from pg_class and replace what we've done here.
 	 */
 	relation->rd_rel = (Form_pg_class) palloc0(CLASS_TUPLE_SIZE);

 	namestrcpy(&relation->rd_rel->relname, relationName);
 	relation->rd_rel->relnamespace = PG_CATALOG_NAMESPACE;
 	relation->rd_rel->reltype = relationReltype;

 	/*
 	 * It's important to distinguish between shared and non-shared relations,
 	 * even at bootstrap time, to make sure we know where they are stored.	At
 	 * present, all relations that formrdesc is used for are not shared.
 	 */
 	relation->rd_rel->relisshared = false;

 	relation->rd_rel->relpages = 1;
 	relation->rd_rel->reltuples = 1;
 	relation->rd_rel->relkind = RELKIND_RELATION;
 	relation->rd_rel->relstorage = RELSTORAGE_HEAP;
 	relation->rd_rel->relhasoids = hasoids;
 	relation->rd_rel->relnatts = (int16) natts;

 	/*
 	 * Physical file-system information.
 	 */
 	relation->rd_relationnodeinfo.isPresent = false;
 	relation->rd_relationnodeinfo.tidAllowedToBeZero = false;

 	/*
 	 * initialize attribute tuple form
 	 *
 	 * Unlike the case with the relation tuple, this data had better be right
 	 * because it will never be replaced.  The input values must be correctly
 	 * defined by macros in src/include/catalog/ headers.
 	 */
 	relation->rd_att = CreateTemplateTupleDesc(natts, hasoids);
 	relation->rd_att->tdrefcount = 1;	/* mark as refcounted */

 	relation->rd_att->tdtypeid = relationReltype;
 	relation->rd_att->tdtypmod = -1;	/* unnecessary, but... */

 	/*
 	 * initialize tuple desc info
 	 */
 	has_not_null = false;
 	for (i = 0; i < natts; i++)
 	{
 		memcpy(relation->rd_att->attrs[i],
 			   &att[i],
 			   ATTRIBUTE_FIXED_PART_SIZE);
 		has_not_null |= att[i].attnotnull;
 		/* make sure attcacheoff is valid */
 		relation->rd_att->attrs[i]->attcacheoff = -1;
 	}

 	/* initialize first attribute's attcacheoff, cf RelationBuildTupleDesc */
 	relation->rd_att->attrs[0]->attcacheoff = 0;

 	/* mark not-null status */
 	if (has_not_null)
 	{
 		TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr));

 		constr->has_not_null = true;
 		relation->rd_att->constr = constr;
 	}

 	/*
 	 * initialize relation id from info in att array (my, this is ugly)
 	 */
 	RelationGetRelid(relation) = relation->rd_att->attrs[0]->attrelid;
 	relation->rd_rel->relfilenode = RelationGetRelid(relation);

 	/*
 	 * initialize the relation lock manager information
 	 */
 	RelationInitLockInfo(relation);		/* see lmgr.c */

 	/*
 	 * initialize physical addressing information for the relation
 	 */
 	RelationInitPhysicalAddr(relation);

 	/*
 	 * initialize the rel-has-index flag, using hardwired knowledge
 	 */
 	if (IsBootstrapProcessingMode())
 	{
 		/* In bootstrap mode, we have no indexes */
 		relation->rd_rel->relhasindex = false;
 	}
 	else
 	{
 		/* Otherwise, all the rels formrdesc is used for have indexes */
 		relation->rd_rel->relhasindex = true;
 	}

 	/*
 	 * add new reldesc to relcache
 	 */
 	RelationCacheInsert(relation);

 	/* It's fully valid */
 	relation->rd_isvalid = true;
 }


 /* ----------------------------------------------------------------
  *				 Relation Descriptor Lookup Interface
  * ----------------------------------------------------------------
  */

 /*
  *		RelationIdGetRelation
  *
  *		Lookup a reldesc by OID; make one if not already in cache.
  *
  *		Returns NULL if no pg_class row could be found for the given relid
  *		(suggesting we are trying to access a just-deleted relation).
  *		Any other error is reported via elog.
  *
  *		NB: caller should already have at least AccessShareLock on the
  *		relation ID, else there are nasty race conditions.
  *
  *		NB: relation ref count is incremented, or set to 1 if new entry.
  *		Caller should eventually decrement count.  (Usually,
  *		that happens by calling RelationClose().)
  */
 Relation
 RelationIdGetRelation(Oid relationId)
 {
 	Relation	rd;

 	/*
 	 * first try to find reldesc in the cache
 	 */
 	RelationIdCacheLookup(relationId, rd);

 	if (RelationIsValid(rd))
 	{
 		RelationIncrementReferenceCount(rd);
 		/* revalidate cache entry if necessary */
 		if (!rd->rd_isvalid)
 		{
 			/*
 			 * Indexes only have a limited number of possible schema changes,
 			 * and we don't want to use the full-blown procedure because it's
 			 * a headache for indexes that reload itself depends on.
 			 */
 			if (rd->rd_rel->relkind == RELKIND_INDEX)
 				RelationReloadClassinfo(rd);
 			else
 				RelationClearRelation(rd, true);
 		 }

 		return rd;
 	}

 	/*
 	 * no reldesc in the cache, so have RelationBuildDesc() build one and add
 	 * it. Do not add relation to relcache if it is an external object that
 	 * comes from HCatalog
 	 */
 	rd = RelationBuildDesc(relationId, relationId < FirstExternalObjectId);
 	if (RelationIsValid(rd))
 		RelationIncrementReferenceCount(rd);

 	return rd;
 }

 /* ----------------------------------------------------------------
  *				cache invalidation support routines
  * ----------------------------------------------------------------
  */

 /*
  * RelationIncrementReferenceCount
  *		Increments relation reference count.
  *
  * Note: bootstrap mode has its own weird ideas about relation refcount
  * behavior; we ought to fix it someday, but for now, just disable
  * reference count ownership tracking in bootstrap mode.
  */
 void
 RelationIncrementReferenceCount(Relation rel)
 {
 	ResourceOwnerEnlargeRelationRefs(CurrentResourceOwner);
 	rel->rd_refcnt += 1;
 	if (!IsBootstrapProcessingMode())
 		ResourceOwnerRememberRelationRef(CurrentResourceOwner, rel);
 }

 /*
  * RelationDecrementReferenceCount
  *		Decrements relation reference count.
  */
 void
 RelationDecrementReferenceCount(Relation rel)
 {
 	if (rel->rd_refcnt <= 0)
 	{
 		elog(ERROR,
 			 "Relation decrement reference count found relation %u/%u/%u with bad count (reference count %d)",
 			 rel->rd_node.spcNode,
 			 rel->rd_node.dbNode,
 			 rel->rd_node.relNode,
 			 rel->rd_refcnt);
 	}

 	rel->rd_refcnt -= 1;
 	if (!IsBootstrapProcessingMode())
 		ResourceOwnerForgetRelationRef(CurrentResourceOwner, rel);
 }

 /*
  * RelationClose - close an open relation
  *
  *	Actually, we just decrement the refcount.
  *
  *	NOTE: if compiled with -DRELCACHE_FORCE_RELEASE then relcache entries
  *	will be freed as soon as their refcount goes to zero.  In combination
  *	with aset.c's CLOBBER_FREED_MEMORY option, this provides a good test
  *	to catch references to already-released relcache entries.  It slows
  *	things down quite a bit, however.
  */
 void
 RelationClose(Relation relation)
 {
 	/* Note: no locking manipulations needed */
 	RelationDecrementReferenceCount(relation);

 #ifdef RELCACHE_FORCE_RELEASE
 	if (RelationHasReferenceCountZero(relation) &&
 		relation->rd_createSubid == InvalidSubTransactionId)
 		RelationClearRelation(relation, false);
 #endif
 }

 /*
  * RelationReloadClassinfo - reload the pg_class row (only)
  *
  *	This function is used only for indexes.  We currently allow only the
  *	pg_class row of an existing index to change (to support changes of
  *	owner, tablespace, or relfilenode), not its pg_index row or other
  *	subsidiary index schema information.  Therefore it's sufficient to do
  *	this when we get an SI invalidation.  Furthermore, there are cases
  *	where it's necessary not to throw away the index information, especially
  *	for "nailed" indexes which we are unable to rebuild on-the-fly.
  *
  *	We can't necessarily reread the pg_class row right away; we might be
  *	in a failed transaction when we receive the SI notification.  If so,
  *	RelationClearRelation just marks the entry as invalid by setting
  *	rd_isvalid to false.  This routine is called to fix the entry when it
  *	is next needed.
  *
  *	We assume that at the time we are called, we have at least AccessShareLock
  *	on the target index.  (Note: in the calls from RelationClearRelation,
  *	this is legitimate because we know the rel has positive refcount.)
  */
 static void
 RelationReloadClassinfo(Relation relation)
 {
 	bool		indexOK;
 	HeapTuple	pg_class_tuple;
 	Form_pg_class relp;

 	/* Should be called only for invalidated indexes */
 	Assert(relation->rd_rel->relkind == RELKIND_INDEX &&
 		   !relation->rd_isvalid);
 	/* Should be closed at smgr level */
 	Assert(relation->rd_smgr == NULL);

 	/*
 	 * Read the pg_class row
 	 *
 	 * Don't try to use an indexscan of pg_class_oid_index to reload the info
 	 * for pg_class_oid_index ...
 	 */
 	indexOK = (RelationGetRelid(relation) != ClassOidIndexId);
 	pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK, NULL);
 	if (!HeapTupleIsValid(pg_class_tuple))
 		elog(ERROR, "could not find pg_class tuple for index %u",
 			 RelationGetRelid(relation));
 	relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
 	memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
 	/* Reload reloptions in case they changed */
 	if (relation->rd_options)
 		pfree(relation->rd_options);
 	RelationParseRelOptions(relation, pg_class_tuple);
 	/* done with pg_class tuple */
 	heap_freetuple(pg_class_tuple);
 	/* We must recalculate physical address in case it changed */
 	RelationInitPhysicalAddr(relation);
 	/* Make sure targblock is reset in case rel was truncated */
 	relation->rd_targblock = InvalidBlockNumber;
 	/* Must free any AM cached data, too */
 	if (relation->rd_amcache)
 		pfree(relation->rd_amcache);
 	relation->rd_amcache = NULL;

 	/* Forget gp_relation_node information -- it may have changed. */
 	MemSet(&relation->rd_relationnodeinfo, 0, sizeof(RelationNodeInfo));

 	/* Okay, now it's valid again */
 	relation->rd_isvalid = true;
 }

 /*
  * RelationDestroyRelation
  *
  *	Physically delete a relation cache entry and all subsidiary data.
  *	Caller must already have unhooked the entry from the hash table.
  */
 static void
 RelationDestroyRelation(Relation relation)
 {
 	Assert(RelationHasReferenceCountZero(relation));

 	/*
 	 * Make sure smgr and lower levels close the relation's files, if they
 	 * weren't closed already.  (This was probably done by caller, but let's
 	 * just be real sure.)
 	 */
 	RelationCloseSmgr(relation);

 	/*
 	 * Free all the subsidiary data structures of the relcache entry,
 	 * then the entry itself.
 	 */
 	if (relation->rd_rel)
 		pfree(relation->rd_rel);
 	/* can't use DecrTupleDescRefCount here */
 	Assert(relation->rd_att->tdrefcount > 0);
 	if (--relation->rd_att->tdrefcount == 0)
 		FreeTupleDesc(relation->rd_att);
 	list_free(relation->rd_indexlist);
 	FreeTriggerDesc(relation->trigdesc);
 	if (relation->rd_options)
 		pfree(relation->rd_options);
 	if (relation->rd_indextuple)
 		pfree(relation->rd_indextuple);
 	if (relation->rd_am)
 		pfree(relation->rd_am);
 	if (relation->rd_indexcxt)
 		MemoryContextDelete(relation->rd_indexcxt);
 	if (relation->rd_rulescxt)
 		MemoryContextDelete(relation->rd_rulescxt);
 	if (relation->rd_cdbpolicy)
 		pfree(relation->rd_cdbpolicy);

 	pfree(relation);
 }

 /*
  * RelationClearRelation
  *
  *	 Physically blow away a relation cache entry, or reset it and rebuild
  *	 it from scratch (that is, from catalog entries).  The latter path is
  *	 usually used when we are notified of a change to an open relation
  *	 (one with refcount > 0).  However, this routine just does whichever
  *	 it's told to do; callers must determine which they want.
  *
  *	 NB: when rebuilding, we'd better hold some lock on the relation.
  *	 In current usages this is presumed true because it has refcnt > 0.
  */
 static void
 RelationClearRelation(Relation relation, bool rebuild)
 {
 	Oid			old_reltype = relation->rd_rel->reltype;

 	/*
 	 * Make sure smgr and lower levels close the relation's files, if they
 	 * weren't closed already.  If the relation is not getting deleted, the
 	 * next smgr access should reopen the files automatically.	This ensures
 	 * that the low-level file access state is updated after, say, a vacuum
 	 * truncation.
 	 */
 	RelationCloseSmgr(relation);

 	/*
 	 * Never, never ever blow away a nailed-in system relation, because we'd
 	 * be unable to recover.  However, we must reset rd_targblock, in case we
 	 * got called because of a relation cache flush that was triggered by
 	 * VACUUM.  Likewise reset the fsm and vm size info.
 	 *
 	 * If it's a nailed index, then we need to re-read the pg_class row to see
 	 * if its relfilenode changed.	We can't necessarily do that here, because
 	 * we might be in a failed transaction.  We assume it's okay to do it if
 	 * there are open references to the relcache entry (cf notes for
 	 * AtEOXact_RelationCache).  Otherwise just mark the entry as possibly
 	 * invalid, and it'll be fixed when next opened.
 	 */
 	if (relation->rd_isnailed)
 	{
 		relation->rd_targblock = InvalidBlockNumber;
 		if (relation->rd_rel->relkind == RELKIND_INDEX)
 		{
 			relation->rd_isvalid = false;		/* needs to be revalidated */
 			if (relation->rd_refcnt > 1)
 				RelationReloadClassinfo(relation);
 		}
 		return;
 	}

 	/*
 	 * Even non-system indexes should not be blown away if they are open and
 	 * have valid index support information.  This avoids problems with active
 	 * use of the index support information.  As with nailed indexes, we
 	 * re-read the pg_class row to handle possible physical relocation of the
 	 * index.
 	 */
 	if (relation->rd_rel->relkind == RELKIND_INDEX &&
 		relation->rd_refcnt > 0 &&
 		relation->rd_indexcxt != NULL)
 	{
 		relation->rd_isvalid = false;	/* needs to be revalidated */
 		RelationReloadClassinfo(relation);
 		return;
 	}

 	/* Mark it invalid until we've finished rebuild */
 	relation->rd_isvalid = false;

 	/*
 	 * If we're really done with the relcache entry, blow it away. But if
 	 * someone is still using it, reconstruct the whole deal without moving
 	 * the physical RelationData record (so that the someone's pointer is
 	 * still valid).
 	 */
 	if (!rebuild)
 	{
 		/* Flush any rowtype cache entry */
 		flush_rowtype_cache(old_reltype);

 		/* Remove it from the hash table */
 		RelationCacheDelete(relation);

 		/* And release storage */
 		RelationDestroyRelation(relation);
 	}
 	else
 	{
 		/*
 		 * Our strategy for rebuilding an open relcache entry is to build
 		 * a new entry from scratch, swap its contents with the old entry,
 		 * and finally delete the new entry (along with any infrastructure
 		 * swapped over from the old entry).  This is to avoid trouble in case
 		 * an error causes us to lose control partway through.  The old entry
 		 * will still be marked !rd_isvalid, so we'll try to rebuild it again
 		 * on next access.  Meanwhile it's not any less valid than it was
 		 * before, so any code that might expect to continue accessing it
 		 * isn't hurt by the rebuild failure.  (Consider for example a
 		 * subtransaction that ALTERs a table and then gets cancelled partway
 		 * through the cache entry rebuild.  The outer transaction should
 		 * still see the not-modified cache entry as valid.)  The worst
 		 * consequence of an error is leaking the necessarily-unreferenced
 		 * new entry, and this shouldn't happen often enough for that to be
 		 * a big problem.
 		 *
 		 * When rebuilding an open relcache entry, we must preserve ref count
 		 * and rd_createSubid/rd_newRelfilenodeSubid state.  Also attempt to
 		 * preserve the pg_class entry (rd_rel), tupledesc, and rewrite-rule
 		 * substructures in place, because various places assume that these
 		 * structures won't move while they are working with an open relcache
 		 * entry.  (Note: the refcount mechanism for tupledescs might someday
 		 * allow us to remove this hack for the tupledesc.)
  		 *
  		 * Note that this process does not touch CurrentResourceOwner; which
  		 * is good because whatever ref counts the entry may have do not
  		 * necessarily belong to that resource owner.
  		 */
 		Relation	newrel;
  		Oid			save_relid = RelationGetRelid(relation);
 		bool		keep_tupdesc;
 		bool		keep_rules;

 		/* Build temporary entry, but don't link it into hashtable */
 		newrel = RelationBuildDesc(save_relid, false);
 		if (newrel == NULL)
  		{
  			/* Should only get here if relation was deleted */
  			flush_rowtype_cache(old_reltype);
 			RelationCacheDelete(relation);
 			RelationDestroyRelation(relation);
  			elog(ERROR, "relation %u deleted while still in use", save_relid);
  		}

 		keep_tupdesc = equalTupleDescs(relation->rd_att, newrel->rd_att, true);
 		keep_rules = equalRuleLocks(relation->rd_rules, newrel->rd_rules);
 		if (!keep_tupdesc)
  			flush_rowtype_cache(old_reltype);

 		/*
 		 * Perform swapping of the relcache entry contents.  Within this
 		 * process the old entry is momentarily invalid, so there *must*
 		 * be no possibility of CHECK_FOR_INTERRUPTS within this sequence.
 		 * Do it in all-in-line code for safety.
 		 *
 		 * Since the vast majority of fields should be swapped, our method
 		 * is to swap the whole structures and then re-swap those few fields
 		 * we didn't want swapped.
 		 */
 #define SWAPFIELD(fldtype, fldname) \
 		do { \
 			fldtype _tmp = newrel->fldname; \
 			newrel->fldname = relation->fldname; \
 			relation->fldname = _tmp; \
 		} while (0)

 		/* swap all Relation struct fields */
  		{
 			RelationData tmpstruct;

 			memcpy(&tmpstruct, newrel, sizeof(RelationData));
 			memcpy(newrel, relation, sizeof(RelationData));
 			memcpy(relation, &tmpstruct, sizeof(RelationData));
  		}

 		/* rd_smgr must not be swapped, due to back-links from smgr level */
 		SWAPFIELD(SMgrRelation, rd_smgr);
 		/* rd_refcnt must be preserved */
 		SWAPFIELD(int, rd_refcnt);
 		/* isnailed shouldn't change */
 		Assert(newrel->rd_isnailed == relation->rd_isnailed);
 		/* creation sub-XIDs must be preserved */
 		SWAPFIELD(SubTransactionId, rd_createSubid);
 		/* un-swap rd_rel pointers, swap contents instead */
 		SWAPFIELD(Form_pg_class, rd_rel);
 		/* ... but actually, we don't have to update newrel->rd_rel */
 		memcpy(relation->rd_rel, newrel->rd_rel, CLASS_TUPLE_SIZE);
 		/* preserve old tupledesc and rules if no logical change */
 		if (keep_tupdesc)
 			SWAPFIELD(TupleDesc, rd_att);
 		if (keep_rules)
  		{
 			SWAPFIELD(RuleLock *, rd_rules);
 			SWAPFIELD(MemoryContext, rd_rulescxt);
  		}
 		/* pgstat_info must be preserved */
 		SWAPFIELD(struct PgStat_TableStatus *, pgstat_info);

 		/* preserve rd_cdbpolicy, as there are probably pointers to it */
 		SWAPFIELD(struct GpPolicy *, rd_cdbpolicy);

 		SWAPFIELD(struct RelationNodeInfo, rd_relationnodeinfo);

 #undef SWAPFIELD

 		/* And now we can throw away the temporary entry */
 		RelationDestroyRelation(newrel);
 	}
 }

 /*
  * RelationFlushRelation
  *
  *	 Rebuild the relation if it is open (refcount > 0), else blow it away.
  */
 static void
 RelationFlushRelation(Relation relation)
 {
 	bool		rebuild;

 	if (relation->rd_createSubid != InvalidSubTransactionId)
 	{
 		/*
 		 * New relcache entries are always rebuilt, not flushed; else we'd
 		 * forget the "new" status of the relation, which is a useful
 		 * optimization to have.
 		 */
 		rebuild = true;
 	}
 	else
 	{
 		/*
 		 * Pre-existing rels can be dropped from the relcache if not open.
 		 */
 		rebuild = !RelationHasReferenceCountZero(relation);
 	}

 	RelationClearRelation(relation, rebuild);
 }

 /*
  * RelationForgetRelation - unconditionally remove a relcache entry
  *
  *		   External interface for destroying a relcache entry when we
  *		   drop the relation.
  */
 void
 RelationForgetRelation(Oid rid)
 {
 	Relation	relation;

 	RelationIdCacheLookup(rid, relation);

 	if (!PointerIsValid(relation))
 		return;					/* not in cache, nothing to do */

 	if (!RelationHasReferenceCountZero(relation))
 		elog(ERROR, "relation %u is still open", rid);

 	/* Unconditionally destroy the relcache entry */
 	RelationClearRelation(relation, false);
 }

 /*
  *		RelationCacheInvalidateEntry
  *
  *		This routine is invoked for SI cache flush messages.
  *
  * Any relcache entry matching the relid must be flushed.  (Note: caller has
  * already determined that the relid belongs to our database or is a shared
  * relation.)
  *
  * We used to skip local relations, on the grounds that they could
  * not be targets of cross-backend SI update messages; but it seems
  * safer to process them, so that our *own* SI update messages will
  * have the same effects during CommandCounterIncrement for both
  * local and nonlocal relations.
  */
 void
 RelationCacheInvalidateEntry(Oid relationId)
 {
 	Relation	relation;

 	RelationIdCacheLookup(relationId, relation);

 	if (PointerIsValid(relation))
 	{
 		relcacheInvalsReceived++;
 		RelationFlushRelation(relation);
 	}
 }

 /*
  * RelationCacheInvalidate
  *	 Blow away cached relation descriptors that have zero reference counts,
  *	 and rebuild those with positive reference counts.	Also reset the smgr
  *	 relation cache.
  *
  *	 This is currently used only to recover from SI message buffer overflow,
  *	 so we do not touch new-in-transaction relations; they cannot be targets
  *	 of cross-backend SI updates (and our own updates now go through a
  *	 separate linked list that isn't limited by the SI message buffer size).
  *
  *	 We do this in two phases: the first pass deletes deletable items, and
  *	 the second one rebuilds the rebuildable items.  This is essential for
  *	 safety, because hash_seq_search only copes with concurrent deletion of
  *	 the element it is currently visiting.	If a second SI overflow were to
  *	 occur while we are walking the table, resulting in recursive entry to
  *	 this routine, we could crash because the inner invocation blows away
  *	 the entry next to be visited by the outer scan.  But this way is OK,
  *	 because (a) during the first pass we won't process any more SI messages,
  *	 so hash_seq_search will complete safely; (b) during the second pass we
  *	 only hold onto pointers to nondeletable entries.
  *
  *	 The two-phase approach also makes it easy to ensure that we process
  *	 nailed-in-cache indexes before other nondeletable items, and that we
  *	 process pg_class_oid_index first of all.  In scenarios where a nailed
  *	 index has been given a new relfilenode, we have to detect that update
  *	 before the nailed index is used in reloading any other relcache entry.
  */
 void
 RelationCacheInvalidate(void)
 {
 	HASH_SEQ_STATUS status;
 	RelIdCacheEnt *idhentry;
 	Relation	relation;
 	List	   *rebuildFirstList = NIL;
 	List	   *rebuildList = NIL;
 	ListCell   *l;

 	/* Phase 1 */
 	hash_seq_init(&status, RelationIdCache);

 	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
 	{
 		relation = idhentry->reldesc;

 		/* Must close all smgr references to avoid leaving dangling ptrs */
 		RelationCloseSmgr(relation);

 		/* Ignore new relations, since they are never SI targets */
 		if (relation->rd_createSubid != InvalidSubTransactionId)
 			continue;

 		relcacheInvalsReceived++;

 		if (RelationHasReferenceCountZero(relation))
 		{
 			/* Delete this entry immediately */
 			Assert(!relation->rd_isnailed);
 			RelationClearRelation(relation, false);
 		}
 		else
 		{
 			/*
 			 * Add this entry to list of stuff to rebuild in second pass.
 			 * pg_class_oid_index goes on the front of rebuildFirstList, other
 			 * nailed indexes on the back, and everything else into
 			 * rebuildList (in no particular order).
 			 */
 			if (relation->rd_isnailed &&
 				relation->rd_rel->relkind == RELKIND_INDEX)
 			{
 				if (RelationGetRelid(relation) == ClassOidIndexId)
 					rebuildFirstList = lcons(relation, rebuildFirstList);
 				else
 					rebuildFirstList = lappend(rebuildFirstList, relation);
 			}
 			else
 				rebuildList = lcons(relation, rebuildList);
 		}
 	}

 	/*
 	 * Now zap any remaining smgr cache entries.  This must happen before we
 	 * start to rebuild entries, since that may involve catalog fetches which
 	 * will re-open catalog files.
 	 */
 	smgrcloseall();

 	/* Phase 2: rebuild the items found to need rebuild in phase 1 */
 	foreach(l, rebuildFirstList)
 	{
 		relation = (Relation) lfirst(l);
 		RelationClearRelation(relation, true);
 	}
 	list_free(rebuildFirstList);
 	foreach(l, rebuildList)
 	{
 		relation = (Relation) lfirst(l);
 		RelationClearRelation(relation, true);
 	}
 	list_free(rebuildList);
 }

 /*
  * AtEOXact_RelationCache
  *
  *	Clean up the relcache at main-transaction commit or abort.
  *
  * Note: this must be called *before* processing invalidation messages.
  * In the case of abort, we don't want to try to rebuild any invalidated
  * cache entries (since we can't safely do database accesses).  Therefore
  * we must reset refcnts before handling pending invalidations.
  *
  * As of PostgreSQL 8.1, relcache refcnts should get released by the
  * ResourceOwner mechanism.  This routine just does a debugging
  * cross-check that no pins remain.  However, we also need to do special
  * cleanup when the current transaction created any relations or made use
  * of forced index lists.
  */
 void
 AtEOXact_RelationCache(bool isCommit)
 {
 	HASH_SEQ_STATUS status;
 	RelIdCacheEnt *idhentry;

 	/*
 	 * To speed up transaction exit, we want to avoid scanning the relcache
 	 * unless there is actually something for this routine to do.  Other than
 	 * the debug-only Assert checks, most transactions don't create any work
 	 * for us to do here, so we keep a static flag that gets set if there is
 	 * anything to do.	(Currently, this means either a relation is created in
 	 * the current xact, or an index list is forced.)  For simplicity, the
 	 * flag remains set till end of top-level transaction, even though we
 	 * could clear it at subtransaction end in some cases.
 	 *
 	 * MPP-3333: READERS need to *always* scan, otherwise they will not be able
 	 * to maintain a coherent view of the storage layer.
 	 */

 	/*
 	 * in hawq, QE should always cleanup the relcache.
 	 */
 	if (Gp_role != GP_ROLE_EXECUTE && !need_eoxact_work
 #ifdef USE_ASSERT_CHECKING
 		&& !assert_enabled
 #endif
 		)
 		return;

 	hash_seq_init(&status, RelationIdCache);

 	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
 	{
 		Relation	relation = idhentry->reldesc;

 		/*
 		 * The relcache entry's ref count should be back to its normal
 		 * not-in-a-transaction state: 0 unless it's nailed in cache.
 		 *
 		 * In bootstrap mode, this is NOT true, so don't check it --- the
 		 * bootstrap code expects relations to stay open across start/commit
 		 * transaction calls.  (That seems bogus, but it's not worth fixing.)
 		 */
 #ifdef USE_ASSERT_CHECKING
 		if (!IsBootstrapProcessingMode())
 		{
 			int			expected_refcnt;

 			expected_refcnt = relation->rd_isnailed ? 1 : 0;
 			Assert(relation->rd_refcnt == expected_refcnt);
 		}
 #endif

 		/*
 		 * Is it a relation created in the current transaction?
 		 *
 		 * During commit, reset the flag to zero, since we are now out of the
 		 * creating transaction.  During abort, simply delete the relcache
 		 * entry --- it isn't interesting any longer.  (NOTE: if we have
 		 * forgotten the new-ness of a new relation due to a forced cache
 		 * flush, the entry will get deleted anyway by shared-cache-inval
 		 * processing of the aborted pg_class insertion.)
 		 */
 		if (relation->rd_createSubid != InvalidSubTransactionId)
 		{
 			if (isCommit)
 				relation->rd_createSubid = InvalidSubTransactionId;
 			else
 			{
 				RelationClearRelation(relation, false);
 				continue;
 			}
 		}

 		/*
 		 * Flush any temporary index list.
 		 */
 		if (relation->rd_indexvalid == 2)
 		{
 			list_free(relation->rd_indexlist);
 			relation->rd_indexlist = NIL;
 			relation->rd_oidindex = InvalidOid;
 			relation->rd_indexvalid = 0;
 		}
 	}

 	/* Once done with the transaction, we can reset need_eoxact_work */
 	need_eoxact_work = false;
 }

 /*
  * AtEOSubXact_RelationCache
  *
  *	Clean up the relcache at sub-transaction commit or abort.
  *
  * Note: this must be called *before* processing invalidation messages.
  */
 void
 AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
 						  SubTransactionId parentSubid)
 {
 	HASH_SEQ_STATUS status;
 	RelIdCacheEnt *idhentry;

 	/*
 	 * Skip the relcache scan if nothing to do --- see notes for
 	 * AtEOXact_RelationCache.
 	 */
 	if (!need_eoxact_work)
 		return;

 	hash_seq_init(&status, RelationIdCache);

 	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
 	{
 		Relation	relation = idhentry->reldesc;

 		/*
 		 * Is it a relation created in the current subtransaction?
 		 *
 		 * During subcommit, mark it as belonging to the parent, instead.
 		 * During subabort, simply delete the relcache entry.
 		 */
 		if (relation->rd_createSubid == mySubid)
 		{
 			if (isCommit)
 				relation->rd_createSubid = parentSubid;
 			else
 			{
 				Assert(RelationHasReferenceCountZero(relation));
 				RelationClearRelation(relation, false);
 				continue;
 			}
 		}

 		/*
 		 * Flush any temporary index list.
 		 */
 		if (relation->rd_indexvalid == 2)
 		{
 			list_free(relation->rd_indexlist);
 			relation->rd_indexlist = NIL;
 			relation->rd_oidindex = InvalidOid;
 			relation->rd_indexvalid = 0;
 		}
 	}
 }

 /*
  *		RelationBuildLocalRelation
  *			Build a relcache entry for an about-to-be-created relation,
  *			and enter it into the relcache.
  */
 Relation
 RelationBuildLocalRelation(const char *relname,
 						   Oid relnamespace,
 						   TupleDesc tupDesc,
 						   Oid relid,
 						   Oid reltablespace,
 			               char relkind,            /*CDB*/
 			               char relstorage,
 						   bool shared_relation)
 {
 	Relation	rel;
 	MemoryContext oldcxt;
 	int			natts = tupDesc->natts;
 	int			i;
 	bool		has_not_null;
 	bool		nailit;

 	AssertArg(natts >= 0);

 	/*
 	 * check for creation of a rel that must be nailed in cache.
 	 *
 	 * XXX this list had better match RelationCacheInitializePhase2's list.
 	 */
 	switch (relid)
 	{
 		case RelationRelationId:
 		case AttributeRelationId:
 		case ProcedureRelationId:
 		case TypeRelationId:
 			nailit = true;
 			break;
 		default:
 			nailit = false;
 			break;
 	}

 	/*
 	 * check that hardwired list of shared rels matches what's in the
 	 * bootstrap .bki file.  If you get a failure here during initdb, you
 	 * probably need to fix IsSharedRelation() to match whatever you've done
 	 * to the set of shared relations.
 	 */
 	if (shared_relation != IsSharedRelation(relid))
 		elog(ERROR, "shared_relation flag for \"%s\" does not match IsSharedRelation(%u)",
 			 relname, relid);

 	/*
 	 * switch to the cache context to create the relcache entry.
 	 */
 	if (!CacheMemoryContext)
 		CreateCacheMemoryContext();

 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

 	/*
 	 * allocate a new relation descriptor and fill in basic state fields.
 	 */
 	rel = (Relation) palloc0(sizeof(RelationData));

 	rel->rd_targblock = InvalidBlockNumber;

 	/* make sure relation is marked as having no open file yet */
 	rel->rd_smgr = NULL;

 	/* mark it nailed if appropriate */
 	rel->rd_isnailed = nailit;

 	rel->rd_refcnt = nailit ? 1 : 0;

 	/* it's being created in this transaction */
 	rel->rd_createSubid = GetCurrentSubTransactionId();

 	/* must flag that we have rels created in this transaction */
 	need_eoxact_work = true;

 	/* is it a temporary relation? */
 	rel->rd_istemp = isTempNamespace(relnamespace);

 	/* is it a system catalog? */
 	rel->rd_issyscat = (strncmp(relname, "pg_", 3) == 0);

     /*
      * CDB: On QEs, temp relations must use shared buffer cache so data
      * will be visible to all segmates.  On QD, sequence objects must
      * use shared buffer cache so data will be visible to sequence server.
      */
     if (rel->rd_istemp &&
         relkind != RELKIND_SEQUENCE &&
         Gp_role != GP_ROLE_EXECUTE)
         rel->rd_isLocalBuf = true;
     else
         rel->rd_isLocalBuf = false;

 	/*
 	 * create a new tuple descriptor from the one passed in.  We do this
 	 * partly to copy it into the cache context, and partly because the new
 	 * relation can't have any defaults or constraints yet; they have to be
 	 * added in later steps, because they require additions to multiple system
 	 * catalogs.  We can copy attnotnull constraints here, however.
 	 */
 	rel->rd_att = CreateTupleDescCopy(tupDesc);
 	rel->rd_att->tdrefcount = 1;	/* mark as refcounted */
 	has_not_null = false;
 	for (i = 0; i < natts; i++)
 	{
 		rel->rd_att->attrs[i]->attnotnull = tupDesc->attrs[i]->attnotnull;
 		has_not_null |= tupDesc->attrs[i]->attnotnull;
 	}

 	if (has_not_null)
 	{
 		TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr));

 		constr->has_not_null = true;
 		rel->rd_att->constr = constr;
 	}

 	/*
 	 * initialize relation tuple form (caller may add/override data later)
 	 */
 	rel->rd_rel = (Form_pg_class) palloc0(CLASS_TUPLE_SIZE);

 	namestrcpy(&rel->rd_rel->relname, relname);
 	rel->rd_rel->relnamespace = relnamespace;

 	rel->rd_rel->relkind = RELKIND_UNCATALOGED;
 	rel->rd_rel->relstorage = relstorage;
 	rel->rd_rel->relhasoids = rel->rd_att->tdhasoid;
 	rel->rd_rel->relnatts = natts;
 	rel->rd_rel->reltype = InvalidOid;
 	/* needed when bootstrapping: */
 	rel->rd_rel->relowner = BOOTSTRAP_SUPERUSERID;

 	/*
 	 * Create zeroed-out gp_relation_node data.  It will be filled in when the
 	 * disk file is created.
 	 */
 	rel->rd_relationnodeinfo.isPresent = false;
 	rel->rd_relationnodeinfo.tidAllowedToBeZero = false;

 	/*
 	 * Insert relation physical and logical identifiers (OIDs) into the right
 	 * places.	Note that the physical ID (relfilenode) is initially the same
 	 * as the logical ID (OID).
 	 */
 	rel->rd_rel->relisshared = shared_relation;

 	RelationGetRelid(rel) = relid;

 	for (i = 0; i < natts; i++)
 		rel->rd_att->attrs[i]->attrelid = relid;

 	rel->rd_rel->relfilenode = relid;
 	rel->rd_rel->reltablespace = reltablespace;

 	RelationInitLockInfo(rel);	/* see lmgr.c */

 	RelationInitPhysicalAddr(rel);

 	/*
 	 * Okay to insert into the relcache hash tables.
 	 */
 	RelationCacheInsert(rel);

 	/*
 	 * done building relcache entry.
 	 */
 	MemoryContextSwitchTo(oldcxt);

 	/* It's fully valid */
 	rel->rd_isvalid = true;

 	/*
 	 * Caller expects us to pin the returned entry.
 	 */
 	RelationIncrementReferenceCount(rel);

 	return rel;
 }

 /*
  *		RelationCacheInitialize
  *
  *		This initializes the relation descriptor cache.  At the time
  *		that this is invoked, we can't do database access yet (mainly
  *		because the transaction subsystem is not up); all we are doing
  *		is making an empty cache hashtable.  This must be done before
  *		starting the initialization transaction, because otherwise
  *		AtEOXact_RelationCache would crash if that transaction aborts
  *		before we can get the relcache set up.
  */

 #define INITRELCACHESIZE		400

 void
 RelationCacheInitialize(void)
 {
 	MemoryContext oldcxt;
 	HASHCTL		ctl;

 	/*
 	 * make sure cache memory context exists
 	 */
 	if (!CacheMemoryContext)
 		CreateCacheMemoryContext();

     /*
 	 * switch to cache memory context
 	 */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

 	/*
 	 * create hashtable that indexes the relcache
 	 */
 	MemSet(&ctl, 0, sizeof(ctl));
 	ctl.keysize = sizeof(Oid);
 	ctl.entrysize = sizeof(RelIdCacheEnt);
 	ctl.hash = oid_hash;
 	RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
 								  &ctl, HASH_ELEM | HASH_FUNCTION);

 	MemoryContextSwitchTo(oldcxt);
 }

 /*
  *		RelationCacheInitializePhase2
  *
  *		This is called as soon as the catcache and transaction system
  *		are functional.  At this point we can actually read data from
  *		the system catalogs.  We first try to read pre-computed relcache
  *		entries from the pg_internal.init file.  If that's missing or
  *		broken, make phony entries for the minimum set of nailed-in-cache
  *		relations.	Then (unless bootstrapping) make sure we have entries
  *		for the critical system indexes.  Once we've done all this, we
  *		have enough infrastructure to open any system catalog or use any
  *		catcache.  The last step is to rewrite pg_internal.init if needed.
  */
 void
 RelationCacheInitializePhase2(void)
 {
 	HASH_SEQ_STATUS status;
 	RelIdCacheEnt *idhentry;
 	MemoryContext oldcxt;
 	bool		needNewCacheFile = false;

 	/*
 	 * switch to cache memory context
 	 */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

 	/*
 	 * Try to load the relcache cache file.  If unsuccessful, bootstrap the
 	 * cache with pre-made descriptors for the critical "nailed-in" system
 	 * catalogs.
 	 */
 	if (IsBootstrapProcessingMode() ||
 		!load_relcache_init_file())
 	{
 		needNewCacheFile = true;

 		formrdesc("pg_class", PG_CLASS_RELTYPE_OID,
 				  true, Natts_pg_class, Desc_pg_class);
 		formrdesc("pg_attribute", PG_ATTRIBUTE_RELTYPE_OID,
 				  false, Natts_pg_attribute, Desc_pg_attribute);
 		formrdesc("pg_proc", PG_PROC_RELTYPE_OID,
 				  true, Natts_pg_proc, Desc_pg_proc);
 		formrdesc("pg_type", PG_TYPE_RELTYPE_OID,
 				  true, Natts_pg_type, Desc_pg_type);

 #define NUM_CRITICAL_RELS	4	/* fix if you change list above */
 	}

 	MemoryContextSwitchTo(oldcxt);

 	/* In bootstrap mode, the faked-up formrdesc info is all we'll have */
 	if (IsBootstrapProcessingMode())
 		return;

 	/*
 	 * If we didn't get the critical system indexes loaded into relcache, do
 	 * so now.	These are critical because the catcache depends on them for
 	 * catcache fetches that are done during relcache load.  Thus, we have an
 	 * infinite-recursion problem.	We can break the recursion by doing
 	 * heapscans instead of indexscans at certain key spots. To avoid hobbling
 	 * performance, we only want to do that until we have the critical indexes
 	 * loaded into relcache.  Thus, the flag criticalRelcachesBuilt is used to
 	 * decide whether to do heapscan or indexscan at the key spots, and we set
 	 * it true after we've loaded the critical indexes.
 	 *
 	 * The critical indexes are marked as "nailed in cache", partly to make it
 	 * easy for load_relcache_init_file to count them, but mainly because we
 	 * cannot flush and rebuild them once we've set criticalRelcachesBuilt to
 	 * true.  (NOTE: perhaps it would be possible to reload them by
 	 * temporarily setting criticalRelcachesBuilt to false again.  For now,
 	 * though, we just nail 'em in.)
 	 *
 	 * RewriteRelRulenameIndexId and TriggerRelidNameIndexId are not critical
 	 * in the same way as the others, because the critical catalogs don't
 	 * (currently) have any rules or triggers, and so these indexes can be
 	 * rebuilt without inducing recursion.	However they are used during
 	 * relcache load when a rel does have rules or triggers, so we choose to
 	 * nail them for performance reasons.
 	 */
 	if (!criticalRelcachesBuilt)
 	{
 		load_critical_index(ClassOidIndexId,
 							RelationRelationId);
 		load_critical_index(AttributeRelidNumIndexId,
 							AttributeRelationId);
 		load_critical_index(IndexRelidIndexId,
 							IndexRelationId);
 		load_critical_index(AccessMethodStrategyIndexId,
 							AccessMethodOperatorRelationId);
 		load_critical_index(AccessMethodProcedureIndexId,
 							AccessMethodProcedureRelationId);
 		load_critical_index(OperatorOidIndexId,
 							OperatorRelationId);
 		load_critical_index(RewriteRelRulenameIndexId,
 							RewriteRelationId);
 		load_critical_index(TriggerRelidNameIndexId,
 							TriggerRelationId);

 #define NUM_CRITICAL_INDEXES	8		/* fix if you change list above */

 		criticalRelcachesBuilt = true;
 	}

 	/*
 	 * Now, scan all the relcache entries and update anything that might be
 	 * wrong in the results from formrdesc or the relcache cache file. If we
 	 * faked up relcache entries using formrdesc, then read the real pg_class
 	 * rows and replace the fake entries with them. Also, if any of the
 	 * relcache entries have rules or triggers, load that info the hard way
 	 * since it isn't recorded in the cache file.
 	 */
 	hash_seq_init(&status, RelationIdCache);

 	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
 	{
 		Relation	relation = idhentry->reldesc;
 		bool		restart	 = false;

 		/*
 		 * Make sure *this* entry doesn't get flushed while we work with it.
 		 */
 		RelationIncrementReferenceCount(relation);

 		/*
 		 * If it's a faked-up entry, read the real pg_class tuple.
 		 */
 		if (relation->rd_rel->relowner == InvalidOid)
 		{
 			HeapTuple	htup;
 			Form_pg_class relp;

 			htup = SearchSysCache(RELOID,
 								ObjectIdGetDatum(RelationGetRelid(relation)),
 								  0, 0, 0);
 			if (!HeapTupleIsValid(htup))
 				elog(FATAL, "cache lookup failed for relation %u",
 					 RelationGetRelid(relation));
 			relp = (Form_pg_class) GETSTRUCT(htup);

 			/*
 			 * Copy tuple to relation->rd_rel. (See notes in
 			 * AllocateRelationDesc())
 			 */
 			Assert(relation->rd_rel != NULL);
 			memcpy((char *) relation->rd_rel, (char *) relp, CLASS_TUPLE_SIZE);

 			/* Update rd_options while we have the tuple */
 			if (relation->rd_options)
 				pfree(relation->rd_options);
 			RelationParseRelOptions(relation, htup);

 			/*
 			 * Check the values in rd_att were set up correctly.  (We cannot
 			 * just copy them over now: formrdesc must have set up the rd_att
 			 * data correctly to start with, because it may already have been
 			 * copied into one or more catcache entries.)
 			 */
 			Assert(relation->rd_att->tdtypeid == relp->reltype);
 			Assert(relation->rd_att->tdtypmod == -1);
 			Assert(relation->rd_att->tdhasoid == relp->relhasoids);

 			ReleaseSysCache(htup);

 			/* relowner had better be OK now, else we'll loop forever */
 			if (relation->rd_rel->relowner == InvalidOid)
 				elog(ERROR, "invalid relowner in pg_class entry for \"%s\"",
 					 RelationGetRelationName(relation));

 			restart = true;
 		}

 		/*
 		 * Fix data that isn't saved in relcache cache file.
 		 */
 		if (relation->rd_rel->relhasrules && relation->rd_rules == NULL)
 		{
 			RelationBuildRuleLock(relation);
 			if (relation->rd_rules == NULL)
 				relation->rd_rel->relhasrules = false;
 			restart = true;
 		}
 		if (relation->rd_rel->reltriggers > 0 && relation->trigdesc == NULL)
 		{
 			RelationBuildTriggers(relation);
 			if (relation->trigdesc == NULL)
 				relation->rd_rel->reltriggers = 0;
 			restart = true;
 		}

 		/* Release hold on the relation */
 		RelationDecrementReferenceCount(relation);

 		/* Now, restart the hashtable scan if needed */
 		if (restart)
 		{
 			hash_seq_term(&status);
 			hash_seq_init(&status, RelationIdCache);
 		}
 	}

 	/*
 	 * Lastly, write out a new relcache cache file if one is needed.
 	 */
 	if (needNewCacheFile)
 	{
 		/*
 		 * Force all the catcaches to finish initializing and thereby open the
 		 * catalogs and indexes they use.  This will preload the relcache with
 		 * entries for all the most important system catalogs and indexes, so
 		 * that the init file will be most useful for future backends.
 		 */
 		InitCatalogCachePhase2();

 		/* now write the file */
 		write_relcache_init_file();
 	}
 }

 /*
  * Load one critical system index into the relcache
  *
  * indexoid is the OID of the target index, heapoid is the OID of the catalog
  * it belongs to.
  */
 static void
 load_critical_index(Oid indexoid, Oid heapoid)
 {
 	Relation	ird;

 	/*
 	 * We must lock the underlying catalog before locking the index to avoid
 	 * deadlock, since RelationBuildDesc might well need to read the catalog,
 	 * and if anyone else is exclusive-locking this catalog and index they'll
 	 * be doing it in that order.
 	 */
 	LockRelationOid(heapoid, AccessShareLock);
 	LockRelationOid(indexoid, AccessShareLock);
 	ird = RelationBuildDesc(indexoid, true);
 	if (ird == NULL)
 		elog(PANIC, "could not open critical system index %u", indexoid);
 	ird->rd_isnailed = true;
 	ird->rd_refcnt = 1;
 	UnlockRelationOid(indexoid, AccessShareLock);
 	UnlockRelationOid(heapoid, AccessShareLock);
 }

 /*
  * GetPgClassDescriptor -- get a predefined tuple descriptor for pg_class
  * GetPgIndexDescriptor -- get a predefined tuple descriptor for pg_index
  *
  * We need this kluge because we have to be able to access non-fixed-width
  * fields of pg_class and pg_index before we have the standard catalog caches
  * available.  We use predefined data that's set up in just the same way as
  * the bootstrapped reldescs used by formrdesc().  The resulting tupdesc is
  * not 100% kosher: it does not have the correct rowtype OID in tdtypeid, nor
  * does it have a TupleConstr field.  But it's good enough for the purpose of
  * extracting fields.
  */
 static TupleDesc
 BuildHardcodedDescriptor(int natts, const FormData_pg_attribute *attrs,
 						 bool hasoids)
 {
 	TupleDesc	result;
 	MemoryContext oldcxt;
 	int			i;

 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

 	result = CreateTemplateTupleDesc(natts, hasoids);
 	result->tdtypeid = RECORDOID;		/* not right, but we don't care */
 	result->tdtypmod = -1;

 	for (i = 0; i < natts; i++)
 	{
 		memcpy(result->attrs[i], &attrs[i], ATTRIBUTE_FIXED_PART_SIZE);
 		/* make sure attcacheoff is valid */
 		result->attrs[i]->attcacheoff = -1;
 	}

 	/* initialize first attribute's attcacheoff, cf RelationBuildTupleDesc */
 	result->attrs[0]->attcacheoff = 0;

 	/* Note: we don't bother to set up a TupleConstr entry */

 	MemoryContextSwitchTo(oldcxt);

 	return result;
 }

 static TupleDesc
 GetPgClassDescriptor(void)
 {
 	static TupleDesc pgclassdesc = NULL;

 	/* Already done? */
 	if (pgclassdesc == NULL)
 		pgclassdesc = BuildHardcodedDescriptor(Natts_pg_class,
 											   Desc_pg_class,
 											   true);

 	return pgclassdesc;
 }

 static TupleDesc
 GetPgIndexDescriptor(void)
 {
 	static TupleDesc pgindexdesc = NULL;

 	/* Already done? */
 	if (pgindexdesc == NULL)
 		pgindexdesc = BuildHardcodedDescriptor(Natts_pg_index,
 											   Desc_pg_index,
 											   false);

 	return pgindexdesc;
 }

 /*
  * Load any default attribute value definitions for the relation.
  */
 static void
 AttrDefaultFetch(Relation relation)
 {
 	AttrDefault *attrdef = relation->rd_att->constr->defval;
 	int			ndef = relation->rd_att->constr->num_defval;
 	Relation	adrel;
 	HeapTuple	htup;
 	cqContext	cqc;
 	cqContext  *pcqCtx;
 	Datum		val;
 	bool		isnull;
 	int			found;
 	int			i;

 	adrel = heap_open(AttrDefaultRelationId, AccessShareLock);
 	pcqCtx = caql_beginscan(
 			caql_syscache(
 					caql_indexOK(caql_addrel(cqclr(&cqc), adrel),
 								 true),
 					false),
 			cql("SELECT * FROM pg_attrdef "
 				" WHERE adrelid = :1 ",
 				ObjectIdGetDatum(RelationGetRelid(relation))));

 	found = 0;

 	while (HeapTupleIsValid(htup = caql_getnext(pcqCtx)))
 	{
 		Form_pg_attrdef adform = (Form_pg_attrdef) GETSTRUCT(htup);

 		for (i = 0; i < ndef; i++)
 		{
 			if (adform->adnum != attrdef[i].adnum)
 				continue;
 			if (attrdef[i].adbin != NULL)
 				elog(WARNING, "multiple attrdef records found for attr %s of rel %s",
 				NameStr(relation->rd_att->attrs[adform->adnum - 1]->attname),
 					 RelationGetRelationName(relation));
 			else
 				found++;

 			val = fastgetattr(htup,
 							  Anum_pg_attrdef_adbin,
 							  adrel->rd_att, &isnull);
 			if (isnull)
 				elog(WARNING, "null adbin for attr %s of rel %s",
 				NameStr(relation->rd_att->attrs[adform->adnum - 1]->attname),
 					 RelationGetRelationName(relation));
 			else
 				attrdef[i].adbin = MemoryContextStrdup(CacheMemoryContext,
 												   TextDatumGetCString(val));
 			break;
 		}

 		if (i >= ndef)
 			elog(WARNING, "unexpected attrdef record found for attr %d of rel %s",
 				 adform->adnum, RelationGetRelationName(relation));
 	}

 	caql_endscan(pcqCtx);
 	heap_close(adrel, AccessShareLock);

 	if (found != ndef && ( GP_ROLE_UTILITY==Gp_role || GP_ROLE_DISPATCH==Gp_role ))
 		elog(WARNING, "%d attrdef record(s) missing for rel %s",
 			 ndef - found, RelationGetRelationName(relation));
 }

 /*
  * Load any check constraints for the relation.
  */
 static void
 CheckConstraintFetch(Relation relation)
 {
 	ConstrCheck *check = relation->rd_att->constr->check;
 	int			ncheck = relation->rd_att->constr->num_check;
 	Relation	conrel;
 	HeapTuple	htup;
 	cqContext	cqc;
 	cqContext  *pcqCtx;
 	Datum		val;
 	bool		isnull;
 	int			found = 0;

 	conrel = heap_open(ConstraintRelationId, AccessShareLock);

 	pcqCtx = caql_beginscan(
 			caql_syscache(
 					caql_indexOK(caql_addrel(cqclr(&cqc), conrel),
 								 true),
 						false),
 			cql("SELECT * FROM pg_constraint "
 				" WHERE conrelid = :1 ",
 				ObjectIdGetDatum(RelationGetRelid(relation))));

 	while (HeapTupleIsValid(htup = caql_getnext(pcqCtx)))
 	{
 		Form_pg_constraint conform = (Form_pg_constraint) GETSTRUCT(htup);

 		/* We want check constraints only */
 		if (conform->contype != CONSTRAINT_CHECK)
 			continue;

 		if (found >= ncheck)
 			elog(ERROR, "unexpected constraint record found for rel %s",
 				 RelationGetRelationName(relation));

 		check[found].ccname = MemoryContextStrdup(CacheMemoryContext,
 												  NameStr(conform->conname));

 		/* Grab and test conbin is actually set */
 		val = fastgetattr(htup,
 						  Anum_pg_constraint_conbin,
 						  conrel->rd_att, &isnull);
 		if (isnull)
 			elog(ERROR, "null conbin for rel %s",
 				 RelationGetRelationName(relation));

 		check[found].ccbin = MemoryContextStrdup(CacheMemoryContext,
 												 TextDatumGetCString(val));
 		found++;
 	}

 	caql_endscan(pcqCtx);
 	heap_close(conrel, AccessShareLock);

 	if (found != ncheck)
 		elog(ERROR, "%d constraint record(s) missing for rel %s",
 			 ncheck - found, RelationGetRelationName(relation));
 }


 /*
  * RelationGetPartitioningKey -- get GpPolicy struct for distributed relation
  *
  * Returns a copy of the relation's GpPolicy object, palloc'd in
  * the caller's context.  Caller should pfree() it.  If NULL is
  * returned, relation should be accessed locally.
  */
 GpPolicy*
 RelationGetPartitioningKey(Relation relation)
 {
     return GpPolicyCopy(CurrentMemoryContext, relation->rd_cdbpolicy);
 }                                       /* RelationGetPartitioningKey */


 /*
  * RelationGetIndexList -- get a list of OIDs of indexes on this relation
  *
  * The index list is created only if someone requests it.  We scan pg_index
  * to find relevant indexes, and add the list to the relcache entry so that
  * we won't have to compute it again.  Note that shared cache inval of a
  * relcache entry will delete the old list and set rd_indexvalid to 0,
  * so that we must recompute the index list on next request.  This handles
  * creation or deletion of an index.
  *
  * The returned list is guaranteed to be sorted in order by OID.  This is
  * needed by the executor, since for index types that we obtain exclusive
  * locks on when updating the index, all backends must lock the indexes in
  * the same order or we will get deadlocks (see ExecOpenIndices()).  Any
  * consistent ordering would do, but ordering by OID is easy.
  *
  * Since shared cache inval causes the relcache's copy of the list to go away,
  * we return a copy of the list palloc'd in the caller's context.  The caller
  * may list_free() the returned list after scanning it. This is necessary
  * since the caller will typically be doing syscache lookups on the relevant
  * indexes, and syscache lookup could cause SI messages to be processed!
  *
  * We also update rd_oidindex, which this module treats as effectively part
  * of the index list.  rd_oidindex is valid when rd_indexvalid isn't zero;
  * it is the pg_class OID of a unique index on OID when the relation has one,
  * and InvalidOid if there is no such index.
  */
 List *
 RelationGetIndexList(Relation relation)
 {
 	Relation	indrel;
 	HeapTuple	htup;
 	cqContext	cqc;
 	cqContext  *pcqCtx;
 	List	   *result;
 	Oid			oidIndex;
 	MemoryContext oldcxt;

 	/* Quick exit if we already computed the list. */
 	if (relation->rd_indexvalid != 0)
 		return list_copy(relation->rd_indexlist);

 	/*
 	 * We build the list we intend to return (in the caller's context) while
 	 * doing the scan.	After successfully completing the scan, we copy that
 	 * list into the relcache entry.  This avoids cache-context memory leakage
 	 * if we get some sort of error partway through.
 	 */
 	result = NIL;
 	oidIndex = InvalidOid;

 	/* Prepare to scan pg_index for entries having indrelid = this rel. */

 	indrel = heap_open(IndexRelationId, AccessShareLock);

 	pcqCtx = caql_beginscan(
 			caql_syscache(
 					caql_indexOK(caql_addrel(cqclr(&cqc), indrel),
 								 true),
 						false),
 			cql("SELECT * FROM pg_index "
 				" WHERE indrelid = :1 ",
 				ObjectIdGetDatum(RelationGetRelid(relation))));

 	while (HeapTupleIsValid(htup = caql_getnext(pcqCtx)))
 	{
 		Form_pg_index index = (Form_pg_index) GETSTRUCT(htup);

 		/* Add index's OID to result list in the proper order */
 		result = insert_ordered_oid(result, index->indexrelid);

 		/* Check to see if it is a unique, non-partial btree index on OID */
 		if (index->indnatts == 1 &&
 			index->indisunique &&
 			index->indkey.values[0] == ObjectIdAttributeNumber &&
 			index->indclass.values[0] == OID_BTREE_OPS_OID &&
 			heap_attisnull(htup, Anum_pg_index_indpred))
 			oidIndex = index->indexrelid;
 	}

 	caql_endscan(pcqCtx);
 	heap_close(indrel, AccessShareLock);

 	/* Now save a copy of the completed list in the relcache entry. */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indexlist = list_copy(result);
 	relation->rd_oidindex = oidIndex;
 	relation->rd_indexvalid = 1;
 	MemoryContextSwitchTo(oldcxt);

 	return result;
 }

 /*
  * insert_ordered_oid
  *		Insert a new Oid into a sorted list of Oids, preserving ordering
  *
  * Building the ordered list this way is O(N^2), but with a pretty small
  * constant, so for the number of entries we expect it will probably be
  * faster than trying to apply qsort().  Most tables don't have very many
  * indexes...
  */
 static List *
 insert_ordered_oid(List *list, Oid datum)
 {
 	ListCell   *prev;

 	/* Does the datum belong at the front? */
 	if (list == NIL || datum < linitial_oid(list))
 		return lcons_oid(datum, list);
 	/* No, so find the entry it belongs after */
 	prev = list_head(list);
 	for (;;)
 	{
 		ListCell   *curr = lnext(prev);

 		if (curr == NULL || datum < lfirst_oid(curr))
 			break;				/* it belongs after 'prev', before 'curr' */

 		prev = curr;
 	}
 	/* Insert datum into list after 'prev' */
 	lappend_cell_oid(list, prev, datum);
 	return list;
 }

 /*
  * RelationSetIndexList -- externally force the index list contents
  *
  * This is used to temporarily override what we think the set of valid
  * indexes is (including the presence or absence of an OID index).
  * The forcing will be valid only until transaction commit or abort.
  *
  * This should only be applied to nailed relations, because in a non-nailed
  * relation the hacked index list could be lost at any time due to SI
  * messages.  In practice it is only used on pg_class (see REINDEX).
  *
  * It is up to the caller to make sure the given list is correctly ordered.
  */
 void
 RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex)
 {
 	MemoryContext oldcxt;

 	Assert(relation->rd_isnailed);
 	/* Copy the list into the cache context (could fail for lack of mem) */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	indexIds = list_copy(indexIds);
 	MemoryContextSwitchTo(oldcxt);
 	/* Okay to replace old list */
 	list_free(relation->rd_indexlist);
 	relation->rd_indexlist = indexIds;
 	relation->rd_oidindex = oidIndex;
 	relation->rd_indexvalid = 2;	/* mark list as forced */
 	/* must flag that we have a forced index list */
 	need_eoxact_work = true;
 }

 /*
  * RelationGetOidIndex -- get the pg_class OID of the relation's OID index
  *
  * Returns InvalidOid if there is no such index.
  */
 Oid
 RelationGetOidIndex(Relation relation)
 {
 	List	   *ilist;

 	/*
 	 * If relation doesn't have OIDs at all, caller is probably confused. (We
 	 * could just silently return InvalidOid, but it seems better to throw an
 	 * assertion.)
 	 */
 	Assert(relation->rd_rel->relhasoids);

 	if (relation->rd_indexvalid == 0)
 	{
 		/* RelationGetIndexList does the heavy lifting. */
 		ilist = RelationGetIndexList(relation);
 		list_free(ilist);
 		Assert(relation->rd_indexvalid != 0);
 	}

 	return relation->rd_oidindex;
 }

 /*
  * RelationGetIndexExpressions -- get the index expressions for an index
  *
  * We cache the result of transforming pg_index.indexprs into a node tree.
  * If the rel is not an index or has no expressional columns, we return NIL.
  * Otherwise, the returned tree is copied into the caller's memory context.
  * (We don't want to return a pointer to the relcache copy, since it could
  * disappear due to relcache invalidation.)
  */
 List *
 RelationGetIndexExpressions(Relation relation)
 {
 	List	   *result;
 	Datum		exprsDatum;
 	bool		isnull;
 	char	   *exprsString;
 	MemoryContext oldcxt;

 	/* Quick exit if we already computed the result. */
 	if (relation->rd_indexprs)
 		return (List *) copyObject(relation->rd_indexprs);

 	/* Quick exit if there is nothing to do. */
 	if (relation->rd_indextuple == NULL ||
 		heap_attisnull(relation->rd_indextuple, Anum_pg_index_indexprs))
 		return NIL;

 	/*
 	 * We build the tree we intend to return in the caller's context. After
 	 * successfully completing the work, we copy it into the relcache entry.
 	 * This avoids problems if we get some sort of error partway through.
 	 */
 	exprsDatum = heap_getattr(relation->rd_indextuple,
 							  Anum_pg_index_indexprs,
 							  GetPgIndexDescriptor(),
 							  &isnull);
 	Assert(!isnull);
 	exprsString = TextDatumGetCString(exprsDatum);
 	result = (List *) stringToNode(exprsString);
 	pfree(exprsString);

 	/*
 	 * Run the expressions through eval_const_expressions. This is not just an
 	 * optimization, but is necessary, because the planner will be comparing
 	 * them to similarly-processed qual clauses, and may fail to detect valid
 	 * matches without this.  We don't bother with canonicalize_qual, however.
 	 */
 	result = (List *) eval_const_expressions(NULL, (Node *) result);

 	/*
 	 * Also mark any coercion format fields as "don't care", so that the
 	 * planner can match to both explicit and implicit coercions.
 	 */
 	set_coercionform_dontcare((Node *) result);

 	/* May as well fix opfuncids too */
 	fix_opfuncids((Node *) result);

 	/* Now save a copy of the completed tree in the relcache entry. */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indexprs = (List *) copyObject(result);
 	MemoryContextSwitchTo(oldcxt);

 	return result;
 }

 /*
  * RelationGetIndexPredicate -- get the index predicate for an index
  *
  * We cache the result of transforming pg_index.indpred into an implicit-AND
  * node tree (suitable for ExecQual).
  * If the rel is not an index or has no predicate, we return NIL.
  * Otherwise, the returned tree is copied into the caller's memory context.
  * (We don't want to return a pointer to the relcache copy, since it could
  * disappear due to relcache invalidation.)
  */
 List *
 RelationGetIndexPredicate(Relation relation)
 {
 	List	   *result;
 	Datum		predDatum;
 	bool		isnull;
 	char	   *predString;
 	MemoryContext oldcxt;

 	/* Quick exit if we already computed the result. */
 	if (relation->rd_indpred)
 		return (List *) copyObject(relation->rd_indpred);

 	/* Quick exit if there is nothing to do. */
 	if (relation->rd_indextuple == NULL ||
 		heap_attisnull(relation->rd_indextuple, Anum_pg_index_indpred))
 		return NIL;

 	/*
 	 * We build the tree we intend to return in the caller's context. After
 	 * successfully completing the work, we copy it into the relcache entry.
 	 * This avoids problems if we get some sort of error partway through.
 	 */
 	predDatum = heap_getattr(relation->rd_indextuple,
 							 Anum_pg_index_indpred,
 							 GetPgIndexDescriptor(),
 							 &isnull);
 	Assert(!isnull);
 	predString = TextDatumGetCString(predDatum);
 	result = (List *) stringToNode(predString);
 	pfree(predString);

 	/*
 	 * Run the expression through const-simplification and canonicalization.
 	 * This is not just an optimization, but is necessary, because the planner
 	 * will be comparing it to similarly-processed qual clauses, and may fail
 	 * to detect valid matches without this.  This must match the processing
 	 * done to qual clauses in preprocess_expression()!  (We can skip the
 	 * stuff involving subqueries, however, since we don't allow any in index
 	 * predicates.)
 	 */
 	result = (List *) eval_const_expressions(NULL, (Node *) result);

 	result = (List *) canonicalize_qual((Expr *) result);

 	/*
 	 * Also mark any coercion format fields as "don't care", so that the
 	 * planner can match to both explicit and implicit coercions.
 	 */
 	set_coercionform_dontcare((Node *) result);

 	/* Also convert to implicit-AND format */
 	result = make_ands_implicit((Expr *) result);

 	/* May as well fix opfuncids too */
 	fix_opfuncids((Node *) result);

 	/* Now save a copy of the completed tree in the relcache entry. */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indpred = (List *) copyObject(result);
 	MemoryContextSwitchTo(oldcxt);

 	return result;
 }


 /*
  *	load_relcache_init_file, write_relcache_init_file
  *
  *		In late 1992, we started regularly having databases with more than
  *		a thousand classes in them.  With this number of classes, it became
  *		critical to do indexed lookups on the system catalogs.
  *
  *		Bootstrapping these lookups is very hard.  We want to be able to
  *		use an index on pg_attribute, for example, but in order to do so,
  *		we must have read pg_attribute for the attributes in the index,
  *		which implies that we need to use the index.
  *
  *		In order to get around the problem, we do the following:
  *
  *		   +  When the database system is initialized (at initdb time), we
  *			  don't use indexes.  We do sequential scans.
  *
  *		   +  When the backend is started up in normal mode, we load an image
  *			  of the appropriate relation descriptors, in internal format,
  *			  from an initialization file in the data/base/... directory.
  *
  *		   +  If the initialization file isn't there, then we create the
  *			  relation descriptors using sequential scans and write 'em to
  *			  the initialization file for use by subsequent backends.
  *
  *		We could dispense with the initialization files and just build the
  *		critical reldescs the hard way on every backend startup, but that
  *		slows down backend startup noticeably.
  *
  *		We can in fact go further, and save more relcache entries than
  *		just the ones that are absolutely critical; this allows us to speed
  *		up backend startup by not having to build such entries the hard way.
  *		Presently, all the catalog and index entries that are referred to
  *		by catcaches are stored in the initialization files.
  *
  *		The same mechanism that detects when catcache and relcache entries
  *		need to be invalidated (due to catalog updates) also arranges to
  *		unlink the initialization files when the contents may be out of date.
  *		The files will then be rebuilt during the next backend startup.
  */

 /*
  * load_relcache_init_file -- attempt to load cache from the init file
  *
  * If successful, return TRUE and set criticalRelcachesBuilt to true.
  * If not successful, return FALSE.
  *
  * NOTE: we assume we are already switched into CacheMemoryContext.
  */
 static bool
 load_relcache_init_file(void)
 {
 	FILE	   *fp;
 	char		initfilename[MAXPGPATH];
 	Relation   *rels;
 	int			relno,
 				num_rels,
 				max_rels,
 				nailed_rels,
 				nailed_indexes,
 				magic;
 	int			i;

 	snprintf(initfilename, sizeof(initfilename), "%s/%s",
 			 DatabasePath, RELCACHE_INIT_FILENAME);

 	fp = AllocateFile(initfilename, PG_BINARY_R);
 	if (fp == NULL)
 		return false;

 	/*
 	 * Read the index relcache entries from the file.  Note we will not enter
 	 * any of them into the cache if the read fails partway through; this
 	 * helps to guard against broken init files.
 	 */
 	max_rels = 100;
 	rels = (Relation *) palloc(max_rels * sizeof(Relation));
 	num_rels = 0;
 	nailed_rels = nailed_indexes = 0;
 	initFileRelationIds = NIL;

 	/* check for correct magic number (compatible version) */
 	if (fread(&magic, 1, sizeof(magic), fp) != sizeof(magic))
 		goto read_failed;
 	if (magic != RELCACHE_INIT_FILEMAGIC)
 		goto read_failed;

 	for (relno = 0;; relno++)
 	{
 		Size		len;
 		size_t		nread;
 		Relation	rel;
 		Form_pg_class relform;
 		bool		has_not_null;
 		Datum		indclassDatum;
 		bool		isnull;

 		/* first read the relation descriptor length */
 		if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 		{
 			if (nread == 0)
 				break;			/* end of file */
 			goto read_failed;
 		}

 		/* safety check for incompatible relcache layout */
 		if (len != sizeof(RelationData))
 			goto read_failed;

 		/* allocate another relcache header */
 		if (num_rels >= max_rels)
 		{
 			max_rels *= 2;
 			rels = (Relation *) repalloc(rels, max_rels * sizeof(Relation));
 		}

 		rel = rels[num_rels++] = (Relation) palloc(len);

 		/* then, read the Relation structure */
 		if ((nread = fread(rel, 1, len, fp)) != len)
 			goto read_failed;

 		/* next read the relation tuple form */
 		if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 			goto read_failed;

 		relform = (Form_pg_class) palloc(len);
 		if ((nread = fread(relform, 1, len, fp)) != len)
 			goto read_failed;

 		rel->rd_rel = relform;

 		/* initialize attribute tuple forms */
 		rel->rd_att = CreateTemplateTupleDesc(relform->relnatts,
 											  relform->relhasoids);
 		rel->rd_att->tdrefcount = 1;	/* mark as refcounted */

 		rel->rd_att->tdtypeid = relform->reltype;
 		rel->rd_att->tdtypmod = -1;		/* unnecessary, but... */

 		/* next read all the attribute tuple form data entries */
 		has_not_null = false;
 		for (i = 0; i < relform->relnatts; i++)
 		{
 			if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 				goto read_failed;
 			if (len != ATTRIBUTE_FIXED_PART_SIZE)
 				goto read_failed;
 			if ((nread = fread(rel->rd_att->attrs[i], 1, len, fp)) != len)
 				goto read_failed;

 			has_not_null |= rel->rd_att->attrs[i]->attnotnull;
 		}

 		/* next read the access method specific field */
 		if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 			goto read_failed;
 		if (len > 0)
 		{
 			rel->rd_options = palloc(len);
 			if ((nread = fread(rel->rd_options, 1, len, fp)) != len)
 				goto read_failed;
 			if (len != VARSIZE(rel->rd_options))
 				goto read_failed;		/* sanity check */
 		}
 		else
 		{
 			rel->rd_options = NULL;
 		}

 		/* mark not-null status */
 		if (has_not_null)
 		{
 			TupleConstr *constr = (TupleConstr *) palloc0(sizeof(TupleConstr));

 			constr->has_not_null = true;
 			rel->rd_att->constr = constr;
 		}

 		/* If it's an index, there's more to do */
 		if (rel->rd_rel->relkind == RELKIND_INDEX)
 		{
 			Form_pg_am	am;
 			MemoryContext indexcxt;
 			Oid		   *operator;
 			RegProcedure *support;
 			int			nsupport;

 			/* Count nailed indexes to ensure we have 'em all */
 			if (rel->rd_isnailed)
 				nailed_indexes++;

 			/* next, read the pg_index tuple */
 			if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 				goto read_failed;

 			rel->rd_indextuple = (HeapTuple) palloc(len);
 			if ((nread = fread(rel->rd_indextuple, 1, len, fp)) != len)
 				goto read_failed;

 			/* Fix up internal pointers in the tuple -- see heap_copytuple */
 			rel->rd_indextuple->t_data = (HeapTupleHeader) ((char *) rel->rd_indextuple + HEAPTUPLESIZE);
 			rel->rd_index = (Form_pg_index) GETSTRUCT(rel->rd_indextuple);

 			/* fix up indclass pointer too */
 			indclassDatum = fastgetattr(rel->rd_indextuple,
 										Anum_pg_index_indclass,
 										GetPgIndexDescriptor(),
 										&isnull);
 			Assert(!isnull);
 			rel->rd_indclass = (oidvector *) DatumGetPointer(indclassDatum);

 			/* next, read the access method tuple form */
 			if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 				goto read_failed;

 			am = (Form_pg_am) palloc(len);
 			if ((nread = fread(am, 1, len, fp)) != len)
 				goto read_failed;
 			rel->rd_am = am;

 			/*
 			 * prepare index info context --- parameters should match
 			 * RelationInitIndexAccessInfo
 			 */
 			indexcxt = AllocSetContextCreate(CacheMemoryContext,
 											 RelationGetRelationName(rel),
 											 ALLOCSET_SMALL_MINSIZE,
 											 ALLOCSET_SMALL_INITSIZE,
 											 ALLOCSET_SMALL_MAXSIZE);
 			rel->rd_indexcxt = indexcxt;

 			/* next, read the vector of operator OIDs */
 			if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 				goto read_failed;

 			operator = (Oid *) MemoryContextAlloc(indexcxt, len);
 			if ((nread = fread(operator, 1, len, fp)) != len)
 				goto read_failed;

 			rel->rd_operator = operator;

 			/* finally, read the vector of support procedures */
 			if ((nread = fread(&len, 1, sizeof(len), fp)) != sizeof(len))
 				goto read_failed;
 			support = (RegProcedure *) MemoryContextAlloc(indexcxt, len);
 			if ((nread = fread(support, 1, len, fp)) != len)
 				goto read_failed;

 			rel->rd_support = support;

 			/* set up zeroed fmgr-info vectors */
 			rel->rd_aminfo = (RelationAmInfo *)
 				MemoryContextAllocZero(indexcxt, sizeof(RelationAmInfo));
 			nsupport = relform->relnatts * am->amsupport;
 			rel->rd_supportinfo = (FmgrInfo *)
 				MemoryContextAllocZero(indexcxt, nsupport * sizeof(FmgrInfo));
 		}
 		else
 		{
 			/* Count nailed rels to ensure we have 'em all */
 			if (rel->rd_isnailed)
 				nailed_rels++;

 			Assert(rel->rd_index == NULL);
 			Assert(rel->rd_indextuple == NULL);
 			Assert(rel->rd_indclass == NULL);
 			Assert(rel->rd_am == NULL);
 			Assert(rel->rd_indexcxt == NULL);
 			Assert(rel->rd_aminfo == NULL);
 			Assert(rel->rd_operator == NULL);
 			Assert(rel->rd_support == NULL);
 			Assert(rel->rd_supportinfo == NULL);
 		}

 		/*
 		 * Rules and triggers are not saved (mainly because the internal
 		 * format is complex and subject to change).  They must be rebuilt if
 		 * needed by RelationCacheInitializePhase2.  This is not expected to
 		 * be a big performance hit since few system catalogs have such. Ditto
 		 * for index expressions and predicates.
 		 */
 		rel->rd_rules = NULL;
 		rel->rd_rulescxt = NULL;
 		rel->trigdesc = NULL;
 		rel->rd_indexprs = NIL;
 		rel->rd_indpred = NIL;

 		/*
 		 * Reset transient-state fields in the relcache entry
 		 */
 		rel->rd_smgr = NULL;
 		rel->rd_targblock = InvalidBlockNumber;
 		if (rel->rd_isnailed)
 			rel->rd_refcnt = 1;
 		else
 			rel->rd_refcnt = 0;
 		rel->rd_indexvalid = 0;
 		rel->rd_indexlist = NIL;
 		rel->rd_oidindex = InvalidOid;
 		rel->rd_createSubid = InvalidSubTransactionId;
 		rel->rd_amcache = NULL;
 		MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
         rel->rd_cdbpolicy = NULL;
         rel->rd_cdbDefaultStatsWarningIssued = false;

 		/*
 		 * Recompute lock and physical addressing info.  This is needed in
 		 * case the pg_internal.init file was copied from some other database
 		 * by CREATE DATABASE.
 		 */
 		RelationInitLockInfo(rel);
 		RelationInitPhysicalAddr(rel);
 	}

 	/*
 	 * We reached the end of the init file without apparent problem. Did we
 	 * get the right number of nailed items?  (This is a useful crosscheck in
 	 * case the set of critical rels or indexes changes.)
 	 */
 	if (nailed_rels != NUM_CRITICAL_RELS ||
 		nailed_indexes != NUM_CRITICAL_INDEXES)
 		goto read_failed;

 	/*
 	 * OK, all appears well.
 	 *
 	 * Now insert all the new relcache entries into the cache.
 	 */
 	for (relno = 0; relno < num_rels; relno++)
 	{
 		RelationCacheInsert(rels[relno]);
 		/* also make a list of their OIDs, for RelationIdIsInInitFile */
 		initFileRelationIds = lcons_oid(RelationGetRelid(rels[relno]),
 										initFileRelationIds);
 	}

 	pfree(rels);
 	FreeFile(fp);

 	criticalRelcachesBuilt = true;
 	return true;

 	/*
 	 * init file is broken, so do it the hard way.	We don't bother trying to
 	 * free the clutter we just allocated; it's not in the relcache so it
 	 * won't hurt.
 	 */
 read_failed:
 	pfree(rels);
 	FreeFile(fp);

 	return false;
 }

 /*
  * Write out a new initialization file with the current contents
  * of the relcache.
  */
 static void
 write_relcache_init_file(void)
 {
 	FILE	   *fp;
 	char		tempfilename[MAXPGPATH];
 	char		finalfilename[MAXPGPATH];
 	int			magic;
 	HASH_SEQ_STATUS status;
 	RelIdCacheEnt *idhentry;
 	MemoryContext oldcxt;
 	int			i;

 	/*
 	 * We must write a temporary file and rename it into place. Otherwise,
 	 * another backend starting at about the same time might crash trying to
 	 * read the partially-complete file.
 	 */
 	snprintf(tempfilename, sizeof(tempfilename), "%s/%s.%d",
 			 DatabasePath, RELCACHE_INIT_FILENAME, MyProcPid);
 	snprintf(finalfilename, sizeof(finalfilename), "%s/%s",
 			 DatabasePath, RELCACHE_INIT_FILENAME);

 	unlink(tempfilename);		/* in case it exists w/wrong permissions */

 	fp = AllocateFile(tempfilename, PG_BINARY_W);
 	if (fp == NULL)
 	{
 		/*
 		 * We used to consider this a fatal error, but we might as well
 		 * continue with backend startup ...
 		 */
 		ereport(WARNING,
 				(errcode_for_file_access(),
 				 errmsg("could not create relation-cache initialization file \"%s\": %m",
 						tempfilename),
 			  errdetail("Continuing anyway, but there's something wrong.")));
 		return;
 	}

 	/*
 	 * Write a magic number to serve as a file version identifier.	We can
 	 * change the magic number whenever the relcache layout changes.
 	 */
 	magic = RELCACHE_INIT_FILEMAGIC;
 	if (fwrite(&magic, 1, sizeof(magic), fp) != sizeof(magic))
 		elog(FATAL, "could not write init file");

 	/*
 	 * Write all the reldescs (in no particular order).
 	 */
 	hash_seq_init(&status, RelationIdCache);

 	initFileRelationIds = NIL;

 	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
 	{
 		Relation	rel = idhentry->reldesc;
 		Form_pg_class relform = rel->rd_rel;

 		/* first write the relcache entry proper */
 		write_item(rel, sizeof(RelationData), fp);

 		/* next write the relation tuple form */
 		write_item(relform, CLASS_TUPLE_SIZE, fp);

 		/* next, do all the attribute tuple form data entries */
 		for (i = 0; i < relform->relnatts; i++)
 		{
 			write_item(rel->rd_att->attrs[i], ATTRIBUTE_FIXED_PART_SIZE, fp);
 		}

 		/* next, do the access method specific field */
 		write_item(rel->rd_options,
 				   (rel->rd_options ? VARSIZE(rel->rd_options) : 0),
 				   fp);

 		/* If it's an index, there's more to do */
 		if (rel->rd_rel->relkind == RELKIND_INDEX)
 		{
 			Form_pg_am	am = rel->rd_am;

 			/* write the pg_index tuple */
 			/* we assume this was created by heap_copytuple! */
 			write_item(rel->rd_indextuple,
 					   HEAPTUPLESIZE + rel->rd_indextuple->t_len,
 					   fp);

 			/* next, write the access method tuple form */
 			write_item(am, sizeof(FormData_pg_am), fp);

 			/* next, write the vector of operator OIDs */
 			write_item(rel->rd_operator,
 					   relform->relnatts * (am->amstrategies * sizeof(Oid)),
 					   fp);

 			/* finally, write the vector of support procedures */
 			write_item(rel->rd_support,
 				  relform->relnatts * (am->amsupport * sizeof(RegProcedure)),
 					   fp);
 		}

 		/* also make a list of their OIDs, for RelationIdIsInInitFile */
 		oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 		initFileRelationIds = lcons_oid(RelationGetRelid(rel),
 										initFileRelationIds);
 		MemoryContextSwitchTo(oldcxt);
 	}

 	if (FreeFile(fp))
 		elog(FATAL, "could not write init file");

 	/*
 	 * Now we have to check whether the data we've so painstakingly
 	 * accumulated is already obsolete due to someone else's just-committed
 	 * catalog changes.  If so, we just delete the temp file and leave it to
 	 * the next backend to try again.  (Our own relcache entries will be
 	 * updated by SI message processing, but we can't be sure whether what we
 	 * wrote out was up-to-date.)
 	 *
 	 * This mustn't run concurrently with RelationCacheInitFileInvalidate, so
 	 * grab a serialization lock for the duration.
 	 */
 	LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);

 	/* Make sure we have seen all incoming SI messages */
 	AcceptInvalidationMessages();

 	/*
 	 * If we have received any SI relcache invals since backend start, assume
 	 * we may have written out-of-date data.
 	 */
 	if (relcacheInvalsReceived == 0L)
 	{
 		/*
 		 * OK, rename the temp file to its final name, deleting any
 		 * previously-existing init file.
 		 *
 		 * Note: a failure here is possible under Cygwin, if some other
 		 * backend is holding open an unlinked-but-not-yet-gone init file. So
 		 * treat this as a noncritical failure; just remove the useless temp
 		 * file on failure.
 		 */
 		if (rename(tempfilename, finalfilename) < 0)
 			unlink(tempfilename);
 	}
 	else
 	{
 		/* Delete the already-obsolete temp file */
 		unlink(tempfilename);
 	}

 	LWLockRelease(RelCacheInitLock);
 }

 /* write a chunk of data preceded by its length */
 static void
 write_item(const void *data, Size len, FILE *fp)
 {
 	if (fwrite(&len, 1, sizeof(len), fp) != sizeof(len))
 		elog(FATAL, "could not write init file");
 	if (fwrite(data, 1, len, fp) != len)
 		elog(FATAL, "could not write init file");
 }

 /*
  * Detect whether a given relation (identified by OID) is one of the ones
  * we store in the init file.
  *
  * Note that we effectively assume that all backends running in a database
  * would choose to store the same set of relations in the init file;
  * otherwise there are cases where we'd fail to detect the need for an init
  * file invalidation.  This does not seem likely to be a problem in practice.
  */
 bool
 RelationIdIsInInitFile(Oid relationId)
 {
 	return list_member_oid(initFileRelationIds, relationId);
 }

 /*
  * Invalidate (remove) the init file during commit of a transaction that
  * changed one or more of the relation cache entries that are kept in the
  * local init file.
  *
  * We actually need to remove the init file twice: once just before sending
  * the SI messages that include relcache inval for such relations, and once
  * just after sending them.  The unlink before ensures that a backend that's
  * currently starting cannot read the now-obsolete init file and then miss
  * the SI messages that will force it to update its relcache entries.  (This
  * works because the backend startup sequence gets into the PGPROC array before
  * trying to load the init file.)  The unlink after is to synchronize with a
  * backend that may currently be trying to write an init file based on data
  * that we've just rendered invalid.  Such a backend will see the SI messages,
  * but we can't leave the init file sitting around to fool later backends.
  *
  * Ignore any failure to unlink the file, since it might not be there if
  * no backend has been started since the last removal.
  *
  * Notice this deals only with the local init file, not the shared init file.
  * The reason is that there can never be a "significant" change to the
  * relcache entry of a shared relation; the most that could happen is
  * updates of noncritical fields such as relpages/reltuples.  So, while
  * it's worth updating the shared init file from time to time, it can never
  * be invalid enough to make it necessary to remove it.
  */
 void
 RelationCacheInitFileInvalidate(bool beforeSend)
 {
 	char		initfilename[MAXPGPATH];

 	snprintf(initfilename, sizeof(initfilename), "%s/%s",
 			 DatabasePath, RELCACHE_INIT_FILENAME);

 	if (beforeSend)
 	{
 		/* no interlock needed here */
 		unlink(initfilename);
 	}
 	else
 	{
 		/*
 		 * We need to interlock this against write_relcache_init_file, to
 		 * guard against possibility that someone renames a new-but-
 		 * already-obsolete init file into place just after we unlink. With
 		 * the interlock, it's certain that write_relcache_init_file will
 		 * notice our SI inval message before renaming into place, or else
 		 * that we will execute second and successfully unlink the file.
 		 */
 		LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
 		unlink(initfilename);
 		LWLockRelease(RelCacheInitLock);
 	}
 }

 /*
  * Remove the init file for a given database during postmaster startup.
  *
  * We used to keep the init file across restarts, but that is unsafe in PITR
  * scenarios, and even in simple crash-recovery cases there are windows for
  * the init file to become out-of-sync with the database.  So now we just
  * remove it during startup and expect the first backend launch to rebuild it.
  * Of course, this has to happen in each database of the cluster.  For
  * simplicity this is driven by flatfiles.c, which has to scan pg_database
  * anyway.
  */
 void
 RelationCacheInitFileRemove(const char *dbPath)
 {
 	char		initfilename[MAXPGPATH];

 	snprintf(initfilename, sizeof(initfilename), "%s/%s",
 			 dbPath, RELCACHE_INIT_FILENAME);
 	unlink(initfilename);
 	/* ignore any error, since it might not be there at all */
 }