src/backend/catalog/storage.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * storage.c
  *	  code to create and destroy physical storage for relations
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/catalog/storage.c
  *
  * NOTES
  *	  Some of this code used to be in storage/smgr/smgr.c, and the
  *	  function names still reflect that.
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/parallel.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
 #include "catalog/storage.h"
 #include "catalog/storage_directory_table.h"
 #include "catalog/storage_xlog.h"
 #include "common/relpath.h"
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"

 /* GUC variables */
 int			wal_skip_threshold = 2048;	/* in kilobytes */

 /*
  * We keep a list of all relations (represented as RelFileLocator values)
  * that have been created or deleted in the current transaction.  When
  * a relation is created, we create the physical file immediately, but
  * remember it so that we can delete the file again if the current
  * transaction is aborted.  Conversely, a deletion request is NOT
  * executed immediately, but is just entered in the list.  When and if
  * the transaction commits, we can delete the physical file.
  *
  * To handle subtransactions, every entry is marked with its transaction
  * nesting level.  At subtransaction commit, we reassign the subtransaction's
  * entries to the parent nesting level.  At subtransaction abort, we can
  * immediately execute the abort-time actions for all entries of the current
  * nesting level.
  *
  * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
  * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
  * but I'm being paranoid.
  */

 typedef struct PendingRelSync
 {
 	RelFileLocator rlocator;
 	bool		is_truncated;	/* Has the file experienced truncation? */
 } PendingRelSync;

 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
 static HTAB *pendingSyncHash = NULL;


 static
 void
 StoargeDestroyPendingRelDelete(PendingRelDelete *reldelete)
 {
 	pfree(reldelete);
 }

 static
 void
 StorageDoPendingRelDelete(PendingRelDelete *delete)
 {
 	SMgrRelation srel;

 	/*
 	 * GPDB: backend can only be TempRelBackendId or InvalidBackendId for a
 	 * given relfile since we don't tie temp relations to their backends.
 	 */
 	srel = smgropen(delete->rlocator.node,
 					delete->rlocator.isTempRelation ?
 					TempRelBackendId : InvalidBackendId,
 					delete->rlocator.smgr_which, NULL);
 	smgrdounlinkall(&srel, 1, false);
 	smgrclose(srel);
 }

 struct PendingRelDeleteAction storage_pending_rel_deletes_action = {
 	.flags = PENDING_REL_DELETE_NEED_PRESERVE | PENDING_REL_DELETE_NEED_XLOG | PENDING_REL_DELETE_NEED_SYNC,
 	.destroy_pending_rel_delete = StoargeDestroyPendingRelDelete,
 	.do_pending_rel_delete = StorageDoPendingRelDelete
 };


 /*
  * AddPendingSync
  *		Queue an at-commit fsync.
  */
 static void
 AddPendingSync(const RelFileLocator *rlocator)
 {
 	PendingRelSync *pending;
 	bool		found;

 	/* create the hash if not yet */
 	if (!pendingSyncHash)
 	{
 		HASHCTL		ctl;

 		ctl.keysize = sizeof(RelFileLocator);
 		ctl.entrysize = sizeof(PendingRelSync);
 		ctl.hcxt = TopTransactionContext;
 		pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
 									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 	}

 	pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
 	Assert(!found);
 	pending->is_truncated = false;
 }

 /*
  * RelationCreateStorage
  *		Create physical storage for a relation.
  *
  * Create the underlying disk file storage for the relation. This only
  * creates the main fork; additional forks are created lazily by the
  * modules that need them.
  *
  * This function is transactional. The creation is WAL-logged, and if the
  * transaction aborts later on, the storage will be destroyed.  A caller
  * that does not want the storage to be destroyed in case of an abort may
  * pass register_delete = false.
  */
 SMgrRelation
 RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete, SMgrImpl smgr_which, Relation rel)
 {
 	SMgrRelation srel;
 	BackendId	backend;
 	bool		needs_wal;

 	Assert(!IsInParallelMode());	/* couldn't update pendingSyncHash */

 	switch (relpersistence)
 	{
 		case RELPERSISTENCE_TEMP:
 			backend = BackendIdForTempRelations();
 			needs_wal = false;
 			break;
 		case RELPERSISTENCE_UNLOGGED:
 			backend = InvalidBackendId;
 			needs_wal = false;
 			break;
 		case RELPERSISTENCE_PERMANENT:
 			backend = InvalidBackendId;
 			needs_wal = true;
 			break;
 		default:
 			elog(ERROR, "invalid relpersistence: %c", relpersistence);
 			return NULL;		/* placate compiler */
 	}

 	srel = smgropen(rlocator, backend, smgr_which, rel);
 	smgrcreate(srel, MAIN_FORKNUM, false);

 	if (needs_wal)
 		log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM, smgr_which);

 	/*
 	 * Add the relation to the list of stuff to delete at abort, if we are
 	 * asked to do so.
 	 */
 	if (register_delete)
 	{
 		PendingRelDelete *pending;

 		pending = (PendingRelDelete *)
 			MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
 		pending->rlocator.node = rlocator;
 		pending->rlocator.isTempRelation = backend == TempRelBackendId;
 		pending->backend = backend;
 		pending->atCommit = false;	/* delete if abort */
 		pending->nestLevel = GetCurrentTransactionNestLevel();
 		pending->rlocator.smgr_which = smgr_which;
 		pending->action = &storage_pending_rel_deletes_action;
 		RegisterPendingDelete(pending);
 	}

 	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
 	{
 		Assert(backend == InvalidBackendId);
 		AddPendingSync(&rlocator);
 	}

 	return srel;
 }

 /*
  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
  */
 void
 log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum, SMgrImpl impl)
 {
 	xl_smgr_create xlrec;

 	/*
 	 * Make an XLOG entry reporting the file creation.
 	 */
 	xlrec.rlocator = *rlocator;
 	xlrec.forkNum = forkNum;
 	xlrec.impl = impl;

 	XLogBeginInsert();
 	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
 	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
 }

 /*
  * RelationDropStorage
  *		Schedule unlinking of physical storage at transaction commit.
  */
 void
 RelationDropStorage(Relation rel)
 {
 	PendingRelDelete *pending;

 	/* Add the relation to the list of stuff to delete at commit */
 	pending = (PendingRelDelete *)
 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
 	pending->rlocator.node = rel->rd_locator;
 	pending->backend = rel->rd_backend;
 	pending->rlocator.isTempRelation = rel->rd_backend == TempRelBackendId;
 	pending->atCommit = true;	/* delete if commit */
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->rlocator.smgr_which = smgr_get_impl(rel);
 	pending->action = &storage_pending_rel_deletes_action;
 	RegisterPendingDelete(pending);

 	/*
 	 * NOTE: if the relation was created in this transaction, it will now be
 	 * present in the pending-delete list twice, once with atCommit true and
 	 * once with atCommit false.  Hence, it will be physically deleted at end
 	 * of xact in either case (and the other entry will be ignored by
 	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
 	 * the existing list entry and delete the physical file immediately, but
 	 * for now I'll keep the logic simple.
 	 */

 	RelationCloseSmgr(rel);
 }

 /*
  * RelationPreserveStorage
  *		Mark a relation as not to be deleted after all.
  *
  * We need this function because relation mapping changes are committed
  * separately from commit of the whole transaction, so it's still possible
  * for the transaction to abort after the mapping update is done.
  * When a new physical relation is installed in the map, it would be
  * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
  * The relation mapper fixes this by telling us to not delete such relations
  * after all as part of its commit.
  *
  * We also use this to reuse an old build of an index during ALTER TABLE, this
  * time removing the delete-at-commit entry.
  *
  * No-op if the relation is not among those scheduled for deletion.
  */
 void
 RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
 {
 	PendingRelDelete *pending;
 	PendingRelDelete *prev;
 	PendingRelDelete *next;

 	prev = NULL;
 	for (pending = pendingDeletes; pending != NULL; pending = next)
 	{
 		next = pending->next;
 		Assert(pending->action);
 		if (!(pending->action->flags & PENDING_REL_DELETE_NEED_PRESERVE))
 		{
 			continue;
 		}

 		if (RelFileLocatorEquals(rlocator, pending->rlocator.node)
 			&& pending->atCommit == atCommit)
 		{
 			/* unlink and delete list entry */
 			if (prev)
 				prev->next = next;
 			else
 				pendingDeletes = next;
 			pfree(pending);
 			/* prev does not change */
 		}
 		else
 		{
 			/* unrelated entry, don't touch it */
 			prev = pending;
 		}
 	}
 }

 /*
  * RelationTruncate
  *		Physically truncate a relation to the specified number of blocks.
  *
  * This includes getting rid of any buffers for the blocks that are to be
  * dropped.
  */
 void
 RelationTruncate(Relation rel, BlockNumber nblocks)
 {
 	bool		fsm;
 	bool		vm;
 	bool		need_fsm_vacuum = false;
 	ForkNumber	forks[MAX_FORKNUM];
 	BlockNumber old_blocks[MAX_FORKNUM];
 	BlockNumber blocks[MAX_FORKNUM];
 	int			nforks = 0;
 	SMgrRelation reln;

 	/*
 	 * Make sure smgr_targblock etc aren't pointing somewhere past new end.
 	 * (Note: don't rely on this reln pointer below this loop.)
 	 */
 	reln = RelationGetSmgr(rel);
 	reln->smgr_targblock = InvalidBlockNumber;
 	for (int i = 0; i <= MAX_FORKNUM; ++i)
 		reln->smgr_cached_nblocks[i] = InvalidBlockNumber;

 	/* Prepare for truncation of MAIN fork of the relation */
 	forks[nforks] = MAIN_FORKNUM;
 	old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
 	blocks[nforks] = nblocks;
 	nforks++;

 	/* Prepare for truncation of the FSM if it exists */
 	fsm = smgrexists(RelationGetSmgr(rel), FSM_FORKNUM);
 	if (fsm)
 	{
 		blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
 		if (BlockNumberIsValid(blocks[nforks]))
 		{
 			forks[nforks] = FSM_FORKNUM;
 			old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
 			nforks++;
 			need_fsm_vacuum = true;
 		}
 	}

 	/* Prepare for truncation of the visibility map too if it exists */
 	vm = smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM);
 	if (vm)
 	{
 		blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
 		if (BlockNumberIsValid(blocks[nforks]))
 		{
 			forks[nforks] = VISIBILITYMAP_FORKNUM;
 			old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
 			nforks++;
 		}
 	}

 	RelationPreTruncate(rel);

 	/*
 	 * The code which follows can interact with concurrent checkpoints in two
 	 * separate ways.
 	 *
 	 * First, the truncation operation might drop buffers that the checkpoint
 	 * otherwise would have flushed. If it does, then it's essential that the
 	 * files actually get truncated on disk before the checkpoint record is
 	 * written. Otherwise, if reply begins from that checkpoint, the
 	 * to-be-truncated blocks might still exist on disk but have older
 	 * contents than expected, which can cause replay to fail. It's OK for the
 	 * blocks to not exist on disk at all, but not for them to have the wrong
 	 * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
 	 * this code executes.
 	 *
 	 * Second, the call to smgrtruncate() below will in turn call
 	 * RegisterSyncRequest(). We need the sync request created by that call to
 	 * be processed before the checkpoint completes. CheckPointGuts() will
 	 * call ProcessSyncRequests(), but if we register our sync request after
 	 * that happens, then the WAL record for the truncation could end up
 	 * preceding the checkpoint record, while the actual sync doesn't happen
 	 * until the next checkpoint. To prevent that, we need to set
 	 * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
 	 * the redo pointer of a concurrent checkpoint, we're guaranteed that the
 	 * corresponding sync request will be processed before the checkpoint
 	 * completes.
 	 */
 	Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
 	MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;

 	/*
 	 * We WAL-log the truncation first and then truncate in a critical
 	 * section. Truncation drops buffers, even if dirty, and then truncates
 	 * disk files. All of that work needs to complete before the lock is
 	 * released, or else old versions of pages on disk that are missing recent
 	 * changes would become accessible again.  We'll try the whole operation
 	 * again in crash recovery if we panic, but even then we can't give up
 	 * because we don't want standbys' relation sizes to diverge and break
 	 * replay or visibility invariants downstream.  The critical section also
 	 * suppresses interrupts.
 	 *
 	 * (See also pg_visibilitymap.c if changing this code.)
 	 */
 	START_CRIT_SECTION();

 	if (RelationNeedsWAL(rel))
 	{
 		/*
 		 * Make an XLOG entry reporting the file truncation.
 		 */
 		XLogRecPtr	lsn;
 		xl_smgr_truncate xlrec;

 		xlrec.blkno = nblocks;
 		xlrec.rlocator = rel->rd_locator;
 		xlrec.flags = SMGR_TRUNCATE_ALL;

 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, sizeof(xlrec));

 		lsn = XLogInsert(RM_SMGR_ID,
 						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);

 		/*
 		 * Flush, because otherwise the truncation of the main relation might
 		 * hit the disk before the WAL record, and the truncation of the FSM
 		 * or visibility map. If we crashed during that window, we'd be left
 		 * with a truncated heap, but the FSM or visibility map would still
 		 * contain entries for the non-existent heap pages, and standbys would
 		 * also never replay the truncation.
 		 */
 		XLogFlush(lsn);
 	}

 	/*
 	 * This will first remove any buffers from the buffer pool that should no
 	 * longer exist after truncation is complete, and then truncate the
 	 * corresponding files on disk.
 	 */
 	smgrtruncate2(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);

 	END_CRIT_SECTION();

 	/* We've done all the critical work, so checkpoints are OK now. */
 	MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);

 	/*
 	 * Update upper-level FSM pages to account for the truncation. This is
 	 * important because the just-truncated pages were likely marked as
 	 * all-free, and would be preferentially selected.
 	 *
 	 * NB: There's no point in delaying checkpoints until this is done.
 	 * Because the FSM is not WAL-logged, we have to be prepared for the
 	 * possibility of corruption after a crash anyway.
 	 */
 	if (need_fsm_vacuum)
 		FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
 }

 /*
  * RelationPreTruncate
  *		Perform AM-independent work before a physical truncation.
  *
  * If an access method's relation_nontransactional_truncate does not call
  * RelationTruncate(), it must call this before decreasing the table size.
  */
 void
 RelationPreTruncate(Relation rel)
 {
 	PendingRelSync *pending;

 	if (!pendingSyncHash)
 		return;

 	pending = hash_search(pendingSyncHash,
 						  &(RelationGetSmgr(rel)->smgr_rlocator.locator),
 						  HASH_FIND, NULL);
 	if (pending)
 		pending->is_truncated = true;
 }

 /*
  * Copy a fork's data, block by block.
  *
  * Note that this requires that there is no dirty data in shared buffers. If
  * it's possible that there are, callers need to flush those using
  * e.g. FlushRelationBuffers(rel).
  *
  * Also note that this is frequently called via locutions such as
  *		RelationCopyStorage(RelationGetSmgr(rel), ...);
  * That's safe only because we perform only smgr and WAL operations here.
  * If we invoked anything else, a relcache flush could cause our SMgrRelation
  * argument to become a dangling pointer.
  */
 void
 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 					ForkNumber forkNum, char relpersistence)
 {
 	PGIOAlignedBlock buf;
 	Page		page;
 	bool		use_wal;
 	bool		copying_initfork;
 	BlockNumber nblocks;
 	BlockNumber blkno;

 	page = (Page) buf.data;

 	/*
 	 * The init fork for an unlogged relation in many respects has to be
 	 * treated the same as normal relation, changes need to be WAL logged and
 	 * it needs to be synced to disk.
 	 */
 	copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
 		forkNum == INIT_FORKNUM;

 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
 	 * enabled AND it's a permanent relation.  This gives the same answer as
 	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
 	 * current operation created new relation storage.
 	 */
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);

 	nblocks = smgrnblocks(src, forkNum);

 	for (blkno = 0; blkno < nblocks; blkno++)
 	{
 		/* If we got a cancel signal during the copy of the data, quit */
 		CHECK_FOR_INTERRUPTS();

 		smgrread(src, forkNum, blkno, buf.data);

 		if (!PageIsVerifiedExtended(page, forkNum,
 									blkno, PIV_LOG_WARNING | PIV_REPORT_STAT))
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg("invalid page in block %u of relation %s",
 							blkno,
 							relpathbackend(src->smgr_rlocator.locator,
 										   src->smgr_rlocator.backend,
 										   forkNum))));
 		/*
 		 * WAL-log the copied page. Unfortunately we don't know what kind of a
 		 * page this is, so we have to log the full page including any unused
 		 * space.
 		 */
 		if (use_wal)
 			log_newpage(&dst->smgr_rlocator.locator, forkNum, blkno, page, false);

 		PageEncryptInplace(page, forkNum,
 						   blkno);
 		PageSetChecksumInplace(page, blkno);

 		/*
 		 * Now write the page.  We say skipFsync = true because there's no
 		 * need for smgr to schedule an fsync for this write; we'll do it
 		 * ourselves below.
 		 */
 		smgrextend(dst, forkNum, blkno, buf.data, true);
 	}

 	/*
 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
 	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
 	 * occurring during the copy has no way to flush the previously written
 	 * data to disk (indeed it won't know the new rel even exists).  A crash
 	 * later on would replay WAL from the checkpoint, therefore it wouldn't
 	 * replay our earlier WAL entries. If we do not fsync those pages here,
 	 * they might still not be on disk when the crash occurs.
 	 */
 	if (use_wal || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }

 /*
  * RelFileLocatorSkippingWAL
  *		Check if a BM_PERMANENT relfilelocator is using WAL.
  *
  * Changes to certain relations must not write WAL; see "Skipping WAL for
  * New RelFileLocator" in src/backend/access/transam/README.  Though it is
  * known from Relation efficiently, this function is intended for the code
  * paths not having access to Relation.
  */
 bool
 RelFileLocatorSkippingWAL(RelFileLocator rlocator)
 {
 	if (!pendingSyncHash ||
 		hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
 		return false;

 	return true;
 }

 /*
  * EstimatePendingSyncsSpace
  *		Estimate space needed to pass syncs to parallel workers.
  */
 Size
 EstimatePendingSyncsSpace(void)
 {
 	long		entries;

 	entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
 	return mul_size(1 + entries, sizeof(RelFileLocator));
 }

 /*
  * SerializePendingSyncs
  *		Serialize syncs for parallel workers.
  */
 void
 SerializePendingSyncs(Size maxSize, char *startAddress)
 {
 	HTAB	   *tmphash;
 	HASHCTL		ctl;
 	HASH_SEQ_STATUS scan;
 	PendingRelSync *sync;
 	PendingRelDelete *delete;
 	RelFileLocator *src;
 	RelFileLocator *dest = (RelFileLocator *) startAddress;

 	if (!pendingSyncHash)
 		goto terminate;

 	/* Create temporary hash to collect active relfilelocators */
 	ctl.keysize = sizeof(RelFileLocator);
 	ctl.entrysize = sizeof(RelFileLocator);
 	ctl.hcxt = CurrentMemoryContext;
 	tmphash = hash_create("tmp relfilelocators",
 						  hash_get_num_entries(pendingSyncHash), &ctl,
 						  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);

 	/* collect all rlocator from pending syncs */
 	hash_seq_init(&scan, pendingSyncHash);
 	while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
 		(void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);

 	/* remove deleted rnodes */
 	for (delete = pendingDeletes; delete != NULL; delete = delete->next)
 	{
 		Assert(delete->action);
 		if (delete->atCommit || !(delete->action->flags & PENDING_REL_DELETE_NEED_SYNC))
 			(void) hash_search(tmphash, (void *) &delete->rlocator,
 							   HASH_REMOVE, NULL);
 	}
 	hash_seq_init(&scan, tmphash);
 	while ((src = (RelFileLocator *) hash_seq_search(&scan)))
 		*dest++ = *src;

 	hash_destroy(tmphash);

 terminate:
 	MemSet(dest, 0, sizeof(RelFileLocator));
 }

 /*
  * RestorePendingSyncs
  *		Restore syncs within a parallel worker.
  *
  * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
  * answer to parallel workers.  Only smgrDoPendingSyncs() reads the
  * is_truncated field, at end of transaction.  Hence, don't restore it.
  */
 void
 RestorePendingSyncs(char *startAddress)
 {
 	RelFileLocator *rlocator;

 	Assert(pendingSyncHash == NULL);
 	for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
 		 rlocator++)
 		AddPendingSync(rlocator);
 }

 void
 RegisterPendingDelete(struct PendingRelDelete *pending)
 {
 	Assert(pending);
 	Assert(pending->action);
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
 }

 /*
  *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
  *
  * This also runs when aborting a subxact; we want to clean up a failed
  * subxact immediately.
  *
  * Note: It's possible that we're being asked to remove a relation that has
  * no physical storage in any fork. In particular, it's possible that we're
  * cleaning up an old temporary relation for which RemovePgTempFiles has
  * already recovered the physical storage.
  */
 void
 smgrDoPendingDeletes(bool isCommit)
 {
 	int			nestLevel = GetCurrentTransactionNestLevel();
 	PendingRelDelete *pending;
 	PendingRelDelete *prev;
 	PendingRelDelete *next;

 	prev = NULL;
 	for (pending = pendingDeletes; pending != NULL; pending = next)
 	{
 		next = pending->next;
 		if (pending->nestLevel < nestLevel)
 		{
 			/* outer-level entries should not be processed yet */
 			prev = pending;
 		}
 		else
 		{
 			/* unlink list entry first, so we don't retry on failure */
 			if (prev)
 				prev->next = next;
 			else
 				pendingDeletes = next;
 			/* do deletion if called for */
 			if (pending->atCommit == isCommit)
 			{
 				Assert(pending->action);
 				Assert(pending->action->do_pending_rel_delete);
 				pending->action->do_pending_rel_delete(pending);
 			}

 			/* must explicitly free the list entry */
 			Assert(pending->action);
 			Assert(pending->action->destroy_pending_rel_delete);
 			pending->action->destroy_pending_rel_delete(pending);
 			/* prev does not change */
 		}
 	}
 }

 /*
  *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
  */
 void
 smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 {
 	PendingRelDelete *pending;
 	int			nrels = 0,
 				maxrels = 0;
 	SMgrRelation *srels = NULL;
 	HASH_SEQ_STATUS scan;
 	PendingRelSync *pendingsync;

 	Assert(GetCurrentTransactionNestLevel() == 1);

 	if (!pendingSyncHash)
 		return;					/* no relation needs sync */

 	/* Abort -- just throw away all pending syncs */
 	if (!isCommit)
 	{
 		pendingSyncHash = NULL;
 		return;
 	}

 	AssertPendingSyncs_RelationCache();

 	/* Parallel worker -- just throw away all pending syncs */
 	if (isParallelWorker)
 	{
 		pendingSyncHash = NULL;
 		return;
 	}

 	/*
 	 * Skip syncing nodes that smgrDoPendingDeletes() will delete. Also skip
 	 * the no need sync pending delete item.
 	 */
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
 		Assert(pending->action);
 		if (pending->atCommit || !(pending->action->flags & PENDING_REL_DELETE_NEED_SYNC))
 			(void) hash_search(pendingSyncHash, &pending->rlocator,
 							   HASH_REMOVE, NULL);
 	}
 	hash_seq_init(&scan, pendingSyncHash);
 	while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
 	{
 		ForkNumber	fork;
 		BlockNumber nblocks[MAX_FORKNUM + 1];
 		uint64		total_blocks = 0;
 		SMgrRelation srel;

 		srel = smgropen(pendingsync->rlocator, InvalidBackendId, SMGR_MD, NULL);

 		/*
 		 * We emit newpage WAL records for smaller relations.
 		 *
 		 * Small WAL records have a chance to be emitted along with other
 		 * backends' WAL records.  We emit WAL records instead of syncing for
 		 * files that are smaller than a certain threshold, expecting faster
 		 * commit.  The threshold is defined by the GUC wal_skip_threshold.
 		 */
 		if (!pendingsync->is_truncated)
 		{
 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
 			{
 				if (smgrexists(srel, fork))
 				{
 					BlockNumber n = smgrnblocks(srel, fork);

 					/* we shouldn't come here for unlogged relations */
 					Assert(fork != INIT_FORKNUM);
 					nblocks[fork] = n;
 					total_blocks += n;
 				}
 				else
 					nblocks[fork] = InvalidBlockNumber;
 			}
 		}

 		/*
 		 * Sync file or emit WAL records for its contents.
 		 *
 		 * Although we emit WAL record if the file is small enough, do file
 		 * sync regardless of the size if the file has experienced a
 		 * truncation. It is because the file would be followed by trailing
 		 * garbage blocks after a crash recovery if, while a past longer file
 		 * had been flushed out, we omitted syncing-out of the file and
 		 * emitted WAL instead.  You might think that we could choose WAL if
 		 * the current main fork is longer than ever, but there's a case where
 		 * main fork is longer than ever but FSM fork gets shorter.
 		 */
 		if (pendingsync->is_truncated ||
 			total_blocks >= wal_skip_threshold * (uint64) 1024 / BLCKSZ)
 		{
 			/* allocate the initial array, or extend it, if needed */
 			if (maxrels == 0)
 			{
 				maxrels = 8;
 				srels = palloc(sizeof(SMgrRelation) * maxrels);
 			}
 			else if (maxrels <= nrels)
 			{
 				maxrels *= 2;
 				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
 			}

 			srels[nrels++] = srel;
 		}
 		else
 		{
 			/* Emit WAL records for all blocks.  The file is small enough. */
 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
 			{
 				int			n = nblocks[fork];
 				Relation	rel;

 				if (!BlockNumberIsValid(n))
 					continue;

 				/*
 				 * Emit WAL for the whole file.  Unfortunately we don't know
 				 * what kind of a page this is, so we have to log the full
 				 * page including any unused space.  ReadBufferExtended()
 				 * counts some pgstat events; unfortunately, we discard them.
 				 */
 				rel = CreateFakeRelcacheEntry(srel->smgr_rlocator.locator);
 				log_newpage_range(rel, fork, 0, n, false);
 				FreeFakeRelcacheEntry(rel);
 			}
 		}
 	}

 	pendingSyncHash = NULL;

 	if (nrels > 0)
 	{
 		smgrdosyncall(srels, nrels);
 		pfree(srels);
 	}
 }

 /*
  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
  *
  * The return value is the number of relations scheduled for termination.
  * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
  * If there are no relations to be deleted, *ptr is set to NULL.
  *
  * Only non-temporary relations are included in the returned list.  This is OK
  * because the list is used only in contexts where temporary relations don't
  * matter: we're either writing to the two-phase state file (and transactions
  * that have touched temp tables can't be prepared) or we're writing to xlog
  * (and all temporary files will be zapped if we restart anyway, so no need
  * for redo to do it also).
  *
  * Note that the list does not include anything scheduled for termination
  * by upper-level transactions.
  *
  * Cloudberry-specific notes: We *do* include temporary relations in the returned
  * list. Because unlike in Upstream Postgres, Cloudberry two-phase commits can
  * involve temporary tables, which necessitates including the temporary
  * relations in the two-phase state files (PREPARE xlog record). Otherwise the
  * relation files won't get unlink(2)'d, or the shared buffers won't be
  * dropped at the end of COMMIT phase.
  */
 int
 smgrGetPendingDeletes(bool forCommit, RelFileNodePendingDelete **ptr)
 {
 	int			nestLevel = GetCurrentTransactionNestLevel();
 	int			nrels;
 	RelFileNodePendingDelete *rptr;
 	PendingRelDelete *pending;

 	nrels = 0;
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
 		Assert(pending->action);
 		if (!(pending->action->flags & PENDING_REL_DELETE_NEED_XLOG))
 		{
 			/* should not reocrd xlog expect pg relation */
 			continue;
 		}

 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
 			/*
 			 * Cloudberry allows transactions that access temporary tables to be
 			 * prepared.
 			 */
 			/* && pending->relnode.backend == InvalidBackendId) */
 				)
 			nrels++;
 	}
 	if (nrels == 0)
 	{
 		*ptr = NULL;
 		return 0;
 	}
 	rptr = (RelFileNodePendingDelete *) palloc(nrels * sizeof(RelFileNodePendingDelete));
 	*ptr = rptr;
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
 		Assert(pending->action);
 		if (!(pending->action->flags & PENDING_REL_DELETE_NEED_XLOG))
 		{
 			continue;
 		}

 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
 			/*
 			 * Keep this loop condition identical to above
 			 */
 			/* && pending->relnode.backend == InvalidBackendId) */
 				)
 		{
 			*rptr = pending->rlocator;
 			rptr++;
 		}
 	}
 	return nrels;
 }

 /*
  *	PostPrepare_smgr -- Clean up after a successful PREPARE
  *
  * What we have to do here is throw away the in-memory state about pending
  * relation deletes.  It's all been recorded in the 2PC state file and
  * it's no longer smgr's job to worry about it.
  */
 void
 PostPrepare_smgr(void)
 {
 	PendingRelDelete *pending;
 	PendingRelDelete *prev;
 	PendingRelDelete *next;

 	prev = NULL;
 	for (pending = pendingDeletes; pending != NULL; pending = next)
 	{
 		next = pending->next;

 		Assert(pending->action);
 		if (pending->action->flags & PENDING_REL_DELETE_NEED_DROP_DELAY_DELETE)
 		{
 			/* delay delete entries should not be processed yet */
 			prev = pending;
 		}
 		else
 		{
 			/* unlink list entry first, so we don't retry on failure */
 			if (prev)
 				prev->next = next;
 			else
 				pendingDeletes = next;

 			/* do deletion if called for */
 			Assert(pending->action->destroy_pending_rel_delete);
 			pending->action->destroy_pending_rel_delete(pending);
 		}
 	}
 }

 /*
  * AtSubCommit_smgr() --- Take care of subtransaction commit.
  *
  * Reassign all items in the pending-deletes list to the parent transaction.
  */
 void
 AtSubCommit_smgr(void)
 {
 	int			nestLevel = GetCurrentTransactionNestLevel();
 	PendingRelDelete *pending;

 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
 		if (pending->nestLevel >= nestLevel)
 			pending->nestLevel = nestLevel - 1;
 	}
 }

 /*
  * AtSubAbort_smgr() --- Take care of subtransaction abort.
  *
  * Delete created relations and forget about deleted relations.
  * We can execute these operations immediately because we know this
  * subtransaction will not commit.
  */
 void
 AtSubAbort_smgr(void)
 {
 	smgrDoPendingDeletes(false);
 }

 void
 smgr_redo(XLogReaderState *record)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;

 	/* Backup blocks are not used in smgr records */
 	Assert(!XLogRecHasAnyBlockRefs(record));

 	if (info == XLOG_SMGR_CREATE)
 	{
 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
 		SMgrRelation reln;

 		reln = smgropen(xlrec->rlocator, InvalidBackendId, xlrec->impl, NULL);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
 	else if (info == XLOG_SMGR_TRUNCATE)
 	{
 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
 		SMgrRelation reln;
 		Relation	rel;
 		ForkNumber	forks[MAX_FORKNUM];
 		BlockNumber blocks[MAX_FORKNUM];
 		BlockNumber old_blocks[MAX_FORKNUM];
 		int			nforks = 0;
 		bool		need_fsm_vacuum = false;

 		/*
 		 * AO-specific implementation of SMGR is not needed because truncate
 		 * for AO takes a different code path, it does not involve emitting
 		 * SMGR_TRUNCATE WAL record.
 		 */
 		reln = smgropen(xlrec->rlocator, InvalidBackendId, SMGR_MD, NULL);

 		/*
 		 * Forcibly create relation if it doesn't exist (which suggests that
 		 * it was dropped somewhere later in the WAL sequence).  As in
 		 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
 		 * log as best we can until the drop is seen.
 		 */
 		smgrcreate(reln, MAIN_FORKNUM, true);

 		/*
 		 * Before we perform the truncation, update minimum recovery point to
 		 * cover this WAL record. Once the relation is truncated, there's no
 		 * going back. The buffer manager enforces the WAL-first rule for
 		 * normal updates to relation files, so that the minimum recovery
 		 * point is always updated before the corresponding change in the data
 		 * file is flushed to disk. We have to do the same manually here.
 		 *
 		 * Doing this before the truncation means that if the truncation fails
 		 * for some reason, you cannot start up the system even after restart,
 		 * until you fix the underlying situation so that the truncation will
 		 * succeed. Alternatively, we could update the minimum recovery point
 		 * after truncation, but that would leave a small window where the
 		 * WAL-first rule could be violated.
 		 */
 		XLogFlush(lsn);

 		/* Prepare for truncation of MAIN fork */
 		if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
 		{
 			forks[nforks] = MAIN_FORKNUM;
 			old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
 			blocks[nforks] = xlrec->blkno;
 			nforks++;

 			/* Also tell xlogutils.c about it */
 			XLogTruncateRelation(xlrec->rlocator, MAIN_FORKNUM, xlrec->blkno);
 		}

 		/* Prepare for truncation of FSM and VM too */
 		rel = CreateFakeRelcacheEntry(xlrec->rlocator);

 		if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
 			smgrexists(reln, FSM_FORKNUM))
 		{
 			blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
 			if (BlockNumberIsValid(blocks[nforks]))
 			{
 				forks[nforks] = FSM_FORKNUM;
 				old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
 				nforks++;
 				need_fsm_vacuum = true;
 			}
 		}
 		if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
 			smgrexists(reln, VISIBILITYMAP_FORKNUM))
 		{
 			blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
 			if (BlockNumberIsValid(blocks[nforks]))
 			{
 				forks[nforks] = VISIBILITYMAP_FORKNUM;
 				old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
 				nforks++;
 			}
 		}

 		/* Do the real work to truncate relation forks */
 		if (nforks > 0)
 		{
 			START_CRIT_SECTION();
 			smgrtruncate2(reln, forks, nforks, old_blocks, blocks);
 			END_CRIT_SECTION();
 		}

 		/*
 		 * Update upper-level FSM pages to account for the truncation. This is
 		 * important because the just-truncated pages were likely marked as
 		 * all-free, and would be preferentially selected.
 		 */
 		if (need_fsm_vacuum)
 			FreeSpaceMapVacuumRange(rel, xlrec->blkno,
 									InvalidBlockNumber);

 		FreeFakeRelcacheEntry(rel);
 	}
 	else
 		elog(PANIC, "smgr_redo: unknown op code %u", info);
 }