src/backend/storage/sync/sync.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * sync.c
  *	  File synchronization management code.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/storage/sync/sync.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/file.h>

 #include "access/commit_ts.h"
 #include "access/clog.h"
 #include "access/distributedlog.h"
 #include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
 #include "storage/md.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"

 #include "utils/faultinjector.h"

 static MemoryContext pendingOpsCxt; /* context for the pending ops state  */

 /*
  * In some contexts (currently, standalone backends and the checkpointer)
  * we keep track of pending fsync operations: we need to remember all relation
  * segments that have been written since the last checkpoint, so that we can
  * fsync them down to disk before completing the next checkpoint.  This hash
  * table remembers the pending operations.  We use a hash table mostly as
  * a convenient way of merging duplicate requests.
  *
  * We use a similar mechanism to remember no-longer-needed files that can
  * be deleted after the next checkpoint, but we use a linked list instead of
  * a hash table, because we don't expect there to be any duplicate requests.
  *
  * These mechanisms are only used for non-temp relations; we never fsync
  * temp rels, nor do we need to postpone their deletion (see comments in
  * mdunlink).
  *
  * (Regular backends do not track pending operations locally, but forward
  * them to the checkpointer.)
  */
 typedef uint16 CycleCtr;		/* can be any convenient integer size */

 typedef struct
 {
 	FileTag		tag;			/* identifies handler and file */
 	CycleCtr	cycle_ctr;		/* sync_cycle_ctr of oldest request */
 	bool		canceled;		/* canceled is true if we canceled "recently" */
 } PendingFsyncEntry;

 typedef struct
 {
 	FileTag		tag;			/* identifies handler and file */
 	CycleCtr	cycle_ctr;		/* checkpoint_cycle_ctr when request was made */
 	bool		canceled;		/* true if request has been canceled */
 } PendingUnlinkEntry;

 static HTAB *pendingOps = NULL;
 static List *pendingUnlinks = NIL;
 static MemoryContext pendingOpsCxt; /* context for the above  */

 static CycleCtr sync_cycle_ctr = 0;
 static CycleCtr checkpoint_cycle_ctr = 0;

 /* Intervals for calling AbsorbSyncRequests */
 #define FSYNCS_PER_ABSORB		10
 #define UNLINKS_PER_ABSORB		10

 /*
  * Function pointers for handling sync and unlink requests.
  */
 typedef struct SyncOps
 {
 	int			(*sync_syncfiletag) (const FileTag *ftag, char *path);
 	int			(*sync_unlinkfiletag) (const FileTag *ftag, char *path);
 	bool		(*sync_filetagmatches) (const FileTag *ftag,
 										const FileTag *candidate);
 } SyncOps;

 /*
  * These indexes must correspond to the values of the SyncRequestHandler enum.
  */
 static const SyncOps syncsw[] = {
 	/* magnetic disk */
 	[SYNC_HANDLER_MD] = {
 		.sync_syncfiletag = mdsyncfiletag,
 		.sync_unlinkfiletag = mdunlinkfiletag,
 		.sync_filetagmatches = mdfiletagmatches
 	},
 	/* pg_xact */
 	[SYNC_HANDLER_CLOG] = {
 		.sync_syncfiletag = clogsyncfiletag
 	},
 	/* pg_commit_ts */
 	[SYNC_HANDLER_COMMIT_TS] = {
 		.sync_syncfiletag = committssyncfiletag
 	},
 	/* pg_multixact/offsets */
 	[SYNC_HANDLER_MULTIXACT_OFFSET] = {
 		.sync_syncfiletag = multixactoffsetssyncfiletag
 	},
 	/* pg_multixact/members */
 	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
 		.sync_syncfiletag = multixactmemberssyncfiletag
 	},
 	/* append-optimized storage */
 	[SYNC_HANDLER_AO] = {
 		.sync_syncfiletag = aosyncfiletag,
 		.sync_unlinkfiletag = mdunlinkfiletag,
 		.sync_filetagmatches = mdfiletagmatches
 	},
 	[SYNC_HANDLER_DISTRIBUTED_LOG] = {
 		.sync_syncfiletag = DistributedLog_syncfiletag
 	}
 };

 /*
  * Initialize data structures for the file sync tracking.
  */
 void
 InitSync(void)
 {
 	/*
 	 * Create pending-operations hashtable if we need it.  Currently, we need
 	 * it if we are standalone (not under a postmaster) or if we are a startup
 	 * or checkpointer auxiliary process.
 	 */
 	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
 	{
 		HASHCTL		hash_ctl;

 		/*
 		 * XXX: The checkpointer needs to add entries to the pending ops table
 		 * when absorbing fsync requests.  That is done within a critical
 		 * section, which isn't usually allowed, but we make an exception. It
 		 * means that there's a theoretical possibility that you run out of
 		 * memory while absorbing fsync requests, which leads to a PANIC.
 		 * Fortunately the hash table is small so that's unlikely to happen in
 		 * practice.
 		 */
 		pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
 											  "Pending ops context",
 											  ALLOCSET_DEFAULT_SIZES);
 		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);

 		hash_ctl.keysize = sizeof(FileTag);
 		hash_ctl.entrysize = sizeof(PendingFsyncEntry);
 		hash_ctl.hcxt = pendingOpsCxt;
 		pendingOps = hash_create("Pending Ops Table",
 								 100L,
 								 &hash_ctl,
 								 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 		pendingUnlinks = NIL;
 	}

 }

 /*
  * SyncPreCheckpoint() -- Do pre-checkpoint work
  *
  * To distinguish unlink requests that arrived before this checkpoint
  * started from those that arrived during the checkpoint, we use a cycle
  * counter similar to the one we use for fsync requests. That cycle
  * counter is incremented here.
  *
  * This must be called *before* the checkpoint REDO point is determined.
  * That ensures that we won't delete files too soon.  Since this calls
  * AbsorbSyncRequests(), which performs memory allocations, it cannot be
  * called within a critical section.
  *
  * Note that we can't do anything here that depends on the assumption
  * that the checkpoint will be completed.
  */
 void
 SyncPreCheckpoint(void)
 {
 	/*
 	 * Operations such as DROP TABLESPACE assume that the next checkpoint will
 	 * process all recently forwarded unlink requests, but if they aren't
 	 * absorbed prior to advancing the cycle counter, they won't be processed
 	 * until a future checkpoint.  The following absorb ensures that any
 	 * unlink requests forwarded before the checkpoint began will be processed
 	 * in the current checkpoint.
 	 */
 	AbsorbSyncRequests();

 	/*
 	 * Any unlink requests arriving after this point will be assigned the next
 	 * cycle counter, and won't be unlinked until next checkpoint.
 	 */
 	checkpoint_cycle_ctr++;
 }

 /*
  * SyncPostCheckpoint() -- Do post-checkpoint work
  *
  * Remove any lingering files that can now be safely removed.
  */
 void
 SyncPostCheckpoint(void)
 {
 	int			absorb_counter;
 	ListCell   *lc;

 	absorb_counter = UNLINKS_PER_ABSORB;
 	foreach(lc, pendingUnlinks)
 	{
 		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
 		char		path[MAXPGPATH];

 		/* Skip over any canceled entries */
 		if (entry->canceled)
 			continue;

 		/*
 		 * New entries are appended to the end, so if the entry is new we've
 		 * reached the end of old entries.
 		 *
 		 * Note: if just the right number of consecutive checkpoints fail, we
 		 * could be fooled here by cycle_ctr wraparound.  However, the only
 		 * consequence is that we'd delay unlinking for one more checkpoint,
 		 * which is perfectly tolerable.
 		 */
 		if (entry->cycle_ctr == checkpoint_cycle_ctr)
 			break;

 		/* Unlink the file */
 		if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
 														  path) < 0)
 		{
 			/*
 			 * There's a race condition, when the database is dropped at the
 			 * same time that we process the pending unlink requests. If the
 			 * DROP DATABASE deletes the file before we do, we will get ENOENT
 			 * here. rmtree() also has to ignore ENOENT errors, to deal with
 			 * the possibility that we delete the file first.
 			 */
 			if (errno != ENOENT)
 				ereport(WARNING,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
 		}

 		/* Mark the list entry as canceled, just in case */
 		entry->canceled = true;

 		/*
 		 * As in ProcessSyncRequests, we don't want to stop absorbing fsync
 		 * requests for a long time when there are many deletions to be done.
 		 * We can safely call AbsorbSyncRequests() at this point in the loop.
 		 */
 		if (--absorb_counter <= 0)
 		{
 			AbsorbSyncRequests();
 			absorb_counter = UNLINKS_PER_ABSORB;
 		}
 	}

 	/*
 	 * If we reached the end of the list, we can just remove the whole list
 	 * (remembering to pfree all the PendingUnlinkEntry objects).  Otherwise,
 	 * we must keep the entries at or after "lc".
 	 */
 	if (lc == NULL)
 	{
 		list_free_deep(pendingUnlinks);
 		pendingUnlinks = NIL;
 	}
 	else
 	{
 		int			ntodelete = list_cell_number(pendingUnlinks, lc);

 		for (int i = 0; i < ntodelete; i++)
 			pfree(list_nth(pendingUnlinks, i));

 		pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
 	}
 }

 /*

  *	ProcessSyncRequests() -- Process queued fsync requests.
  */
 void
 ProcessSyncRequests(void)
 {
 	static bool sync_in_progress = false;

 	HASH_SEQ_STATUS hstat;
 	PendingFsyncEntry *entry;
 	int			absorb_counter;

 	/* Statistics on sync times */
 	int			processed = 0;
 	instr_time	sync_start,
 				sync_end,
 				sync_diff;
 	uint64		elapsed;
 	uint64		longest = 0;
 	uint64		total_elapsed = 0;

 	/*
 	 * This is only called during checkpoints, and checkpoints should only
 	 * occur in processes that have created a pendingOps.
 	 */
 	if (!pendingOps)
 		elog(ERROR, "cannot sync without a pendingOps table");

 	/*
 	 * If we are in the checkpointer, the sync had better include all fsync
 	 * requests that were queued by backends up to this point.  The tightest
 	 * race condition that could occur is that a buffer that must be written
 	 * and fsync'd for the checkpoint could have been dumped by a backend just
 	 * before it was visited by BufferSync().  We know the backend will have
 	 * queued an fsync request before clearing the buffer's dirtybit, so we
 	 * are safe as long as we do an Absorb after completing BufferSync().
 	 */
 	AbsorbSyncRequests();

 	/*
 	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 	 * checkpoint), we want to ignore fsync requests that are entered into the
 	 * hashtable after this point --- they should be processed next time,
 	 * instead.  We use sync_cycle_ctr to tell old entries apart from new
 	 * ones: new ones will have cycle_ctr equal to the incremented value of
 	 * sync_cycle_ctr.
 	 *
 	 * In normal circumstances, all entries present in the table at this point
 	 * will have cycle_ctr exactly equal to the current (about to be old)
 	 * value of sync_cycle_ctr.  However, if we fail partway through the
 	 * fsync'ing loop, then older values of cycle_ctr might remain when we
 	 * come back here to try again.  Repeated checkpoint failures would
 	 * eventually wrap the counter around to the point where an old entry
 	 * might appear new, causing us to skip it, possibly allowing a checkpoint
 	 * to succeed that should not have.  To forestall wraparound, any time the
 	 * previous ProcessSyncRequests() failed to complete, run through the
 	 * table and forcibly set cycle_ctr = sync_cycle_ctr.
 	 *
 	 * Think not to merge this loop with the main loop, as the problem is
 	 * exactly that that loop may fail before having visited all the entries.
 	 * From a performance point of view it doesn't matter anyway, as this path
 	 * will never be taken in a system that's functioning normally.
 	 */
 	if (sync_in_progress)
 	{
 		/* prior try failed, so update any stale cycle_ctr values */
 		hash_seq_init(&hstat, pendingOps);
 		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
 		{
 			entry->cycle_ctr = sync_cycle_ctr;
 		}
 	}

 	/* Advance counter so that new hashtable entries are distinguishable */
 	sync_cycle_ctr++;

 	/* Set flag to detect failure if we don't reach the end of the loop */
 	sync_in_progress = true;

 	/* Now scan the hashtable for fsync requests to process */
 	absorb_counter = FSYNCS_PER_ABSORB;
 	hash_seq_init(&hstat, pendingOps);
 	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
 	{
 		int			failures;

 #ifdef FAULT_INJECTOR
 		if (entry->cycle_ctr != sync_cycle_ctr && !entry->canceled &&
 			(SIMPLE_FAULT_INJECTOR("fsync_counter") == FaultInjectorTypeSkip
 			 || (entry->tag.handler == SYNC_HANDLER_AO &&
 				 SIMPLE_FAULT_INJECTOR("ao_fsync_counter") == FaultInjectorTypeSkip)))
 		{
 			if (MyAuxProcType == CheckpointerProcess)
 			{
 				if (entry->tag.segno == 0)
 					elog(LOG, "checkpoint performing fsync for %d/%d/%u",
 						 entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
 						 entry->tag.rnode.relNode);
 				else
 					elog(LOG, "checkpoint performing fsync for %d/%d/%u.%d",
 						 entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
 						 entry->tag.rnode.relNode, entry->tag.segno);
 			}
 			else
 			{
 				int level = (SIMPLE_FAULT_INJECTOR("fsync_counter") == FaultInjectorTypeSkip) ? ERROR : LOG;
 				if (entry->tag.segno == 0)
 					elog(level, "non checkpoint process trying to fsync "
 						 "%d/%d/%u when fsync_counter fault is set",
 						 entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
 						 entry->tag.rnode.relNode);
 				else
 					elog(level, "non checkpoint process trying to fsync "
 						 "%d/%d/%u.%d when fsync_counter fault is set",
 						 entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
 						 entry->tag.rnode.relNode, entry->tag.segno);
 			}
 		}
 #endif

 		/*
 		 * If the entry is new then don't process it this time; it is new.
 		 * Note "continue" bypasses the hash-remove call at the bottom of the
 		 * loop.
 		 */
 		if (entry->cycle_ctr == sync_cycle_ctr)
 			continue;

 		/* Else assert we haven't missed it */
 		Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);

 		/*
 		 * If fsync is off then we don't have to bother opening the file at
 		 * all.  (We delay checking until this point so that changing fsync on
 		 * the fly behaves sensibly.)
 		 */
 		if (enableFsync)
 		{
 			/*
 			 * If in checkpointer, we want to absorb pending requests every so
 			 * often to prevent overflow of the fsync request queue.  It is
 			 * unspecified whether newly-added entries will be visited by
 			 * hash_seq_search, but we don't care since we don't need to
 			 * process them anyway.
 			 */
 			if (--absorb_counter <= 0)
 			{
 				AbsorbSyncRequests();
 				absorb_counter = FSYNCS_PER_ABSORB;
 			}

 			/*
 			 * The fsync table could contain requests to fsync segments that
 			 * have been deleted (unlinked) by the time we get to them. Rather
 			 * than just hoping an ENOENT (or EACCES on Windows) error can be
 			 * ignored, what we do on error is absorb pending requests and
 			 * then retry. Since mdunlink() queues a "cancel" message before
 			 * actually unlinking, the fsync request is guaranteed to be
 			 * marked canceled after the absorb if it really was this case.
 			 * DROP DATABASE likewise has to tell us to forget fsync requests
 			 * before it starts deletions.
 			 */
 			for (failures = 0; !entry->canceled; failures++)
 			{
 				char		path[MAXPGPATH];

 				INSTR_TIME_SET_CURRENT(sync_start);
 				if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
 																path) == 0)
 				{
 					/* Success; update statistics about sync timing */
 					INSTR_TIME_SET_CURRENT(sync_end);
 					sync_diff = sync_end;
 					INSTR_TIME_SUBTRACT(sync_diff, sync_start);
 					elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
 					if (elapsed > longest)
 						longest = elapsed;
 					total_elapsed += elapsed;
 					processed++;

 					if (log_checkpoints)
 						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
 							 processed,
 							 path,
 							 (double) elapsed / 1000);

 					break;		/* out of retry loop */
 				}

 				/*
 				 * It is possible that the relation has been dropped or
 				 * truncated since the fsync request was entered. Therefore,
 				 * allow ENOENT, but only if we didn't fail already on this
 				 * file.
 				 */
 				if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
 					ereport(data_sync_elevel(ERROR),
 							(errcode_for_file_access(),
 							 errmsg("could not fsync file \"%s\": %m",
 									path)));
 				else
 					ereport(DEBUG1,
 							(errcode_for_file_access(),
 							 errmsg_internal("could not fsync file \"%s\" but retrying: %m",
 											 path)));

 				/*
 				 * Absorb incoming requests and check to see if a cancel
 				 * arrived for this relation fork.
 				 */
 				AbsorbSyncRequests();
 				absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
 			}					/* end retry loop */
 		}

 		/* We are done with this entry, remove it */
 		if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
 			elog(ERROR, "pendingOps corrupted");
 	}							/* end loop over hashtable entries */

 	/* Return sync performance metrics for report at checkpoint end */
 	CheckpointStats.ckpt_sync_rels = processed;
 	CheckpointStats.ckpt_longest_sync = longest;
 	CheckpointStats.ckpt_agg_sync_time = total_elapsed;

 	/* Flag successful completion of ProcessSyncRequests */
 	sync_in_progress = false;
 }

 /*
  * RememberSyncRequest() -- callback from checkpointer side of sync request
  *
  * We stuff fsync requests into the local hash table for execution
  * during the checkpointer's next checkpoint.  UNLINK requests go into a
  * separate linked list, however, because they get processed separately.
  *
  * See sync.h for more information on the types of sync requests supported.
  */
 void
 RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
 {
 	Assert(pendingOps);

 	if (type == SYNC_FORGET_REQUEST)
 	{
 		PendingFsyncEntry *entry;

 		/* Cancel previously entered request */
 		entry = (PendingFsyncEntry *) hash_search(pendingOps,
 												  (void *) ftag,
 												  HASH_FIND,
 												  NULL);
 		if (entry != NULL)
 			entry->canceled = true;
 	}
 	else if (type == SYNC_FILTER_REQUEST)
 	{
 		HASH_SEQ_STATUS hstat;
 		PendingFsyncEntry *entry;
 		ListCell   *cell;

 		/* Cancel matching fsync requests */
 		hash_seq_init(&hstat, pendingOps);
 		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
 		{
 			if (entry->tag.handler == ftag->handler &&
 				syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
 				entry->canceled = true;
 		}

 		/* Cancel matching unlink requests */
 		foreach(cell, pendingUnlinks)
 		{
 			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);

 			if (entry->tag.handler == ftag->handler &&
 				syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
 				entry->canceled = true;
 		}
 	}
 	else if (type == SYNC_UNLINK_REQUEST)
 	{
 		/* Unlink request: put it in the linked list */
 		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
 		PendingUnlinkEntry *entry;

 		entry = palloc(sizeof(PendingUnlinkEntry));
 		entry->tag = *ftag;
 		entry->cycle_ctr = checkpoint_cycle_ctr;
 		entry->canceled = false;

 		pendingUnlinks = lappend(pendingUnlinks, entry);

 		MemoryContextSwitchTo(oldcxt);
 	}
 	else
 	{
 		/* Normal case: enter a request to fsync this segment */
 		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
 		PendingFsyncEntry *entry;
 		bool		found;

 		Assert(type == SYNC_REQUEST);

 		entry = (PendingFsyncEntry *) hash_search(pendingOps,
 												  (void *) ftag,
 												  HASH_ENTER,
 												  &found);
 		/* if new entry, or was previously canceled, initialize it */
 		if (!found || entry->canceled)
 		{
 			entry->cycle_ctr = sync_cycle_ctr;
 			entry->canceled = false;
 		}

 		/*
 		 * NB: it's intentional that we don't change cycle_ctr if the entry
 		 * already exists.  The cycle_ctr must represent the oldest fsync
 		 * request that could be in the entry.
 		 */

 		MemoryContextSwitchTo(oldcxt);
 	}
 }

 /*
  * Register the sync request locally, or forward it to the checkpointer.
  *
  * If retryOnError is true, we'll keep trying if there is no space in the
  * queue.  Return true if we succeeded, or false if there wasn't space.
  */
 bool
 RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
 					bool retryOnError)
 {
 	bool		ret;

 	if (pendingOps != NULL)
 	{
 		/* standalone backend or startup process: fsync state is local */
 		RememberSyncRequest(ftag, type);
 		return true;
 	}

 	for (;;)
 	{
 		/*
 		 * Notify the checkpointer about it.  If we fail to queue a message in
 		 * retryOnError mode, we have to sleep and try again ... ugly, but
 		 * hopefully won't happen often.
 		 *
 		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
 		 * error in the case of SYNC_UNLINK_REQUEST would leave the
 		 * no-longer-used file still present on disk, which would be bad, so
 		 * I'm inclined to assume that the checkpointer will always empty the
 		 * queue soon.
 		 */
 		ret = ForwardSyncRequest(ftag, type);

 		/*
 		 * If we are successful in queueing the request, or we failed and were
 		 * instructed not to retry on error, break.
 		 */
 		if (ret || (!ret && !retryOnError))
 			break;

 		WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
 				  WAIT_EVENT_REGISTER_SYNC_REQUEST);
 	}

 	return ret;
 }

 /*
  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
  * already created the pendingOps during initialization of the startup
  * process.  Calling this function drops the local pendingOps so that
  * subsequent requests will be forwarded to checkpointer.
  */
 void
 EnableSyncRequestForwarding(void)
 {
 	/* Perform any pending fsyncs we may have queued up, then drop table */
 	if (pendingOps)
 	{
 		ProcessSyncRequests();
 		hash_destroy(pendingOps);
 	}
 	pendingOps = NULL;

 	/*
 	 * We should not have any pending unlink requests, since mdunlink doesn't
 	 * queue unlink requests when isRedo.
 	 */
 	Assert(pendingUnlinks == NIL);
 }
	/*-------------------------------------------------------------------------
	*
	* sync.c
	* File synchronization management code.
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	*
	* IDENTIFICATION
	* src/backend/storage/sync/sync.c
	*
	*-------------------------------------------------------------------------
	*/
	#include "postgres.h"

	#include <unistd.h>
	#include <fcntl.h>
	#include <sys/file.h>

	#include "access/commit_ts.h"
	#include "access/clog.h"
	#include "access/distributedlog.h"
	#include "access/multixact.h"
	#include "access/xlog.h"
	#include "access/xlogutils.h"
	#include "commands/tablespace.h"
	#include "miscadmin.h"
	#include "pgstat.h"
	#include "portability/instr_time.h"
	#include "postmaster/bgwriter.h"
	#include "storage/bufmgr.h"
	#include "storage/ipc.h"
	#include "storage/latch.h"
	#include "storage/md.h"
	#include "utils/hsearch.h"
	#include "utils/inval.h"
	#include "utils/memutils.h"

	#include "utils/faultinjector.h"

	static MemoryContext pendingOpsCxt; /* context for the pending ops state */

	/*
	* In some contexts (currently, standalone backends and the checkpointer)
	* we keep track of pending fsync operations: we need to remember all relation
	* segments that have been written since the last checkpoint, so that we can
	* fsync them down to disk before completing the next checkpoint. This hash
	* table remembers the pending operations. We use a hash table mostly as
	* a convenient way of merging duplicate requests.
	*
	* We use a similar mechanism to remember no-longer-needed files that can
	* be deleted after the next checkpoint, but we use a linked list instead of
	* a hash table, because we don't expect there to be any duplicate requests.
	*
	* These mechanisms are only used for non-temp relations; we never fsync
	* temp rels, nor do we need to postpone their deletion (see comments in
	* mdunlink).
	*
	* (Regular backends do not track pending operations locally, but forward
	* them to the checkpointer.)
	*/
	typedef uint16 CycleCtr; /* can be any convenient integer size */

	typedef struct
	{
	FileTag tag; /* identifies handler and file */
	CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
	bool canceled; /* canceled is true if we canceled "recently" */
	} PendingFsyncEntry;

	typedef struct
	{
	FileTag tag; /* identifies handler and file */
	CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
	bool canceled; /* true if request has been canceled */
	} PendingUnlinkEntry;

	static HTAB *pendingOps = NULL;
	static List *pendingUnlinks = NIL;
	static MemoryContext pendingOpsCxt; /* context for the above */

	static CycleCtr sync_cycle_ctr = 0;
	static CycleCtr checkpoint_cycle_ctr = 0;

	/* Intervals for calling AbsorbSyncRequests */
	#define FSYNCS_PER_ABSORB 10
	#define UNLINKS_PER_ABSORB 10

	/*
	* Function pointers for handling sync and unlink requests.
	*/
	typedef struct SyncOps
	{
	int (sync_syncfiletag) (const FileTag ftag, char *path);
	int (sync_unlinkfiletag) (const FileTag ftag, char *path);
	bool (sync_filetagmatches) (const FileTag ftag,
	const FileTag *candidate);
	} SyncOps;

	/*
	* These indexes must correspond to the values of the SyncRequestHandler enum.
	*/
	static const SyncOps syncsw[] = {
	/* magnetic disk */
	[SYNC_HANDLER_MD] = {
	.sync_syncfiletag = mdsyncfiletag,
	.sync_unlinkfiletag = mdunlinkfiletag,
	.sync_filetagmatches = mdfiletagmatches
	},
	/* pg_xact */
	[SYNC_HANDLER_CLOG] = {
	.sync_syncfiletag = clogsyncfiletag
	},
	/* pg_commit_ts */
	[SYNC_HANDLER_COMMIT_TS] = {
	.sync_syncfiletag = committssyncfiletag
	},
	/* pg_multixact/offsets */
	[SYNC_HANDLER_MULTIXACT_OFFSET] = {
	.sync_syncfiletag = multixactoffsetssyncfiletag
	},
	/* pg_multixact/members */
	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
	.sync_syncfiletag = multixactmemberssyncfiletag
	},
	/* append-optimized storage */
	[SYNC_HANDLER_AO] = {
	.sync_syncfiletag = aosyncfiletag,
	.sync_unlinkfiletag = mdunlinkfiletag,
	.sync_filetagmatches = mdfiletagmatches
	},
	[SYNC_HANDLER_DISTRIBUTED_LOG] = {
	.sync_syncfiletag = DistributedLog_syncfiletag
	}
	};

	/*
	* Initialize data structures for the file sync tracking.
	*/
	void
	InitSync(void)
	{
	/*
	* Create pending-operations hashtable if we need it. Currently, we need
	* it if we are standalone (not under a postmaster) or if we are a startup
	* or checkpointer auxiliary process.
	*/
	if (!IsUnderPostmaster \|\| AmStartupProcess() \|\| AmCheckpointerProcess())
	{
	HASHCTL hash_ctl;

	/*
	* XXX: The checkpointer needs to add entries to the pending ops table
	* when absorbing fsync requests. That is done within a critical
	* section, which isn't usually allowed, but we make an exception. It
	* means that there's a theoretical possibility that you run out of
	* memory while absorbing fsync requests, which leads to a PANIC.
	* Fortunately the hash table is small so that's unlikely to happen in
	* practice.
	*/
	pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
	"Pending ops context",
	ALLOCSET_DEFAULT_SIZES);
	MemoryContextAllowInCriticalSection(pendingOpsCxt, true);

	hash_ctl.keysize = sizeof(FileTag);
	hash_ctl.entrysize = sizeof(PendingFsyncEntry);
	hash_ctl.hcxt = pendingOpsCxt;
	pendingOps = hash_create("Pending Ops Table",
	100L,
	&hash_ctl,
	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
	pendingUnlinks = NIL;
	}

	}

	/*
	* SyncPreCheckpoint() -- Do pre-checkpoint work
	*
	* To distinguish unlink requests that arrived before this checkpoint
	* started from those that arrived during the checkpoint, we use a cycle
	* counter similar to the one we use for fsync requests. That cycle
	* counter is incremented here.
	*
	* This must be called before the checkpoint REDO point is determined.
	* That ensures that we won't delete files too soon. Since this calls
	* AbsorbSyncRequests(), which performs memory allocations, it cannot be
	* called within a critical section.
	*
	* Note that we can't do anything here that depends on the assumption
	* that the checkpoint will be completed.
	*/
	void
	SyncPreCheckpoint(void)
	{
	/*
	* Operations such as DROP TABLESPACE assume that the next checkpoint will
	* process all recently forwarded unlink requests, but if they aren't
	* absorbed prior to advancing the cycle counter, they won't be processed
	* until a future checkpoint. The following absorb ensures that any
	* unlink requests forwarded before the checkpoint began will be processed
	* in the current checkpoint.
	*/
	AbsorbSyncRequests();

	/*
	* Any unlink requests arriving after this point will be assigned the next
	* cycle counter, and won't be unlinked until next checkpoint.
	*/
	checkpoint_cycle_ctr++;
	}

	/*
	* SyncPostCheckpoint() -- Do post-checkpoint work
	*
	* Remove any lingering files that can now be safely removed.
	*/
	void
	SyncPostCheckpoint(void)
	{
	int absorb_counter;
	ListCell *lc;

	absorb_counter = UNLINKS_PER_ABSORB;
	foreach(lc, pendingUnlinks)
	{
	PendingUnlinkEntry entry = (PendingUnlinkEntry ) lfirst(lc);
	char path[MAXPGPATH];

	/* Skip over any canceled entries */
	if (entry->canceled)
	continue;

	/*
	* New entries are appended to the end, so if the entry is new we've
	* reached the end of old entries.
	*
	* Note: if just the right number of consecutive checkpoints fail, we
	* could be fooled here by cycle_ctr wraparound. However, the only
	* consequence is that we'd delay unlinking for one more checkpoint,
	* which is perfectly tolerable.
	*/
	if (entry->cycle_ctr == checkpoint_cycle_ctr)
	break;

	/* Unlink the file */
	if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
	path) < 0)
	{
	/*
	* There's a race condition, when the database is dropped at the
	* same time that we process the pending unlink requests. If the
	* DROP DATABASE deletes the file before we do, we will get ENOENT
	* here. rmtree() also has to ignore ENOENT errors, to deal with
	* the possibility that we delete the file first.
	*/
	if (errno != ENOENT)
	ereport(WARNING,
	(errcode_for_file_access(),
	errmsg("could not remove file \"%s\": %m", path)));
	}

	/* Mark the list entry as canceled, just in case */
	entry->canceled = true;

	/*
	* As in ProcessSyncRequests, we don't want to stop absorbing fsync
	* requests for a long time when there are many deletions to be done.
	* We can safely call AbsorbSyncRequests() at this point in the loop.
	*/
	if (--absorb_counter <= 0)
	{
	AbsorbSyncRequests();
	absorb_counter = UNLINKS_PER_ABSORB;
	}
	}

	/*
	* If we reached the end of the list, we can just remove the whole list
	* (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
	* we must keep the entries at or after "lc".
	*/
	if (lc == NULL)
	{
	list_free_deep(pendingUnlinks);
	pendingUnlinks = NIL;
	}
	else
	{
	int ntodelete = list_cell_number(pendingUnlinks, lc);

	for (int i = 0; i < ntodelete; i++)
	pfree(list_nth(pendingUnlinks, i));

	pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
	}
	}

	/*

	* ProcessSyncRequests() -- Process queued fsync requests.
	*/
	void
	ProcessSyncRequests(void)
	{
	static bool sync_in_progress = false;

	HASH_SEQ_STATUS hstat;
	PendingFsyncEntry *entry;
	int absorb_counter;

	/* Statistics on sync times */
	int processed = 0;
	instr_time sync_start,
	sync_end,
	sync_diff;
	uint64 elapsed;
	uint64 longest = 0;
	uint64 total_elapsed = 0;

	/*
	* This is only called during checkpoints, and checkpoints should only
	* occur in processes that have created a pendingOps.
	*/
	if (!pendingOps)
	elog(ERROR, "cannot sync without a pendingOps table");

	/*
	* If we are in the checkpointer, the sync had better include all fsync
	* requests that were queued by backends up to this point. The tightest
	* race condition that could occur is that a buffer that must be written
	* and fsync'd for the checkpoint could have been dumped by a backend just
	* before it was visited by BufferSync(). We know the backend will have
	* queued an fsync request before clearing the buffer's dirtybit, so we
	* are safe as long as we do an Absorb after completing BufferSync().
	*/
	AbsorbSyncRequests();

	/*
	* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
	* checkpoint), we want to ignore fsync requests that are entered into the
	* hashtable after this point --- they should be processed next time,
	* instead. We use sync_cycle_ctr to tell old entries apart from new
	* ones: new ones will have cycle_ctr equal to the incremented value of
	* sync_cycle_ctr.
	*
	* In normal circumstances, all entries present in the table at this point
	* will have cycle_ctr exactly equal to the current (about to be old)
	* value of sync_cycle_ctr. However, if we fail partway through the
	* fsync'ing loop, then older values of cycle_ctr might remain when we
	* come back here to try again. Repeated checkpoint failures would
	* eventually wrap the counter around to the point where an old entry
	* might appear new, causing us to skip it, possibly allowing a checkpoint
	* to succeed that should not have. To forestall wraparound, any time the
	* previous ProcessSyncRequests() failed to complete, run through the
	* table and forcibly set cycle_ctr = sync_cycle_ctr.
	*
	* Think not to merge this loop with the main loop, as the problem is
	* exactly that that loop may fail before having visited all the entries.
	* From a performance point of view it doesn't matter anyway, as this path
	* will never be taken in a system that's functioning normally.
	*/
	if (sync_in_progress)
	{
	/* prior try failed, so update any stale cycle_ctr values */
	hash_seq_init(&hstat, pendingOps);
	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
	{
	entry->cycle_ctr = sync_cycle_ctr;
	}
	}

	/* Advance counter so that new hashtable entries are distinguishable */
	sync_cycle_ctr++;

	/* Set flag to detect failure if we don't reach the end of the loop */
	sync_in_progress = true;

	/* Now scan the hashtable for fsync requests to process */
	absorb_counter = FSYNCS_PER_ABSORB;
	hash_seq_init(&hstat, pendingOps);
	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
	{
	int failures;

	#ifdef FAULT_INJECTOR
	if (entry->cycle_ctr != sync_cycle_ctr && !entry->canceled &&
	(SIMPLE_FAULT_INJECTOR("fsync_counter") == FaultInjectorTypeSkip
	\|\| (entry->tag.handler == SYNC_HANDLER_AO &&
	SIMPLE_FAULT_INJECTOR("ao_fsync_counter") == FaultInjectorTypeSkip)))
	{
	if (MyAuxProcType == CheckpointerProcess)
	{
	if (entry->tag.segno == 0)
	elog(LOG, "checkpoint performing fsync for %d/%d/%u",
	entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
	entry->tag.rnode.relNode);
	else
	elog(LOG, "checkpoint performing fsync for %d/%d/%u.%d",
	entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
	entry->tag.rnode.relNode, entry->tag.segno);
	}
	else
	{
	int level = (SIMPLE_FAULT_INJECTOR("fsync_counter") == FaultInjectorTypeSkip) ? ERROR : LOG;
	if (entry->tag.segno == 0)
	elog(level, "non checkpoint process trying to fsync "
	"%d/%d/%u when fsync_counter fault is set",
	entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
	entry->tag.rnode.relNode);
	else
	elog(level, "non checkpoint process trying to fsync "
	"%d/%d/%u.%d when fsync_counter fault is set",
	entry->tag.rnode.spcNode, entry->tag.rnode.dbNode,
	entry->tag.rnode.relNode, entry->tag.segno);
	}
	}
	#endif

	/*
	* If the entry is new then don't process it this time; it is new.
	* Note "continue" bypasses the hash-remove call at the bottom of the
	* loop.
	*/
	if (entry->cycle_ctr == sync_cycle_ctr)
	continue;

	/* Else assert we haven't missed it */
	Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);

	/*
	* If fsync is off then we don't have to bother opening the file at
	* all. (We delay checking until this point so that changing fsync on
	* the fly behaves sensibly.)
	*/
	if (enableFsync)
	{
	/*
	* If in checkpointer, we want to absorb pending requests every so
	* often to prevent overflow of the fsync request queue. It is
	* unspecified whether newly-added entries will be visited by
	* hash_seq_search, but we don't care since we don't need to
	* process them anyway.
	*/
	if (--absorb_counter <= 0)
	{
	AbsorbSyncRequests();
	absorb_counter = FSYNCS_PER_ABSORB;
	}

	/*
	* The fsync table could contain requests to fsync segments that
	* have been deleted (unlinked) by the time we get to them. Rather
	* than just hoping an ENOENT (or EACCES on Windows) error can be
	* ignored, what we do on error is absorb pending requests and
	* then retry. Since mdunlink() queues a "cancel" message before
	* actually unlinking, the fsync request is guaranteed to be
	* marked canceled after the absorb if it really was this case.
	* DROP DATABASE likewise has to tell us to forget fsync requests
	* before it starts deletions.
	*/
	for (failures = 0; !entry->canceled; failures++)
	{
	char path[MAXPGPATH];

	INSTR_TIME_SET_CURRENT(sync_start);
	if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
	path) == 0)
	{
	/* Success; update statistics about sync timing */
	INSTR_TIME_SET_CURRENT(sync_end);
	sync_diff = sync_end;
	INSTR_TIME_SUBTRACT(sync_diff, sync_start);
	elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
	if (elapsed > longest)
	longest = elapsed;
	total_elapsed += elapsed;
	processed++;

	if (log_checkpoints)
	elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
	processed,
	path,
	(double) elapsed / 1000);

	break; /* out of retry loop */
	}

	/*
	* It is possible that the relation has been dropped or
	* truncated since the fsync request was entered. Therefore,
	* allow ENOENT, but only if we didn't fail already on this
	* file.
	*/
	if (!FILE_POSSIBLY_DELETED(errno) \|\| failures > 0)
	ereport(data_sync_elevel(ERROR),
	(errcode_for_file_access(),
	errmsg("could not fsync file \"%s\": %m",
	path)));
	else
	ereport(DEBUG1,
	(errcode_for_file_access(),
	errmsg_internal("could not fsync file \"%s\" but retrying: %m",
	path)));

	/*
	* Absorb incoming requests and check to see if a cancel
	* arrived for this relation fork.
	*/
	AbsorbSyncRequests();
	absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
	} /* end retry loop */
	}

	/* We are done with this entry, remove it */
	if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
	elog(ERROR, "pendingOps corrupted");
	} /* end loop over hashtable entries */

	/* Return sync performance metrics for report at checkpoint end */
	CheckpointStats.ckpt_sync_rels = processed;
	CheckpointStats.ckpt_longest_sync = longest;
	CheckpointStats.ckpt_agg_sync_time = total_elapsed;

	/* Flag successful completion of ProcessSyncRequests */
	sync_in_progress = false;
	}

	/*
	* RememberSyncRequest() -- callback from checkpointer side of sync request
	*
	* We stuff fsync requests into the local hash table for execution
	* during the checkpointer's next checkpoint. UNLINK requests go into a
	* separate linked list, however, because they get processed separately.
	*
	* See sync.h for more information on the types of sync requests supported.
	*/
	void
	RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
	{
	Assert(pendingOps);

	if (type == SYNC_FORGET_REQUEST)
	{
	PendingFsyncEntry *entry;

	/* Cancel previously entered request */
	entry = (PendingFsyncEntry *) hash_search(pendingOps,
	(void *) ftag,
	HASH_FIND,
	NULL);
	if (entry != NULL)
	entry->canceled = true;
	}
	else if (type == SYNC_FILTER_REQUEST)
	{
	HASH_SEQ_STATUS hstat;
	PendingFsyncEntry *entry;
	ListCell *cell;

	/* Cancel matching fsync requests */
	hash_seq_init(&hstat, pendingOps);
	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
	{
	if (entry->tag.handler == ftag->handler &&
	syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
	entry->canceled = true;
	}

	/* Cancel matching unlink requests */
	foreach(cell, pendingUnlinks)
	{
	PendingUnlinkEntry entry = (PendingUnlinkEntry ) lfirst(cell);

	if (entry->tag.handler == ftag->handler &&
	syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
	entry->canceled = true;
	}
	}
	else if (type == SYNC_UNLINK_REQUEST)
	{
	/* Unlink request: put it in the linked list */
	MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
	PendingUnlinkEntry *entry;

	entry = palloc(sizeof(PendingUnlinkEntry));
	entry->tag = *ftag;
	entry->cycle_ctr = checkpoint_cycle_ctr;
	entry->canceled = false;

	pendingUnlinks = lappend(pendingUnlinks, entry);

	MemoryContextSwitchTo(oldcxt);
	}
	else
	{
	/* Normal case: enter a request to fsync this segment */
	MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
	PendingFsyncEntry *entry;
	bool found;

	Assert(type == SYNC_REQUEST);

	entry = (PendingFsyncEntry *) hash_search(pendingOps,
	(void *) ftag,
	HASH_ENTER,
	&found);
	/* if new entry, or was previously canceled, initialize it */
	if (!found \|\| entry->canceled)
	{
	entry->cycle_ctr = sync_cycle_ctr;
	entry->canceled = false;
	}

	/*
	* NB: it's intentional that we don't change cycle_ctr if the entry
	* already exists. The cycle_ctr must represent the oldest fsync
	* request that could be in the entry.
	*/

	MemoryContextSwitchTo(oldcxt);
	}
	}

	/*
	* Register the sync request locally, or forward it to the checkpointer.
	*
	* If retryOnError is true, we'll keep trying if there is no space in the
	* queue. Return true if we succeeded, or false if there wasn't space.
	*/
	bool
	RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
	bool retryOnError)
	{
	bool ret;

	if (pendingOps != NULL)
	{
	/* standalone backend or startup process: fsync state is local */
	RememberSyncRequest(ftag, type);
	return true;
	}

	for (;;)
	{
	/*
	* Notify the checkpointer about it. If we fail to queue a message in
	* retryOnError mode, we have to sleep and try again ... ugly, but
	* hopefully won't happen often.
	*
	* XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
	* error in the case of SYNC_UNLINK_REQUEST would leave the
	* no-longer-used file still present on disk, which would be bad, so
	* I'm inclined to assume that the checkpointer will always empty the
	* queue soon.
	*/
	ret = ForwardSyncRequest(ftag, type);

	/*
	* If we are successful in queueing the request, or we failed and were
	* instructed not to retry on error, break.
	*/
	if (ret \|\| (!ret && !retryOnError))
	break;

	WaitLatch(NULL, WL_EXIT_ON_PM_DEATH \| WL_TIMEOUT, 10,
	WAIT_EVENT_REGISTER_SYNC_REQUEST);
	}

	return ret;
	}

	/*
	* In archive recovery, we rely on checkpointer to do fsyncs, but we will have
	* already created the pendingOps during initialization of the startup
	* process. Calling this function drops the local pendingOps so that
	* subsequent requests will be forwarded to checkpointer.
	*/
	void
	EnableSyncRequestForwarding(void)
	{
	/* Perform any pending fsyncs we may have queued up, then drop table */
	if (pendingOps)
	{
	ProcessSyncRequests();
	hash_destroy(pendingOps);
	}
	pendingOps = NULL;

	/*
	* We should not have any pending unlink requests, since mdunlink doesn't
	* queue unlink requests when isRedo.
	*/
	Assert(pendingUnlinks == NIL);
	}