src/backend/backup/basebackup.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * basebackup.c
  *	  code for taking a base backup and streaming it to a standby
  *
  * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
  *	  src/backend/backup/basebackup.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <sys/stat.h>
 #include <unistd.h>
 #include <time.h>

 #include "access/xlog_internal.h"
 #include "access/xlogbackup.h"
 #include "backup/backup_manifest.h"
 #include "backup/basebackup.h"
 #include "backup/basebackup_sink.h"
 #include "backup/basebackup_target.h"
 #include "commands/defrem.h"
 #include "common/compression.h"
 #include "common/file_perm.h"
 #include "common/kmgr_utils.h"
 #include "commands/progress.h"
 #include "lib/stringinfo.h"
 #include "miscadmin.h"
 #include "nodes/pg_list.h"
 #include "pgstat.h"
 #include "pgtar.h"
 #include "port.h"
 #include "postmaster/syslogger.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
 #include "storage/bufpage.h"
 #include "storage/checksum.h"
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/reinit.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
 #include "utils/ps_status.h"
 #include "utils/relcache.h"
 #include "utils/resowner.h"
 #include "utils/timestamp.h"

 #include "access/genam.h"
 #include "access/hash.h"
 #include "access/xact.h"
 #include "cdb/cdbvars.h"
 #include "catalog/catalog.h"
 #include "catalog/indexing.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_tablespace.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "utils/elog.h"
 #include "utils/fmgroids.h"
 #include "utils/faultinjector.h"
 #include "utils/snapmgr.h"
 #include "utils/tarrable.h"
 /*
  * How much data do we want to send in one CopyData message? Note that
  * this may also result in reading the underlying files in chunks of this
  * size.
  *
  * NB: The buffer size is required to be a multiple of the system block
  * size, so use that value instead if it's bigger than our preference.
  */
 #define SINK_BUFFER_LENGTH			Max(32768, BLCKSZ)

 typedef struct
 {
 	const char *label;
 	bool		progress;
 	bool		fastcheckpoint;
 	bool		nowait;
 	bool		includewal;
 	uint32		maxrate;
 	bool		sendtblspcmapfile;
 	bool		send_to_client;
 	bool		use_copytblspc;
 	BaseBackupTargetHandle *target_handle;
 	backup_manifest_option manifest;
 	pg_compress_algorithm compression;
 	pg_compress_specification compression_specification;
 	pg_checksum_type manifest_checksum_type;
 	HTAB	   *exclude;
 } basebackup_options;

 static bool match_exclude_list(char *path, HTAB *exclude);

 static int64 sendTablespace(bbsink *sink, char *path, char *spcoid, bool sizeonly,
 							struct backup_manifest_info *manifest);
 static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
 					 List *tablespaces, bool sendtblspclinks,
 					 backup_manifest_info *manifest, const char *spcoid,
 					 HTAB *exclude);
 static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
 					 struct stat *statbuf, bool missing_ok, Oid dboid,
 					 backup_manifest_info *manifest, const char *spcoid);
 static void sendFileWithContent(bbsink *sink, const char *filename,
 								const char *content,
 								backup_manifest_info *manifest);
 static int64 _tarWriteHeader(bbsink *sink, const char *filename,
 							 const char *linktarget, struct stat *statbuf,
 							 bool sizeonly);
 static void _tarWritePadding(bbsink *sink, int len);
 static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf);
 static void perform_base_backup(basebackup_options *opt, bbsink *sink);
 static void parse_basebackup_options(List *options, basebackup_options *opt);
 static int	compareWalFileNames(const ListCell *a, const ListCell *b);
 static bool is_checksummed_file(const char *fullpath, const char *filename);
 static int	basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
 								 const char *filename, bool partial_read_ok);

 /* Was the backup currently in-progress initiated in recovery mode? */
 static bool backup_started_in_recovery = false;

 /* Total number of checksum failures during base backup. */
 static long long int total_checksum_failures;

 /* Do not verify checksums. */
 static bool noverify_checksums = false;

 /*
  * Definition of one element part of an exclusion list, used for paths part
  * of checksum validation or base backups.  "name" is the name of the file
  * or path to check for exclusion.  If "match_prefix" is true, any items
  * matching the name as prefix are excluded.
  */
 struct exclude_list_item
 {
 	const char *name;
 	bool		match_prefix;
 };

 /*
  * The contents of these directories are removed or recreated during server
  * start so they are not included in backups.  The directories themselves are
  * kept and included as empty to preserve access permissions.
  *
  * Note: this list should be kept in sync with the filter lists in pg_rewind's
  * filemap.c.
  */
 static const char *const excludeDirContents[] =
 {
 	/* Skip temporary crypto key directories */
 	NEW_KMGR_DIR,
 	OLD_KMGR_DIR,
 	/*
 	 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
 	 * because extensions like pg_stat_statements store data there.
 	 */
 	PG_STAT_TMP_DIR,

 	/*
 	 * It is generally not useful to backup the contents of this directory
 	 * even if the intention is to restore to another primary. See backup.sgml
 	 * for a more detailed description.
 	 */
 	"pg_replslot",

 	/* Contents removed on startup, see dsm_cleanup_for_mmap(). */
 	PG_DYNSHMEM_DIR,

 	/* Contents removed on startup, see AsyncShmemInit(). */
 	"pg_notify",

 	/*
 	 * Old contents are loaded for possible debugging but are not required for
 	 * normal operation, see SerialInit().
 	 */
 	"pg_serial",

 	/* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
 	"pg_snapshots",

 	/* Contents zeroed on startup, see StartupSUBTRANS(). */
 	"pg_subtrans",

 	/* Contents unique to each segment instance. */
 	"log",

 	/* GPDB: Default gpbackup directory (backup contents) */
 	"backups",

 	/* end of list */
 	NULL
 };

 /*
  * List of files excluded from backups.
  */
 static const struct exclude_list_item excludeFiles[] =
 {
 	/* Skip auto conf temporary file. */
 	{PG_AUTOCONF_FILENAME ".tmp", false},

 	/* Skip current log file temporary file */
 	{LOG_METAINFO_DATAFILE_TMP, false},

 	/*
 	 * Skip relation cache because it is rebuilt on startup.  This includes
 	 * temporary files.
 	 */
 	{RELCACHE_INIT_FILENAME, true},

 	/*
 	 * backup_label and tablespace_map should not exist in a running cluster
 	 * capable of doing an online backup, but exclude them just in case.
 	 */
 	{BACKUP_LABEL_FILE, false},
 	{TABLESPACE_MAP, false},

 	/*
 	 * If there's a backup_manifest, it belongs to a backup that was used to
 	 * start this server. It is *not* correct for this backup. Our
 	 * backup_manifest is injected into the backup separately if users want
 	 * it.
 	 */
 	{"backup_manifest", false},

 	{"postmaster.pid", false},
 	{"postmaster.opts", false},

 	/* GPDB: Default gpbackup directory (top-level directory) */
 	{"backups", false},

 	/* end of list */
 	{NULL, false}
 };

 /*
  * List of files excluded from checksum validation.
  *
  * Note: this list should be kept in sync with what pg_checksums.c
  * includes.
  */
 static const struct exclude_list_item noChecksumFiles[] = {
 	{"pg_control", false},
 	{"pg_filenode.map", false},
 	{"pg_internal.init", true},
 	{"PG_VERSION", false},
 #ifdef EXEC_BACKEND
 	{"config_exec_params", true},
 #endif
 	{NULL, false}
 };

 /*
  * Actually do a base backup for the specified tablespaces.
  *
  * This is split out mainly to avoid complaints about "variable might be
  * clobbered by longjmp" from stupider versions of gcc.
  */
 static void
 perform_base_backup(basebackup_options *opt, bbsink *sink)
 {
 	bbsink_state state;
 	XLogRecPtr	endptr;
 	TimeLineID	endtli;
 	backup_manifest_info manifest;
 	BackupState *backup_state;
 	StringInfo	tablespace_map;

 	/* Initial backup state, insofar as we know it now. */
 	state.tablespaces = NIL;
 	state.tablespace_num = 0;
 	state.bytes_done = 0;
 	state.bytes_total = 0;
 	state.bytes_total_is_valid = false;

 	/* we're going to use a BufFile, so we need a ResourceOwner */
 	Assert(CurrentResourceOwner == NULL);
 	CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup");

 	backup_started_in_recovery = RecoveryInProgress();

 	InitializeBackupManifest(&manifest, opt->manifest,
 							 opt->manifest_checksum_type);

 	total_checksum_failures = 0;

 	/* Allocate backup related variables. */
 	backup_state = (BackupState *) palloc0(sizeof(BackupState));
 	tablespace_map = makeStringInfo();

 	basebackup_progress_wait_checkpoint();
 	do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces,
 					   backup_state, tablespace_map);

 	state.startptr = backup_state->startpoint;
 	state.starttli = backup_state->starttli;

 	SIMPLE_FAULT_INJECTOR("base_backup_post_create_checkpoint");

 	/*
 	 * Once do_pg_backup_start has been called, ensure that any failure causes
 	 * us to abort the backup so we don't "leak" a backup counter. For this
 	 * reason, *all* functionality between do_pg_backup_start() and the end of
 	 * do_pg_backup_stop() should be inside the error cleanup block!
 	 */

 	PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
 	{
 		ListCell   *lc;
 		tablespaceinfo *newti;

 		/* Add a node for the base directory at the end */
 		newti = palloc0(sizeof(tablespaceinfo));
 		newti->size = -1;
 		state.tablespaces = lappend(state.tablespaces, newti);

 		/*
 		 * Calculate the total backup size by summing up the size of each
 		 * tablespace
 		 */
 		if (opt->progress)
 		{
 			basebackup_progress_estimate_backup_size();

 			foreach(lc, state.tablespaces)
 			{
 				tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);

 				if (tmp->path == NULL)
 					tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
 										true, NULL, NULL, opt->exclude);
 				else
 					tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
 											   NULL);
 				state.bytes_total += tmp->size;
 			}
 			state.bytes_total_is_valid = true;
 		}

 		/* notify basebackup sink about start of backup */
 		bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);

 		/* Send off our tablespaces one by one */
 		foreach(lc, state.tablespaces)
 		{
 			tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);

 			if (ti->path == NULL)
 			{
 				struct stat statbuf;
 				bool		sendtblspclinks = true;
 				char	   *backup_label;

 				bbsink_begin_archive(sink, "base.tar");

 				/* In the main tar, include the backup_label first... */
 				backup_label = build_backup_content(backup_state, false);
 				sendFileWithContent(sink, BACKUP_LABEL_FILE,
 									backup_label, &manifest);
 				pfree(backup_label);

 				/* Then the tablespace_map file, if required... */
 				if (opt->sendtblspcmapfile)
 				{
 					sendFileWithContent(sink, TABLESPACE_MAP,
 										tablespace_map->data, &manifest);
 					sendtblspclinks = false;
 				}

 				/* Then the bulk of the files... */
 				sendDir(sink, ".", 1, false, state.tablespaces,
 						sendtblspclinks, &manifest, NULL, opt->exclude);

 				/* ... and pg_control after everything else. */
 				if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
 					ereport(ERROR,
 							(errcode_for_file_access(),
 							 errmsg("could not stat file \"%s\": %m",
 									XLOG_CONTROL_FILE)));
 				sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
 						 false, InvalidOid, &manifest, NULL);
 			}
 			else
 			{
 				char	   *archive_name = psprintf("%s.tar", ti->oid);

 				bbsink_begin_archive(sink, archive_name);

 				sendTablespace(sink, ti->path, ti->oid, false, &manifest);
 			}

 			/*
 			 * If we're including WAL, and this is the main data directory we
 			 * don't treat this as the end of the tablespace. Instead, we will
 			 * include the xlog files below and stop afterwards. This is safe
 			 * since the main data directory is always sent *last*.
 			 */
 			if (opt->includewal && ti->path == NULL)
 			{
 				Assert(lnext(state.tablespaces, lc) == NULL);
 			}
 			else
 			{
 				/* Properly terminate the tarfile. */
 				StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ,
 								 "BLCKSZ too small for 2 tar blocks");
 				memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
 				bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);

 				/* OK, that's the end of the archive. */
 				bbsink_end_archive(sink);
 			}
 		}

 		basebackup_progress_wait_wal_archive(&state);
 		do_pg_backup_stop(backup_state, !opt->nowait);

 		endptr = backup_state->stoppoint;
 		endtli = backup_state->stoptli;

 		/* Deallocate backup-related variables. */
 		pfree(tablespace_map->data);
 		pfree(tablespace_map);
 		pfree(backup_state);
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));


 	if (opt->includewal)
 	{
 		/*
 		 * We've left the last tar file "open", so we can now append the
 		 * required WAL files to it.
 		 */
 		char		pathbuf[MAXPGPATH];
 		XLogSegNo	segno;
 		XLogSegNo	startsegno;
 		XLogSegNo	endsegno;
 		struct stat statbuf;
 		List	   *historyFileList = NIL;
 		List	   *walFileList = NIL;
 		char		firstoff[MAXFNAMELEN];
 		char		lastoff[MAXFNAMELEN];
 		DIR		   *dir;
 		struct dirent *de;
 		ListCell   *lc;
 		TimeLineID	tli;

 		basebackup_progress_transfer_wal();

 		/*
 		 * I'd rather not worry about timelines here, so scan pg_wal and
 		 * include all WAL files in the range between 'startptr' and 'endptr',
 		 * regardless of the timeline the file is stamped with. If there are
 		 * some spurious WAL files belonging to timelines that don't belong in
 		 * this server's history, they will be included too. Normally there
 		 * shouldn't be such files, but if there are, there's little harm in
 		 * including them.
 		 */
 		XLByteToSeg(state.startptr, startsegno, wal_segment_size);
 		XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size);
 		XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
 		XLogFileName(lastoff, endtli, endsegno, wal_segment_size);

 		dir = AllocateDir("pg_wal");
 		while ((de = ReadDir(dir, "pg_wal")) != NULL)
 		{
 			/* Does it look like a WAL segment, and is it in the range? */
 			if (IsXLogFileName(de->d_name) &&
 				strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
 				strcmp(de->d_name + 8, lastoff + 8) <= 0)
 			{
 				walFileList = lappend(walFileList, pstrdup(de->d_name));
 			}
 			/* Does it look like a timeline history file? */
 			else if (IsTLHistoryFileName(de->d_name))
 			{
 				historyFileList = lappend(historyFileList, pstrdup(de->d_name));
 			}
 		}
 		FreeDir(dir);

 		/*
 		 * Before we go any further, check that none of the WAL segments we
 		 * need were removed.
 		 */
 		CheckXLogRemoved(startsegno, state.starttli);

 		/*
 		 * Sort the WAL filenames.  We want to send the files in order from
 		 * oldest to newest, to reduce the chance that a file is recycled
 		 * before we get a chance to send it over.
 		 */
 		list_sort(walFileList, compareWalFileNames);

 		/*
 		 * There must be at least one xlog file in the pg_wal directory, since
 		 * we are doing backup-including-xlog.
 		 */
 		if (walFileList == NIL)
 			ereport(ERROR,
 					(errmsg("could not find any WAL files")));

 		/*
 		 * Sanity check: the first and last segment should cover startptr and
 		 * endptr, with no gaps in between.
 		 */
 		XLogFromFileName((char *) linitial(walFileList),
 						 &tli, &segno, wal_segment_size);
 		if (segno != startsegno)
 		{
 			char		startfname[MAXFNAMELEN];

 			XLogFileName(startfname, state.starttli, startsegno,
 						 wal_segment_size);
 			ereport(ERROR,
 					(errmsg("could not find WAL file \"%s\"", startfname)));
 		}
 		foreach(lc, walFileList)
 		{
 			char	   *walFileName = (char *) lfirst(lc);
 			XLogSegNo	currsegno = segno;
 			XLogSegNo	nextsegno = segno + 1;

 			XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
 			if (!(nextsegno == segno || currsegno == segno))
 			{
 				char		nextfname[MAXFNAMELEN];

 				XLogFileName(nextfname, tli, nextsegno, wal_segment_size);
 				ereport(ERROR,
 						(errmsg("could not find WAL file \"%s\"", nextfname)));
 			}
 		}
 		if (segno != endsegno)
 		{
 			char		endfname[MAXFNAMELEN];

 			XLogFileName(endfname, endtli, endsegno, wal_segment_size);
 			ereport(ERROR,
 					(errmsg("could not find WAL file \"%s\"", endfname)));
 		}

 		/* Ok, we have everything we need. Send the WAL files. */
 		foreach(lc, walFileList)
 		{
 			char	   *walFileName = (char *) lfirst(lc);
 			int			fd;
 			size_t		cnt;
 			pgoff_t		len = 0;

 			snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
 			XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);

 			fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
 			if (fd < 0)
 			{
 				int			save_errno = errno;

 				/*
 				 * Most likely reason for this is that the file was already
 				 * removed by a checkpoint, so check for that to get a better
 				 * error message.
 				 */
 				CheckXLogRemoved(segno, tli);

 				errno = save_errno;
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not open file \"%s\": %m", pathbuf)));
 			}

 			if (fstat(fd, &statbuf) != 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m",
 								pathbuf)));
 			if (statbuf.st_size != wal_segment_size)
 			{
 				CheckXLogRemoved(segno, tli);
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("unexpected WAL file size \"%s\"", walFileName)));
 			}

 			/* send the WAL file itself */
 			_tarWriteHeader(sink, pathbuf, NULL, &statbuf, false);

 			while ((cnt = basebackup_read_file(fd, sink->bbs_buffer,
 											   Min(sink->bbs_buffer_length,
 												   wal_segment_size - len),
 											   len, pathbuf, true)) > 0)
 			{
 				CheckXLogRemoved(segno, tli);
 				bbsink_archive_contents(sink, cnt);

 				len += cnt;

 				if (len == wal_segment_size)
 					break;
 			}

 			if (len != wal_segment_size)
 			{
 				CheckXLogRemoved(segno, tli);
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("unexpected WAL file size \"%s\"", walFileName)));
 			}

 			elogif(debug_basebackup, LOG,
 				   "basebackup perform -- Sent xlog file %s", walFileName);

 			/*
 			 * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
 			 * for padding.
 			 */
 			Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);

 			CloseTransientFile(fd);

 			/*
 			 * Mark file as archived, otherwise files can get archived again
 			 * after promotion of a new node. This is in line with
 			 * walreceiver.c always doing an XLogArchiveForceDone() after a
 			 * complete segment.
 			 */
 			StatusFilePath(pathbuf, walFileName, ".done");
 			sendFileWithContent(sink, pathbuf, "", &manifest);
 		}

 		/*
 		 * Send timeline history files too. Only the latest timeline history
 		 * file is required for recovery, and even that only if there happens
 		 * to be a timeline switch in the first WAL segment that contains the
 		 * checkpoint record, or if we're taking a base backup from a standby
 		 * server and the target timeline changes while the backup is taken.
 		 * But they are small and highly useful for debugging purposes, so
 		 * better include them all, always.
 		 */
 		foreach(lc, historyFileList)
 		{
 			char	   *fname = lfirst(lc);

 			snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);

 			if (lstat(pathbuf, &statbuf) != 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m", pathbuf)));

 			sendFile(sink, pathbuf, pathbuf, &statbuf, false, InvalidOid,
 					 &manifest, NULL);

 			/* unconditionally mark file as archived */
 			StatusFilePath(pathbuf, fname, ".done");
 			sendFileWithContent(sink, pathbuf, "", &manifest);
 		}

 		/* Properly terminate the tar file. */
 		StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
 						 "BLCKSZ too small for 2 tar blocks");
 		memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
 		bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);

 		/* OK, that's the end of the archive. */
 		bbsink_end_archive(sink);
 	}

 	AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli,
 							   endptr, endtli);

 	SendBackupManifest(&manifest, sink);

 	bbsink_end_backup(sink, endptr, endtli);

 	if (total_checksum_failures)
 	{
 		if (total_checksum_failures > 1)
 			ereport(WARNING,
 					(errmsg_plural("%lld total checksum verification failure",
 								   "%lld total checksum verification failures",
 								   total_checksum_failures,
 								   total_checksum_failures)));

 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_CORRUPTED),
 				 errmsg("checksum verification failure during base backup")));
 	}

 	/*
 	 * Make sure to free the manifest before the resource owners as manifests
 	 * use cryptohash contexts that may depend on resource owners (like
 	 * OpenSSL).
 	 */
 	FreeBackupManifest(&manifest);

 	/* clean up the resource owner we created */
 	WalSndResourceCleanup(true);

 	basebackup_progress_done();
 }

 /*
  * list_sort comparison function, to compare log/seg portion of WAL segment
  * filenames, ignoring the timeline portion.
  */
 static int
 compareWalFileNames(const ListCell *a, const ListCell *b)
 {
 	char	   *fna = (char *) lfirst(a);
 	char	   *fnb = (char *) lfirst(b);

 	return strcmp(fna + 8, fnb + 8);
 }

 /* Hash entire string */
 static uint32
 key_string_hash(const void *key, Size keysize)
 {
 	Size		s_len = strlen((const char *) key);

 	Assert(keysize == sizeof(char *));
 	return DatumGetUInt32(hash_any((const unsigned char *) key, (int) s_len));
 }

 /* Compare entire string. */
 static int
 key_string_compare(const void *key1, const void *key2, Size keysize)
 {
 	Assert(keysize == sizeof(char *));

 	return strcmp(*((const char **) key1), key2);
 }

 /* Copy string by copying pointer. */
 static void *
 key_string_copy(void *dest, const void *src, Size keysize)
 {
 	Assert(keysize == sizeof(char *));

 	*((char **) dest) = (char *) src;	/* trust caller re allocation */
 	return NULL;				/* not used */
 }

 /*
  * Parse the base backup options passed down by the parser
  */
 static void
 parse_basebackup_options(List *options, basebackup_options *opt)
 {
 	ListCell   *lopt;
 	bool		o_label = false;
 	bool		o_progress = false;
 	bool		o_checkpoint = false;
 	bool		o_nowait = false;
 	bool		o_wal = false;
 	bool		o_maxrate = false;
 	bool		o_tablespace_map = false;
 	bool		o_noverify_checksums = false;
 	bool		o_manifest = false;
 	bool		o_manifest_checksums = false;
 	bool		o_target = false;
 	bool		o_target_detail = false;
 	char	   *target_str = NULL;
 	char	   *target_detail_str = NULL;
 	bool		o_compression = false;
 	bool		o_compression_detail = false;
 	char	   *compression_detail_str = NULL;

 	MemSet(opt, 0, sizeof(*opt));

 	/*
 	 * The exclude hash table is only created if EXCLUDE options are specified.
 	 * The matching function is optimized to run fast when the hash table is
 	 * NULL.
 	 */
 	opt->exclude = NULL;
 	opt->manifest = MANIFEST_OPTION_NO;
 	opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
 	opt->compression = PG_COMPRESSION_NONE;
 	opt->compression_specification.algorithm = PG_COMPRESSION_NONE;

 	foreach(lopt, options)
 	{
 		DefElem    *defel = (DefElem *) lfirst(lopt);

 		if (strcmp(defel->defname, "label") == 0)
 		{
 			if (o_label)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			opt->label = defGetString(defel);
 			o_label = true;
 		}
 		else if (strcmp(defel->defname, "progress") == 0)
 		{
 			if (o_progress)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			opt->progress = defGetBoolean(defel);
 			o_progress = true;
 		}
 		else if (strcmp(defel->defname, "checkpoint") == 0)
 		{
 			char	   *optval = defGetString(defel);

 			if (o_checkpoint)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			if (pg_strcasecmp(optval, "fast") == 0)
 				opt->fastcheckpoint = true;
 			else if (pg_strcasecmp(optval, "spread") == 0)
 				opt->fastcheckpoint = false;
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("unrecognized checkpoint type: \"%s\"",
 								optval)));
 			o_checkpoint = true;
 		}
 		else if (strcmp(defel->defname, "wait") == 0)
 		{
 			if (o_nowait)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			opt->nowait = !defGetBoolean(defel);
 			o_nowait = true;
 		}
 		else if (strcmp(defel->defname, "wal") == 0)
 		{
 			if (o_wal)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			opt->includewal = defGetBoolean(defel);
 			o_wal = true;
 		}
 		else if (strcmp(defel->defname, "max_rate") == 0)
 		{
 			int64		maxrate;

 			if (o_maxrate)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));

 			maxrate = defGetInt64(defel);
 			if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
 				ereport(ERROR,
 						(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
 						 errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
 								(int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));

 			opt->maxrate = (uint32) maxrate;
 			o_maxrate = true;
 		}
 		else if (strcmp(defel->defname, "exclude") == 0)
 		{
 			/* EXCLUDE option can be specified multiple times */
 			bool		found;

 			if (unlikely(opt->exclude == NULL))
 			{
 				HASHCTL		hashctl;

 				/*
 				 * The hash table stores the string keys in-place if the
 				 * `match` and `keycopy` functions are not explicitly
 				 * specified.  In our case MAXPGPATH bytes need to be reserved
 				 * for each key, which is too wasteful.
 				 *
 				 * By specifying the `match` and `keycopy` functions we could
 				 * allocate the strings separately and store only the string
 				 * pointers in the hash table.
 				 */
 				hashctl.hash = key_string_hash;
 				hashctl.match = key_string_compare;
 				hashctl.keycopy = key_string_copy;

 				/* The hash table is used as a set, only the keys are meaningful */
 				hashctl.keysize = sizeof(char *);
 				hashctl.entrysize = hashctl.keysize;

 				opt->exclude = hash_create("replication exclude",
 										   64 /* nelem */,
 										   &hashctl,
 										   HASH_ELEM | HASH_FUNCTION |
 										   HASH_COMPARE | HASH_KEYCOPY);
 			}

 			hash_search(opt->exclude, pstrdup(strVal(defel->arg)),
 						HASH_ENTER, &found);
 		}
 		else if (strcmp(defel->defname, "tablespace_map") == 0)
 		{
 			if (o_tablespace_map)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			opt->sendtblspcmapfile = defGetBoolean(defel);
 			o_tablespace_map = true;
 		}
 		else if (strcmp(defel->defname, "verify_checksums") == 0)
 		{
 			if (o_noverify_checksums)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			noverify_checksums = !defGetBoolean(defel);
 			o_noverify_checksums = true;
 		}
 		else if (strcmp(defel->defname, "manifest") == 0)
 		{
 			char	   *optval = defGetString(defel);
 			bool		manifest_bool;

 			if (o_manifest)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			if (parse_bool(optval, &manifest_bool))
 			{
 				if (manifest_bool)
 					opt->manifest = MANIFEST_OPTION_YES;
 				else
 					opt->manifest = MANIFEST_OPTION_NO;
 			}
 			else if (pg_strcasecmp(optval, "force-encode") == 0)
 				opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("unrecognized manifest option: \"%s\"",
 								optval)));
 			o_manifest = true;
 		}
 		else if (strcmp(defel->defname, "manifest_checksums") == 0)
 		{
 			char	   *optval = defGetString(defel);

 			if (o_manifest_checksums)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			if (!pg_checksum_parse_type(optval,
 										&opt->manifest_checksum_type))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("unrecognized checksum algorithm: \"%s\"",
 								optval)));
 			o_manifest_checksums = true;
 		}
 		else if (strcmp(defel->defname, "target") == 0)
 		{
 			if (o_target)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			target_str = defGetString(defel);
 			o_target = true;
 		}
 		else if (strcmp(defel->defname, "target_detail") == 0)
 		{
 			char	   *optval = defGetString(defel);

 			if (o_target_detail)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			target_detail_str = optval;
 			o_target_detail = true;
 		}
 		else if (strcmp(defel->defname, "compression") == 0)
 		{
 			char	   *optval = defGetString(defel);

 			if (o_compression)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			if (!parse_compress_algorithm(optval, &opt->compression))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("unrecognized compression algorithm: \"%s\"",
 								optval)));
 			o_compression = true;
 		}
 		else if (strcmp(defel->defname, "compression_detail") == 0)
 		{
 			if (o_compression_detail)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("duplicate option \"%s\"", defel->defname)));
 			compression_detail_str = defGetString(defel);
 			o_compression_detail = true;
 		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("unrecognized base backup option: \"%s\"",
 							defel->defname)));
 	}

 	if (opt->label == NULL)
 		opt->label = "base backup";

 	if (opt->exclude)
 		hash_freeze(opt->exclude);

 	elogif(debug_basebackup, LOG,
 			"basebackup options -- "
 			"label = %s, "
 			"progress = %s, "
 			"fastcheckpoint = %s, "
 			"nowait = %s, "
 			"wal = %s",
 			opt->label,
 			opt->progress ? "true" : "false",
 			opt->fastcheckpoint ? "true" : "false",
 			opt->nowait ? "true" : "false",
 			opt->includewal ? "true" : "false");
 	if (opt->manifest == MANIFEST_OPTION_NO)
 	{
 		if (o_manifest_checksums)
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("manifest checksums require a backup manifest")));
 		opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
 	}

 	if (target_str == NULL)
 	{
 		if (target_detail_str != NULL)
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("target detail cannot be used without target")));
 		opt->use_copytblspc = true;
 		opt->send_to_client = true;
 	}
 	else if (strcmp(target_str, "client") == 0)
 	{
 		if (target_detail_str != NULL)
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("target \"%s\" does not accept a target detail",
 							target_str)));
 		opt->send_to_client = true;
 	}
 	else
 		opt->target_handle =
 			BaseBackupGetTargetHandle(target_str, target_detail_str);

 	if (o_compression_detail && !o_compression)
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
 				 errmsg("compression detail cannot be specified unless compression is enabled")));

 	if (o_compression)
 	{
 		char	   *error_detail;

 		parse_compress_specification(opt->compression, compression_detail_str,
 									 &opt->compression_specification);
 		error_detail =
 			validate_compress_specification(&opt->compression_specification);
 		if (error_detail != NULL)
 			ereport(ERROR,
 					errcode(ERRCODE_SYNTAX_ERROR),
 					errmsg("invalid compression specification: %s",
 						   error_detail));
 	}
 }


 /*
  * SendBaseBackup() - send a complete base backup.
  *
  * The function will put the system into backup mode like pg_backup_start()
  * does, so that the backup is consistent even though we read directly from
  * the filesystem, bypassing the buffer cache.
  */
 void
 SendBaseBackup(BaseBackupCmd *cmd)
 {
 	basebackup_options opt;
 	bbsink	   *sink;
 	SessionBackupState status = get_backup_status();

 	if (status == SESSION_BACKUP_RUNNING)
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("a backup is already in progress in this session")));

 	parse_basebackup_options(cmd->options, &opt);

 	WalSndSetState(WALSNDSTATE_BACKUP);

 	if (update_process_title)
 	{
 		char		activitymsg[50];

 		snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
 				 opt.label);
 		set_ps_display(activitymsg);
 	}

 	/*
 	 * If the target is specifically 'client' then set up to stream the backup
 	 * to the client; otherwise, it's being sent someplace else and should not
 	 * be sent to the client. BaseBackupGetSink has the job of setting up a
 	 * sink to send the backup data wherever it needs to go.
 	 */
 	sink = bbsink_copystream_new(opt.send_to_client);
 	if (opt.target_handle != NULL)
 		sink = BaseBackupGetSink(opt.target_handle, sink);

 	/* Set up network throttling, if client requested it */
 	if (opt.maxrate > 0)
 		sink = bbsink_throttle_new(sink, opt.maxrate);

 	/* Set up server-side compression, if client requested it */
 	if (opt.compression == PG_COMPRESSION_GZIP)
 		sink = bbsink_gzip_new(sink, &opt.compression_specification);
 	else if (opt.compression == PG_COMPRESSION_LZ4)
 		sink = bbsink_lz4_new(sink, &opt.compression_specification);
 	else if (opt.compression == PG_COMPRESSION_ZSTD)
 		sink = bbsink_zstd_new(sink, &opt.compression_specification);

 	/* Set up progress reporting. */
 	sink = bbsink_progress_new(sink, opt.progress);

 	/*
 	 * Perform the base backup, but make sure we clean up the bbsink even if
 	 * an error occurs.
 	 */
 	PG_TRY();
 	{
 		perform_base_backup(&opt, sink);
 	}
 	PG_FINALLY();
 	{
 		bbsink_cleanup(sink);
 	}
 	PG_END_TRY();
 }

 /*
  * Inject a file with given name and content in the output tar stream.
  */
 static void
 sendFileWithContent(bbsink *sink, const char *filename, const char *content,
 					backup_manifest_info *manifest)
 {
 	struct stat statbuf;
 	int			bytes_done = 0,
 				len;
 	pg_checksum_context checksum_ctx;

 	if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
 		elog(ERROR, "could not initialize checksum of file \"%s\"",
 			 filename);

 	len = strlen(content);

 	/*
 	 * Construct a stat struct for the backup_label file we're injecting in
 	 * the tar.
 	 */
 	/* Windows doesn't have the concept of uid and gid */
 #ifdef WIN32
 	statbuf.st_uid = 0;
 	statbuf.st_gid = 0;
 #else
 	statbuf.st_uid = geteuid();
 	statbuf.st_gid = getegid();
 #endif
 	statbuf.st_mtime = time(NULL);
 	statbuf.st_mode = pg_file_create_mode;
 	statbuf.st_size = len;

 	_tarWriteHeader(sink, filename, NULL, &statbuf, false);

 	elogif(debug_basebackup, LOG,
 			"basebackup send file -- Sent file '%s' with content \n%s.",
 			filename, content);

 	if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0)
 		elog(ERROR, "could not update checksum of file \"%s\"",
 			 filename);

 	while (bytes_done < len)
 	{
 		size_t		remaining = len - bytes_done;
 		size_t		nbytes = Min(sink->bbs_buffer_length, remaining);

 		memcpy(sink->bbs_buffer, content, nbytes);
 		bbsink_archive_contents(sink, nbytes);
 		bytes_done += nbytes;
 		content += nbytes;
 	}

 	_tarWritePadding(sink, len);

 	AddFileToBackupManifest(manifest, NULL, filename, len,
 							(pg_time_t) statbuf.st_mtime, &checksum_ctx);
 }

 /*
  * Include the tablespace directory pointed to by 'path' in the output tar
  * stream.  If 'sizeonly' is true, we just calculate a total length and return
  * it, without actually sending anything.
  *
  * Only used to send auxiliary tablespaces, not PGDATA.
  */
 static int64
 sendTablespace(bbsink *sink, char *path, char *spcoid, bool sizeonly,
 			   backup_manifest_info *manifest)
 {
 	int64		size;
 	char		pathbuf[MAXPGPATH];
 	struct stat statbuf;

 	/*
 	 * 'path' points to the tablespace location, but we only want to include
 	 * the version directory in it that belongs to us.
 	 */
 	snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
 			 GP_TABLESPACE_VERSION_DIRECTORY);

 	elogif(debug_basebackup, LOG,
 		   "sendTablespace -- Sending tablespace version directory = %s", pathbuf);
 	/*
 	 * Store a directory entry in the tar file so we get the permissions
 	 * right.
 	 */
 	if (lstat(pathbuf, &statbuf) != 0)
 	{
 		if (errno != ENOENT)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file or directory \"%s\": %m",
 							pathbuf)));

 		/* If the tablespace went away while scanning, it's no error. */
 		return 0;
 	}

 	size = _tarWriteHeader(sink, GP_TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
 						   sizeonly);

 	/* Send all the files in the tablespace version directory */
 	size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest,
 					spcoid, NULL);

 	return size;
 }

 /*
  * Check if client EXCLUDE option matches this path.  Current implementation
  * is only the exact match for the relative path from the datadir root (e.g.
  * "./log" etc).
  */
 static bool
 match_exclude_list(char *path, HTAB *exclude)
 {
 	bool		found = false;

 	if (unlikely(exclude))
 		hash_search(exclude, path, HASH_FIND, &found);

 	return found;
 }

 /*
  * Include all files from the given directory in the output tar stream. If
  * 'sizeonly' is true, we just calculate a total length and return it, without
  * actually sending anything.
  *
  * Omit any directory in the tablespaces list, to avoid backing up
  * tablespaces twice when they were created inside PGDATA.
  *
  * If sendtblspclinks is true, we need to include symlink
  * information in the tar file. If not, we can skip that
  * as it will be sent separately in the tablespace_map file.
  *
  * GPDB: Also omit any files in the 'exclude' list.
  */
 static int64
 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
 		List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest,
 		const char *spcoid, HTAB *exclude)
 {
 	DIR		   *dir;
 	struct dirent *de;
 	char		pathbuf[MAXPGPATH * 2];
 	struct stat statbuf;
 	int64		size = 0;
 	const char *lastDir;		/* Split last dir from parent path. */
 	bool		isDbDir = false;	/* Does this directory contain relations? */

 	/*
 	 * Determine if the current path is a database directory that can contain
 	 * relations.
 	 *
 	 * Start by finding the location of the delimiter between the parent path
 	 * and the current path.
 	 */
 	lastDir = last_dir_separator(path);

 	/* Does this path look like a database path (i.e. all digits)? */
 	if (lastDir != NULL &&
 		strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
 	{
 		/* Part of path that contains the parent directory. */
 		int			parentPathLen = lastDir - path;

 		/*
 		 * Mark path as a database directory if the parent path is either
 		 * $PGDATA/base or a tablespace version path.
 		 */
 		if (strncmp(path, "./base", parentPathLen) == 0 ||
 			(parentPathLen >= (sizeof(GP_TABLESPACE_VERSION_DIRECTORY) - 1) &&
 			 strncmp(lastDir - (sizeof(GP_TABLESPACE_VERSION_DIRECTORY) - 1),
 					 GP_TABLESPACE_VERSION_DIRECTORY,
 					 sizeof(GP_TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
 			isDbDir = true;
 	}

 	dir = AllocateDir(path);
 	while ((de = ReadDir(dir, path)) != NULL)
 	{
 		int			excludeIdx;
 		bool		excludeFound;
 		ForkNumber	relForkNum; /* Type of fork if file is a relation */
 		int			relnumchars;	/* Chars in filename that are the
 									 * relnumber */

 		/* Skip special stuff */
 		if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
 			continue;

 		/* Skip temporary files */
 		if (strncmp(de->d_name,
 					PG_TEMP_FILE_PREFIX,
 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
 			continue;

 		/* Skip macOS system files */
 		if (strcmp(de->d_name, ".DS_Store") == 0)
 			continue;

 		/*
 		 * Check if the postmaster has signaled us to exit, and abort with an
 		 * error in that case. The error handler further up will call
 		 * do_pg_abort_backup() for us. Also check that if the backup was
 		 * started while still in recovery, the server wasn't promoted.
 		 * do_pg_backup_stop() will check that too, but it's better to stop
 		 * the backup early than continue to the end and fail there.
 		 */
 		CHECK_FOR_INTERRUPTS();
 		if (RecoveryInProgress() != backup_started_in_recovery)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("the standby was promoted during online backup"),
 					 errhint("This means that the backup being taken is corrupt "
 							 "and should not be used. "
 							 "Try taking another online backup.")));

 		/* Scan for files that should be excluded */
 		excludeFound = false;
 		for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
 		{
 			int			cmplen = strlen(excludeFiles[excludeIdx].name);

 			if (!excludeFiles[excludeIdx].match_prefix)
 				cmplen++;
 			if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
 			{
 				elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
 				excludeFound = true;
 				break;
 			}
 		}

 		if (excludeFound)
 			continue;

 		/* Exclude all forks for unlogged tables except the init fork */
 		if (isDbDir &&
 			parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
 												&relForkNum))
 		{
 			/* Never exclude init forks */
 			if (relForkNum != INIT_FORKNUM)
 			{
 				char		initForkFile[MAXPGPATH];
 				char		relNumber[OIDCHARS + 1];

 				/*
 				 * If any other type of fork, check if there is an init fork
 				 * with the same RelFileNumber. If so, the file can be
 				 * excluded.
 				 */
 				memcpy(relNumber, de->d_name, relnumchars);
 				relNumber[relnumchars] = '\0';
 				snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init",
 						 path, relNumber);

 				if (lstat(initForkFile, &statbuf) == 0)
 				{
 					elog(DEBUG2,
 						 "unlogged relation file \"%s\" excluded from backup",
 						 de->d_name);

 					continue;
 				}
 			}
 		}

 		/* Exclude temporary relations */
 		if (isDbDir && looks_like_temp_rel_name(de->d_name))
 		{
 			elog(DEBUG2,
 				 "temporary relation file \"%s\" excluded from backup",
 				 de->d_name);

 			continue;
 		}

 		snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);

 		/* Skip pg_control here to back up it last */
 		if (strcmp(pathbuf, "./global/pg_control") == 0)
 			continue;

 		if (lstat(pathbuf, &statbuf) != 0)
 		{
 			if (errno != ENOENT)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file or directory \"%s\": %m",
 								pathbuf)));

 			/* If the file went away while scanning, it's not an error. */
 			continue;
 		}

 		/* Scan for directories whose contents should be excluded */
 		excludeFound = false;
 		for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
 		{
 			if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
 			{
 				elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
 				convert_link_to_directory(pathbuf, &statbuf);
 				size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
 										&statbuf, sizeonly);
 				excludeFound = true;
 				break;
 			}
 		}

 		if (excludeFound)
 			continue;

 		/*
 		 * We can skip pg_wal, the WAL segments need to be fetched from the
 		 * WAL archive anyway. But include it as an empty directory anyway, so
 		 * we get permissions right.
 		 */
 		if (strcmp(pathbuf, "./pg_wal") == 0)
 		{
 			/* If pg_wal is a symlink, write it as a directory anyway */
 			convert_link_to_directory(pathbuf, &statbuf);
 			size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
 									&statbuf, sizeonly);

 			/*
 			 * Also send archive_status directory (by hackishly reusing
 			 * statbuf from above ...).
 			 */
 			size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
 									&statbuf, sizeonly);

 			continue;			/* don't recurse into pg_wal */
 		}

 		/* Skip if client does not want */
 		if (match_exclude_list(pathbuf, exclude))
 			continue;

 		/* Allow symbolic links in pg_tblspc only */
 		if (strcmp(path, "./pg_tblspc") == 0 && S_ISLNK(statbuf.st_mode))
 		{
 			char		linkpath[MAXPGPATH];
 			int			rllen;

 			rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
 			if (rllen < 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not read symbolic link \"%s\": %m",
 								pathbuf)));
 			if (rllen >= MAX_TARABLE_SYMLINK_PATH_LENGTH)
 				ereport(ERROR,
 						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 						 errmsg("symbolic link \"%s\" target is too long and will not be added to the backup",
 								pathbuf),
 						 errdetail("The symbolic link with target \"%s\" is too long. Symlink targets with length greater than %d characters would be truncated.", pathbuf, MAX_TARABLE_SYMLINK_PATH_LENGTH)));
 			linkpath[rllen] = '\0';

 			/* Lop off the dbid before sending the link target. */
 			char *file_sep_before_dbid_in_link_path = strrchr(linkpath, '/');
 			*file_sep_before_dbid_in_link_path = '\0';

 			size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath,
 									&statbuf, sizeonly);
 		}
 		else if (S_ISDIR(statbuf.st_mode))
 		{
 			bool		skip_this_dir = false;
 			ListCell   *lc;

 			/*
 			 * Store a directory entry in the tar file so we can get the
 			 * permissions right.
 			 */
 			size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf,
 									sizeonly);

 			/*
 			 * Call ourselves recursively for a directory, unless it happens
 			 * to be a separate tablespace located within PGDATA.
 			 */
 			foreach(lc, tablespaces)
 			{
 				tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);

 				/*
 				 * ti->rpath is the tablespace relative path within PGDATA, or
 				 * NULL if the tablespace has been properly located somewhere
 				 * else.
 				 *
 				 * Skip past the leading "./" in pathbuf when comparing.
 				 */
 				if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
 				{
 					skip_this_dir = true;
 					break;
 				}
 			}

 			/*
 			 * skip sending directories inside pg_tblspc, if not required.
 			 */
 			if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
 				skip_this_dir = true;

 			if (!skip_this_dir)
 				size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces,
 								sendtblspclinks, manifest, spcoid, exclude);
 		}
 		else if (S_ISREG(statbuf.st_mode))
 		{
 			bool		sent = false;

 			if (!sizeonly)
 				sent = sendFile(sink, pathbuf, pathbuf + basepathlen + 1, &statbuf,
 								true, isDbDir ? atooid(lastDir + 1) : InvalidOid,
 								manifest, spcoid);

 			if (sent || sizeonly)
 			{
 				/* Add size. */
 				size += statbuf.st_size;

 				/* Pad to a multiple of the tar block size. */
 				size += tarPaddingBytesRequired(statbuf.st_size);

 				/* Size of the header for the file. */
 				size += TAR_BLOCK_SIZE;
 			}
 		}
 		else
 			ereport(WARNING,
 					(errmsg("skipping special file \"%s\"", pathbuf)));
 	}
 	FreeDir(dir);

 	elogif(debug_basebackup && !sizeonly, LOG,
 			"baseabckup send dir -- Sent directory %s", path);

 	return size;
 }

 /*
  * Check if a file should have its checksum validated.
  * We validate checksums on files in regular tablespaces
  * (including global and default) only, and in those there
  * are some files that are explicitly excluded.
  */
 static bool
 is_checksummed_file(const char *fullpath, const char *filename)
 {
 	/* Check that the file is in a tablespace */
 	if (strncmp(fullpath, "./global/", 9) == 0 ||
 		strncmp(fullpath, "./base/", 7) == 0 ||
 		strncmp(fullpath, "/", 1) == 0)
 	{
 		int			excludeIdx;

 		/* Compare file against noChecksumFiles skip list */
 		for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++)
 		{
 			int			cmplen = strlen(noChecksumFiles[excludeIdx].name);

 			if (!noChecksumFiles[excludeIdx].match_prefix)
 				cmplen++;
 			if (strncmp(filename, noChecksumFiles[excludeIdx].name,
 						cmplen) == 0)
 				return false;
 		}

 		return true;
 	}
 	else
 		return false;
 }

 /*
  * Given the member, write the TAR header & send the file.
  *
  * If 'missing_ok' is true, will not throw an error if the file is not found.
  *
  * If dboid is anything other than InvalidOid then any checksum failures
  * detected will get reported to the cumulative stats system.
  *
  * Returns true if the file was successfully sent, false if 'missing_ok',
  * and the file did not exist.
  */
 static bool
 sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
 		 struct stat *statbuf, bool missing_ok, Oid dboid,
 		 backup_manifest_info *manifest, const char *spcoid)
 {
 	int			fd;
 	BlockNumber blkno = 0;
 	bool		block_retry = false;
 	uint16		checksum;
 	int			checksum_failures = 0;
 	off_t		cnt;
 	int			i;
 	pgoff_t		len = 0;
 	char	   *page;
 	PageHeader	phdr;
 	int			segmentno = 0;
 	char	   *segmentpath;
 	bool		verify_checksum = false;
 	pg_checksum_context checksum_ctx;

 	if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
 		elog(ERROR, "could not initialize checksum of file \"%s\"",
 			 readfilename);

 	fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
 	if (fd < 0)
 	{
 		if (errno == ENOENT && missing_ok)
 			return false;
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", readfilename)));
 	}

 	_tarWriteHeader(sink, tarfilename, NULL, statbuf, false);

 	if (!noverify_checksums && DataChecksumsEnabled())
 	{
 		char	   *filename;

 		/*
 		 * Get the filename (excluding path).  As last_dir_separator()
 		 * includes the last directory separator, we chop that off by
 		 * incrementing the pointer.
 		 */
 		filename = last_dir_separator(readfilename) + 1;

 		if (is_checksummed_file(readfilename, filename))
 		{
 			verify_checksum = true;

 			/*
 			 * Cut off at the segment boundary (".") to get the segment number
 			 * in order to mix it into the checksum.
 			 */
 			segmentpath = strstr(filename, ".");
 			if (segmentpath != NULL)
 			{
 				segmentno = atoi(segmentpath + 1);
 				if (segmentno == 0)
 					ereport(ERROR,
 							(errmsg("invalid segment number %d in file \"%s\"",
 									segmentno, filename)));
 			}
 		}
 	}

 	/*
 	 * Loop until we read the amount of data the caller told us to expect. The
 	 * file could be longer, if it was extended while we were sending it, but
 	 * for a base backup we can ignore such extended data. It will be restored
 	 * from WAL.
 	 */
 	while (len < statbuf->st_size)
 	{
 		size_t		remaining = statbuf->st_size - len;

 		/* Try to read some more data. */
 		cnt = basebackup_read_file(fd, sink->bbs_buffer,
 								   Min(sink->bbs_buffer_length, remaining),
 								   len, readfilename, true);

 		/*
 		 * The checksums are verified at block level, so we iterate over the
 		 * buffer in chunks of BLCKSZ, after making sure that
 		 * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
 		 * BLCKSZ bytes.
 		 */
 		Assert((sink->bbs_buffer_length % BLCKSZ) == 0);

 		if (verify_checksum && (cnt % BLCKSZ != 0))
 		{
 			ereport(WARNING,
 					(errmsg("could not verify checksum in file \"%s\", block "
 							"%u: read buffer size %d and page size %d "
 							"differ",
 							readfilename, blkno, (int) cnt, BLCKSZ)));
 			verify_checksum = false;
 		}

 		if (verify_checksum)
 		{
 			for (i = 0; i < cnt / BLCKSZ; i++)
 			{
 				page = sink->bbs_buffer + BLCKSZ * i;

 				/*
 				 * Only check pages which have not been modified since the
 				 * start of the base backup. Otherwise, they might have been
 				 * written only halfway and the checksum would not be valid.
 				 * However, replaying WAL would reinstate the correct page in
 				 * this case. We also skip completely new pages, since they
 				 * don't have a checksum yet.
 				 */
 				if (!PageIsNew(page) && PageGetLSN(page) < sink->bbs_state->startptr)
 				{
 					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
 					phdr = (PageHeader) page;
 					if (phdr->pd_checksum != checksum)
 					{
 						/*
 						 * Retry the block on the first failure.  It's
 						 * possible that we read the first 4K page of the
 						 * block just before postgres updated the entire block
 						 * so it ends up looking torn to us. If, before we
 						 * retry the read, the concurrent write of the block
 						 * finishes, the page LSN will be updated and we'll
 						 * realize that we should ignore this block.
 						 *
 						 * There's no guarantee that this will actually
 						 * happen, though: the torn write could take an
 						 * arbitrarily long time to complete. Retrying
 						 * multiple times wouldn't fix this problem, either,
 						 * though it would reduce the chances of it happening
 						 * in practice. The only real fix here seems to be to
 						 * have some kind of interlock that allows us to wait
 						 * until we can be certain that no write to the block
 						 * is in progress. Since we don't have any such thing
 						 * right now, we just do this and hope for the best.
 						 */
 						if (block_retry == false)
 						{
 							int			reread_cnt;

 							/* Reread the failed block */
 							reread_cnt =
 								basebackup_read_file(fd,
 													 sink->bbs_buffer + BLCKSZ * i,
 													 BLCKSZ, len + BLCKSZ * i,
 													 readfilename,
 													 false);
 							if (reread_cnt == 0)
 							{
 								/*
 								 * If we hit end-of-file, a concurrent
 								 * truncation must have occurred, so break out
 								 * of this loop just as if the initial fread()
 								 * returned 0. We'll drop through to the same
 								 * code that handles that case. (We must fix
 								 * up cnt first, though.)
 								 */
 								cnt = BLCKSZ * i;
 								break;
 							}

 							/* Set flag so we know a retry was attempted */
 							block_retry = true;

 							/* Reset loop to validate the block again */
 							i--;
 							continue;
 						}

 						checksum_failures++;

 						if (checksum_failures <= 5)
 							ereport(WARNING,
 									(errmsg("checksum verification failed in "
 											"file \"%s\", block %u: calculated "
 											"%X but expected %X",
 											readfilename, blkno, checksum,
 											phdr->pd_checksum)));
 						if (checksum_failures == 5)
 							ereport(WARNING,
 									(errmsg("further checksum verification "
 											"failures in file \"%s\" will not "
 											"be reported", readfilename)));
 					}
 				}
 				block_retry = false;
 				blkno++;
 			}
 		}

 		/*
 		 * If we hit end-of-file, a concurrent truncation must have occurred.
 		 * That's not an error condition, because WAL replay will fix things
 		 * up.
 		 */
 		if (cnt == 0)
 			break;

 		/* Archive the data we just read. */
 		bbsink_archive_contents(sink, cnt);

 		/* Also feed it to the checksum machinery. */
 		if (pg_checksum_update(&checksum_ctx,
 							   (uint8 *) sink->bbs_buffer, cnt) < 0)
 			elog(ERROR, "could not update checksum of base backup");

 		len += cnt;
 	}

 	/* If the file was truncated while we were sending it, pad it with zeros */
 	while (len < statbuf->st_size)
 	{
 		size_t		remaining = statbuf->st_size - len;
 		size_t		nbytes = Min(sink->bbs_buffer_length, remaining);

 		MemSet(sink->bbs_buffer, 0, nbytes);
 		if (pg_checksum_update(&checksum_ctx,
 							   (uint8 *) sink->bbs_buffer,
 							   nbytes) < 0)
 			elog(ERROR, "could not update checksum of base backup");
 		bbsink_archive_contents(sink, nbytes);
 		len += nbytes;
 	}

 	/*
 	 * Pad to a block boundary, per tar format requirements. (This small piece
 	 * of data is probably not worth throttling, and is not checksummed
 	 * because it's not actually part of the file.)
 	 */
 	_tarWritePadding(sink, len);

 	CloseTransientFile(fd);

 	if (checksum_failures > 1)
 	{
 		ereport(WARNING,
 				(errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
 							   "file \"%s\" has a total of %d checksum verification failures",
 							   checksum_failures,
 							   readfilename, checksum_failures)));

 		pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
 	}

 	total_checksum_failures += checksum_failures;

 	AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
 							(pg_time_t) statbuf->st_mtime, &checksum_ctx);

 	return true;
 }

 static int64
 _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
 				struct stat *statbuf, bool sizeonly)
 {
 	enum tarError rc;

 	if (!sizeonly)
 	{
 		/*
 		 * As of this writing, the smallest supported block size is 1kB, which
 		 * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
 		 * multiple of BLCKSZ, it should be safe to assume that the buffer is
 		 * large enough to fit an entire tar block. We double-check by means
 		 * of these assertions.
 		 */
 		StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ,
 						 "BLCKSZ too small for tar block");
 		Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);

 		rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget,
 							 statbuf->st_size, statbuf->st_mode,
 							 statbuf->st_uid, statbuf->st_gid,
 							 statbuf->st_mtime);

 		switch (rc)
 		{
 			case TAR_OK:
 				break;
 			case TAR_NAME_TOO_LONG:
 				ereport(ERROR,
 						(errmsg("file name too long for tar format: \"%s\"",
 								filename)));
 				break;
 			case TAR_SYMLINK_TOO_LONG:
 				ereport(ERROR,
 						(errmsg("symbolic link target too long for tar format: "
 								"file name \"%s\", target \"%s\"",
 								filename, linktarget)));
 				break;
 			default:
 				elog(ERROR, "unrecognized tar error: %d", rc);
 		}

 		bbsink_archive_contents(sink, TAR_BLOCK_SIZE);
 	}

 	return TAR_BLOCK_SIZE;
 }

 /*
  * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
  */
 static void
 _tarWritePadding(bbsink *sink, int len)
 {
 	int			pad = tarPaddingBytesRequired(len);

 	/*
 	 * As in _tarWriteHeader, it should be safe to assume that the buffer is
 	 * large enough that we don't need to do this in multiple chunks.
 	 */
 	Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
 	Assert(pad <= TAR_BLOCK_SIZE);

 	if (pad > 0)
 	{
 		MemSet(sink->bbs_buffer, 0, pad);
 		bbsink_archive_contents(sink, pad);
 	}
 }

 /*
  * If the entry in statbuf is a link, then adjust statbuf to make it look like a
  * directory, so that it will be written that way.
  */
 static void
 convert_link_to_directory(const char *pathbuf, struct stat *statbuf)
 {
 	/* If symlink, write it as a directory anyway */
 	if (S_ISLNK(statbuf->st_mode))
 		statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
 }

 /*
  * Read some data from a file, setting a wait event and reporting any error
  * encountered.
  *
  * If partial_read_ok is false, also report an error if the number of bytes
  * read is not equal to the number of bytes requested.
  *
  * Returns the number of bytes read.
  */
 static int
 basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
 					 const char *filename, bool partial_read_ok)
 {
 	int			rc;

 	pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
 	rc = pg_pread(fd, buf, nbytes, offset);
 	pgstat_report_wait_end();

 	if (rc < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not read file \"%s\": %m", filename)));
 	if (!partial_read_ok && rc > 0 && rc != nbytes)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not read file \"%s\": read %d of %zu",
 						filename, rc, nbytes)));

 	return rc;
 }