| /*------------------------------------------------------------------------- |
| * |
| * basebackup.c |
| * code for taking a base backup and streaming it to a standby |
| * |
| * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group |
| * |
| * IDENTIFICATION |
| * src/backend/backup/basebackup.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <sys/stat.h> |
| #include <unistd.h> |
| #include <time.h> |
| |
| #include "access/xlog_internal.h" |
| #include "access/xlogbackup.h" |
| #include "backup/backup_manifest.h" |
| #include "backup/basebackup.h" |
| #include "backup/basebackup_sink.h" |
| #include "backup/basebackup_target.h" |
| #include "commands/defrem.h" |
| #include "common/compression.h" |
| #include "common/file_perm.h" |
| #include "common/kmgr_utils.h" |
| #include "commands/progress.h" |
| #include "lib/stringinfo.h" |
| #include "miscadmin.h" |
| #include "nodes/pg_list.h" |
| #include "pgstat.h" |
| #include "pgtar.h" |
| #include "port.h" |
| #include "postmaster/syslogger.h" |
| #include "replication/walsender.h" |
| #include "replication/walsender_private.h" |
| #include "storage/bufpage.h" |
| #include "storage/checksum.h" |
| #include "storage/dsm_impl.h" |
| #include "storage/fd.h" |
| #include "storage/ipc.h" |
| #include "storage/reinit.h" |
| #include "utils/builtins.h" |
| #include "utils/guc.h" |
| #include "utils/ps_status.h" |
| #include "utils/relcache.h" |
| #include "utils/resowner.h" |
| #include "utils/timestamp.h" |
| |
| #include "access/genam.h" |
| #include "access/hash.h" |
| #include "access/xact.h" |
| #include "cdb/cdbvars.h" |
| #include "catalog/catalog.h" |
| #include "catalog/indexing.h" |
| #include "catalog/pg_database.h" |
| #include "catalog/pg_tablespace.h" |
| #include "storage/lmgr.h" |
| #include "storage/proc.h" |
| #include "utils/elog.h" |
| #include "utils/fmgroids.h" |
| #include "utils/faultinjector.h" |
| #include "utils/snapmgr.h" |
| #include "utils/tarrable.h" |
| /* |
| * How much data do we want to send in one CopyData message? Note that |
| * this may also result in reading the underlying files in chunks of this |
| * size. |
| * |
| * NB: The buffer size is required to be a multiple of the system block |
| * size, so use that value instead if it's bigger than our preference. |
| */ |
| #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ) |
| |
| typedef struct |
| { |
| const char *label; |
| bool progress; |
| bool fastcheckpoint; |
| bool nowait; |
| bool includewal; |
| uint32 maxrate; |
| bool sendtblspcmapfile; |
| bool send_to_client; |
| bool use_copytblspc; |
| BaseBackupTargetHandle *target_handle; |
| backup_manifest_option manifest; |
| pg_compress_algorithm compression; |
| pg_compress_specification compression_specification; |
| pg_checksum_type manifest_checksum_type; |
| HTAB *exclude; |
| } basebackup_options; |
| |
| static bool match_exclude_list(char *path, HTAB *exclude); |
| |
| static int64 sendTablespace(bbsink *sink, char *path, char *spcoid, bool sizeonly, |
| struct backup_manifest_info *manifest); |
| static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, |
| List *tablespaces, bool sendtblspclinks, |
| backup_manifest_info *manifest, const char *spcoid, |
| HTAB *exclude); |
| static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, |
| struct stat *statbuf, bool missing_ok, Oid dboid, |
| backup_manifest_info *manifest, const char *spcoid); |
| static void sendFileWithContent(bbsink *sink, const char *filename, |
| const char *content, |
| backup_manifest_info *manifest); |
| static int64 _tarWriteHeader(bbsink *sink, const char *filename, |
| const char *linktarget, struct stat *statbuf, |
| bool sizeonly); |
| static void _tarWritePadding(bbsink *sink, int len); |
| static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf); |
| static void perform_base_backup(basebackup_options *opt, bbsink *sink); |
| static void parse_basebackup_options(List *options, basebackup_options *opt); |
| static int compareWalFileNames(const ListCell *a, const ListCell *b); |
| static bool is_checksummed_file(const char *fullpath, const char *filename); |
| static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, |
| const char *filename, bool partial_read_ok); |
| |
| /* Was the backup currently in-progress initiated in recovery mode? */ |
| static bool backup_started_in_recovery = false; |
| |
| /* Total number of checksum failures during base backup. */ |
| static long long int total_checksum_failures; |
| |
| /* Do not verify checksums. */ |
| static bool noverify_checksums = false; |
| |
| /* |
| * Definition of one element part of an exclusion list, used for paths part |
| * of checksum validation or base backups. "name" is the name of the file |
| * or path to check for exclusion. If "match_prefix" is true, any items |
| * matching the name as prefix are excluded. |
| */ |
| struct exclude_list_item |
| { |
| const char *name; |
| bool match_prefix; |
| }; |
| |
| /* |
| * The contents of these directories are removed or recreated during server |
| * start so they are not included in backups. The directories themselves are |
| * kept and included as empty to preserve access permissions. |
| * |
| * Note: this list should be kept in sync with the filter lists in pg_rewind's |
| * filemap.c. |
| */ |
| static const char *const excludeDirContents[] = |
| { |
| /* Skip temporary crypto key directories */ |
| NEW_KMGR_DIR, |
| OLD_KMGR_DIR, |
| /* |
| * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped |
| * because extensions like pg_stat_statements store data there. |
| */ |
| PG_STAT_TMP_DIR, |
| |
| /* |
| * It is generally not useful to backup the contents of this directory |
| * even if the intention is to restore to another primary. See backup.sgml |
| * for a more detailed description. |
| */ |
| "pg_replslot", |
| |
| /* Contents removed on startup, see dsm_cleanup_for_mmap(). */ |
| PG_DYNSHMEM_DIR, |
| |
| /* Contents removed on startup, see AsyncShmemInit(). */ |
| "pg_notify", |
| |
| /* |
| * Old contents are loaded for possible debugging but are not required for |
| * normal operation, see SerialInit(). |
| */ |
| "pg_serial", |
| |
| /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */ |
| "pg_snapshots", |
| |
| /* Contents zeroed on startup, see StartupSUBTRANS(). */ |
| "pg_subtrans", |
| |
| /* Contents unique to each segment instance. */ |
| "log", |
| |
| /* GPDB: Default gpbackup directory (backup contents) */ |
| "backups", |
| |
| /* end of list */ |
| NULL |
| }; |
| |
| /* |
| * List of files excluded from backups. |
| */ |
| static const struct exclude_list_item excludeFiles[] = |
| { |
| /* Skip auto conf temporary file. */ |
| {PG_AUTOCONF_FILENAME ".tmp", false}, |
| |
| /* Skip current log file temporary file */ |
| {LOG_METAINFO_DATAFILE_TMP, false}, |
| |
| /* |
| * Skip relation cache because it is rebuilt on startup. This includes |
| * temporary files. |
| */ |
| {RELCACHE_INIT_FILENAME, true}, |
| |
| /* |
| * backup_label and tablespace_map should not exist in a running cluster |
| * capable of doing an online backup, but exclude them just in case. |
| */ |
| {BACKUP_LABEL_FILE, false}, |
| {TABLESPACE_MAP, false}, |
| |
| /* |
| * If there's a backup_manifest, it belongs to a backup that was used to |
| * start this server. It is *not* correct for this backup. Our |
| * backup_manifest is injected into the backup separately if users want |
| * it. |
| */ |
| {"backup_manifest", false}, |
| |
| {"postmaster.pid", false}, |
| {"postmaster.opts", false}, |
| |
| /* GPDB: Default gpbackup directory (top-level directory) */ |
| {"backups", false}, |
| |
| /* end of list */ |
| {NULL, false} |
| }; |
| |
| /* |
| * List of files excluded from checksum validation. |
| * |
| * Note: this list should be kept in sync with what pg_checksums.c |
| * includes. |
| */ |
| static const struct exclude_list_item noChecksumFiles[] = { |
| {"pg_control", false}, |
| {"pg_filenode.map", false}, |
| {"pg_internal.init", true}, |
| {"PG_VERSION", false}, |
| #ifdef EXEC_BACKEND |
| {"config_exec_params", true}, |
| #endif |
| {NULL, false} |
| }; |
| |
| /* |
| * Actually do a base backup for the specified tablespaces. |
| * |
| * This is split out mainly to avoid complaints about "variable might be |
| * clobbered by longjmp" from stupider versions of gcc. |
| */ |
| static void |
| perform_base_backup(basebackup_options *opt, bbsink *sink) |
| { |
| bbsink_state state; |
| XLogRecPtr endptr; |
| TimeLineID endtli; |
| backup_manifest_info manifest; |
| BackupState *backup_state; |
| StringInfo tablespace_map; |
| |
| /* Initial backup state, insofar as we know it now. */ |
| state.tablespaces = NIL; |
| state.tablespace_num = 0; |
| state.bytes_done = 0; |
| state.bytes_total = 0; |
| state.bytes_total_is_valid = false; |
| |
| /* we're going to use a BufFile, so we need a ResourceOwner */ |
| Assert(CurrentResourceOwner == NULL); |
| CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup"); |
| |
| backup_started_in_recovery = RecoveryInProgress(); |
| |
| InitializeBackupManifest(&manifest, opt->manifest, |
| opt->manifest_checksum_type); |
| |
| total_checksum_failures = 0; |
| |
| /* Allocate backup related variables. */ |
| backup_state = (BackupState *) palloc0(sizeof(BackupState)); |
| tablespace_map = makeStringInfo(); |
| |
| basebackup_progress_wait_checkpoint(); |
| do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces, |
| backup_state, tablespace_map); |
| |
| state.startptr = backup_state->startpoint; |
| state.starttli = backup_state->starttli; |
| |
| SIMPLE_FAULT_INJECTOR("base_backup_post_create_checkpoint"); |
| |
| /* |
| * Once do_pg_backup_start has been called, ensure that any failure causes |
| * us to abort the backup so we don't "leak" a backup counter. For this |
| * reason, *all* functionality between do_pg_backup_start() and the end of |
| * do_pg_backup_stop() should be inside the error cleanup block! |
| */ |
| |
| PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); |
| { |
| ListCell *lc; |
| tablespaceinfo *newti; |
| |
| /* Add a node for the base directory at the end */ |
| newti = palloc0(sizeof(tablespaceinfo)); |
| newti->size = -1; |
| state.tablespaces = lappend(state.tablespaces, newti); |
| |
| /* |
| * Calculate the total backup size by summing up the size of each |
| * tablespace |
| */ |
| if (opt->progress) |
| { |
| basebackup_progress_estimate_backup_size(); |
| |
| foreach(lc, state.tablespaces) |
| { |
| tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc); |
| |
| if (tmp->path == NULL) |
| tmp->size = sendDir(sink, ".", 1, true, state.tablespaces, |
| true, NULL, NULL, opt->exclude); |
| else |
| tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true, |
| NULL); |
| state.bytes_total += tmp->size; |
| } |
| state.bytes_total_is_valid = true; |
| } |
| |
| /* notify basebackup sink about start of backup */ |
| bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH); |
| |
| /* Send off our tablespaces one by one */ |
| foreach(lc, state.tablespaces) |
| { |
| tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); |
| |
| if (ti->path == NULL) |
| { |
| struct stat statbuf; |
| bool sendtblspclinks = true; |
| char *backup_label; |
| |
| bbsink_begin_archive(sink, "base.tar"); |
| |
| /* In the main tar, include the backup_label first... */ |
| backup_label = build_backup_content(backup_state, false); |
| sendFileWithContent(sink, BACKUP_LABEL_FILE, |
| backup_label, &manifest); |
| pfree(backup_label); |
| |
| /* Then the tablespace_map file, if required... */ |
| if (opt->sendtblspcmapfile) |
| { |
| sendFileWithContent(sink, TABLESPACE_MAP, |
| tablespace_map->data, &manifest); |
| sendtblspclinks = false; |
| } |
| |
| /* Then the bulk of the files... */ |
| sendDir(sink, ".", 1, false, state.tablespaces, |
| sendtblspclinks, &manifest, NULL, opt->exclude); |
| |
| /* ... and pg_control after everything else. */ |
| if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not stat file \"%s\": %m", |
| XLOG_CONTROL_FILE))); |
| sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, |
| false, InvalidOid, &manifest, NULL); |
| } |
| else |
| { |
| char *archive_name = psprintf("%s.tar", ti->oid); |
| |
| bbsink_begin_archive(sink, archive_name); |
| |
| sendTablespace(sink, ti->path, ti->oid, false, &manifest); |
| } |
| |
| /* |
| * If we're including WAL, and this is the main data directory we |
| * don't treat this as the end of the tablespace. Instead, we will |
| * include the xlog files below and stop afterwards. This is safe |
| * since the main data directory is always sent *last*. |
| */ |
| if (opt->includewal && ti->path == NULL) |
| { |
| Assert(lnext(state.tablespaces, lc) == NULL); |
| } |
| else |
| { |
| /* Properly terminate the tarfile. */ |
| StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ, |
| "BLCKSZ too small for 2 tar blocks"); |
| memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE); |
| bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE); |
| |
| /* OK, that's the end of the archive. */ |
| bbsink_end_archive(sink); |
| } |
| } |
| |
| basebackup_progress_wait_wal_archive(&state); |
| do_pg_backup_stop(backup_state, !opt->nowait); |
| |
| endptr = backup_state->stoppoint; |
| endtli = backup_state->stoptli; |
| |
| /* Deallocate backup-related variables. */ |
| pfree(tablespace_map->data); |
| pfree(tablespace_map); |
| pfree(backup_state); |
| } |
| PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); |
| |
| |
| if (opt->includewal) |
| { |
| /* |
| * We've left the last tar file "open", so we can now append the |
| * required WAL files to it. |
| */ |
| char pathbuf[MAXPGPATH]; |
| XLogSegNo segno; |
| XLogSegNo startsegno; |
| XLogSegNo endsegno; |
| struct stat statbuf; |
| List *historyFileList = NIL; |
| List *walFileList = NIL; |
| char firstoff[MAXFNAMELEN]; |
| char lastoff[MAXFNAMELEN]; |
| DIR *dir; |
| struct dirent *de; |
| ListCell *lc; |
| TimeLineID tli; |
| |
| basebackup_progress_transfer_wal(); |
| |
| /* |
| * I'd rather not worry about timelines here, so scan pg_wal and |
| * include all WAL files in the range between 'startptr' and 'endptr', |
| * regardless of the timeline the file is stamped with. If there are |
| * some spurious WAL files belonging to timelines that don't belong in |
| * this server's history, they will be included too. Normally there |
| * shouldn't be such files, but if there are, there's little harm in |
| * including them. |
| */ |
| XLByteToSeg(state.startptr, startsegno, wal_segment_size); |
| XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size); |
| XLByteToPrevSeg(endptr, endsegno, wal_segment_size); |
| XLogFileName(lastoff, endtli, endsegno, wal_segment_size); |
| |
| dir = AllocateDir("pg_wal"); |
| while ((de = ReadDir(dir, "pg_wal")) != NULL) |
| { |
| /* Does it look like a WAL segment, and is it in the range? */ |
| if (IsXLogFileName(de->d_name) && |
| strcmp(de->d_name + 8, firstoff + 8) >= 0 && |
| strcmp(de->d_name + 8, lastoff + 8) <= 0) |
| { |
| walFileList = lappend(walFileList, pstrdup(de->d_name)); |
| } |
| /* Does it look like a timeline history file? */ |
| else if (IsTLHistoryFileName(de->d_name)) |
| { |
| historyFileList = lappend(historyFileList, pstrdup(de->d_name)); |
| } |
| } |
| FreeDir(dir); |
| |
| /* |
| * Before we go any further, check that none of the WAL segments we |
| * need were removed. |
| */ |
| CheckXLogRemoved(startsegno, state.starttli); |
| |
| /* |
| * Sort the WAL filenames. We want to send the files in order from |
| * oldest to newest, to reduce the chance that a file is recycled |
| * before we get a chance to send it over. |
| */ |
| list_sort(walFileList, compareWalFileNames); |
| |
| /* |
| * There must be at least one xlog file in the pg_wal directory, since |
| * we are doing backup-including-xlog. |
| */ |
| if (walFileList == NIL) |
| ereport(ERROR, |
| (errmsg("could not find any WAL files"))); |
| |
| /* |
| * Sanity check: the first and last segment should cover startptr and |
| * endptr, with no gaps in between. |
| */ |
| XLogFromFileName((char *) linitial(walFileList), |
| &tli, &segno, wal_segment_size); |
| if (segno != startsegno) |
| { |
| char startfname[MAXFNAMELEN]; |
| |
| XLogFileName(startfname, state.starttli, startsegno, |
| wal_segment_size); |
| ereport(ERROR, |
| (errmsg("could not find WAL file \"%s\"", startfname))); |
| } |
| foreach(lc, walFileList) |
| { |
| char *walFileName = (char *) lfirst(lc); |
| XLogSegNo currsegno = segno; |
| XLogSegNo nextsegno = segno + 1; |
| |
| XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); |
| if (!(nextsegno == segno || currsegno == segno)) |
| { |
| char nextfname[MAXFNAMELEN]; |
| |
| XLogFileName(nextfname, tli, nextsegno, wal_segment_size); |
| ereport(ERROR, |
| (errmsg("could not find WAL file \"%s\"", nextfname))); |
| } |
| } |
| if (segno != endsegno) |
| { |
| char endfname[MAXFNAMELEN]; |
| |
| XLogFileName(endfname, endtli, endsegno, wal_segment_size); |
| ereport(ERROR, |
| (errmsg("could not find WAL file \"%s\"", endfname))); |
| } |
| |
| /* Ok, we have everything we need. Send the WAL files. */ |
| foreach(lc, walFileList) |
| { |
| char *walFileName = (char *) lfirst(lc); |
| int fd; |
| size_t cnt; |
| pgoff_t len = 0; |
| |
| snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName); |
| XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); |
| |
| fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY); |
| if (fd < 0) |
| { |
| int save_errno = errno; |
| |
| /* |
| * Most likely reason for this is that the file was already |
| * removed by a checkpoint, so check for that to get a better |
| * error message. |
| */ |
| CheckXLogRemoved(segno, tli); |
| |
| errno = save_errno; |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", pathbuf))); |
| } |
| |
| if (fstat(fd, &statbuf) != 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not stat file \"%s\": %m", |
| pathbuf))); |
| if (statbuf.st_size != wal_segment_size) |
| { |
| CheckXLogRemoved(segno, tli); |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("unexpected WAL file size \"%s\"", walFileName))); |
| } |
| |
| /* send the WAL file itself */ |
| _tarWriteHeader(sink, pathbuf, NULL, &statbuf, false); |
| |
| while ((cnt = basebackup_read_file(fd, sink->bbs_buffer, |
| Min(sink->bbs_buffer_length, |
| wal_segment_size - len), |
| len, pathbuf, true)) > 0) |
| { |
| CheckXLogRemoved(segno, tli); |
| bbsink_archive_contents(sink, cnt); |
| |
| len += cnt; |
| |
| if (len == wal_segment_size) |
| break; |
| } |
| |
| if (len != wal_segment_size) |
| { |
| CheckXLogRemoved(segno, tli); |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("unexpected WAL file size \"%s\"", walFileName))); |
| } |
| |
| elogif(debug_basebackup, LOG, |
| "basebackup perform -- Sent xlog file %s", walFileName); |
| |
| /* |
| * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need |
| * for padding. |
| */ |
| Assert(wal_segment_size % TAR_BLOCK_SIZE == 0); |
| |
| CloseTransientFile(fd); |
| |
| /* |
| * Mark file as archived, otherwise files can get archived again |
| * after promotion of a new node. This is in line with |
| * walreceiver.c always doing an XLogArchiveForceDone() after a |
| * complete segment. |
| */ |
| StatusFilePath(pathbuf, walFileName, ".done"); |
| sendFileWithContent(sink, pathbuf, "", &manifest); |
| } |
| |
| /* |
| * Send timeline history files too. Only the latest timeline history |
| * file is required for recovery, and even that only if there happens |
| * to be a timeline switch in the first WAL segment that contains the |
| * checkpoint record, or if we're taking a base backup from a standby |
| * server and the target timeline changes while the backup is taken. |
| * But they are small and highly useful for debugging purposes, so |
| * better include them all, always. |
| */ |
| foreach(lc, historyFileList) |
| { |
| char *fname = lfirst(lc); |
| |
| snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname); |
| |
| if (lstat(pathbuf, &statbuf) != 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not stat file \"%s\": %m", pathbuf))); |
| |
| sendFile(sink, pathbuf, pathbuf, &statbuf, false, InvalidOid, |
| &manifest, NULL); |
| |
| /* unconditionally mark file as archived */ |
| StatusFilePath(pathbuf, fname, ".done"); |
| sendFileWithContent(sink, pathbuf, "", &manifest); |
| } |
| |
| /* Properly terminate the tar file. */ |
| StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ, |
| "BLCKSZ too small for 2 tar blocks"); |
| memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE); |
| bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE); |
| |
| /* OK, that's the end of the archive. */ |
| bbsink_end_archive(sink); |
| } |
| |
| AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli, |
| endptr, endtli); |
| |
| SendBackupManifest(&manifest, sink); |
| |
| bbsink_end_backup(sink, endptr, endtli); |
| |
| if (total_checksum_failures) |
| { |
| if (total_checksum_failures > 1) |
| ereport(WARNING, |
| (errmsg_plural("%lld total checksum verification failure", |
| "%lld total checksum verification failures", |
| total_checksum_failures, |
| total_checksum_failures))); |
| |
| ereport(ERROR, |
| (errcode(ERRCODE_DATA_CORRUPTED), |
| errmsg("checksum verification failure during base backup"))); |
| } |
| |
| /* |
| * Make sure to free the manifest before the resource owners as manifests |
| * use cryptohash contexts that may depend on resource owners (like |
| * OpenSSL). |
| */ |
| FreeBackupManifest(&manifest); |
| |
| /* clean up the resource owner we created */ |
| WalSndResourceCleanup(true); |
| |
| basebackup_progress_done(); |
| } |
| |
| /* |
| * list_sort comparison function, to compare log/seg portion of WAL segment |
| * filenames, ignoring the timeline portion. |
| */ |
| static int |
| compareWalFileNames(const ListCell *a, const ListCell *b) |
| { |
| char *fna = (char *) lfirst(a); |
| char *fnb = (char *) lfirst(b); |
| |
| return strcmp(fna + 8, fnb + 8); |
| } |
| |
| /* Hash entire string */ |
| static uint32 |
| key_string_hash(const void *key, Size keysize) |
| { |
| Size s_len = strlen((const char *) key); |
| |
| Assert(keysize == sizeof(char *)); |
| return DatumGetUInt32(hash_any((const unsigned char *) key, (int) s_len)); |
| } |
| |
| /* Compare entire string. */ |
| static int |
| key_string_compare(const void *key1, const void *key2, Size keysize) |
| { |
| Assert(keysize == sizeof(char *)); |
| |
| return strcmp(*((const char **) key1), key2); |
| } |
| |
| /* Copy string by copying pointer. */ |
| static void * |
| key_string_copy(void *dest, const void *src, Size keysize) |
| { |
| Assert(keysize == sizeof(char *)); |
| |
| *((char **) dest) = (char *) src; /* trust caller re allocation */ |
| return NULL; /* not used */ |
| } |
| |
| /* |
| * Parse the base backup options passed down by the parser |
| */ |
| static void |
| parse_basebackup_options(List *options, basebackup_options *opt) |
| { |
| ListCell *lopt; |
| bool o_label = false; |
| bool o_progress = false; |
| bool o_checkpoint = false; |
| bool o_nowait = false; |
| bool o_wal = false; |
| bool o_maxrate = false; |
| bool o_tablespace_map = false; |
| bool o_noverify_checksums = false; |
| bool o_manifest = false; |
| bool o_manifest_checksums = false; |
| bool o_target = false; |
| bool o_target_detail = false; |
| char *target_str = NULL; |
| char *target_detail_str = NULL; |
| bool o_compression = false; |
| bool o_compression_detail = false; |
| char *compression_detail_str = NULL; |
| |
| MemSet(opt, 0, sizeof(*opt)); |
| |
| /* |
| * The exclude hash table is only created if EXCLUDE options are specified. |
| * The matching function is optimized to run fast when the hash table is |
| * NULL. |
| */ |
| opt->exclude = NULL; |
| opt->manifest = MANIFEST_OPTION_NO; |
| opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C; |
| opt->compression = PG_COMPRESSION_NONE; |
| opt->compression_specification.algorithm = PG_COMPRESSION_NONE; |
| |
| foreach(lopt, options) |
| { |
| DefElem *defel = (DefElem *) lfirst(lopt); |
| |
| if (strcmp(defel->defname, "label") == 0) |
| { |
| if (o_label) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| opt->label = defGetString(defel); |
| o_label = true; |
| } |
| else if (strcmp(defel->defname, "progress") == 0) |
| { |
| if (o_progress) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| opt->progress = defGetBoolean(defel); |
| o_progress = true; |
| } |
| else if (strcmp(defel->defname, "checkpoint") == 0) |
| { |
| char *optval = defGetString(defel); |
| |
| if (o_checkpoint) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| if (pg_strcasecmp(optval, "fast") == 0) |
| opt->fastcheckpoint = true; |
| else if (pg_strcasecmp(optval, "spread") == 0) |
| opt->fastcheckpoint = false; |
| else |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("unrecognized checkpoint type: \"%s\"", |
| optval))); |
| o_checkpoint = true; |
| } |
| else if (strcmp(defel->defname, "wait") == 0) |
| { |
| if (o_nowait) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| opt->nowait = !defGetBoolean(defel); |
| o_nowait = true; |
| } |
| else if (strcmp(defel->defname, "wal") == 0) |
| { |
| if (o_wal) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| opt->includewal = defGetBoolean(defel); |
| o_wal = true; |
| } |
| else if (strcmp(defel->defname, "max_rate") == 0) |
| { |
| int64 maxrate; |
| |
| if (o_maxrate) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| |
| maxrate = defGetInt64(defel); |
| if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER) |
| ereport(ERROR, |
| (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
| errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)", |
| (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER))); |
| |
| opt->maxrate = (uint32) maxrate; |
| o_maxrate = true; |
| } |
| else if (strcmp(defel->defname, "exclude") == 0) |
| { |
| /* EXCLUDE option can be specified multiple times */ |
| bool found; |
| |
| if (unlikely(opt->exclude == NULL)) |
| { |
| HASHCTL hashctl; |
| |
| /* |
| * The hash table stores the string keys in-place if the |
| * `match` and `keycopy` functions are not explicitly |
| * specified. In our case MAXPGPATH bytes need to be reserved |
| * for each key, which is too wasteful. |
| * |
| * By specifying the `match` and `keycopy` functions we could |
| * allocate the strings separately and store only the string |
| * pointers in the hash table. |
| */ |
| hashctl.hash = key_string_hash; |
| hashctl.match = key_string_compare; |
| hashctl.keycopy = key_string_copy; |
| |
| /* The hash table is used as a set, only the keys are meaningful */ |
| hashctl.keysize = sizeof(char *); |
| hashctl.entrysize = hashctl.keysize; |
| |
| opt->exclude = hash_create("replication exclude", |
| 64 /* nelem */, |
| &hashctl, |
| HASH_ELEM | HASH_FUNCTION | |
| HASH_COMPARE | HASH_KEYCOPY); |
| } |
| |
| hash_search(opt->exclude, pstrdup(strVal(defel->arg)), |
| HASH_ENTER, &found); |
| } |
| else if (strcmp(defel->defname, "tablespace_map") == 0) |
| { |
| if (o_tablespace_map) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| opt->sendtblspcmapfile = defGetBoolean(defel); |
| o_tablespace_map = true; |
| } |
| else if (strcmp(defel->defname, "verify_checksums") == 0) |
| { |
| if (o_noverify_checksums) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| noverify_checksums = !defGetBoolean(defel); |
| o_noverify_checksums = true; |
| } |
| else if (strcmp(defel->defname, "manifest") == 0) |
| { |
| char *optval = defGetString(defel); |
| bool manifest_bool; |
| |
| if (o_manifest) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| if (parse_bool(optval, &manifest_bool)) |
| { |
| if (manifest_bool) |
| opt->manifest = MANIFEST_OPTION_YES; |
| else |
| opt->manifest = MANIFEST_OPTION_NO; |
| } |
| else if (pg_strcasecmp(optval, "force-encode") == 0) |
| opt->manifest = MANIFEST_OPTION_FORCE_ENCODE; |
| else |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("unrecognized manifest option: \"%s\"", |
| optval))); |
| o_manifest = true; |
| } |
| else if (strcmp(defel->defname, "manifest_checksums") == 0) |
| { |
| char *optval = defGetString(defel); |
| |
| if (o_manifest_checksums) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| if (!pg_checksum_parse_type(optval, |
| &opt->manifest_checksum_type)) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("unrecognized checksum algorithm: \"%s\"", |
| optval))); |
| o_manifest_checksums = true; |
| } |
| else if (strcmp(defel->defname, "target") == 0) |
| { |
| if (o_target) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| target_str = defGetString(defel); |
| o_target = true; |
| } |
| else if (strcmp(defel->defname, "target_detail") == 0) |
| { |
| char *optval = defGetString(defel); |
| |
| if (o_target_detail) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| target_detail_str = optval; |
| o_target_detail = true; |
| } |
| else if (strcmp(defel->defname, "compression") == 0) |
| { |
| char *optval = defGetString(defel); |
| |
| if (o_compression) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| if (!parse_compress_algorithm(optval, &opt->compression)) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("unrecognized compression algorithm: \"%s\"", |
| optval))); |
| o_compression = true; |
| } |
| else if (strcmp(defel->defname, "compression_detail") == 0) |
| { |
| if (o_compression_detail) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("duplicate option \"%s\"", defel->defname))); |
| compression_detail_str = defGetString(defel); |
| o_compression_detail = true; |
| } |
| else |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("unrecognized base backup option: \"%s\"", |
| defel->defname))); |
| } |
| |
| if (opt->label == NULL) |
| opt->label = "base backup"; |
| |
| if (opt->exclude) |
| hash_freeze(opt->exclude); |
| |
| elogif(debug_basebackup, LOG, |
| "basebackup options -- " |
| "label = %s, " |
| "progress = %s, " |
| "fastcheckpoint = %s, " |
| "nowait = %s, " |
| "wal = %s", |
| opt->label, |
| opt->progress ? "true" : "false", |
| opt->fastcheckpoint ? "true" : "false", |
| opt->nowait ? "true" : "false", |
| opt->includewal ? "true" : "false"); |
| if (opt->manifest == MANIFEST_OPTION_NO) |
| { |
| if (o_manifest_checksums) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("manifest checksums require a backup manifest"))); |
| opt->manifest_checksum_type = CHECKSUM_TYPE_NONE; |
| } |
| |
| if (target_str == NULL) |
| { |
| if (target_detail_str != NULL) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("target detail cannot be used without target"))); |
| opt->use_copytblspc = true; |
| opt->send_to_client = true; |
| } |
| else if (strcmp(target_str, "client") == 0) |
| { |
| if (target_detail_str != NULL) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("target \"%s\" does not accept a target detail", |
| target_str))); |
| opt->send_to_client = true; |
| } |
| else |
| opt->target_handle = |
| BaseBackupGetTargetHandle(target_str, target_detail_str); |
| |
| if (o_compression_detail && !o_compression) |
| ereport(ERROR, |
| (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("compression detail cannot be specified unless compression is enabled"))); |
| |
| if (o_compression) |
| { |
| char *error_detail; |
| |
| parse_compress_specification(opt->compression, compression_detail_str, |
| &opt->compression_specification); |
| error_detail = |
| validate_compress_specification(&opt->compression_specification); |
| if (error_detail != NULL) |
| ereport(ERROR, |
| errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("invalid compression specification: %s", |
| error_detail)); |
| } |
| } |
| |
| |
| /* |
| * SendBaseBackup() - send a complete base backup. |
| * |
| * The function will put the system into backup mode like pg_backup_start() |
| * does, so that the backup is consistent even though we read directly from |
| * the filesystem, bypassing the buffer cache. |
| */ |
| void |
| SendBaseBackup(BaseBackupCmd *cmd) |
| { |
| basebackup_options opt; |
| bbsink *sink; |
| SessionBackupState status = get_backup_status(); |
| |
| if (status == SESSION_BACKUP_RUNNING) |
| ereport(ERROR, |
| (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| errmsg("a backup is already in progress in this session"))); |
| |
| parse_basebackup_options(cmd->options, &opt); |
| |
| WalSndSetState(WALSNDSTATE_BACKUP); |
| |
| if (update_process_title) |
| { |
| char activitymsg[50]; |
| |
| snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"", |
| opt.label); |
| set_ps_display(activitymsg); |
| } |
| |
| /* |
| * If the target is specifically 'client' then set up to stream the backup |
| * to the client; otherwise, it's being sent someplace else and should not |
| * be sent to the client. BaseBackupGetSink has the job of setting up a |
| * sink to send the backup data wherever it needs to go. |
| */ |
| sink = bbsink_copystream_new(opt.send_to_client); |
| if (opt.target_handle != NULL) |
| sink = BaseBackupGetSink(opt.target_handle, sink); |
| |
| /* Set up network throttling, if client requested it */ |
| if (opt.maxrate > 0) |
| sink = bbsink_throttle_new(sink, opt.maxrate); |
| |
| /* Set up server-side compression, if client requested it */ |
| if (opt.compression == PG_COMPRESSION_GZIP) |
| sink = bbsink_gzip_new(sink, &opt.compression_specification); |
| else if (opt.compression == PG_COMPRESSION_LZ4) |
| sink = bbsink_lz4_new(sink, &opt.compression_specification); |
| else if (opt.compression == PG_COMPRESSION_ZSTD) |
| sink = bbsink_zstd_new(sink, &opt.compression_specification); |
| |
| /* Set up progress reporting. */ |
| sink = bbsink_progress_new(sink, opt.progress); |
| |
| /* |
| * Perform the base backup, but make sure we clean up the bbsink even if |
| * an error occurs. |
| */ |
| PG_TRY(); |
| { |
| perform_base_backup(&opt, sink); |
| } |
| PG_FINALLY(); |
| { |
| bbsink_cleanup(sink); |
| } |
| PG_END_TRY(); |
| } |
| |
| /* |
| * Inject a file with given name and content in the output tar stream. |
| */ |
| static void |
| sendFileWithContent(bbsink *sink, const char *filename, const char *content, |
| backup_manifest_info *manifest) |
| { |
| struct stat statbuf; |
| int bytes_done = 0, |
| len; |
| pg_checksum_context checksum_ctx; |
| |
| if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) |
| elog(ERROR, "could not initialize checksum of file \"%s\"", |
| filename); |
| |
| len = strlen(content); |
| |
| /* |
| * Construct a stat struct for the backup_label file we're injecting in |
| * the tar. |
| */ |
| /* Windows doesn't have the concept of uid and gid */ |
| #ifdef WIN32 |
| statbuf.st_uid = 0; |
| statbuf.st_gid = 0; |
| #else |
| statbuf.st_uid = geteuid(); |
| statbuf.st_gid = getegid(); |
| #endif |
| statbuf.st_mtime = time(NULL); |
| statbuf.st_mode = pg_file_create_mode; |
| statbuf.st_size = len; |
| |
| _tarWriteHeader(sink, filename, NULL, &statbuf, false); |
| |
| elogif(debug_basebackup, LOG, |
| "basebackup send file -- Sent file '%s' with content \n%s.", |
| filename, content); |
| |
| if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0) |
| elog(ERROR, "could not update checksum of file \"%s\"", |
| filename); |
| |
| while (bytes_done < len) |
| { |
| size_t remaining = len - bytes_done; |
| size_t nbytes = Min(sink->bbs_buffer_length, remaining); |
| |
| memcpy(sink->bbs_buffer, content, nbytes); |
| bbsink_archive_contents(sink, nbytes); |
| bytes_done += nbytes; |
| content += nbytes; |
| } |
| |
| _tarWritePadding(sink, len); |
| |
| AddFileToBackupManifest(manifest, NULL, filename, len, |
| (pg_time_t) statbuf.st_mtime, &checksum_ctx); |
| } |
| |
| /* |
| * Include the tablespace directory pointed to by 'path' in the output tar |
| * stream. If 'sizeonly' is true, we just calculate a total length and return |
| * it, without actually sending anything. |
| * |
| * Only used to send auxiliary tablespaces, not PGDATA. |
| */ |
| static int64 |
| sendTablespace(bbsink *sink, char *path, char *spcoid, bool sizeonly, |
| backup_manifest_info *manifest) |
| { |
| int64 size; |
| char pathbuf[MAXPGPATH]; |
| struct stat statbuf; |
| |
| /* |
| * 'path' points to the tablespace location, but we only want to include |
| * the version directory in it that belongs to us. |
| */ |
| snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, |
| GP_TABLESPACE_VERSION_DIRECTORY); |
| |
| elogif(debug_basebackup, LOG, |
| "sendTablespace -- Sending tablespace version directory = %s", pathbuf); |
| /* |
| * Store a directory entry in the tar file so we get the permissions |
| * right. |
| */ |
| if (lstat(pathbuf, &statbuf) != 0) |
| { |
| if (errno != ENOENT) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not stat file or directory \"%s\": %m", |
| pathbuf))); |
| |
| /* If the tablespace went away while scanning, it's no error. */ |
| return 0; |
| } |
| |
| size = _tarWriteHeader(sink, GP_TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf, |
| sizeonly); |
| |
| /* Send all the files in the tablespace version directory */ |
| size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest, |
| spcoid, NULL); |
| |
| return size; |
| } |
| |
| /* |
| * Check if client EXCLUDE option matches this path. Current implementation |
| * is only the exact match for the relative path from the datadir root (e.g. |
| * "./log" etc). |
| */ |
| static bool |
| match_exclude_list(char *path, HTAB *exclude) |
| { |
| bool found = false; |
| |
| if (unlikely(exclude)) |
| hash_search(exclude, path, HASH_FIND, &found); |
| |
| return found; |
| } |
| |
| /* |
| * Include all files from the given directory in the output tar stream. If |
| * 'sizeonly' is true, we just calculate a total length and return it, without |
| * actually sending anything. |
| * |
| * Omit any directory in the tablespaces list, to avoid backing up |
| * tablespaces twice when they were created inside PGDATA. |
| * |
| * If sendtblspclinks is true, we need to include symlink |
| * information in the tar file. If not, we can skip that |
| * as it will be sent separately in the tablespace_map file. |
| * |
| * GPDB: Also omit any files in the 'exclude' list. |
| */ |
| static int64 |
| sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, |
| List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest, |
| const char *spcoid, HTAB *exclude) |
| { |
| DIR *dir; |
| struct dirent *de; |
| char pathbuf[MAXPGPATH * 2]; |
| struct stat statbuf; |
| int64 size = 0; |
| const char *lastDir; /* Split last dir from parent path. */ |
| bool isDbDir = false; /* Does this directory contain relations? */ |
| |
| /* |
| * Determine if the current path is a database directory that can contain |
| * relations. |
| * |
| * Start by finding the location of the delimiter between the parent path |
| * and the current path. |
| */ |
| lastDir = last_dir_separator(path); |
| |
| /* Does this path look like a database path (i.e. all digits)? */ |
| if (lastDir != NULL && |
| strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1)) |
| { |
| /* Part of path that contains the parent directory. */ |
| int parentPathLen = lastDir - path; |
| |
| /* |
| * Mark path as a database directory if the parent path is either |
| * $PGDATA/base or a tablespace version path. |
| */ |
| if (strncmp(path, "./base", parentPathLen) == 0 || |
| (parentPathLen >= (sizeof(GP_TABLESPACE_VERSION_DIRECTORY) - 1) && |
| strncmp(lastDir - (sizeof(GP_TABLESPACE_VERSION_DIRECTORY) - 1), |
| GP_TABLESPACE_VERSION_DIRECTORY, |
| sizeof(GP_TABLESPACE_VERSION_DIRECTORY) - 1) == 0)) |
| isDbDir = true; |
| } |
| |
| dir = AllocateDir(path); |
| while ((de = ReadDir(dir, path)) != NULL) |
| { |
| int excludeIdx; |
| bool excludeFound; |
| ForkNumber relForkNum; /* Type of fork if file is a relation */ |
| int relnumchars; /* Chars in filename that are the |
| * relnumber */ |
| |
| /* Skip special stuff */ |
| if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) |
| continue; |
| |
| /* Skip temporary files */ |
| if (strncmp(de->d_name, |
| PG_TEMP_FILE_PREFIX, |
| strlen(PG_TEMP_FILE_PREFIX)) == 0) |
| continue; |
| |
| /* Skip macOS system files */ |
| if (strcmp(de->d_name, ".DS_Store") == 0) |
| continue; |
| |
| /* |
| * Check if the postmaster has signaled us to exit, and abort with an |
| * error in that case. The error handler further up will call |
| * do_pg_abort_backup() for us. Also check that if the backup was |
| * started while still in recovery, the server wasn't promoted. |
| * do_pg_backup_stop() will check that too, but it's better to stop |
| * the backup early than continue to the end and fail there. |
| */ |
| CHECK_FOR_INTERRUPTS(); |
| if (RecoveryInProgress() != backup_started_in_recovery) |
| ereport(ERROR, |
| (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| errmsg("the standby was promoted during online backup"), |
| errhint("This means that the backup being taken is corrupt " |
| "and should not be used. " |
| "Try taking another online backup."))); |
| |
| /* Scan for files that should be excluded */ |
| excludeFound = false; |
| for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++) |
| { |
| int cmplen = strlen(excludeFiles[excludeIdx].name); |
| |
| if (!excludeFiles[excludeIdx].match_prefix) |
| cmplen++; |
| if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0) |
| { |
| elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name); |
| excludeFound = true; |
| break; |
| } |
| } |
| |
| if (excludeFound) |
| continue; |
| |
| /* Exclude all forks for unlogged tables except the init fork */ |
| if (isDbDir && |
| parse_filename_for_nontemp_relation(de->d_name, &relnumchars, |
| &relForkNum)) |
| { |
| /* Never exclude init forks */ |
| if (relForkNum != INIT_FORKNUM) |
| { |
| char initForkFile[MAXPGPATH]; |
| char relNumber[OIDCHARS + 1]; |
| |
| /* |
| * If any other type of fork, check if there is an init fork |
| * with the same RelFileNumber. If so, the file can be |
| * excluded. |
| */ |
| memcpy(relNumber, de->d_name, relnumchars); |
| relNumber[relnumchars] = '\0'; |
| snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init", |
| path, relNumber); |
| |
| if (lstat(initForkFile, &statbuf) == 0) |
| { |
| elog(DEBUG2, |
| "unlogged relation file \"%s\" excluded from backup", |
| de->d_name); |
| |
| continue; |
| } |
| } |
| } |
| |
| /* Exclude temporary relations */ |
| if (isDbDir && looks_like_temp_rel_name(de->d_name)) |
| { |
| elog(DEBUG2, |
| "temporary relation file \"%s\" excluded from backup", |
| de->d_name); |
| |
| continue; |
| } |
| |
| snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name); |
| |
| /* Skip pg_control here to back up it last */ |
| if (strcmp(pathbuf, "./global/pg_control") == 0) |
| continue; |
| |
| if (lstat(pathbuf, &statbuf) != 0) |
| { |
| if (errno != ENOENT) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not stat file or directory \"%s\": %m", |
| pathbuf))); |
| |
| /* If the file went away while scanning, it's not an error. */ |
| continue; |
| } |
| |
| /* Scan for directories whose contents should be excluded */ |
| excludeFound = false; |
| for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++) |
| { |
| if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0) |
| { |
| elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name); |
| convert_link_to_directory(pathbuf, &statbuf); |
| size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, |
| &statbuf, sizeonly); |
| excludeFound = true; |
| break; |
| } |
| } |
| |
| if (excludeFound) |
| continue; |
| |
| /* |
| * We can skip pg_wal, the WAL segments need to be fetched from the |
| * WAL archive anyway. But include it as an empty directory anyway, so |
| * we get permissions right. |
| */ |
| if (strcmp(pathbuf, "./pg_wal") == 0) |
| { |
| /* If pg_wal is a symlink, write it as a directory anyway */ |
| convert_link_to_directory(pathbuf, &statbuf); |
| size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, |
| &statbuf, sizeonly); |
| |
| /* |
| * Also send archive_status directory (by hackishly reusing |
| * statbuf from above ...). |
| */ |
| size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL, |
| &statbuf, sizeonly); |
| |
| continue; /* don't recurse into pg_wal */ |
| } |
| |
| /* Skip if client does not want */ |
| if (match_exclude_list(pathbuf, exclude)) |
| continue; |
| |
| /* Allow symbolic links in pg_tblspc only */ |
| if (strcmp(path, "./pg_tblspc") == 0 && S_ISLNK(statbuf.st_mode)) |
| { |
| char linkpath[MAXPGPATH]; |
| int rllen; |
| |
| rllen = readlink(pathbuf, linkpath, sizeof(linkpath)); |
| if (rllen < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not read symbolic link \"%s\": %m", |
| pathbuf))); |
| if (rllen >= MAX_TARABLE_SYMLINK_PATH_LENGTH) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("symbolic link \"%s\" target is too long and will not be added to the backup", |
| pathbuf), |
| errdetail("The symbolic link with target \"%s\" is too long. Symlink targets with length greater than %d characters would be truncated.", pathbuf, MAX_TARABLE_SYMLINK_PATH_LENGTH))); |
| linkpath[rllen] = '\0'; |
| |
| /* Lop off the dbid before sending the link target. */ |
| char *file_sep_before_dbid_in_link_path = strrchr(linkpath, '/'); |
| *file_sep_before_dbid_in_link_path = '\0'; |
| |
| size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath, |
| &statbuf, sizeonly); |
| } |
| else if (S_ISDIR(statbuf.st_mode)) |
| { |
| bool skip_this_dir = false; |
| ListCell *lc; |
| |
| /* |
| * Store a directory entry in the tar file so we can get the |
| * permissions right. |
| */ |
| size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf, |
| sizeonly); |
| |
| /* |
| * Call ourselves recursively for a directory, unless it happens |
| * to be a separate tablespace located within PGDATA. |
| */ |
| foreach(lc, tablespaces) |
| { |
| tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); |
| |
| /* |
| * ti->rpath is the tablespace relative path within PGDATA, or |
| * NULL if the tablespace has been properly located somewhere |
| * else. |
| * |
| * Skip past the leading "./" in pathbuf when comparing. |
| */ |
| if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0) |
| { |
| skip_this_dir = true; |
| break; |
| } |
| } |
| |
| /* |
| * skip sending directories inside pg_tblspc, if not required. |
| */ |
| if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks) |
| skip_this_dir = true; |
| |
| if (!skip_this_dir) |
| size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces, |
| sendtblspclinks, manifest, spcoid, exclude); |
| } |
| else if (S_ISREG(statbuf.st_mode)) |
| { |
| bool sent = false; |
| |
| if (!sizeonly) |
| sent = sendFile(sink, pathbuf, pathbuf + basepathlen + 1, &statbuf, |
| true, isDbDir ? atooid(lastDir + 1) : InvalidOid, |
| manifest, spcoid); |
| |
| if (sent || sizeonly) |
| { |
| /* Add size. */ |
| size += statbuf.st_size; |
| |
| /* Pad to a multiple of the tar block size. */ |
| size += tarPaddingBytesRequired(statbuf.st_size); |
| |
| /* Size of the header for the file. */ |
| size += TAR_BLOCK_SIZE; |
| } |
| } |
| else |
| ereport(WARNING, |
| (errmsg("skipping special file \"%s\"", pathbuf))); |
| } |
| FreeDir(dir); |
| |
| elogif(debug_basebackup && !sizeonly, LOG, |
| "baseabckup send dir -- Sent directory %s", path); |
| |
| return size; |
| } |
| |
| /* |
| * Check if a file should have its checksum validated. |
| * We validate checksums on files in regular tablespaces |
| * (including global and default) only, and in those there |
| * are some files that are explicitly excluded. |
| */ |
| static bool |
| is_checksummed_file(const char *fullpath, const char *filename) |
| { |
| /* Check that the file is in a tablespace */ |
| if (strncmp(fullpath, "./global/", 9) == 0 || |
| strncmp(fullpath, "./base/", 7) == 0 || |
| strncmp(fullpath, "/", 1) == 0) |
| { |
| int excludeIdx; |
| |
| /* Compare file against noChecksumFiles skip list */ |
| for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++) |
| { |
| int cmplen = strlen(noChecksumFiles[excludeIdx].name); |
| |
| if (!noChecksumFiles[excludeIdx].match_prefix) |
| cmplen++; |
| if (strncmp(filename, noChecksumFiles[excludeIdx].name, |
| cmplen) == 0) |
| return false; |
| } |
| |
| return true; |
| } |
| else |
| return false; |
| } |
| |
| /* |
| * Given the member, write the TAR header & send the file. |
| * |
| * If 'missing_ok' is true, will not throw an error if the file is not found. |
| * |
| * If dboid is anything other than InvalidOid then any checksum failures |
| * detected will get reported to the cumulative stats system. |
| * |
| * Returns true if the file was successfully sent, false if 'missing_ok', |
| * and the file did not exist. |
| */ |
| static bool |
| sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, |
| struct stat *statbuf, bool missing_ok, Oid dboid, |
| backup_manifest_info *manifest, const char *spcoid) |
| { |
| int fd; |
| BlockNumber blkno = 0; |
| bool block_retry = false; |
| uint16 checksum; |
| int checksum_failures = 0; |
| off_t cnt; |
| int i; |
| pgoff_t len = 0; |
| char *page; |
| PageHeader phdr; |
| int segmentno = 0; |
| char *segmentpath; |
| bool verify_checksum = false; |
| pg_checksum_context checksum_ctx; |
| |
| if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) |
| elog(ERROR, "could not initialize checksum of file \"%s\"", |
| readfilename); |
| |
| fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY); |
| if (fd < 0) |
| { |
| if (errno == ENOENT && missing_ok) |
| return false; |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not open file \"%s\": %m", readfilename))); |
| } |
| |
| _tarWriteHeader(sink, tarfilename, NULL, statbuf, false); |
| |
| if (!noverify_checksums && DataChecksumsEnabled()) |
| { |
| char *filename; |
| |
| /* |
| * Get the filename (excluding path). As last_dir_separator() |
| * includes the last directory separator, we chop that off by |
| * incrementing the pointer. |
| */ |
| filename = last_dir_separator(readfilename) + 1; |
| |
| if (is_checksummed_file(readfilename, filename)) |
| { |
| verify_checksum = true; |
| |
| /* |
| * Cut off at the segment boundary (".") to get the segment number |
| * in order to mix it into the checksum. |
| */ |
| segmentpath = strstr(filename, "."); |
| if (segmentpath != NULL) |
| { |
| segmentno = atoi(segmentpath + 1); |
| if (segmentno == 0) |
| ereport(ERROR, |
| (errmsg("invalid segment number %d in file \"%s\"", |
| segmentno, filename))); |
| } |
| } |
| } |
| |
| /* |
| * Loop until we read the amount of data the caller told us to expect. The |
| * file could be longer, if it was extended while we were sending it, but |
| * for a base backup we can ignore such extended data. It will be restored |
| * from WAL. |
| */ |
| while (len < statbuf->st_size) |
| { |
| size_t remaining = statbuf->st_size - len; |
| |
| /* Try to read some more data. */ |
| cnt = basebackup_read_file(fd, sink->bbs_buffer, |
| Min(sink->bbs_buffer_length, remaining), |
| len, readfilename, true); |
| |
| /* |
| * The checksums are verified at block level, so we iterate over the |
| * buffer in chunks of BLCKSZ, after making sure that |
| * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of |
| * BLCKSZ bytes. |
| */ |
| Assert((sink->bbs_buffer_length % BLCKSZ) == 0); |
| |
| if (verify_checksum && (cnt % BLCKSZ != 0)) |
| { |
| ereport(WARNING, |
| (errmsg("could not verify checksum in file \"%s\", block " |
| "%u: read buffer size %d and page size %d " |
| "differ", |
| readfilename, blkno, (int) cnt, BLCKSZ))); |
| verify_checksum = false; |
| } |
| |
| if (verify_checksum) |
| { |
| for (i = 0; i < cnt / BLCKSZ; i++) |
| { |
| page = sink->bbs_buffer + BLCKSZ * i; |
| |
| /* |
| * Only check pages which have not been modified since the |
| * start of the base backup. Otherwise, they might have been |
| * written only halfway and the checksum would not be valid. |
| * However, replaying WAL would reinstate the correct page in |
| * this case. We also skip completely new pages, since they |
| * don't have a checksum yet. |
| */ |
| if (!PageIsNew(page) && PageGetLSN(page) < sink->bbs_state->startptr) |
| { |
| checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); |
| phdr = (PageHeader) page; |
| if (phdr->pd_checksum != checksum) |
| { |
| /* |
| * Retry the block on the first failure. It's |
| * possible that we read the first 4K page of the |
| * block just before postgres updated the entire block |
| * so it ends up looking torn to us. If, before we |
| * retry the read, the concurrent write of the block |
| * finishes, the page LSN will be updated and we'll |
| * realize that we should ignore this block. |
| * |
| * There's no guarantee that this will actually |
| * happen, though: the torn write could take an |
| * arbitrarily long time to complete. Retrying |
| * multiple times wouldn't fix this problem, either, |
| * though it would reduce the chances of it happening |
| * in practice. The only real fix here seems to be to |
| * have some kind of interlock that allows us to wait |
| * until we can be certain that no write to the block |
| * is in progress. Since we don't have any such thing |
| * right now, we just do this and hope for the best. |
| */ |
| if (block_retry == false) |
| { |
| int reread_cnt; |
| |
| /* Reread the failed block */ |
| reread_cnt = |
| basebackup_read_file(fd, |
| sink->bbs_buffer + BLCKSZ * i, |
| BLCKSZ, len + BLCKSZ * i, |
| readfilename, |
| false); |
| if (reread_cnt == 0) |
| { |
| /* |
| * If we hit end-of-file, a concurrent |
| * truncation must have occurred, so break out |
| * of this loop just as if the initial fread() |
| * returned 0. We'll drop through to the same |
| * code that handles that case. (We must fix |
| * up cnt first, though.) |
| */ |
| cnt = BLCKSZ * i; |
| break; |
| } |
| |
| /* Set flag so we know a retry was attempted */ |
| block_retry = true; |
| |
| /* Reset loop to validate the block again */ |
| i--; |
| continue; |
| } |
| |
| checksum_failures++; |
| |
| if (checksum_failures <= 5) |
| ereport(WARNING, |
| (errmsg("checksum verification failed in " |
| "file \"%s\", block %u: calculated " |
| "%X but expected %X", |
| readfilename, blkno, checksum, |
| phdr->pd_checksum))); |
| if (checksum_failures == 5) |
| ereport(WARNING, |
| (errmsg("further checksum verification " |
| "failures in file \"%s\" will not " |
| "be reported", readfilename))); |
| } |
| } |
| block_retry = false; |
| blkno++; |
| } |
| } |
| |
| /* |
| * If we hit end-of-file, a concurrent truncation must have occurred. |
| * That's not an error condition, because WAL replay will fix things |
| * up. |
| */ |
| if (cnt == 0) |
| break; |
| |
| /* Archive the data we just read. */ |
| bbsink_archive_contents(sink, cnt); |
| |
| /* Also feed it to the checksum machinery. */ |
| if (pg_checksum_update(&checksum_ctx, |
| (uint8 *) sink->bbs_buffer, cnt) < 0) |
| elog(ERROR, "could not update checksum of base backup"); |
| |
| len += cnt; |
| } |
| |
| /* If the file was truncated while we were sending it, pad it with zeros */ |
| while (len < statbuf->st_size) |
| { |
| size_t remaining = statbuf->st_size - len; |
| size_t nbytes = Min(sink->bbs_buffer_length, remaining); |
| |
| MemSet(sink->bbs_buffer, 0, nbytes); |
| if (pg_checksum_update(&checksum_ctx, |
| (uint8 *) sink->bbs_buffer, |
| nbytes) < 0) |
| elog(ERROR, "could not update checksum of base backup"); |
| bbsink_archive_contents(sink, nbytes); |
| len += nbytes; |
| } |
| |
| /* |
| * Pad to a block boundary, per tar format requirements. (This small piece |
| * of data is probably not worth throttling, and is not checksummed |
| * because it's not actually part of the file.) |
| */ |
| _tarWritePadding(sink, len); |
| |
| CloseTransientFile(fd); |
| |
| if (checksum_failures > 1) |
| { |
| ereport(WARNING, |
| (errmsg_plural("file \"%s\" has a total of %d checksum verification failure", |
| "file \"%s\" has a total of %d checksum verification failures", |
| checksum_failures, |
| readfilename, checksum_failures))); |
| |
| pgstat_report_checksum_failures_in_db(dboid, checksum_failures); |
| } |
| |
| total_checksum_failures += checksum_failures; |
| |
| AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size, |
| (pg_time_t) statbuf->st_mtime, &checksum_ctx); |
| |
| return true; |
| } |
| |
| static int64 |
| _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget, |
| struct stat *statbuf, bool sizeonly) |
| { |
| enum tarError rc; |
| |
| if (!sizeonly) |
| { |
| /* |
| * As of this writing, the smallest supported block size is 1kB, which |
| * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a |
| * multiple of BLCKSZ, it should be safe to assume that the buffer is |
| * large enough to fit an entire tar block. We double-check by means |
| * of these assertions. |
| */ |
| StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ, |
| "BLCKSZ too small for tar block"); |
| Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE); |
| |
| rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget, |
| statbuf->st_size, statbuf->st_mode, |
| statbuf->st_uid, statbuf->st_gid, |
| statbuf->st_mtime); |
| |
| switch (rc) |
| { |
| case TAR_OK: |
| break; |
| case TAR_NAME_TOO_LONG: |
| ereport(ERROR, |
| (errmsg("file name too long for tar format: \"%s\"", |
| filename))); |
| break; |
| case TAR_SYMLINK_TOO_LONG: |
| ereport(ERROR, |
| (errmsg("symbolic link target too long for tar format: " |
| "file name \"%s\", target \"%s\"", |
| filename, linktarget))); |
| break; |
| default: |
| elog(ERROR, "unrecognized tar error: %d", rc); |
| } |
| |
| bbsink_archive_contents(sink, TAR_BLOCK_SIZE); |
| } |
| |
| return TAR_BLOCK_SIZE; |
| } |
| |
| /* |
| * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE. |
| */ |
| static void |
| _tarWritePadding(bbsink *sink, int len) |
| { |
| int pad = tarPaddingBytesRequired(len); |
| |
| /* |
| * As in _tarWriteHeader, it should be safe to assume that the buffer is |
| * large enough that we don't need to do this in multiple chunks. |
| */ |
| Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE); |
| Assert(pad <= TAR_BLOCK_SIZE); |
| |
| if (pad > 0) |
| { |
| MemSet(sink->bbs_buffer, 0, pad); |
| bbsink_archive_contents(sink, pad); |
| } |
| } |
| |
| /* |
| * If the entry in statbuf is a link, then adjust statbuf to make it look like a |
| * directory, so that it will be written that way. |
| */ |
| static void |
| convert_link_to_directory(const char *pathbuf, struct stat *statbuf) |
| { |
| /* If symlink, write it as a directory anyway */ |
| if (S_ISLNK(statbuf->st_mode)) |
| statbuf->st_mode = S_IFDIR | pg_dir_create_mode; |
| } |
| |
| /* |
| * Read some data from a file, setting a wait event and reporting any error |
| * encountered. |
| * |
| * If partial_read_ok is false, also report an error if the number of bytes |
| * read is not equal to the number of bytes requested. |
| * |
| * Returns the number of bytes read. |
| */ |
| static int |
| basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, |
| const char *filename, bool partial_read_ok) |
| { |
| int rc; |
| |
| pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ); |
| rc = pg_pread(fd, buf, nbytes, offset); |
| pgstat_report_wait_end(); |
| |
| if (rc < 0) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not read file \"%s\": %m", filename))); |
| if (!partial_read_ok && rc > 0 && rc != nbytes) |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not read file \"%s\": read %d of %zu", |
| filename, rc, nbytes))); |
| |
| return rc; |
| } |