| /*------------------------------------------------------------------------- |
| * |
| * pg_rewind.c |
| * Synchronizes a PostgreSQL data directory to a new timeline |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres_fe.h" |
| #include <sys/stat.h> |
| #include <fcntl.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "access/timeline.h" |
| #include "access/xlog_internal.h" |
| #include "catalog/catversion.h" |
| #include "catalog/pg_control.h" |
| #include "common/controldata_utils.h" |
| #include "common/file_perm.h" |
| #include "common/restricted_token.h" |
| #include "common/string.h" |
| #include "fe_utils/recovery_gen.h" |
| #include "file_ops.h" |
| #include "filemap.h" |
| #include "getopt_long.h" |
| #include "pg_rewind.h" |
| #include "rewind_source.h" |
| #include "storage/bufpage.h" |
| |
| static void usage(const char *progname); |
| |
| static void perform_rewind(filemap_t *filemap, rewind_source *source, |
| XLogRecPtr chkptrec, |
| TimeLineID chkpttli, |
| XLogRecPtr chkptredo); |
| |
| static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, |
| XLogRecPtr checkpointloc); |
| |
| static void digestControlFile(ControlFileData *ControlFile, |
| const char *content, size_t size); |
| static void getRestoreCommand(const char *argv0); |
| static void sanityChecks(void); |
| static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex); |
| static void ensureCleanShutdown(const char *argv0); |
| static int32 get_target_dbid(const char *argv0); |
| static void disconnect_atexit(void); |
| |
| static ControlFileData ControlFile_target; |
| static ControlFileData ControlFile_source; |
| static ControlFileData ControlFile_source_after; |
| |
| int32 dbid_target; |
| const char *progname; |
| int WalSegSz; |
| |
| /* Configuration options */ |
| char *datadir_target = NULL; |
| char *datadir_source = NULL; |
| char *connstr_source = NULL; |
| char *restore_command = NULL; |
| |
| static bool debug = false; |
| bool showprogress = false; |
| bool dry_run = false; |
| bool do_sync = true; |
| bool restore_wal = false; |
| |
| /* Target history */ |
| TimeLineHistoryEntry *targetHistory; |
| int targetNentries; |
| |
| /* Progress counters */ |
| uint64 fetch_size; |
| uint64 fetch_done; |
| |
| static PGconn *conn; |
| static rewind_source *source; |
| |
| static void |
| usage(const char *progname) |
| { |
| printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname); |
| printf(_("Usage:\n %s [OPTION]...\n\n"), progname); |
| printf(_("Options:\n")); |
| printf(_(" -c, --restore-target-wal use restore_command in target configuration to\n" |
| " retrieve WAL files from archives\n")); |
| printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n")); |
| printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n")); |
| printf(_(" --source-server=CONNSTR source server to synchronize with\n")); |
| printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); |
| printf(_(" -n, --dry-run stop before modifying anything\n")); |
| printf(_(" -N, --no-sync do not wait for changes to be written\n" |
| " safely to disk\n")); |
| printf(_(" -P, --progress write progress messages\n")); |
| printf(_(" -R, --write-recovery-conf write configuration for replication\n" |
| " (requires --source-server)\n")); |
| printf(_(" --debug write a lot of debug messages\n")); |
| printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); |
| printf(_(" -V, --version output version information, then exit\n")); |
| printf(_(" -?, --help show this help, then exit\n")); |
| printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); |
| printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); |
| } |
| |
| |
| int |
| main(int argc, char **argv) |
| { |
| static struct option long_options[] = { |
| {"help", no_argument, NULL, '?'}, |
| {"target-pgdata", required_argument, NULL, 'D'}, |
| {"write-recovery-conf", no_argument, NULL, 'R'}, |
| {"slot", required_argument, NULL, 'S'}, |
| {"source-pgdata", required_argument, NULL, 1}, |
| {"source-server", required_argument, NULL, 2}, |
| {"no-ensure-shutdown", no_argument, NULL, 4}, |
| {"version", no_argument, NULL, 'V'}, |
| {"restore-target-wal", no_argument, NULL, 'c'}, |
| {"dry-run", no_argument, NULL, 'n'}, |
| {"no-sync", no_argument, NULL, 'N'}, |
| {"progress", no_argument, NULL, 'P'}, |
| {"debug", no_argument, NULL, 3}, |
| {NULL, 0, NULL, 0} |
| }; |
| int option_index; |
| int c; |
| XLogRecPtr divergerec; |
| int lastcommontliIndex; |
| XLogRecPtr chkptrec; |
| TimeLineID chkpttli; |
| XLogRecPtr chkptredo; |
| XLogRecPtr target_wal_endrec; |
| size_t size; |
| char *buffer; |
| bool no_ensure_shutdown = false; |
| bool rewind_needed; |
| bool writerecoveryconf = false; |
| char *replication_slot = NULL; |
| |
| filemap_t *filemap; |
| |
| |
| pg_logging_init(argv[0]); |
| set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind")); |
| progname = get_progname(argv[0]); |
| |
| /* Process command-line arguments */ |
| if (argc > 1) |
| { |
| if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) |
| { |
| usage(progname); |
| exit(0); |
| } |
| if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) |
| { |
| puts("pg_rewind (Apache Cloudberry) " PG_VERSION); |
| exit(0); |
| } |
| } |
| |
| while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1) |
| { |
| switch (c) |
| { |
| case '?': |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| |
| case 'c': |
| restore_wal = true; |
| break; |
| |
| case 'P': |
| showprogress = true; |
| break; |
| |
| case 'n': |
| dry_run = true; |
| break; |
| |
| case 'S': |
| replication_slot = pg_strdup(optarg); |
| break; |
| |
| case 'N': |
| do_sync = false; |
| break; |
| |
| case 'R': |
| writerecoveryconf = true; |
| break; |
| |
| case 3: |
| debug = true; |
| pg_logging_increase_verbosity(); |
| break; |
| |
| case 'D': /* -D or --target-pgdata */ |
| datadir_target = pg_strdup(optarg); |
| break; |
| |
| case 1: /* --source-pgdata */ |
| datadir_source = pg_strdup(optarg); |
| break; |
| |
| case 2: /* --source-server */ |
| connstr_source = pg_strdup(optarg); |
| break; |
| |
| case 4: |
| no_ensure_shutdown = true; |
| break; |
| } |
| } |
| |
| if (datadir_source == NULL && connstr_source == NULL) |
| { |
| pg_log_error("no source specified (--source-pgdata or --source-server)"); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| if (datadir_source != NULL && connstr_source != NULL) |
| { |
| pg_log_error("only one of --source-pgdata or --source-server can be specified"); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| if (datadir_source != NULL && connstr_source != NULL) |
| { |
| fprintf(stderr, _("%s: only one of --source-pgdata or --source-server can be specified\n"), progname); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| if (datadir_target == NULL) |
| { |
| pg_log_error("no target data directory specified (--target-pgdata)"); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| if (writerecoveryconf && connstr_source == NULL) |
| { |
| pg_log_error("no source server information (--source-server) specified for --write-recovery-conf"); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| if (optind < argc) |
| { |
| pg_log_error("too many command-line arguments (first is \"%s\")", |
| argv[optind]); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| if (!writerecoveryconf && replication_slot != NULL) |
| { |
| fprintf(stderr, _("%s: --slot can be specified only if --write-recovery-conf is specified\n"), progname); |
| fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); |
| exit(1); |
| } |
| |
| /* |
| * Don't allow pg_rewind to be run as root, to avoid overwriting the |
| * ownership of files in the data directory. We need only check for root |
| * -- any other user won't have sufficient permissions to modify files in |
| * the data directory. |
| */ |
| #ifndef WIN32 |
| if (geteuid() == 0) |
| { |
| pg_log_error("cannot be executed by \"root\""); |
| fprintf(stderr, _("You must run %s as the PostgreSQL superuser.\n"), |
| progname); |
| exit(1); |
| } |
| #endif |
| |
| get_restricted_token(); |
| |
| /* Set mask based on PGDATA permissions */ |
| if (!GetDataDirectoryCreatePerm(datadir_target)) |
| { |
| pg_log_error("could not read permissions of directory \"%s\": %m", |
| datadir_target); |
| exit(1); |
| } |
| |
| umask(pg_mode_mask); |
| |
| getRestoreCommand(argv[0]); |
| |
| atexit(disconnect_atexit); |
| |
| /* |
| * Ok, we have all the options and we're ready to start. First, connect to |
| * remote server. |
| */ |
| if (connstr_source) |
| { |
| conn = PQconnectdb(connstr_source); |
| |
| if (PQstatus(conn) == CONNECTION_BAD) |
| pg_fatal("%s", PQerrorMessage(conn)); |
| |
| if (showprogress) |
| pg_log_info("connected to server"); |
| |
| source = init_libpq_source(conn); |
| } |
| else |
| source = init_local_source(datadir_source); |
| |
| /* |
| * Check the status of the target instance. |
| * |
| * If the target instance was not cleanly shut down, start and stop the |
| * target cluster once in single-user mode to enforce recovery to finish, |
| * ensuring that the cluster can be used by pg_rewind. Note that if |
| * no_ensure_shutdown is specified, pg_rewind ignores this step, and users |
| * need to make sure by themselves that the target cluster is in a clean |
| * state. |
| */ |
| buffer = slurpFile(datadir_target, "global/pg_control", &size); |
| digestControlFile(&ControlFile_target, buffer, size); |
| pg_free(buffer); |
| |
| if (!no_ensure_shutdown && |
| ControlFile_target.state != DB_SHUTDOWNED && |
| ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY) |
| { |
| ensureCleanShutdown(argv[0]); |
| |
| buffer = slurpFile(datadir_target, "global/pg_control", &size); |
| digestControlFile(&ControlFile_target, buffer, size); |
| pg_free(buffer); |
| } |
| |
| buffer = source->fetch_file(source, "global/pg_control", &size); |
| digestControlFile(&ControlFile_source, buffer, size); |
| pg_free(buffer); |
| |
| dbid_target = get_target_dbid(argv[0]); |
| |
| sanityChecks(); |
| |
| #ifdef FAULT_INJECTOR |
| /* |
| * SUSPEND_PG_REWIND is used for testing purposes. If set to true, the pg_rewind process will be |
| * suspended for 120 seconds, so that it's entry can be checked in the pg_stat_activity table. |
| * The goal being that we can run another instance of gprecoverseg and assert that it ignores |
| * recovery for segments that already have an active pg_rewind process. |
| */ |
| char* suspend_pg_rewind = getenv("SUSPEND_PG_REWIND"); |
| if(suspend_pg_rewind != NULL && strcmp(suspend_pg_rewind, "true") == 0) |
| { |
| pg_log_info("pg_rewind suspended"); |
| sleep(120); |
| } |
| #endif |
| |
| /* |
| * Find the common ancestor timeline between the clusters. |
| * |
| * If both clusters are already on the same timeline, there's nothing to |
| * do. |
| */ |
| if (ControlFile_target.checkPointCopy.ThisTimeLineID == |
| ControlFile_source.checkPointCopy.ThisTimeLineID) |
| { |
| pg_log_info("source and target cluster are on the same timeline: %u", |
| ControlFile_source.checkPointCopy.ThisTimeLineID); |
| rewind_needed = false; |
| target_wal_endrec = 0; |
| } |
| else |
| { |
| XLogRecPtr chkptendrec; |
| |
| findCommonAncestorTimeline(&divergerec, &lastcommontliIndex); |
| pg_log_info("servers diverged at WAL location %X/%X on timeline %u", |
| LSN_FORMAT_ARGS(divergerec), |
| targetHistory[lastcommontliIndex].tli); |
| |
| /* |
| * Determine the end-of-WAL on the target. |
| * |
| * The WAL ends at the last shutdown checkpoint, or at |
| * minRecoveryPoint if it was a standby. (If we supported rewinding a |
| * server that was not shut down cleanly, we would need to replay |
| * until we reach the first invalid record, like crash recovery does.) |
| */ |
| |
| /* read the checkpoint record on the target to see where it ends. */ |
| chkptendrec = readOneRecord(datadir_target, |
| ControlFile_target.checkPoint, |
| targetNentries - 1, |
| restore_command); |
| |
| if (ControlFile_target.minRecoveryPoint > chkptendrec) |
| { |
| target_wal_endrec = ControlFile_target.minRecoveryPoint; |
| } |
| else |
| { |
| target_wal_endrec = chkptendrec; |
| } |
| |
| /* |
| * Check for the possibility that the target is in fact a direct |
| * ancestor of the source. In that case, there is no divergent history |
| * in the target that needs rewinding. |
| */ |
| if (target_wal_endrec > divergerec) |
| { |
| rewind_needed = true; |
| } |
| else |
| { |
| /* the last common checkpoint record must be part of target WAL */ |
| Assert(target_wal_endrec == divergerec); |
| |
| rewind_needed = false; |
| } |
| } |
| |
| if (!rewind_needed) |
| { |
| pg_log_info("no rewind required"); |
| if (writerecoveryconf && !dry_run) |
| WriteRecoveryConfig(conn, datadir_target, |
| GenerateRecoveryConfig(conn, replication_slot)); |
| exit(0); |
| } |
| |
| findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex, |
| &chkptrec, &chkpttli, &chkptredo, restore_command); |
| pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u", |
| LSN_FORMAT_ARGS(chkptrec), chkpttli); |
| |
| /* Initialize the hash table to track the status of each file */ |
| filehash_init(); |
| |
| /* |
| * Collect information about all files in the both data directories. |
| */ |
| if (showprogress) |
| pg_log_info("reading source file list"); |
| source->traverse_files(source, &process_source_file); |
| |
| if (showprogress) |
| pg_log_info("reading target file list"); |
| traverse_datadir(datadir_target, &process_target_file); |
| |
| /* |
| * Read the target WAL from last checkpoint before the point of fork, to |
| * extract all the pages that were modified on the target cluster after |
| * the fork. |
| */ |
| if (showprogress) |
| pg_log_info("reading WAL in target"); |
| extractPageMap(datadir_target, chkptrec, lastcommontliIndex, |
| target_wal_endrec, restore_command); |
| |
| |
| /* |
| * We have collected all information we need from both systems. Decide |
| * what to do with each file. |
| */ |
| |
| filemap = decide_file_actions(); |
| |
| if (showprogress) |
| calculate_totals(filemap); |
| |
| /* this is too verbose even for verbose mode */ |
| if (debug) |
| print_filemap(filemap); |
| |
| /* |
| * Ok, we're ready to start copying things over. |
| */ |
| if (showprogress) |
| { |
| pg_log_info("need to copy %lu MB (total source directory size is %lu MB)", |
| (unsigned long) (filemap->fetch_size / (1024 * 1024)), |
| (unsigned long) (filemap->total_size / (1024 * 1024))); |
| |
| fetch_size = filemap->fetch_size; |
| fetch_done = 0; |
| } |
| |
| /* |
| * We have now collected all the information we need from both systems, |
| * and we are ready to start modifying the target directory. |
| * |
| * This is the point of no return. Once we start copying things, there is |
| * no turning back! |
| */ |
| perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo); |
| |
| if (showprogress) |
| pg_log_info("syncing target data directory"); |
| sync_target_dir(); |
| |
| /* Also update the standby configuration, if requested. */ |
| if (writerecoveryconf && !dry_run) |
| WriteRecoveryConfig(conn, datadir_target, |
| GenerateRecoveryConfig(conn, NULL)); |
| |
| /* don't need the source connection anymore */ |
| source->destroy(source); |
| if (conn) |
| { |
| PQfinish(conn); |
| conn = NULL; |
| } |
| |
| pg_log_info("Done!"); |
| |
| return 0; |
| } |
| |
| /* |
| * Perform the rewind. |
| * |
| * We have already collected all the information we need from the |
| * target and the source. |
| */ |
| static void |
| perform_rewind(filemap_t *filemap, rewind_source *source, |
| XLogRecPtr chkptrec, |
| TimeLineID chkpttli, |
| XLogRecPtr chkptredo) |
| { |
| XLogRecPtr endrec; |
| TimeLineID endtli; |
| ControlFileData ControlFile_new; |
| size_t size; |
| char *buffer; |
| |
| /* |
| * Execute the actions in the file map, fetching data from the source |
| * system as needed. |
| */ |
| for (int i = 0; i < filemap->nentries; i++) |
| { |
| file_entry_t *entry = filemap->entries[i]; |
| |
| /* |
| * If this is a relation file, copy the modified blocks. |
| * |
| * This is in addition to any other changes. |
| */ |
| if (entry->target_pages_to_overwrite.bitmapsize > 0) |
| { |
| datapagemap_iterator_t *iter; |
| BlockNumber blkno; |
| off_t offset; |
| |
| iter = datapagemap_iterate(&entry->target_pages_to_overwrite); |
| while (datapagemap_next(iter, &blkno)) |
| { |
| offset = blkno * BLCKSZ; |
| source->queue_fetch_range(source, entry->path, offset, BLCKSZ); |
| } |
| pg_free(iter); |
| } |
| |
| switch (entry->action) |
| { |
| case FILE_ACTION_NONE: |
| /* nothing else to do */ |
| break; |
| |
| case FILE_ACTION_COPY: |
| /* Truncate the old file out of the way, if any */ |
| open_target_file(entry->path, true); |
| source->queue_fetch_range(source, entry->path, |
| 0, entry->source_size); |
| break; |
| |
| case FILE_ACTION_TRUNCATE: |
| truncate_target_file(entry->path, entry->source_size); |
| break; |
| |
| case FILE_ACTION_COPY_TAIL: |
| source->queue_fetch_range(source, entry->path, |
| entry->target_size, |
| entry->source_size - entry->target_size); |
| break; |
| |
| case FILE_ACTION_REMOVE: |
| remove_target(entry); |
| break; |
| |
| case FILE_ACTION_CREATE: |
| create_target(entry); |
| break; |
| |
| case FILE_ACTION_UNDECIDED: |
| pg_fatal("no action decided for file \"%s\"", entry->path); |
| break; |
| } |
| } |
| |
| /* Complete any remaining range-fetches that we queued up above. */ |
| source->finish_fetch(source); |
| |
| close_target_file(); |
| |
| progress_report(true); |
| |
| /* |
| * Fetch the control file from the source last. This ensures that the |
| * minRecoveryPoint is up-to-date. |
| */ |
| buffer = source->fetch_file(source, "global/pg_control", &size); |
| digestControlFile(&ControlFile_source_after, buffer, size); |
| pg_free(buffer); |
| |
| /* |
| * Sanity check: If the source is a local system, the control file should |
| * not have changed since we started. |
| * |
| * XXX: We assume it hasn't been modified, but actually, what could go |
| * wrong? The logic handles a libpq source that's modified concurrently, |
| * why not a local datadir? |
| */ |
| if (datadir_source && |
| memcmp(&ControlFile_source, &ControlFile_source_after, |
| sizeof(ControlFileData)) != 0) |
| { |
| pg_fatal("source system was modified while pg_rewind was running"); |
| } |
| |
| if (showprogress) |
| pg_log_info("creating backup label and updating control file"); |
| |
| /* |
| * Create a backup label file, to tell the target where to begin the WAL |
| * replay. Normally, from the last common checkpoint between the source |
| * and the target. But if the source is a standby server, it's possible |
| * that the last common checkpoint is *after* the standby's restartpoint. |
| * That implies that the source server has applied the checkpoint record, |
| * but hasn't performed a corresponding restartpoint yet. Make sure we |
| * start at the restartpoint's redo point in that case. |
| * |
| * Use the old version of the source's control file for this. The server |
| * might have finished the restartpoint after we started copying files, |
| * but we must begin from the redo point at the time that started copying. |
| */ |
| if (ControlFile_source.checkPointCopy.redo < chkptredo) |
| { |
| chkptredo = ControlFile_source.checkPointCopy.redo; |
| chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID; |
| chkptrec = ControlFile_source.checkPoint; |
| } |
| createBackupLabel(chkptredo, chkpttli, chkptrec); |
| |
| /* |
| * Update control file of target, to tell the target how far it must |
| * replay the WAL (minRecoveryPoint). |
| */ |
| if (connstr_source) |
| { |
| /* |
| * The source is a live server. Like in an online backup, it's |
| * important that we recover all the WAL that was generated while we |
| * were copying files. |
| */ |
| if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY) |
| { |
| /* |
| * Source is a standby server. We must replay to its |
| * minRecoveryPoint. |
| */ |
| endrec = ControlFile_source_after.minRecoveryPoint; |
| endtli = ControlFile_source_after.minRecoveryPointTLI; |
| } |
| else |
| { |
| /* |
| * Source is a production, non-standby, server. We must replay to |
| * the last WAL insert location. |
| */ |
| if (ControlFile_source_after.state != DB_IN_PRODUCTION) |
| pg_fatal("source system was in unexpected state at end of rewind"); |
| |
| endrec = source->get_current_wal_insert_lsn(source); |
| endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID; |
| } |
| } |
| else |
| { |
| /* |
| * Source is a local data directory. It should've shut down cleanly, |
| * and we must replay to the latest shutdown checkpoint. |
| */ |
| endrec = ControlFile_source_after.checkPoint; |
| endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID; |
| } |
| |
| memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData)); |
| ControlFile_new.minRecoveryPoint = endrec; |
| ControlFile_new.minRecoveryPointTLI = endtli; |
| ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY; |
| if (!dry_run) |
| update_controlfile(datadir_target, &ControlFile_new, do_sync); |
| } |
| |
| static void |
| sanityChecks(void) |
| { |
| /* TODO Check that there's no backup_label in either cluster */ |
| |
| /* Check system_identifier match */ |
| if (ControlFile_target.system_identifier != ControlFile_source.system_identifier) |
| pg_fatal("source and target clusters are from different systems"); |
| |
| /* check version */ |
| if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION || |
| ControlFile_source.pg_control_version != PG_CONTROL_VERSION || |
| ControlFile_target.catalog_version_no != CATALOG_VERSION_NO || |
| ControlFile_source.catalog_version_no != CATALOG_VERSION_NO) |
| { |
| pg_fatal("clusters are not compatible with this version of pg_rewind"); |
| } |
| |
| /* |
| * Target cluster need to use checksums or hint bit wal-logging, this to |
| * prevent from data corruption that could occur because of hint bits. |
| */ |
| if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION && |
| !ControlFile_target.wal_log_hints) |
| { |
| pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\""); |
| } |
| |
| /* |
| * Target cluster better not be running. This doesn't guard against |
| * someone starting the cluster concurrently. Also, this is probably more |
| * strict than necessary; it's OK if the target node was not shut down |
| * cleanly, as long as it isn't running at the moment. |
| */ |
| if (ControlFile_target.state != DB_SHUTDOWNED && |
| ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY) |
| pg_fatal("target server must be shut down cleanly"); |
| |
| /* |
| * When the source is a data directory, also require that the source |
| * server is shut down. There isn't any very strong reason for this |
| * limitation, but better safe than sorry. |
| */ |
| if (datadir_source && |
| ControlFile_source.state != DB_SHUTDOWNED && |
| ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY) |
| pg_fatal("source data directory must be shut down cleanly"); |
| } |
| |
| /* |
| * Print a progress report based on the fetch_size and fetch_done variables. |
| * |
| * Progress report is written at maximum once per second, except that the |
| * last progress report is always printed. |
| * |
| * If finished is set to true, this is the last progress report. The cursor |
| * is moved to the next line. |
| */ |
| void |
| progress_report(bool finished) |
| { |
| static pg_time_t last_progress_report = 0; |
| int percent; |
| char fetch_done_str[32]; |
| char fetch_size_str[32]; |
| pg_time_t now; |
| |
| if (!showprogress) |
| return; |
| |
| now = time(NULL); |
| if (now == last_progress_report && !finished) |
| return; /* Max once per second */ |
| |
| last_progress_report = now; |
| percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0; |
| |
| /* |
| * Avoid overflowing past 100% or the full size. This may make the total |
| * size number change as we approach the end of the backup (the estimate |
| * will always be wrong if WAL is included), but that's better than having |
| * the done column be bigger than the total. |
| */ |
| if (percent > 100) |
| percent = 100; |
| if (fetch_done > fetch_size) |
| fetch_size = fetch_done; |
| |
| /* |
| * Separate step to keep platform-dependent format code out of |
| * translatable strings. And we only test for INT64_FORMAT availability |
| * in snprintf, not fprintf. |
| */ |
| snprintf(fetch_done_str, sizeof(fetch_done_str), INT64_FORMAT, |
| fetch_done / 1024); |
| snprintf(fetch_size_str, sizeof(fetch_size_str), INT64_FORMAT, |
| fetch_size / 1024); |
| |
| fprintf(stderr, _("%*s/%s kB (%d%%) copied"), |
| (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str, |
| percent); |
| |
| /* |
| * Stay on the same line if reporting to a terminal and we're not done |
| * yet. |
| */ |
| fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr); |
| } |
| |
| /* |
| * Find minimum from two WAL locations assuming InvalidXLogRecPtr means |
| * infinity as src/include/access/timeline.h states. This routine should |
| * be used only when comparing WAL locations related to history files. |
| */ |
| static XLogRecPtr |
| MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b) |
| { |
| if (XLogRecPtrIsInvalid(a)) |
| return b; |
| else if (XLogRecPtrIsInvalid(b)) |
| return a; |
| else |
| return Min(a, b); |
| } |
| |
| /* |
| * Retrieve timeline history for given control file which should behold |
| * either source or target. |
| */ |
| static TimeLineHistoryEntry * |
| getTimelineHistory(ControlFileData *controlFile, int *nentries) |
| { |
| TimeLineHistoryEntry *history; |
| TimeLineID tli; |
| |
| tli = controlFile->checkPointCopy.ThisTimeLineID; |
| |
| /* |
| * Timeline 1 does not have a history file, so there is no need to check |
| * and fake an entry with infinite start and end positions. |
| */ |
| if (tli == 1) |
| { |
| history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); |
| history->tli = tli; |
| history->begin = history->end = InvalidXLogRecPtr; |
| *nentries = 1; |
| } |
| else |
| { |
| char path[MAXPGPATH]; |
| char *histfile; |
| |
| TLHistoryFilePath(path, tli); |
| |
| /* Get history file from appropriate source */ |
| if (controlFile == &ControlFile_source) |
| histfile = source->fetch_file(source, path, NULL); |
| else if (controlFile == &ControlFile_target) |
| histfile = slurpFile(datadir_target, path, NULL); |
| else |
| pg_fatal("invalid control file"); |
| |
| history = rewind_parseTimeLineHistory(histfile, tli, nentries); |
| pg_free(histfile); |
| } |
| |
| if (debug) |
| { |
| int i; |
| |
| if (controlFile == &ControlFile_source) |
| pg_log_debug("Source timeline history:"); |
| else if (controlFile == &ControlFile_target) |
| pg_log_debug("Target timeline history:"); |
| else |
| Assert(false); |
| |
| /* |
| * Print the target timeline history. |
| */ |
| for (i = 0; i < targetNentries; i++) |
| { |
| TimeLineHistoryEntry *entry; |
| |
| entry = &history[i]; |
| pg_log_debug("%u: %X/%X - %X/%X", entry->tli, |
| LSN_FORMAT_ARGS(entry->begin), |
| LSN_FORMAT_ARGS(entry->end)); |
| } |
| } |
| |
| return history; |
| } |
| |
| /* |
| * Determine the TLI of the last common timeline in the timeline history of the |
| * two clusters. targetHistory is filled with target timeline history and |
| * targetNentries is number of items in targetHistory. *tliIndex is set to the |
| * index of last common timeline in targetHistory array, and *recptr is set to |
| * the position where the timeline history diverged (ie. the first WAL record |
| * that's not the same in both clusters). |
| * |
| * Control files of both clusters must be read into ControlFile_target/source |
| * before calling this routine. |
| */ |
| static void |
| findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex) |
| { |
| TimeLineHistoryEntry *sourceHistory; |
| int sourceNentries; |
| int i, |
| n; |
| |
| /* Retrieve timelines for both source and target */ |
| sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries); |
| targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries); |
| |
| /* |
| * Trace the history forward, until we hit the timeline diverge. It may |
| * still be possible that the source and target nodes used the same |
| * timeline number in their history but with different start position |
| * depending on the history files that each node has fetched in previous |
| * recovery processes. Hence check the start position of the new timeline |
| * as well and move down by one extra timeline entry if they do not match. |
| */ |
| n = Min(sourceNentries, targetNentries); |
| for (i = 0; i < n; i++) |
| { |
| if (sourceHistory[i].tli != targetHistory[i].tli || |
| sourceHistory[i].begin != targetHistory[i].begin) |
| break; |
| } |
| |
| if (i > 0) |
| { |
| i--; |
| *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end); |
| *tliIndex = i; |
| |
| pg_free(sourceHistory); |
| return; |
| } |
| else |
| { |
| pg_fatal("could not find common ancestor of the source and target cluster's timelines"); |
| } |
| } |
| |
| |
| /* |
| * Create a backup_label file that forces recovery to begin at the last common |
| * checkpoint. |
| */ |
| static void |
| createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc) |
| { |
| XLogSegNo startsegno; |
| time_t stamp_time; |
| char strfbuf[128]; |
| char xlogfilename[MAXFNAMELEN]; |
| struct tm *tmp; |
| char buf[1000]; |
| int len; |
| |
| XLByteToSeg(startpoint, startsegno, WalSegSz); |
| XLogFileName(xlogfilename, starttli, startsegno, WalSegSz); |
| |
| /* |
| * Construct backup label file |
| */ |
| stamp_time = time(NULL); |
| tmp = localtime(&stamp_time); |
| strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp); |
| |
| len = snprintf(buf, sizeof(buf), |
| "START WAL LOCATION: %X/%X (file %s)\n" |
| "CHECKPOINT LOCATION: %X/%X\n" |
| "BACKUP METHOD: pg_rewind\n" |
| "BACKUP FROM: standby\n" |
| "START TIME: %s\n", |
| /* omit LABEL: line */ |
| LSN_FORMAT_ARGS(startpoint), xlogfilename, |
| LSN_FORMAT_ARGS(checkpointloc), |
| strfbuf); |
| if (len >= sizeof(buf)) |
| pg_fatal("backup label buffer too small"); /* shouldn't happen */ |
| |
| /* TODO: move old file out of the way, if any. */ |
| open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */ |
| write_target_range(buf, 0, len); |
| close_target_file(); |
| } |
| |
| /* |
| * Check CRC of control file |
| */ |
| static void |
| checkControlFile(ControlFileData *ControlFile) |
| { |
| pg_crc32c crc; |
| |
| /* Calculate CRC */ |
| INIT_CRC32C(crc); |
| COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc)); |
| FIN_CRC32C(crc); |
| |
| /* And simply compare it */ |
| if (!EQ_CRC32C(crc, ControlFile->crc)) |
| pg_fatal("unexpected control file CRC"); |
| } |
| |
| /* |
| * Verify control file contents in the buffer 'content', and copy it to |
| * *ControlFile. |
| */ |
| static void |
| digestControlFile(ControlFileData *ControlFile, const char *content, |
| size_t size) |
| { |
| if (size != PG_CONTROL_FILE_SIZE) |
| pg_fatal("unexpected control file size %d, expected %d", |
| (int) size, PG_CONTROL_FILE_SIZE); |
| |
| memcpy(ControlFile, content, sizeof(ControlFileData)); |
| |
| /* set and validate WalSegSz */ |
| WalSegSz = ControlFile->xlog_seg_size; |
| |
| if (!IsValidWalSegSize(WalSegSz)) |
| pg_fatal(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte", |
| "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes", |
| WalSegSz), |
| WalSegSz); |
| |
| /* Additional checks on control file */ |
| checkControlFile(ControlFile); |
| } |
| |
| |
| |
| static int32 |
| get_target_dbid(const char *argv0) |
| { |
| char cmd_output[1024]; |
| FILE *output; |
| int32 dbid; |
| |
| int ret; |
| #define MAXCMDLEN (2 * MAXPGPATH) |
| char exec_path[MAXPGPATH]; |
| char cmd[MAXCMDLEN]; |
| long parsed_dbid; |
| |
| /* locate postgres binary */ |
| if ((ret = find_other_exec(argv0, "postgres", |
| "postgres (Apache Cloudberry) " PG_VERSION "\n", |
| exec_path)) < 0) |
| { |
| char full_path[MAXPGPATH]; |
| |
| if (find_my_exec(argv0, full_path) < 0) |
| strlcpy(full_path, progname, sizeof(full_path)); |
| |
| if (ret == -1) |
| pg_fatal("The program \"postgres\" is needed by %s but was \n" |
| "not found in the same directory as \"%s\".\n" |
| "Check your installation.", progname, full_path); |
| else |
| pg_fatal("The program \"postgres\" was found by \"%s\"\n" |
| "but was not the same version as %s.\n" |
| "Check your installation.`", full_path, progname); |
| } |
| |
| snprintf(cmd, MAXCMDLEN, "\"%s\" -D \"%s\" -C gp_dbid", |
| exec_path, datadir_target); |
| |
| if ((output = popen(cmd, "r")) == NULL || |
| fgets(cmd_output, sizeof(cmd_output), output) == NULL) |
| pg_fatal("Could not get dbid using %s: %m\n", |
| cmd); |
| |
| pclose(output); |
| |
| /* Remove trailing newline */ |
| if (strchr(cmd_output, '\n') != NULL) |
| *strchr(cmd_output, '\n') = '\0'; |
| |
| errno = 0; |
| parsed_dbid = strtol(cmd_output, NULL, 10); |
| if (errno) |
| pg_fatal("could not parse valid dbid from %s\n with cmd_output %s\n", cmd, cmd_output); |
| if(parsed_dbid > INT16_MAX || parsed_dbid <= -1) |
| pg_fatal("parsed dbid (%ld) is out of valid range: [1, INT16_MAX]", parsed_dbid); |
| dbid = (int32) parsed_dbid; |
| |
| return dbid; |
| } |
| |
| /* |
| * Get value of GUC parameter restore_command from the target cluster. |
| * |
| * This uses a logic based on "postgres -C" to get the value from the |
| * cluster. |
| */ |
| static void |
| getRestoreCommand(const char *argv0) |
| { |
| int rc; |
| char postgres_exec_path[MAXPGPATH], |
| postgres_cmd[MAXPGPATH], |
| cmd_output[MAXPGPATH]; |
| |
| if (!restore_wal) |
| return; |
| |
| /* find postgres executable */ |
| rc = find_other_exec(argv0, "postgres", |
| PG_BACKEND_VERSIONSTR, |
| postgres_exec_path); |
| |
| if (rc < 0) |
| { |
| char full_path[MAXPGPATH]; |
| |
| if (find_my_exec(argv0, full_path) < 0) |
| strlcpy(full_path, progname, sizeof(full_path)); |
| |
| if (rc == -1) |
| pg_log_error("The program \"%s\" is needed by %s but was not found in the\n" |
| "same directory as \"%s\".\n" |
| "Check your installation.", |
| "postgres", progname, full_path); |
| else |
| pg_log_error("The program \"%s\" was found by \"%s\"\n" |
| "but was not the same version as %s.\n" |
| "Check your installation.", |
| "postgres", full_path, progname); |
| exit(1); |
| } |
| |
| /* |
| * Build a command able to retrieve the value of GUC parameter |
| * restore_command, if set. |
| */ |
| snprintf(postgres_cmd, sizeof(postgres_cmd), |
| "\"%s\" -D \"%s\" -C restore_command", |
| postgres_exec_path, datadir_target); |
| |
| if (!pipe_read_line(postgres_cmd, cmd_output, sizeof(cmd_output))) |
| exit(1); |
| |
| (void) pg_strip_crlf(cmd_output); |
| |
| if (strcmp(cmd_output, "") == 0) |
| pg_fatal("restore_command is not set in the target cluster"); |
| |
| restore_command = pg_strdup(cmd_output); |
| |
| pg_log_debug("using for rewind restore_command = \'%s\'", |
| restore_command); |
| } |
| |
| |
| /* |
| * Ensure clean shutdown of target instance by launching single-user mode |
| * postgres to do crash recovery. |
| */ |
| static void |
| ensureCleanShutdown(const char *argv0) |
| { |
| int ret; |
| #define MAXCMDLEN (2 * MAXPGPATH) |
| char exec_path[MAXPGPATH]; |
| char cmd[MAXCMDLEN]; |
| |
| /* locate postgres binary */ |
| if ((ret = find_other_exec(argv0, "postgres", |
| PG_BACKEND_VERSIONSTR, |
| exec_path)) < 0) |
| { |
| char full_path[MAXPGPATH]; |
| |
| if (find_my_exec(argv0, full_path) < 0) |
| strlcpy(full_path, progname, sizeof(full_path)); |
| |
| if (ret == -1) |
| pg_fatal("The program \"%s\" is needed by %s but was not found in the\n" |
| "same directory as \"%s\".\n" |
| "Check your installation.", |
| "postgres", progname, full_path); |
| else |
| pg_fatal("The program \"%s\" was found by \"%s\"\n" |
| "but was not the same version as %s.\n" |
| "Check your installation.", |
| "postgres", full_path, progname); |
| } |
| |
| pg_log_info("executing \"%s\" for target server to complete crash recovery", |
| exec_path); |
| |
| /* |
| * Skip processing if requested, but only after ensuring presence of |
| * postgres. |
| */ |
| if (dry_run) |
| return; |
| |
| /* finally run postgres single-user mode */ |
| /* |
| * gpdb: use postgres instead of template1, else the below postgres |
| * instance might hang in the below scenario: |
| * |
| * 1. There was a prepared but not finished "create database " dtx |
| * transaction which was recovered during crash recovery in the startup |
| * process and thus it holds the lock of database template1 since |
| * by default template1 is the template for database creation. |
| * |
| * 2. Single mode postgres process will execute the below code in |
| * InitPostgres() after finishing crash recovery (i.e. calling |
| * startupXLOG()) and then hang due to lock conflict. |
| * |
| * LockSharedObject(DatabaseRelationId, ...); |
| * |
| * DB_FOR_COMMON_ACCESS is used in fts probe, dtx recovery, gdd so it's |
| * hard to have the above kind of dtx transaction on DB_FOR_COMMON_ACCESS |
| * since the commands (e.g. create database with template |
| * DB_FOR_COMMON_ACCESS) would fail. |
| */ |
| |
| snprintf(cmd, MAXCMDLEN, "\"%s\" --single -D \"%s\" %s < %s", |
| exec_path, datadir_target, DB_FOR_COMMON_ACCESS, DEVNULL); |
| |
| if (system(cmd) != 0) |
| { |
| pg_log_error("postgres single-user mode in target cluster failed"); |
| pg_fatal("Command was: %s", cmd); |
| } |
| } |
| |
| static void |
| disconnect_atexit(void) |
| { |
| if (conn != NULL) |
| PQfinish(conn); |
| } |