| /*------------------------------------------------------------------------- |
| * |
| * standby.c |
| * Misc functions used in Hot Standby mode. |
| * |
| * All functions for handling RM_STANDBY_ID, which relate to |
| * AccessExclusiveLocks and starting snapshots for Hot Standby mode. |
| * Plus conflict recovery processing. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/storage/ipc/standby.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| #include "access/transam.h" |
| #include "access/twophase.h" |
| #include "access/xact.h" |
| #include "access/xloginsert.h" |
| #include "access/xlogrecovery.h" |
| #include "access/xlogutils.h" |
| #include "miscadmin.h" |
| #include "pgstat.h" |
| #include "replication/slot.h" |
| #include "storage/bufmgr.h" |
| #include "storage/lmgr.h" |
| #include "storage/proc.h" |
| #include "storage/procarray.h" |
| #include "storage/sinvaladt.h" |
| #include "storage/standby.h" |
| #include "utils/hsearch.h" |
| #include "utils/memutils.h" |
| #include "utils/ps_status.h" |
| #include "utils/timeout.h" |
| #include "utils/timestamp.h" |
| |
| /* User-settable GUC parameters */ |
| int max_standby_archive_delay = 30 * 1000; |
| int max_standby_streaming_delay = 30 * 1000; |
| bool log_recovery_conflict_waits = false; |
| |
| /* |
| * Keep track of all the exclusive locks owned by original transactions. |
| * For each known exclusive lock, there is a RecoveryLockEntry in the |
| * RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a |
| * given XID are chained together so that we can find them easily. |
| * For each original transaction that is known to have any such locks, |
| * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table, |
| * which stores the head of the chain of its locks. |
| */ |
| typedef struct RecoveryLockEntry |
| { |
| xl_standby_lock key; /* hash key: xid, dbOid, relOid */ |
| struct RecoveryLockEntry *next; /* chain link */ |
| } RecoveryLockEntry; |
| |
| typedef struct RecoveryLockXidEntry |
| { |
| TransactionId xid; /* hash key -- must be first */ |
| struct RecoveryLockEntry *head; /* chain head */ |
| } RecoveryLockXidEntry; |
| |
| static HTAB *RecoveryLockHash = NULL; |
| static HTAB *RecoveryLockXidHash = NULL; |
| |
| /* Flags set by timeout handlers */ |
| static volatile sig_atomic_t got_standby_deadlock_timeout = false; |
| static volatile sig_atomic_t got_standby_delay_timeout = false; |
| static volatile sig_atomic_t got_standby_lock_timeout = false; |
| |
| static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, |
| ProcSignalReason reason, |
| uint32 wait_event_info, |
| bool report_waiting); |
| static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason); |
| static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts); |
| static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks); |
| static const char *get_recovery_conflict_desc(ProcSignalReason reason); |
| |
| /* |
| * InitRecoveryTransactionEnvironment |
| * Initialize tracking of our primary's in-progress transactions. |
| * |
| * We need to issue shared invalidations and hold locks. Holding locks |
| * means others may want to wait on us, so we need to make a lock table |
| * vxact entry like a real transaction. We could create and delete |
| * lock table entries for each transaction but its simpler just to create |
| * one permanent entry and leave it there all the time. Locks are then |
| * acquired and released as needed. Yes, this means you can see the |
| * Startup process in pg_locks once we have run this. |
| */ |
| void |
| InitRecoveryTransactionEnvironment(void) |
| { |
| VirtualTransactionId vxid; |
| HASHCTL hash_ctl; |
| |
| Assert(RecoveryLockHash == NULL); /* don't run this twice */ |
| |
| /* |
| * Initialize the hash tables for tracking the locks held by each |
| * transaction. |
| */ |
| hash_ctl.keysize = sizeof(xl_standby_lock); |
| hash_ctl.entrysize = sizeof(RecoveryLockEntry); |
| RecoveryLockHash = hash_create("RecoveryLockHash", |
| 64, |
| &hash_ctl, |
| HASH_ELEM | HASH_BLOBS); |
| hash_ctl.keysize = sizeof(TransactionId); |
| hash_ctl.entrysize = sizeof(RecoveryLockXidEntry); |
| RecoveryLockXidHash = hash_create("RecoveryLockXidHash", |
| 64, |
| &hash_ctl, |
| HASH_ELEM | HASH_BLOBS); |
| |
| /* |
| * Initialize shared invalidation management for Startup process, being |
| * careful to register ourselves as a sendOnly process so we don't need to |
| * read messages, nor will we get signaled when the queue starts filling |
| * up. |
| */ |
| SharedInvalBackendInit(true); |
| |
| /* |
| * Lock a virtual transaction id for Startup process. |
| * |
| * We need to do GetNextLocalTransactionId() because |
| * SharedInvalBackendInit() leaves localTransactionId invalid and the lock |
| * manager doesn't like that at all. |
| * |
| * Note that we don't need to run XactLockTableInsert() because nobody |
| * needs to wait on xids. That sounds a little strange, but table locks |
| * are held by vxids and row level locks are held by xids. All queries |
| * hold AccessShareLocks so never block while we write or lock new rows. |
| */ |
| vxid.backendId = MyBackendId; |
| vxid.localTransactionId = GetNextLocalTransactionId(); |
| VirtualXactLockTableInsert(vxid); |
| |
| standbyState = STANDBY_INITIALIZED; |
| } |
| |
| /* |
| * ShutdownRecoveryTransactionEnvironment |
| * Shut down transaction tracking |
| * |
| * Prepare to switch from hot standby mode to normal operation. Shut down |
| * recovery-time transaction tracking. |
| * |
| * This must be called even in shutdown of startup process if transaction |
| * tracking has been initialized. Otherwise some locks the tracked |
| * transactions were holding will not be released and may interfere with |
| * the processes still running (but will exit soon later) at the exit of |
| * startup process. |
| */ |
| void |
| ShutdownRecoveryTransactionEnvironment(void) |
| { |
| /* |
| * Do nothing if RecoveryLockHash is NULL because that means that |
| * transaction tracking has not yet been initialized or has already been |
| * shut down. This makes it safe to have possibly-redundant calls of this |
| * function during process exit. |
| */ |
| if (RecoveryLockHash == NULL) |
| return; |
| |
| /* Mark all tracked in-progress transactions as finished. */ |
| ExpireAllKnownAssignedTransactionIds(); |
| |
| /* Release all locks the tracked transactions were holding */ |
| StandbyReleaseAllLocks(); |
| |
| /* Destroy the lock hash tables. */ |
| hash_destroy(RecoveryLockHash); |
| hash_destroy(RecoveryLockXidHash); |
| RecoveryLockHash = NULL; |
| RecoveryLockXidHash = NULL; |
| |
| /* Cleanup our VirtualTransaction */ |
| VirtualXactLockTableCleanup(); |
| } |
| |
| |
| /* |
| * ----------------------------------------------------- |
| * Standby wait timers and backend cancel logic |
| * ----------------------------------------------------- |
| */ |
| |
| /* |
| * Determine the cutoff time at which we want to start canceling conflicting |
| * transactions. Returns zero (a time safely in the past) if we are willing |
| * to wait forever. |
| */ |
| static TimestampTz |
| GetStandbyLimitTime(void) |
| { |
| TimestampTz rtime; |
| bool fromStream; |
| |
| /* |
| * The cutoff time is the last WAL data receipt time plus the appropriate |
| * delay variable. Delay of -1 means wait forever. |
| */ |
| GetXLogReceiptTime(&rtime, &fromStream); |
| if (fromStream) |
| { |
| if (max_standby_streaming_delay < 0) |
| return 0; /* wait forever */ |
| return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay); |
| } |
| else |
| { |
| if (max_standby_archive_delay < 0) |
| return 0; /* wait forever */ |
| return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay); |
| } |
| } |
| |
| #define STANDBY_INITIAL_WAIT_US 1000 |
| static int standbyWait_us = STANDBY_INITIAL_WAIT_US; |
| |
| /* |
| * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs. |
| * We wait here for a while then return. If we decide we can't wait any |
| * more then we return true, if we can wait some more return false. |
| */ |
| static bool |
| WaitExceedsMaxStandbyDelay(uint32 wait_event_info) |
| { |
| TimestampTz ltime; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* Are we past the limit time? */ |
| ltime = GetStandbyLimitTime(); |
| if (ltime && GetCurrentTimestamp() >= ltime) |
| return true; |
| |
| /* |
| * Sleep a bit (this is essential to avoid busy-waiting). |
| */ |
| pgstat_report_wait_start(wait_event_info); |
| pg_usleep(standbyWait_us); |
| pgstat_report_wait_end(); |
| |
| /* |
| * Progressively increase the sleep times, but not to more than 1s, since |
| * pg_usleep isn't interruptible on some platforms. |
| */ |
| standbyWait_us *= 2; |
| if (standbyWait_us > 1000000) |
| standbyWait_us = 1000000; |
| |
| return false; |
| } |
| |
| /* |
| * Log the recovery conflict. |
| * |
| * wait_start is the timestamp when the caller started to wait. |
| * now is the timestamp when this function has been called. |
| * wait_list is the list of virtual transaction ids assigned to |
| * conflicting processes. still_waiting indicates whether |
| * the startup process is still waiting for the recovery conflict |
| * to be resolved or not. |
| */ |
| void |
| LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, |
| TimestampTz now, VirtualTransactionId *wait_list, |
| bool still_waiting) |
| { |
| long secs; |
| int usecs; |
| long msecs; |
| StringInfoData buf; |
| int nprocs = 0; |
| |
| /* |
| * There must be no conflicting processes when the recovery conflict has |
| * already been resolved. |
| */ |
| Assert(still_waiting || wait_list == NULL); |
| |
| TimestampDifference(wait_start, now, &secs, &usecs); |
| msecs = secs * 1000 + usecs / 1000; |
| usecs = usecs % 1000; |
| |
| if (wait_list) |
| { |
| VirtualTransactionId *vxids; |
| |
| /* Construct a string of list of the conflicting processes */ |
| vxids = wait_list; |
| while (VirtualTransactionIdIsValid(*vxids)) |
| { |
| PGPROC *proc = BackendIdGetProc(vxids->backendId); |
| |
| /* proc can be NULL if the target backend is not active */ |
| if (proc) |
| { |
| if (nprocs == 0) |
| { |
| initStringInfo(&buf); |
| appendStringInfo(&buf, "%d", proc->pid); |
| } |
| else |
| appendStringInfo(&buf, ", %d", proc->pid); |
| |
| nprocs++; |
| } |
| |
| vxids++; |
| } |
| } |
| |
| /* |
| * If wait_list is specified, report the list of PIDs of active |
| * conflicting backends in a detail message. Note that if all the backends |
| * in the list are not active, no detail message is logged. |
| */ |
| if (still_waiting) |
| { |
| ereport(LOG, |
| errmsg("recovery still waiting after %ld.%03d ms: %s", |
| msecs, usecs, get_recovery_conflict_desc(reason)), |
| nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.", |
| "Conflicting processes: %s.", |
| nprocs, buf.data) : 0); |
| } |
| else |
| { |
| ereport(LOG, |
| errmsg("recovery finished waiting after %ld.%03d ms: %s", |
| msecs, usecs, get_recovery_conflict_desc(reason))); |
| } |
| |
| if (nprocs > 0) |
| pfree(buf.data); |
| } |
| |
| /* |
| * This is the main executioner for any query backend that conflicts with |
| * recovery processing. Judgement has already been passed on it within |
| * a specific rmgr. Here we just issue the orders to the procs. The procs |
| * then throw the required error as instructed. |
| * |
| * If report_waiting is true, "waiting" is reported in PS display and the |
| * wait for recovery conflict is reported in the log, if necessary. If |
| * the caller is responsible for reporting them, report_waiting should be |
| * false. Otherwise, both the caller and this function report the same |
| * thing unexpectedly. |
| */ |
| static void |
| ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, |
| ProcSignalReason reason, uint32 wait_event_info, |
| bool report_waiting) |
| { |
| TimestampTz waitStart = 0; |
| bool waiting = false; |
| bool logged_recovery_conflict = false; |
| |
| /* Fast exit, to avoid a kernel call if there's no work to be done. */ |
| if (!VirtualTransactionIdIsValid(*waitlist)) |
| return; |
| |
| /* Set the wait start timestamp for reporting */ |
| if (report_waiting && (log_recovery_conflict_waits || update_process_title)) |
| waitStart = GetCurrentTimestamp(); |
| |
| while (VirtualTransactionIdIsValid(*waitlist)) |
| { |
| /* reset standbyWait_us for each xact we wait for */ |
| standbyWait_us = STANDBY_INITIAL_WAIT_US; |
| |
| /* wait until the virtual xid is gone */ |
| while (!VirtualXactLock(*waitlist, false)) |
| { |
| /* Is it time to kill it? */ |
| if (WaitExceedsMaxStandbyDelay(wait_event_info)) |
| { |
| pid_t pid; |
| |
| /* |
| * Now find out who to throw out of the balloon. |
| */ |
| Assert(VirtualTransactionIdIsValid(*waitlist)); |
| pid = CancelVirtualTransaction(*waitlist, reason); |
| |
| /* |
| * Wait a little bit for it to die so that we avoid flooding |
| * an unresponsive backend when system is heavily loaded. |
| */ |
| if (pid != 0) |
| pg_usleep(5000L); |
| } |
| |
| if (waitStart != 0 && (!logged_recovery_conflict || !waiting)) |
| { |
| TimestampTz now = 0; |
| bool maybe_log_conflict; |
| bool maybe_update_title; |
| |
| maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict); |
| maybe_update_title = (update_process_title && !waiting); |
| |
| /* Get the current timestamp if not report yet */ |
| if (maybe_log_conflict || maybe_update_title) |
| now = GetCurrentTimestamp(); |
| |
| /* |
| * Report via ps if we have been waiting for more than 500 |
| * msec (should that be configurable?) |
| */ |
| if (maybe_update_title && |
| TimestampDifferenceExceeds(waitStart, now, 500)) |
| { |
| set_ps_display_suffix("waiting"); |
| waiting = true; |
| } |
| |
| /* |
| * Emit the log message if the startup process is waiting |
| * longer than deadlock_timeout for recovery conflict. |
| */ |
| if (maybe_log_conflict && |
| TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout)) |
| { |
| LogRecoveryConflict(reason, waitStart, now, waitlist, true); |
| logged_recovery_conflict = true; |
| } |
| } |
| } |
| |
| /* The virtual transaction is gone now, wait for the next one */ |
| waitlist++; |
| } |
| |
| /* |
| * Emit the log message if recovery conflict was resolved but the startup |
| * process waited longer than deadlock_timeout for it. |
| */ |
| if (logged_recovery_conflict) |
| LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(), |
| NULL, false); |
| |
| /* reset ps display to remove the suffix if we added one */ |
| if (waiting) |
| set_ps_display_remove_suffix(); |
| |
| } |
| |
| /* |
| * Generate whatever recovery conflicts are needed to eliminate snapshots that |
| * might see XIDs <= snapshotConflictHorizon as still running. |
| * |
| * snapshotConflictHorizon cutoffs are our standard approach to generating |
| * granular recovery conflicts. Note that InvalidTransactionId values are |
| * interpreted as "definitely don't need any conflicts" here, which is a |
| * general convention that WAL records can (and often do) depend on. |
| */ |
| void |
| ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, |
| bool isCatalogRel, |
| RelFileLocator locator) |
| { |
| VirtualTransactionId *backends; |
| |
| /* |
| * If we get passed InvalidTransactionId then we do nothing (no conflict). |
| * |
| * This can happen when replaying already-applied WAL records after a |
| * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE |
| * record that marks as frozen a page which was already all-visible. It's |
| * also quite common with records generated during index deletion |
| * (original execution of the deletion can reason that a recovery conflict |
| * which is sufficient for the deletion operation must take place before |
| * replay of the deletion record itself). |
| */ |
| if (!TransactionIdIsValid(snapshotConflictHorizon)) |
| return; |
| |
| Assert(TransactionIdIsNormal(snapshotConflictHorizon)); |
| backends = GetConflictingVirtualXIDs(snapshotConflictHorizon, |
| locator.dbOid); |
| ResolveRecoveryConflictWithVirtualXIDs(backends, |
| PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, |
| WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT, |
| true); |
| |
| /* |
| * Note that WaitExceedsMaxStandbyDelay() is not taken into account here |
| * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That |
| * seems OK, given that this kind of conflict should not normally be |
| * reached, e.g. due to using a physical replication slot. |
| */ |
| if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel) |
| InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid, |
| snapshotConflictHorizon); |
| } |
| |
| /* |
| * Variant of ResolveRecoveryConflictWithSnapshot that works with |
| * FullTransactionId values |
| */ |
| void |
| ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon, |
| bool isCatalogRel, |
| RelFileLocator locator) |
| { |
| /* |
| * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds, |
| * so truncate the logged FullTransactionId. If the logged value is very |
| * old, so that XID wrap-around already happened on it, there can't be any |
| * snapshots that still see it. |
| */ |
| FullTransactionId nextXid = ReadNextFullTransactionId(); |
| uint64 diff; |
| |
| diff = U64FromFullTransactionId(nextXid) - |
| U64FromFullTransactionId(snapshotConflictHorizon); |
| if (diff < MaxTransactionId / 2) |
| { |
| TransactionId truncated; |
| |
| truncated = XidFromFullTransactionId(snapshotConflictHorizon); |
| ResolveRecoveryConflictWithSnapshot(truncated, |
| isCatalogRel, |
| locator); |
| } |
| } |
| |
| void |
| ResolveRecoveryConflictWithTablespace(Oid tsid) |
| { |
| VirtualTransactionId *temp_file_users; |
| |
| /* |
| * Standby users may be currently using this tablespace for their |
| * temporary files. We only care about current users because |
| * temp_tablespace parameter will just ignore tablespaces that no longer |
| * exist. |
| * |
| * Ask everybody to cancel their queries immediately so we can ensure no |
| * temp files remain and we can remove the tablespace. Nuke the entire |
| * site from orbit, it's the only way to be sure. |
| * |
| * XXX: We could work out the pids of active backends using this |
| * tablespace by examining the temp filenames in the directory. We would |
| * then convert the pids into VirtualXIDs before attempting to cancel |
| * them. |
| * |
| * We don't wait for commit because drop tablespace is non-transactional. |
| */ |
| temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId, |
| InvalidOid); |
| ResolveRecoveryConflictWithVirtualXIDs(temp_file_users, |
| PROCSIG_RECOVERY_CONFLICT_TABLESPACE, |
| WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE, |
| true); |
| } |
| |
| void |
| ResolveRecoveryConflictWithDatabase(Oid dbid) |
| { |
| /* |
| * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that |
| * only waits for transactions and completely idle sessions would block |
| * us. This is rare enough that we do this as simply as possible: no wait, |
| * just force them off immediately. |
| * |
| * No locking is required here because we already acquired |
| * AccessExclusiveLock. Anybody trying to connect while we do this will |
| * block during InitPostgres() and then disconnect when they see the |
| * database has been removed. |
| */ |
| while (CountDBBackends(dbid) > 0) |
| { |
| CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true); |
| |
| /* |
| * Wait awhile for them to die so that we avoid flooding an |
| * unresponsive backend when system is heavily loaded. |
| */ |
| pg_usleep(10000); |
| } |
| } |
| |
| /* |
| * ResolveRecoveryConflictWithLock is called from ProcSleep() |
| * to resolve conflicts with other backends holding relation locks. |
| * |
| * The WaitLatch sleep normally done in ProcSleep() |
| * (when not InHotStandby) is performed here, for code clarity. |
| * |
| * We either resolve conflicts immediately or set a timeout to wake us at |
| * the limit of our patience. |
| * |
| * Resolve conflicts by canceling to all backends holding a conflicting |
| * lock. As we are already queued to be granted the lock, no new lock |
| * requests conflicting with ours will be granted in the meantime. |
| * |
| * We also must check for deadlocks involving the Startup process and |
| * hot-standby backend processes. If deadlock_timeout is reached in |
| * this function, all the backends holding the conflicting locks are |
| * requested to check themselves for deadlocks. |
| * |
| * logging_conflict should be true if the recovery conflict has not been |
| * logged yet even though logging is enabled. After deadlock_timeout is |
| * reached and the request for deadlock check is sent, we wait again to |
| * be signaled by the release of the lock if logging_conflict is false. |
| * Otherwise we return without waiting again so that the caller can report |
| * the recovery conflict. In this case, then, this function is called again |
| * with logging_conflict=false (because the recovery conflict has already |
| * been logged) and we will wait again for the lock to be released. |
| */ |
| void |
| ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) |
| { |
| TimestampTz ltime; |
| TimestampTz now; |
| |
| Assert(InHotStandby); |
| |
| ltime = GetStandbyLimitTime(); |
| now = GetCurrentTimestamp(); |
| |
| /* |
| * Update waitStart if first time through after the startup process |
| * started waiting for the lock. It should not be updated every time |
| * ResolveRecoveryConflictWithLock() is called during the wait. |
| * |
| * Use the current time obtained for comparison with ltime as waitStart |
| * (i.e., the time when this process started waiting for the lock). Since |
| * getting the current time newly can cause overhead, we reuse the |
| * already-obtained time to avoid that overhead. |
| * |
| * Note that waitStart is updated without holding the lock table's |
| * partition lock, to avoid the overhead by additional lock acquisition. |
| * This can cause "waitstart" in pg_locks to become NULL for a very short |
| * period of time after the wait started even though "granted" is false. |
| * This is OK in practice because we can assume that users are likely to |
| * look at "waitstart" when waiting for the lock for a long time. |
| */ |
| if (pg_atomic_read_u64(&MyProc->waitStart) == 0) |
| pg_atomic_write_u64(&MyProc->waitStart, now); |
| |
| if (now >= ltime && ltime != 0) |
| { |
| /* |
| * We're already behind, so clear a path as quickly as possible. |
| */ |
| VirtualTransactionId *backends; |
| |
| backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL); |
| |
| /* |
| * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting |
| * "waiting" in PS display by disabling its argument report_waiting |
| * because the caller, WaitOnLock(), has already reported that. |
| */ |
| ResolveRecoveryConflictWithVirtualXIDs(backends, |
| PROCSIG_RECOVERY_CONFLICT_LOCK, |
| PG_WAIT_LOCK | locktag.locktag_type, |
| false); |
| } |
| else |
| { |
| /* |
| * Wait (or wait again) until ltime, and check for deadlocks as well |
| * if we will be waiting longer than deadlock_timeout |
| */ |
| EnableTimeoutParams timeouts[2]; |
| int cnt = 0; |
| |
| if (ltime != 0) |
| { |
| got_standby_lock_timeout = false; |
| timeouts[cnt].id = STANDBY_LOCK_TIMEOUT; |
| timeouts[cnt].type = TMPARAM_AT; |
| timeouts[cnt].fin_time = ltime; |
| cnt++; |
| } |
| |
| got_standby_deadlock_timeout = false; |
| timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT; |
| timeouts[cnt].type = TMPARAM_AFTER; |
| timeouts[cnt].delay_ms = DeadlockTimeout; |
| cnt++; |
| |
| enable_timeouts(timeouts, cnt); |
| } |
| |
| /* Wait to be signaled by the release of the Relation Lock */ |
| ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type); |
| |
| /* |
| * Exit if ltime is reached. Then all the backends holding conflicting |
| * locks will be canceled in the next ResolveRecoveryConflictWithLock() |
| * call. |
| */ |
| if (got_standby_lock_timeout) |
| goto cleanup; |
| |
| if (got_standby_deadlock_timeout) |
| { |
| VirtualTransactionId *backends; |
| |
| backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL); |
| |
| /* Quick exit if there's no work to be done */ |
| if (!VirtualTransactionIdIsValid(*backends)) |
| goto cleanup; |
| |
| /* |
| * Send signals to all the backends holding the conflicting locks, to |
| * ask them to check themselves for deadlocks. |
| */ |
| while (VirtualTransactionIdIsValid(*backends)) |
| { |
| SignalVirtualTransaction(*backends, |
| PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, |
| false); |
| backends++; |
| } |
| |
| /* |
| * Exit if the recovery conflict has not been logged yet even though |
| * logging is enabled, so that the caller can log that. Then |
| * RecoveryConflictWithLock() is called again and we will wait again |
| * for the lock to be released. |
| */ |
| if (logging_conflict) |
| goto cleanup; |
| |
| /* |
| * Wait again here to be signaled by the release of the Relation Lock, |
| * to prevent the subsequent RecoveryConflictWithLock() from causing |
| * deadlock_timeout and sending a request for deadlocks check again. |
| * Otherwise the request continues to be sent every deadlock_timeout |
| * until the relation locks are released or ltime is reached. |
| */ |
| got_standby_deadlock_timeout = false; |
| ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type); |
| } |
| |
| cleanup: |
| |
| /* |
| * Clear any timeout requests established above. We assume here that the |
| * Startup process doesn't have any other outstanding timeouts than those |
| * used by this function. If that stops being true, we could cancel the |
| * timeouts individually, but that'd be slower. |
| */ |
| disable_all_timeouts(false); |
| got_standby_lock_timeout = false; |
| got_standby_deadlock_timeout = false; |
| } |
| |
| /* |
| * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup() |
| * to resolve conflicts with other backends holding buffer pins. |
| * |
| * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup() |
| * (when not InHotStandby) is performed here, for code clarity. |
| * |
| * We either resolve conflicts immediately or set a timeout to wake us at |
| * the limit of our patience. |
| * |
| * Resolve conflicts by sending a PROCSIG signal to all backends to check if |
| * they hold one of the buffer pins that is blocking Startup process. If so, |
| * those backends will take an appropriate error action, ERROR or FATAL. |
| * |
| * We also must check for deadlocks. Deadlocks occur because if queries |
| * wait on a lock, that must be behind an AccessExclusiveLock, which can only |
| * be cleared if the Startup process replays a transaction completion record. |
| * If Startup process is also waiting then that is a deadlock. The deadlock |
| * can occur if the query is waiting and then the Startup sleeps, or if |
| * Startup is sleeping and the query waits on a lock. We protect against |
| * only the former sequence here, the latter sequence is checked prior to |
| * the query sleeping, in CheckRecoveryConflictDeadlock(). |
| * |
| * Deadlocks are extremely rare, and relatively expensive to check for, |
| * so we don't do a deadlock check right away ... only if we have had to wait |
| * at least deadlock_timeout. |
| */ |
| void |
| ResolveRecoveryConflictWithBufferPin(void) |
| { |
| TimestampTz ltime; |
| |
| Assert(InHotStandby); |
| |
| ltime = GetStandbyLimitTime(); |
| |
| if (GetCurrentTimestamp() >= ltime && ltime != 0) |
| { |
| /* |
| * We're already behind, so clear a path as quickly as possible. |
| */ |
| SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); |
| } |
| else |
| { |
| /* |
| * Wake up at ltime, and check for deadlocks as well if we will be |
| * waiting longer than deadlock_timeout |
| */ |
| EnableTimeoutParams timeouts[2]; |
| int cnt = 0; |
| |
| if (ltime != 0) |
| { |
| timeouts[cnt].id = STANDBY_TIMEOUT; |
| timeouts[cnt].type = TMPARAM_AT; |
| timeouts[cnt].fin_time = ltime; |
| cnt++; |
| } |
| |
| got_standby_deadlock_timeout = false; |
| timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT; |
| timeouts[cnt].type = TMPARAM_AFTER; |
| timeouts[cnt].delay_ms = DeadlockTimeout; |
| cnt++; |
| |
| enable_timeouts(timeouts, cnt); |
| } |
| |
| /* |
| * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted |
| * by one of the timeouts established above. |
| * |
| * We assume that only UnpinBuffer() and the timeout requests established |
| * above can wake us up here. WakeupRecovery() called by walreceiver or |
| * SIGHUP signal handler, etc cannot do that because it uses the different |
| * latch from that ProcWaitForSignal() waits on. |
| */ |
| ProcWaitForSignal(PG_WAIT_BUFFER_PIN); |
| |
| if (got_standby_delay_timeout) |
| SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); |
| else if (got_standby_deadlock_timeout) |
| { |
| /* |
| * Send out a request for hot-standby backends to check themselves for |
| * deadlocks. |
| * |
| * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait |
| * to be signaled by UnpinBuffer() again and send a request for |
| * deadlocks check if deadlock_timeout happens. This causes the |
| * request to continue to be sent every deadlock_timeout until the |
| * buffer is unpinned or ltime is reached. This would increase the |
| * workload in the startup process and backends. In practice it may |
| * not be so harmful because the period that the buffer is kept pinned |
| * is basically no so long. But we should fix this? |
| */ |
| SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); |
| } |
| |
| /* |
| * Clear any timeout requests established above. We assume here that the |
| * Startup process doesn't have any other timeouts than what this function |
| * uses. If that stops being true, we could cancel the timeouts |
| * individually, but that'd be slower. |
| */ |
| disable_all_timeouts(false); |
| got_standby_delay_timeout = false; |
| got_standby_deadlock_timeout = false; |
| } |
| |
| static void |
| SendRecoveryConflictWithBufferPin(ProcSignalReason reason) |
| { |
| Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN || |
| reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); |
| |
| /* |
| * We send signal to all backends to ask them if they are holding the |
| * buffer pin which is delaying the Startup process. We must not set the |
| * conflict flag yet, since most backends will be innocent. Let the |
| * SIGUSR1 handling in each backend decide their own fate. |
| */ |
| CancelDBBackends(InvalidOid, reason, false); |
| } |
| |
| /* |
| * In Hot Standby perform early deadlock detection. We abort the lock |
| * wait if we are about to sleep while holding the buffer pin that Startup |
| * process is waiting for. |
| * |
| * Note: this code is pessimistic, because there is no way for it to |
| * determine whether an actual deadlock condition is present: the lock we |
| * need to wait for might be unrelated to any held by the Startup process. |
| * Sooner or later, this mechanism should get ripped out in favor of somehow |
| * accounting for buffer locks in DeadLockCheck(). However, errors here |
| * seem to be very low-probability in practice, so for now it's not worth |
| * the trouble. |
| */ |
| void |
| CheckRecoveryConflictDeadlock(void) |
| { |
| Assert(!InRecovery); /* do not call in Startup process */ |
| |
| if (!HoldingBufferPinThatDelaysRecovery()) |
| return; |
| |
| /* |
| * Error message should match ProcessInterrupts() but we avoid calling |
| * that because we aren't handling an interrupt at this point. Note that |
| * we only cancel the current transaction here, so if we are in a |
| * subtransaction and the pin is held by a parent, then the Startup |
| * process will continue to wait even though we have avoided deadlock. |
| */ |
| ereport(ERROR, |
| (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), |
| errmsg("canceling statement due to conflict with recovery"), |
| errdetail("User transaction caused buffer deadlock with recovery."))); |
| } |
| |
| |
| /* -------------------------------- |
| * timeout handler routines |
| * -------------------------------- |
| */ |
| |
| /* |
| * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is |
| * exceeded. |
| */ |
| void |
| StandbyDeadLockHandler(void) |
| { |
| got_standby_deadlock_timeout = true; |
| } |
| |
| /* |
| * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded. |
| */ |
| void |
| StandbyTimeoutHandler(void) |
| { |
| got_standby_delay_timeout = true; |
| } |
| |
| /* |
| * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded. |
| */ |
| void |
| StandbyLockTimeoutHandler(void) |
| { |
| got_standby_lock_timeout = true; |
| } |
| |
| /* |
| * ----------------------------------------------------- |
| * Locking in Recovery Mode |
| * ----------------------------------------------------- |
| * |
| * All locks are held by the Startup process using a single virtual |
| * transaction. This implementation is both simpler and in some senses, |
| * more correct. The locks held mean "some original transaction held |
| * this lock, so query access is not allowed at this time". So the Startup |
| * process is the proxy by which the original locks are implemented. |
| * |
| * We only keep track of AccessExclusiveLocks, which are only ever held by |
| * one transaction on one relation. |
| * |
| * We keep a table of known locks in the RecoveryLockHash hash table. |
| * The point of that table is to let us efficiently de-duplicate locks, |
| * which is important because checkpoints will re-report the same locks |
| * already held. There is also a RecoveryLockXidHash table with one entry |
| * per xid, which allows us to efficiently find all the locks held by a |
| * given original transaction. |
| * |
| * We use session locks rather than normal locks so we don't need |
| * ResourceOwners. |
| */ |
| |
| |
| void |
| StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid) |
| { |
| RecoveryLockXidEntry *xidentry; |
| RecoveryLockEntry *lockentry; |
| xl_standby_lock key; |
| LOCKTAG locktag; |
| bool found; |
| |
| /* Already processed? */ |
| if (!TransactionIdIsValid(xid) || |
| TransactionIdDidCommit(xid) || |
| TransactionIdDidAbort(xid)) |
| return; |
| |
| elog(trace_recovery(DEBUG4), |
| "adding recovery lock: db %u rel %u", dbOid, relOid); |
| |
| /* dbOid is InvalidOid when we are locking a shared relation. */ |
| Assert(OidIsValid(relOid)); |
| |
| /* Create a hash entry for this xid, if we don't have one already. */ |
| xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found); |
| if (!found) |
| { |
| Assert(xidentry->xid == xid); /* dynahash should have set this */ |
| xidentry->head = NULL; |
| } |
| |
| /* Create a hash entry for this lock, unless we have one already. */ |
| key.xid = xid; |
| key.dbOid = dbOid; |
| key.relOid = relOid; |
| lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found); |
| if (!found) |
| { |
| /* It's new, so link it into the XID's list ... */ |
| lockentry->next = xidentry->head; |
| xidentry->head = lockentry; |
| |
| /* ... and acquire the lock locally. */ |
| SET_LOCKTAG_RELATION(locktag, dbOid, relOid); |
| |
| (void) LockAcquire(&locktag, AccessExclusiveLock, true, false); |
| } |
| } |
| |
| /* |
| * Release all the locks associated with this RecoveryLockXidEntry. |
| */ |
| static void |
| StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry) |
| { |
| RecoveryLockEntry *entry; |
| RecoveryLockEntry *next; |
| |
| for (entry = xidentry->head; entry != NULL; entry = next) |
| { |
| LOCKTAG locktag; |
| |
| elog(trace_recovery(DEBUG4), |
| "releasing recovery lock: xid %u db %u rel %u", |
| entry->key.xid, entry->key.dbOid, entry->key.relOid); |
| /* Release the lock ... */ |
| SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid); |
| if (!LockRelease(&locktag, AccessExclusiveLock, true)) |
| { |
| elog(LOG, |
| "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u", |
| entry->key.xid, entry->key.dbOid, entry->key.relOid); |
| Assert(false); |
| } |
| /* ... and remove the per-lock hash entry */ |
| next = entry->next; |
| hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL); |
| } |
| |
| xidentry->head = NULL; /* just for paranoia */ |
| } |
| |
| /* |
| * Release locks for specific XID, or all locks if it's InvalidXid. |
| */ |
| static void |
| StandbyReleaseLocks(TransactionId xid) |
| { |
| RecoveryLockXidEntry *entry; |
| |
| if (TransactionIdIsValid(xid)) |
| { |
| if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL))) |
| { |
| StandbyReleaseXidEntryLocks(entry); |
| hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL); |
| } |
| } |
| else |
| StandbyReleaseAllLocks(); |
| } |
| |
| /* |
| * Release locks for a transaction tree, starting at xid down, from |
| * RecoveryLockXidHash. |
| * |
| * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode, |
| * to remove any AccessExclusiveLocks requested by a transaction. |
| */ |
| void |
| StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids) |
| { |
| int i; |
| |
| StandbyReleaseLocks(xid); |
| |
| for (i = 0; i < nsubxids; i++) |
| StandbyReleaseLocks(subxids[i]); |
| } |
| |
| /* |
| * Called at end of recovery and when we see a shutdown checkpoint. |
| */ |
| void |
| StandbyReleaseAllLocks(void) |
| { |
| HASH_SEQ_STATUS status; |
| RecoveryLockXidEntry *entry; |
| |
| elog(trace_recovery(DEBUG2), "release all standby locks"); |
| |
| hash_seq_init(&status, RecoveryLockXidHash); |
| while ((entry = hash_seq_search(&status))) |
| { |
| StandbyReleaseXidEntryLocks(entry); |
| hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL); |
| } |
| } |
| |
| /* |
| * StandbyReleaseOldLocks |
| * Release standby locks held by top-level XIDs that aren't running, |
| * as long as they're not prepared transactions. |
| */ |
| void |
| StandbyReleaseOldLocks(TransactionId oldxid) |
| { |
| HASH_SEQ_STATUS status; |
| RecoveryLockXidEntry *entry; |
| |
| hash_seq_init(&status, RecoveryLockXidHash); |
| while ((entry = hash_seq_search(&status))) |
| { |
| Assert(TransactionIdIsValid(entry->xid)); |
| |
| /* Skip if prepared transaction. */ |
| if (StandbyTransactionIdIsPrepared(entry->xid)) |
| continue; |
| |
| /* Skip if >= oldxid. */ |
| if (!TransactionIdPrecedes(entry->xid, oldxid)) |
| continue; |
| |
| /* Remove all locks and hash table entry. */ |
| StandbyReleaseXidEntryLocks(entry); |
| hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL); |
| } |
| } |
| |
| /* |
| * -------------------------------------------------------------------- |
| * Recovery handling for Rmgr RM_STANDBY_ID |
| * |
| * These record types will only be created if XLogStandbyInfoActive() |
| * -------------------------------------------------------------------- |
| */ |
| |
| void |
| standby_redo(XLogReaderState *record) |
| { |
| uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
| |
| /* Backup blocks are not used in standby records */ |
| Assert(!XLogRecHasAnyBlockRefs(record)); |
| |
| /* Do nothing if we're not in hot standby mode */ |
| if (standbyState == STANDBY_DISABLED) |
| return; |
| |
| if (info == XLOG_STANDBY_LOCK) |
| { |
| xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record); |
| int i; |
| |
| for (i = 0; i < xlrec->nlocks; i++) |
| StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid, |
| xlrec->locks[i].dbOid, |
| xlrec->locks[i].relOid); |
| } |
| else if (info == XLOG_RUNNING_XACTS) |
| { |
| xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record); |
| RunningTransactionsData running; |
| |
| running.xcnt = xlrec->xcnt; |
| running.subxcnt = xlrec->subxcnt; |
| running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY; |
| running.nextXid = xlrec->nextXid; |
| running.latestCompletedXid = xlrec->latestCompletedXid; |
| running.oldestRunningXid = xlrec->oldestRunningXid; |
| running.xids = xlrec->xids; |
| |
| ProcArrayApplyRecoveryInfo(&running); |
| |
| /* |
| * The startup process currently has no convenient way to schedule |
| * stats to be reported. XLOG_RUNNING_XACTS records issued at a |
| * regular cadence, making this a convenient location to report stats. |
| * While these records aren't generated with wal_level=minimal, stats |
| * also cannot be accessed during WAL replay. |
| */ |
| pgstat_report_stat(true); |
| } |
| else if (info == XLOG_INVALIDATIONS) |
| { |
| xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record); |
| |
| ProcessCommittedInvalidationMessages(xlrec->msgs, |
| xlrec->nmsgs, |
| xlrec->relcacheInitFileInval, |
| xlrec->dbId, |
| xlrec->tsId); |
| } |
| else |
| elog(PANIC, "standby_redo: unknown op code %u", info); |
| } |
| |
| /* |
| * Log details of the current snapshot to WAL. This allows the snapshot state |
| * to be reconstructed on the standby and for logical decoding. |
| * |
| * This is used for Hot Standby as follows: |
| * |
| * We can move directly to STANDBY_SNAPSHOT_READY at startup if we |
| * start from a shutdown checkpoint because we know nothing was running |
| * at that time and our recovery snapshot is known empty. In the more |
| * typical case of an online checkpoint we need to jump through a few |
| * hoops to get a correct recovery snapshot and this requires a two or |
| * sometimes a three stage process. |
| * |
| * The initial snapshot must contain all running xids and all current |
| * AccessExclusiveLocks at a point in time on the standby. Assembling |
| * that information while the server is running requires many and |
| * various LWLocks, so we choose to derive that information piece by |
| * piece and then re-assemble that info on the standby. When that |
| * information is fully assembled we move to STANDBY_SNAPSHOT_READY. |
| * |
| * Since locking on the primary when we derive the information is not |
| * strict, we note that there is a time window between the derivation and |
| * writing to WAL of the derived information. That allows race conditions |
| * that we must resolve, since xids and locks may enter or leave the |
| * snapshot during that window. This creates the issue that an xid or |
| * lock may start *after* the snapshot has been derived yet *before* the |
| * snapshot is logged in the running xacts WAL record. We resolve this by |
| * starting to accumulate changes at a point just prior to when we derive |
| * the snapshot on the primary, then ignore duplicates when we later apply |
| * the snapshot from the running xacts record. This is implemented during |
| * CreateCheckPoint() where we use the logical checkpoint location as |
| * our starting point and then write the running xacts record immediately |
| * before writing the main checkpoint WAL record. Since we always start |
| * up from a checkpoint and are immediately at our starting point, we |
| * unconditionally move to STANDBY_INITIALIZED. After this point we |
| * must do 4 things: |
| * * move shared nextXid forwards as we see new xids |
| * * extend the clog and subtrans with each new xid |
| * * keep track of uncommitted known assigned xids |
| * * keep track of uncommitted AccessExclusiveLocks |
| * |
| * When we see a commit/abort we must remove known assigned xids and locks |
| * from the completing transaction. Attempted removals that cannot locate |
| * an entry are expected and must not cause an error when we are in state |
| * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and |
| * KnownAssignedXidsRemove(). |
| * |
| * Later, when we apply the running xact data we must be careful to ignore |
| * transactions already committed, since those commits raced ahead when |
| * making WAL entries. |
| * |
| * The loose timing also means that locks may be recorded that have a |
| * zero xid, since xids are removed from procs before locks are removed. |
| * So we must prune the lock list down to ensure we hold locks only for |
| * currently running xids, performed by StandbyReleaseOldLocks(). |
| * Zero xids should no longer be possible, but we may be replaying WAL |
| * from a time when they were possible. |
| * |
| * For logical decoding only the running xacts information is needed; |
| * there's no need to look at the locking information, but it's logged anyway, |
| * as there's no independent knob to just enable logical decoding. For |
| * details of how this is used, check snapbuild.c's introductory comment. |
| * |
| * |
| * Returns the RecPtr of the last inserted record. |
| */ |
| XLogRecPtr |
| LogStandbySnapshot(void) |
| { |
| XLogRecPtr recptr; |
| RunningTransactions running; |
| xl_standby_lock *locks; |
| int nlocks; |
| |
| Assert(XLogStandbyInfoActive()); |
| |
| /* |
| * Get details of any AccessExclusiveLocks being held at the moment. |
| */ |
| locks = GetRunningTransactionLocks(&nlocks); |
| if (nlocks > 0) |
| LogAccessExclusiveLocks(nlocks, locks); |
| pfree(locks); |
| |
| /* |
| * Log details of all in-progress transactions. This should be the last |
| * record we write, because standby will open up when it sees this. |
| */ |
| running = GetRunningTransactionData(); |
| |
| /* |
| * GetRunningTransactionData() acquired ProcArrayLock, we must release it. |
| * For Hot Standby this can be done before inserting the WAL record |
| * because ProcArrayApplyRecoveryInfo() rechecks the commit status using |
| * the clog. For logical decoding, though, the lock can't be released |
| * early because the clog might be "in the future" from the POV of the |
| * historic snapshot. This would allow for situations where we're waiting |
| * for the end of a transaction listed in the xl_running_xacts record |
| * which, according to the WAL, has committed before the xl_running_xacts |
| * record. Fortunately this routine isn't executed frequently, and it's |
| * only a shared lock. |
| */ |
| if (wal_level < WAL_LEVEL_LOGICAL) |
| LWLockRelease(ProcArrayLock); |
| |
| recptr = LogCurrentRunningXacts(running); |
| |
| /* Release lock if we kept it longer ... */ |
| if (wal_level >= WAL_LEVEL_LOGICAL) |
| LWLockRelease(ProcArrayLock); |
| |
| /* GetRunningTransactionData() acquired XidGenLock, we must release it */ |
| LWLockRelease(XidGenLock); |
| |
| return recptr; |
| } |
| |
| /* |
| * Record an enhanced snapshot of running transactions into WAL. |
| * |
| * The definitions of RunningTransactionsData and xl_running_xacts are |
| * similar. We keep them separate because xl_running_xacts is a contiguous |
| * chunk of memory and never exists fully until it is assembled in WAL. |
| * The inserted records are marked as not being important for durability, |
| * to avoid triggering superfluous checkpoint / archiving activity. |
| */ |
| static XLogRecPtr |
| LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) |
| { |
| xl_running_xacts xlrec; |
| XLogRecPtr recptr; |
| |
| xlrec.xcnt = CurrRunningXacts->xcnt; |
| xlrec.subxcnt = CurrRunningXacts->subxcnt; |
| xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY); |
| xlrec.nextXid = CurrRunningXacts->nextXid; |
| xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; |
| xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; |
| |
| /* Header */ |
| XLogBeginInsert(); |
| XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); |
| XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts); |
| |
| /* array of TransactionIds */ |
| if (xlrec.xcnt > 0) |
| XLogRegisterData((char *) CurrRunningXacts->xids, |
| (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); |
| |
| recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); |
| |
| if (xlrec.subxid_overflow) |
| elog(trace_recovery(DEBUG2), |
| "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", |
| CurrRunningXacts->xcnt, |
| LSN_FORMAT_ARGS(recptr), |
| CurrRunningXacts->oldestRunningXid, |
| CurrRunningXacts->latestCompletedXid, |
| CurrRunningXacts->nextXid); |
| else |
| elog(trace_recovery(DEBUG2), |
| "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", |
| CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, |
| LSN_FORMAT_ARGS(recptr), |
| CurrRunningXacts->oldestRunningXid, |
| CurrRunningXacts->latestCompletedXid, |
| CurrRunningXacts->nextXid); |
| |
| /* |
| * Ensure running_xacts information is synced to disk not too far in the |
| * future. We don't want to stall anything though (i.e. use XLogFlush()), |
| * so we let the wal writer do it during normal operation. |
| * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced |
| * and nudge the WALWriter into action if sleeping. Check |
| * XLogBackgroundFlush() for details why a record might not be flushed |
| * without it. |
| */ |
| XLogSetAsyncXactLSN(recptr); |
| |
| return recptr; |
| } |
| |
| /* |
| * Wholesale logging of AccessExclusiveLocks. Other lock types need not be |
| * logged, as described in backend/storage/lmgr/README. |
| */ |
| static void |
| LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks) |
| { |
| xl_standby_locks xlrec; |
| |
| xlrec.nlocks = nlocks; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks)); |
| XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock)); |
| XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); |
| |
| (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK); |
| } |
| |
| /* |
| * Individual logging of AccessExclusiveLocks for use during LockAcquire() |
| */ |
| void |
| LogAccessExclusiveLock(Oid dbOid, Oid relOid) |
| { |
| xl_standby_lock xlrec; |
| |
| xlrec.xid = GetCurrentTransactionId(); |
| |
| xlrec.dbOid = dbOid; |
| xlrec.relOid = relOid; |
| |
| LogAccessExclusiveLocks(1, &xlrec); |
| MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK; |
| } |
| |
| /* |
| * Prepare to log an AccessExclusiveLock, for use during LockAcquire() |
| */ |
| void |
| LogAccessExclusiveLockPrepare(void) |
| { |
| /* |
| * Ensure that a TransactionId has been assigned to this transaction, for |
| * two reasons, both related to lock release on the standby. First, we |
| * must assign an xid so that RecordTransactionCommit() and |
| * RecordTransactionAbort() do not optimise away the transaction |
| * completion record which recovery relies upon to release locks. It's a |
| * hack, but for a corner case not worth adding code for into the main |
| * commit path. Second, we must assign an xid before the lock is recorded |
| * in shared memory, otherwise a concurrently executing |
| * GetRunningTransactionLocks() might see a lock associated with an |
| * InvalidTransactionId which we later assert cannot happen. |
| */ |
| (void) GetCurrentTransactionId(); |
| } |
| |
| /* |
| * Emit WAL for invalidations. This currently is only used for commits without |
| * an xid but which contain invalidations. |
| */ |
| void |
| LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, |
| bool relcacheInitFileInval) |
| { |
| xl_invalidations xlrec; |
| |
| /* prepare record */ |
| memset(&xlrec, 0, sizeof(xlrec)); |
| xlrec.dbId = MyDatabaseId; |
| xlrec.tsId = MyDatabaseTableSpace; |
| xlrec.relcacheInitFileInval = relcacheInitFileInval; |
| xlrec.nmsgs = nmsgs; |
| |
| /* perform insertion */ |
| XLogBeginInsert(); |
| XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations); |
| XLogRegisterData((char *) msgs, |
| nmsgs * sizeof(SharedInvalidationMessage)); |
| XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS); |
| } |
| |
| /* Return the description of recovery conflict */ |
| static const char * |
| get_recovery_conflict_desc(ProcSignalReason reason) |
| { |
| const char *reasonDesc = _("unknown reason"); |
| |
| switch (reason) |
| { |
| case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: |
| reasonDesc = _("recovery conflict on buffer pin"); |
| break; |
| case PROCSIG_RECOVERY_CONFLICT_LOCK: |
| reasonDesc = _("recovery conflict on lock"); |
| break; |
| case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: |
| reasonDesc = _("recovery conflict on tablespace"); |
| break; |
| case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: |
| reasonDesc = _("recovery conflict on snapshot"); |
| break; |
| case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: |
| reasonDesc = _("recovery conflict on replication slot"); |
| break; |
| case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: |
| reasonDesc = _("recovery conflict on buffer deadlock"); |
| break; |
| case PROCSIG_RECOVERY_CONFLICT_DATABASE: |
| reasonDesc = _("recovery conflict on database"); |
| break; |
| default: |
| break; |
| } |
| |
| return reasonDesc; |
| } |