| /*------------------------------------------------------------------------- |
| * |
| * multixact.c |
| * PostgreSQL multi-transaction-log manager |
| * |
| * The pg_multixact manager is a pg_clog-like manager that stores an array |
| * of TransactionIds for each MultiXactId. It is a fundamental part of the |
| * shared-row-lock implementation. A share-locked tuple stores a |
| * MultiXactId in its Xmax, and a transaction that needs to wait for the |
| * tuple to be unlocked can sleep on the potentially-several TransactionIds |
| * that compose the MultiXactId. |
| * |
| * We use two SLRU areas, one for storing the offsets at which the data |
| * starts for each MultiXactId in the other one. This trick allows us to |
| * store variable length arrays of TransactionIds. (We could alternatively |
| * use one area containing counts and TransactionIds, with valid MultiXactId |
| * values pointing at slots containing counts; but that way seems less robust |
| * since it would get completely confused if someone inquired about a bogus |
| * MultiXactId that pointed to an intermediate slot containing an XID.) |
| * |
| * XLOG interactions: this module generates an XLOG record whenever a new |
| * OFFSETs or MEMBERs page is initialized to zeroes, as well as an XLOG record |
| * whenever a new MultiXactId is defined. This allows us to completely |
| * rebuild the data entered since the last checkpoint during XLOG replay. |
| * Because this is possible, we need not follow the normal rule of |
| * "write WAL before data"; the only correctness guarantee needed is that |
| * we flush and sync all dirty OFFSETs and MEMBERs pages to disk before a |
| * checkpoint is considered complete. If a page does make it to disk ahead |
| * of corresponding WAL records, it will be forcibly zeroed before use anyway. |
| * Therefore, we don't need to mark our pages with LSN information; we have |
| * enough synchronization already. |
| * |
| * Like clog.c, and unlike subtrans.c, we have to preserve state across |
| * crashes and ensure that MXID and offset numbering increases monotonically |
| * across a crash. We do this in the same way as it's done for transaction |
| * IDs: the WAL record is guaranteed to contain evidence of every MXID we |
| * could need to worry about, and we just make sure that at the end of |
| * replay, the next-MXID and next-offset counters are at least as large as |
| * anything we saw during replay. |
| * |
| * |
| * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.22 2006/11/17 18:00:15 tgl Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/multixact.h" |
| #include "access/slru.h" |
| #include "access/transam.h" |
| #include "access/xact.h" |
| #include "miscadmin.h" |
| #include "storage/backendid.h" |
| #include "storage/lmgr.h" |
| #include "utils/memutils.h" |
| #include "storage/procarray.h" |
| #include "cdb/cdbpersistentstore.h" |
| |
| |
| /* |
| * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is |
| * used everywhere else in Postgres. |
| * |
| * Note: because both MultiXactOffsets and TransactionIds are 32 bits and |
| * wrap around at 0xFFFFFFFF, MultiXact page numbering also wraps around at |
| * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE, and segment numbering at |
| * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need take no |
| * explicit notice of that fact in this module, except when comparing segment |
| * and page numbers in TruncateMultiXact |
| * (see MultiXact{Offset,Member}PagePrecedes). |
| */ |
| |
| /* We need four bytes per offset and also four bytes per member */ |
| #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) |
| #define MULTIXACT_MEMBERS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) |
| |
| #define MultiXactIdToOffsetPage(xid) \ |
| ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) |
| #define MultiXactIdToOffsetEntry(xid) \ |
| ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) |
| |
| #define MXOffsetToMemberPage(xid) \ |
| ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) |
| #define MXOffsetToMemberEntry(xid) \ |
| ((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) |
| |
| |
| /* |
| * Links to shared-memory data structures for MultiXact control |
| */ |
| static SlruCtlData MultiXactOffsetCtlData; |
| static SlruCtlData MultiXactMemberCtlData; |
| |
| #define MultiXactOffsetCtl (&MultiXactOffsetCtlData) |
| #define MultiXactMemberCtl (&MultiXactMemberCtlData) |
| |
| /* |
| * MultiXact state shared across all backends. All this state is protected |
| * by MultiXactGenLock. (We also use MultiXactOffsetControlLock and |
| * MultiXactMemberControlLock to guard accesses to the two sets of SLRU |
| * buffers. For concurrency's sake, we avoid holding more than one of these |
| * locks at a time.) |
| */ |
| typedef struct MultiXactStateData |
| { |
| /* next-to-be-assigned MultiXactId */ |
| MultiXactId nextMXact; |
| |
| /* next-to-be-assigned offset */ |
| MultiXactOffset nextOffset; |
| |
| /* the Offset SLRU area was last truncated at this MultiXactId */ |
| MultiXactId lastTruncationPoint; |
| |
| /* |
| * Per-backend data starts here. We have two arrays stored in the area |
| * immediately following the MultiXactStateData struct. Each is indexed by |
| * BackendId. (Note: valid BackendIds run from 1 to MaxBackends; element |
| * zero of each array is never used.) |
| * |
| * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current |
| * transaction(s) could possibly be a member of, or InvalidMultiXactId |
| * when the backend has no live transaction that could possibly be a |
| * member of a MultiXact. Each backend sets its entry to the current |
| * nextMXact counter just before first acquiring a shared lock in a given |
| * transaction, and clears it at transaction end. (This works because only |
| * during or after acquiring a shared lock could an XID possibly become a |
| * member of a MultiXact, and that MultiXact would have to be created |
| * during or after the lock acquisition.) |
| * |
| * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's |
| * current transaction(s) think is potentially live, or InvalidMultiXactId |
| * when not in a transaction or not in a transaction that's paid any |
| * attention to MultiXacts yet. This is computed when first needed in a |
| * given transaction, and cleared at transaction end. We can compute it |
| * as the minimum of the valid OldestMemberMXactId[] entries at the time |
| * we compute it (using nextMXact if none are valid). Each backend is |
| * required not to attempt to access any SLRU data for MultiXactIds older |
| * than its own OldestVisibleMXactId[] setting; this is necessary because |
| * the checkpointer could truncate away such data at any instant. |
| * |
| * The checkpointer can compute the safe truncation point as the oldest |
| * valid value among all the OldestMemberMXactId[] and |
| * OldestVisibleMXactId[] entries, or nextMXact if none are valid. |
| * Clearly, it is not possible for any later-computed OldestVisibleMXactId |
| * value to be older than this, and so there is no risk of truncating data |
| * that is still needed. |
| */ |
| MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */ |
| } MultiXactStateData; |
| |
| /* Pointers to the state data in shared memory */ |
| static MultiXactStateData *MultiXactState; |
| static MultiXactId *OldestMemberMXactId; |
| static MultiXactId *OldestVisibleMXactId; |
| |
| |
| /* |
| * Definitions for the backend-local MultiXactId cache. |
| * |
| * We use this cache to store known MultiXacts, so we don't need to go to |
| * SLRU areas everytime. |
| * |
| * The cache lasts for the duration of a single transaction, the rationale |
| * for this being that most entries will contain our own TransactionId and |
| * so they will be uninteresting by the time our next transaction starts. |
| * (XXX not clear that this is correct --- other members of the MultiXact |
| * could hang around longer than we did. However, it's not clear what a |
| * better policy for flushing old cache entries would be.) |
| * |
| * We allocate the cache entries in a memory context that is deleted at |
| * transaction end, so we don't need to do retail freeing of entries. |
| */ |
| typedef struct mXactCacheEnt |
| { |
| struct mXactCacheEnt *next; |
| MultiXactId multi; |
| int nxids; |
| TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */ |
| } mXactCacheEnt; |
| |
| static mXactCacheEnt *MXactCache = NULL; |
| static MemoryContext MXactContext = NULL; |
| |
| |
| #ifdef MULTIXACT_DEBUG |
| #define debug_elog2(a,b) elog(a,b) |
| #define debug_elog3(a,b,c) elog(a,b,c) |
| #define debug_elog4(a,b,c,d) elog(a,b,c,d) |
| #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) |
| #else |
| #define debug_elog2(a,b) |
| #define debug_elog3(a,b,c) |
| #define debug_elog4(a,b,c,d) |
| #define debug_elog5(a,b,c,d,e) |
| #endif |
| |
| /* internal MultiXactId management */ |
| static void MultiXactIdSetOldestVisible(void); |
| static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids); |
| static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
| int nxids, TransactionId *xids); |
| static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset); |
| |
| /* MultiXact cache management */ |
| static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids); |
| static int mXactCacheGetById(MultiXactId multi, TransactionId **xids); |
| static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids); |
| static int xidComparator(const void *arg1, const void *arg2); |
| |
| #ifdef MULTIXACT_DEBUG |
| static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids); |
| #endif |
| |
| /* management of SLRU infrastructure */ |
| static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); |
| static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); |
| static bool MultiXactOffsetPagePrecedes(int page1, int page2); |
| static bool MultiXactMemberPagePrecedes(int page1, int page2); |
| static bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); |
| static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, |
| MultiXactOffset offset2); |
| static void ExtendMultiXactOffset(MultiXactId multi); |
| static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); |
| static void TruncateMultiXact(void); |
| static void WriteMZeroPageXlogRec(int pageno, uint8 info); |
| |
| |
| /* |
| * MultiXactIdCreate |
| * Construct a MultiXactId representing two TransactionIds. |
| * |
| * The two XIDs must be different. |
| * |
| * NB - we don't worry about our local MultiXactId cache here, because that |
| * is handled by the lower-level routines. |
| */ |
| MultiXactId |
| MultiXactIdCreate(TransactionId xid1, TransactionId xid2) |
| { |
| MultiXactId newMulti; |
| TransactionId xids[2]; |
| |
| AssertArg(TransactionIdIsValid(xid1)); |
| AssertArg(TransactionIdIsValid(xid2)); |
| |
| Assert(!TransactionIdEquals(xid1, xid2)); |
| |
| /* |
| * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs |
| * are still running. In typical usage, xid2 will be our own XID and the |
| * caller just did a check on xid1, so it'd be wasted effort. |
| */ |
| |
| xids[0] = xid1; |
| xids[1] = xid2; |
| |
| newMulti = CreateMultiXactId(2, xids); |
| |
| debug_elog5(DEBUG2, "Create: returning %u for %u, %u", |
| newMulti, xid1, xid2); |
| |
| return newMulti; |
| } |
| |
| /* |
| * MultiXactIdExpand |
| * Add a TransactionId to a pre-existing MultiXactId. |
| * |
| * If the TransactionId is already a member of the passed MultiXactId, |
| * just return it as-is. |
| * |
| * Note that we do NOT actually modify the membership of a pre-existing |
| * MultiXactId; instead we create a new one. This is necessary to avoid |
| * a race condition against MultiXactIdWait (see notes there). |
| * |
| * NB - we don't worry about our local MultiXactId cache here, because that |
| * is handled by the lower-level routines. |
| */ |
| MultiXactId |
| MultiXactIdExpand(MultiXactId multi, TransactionId xid) |
| { |
| MultiXactId newMulti; |
| TransactionId *members; |
| TransactionId *newMembers; |
| int nmembers; |
| int i; |
| int j; |
| |
| AssertArg(MultiXactIdIsValid(multi)); |
| AssertArg(TransactionIdIsValid(xid)); |
| |
| debug_elog4(DEBUG2, "Expand: received multi %u, xid %u", |
| multi, xid); |
| |
| nmembers = GetMultiXactIdMembers(multi, &members); |
| |
| if (nmembers < 0) |
| { |
| /* |
| * The MultiXactId is obsolete. This can only happen if all the |
| * MultiXactId members stop running between the caller checking and |
| * passing it to us. It would be better to return that fact to the |
| * caller, but it would complicate the API and it's unlikely to happen |
| * too often, so just deal with it by creating a singleton MultiXact. |
| */ |
| newMulti = CreateMultiXactId(1, &xid); |
| |
| debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", |
| multi, newMulti); |
| return newMulti; |
| } |
| |
| /* |
| * If the TransactionId is already a member of the MultiXactId, just |
| * return the existing MultiXactId. |
| */ |
| for (i = 0; i < nmembers; i++) |
| { |
| if (TransactionIdEquals(members[i], xid)) |
| { |
| debug_elog4(DEBUG2, "Expand: %u is already a member of %u", |
| xid, multi); |
| pfree(members); |
| return multi; |
| } |
| } |
| |
| /* |
| * Determine which of the members of the MultiXactId are still running, |
| * and use them to create a new one. (Removing dead members is just an |
| * optimization, but a useful one. Note we have the same race condition |
| * here as above: j could be 0 at the end of the loop.) |
| */ |
| newMembers = (TransactionId *) |
| palloc(sizeof(TransactionId) * (nmembers + 1)); |
| |
| for (i = 0, j = 0; i < nmembers; i++) |
| { |
| if (TransactionIdIsInProgress(members[i])) |
| newMembers[j++] = members[i]; |
| } |
| |
| newMembers[j++] = xid; |
| newMulti = CreateMultiXactId(j, newMembers); |
| |
| pfree(members); |
| pfree(newMembers); |
| |
| debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti); |
| |
| return newMulti; |
| } |
| |
| /* |
| * MultiXactIdIsRunning |
| * Returns whether a MultiXactId is "running". |
| * |
| * We return true if at least one member of the given MultiXactId is still |
| * running. Note that a "false" result is certain not to change, |
| * because it is not legal to add members to an existing MultiXactId. |
| */ |
| bool |
| MultiXactIdIsRunning(MultiXactId multi) |
| { |
| TransactionId *members; |
| int nmembers; |
| int i; |
| |
| debug_elog3(DEBUG2, "IsRunning %u?", multi); |
| |
| nmembers = GetMultiXactIdMembers(multi, &members); |
| |
| if (nmembers < 0) |
| { |
| debug_elog2(DEBUG2, "IsRunning: no members"); |
| return false; |
| } |
| |
| /* |
| * Checking for myself is cheap compared to looking in shared memory, |
| * so first do the equivalent of MultiXactIdIsCurrent(). This is not |
| * needed for correctness, it's just a fast path. |
| */ |
| for (i = 0; i < nmembers; i++) |
| { |
| if (TransactionIdIsCurrentTransactionId(members[i])) |
| { |
| debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i); |
| pfree(members); |
| return true; |
| } |
| } |
| |
| /* |
| * This could be made faster by having another entry point in procarray.c, |
| * walking the PGPROC array only once for all the members. But in most |
| * cases nmembers should be small enough that it doesn't much matter. |
| */ |
| for (i = 0; i < nmembers; i++) |
| { |
| if (TransactionIdIsInProgress(members[i])) |
| { |
| debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running", |
| i, members[i]); |
| pfree(members); |
| return true; |
| } |
| } |
| |
| pfree(members); |
| |
| debug_elog3(DEBUG2, "IsRunning: %u is not running", multi); |
| |
| return false; |
| } |
| |
| /* |
| * MultiXactIdIsCurrent |
| * Returns true if the current transaction is a member of the MultiXactId. |
| * |
| * We return true if any live subtransaction of the current top-level |
| * transaction is a member. This is appropriate for the same reason that a |
| * lock held by any such subtransaction is globally equivalent to a lock |
| * held by the current subtransaction: no such lock could be released without |
| * aborting this subtransaction, and hence releasing its locks. So it's not |
| * necessary to add the current subxact to the MultiXact separately. |
| */ |
| bool |
| MultiXactIdIsCurrent(MultiXactId multi) |
| { |
| bool result = false; |
| TransactionId *members; |
| int nmembers; |
| int i; |
| |
| nmembers = GetMultiXactIdMembers(multi, &members); |
| |
| if (nmembers < 0) |
| return false; |
| |
| for (i = 0; i < nmembers; i++) |
| { |
| if (TransactionIdIsCurrentTransactionId(members[i])) |
| { |
| result = true; |
| break; |
| } |
| } |
| |
| pfree(members); |
| |
| return result; |
| } |
| |
| /* |
| * MultiXactIdSetOldestMember |
| * Save the oldest MultiXactId this transaction could be a member of. |
| * |
| * We set the OldestMemberMXactId for a given transaction the first time |
| * it's going to acquire a shared lock. We need to do this even if we end |
| * up using a TransactionId instead of a MultiXactId, because there is a |
| * chance that another transaction would add our XID to a MultiXactId. |
| * |
| * The value to set is the next-to-be-assigned MultiXactId, so this is meant |
| * to be called just before acquiring a shared lock. |
| */ |
| void |
| MultiXactIdSetOldestMember(void) |
| { |
| if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])) |
| { |
| MultiXactId nextMXact; |
| |
| /* |
| * You might think we don't need to acquire a lock here, since |
| * fetching and storing of TransactionIds is probably atomic, but in |
| * fact we do: suppose we pick up nextMXact and then lose the CPU for |
| * a long time. Someone else could advance nextMXact, and then |
| * another someone else could compute an OldestVisibleMXactId that |
| * would be after the value we are going to store when we get control |
| * back. Which would be wrong. |
| */ |
| LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| |
| /* |
| * We have to beware of the possibility that nextMXact is in the |
| * wrapped-around state. We don't fix the counter itself here, but we |
| * must be sure to store a valid value in our array entry. |
| */ |
| nextMXact = MultiXactState->nextMXact; |
| if (nextMXact < FirstMultiXactId) |
| nextMXact = FirstMultiXactId; |
| |
| OldestMemberMXactId[MyBackendId] = nextMXact; |
| |
| LWLockRelease(MultiXactGenLock); |
| |
| debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u", |
| MyBackendId, nextMXact); |
| } |
| } |
| |
| /* |
| * MultiXactIdSetOldestVisible |
| * Save the oldest MultiXactId this transaction considers possibly live. |
| * |
| * We set the OldestVisibleMXactId for a given transaction the first time |
| * it's going to inspect any MultiXactId. Once we have set this, we are |
| * guaranteed that the checkpointer won't truncate off SLRU data for |
| * MultiXactIds at or after our OldestVisibleMXactId. |
| * |
| * The value to set is the oldest of nextMXact and all the valid per-backend |
| * OldestMemberMXactId[] entries. Because of the locking we do, we can be |
| * certain that no subsequent call to MultiXactIdSetOldestMember can set |
| * an OldestMemberMXactId[] entry older than what we compute here. Therefore |
| * there is no live transaction, now or later, that can be a member of any |
| * MultiXactId older than the OldestVisibleMXactId we compute here. |
| */ |
| static void |
| MultiXactIdSetOldestVisible(void) |
| { |
| if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId])) |
| { |
| MultiXactId oldestMXact; |
| int i; |
| |
| LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| |
| /* |
| * We have to beware of the possibility that nextMXact is in the |
| * wrapped-around state. We don't fix the counter itself here, but we |
| * must be sure to store a valid value in our array entry. |
| */ |
| oldestMXact = MultiXactState->nextMXact; |
| if (oldestMXact < FirstMultiXactId) |
| oldestMXact = FirstMultiXactId; |
| |
| for (i = 1; i <= MaxBackends; i++) |
| { |
| MultiXactId thisoldest = OldestMemberMXactId[i]; |
| |
| if (MultiXactIdIsValid(thisoldest) && |
| MultiXactIdPrecedes(thisoldest, oldestMXact)) |
| oldestMXact = thisoldest; |
| } |
| |
| OldestVisibleMXactId[MyBackendId] = oldestMXact; |
| |
| LWLockRelease(MultiXactGenLock); |
| |
| debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u", |
| MyBackendId, oldestMXact); |
| } |
| } |
| |
| /* |
| * MultiXactIdWait |
| * Sleep on a MultiXactId. |
| * |
| * We do this by sleeping on each member using XactLockTableWait. Any |
| * members that belong to the current backend are *not* waited for, however; |
| * this would not merely be useless but would lead to Assert failure inside |
| * XactLockTableWait. By the time this returns, it is certain that all |
| * transactions *of other backends* that were members of the MultiXactId |
| * are dead (and no new ones can have been added, since it is not legal |
| * to add members to an existing MultiXactId). |
| * |
| * But by the time we finish sleeping, someone else may have changed the Xmax |
| * of the containing tuple, so the caller needs to iterate on us somehow. |
| */ |
| void |
| MultiXactIdWait(MultiXactId multi) |
| { |
| TransactionId *members; |
| int nmembers; |
| |
| nmembers = GetMultiXactIdMembers(multi, &members); |
| |
| if (nmembers >= 0) |
| { |
| int i; |
| |
| for (i = 0; i < nmembers; i++) |
| { |
| TransactionId member = members[i]; |
| |
| debug_elog4(DEBUG2, "MultiXactIdWait: waiting for %d (%u)", |
| i, member); |
| if (!TransactionIdIsCurrentTransactionId(member)) |
| XactLockTableWait(member); |
| } |
| |
| pfree(members); |
| } |
| } |
| |
| /* |
| * ConditionalMultiXactIdWait |
| * As above, but only lock if we can get the lock without blocking. |
| */ |
| bool |
| ConditionalMultiXactIdWait(MultiXactId multi) |
| { |
| bool result = true; |
| TransactionId *members; |
| int nmembers; |
| |
| nmembers = GetMultiXactIdMembers(multi, &members); |
| |
| if (nmembers >= 0) |
| { |
| int i; |
| |
| for (i = 0; i < nmembers; i++) |
| { |
| TransactionId member = members[i]; |
| |
| debug_elog4(DEBUG2, "ConditionalMultiXactIdWait: trying %d (%u)", |
| i, member); |
| if (!TransactionIdIsCurrentTransactionId(member)) |
| { |
| result = ConditionalXactLockTableWait(member); |
| if (!result) |
| break; |
| } |
| } |
| |
| pfree(members); |
| } |
| |
| return result; |
| } |
| |
| /* |
| * CreateMultiXactId |
| * Make a new MultiXactId |
| * |
| * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the |
| * given TransactionIds as members. Returns the newly created MultiXactId. |
| * |
| * NB: the passed xids[] array will be sorted in-place. |
| */ |
| static MultiXactId |
| CreateMultiXactId(int nxids, TransactionId *xids) |
| { |
| MultiXactId multi; |
| MultiXactOffset offset; |
| XLogRecData rdata[2]; |
| xl_multixact_create xlrec; |
| |
| debug_elog3(DEBUG2, "Create: %s", |
| mxid_to_string(InvalidMultiXactId, nxids, xids)); |
| |
| /* |
| * See if the same set of XIDs already exists in our cache; if so, just |
| * re-use that MultiXactId. (Note: it might seem that looking in our |
| * cache is insufficient, and we ought to search disk to see if a |
| * duplicate definition already exists. But since we only ever create |
| * MultiXacts containing our own XID, in most cases any such MultiXacts |
| * were in fact created by us, and so will be in our cache. There are |
| * corner cases where someone else added us to a MultiXact without our |
| * knowledge, but it's not worth checking for.) |
| */ |
| multi = mXactCacheGetBySet(nxids, xids); |
| if (MultiXactIdIsValid(multi)) |
| { |
| debug_elog2(DEBUG2, "Create: in cache!"); |
| return multi; |
| } |
| |
| /* |
| * Assign the MXID and offsets range to use, and make sure there is space |
| * in the OFFSETs and MEMBERs files. NB: this routine does |
| * START_CRIT_SECTION(). |
| */ |
| multi = GetNewMultiXactId(nxids, &offset); |
| |
| /* |
| * Make an XLOG entry describing the new MXID. |
| * |
| * Note: we need not flush this XLOG entry to disk before proceeding. The |
| * only way for the MXID to be referenced from any data page is for |
| * heap_lock_tuple() to have put it there, and heap_lock_tuple() generates |
| * an XLOG record that must follow ours. The normal LSN interlock between |
| * the data page and that XLOG record will ensure that our XLOG record |
| * reaches disk first. If the SLRU members/offsets data reaches disk |
| * sooner than the XLOG record, we do not care because we'll overwrite it |
| * with zeroes unless the XLOG record is there too; see notes at top of |
| * this file. |
| */ |
| xlrec.mid = multi; |
| xlrec.moff = offset; |
| xlrec.nxids = nxids; |
| |
| rdata[0].data = (char *) (&xlrec); |
| rdata[0].len = MinSizeOfMultiXactCreate; |
| rdata[0].buffer = InvalidBuffer; |
| rdata[0].next = &(rdata[1]); |
| rdata[1].data = (char *) xids; |
| rdata[1].len = nxids * sizeof(TransactionId); |
| rdata[1].buffer = InvalidBuffer; |
| rdata[1].next = NULL; |
| |
| (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata); |
| |
| /* Now enter the information into the OFFSETs and MEMBERs logs */ |
| RecordNewMultiXact(multi, offset, nxids, xids); |
| |
| /* Done with critical section */ |
| END_CRIT_SECTION(); |
| |
| /* Store the new MultiXactId in the local cache, too */ |
| mXactCachePut(multi, nxids, xids); |
| |
| debug_elog2(DEBUG2, "Create: all done"); |
| |
| return multi; |
| } |
| |
| /* |
| * RecordNewMultiXact |
| * Write info about a new multixact into the offsets and members files |
| * |
| * This is broken out of CreateMultiXactId so that xlog replay can use it. |
| */ |
| static void |
| RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
| int nxids, TransactionId *xids) |
| { |
| int pageno; |
| int prev_pageno; |
| int entryno; |
| int slotno; |
| MultiXactOffset *offptr; |
| int i; |
| |
| LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| |
| pageno = MultiXactIdToOffsetPage(multi); |
| entryno = MultiXactIdToOffsetEntry(multi); |
| |
| /* |
| * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" |
| * to complain about if there's any I/O error. This is kinda bogus, but |
| * since the errors will always give the full pathname, it should be clear |
| * enough that a MultiXactId is really involved. Perhaps someday we'll |
| * take the trouble to generalize the slru.c error reporting code. |
| */ |
| slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi); |
| offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| offptr += entryno; |
| |
| *offptr = offset; |
| |
| MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
| |
| /* Exchange our lock */ |
| LWLockRelease(MultiXactOffsetControlLock); |
| |
| LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| |
| prev_pageno = -1; |
| |
| for (i = 0; i < nxids; i++, offset++) |
| { |
| TransactionId *memberptr; |
| |
| pageno = MXOffsetToMemberPage(offset); |
| entryno = MXOffsetToMemberEntry(offset); |
| |
| if (pageno != prev_pageno) |
| { |
| slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, multi); |
| prev_pageno = pageno; |
| } |
| |
| memberptr = (TransactionId *) |
| MultiXactMemberCtl->shared->page_buffer[slotno]; |
| memberptr += entryno; |
| |
| *memberptr = xids[i]; |
| |
| MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
| } |
| |
| LWLockRelease(MultiXactMemberControlLock); |
| } |
| |
| /* |
| * GetNewMultiXactId |
| * Get the next MultiXactId. |
| * |
| * Also, reserve the needed amount of space in the "members" area. The |
| * starting offset of the reserved space is returned in *offset. |
| * |
| * This may generate XLOG records for expansion of the offsets and/or members |
| * files. Unfortunately, we have to do that while holding MultiXactGenLock |
| * to avoid race conditions --- the XLOG record for zeroing a page must appear |
| * before any backend can possibly try to store data in that page! |
| * |
| * We start a critical section before advancing the shared counters. The |
| * caller must end the critical section after writing SLRU data. |
| */ |
| static MultiXactId |
| GetNewMultiXactId(int nxids, MultiXactOffset *offset) |
| { |
| MultiXactId result; |
| MultiXactOffset nextOffset; |
| |
| debug_elog3(DEBUG2, "GetNew: for %d xids", nxids); |
| |
| /* MultiXactIdSetOldestMember() must have been called already */ |
| Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); |
| |
| LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
| |
| /* Handle wraparound of the nextMXact counter */ |
| if (MultiXactState->nextMXact < FirstMultiXactId) |
| MultiXactState->nextMXact = FirstMultiXactId; |
| |
| /* |
| * Assign the MXID, and make sure there is room for it in the file. |
| */ |
| result = MultiXactState->nextMXact; |
| |
| ExtendMultiXactOffset(result); |
| |
| /* |
| * Reserve the members space, similarly to above. Also, be careful not to |
| * return zero as the starting offset for any multixact. See |
| * GetMultiXactIdMembers() for motivation. |
| */ |
| nextOffset = MultiXactState->nextOffset; |
| if (nextOffset == 0) |
| { |
| *offset = 1; |
| nxids++; /* allocate member slot 0 too */ |
| } |
| else |
| *offset = nextOffset; |
| |
| ExtendMultiXactMember(nextOffset, nxids); |
| |
| /* |
| * Critical section from here until caller has written the data into the |
| * just-reserved SLRU space; we don't want to error out with a partly |
| * written MultiXact structure. (In particular, failing to write our |
| * start offset after advancing nextMXact would effectively corrupt the |
| * previous MultiXact.) |
| */ |
| START_CRIT_SECTION(); |
| |
| /* |
| * Advance counters. As in GetNewTransactionId(), this must not happen |
| * until after file extension has succeeded! |
| * |
| * We don't care about MultiXactId wraparound here; it will be handled by |
| * the next iteration. But note that nextMXact may be InvalidMultiXactId |
| * after this routine exits, so anyone else looking at the variable must |
| * be prepared to deal with that. Similarly, nextOffset may be zero, but |
| * we won't use that as the actual start offset of the next multixact. |
| */ |
| (MultiXactState->nextMXact)++; |
| |
| MultiXactState->nextOffset += nxids; |
| |
| LWLockRelease(MultiXactGenLock); |
| |
| debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); |
| return result; |
| } |
| |
| /* |
| * GetMultiXactIdMembers |
| * Returns the set of TransactionIds that make up a MultiXactId |
| * |
| * We return -1 if the MultiXactId is too old to possibly have any members |
| * still running; in that case we have not actually looked them up, and |
| * *xids is not set. |
| */ |
| int |
| GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) |
| { |
| int pageno; |
| int prev_pageno; |
| int entryno; |
| int slotno; |
| MultiXactOffset *offptr; |
| MultiXactOffset offset; |
| int length; |
| int truelength; |
| int i; |
| MultiXactId nextMXact; |
| MultiXactId tmpMXact; |
| MultiXactOffset nextOffset; |
| TransactionId *ptr; |
| |
| debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); |
| |
| Assert(MultiXactIdIsValid(multi)); |
| |
| /* See if the MultiXactId is in the local cache */ |
| length = mXactCacheGetById(multi, xids); |
| if (length >= 0) |
| { |
| debug_elog3(DEBUG2, "GetMembers: found %s in the cache", |
| mxid_to_string(multi, length, *xids)); |
| return length; |
| } |
| |
| /* Set our OldestVisibleMXactId[] entry if we didn't already */ |
| MultiXactIdSetOldestVisible(); |
| |
| /* |
| * We check known limits on MultiXact before resorting to the SLRU area. |
| * |
| * An ID older than our OldestVisibleMXactId[] entry can't possibly still |
| * be running, and we'd run the risk of trying to read already-truncated |
| * SLRU data if we did try to examine it. |
| * |
| * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is |
| * seen, it implies undetected ID wraparound has occurred. We just |
| * silently assume that such an ID is no longer running. |
| * |
| * Shared lock is enough here since we aren't modifying any global state. |
| * Also, we can examine our own OldestVisibleMXactId without the lock, |
| * since no one else is allowed to change it. |
| */ |
| if (MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId])) |
| { |
| debug_elog2(DEBUG2, "GetMembers: it's too old"); |
| *xids = NULL; |
| return -1; |
| } |
| |
| /* |
| * Acquire the shared lock just long enough to grab the current counter |
| * values. We may need both nextMXact and nextOffset; see below. |
| */ |
| LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| |
| nextMXact = MultiXactState->nextMXact; |
| nextOffset = MultiXactState->nextOffset; |
| |
| LWLockRelease(MultiXactGenLock); |
| |
| if (!MultiXactIdPrecedes(multi, nextMXact)) |
| { |
| debug_elog2(DEBUG2, "GetMembers: it's too new!"); |
| *xids = NULL; |
| return -1; |
| } |
| |
| /* |
| * Find out the offset at which we need to start reading MultiXactMembers |
| * and the number of members in the multixact. We determine the latter as |
| * the difference between this multixact's starting offset and the next |
| * one's. However, there are some corner cases to worry about: |
| * |
| * 1. This multixact may be the latest one created, in which case there is |
| * no next one to look at. In this case the nextOffset value we just |
| * saved is the correct endpoint. |
| * |
| * 2. The next multixact may still be in process of being filled in: that |
| * is, another process may have done GetNewMultiXactId but not yet written |
| * the offset entry for that ID. In that scenario, it is guaranteed that |
| * the offset entry for that multixact exists (because GetNewMultiXactId |
| * won't release MultiXactGenLock until it does) but contains zero |
| * (because we are careful to pre-zero offset pages). Because |
| * GetNewMultiXactId will never return zero as the starting offset for a |
| * multixact, when we read zero as the next multixact's offset, we know we |
| * have this case. We sleep for a bit and try again. |
| * |
| * 3. Because GetNewMultiXactId increments offset zero to offset one to |
| * handle case #2, there is an ambiguity near the point of offset |
| * wraparound. If we see next multixact's offset is one, is that our |
| * multixact's actual endpoint, or did it end at zero with a subsequent |
| * increment? We handle this using the knowledge that if the zero'th |
| * member slot wasn't filled, it'll contain zero, and zero isn't a valid |
| * transaction ID so it can't be a multixact member. Therefore, if we |
| * read a zero from the members array, just ignore it. |
| * |
| * This is all pretty messy, but the mess occurs only in infrequent corner |
| * cases, so it seems better than holding the MultiXactGenLock for a long |
| * time on every multixact creation. |
| */ |
| retry: |
| LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| |
| pageno = MultiXactIdToOffsetPage(multi); |
| entryno = MultiXactIdToOffsetEntry(multi); |
| |
| slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi); |
| offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| offptr += entryno; |
| offset = *offptr; |
| |
| Assert(offset != 0); |
| |
| /* |
| * Use the same increment rule as GetNewMultiXactId(), that is, don't |
| * handle wraparound explicitly until needed. |
| */ |
| tmpMXact = multi + 1; |
| |
| if (nextMXact == tmpMXact) |
| { |
| /* Corner case 1: there is no next multixact */ |
| length = nextOffset - offset; |
| } |
| else |
| { |
| MultiXactOffset nextMXOffset; |
| |
| /* handle wraparound if needed */ |
| if (tmpMXact < FirstMultiXactId) |
| tmpMXact = FirstMultiXactId; |
| |
| prev_pageno = pageno; |
| |
| pageno = MultiXactIdToOffsetPage(tmpMXact); |
| entryno = MultiXactIdToOffsetEntry(tmpMXact); |
| |
| if (pageno != prev_pageno) |
| slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, tmpMXact); |
| |
| offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| offptr += entryno; |
| nextMXOffset = *offptr; |
| |
| if (nextMXOffset == 0) |
| { |
| /* Corner case 2: next multixact is still being filled in */ |
| LWLockRelease(MultiXactOffsetControlLock); |
| pg_usleep(1000L); |
| goto retry; |
| } |
| |
| length = nextMXOffset - offset; |
| } |
| |
| LWLockRelease(MultiXactOffsetControlLock); |
| |
| ptr = (TransactionId *) palloc(length * sizeof(TransactionId)); |
| *xids = ptr; |
| |
| /* Now get the members themselves. */ |
| LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| |
| truelength = 0; |
| prev_pageno = -1; |
| for (i = 0; i < length; i++, offset++) |
| { |
| TransactionId *xactptr; |
| |
| pageno = MXOffsetToMemberPage(offset); |
| entryno = MXOffsetToMemberEntry(offset); |
| |
| if (pageno != prev_pageno) |
| { |
| slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, multi); |
| prev_pageno = pageno; |
| } |
| |
| xactptr = (TransactionId *) |
| MultiXactMemberCtl->shared->page_buffer[slotno]; |
| xactptr += entryno; |
| |
| if (!TransactionIdIsValid(*xactptr)) |
| { |
| /* Corner case 3: we must be looking at unused slot zero */ |
| Assert(offset == 0); |
| continue; |
| } |
| |
| ptr[truelength++] = *xactptr; |
| } |
| |
| LWLockRelease(MultiXactMemberControlLock); |
| |
| /* |
| * Copy the result into the local cache. |
| */ |
| mXactCachePut(multi, truelength, ptr); |
| |
| debug_elog3(DEBUG2, "GetMembers: no cache for %s", |
| mxid_to_string(multi, truelength, ptr)); |
| return truelength; |
| } |
| |
| /* |
| * mXactCacheGetBySet |
| * returns a MultiXactId from the cache based on the set of |
| * TransactionIds that compose it, or InvalidMultiXactId if |
| * none matches. |
| * |
| * This is helpful, for example, if two transactions want to lock a huge |
| * table. By using the cache, the second will use the same MultiXactId |
| * for the majority of tuples, thus keeping MultiXactId usage low (saving |
| * both I/O and wraparound issues). |
| * |
| * NB: the passed xids[] array will be sorted in-place. |
| */ |
| static MultiXactId |
| mXactCacheGetBySet(int nxids, TransactionId *xids) |
| { |
| mXactCacheEnt *entry; |
| |
| debug_elog3(DEBUG2, "CacheGet: looking for %s", |
| mxid_to_string(InvalidMultiXactId, nxids, xids)); |
| |
| /* sort the array so comparison is easy */ |
| qsort(xids, nxids, sizeof(TransactionId), xidComparator); |
| |
| for (entry = MXactCache; entry != NULL; entry = entry->next) |
| { |
| if (entry->nxids != nxids) |
| continue; |
| |
| /* We assume the cache entries are sorted */ |
| if (memcmp(xids, entry->xids, nxids * sizeof(TransactionId)) == 0) |
| { |
| debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi); |
| return entry->multi; |
| } |
| } |
| |
| debug_elog2(DEBUG2, "CacheGet: not found :-("); |
| return InvalidMultiXactId; |
| } |
| |
| /* |
| * mXactCacheGetById |
| * returns the composing TransactionId set from the cache for a |
| * given MultiXactId, if present. |
| * |
| * If successful, *xids is set to the address of a palloc'd copy of the |
| * TransactionId set. Return value is number of members, or -1 on failure. |
| */ |
| static int |
| mXactCacheGetById(MultiXactId multi, TransactionId **xids) |
| { |
| mXactCacheEnt *entry; |
| |
| debug_elog3(DEBUG2, "CacheGet: looking for %u", multi); |
| |
| for (entry = MXactCache; entry != NULL; entry = entry->next) |
| { |
| if (entry->multi == multi) |
| { |
| TransactionId *ptr; |
| Size size; |
| |
| size = sizeof(TransactionId) * entry->nxids; |
| ptr = (TransactionId *) palloc(size); |
| *xids = ptr; |
| |
| memcpy(ptr, entry->xids, size); |
| |
| debug_elog3(DEBUG2, "CacheGet: found %s", |
| mxid_to_string(multi, entry->nxids, entry->xids)); |
| return entry->nxids; |
| } |
| } |
| |
| debug_elog2(DEBUG2, "CacheGet: not found"); |
| return -1; |
| } |
| |
| /* |
| * mXactCachePut |
| * Add a new MultiXactId and its composing set into the local cache. |
| */ |
| static void |
| mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids) |
| { |
| mXactCacheEnt *entry; |
| |
| debug_elog3(DEBUG2, "CachePut: storing %s", |
| mxid_to_string(multi, nxids, xids)); |
| |
| if (MXactContext == NULL) |
| { |
| /* The cache only lives as long as the current transaction */ |
| debug_elog2(DEBUG2, "CachePut: initializing memory context"); |
| MXactContext = AllocSetContextCreate(TopTransactionContext, |
| "MultiXact Cache Context", |
| ALLOCSET_SMALL_MINSIZE, |
| ALLOCSET_SMALL_INITSIZE, |
| ALLOCSET_SMALL_MAXSIZE); |
| } |
| |
| entry = (mXactCacheEnt *) |
| MemoryContextAlloc(MXactContext, |
| offsetof(mXactCacheEnt, xids) + |
| nxids * sizeof(TransactionId)); |
| |
| entry->multi = multi; |
| entry->nxids = nxids; |
| memcpy(entry->xids, xids, nxids * sizeof(TransactionId)); |
| |
| /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ |
| qsort(entry->xids, nxids, sizeof(TransactionId), xidComparator); |
| |
| entry->next = MXactCache; |
| MXactCache = entry; |
| } |
| |
| /* |
| * xidComparator |
| * qsort comparison function for XIDs |
| * |
| * We don't need to use wraparound comparison for XIDs, and indeed must |
| * not do so since that does not respect the triangle inequality! Any |
| * old sort order will do. |
| */ |
| static int |
| xidComparator(const void *arg1, const void *arg2) |
| { |
| TransactionId xid1 = *(const TransactionId *) arg1; |
| TransactionId xid2 = *(const TransactionId *) arg2; |
| |
| if (xid1 > xid2) |
| return 1; |
| if (xid1 < xid2) |
| return -1; |
| return 0; |
| } |
| |
| #ifdef MULTIXACT_DEBUG |
| static char * |
| mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids) |
| { |
| char *str = palloc(15 * (nxids + 1) + 4); |
| int i; |
| |
| snprintf(str, 47, "%u %d[%u", multi, nxids, xids[0]); |
| |
| for (i = 1; i < nxids; i++) |
| snprintf(str + strlen(str), 17, ", %u", xids[i]); |
| |
| strcat(str, "]"); |
| return str; |
| } |
| #endif |
| |
| /* |
| * AtEOXact_MultiXact |
| * Handle transaction end for MultiXact |
| * |
| * This is called at top transaction commit or abort (we don't care which). |
| */ |
| void |
| AtEOXact_MultiXact(void) |
| { |
| /* |
| * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of |
| * which should only be valid while within a transaction. |
| * |
| * We assume that storing a MultiXactId is atomic and so we need not take |
| * MultiXactGenLock to do this. |
| */ |
| OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; |
| OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; |
| |
| /* |
| * Discard the local MultiXactId cache. Since MXactContext was created as |
| * a child of TopTransactionContext, we needn't delete it explicitly. |
| */ |
| MXactContext = NULL; |
| MXactCache = NULL; |
| } |
| |
| /* |
| * Initialization of shared memory for MultiXact. We use two SLRU areas, |
| * thus double memory. Also, reserve space for the shared MultiXactState |
| * struct and the per-backend MultiXactId arrays (two of those, too). |
| */ |
| Size |
| MultiXactShmemSize(void) |
| { |
| Size size; |
| |
| #define SHARED_MULTIXACT_STATE_SIZE \ |
| add_size(sizeof(MultiXactStateData), \ |
| mul_size(sizeof(MultiXactId) * 2, MaxBackends)) |
| |
| size = SHARED_MULTIXACT_STATE_SIZE; |
| size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS)); |
| size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS)); |
| |
| return size; |
| } |
| |
| void |
| MultiXactShmemInit(void) |
| { |
| bool found; |
| |
| debug_elog2(DEBUG2, "Shared Memory Init for MultiXact"); |
| |
| MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; |
| MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; |
| |
| SimpleLruInit(MultiXactOffsetCtl, |
| "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS, |
| MultiXactOffsetControlLock, MULTIXACT_OFFSETS_DIR); |
| SimpleLruInit(MultiXactMemberCtl, |
| "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS, |
| MultiXactMemberControlLock, MULTIXACT_MEMBERS_DIR); |
| |
| /* Initialize our shared state struct */ |
| MultiXactState = ShmemInitStruct("Shared MultiXact State", |
| SHARED_MULTIXACT_STATE_SIZE, |
| &found); |
| if (!IsUnderPostmaster) |
| { |
| Assert(!found); |
| |
| /* Make sure we zero out the per-backend state */ |
| MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); |
| } |
| else |
| Assert(found); |
| |
| /* |
| * Set up array pointers. Note that perBackendXactIds[0] is wasted space |
| * since we only use indexes 1..MaxBackends in each array. |
| */ |
| OldestMemberMXactId = MultiXactState->perBackendXactIds; |
| OldestVisibleMXactId = OldestMemberMXactId + MaxBackends; |
| } |
| |
| /* |
| * This func must be called ONCE on system install. It creates the initial |
| * MultiXact segments. (The MultiXacts directories are assumed to have been |
| * created by initdb, and MultiXactShmemInit must have been called already.) |
| */ |
| void |
| BootStrapMultiXact(void) |
| { |
| int slotno; |
| |
| LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| |
| /* Create and zero the first page of the offsets log */ |
| slotno = ZeroMultiXactOffsetPage(0, false); |
| |
| /* Make sure it's written out */ |
| SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL); |
| Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
| |
| LWLockRelease(MultiXactOffsetControlLock); |
| |
| LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| |
| /* Create and zero the first page of the members log */ |
| slotno = ZeroMultiXactMemberPage(0, false); |
| |
| /* Make sure it's written out */ |
| SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL); |
| Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
| |
| LWLockRelease(MultiXactMemberControlLock); |
| } |
| |
| /* |
| * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. |
| * If writeXlog is TRUE, also emit an XLOG record saying we did this. |
| * |
| * The page is not actually written, just set up in shared memory. |
| * The slot number of the new page is returned. |
| * |
| * Control lock must be held at entry, and will be held at exit. |
| */ |
| static int |
| ZeroMultiXactOffsetPage(int pageno, bool writeXlog) |
| { |
| int slotno; |
| |
| slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); |
| |
| if (writeXlog) |
| WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); |
| |
| return slotno; |
| } |
| |
| /* |
| * Ditto, for MultiXactMember |
| */ |
| static int |
| ZeroMultiXactMemberPage(int pageno, bool writeXlog) |
| { |
| int slotno; |
| |
| slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); |
| |
| if (writeXlog) |
| WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); |
| |
| return slotno; |
| } |
| |
| /* |
| * This must be called ONCE during postmaster or standalone-backend startup. |
| * |
| * StartupXLOG has already established nextMXact/nextOffset by calling |
| * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we |
| * may already have replayed WAL data into the SLRU files. |
| * |
| * We don't need any locks here, really; the SLRU locks are taken |
| * only because slru.c expects to be called with locks held. |
| */ |
| void |
| StartupMultiXact(void) |
| { |
| MultiXactId multi = MultiXactState->nextMXact; |
| MultiXactOffset offset = MultiXactState->nextOffset; |
| int pageno; |
| int entryno; |
| |
| /* Clean up offsets state */ |
| LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| |
| /* |
| * Initialize our idea of the latest page number. |
| */ |
| pageno = MultiXactIdToOffsetPage(multi); |
| MultiXactOffsetCtl->shared->latest_page_number = pageno; |
| |
| /* |
| * Zero out the remainder of the current offsets page. See notes in |
| * StartupCLOG() for motivation. |
| */ |
| entryno = MultiXactIdToOffsetEntry(multi); |
| if (entryno != 0) |
| { |
| int slotno; |
| MultiXactOffset *offptr; |
| |
| slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi); |
| offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| offptr += entryno; |
| |
| MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); |
| |
| MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
| } |
| |
| LWLockRelease(MultiXactOffsetControlLock); |
| |
| /* And the same for members */ |
| LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| |
| /* |
| * Initialize our idea of the latest page number. |
| */ |
| pageno = MXOffsetToMemberPage(offset); |
| MultiXactMemberCtl->shared->latest_page_number = pageno; |
| |
| /* |
| * Zero out the remainder of the current members page. See notes in |
| * StartupCLOG() for motivation. |
| */ |
| entryno = MXOffsetToMemberEntry(offset); |
| if (entryno != 0) |
| { |
| int slotno; |
| TransactionId *xidptr; |
| |
| slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, offset); |
| xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno]; |
| xidptr += entryno; |
| |
| MemSet(xidptr, 0, BLCKSZ - (entryno * sizeof(TransactionId))); |
| |
| MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
| } |
| |
| LWLockRelease(MultiXactMemberControlLock); |
| |
| /* |
| * Initialize lastTruncationPoint to invalid, ensuring that the first |
| * checkpoint will try to do truncation. |
| */ |
| MultiXactState->lastTruncationPoint = InvalidMultiXactId; |
| } |
| |
| /* |
| * This must be called ONCE during postmaster or standalone-backend shutdown |
| */ |
| void |
| ShutdownMultiXact(void) |
| { |
| /* Flush dirty MultiXact pages to disk */ |
| SimpleLruFlush(MultiXactOffsetCtl, false); |
| SimpleLruFlush(MultiXactMemberCtl, false); |
| } |
| |
| /* |
| * Get the next MultiXactId and offset to save in a checkpoint record |
| */ |
| void |
| MultiXactGetCheckptMulti(bool is_shutdown __attribute__((unused)) , |
| MultiXactId *nextMulti, |
| MultiXactOffset *nextMultiOffset) |
| { |
| LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| |
| *nextMulti = MultiXactState->nextMXact; |
| *nextMultiOffset = MultiXactState->nextOffset; |
| |
| LWLockRelease(MultiXactGenLock); |
| |
| debug_elog4(DEBUG2, "MultiXact: checkpoint is nextMulti %u, nextOffset %u", |
| *nextMulti, *nextMultiOffset); |
| } |
| |
| /* |
| * Perform a checkpoint --- either during shutdown, or on-the-fly |
| */ |
| void |
| CheckPointMultiXact(void) |
| { |
| /* Flush dirty MultiXact pages to disk */ |
| SimpleLruFlush(MultiXactOffsetCtl, true); |
| SimpleLruFlush(MultiXactMemberCtl, true); |
| |
| /* |
| * Truncate the SLRU files. This could be done at any time, but |
| * checkpoint seems a reasonable place for it. There is one exception: if |
| * we are called during xlog recovery, then shared->latest_page_number |
| * isn't valid (because StartupMultiXact hasn't been called yet) and so |
| * SimpleLruTruncate would get confused. It seems best not to risk |
| * removing any data during recovery anyway, so don't truncate. |
| */ |
| if (!InRecovery) |
| TruncateMultiXact(); |
| } |
| |
| /* |
| * Set the next-to-be-assigned MultiXactId and offset |
| * |
| * This is used when we can determine the correct next ID/offset exactly |
| * from a checkpoint record. We need no locking since it is only called |
| * during bootstrap and XLog replay. |
| */ |
| void |
| MultiXactSetNextMXact(MultiXactId nextMulti, |
| MultiXactOffset nextMultiOffset) |
| { |
| debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", |
| nextMulti, nextMultiOffset); |
| MultiXactState->nextMXact = nextMulti; |
| MultiXactState->nextOffset = nextMultiOffset; |
| } |
| |
| /* |
| * Ensure the next-to-be-assigned MultiXactId is at least minMulti, |
| * and similarly nextOffset is at least minMultiOffset |
| * |
| * This is used when we can determine minimum safe values from an XLog |
| * record (either an on-line checkpoint or an mxact creation log entry). |
| * We need no locking since it is only called during XLog replay. |
| */ |
| void |
| MultiXactAdvanceNextMXact(MultiXactId minMulti, |
| MultiXactOffset minMultiOffset) |
| { |
| if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) |
| { |
| debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); |
| MultiXactState->nextMXact = minMulti; |
| } |
| if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) |
| { |
| debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", |
| minMultiOffset); |
| MultiXactState->nextOffset = minMultiOffset; |
| } |
| } |
| |
| /* |
| * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. |
| * |
| * NB: this is called while holding MultiXactGenLock. We want it to be very |
| * fast most of the time; even when it's not so fast, no actual I/O need |
| * happen unless we're forced to write out a dirty log or xlog page to make |
| * room in shared memory. |
| */ |
| static void |
| ExtendMultiXactOffset(MultiXactId multi) |
| { |
| int pageno; |
| |
| /* |
| * No work except at first MultiXactId of a page. But beware: just after |
| * wraparound, the first MultiXactId of page zero is FirstMultiXactId. |
| */ |
| if (MultiXactIdToOffsetEntry(multi) != 0 && |
| multi != FirstMultiXactId) |
| return; |
| |
| pageno = MultiXactIdToOffsetPage(multi); |
| |
| LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| |
| /* Zero the page and make an XLOG entry about it */ |
| ZeroMultiXactOffsetPage(pageno, true); |
| |
| LWLockRelease(MultiXactOffsetControlLock); |
| } |
| |
| /* |
| * Make sure that MultiXactMember has room for the members of a newly- |
| * allocated MultiXactId. |
| * |
| * Like the above routine, this is called while holding MultiXactGenLock; |
| * same comments apply. |
| */ |
| static void |
| ExtendMultiXactMember(MultiXactOffset offset, int nmembers) |
| { |
| /* |
| * It's possible that the members span more than one page of the members |
| * file, so we loop to ensure we consider each page. The coding is not |
| * optimal if the members span several pages, but that seems unusual |
| * enough to not worry much about. |
| */ |
| while (nmembers > 0) |
| { |
| int entryno; |
| |
| /* |
| * Only zero when at first entry of a page. |
| */ |
| entryno = MXOffsetToMemberEntry(offset); |
| if (entryno == 0) |
| { |
| int pageno; |
| |
| pageno = MXOffsetToMemberPage(offset); |
| |
| LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| |
| /* Zero the page and make an XLOG entry about it */ |
| ZeroMultiXactMemberPage(pageno, true); |
| |
| LWLockRelease(MultiXactMemberControlLock); |
| } |
| |
| /* Advance to next page (OK if nmembers goes negative) */ |
| offset += (MULTIXACT_MEMBERS_PER_PAGE - entryno); |
| nmembers -= (MULTIXACT_MEMBERS_PER_PAGE - entryno); |
| } |
| } |
| |
| /* |
| * Remove all MultiXactOffset and MultiXactMember segments before the oldest |
| * ones still of interest. |
| * |
| * This is called only during checkpoints. We assume no more than one |
| * backend does this at a time. |
| * |
| * XXX do we have any issues with needing to checkpoint here? |
| */ |
| static void |
| TruncateMultiXact(void) |
| { |
| MultiXactId nextMXact; |
| MultiXactOffset nextOffset; |
| MultiXactId oldestMXact; |
| MultiXactOffset oldestOffset; |
| int cutoffPage; |
| int i; |
| |
| /* |
| * First, compute where we can safely truncate. Per notes above, this is |
| * the oldest valid value among all the OldestMemberMXactId[] and |
| * OldestVisibleMXactId[] entries, or nextMXact if none are valid. |
| */ |
| |
| LWLockAcquire(MultiXactGenLock, LW_SHARED); |
| |
| /* |
| * We have to beware of the possibility that nextMXact is in the |
| * wrapped-around state. We don't fix the counter itself here, but we |
| * must be sure to use a valid value in our calculation. |
| */ |
| nextMXact = MultiXactState->nextMXact; |
| if (nextMXact < FirstMultiXactId) |
| nextMXact = FirstMultiXactId; |
| |
| oldestMXact = nextMXact; |
| for (i = 1; i <= MaxBackends; i++) |
| { |
| MultiXactId thisoldest; |
| |
| thisoldest = OldestMemberMXactId[i]; |
| if (MultiXactIdIsValid(thisoldest) && |
| MultiXactIdPrecedes(thisoldest, oldestMXact)) |
| oldestMXact = thisoldest; |
| thisoldest = OldestVisibleMXactId[i]; |
| if (MultiXactIdIsValid(thisoldest) && |
| MultiXactIdPrecedes(thisoldest, oldestMXact)) |
| oldestMXact = thisoldest; |
| } |
| |
| /* Save the current nextOffset too */ |
| nextOffset = MultiXactState->nextOffset; |
| |
| LWLockRelease(MultiXactGenLock); |
| |
| debug_elog3(DEBUG2, "MultiXact: truncation point = %u", oldestMXact); |
| |
| /* |
| * If we already truncated at this point, do nothing. This saves time |
| * when no MultiXacts are getting used, which is probably not uncommon. |
| */ |
| if (MultiXactState->lastTruncationPoint == oldestMXact) |
| { |
| return; |
| } |
| |
| /* |
| * We need to determine where to truncate MultiXactMember. If we found a |
| * valid oldest MultiXactId, read its starting offset; otherwise we use |
| * the nextOffset value we saved above. |
| */ |
| if (oldestMXact == nextMXact) |
| oldestOffset = nextOffset; |
| else |
| { |
| int pageno; |
| int slotno; |
| int entryno; |
| MultiXactOffset *offptr; |
| |
| /* lock is acquired by SimpleLruReadPage_ReadOnly */ |
| |
| pageno = MultiXactIdToOffsetPage(oldestMXact); |
| entryno = MultiXactIdToOffsetEntry(oldestMXact); |
| |
| slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, oldestMXact, NULL); |
| offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
| offptr += entryno; |
| oldestOffset = *offptr; |
| |
| LWLockRelease(MultiXactOffsetControlLock); |
| } |
| |
| /* |
| * The cutoff point is the start of the segment containing oldestMXact. We |
| * pass the *page* containing oldestMXact to SimpleLruTruncate. |
| */ |
| cutoffPage = MultiXactIdToOffsetPage(oldestMXact); |
| |
| SimpleLruTruncate(MultiXactOffsetCtl, cutoffPage, false); |
| |
| /* |
| * Also truncate MultiXactMember at the previously determined offset. |
| */ |
| cutoffPage = MXOffsetToMemberPage(oldestOffset); |
| |
| SimpleLruTruncate(MultiXactMemberCtl, cutoffPage, false); |
| |
| /* |
| * Set the last known truncation point. We don't need a lock for this |
| * since only one backend does checkpoints at a time. |
| */ |
| MultiXactState->lastTruncationPoint = oldestMXact; |
| } |
| |
| /* |
| * Decide which of two MultiXactOffset page numbers is "older" for truncation |
| * purposes. |
| * |
| * We need to use comparison of MultiXactId here in order to do the right |
| * thing with wraparound. However, if we are asked about page number zero, we |
| * don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get |
| * weird. So, offset both multis by FirstMultiXactId to avoid that. |
| * (Actually, the current implementation doesn't do anything weird with |
| * InvalidMultiXactId, but there's no harm in leaving this code like this.) |
| */ |
| static bool |
| MultiXactOffsetPagePrecedes(int page1, int page2) |
| { |
| MultiXactId multi1; |
| MultiXactId multi2; |
| |
| multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; |
| multi1 += FirstMultiXactId; |
| multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; |
| multi2 += FirstMultiXactId; |
| |
| return MultiXactIdPrecedes(multi1, multi2); |
| } |
| |
| /* |
| * Decide which of two MultiXactMember page numbers is "older" for truncation |
| * purposes. There is no "invalid offset number" so use the numbers verbatim. |
| */ |
| static bool |
| MultiXactMemberPagePrecedes(int page1, int page2) |
| { |
| MultiXactOffset offset1; |
| MultiXactOffset offset2; |
| |
| offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; |
| offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; |
| |
| return MultiXactOffsetPrecedes(offset1, offset2); |
| } |
| |
| /* |
| * Decide which of two MultiXactIds is earlier. |
| * |
| * XXX do we need to do something special for InvalidMultiXactId? |
| * (Doesn't look like it.) |
| */ |
| static bool |
| MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) |
| { |
| int32 diff = (int32) (multi1 - multi2); |
| |
| return (diff < 0); |
| } |
| |
| /* |
| * Decide which of two offsets is earlier. |
| */ |
| static bool |
| MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) |
| { |
| int32 diff = (int32) (offset1 - offset2); |
| |
| return (diff < 0); |
| } |
| |
| |
| /* |
| * Write an xlog record reflecting the zeroing of either a MEMBERs or |
| * OFFSETs page (info shows which) |
| * |
| * Note: xlog record is marked as outside transaction control, since we |
| * want it to be redone whether the invoking transaction commits or not. |
| */ |
| static void |
| WriteMZeroPageXlogRec(int pageno, uint8 info) |
| { |
| XLogRecData rdata; |
| |
| rdata.data = (char *) (&pageno); |
| rdata.len = sizeof(int); |
| rdata.buffer = InvalidBuffer; |
| rdata.next = NULL; |
| (void) XLogInsert(RM_MULTIXACT_ID, info | XLOG_NO_TRAN, &rdata); |
| } |
| |
| /* |
| * MULTIXACT resource manager's routines |
| */ |
| void |
| multixact_redo(XLogRecPtr beginLoc __attribute__((unused)), XLogRecPtr lsn __attribute__((unused)), XLogRecord *record) |
| { |
| uint8 info = record->xl_info & ~XLR_INFO_MASK; |
| |
| if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) |
| { |
| int pageno; |
| int slotno; |
| |
| memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
| |
| LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); |
| |
| slotno = ZeroMultiXactOffsetPage(pageno, false); |
| SimpleLruWritePage(MultiXactOffsetCtl, slotno, NULL); |
| Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
| |
| LWLockRelease(MultiXactOffsetControlLock); |
| } |
| else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) |
| { |
| int pageno; |
| int slotno; |
| |
| memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
| |
| LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); |
| |
| slotno = ZeroMultiXactMemberPage(pageno, false); |
| SimpleLruWritePage(MultiXactMemberCtl, slotno, NULL); |
| Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
| |
| LWLockRelease(MultiXactMemberControlLock); |
| } |
| else if (info == XLOG_MULTIXACT_CREATE_ID) |
| { |
| xl_multixact_create *xlrec = (xl_multixact_create *) XLogRecGetData(record); |
| TransactionId *xids = xlrec->xids; |
| TransactionId max_xid; |
| int i; |
| |
| /* Store the data back into the SLRU files */ |
| RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids); |
| |
| /* Make sure nextMXact/nextOffset are beyond what this record has */ |
| MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids); |
| |
| /* |
| * Make sure nextXid is beyond any XID mentioned in the record. This |
| * should be unnecessary, since any XID found here ought to have other |
| * evidence in the XLOG, but let's be safe. |
| */ |
| max_xid = record->xl_xid; |
| for (i = 0; i < xlrec->nxids; i++) |
| { |
| if (TransactionIdPrecedes(max_xid, xids[i])) |
| max_xid = xids[i]; |
| } |
| if (TransactionIdFollowsOrEquals(max_xid, |
| ShmemVariableCache->nextXid)) |
| { |
| ShmemVariableCache->nextXid = max_xid; |
| TransactionIdAdvance(ShmemVariableCache->nextXid); |
| } |
| } |
| else |
| elog(PANIC, "multixact_redo: unknown op code %u", info); |
| } |
| |
| void |
| multixact_desc(StringInfo buf, XLogRecPtr beginLoc, XLogRecord *record) |
| { |
| uint8 info = record->xl_info & ~XLR_INFO_MASK; |
| char *rec = XLogRecGetData(record); |
| |
| if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) |
| { |
| int pageno; |
| |
| memcpy(&pageno, rec, sizeof(int)); |
| appendStringInfo(buf, "zero offsets page: %d", pageno); |
| } |
| else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) |
| { |
| int pageno; |
| |
| memcpy(&pageno, rec, sizeof(int)); |
| appendStringInfo(buf, "zero members page: %d", pageno); |
| } |
| else if (info == XLOG_MULTIXACT_CREATE_ID) |
| { |
| xl_multixact_create *xlrec = (xl_multixact_create *) rec; |
| int i; |
| |
| appendStringInfo(buf, "create multixact %u offset %u:", |
| xlrec->mid, xlrec->moff); |
| for (i = 0; i < xlrec->nxids; i++) |
| appendStringInfo(buf, " %u", xlrec->xids[i]); |
| } |
| else |
| appendStringInfo(buf, "UNKNOWN"); |
| } |