blob: ae674752791db8d8f4b90056914a0b109aa60a1b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*-------------------------------------------------------------------------
*
* cdbpersistentrecovery.c
*
*-------------------------------------------------------------------------
*/
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include <sys/stat.h>
#include "postgres.h"
#include "cdb/cdbdoublylinked.h"
#include "access/persistentfilesysobjname.h"
#include "catalog/pg_class.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_authid.h"
#include "catalog/pg_tablespace.h"
#include "catalog/pg_database.h"
#include "catalog/gp_persistent.h"
#include "cdb/cdbpersistentstore.h"
#include "cdb/cdbpersistentfilesysobj.h"
#include "cdb/cdbpersistentrelfile.h"
#include "cdb/cdbpersistentrecovery.h"
#include "access/heapam.h"
#include "catalog/pg_tablespace.h"
#include "access/xlog_internal.h"
#include "catalog/catalog.h"
#include "cdb/cdbdoublylinked.h"
#include "utils/hsearch.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/transam.h"
#include "cdb/cdbdirectopen.h"
#include "utils/guc.h"
#include "storage/smgr.h"
#include "cdb/cdbmirroredfilesysobj.h"
#include "cdb/cdbresynchronizechangetracking.h"
#include "access/twophase.h"
static bool
PersistentRecovery_RedoRelationExists(
ItemPointer persistentTid,
int64 persistentSerialNum,
RelFileNode *relFileNode);
typedef struct XactEntryData
{
TransactionId xid;
XactInfoKind infoKind;
DoublyLinkedHead fsObjEntryList;
} XactEntryData;
typedef XactEntryData *XactEntry;
static HTAB *xactHashTable = NULL;
Pass2RecoveryHashShmem_s *pass2RecoveryHashShmem = NULL;
Size
Pass2Recovery_ShmemSize(void)
{
Size size;
size = hash_estimate_size(
(Size)GP_MAX_PASS2RECOVERY_ABORTINGCREATE,
sizeof(Pass2RecoveryHashEntry_s));
size = add_size(size, sizeof(Pass2RecoveryHashShmem_s));
return size;
}
/* Initialize hash table of AbortingCreate entries in shared memory */
void
Pass2Recovery_ShmemInit(void)
{
HASHCTL info;
int hash_flags;
bool foundPtr;
pass2RecoveryHashShmem =
(Pass2RecoveryHashShmem_s *)
ShmemInitStruct("pass2 recovery abortingcreate hash",
sizeof(Pass2RecoveryHashShmem_s),
&foundPtr);
if (pass2RecoveryHashShmem == NULL) {
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
(errmsg("not enough shared memory for pass2 recovery"))));
}
if (!foundPtr) {
MemSet(pass2RecoveryHashShmem,
0,
sizeof(Pass2RecoveryHashShmem_s));
}
MemSet(&info, 0, sizeof(info));
info.keysize = sizeof(Oid);
info.entrysize = sizeof(Pass2RecoveryHashEntry_s);
info.hash = tag_hash;
hash_flags = (HASH_ELEM | HASH_FUNCTION);
pass2RecoveryHashShmem->hash =
ShmemInitHash("pass2 recovery hash",
GP_MAX_PASS2RECOVERY_ABORTINGCREATE,
GP_MAX_PASS2RECOVERY_ABORTINGCREATE,
&info,
hash_flags);
if (pass2RecoveryHashShmem->hash == NULL) {
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
(errmsg("not enough shared memory for pass2 recovery"))));
}
}
static Pass2RecoveryHashEntry_s*
Pass2Recovery_InsertHashEntry(
Oid objid,
bool *exists)
{
bool foundPtr;
Pass2RecoveryHashEntry_s *entry;
Assert(pass2RecoveryHashShmem->hash != NULL);
entry = (Pass2RecoveryHashEntry_s *) hash_search(
pass2RecoveryHashShmem->hash,
(void *) &objid,
HASH_ENTER_NULL,
&foundPtr);
if (entry == NULL) {
*exists = FALSE;
return entry;
}
if (foundPtr) {
*exists = TRUE;
} else {
*exists = FALSE;
}
return entry;
}
static void
PersistentRecovery_XactHashTableInit(void)
{
HASHCTL info;
int hash_flags;
/* Set key and entry sizes. */
MemSet(&info, 0, sizeof(info));
info.keysize = sizeof(TransactionId);
info.entrysize = sizeof(XactEntryData);
info.hash = tag_hash;
hash_flags = (HASH_ELEM | HASH_FUNCTION);
xactHashTable = hash_create("XactEntry", 10, &info, hash_flags);
}
static XactEntry PersistentRecovery_FindOrCreateXactEntry(
TransactionId xid,
bool *found)
{
XactEntry xactEntry;
if (xactHashTable == NULL)
PersistentRecovery_XactHashTableInit();
xactEntry =
(XactEntry)
hash_search(xactHashTable,
(void *) &xid,
HASH_ENTER,
found);
if (!*found)
{
DoublyLinkedHead_Init(&xactEntry->fsObjEntryList);
}
return xactEntry;
}
#if suppress
static XactEntry PersistentRecovery_FindXactEntry(
TransactionId xid)
{
XactEntry xactEntry;
bool found;
Assert (xactHashTable != NULL);
xactEntry =
(XactEntry)
hash_search(xactHashTable,
(void *) &xid,
HASH_FIND,
&found);
if (!found)
return NULL;
return xactEntry;
}
#endif
typedef struct FsObjEntryKey
{
ItemPointerData persistentTid;
} FsObjEntryKey;
typedef struct FsObjEntryData
{
FsObjEntryKey key;
PersistentFileSysObjName fsObjName;
PersistentFileSysRelStorageMgr relStorageMgr;
int64 persistentSerialNum;
PersistentFileSysState state;
TransactionId xid;
bool updateNeeded;
PersistentFileSysObjStateChangeResult stateChangeResult;
DoubleLinks xactLinks;
} FsObjEntryData;
typedef FsObjEntryData *FsObjEntry;
static char *
FsObjEntryToBuffer(char *buffer, int maxBufferLen, FsObjEntry fsObjEntry)
{
int snprintfResult;
if (fsObjEntry->state != PersistentFileSysState_Free)
snprintfResult =
snprintf(
buffer,
maxBufferLen,
"%s: state '%s' , transaction %u, relation storage manager '%s', persistent serial number " INT64_FORMAT ", TID %s",
PersistentFileSysObjName_TypeAndObjectName(&fsObjEntry->fsObjName),
PersistentFileSysObjState_Name(fsObjEntry->state),
fsObjEntry->xid,
PersistentFileSysRelStorageMgr_Name(fsObjEntry->relStorageMgr),
fsObjEntry->persistentSerialNum,
ItemPointerToString(&fsObjEntry->key.persistentTid));
else
snprintfResult =
snprintf(
buffer,
maxBufferLen,
"Free entry as free number " INT64_FORMAT ", TID %s",
fsObjEntry->persistentSerialNum,
ItemPointerToString(&fsObjEntry->key.persistentTid));
Assert(snprintfResult >= 0);
Assert(snprintfResult < maxBufferLen);
return buffer;
}
#define MAX_FS_OBJ_ENTRY_BUFFER 400
static char fsObjEntryBuffer[MAX_FS_OBJ_ENTRY_BUFFER];
static char fsObjEntryBuffer2[MAX_FS_OBJ_ENTRY_BUFFER];
static char *
FsObjEntryToString(FsObjEntry fsObjEntry)
{
return FsObjEntryToBuffer(fsObjEntryBuffer, MAX_FS_OBJ_ENTRY_BUFFER, fsObjEntry);
}
static char *
FsObjEntryToString2(FsObjEntry fsObjEntry)
{
return FsObjEntryToBuffer(fsObjEntryBuffer2, MAX_FS_OBJ_ENTRY_BUFFER, fsObjEntry);
}
static HTAB **fsObjHashTable = NULL;
static void
PersistentRecovery_FsObjHashTableInit(void)
{
HASHCTL info;
int hash_flags;
PersistentFsObjType fsObjType;
char name[20];
fsObjHashTable = (HTAB**)palloc(CountPersistentFsObjType * sizeof(HTAB*));
/* Set key and entry sizes. */
MemSet(&info, 0, sizeof(info));
info.keysize = sizeof(FsObjEntryKey);
info.entrysize = sizeof(FsObjEntryData);
info.hash = tag_hash;
hash_flags = (HASH_ELEM | HASH_FUNCTION);
for (fsObjType = PersistentFsObjType_First;
fsObjType <= PersistentFsObjType_Last;
fsObjType++)
{
sprintf(name, "FsObjEntry_%d", fsObjType);
fsObjHashTable[fsObjType] = hash_create(name, 10, &info, hash_flags);
}
}
static FsObjEntry PersistentRecovery_FindOrCreateFsObjEntry(
PersistentFsObjType fsObjType,
ItemPointer persistentTid,
bool *found)
{
FsObjEntry fsObjEntry;
FsObjEntryKey key;
if (fsObjHashTable == NULL)
PersistentRecovery_FsObjHashTableInit();
MemSet(&key, 0, sizeof(FsObjEntryKey));
key.persistentTid = *persistentTid;
fsObjEntry =
(FsObjEntry)
hash_search(fsObjHashTable[fsObjType],
(void *) &key,
HASH_ENTER,
found);
Assert(ItemPointerCompare(&fsObjEntry->key.persistentTid, persistentTid) == 0);
if (!*found)
{
DoubleLinks_Init(&fsObjEntry->xactLinks);
fsObjEntry->persistentSerialNum = 0;
MemSet(&fsObjEntry->fsObjName, 0, sizeof(PersistentFileSysObjName));
fsObjEntry->relStorageMgr = PersistentFileSysRelStorageMgr_None;
fsObjEntry->xid = InvalidTransactionId;
fsObjEntry->state = -1;
fsObjEntry->updateNeeded = false;
fsObjEntry->stateChangeResult = PersistentFileSysObjStateChangeResult_StateChangeOk;
}
return fsObjEntry;
}
static void PersistentRecovery_AddEndXactFsObj(
TransactionId xid,
XactInfoKind infoKind,
PersistentEndXactFileSysActionInfo *fileSysActionInfo,
PersistentFileSysState state)
{
FsObjEntry fsObjEntry;
FsObjEntryData prevFsObjEntry;
bool found;
/*
* Create Persistent Change entry keyed on TID.
*/
fsObjEntry =
PersistentRecovery_FindOrCreateFsObjEntry(
fileSysActionInfo->fsObjName.type,
&fileSysActionInfo->persistentTid,
&found);
if (found)
{
memcpy(&prevFsObjEntry, fsObjEntry, sizeof(FsObjEntryData));
}
fsObjEntry->fsObjName = fileSysActionInfo->fsObjName;
if (fileSysActionInfo->fsObjName.type == PersistentFsObjType_RelationFile)
{
if (!PersistentFileSysRelStorageMgr_IsValid(fileSysActionInfo->relStorageMgr))
elog(ERROR, "Relation storage manager for persistent '%s' for Crash Recovery is invalid (%d)",
PersistentFileSysObjName_TypeAndObjectName(&fileSysActionInfo->fsObjName),
fileSysActionInfo->relStorageMgr);
fsObjEntry->relStorageMgr = fileSysActionInfo->relStorageMgr;
}
else
{
// Not a 'Relation File'.
fsObjEntry->relStorageMgr = PersistentFileSysRelStorageMgr_None;
}
fsObjEntry->persistentSerialNum = fileSysActionInfo->persistentSerialNum;
fsObjEntry->xid = xid;
fsObjEntry->state = state;
if (found)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"%s Record (transaction %u): Overwritting %s with %s",
XactInfoKind_Name(infoKind),
xid,
FsObjEntryToString(&prevFsObjEntry),
FsObjEntryToString2(fsObjEntry));
}
else
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"%s Record (transaction %u): Adding %s",
XactInfoKind_Name(infoKind),
xid,
FsObjEntryToString(fsObjEntry));
}
}
bool
PersistentRecovery_ShouldHandlePass1XLogRec(
XLogRecPtr *beginLoc,
XLogRecPtr *lsn,
XLogRecord *record)
{
int relationChangeInfoArrayCount;
int arrlen = ChangeTracking_GetInfoArrayDesiredMaxLength(record->xl_rmid,
record->xl_info);
RelationChangeInfo relationChangeInfoArray[arrlen];
if (Debug_persistent_recovery_print)
ChangeTracking_PrintRelationChangeInfo(
record->xl_rmid,
record->xl_info,
(void*)XLogRecGetData(record),
lsn,
/* weAreGeneratingXLogNow */ false,
/* printSkipIssuesOnly */ false);
/*
* Gather vital peristence information from the XLOG record about relations ONLY.
*/
ChangeTracking_GetRelationChangeInfoFromXlog(
record->xl_rmid,
record->xl_info,
(void*)XLogRecGetData(record),
relationChangeInfoArray,
&relationChangeInfoArrayCount,
arrlen);
if (relationChangeInfoArrayCount == 0)
{
/*
* A non-data change XLOG record.
*/
return false;
}
else
{
RelFileNode *xlogRelFileNode;
xlogRelFileNode = &relationChangeInfoArray[0].relFileNode;
if(RelFileNode_IsEmpty(xlogRelFileNode))
elog(ERROR, "Invalid RelFileNode (0,0,0)");
if (GpPersistent_IsPersistentRelation(xlogRelFileNode->relNode)
&& (xlogRelFileNode->relNode != GpGlobalSequenceRelationId))
/* MPP-17181: since the fix for MPP-17181 will always sync
* newly allocated sequence numbers of gp_global_sequence
* to disk before persistent table use them, replay of XLOG
* records for gp_global_sequence will be skipped, otherwise
* replay of stale xlog records might corrupt valid on-disk
* gp_global_sequence relation file.
*/
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass1XLogRec: persistent meta-data (returning true) %u/%u/%u",
xlogRelFileNode->spcNode,
xlogRelFileNode->dbNode,
xlogRelFileNode->relNode);
return true;
}
else
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass1XLogRec: regular table (returning false) %u/%u/%u",
xlogRelFileNode->spcNode,
xlogRelFileNode->dbNode,
xlogRelFileNode->relNode);
return false;
}
}
return false; // Not reached.
}
void
PersistentRecovery_HandlePass2XLogRec(
XLogRecPtr *beginLoc,
XLogRecPtr *lsn,
XLogRecord *record)
{
RmgrId rmid = record->xl_rmid;
if (rmid == RM_HEAP_ID)
{
RelFileNode relFileNode;
if (!heap_getrelfilenode(record, &relFileNode))
elog(ERROR, "No relfilenode");
if (relFileNode.spcNode != GLOBALTABLESPACE_OID)
return;
if (relFileNode.relNode != GpPersistentRelfileNodeRelationId &&
relFileNode.relNode != GpPersistentRelationNodeRelationId &&
relFileNode.relNode != GpPersistentDatabaseNodeRelationId &&
relFileNode.relNode != GpPersistentTablespaceNodeRelationId &&
relFileNode.relNode != GpPersistentFilespaceNodeRelationId)
return;
}
else if (rmid == RM_XACT_ID)
{
TransactionId xid;
XactEntry xactEntry;
bool found;
PersistentEndXactRecObjects persistentObjects;
TransactionId *subXids;
int subXidCount;
XactInfoKind infoKind;
int i;
PersistentFileSysState newState;
/*
* The XLOG record is for transaction module...
*/
if (!xact_redo_get_info(
record,
&infoKind,
&xid,
&persistentObjects,
&subXids,
&subXidCount))
return;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"%s Record (transaction %u)",
XactInfoKind_Name(infoKind),
xid);
/*
* We keep track of comitted, aborted, and prepared transactions so 'Create Pending'
* objects can be attached to these transaction entries and not get automatically
* aborted by Crash Recovery.
*/
xactEntry =
PersistentRecovery_FindOrCreateXactEntry(xid, &found);
xactEntry->infoKind = infoKind;
if (infoKind == XACT_INFOKIND_COMMIT)
{
for (i = 0; i < persistentObjects.typed.fileSysActionInfosCount; i++)
{
PersistentEndXactFileSysActionInfo *fileSysActionInfo =
&persistentObjects.typed.fileSysActionInfos[i];
PersistentEndXactFileSysAction action;
action = fileSysActionInfo->action;
switch (action)
{
case PersistentEndXactFileSysAction_Create:
newState = PersistentFileSysState_Created;
break;
case PersistentEndXactFileSysAction_Drop:
newState = PersistentFileSysState_DropPending;
break;
case PersistentEndXactFileSysAction_AbortingCreateNeeded:
newState = PersistentFileSysState_AbortingCreate;
break;
default:
elog(ERROR, "Unexpected persistent end transaction file-system action: %d",
action);
newState = PersistentFileSysState_Free; // Not reached.
break;
}
PersistentRecovery_AddEndXactFsObj(
xid,
XACT_INFOKIND_COMMIT,
fileSysActionInfo,
newState);
}
}
else if (infoKind == XACT_INFOKIND_ABORT)
{
for (i = 0; i < persistentObjects.typed.fileSysActionInfosCount; i++)
{
PersistentEndXactFileSysActionInfo *fileSysActionInfo =
&persistentObjects.typed.fileSysActionInfos[i];
PersistentEndXactFileSysAction action;
action = fileSysActionInfo->action;
switch (action)
{
case PersistentEndXactFileSysAction_Create:
newState = PersistentFileSysState_AbortingCreate;
break;
case PersistentEndXactFileSysAction_Drop:
continue;
case PersistentEndXactFileSysAction_AbortingCreateNeeded:
newState = PersistentFileSysState_AbortingCreate;
break;
default:
elog(ERROR, "Unexpected persistent end transaction file-system action: %d",
action);
newState = PersistentFileSysState_Free; // Not reached.
}
/* MPP-16881: adding AbortingCreate objid to shared memory
* hash table, which will be used in Pass3 recovery to
* clean up gp_fastsequence.
*/
if (fileSysActionInfo->fsObjName.type == PersistentFsObjType_RelationFile)
{
bool exists;
Pass2RecoveryHashEntry_s *entry;
RelFileNode *relFileNode = PersistentFileSysObjName_GetRelFileNodePtr(&fileSysActionInfo->fsObjName);
entry = Pass2Recovery_InsertHashEntry(
relFileNode->relNode,
&exists);
if (entry == NULL)
elog(WARNING,
"Pass2Recovery_InsertHashEntry"
" failed to insert AbortingCreate entry into"
" shared memory hash table, there might be"
" entries in gp_fastsequence left uncleaned,"
" it could cause inconsistency between"
" pg_class and gp_fastsequence.");
}
PersistentRecovery_AddEndXactFsObj(
xid,
XACT_INFOKIND_ABORT,
fileSysActionInfo,
newState);
}
}
else
{
Assert(infoKind == XACT_INFOKIND_PREPARE);
/*
* Since we don't know if this transaction will commit or abort, we
* only keep the transaction entry.
*/
}
}
}
bool
PersistentRecovery_ShouldHandlePass3XLogRec(
XLogRecPtr *beginLoc,
XLogRecPtr *lsn,
XLogRecord *record)
{
int relationChangeInfoArrayCount;
int arrlen = ChangeTracking_GetInfoArrayDesiredMaxLength(record->xl_rmid,
record->xl_info);
RelationChangeInfo relationChangeInfoArray[arrlen];
RelFileNode xlogRelFileNode;
ItemPointerData xlogPersistentTid;
int64 xlogPersistentSerialNum;
bool exists;
if (Debug_persistent_recovery_print)
ChangeTracking_PrintRelationChangeInfo(
record->xl_rmid,
record->xl_info,
(void*)XLogRecGetData(record),
lsn,
/* weAreGeneratingXLogNow */ false,
/* printSkipIssuesOnly */ false);
/*
* Gather vital peristence information from the XLOG record about relations ONLY.
*/
ChangeTracking_GetRelationChangeInfoFromXlog(
record->xl_rmid,
record->xl_info,
(void*)XLogRecGetData(record),
relationChangeInfoArray,
&relationChangeInfoArrayCount,
arrlen);
if (relationChangeInfoArrayCount == 0)
{
/*
* Special case truncate because it is not considered a FileRep "page change"....
*/
if (record->xl_rmid == RM_SMGR_ID)
{
if (!smgrgetpersistentinfo(
record,
&xlogRelFileNode,
&xlogPersistentTid,
&xlogPersistentSerialNum))
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass3XLogRec: truncate %u/%u/%u, serial number " INT64_FORMAT ", TID %s",
xlogRelFileNode.spcNode,
xlogRelFileNode.dbNode,
xlogRelFileNode.relNode,
xlogPersistentSerialNum,
ItemPointerToString(&xlogPersistentTid));
return true;
}
// Otherwise, fall through with persistent information.
}
else
{
/*
* A non-data change XLOG record.
*/
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass3XLogRec: non-data change XLOG record");
return true;
}
}
else
{
xlogRelFileNode = relationChangeInfoArray[0].relFileNode;
xlogPersistentTid = relationChangeInfoArray[0].persistentTid;
xlogPersistentSerialNum = relationChangeInfoArray[0].persistentSerialNum;
}
if (GpPersistent_IsPersistentRelation(xlogRelFileNode.relNode))
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass3XLogRec: skip persistent meta-data %u/%u/%u",
xlogRelFileNode.spcNode,
xlogRelFileNode.dbNode,
xlogRelFileNode.relNode);
return false;
}
if (GpPersistent_SkipXLogInfo(xlogRelFileNode.relNode))
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass3XLogRec: other special relation %u/%u/%u",
xlogRelFileNode.spcNode,
xlogRelFileNode.dbNode,
xlogRelFileNode.relNode);
return true;
}
/*
* Further qualify using the RelFileNode.
*/
exists = PersistentRecovery_RedoRelationExists(
&xlogPersistentTid,
xlogPersistentSerialNum,
&xlogRelFileNode);
if (exists)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass3XLogRec: match %u/%u/%u, serial number " INT64_FORMAT ", TID %s",
xlogRelFileNode.spcNode,
xlogRelFileNode.dbNode,
xlogRelFileNode.relNode,
xlogPersistentSerialNum,
ItemPointerToString(&xlogPersistentTid));
}
else
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_ShouldHandlePass3XLogRec: relation has been dropped or the create aborted %u/%u/%u, serial number " INT64_FORMAT ", TID %s",
xlogRelFileNode.spcNode,
xlogRelFileNode.dbNode,
xlogRelFileNode.relNode,
xlogPersistentSerialNum,
ItemPointerToString(&xlogPersistentTid));
}
return exists;
}
static void
PersistentRecovery_PrintXactAndFsObjs(void)
{
HASH_SEQ_STATUS iterateStatus;
XactEntry xactEntry;
FsObjEntry fsObjEntry;
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_PrintXactAndFsObjs");
if (xactHashTable == NULL)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_PrintXactAndFsObjs -- no entries");
return;
}
hash_seq_init(&iterateStatus, xactHashTable);
while (true)
{
xactEntry =
(XactEntry)
hash_seq_search(&iterateStatus);
if (xactEntry == NULL)
break;
elog(PersistentRecovery_DebugPrintLevel(),
"Summary REDO: Transaction %u (%s)",
xactEntry->xid,
XactInfoKind_Name(xactEntry->infoKind));
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_First(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList);
while (fsObjEntry != NULL)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Summary REDO: %s",
FsObjEntryToString(fsObjEntry));
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_Next(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList,
fsObjEntry);
}
}
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_PrintXactAndFsObjs");
}
// -----------------------------------------------------------------------------
static void
PersistentRecovery_AddScanEntry(
PersistentFsObjType fsObjType,
ItemPointer persistentTid,
PersistentFileSysObjName *fsObjName,
PersistentFileSysRelStorageMgr relStorageMgr,
PersistentFileSysState state,
TransactionId parentXid,
int64 persistentSerialNum)
{
FsObjEntry fsObjEntry;
bool found;
if (state == PersistentFileSysState_Free)
{
fsObjEntry =
PersistentRecovery_FindOrCreateFsObjEntry(fsObjType, persistentTid, &found);
if (found)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Overwriting %s as free",
FsObjEntryToString(fsObjEntry));
}
else
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Free entry with no end transaction work: %s",
FsObjEntryToString(fsObjEntry));
}
fsObjEntry->state = PersistentFileSysState_Free;
fsObjEntry->updateNeeded = false; // Already in terminal condition.
return;
}
/*
* Create Persistent Change entry.
*/
fsObjEntry =
PersistentRecovery_FindOrCreateFsObjEntry(
fsObjType,
persistentTid,
&found);
if (!found)
{
fsObjEntry->fsObjName = *fsObjName;
fsObjEntry->relStorageMgr = relStorageMgr;
fsObjEntry->persistentSerialNum = persistentSerialNum;
fsObjEntry->xid = parentXid;
fsObjEntry->state = state;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Add %s", FsObjEntryToString(fsObjEntry));
return;
}
if (fsObjEntry->persistentSerialNum == persistentSerialNum)
{
if (fsObjEntry->state < state)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Newer %s than %s state, transaction %u",
FsObjEntryToString(fsObjEntry),
PersistentFileSysObjState_Name(state),
parentXid);
fsObjEntry->state = state;
fsObjEntry->xid = parentXid;
}
else if (fsObjEntry->state > state)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Update %s to state '%s', transaction %u, serial number " INT64_FORMAT,
FsObjEntryToString(fsObjEntry),
PersistentFileSysObjState_Name(state),
parentXid,
persistentSerialNum);
fsObjEntry->updateNeeded = true;
}
else
{
/*
* During crash recovery when we drop objects, we should skip the objects whose
* mirror existence state is MirrorDropRemains. It will be dropped during resync
*/
}
}
else if (fsObjEntry->persistentSerialNum < persistentSerialNum)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Overwriting obsolete %s with '%s', relation storage manager '%s', state '%s', transaction %u, serial number " INT64_FORMAT,
FsObjEntryToString(fsObjEntry),
PersistentFileSysObjName_TypeAndObjectName(fsObjName),
PersistentFileSysRelStorageMgr_Name(relStorageMgr),
PersistentFileSysObjState_Name(state),
parentXid,
persistentSerialNum);
fsObjEntry->fsObjName = *fsObjName;
fsObjEntry->relStorageMgr = relStorageMgr;
fsObjEntry->state = state;
fsObjEntry->xid = parentXid;
fsObjEntry->persistentSerialNum = persistentSerialNum;
}
else
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Scan REDO: Wrong state %s", FsObjEntryToString(fsObjEntry));
}
}
static int persistentRecoveryCount = 0;
static void PersistentRecovery_ScanTuple(
PersistentFsObjType fsObjType,
ItemPointer persistentTid,
int64 persistentSerialNum,
Datum *values)
{
PersistentFileSysObjName fsObjName;
PersistentFileSysState state;
PersistentFileSysRelStorageMgr relStorageMgr;
TransactionId parentXid;
int64 serialNum;
GpPersistent_GetCommonValues(
fsObjType,
values,
&fsObjName,
&state,
&parentXid,
&serialNum);
Assert(serialNum == persistentSerialNum);
if (state != PersistentFileSysState_Free &&
fsObjType == PersistentFsObjType_RelationFile)
{
relStorageMgr =
(PersistentFileSysRelStorageMgr)
DatumGetInt16(
values[Anum_gp_persistent_relfile_node_relation_storage_manager - 1]);
if (!PersistentFileSysRelStorageMgr_IsValid(relStorageMgr))
elog(ERROR, "Relation storage manager for persistent '%s' for Crash Recovery is invalid (%d)",
PersistentFileSysObjName_TypeAndObjectName(&fsObjName),
relStorageMgr);
}
else
{
// 'Free' entry or not a 'Relation File'.
relStorageMgr = PersistentFileSysRelStorageMgr_None;
}
PersistentRecovery_AddScanEntry(
fsObjType,
persistentTid,
&fsObjName,
relStorageMgr,
state,
parentXid,
persistentSerialNum);
persistentRecoveryCount++;
}
static void
PersistentRecovery_ScanType(
PersistentFsObjType fsObjType)
{
PersistentFileSysObjData *fileSysObjData;
PersistentFileSysObjSharedData *fileSysObjSharedData;
PersistentStoreScan storeScan;
Datum *values;
ItemPointerData persistentTid;
int64 persistentSerialNum;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_ScanType %s",
PersistentFileSysObjName_TypeName(fsObjType));
persistentRecoveryCount = 0;
PersistentFileSysObj_GetDataPtrs(
fsObjType,
&fileSysObjData,
&fileSysObjSharedData);
values = (Datum*)palloc(fileSysObjData->storeData.numAttributes * sizeof(Datum));
PersistentStore_BeginScan(
&fileSysObjData->storeData,
&fileSysObjSharedData->storeSharedData,
&storeScan);
while (PersistentStore_GetNext(
&storeScan,
values,
&persistentTid,
&persistentSerialNum))
{
PersistentRecovery_ScanTuple(
fsObjType,
&persistentTid,
persistentSerialNum,
values);
}
pfree(values);
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_ScanType %s, count %d",
PersistentFileSysObjName_TypeName(fsObjType),
persistentRecoveryCount);
}
void
PersistentRecovery_Scan(void)
{
PersistentFsObjType fsObjType;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_Scan");
for (fsObjType = PersistentFsObjType_First;
fsObjType <= PersistentFsObjType_Last;
fsObjType++)
PersistentRecovery_ScanType(fsObjType);
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_Scan");
}
static void
PersistentRecovery_AttachFsObjTypeToXact(
PersistentFsObjType fsObjType)
{
HASH_SEQ_STATUS iterateStatus;
FsObjEntry fsObjEntry;
XactEntry xactEntry;
bool found;
Assert (fsObjHashTable != NULL);
hash_seq_init(&iterateStatus, fsObjHashTable[fsObjType]);
while (true)
{
fsObjEntry =
(FsObjEntry)
hash_seq_search(&iterateStatus);
if (fsObjEntry == NULL)
break;
if (fsObjEntry->xid == InvalidTransactionId)
continue;
xactEntry =
PersistentRecovery_FindOrCreateXactEntry(
fsObjEntry->xid,
&found);
if (!found)
xactEntry->infoKind = XACT_INFOKIND_NONE;
DoublyLinkedHead_AddFirst(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList,
fsObjEntry);
}
}
typedef struct RecordCrashTransactionAbortRecordErrContext
{
TransactionId xid;
PersistentEndXactFileSysActionInfo *fileSysActionInfos;
int createPendingCount;
} RecordCrashTransactionAbortRecordErrContext;
static void PersistentRecovery_RecordCrashTransactionAbortRecordErrContext(void *arg)
{
RecordCrashTransactionAbortRecordErrContext *errContext = (RecordCrashTransactionAbortRecordErrContext*) arg;
PersistentEndXactFileSysActionInfo *firstFileSysActionInfo;
Assert(errContext->createPendingCount > 0);
firstFileSysActionInfo = &errContext->fileSysActionInfos[0];
errcontext(
"Record abort transaction record for crashed transaction %u with %d 'Create Pending' file-system objects "
"(first file-system object %s, persistent serial number " INT64_FORMAT ", TID %s)",
errContext->xid,
errContext->createPendingCount,
PersistentFileSysObjName_TypeAndObjectName(&firstFileSysActionInfo->fsObjName),
firstFileSysActionInfo->persistentSerialNum,
ItemPointerToString(&firstFileSysActionInfo->persistentTid));
}
static void
PersistentRecovery_HandlePrepareBeforeCheckpoint(void)
{
HASH_SEQ_STATUS iterateStatus;
XactEntry xactEntry;
if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_HandlePrepareBeforeCheckpoint");
}
if (xactHashTable == NULL)
{
if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_HandlePrepareBeforeCheckpoint -- no entries");
}
return;
}
hash_seq_init(&iterateStatus, xactHashTable);
while (true)
{
xactEntry =
(XactEntry)
hash_seq_search(&iterateStatus);
if (xactEntry == NULL)
break;
if (xactEntry->infoKind == XACT_INFOKIND_NONE)
{
XLogRecPtr preparedLoc;
if (TwoPhaseFindRecoverPostCheckpointPreparedTransactionsMapEntry(
xactEntry->xid,
&preparedLoc,
"PersistentRecovery_HandlePrepareBeforeCheckpoint"))
{
xactEntry->infoKind = XACT_INFOKIND_PREPARE;
if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Prepare Before Checkpoint: Transaction %u (None) found and marked prepared (location %s)",
xactEntry->xid,
XLogLocationToString(&preparedLoc));
}
}
else
{
if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Prepare Before Checkpoint: Transaction %u (None) not found in checkpoint prepared list",
xactEntry->xid);
}
}
}
else
{
if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Prepare Before Checkpoint: Transaction %u (%s)",
xactEntry->xid,
XactInfoKind_Name(xactEntry->infoKind));
}
}
}
if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_HandlePrepareBeforeCheckpoint");
}
}
void
PersistentRecovery_CrashAbort(void)
{
PersistentFsObjType fsObjType;
HASH_SEQ_STATUS iterateStatus;
FsObjEntry fsObjEntry;
XactEntry xactEntry;
int xactCount;
ErrorContextCallback errcontext;
RecordCrashTransactionAbortRecordErrContext recordCrashTransactionAbortRecordErrContext;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_CrashAbort");
if (fsObjHashTable == NULL)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_CrashAbort (no relation hash table)");
return;
}
for (fsObjType = PersistentFsObjType_First;
fsObjType <= PersistentFsObjType_Last;
fsObjType++)
PersistentRecovery_AttachFsObjTypeToXact(fsObjType);
PersistentRecovery_HandlePrepareBeforeCheckpoint();
if (Debug_persistent_recovery_print)
{
PersistentRecovery_PrintXactAndFsObjs();
}
hash_seq_init(&iterateStatus, xactHashTable);
xactCount = 0;
while (true)
{
bool needsAbort;
int createPendingCount;
int e;
PersistentEndXactFileSysActionInfo *fileSysActionInfos;
PersistentEndXactRecObjects persistentObjects;
bool abortSuccessfullyRecorded;
xactEntry =
(XactEntry)
hash_seq_search(&iterateStatus);
if (xactEntry == NULL)
break;
xactCount++;
if (xactEntry->infoKind != XACT_INFOKIND_NONE)
{
Assert(xactEntry->infoKind == XACT_INFOKIND_PREPARE ||
xactEntry->infoKind == XACT_INFOKIND_COMMIT ||
xactEntry->infoKind == XACT_INFOKIND_ABORT);
/*
* If it is Prepared, we don't know yet what to do with the transaction.
*
* If it is known Committed or Aborted because we found an XLOG Commit or
* or Abort, we may have updates that will be handled later.
*/
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_CrashAbort: Skipping transaction %u (state '%s')",
xactEntry->xid,
XactInfoKind_Name(xactEntry->infoKind));
continue;
}
/*
* We don't know the status of the transaction until we examine the
* persistent node states.
*/
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_CrashAbort: Checking transaction %u (state '%s')",
xactEntry->xid,
XactInfoKind_Name(xactEntry->infoKind));
needsAbort = false;
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_First(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList);
createPendingCount = 0;
while (fsObjEntry != NULL)
{
if (fsObjEntry->state == PersistentFileSysState_BulkLoadCreatePending ||
fsObjEntry->state == PersistentFileSysState_CreatePending)
{
needsAbort = true;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Needs-Abort REDO: (New state 'Aborting Create') %s",
FsObjEntryToString(fsObjEntry));
createPendingCount++;
}
else if (fsObjEntry->state != PersistentFileSysState_Free)
{
if (needsAbort)
elog(ERROR, "Found persistent '%s' entry among 'Create Pending' entries",
FsObjEntryToString(fsObjEntry));
break;
}
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_Next(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList,
fsObjEntry);
}
if (!needsAbort)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_CrashAbort: Skipping transaction %u due to no 'Create Pending' objects",
xactEntry->xid);
Assert(createPendingCount == 0);
continue;
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_CrashAbort: Found %d 'Create Pending' persistent file-system objects for transaction %u",
createPendingCount,
xactEntry->xid);
fileSysActionInfos =
(PersistentEndXactFileSysActionInfo*)
palloc0(createPendingCount * sizeof(PersistentEndXactFileSysActionInfo));
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_First(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList);
e = 0;
while (fsObjEntry != NULL)
{
if (fsObjEntry->state == PersistentFileSysState_BulkLoadCreatePending ||
fsObjEntry->state == PersistentFileSysState_CreatePending)
{
Assert(e < createPendingCount);
fileSysActionInfos[e].action = PersistentEndXactFileSysAction_Create;
fileSysActionInfos[e].fsObjName = fsObjEntry->fsObjName;
fileSysActionInfos[e].relStorageMgr = fsObjEntry->relStorageMgr;
fileSysActionInfos[e].persistentTid = fsObjEntry->key.persistentTid;
fileSysActionInfos[e].persistentSerialNum = fsObjEntry->persistentSerialNum;
e++;
}
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_Next(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList,
fsObjEntry);
}
PersistentEndXactRec_Init(&persistentObjects);
PersistentEndXactRec_AddFileSysActionInfos(
&persistentObjects,
EndXactRecKind_Abort,
fileSysActionInfos,
createPendingCount);
/* Setup error traceback support for ereport() */
recordCrashTransactionAbortRecordErrContext.xid = xactEntry->xid;
recordCrashTransactionAbortRecordErrContext.fileSysActionInfos = fileSysActionInfos;
recordCrashTransactionAbortRecordErrContext.createPendingCount = createPendingCount;
errcontext.callback = PersistentRecovery_RecordCrashTransactionAbortRecordErrContext;
errcontext.arg = (void *) &recordCrashTransactionAbortRecordErrContext;
errcontext.previous = error_context_stack;
error_context_stack = &errcontext;
abortSuccessfullyRecorded =
RecordCrashTransactionAbortRecord(
xactEntry->xid,
&persistentObjects);
/* Pop the error context stack */
error_context_stack = errcontext.previous;
pfree(fileSysActionInfos);
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_First(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList);
while (fsObjEntry != NULL)
{
if (fsObjEntry->state == PersistentFileSysState_BulkLoadCreatePending ||
fsObjEntry->state == PersistentFileSysState_CreatePending)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_CrashAbort: Set state of %s to 'Aborting Create'",
FsObjEntryToString(fsObjEntry));
if (abortSuccessfullyRecorded)
{
fsObjEntry->updateNeeded = true;
fsObjEntry->state = PersistentFileSysState_AbortingCreate;
if (fsObjEntry->fsObjName.type == PersistentFsObjType_RelationFile)
{
bool exists;
Pass2RecoveryHashEntry_s *entry;
RelFileNode *relFileNode =
PersistentFileSysObjName_GetRelFileNodePtr(
&fsObjEntry->fsObjName);
entry = Pass2Recovery_InsertHashEntry(
relFileNode->relNode,
&exists);
if (entry == NULL)
elog(WARNING,
"Pass2Recovery_InsertHashEntry"
" failed to insert AbortingCreate entry into"
" shared memory hash table, there might be"
" entries in gp_fastsequence left uncleaned,"
" it could cause inconsistency between"
" pg_class and gp_fastsequence.");
}
}
else
{
elog(WARNING, "Not setting the state of '%s' to 'Aborting Create'",
FsObjEntryToString(fsObjEntry));
}
}
else if (fsObjEntry->state != PersistentFileSysState_Free)
elog(ERROR, "Unexpected persistent file-system state: %d",
fsObjEntry->state);
fsObjEntry =
(FsObjEntry) DoublyLinkedHead_Next(
offsetof(FsObjEntryData, xactLinks),
&xactEntry->fsObjEntryList,
fsObjEntry);
}
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_CrashAbort (transaction count %d)", xactCount);
}
static void
PersistentRecovery_UpdateType(
PersistentFsObjType fsObjType)
{
HASH_SEQ_STATUS iterateStatus;
FsObjEntry fsObjEntry;
PersistentFileSysState state;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_UpdateType %s",
PersistentFileSysObjName_TypeName(fsObjType));
hash_seq_init(&iterateStatus, fsObjHashTable[fsObjType]);
persistentRecoveryCount = 0;
while (true)
{
fsObjEntry =
(FsObjEntry)
hash_seq_search(&iterateStatus);
if (fsObjEntry == NULL)
break;
if (fsObjEntry->updateNeeded)
{
persistentRecoveryCount++;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"End-Xact-Update-Needed REDO: %s", FsObjEntryToString(fsObjEntry));
state = fsObjEntry->state;
switch (state)
{
case PersistentFileSysState_Created:
PersistentFileSysObj_Created(
&fsObjEntry->fsObjName,
&fsObjEntry->key.persistentTid,
fsObjEntry->persistentSerialNum,
/* retryPossible */ false);
break;
case PersistentFileSysState_AbortingCreate:
fsObjEntry->stateChangeResult =
PersistentFileSysObj_MarkAbortingCreate(
&fsObjEntry->fsObjName,
&fsObjEntry->key.persistentTid,
fsObjEntry->persistentSerialNum,
/* retryPossible */ false);
break;
case PersistentFileSysState_DropPending:
fsObjEntry->stateChangeResult =
PersistentFileSysObj_MarkDropPending(
&fsObjEntry->fsObjName,
&fsObjEntry->key.persistentTid,
fsObjEntry->persistentSerialNum,
/* retryPossible */ false);
break;
default:
elog(ERROR, "Unexpected persistent file-system state: %d",
state);
}
}
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_UpdateType %s, count %d",
PersistentFileSysObjName_TypeName(fsObjType),
persistentRecoveryCount);
}
void
PersistentRecovery_Update(void)
{
PersistentFsObjType fsObjType;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_Update");
if (fsObjHashTable == NULL)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_Update (no relation hash table)");
return;
}
for (fsObjType = PersistentFsObjType_First;
fsObjType <= PersistentFsObjType_Last;
fsObjType++)
PersistentRecovery_UpdateType(fsObjType);
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_Update");
}
static void
PersistentRecovery_DropType(
PersistentFsObjType fsObjType)
{
HASH_SEQ_STATUS iterateStatus;
FsObjEntry fsObjEntry;
PersistentFileSysState state;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_DropType %s",
PersistentFileSysObjName_TypeName(fsObjType));
hash_seq_init(&iterateStatus, fsObjHashTable[fsObjType]);
persistentRecoveryCount = 0;
while (true)
{
fsObjEntry =
(FsObjEntry)
hash_seq_search(&iterateStatus);
if (fsObjEntry == NULL)
break;
state = fsObjEntry->state;
if (state == PersistentFileSysState_AbortingCreate ||
state == PersistentFileSysState_DropPending)
{
if (fsObjEntry->stateChangeResult != PersistentFileSysObjStateChangeResult_StateChangeOk)
{
if (fsObjEntry->stateChangeResult == PersistentFileSysObjStateChangeResult_ErrorSuppressed)
{
elog(WARNING,
"Crash recovery skipping drop for %s with State-Change result '%s'",
FsObjEntryToString(fsObjEntry),
PersistentFileSysObjStateChangeResult_Name(fsObjEntry->stateChangeResult));
}
else if (Debug_persistent_recovery_print)
{
elog(PersistentRecovery_DebugPrintLevel(),
"Drop REDO: Skipping drop for %s with State-Change result '%s'",
FsObjEntryToString(fsObjEntry),
PersistentFileSysObjStateChangeResult_Name(fsObjEntry->stateChangeResult));
}
continue;
}
persistentRecoveryCount++;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Drop REDO: %s", FsObjEntryToString(fsObjEntry));
if (fsObjType == PersistentFsObjType_RelationFile &&
!PersistentFileSysRelStorageMgr_IsValid(fsObjEntry->relStorageMgr))
elog(ERROR, "Relation storage manager for persistent '%s' for Crash Recovery is invalid (%d)",
PersistentFileSysObjName_TypeAndObjectName(&fsObjEntry->fsObjName),
fsObjEntry->relStorageMgr);
PersistentFileSysObj_DropObject(
&fsObjEntry->fsObjName,
fsObjEntry->relStorageMgr,
/* relationName */ NULL, // Ok to be NULL -- we don't know the name here.
&fsObjEntry->key.persistentTid,
fsObjEntry->persistentSerialNum,
/* ignoreNonExistence */ true,
Debug_persistent_recovery_print,
PersistentRecovery_DebugPrintLevel());
}
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_DropType %s, count %d",
PersistentFileSysObjName_TypeName(fsObjType),
persistentRecoveryCount);
}
void
PersistentRecovery_Drop(void)
{
PersistentFsObjType fsObjType;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_Scan");
for (fsObjType = PersistentFsObjType_First;
fsObjType <= PersistentFsObjType_Last;
fsObjType++)
PersistentRecovery_DropType(fsObjType);
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_Scan");
}
typedef struct RedoRelationEntryDataKey
{
ItemPointerData persistentTid;
} RedoRelationEntryDataKey;
typedef struct RedoRelationEntryData
{
RedoRelationEntryDataKey key;
RelFileNode relFileNode;
int64 persistentSerialNum;
// UNDONE: mirrorExistence
} RedoRelationEntryData;
typedef RedoRelationEntryData *RedoRelationEntry;
void
PersistentRecovery_SerializeRedoRelationFile(
int redoRelationFile)
{
HASH_SEQ_STATUS iterateStatus;
FsObjEntry fsObjEntry;
RedoRelationEntryData redoRelationEntryData;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_SerializeRedoRelationFile");
hash_seq_init(
&iterateStatus,
fsObjHashTable[PersistentFsObjType_RelationFile]);
persistentRecoveryCount = 0;
while (true)
{
int writeLen;
fsObjEntry =
(FsObjEntry)
hash_seq_search(&iterateStatus);
if (fsObjEntry == NULL)
break;
if (fsObjEntry->state == PersistentFileSysState_Free ||
fsObjEntry->state == PersistentFileSysState_AbortingCreate ||
fsObjEntry->state == PersistentFileSysState_DropPending)
continue;
Assert(fsObjEntry->state == PersistentFileSysState_BulkLoadCreatePending ||
fsObjEntry->state == PersistentFileSysState_CreatePending ||
fsObjEntry->state == PersistentFileSysState_Created);
MemSet(&redoRelationEntryData.key, 0, sizeof(RedoRelationEntryDataKey)); // Zero out any padding.
redoRelationEntryData.key.persistentTid = fsObjEntry->key.persistentTid;
redoRelationEntryData.relFileNode = PersistentFileSysObjName_GetRelFileNode(&fsObjEntry->fsObjName);
redoRelationEntryData.persistentSerialNum = fsObjEntry->persistentSerialNum;
writeLen = write(redoRelationFile, &redoRelationEntryData, sizeof(RedoRelationEntryData));
if (writeLen != sizeof(RedoRelationEntryData))
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write redo relation file : %m")));
}
persistentRecoveryCount++;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_SerializeRedoRelationFile: %u/%u/%u, serial number " INT64_FORMAT ", TID %s",
redoRelationEntryData.relFileNode.spcNode,
redoRelationEntryData.relFileNode.dbNode,
redoRelationEntryData.relFileNode.relNode,
redoRelationEntryData.persistentSerialNum,
ItemPointerToString(&redoRelationEntryData.key.persistentTid));
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_SerializeRedoRelationFile count %d",
persistentRecoveryCount);
}
static HTAB* redoRelationHashTable = NULL;
static void
PersistentRecovery_RedoRelationHashTableInit(void)
{
HASHCTL info;
int hash_flags;
/* Set key and entry sizes. */
MemSet(&info, 0, sizeof(info));
info.keysize = sizeof(RedoRelationEntryDataKey);
info.entrysize = sizeof(RedoRelationEntryData);
info.hash = tag_hash;
hash_flags = (HASH_ELEM | HASH_FUNCTION);
redoRelationHashTable = hash_create("Redo Relation", 10, &info, hash_flags);
}
static RedoRelationEntry
PersistentRecovery_CreateRedoRelationEntry(
RedoRelationEntry redoRelationEntry)
{
RedoRelationEntry entry;
bool found;
if(RelFileNode_IsEmpty(&redoRelationEntry->relFileNode))
elog(ERROR, "Invalid RelFileNode (0,0,0)");
if (redoRelationHashTable == NULL)
PersistentRecovery_RedoRelationHashTableInit();
entry =
(RedoRelationEntry)
hash_search(redoRelationHashTable,
(void *) &redoRelationEntry->key,
HASH_ENTER,
&found);
if (found)
{
Assert(entry != NULL);
elog(ERROR,
"Duplicate redo relation entry: existing (%u/%u/%u, serial number " INT64_FORMAT ", TID %s), "
"new (%u/%u/%u, serial number " INT64_FORMAT ", TID %s)",
entry->relFileNode.spcNode,
entry->relFileNode.dbNode,
entry->relFileNode.relNode,
entry->persistentSerialNum,
ItemPointerToString(&entry->key.persistentTid),
redoRelationEntry->relFileNode.spcNode,
redoRelationEntry->relFileNode.dbNode,
redoRelationEntry->relFileNode.relNode,
redoRelationEntry->persistentSerialNum,
ItemPointerToString2(&redoRelationEntry->key.persistentTid));
}
entry->relFileNode = redoRelationEntry->relFileNode;
entry->persistentSerialNum = redoRelationEntry->persistentSerialNum;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Created redo relation entry: %u/%u/%u, serial number " INT64_FORMAT ", TID %s",
entry->relFileNode.spcNode,
entry->relFileNode.dbNode,
entry->relFileNode.relNode,
entry->persistentSerialNum,
ItemPointerToString(&entry->key.persistentTid));
return entry;
}
static RedoRelationEntry
PersistentRecovery_FindRedoRelationEntry(
ItemPointer persistentTid)
{
RedoRelationEntryDataKey key;
RedoRelationEntry foundEntry;
bool found;
if (redoRelationHashTable == NULL)
PersistentRecovery_RedoRelationHashTableInit();
MemSet(&key, 0, sizeof(RedoRelationEntryDataKey));
key.persistentTid = *persistentTid;
foundEntry =
(RedoRelationEntry)
hash_search(redoRelationHashTable,
(void *) &key,
HASH_FIND,
&found);
if (!found)
return NULL;
return foundEntry;
}
void
PersistentRecovery_DeserializeRedoRelationFile(
int redoRelationFile)
{
RedoRelationEntryData redoRelationEntryData;
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Entering PersistentRecovery_DeserializeRedoRelationFile");
while(true)
{
int readLen;
errno = 0;
readLen = read(redoRelationFile, &redoRelationEntryData, sizeof(RedoRelationEntryData));
if (readLen == 0)
break;
else if (readLen != sizeof(RedoRelationEntryData) && errno == 0)
elog(ERROR, "Bad redo relation entry length (expected %d and found %d)",
(int)sizeof(RedoRelationEntryData), readLen);
else if (errno != 0)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("error reading redo relation file: %m")));
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_DeserializeRedoRelationFile: %u/%u/%u, serial number " INT64_FORMAT ", TID %s",
redoRelationEntryData.relFileNode.spcNode,
redoRelationEntryData.relFileNode.dbNode,
redoRelationEntryData.relFileNode.relNode,
redoRelationEntryData.persistentSerialNum,
ItemPointerToString(&redoRelationEntryData.key.persistentTid));
if(RelFileNode_IsEmpty(&redoRelationEntryData.relFileNode))
elog(ERROR, "Invalid RelFileNode (0,0,0)");
PersistentRecovery_CreateRedoRelationEntry(&redoRelationEntryData);
}
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"Exiting PersistentRecovery_DeserializeRedoRelationFile count %d",
persistentRecoveryCount);
}
static bool
PersistentRecovery_RedoRelationExists(
ItemPointer persistentTid,
int64 persistentSerialNum,
RelFileNode *relFileNode)
{
RedoRelationEntry redoRelationEntry;
redoRelationEntry = PersistentRecovery_FindRedoRelationEntry(persistentTid);
if (redoRelationEntry == NULL)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_RedoRelationExists: TID %s is not in the currently 'Created' set (serial number " INT64_FORMAT ", %u/%u/%u)",
ItemPointerToString(persistentTid),
persistentSerialNum,
relFileNode->spcNode,
relFileNode->dbNode,
relFileNode->relNode);
return false;
}
if (redoRelationEntry->persistentSerialNum != persistentSerialNum)
{
if (Debug_persistent_recovery_print)
elog(PersistentRecovery_DebugPrintLevel(),
"PersistentRecovery_RedoRelationExists: TID %s entry from the 'Created' set has different serial number (serial number " INT64_FORMAT ", %u/%u/%u) indicating XLOG for persistent obsolete file-system object (obsolete serial number " INT64_FORMAT ", %u/%u/%u)",
ItemPointerToString(persistentTid),
redoRelationEntry->persistentSerialNum,
redoRelationEntry->relFileNode.spcNode,
redoRelationEntry->relFileNode.dbNode,
redoRelationEntry->relFileNode.relNode,
persistentSerialNum,
relFileNode->spcNode,
relFileNode->dbNode,
relFileNode->relNode);
return false;
}
return true;
}