blob: 4d0d3c2afa672286364af68d20e5071767be7b1d [file] [log] [blame]
///////////////////////////////////////////////////////////////////////////////
//
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
//
///////////////////////////////////////////////////////////////////////////////
#include <iostream>
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>
#include <signal.h>
#include <fcntl.h>
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <sys/file.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <errno.h>
#include <limits.h>
#include <unistd.h>
#include "localio.h"
#include "mlio.h"
#include "monlogging.h"
#include "monsonar.h"
#include "montrace.h"
#include "redirector.h"
#include "healthcheck.h"
#include "config.h"
#include "device.h"
#include "cluster.h"
#include "monitor.h"
#include "replicate.h"
#include "clusterconf.h"
#include "lnode.h"
#include "pnode.h"
#include "reqqueue.h"
#include "zclient.h"
#include "commaccept.h"
#include "meas.h"
#ifdef NAMESERVER_PROCESS
#include "nscommacceptmon.h"
#else
#include "nameserver.h"
#endif
extern bool IAmIntegrating;
extern bool IAmIntegrated;
extern bool IsRealCluster;
extern bool IsAgentMode;
extern bool IsMaster;
extern bool IsMPIChild;
extern char MasterMonitorName[MAX_PROCESS_PATH];
extern char Node_name[MPI_MAX_PROCESSOR_NAME];
extern bool ZClientEnabled;
extern char IntegratingMonitorPort[MPI_MAX_PORT_NAME];
extern char MyCommPort[MPI_MAX_PORT_NAME];
extern char MyMPICommPort[MPI_MAX_PORT_NAME];
extern char MySyncPort[MPI_MAX_PORT_NAME];
#ifdef NAMESERVER_PROCESS
extern CCommAcceptMon CommAcceptMon;
extern char MyMon2NsPort[MPI_MAX_PORT_NAME];
#else
extern CNameServer *NameServer;
extern bool NameServerEnabled;
extern char MyPtPPort[MPI_MAX_PORT_NAME];
#endif
extern bool SMSIntegrating;
extern int CreatorShellPid;
extern Verifier_t CreatorShellVerifier;
extern CommType_t CommType;
extern int MyPNID;
extern CReqQueue ReqQueue;
extern CMonitor *Monitor;
extern CNodeContainer *Nodes;
extern CConfigContainer *Config;
#ifndef NAMESERVER_PROCESS
extern CDeviceContainer *Devices;
#endif
extern CNode *MyNode;
extern CMonStats *MonStats;
#ifndef NAMESERVER_PROCESS
extern CRedirector Redirector;
#endif
extern CMonLog *MonLog;
extern CHealthCheck HealthCheck;
extern CCommAccept CommAccept;
extern CZClient *ZClient;
extern CMeas Meas;
extern long next_test_delay;
extern CReplicate Replicator;
extern char *ErrorMsg (int error_code);
extern const char *ProcessTypeString( PROCESSTYPE type );
const char *JoiningPhaseString( JOINING_PHASE phase);
const char *StateString( STATE state);
#ifndef NAMESERVER_PROCESS
const char *SyncStateString( SyncState state);
#endif
const char *EpollEventString( __uint32_t events );
const char *EpollOpString( int op );
const char *NodePhaseString( NodePhase phase );
#ifdef NAMESERVER_PROCESS
#define MPI_Abort(a,b) abort()
#endif
const char *NodePhaseString( NodePhase phase )
{
const char *str;
switch( phase )
{
case Phase_Ready:
str = "Phase_Ready";
break;
case Phase_Activating:
str = "Phase_Activating";
break;
case Phase_SoftDown:
str = "Phase_SoftDown";
break;
case Phase_SoftUp:
str = "Phase_SoftUp";
break;
default:
str = "NodePhase - Undefined";
break;
}
return( str );
}
void CCluster::ActivateSpare( CNode *spareNode, CNode *downNode, bool checkHealth )
{
const char method_name[] = "CCluster::ActivateSpare";
TRACE_ENTRY;
// if not checking health, assume the spare is healthy
bool spareHealthy = checkHealth ? false : true;
int tmCount = 0;
CNode *node;
CLNode *lnode;
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d - pnid=%d, name=%s (%s) is taking over pnid=%d, name=%s (%s), check health=%d, isIntegrating=%d , integrating pnid=%d\n"
, method_name, __LINE__
, spareNode->GetPNid(), spareNode->GetName(), StateString(spareNode->GetState())
, downNode->GetPNid(), downNode->GetName(), StateString(downNode->GetState())
, checkHealth, IsIntegrating(), integratingPNid_ );
}
if ( checkHealth )
{
// TODO: Execute physical node health check script here
spareHealthy = true;
if ( !spareHealthy )
{
// and tell the cluster the node is down, since the spare can't takeover
CReplNodeDown *repl = new CReplNodeDown(downNode->GetPNid());
Replicator.addItem(repl);
}
}
if ( spareHealthy )
{
if ( downNode->GetPNid() != spareNode->GetPNid() )
{
// Move down node's logical nodes to spare node
downNode->MoveLNodes( spareNode );
spareNode->SetPhase( Phase_Activating );
Nodes->AddToSpareNodesList( downNode->GetPNid() );
if ( !IsIntegrating() )
{
downNode->SetState( State_Down );
// Send process death notices
#ifndef NAMESERVER_PROCESS
spareNode->KillAllDown();
#endif
// Send node down notice
lnode = spareNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
// Watchdog process clone was removed in KillAllDown
lnode->Down();
}
}
}
// Any DTMs running?
for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ )
{
node = Nodes->GetNodeByMap( i );
lnode = node->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
CProcess *process = lnode->GetProcessLByType( ProcessType_DTM );
if ( process ) tmCount++;
}
}
// Create Watchdog and PSD processes if this node is the activating spare
if ( spareNode->GetPNid() == MyPNID )
{
#ifndef NAMESERVER_PROCESS
Monitor->StartPrimitiveProcesses();
#endif
}
else
{
// Check for end of joining phase on node re-integration
if ( spareNode->GetState() == State_Joining )
{
spareNode->SetState( State_Up );
}
#ifndef NAMESERVER_PROCESS
if ( tmCount )
{
// Send node prepare notice to local DTM processes
lnode = spareNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->PrepareForTransactions( downNode->GetPNid() != spareNode->GetPNid() );
}
}
#else
ResetIntegratingPNid();
#endif
}
#ifndef NAMESERVER_PROCESS
if ( downNode->GetPNid() != spareNode->GetPNid() )
{
// we need to abort any active TmSync
if (( MyNode->GetTmSyncState() == SyncState_Start ) ||
( MyNode->GetTmSyncState() == SyncState_Continue ) ||
( MyNode->GetTmSyncState() == SyncState_Commit ) )
{
MyNode->SetTmSyncState( SyncState_Abort );
Monitor->SetAbortPendingTmSync();
if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d" " - Node " "%d" " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
}
}
#endif
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d - Spare node activating! pnid=%d, name=(%s)\n"
, method_name, __LINE__
, spareNode->GetPNid(), spareNode->GetName());
}
}
if ( spareNode->GetPNid() == MyPNID && spareHealthy )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Replicating activate spare node pnid=%d, name=%s (%s), spare=%d, down pnid=%d, name=%s (%s), DTM count=%d\n"
, method_name, __LINE__
, spareNode->GetPNid(), spareNode->GetName(), StateString(spareNode->GetState())
, spareNode->IsSpareNode()
, downNode->GetPNid(), downNode->GetName(), StateString(downNode->GetState())
, tmCount );
// Let other monitors know is ok to activate this spare node
CReplActivateSpare *repl = new CReplActivateSpare( MyPNID, downNode->GetPNid() );
Replicator.addItem(repl);
#ifndef NAMESERVER_PROCESS
if ( !tmCount )
{
// No DTMs in environment so implicitly make ready for transactions
lnode = MyNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
ReqQueue.enqueueTmReadyReq( lnode->GetNid() );
}
}
#endif
}
TRACE_EXIT;
}
#ifndef NAMESERVER_PROCESS
void CCluster::NodeTmReady( int nid )
{
const char method_name[] = "CCluster::NodeTmReady";
TRACE_ENTRY;
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d - nid=%d\n", method_name, __LINE__, nid );
}
tmReadyCount_++;
if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d - TmReady, nid=%d, tm count=%d, soft node down=%d, LNodesCount=%d\n"
, method_name, __LINE__
, nid
, tmReadyCount_
, MyNode->IsSoftNodeDown()
, MyNode->GetLNodesCount() );
}
MyNode->StartPStartDPersistentDTM( nid );
if ( MyNode->GetLNodesCount() == tmReadyCount_ )
{
if ( MyNode->IsSoftNodeDown() )
{
MyNode->ResetSoftNodeDown();
MyNode->SetPhase( Phase_Ready );
char la_buf[MON_STRING_BUF_SIZE];
sprintf( la_buf, "[%s], Soft Node up! pnid=%d, name=(%s)\n"
, method_name, MyNode->GetPNid(), MyNode->GetName());
mon_log_write(MON_CLUSTER_NODE_TM_READY_1, SQ_LOG_INFO, la_buf);
}
else
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "[%s], Node activated! pnid=%d, name=(%s) \n", method_name, MyNode->GetPNid(), MyNode->GetName());
mon_log_write(MON_CLUSTER_NODE_TM_READY_2, SQ_LOG_INFO, la_buf);
// Let other monitors know the node is up
CReplActivateSpare *repl = new CReplActivateSpare( MyPNID, -1 );
Replicator.addItem(repl);
}
}
TRACE_EXIT;
}
#endif
void CCluster::NodeReady( CNode *spareNode )
{
const char method_name[] = "CCluster::NodeReady";
TRACE_ENTRY;
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d - spare node %s pnid=%d\n"
, method_name, __LINE__, spareNode->GetName(), spareNode->GetPNid() );
}
assert( spareNode->GetState() == State_Up );
// Send node up notice
CLNode *lnode = spareNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->Up();
}
spareNode->SetActivatingSpare( false );
ResetIntegratingPNid();
TRACE_EXIT;
}
void CCluster::UpdateMonitorPort (const char* newMaster)
{
const char method_name[] = "CCluster::UpdateMonitorPort";
TRACE_ENTRY;
char *monitorPort = getenv ("MONITOR_COMM_PORT");
if ((monitorPort) && (newMaster))
{
strcpy( IntegratingMonitorPort, newMaster );
strcat( IntegratingMonitorPort, ":");
strcat( IntegratingMonitorPort, monitorPort);
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) UpdateMonitorPort Updating IntegratingMonitorPort to %s\n",
method_name, __LINE__,IntegratingMonitorPort );
}
}
TRACE_EXIT;
}
// Assign leaders as required
// Current leaders are TM Leader and Monitor Leader
void CCluster::AssignLeaders( int pnid, const char* failedMaster, bool checkProcess )
{
const char method_name[] = "CCluster::AssignLeaders";
TRACE_ENTRY;
#ifndef NAMESERVER_PROCESS
AssignTmLeader ( pnid, checkProcess );
#else
pnid = pnid;
checkProcess = checkProcess;
#endif
AssignMonitorLeader ( failedMaster );
TRACE_EXIT;
}
// Assign monitor lead in the case of failure
void CCluster::AssignMonitorLeader( const char* failedMaster )
{
const char method_name[] = "CCluster::AssignMonitorLeader";
TRACE_ENTRY;
int i = 0;
int rc = 0;
int monitorLeaderPNid = -1;
CNode *node = NULL;
if (failedMaster == NULL)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) failedMaster is NULL, returning\n" , method_name, __LINE__);
}
TRACE_EXIT;
return;
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) " " MonitorLeader (%s) failed!\n"
, method_name, __LINE__, failedMaster );
}
if (!IsAgentMode || !ZClientEnabled)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) not AgentMode or zookeeper not enabled, returning\n"
, method_name, __LINE__);
}
TRACE_EXIT;
return;
}
// delete old master if needed
const char *masterMonitor = ZClient->WaitForAndReturnMaster (false);
if (masterMonitor)
{
// IFF it is the failed master, delete, do not delete anything else because we could delete a new master
if (strcmp (masterMonitor, failedMaster) == 0)
{
ZClient->WatchNodeMasterDelete (failedMaster);
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) deleting master %s\n"
, method_name, __LINE__, masterMonitor );
}
}
// no worries
else
{
rc = ZClient->WatchMasterNode( masterMonitor );
UpdateMonitorPort ( masterMonitor );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) master did not match, set watch (rc = %d) and returning %s\n"
, method_name, __LINE__, rc, masterMonitor );
}
TRACE_EXIT;
return;
}
}
// choose a new master
if (((MyNode) && ((MyNode->GetState() != State_Up) ||(!IAmIntegrated))) || (MyNode == NULL /* not set up yet*/))
{
// Do not let this monitor participate in choosing the master. It can wait until an integrated
// monitor makes a decision.
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) This Node is not set up yet and will not participate in master choice!\n"
, method_name, __LINE__);
}
// wait until another monitor choose a master
const char *masterMonitor = ZClient->WaitForAndReturnMaster (true);
if (masterMonitor)
{
rc = ZClient->WatchMasterNode( masterMonitor );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc);
}
UpdateMonitorPort ( masterMonitor );
}
TRACE_EXIT;
return;
}
// For all monitors who are up - choose the master using the same logic
for (i=0; i<GetConfigPNodesMax(); i++)
{
monitorLeaderPNid++; // set to -1, so this will bump it to 0 on the first time through
if (monitorLeaderPNid == GetConfigPNodesMax())
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader Unable to create or set watch\n", method_name, __LINE__);
}
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Unable to create or set watch on master, hit max\n"
, method_name );
mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_1, SQ_LOG_ERR, buf);
break;
}
if (Node[monitorLeaderPNid] == NULL)
{
continue;
}
node = Node[monitorLeaderPNid];
// skip this node
if ( node == NULL )
{
continue;
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, NodePhaseString(node->GetPhase())
, node->IsSoftNodeDown());
}
if ( node->IsSpareNode() ||
node->IsSoftNodeDown() ||
node->GetState() != State_Up ||
node->GetPhase() != Phase_Ready )
{
continue; // skip this node for any of the above reasons
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " - Node " "%d" " is the new monitorLeaderPNid." "\n", method_name, __LINE__, node->GetPNid());
}
const char *masterMonitor = ZClient->WaitForAndReturnMaster (false);
//nobody has written it yet, we don't want to overwrite anything
if (!masterMonitor)
{
rc = ZClient->CreateMasterZNode ( node->GetName() );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader CreateMasterZNode with rc = %d\n", method_name, __LINE__, rc);
}
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Master Monitor is %s on node %d\n"
, method_name, node->GetName(), node->GetPNid() );
mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_2, SQ_LOG_INFO, buf);
if ( (rc == ZOK) || (rc == ZNODEEXISTS) )
{
rc = ZClient->WatchMasterNode( node->GetName() );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc);
}
}
else
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader Unable to create or set watch\n", method_name, __LINE__);
}
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Unable to create or set watch on master node %s\n"
, method_name, node->GetName() );
mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_3, SQ_LOG_ERR, buf);
}
}
else
{
rc = ZClient->WatchMasterNode( masterMonitor );
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Master Monitor is %s\n"
, method_name, masterMonitor);
mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_4, SQ_LOG_INFO, buf);
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc);
}
}
break;
}
TRACE_EXIT;
}
#ifndef NAMESERVER_PROCESS
// Assigns a new TMLeader if given pnid is same as tmLeaderNid_
// TmLeader is a logical node num.
// pnid has gone down, so if that node was previously the TM leader, a new one needs to be chosen.
void CCluster::AssignTmLeader( int pnid, bool checkProcess )
{
const char method_name[] = "CCluster::AssignTmLeader";
TRACE_ENTRY;
int i = 0;
CNode *node = NULL;
CProcess *process = NULL;
int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d - pnid=%d, checkProcess=%d, tmLeaderNid_=%d, TmLeaderPNid=%d\n"
, method_name, __LINE__
, pnid, checkProcess, tmLeaderNid_, TmLeaderPNid );
}
if (TmLeaderPNid != pnid)
{
node = LNode[tmLeaderNid_]->GetNode();
if (checkProcess)
{
process = LNode[tmLeaderNid_]->GetProcessLByType( ProcessType_DTM );
if (process)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
if (node)
trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, "
"isSoftNodeDown=%d, checkProcess=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, NodePhaseString(node->GetPhase())
, node->IsSoftNodeDown()
, checkProcess );
}
return;
}
else
{
if (NameServerEnabled)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d - Getting process from Name Server, nid=%d, type=%s\n"
, method_name, __LINE__
, tmLeaderNid_, ProcessTypeString(ProcessType_DTM) );
}
process = Nodes->GetProcessLByTypeNs( tmLeaderNid_, ProcessType_DTM );
if (process)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
if (node)
trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, "
"isSoftNodeDown=%d, checkProcess=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, NodePhaseString(node->GetPhase())
, node->IsSoftNodeDown()
, checkProcess );
}
return;
}
}
}
}
else
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
if (node)
trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, "
"isSoftNodeDown=%d, checkProcess=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, NodePhaseString(node->GetPhase())
, node->IsSoftNodeDown()
, checkProcess );
}
return;
}
}
node = Node[TmLeaderPNid];
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - Node " "%d" " TmLeader failed! (checkProcess=%d)\n"
, method_name, __LINE__, tmLeaderNid_, checkProcess );
}
for (i=0; i<GetConfigPNodesMax(); i++)
{
TmLeaderPNid++;
if (TmLeaderPNid == GetConfigPNodesMax())
{
TmLeaderPNid = 0; // restart with nid 0
}
if (TmLeaderPNid == pnid)
{
continue; // this is the node that is going down, skip it
}
if (Node[TmLeaderPNid] == NULL)
{
continue;
}
node = Node[TmLeaderPNid];
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, NodePhaseString(node->GetPhase())
, node->IsSoftNodeDown());
}
if ( node->IsSpareNode() ||
node->IsSoftNodeDown() ||
node->GetState() != State_Up ||
node->GetPhase() != Phase_Ready )
{
continue; // skip this node for any of the above reasons
}
tmLeaderNid_ = node->GetFirstLNode()->GetNid();
if (checkProcess)
{
process = LNode[tmLeaderNid_]->GetProcessLByType( ProcessType_DTM );
if (!process)
{
continue; // skip this node no DTM process exists
}
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d" " - Node " "%d" " is the new TmLeader." "\n", method_name, __LINE__, tmLeaderNid_);
}
break;
}
TRACE_EXIT;
}
#endif
CCluster::CCluster (void)
:NumRanks (-1)
,socks_(NULL)
,sockPorts_(NULL)
,commSock_(-1)
,syncPort_(0)
,syncSock_(-1)
#ifdef NAMESERVER_PROCESS
,mon2nsSock_(-1)
#endif
,epollFD_(-1),
Node (NULL),
LNode (NULL),
tmSyncPNid_ (-1),
currentNodes_ (0),
configPNodesCount_ (-1),
configPNodesMax_ (-1),
nodeMap_ (NULL),
#ifndef NAMESERVER_PROCESS
tmLeaderNid_ (-1),
tmReadyCount_(0),
#endif
minRecvCount_(4096),
recvBuffer_(NULL),
recvBuffer2_(NULL),
swpRecCount_(0),
barrierCount_(0),
allGatherCount_(0),
commDupCount_(0),
barrierCountSaved_(0),
allGatherCountSaved_(0),
commDupCountSaved_(0),
inBarrier_(false),
inAllGather_(false),
inCommDup_(false),
monInitComplete_(false),
monSyncResponsive_(true),
integratingPNid_(-1),
joinComm_(MPI_COMM_NULL),
joinSock_(-1),
lastSeqNum_(0),
lowSeqNum_(0),
highSeqNum_(0),
reconnectSeqNum_(0),
seqNum_(1),
waitForWatchdogExit_(false)
,waitForNameServerExit_(false)
,checkSeqNum_(false)
,validateNodeDown_(false)
,enqueuedDown_(false)
,nodeDownDeathNotices_(true)
,verifierNum_(0)
#ifdef NAMESERVER_PROCESS
,myMonConnCount_(0)
,minMonConnCount_(0)
,minMonConnPnid_(-1)
#else
,clusterProcCount_(0)
#endif
{
int i;
const char method_name[] = "CCluster::CCluster";
TRACE_ENTRY;
configMaster_ = -1;
MPI_Comm_set_errhandler(MPI_COMM_WORLD,MPI_ERRORS_RETURN);
char *env = getenv("SQ_MON_CHECK_SEQNUM");
if ( env )
{
int val = atoi(env);
if ( val > 0)
{
checkSeqNum_ = (val != 0);
}
}
if (trace_settings & TRACE_INIT)
trace_printf("%s@%d Checking sync sequence numbers is %s\n",
method_name, __LINE__,
(checkSeqNum_ ? "enabled" : "disabled"));
CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
configPNodesMax_ = clusterConfig->GetPNodesConfigMax();
// get master from CClusterConfig
configMaster_ = clusterConfig->GetConfigMaster();
// Compute minimum "sync cycles" per second. The minimum is 1/10
// the expected number, assuming "next_test_delay" cycles per second (where
// next_test_delay is in microseconds).
syncMinPerSec_ = 1000000 / next_test_delay / 10;
agMaxElapsed_.tv_sec = 0;
agMaxElapsed_.tv_nsec = 0;
agMinElapsed_.tv_sec = 10000;
agMinElapsed_.tv_nsec = 0;
// Allocate structures for monitor point-to-point communications
//
// The current approach is to allocate to a maximum number (MAX_NODES).
//
// The actual number could be based on the number of nodes configured
// which is better from a memory allocation perspective. However,
// this requires changing to an index-to-pnid map structure to access
// physical node objects (CNode) in the array structures and managing
// the map as nodes are added and deleted. (an optimization task)
//
comms_ = new MPI_Comm[MAX_NODES];
otherMonRank_ = new int[MAX_NODES];
socks_ = new int[MAX_NODES];
sockPorts_ = new int[MAX_NODES];
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
upNodes_.upNodes[i] = 0;
}
for (i=0; i < MAX_NODES; ++i)
{
comms_[i] = MPI_COMM_NULL;
socks_[i] = -1;
sockPorts_[i] = -1;
}
env = getenv("SQ_MON_NODE_DOWN_VALIDATION");
if ( env )
{
int val = atoi(env);
if ( val > 0)
{
validateNodeDown_ = (val != 0);
}
}
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] Validation of node down is %s\n",
method_name, (validateNodeDown_ ? "enabled" : "disabled"));
mon_log_write(MON_CLUSTER_CLUSTER_1, SQ_LOG_INFO, buf);
InitializeConfigCluster();
for (size_t j=0; j<(sizeof(agElapsed_)/sizeof(int)); ++j)
{
agElapsed_[j] = 0;
}
char *p = getenv("MON_MIN_RECV_COUNT");
if ( p )
{
long int val = strtoul(p, NULL, 10);
if (errno != ERANGE)
{
minRecvCount_ = val;
}
}
p = getenv("SQ_MON_NODE_DOWN_DEATH_MESSAGES");
if ( p && atoi(p) == 0)
{
nodeDownDeathNotices_ = false;
}
// build the node objects & Sync collision assignment arrays
// these buffers will be used in ShareWithPeers in AllGather
// operation to get TMSync data as well as Replication data.
// Allocate the maximum allowed so that we pay the price only once.
// This wastes a bit of memory but reduces complexity when
// adding and deleting nodes. Usage is based on GetConfigPNodesMax()
// the maximum number that can be configured.
recvBuffer_ = new struct sync_buffer_def[GetConfigPNodesMax()];
recvBuffer2_ = new struct sync_buffer_def[GetConfigPNodesMax()];
TRACE_EXIT;
}
CCluster::~CCluster (void)
{
const char method_name[] = "CCluster::~CCluster";
TRACE_ENTRY;
if (epollFD_ != -1)
{
close( epollFD_ );
}
if (commSock_ != -1)
{
close( commSock_ );
}
if (syncSock_ != -1)
{
close( syncSock_ );
}
delete [] comms_;
delete [] otherMonRank_;
delete [] socks_;
delete [] sockPorts_;
if (nodeMap_)
{
delete [] nodeMap_;
nodeMap_ = NULL;
}
delete [] recvBuffer2_;
delete [] recvBuffer_;
TRACE_EXIT;
}
int CCluster::incrGetVerifierNum()
{
verifierNum_++;
if ( verifierNum_ < 0 )
{
verifierNum_ = 0;
}
return verifierNum_;
}
// For a reintegrated monitor node, following the first sync cycle, obtain the
// current sync cycle sequence number. And verify that all nodes agree
// on the sequence number.
unsigned long long CCluster::EnsureAndGetSeqNum(cluster_state_def_t nodestate[])
{
const char method_name[] = "CCluster::EnsureAndGetSeqNum";
TRACE_ENTRY;
unsigned long long seqNum = 0;
for (int i = 0; i < GetConfigPNodesCount(); i++)
{
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum );
}
if (nodestate[indexToPnid_[i]].seq_num > 1)
{
if (seqNum == 0)
{
seqNum = nodestate[indexToPnid_[i]].seq_num;
}
else
{
assert(nodestate[indexToPnid_[i]].seq_num == seqNum);
}
}
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum );
}
}
TRACE_EXIT;
return seqNum;
}
void CCluster::HardNodeDown (int pnid, bool communicate_state)
{
#ifndef NAMESERVER_PROCESS
char port_fname[MAX_PROCESS_PATH];
char temp_fname[MAX_PROCESS_PATH];
#endif
CNode *node;
CLNode *lnode;
char buf[MON_STRING_BUF_SIZE];
const char method_name[] = "CCluster::HardNodeDown";
TRACE_ENTRY;
node = Nodes->GetNode(pnid);
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - pnid=%d, comm_state=%d, state=%s, isInQuiesceState=%d,"
" (local pnid=%d, state=%s, isInQuiesceState=%d, "
"shutdown level=%d)\n", method_name, __LINE__,
pnid, communicate_state, StateString(node->GetState()),
node->isInQuiesceState(),
MyPNID, StateString(MyNode->GetState()),
MyNode->isInQuiesceState(), MyNode->GetShutdownLevel() );
if (( MyPNID == pnid ) &&
( MyNode->GetState() == State_Down ||
MyNode->IsKillingNode() ) )
{
// we are coming down ... don't process it
if ( !IsRealCluster && MyNode->isInQuiesceState())
{
// in virtual env, this would be called after node quiescing,
// so continue with mark down processing.
}
else
{
return;
}
}
if ( (MyNode->GetShutdownLevel() != ShutdownLevel_Undefined) &&
(pnid != MyPNID) ) // some other node went down while shutdown was in progress
{
snprintf(buf, sizeof(buf), "[%s], Node failure during shutdown, down nid = %d\n", method_name, pnid);
mon_log_write(MON_CLUSTER_MARKDOWN_1, SQ_LOG_ERR, buf);
if (!waitForWatchdogExit_) // if WDT is not exiting
{
// bring down this node because TSE backup processes may not exit
// if the primary was on the node that went down.
ReqQueue.enqueueDownReq(MyPNID);
}
}
if ( communicate_state && pnid != MyPNID )
{
// just communicate the change and let the real node handle it.
node->SetChangeState( true );
return;
}
#ifndef NAMESERVER_PROCESS
if ( !Emulate_Down )
{
if( !IsRealCluster )
{
snprintf(port_fname, sizeof(port_fname), "%s/monitor.%d.port.%s",getenv("MPI_TMPDIR"),pnid,node->GetName());
}
else
{
// Remove the domain portion of the name if any
char short_node_name[MPI_MAX_PROCESSOR_NAME];
char str1[MPI_MAX_PROCESSOR_NAME];
memset( short_node_name, 0, MPI_MAX_PROCESSOR_NAME );
memset( str1, 0, MPI_MAX_PROCESSOR_NAME );
strcpy (str1, node->GetName() );
char *str1_dot = strchr( (char *) str1, '.' );
if ( str1_dot )
{
memcpy( short_node_name, str1, str1_dot - str1 );
}
else
{
strcpy (short_node_name, str1 );
}
snprintf(port_fname, sizeof(port_fname), "%s/monitor.port.%s",getenv("MPI_TMPDIR"),short_node_name);
}
sprintf(temp_fname, "%s.bak", port_fname);
remove(temp_fname);
rename(port_fname, temp_fname);
}
#endif
if (node->GetState() != State_Down || !node->isInQuiesceState())
{
snprintf(buf, sizeof(buf),
"[CCluster::HardNodeDown], Node %s (%d) is going down.\n",
node->GetName(), node->GetPNid());
mon_log_write(MON_CLUSTER_MARKDOWN_2, SQ_LOG_CRIT, buf);
node->SetKillingNode( true );
if ( MyPNID == pnid &&
(MyNode->GetState() == State_Up || MyNode->GetState() == State_Shutdown) &&
!MyNode->isInQuiesceState() )
{
STATE state = MyNode->GetState();
switch ( state )
{
case State_Up:
case State_Shutdown:
// do node quiescing and let HealthCheck thread know that quiescing has started
// setting internal state to 'quiesce' will prevent replicating process exits
// and reject normal shutdown requests in all nodes while we are quiescing.
if (!waitForWatchdogExit_) // if WDT is not exiting
{
MyNode->setQuiesceState();
HealthCheck.setState(MON_NODE_QUIESCE);
}
break;
default: // in all other states
if ( ! Emulate_Down )
{
// make sure no processes are alive if in the middle of re-integration
#ifndef NAMESERVER_PROCESS
node->KillAllDown();
#endif
snprintf(buf, sizeof(buf),
"[CCluster::HardNodeDown], Node %s (%d)is down.\n",
node->GetName(), node->GetPNid());
mon_log_write(MON_CLUSTER_MARKDOWN_3, SQ_LOG_ERR, buf);
// Don't generate a core file, abort is intentional
struct rlimit limit;
limit.rlim_cur = 0;
limit.rlim_max = 0;
setrlimit(RLIMIT_CORE, &limit);
MPI_Abort(MPI_COMM_SELF,99);
}
}
}
else
{
if ( node->GetPNid() == integratingPNid_ )
{
ResetIntegratingPNid();
}
#ifndef NAMESERVER_PROCESS
node->KillAllDown();
#endif
node->SetState( State_Down );
// Send node down message to local node's processes
lnode = node->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->Down();
}
if ( ZClientEnabled )
{
ZClient->WatchNodeDelete( node->GetName() );
ZClient->WatchNodeMasterDelete( node->GetName() );
}
}
}
#ifndef NAMESERVER_PROCESS
// we need to abort any active TmSync
if (( MyNode->GetTmSyncState() == SyncState_Start ) ||
( MyNode->GetTmSyncState() == SyncState_Continue ) ||
( MyNode->GetTmSyncState() == SyncState_Commit ) )
{
MyNode->SetTmSyncState( SyncState_Abort );
Monitor->SetAbortPendingTmSync();
if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ));
}
#endif
#ifndef NAMESERVER_PROCESS
if ( Emulate_Down )
{
AssignTmLeader(pnid, false);
}
else
#endif
{
AssignLeaders(pnid, node->GetName(), false);
}
TRACE_EXIT;
}
void CCluster::SoftNodeDown( int pnid )
{
CNode *node;
char buf[MON_STRING_BUF_SIZE];
const char method_name[] = "CCluster::SoftNodeDown";
TRACE_ENTRY;
node = Nodes->GetNode(pnid);
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - pnid=%d, state=%s, phase=%s, isInQuiesceState=%d, isSoftNodeDown=%d"
" (local pnid=%d, state=%s, phase=%s, isInQuiesceState=%d, isSoftNodeDown=%d "
"shutdown level=%d)\n"
, method_name, __LINE__
, pnid, StateString(node->GetState())
, NodePhaseString(node->GetPhase())
, node->isInQuiesceState()
, node->IsSoftNodeDown()
, MyPNID, StateString(MyNode->GetState())
, NodePhaseString(MyNode->GetPhase())
, MyNode->isInQuiesceState()
, MyNode->IsSoftNodeDown()
, MyNode->GetShutdownLevel() );
}
if (( MyPNID == pnid ) &&
( MyNode->GetState() == State_Down ||
MyNode->IsKillingNode() ) )
{
// we are coming down ... don't process it
return;
}
snprintf( buf, sizeof(buf)
, "[%s], Node %s (%d) is going soft down.\n"
, method_name, node->GetName(), node->GetPNid());
mon_log_write(MON_CLUSTER_SOFTNODEDOWN_1, SQ_LOG_ERR, buf);
node->SetKillingNode( true );
if ( node->GetState() == State_Up )
{
node->SetSoftNodeDown(); // Set soft down flag
node->SetPhase( Phase_SoftDown ); // Suspend TMSync on node
if ( node->GetPNid() == MyPNID )
{
// and tell remote monitor processes the node is soft down
CReplSoftNodeDown *repl = new CReplSoftNodeDown( MyPNID );
Replicator.addItem(repl);
}
#ifndef NAMESERVER_PROCESS
node->KillAllDownSoft(); // Kill all processes
#endif
snprintf( buf, sizeof(buf)
, "[%s], Node %s (%d) executed soft down.\n"
, method_name, node->GetName(), node->GetPNid() );
mon_log_write(MON_CLUSTER_SOFTNODEDOWN_2, SQ_LOG_ERR, buf);
}
else
{
snprintf( buf, sizeof(buf),
"[%s], Node %s (%d) soft node down not executed, state=%s\n"
, method_name, node->GetName()
, node->GetPNid()
, StateString(MyNode->GetState()) );
mon_log_write(MON_CLUSTER_SOFTNODEDOWN_3, SQ_LOG_ERR, buf);
// Probably a programmer bonehead!
abort();
}
#ifndef NAMESERVER_PROCESS
// we need to abort any active TmSync
if (( MyNode->GetTmSyncState() == SyncState_Start ) ||
( MyNode->GetTmSyncState() == SyncState_Continue ) ||
( MyNode->GetTmSyncState() == SyncState_Commit ) )
{
MyNode->SetTmSyncState( SyncState_Abort );
Monitor->SetAbortPendingTmSync();
if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ));
}
#endif
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, NodePhaseString(node->GetPhase())
, node->IsSoftNodeDown());
}
AssignLeaders(pnid, node->GetName(), false);
TRACE_EXIT;
}
bool CCluster::CheckSpareSet( int pnid )
{
bool activatedSpare = false;
bool done = false;
unsigned int ii;
unsigned int jj;
CNode *newNode = Nodes->GetNode( pnid );
const char method_name[] = "CCluster::CheckSpareSet";
TRACE_ENTRY;
// Build spare node set
CNode *spareNode;
NodesList spareSetList;
NodesList *spareNodesConfigList = Nodes->GetSpareNodesConfigList();
NodesList::iterator itSn;
for ( itSn = spareNodesConfigList->begin();
itSn != spareNodesConfigList->end() && !done ; itSn++ )
{
spareNode = *itSn;
PNidVector sparePNids = spareNode->GetSparePNids();
// if the new node is a spare node in the configuration
if ( newNode->GetPNid() == spareNode->GetPNid() )
{
// Add the spare node and each node it is configured to spare to the set
spareSetList.push_back( spareNode );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - pnid=%d, name=(%s) is a configured Spare\n", method_name, __LINE__, spareNode->GetPNid(), spareNode->GetName());
for ( ii = 0; ii < sparePNids.size(); ii++ )
{
spareSetList.push_back( Nodes->GetNode(sparePNids[ii]) );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - pnid=%d, name=(%s) is in Spare set\n", method_name, __LINE__, Nodes->GetNode(sparePNids[ii])->GetPNid(), Nodes->GetNode(sparePNids[ii])->GetName());
}
done = true;
}
else
{
// Check each pnid it is configured to spare
for ( jj = 0; jj < sparePNids.size(); jj++ )
{
// if the new node is in the spare set of a spare node
if ( newNode->GetPNid() == sparePNids[jj] )
{
// Add the spare node and each node it is configured to spare to the set
spareSetList.push_back( spareNode );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - pnid=%d, name=(%s) is a configured Spare\n", method_name, __LINE__, spareNode->GetPNid(), spareNode->GetName());
for ( ii = 0; ii < sparePNids.size(); ii++ )
{
spareSetList.push_back( Nodes->GetNode(sparePNids[ii]) );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - pnid=%d, name=(%s) is in Spare set\n", method_name, __LINE__, Nodes->GetNode(sparePNids[ii])->GetPNid(), Nodes->GetNode(sparePNids[ii])->GetName());
}
done = true;
}
}
}
}
if (newNode && trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - new node pnid=%d, name=(%s), zid=%d\n"
, method_name, __LINE__
, newNode->GetPNid(), newNode->GetName(), newNode->GetZone());
}
// if the newNode still owns the zone
if ( newNode && newNode->GetZone() != -1 )
{
// assume implicit spare node activation
// (no need to move logical nodes to physical node)
// since HardNodeUp() already set State_Up,
// just reset spare node flag and remove from available spare nodes
newNode->ResetSpareNode();
Nodes->RemoveFromSpareNodesList( newNode );
ActivateSpare( newNode, newNode );
activatedSpare = true;
TRACE_EXIT;
return( activatedSpare );
}
CLNode *lnode;
CNode *node;
CNode *downNode = NULL;
// Now check the state of each configured logical node in the set for down state
spareNode = newNode; // new node (pnid) is the spare to activate
NodesList::iterator itSs;
for ( itSs = spareSetList.begin(); itSs != spareSetList.end(); itSs++ )
{
node = *itSs;
if ( node->GetPNid() != pnid )
{
// Find the first down node
if ( !downNode )
{
lnode = node->GetFirstLNode();
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - node nid=%d, pnid=%d(%s), state=%s\n"
, method_name, __LINE__, lnode?lnode->GetNid():-1
, node->GetPNid(), node->GetName()
, StateString( node->GetState() ) );
if ( lnode && lnode->GetState() == State_Down )
{
downNode = node;
}
}
}
if ( spareNode && downNode )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - spare node pnid=%d (%s), down node pnid=%d (%s) \n"
, method_name, __LINE__
, spareNode->GetPNid(), spareNode->GetName()
, downNode->GetPNid(), downNode->GetName());
break;
}
}
if ( spareNode && downNode )
{
Nodes->RemoveFromSpareNodesList( spareNode );
spareNode->ResetSpareNode();
if ( downNode->GetPNid() != pnid )
{ // the spare node does not own the down logical nodes so activate it
ActivateSpare( spareNode, downNode );
}
activatedSpare = true;
}
TRACE_EXIT;
return( activatedSpare );
}
const char *JoiningPhaseString( JOINING_PHASE phase )
{
const char *str;
switch( phase )
{
case JoiningPhase_Unknown:
str = "JoiningPhase_Unknown";
break;
case JoiningPhase_1:
str = "JoiningPhase_1";
break;
case JoiningPhase_2:
str = "JoiningPhase_2";
break;
case JoiningPhase_3:
str = "JoiningPhase_3";
break;
default:
str = "JoiningPhase - Undefined";
break;
}
return( str );
}
struct message_def *CCluster::JoinMessage( const char *node_name, int pnid, JOINING_PHASE phase )
{
struct message_def *msg;
const char method_name[] = "CCluster::JoinMessage";
TRACE_ENTRY;
// Record statistics (sonar counters)
if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
MonStats->notice_death_Incr();
msg = new struct message_def;
msg->type = MsgType_NodeJoining;
msg->noreply = true;
msg->u.request.type = ReqType_Notice;
strcpy( msg->u.request.u.joining.node_name, node_name );
msg->u.request.u.joining.pnid = pnid;
msg->u.request.u.joining.phase = phase;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST_DETAIL))
trace_printf("%s@%d - Joining notice for node %s (pnid=%d, phase=%d)\n",
method_name, __LINE__, node_name, pnid, phase );
TRACE_EXIT;
return msg;
}
struct message_def *CCluster::SpareUpMessage( const char *node_name, int pnid )
{
struct message_def *msg;
const char method_name[] = "CCluster::SpareUpMessage";
TRACE_ENTRY;
// Record statistics (sonar counters)
if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
MonStats->notice_death_Incr();
msg = new struct message_def;
msg->type = MsgType_SpareUp;
msg->noreply = true;
msg->u.request.type = ReqType_Notice;
strcpy( msg->u.request.u.spare_up.node_name, node_name );
msg->u.request.u.spare_up.pnid = pnid;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST_DETAIL))
trace_printf("%s@%d - Spare node up notice for node %s nid=%d\n",
method_name, __LINE__, node_name, pnid );
TRACE_EXIT;
return msg;
}
struct message_def *CCluster::ReIntegErrorMessage( const char *msgText )
{
struct message_def *msg;
const char method_name[] = "CCluster::ReIntegErrorMessage";
TRACE_ENTRY;
msg = new struct message_def;
msg->type = MsgType_ReintegrationError;
msg->noreply = true;
msg->u.request.type = ReqType_Notice;
strncpy( msg->u.request.u.reintegrate.msg, msgText,
sizeof(msg->u.request.u.reintegrate.msg) );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST_DETAIL))
trace_printf("%s@%d - Reintegrate notice %s\n",
method_name, __LINE__, msgText );
TRACE_EXIT;
return msg;
}
int CCluster::HardNodeUp( int pnid, char *node_name )
{
bool spareNodeActivated = false;
int rc = MPI_SUCCESS;
int tmCount = 0;
CNode *node;
CLNode *lnode;
STATE nodeState;
const char method_name[] = "CCluster::HardNodeUp";
TRACE_ENTRY;
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - pnid=%d, name=%s (MyPNID = %d), currentNodes_=%d\n"
, method_name, __LINE__, pnid, node_name, MyPNID, currentNodes_ );
if ( pnid == -1 )
{
node = Nodes->GetNode( node_name );
}
else
{
node = Nodes->GetNode( pnid );
}
if ( node == NULL )
{
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Invalid node, pnid=%d, name=%s" "\n"
, method_name, __LINE__, pnid, node_name );
return( MPI_ERR_NAME );
}
nodeState = node->GetState();
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Node state=%s" "\n"
, method_name, __LINE__, StateString( nodeState ) );
if ( nodeState != State_Up )
{
if ( nodeState == State_Down )
{
node->SetKillingNode( false );
if ( Emulate_Down )
{
// Any DTMs running?
for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ )
{
CNode *tempNode = Nodes->GetNodeByMap( i );
lnode = tempNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
CProcess *process = lnode->GetProcessLByType( ProcessType_DTM );
if ( process ) tmCount++;
}
}
if ( tmCount )
{
IAmIntegrated = true;
}
// We need to remove any old process objects before we restart the node.
node->CleanUpProcesses();
node->SetState( State_Up );
if ( MyPNID == pnid )
{
MyNode->clearQuiesceState();
HealthCheck.initializeVars();
SMSIntegrating = true;
#ifndef NAMESERVER_PROCESS
Monitor->StartPrimitiveProcesses();
#endif
// Let other monitors know this node is up
CReplNodeUp *repl = new CReplNodeUp(MyPNID);
Replicator.addItem(repl);
}
else
{
if ( tmCount )
{
#ifndef NAMESERVER_PROCESS
// Send node prepare notice to local DTM processes
lnode = node->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->PrepareForTransactions( true );
}
#endif
}
else
{
// Process logical node up
lnode = node->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->Up();
}
}
}
}
else
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - Unexpectedly executing HardNodeUp. Expecting to do accept in commAccept thread\n",
method_name, __LINE__ );
}
}
else if ( nodeState == State_Merged )
{
node->SetKillingNode( false );
node->SetState( State_Joining );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d" " - New monitor %s, pnid=%d, state=%s" "\n"
, method_name, __LINE__, node->GetName(), node->GetPNid(), StateString( node->GetState() ) );
for ( int i =0; i < Nodes->GetPNodesCount(); i++ )
{
trace_printf( "%s@%d socks_[indexToPnid_[%d]=%d]=%d, sockPorts_[indexToPnid_[%d]=%d]=%d\n"
, method_name, __LINE__
, i, indexToPnid_[i], socks_[indexToPnid_[i]]
, i, indexToPnid_[i], sockPorts_[indexToPnid_[i]] );
}
}
if ( MyNode->IsCreator() )
{
#ifndef NAMESERVER_PROCESS
SQ_theLocalIOToClient->putOnNoticeQueue( MyNode->GetCreatorPid()
, MyNode->GetCreatorVerifier()
, JoinMessage( node->GetName()
, node->GetPNid()
, JoiningPhase_1 )
, NULL);
#endif
// save the current seq num in the snapshot request.
// this sequence number will match the state of the cluster
// when this request is processed.
ReqQueue.enqueueSnapshotReq(seqNum_);
}
if ( MyPNID == pnid )
{
// request and process revive packet from the creator.
// when complete, this will call HardNodeUp again.
ReqQueue.enqueueReviveReq( );
}
else
{
if ( ZClientEnabled )
{
rc = ZClient->WatchNode( node->GetName() );
if ( rc != ZOK )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Unable to set node watch on %s, pnid%d\n"
, method_name, node->GetName(), node->GetPNid() );
mon_log_write(MON_CLUSTER_HARDNODEUP_1, SQ_LOG_ERR, buf);
}
}
}
}
else if ( nodeState == State_Joining )
{
// The new monitor comes in here first and schedules a node up request on all nodes.
// All other monitors come here next, including the creator.
// The new monitor will not come here again because
// CReplNodeUp is a noop for the one who schedules it.
node->SetState( State_Up );
if ( Nodes->GetSNodesCount() == 0 )
{ // Spare nodes not configured so bring up my logical nodes
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - No spare nodes configured node=%s, pnid=%d, state=%s\n"
, method_name, __LINE__, node->GetName(), node->GetPNid()
, StateString(node->GetState()) );
if ( MyPNID == pnid )
{
ActivateSpare( node, node );
}
}
else
{
node->SetSpareNode();
Nodes->AddToSpareNodesList( node->GetPNid() );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Adding to available spares node=%s, pnid=%d\n"
, method_name, __LINE__, node->GetName(), node->GetPNid() );
// Check for a node down in spare set and activate down node if found
spareNodeActivated = CheckSpareSet( node->GetPNid() );
if ( spareNodeActivated )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Activated spare node=%s, pnid=%d\n"
, method_name, __LINE__, node->GetName(), node->GetPNid() );
}
else
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Available spare node=%s, pnid=%d\n"
, method_name, __LINE__, node->GetName(), node->GetPNid() );
// Spare node not activated
if ( MyNode->IsCreator() )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Sending spare up notice to creator shell(%d) spare node=%s, pnid=%d\n"
, method_name, __LINE__, MyNode->GetCreatorPid(), node->GetName(), node->GetPNid() );
#ifndef NAMESERVER_PROCESS
// Tell creator spare node is up
SQ_theLocalIOToClient->putOnNoticeQueue( MyNode->GetCreatorPid()
, MyNode->GetCreatorVerifier()
, SpareUpMessage( node->GetName()
, node->GetPNid() )
, NULL);
#endif
}
}
}
if ( MyPNID == pnid )
{
// Any DTMs running?
for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ )
{
CNode *tempNode = Nodes->GetNodeByMap( i );
lnode = tempNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
CProcess *process = lnode->GetProcessLByType( ProcessType_DTM );
if ( process ) tmCount++;
}
}
if ( !tmCount && !spareNodeActivated )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - Replicating node up %s, pnid=%d, state=%s, spare=%d, DTM count=%d\n"
, method_name, __LINE__, node->GetName(), node->GetPNid()
, StateString(node->GetState()), node->IsSpareNode(), tmCount );
// Let other monitors know this node is up
CReplNodeUp *repl = new CReplNodeUp(MyPNID);
Replicator.addItem(repl);
}
}
ResetIntegratingPNid();
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d" " - New monitor %s, pnid=%d, state=%s, spare=%d\n"
, method_name, __LINE__, node->GetName(), node->GetPNid()
, StateString(node->GetState()), node->IsSpareNode() );
}
}
TRACE_EXIT;
return( rc );
}
int CCluster::SoftNodeUpPrepare( int pnid )
{
char buf[MON_STRING_BUF_SIZE];
int rc = MPI_SUCCESS;
int tmCount = 0;
CNode *node;
CLNode *lnode;
STATE nodeState;
const char method_name[] = "CCluster::SoftNodeUpPrepare";
TRACE_ENTRY;
node = Nodes->GetNode( pnid );
if ( node == NULL )
{
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - Invalid node, pnid=%d\n"
, method_name, __LINE__, pnid );
return( MPI_ERR_NAME );
}
nodeState = node->GetState();
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - Node name=%s, pnid=%d, state=%s, soft down=%d\n"
, method_name, __LINE__
, node->GetName()
, node->GetPNid()
, StateString( nodeState )
, node->IsSoftNodeDown() );
if ( nodeState != State_Up )
{
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - Unexpectedly executing SoftNodeUp\n",
method_name, __LINE__ );
// Programmer bonehead!
abort();
}
node->SetKillingNode( false );
node->ResetSoftNodeDown( );
node->SetPhase( Phase_Ready );
if ( MyPNID == pnid )
{
SMSIntegrating = true;
#ifndef NAMESERVER_PROCESS
node->SetSoftNodeUp( );
Monitor->StartPrimitiveProcesses();
#endif
// Let other monitors know this node is preparing to soft up
CReplSoftNodeUp *repl = new CReplSoftNodeUp(MyPNID);
Replicator.addItem(repl);
}
else
{
// Any DTMs running?
for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ )
{
CNode *tempNode = Nodes->GetNodeByMap( i );
lnode = tempNode->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
CProcess *process = lnode->GetProcessLByType( ProcessType_DTM );
if ( process ) tmCount++;
}
}
if ( tmCount )
{
#ifndef NAMESERVER_PROCESS
// Send DTM restarted notice to local DTM processes
lnode = node->GetFirstLNode();
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->SendDTMRestarted();
}
#endif
}
else
{
snprintf( buf, sizeof(buf),
"[%s], Node %s (%d) soft node up prepare not executed, state=%s, tmCount=%d\n"
, method_name, node->GetName()
, node->GetPNid()
, StateString(MyNode->GetState())
, tmCount );
mon_log_write(MON_CLUSTER_SOFTNODEUP_1, SQ_LOG_WARNING, buf);
}
}
TRACE_EXIT;
return( rc );
}
const char *StateString( STATE state)
{
const char *str;
switch( state )
{
case State_Unknown:
str = "State_Unknown";
break;
case State_Up:
str = "State_Up";
break;
case State_Down:
str = "State_Down";
break;
case State_Stopped:
str = "State_Stopped";
break;
case State_Shutdown:
str = "State_Shutdown";
break;
case State_Unlinked:
str = "State_Unlinked";
break;
case State_Merging:
str = "State_Merging";
break;
case State_Merged:
str = "State_Merged";
break;
case State_Joining:
str = "State_Joining";
break;
case State_Initializing:
str = "State_Initializing";
break;
default:
str = "State - Undefined";
break;
}
return( str );
}
const char *SyncStateString( SyncState state)
{
const char *str;
switch( state )
{
case SyncState_Null:
str = "SyncState_Null";
break;
case SyncState_Start:
str = "SyncState_Start";
break;
case SyncState_Continue:
str = "SyncState_Continue";
break;
case SyncState_Abort:
str = "SyncState_Abort";
break;
case SyncState_Commit:
str = "SyncState_Commit";
break;
case SyncState_Suspended:
str = "SyncState_Suspended";
break;
default:
str = "SyncState - Undefined";
break;
}
return( str );
}
#ifndef NAMESERVER_PROCESS
void CCluster::AddTmsyncMsg( struct sync_buffer_def *tmSyncBuffer
, struct sync_def *sync
, struct internal_msg_def *msg)
{
const char method_name[] = "CCluster::AddTmsyncMsg";
TRACE_ENTRY;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Requesting SyncType=%d\n", method_name,
__LINE__, sync->type);
msg->type = InternalType_Sync;
msg->u.sync.type = sync->type;
msg->u.sync.pnid = sync->pnid;
msg->u.sync.syncnid = sync->syncnid;
msg->u.sync.tmleader = sync->tmleader;
msg->u.sync.state = sync->state;
msg->u.sync.count = sync->count;
if ( sync->type == SyncType_TmData )
{
memmove (msg->u.sync.data, sync->data, sync->length);
}
msg->u.sync.length = sync->length;
// We can have only have a single "InternalType_Sync" msg in our
// SyncBuffer, else we cause a collision.
int msgSize = (MSG_HDR_SIZE + sizeof(sync_def) - MAX_SYNC_DATA
+ sync->length );
// Insert the message size into the message header
msg->replSize = msgSize;
tmSyncBuffer->msgInfo.msg_count = 1;
tmSyncBuffer->msgInfo.msg_offset += msgSize;
// Set end-of-buffer marker
msg = (struct internal_msg_def *)
&tmSyncBuffer->msg[tmSyncBuffer->msgInfo.msg_offset];
msg->type = InternalType_Null;
TRACE_EXIT;
}
#endif
#ifndef NAMESERVER_PROCESS
void CCluster::DoDeviceReq(char * ldevName)
{
const char method_name[] = "CCluster::DoDeviceReq";
TRACE_ENTRY;
CProcess *process;
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal device request for ldev %s\n",
method_name, __LINE__, ldevName);
Nodes->GetLNode(ldevName, &process);
if (!process)
{
if (trace_settings & TRACE_SYNC)
trace_printf("%s@%d - Device processing but can't find device %s\n",
method_name, __LINE__, ldevName);
}
else
{
CLogicalDevice *ldev;
ldev = Devices->GetLogicalDevice( ldevName );
if ( !ldev )
{ // The device name is not known on this node
// we need to clone the device
ldev = Devices->CloneDevice( process );
}
if ( ldev )
{
bool rstate = false;
if ( ldev->Mounted() )
{
rstate = ldev->UnMount( false );
if (!rstate)
{
if (trace_settings & TRACE_REQUEST)
trace_printf("%s@%d - Can't unmount device %s for "
"process %s (%d, %d)\n", method_name,
__LINE__, ldev->name(), process->GetName(),
process->GetNid(), process->GetPid());
}
}
if ( rstate )
{
rstate = ldev->Mount( process, false );
if (!rstate)
{
if (trace_settings & TRACE_REQUEST)
trace_printf("%s@%d - Can't mount device %s for "
"process %s (%d, %d)\n", method_name,
__LINE__, ldev->name(), process->GetName(),
process->GetNid(), process->GetPid());
}
else
{
if (trace_settings & TRACE_REQUEST)
trace_printf("%s@%d - Mounted device %s for process "
"%s (%d, %d)\n", method_name, __LINE__,
ldev->name(), process->GetName(),
process->GetNid(), process->GetPid());
}
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find ldev %s.\n", method_name,
ldevName);
mon_log_write(MON_CLUSTER_DODEVICEREQ_1, SQ_LOG_ERR, buf);
}
}
TRACE_EXIT;
}
#endif
#ifdef EXCHANGE_CPU_SCHEDULING_DATA
void CCluster::SaveSchedData( struct internal_msg_def *recv_msg )
{
const char method_name[] = "CCluster::SaveSchedData";
TRACE_ENTRY;
int nid = recv_msg->u.scheddata.PNid;
Node[nid]->SetNumCores( recv_msg->u.scheddata.processors );
Node[nid]->SetFreeMemory( recv_msg->u.scheddata.memory_free );
Node[nid]->SetFreeSwap( recv_msg->u.scheddata.swap_free );
Node[nid]->SetFreeCache( recv_msg->u.scheddata.cache_free );
Node[nid]->SetMemTotal( recv_msg->u.scheddata.memory_total );
Node[nid]->SetMemActive( recv_msg->u.scheddata.memory_active );
Node[nid]->SetMemInactive( recv_msg->u.scheddata.memory_inactive );
Node[nid]->SetMemDirty( recv_msg->u.scheddata.memory_dirty );
Node[nid]->SetMemWriteback( recv_msg->u.scheddata.memory_writeback );
Node[nid]->SetMemVMallocUsed( recv_msg->u.scheddata.memory_VMallocUsed );
Node[nid]->SetBTime( recv_msg->u.scheddata.btime );
CLNode *lnode;
lnode = Node[nid]->GetFirstLNode();
int i = 0;
for ( ; lnode; lnode = lnode->GetNextP() )
{
lnode->SetCpuUser(recv_msg->u.scheddata.proc_stats[i].cpu_user);
lnode->SetCpuNice(recv_msg->u.scheddata.proc_stats[i].cpu_nice);
lnode->SetCpuSystem(recv_msg->u.scheddata.proc_stats[i].cpu_system);
lnode->SetCpuIdle(recv_msg->u.scheddata.proc_stats[i].cpu_idle);
lnode->SetCpuIowait(recv_msg->u.scheddata.proc_stats[i].cpu_iowait);
lnode->SetCpuIrq(recv_msg->u.scheddata.proc_stats[i].cpu_irq);
lnode->SetCpuSoftIrq(recv_msg->u.scheddata.proc_stats[i].cpu_soft_irq);
++i;
}
TRACE_EXIT;
}
#endif
void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg,
int pnid)
{
const char method_name[] = "CCluster::HandleOtherNodeMsg";
TRACE_ENTRY;
CNode *downNode;
CNode *spareNode;
#ifndef NAMESERVER_PROCESS
CProcess *process;
CLNode *lnode;
#endif
switch (recv_msg->type)
{
case InternalType_Null:
if (trace_settings & TRACE_SYNC_DETAIL)
trace_printf("%s@%d - Node n%d has nothing to "
"update. \n", method_name, __LINE__, pnid);
break;
case InternalType_ActivateSpare:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal activate spare request, spare pnid=%d, down pnid=%d\n"
, method_name, __LINE__
, recv_msg->u.activate_spare.spare_pnid
, recv_msg->u.activate_spare.down_pnid);
downNode = NULL;
if ( recv_msg->u.activate_spare.down_pnid != -1 )
{
downNode = Nodes->GetNode( recv_msg->u.activate_spare.down_pnid );
}
spareNode = Nodes->GetNode( recv_msg->u.activate_spare.spare_pnid );
ReqQueue.enqueueActivateSpareReq( spareNode, downNode );
break;
case InternalType_NameServerAdd:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal NameServer add request for node_name=%s\n"
, method_name, __LINE__
, recv_msg->u.nameserver_add.node_name );
// Queue the nameserver add request for processing by a worker thread.
ReqQueue.enqueueNameServerAddReq( recv_msg->u.nameserver_add.req_nid
, recv_msg->u.nameserver_add.req_pid
, recv_msg->u.nameserver_add.req_verifier
, recv_msg->u.nameserver_add.node_name );
break;
case InternalType_NameServerDelete:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal NameServer delete request for node=%s\n"
, method_name, __LINE__, recv_msg->u.nameserver_delete.node_name);
// Queue the nameserver delete request for processing by a worker thread.
ReqQueue.enqueueNameServerDeleteReq( recv_msg->u.nameserver_delete.req_nid
, recv_msg->u.nameserver_delete.req_pid
, recv_msg->u.nameserver_delete.req_verifier
, recv_msg->u.nameserver_delete.node_name );
break;
case InternalType_NodeAdd:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal node add request for node_name=%s, "
"first_core=%d, last_core=%d, "
"processors=%d, roles=%d\n"
, method_name, __LINE__
, recv_msg->u.node_add.node_name
, recv_msg->u.node_add.first_core
, recv_msg->u.node_add.last_core
, recv_msg->u.node_add.processors
, recv_msg->u.node_add.roles );
// Queue the node add request for processing by a worker thread.
ReqQueue.enqueueNodeAddReq( recv_msg->u.node_add.req_nid
, recv_msg->u.node_add.req_pid
, recv_msg->u.node_add.req_verifier
, recv_msg->u.node_add.node_name
, recv_msg->u.node_add.first_core
, recv_msg->u.node_add.last_core
, recv_msg->u.node_add.processors
, recv_msg->u.node_add.roles );
break;
case InternalType_Clone:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal clone request, process (%d, %d)"
" %s\n", method_name, __LINE__,
recv_msg->u.clone.nid, recv_msg->u.clone.os_pid,
(recv_msg->u.clone.backup?" Backup":""));
ReqQueue.enqueueCloneReq( &recv_msg->u.clone );
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Device:
ReqQueue.enqueueDeviceReq(recv_msg->u.device.ldev_name);
break;
#endif
case InternalType_Shutdown:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal shutdown request for level=%d\n", method_name, __LINE__, recv_msg->u.shutdown.level);
// Queue the shutdown request for processing by a worker thread.
ReqQueue.enqueueShutdownReq( recv_msg->u.shutdown.level );
break;
case InternalType_NodeDelete:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal node delete request for pnid=%d\n"
, method_name, __LINE__, recv_msg->u.node_delete.pnid);
// Queue the node delete request for processing by a worker thread.
ReqQueue.enqueueNodeDeleteReq( recv_msg->u.node_delete.req_nid
, recv_msg->u.node_delete.req_pid
, recv_msg->u.node_delete.req_verifier
, recv_msg->u.node_delete.pnid );
break;
case InternalType_Down:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid);
// Queue the node down request for processing by a worker thread.
ReqQueue.enqueueDownReq( recv_msg->u.down.pnid );
break;
case InternalType_NodeName:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal node name request (%s to %s)\n", method_name, __LINE__, recv_msg->u.node_name.current_name, recv_msg->u.node_name.new_name);
// Queue the node name request for processing by a worker thread.
ReqQueue.enqueueNodeNameReq( recv_msg->u.node_name.req_nid
, recv_msg->u.node_name.req_pid
, recv_msg->u.node_name.req_verifier
, recv_msg->u.node_name.current_name
, recv_msg->u.node_name.new_name );
break;
case InternalType_SoftNodeDown:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal soft node down request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid);
// Queue the node down request for processing by a worker thread.
ReqQueue.enqueueSoftNodeDownReq( recv_msg->u.down.pnid );
break;
case InternalType_SoftNodeUp:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal soft node up request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
// Queue the node up request for processing by a worker thread.
ReqQueue.enqueueSoftNodeUpReq( recv_msg->u.up.pnid );
break;
case InternalType_Up:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
// Queue the node up request for processing by a worker thread.
ReqQueue.enqueueUpReq( recv_msg->u.up.pnid, NULL, -1 );
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Dump:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal dump request for nid=%d, pid=%d\n",
method_name, __LINE__,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
if ( lnode )
{
process = lnode->GetProcessL(recv_msg->u.dump.pid);
if (process)
{
int verifier = recv_msg->u.dump.verifier;
if ( (verifier == -1) || (verifier == process->GetVerifier()) )
{
process->DumpBegin(recv_msg->u.dump.dumper_nid,
recv_msg->u.dump.dumper_pid,
recv_msg->u.dump.dumper_verifier,
recv_msg->u.dump.core_file);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d, verifier=%d for dump target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid,
recv_msg->u.dump.verifier);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_1, SQ_LOG_ERR, buf);
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d for dump target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_2, SQ_LOG_ERR, buf);
}
}
break;
case InternalType_DumpComplete:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal dump-complete request for nid=%d, pid=%d\n",
method_name, __LINE__,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
if ( lnode )
{
process = lnode->GetProcessL(recv_msg->u.dump.pid);
if (process)
{
int verifier = recv_msg->u.dump.verifier;
if ( (verifier == -1) || (verifier == process->GetVerifier()) )
{
process->DumpEnd(recv_msg->u.dump.status, recv_msg->u.dump.core_file);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d, verifier=%d for dump target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid,
recv_msg->u.dump.verifier);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_3, SQ_LOG_ERR, buf);
}
}
else
{
// Dump completion handled in CProcess::Exit()
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d for dump complete target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_4, SQ_LOG_ERR, buf);
}
}
break;
#endif
case InternalType_Exit:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", method_name, __LINE__, recv_msg->u.exit.name, recv_msg->u.exit.nid, recv_msg->u.exit.pid);
#ifndef NAMESERVER_PROCESS
ReqQueue.enqueueExitReq( &recv_msg->u.exit );
#else
ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns );
#endif
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Event:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal event request\n", method_name, __LINE__);
if ( MyNode->IsMyNode(recv_msg->u.event.nid) )
{
if (trace_settings & TRACE_SYNC)
trace_printf("%s@%d - processing event for (%d, %d)\n", method_name, __LINE__, recv_msg->u.event.nid, recv_msg->u.event.pid);
lnode = Nodes->GetLNode( recv_msg->u.event.nid );
if ( lnode )
{
process = lnode->GetProcessL(recv_msg->u.event.pid);
if (process)
{
int verifier = recv_msg->u.dump.verifier;
if ( (verifier == -1) || (verifier == process->GetVerifier()) )
{
process->GenerateEvent (recv_msg->u.event.event_id,
recv_msg->u.event.length,
&recv_msg->u.event.data);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d, verifier=%d for event=%d\n", method_name,
recv_msg->u.event.nid, recv_msg->u.event.pid,
recv_msg->u.event.verifier, recv_msg->u.event.event_id);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_5, SQ_LOG_ERR, buf);
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid"
"=%d, pid=%d for processing event.\n",
method_name,
recv_msg->u.event.nid, recv_msg->u.event.pid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_6, SQ_LOG_ERR,
buf);
}
}
}
break;
#endif
#ifndef NAMESERVER_PROCESS
case InternalType_IoData:
if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_REDIRECTION))
trace_printf("%s@%d - Internal IO data request\n", method_name, __LINE__);
if ( MyNode->IsMyNode(recv_msg->u.iodata.nid) )
{
if (trace_settings & (TRACE_SYNC | TRACE_REDIRECTION))
trace_printf("%s@%d - processing IO Data for (%d, %d)\n", method_name, __LINE__, recv_msg->u.iodata.nid, recv_msg->u.iodata.pid);
lnode = Nodes->GetLNode( recv_msg->u.iodata.nid );
if ( lnode )
{
process = lnode->GetProcessL(recv_msg->u.iodata.pid);
if (process)
{
int fd;
if (recv_msg->u.iodata.ioType == STDIN_DATA)
{
fd = process->FdStdin();
}
else
{
fd = process->FdStdout();
}
Redirector.disposeIoData(fd,
recv_msg->u.iodata.length,
recv_msg->u.iodata.data);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid"
"=%d, pid=%d for processing IO Data.\n",
method_name,
recv_msg->u.iodata.nid, recv_msg->u.iodata.pid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_7, SQ_LOG_ERR,
buf);
}
}
}
break;
#endif
#ifndef NAMESERVER_PROCESS
case InternalType_StdinReq:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal STDIN request\n", method_name, __LINE__);
if ( !MyNode->IsMyNode(recv_msg->u.stdin_req.supplier_nid) )
{
break;
}
if (trace_settings & (TRACE_SYNC | TRACE_REDIRECTION))
trace_printf("%s@%d - stdin request from (%d,%d)"
", type=%d, for supplier (%d, %d)\n",
method_name, __LINE__,
recv_msg->u.stdin_req.nid,
recv_msg->u.stdin_req.pid,
recv_msg->u.stdin_req.reqType,
recv_msg->u.stdin_req.supplier_nid,
recv_msg->u.stdin_req.supplier_pid);
lnode = Nodes->GetLNode( recv_msg->u.stdin_req.nid );
if ( lnode == NULL )
{
break;
}
process = lnode->GetProcessL(recv_msg->u.stdin_req.pid);
if (process)
{
if (recv_msg->u.stdin_req.reqType == STDIN_REQ_DATA)
{
// Set up to forward stdin data to requester.
// Save file descriptor associated with stdin
// so can find the redirector object later.
CProcess *supProcess;
lnode = Nodes->GetLNode( recv_msg->u.stdin_req.supplier_nid );
if ( lnode )
{
supProcess = lnode->GetProcessL ( recv_msg->u.stdin_req.supplier_pid );
if (supProcess)
{
int fd;
fd = Redirector.stdinRemote(supProcess->infile(),
recv_msg->u.stdin_req.nid,
recv_msg->u.stdin_req.pid);
process->FdStdin(fd);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process "
"nid=%d, pid=%d for stdin data request.\n",
method_name,
recv_msg->u.stdin_req.supplier_nid,
recv_msg->u.stdin_req.supplier_pid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_8,
SQ_LOG_ERR, buf);
}
}
}
else if (recv_msg->u.stdin_req.reqType == STDIN_FLOW_OFF)
{
Redirector.stdinOff(process->FdStdin());
}
else if (recv_msg->u.stdin_req.reqType == STDIN_FLOW_ON)
{
Redirector.stdinOn(process->FdStdin());
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d for stdin data request.\n", method_name,
recv_msg->u.stdin_req.nid,
recv_msg->u.stdin_req.pid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_9, SQ_LOG_INFO, buf);
}
break;
#endif
#ifndef NAMESERVER_PROCESS
case InternalType_Kill:
// Queue the kill request for processing by a worker thread.
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal kill request for (%d, %d), abort =%d\n", method_name, __LINE__, recv_msg->u.kill.nid, recv_msg->u.kill.pid, recv_msg->u.kill.persistent_abort);
ReqQueue.enqueueKillReq( &recv_msg->u.kill );
break;
#endif
case InternalType_Process:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal process request\n", method_name, __LINE__);
if ( MyNode->IsMyNode(recv_msg->u.process.nid) )
{ // Need to create process on this node.
// Queue process creation request for handling by worker thread
#ifdef NAMESERVER_PROCESS
ReqQueue.enqueueNewProcNsReq( &recv_msg->u.process );
#endif
#ifndef NAMESERVER_PROCESS
ReqQueue.enqueueNewProcReq( &recv_msg->u.process );
#endif
}
break;
case InternalType_ProcessInit:
if ( MyNode->IsMyNode(recv_msg->u.processInit.origNid) )
{ // New process request originated on this node
ReqQueue.enqueueProcInitReq( &recv_msg->u.processInit );
}
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Open:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal open request for (%d, %d), opened (%d, %d)\n", method_name, __LINE__, recv_msg->u.open.nid, recv_msg->u.open.pid, recv_msg->u.open.opened_nid, recv_msg->u.open.opened_pid);
ReqQueue.enqueueOpenReq( &recv_msg->u.open );
break;
#endif
case InternalType_SchedData:
#ifdef EXCHANGE_CPU_SCHEDULING_DATA
SaveSchedData( recv_msg );
#endif
break;
case InternalType_Set:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal set request\n", method_name, __LINE__);
ReqQueue.enqueueSetReq( &recv_msg->u.set );
break;
case InternalType_UniqStr:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal unique string request\n", method_name, __LINE__);
ReqQueue.enqueueUniqStrReq( &recv_msg->u.uniqstr );
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Sync:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_TMSYNC))
trace_printf("%s@%d - Internal sync request for"
" Node %s, pnid=%d, SyncType=%d\n",
method_name, __LINE__, Node[pnid]->GetName(), pnid,
recv_msg->u.sync.type);
switch (recv_msg->u.sync.type )
{
case SyncType_TmData:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d), (phase=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, MyNode->GetPhase());
if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
{
MyNode->CheckActivationPhase();
}
if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() == Phase_Ready )
{
if ( MyNode->GetTmSyncState() == SyncState_Null )
{
// Begin a Slave Sync Start
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
tmSyncPNid_ = pnid;
Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() ));
}
Monitor->CoordinateTmDataBlock( &recv_msg->u.sync );
}
else
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Sync State Collision! Node %s (pnid=%d) TmSyncState=(%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState()) );
if ( MyNode->GetTmSyncState() == SyncState_Continue )
{
if ( pnid > tmSyncPNid_ )
// highest node id will continue
{
// They take priority ... we abort
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Aborting Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[Monitor->tmSyncPNid_]->GetName(), Monitor->tmSyncPNid_);
MyNode->SetTmSyncState( SyncState_Null );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) );
Monitor->ReQueue_TmSync (false);
// Continue with other node's Slave TmSync Start request
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
tmSyncPNid_ = pnid;
Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() ));
}
Monitor->CoordinateTmDataBlock (&recv_msg->u.sync);
}
}
else if ( MyNode->GetTmSyncState() == SyncState_Start )
{
// Check if they continue with Master Sync Start
if ( pnid > MyPNID )
// highest node id will continue
{
// They take priority ... we abort
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Aborted Master Sync Start\n", method_name, __LINE__);
MyNode->SetTmSyncState( SyncState_Null );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) );
// Continue with other node's Slave TmSync Start request
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
tmSyncPNid_ = pnid;
Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() ));
}
Monitor->CoordinateTmDataBlock (&recv_msg->u.sync);
}
else
{
// We continue and assume they abort
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Continuing with Master Sync Start\n", method_name, __LINE__);
}
}
else
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Invalid TmSync_State\n", method_name, __LINE__);
}
}
}
break;
case SyncType_TmSyncState:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - TMSYNC(TmSyncState) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
break;
default:
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Unknown SyncType from pnid=%d.\n", method_name, pnid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_10, SQ_LOG_ERR, buf);
}
}
break;
#endif
default:
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Unknown Internal message received, Physical Node=%d.\n", method_name, pnid);
mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_11, SQ_LOG_ERR, buf);
}
}
TRACE_EXIT;
}
void CCluster::HandleMyNodeMsg (struct internal_msg_def *recv_msg,
int pnid)
{
const char method_name[] = "CCluster::HandleMyNodeMsg";
TRACE_ENTRY;
#ifndef NAMESERVER_PROCESS
CProcess *process;
CLNode *lnode;
#endif
if (trace_settings & TRACE_SYNC_DETAIL)
trace_printf("%s@%d - Marking object as replicated, msg type=%d\n",
method_name, __LINE__, recv_msg->type);
switch (recv_msg->type)
{
case InternalType_Null:
if (trace_settings & TRACE_SYNC_DETAIL)
trace_printf("%s@%d - Physical Node pnid=n%d has nothing to "
"update. \n", method_name, __LINE__, pnid);
break;
case InternalType_ActivateSpare:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal activate spare request, spare pnid=%d, down pnid=%d\n"
, method_name, __LINE__
, recv_msg->u.activate_spare.spare_pnid
, recv_msg->u.activate_spare.down_pnid);
break;
case InternalType_NameServerAdd:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal NameServer add request for node_name=%s\n"
, method_name, __LINE__
, recv_msg->u.nameserver_add.node_name );
// Queue the nameserver add request for processing by a worker thread.
ReqQueue.enqueueNameServerAddReq( recv_msg->u.nameserver_add.req_nid
, recv_msg->u.nameserver_add.req_pid
, recv_msg->u.nameserver_add.req_verifier
, recv_msg->u.nameserver_add.node_name );
break;
case InternalType_NameServerDelete:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal NameServer delete request for node=%s\n"
, method_name, __LINE__, recv_msg->u.nameserver_delete.node_name);
// Queue the nameserver delete request for processing by a worker thread.
ReqQueue.enqueueNameServerDeleteReq( recv_msg->u.nameserver_delete.req_nid
, recv_msg->u.nameserver_delete.req_pid
, recv_msg->u.nameserver_delete.req_verifier
, recv_msg->u.nameserver_delete.node_name );
break;
case InternalType_NodeAdd:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal node add request for node_name=%s, "
"first_core=%d, last_core=%d, "
"processors=%d, roles=%d\n"
, method_name, __LINE__
, recv_msg->u.node_add.node_name
, recv_msg->u.node_add.first_core
, recv_msg->u.node_add.last_core
, recv_msg->u.node_add.processors
, recv_msg->u.node_add.roles );
// Queue the node add request for processing by a worker thread.
ReqQueue.enqueueNodeAddReq( recv_msg->u.node_add.req_nid
, recv_msg->u.node_add.req_pid
, recv_msg->u.node_add.req_verifier
, recv_msg->u.node_add.node_name
, recv_msg->u.node_add.first_core
, recv_msg->u.node_add.last_core
, recv_msg->u.node_add.processors
, recv_msg->u.node_add.roles );
break;
case InternalType_Clone:
#ifndef NAMESERVER_PROCESS
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal clone request, completed replicating process (%d, %d) %s\n", method_name, __LINE__, recv_msg->u.clone.nid, recv_msg->u.clone.os_pid, (recv_msg->u.clone.backup?" Backup":""));
#else
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal clone request, process (%d, %d)"
" %s\n", method_name, __LINE__,
recv_msg->u.clone.nid, recv_msg->u.clone.os_pid,
(recv_msg->u.clone.backup?" Backup":""));
ReqQueue.enqueueCloneReq( &recv_msg->u.clone );
#endif
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Device:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal device request, completed device processing for ldev %s\n", method_name, __LINE__, recv_msg->u.device.ldev_name);
break;
#endif
case InternalType_Shutdown:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal shutdown request for level=%d\n", method_name, __LINE__, recv_msg->u.shutdown.level);
// Queue the shutdown request for processing by a worker thread.
ReqQueue.enqueueShutdownReq( recv_msg->u.shutdown.level );
break;
case InternalType_NodeDelete:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf( "%s@%d - Internal node delete request for pnid=%d\n"
, method_name, __LINE__, recv_msg->u.node_delete.pnid);
// Queue the node delete request for processing by a worker thread.
ReqQueue.enqueueNodeDeleteReq( recv_msg->u.node_delete.req_nid
, recv_msg->u.node_delete.req_pid
, recv_msg->u.node_delete.req_verifier
, recv_msg->u.node_delete.pnid );
break;
case InternalType_Down:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid);
break;
case InternalType_NodeName:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal node name request (%s to %s)\n", method_name, __LINE__, recv_msg->u.node_name.current_name, recv_msg->u.node_name.new_name);
// Queue the node name request for processing by a worker thread.
ReqQueue.enqueueNodeNameReq( recv_msg->u.node_name.req_nid
, recv_msg->u.node_name.req_pid
, recv_msg->u.node_name.req_verifier
, recv_msg->u.node_name.current_name
, recv_msg->u.node_name.new_name );
break;
case InternalType_SoftNodeDown:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal soft down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid);
break;
case InternalType_SoftNodeUp:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal soft up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
break;
case InternalType_Up:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Dump:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal dump request for nid=%d, pid=%d\n",
method_name, __LINE__,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
if ( lnode )
{
process = lnode->GetProcessL(recv_msg->u.dump.pid);
if (process)
{
int verifier = recv_msg->u.dump.verifier;
if ( (verifier == -1) || (verifier == process->GetVerifier()) )
{
process->DumpBegin(recv_msg->u.dump.dumper_nid,
recv_msg->u.dump.dumper_pid,
recv_msg->u.dump.dumper_verifier,
recv_msg->u.dump.core_file);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d, verifier=%d for dump target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid,
recv_msg->u.dump.verifier);
mon_log_write(MON_CLUSTER_HANDLEMYNODE_1, SQ_LOG_ERR, buf);
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d for dump target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
mon_log_write(MON_CLUSTER_HANDLEMYNODE_2, SQ_LOG_ERR, buf);
}
}
break;
case InternalType_DumpComplete:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal dump-complete request for nid=%d, pid=%d\n",
method_name, __LINE__,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
if ( lnode )
{
process = lnode->GetProcessL(recv_msg->u.dump.pid);
if (process)
{
int verifier = recv_msg->u.dump.verifier;
if ( (verifier == -1) || (verifier == process->GetVerifier()) )
{
process->DumpEnd(recv_msg->u.dump.status, recv_msg->u.dump.core_file);
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d, verifier=%d for dump target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid,
recv_msg->u.dump.verifier);
mon_log_write(MON_CLUSTER_HANDLEMYNODE_3, SQ_LOG_ERR, buf);
}
}
else
{
// Dump completion handled in CProcess::Exit()
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
"pid=%d for dump complete target.\n", method_name,
recv_msg->u.dump.nid, recv_msg->u.dump.pid);
mon_log_write(MON_CLUSTER_HANDLEMYNODE_4, SQ_LOG_ERR, buf);
}
}
break;
#endif
case InternalType_Exit:
// Final process exit logic is done in Process_Exit, not here
// as in the past.
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", method_name, __LINE__, recv_msg->u.exit_ns.name, recv_msg->u.exit_ns.nid, recv_msg->u.exit_ns.pid);
#ifdef NAMESERVER_PROCESS
ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns );
#endif
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Event:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal event request\n", method_name, __LINE__);
break;
#endif
#ifndef NAMESERVER_PROCESS
case InternalType_IoData:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal IO data request\n", method_name, __LINE__);
break;
#endif
#ifndef NAMESERVER_PROCESS
case InternalType_StdinReq:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal STDIN request\n", method_name, __LINE__);
break;
#endif
#ifndef NAMESERVER_PROCESS
case InternalType_Kill:
// Queue the kill request for processing by a worker thread.
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal kill request for (%d, %d), abort =%d\n", method_name, __LINE__, recv_msg->u.kill.nid, recv_msg->u.kill.pid, recv_msg->u.kill.persistent_abort);
ReqQueue.enqueueKillReq( &recv_msg->u.kill );
break;
#endif
case InternalType_Process:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal process request, completed process replication for (%d, %d) %s\n", method_name, __LINE__, recv_msg->u.process.pid, recv_msg->u.process.nid, (recv_msg->u.process.backup?" Backup":""));
break;
case InternalType_ProcessInit:
// No action needed
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Open:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal open request, completed open replication, "
"(%d, %d:%d) opened (%d, %d:%d)\n",
method_name, __LINE__,
recv_msg->u.open.nid,
recv_msg->u.open.pid,
recv_msg->u.open.verifier,
recv_msg->u.open.opened_nid,
recv_msg->u.open.opened_pid,
recv_msg->u.open.opened_verifier);
break;
#endif
case InternalType_SchedData:
// No action needed
break;
case InternalType_Set:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal set request, completed replicating key %s::%s\n", method_name, __LINE__, recv_msg->u.set.group, recv_msg->u.set.key);
break;
case InternalType_UniqStr:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Internal unique string request, completed replicating (%d, %d)\n", method_name, __LINE__, recv_msg->u.uniqstr.nid, recv_msg->u.uniqstr.id);
break;
#ifndef NAMESERVER_PROCESS
case InternalType_Sync:
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_TMSYNC))
trace_printf("%s@%d - Internal sync request for node %s, pnid=%d, SyncType=%d\n"
, method_name, __LINE__, Node[pnid]->GetName(), pnid, recv_msg->u.sync.type);
switch (recv_msg->u.sync.type )
{
case SyncType_TmData:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID);
tmSyncPNid_ = MyPNID;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Sync communicated, tmSyncPNid_=%d\n", method_name, __LINE__, tmSyncPNid_);
if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
{
MyNode->CheckActivationPhase();
}
if ( MyNode->GetTmSyncState() == SyncState_Start &&
MyNode->GetPhase() == Phase_Ready &&
MyNode->GetLNodesCount() > 1 )
{
// Begin a Slave Sync Start to other
// logical nodes in my physical node
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on local node %s, pnid=%d\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
Monitor->CoordinateTmDataBlock( &recv_msg->u.sync );
}
break;
case SyncType_TmSyncState:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - TMSYNC(TmSyncState) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID);
break;
default:
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Unknown SyncType from node %s, pnid=%d during processing local SyncType.\n", method_name, Node[pnid]->GetName(), pnid);
mon_log_write(MON_CLUSTER_HANDLEMYNODE_5, SQ_LOG_ERR, buf);
}
}
break;
#endif
default:
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Unknown Internal message received during processing local SyncType for pnid=%d.\n", method_name, pnid);
mon_log_write(MON_CLUSTER_HANDLEMYNODE_6, SQ_LOG_ERR, buf);
}
}
TRACE_EXIT;
}
bool CCluster::responsive()
{
const char method_name[] = "CCluster::responsive";
TRACE_ENTRY;
int barrierDiff = barrierCount_ - barrierCountSaved_;
// if no difference in barrier count, sync thread is not responsive
if ( !barrierDiff && isMonInitComplete() )
{
// this proc is called every SYNC_MAX_RESPONSIVE+1 secs
cumulativeDelaySec_ += CCluster::SYNC_MAX_RESPONSIVE + 1;
monSyncResponsive_ = false; // sync thread is no longer responsive
if ( CommType == CommType_InfiniBand )
{
// if sync thread is stuck in mpi call, one of the following checks will be true
if ( inBarrier_ || inAllGather_ || inCommDup_ )
{
mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeDelaySec_,
( ( (inBarrier_ << 1) | inAllGather_ ) << 1 ) | inCommDup_);
}
else // non-mpi took quite long
{
mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeDelaySec_);
}
}
else
{
// if sync thread is stuck in mpi call
if ( inBarrier_ )
{
mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeDelaySec_,
inBarrier_);
}
else // non-mpi took quite long
{
mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeDelaySec_);
}
}
}
else if (barrierDiff < syncMinPerSec_)
{
mem_log_write(MON_CLUSTER_RESPONSIVE_3, barrierDiff, syncMinPerSec_);
cumulativeDelaySec_ = 0;
monSyncResponsive_ = true; // slow but responsive
}
else
{
cumulativeDelaySec_ = 0;
monSyncResponsive_ = true; // truely responsive
}
barrierCountSaved_ = barrierCount_;
if ( CommType == CommType_InfiniBand )
{
allGatherCountSaved_ = allGatherCount_;
commDupCountSaved_ = commDupCount_;
}
TRACE_EXIT;
return monSyncResponsive_;
}
int CCluster::MPIAllgather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm Comm)
{
const char method_name[] = "CCluster::MPIAllGather";
TRACE_ENTRY;
inAllGather_ = true;
int rc = MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, Comm);
inAllGather_ = false;
allGatherCount_++;
TRACE_EXIT;
return rc;
}
bool CCluster::ReinitializeConfigCluster( bool nodeAdded, int pnid )
{
const char method_name[] = "CCluster::ReinitializeConfigCluster";
TRACE_ENTRY;
int rs = true;
CNode *pnode;
// Update node membership in the cluster
if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
trace_printf( "%s@%d - Configured physical nodes count=%d\n"
, method_name, __LINE__
, GetConfigPNodesCount() );
if (nodeAdded)
{
// Add node to monitor's view of the cluster
pnode = Nodes->AddNode( pnid );
if ( !pnode )
{
rs = false;
}
}
else
{
// Delete node from monitor's view of the cluster
if ( !Nodes->DeleteNode( pnid ) )
{
rs = false;
}
}
if ( rs )
{
CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
configPNodesCount_ = clusterConfig->GetPNodesCount();
Nodes->UpdateCluster();
}
if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
trace_printf( "%s@%d - Configured physical nodes count=%d\n"
, method_name, __LINE__
, GetConfigPNodesCount() );
TRACE_EXIT;
return( rs );
}
void CCluster::InitializeConfigCluster( void )
{
#ifndef NAMESERVER_PROCESS // nameserver is running in agent mode
int rc;
#endif
const char method_name[] = "CCluster::InitializeConfigCluster";
TRACE_ENTRY;
int worldSize = 0;
MPI_Comm_size (MPI_COMM_WORLD, &worldSize);
#ifdef NAMESERVER_PROCESS
if ( !IsRealCluster )
{
char *nodes = getenv( "SQ_VIRTUAL_NODES" );
worldSize = atoi(nodes);
if ( worldSize <= 0 )
{
worldSize = 1;
}
}
#endif
CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
configPNodesCount_ = clusterConfig->GetPNodesCount();
int rankToPnid[configPNodesCount_];
#ifdef NAMESERVER_PROCESS
currentNodes_ = 1; // non-master Name Servers join set through master Name Server
#else
if (IAmIntegrating || IsAgentMode)
{
currentNodes_ = 1; // non-master monitors join cluster through master monitor
}
else
{
currentNodes_ = worldSize;
}
#endif
if ( !IsRealCluster )
{
// Set virtual cluster size to collective size
MPI_Comm_size (MPI_COMM_WORLD, &configPNodesCount_);
#ifdef NAMESERVER_PROCESS
configPNodesCount_ = worldSize;
#endif
// For virtual cluster set physical node id equal to rank
for (int i=0; i<worldSize; ++i)
{
rankToPnid[i] = i;
// Set bit indicating node is up
upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK));
}
}
else
{
for (int i=0; i<configPNodesCount_; ++i)
{
rankToPnid[i] = i;
rankToPnid[i] = rankToPnid[i]; // make compiler happy
}
}
// Build the monitor's configured view of the cluster
if ( IsRealCluster )
{ // Map node name to physical node id
// (for virtual nodes physical node equals "rank" (previously set))
if (MyPNID == -1)
{
MyPNID = clusterConfig->GetPNid( Node_name );
if (MyPNID == -1)
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s@%d] Can't find node name=%s in cluster configuration\n",
method_name, __LINE__, Node_name );
mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_1, SQ_LOG_CRIT, buf);
MPI_Abort(MPI_COMM_SELF,99);
}
}
}
Nodes->AddNodes( );
MyNode = Nodes->GetNode(MyPNID);
Nodes->SetupCluster( &Node, &LNode, &indexToPnid_ );
if ( CommType == CommType_Sockets )
{
InitServerSock();
}
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d (MasterMonitor) IAmIntegrating=%d,"
" IsAgentMode=%d, IsMaster=%d,"
" MasterMonitorName=%s, Node_name=%s\n"
, method_name, __LINE__
, IAmIntegrating
, IsAgentMode, IsMaster, MasterMonitorName, Node_name );
}
if (IAmIntegrating || IsAgentMode)
{
#ifndef NAMESERVER_PROCESS
int TmLeaderPNid = -1;
if (IsMaster)
{
tmLeaderNid_ = Nodes->GetFirstNid();
TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
}
#endif
// Monitors processes in AGENT mode in a real cluster initialize all
// remote nodes to a down state. The master monitor and the joining
// monitors will set the joining node state to up as part of the node
// re-integration processing as monitor processes join the cluster
// through the master.
for (int i=0; i < clusterConfig->GetPNodesCount(); i++)
{
if (Node[indexToPnid_[i]])
{
if (Node[indexToPnid_[i]]->GetPNid() == MyPNID)
{ // Set bit indicating node is up
upNodes_.upNodes[indexToPnid_[i]/MAX_NODE_BITMASK] |=
(1ull << (indexToPnid_[i]%MAX_NODE_BITMASK));
}
else
{ // Set node state to down
Node[indexToPnid_[i]]->SetState( State_Down );
#ifndef NAMESERVER_PROCESS
if (IsMaster)
{
if (TmLeaderPNid == indexToPnid_[i])
{
AssignTmLeader(indexToPnid_[i], false);
}
}
#endif
}
}
}
}
#ifndef NAMESERVER_PROCESS // nameserver is running in agent mode
else
{
char *nodeNames = 0;
if ( IsRealCluster )
{
if (trace_settings & TRACE_INIT)
trace_printf( "%s@%d Collecting port numbers and node names, "
"configPNodesCount_=%d, worldSize=%d, pnid=%d (%s:%s)\n"
"MyCommPort=%s\nMySyncPort=%s\n"
, method_name, __LINE__
, GetConfigPNodesCount(), worldSize
, MyPNID, MyNode->GetName(), MyNode->GetCommPort()
, MyCommPort, MySyncPort );
bool nodeStatus[GetConfigPNodesCount()];
for (int i=0; i<GetConfigPNodesCount(); ++i)
{
nodeStatus[i] = false;
if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
trace_printf( "%s@%d - nodeStatus[%d]=%d\n"
, method_name, __LINE__, i, nodeStatus[i] ) ;
}
// Collect comm port info from other monitors
char *commPortNums = new char[worldSize * MPI_MAX_PORT_NAME];
rc = MPI_Allgather (MyCommPort, MPI_MAX_PORT_NAME, MPI_CHAR, commPortNums,
MPI_MAX_PORT_NAME, MPI_CHAR, MPI_COMM_WORLD);
if (rc != MPI_SUCCESS)
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n",
method_name, __LINE__, ErrorMsg(rc));
mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_2, SQ_LOG_CRIT, buf);
MPI_Abort(MPI_COMM_SELF,99);
}
// Collect sync port info from other monitors
char *syncPortNums = new char[worldSize * MPI_MAX_PORT_NAME];
rc = MPI_Allgather (MySyncPort, MPI_MAX_PORT_NAME, MPI_CHAR, syncPortNums,
MPI_MAX_PORT_NAME, MPI_CHAR, MPI_COMM_WORLD);
if (rc != MPI_SUCCESS)
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n",
method_name, __LINE__, ErrorMsg(rc));
mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_2, SQ_LOG_CRIT, buf);
MPI_Abort(MPI_COMM_SELF,99);
}
// Exchange Node Names with collective
nodeNames = new char[worldSize * MPI_MAX_PROCESSOR_NAME];
rc = MPI_Allgather (Node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
nodeNames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
MPI_COMM_WORLD);
if (rc != MPI_SUCCESS)
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n",
method_name, __LINE__, ErrorMsg(rc));
mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_3, SQ_LOG_CRIT, buf);
MPI_Abort(MPI_COMM_SELF,99);
}
// For each node name received get corresponding CNode object and
// store port number in it.
char * nodeName;
CNode * node;
for (int i = 0; i < worldSize; i++)
{
nodeName = &nodeNames[ i * MPI_MAX_PROCESSOR_NAME ];
node = Nodes->GetNode( nodeName );
if ( node )
{
node->SetCommPort( &commPortNums[ i * MPI_MAX_PORT_NAME] );
node->SetSyncPort( &syncPortNums[ i * MPI_MAX_PORT_NAME] );
rankToPnid[i] = node->GetPNid();
nodeStatus[rankToPnid[i]] = true;
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d rankToPnid[%d]=%d (%s:%s:%s)"
"(node=%s,commPort=%s,syncPort=%s)\n"
, method_name, __LINE__, i, rankToPnid[i]
, node->GetName()
, node->GetCommPort()
, node->GetSyncPort()
, &nodeNames[ i * MPI_MAX_PROCESSOR_NAME]
, &commPortNums[ i * MPI_MAX_PORT_NAME]
, &syncPortNums[ i * MPI_MAX_PORT_NAME]);
}
}
else
{
rankToPnid[i] = -1;
// Unexpectedly could not map node name to CNode object
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s@%d] Unable to find node "
"object for node %s\n", method_name, __LINE__,
nodeName );
mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_4, SQ_LOG_CRIT, buf);
}
}
delete [] commPortNums;
delete [] syncPortNums;
tmLeaderNid_ = Nodes->GetFirstNid();
int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
// Any nodes not in the initial MPI_COMM_WORLD are down.
for (int i=0; i<GetConfigPNodesCount(); ++i)
{
if ( nodeStatus[indexToPnid_[i]] == false )
{
if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
trace_printf( "%s@%d - nodeStatus[%d]=%d"
", indexToPnid_[%d]=%d\n"
, method_name, __LINE__
, i, nodeStatus[i]
, i, indexToPnid_[i] ) ;
node = Nodes->GetNode(indexToPnid_[i]);
if ( node ) node->SetState( State_Down );
// assign new TmLeader if TMLeader node is dead.
if (TmLeaderPNid == indexToPnid_[i])
{
AssignTmLeader(indexToPnid_[i], false);
}
}
else
{ // Set bit indicating node is up
if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
trace_printf( "%s@%d - nodeStatus[%d]=%d"
", indexToPnid_[%d]=%d\n"
, method_name, __LINE__
, i, nodeStatus[i]
, i, indexToPnid_[i] ) ;
upNodes_.upNodes[indexToPnid_[i]/MAX_NODE_BITMASK] |=
(1ull << (indexToPnid_[i]%MAX_NODE_BITMASK));
}
}
}
else
{
tmLeaderNid_ = 0;
}
// Initialize communicators for point-to-point communications
int myRank;
MPI_Comm_rank( MPI_COMM_WORLD, &myRank );
if ( !IsRealCluster )
myRank = MyPNID;
InitClusterComm(worldSize, myRank, rankToPnid);
if ( CommType == CommType_Sockets )
{
InitClusterSocks(worldSize, myRank, nodeNames, rankToPnid);
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
for ( int i =0; i < worldSize; i++ )
{
trace_printf( "%s@%d socks_[%d]=%d\n"
, method_name, __LINE__
, rankToPnid[i], socks_[rankToPnid[i]]);
}
}
}
if (nodeNames) delete [] nodeNames;
}
#endif
if ( CommType == CommType_Sockets )
{
// Allgather() cluster sockets are established as remote
// monitor processes join the cluster
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
for ( int i =0; i < clusterConfig->GetPNodesCount() ; i++ )
{
trace_printf( "%s@%d %s (%d), state=%s, socks_[%d]=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[i]]->GetName()
, Node[indexToPnid_[i]]->GetPNid()
, StateString(Node[indexToPnid_[i]]->GetState())
, indexToPnid_[i], socks_[indexToPnid_[i]]);
}
}
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d upNodes set[%d]: %llx\n"
, method_name, __LINE__
, i, upNodes_.upNodes[i]);
}
}
#ifndef NAMESERVER_PROCESS
// Kill the MPICH hydra_pmi_proxy to prevent it from killing all
// processes in cluster when mpirun or monitor processes are killed
if (!IsAgentMode || (IsAgentMode && IsMPIChild))
{
kill( getppid(), SIGKILL );
}
#endif
TRACE_EXIT;
}
void CCluster::InitClusterComm(int worldSize, int myRank, int * rankToPnid)
{
const char method_name[] = "CCluster::InitClusterComm";
TRACE_ENTRY;
// Compute an array of "colors" for use with MPI_Comm_split.
int *splitColors;
splitColors = new int[worldSize*worldSize*2];
int *splitOtherNode;
splitOtherNode = new int[worldSize*worldSize*2];
int splitRows = 0;
for ( int i=0; i<(worldSize*worldSize*2); ++i)
{
splitColors[i] = MPI_UNDEFINED;
splitOtherNode[i] = -1;
}
int color = 1;
bool placed;
for (int i = 0; i < worldSize; i++)
{
for (int j = i+1; j < worldSize; j++)
{
// Find a free slot for rank "i" to rank "j"
placed = false;
for (int k=0; k<splitRows; ++k)
{
if ( splitColors[k*worldSize+i] == MPI_UNDEFINED
&& splitColors[k*worldSize+j] == MPI_UNDEFINED )
{
splitColors[k*worldSize+i] = color;
splitColors[k*worldSize+j] = color;
placed = true;
if (myRank == i)
splitOtherNode[k] = j;
else if (myRank == j)
splitOtherNode[k] = i;
break;
}
}
if (!placed)
{ // Need to use a new row
splitColors[splitRows*worldSize+i] = color;
splitColors[splitRows*worldSize+j] = color;
if (myRank == i)
splitOtherNode[splitRows] = j;
else if (myRank == j)
splitOtherNode[splitRows] = i;
++splitRows;
}
++color;
}
}
if (trace_settings & TRACE_INIT)
{
trace_printf("%s@%d Created %d splitRows for worldSize=%d, myRank=%d\n",
method_name, __LINE__, splitRows, worldSize, myRank);
string line;
char fragment[50];
for (int i=0; i<splitRows; ++i)
{
sprintf(fragment, "%s@%d splitColors[%d]=", method_name, __LINE__,
i);
line = fragment;
for (int j=0; j<worldSize; ++j)
{
sprintf(fragment, " %d,", splitColors[i*worldSize+j]);
line += fragment;
}
line += "\n";
trace_printf(line.c_str());
trace_printf("%s@%d splitOtherNode[%d]=%d\n", method_name,
__LINE__, i, splitOtherNode[i]);
}
}
// Create one communicator for each other rank in MPI_COMM_WORLD
// This permits point-to-point communication with each rank.
int myRankInComm;
MPI_Comm ncomm;
int nid;
for (int nSplit=0; nSplit < splitRows; ++nSplit)
{
color = splitColors[nSplit*worldSize+myRank];
MPI_Comm_split(MPI_COMM_WORLD, color, myRank, &ncomm);
if (ncomm == MPI_COMM_NULL)
{
if (splitColors[nSplit*worldSize+myRank] != MPI_UNDEFINED)
{
if (trace_settings & TRACE_INIT)
{
trace_printf("%s@%d Rank %d: Unexpected MPI_COMM_NULL from "
"MPI_Comm_split, nSplit=%d\n",
method_name, __LINE__,myRank, nSplit);
}
}
}
else
{
// Set comms_ (communicators) array element for the
// physical node.
nid = rankToPnid[splitOtherNode[nSplit]];
comms_[nid] = ncomm;
MPI_Comm_rank(ncomm, &myRankInComm);
otherMonRank_[nid] = (myRankInComm == 0)? 1: 0;
if (trace_settings & TRACE_INIT)
{
trace_printf("%s@%d Rank %d: MPI_Comm_split %d, color=%d, "
"comms_[%d] is orig rank #%d, "
"otherMonRank_=%d\n",
method_name, __LINE__,
myRank, nSplit, color,
nid, splitOtherNode[nSplit],
otherMonRank_[nid]);
}
}
}
delete [] splitColors;
delete [] splitOtherNode;
TRACE_EXIT;
}
void CCluster::HandleReintegrateError( int rc, int err,
int pnid, nodeId_t *nodeInfo,
bool abortIn )
{
const char method_name[] = "CCluster::HandleReintegrateError";
TRACE_ENTRY;
char buf[MON_STRING_BUF_SIZE];
switch ( err )
{
case Reintegrate_Err1:
snprintf(buf, sizeof(buf), "[%s], can't to connect to creator monitor"
" port: %s - Error: %s.\n",
method_name, IntegratingMonitorPort, ErrorMsg(rc));
break;
case Reintegrate_Err2:
snprintf(buf, sizeof(buf), "[%s], can't merge intercomm to existing "
"MPI collective - Error: %s.\n",
method_name, ErrorMsg(rc));
break;
case Reintegrate_Err3:
snprintf(buf, sizeof(buf), "[%s], unable to obtain cluster info "
"from creator monitor: %s.\n", method_name, ErrorMsg(rc));
break;
case Reintegrate_Err4:
snprintf(buf, sizeof(buf), "[%s], Failed to send name/port "
"to node %d (%s): %s.\n", method_name, pnid,
nodeInfo->nodeName, ErrorMsg(rc));
break;
case Reintegrate_Err5:
snprintf(buf, sizeof(buf), "[%s], can't to connect to "
" node %d monitor, commPort=%s, syncPort=%s: %s.\n",
method_name, pnid, nodeInfo->commPort,
nodeInfo->syncPort, ErrorMsg(rc));
break;
case Reintegrate_Err6:
snprintf(buf, sizeof(buf), "[%s], can't merge intercomm "
"for node %d: %s.\n", method_name, pnid,
ErrorMsg(rc));
break;
case Reintegrate_Err7:
snprintf(buf, sizeof(buf), "[%s], can't disconnect "
"intercomm for node %d: %s.\n", method_name, pnid,
ErrorMsg(rc));
break;
case Reintegrate_Err8:
snprintf(buf, sizeof(buf), "[%s], Failed to send status to creator "
"monitor: %s\n", method_name, ErrorMsg(rc));
break;
case Reintegrate_Err9:
snprintf(buf, sizeof(buf), "[%s], Failed to send name/port "
"to creator monitor: %s.\n", method_name, ErrorMsg(rc));
break;
case Reintegrate_Err10:
snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could"
" not write to port file). Aborting.\n", method_name);
break;
case Reintegrate_Err11:
snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could"
" not open port file). Aborting.\n", method_name);
break;
case Reintegrate_Err12:
snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could"
" not initialize local io). Aborting.\n", method_name);
break;
case Reintegrate_Err13:
snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could"
" not initialize devices). Aborting.\n", method_name);
break;
case Reintegrate_Err14:
snprintf(buf, sizeof(buf), "[%s] Aborting.\n", method_name);
break;
case Reintegrate_Err15:
snprintf(buf, sizeof(buf), "[%s], no connect acknowledgement "
"for node %d: %s.\n", method_name, pnid,
ErrorMsg(rc));
break;
default:
snprintf(buf, sizeof(buf), "[%s], Reintegration error: %s\n",
method_name, ErrorMsg(rc));
}
mon_log_write(MON_CLUSTER_REINTEGRATE_1, SQ_LOG_ERR, buf);
if ( abortIn )
MPI_Abort(MPI_COMM_SELF,99);
TRACE_EXIT;
}
void CCluster::SendReIntegrateStatus( STATE nodeState, int initErr )
{
const char method_name[] = "CCluster::SendReIntegrateStatus";
int rc;
nodeStatus_t nodeStatus;
nodeStatus.state = nodeState;
nodeStatus.status = initErr;
switch( CommType )
{
case CommType_InfiniBand:
rc = Monitor->SendMPI( (char *) &nodeStatus
, sizeof(nodeStatus_t)
, 0
, MON_XCHNG_DATA
, joinComm_ );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL, true );
}
break;
case CommType_Sockets:
rc = Monitor->SendSock( (char *) &nodeStatus
, sizeof(nodeStatus_t)
, joinSock_
, method_name );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL, true );
}
break;
default:
// Programmer bonehead!
abort();
}
if ( nodeState != State_Up )
{ // Initialization error, abort.
mem_log_write(CMonLog::MON_REINTEGRATE_9, MyPNID, initErr);
HandleReintegrateError( rc, initErr, -1, NULL, true );
}
}
bool CCluster::PingSockPeer(CNode *node)
{
const char method_name[] = "CCluster::PingSockPeer";
TRACE_ENTRY;
static int sv_connect_wait_timeout = -2;
static int sv_connect_retry_count = 1;
if ( sv_connect_wait_timeout == -2 )
{
// Use the EPOLL timeout and retry values
char *lv_connect_wait_timeout_env = getenv( "SQ_MON_EPOLL_WAIT_TIMEOUT" );
if ( lv_connect_wait_timeout_env )
{
// Timeout in seconds
sv_connect_wait_timeout = atoi( lv_connect_wait_timeout_env );
char *lv_connect_retry_count_env = getenv( "SQ_MON_EPOLL_RETRY_COUNT" );
if ( lv_connect_retry_count_env )
{
sv_connect_retry_count = atoi( lv_connect_retry_count_env );
}
if ( sv_connect_retry_count > 180 )
{
sv_connect_retry_count = 180;
}
}
else
{
// default to 64 seconds
sv_connect_wait_timeout = 16;
sv_connect_retry_count = 4;
}
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] Ping connect timeout wait_timeout=1 second, retry_count=%d\n"
, method_name
, __LINE__
, (sv_connect_retry_count * sv_connect_wait_timeout) );
mon_log_write( MON_PINGSOCKPEER_3, SQ_LOG_INFO, buf );
}
bool rs = true;
int rc;
int pingSock = -1;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Pinging remote monitor %s, pnid=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid() );
}
// Attempt to connect with remote monitor in one seconds increments
// to recover as quickly as possible or give up trying
for (int i = 0; i < (sv_connect_retry_count*sv_connect_wait_timeout); i++ )
{
// Disable internal retries
pingSock = Monitor->Connect( node->GetCommPort(), false );
if ( pingSock < 0 )
{
if (node->GetState() != State_Up)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, node->GetPNid(), socks_[node->GetPNid()] );
}
break;
}
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] Retrying connect to remote monitor %s, pnid=%d, retry=%d\n"
, method_name
, __LINE__
, node->GetName(), node->GetPNid(), i );
mon_log_write( MON_PINGSOCKPEER_4, SQ_LOG_INFO, buf );
sleep( 1 );
}
else
{
break;
}
}
if ( pingSock < 0 )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Can't connect to remote monitor %s, pnid=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid() );
}
return(false);
}
nodeId_t nodeInfo;
nodeInfo.pnid = MyPNID;
strcpy(nodeInfo.nodeName, MyNode->GetName());
strcpy(nodeInfo.commPort, MyNode->GetCommPort());
strcpy(nodeInfo.syncPort, MyNode->GetSyncPort());
nodeInfo.ping = true;
nodeInfo.creatorPNid = -1;
nodeInfo.creator = false;
nodeInfo.creatorShellPid = -1;
nodeInfo.creatorShellVerifier = -1;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "Sending my nodeInfo.pnid=%d\n"
" nodeInfo.nodeName=%s\n"
" nodeInfo.commPort=%s\n"
" nodeInfo.syncPort=%s\n"
" nodeInfo.creatorPNid=%d\n"
" nodeInfo.creator=%d\n"
" nodeInfo.creatorShellPid=%d\n"
" nodeInfo.creatorShellVerifier=%d\n"
" nodeInfo.ping=%d\n"
, nodeInfo.pnid
, nodeInfo.nodeName
, nodeInfo.commPort
, nodeInfo.syncPort
, nodeInfo.creatorPNid
, nodeInfo.creator
, nodeInfo.creatorShellPid
, nodeInfo.creatorShellVerifier
, nodeInfo.ping );
}
rc = Monitor->SendSock( (char *) &nodeInfo
, sizeof(nodeId_t)
, pingSock
, method_name );
if ( rc )
{
rs = false;
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Cannot send ping node info to node %s: (%s)\n"
, method_name, node->GetName(), ErrorMsg(rc));
mon_log_write(MON_PINGSOCKPEER_1, SQ_LOG_ERR, buf);
}
else
{
// Get info about connecting monitor
rc = Monitor->ReceiveSock( (char *) &nodeInfo
, sizeof(nodeId_t)
, pingSock
, method_name );
if ( rc )
{ // Handle error
rs = false;
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Cannot receive ping node info from node %s: (%s)\n"
, method_name, node->GetName(), ErrorMsg(rc));
mon_log_write(MON_PINGSOCKPEER_2, SQ_LOG_ERR, buf);
}
else
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "Received from nodeInfo.pnid=%d\n"
" nodeInfo.nodeName=%s\n"
" nodeInfo.commPort=%s\n"
" nodeInfo.syncPort=%s\n"
" nodeInfo.ping=%d\n"
, nodeInfo.pnid
, nodeInfo.nodeName
, nodeInfo.commPort
, nodeInfo.syncPort
, nodeInfo.ping );
}
}
}
close( pingSock );
TRACE_EXIT;
return( rs );
}
void CCluster::ReIntegrate( int initProblem )
{
const char method_name[] = "CCluster::ReIntegrate";
TRACE_ENTRY;
switch( CommType )
{
case CommType_InfiniBand:
ReIntegrateMPI( initProblem );
break;
case CommType_Sockets:
ReIntegrateSock( initProblem );
break;
default:
// Programmer bonehead!
abort();
}
TRACE_EXIT;
}
void CCluster::ReIntegrateMPI( int initProblem )
{
const char method_name[] = "CCluster::ReIntegrateMPI";
TRACE_ENTRY;
int rc;
bool haveCreatorComm = false;
MPI_Comm interComm;
MPI_Comm intraComm = MPI_COMM_NULL;
MPI_Comm intraCommCreatorMon = MPI_COMM_NULL;
nodeId_t myNodeInfo;
strcpy(myNodeInfo.nodeName, MyNode->GetName());
strcpy(myNodeInfo.commPort, MyNode->GetCommPort());
// Set bit indicating my node is up
upNodes_.upNodes[MyPNID/MAX_NODE_BITMASK] |= (1ull << (MyPNID%MAX_NODE_BITMASK));
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Connect to creator monitor (port %s)\n",
method_name, __LINE__, IntegratingMonitorPort);
mem_log_write(CMonLog::MON_REINTEGRATE_1, MyPNID);
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d Integrating node %s (pnid=%d) "
"sees set[%d]: %llx\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, i, upNodes_.upNodes[i] );
}
}
TEST_POINT( TP010_NODE_UP );
// Connect with my creator monitor
rc = MPI_Comm_connect( IntegratingMonitorPort,
MPI_INFO_NULL, 0, MPI_COMM_SELF, &joinComm_ );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err1, -1, NULL, true );
}
MPI_Comm_set_errhandler( joinComm_, MPI_ERRORS_RETURN );
mem_log_write(CMonLog::MON_REINTEGRATE_4, MyPNID);
TEST_POINT( TP011_NODE_UP );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Connected to creator monitor, sending id\n",
method_name, __LINE__);
}
// Send this node's name and port number so creator monitor
// knows who we are, and set flag to let creator monitor it is the CREATOR.
myNodeInfo.creator = true;
myNodeInfo.creatorShellPid = CreatorShellPid;
myNodeInfo.creatorShellVerifier = CreatorShellVerifier;
if ((rc = Monitor->SendMPI((char *) &myNodeInfo, sizeof(nodeId_t), 0,
MON_XCHNG_DATA, joinComm_)))
HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL,
true );
TEST_POINT( TP012_NODE_UP );
// Merge the inter-communicators obtained from the connect/accept
// between this new monitor and the creator monitor.
if ((rc = MPI_Intercomm_merge( joinComm_, 1, &intraCommCreatorMon )))
HandleReintegrateError( rc, Reintegrate_Err2, -1, NULL, true );
MPI_Comm_set_errhandler( intraCommCreatorMon, MPI_ERRORS_RETURN );
nodeId_t *nodeInfo = new nodeId_t[GetConfigPNodesCount()];
mem_log_write(CMonLog::MON_REINTEGRATE_3, MyPNID);
// Obtain node names & port numbers of existing monitors from
// the creator monitor.
if ((rc = Monitor->ReceiveMPI((char *)nodeInfo, sizeof(nodeId_t)*GetConfigPNodesCount(),
MPI_ANY_SOURCE, MON_XCHNG_DATA, joinComm_)))
HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL, true );
if ( initProblem )
{
// The monitor encountered an initialization error. Inform
// the creator monitor that the node is down. Then abort.
SendReIntegrateStatus( State_Down, initProblem );
}
// Connect to each of the other existing monitors and let them know
// we are the NEW monitor and reset the creator flag so they know they are
// not the creator monitor.
myNodeInfo.creator = false;
myNodeInfo.creatorShellPid = -1;
myNodeInfo.creatorShellVerifier = -1;
for (int i = 0; i < GetConfigPNodesCount(); i++)
{
if (strcmp(nodeInfo[i].commPort, IntegratingMonitorPort) == 0)
{ // Already connected to creator monitor
comms_[i] = intraCommCreatorMon;
otherMonRank_[i] = 0;
++currentNodes_;
// Set bit indicating node is up
upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK));
Node[i]->SetCommPort( IntegratingMonitorPort );
Node[i]->SetState( State_Up );
haveCreatorComm = true;
}
else if (nodeInfo[i].nodeName[0] != 0
&& nodeInfo[i].commPort[0] != 0)
{
if ( haveCreatorComm && i >= GetConfigPNodesCount()/2)
// Reintegration failure after connecting to half
// of existing monitors.
TEST_POINT( TP016_NODE_UP );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Attempting connection to node %d (%s), "
"port %s\n", method_name, __LINE__, i,
nodeInfo[i].nodeName, nodeInfo[i].commPort);
}
mem_log_write(CMonLog::MON_REINTEGRATE_5, MyPNID, i);
TEST_POINT( TP013_NODE_UP );
// Connect to existing monitor
if ((rc = MPI_Comm_connect( nodeInfo[i].commPort,
MPI_INFO_NULL, 0, MPI_COMM_SELF,
&interComm )))
{
HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
MPI_Comm_set_errhandler( interComm, MPI_ERRORS_RETURN );
TEST_POINT( TP014_NODE_UP );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Connected to node %d (%s), sending id\n",
method_name, __LINE__,i,nodeInfo[i].nodeName);
}
// Send this nodes name and port number so other monitor
// knows who we are.
if ((rc = Monitor->SendMPI((char *) &myNodeInfo, sizeof(nodeId_t), 0,
MON_XCHNG_DATA, interComm)))
{
HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
if ((rc = MPI_Intercomm_merge(interComm, 1, &intraComm)))
{
HandleReintegrateError( rc, Reintegrate_Err6, i, NULL, false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
// Get acknowledgement that other monitor is ready to
// integrate this node. This is an interlock to avoid a
// race condition where the creator monitor could signal
// the monitors in the cluster to integrate the new node
// before one or more was ready to do the integration.
int readyFlag;
if ((rc = Monitor->ReceiveMPI((char *) &readyFlag, sizeof(readyFlag),
MPI_ANY_SOURCE, MON_XCHNG_DATA,
interComm)))
{
HandleReintegrateError( rc, Reintegrate_Err15, i, NULL,
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Received ready-flag from node %d (%s)\n",
method_name, __LINE__, i,
nodeInfo[i].nodeName);
}
if ((rc = MPI_Comm_disconnect(&interComm)))
HandleReintegrateError( rc, Reintegrate_Err7, i, NULL, false );
MPI_Comm_set_errhandler(intraComm, MPI_ERRORS_RETURN);
comms_[i] = intraComm;
otherMonRank_[i] = 0;
++currentNodes_;
Node[i]->SetSyncPort( nodeInfo[i].syncPort );
Node[i]->SetState( State_Up );
// Set bit indicating node is up
upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK));
mem_log_write(CMonLog::MON_REINTEGRATE_6, MyPNID, i);
}
else if ( i != MyPNID)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Connection to node %d not attempted, "
"no port information. nodeInfo[%d].port=%s, "
"IntegratingMonitorPort=%s\n", method_name,
__LINE__, i, i, nodeInfo[i].commPort,
IntegratingMonitorPort);
}
}
}
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d Integrating node %s (pnid=%d) "
"sees set[%d]: %llx\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, i, upNodes_.upNodes[i] );
}
}
mem_log_write(CMonLog::MON_REINTEGRATE_7, MyPNID);
TEST_POINT( TP015_NODE_UP );
// Inform creator monitor that connections are complete and
// this monitor is ready to participate in "allgather"
// communications with the other monitors.
SendReIntegrateStatus( State_Up, 0 );
mem_log_write(CMonLog::MON_REINTEGRATE_8, MyPNID);
MyNode->SetState( State_Merged );
delete[] nodeInfo;
TRACE_EXIT;
}
void CCluster::ReIntegrateSock( int initProblem )
{
const char method_name[] = "CCluster::ReIntegrateSock";
TRACE_ENTRY;
bool haveCreatorSocket = false;
int rc;
int existingCommFd;
int existingSyncFd;
char commPort[MPI_MAX_PORT_NAME];
char syncPort[MPI_MAX_PORT_NAME];
char *pch1;
char *pch2;
// Set bit indicating my node is up
upNodes_.upNodes[MyPNID/MAX_NODE_BITMASK] |= (1ull << (MyPNID%MAX_NODE_BITMASK));
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Connect to creator monitor (port %s)\n",
method_name, __LINE__, IntegratingMonitorPort);
mem_log_write(CMonLog::MON_REINTEGRATE_1, MyPNID);
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d Integrating node %s (pnid=%d) "
"sees set[%d]: %llx\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, i, upNodes_.upNodes[i] );
}
}
TEST_POINT( TP010_NODE_UP );
// Connect with my creator monitor
bool lv_done = false;
bool lv_did_not_connect_in_first_attempt = false;
while ( ! lv_done )
{
joinSock_ = Monitor->Connect( IntegratingMonitorPort );
if ( joinSock_ < 0 )
{
if ( IsAgentMode )
{
lv_did_not_connect_in_first_attempt = true;
sleep( 15 );
}
else
{
HandleReintegrateError( joinSock_, Reintegrate_Err1, -1, NULL, true );
}
}
else
{
if ( lv_did_not_connect_in_first_attempt )
{
sleep( 10 );
}
lv_done = true;
}
}
mem_log_write(CMonLog::MON_REINTEGRATE_4, MyPNID);
TEST_POINT( TP011_NODE_UP );
// Send this node's name and port number so creator monitor
// knows who we are, and set flag to let creator monitor it is the CREATOR.
nodeId_t myNodeInfo;
strcpy(myNodeInfo.nodeName, MyNode->GetName());
strcpy(myNodeInfo.commPort, MyNode->GetCommPort());
strcpy(myNodeInfo.syncPort, MyNode->GetSyncPort());
myNodeInfo.pnid = MyNode->GetPNid();
myNodeInfo.creatorPNid = -1;
myNodeInfo.creator = true;
myNodeInfo.creatorShellPid = CreatorShellPid;
myNodeInfo.creatorShellVerifier = CreatorShellVerifier;
myNodeInfo.ping = false;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connected to creator monitor, sending my info:\n"
" myNodeInfo.pnid=%d (%s)\n"
" myNodeInfo.commPort=%s\n"
" myNodeInfo.syncPort=%s\n"
" myNodeInfo.creator=%d\n"
" myNodeInfo.creatorShellPid=%d:%d\n"
" myNodeInfo.ping=%d\n"
, method_name, __LINE__
, myNodeInfo.pnid
, myNodeInfo.nodeName
, myNodeInfo.commPort
, myNodeInfo.syncPort
, myNodeInfo.creator
, myNodeInfo.creatorShellPid
, myNodeInfo.creatorShellVerifier
, myNodeInfo.ping );
}
rc = Monitor->SendSock( (char *) &myNodeInfo
, sizeof(nodeId_t)
, joinSock_
, method_name );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL, true );
}
TEST_POINT( TP012_NODE_UP );
mem_log_write(CMonLog::MON_REINTEGRATE_3, MyPNID);
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Getting all node info from creator monitor\n",
method_name, __LINE__);
}
// Obtain node names & port numbers of existing monitors from
// the creator monitor.
int pnodeCount = Nodes->GetPNodesCount();
nodeId_t *nodeInfo;
size_t nodeInfoSize = (sizeof(nodeId_t) * pnodeCount);
nodeInfo = (nodeId_t *) new char[nodeInfoSize];
rc = Monitor->ReceiveSock( (char *)nodeInfo
, nodeInfoSize
, joinSock_
, method_name );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL, true );
}
if ( initProblem )
{
// The monitor encountered an initialization error. Inform
// the creator monitor that the node is down. Then abort.
SendReIntegrateStatus( State_Down, initProblem );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Received port info from creator monitor\n"
, method_name, __LINE__);
for (int i=0; i<pnodeCount; i++)
{
trace_printf( "Port info for pnid=%d\n"
" nodeInfo[%d].nodeName=%s\n"
" nodeInfo[%d].commPort=%s\n"
" nodeInfo[%d].syncPort=%s\n"
" nodeInfo[%d].creatorPNid=%d\n"
, nodeInfo[i].pnid
, i, nodeInfo[i].nodeName
, i, nodeInfo[i].commPort
, i, nodeInfo[i].syncPort
, i, nodeInfo[i].creatorPNid );
}
}
// Connect to each of the other existing monitors and let them know
// we are the NEW monitor and reset the creator flag so they know they are
// not the creator monitor.
myNodeInfo.creator = false;
myNodeInfo.creatorShellPid = -1;
myNodeInfo.creatorShellVerifier = -1;
myNodeInfo.ping = false;
for (int i=0; i<pnodeCount; i++)
{
if ( nodeInfo[i].creatorPNid != -1 &&
nodeInfo[i].creatorPNid == nodeInfo[i].pnid )
{
// Get acknowledgement that creator monitor is ready to
// integrate this node.
int creatorpnid = -1;
rc = Monitor->ReceiveSock( (char *) &creatorpnid
, sizeof(creatorpnid)
, joinSock_
, method_name );
if ( rc || creatorpnid != nodeInfo[i].creatorPNid )
{
HandleReintegrateError( rc, Reintegrate_Err15, i, NULL,
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Received ready indication from creator "
"node %d nodeInfo[%d].nodeName=%s\n"
, method_name, __LINE__
, creatorpnid, i , nodeInfo[i].nodeName);
}
otherMonRank_[nodeInfo[i].pnid] = 0;
++currentNodes_;
// Store port numbers for the node
strncpy(commPort, nodeInfo[i].commPort, MPI_MAX_PORT_NAME);
strncpy(syncPort, nodeInfo[i].syncPort, MPI_MAX_PORT_NAME);
Node[nodeInfo[i].pnid]->SetCommPort( commPort );
pch1 = strtok (commPort,":");
pch1 = strtok (NULL,":");
Node[nodeInfo[i].pnid]->SetCommSocketPort( atoi(pch1) );
Node[nodeInfo[i].pnid]->SetSyncPort( syncPort );
pch2 = strtok (syncPort,":");
pch2 = strtok (NULL,":");
Node[nodeInfo[i].pnid]->SetSyncSocketPort( atoi(pch2) );
sockPorts_[nodeInfo[i].pnid] = Node[nodeInfo[i].pnid]->GetSyncSocketPort();
Node[nodeInfo[i].pnid]->SetState( State_Up );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting node %d (%s), commPort=%s(%d), syncPort=%s(%d)\n"
, method_name, __LINE__
, Node[nodeInfo[i].pnid]->GetPNid()
, Node[nodeInfo[i].pnid]->GetName()
, pch1, atoi(pch1)
, pch2, atoi(pch2) );
}
// Tell creator we are ready to accept its connection
int mypnid = MyPNID;
rc = Monitor->SendSock( (char *) &mypnid
, sizeof(mypnid)
, joinSock_
, method_name );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
// Connect to creator monitor
existingSyncFd = AcceptSyncSock();
if ( existingSyncFd < 0 )
{
HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
socks_[nodeInfo[i].pnid] = existingSyncFd; // ReIntegrateSock
// Set bit indicating node is up
upNodes_.upNodes[nodeInfo[i].pnid/MAX_NODE_BITMASK] |=
(1ull << (nodeInfo[i].pnid%MAX_NODE_BITMASK));
if (trace_settings & (TRACE_RECOVERY | TRACE_INIT))
{
trace_printf( "%s@%d Connected to creator node %d (%s)\n"
, method_name, __LINE__
, nodeInfo[i].creatorPNid
, nodeInfo[i].nodeName );
trace_printf( "%s@%d socks_[%d]=%d\n"
, method_name, __LINE__
, nodeInfo[i].pnid, socks_[nodeInfo[i].pnid]);
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d Integrating node %s (pnid=%d) "
"sees set[%d]: %llx\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, i, upNodes_.upNodes[i] );
}
}
haveCreatorSocket = true;
}
else if ( nodeInfo[i].nodeName[0] != 0 && nodeInfo[i].commPort[0] != 0 )
{
if ( haveCreatorSocket && i >= pnodeCount/2)
// Reintegration failure after connecting to half
// of existing monitors.
TEST_POINT( TP016_NODE_UP );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Attempting connection to node %d (%s), "
"port %s\n", method_name, __LINE__, nodeInfo[i].pnid,
nodeInfo[i].nodeName, nodeInfo[i].commPort);
}
mem_log_write(CMonLog::MON_REINTEGRATE_5, MyPNID, i);
TEST_POINT( TP013_NODE_UP );
// Connect to existing monitor
existingCommFd = Monitor->Connect( nodeInfo[i].commPort );
if ( existingCommFd < 0 )
{
HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
TEST_POINT( TP014_NODE_UP );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf("%s@%d Connected to node %d (%s), sending my node name\n",
method_name, __LINE__,i,nodeInfo[i].nodeName);
}
// Send this nodes name and port number so other monitor
// knows who we are.
rc = Monitor->SendSock( (char *) &myNodeInfo
, sizeof(nodeId_t)
, existingCommFd
, method_name );
if ( rc )
{
HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
// Get acknowledgement that other monitor is ready to
// integrate this node. This is an interlock to avoid a
// race condition where the creator monitor could signal
// the monitors in the cluster to integrate the new node
// before one or more was ready to do the integration.
int remotepnid = -1;
rc = Monitor->ReceiveSock( (char *) &remotepnid
, sizeof(remotepnid)
, existingCommFd
, method_name );
if ( rc || remotepnid != nodeInfo[i].pnid )
{
HandleReintegrateError( rc, Reintegrate_Err15, i, NULL,
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Received ready indication from "
"node %d nodeInfo[%d].nodeName=%s\n"
, method_name, __LINE__
, remotepnid, i , nodeInfo[i].nodeName);
}
otherMonRank_[nodeInfo[i].pnid] = 0;
++currentNodes_;
// Store port numbers for the node
strncpy(commPort, nodeInfo[i].commPort, MPI_MAX_PORT_NAME);
strncpy(syncPort, nodeInfo[i].syncPort, MPI_MAX_PORT_NAME);
Node[nodeInfo[i].pnid]->SetCommPort( commPort );
pch1 = strtok (commPort,":");
pch1 = strtok (NULL,":");
Node[nodeInfo[i].pnid]->SetCommSocketPort( atoi(pch1) );
Node[nodeInfo[i].pnid]->SetSyncPort( syncPort );
pch2 = strtok (syncPort,":");
pch2 = strtok (NULL,":");
Node[nodeInfo[i].pnid]->SetSyncSocketPort( atoi(pch2) );
sockPorts_[nodeInfo[i].pnid] = Node[nodeInfo[i].pnid]->GetSyncSocketPort();
Node[nodeInfo[i].pnid]->SetState( State_Up );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting node %d (%s), commPort=%s(%d), syncPort=%s(%d)\n"
, method_name, __LINE__
, Node[nodeInfo[i].pnid]->GetPNid()
, Node[nodeInfo[i].pnid]->GetName()
, pch1, atoi(pch1)
, pch2, atoi(pch2) );
}
// Connect to existing monitor
existingSyncFd = AcceptSyncSock();
if ( existingSyncFd < 0 )
{
HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
false );
SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
}
socks_[nodeInfo[i].pnid] = existingSyncFd; // ReIntegrateSock
// Set bit indicating node is up
upNodes_.upNodes[nodeInfo[i].pnid/MAX_NODE_BITMASK] |=
(1ull << (nodeInfo[i].pnid%MAX_NODE_BITMASK));
if (trace_settings & (TRACE_RECOVERY | TRACE_INIT))
{
trace_printf( "%s@%d socks_[%d]=%d\n"
, method_name, __LINE__
, nodeInfo[i].pnid, socks_[nodeInfo[i].pnid]);
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d Integrating node %s (pnid=%d) "
"sees set[%d]: %llx\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, i, upNodes_.upNodes[i] );
}
}
mem_log_write(CMonLog::MON_REINTEGRATE_6, MyPNID, i);
}
else if ( nodeInfo[i].pnid != MyPNID)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d Connection to node %d not attempted, "
"since it's unavailable port information.\n"
"nodeInfo[%d].commPort=%s\n"
"nodeInfo[%d].syncPort=%s\n"
"IntegratingMonitorPort=%s\n"
, method_name, __LINE__
, nodeInfo[i].pnid
, i, nodeInfo[i].commPort
, i, nodeInfo[i].syncPort
, IntegratingMonitorPort);
}
}
}
if (trace_settings & (TRACE_RECOVERY | TRACE_INIT))
{
for (int i=0; i<pnodeCount; i++)
{
if (nodeInfo[i].pnid == -1) continue;
if (Node[nodeInfo[i].pnid] == NULL) continue;
trace_printf( "%s@%d - Node info for pnid=%d (%s)\n"
" Node[%d] commPort=%s\n"
" Node[%d] syncPort=%s\n"
" Node[%d] creatorPNid=%d\n"
, method_name, __LINE__
, Node[nodeInfo[i].pnid]->GetPNid()
, Node[nodeInfo[i].pnid]->GetName()
, nodeInfo[i].pnid, Node[nodeInfo[i].pnid]->GetCommPort()
, nodeInfo[i].pnid, Node[nodeInfo[i].pnid]->GetSyncPort()
, nodeInfo[i].pnid, nodeInfo[i].creatorPNid);
}
for ( int i =0; i < pnodeCount; i++ )
{
if (nodeInfo[i].pnid == -1) continue;
trace_printf( "%s@%d socks_[%d]=%d, sockPorts_[%d]=%d\n"
, method_name, __LINE__
, nodeInfo[i].pnid, socks_[nodeInfo[i].pnid]
, nodeInfo[i].pnid, sockPorts_[nodeInfo[i].pnid]);
}
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
trace_printf( "%s@%d Integrating node %s (pnid=%d) "
"sees set[%d]: %llx\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, i, upNodes_.upNodes[i] );
}
}
mem_log_write(CMonLog::MON_REINTEGRATE_7, MyPNID);
TEST_POINT( TP015_NODE_UP );
// Inform creator monitor that connections are complete and
// this monitor is ready to participate in "allgather"
// communications with the other monitors.
SendReIntegrateStatus( State_Up, 0 );
mem_log_write(CMonLog::MON_REINTEGRATE_8, MyPNID);
MyNode->SetState( State_Merged );
delete[] nodeInfo;
TRACE_EXIT;
}
void CCluster::ResetIntegratingPNid( void )
{
const char method_name[] = "CCluster::ResetIntegratingPNid";
TRACE_ENTRY;
switch( CommType )
{
case CommType_InfiniBand:
if ( joinComm_ != MPI_COMM_NULL )
{
MPI_Comm_free( &joinComm_ );
joinComm_ = MPI_COMM_NULL;
}
break;
case CommType_Sockets:
if ( joinSock_ != -1 )
{
close(joinSock_);
joinSock_ = -1;
}
break;
default:
// Programmer bonehead!
abort();
}
if ( MyNode->IsCreator() )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Resetting creator pnid=%d\n",
method_name, __LINE__, MyPNID );
}
MyNode->SetCreator( false, -1, -1 );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Resetting integratingPNid_=%d\n",
method_name, __LINE__, integratingPNid_ );
}
integratingPNid_ = -1;
#ifdef NAMESERVER_PROCESS
if (!CommAcceptMon.isAccepting())
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Triggering commAcceptorMon thread to begin accepting connections\n",
method_name, __LINE__ );
}
// Indicate to the commAcceptor thread to begin accepting connections
CommAcceptMon.startAccepting();
}
#endif
if (!CommAccept.isAccepting())
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Triggering commAcceptor thread to begin accepting connections\n",
method_name, __LINE__ );
}
// Indicate to the commAcceptor thread to begin accepting connections
CommAccept.startAccepting();
}
TRACE_EXIT;
}
void CCluster::SetIntegratingPNid( int pnid )
{
const char method_name[] = "CCluster::SetIntegratingPNid";
TRACE_ENTRY;
integratingPNid_ = pnid;
TRACE_EXIT;
}
// Save information about a new communicator for a node that is reintegrating
void CCluster::addNewComm(int pnid, int otherRank, MPI_Comm comm)
{
const char method_name[] = "CCluster::addNewComm";
TRACE_ENTRY;
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d - saving communicator for pnid %d\n",
method_name, __LINE__, pnid);
}
// Insert info for new comm into list
commInfo_t commInfo = {pnid, otherRank, comm, -1, {0, 0}};
clock_gettime(CLOCK_REALTIME, &commInfo.ts);
newCommsLock_.lock();
newComms_.push_back( commInfo );
newCommsLock_.unlock();
TRACE_EXIT;
}
// A node is reintegrating. Add the communicator for the node to the set of
// communicators used by "Allgather".
void CCluster::setNewComm( int pnid )
{
const char method_name[] = "CCluster::setNewComm";
TRACE_ENTRY;
newComms_t::iterator it;
bool foundComm = false;
if ( comms_[pnid] != MPI_COMM_NULL )
{ // Unexpectedly already have a communicator for this node
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] Unexpectedly already have a "
"communicator for node %d\n", method_name, pnid);
mon_log_write(MON_CLUSTER_SETNEWCOMM_1, SQ_LOG_ERR, buf);
MPI_Comm_free( &comms_[pnid] );
if ( CommType == CommType_Sockets )
{
shutdown( socks_[pnid], SHUT_RDWR);
close( socks_[pnid] );
socks_[pnid] = -1;
}
}
newCommsLock_.lock();
for ( it = newComms_.begin(); it != newComms_.end(); )
{
if ( it->pnid == pnid )
{
if ( comms_[pnid] != MPI_COMM_NULL )
{ // Found another communicator for the specified node.
// Disconnect from the previous one. It must be a
// stale leftover from a previous reintegration
// attempt for the node.
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d - discarding stale communicator for "
"pnid %d\n", method_name, __LINE__, pnid);
}
MPI_Comm_free( &comms_[pnid] );
if ( CommType == CommType_Sockets )
{
shutdown( socks_[pnid], SHUT_RDWR);
close( socks_[pnid] );
socks_[pnid] = -1;
}
--currentNodes_;
}
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d - setting new communicator for pnid %d, "
"otherRank=%d\n",
method_name, __LINE__, it->pnid, it->otherRank);
}
comms_[it->pnid] = it->comm;
otherMonRank_[it->pnid] = it->otherRank;
++currentNodes_;
// Set bit indicating node is up
upNodes_.upNodes[it->pnid/MAX_NODE_BITMASK] |= (1ull << (it->pnid%MAX_NODE_BITMASK));
// Delete current list element and advance to next one
it = newComms_.erase ( it );
foundComm = true;
}
else
{ // Advance to next list element
++it;
}
}
newCommsLock_.unlock();
if ( !foundComm )
{ // We have no communicator for the specified node.
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] Could not find a communicator for "
"node %d\n", method_name, pnid);
mon_log_write(MON_CLUSTER_SETNEWCOMM_2, SQ_LOG_ERR, buf);
}
TRACE_EXIT;
}
// Save information about a new socket for a node that is reintegrating
void CCluster::addNewSock(int pnid, int otherRank, int sockFd)
{
const char method_name[] = "CCluster::addNewSock";
TRACE_ENTRY;
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d - saving socket for pnid %d\n",
method_name, __LINE__, pnid);
}
// Insert info for new comm into list
commInfo_t commInfo = {pnid, otherRank, MPI_COMM_NULL, sockFd, {0, 0}};
clock_gettime(CLOCK_REALTIME, &commInfo.ts);
newCommsLock_.lock();
newComms_.push_back( commInfo );
newCommsLock_.unlock();
TRACE_EXIT;
}
// A node is reintegrating. Add the socket for the node to the set of
// communicators used by "Allgather".
void CCluster::setNewSock( int pnid )
{
const char method_name[] = "CCluster::setNewSock";
TRACE_ENTRY;
newComms_t::iterator it;
bool foundSocket = false;
if ( socks_[pnid] != -1 )
{ // Unexpectedly already have a communicator for this node
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] Unexpectedly already have a "
"socket for node %d\n", method_name, pnid);
mon_log_write(MON_CLUSTER_SETNEWSOCK_1, SQ_LOG_ERR, buf);
shutdown( socks_[pnid], SHUT_RDWR);
close( socks_[pnid] );
socks_[pnid] = -1;
}
newCommsLock_.lock();
for ( it = newComms_.begin(); it != newComms_.end(); )
{
if ( it->pnid == pnid )
{
if ( socks_[pnid] != -1 )
{ // Found another socket for the specified node.
// Disconnect from the previous one. It must be a
// stale leftover from a previous reintegration
// attempt for the node.
if (trace_settings & TRACE_RECOVERY)
{
trace_printf("%s@%d - discarding stale communicator for "
"pnid %d\n", method_name, __LINE__, pnid);
}
shutdown( socks_[pnid], SHUT_RDWR);
close( socks_[pnid] );
socks_[pnid] = -1;
--currentNodes_;
}
CNode *node = Nodes->GetNode( it->pnid );
socks_[it->pnid] = it->socket; // setNewSock
sockPorts_[it->pnid] = node->GetSyncSocketPort();
otherMonRank_[it->pnid] = it->otherRank;
++currentNodes_;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting new communicator for %d (%s), "
"socks_[%d]=%d, sockPorts_[%d]=%d, otherMonRank_[%d]=%d\n"
, method_name, __LINE__
, node->GetPNid()
, node->GetName()
, it->pnid, socks_[it->pnid]
, it->pnid, sockPorts_[it->pnid]
, it->pnid, otherMonRank_[it->pnid] );
}
// Set bit indicating node is up
upNodes_.upNodes[it->pnid/MAX_NODE_BITMASK] |= (1ull << (it->pnid%MAX_NODE_BITMASK));
// Delete current list element and advance to next one
it = newComms_.erase ( it );
foundSocket = true;
}
else
{ // Advance to next list element
++it;
}
}
newCommsLock_.unlock();
if ( !foundSocket )
{ // We have no communicator for the specified node.
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] Could not find a socket for "
"node %d\n", method_name, pnid);
mon_log_write(MON_CLUSTER_SETNEWSOCK_2, SQ_LOG_ERR, buf);
}
TRACE_EXIT;
}
int CCluster::Allgather( int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats )
{
const char method_name[] = "CCluster::Allgather";
TRACE_ENTRY;
int err = 0;
switch( CommType )
{
case CommType_InfiniBand:
err = AllgatherIB( nbytes, sbuf, rbuf, tag, stats );
break;
case CommType_Sockets:
err = AllgatherSock( nbytes, sbuf, rbuf, tag, stats );
break;
default:
// Programmer bonehead!
MPI_Abort(MPI_COMM_SELF,99);
}
TRACE_EXIT;
return err;
}
int CCluster::AllgatherIB( int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats )
{
const char method_name[] = "CCluster::AllgatherIB";
TRACE_ENTRY;
int e;
int err = 0;
MPI_Request r[2*GetConfigPNodesCount()];
MPI_Status s[2*GetConfigPNodesCount()];
for ( int i = 0; i < 2*GetConfigPNodesCount(); i++ )
{
s[i].MPI_ERROR = MPI_SUCCESS;
r[i] = MPI_REQUEST_NULL;
}
char *cp = rbuf;
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
if ( comms_[i] != MPI_COMM_NULL && otherMonRank_[i] != -1 )
{
e = MPI_Send_init( sbuf, nbytes, MPI_CHAR, otherMonRank_[i], tag,
comms_[i], &r[i] );
if ( e != MPI_SUCCESS )
{
MPI_Error_class( e, &err );
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Comunication error with pnid=%d (%s), "
"MPI_Send_init() error=%s (%d)\n"
, method_name, i, Node[i]->GetName()
, ErrorMsg(e), e );
mon_log_write(MON_CLUSTER_ALLGATHERIB_1, SQ_LOG_ERR, buf);
goto early_exit;
}
e = MPI_Recv_init( cp, CommBufSize, MPI_CHAR, otherMonRank_[i], tag,
comms_[i], &r[i+GetConfigPNodesCount()] );
if ( e != MPI_SUCCESS )
{
MPI_Error_class( e, &err );
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s], Comunication error with pnid=%d (%s), "
"MPI_Recv_init() error=%s (%d)\n"
, method_name, i, Node[i]->GetName()
, ErrorMsg(e), e );
mon_log_write(MON_CLUSTER_ALLGATHERIB_2, SQ_LOG_ERR, buf);
goto early_exit;
}
}
cp += CommBufSize;
}
for ( int i = 0; i < 2*GetConfigPNodesCount(); i++ )
{
if ( r[i] == MPI_REQUEST_NULL ) continue;
e = MPI_Start( &r[i] );
if ( e != MPI_SUCCESS )
{
MPI_Error_class( e, &err );
char buf[MON_STRING_BUF_SIZE];
int pnid = (i < GetConfigPNodesCount()) ? i : (i - GetConfigPNodesCount());
snprintf( buf, sizeof(buf)
, "[%s], Comunication error with pnid=%d (%s), "
"MPI_Start() error=%s (%d)\n"
, method_name, pnid, Node[pnid]->GetName()
, ErrorMsg(e), e );
mon_log_write(MON_CLUSTER_ALLGATHERIB_3, SQ_LOG_ERR, buf);
goto early_exit;
}
}
inBarrier_ = true;
if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
MonStats->BarrierWaitIncr();
e = MPI_Waitall( GetConfigPNodesCount()*2, r, s );
if ( e != MPI_SUCCESS )
{
MPI_Error_class( e, &err );
if ( err != MPI_ERR_IN_STATUS )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s], MPI_Waitall() error=%s (%d)\n"
, method_name, ErrorMsg(e), e );
mon_log_write(MON_CLUSTER_ALLGATHERIB_4, SQ_LOG_ERR, buf);
inBarrier_ = false;
goto early_exit;
}
}
if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
MonStats->BarrierWaitDecr();
inBarrier_ = false;
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
stats[i] = s[i+GetConfigPNodesCount()];
}
if ( e == MPI_SUCCESS )
{
err = MPI_SUCCESS;
goto early_exit;
}
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
if ( s[i].MPI_ERROR != MPI_SUCCESS && // send
s[i+GetConfigPNodesCount()].MPI_ERROR == MPI_SUCCESS ) // receive
{
stats[i].MPI_ERROR = s[i].MPI_ERROR;
}
}
early_exit:
for ( int i = 0; i < 2*GetConfigPNodesCount(); i++ )
{
if ( r[i] != MPI_REQUEST_NULL )
{
MPI_Request_free( &r[i] );
}
}
barrierCount_++;
TRACE_EXIT;
return err;
}
int CCluster::AllgatherSock( int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats )
{
const char method_name[] = "CCluster::AllgatherSock";
TRACE_ENTRY;
bool reconnecting = false;
static int hdrSize = Nodes->GetSyncHdrSize( );
int err = MPI_SUCCESS;
peer_t p[GetConfigPNodesMax()];
memset( p, 0, sizeof(p) );
tag = tag; // make compiler happy
// Set to twice the ZClient session timeout
static int sessionTimeout = ZClientEnabled
? (ZClient->GetSessionTimeout() * 2) : 120;
int nsent = 0, nrecv = 0;
for ( int iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
{
peer_t *peer = &p[indexToPnid_[iPeer]];
stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_SUCCESS;
stats[indexToPnid_[iPeer]].count = 0;
if ( indexToPnid_[iPeer] == MyPNID || socks_[indexToPnid_[iPeer]] == -1 )
{
peer->p_sending = peer->p_receiving = false;
nsent++;
nrecv++;
}
else
{
peer->p_sending = peer->p_receiving = true;
peer->p_sent = peer->p_received = 0;
peer->p_timeout_count = 0;
peer->p_initial_check = true;
peer->p_n2recv = -1;
peer->p_buff = ((char *) rbuf) + (indexToPnid_[iPeer] * CommBufSize);
struct epoll_event event;
event.data.fd = socks_[indexToPnid_[iPeer]];
event.events = EPOLLIN | EPOLLOUT | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
EpollCtl( epollFD_, EPOLL_CTL_ADD, socks_[indexToPnid_[iPeer]], &event );
}
}
if (trace_settings & (TRACE_SYNC | TRACE_SYNC_DETAIL))
{
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
peer_t *peer = &p[indexToPnid_[i]];
trace_printf( "%s@%d" " - socks_[%d]=%d, "
"peer->p_sending=%d, "
"peer->p_receiving=%d\n"
, method_name, __LINE__
, indexToPnid_[i]
, socks_[indexToPnid_[i]]
, peer->p_sending
, peer->p_receiving );
}
}
}
inBarrier_ = true;
MonStats->BarrierWaitIncr( );
static int sv_epoll_wait_timeout = -2;
static int sv_epoll_retry_count = 1;
if ( sv_epoll_wait_timeout == -2 )
{
char *lv_epoll_wait_timeout_env = getenv( "SQ_MON_EPOLL_WAIT_TIMEOUT" );
if ( lv_epoll_wait_timeout_env )
{
// convert to milliseconds
sv_epoll_wait_timeout = atoi( lv_epoll_wait_timeout_env ) * 1000;
char *lv_epoll_retry_count_env = getenv( "SQ_MON_EPOLL_RETRY_COUNT" );
if ( lv_epoll_retry_count_env )
{
sv_epoll_retry_count = atoi( lv_epoll_retry_count_env );
}
if ( sv_epoll_retry_count > 180 )
{
sv_epoll_retry_count = 180;
}
}
else
{
// default to 64 seconds
sv_epoll_wait_timeout = 16000;
sv_epoll_retry_count = 4;
}
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] EPOLL timeout wait_timeout=%d msecs, retry_count=%d\n"
, method_name
, __LINE__
, sv_epoll_wait_timeout
, sv_epoll_retry_count );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_1, SQ_LOG_INFO, buf );
}
// do the work
struct epoll_event events[2*GetConfigPNodesMax() + 1];
while ( 1 )
{
reconnected:
bool checkConnections = false;
bool doReconnect = false;
bool resetConnections = false;
int peerTimedoutCount = 0;
int maxEvents = 2*GetConfigPNodesCount() - nsent - nrecv;
if ( maxEvents == 0 ) break;
int nw;
peer_t *peer;
while ( 1 )
{
nw = epoll_wait( epollFD_, events, maxEvents, sv_epoll_wait_timeout );
if ( nw >= 0 || errno != EINTR ) break;
}
if ( nw == 0 )
{ // Timeout, no fd's ready
for ( int iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
{ // Check no IO completion on peers
peer = &p[indexToPnid_[iPeer]];
if ( (peer->p_receiving) || (peer->p_sending) )
{
peerTimedoutCount++;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - EPOLL timeout (%d) on: %s(%d), "
"socks_[%d]=%d, "
"peer->p_sending=%d, "
"peer->p_receiving=%d\n"
, method_name, __LINE__
, peerTimedoutCount
, Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
, indexToPnid_[iPeer]
, socks_[indexToPnid_[iPeer]]
, peer->p_sending
, peer->p_receiving );
}
if (peer->p_initial_check && !reconnecting)
{ // Set the session timeout relative to now
peer->p_initial_check = false;
clock_gettime(CLOCK_REALTIME, &peer->znodeFailedTime);
peer->znodeFailedTime.tv_sec += sessionTimeout;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d" " - Znode Fail Time %ld(secs)\n"
, method_name, __LINE__
, peer->znodeFailedTime.tv_sec);
}
}
if ( IsRealCluster && peer->p_timeout_count < sv_epoll_retry_count )
{
peer->p_timeout_count++;
checkConnections = true;
if (peer->p_timeout_count == sv_epoll_retry_count)
{
resetConnections = true;
}
}
else
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d" " - Peer timed out: %s(%d), "
"socks_[%d]=%d, "
"peer->p_timeout_count=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
, indexToPnid_[iPeer]
, socks_[indexToPnid_[iPeer]]
, peer->p_timeout_count );
}
}
}
} // Check no IO completion on peers
if (checkConnections)
{
checkConnections = false;
if (trace_settings & TRACE_RECOVERY)
{
trace_printf( "%s@%d - Initializing AllgatherSockReconnect(),"
" peerTimedoutCount=%d\n"
, method_name, __LINE__
, peerTimedoutCount );
}
// First, check ability to connect to all peers
// An err returned will mean that connect failed with
// at least one peer. No err implies that possible network
// reset occurred and there is probably one dead connection
// to a peer where no IOs will complete ever, so connections
// to all peers must be reestablished.
err = AllgatherSockReconnect( stats, false );
if (err == MPI_SUCCESS)
{ // Connections to all peers are good
if (resetConnections)
{ // Establish new connections on all peers
resetConnections = false;
err = AllgatherSockReconnect( stats, true );
// Redrive IOs on new peer connections
nsent = 0; nrecv = 0;
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
peer = &p[indexToPnid_[i]];
if ( indexToPnid_[i] == MyPNID || socks_[indexToPnid_[i]] == -1 )
{ // peer is me or not available
peer->p_sending = peer->p_receiving = false;
nsent++;
nrecv++;
}
else
{
peer->p_sending = peer->p_receiving = true;
peer->p_sent = peer->p_received = 0;
peer->p_n2recv = -1;
peer->p_buff = ((char *) rbuf) + (indexToPnid_[i] * CommBufSize);
struct epoll_event event;
event.data.fd = socks_[indexToPnid_[i]];
event.events = EPOLLIN | EPOLLOUT | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP;
EpollCtl( epollFD_, EPOLL_CTL_ADD, socks_[indexToPnid_[i]], &event );
}
}
} // (resetConnections)
} // (err == MPI_SUCCESS)
else
{
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
peer = &p[indexToPnid_[i]];
if ( indexToPnid_[i] != MyPNID && socks_[indexToPnid_[i]] == -1 )
{ // peer is me or no longer available
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY) &&
(peer->p_sending || peer->p_receiving) )
{
trace_printf( "%s@%d No IO completion on %s(%d):socks_[%d]=%d, "
"peer->p_sending=%d, "
"peer->p_receiving=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[i]]->GetName(), indexToPnid_[i]
, indexToPnid_[i]
, socks_[indexToPnid_[i]]
, peer->p_sending
, peer->p_receiving );
}
if (peer->p_sending)
{
nsent++;
peer->p_sending = false;
}
if (peer->p_receiving)
{
peer->p_receiving = false;
nrecv++;
}
}
}
}
doReconnect = true;
} // (checkConnections)
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
peer = &p[indexToPnid_[i]];
trace_printf( "%s@%d doReconnect=%d, %s(%d):socks_[%d]=%d, "
"peer->p_sending=%d, "
"peer->p_receiving=%d\n"
, method_name, __LINE__
, doReconnect
, Node[indexToPnid_[i]]->GetName(), indexToPnid_[i]
, indexToPnid_[i]
, socks_[indexToPnid_[i]]
, peer->p_sending
, peer->p_receiving );
}
}
if (doReconnect)
{
reconnectSeqNum_ = seqNum_;
reconnecting = true;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d" " - Reconnecting! (reconnectSeqNum_=%lld)\n"
, method_name, __LINE__, reconnectSeqNum_ );
}
goto reconnected;
}
} // ( nw == 0 )
if ( nw < 0 )
{ // Got an error
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] epoll_wait(%d, %d) error: %s\n",
method_name, __LINE__, epollFD_, maxEvents,
strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_3, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
// Process fd's which are ready to initiate an IO or completed IO
for ( int iEvent = 0; iEvent < nw; iEvent++ )
{
bool stateChange = false;
int fd = events[iEvent].data.fd;
int iPeer;
for ( iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
{ // Find corresponding peer by matching socket fd
if ( events[iEvent].data.fd == socks_[indexToPnid_[iPeer]] ) break;
}
if ( indexToPnid_[iPeer] < 0 || indexToPnid_[iPeer] >= GetConfigPNodesMax() || indexToPnid_[iPeer] == MyPNID
|| socks_[indexToPnid_[iPeer]] == -1
|| (!p[indexToPnid_[iPeer]].p_sending && !p[indexToPnid_[iPeer]].p_receiving) )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] Invalid peer %d, "
"peer.p_sending=%d, "
"peer.p_receiving=%d\n"
, method_name, __LINE__
, indexToPnid_[iPeer]
, indexToPnid_[iPeer] >= GetConfigPNodesMax()?-1:p[indexToPnid_[iPeer]].p_sending
, indexToPnid_[iPeer] >= GetConfigPNodesMax()?-1:p[indexToPnid_[iPeer]].p_receiving );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_4, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
peer_t *peer = &p[indexToPnid_[iPeer]];
if ( (events[iEvent].events & EPOLLERR) ||
(events[iEvent].events & EPOLLHUP) ||
( !(events[iEvent].events & (EPOLLIN|EPOLLOUT))) )
{
// An error has occurred on this fd, or the socket is not
// ready for reading nor writing
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] Error: peer=%d, events[%d].data.fd=%d, event[%d]=%s\n"
, method_name, __LINE__
, indexToPnid_[iPeer]
, iEvent
, events[iEvent].data.fd
, iEvent
, EpollEventString(events[iEvent].events) );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_5, SQ_LOG_CRIT, buf );
stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED;
err = MPI_ERR_IN_STATUS;
if ( peer->p_sending )
{
peer->p_sending = false;
nsent++;
}
if ( peer->p_receiving )
{
peer->p_receiving = false;
nrecv++;
}
stateChange = true;
goto early_exit;
}
if ( peer->p_receiving && events[iEvent].events & EPOLLIN )
{ // Got receive (read) completion
int eagain_ok = 0;
read_again:
char *r = &peer->p_buff[peer->p_received];
int n2get;
if ( peer->p_received >= hdrSize )
{
n2get = peer->p_n2recv;
}
else
{
n2get = hdrSize - peer->p_received;
}
int nr;
while ( 1 )
{
if (trace_settings & TRACE_SYNC_DETAIL)
{
trace_printf( "%s@%d - EPOLLIN from %s(%d),"
" sending=%d,"
" receiving=%d (%d)"
" sent=%d,"
" received=%d"
" timeout_count=%d,"
" initial_check=%d,"
" n2recv=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
, peer->p_sending
, peer->p_receiving, n2get
, peer->p_sent
, peer->p_received
, peer->p_timeout_count
, peer->p_initial_check
, peer->p_n2recv );
}
nr = recv( fd, r, n2get, 0 );
if ( nr > 0 ) Meas.addSockAllGatherRcvdBytes( nr );
if ( nr >= 0 || errno == EINTR ) break;
}
if ( nr < 0 )
{
if ( nr < 0 && eagain_ok && errno == EAGAIN )
{
// do nothing
}
else
{
// error, down socket
int err = errno;
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] recv[%d](%d) error %d (%s)\n"
, method_name, __LINE__
, indexToPnid_[iPeer], nr , err, strerror(err) );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_6, SQ_LOG_CRIT, buf );
peer->p_receiving = false;
nrecv++;
if ( peer->p_sending )
{
peer->p_sending = false;
nsent++;
}
stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED;
err = MPI_ERR_IN_STATUS;
stateChange = true;
}
}
else
{
peer->p_received += nr;
if ( peer->p_received < hdrSize )
{
// do nothing
}
else
{
if ( peer->p_received == hdrSize )
{
// got the complete header, get buffer size
struct sync_buffer_def *sb;
sb = (struct sync_buffer_def *)peer->p_buff;
peer->p_n2recv = sb->msgInfo.msg_offset;
if ( peer->p_n2recv )
{
eagain_ok = 1;
goto read_again;
}
}
else
{
// reading buffer, update counters
peer->p_n2recv -= nr;
}
if ( peer->p_n2recv < 0 )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf),
"[%s@%d] error n2recv %d\n",
method_name, __LINE__, peer->p_n2recv );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_7, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
if ( peer->p_n2recv == 0 )
{
// this buffer is done
peer->p_receiving = false;
nrecv++;
stats[indexToPnid_[iPeer]].count = peer->p_received;
if (trace_settings & TRACE_SYNC_DETAIL)
{
trace_printf( "%s@%d - EPOLLOUT to %s(%d),"
" sending=%d,"
" receiving=%d (%d)"
" sent=%d,"
" received=%d"
" timeout_count=%d,"
" initial_check=%d,"
" n2recv=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
, peer->p_sending
, peer->p_receiving, n2get
, peer->p_sent
, peer->p_received
, peer->p_timeout_count
, peer->p_initial_check
, peer->p_n2recv );
}
stateChange = true;
}
}
}
}
if ( peer->p_sending && events[iEvent].events & EPOLLOUT )
{ // Got send (write) completion
char *s = &((char *)sbuf)[peer->p_sent];
int n2send = nbytes - peer->p_sent;
int ns;
while ( 1 )
{
if (trace_settings & TRACE_SYNC_DETAIL)
{
trace_printf( "%s@%d - EPOLLOUT to %s(%d),"
" sending=%d (%d),"
" receiving=%d"
" sent=%d,"
" received=%d"
" timeout_count=%d,"
" initial_check=%d,"
" n2recv=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
, peer->p_sending, n2send
, peer->p_receiving
, peer->p_sent
, peer->p_received
, peer->p_timeout_count
, peer->p_initial_check
, peer->p_n2recv );
}
ns = send( fd, s, n2send, 0 );
if ( ns > 0 ) Meas.addSockAllGatherSentBytes( ns );
if ( ns >= 0 || errno != EINTR ) break;
}
if ( ns < 0 )
{
// error, down socket
int err = errno;
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] send[%d](%d) error=%d (%s)\n"
, method_name, __LINE__
, indexToPnid_[iPeer], ns, err, strerror(err) );
mon_log_write( MON_CLUSTER_ALLGATHERSOCK_8, SQ_LOG_CRIT, buf );
peer->p_sending = false;
nsent++;
if ( peer->p_receiving )
{
peer->p_receiving = false;
nrecv++;
}
stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED;
err = MPI_ERR_IN_STATUS;
stateChange = true;
}
else
{
peer->p_sent += ns;
if ( peer->p_sent == nbytes )
{
// finished sending to this destination
peer->p_sending = false;
nsent++;
if (trace_settings & TRACE_SYNC_DETAIL)
{
trace_printf( "%s@%d - EPOLLOUT to %s(%d),"
" sending=%d (%d),"
" receiving=%d"
" sent=%d,"
" received=%d"
" timeout_count=%d,"
" initial_check=%d,"
" n2recv=%d\n"
, method_name, __LINE__
, Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
, peer->p_sending, n2send
, peer->p_receiving
, peer->p_sent
, peer->p_received
, peer->p_timeout_count
, peer->p_initial_check
, peer->p_n2recv );
}
stateChange = true;
}
}
}
early_exit:
if ( stateChange )
{
struct epoll_event event;
event.data.fd = socks_[indexToPnid_[iPeer]];
int op = 0;
if ( !peer->p_sending && !peer->p_receiving )
{
op = EPOLL_CTL_DEL;
event.events = 0;
}
else if ( peer->p_sending )
{
op = EPOLL_CTL_MOD;
event.events = EPOLLOUT | EPOLLET | EPOLLRDHUP;
}
else if ( peer->p_receiving )
{
op = EPOLL_CTL_MOD;
event.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
}
if ( op == EPOLL_CTL_DEL || op == EPOLL_CTL_MOD )
{
EpollCtl( epollFD_, op, fd, &event );
}
}
}
}
MonStats->BarrierWaitDecr( );
inBarrier_ = false;
barrierCount_++;
TRACE_EXIT;
return err;
}
int CCluster::AllgatherSockReconnect( MPI_Status *stats, bool reestablishConnections )
{
const char method_name[] = "CCluster::AllgatherSockReconnect";
TRACE_ENTRY;
int err = MPI_SUCCESS;
int idst;
int reconnectSock = -1;
CNode *node;
// Loop on each node in the cluster
for ( int i = 0; i < GetConfigPNodesMax(); i++ )
{
// Loop on each adjacent node in the cluster
for ( int j = i+1; j < GetConfigPNodesMax(); j++ )
{
if ( i == MyPNID )
{ // Current [i] node is my node, so connect to [j] node
idst = j;
node = Nodes->GetNode( idst );
if (!node) continue;
if (node->GetState() != State_Up)
{
if (socks_[idst] != -1)
{ // Peer socket is still active
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"removing old socket from epoll set, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst, socks_[idst] );
}
stats[idst].MPI_ERROR = MPI_ERR_EXITED;
stats[idst].count = 0;
err = MPI_ERR_IN_STATUS;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting Node %s (%d) status to "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst
, ErrorMsg(stats[idst].MPI_ERROR) );
}
--currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
// Remove old socket from epoll set, it may not be there
struct epoll_event event;
event.data.fd = socks_[idst];
event.events = 0;
EpollCtlDelete( epollFD_, socks_[idst], &event );
socks_[idst] = -1;
}
continue;
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Pinging Node %s (%d) to see if it's up\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid() );
}
if (PingSockPeer(node))
{
reconnectSock = ConnectSockPeer( node, idst, reestablishConnections );
if (reconnectSock == -1)
{
stats[idst].MPI_ERROR = MPI_ERR_EXITED;
stats[idst].count = 0;
err = MPI_ERR_IN_STATUS;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting Node %s (%d) status to "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst
, ErrorMsg(stats[idst].MPI_ERROR) );
}
}
}
else
{
if (socks_[idst] != -1)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"removing old socket from epoll set, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst, socks_[idst] );
}
--currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
// Remove old socket from epoll set, it may not be there
struct epoll_event event;
event.data.fd = socks_[idst];
event.events = 0;
EpollCtlDelete( epollFD_, socks_[idst], &event );
socks_[idst] = -1;
}
reconnectSock = -1;
stats[idst].MPI_ERROR = MPI_ERR_EXITED;
stats[idst].count = 0;
err = MPI_ERR_IN_STATUS;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting Node %s (%d) status to "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst
, ErrorMsg(stats[idst].MPI_ERROR) );
}
}
}
else if ( j == MyPNID )
{ // Current [j] is my node, accept connection from peer [i] node
idst = i;
node = Nodes->GetNode( idst );
if (!node) continue;
if (node->GetState() != State_Up)
{
if (socks_[idst] != -1)
{ // Peer socket is still active
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"removing old socket from epoll set, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst, socks_[idst] );
}
stats[idst].MPI_ERROR = MPI_ERR_EXITED;
stats[idst].count = 0;
err = MPI_ERR_IN_STATUS;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting Node %s (%d) status to "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst
, ErrorMsg(stats[idst].MPI_ERROR) );
}
--currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
// Remove old socket from epoll set, it may not be there
struct epoll_event event;
event.data.fd = socks_[idst];
event.events = 0;
EpollCtlDelete( epollFD_, socks_[idst], &event );
socks_[idst] = -1;
}
continue;
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Pinging Node %s (%d) to see if it's up\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid() );
}
if (PingSockPeer(node))
{
reconnectSock = AcceptSockPeer( node, idst, reestablishConnections );
if (reconnectSock == -1)
{
stats[idst].MPI_ERROR = MPI_ERR_EXITED;
stats[idst].count = 0;
err = MPI_ERR_IN_STATUS;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting Node %s (%d) status to "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst
, ErrorMsg(stats[idst].MPI_ERROR) );
}
}
}
else
{
if (socks_[idst] != -1)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"removing old socket from epoll set, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst, socks_[idst] );
}
--currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
// Remove old socket from epoll set, it may not be there
struct epoll_event event;
event.data.fd = socks_[idst];
event.events = 0;
EpollCtlDelete( epollFD_, socks_[idst], &event );
socks_[idst] = -1;
}
reconnectSock = -1;
stats[idst].MPI_ERROR = MPI_ERR_EXITED;
stats[idst].count = 0;
err = MPI_ERR_IN_STATUS;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Setting Node %s (%d) status to "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, idst
, ErrorMsg(stats[idst].MPI_ERROR) );
}
}
}
else
{
idst = -1;
}
if ( idst >= 0
&& reconnectSock != -1
&& socks_[idst] != -1
&& fcntl( socks_[idst], F_SETFL, O_NONBLOCK ) )
{
err = MPI_ERR_AMODE;
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] fcntl(socks_[%d]=%d,F_SETFL,NONBLOCK) error: %s\n",
method_name, __LINE__,idst, socks_[idst], strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_ALLGATHERSOCKRECONN_1, SQ_LOG_CRIT, buf );
}
}
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
for ( int i = 0; i < GetConfigPNodesCount(); i++ )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d" " - socks_[%d]=%d, "
"stats[%d].MPI_ERROR=%s\n"
, method_name, __LINE__
, indexToPnid_[i]
, socks_[indexToPnid_[i]]
, indexToPnid_[i]
, ErrorMsg(stats[indexToPnid_[i]].MPI_ERROR) );
}
}
trace_printf( "%s@%d - Returning err=%d\n"
, method_name, __LINE__, err );
}
TRACE_EXIT;
return( err );
}
int CCluster::AcceptSockPeer( CNode *node, int peer, bool reestablishConnections )
{
const char method_name[] = "CCluster::AcceptSockPeer";
TRACE_ENTRY;
int rc = MPI_SUCCESS;
int reconnectSock = -1;
struct hostent *he;
// Get my host structure via my node name
he = gethostbyname( MyNode->GetName() );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] gethostbyname(%s) error: %s\n"
, method_name, __LINE__
, MyNode->GetName()
, strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_1, SQ_LOG_CRIT, buf );
abort();
}
else
{
if (trace_settings & TRACE_RECOVERY)
{
trace_printf( "%s@%d Accepting server socket: from %s(%d), port=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, MyNode->GetSyncSocketPort() );
}
// Accept connection from peer
reconnectSock = AcceptSock( syncSock_ );
if (reconnectSock != -1)
{
if (trace_settings & TRACE_RECOVERY)
{
trace_printf( "%s@%d Server %s(%d) accepted from client %s(%d), old socks_[%d]=%d, new socks_[%d]=%d\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, node->GetName(), node->GetPNid()
, peer, socks_[peer]
, peer, reconnectSock);
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] AcceptSock(%d) failed!\n",
method_name, __LINE__, syncSock_ );
mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_2, SQ_LOG_ERR, buf );
rc = -1;
}
if (reestablishConnections)
{
if (socks_[peer] != -1)
{
// Remove old socket from epoll set, it may not be there
struct epoll_event event;
event.data.fd = socks_[peer];
event.events = 0;
EpollCtlDelete( epollFD_, socks_[peer], &event );
if (node->GetState() != State_Up)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"removing old socket from epoll set, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, peer, socks_[peer] );
}
socks_[peer] = -1;
}
}
if (reconnectSock != -1)
{
socks_[peer] = reconnectSock; // AcceptSockPeer
}
}
else
{
if (reconnectSock != -1)
{
close( (int)reconnectSock );
}
}
}
TRACE_EXIT;
return rc;
}
int CCluster::ConnectSockPeer( CNode *node, int peer, bool reestablishConnections )
{
const char method_name[] = "CCluster::ConnectSockPeer";
TRACE_ENTRY;
int rc = MPI_SUCCESS;
int reconnectSock = -1;
unsigned char srcaddr[4], dstaddr[4];
struct hostent *he;
// Get my host structure via my node name
he = gethostbyname( MyNode->GetName() );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] gethostbyname(%s) error: %s\n"
, method_name, __LINE__
, MyNode->GetName()
, strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_1, SQ_LOG_CRIT, buf );
abort();
}
else
{
// Initialize my source address structure
memcpy( srcaddr, he->h_addr, 4 );
// Get peer's host structure via its node name
he = gethostbyname( node->GetName() );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf),
"[%s@%d] gethostbyname(%s) error: %s\n",
method_name, __LINE__, node->GetName(),
strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_2, SQ_LOG_CRIT, buf );
abort();
}
// Initialize peer's destination address structure
memcpy( dstaddr, he->h_addr, 4 );
if (trace_settings & TRACE_RECOVERY)
{
trace_printf( "%s@%d Creating client socket: src=%d.%d.%d.%d, "
"dst(%s)=%d.%d.%d.%d, dst port=%d\n"
, method_name, __LINE__
, (int)((unsigned char *)srcaddr)[0]
, (int)((unsigned char *)srcaddr)[1]
, (int)((unsigned char *)srcaddr)[2]
, (int)((unsigned char *)srcaddr)[3]
, node->GetName()
, (int)((unsigned char *)dstaddr)[0]
, (int)((unsigned char *)dstaddr)[1]
, (int)((unsigned char *)dstaddr)[2]
, (int)((unsigned char *)dstaddr)[3]
, sockPorts_[peer] );
}
// Connect to peer
reconnectSock = MkCltSock( srcaddr, dstaddr, sockPorts_[peer] );
if (reconnectSock != -1)
{
if (trace_settings & TRACE_RECOVERY)
{
trace_printf( "%s@%d Client %s(%d) connected to server %s(%d), old socks_[%d]=%d, new socks_[%d]=%d\n"
, method_name, __LINE__
, MyNode->GetName(), MyPNID
, node->GetName(), node->GetPNid()
, peer, socks_[peer]
, peer, reconnectSock);
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] MkCltSock() src=%d.%d.%d.%d, "
"dst(%s)=%d.%d.%d.%d failed!\n"
, method_name, __LINE__
, (int)((unsigned char *)srcaddr)[0]
, (int)((unsigned char *)srcaddr)[1]
, (int)((unsigned char *)srcaddr)[2]
, (int)((unsigned char *)srcaddr)[3]
, node->GetName()
, (int)((unsigned char *)dstaddr)[0]
, (int)((unsigned char *)dstaddr)[1]
, (int)((unsigned char *)dstaddr)[2]
, (int)((unsigned char *)dstaddr)[3] );
mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_3, SQ_LOG_ERR, buf );
rc = -1;
}
if (reestablishConnections)
{
if (socks_[peer] != -1)
{
// Remove old socket from epoll set, it may not be there
struct epoll_event event;
event.data.fd = socks_[peer];
event.events = 0;
EpollCtlDelete( epollFD_, socks_[peer], &event );
if (node->GetState() != State_Up)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Node %s (%d) is not up, "
"removing old socket from epoll set, "
"socks_[%d]=%d\n"
, method_name, __LINE__
, node->GetName(), node->GetPNid()
, peer, socks_[peer] );
}
socks_[peer] = -1;
}
}
if (reconnectSock != -1)
{
socks_[peer] = reconnectSock; // ConnectSockPeer
}
}
else
{
if (reconnectSock != -1)
{
close( (int)reconnectSock );
}
}
}
TRACE_EXIT;
return( rc );
}
// When we get a communication error for a point-to-point monitor communicator
// verify that the other nodes in the cluster also lost communications
// with that monitor. If all nodes concur we consider that monitor
// down.
void CCluster::ValidateClusterState( cluster_state_def_t nodestate[],
bool haveDivergence)
{
const char method_name[] = "CCluster::ValidateClusterState";
exitedMons_t::iterator it;
upNodes_t nodeMask;
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
nodeMask.upNodes[i] = 0;
}
for ( it = exitedMons_.begin(); it != exitedMons_.end(); )
{
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d checking exited pnid=%d, detecting pnid=%d, seqNum=%lld"
" (current seqNum_=%lld)\n", method_name, __LINE__,
it->exitedPnid, it->detectingPnid, it->seqNum, seqNum_);
}
if ( seqNum_ >= (it->seqNum + 2) )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s] Validating exited node %d, "
"detected by node %d at seq #%lld "
"(current seq # is %lld).\n",
method_name, it->exitedPnid, it->detectingPnid,
it->seqNum, seqNum_);
mon_log_write(MON_CLUSTER_VALIDATE_STATE_1, SQ_LOG_ERR, buf);
int concurringNodes = 0;
// Check if all active nodes see the node as down.
nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] = 1ull << (it->exitedPnid%MAX_NODE_BITMASK);
string setSeesUp;
string setSeesDown;
char nodeX[10];
// Evaluate each active (up) node in the cluster
int pnodesCount = 0;
for (int index = 0;
index < GetConfigPNodesMax() && pnodesCount < currentNodes_;
++index)
{
if ( nodestate[index].seq_num != 0 )
{ // There is valid nodestate info from node "index"
pnodesCount++;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d down pnid= %d: nodestate[%d].nodeMask.upNodes[%d]=%llx, downNodeMask.upNodes[%d]=%llx\n"
, method_name, __LINE__
, it->exitedPnid
, index, (it->exitedPnid/MAX_NODE_BITMASK)
, nodestate[index].nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK]
, (index/MAX_NODE_BITMASK)
, nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] );
}
if ((nodestate[index].nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] &
nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK]) == 0)
{ // Node "pnid" sees the node as down
// temp trace
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d node %d concurs that node %d "
"is down\n", method_name, __LINE__,
/*indexToPnid_[index]*/ index, it->exitedPnid);
}
snprintf(nodeX, sizeof(nodeX), "%d, ", /*indexToPnid_[index]*/ index);
setSeesDown.append(nodeX);
++concurringNodes;
}
else
{
// temp trace
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d node %d says node %d is up\n",
method_name, __LINE__, /*indexToPnid_[index]*/ index,
it->exitedPnid);
}
snprintf(nodeX, sizeof(nodeX), "%d, ", /*indexToPnid_[index]*/ index);
setSeesUp.append(nodeX);
}
}
else
{
// temp trace
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d ignoring state from node %d\n",
method_name, __LINE__, /*indexToPnid_[index]*/ index);
}
}
}
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d concurringNodes=%d, currentNodes_=%d\n",
method_name, __LINE__, concurringNodes, currentNodes_);
}
if (concurringNodes == currentNodes_)
{ // General agreement that node is down, proceed to mark it down
CNode *downNode = Nodes->GetNode( it->exitedPnid );
if (downNode && downNode->GetState() != State_Down)
{
// temp trace
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d proceeding to mark node %d down\n",
method_name, __LINE__, it->exitedPnid);
}
mem_log_write(CMonLog::MON_UPDATE_CLUSTER_3, it->exitedPnid);
HandleDownNode(it->exitedPnid);
}
else
{
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d Node pnid=%d is already down\n"
, method_name, __LINE__
, it->exitedPnid);
}
}
}
else if ( concurringNodes != 0 && !enqueuedDown_ )
{ // Some monitors say the node is down, others don't.
// This is not supposed to happen. Enqueue request to
// bring this node down. All monitors will do the same
// so the cluster will be brought down.
if (setSeesUp.length() > 2)
setSeesUp.erase(setSeesUp.length()-2, 2);
if (setSeesDown.length() > 2)
setSeesDown.erase(setSeesDown.length()-2, 2);
char buf[MON_STRING_BUF_SIZE*2];
snprintf( buf, sizeof(buf), "[%s] Lost connection to node "
"%d but only %d of %d nodes also lost the "
"connection. See up: %s. See down: %s. So node "
"%d is going down (at seq #%lld).\n", method_name,
it->exitedPnid, concurringNodes, currentNodes_,
setSeesUp.c_str(), setSeesDown.c_str(),
MyPNID, seqNum_ );
mon_log_write(MON_CLUSTER_VALIDATE_STATE_2, SQ_LOG_ERR, buf);
mem_log_write(CMonLog::MON_UPDATE_CLUSTER_4, MyPNID,
it->exitedPnid);
enqueuedDown_ = true;
ReqQueue.enqueueDownReq(MyPNID);
}
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d removing exited pnid=%d, detecting pnid=%d, seqNum=%lld"
" (current seqNum_=%lld)\n", method_name, __LINE__,
it->exitedPnid, it->detectingPnid, it->seqNum, seqNum_);
}
// Delete current list element and advance to next one
it = exitedMons_.erase( it );
}
else
{ // Advance to next list element
++it;
}
}
if ( haveDivergence )
{
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s] Cluster view divergence (at seq #%lld), "
"node %d sees set[%d]: %llx\n"
, method_name, seqNum_, MyPNID, i
, upNodes_.upNodes[i] );
mon_log_write(MON_CLUSTER_VALIDATE_STATE_3, SQ_LOG_ERR, buf);
}
// For each "up node" (from local perspective)
// go through nodestate for each other node. If any node
// says the node is down, add an item to the exitedMons_ list
// for examination during the next sync cycle (by which time
// all nodes will have had a chance to detect the down monitor.)
int pnodesCount2 = 0;
for (int remIndex = 0;
remIndex < GetConfigPNodesMax() && pnodesCount2 < currentNodes_;
++remIndex)
{
bool someExited = false;
// No need to check local monitor's view of the cluster since
// any down connections are handled directly when detected.
if (/*indexToPnid_[remIndex]*/remIndex == MyPNID)
{
pnodesCount2++;
continue;
}
// No need to check a remote monitor's view when node is down
CNode *remoteNode = Nodes->GetNode( /*indexToPnid_[remIndex]*/remIndex );
if ( ! remoteNode )
{ // node is not member of cluster
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d Skipping non-existing node "
"pnid=%d\n",
method_name, __LINE__,
/*indexToPnid_[remIndex]*/remIndex);
}
continue;
}
else if (remoteNode->GetState() == State_Down)
{ // node is down
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d Skipping down node "
"pnid=%d (%s)\n",
method_name, __LINE__,
/*indexToPnid_[remIndex]*/remIndex, remoteNode->GetName());
}
continue;
}
else
{
pnodesCount2++;
}
// Check if all active nodes see the node as up.
nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] =
1ull << (/*indexToPnid_[remIndex]*/remIndex%MAX_NODE_BITMASK);
if ( upNodes_.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] &
nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] )
{ // This remote node sees node pnid as up
int pnodesCount3 = 0;
for (int exitedPNid = 0;
exitedPNid < GetConfigPNodesMax() && pnodesCount3 < currentNodes_;
++exitedPNid)
{
CNode *exitedNode = Nodes->GetNode( /*indexToPnid_[remIndex]*/exitedPNid );
if ( exitedNode &&
(/*indexToPnid_[remIndex]*/remIndex != exitedPNid) &&
(nodestate[remIndex].seq_num != 0) &&
(nodestate[exitedPNid].nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] &
nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK]) == 0 )
{ // Node remIndex sees exitedPNid as down
pnodesCount3++;
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d Divergence, queueing "
"monExited{%d, %d, %lld}\n",
method_name, __LINE__, exitedPNid, /*indexToPnid_[remIndex]*/remIndex,
seqNum_);
}
someExited = true;
monExited_t monExited = {exitedPNid, /*indexToPnid_[remIndex]*/remIndex, seqNum_};
exitedMons_.push_back( monExited );
}
}
}
if (someExited)
{
// No need to look further for any other
// monitor's view of node pnid. When the
// exitedMons_ element is processed all nodes
// will be checked for concurrence.
break;
}
}
}
}
bool CCluster::ValidateSeqNum( cluster_state_def_t nodestate[] )
{
const char method_name[] = "CCluster::ValidateSeqNum";
unsigned long long seqNum;
unsigned long long loSeqNum = seqNum_;
unsigned long long hiSeqNum = seqNum_;
unsigned long long seqNumBucket[MAX_NODES];
int seqNumCount[MAX_NODES];
int maxBucket = 0;
bool found;
int mostCountsIndex;
if ( GetConfigPNodesCount() == 1 ) return true;
// Count occurrences of sequence numbers
for (int pnid = 0; pnid < GetConfigPNodesMax(); pnid++)
{
CNode *node = Nodes->GetNode( pnid );
if (!node) continue;
if (node->GetState() != State_Up) continue;
if ( pnid == MyPNID )
{
seqNum = nodestate[pnid].seq_num = seqNum_;
}
else
{
seqNum = nodestate[pnid].seq_num;
}
if (trace_settings & TRACE_SYNC)
{
trace_printf( "%s@%d seqNum_=%lld, nodestate[%d].seq_num=%lld\n"
, method_name, __LINE__
, seqNum_
, pnid
, nodestate[pnid].seq_num );
}
if (seqNum != 0)
{
loSeqNum = (seqNum < loSeqNum) ? seqNum : loSeqNum;
hiSeqNum = (seqNum > hiSeqNum) ? seqNum : hiSeqNum;
found = false;
for (int i=0; i<maxBucket; ++i)
{
if ( seqNum == seqNumBucket[i] )
{
++seqNumCount[i];
found = true;
break;
}
}
if ( ! found )
{
seqNumBucket[maxBucket] = seqNum;
seqNumCount[maxBucket] = 1;
++maxBucket;
}
}
}
if ( maxBucket == 0 )
{ // Normal case, all nodes have same sequence number
mostCountsIndex = 0;
}
else
{ // Look for majority sequence number
int mostCounts = 0;
mostCountsIndex = 0;
for (int i=0; i<maxBucket; ++i)
{
if ( seqNumCount[i] > mostCounts )
{
mostCounts = seqNumCount[i];
mostCountsIndex = i;
}
}
}
lowSeqNum_ = loSeqNum;
highSeqNum_ = hiSeqNum;
if (trace_settings & TRACE_SYNC)
{
if ( lowSeqNum_ != highSeqNum_ )
{
trace_printf( "%s@%d Most common seq num=%lld (%d nodes), "
"%d buckets, low=%lld, high=%lld, local seq num (%lld) did not match.\n"
, method_name, __LINE__
, seqNumBucket[mostCountsIndex]
, seqNumCount[mostCountsIndex]
, maxBucket
, lowSeqNum_
, highSeqNum_
, seqNum_ );
}
}
// Fail if any sequence number does not match
return( lowSeqNum_ == highSeqNum_ );
}
void CCluster::HandleDownNode( int pnid )
{
const char method_name[] = "CCluster::HandleDownNode";
TRACE_ENTRY;
// Add to dead node name list
CNode *downNode = Nodes->GetNode( pnid );
assert(downNode);
deadNodeList_.push_back( downNode );
if (trace_settings & TRACE_INIT)
trace_printf("%s@%d - Added down node to list, pnid=%d, name=(%s)\n", method_name, __LINE__, downNode->GetPNid(), downNode->GetName());
// assign new leaders if needed
AssignLeaders( pnid, downNode->GetName(), false );
// Build available list of spare nodes
CNode *spareNode;
NodesList *spareNodesList = Nodes->GetSpareNodesList();
NodesList::iterator itSn;
for ( itSn = spareNodesList->begin(); itSn != spareNodesList->end() ; itSn++ )
{
spareNode = *itSn;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - %s (pnid=%d) is in available spare node list, state=%s, spare=%d, rank failure=%d\n"
, method_name, __LINE__, spareNode->GetName(), spareNode->GetPNid()
, StateString(spareNode->GetState()), spareNode->IsSpareNode(), spareNode->IsRankFailure());
// if spare node is available
if ( spareNode->IsSpareNode() &&
!spareNode->IsRankFailure() &&
spareNode->GetState() == State_Up )
{
spareNodeVector_.push_back( spareNode );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - pnid=%d, name=(%s) is available Spare\n", method_name, __LINE__, spareNode->GetPNid(), spareNode->GetName());
}
}
// Activate spare or down node
NodesList::iterator itDn;
for ( itDn = deadNodeList_.begin(); itDn != deadNodeList_.end() ; itDn++ )
{
downNode = *itDn;
if ( Emulate_Down )
{
ReqQueue.enqueueDownReq( downNode->GetPNid() );
}
else
{
bool done = false;
spareNode = NULL;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - spare node vector size=%ld\n"
, method_name, __LINE__, spareNodeVector_.size());
// Find available spare node for current down node
for ( unsigned int ii = 0; ii < spareNodeVector_.size() && !done ; ii++ )
{
PNidVector sparePNids = spareNodeVector_[ii]->GetSparePNids();
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - spare pnids vector size=%ld\n"
, method_name, __LINE__, sparePNids.size());
// Check each pnid it is configured to spare
for ( unsigned int jj = 0; jj < sparePNids.size(); jj++ )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d - %s (pnid=%d) is in spare node vector[%d], size=%ld\n"
, method_name, __LINE__
, spareNodeVector_[ii]->GetName()
, spareNodeVector_[ii]->GetPNid()
, jj, sparePNids.size());
// if this is a spare for the down node
if ( spareNodeVector_[ii]->IsSpareNode() &&
downNode->GetPNid() == sparePNids[jj] )
{
// assign it and remove it from the vector
spareNode = spareNodeVector_[ii];
spareNodeVector_.erase( spareNodeVector_.begin() + ii );
done = true;
break;
}
}
}
if ( spareNode )
{
Nodes->RemoveFromSpareNodesList( spareNode );
downNode->SetState( State_Takeover ); // change state so that pending requests could fail.
spareNode->SetActivatingSpare( true );
if ( spareNode->GetPNid() == MyPNID )
{
ReqQueue.enqueueActivateSpareReq( spareNode, downNode, true );
}
}
else
{
if ( downNode->IsSpareNode() )
{
Nodes->RemoveFromSpareNodesList( downNode );
}
ReqQueue.enqueueDownReq( downNode->GetPNid() );
}
}
}
spareNodeVector_.clear();
deadNodeList_.clear();
TRACE_EXIT;
}
void CCluster::UpdateClusterState( bool &doShutdown,
struct sync_buffer_def * syncBuf,
MPI_Status *status,
int sentChangeNid)
{
const char method_name[] = "CCluster::UpdateClusterState";
TRACE_ENTRY;
struct sync_buffer_def *recvBuf;
#ifndef NAMESERVER_PROCESS
struct sync_buffer_def *sendBuf = Nodes->GetSyncBuffer();
#endif
STATE node_state;
int change_nid;
cluster_state_def_t nodestate[GetConfigPNodesMax()];
bool clusterViewDivergence = false;
// Populate nodestate array using node state info from "allgather"
// along with local node state.
for (int index = 0; index < GetConfigPNodesMax(); index++)
{
// Only process active nodes
bool noComm;
switch( CommType )
{
case CommType_InfiniBand:
noComm = (comms_[index] == MPI_COMM_NULL) ? true : false;
break;
case CommType_Sockets:
noComm = (socks_[index] == -1) ? true : false;
break;
default:
// Programmer bonehead!
abort();
}
if (noComm
|| status[index].MPI_ERROR != MPI_SUCCESS)
{
if (trace_settings & (TRACE_RECOVERY | TRACE_INIT))
{
if (!noComm)
{
trace_printf( "%s@%d - Communication error from node %d, "
" seq_num=#%lld\n"
, method_name, __LINE__, index
, seqNum_ );
}
}
// Not an active node, set default values
nodestate[index].node_state = State_Unknown;
nodestate[index].change_nid = -1;
nodestate[index].seq_num = 0;
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
nodestate[index].nodeMask.upNodes[i] = 0;
}
#ifdef NAMESERVER_PROCESS
nodestate[index].monConnCount = -1;
#else
nodestate[index].monProcCount = 0;
#endif
continue;
}
recvBuf = (struct sync_buffer_def *)
(((char *) syncBuf) + index * CommBufSize);
if (trace_settings & TRACE_SYNC)
{
int nr;
MPI_Get_count(&status[index], MPI_CHAR, &nr);
trace_printf("%s@%d - Received %d bytes from node %d, "
", seq_num=%lld, message count=%d\n",
method_name, __LINE__, nr, index,
recvBuf->nodeInfo.seq_num,
recvBuf->msgInfo.msg_count);
}
nodestate[index].node_state = recvBuf->nodeInfo.node_state;
nodestate[index].change_nid = recvBuf->nodeInfo.change_nid;
nodestate[index].seq_num = recvBuf->nodeInfo.seq_num;
nodestate[index].nodeMask = recvBuf->nodeInfo.nodeMask;
#ifdef NAMESERVER_PROCESS
nodestate[index].monConnCount = recvBuf->nodeInfo.monConnCount;
#else
nodestate[index].monProcCount = recvBuf->nodeInfo.monProcCount;
#endif
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
if ( nodestate[index].nodeMask.upNodes[i] != upNodes_.upNodes[i] )
{
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
for ( int j =0; j < MAX_NODE_MASKS ; j++ )
{
trace_printf( "%s@%d - Divergence (at seq #%lld), node %s "
"(pnid=%d) sees cluster state[%d] %llx, local "
"monitor sees %llx\n"
, method_name, __LINE__
, seqNum_
, Node[index]->GetName()
, index
, j
, nodestate[index].nodeMask.upNodes[j]
, upNodes_.upNodes[j] );
}
}
clusterViewDivergence = true;
}
}
#ifndef NAMESERVER_PROCESS
if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_TMSYNC))
{
trace_printf( "%s@%d - Node %s (pnid=%d) TmSyncState=(%d)(%s)\n"
, method_name, __LINE__
, Node[index]->GetName()
, index
, recvBuf->nodeInfo.tmSyncState
, SyncStateString( recvBuf->nodeInfo.tmSyncState ));
}
#endif
#ifndef NAMESERVER_PROCESS
if ( Node[index]->GetTmSyncState() != recvBuf->nodeInfo.tmSyncState )
{
Node[index]->SetTmSyncState(recvBuf->nodeInfo.tmSyncState);
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated"
" (%d)(%s)\n", method_name, __LINE__,
Node[index]->GetName(), index,
recvBuf->nodeInfo.tmSyncState,
SyncStateString( recvBuf->nodeInfo.tmSyncState ));
}
}
#endif
// Check if we need to increase my node's shutdown level ...
// all nodes should be at the highest level selected from any source
if ( MyNode->GetShutdownLevel() < recvBuf->nodeInfo.sdLevel )
{
MyNode->SetShutdownLevel( recvBuf->nodeInfo.sdLevel );
if (MyNode->GetState() == State_Up)
{
MyNode->SetState( State_Shutdown );
}
if (trace_settings & (TRACE_REQUEST | TRACE_SYNC))
trace_printf("%s@%d - Node %s Shutdown Level updated (%d)\n",
method_name, __LINE__,
Node[index]->GetName(), recvBuf->nodeInfo.sdLevel);
}
Node[index]->SetInternalState( recvBuf->nodeInfo.internalState );
if ( recvBuf->nodeInfo.internalState == State_Ready_To_Exit )
{ // The node is exiting. Don't communicate with it any more.
if (trace_settings & (TRACE_REQUEST | TRACE_SYNC))
trace_printf("%s@%d - Node %s (%d) ready to exit, setting comm "
"to null\n", method_name, __LINE__,
Node[index]->GetName(), index);
switch( CommType )
{
case CommType_InfiniBand:
MPI_Comm_free( &comms_[index] );
break;
case CommType_Sockets:
shutdown( socks_[index], SHUT_RDWR );
close( socks_[index] );
socks_[index] = -1;
break;
default:
// Programmer bonehead!
abort();
}
Node[index]->SetState( State_Down );
--currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK));
}
}
if ( (checkSeqNum_ || reconnectSeqNum_ != 0)
&& !ValidateSeqNum( nodestate )
&& !enqueuedDown_ )
{
if ( reconnectSeqNum_ == 0 && MyNode->GetState() == State_Up )
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] Sync cycle sequence number (%lld) "
"incorrect. Aborting!\n", method_name, seqNum_);
mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_1, SQ_LOG_CRIT, buf);
mem_log_write(CMonLog::MON_UPDATE_CLUSTER_2, MyPNID);
abort();
}
}
nodestate[MyPNID].node_state = Node[MyPNID]->GetState();
nodestate[MyPNID].change_nid = sentChangeNid;
nodestate[MyPNID].seq_num = seqNum_;
nodestate[MyPNID].nodeMask = upNodes_;
#ifdef NAMESERVER_PROCESS
nodestate[MyPNID].monConnCount = Node[MyPNID]->GetMonConnCount();
#else
nodestate[MyPNID].monProcCount = Node[MyPNID]->GetNumProcs();
#endif
// Examine status returned from MPI receive requests
for (int index = 0; index < GetConfigPNodesMax(); index++)
{
bool noComm;
switch( CommType )
{
case CommType_InfiniBand:
noComm = (comms_[index] == MPI_COMM_NULL) ? true : false;
break;
case CommType_Sockets:
noComm = (socks_[index] == -1) ? true : false;
break;
default:
// Programmer bonehead!
abort();
}
if (noComm) continue;
if (status[index].MPI_ERROR != MPI_SUCCESS)
{
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s] MPI communications error=%d "
"(%s) for node %d (at seq #%lld).\n", method_name,
status[index].MPI_ERROR, ErrorMsg(status[index].MPI_ERROR),
index, seqNum_);
mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_2, SQ_LOG_ERR, buf);
if ( status[index].MPI_ERROR == MPI_ERR_EXITED )
{ // A monitor has gone away
mem_log_write(CMonLog::MON_UPDATE_CLUSTER_1, index);
switch( CommType )
{
case CommType_InfiniBand:
MPI_Comm_free( &comms_[index] );
break;
case CommType_Sockets:
shutdown( socks_[index], SHUT_RDWR );
close( socks_[index] );
socks_[index] = -1;
break;
default:
// Programmer bonehead!
abort();
}
--currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK));
// Pretend node is still up until down node processing
// completes.
nodestate[index].node_state = State_Unknown;
nodestate[index].change_nid = -1;
nodestate[index].seq_num = 0;
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
{
nodestate[index].nodeMask.upNodes[i] = 0;
}
#ifdef NAMESERVER_PROCESS
nodestate[index].monConnCount = -1;
#else
nodestate[index].monProcCount = 0;
#endif
if ( validateNodeDown_ )
{
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf( "%s@%d Divergence, queueing "
"monExited{%d, %d, %lld}\n"
, method_name, __LINE__
, index, MyPNID, seqNum_ );
}
// Save info for the exited monitor so can confirm
// that all monitors have the same view.
monExited_t monExited = {index, MyPNID, seqNum_};
exitedMons_.push_back( monExited );
}
else
{
HandleDownNode(index);
}
}
}
}
if ( validateNodeDown_ )
ValidateClusterState( nodestate, clusterViewDivergence );
#ifndef NAMESERVER_PROCESS
if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_TMSYNC))
{
trace_printf( "%s@%d - Node %s (pnid=%d) TmSyncState=(%d)(%s)\n"
, method_name, __LINE__
, MyNode->GetName()
, MyPNID
, sendBuf->nodeInfo.tmSyncState
, SyncStateString( sendBuf->nodeInfo.tmSyncState ));
}
#endif
// Update our node states
for (int index = 0; index < GetConfigPNodesMax(); index++)
{
node_state = (STATE)nodestate[index].node_state;
change_nid = nodestate[index].change_nid;
if ( index == MyPNID &&
MyNode->GetState() == State_Merged && seqNum_ == 1)
{ // Initial "allgather" for this re-integrated monitor.
seqNum_ = EnsureAndGetSeqNum(nodestate);
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
trace_printf("%s@%d Completed initial allgather for pnid=%d, "
"state=%d(%s), seqNum_=%lld\n", method_name, __LINE__,
index, MyNode->GetState(),
StateString(MyNode->GetState()), seqNum_ );
}
// Queue the node up request for processing by a
// worker thread.
ReqQueue.enqueueUpReq( MyPNID, NULL, -1 );
}
if ( change_nid == MyPNID )
{
if( MyNode->GetState() == State_Down ||
MyNode->GetState() == State_Merged ||
MyNode->GetState() == State_Joining )
{
if (trace_settings & TRACE_RECOVERY)
trace_printf( "%s@%d enqueueing node up, state=%s\n",
method_name, __LINE__,
StateString(MyNode->GetState()) );
// Queue the node up request for processing by a
// worker thread.
ReqQueue.enqueueUpReq( MyPNID, NULL, -1 );
}
else
{ // This node is being "downed"
if (trace_settings & TRACE_RECOVERY)
trace_printf( "%s@%d enqueueing node down, state=%s\n",
method_name, __LINE__,
StateString(MyNode->GetState()) );
// Queue the node down request for processing by a
// worker thread.
ReqQueue.enqueueDownReq( MyPNID );
}
}
else
{
// In a real cluster, existing monitors need to merge new
// monitor.
CNode *pnode = change_nid != -1 ? Nodes->GetNode( change_nid ) : NULL;
#ifdef NAMESERVER_PROCESS
if ( change_nid != -1 && pnode )
#else
if ( ! Emulate_Down && change_nid != -1 && pnode )
#endif
{
switch ( pnode->GetState() )
{
case State_Down:
if (trace_settings & TRACE_RECOVERY)
trace_printf( "%s@%d - change_nid=%d, state=%s, "
"queueing up request\n",
method_name, __LINE__ , change_nid,
StateString(pnode->GetState()));
mem_log_write(CMonLog::MON_UPDATE_CLUSTER_5, change_nid);
// Queue the node up request for processing by a
// worker thread.
ReqQueue.enqueueUpReq( change_nid,
(char *)pnode->GetName(),
-1 );
break;
case State_Merging:
if (trace_settings & TRACE_RECOVERY)
trace_printf( "%s@%d - change_nid=%d, state=%s, "
"queueing up request\n",
method_name, __LINE__ , change_nid,
StateString(pnode->GetState()));
mem_log_write(CMonLog::MON_UPDATE_CLUSTER_6, change_nid);
switch( CommType )
{
case CommType_InfiniBand:
setNewComm(change_nid);
break;
case CommType_Sockets:
setNewSock(change_nid);
break;
default:
// Programmer bonehead!
MPI_Abort(MPI_COMM_SELF,99);
}
pnode->SetState( State_Merged );
ReqQueue.enqueueUpReq( change_nid,
(char *)pnode->GetName(),
-1 );
break;
case State_Merged:
case State_Joining:
default:
if (trace_settings & TRACE_RECOVERY)
trace_printf( "%s@%d - change_nid=%d, state=%s, "
"no action required.\n",
method_name, __LINE__ , change_nid,
StateString( pnode->GetState() ));
break;
}
}
}
switch ( node_state )
{
case State_Up:
case State_Joining:
case State_Merged:
case State_Merging:
case State_Initializing:
case State_Unlinked:
case State_Unknown:
break;
case State_Down:
doShutdown = true;
break;
case State_Stopped:
case State_Shutdown:
if (trace_settings & TRACE_SYNC_DETAIL)
trace_printf("%s@%d - Node %d is stopping.\n", method_name, __LINE__, index);
Node[index]->SetState( (STATE) node_state );
doShutdown = true;
break;
default:
if (trace_settings & TRACE_SYNC)
trace_printf("%s@%d - Node %d in unknown state (%d).\n",
method_name, __LINE__, index, node_state);
}
}
#ifdef NAMESERVER_PROCESS
// Update min monConnCount
int minConnCount = INT_MAX;
int minConnPnid = -1;
for (int index = 0; index < GetConfigPNodesMax(); index++)
{
int connCount = nodestate[index].monConnCount;
if ( ( connCount >= 0 ) && ( connCount < minConnCount ) )
{
minConnPnid = index;
minConnCount = connCount;
}
}
myMonConnCount_ = nodestate[MyPNID].monConnCount;
minMonConnCount_ = minConnCount;
minMonConnPnid_ = minConnPnid;
#else
if (NameServerEnabled)
{
clusterProcCount_ = 0;
for (int index = 0; index < GetConfigPNodesMax(); index++)
{
clusterProcCount_ += nodestate[index].monProcCount;
}
}
#endif
TRACE_EXIT;
}
bool CCluster::ProcessClusterData( struct sync_buffer_def * syncBuf,
struct sync_buffer_def * sendBuf,
bool deferredTmSync )
{
const char method_name[] = "CCluster::ProcessClusterData";
TRACE_ENTRY;
// Using the data returned from Allgather, process replication data
// from all nodes. If there are any TmSync messages from other
// nodes, defer processing until all other replicated data are
// processed.
struct internal_msg_def *msg;
struct sync_buffer_def *msgBuf;
bool haveDeferredTmSync = false;
for (int i = 0; i < GetConfigPNodesMax(); i++)
{
bool noComm;
switch( CommType )
{
case CommType_InfiniBand:
noComm = (comms_[i] == MPI_COMM_NULL) ? true : false;
break;
case CommType_Sockets:
noComm = (socks_[i] == -1) ? true : false;
break;
default:
// Programmer bonehead!
abort();
}
// Only process active nodes
if (noComm && i != MyPNID) continue;
if ( i == MyPNID )
{ // Get pointer to message sent by this node
msgBuf = sendBuf;
}
else
{ // Compute pointer to receive buffer element for node "i"
msgBuf = (struct sync_buffer_def *)
(((char *) syncBuf) + i * CommBufSize);
}
if (trace_settings & TRACE_SYNC)
{
trace_printf("%s@%d - Buffer for node %d, swpRecCount_=%d, seq_num=%lld, "
"lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n",
method_name, __LINE__, i, swpRecCount_,
msgBuf->nodeInfo.seq_num,
lastSeqNum_,
msgBuf->msgInfo.msg_count,
msgBuf->msgInfo.msg_offset);
}
// if we have already processed buffer, skip it
if (lastSeqNum_ >= msgBuf->nodeInfo.seq_num) continue;
if (trace_settings & TRACE_SYNC)
{
trace_printf("%s@%d - Processing buffer for node %d, swpRecCount_=%d, seq_num=%lld, "
"lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n",
method_name, __LINE__, i, swpRecCount_,
msgBuf->nodeInfo.seq_num,
lastSeqNum_,
msgBuf->msgInfo.msg_count,
msgBuf->msgInfo.msg_offset);
}
// reset msg length to zero to initialize for PopMsg()
msgBuf->msgInfo.msg_offset = 0;
#ifndef NAMESERVER_PROCESS
if ( msgBuf->msgInfo.msg_count == 1
&& (( internal_msg_def *)msgBuf->msg)->type == InternalType_Sync )
{
if ( deferredTmSync )
{ // This node has sent a TmSync message. Process it now.
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Handling deferred TmSync messages for "
"node %d\n", method_name, __LINE__, i);
struct internal_msg_def *msg;
msg = Nodes->PopMsg( msgBuf );
if ( i == MyPNID )
HandleMyNodeMsg (msg, MyPNID);
else
HandleOtherNodeMsg (msg, i);
}
else
{
// This node has sent a TmSync message. Defer processing
// until we handle all of the non-TmSync messages from
// other nodes.
haveDeferredTmSync = true;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Deferring TmSync processing for node"
" %d until replicated data is handled\n",
method_name, __LINE__, i);
}
}
else if ( !deferredTmSync )
#else
if ( !deferredTmSync )
#endif
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Handling messages for "
"node %d\n", method_name, __LINE__, i);
do
{
// Get the next sync msg for the node
msg = Nodes->PopMsg( msgBuf );
if (msg->type == InternalType_Null) break;
if ( i == MyPNID )
HandleMyNodeMsg (msg, MyPNID);
else
HandleOtherNodeMsg (msg, i);
}
while ( true );
}
}
TRACE_EXIT;
return haveDeferredTmSync;
}
bool CCluster::checkIfDone ( )
{
const char method_name[] = "CCluster::checkIfDone";
TRACE_ENTRY;
int nameServerCount = 0;
CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
CNameServerConfigContainer *nameServerConfigContainer = NULL;
#ifdef NAMESERVER_PROCESS
if (clusterConfig)
{
nameServerConfigContainer = Nodes->GetNameServerConfig();
if (nameServerConfigContainer)
{
nameServerCount = nameServerConfigContainer->GetCount();
}
}
#else
int myNameServerCount = 0;
CNameServerConfig *nameServerConfig = NULL;
if (NameServerEnabled && clusterConfig)
{
nameServerConfigContainer = Nodes->GetNameServerConfig();
if (nameServerConfigContainer)
{
nameServerCount = nameServerConfigContainer->GetCount();
if (IsRealCluster)
{
nameServerConfig = nameServerConfigContainer->GetConfig( Node_name );
if (nameServerConfig)
{
myNameServerCount = 1;
}
}
else
{
if (nameServerCount && MyPNID < nameServerCount)
{
myNameServerCount = 1;
}
}
}
}
#endif
#ifdef NAMESERVER_PROCESS
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Process "
"count=%d, internal state=%d, currentNodes_=%d, "
"local process count=%d, shutdownNameServer=%d, "
"nameServerCount=%d\n",
method_name, __LINE__,
MyNode->GetPNid(),
MyNode->GetShutdownLevel(),
StateString(MyNode->GetState()),
Nodes->ProcessCount(),
MyNode->getInternalState(),
currentNodes_,
MyNode->GetNumProcs(),
MyNode->IsShutdownNameServer(),
nameServerCount );
#else
if (NameServerEnabled)
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Cluster process "
"count=%d, internal state=%d, currentNodes_=%d, "
"local process count=%d\n",
method_name, __LINE__, MyNode->GetPNid(),
MyNode->GetShutdownLevel(),
StateString(MyNode->GetState()),
clusterProcCount_,
MyNode->getInternalState(),
currentNodes_, MyNode->GetNumProcs());
}
else
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Process "
"count=%d, internal state=%d, currentNodes_=%d, "
"local process count=%d\n",
method_name, __LINE__, MyNode->GetPNid(),
MyNode->GetShutdownLevel(),
StateString(MyNode->GetState()),
Nodes->ProcessCount(),
MyNode->getInternalState(),
currentNodes_, MyNode->GetNumProcs());
}
#endif
// Check if we are also done
if (( MyNode->GetState() != State_Down ) &&
( MyNode->GetState() != State_Stopped ) )
{
if ( MyNode->GetShutdownLevel() != ShutdownLevel_Undefined )
{
#ifdef NAMESERVER_PROCESS
if ( (Nodes->ProcessCount() <= nameServerCount ) // only Name Servers alive
&& (MyNode->GetNumProcs() <= MAX_PRIMITIVES ) // only My Name Server alive
&& MyNode->IsShutdownNameServer() // monitor shutdown Name Server received
&& !MyNode->isInQuiesceState() ) // post-quiescing will
// expire WDG (cluster)
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Name Server signaled to exit.\n", method_name, __LINE__);
MyNode->SetState( State_Stopped );
MyNode->SetInternalState(State_Ready_To_Exit);
// we need to sync one more time so other nodes see our state
return false;
}
#else
if ( NameServerEnabled )
{
if ( clusterProcCount_ == 0 ) // all Name Servers exited
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Monitor signaled to exit.\n", method_name, __LINE__);
MyNode->SetState( State_Stopped );
MyNode->SetInternalState(State_Ready_To_Exit);
// we need to sync one more time so other nodes see our state
return false;
}
else if ( (clusterProcCount_ <=
(currentNodes_ * (MAX_PRIMITIVES+1)) ) // only WDGs and Name Servers alive
&& (MyNode->GetNumProcs() <=
(MAX_PRIMITIVES+1) ) // only WDGs and Name Servers alive
&& !MyNode->isInQuiesceState() // post-quiescing will
// expire WDG (cluster)
&& !waitForWatchdogExit_ ) // WDG not yet exiting
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Stopping watchdog process. "
"(process count: cluster=%d, MyNode=%d)\n",
method_name, __LINE__,
Nodes->ProcessCount(), MyNode->ProcessCount());
waitForWatchdogExit_ = true;
// stop the watchdog timer first
HealthCheck.setState(MON_STOP_WATCHDOG);
// let the watchdog process exit
HealthCheck.setState(MON_EXIT_PRIMITIVES);
}
else if ( (MyNode->GetNumProcs() <= // only My Name Server alive
myNameServerCount )
&& !MyNode->isInQuiesceState() // post-quiescing will
// expire WDG (cluster)
&& !waitForNameServerExit_ ) // Name Server not yet exiting
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Stopping Name Server process. "
"(process count: cluster=%d, MyNode=%d)\n",
method_name, __LINE__,
Nodes->ProcessCount(), MyNode->ProcessCount());
waitForNameServerExit_ = true;
int rc = NameServer->ProcessShutdown();
if (rc)
{
char la_buf[MON_STRING_BUF_SIZE];
snprintf( la_buf, sizeof(la_buf)
, "[%s] - Shutdown request to Name Server failed, node going down\n"
, method_name );
mon_log_write( MON_CLUSTER_CHECKIFDONE_1, SQ_LOG_ERR, la_buf );
ReqQueue.enqueueDownReq( MyPNID );
}
}
}
else
{
if ( Nodes->ProcessCount() == 0 ) // all WDTs exited
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Monitor signaled to exit.\n", method_name, __LINE__);
MyNode->SetState( State_Stopped );
MyNode->SetInternalState(State_Ready_To_Exit);
// we need to sync one more time so other nodes see our state
return false;
}
else if ( (Nodes->ProcessCount() <=
(currentNodes_*MAX_PRIMITIVES)) // only WDGs alive
&& !MyNode->isInQuiesceState() // post-quiescing will
// expire WDG (cluster)
&& !waitForWatchdogExit_ ) // WDG not yet exiting
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - Stopping watchdog process.\n",
method_name, __LINE__);
waitForWatchdogExit_ = true;
// stop the watchdog timer first
HealthCheck.setState(MON_STOP_WATCHDOG);
// let the watchdog process exit
HealthCheck.setState(MON_EXIT_PRIMITIVES);
}
}
#endif
}
}
else if ( MyNode->GetShutdownLevel() != ShutdownLevel_Undefined
&& MyNode->GetState() == State_Down
&& MyNode->GetNumProcs() == 0)
{
if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
trace_printf("%s@%d - No processes remaining, monitor exiting.\n",
method_name, __LINE__);
MyNode->SetState( State_Stopped );
MyNode->SetInternalState(State_Ready_To_Exit);
// we need to sync one more time so other nodes see our state
return false;
}
MyNode->CheckShutdownProcessing();
TRACE_EXIT;
return ( MyNode->getInternalState() == State_Ready_To_Exit );
}
// Gather "Allgather" performance statistics
// Given the beginning and ending time of an "Allgather" operation, compute
// the elapsed time and increment the count for the appropriate range
// bucket.
const struct timespec CCluster::agBuckets_[] = {
{0, 0}, // lowest
{0, 20000}, // 20 us
{0, 50000}, // 50 us
{0, 500000}, // 500 us
{0, 1000000}, // 1 ms
{0, 10000000}, // 10 ms
{0, 25000000}, // 25 ms
{0, 50000000}, // 50 ms
{0, 100000000}, // 100 ms
{0, 500000000}}; // 500 ms
const int CCluster::agBucketsSize_ = sizeof(agBuckets_)/sizeof(timespec);
bool CCluster::agTimeStats(struct timespec & ts_begin,
struct timespec & ts_end)
{
const char method_name[] = "CCluster::agTimeStats";
bool slowAg = false;
struct timespec timediff;
if ( (ts_end.tv_nsec - ts_begin.tv_nsec ) < 0 )
{
timediff.tv_sec = ts_end.tv_sec - ts_begin.tv_sec - 1;
timediff.tv_nsec = 1000000000 + ts_end.tv_nsec - ts_begin.tv_nsec;
}
else
{
timediff.tv_sec = ts_end.tv_sec - ts_begin.tv_sec;
timediff.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec;
}
if ( timediff.tv_sec > agMaxElapsed_.tv_sec
|| (timediff.tv_sec == agMaxElapsed_.tv_sec
&& timediff.tv_nsec > agMaxElapsed_.tv_nsec ))
// Have a new maximum elapsed time
agMaxElapsed_ = timediff;
if ( timediff.tv_sec < agMinElapsed_.tv_sec
|| (timediff.tv_sec == agMinElapsed_.tv_sec
&& timediff.tv_nsec < agMinElapsed_.tv_nsec ))
// Have a new minimum time
agMinElapsed_ = timediff;
for (int i=agBucketsSize_-1; i>=0; --i)
{
if (timediff.tv_sec > agBuckets_[i].tv_sec
|| (timediff.tv_sec == agBuckets_[i].tv_sec
&& timediff.tv_nsec > agBuckets_[i].tv_nsec ))
{
++agElapsed_[i];
if (i >= 7)
{
slowAg = true;
if (trace_settings & TRACE_SYNC)
{
trace_printf("%s@%d slow Allgather=(%ld, %ld) seqNum_=%lld, i=%d\n",
method_name, __LINE__,
timediff.tv_sec, timediff.tv_nsec, seqNum_, i);
}
}
break;
}
}
return slowAg;
}
// Display "Allgather" statistics
void CCluster::stats()
{
const char method_name[] = "CCluster::stats";
trace_printf("%s@%d Allgather min elapsed=%ld.%ld\n", method_name, __LINE__,
agMinElapsed_.tv_sec, agMinElapsed_.tv_nsec);
trace_printf("%s@%d Allgather max elapsed=%ld.%ld\n", method_name, __LINE__,
agMaxElapsed_.tv_sec, agMaxElapsed_.tv_nsec);
unsigned long int bucket;
const char * unit;
const char * range;
for (int i=0; i<agBucketsSize_; ++i)
{
if ( i == (agBucketsSize_-1))
{
bucket = agBuckets_[i].tv_nsec;
range = ">";
}
else
{
bucket = agBuckets_[i+1].tv_nsec;
range = "<=";
}
bucket = bucket/1000;
if (bucket < 1000)
unit = "usec";
else
{
bucket = bucket / 1000;
if ( bucket < 1000 )
unit = "msec";
else
unit = "???";
}
trace_printf("%s@%d bucket[%d]=%d (%s %ld %s)\n",
method_name, __LINE__, i, agElapsed_[i],
range, bucket, unit);
}
}
bool CCluster::exchangeNodeData ( )
{
const char method_name[] = "CCluster::exchangeNodeData";
TRACE_ENTRY;
bool result = false;
// Record statistics (sonar counters)
if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
MonStats->req_sync_Incr();
++swpRecCount_; // recursive count for this function
bool doShutdown = false;
bool lastAllgatherWithLastSyncBuffer = false;
struct internal_msg_def *msg;
MPI_Status status[GetConfigPNodesMax()];
int err;
struct sync_buffer_def *recv_buffer;
struct sync_buffer_def *send_buffer = Nodes->GetSyncBuffer();
unsigned long long savedSeqNum = 0;
// if we are here in a second recursive call that occurred while
// processing TMSync data, use the second receive buffer
// else, use the first one.
if (swpRecCount_ == 1)
{
recv_buffer = recvBuffer_;
}
else
{
// should not be here in more than one recursive call.
assert(swpRecCount_ == 2);
recv_buffer = recvBuffer2_;
}
// Initialize sync buffer header including node state
msg = Nodes->InitSyncBuffer( send_buffer, seqNum_, upNodes_ );
// Fill sync buffer based on queue of replication requests
Replicator.FillSyncBuffer ( msg );
reconnected:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - doing Allgather size=%d, swpRecCount_=%d, "
"message count=%d, message seq_num=%lld, "
"seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
"highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
, method_name, __LINE__
, Nodes->GetSyncSize()
, swpRecCount_
, send_buffer->msgInfo.msg_count
, send_buffer->nodeInfo.seq_num
, seqNum_
, lastSeqNum_
, lowSeqNum_
, highSeqNum_
, reconnectSeqNum_);
struct timespec ts_ag_begin;
clock_gettime(CLOCK_REALTIME, &ts_ag_begin);
// Exchange info with other nodes
err = Allgather(Nodes->GetSyncSize(), send_buffer, (char *)recv_buffer,
0 /*seqNum_*/, status );
struct timespec ts_ag_end;
clock_gettime(CLOCK_REALTIME, &ts_ag_end);
if (err != MPI_SUCCESS && err != MPI_ERR_IN_STATUS)
{
if (trace_settings & TRACE_SYNC)
{
trace_printf("%s@%d - unexpected Allgather error=%s (%d)\n",
method_name, __LINE__, ErrorMsg(err), err);
}
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Unexpected MPI communications "
"error=%s (%d).\n", method_name, ErrorMsg(err), err);
mon_log_write(MON_CLUSTER_EXCHANGENODEDATA_1, SQ_LOG_ERR, buf);
// Allgather() failed in a fundamental way, bring this node down
if ( !enqueuedDown_ )
{
enqueuedDown_ = true;
ReqQueue.enqueueDownReq(MyPNID);
}
}
else
{
if (agTimeStats( ts_ag_begin, ts_ag_end))
{ // Slow cycle, print info
if ( trace_settings & TRACE_SYNC )
{
trace_printf("%s@%d - slow Allgather info: sync size=%d, message count=%d, MyPNID=%d\n",
method_name, __LINE__, Nodes->GetSyncSize(),
send_buffer->msgInfo.msg_count, MyPNID);
struct sync_buffer_def *msgBuf;
int nr;
for (int i = 0; i < GetConfigPNodesMax(); i++)
{
bool noComm;
switch( CommType )
{
case CommType_InfiniBand:
noComm = (comms_[i] == MPI_COMM_NULL) ? true : false;
break;
case CommType_Sockets:
noComm = (socks_[i] == -1) ? true : false;
break;
default:
// Programmer bonehead!
abort();
}
// Only process active nodes
if (noComm) continue;
msgBuf = (struct sync_buffer_def *)
(((char *) recv_buffer) + i * CommBufSize);
MPI_Get_count(&status[i], MPI_CHAR, &nr);
trace_printf("%s@%d - slow Allgather info, pnid=%d: received bytes=%d, message count=%d, msg_offset=%d\n",
method_name, __LINE__, i, nr,
msgBuf->msgInfo.msg_count,
msgBuf->msgInfo.msg_offset);
}
}
}
UpdateClusterState( doShutdown
, recv_buffer
, status
, send_buffer->nodeInfo.change_nid);
if ( lastAllgatherWithLastSyncBuffer )
{
seqNum_ = savedSeqNum;
lastAllgatherWithLastSyncBuffer = false;
send_buffer = Nodes->GetSyncBuffer();
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Resetting lastAllgatherWithLastSyncBuffer=%d\n"
, method_name, __LINE__
, lastAllgatherWithLastSyncBuffer);
goto reconnected;
}
if ( reconnectSeqNum_ != 0 )
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Allgather IO retry, swpRecCount_=%d, "
"seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
"highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
, method_name, __LINE__
, swpRecCount_
, seqNum_
, lastSeqNum_
, lowSeqNum_
, highSeqNum_
, reconnectSeqNum_);
// The Allgather() has executed a reconnect at reconnectSeqNum_.
// The UpdateClusterState has set the lowSeqNum_and highSeqNum_
// in the current IO exchange which will indicate whether there is
// a mismatch in IOs between monitor processes. If there is a mismatch,
// the lowSeqNum_and highSeqNum_ relative to our current seqNum_
// will determine how to redrive the exchange of node data.
if (seqNum_ > lowSeqNum_)
{ // A remote monitor did not receive our last SyncBuffer
// Redo exchange with the previous SyncBuffer
send_buffer = Nodes->GetLastSyncBuffer();
savedSeqNum = seqNum_;
seqNum_ = lastSeqNum_;
// Indicate to follow up the next exchange with current SyncBuffer
lastAllgatherWithLastSyncBuffer = true;
lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Setting lastAllgatherWithLastSyncBuffer=%d\n"
, method_name, __LINE__
, lastAllgatherWithLastSyncBuffer);
goto reconnected;
}
else if (seqNum_ < highSeqNum_)
{ // The local monitor did not receive the last remote SyncBuffer
// Redo exchange with the current SyncBuffer
send_buffer = Nodes->GetSyncBuffer();
lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - lastAllgatherWithLastSyncBuffer=%d\n"
, method_name, __LINE__
, lastAllgatherWithLastSyncBuffer);
goto reconnected;
}
lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
}
}
if ( ProcessClusterData( recv_buffer, send_buffer, false ) )
{ // There is a TmSync message remaining to be handled
ProcessClusterData( recv_buffer, send_buffer, true );
}
if (swpRecCount_ == 1)
{
// Save the sync buffer and corresponding sequence number we just processed
// On reconnect we must resend the last buffer and the current buffer
// to ensure dropped buffers are processed by all monitor processe in the
// correct order
Nodes->SaveMyLastSyncBuffer();
lastSeqNum_ = seqNum_;
// Increment count of "Allgather" calls. If wrap-around, start again at 1.
if ( ++seqNum_ == 0) seqNum_ = 1;
}
// Wake up any threads waiting on the completion of a sync cycle
syncCycle_.wakeAll();
if (doShutdown) result = checkIfDone( );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - node data exchange completed, swpRecCount_=%d, "
"seqNum_=%lld, lastSeqNum_=%lld, reconnectSeqNum_=%lld\n"
, method_name, __LINE__
, swpRecCount_
, seqNum_
, lastSeqNum_
, reconnectSeqNum_);
--swpRecCount_;
TRACE_EXIT;
return result;
}
#ifndef NAMESERVER_PROCESS
void CCluster::exchangeTmSyncData ( struct sync_def *sync, bool bumpSync )
{
const char method_name[] = "CCluster::exchangeTmSyncData";
TRACE_ENTRY;
++swpRecCount_; // recursive count for this function
bool doShutdown = false;
bool lastAllgatherWithLastSyncBuffer = false;
struct internal_msg_def *msg;
MPI_Status status[GetConfigPNodesMax()];
int err;
struct sync_buffer_def *recv_buffer;
struct sync_buffer_def *send_buffer = Nodes->GetSyncBuffer();
unsigned long long savedSeqNum = 0;
// if we are here in a second recursive call that occurred while
// processing TMSync data, use the second receive buffer
// else, use the first one.
if (swpRecCount_ == 1)
{
recv_buffer = recvBuffer_;
}
else
{
// should not be here in more than one recursive call.
assert(swpRecCount_ == 2);
recv_buffer = recvBuffer2_;
}
if (bumpSync)
{
// Save the sync buffer and corresponding sequence number we just processed
// On reconnect we must resend the last buffer and the current buffer
// to ensure dropped buffers are processed by all monitor processe in the
// correct order
Nodes->SaveMyLastSyncBuffer();
lastSeqNum_ = seqNum_;
// Increment count of "Allgather" calls. If wrap-around, start again at 1.
if ( ++seqNum_ == 0) seqNum_ = 1;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Bumping sequence number, "
"swpRecCount_=%d, seqNum_=%lld, lastSeqNum_=%lld\n"
, method_name, __LINE__
, swpRecCount_
, seqNum_
, lastSeqNum_);
}
// Initialize sync buffer header including node state
msg = Nodes->InitSyncBuffer( send_buffer, seqNum_, upNodes_ );
// Add tmsync data
AddTmsyncMsg( send_buffer, sync, msg );
reconnected:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - doing Allgather size=%d, swpRecCount_=%d, "
"message count=%d, message seq_num=%lld, "
"seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
"highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
, method_name, __LINE__
, Nodes->GetSyncSize()
, swpRecCount_
, send_buffer->msgInfo.msg_count
, send_buffer->nodeInfo.seq_num
, seqNum_
, lastSeqNum_
, lowSeqNum_
, highSeqNum_
, reconnectSeqNum_);
struct timespec ts_ag_begin;
clock_gettime(CLOCK_REALTIME, &ts_ag_begin);
// Exchange info with other nodes
err = Allgather(Nodes->GetSyncSize(), send_buffer, (char *)recv_buffer,
0 /*seqNum_*/, status );
struct timespec ts_ag_end;
clock_gettime(CLOCK_REALTIME, &ts_ag_end);
if (err != MPI_SUCCESS && err != MPI_ERR_IN_STATUS)
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf("%s@%d - unexpected Allgather error=%s (%d)\n",
method_name, __LINE__, ErrorMsg(err), err);
}
char buf[MON_STRING_BUF_SIZE];
snprintf(buf, sizeof(buf), "[%s], Unexpected MPI communications "
"error=%s (%d).\n", method_name, ErrorMsg(err), err);
mon_log_write(MON_CLUSTER_EXCHANGETMSYNC_1, SQ_LOG_ERR, buf);
// Allgather() failed in a fundamental way, bring this node down
if ( !enqueuedDown_ )
{
enqueuedDown_ = true;
ReqQueue.enqueueDownReq(MyPNID);
}
}
else
{
if (agTimeStats( ts_ag_begin, ts_ag_end))
{ // Slow cycle, print info
if ( trace_settings & TRACE_SYNC )
{
trace_printf("%s@%d - slow Allgather info: sync size=%d, message count=%d, MyPNID=%d\n",
method_name, __LINE__, Nodes->GetSyncSize(),
send_buffer->msgInfo.msg_count, MyPNID);
struct sync_buffer_def *msgBuf;
int nr;
for (int i = 0; i < GetConfigPNodesMax(); i++)
{
bool noComm;
switch( CommType )
{
case CommType_InfiniBand:
noComm = (comms_[i] == MPI_COMM_NULL) ? true : false;
break;
case CommType_Sockets:
noComm = (socks_[i] == -1) ? true : false;
break;
default:
// Programmer bonehead!
abort();
}
// Only process active nodes
if (noComm) continue;
msgBuf = (struct sync_buffer_def *)
(((char *) recv_buffer) + i * CommBufSize);
MPI_Get_count(&status[i], MPI_CHAR, &nr);
trace_printf("%s@%d - slow Allgather info, pnid=%d: received bytes=%d, message count=%d, msg_offset=%d\n",
method_name, __LINE__, i, nr,
msgBuf->msgInfo.msg_count,
msgBuf->msgInfo.msg_offset);
}
}
}
UpdateClusterState( doShutdown
, recv_buffer
, status
, send_buffer->nodeInfo.change_nid);
if ( lastAllgatherWithLastSyncBuffer )
{
seqNum_ = savedSeqNum;
lastAllgatherWithLastSyncBuffer = false;
send_buffer = Nodes->GetSyncBuffer();
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Resetting lastAllgatherWithLastSyncBuffer=%d\n"
, method_name, __LINE__
, lastAllgatherWithLastSyncBuffer);
goto reconnected;
}
if ( reconnectSeqNum_ != 0 )
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Allgather IO retry, swpRecCount_=%d, "
"seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
"highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
, method_name, __LINE__
, swpRecCount_
, seqNum_
, lastSeqNum_
, lowSeqNum_
, highSeqNum_
, reconnectSeqNum_);
// The Allgather() has executed a reconnect at reconnectSeqNum_.
// The UpdateClusterState has set the lowSeqNum_and highSeqNum_
// in the current IO exchange which will indicate whether there is
// a mismatch in IOs between monitor processes. If there is a mismatch,
// the lowSeqNum_and highSeqNum_ relative to our current seqNum_
// will determine how to redrive the exchange of node data.
if (seqNum_ > lowSeqNum_)
{ // A remote monitor did not receive our last SyncBuffer
// Redo exchange with the previous SyncBuffer
send_buffer = Nodes->GetLastSyncBuffer();
savedSeqNum = seqNum_;
seqNum_ = lastSeqNum_;
// Indicate to follow up the next exchange with current SyncBuffer
lastAllgatherWithLastSyncBuffer = true;
lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - Setting lastAllgatherWithLastSyncBuffer=%d\n"
, method_name, __LINE__
, lastAllgatherWithLastSyncBuffer);
goto reconnected;
}
else if (seqNum_ < highSeqNum_)
{ // The local monitor did not receive the last remote SyncBuffer
// Redo exchange with the current SyncBuffer
send_buffer = Nodes->GetSyncBuffer();
lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - lastAllgatherWithLastSyncBuffer=%d\n"
, method_name, __LINE__
, lastAllgatherWithLastSyncBuffer);
goto reconnected;
}
lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
}
}
if ( ProcessClusterData( recv_buffer, send_buffer, false ) )
{ // There is a TmSync message remaining to be handled
ProcessClusterData( recv_buffer, send_buffer, true );
}
if (swpRecCount_ == 1)
{
// Save the sync buffer and corresponding sequence number we just processed
// On reconnect we must resend the last buffer and the current buffer
// to ensure dropped buffers are processed by all monitor processe in the
// correct order
Nodes->SaveMyLastSyncBuffer();
lastSeqNum_ = seqNum_;
// Increment count of "Allgather" calls. If wrap-around, start again at 1.
if ( ++seqNum_ == 0) seqNum_ = 1;
}
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf( "%s@%d - node data exchange completed, swpRecCount_=%d, "
"seqNum_=%lld, lastSeqNum_=%lld, reconnectSeqNum_=%lld\n"
, method_name, __LINE__
, swpRecCount_
, seqNum_
, lastSeqNum_
, reconnectSeqNum_);
--swpRecCount_;
TRACE_EXIT;
}
#endif
void CCluster::EpollCtl( int efd, int op, int fd, struct epoll_event *event )
{
const char method_name[] = "CCluster::EpollCtl";
TRACE_ENTRY;
#if 0
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
int iPeer;
for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
{ // Find corresponding peer by matching socket fd
if ( fd == socks_[iPeer] ) break;
}
trace_printf( "%s@%d epoll_ctl( efd=%d,%s, fd=%d(%s), %s )\n"
, method_name, __LINE__
, efd
, EpollOpString(op)
, fd, Node[iPeer]->GetName()
, EpollEventString(event->events) );
}
#endif
int rc = epoll_ctl( efd, op, fd, event );
if ( rc == -1 )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
int iPeer;
for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
{ // Find corresponding peer by matching socket fd
if ( fd == socks_[iPeer] ) break;
}
snprintf( buf, sizeof(buf), "[%s@%d] epoll_ctl(efd=%d,%s, fd=%d(%s), %s) error: %s\n"
, method_name, __LINE__
, efd
, EpollOpString(op)
, fd, Node[iPeer]->GetName()
, EpollEventString(event->events)
, strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_EPOLLCTL_1, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
TRACE_EXIT;
return;
}
void CCluster::EpollCtlDelete( int efd, int fd, struct epoll_event *event )
{
const char method_name[] = "CCluster::EpollCtlDelete";
TRACE_ENTRY;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
int iPeer;
for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
{ // Find corresponding peer by matching socket fd
if ( fd == socks_[iPeer] ) break;
}
trace_printf( "%s@%d epoll_ctl( efd=%d,%s, fd=%d(%s), %s )\n"
, method_name, __LINE__
, efd
, EpollOpString(EPOLL_CTL_DEL)
, fd, Node[iPeer]->GetName()
, EpollEventString(event->events) );
}
// Remove old socket from epoll set, it may not be there
int rc = epoll_ctl( efd, EPOLL_CTL_DEL, fd, event );
if ( rc == -1 )
{
int err = errno;
if (err != ENOENT)
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] epoll_ctl(efd=%d, %s, fd=%d, %s) error: %s\n"
, method_name, __LINE__
, efd
, EpollOpString(EPOLL_CTL_DEL)
, fd
, EpollEventString(event->events)
, strerror_r( err, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_EPOLLCTLDELETE_1, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
}
TRACE_EXIT;
return;
}
void CCluster::InitClusterSocks( int worldSize, int myRank, char *nodeNames, int *rankToPnid )
{
const char method_name[] = "CCluster::InitClusterSocks";
TRACE_ENTRY;
int serverSyncPort;
CNode *node;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d worldSize=%d, myRank=%d\n"
, method_name, __LINE__
, worldSize, myRank);
}
// Exchange ports with collective
serverSyncPort = MyNode->GetSyncSocketPort();
int rc = MPI_Allgather( &serverSyncPort, 1, MPI_INT,
sockPorts_, 1, MPI_INT, MPI_COMM_WORLD );
if ( rc != MPI_SUCCESS )
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n",
method_name, __LINE__, ErrorMsg( rc ) );
mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_3, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
#ifdef NAMESERVER_PROCESS
if ( !IsRealCluster )
{
for ( int i = 0; i < worldSize; i++ )
sockPorts_[i] = syncPort_ + i;
}
#endif
char *n, nodeName[MPI_MAX_PROCESSOR_NAME];
unsigned char srcaddr[4], dstaddr[4];
struct hostent *he;
if ( nodeNames )
{
n = &nodeNames[myRank*MPI_MAX_PROCESSOR_NAME];
}
else
{
strcpy( nodeName, "localhost" );
n = nodeName;
}
// Get my host structure via my node name or localhost
he = gethostbyname( n );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n",
method_name, __LINE__, n, strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_4, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
// Initialize my source address structure
memcpy( srcaddr, he->h_addr, 4 );
int idst;
// Loop on each node in the cluster
for ( int i = 0; i < worldSize; i++ )
{
// Loop on each adjacent node in the cluster
for ( int j = i+1; j < worldSize; j++ )
{
if ( i == myRank )
{ // Current [i] node is my node, so connect to [j] node
idst = j;
if ( nodeNames )
{ // Real cluster
n = &nodeNames[j*MPI_MAX_PROCESSOR_NAME];
// Get peer's host structure via its node name
he = gethostbyname( n );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf),
"[%s@%d] gethostbyname(%s) error: %s\n",
method_name, __LINE__, n,
strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_5, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
// Initialize peer's destination address structure
memcpy( dstaddr, he->h_addr, 4 );
node = Nodes->GetNode( n );
if ( node )
{ // Save peer's port in its node object
node->SetSyncSocketPort(sockPorts_[j]);
}
}
else
{ // Virtual cluster. Same source and destination addresses
node = NULL;
memcpy( dstaddr, srcaddr, 4 );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d Creating client socket: src=%d.%d.%d.%d, dst(%s)=%d.%d.%d.%d, dst port=%d\n"
, method_name, __LINE__
, (int)((unsigned char *)srcaddr)[0]
, (int)((unsigned char *)srcaddr)[1]
, (int)((unsigned char *)srcaddr)[2]
, (int)((unsigned char *)srcaddr)[3]
, n
, (int)((unsigned char *)dstaddr)[0]
, (int)((unsigned char *)dstaddr)[1]
, (int)((unsigned char *)dstaddr)[2]
, (int)((unsigned char *)dstaddr)[3]
, sockPorts_[j] );
}
// Connect to peer
socks_[rankToPnid[j]] = MkCltSock( srcaddr, dstaddr, sockPorts_[j] ); // InitClusterSocks
}
else if ( j == myRank )
{ // Current [j] peer my node, accept connection from peer [i] node
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d Accepting server socket: src=%d.%d.%d.%d, port=%d\n"
, method_name, __LINE__
, (int)((unsigned char *)srcaddr)[0]
, (int)((unsigned char *)srcaddr)[1]
, (int)((unsigned char *)srcaddr)[2]
, (int)((unsigned char *)srcaddr)[3]
, serverSyncPort );
}
idst = i;
// Accept connection from peer [i]
socks_[rankToPnid[i]] = AcceptSock( syncSock_ ); // InitClusterSocks
}
else
{
idst = -1;
}
if ( idst >= 0 && socks_[rankToPnid[idst]] < 0 )
{
char buf[MON_STRING_BUF_SIZE];
if ( idst == i )
{
snprintf( buf, sizeof(buf), "[%s@%d] mkcltsock src=%d.%d.%d.%d dst=%d.%d.%d.%d failed\n",
method_name, __LINE__,
(int)((unsigned char *)srcaddr)[0],
(int)((unsigned char *)srcaddr)[1],
(int)((unsigned char *)srcaddr)[2],
(int)((unsigned char *)srcaddr)[3],
(int)((unsigned char *)dstaddr)[0],
(int)((unsigned char *)dstaddr)[1],
(int)((unsigned char *)dstaddr)[2],
(int)((unsigned char *)dstaddr)[3] );
}
else
{
snprintf( buf, sizeof(buf), "[%s@%d] acceptsock(%d) failed\n",
method_name, __LINE__, syncSock_ );
}
mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_6, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
if ( idst >= 0 && fcntl( socks_[rankToPnid[idst]], F_SETFL, O_NONBLOCK ) )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] fcntl(NONBLOCK) error: %s\n",
method_name, __LINE__, strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_7, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
MPI_Barrier( MPI_COMM_WORLD );
}
}
TRACE_EXIT;
}
void CCluster::InitServerSock( void )
{
const char method_name[] = "CCluster::InitServerSock";
TRACE_ENTRY;
int serverCommPort = 0;
int serverSyncPort = 0;
#ifdef NAMESERVER_PROCESS
int mon2nsPort = 0;
#else
int ptpPort = 0;
#endif
int val = 0;
unsigned char addr[4];
struct hostent *he;
he = gethostbyname( Node_name );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] gethostbyname(%s) error: %s\n"
, method_name, __LINE__
, Node_name, strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_1, SQ_LOG_CRIT, buf );
abort();
}
memcpy( addr, he->h_addr, 4 );
#ifdef NAMESERVER_PROCESS
char *env = getenv ("NS_COMM_PORT");
#else
char *env = getenv("MONITOR_COMM_PORT");
#endif
if ( env )
{
val = atoi(env);
if ( val > 0)
{
if ( !IsRealCluster )
{
val += MyPNID;
}
serverCommPort = val;
}
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d COMM_PORT Node_name=%s, env=%s, serverCommPort=%d, val=%d\n"
, method_name, __LINE__
, Node_name, env, serverCommPort, val );
}
commSock_ = MkSrvSock( &serverCommPort );
if ( commSock_ < 0 )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
#ifdef NAMESERVER_PROCESS
, "[%s@%d] MkSrvSock(NS_COMM_PORT=%d) error: %s\n"
#else
, "[%s@%d] MkSrvSock(MONITOR_COMM_PORT=%d) error: %s\n"
#endif
, method_name, __LINE__, serverCommPort
, strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_2, SQ_LOG_CRIT, buf );
abort();
}
else
{
snprintf( MyCommPort, sizeof(MyCommPort)
, "%d.%d.%d.%d:%d"
, (int)((unsigned char *)addr)[0]
, (int)((unsigned char *)addr)[1]
, (int)((unsigned char *)addr)[2]
, (int)((unsigned char *)addr)[3]
, serverCommPort );
MyNode->SetCommSocketPort( serverCommPort );
MyNode->SetCommPort( MyCommPort );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d Initialized my comm socket port, "
"pnid=%d (%s:%s) (commPort=%s)\n"
, method_name, __LINE__
, MyPNID, MyNode->GetName(), MyCommPort
, MyNode->GetCommPort() );
}
#ifdef NAMESERVER_PROCESS
env = getenv("NS_SYNC_PORT");
#else
env = getenv("MONITOR_SYNC_PORT");
#endif
if ( env )
{
val = atoi(env);
if ( val > 0)
{
if ( !IsRealCluster )
{
val += MyPNID;
}
syncPort_ = serverSyncPort = val;
}
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d SYNC_PORT Node_name=%s, env=%s, serverSyncPort=%d, val=%d\n"
, method_name, __LINE__
, Node_name, env, syncPort_, val );
}
syncSock_ = MkSrvSock( &serverSyncPort );
if ( syncSock_ < 0 )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
#ifdef NAMESERVER_PROCESS
, "[%s@%d] MkSrvSock(NS_SYNC_PORT=%d) error: %s\n"
#else
, "[%s@%d] MkSrvSock(MONITOR_SYNC_PORT=%d) error: %s\n"
#endif
, method_name, __LINE__, serverSyncPort
, strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_3, SQ_LOG_CRIT, buf );
abort();
}
else
{
snprintf( MySyncPort, sizeof(MySyncPort)
, "%d.%d.%d.%d:%d"
, (int)((unsigned char *)addr)[0]
, (int)((unsigned char *)addr)[1]
, (int)((unsigned char *)addr)[2]
, (int)((unsigned char *)addr)[3]
, serverSyncPort );
MyNode->SetSyncSocketPort( serverSyncPort );
MyNode->SetSyncPort( MySyncPort );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d Initialized my sync socket port, "
"pnid=%d (%s:%s) (syncPort=%s)\n"
, method_name, __LINE__
, MyPNID, MyNode->GetName(), MySyncPort
, MyNode->GetSyncPort() );
}
#ifdef NAMESERVER_PROCESS
env = getenv("NS_M2N_COMM_PORT");
if ( env )
{
val = atoi(env);
if ( val > 0)
{
if ( !IsRealCluster )
{
val += MyPNID;
}
mon2nsPort = val;
}
}
mon2nsSock_ = MkSrvSock( &mon2nsPort );
if ( mon2nsSock_ < 0 )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] MkSrvSock(NS_M2N_COMM_PORT=%d) error: %s\n"
, method_name, __LINE__, mon2nsPort
, strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_4, SQ_LOG_CRIT, buf );
abort();
}
else
{
snprintf( MyMon2NsPort, sizeof(MyMon2NsPort)
, "%d.%d.%d.%d:%d"
, (int)((unsigned char *)addr)[0]
, (int)((unsigned char *)addr)[1]
, (int)((unsigned char *)addr)[2]
, (int)((unsigned char *)addr)[3]
, mon2nsPort );
MyNode->SetMon2NsPort( MyMon2NsPort );
MyNode->SetMon2NsSocketPort( mon2nsPort );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d Initialized my mon2ns comm socket port, "
"pnid=%d (%s:%s) (Mon2NsPort=%s, Mon2NsSocketPort=%d)\n"
, method_name, __LINE__
, MyPNID, MyNode->GetName(), MyMon2NsPort
, MyNode->GetMon2NsPort()
, MyNode->GetMon2NsSocketPort() );
}
#else
if (NameServerEnabled)
{
env = getenv("MON2MON_COMM_PORT");
if ( env )
{
val = atoi(env);
if ( val > 0)
{
ptpPort = val;
}
}
else
{
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] MON2MON_COMM_PORT environment variable is not set!\n"
, method_name, __LINE__ );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_5, SQ_LOG_CRIT, buf );
abort();
}
// For virtual env, add PNid to the port so we can still test without collisions of port numbers
if (!IsRealCluster)
{
ptpPort += MyNode->GetPNid();
}
ptpSock_ = MkSrvSock( &ptpPort );
if ( ptpSock_ < 0 )
{
char ebuff[MON_STRING_BUF_SIZE];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
, "[%s@%d] MkSrvSock(MON2MON_COMM_PORT=%d) error: %s\n"
, method_name, __LINE__, ptpPort
, strerror_r( errno, ebuff, MON_STRING_BUF_SIZE ) );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_6, SQ_LOG_CRIT, buf );
abort();
}
else
{
snprintf( MyPtPPort, sizeof(MyPtPPort)
, "%d.%d.%d.%d:%d"
, (int)((unsigned char *)addr)[0]
, (int)((unsigned char *)addr)[1]
, (int)((unsigned char *)addr)[2]
, (int)((unsigned char *)addr)[3]
, ptpPort );
MyNode->SetPtPPort( MyPtPPort );
MyNode->SetPtPSocketPort( ptpPort );
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
trace_printf( "%s@%d Initialized my ptp socket port, "
"pnid=%d (%s:%s) (ptpPort=%s)\n"
, method_name, __LINE__
, MyPNID, MyNode->GetName(), MyPtPPort
, MyNode->GetPtPPort() );
}
}
#endif
epollFD_ = epoll_create1( EPOLL_CLOEXEC );
if ( epollFD_ < 0 )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] epoll_create1() error: %s\n",
method_name, __LINE__, strerror_r( errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_INITSERVERSOCK_7, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
TRACE_EXIT;
}
int CCluster::AcceptCommSock( void )
{
const char method_name[] = "CCluster::AcceptCommSock";
TRACE_ENTRY;
int csock = AcceptSock( commSock_ );
TRACE_EXIT;
return( csock );
}
int CCluster::AcceptSyncSock( void )
{
const char method_name[] = "CCluster::AcceptSyncSock";
TRACE_ENTRY;
int csock = AcceptSock( syncSock_ );
TRACE_EXIT;
return( csock );
}
#ifndef NAMESERVER_PROCESS
int CCluster::AcceptPtPSock( void )
{
const char method_name[] = "CCluster::AcceptPtPSock";
TRACE_ENTRY;
int csock = AcceptSock( ptpSock_ );
TRACE_EXIT;
return( csock );
}
#endif
int CCluster::AcceptSock( int sock )
{
const char method_name[] = "CCluster::AcceptSock";
TRACE_ENTRY;
#if defined(_XOPEN_SOURCE_EXTENDED)
#ifdef __LP64__
socklen_t size; // size of socket address
#else
size_t size; // size of socket address
#endif
#else
int size; // size of socket address
#endif
int csock; // connected socket
struct sockaddr_in sockinfo; // socket address info
size = sizeof(struct sockaddr *);
if ( getsockname( sock, (struct sockaddr *) &sockinfo, &size ) )
{
char buf[MON_STRING_BUF_SIZE];
int err = errno;
snprintf(buf, sizeof(buf), "[%s], getsockname() failed, errno=%d (%s).\n",
method_name, err, strerror(err));
mon_log_write(MON_CLUSTER_ACCEPTSOCK_1, SQ_LOG_ERR, buf);
return ( -1 );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
unsigned char *addrp = (unsigned char *) &sockinfo.sin_addr.s_addr;
trace_printf( "%s@%d - Accepting socket on addr=%d.%d.%d.%d, port=%d\n"
, method_name, __LINE__
, addrp[0]
, addrp[1]
, addrp[2]
, addrp[3]
, (int) ntohs( sockinfo.sin_port ) );
}
while ( ((csock = accept( sock
, (struct sockaddr *) 0
, (socklen_t *) 0 ) ) < 0) && (errno == EINTR) );
if ( csock > 0 )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
unsigned char *addrp = (unsigned char *) &sockinfo.sin_addr.s_addr;
trace_printf( "%s@%d - Accepted socket on addr=%d.%d.%d.%d, port=%d, sock=%d\n"
, method_name, __LINE__
, addrp[0]
, addrp[1]
, addrp[2]
, addrp[3]
, (int) ntohs( sockinfo.sin_port )
, csock );
}
int nodelay = 1;
if ( setsockopt( csock
, IPPROTO_TCP
, TCP_NODELAY
, (char *) &nodelay
, sizeof(int) ) )
{
char buf[MON_STRING_BUF_SIZE];
int err = errno;
snprintf(buf, sizeof(buf), "[%s], setsockopt() failed, errno=%d (%s).\n",
method_name, err, strerror(err));
mon_log_write(MON_CLUSTER_ACCEPTSOCK_2, SQ_LOG_ERR, buf);
return ( -2 );
}
int reuse = 1;
if ( setsockopt( csock
, SOL_SOCKET
, SO_REUSEADDR
, (char *) &reuse
, sizeof(int) ) )
{
char buf[MON_STRING_BUF_SIZE];
int err = errno;
snprintf(buf, sizeof(buf), "[%s], setsockopt() failed, errno=%d (%s).\n",
method_name, err, strerror(err));
mon_log_write(MON_CLUSTER_ACCEPTSOCK_3, SQ_LOG_ERR, buf);
return ( -2 );
}
}
TRACE_EXIT;
return ( csock );
}
int CCluster::Connect( const char *portName, bool doRetries )
{
const char method_name[] = "CCluster::Connect";
TRACE_ENTRY;
int sock; // socket
int ret; // returned value
int nodelay = 1; // sockopt reuse option
int reuse = 1; // sockopt reuse option
#if defined(_XOPEN_SOURCE_EXTENDED)
#ifdef __LP64__
socklen_t size; // size of socket address
#else
size_t size; // size of socket address
#endif
#else
int size; // size of socket address
#endif
static int retries = 0; // # times to retry connect
int outer_failures = 0; // # failed connect loops
int connect_failures = 0; // # failed connects
char *p; // getenv results
struct sockaddr_in sockinfo; // socket address info
struct hostent *he;
char host[1000];
const char *colon;
unsigned int port;
colon = strstr(portName, ":");
strcpy(host, portName);
int len = colon - portName;
host[len] = '\0';
port = atoi(&colon[1]);
size = sizeof(sockinfo);
if ( !retries )
{
p = getenv( "HPMP_CONNECT_RETRIES" );
if ( p ) retries = atoi( p );
else retries = 5;
}
for ( ;; )
{
sock = socket( AF_INET, SOCK_STREAM, 0 );
if ( sock < 0 )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_CONNECT_1, SQ_LOG_CRIT, la_buf);
abort();
}
he = gethostbyname( host );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n",
method_name, __LINE__, host, strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_CONNECT_2, SQ_LOG_CRIT, buf );
abort();
}
// Connect socket.
memset( (char *) &sockinfo, 0, size );
memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 );
sockinfo.sin_family = AF_INET;
sockinfo.sin_port = htons( (unsigned short) port );
// Note the outer loop uses "retries" from HPMP_CONNECT_RETRIES,
// and has a yield between each retry, since it's more oriented
// toward failures from network overload and putting a pause
// between retries. This inner loop should only iterate when
// a signal interrupts the local process, so it doesn't pause
// or use the same HPMP_CONNECT_RETRIES count.
connect_failures = 0;
ret = 1;
while ( ret != 0 && connect_failures <= 10 )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
if (doRetries)
{
trace_printf( "%s@%d - Connecting to %s, addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n"
, method_name, __LINE__
, portName
, (int)((unsigned char *)he->h_addr)[0]
, (int)((unsigned char *)he->h_addr)[1]
, (int)((unsigned char *)he->h_addr)[2]
, (int)((unsigned char *)he->h_addr)[3]
, port
, connect_failures );
}
else
{
trace_printf( "%s@%d - Connecting to %s, addr=%d.%d.%d.%d, port=%d\n"
, method_name, __LINE__
, portName
, (int)((unsigned char *)he->h_addr)[0]
, (int)((unsigned char *)he->h_addr)[1]
, (int)((unsigned char *)he->h_addr)[2]
, (int)((unsigned char *)he->h_addr)[3]
, port );
}
}
ret = connect( sock, (struct sockaddr *) &sockinfo, size );
if ( ret == 0 ) break;
if ( errno == EINTR )
{
++connect_failures;
}
#ifdef NAMESERVER_PROCESS
else if ( errno == ECONNREFUSED )
{
++connect_failures;
sleep( 1 );
}
#endif
else
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], connect(%s) failed! errno=%d (%s)\n"
, method_name, portName, err, strerror( err ));
mon_log_write(MON_CLUSTER_CONNECT_3, SQ_LOG_ERR, la_buf);
close(sock);
return ( -1 );
}
}
if ( ret == 0 ) break;
if (doRetries == false)
{
close( sock );
return( -1 );
}
// For large clusters, the connect/accept calls seem to fail occasionally,
// no doubt do to the large number (1000's) of simultaneous connect packets
// flooding the network at once. So, we retry up to HPMP_CONNECT_RETRIES
// number of times.
if ( errno != EINTR )
{
if ( ++outer_failures > retries )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf( la_buf, "[%s], connect(%s) exceeded retries! count=%d\n"
, method_name, portName, retries);
mon_log_write(MON_CLUSTER_CONNECT_4, SQ_LOG_ERR, la_buf);
close( sock );
return ( -1 );
}
struct timespec req, rem;
req.tv_sec = 0;
req.tv_nsec = 500000;
nanosleep( &req, &rem );
}
close( sock );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connected to %s addr=%d.%d.%d.%d, port=%d, sock=%d\n"
, method_name, __LINE__
, host
, (int)((unsigned char *)he->h_addr)[0]
, (int)((unsigned char *)he->h_addr)[1]
, (int)((unsigned char *)he->h_addr)[2]
, (int)((unsigned char *)he->h_addr)[3]
, port
, sock );
}
if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_CONNECT_5, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_CONNECT_6, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
TRACE_EXIT;
return ( sock );
}
#ifdef NAMESERVER_PROCESS
void CCluster::ConnectToMon2NsCommSelf( void )
{
const char method_name[] = "CCluster::ConnectToMon2NsCommSelf";
TRACE_ENTRY;
Connect( MyNode->GetMon2NsSocketPort() );
TRACE_EXIT;
}
#else
void CCluster::ConnectToPtPCommSelf( void )
{
const char method_name[] = "CCluster::ConnectToPtPCommSelf";
TRACE_ENTRY;
Connect( MyNode->GetPtPSocketPort() );
TRACE_EXIT;
}
#endif
void CCluster::ConnectToSelf( void )
{
const char method_name[] = "CCluster::ConnectToSelf";
TRACE_ENTRY;
Connect( MyNode->GetCommSocketPort() );
TRACE_EXIT;
}
void CCluster::Connect( int socketPort )
{
const char method_name[] = "CCluster::Connect";
TRACE_ENTRY;
int sock; // socket
int ret; // returned value
#if defined(_XOPEN_SOURCE_EXTENDED)
#ifdef __LP64__
socklen_t size; // size of socket address
#else
size_t size; // size of socket address
#endif
#else
int size; // size of socket address
#endif
static int retries = 0; // # times to retry connect
int connect_failures = 0; // # failed connects
char *p; // getenv results
struct sockaddr_in sockinfo; // socket address info
struct hostent *he;
size = sizeof(sockinfo);
if ( !retries )
{
p = getenv( "HPMP_CONNECT_RETRIES" );
if ( p ) retries = atoi( p );
else retries = 5;
}
sock = socket( AF_INET, SOCK_STREAM, 0 );
if ( sock < 0 )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_CONNECTTOSELF_1, SQ_LOG_CRIT, la_buf);
MPI_Abort( MPI_COMM_SELF,99 );
}
he = gethostbyname( "localhost" );
if ( !he )
{
char ebuff[256];
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n",
method_name, __LINE__, "localhost", strerror_r( h_errno, ebuff, 256 ) );
mon_log_write( MON_CLUSTER_CONNECTTOSELF_2, SQ_LOG_CRIT, buf );
MPI_Abort( MPI_COMM_SELF,99 );
}
// Connect socket.
memset( (char *) &sockinfo, 0, size );
memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 );
sockinfo.sin_family = AF_INET;
sockinfo.sin_port = htons( (unsigned short) socketPort );
connect_failures = 0;
ret = 1;
while ( ret != 0 && connect_failures <= 10 )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connecting to localhost addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n"
, method_name, __LINE__
, (int)((unsigned char *)he->h_addr)[0]
, (int)((unsigned char *)he->h_addr)[1]
, (int)((unsigned char *)he->h_addr)[2]
, (int)((unsigned char *)he->h_addr)[3]
, socketPort
, connect_failures );
}
ret = connect( sock, (struct sockaddr *) &sockinfo, size );
if ( ret == 0 ) break;
if ( errno == EINTR )
{
++connect_failures;
}
else
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_CONNECTTOSELF_3, SQ_LOG_CRIT, la_buf);
MPI_Abort( MPI_COMM_SELF,99 );
}
}
close( sock );
TRACE_EXIT;
}
int CCluster::MkSrvSock( int *pport )
{
const char method_name[] = "CCluster::MkSrvSock";
TRACE_ENTRY;
int sock; // socket
int err; // return code
#if defined(_XOPEN_SOURCE_EXTENDED)
#ifdef __LP64__
socklen_t size; // size of socket address
#else
size_t size; // size of socket address
#endif
#else
unsigned int size; // size of socket address
#endif
struct sockaddr_in sockinfo; // socket address info
sock = socket( AF_INET, SOCK_STREAM, 0 );
if ( sock < 0 )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKSRVSOCK_1, SQ_LOG_CRIT, la_buf);
return ( -1 );
}
int nodelay = 1; // sockopt nodelay option
if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKSRVSOCK_2, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
int reuse = 1; // sockopt reuse option
if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt(SO_REUSEADDR) failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKSRVSOCK_3, SQ_LOG_ERR, la_buf);
close( sock );
return ( -1 );
}
// Bind socket.
size = sizeof(sockinfo);
memset( (char *) &sockinfo, 0, size );
sockinfo.sin_family = AF_INET;
sockinfo.sin_addr.s_addr = htonl( INADDR_ANY );
sockinfo.sin_port = htons( *pport );
int lv_bind_tries = 0;
do
{
if (lv_bind_tries > 0)
{
sleep(5);
}
err = bind( sock, (struct sockaddr *) &sockinfo, size );
sched_yield( );
} while ( err &&
(errno == EADDRINUSE) &&
(++lv_bind_tries < 4) );
if ( err )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], bind() failed! port=%d, errno=%d (%s)\n"
, method_name, *pport, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKSRVSOCK_4, SQ_LOG_CRIT, la_buf);
close( sock );
return ( -1 );
}
if ( pport )
{
if ( getsockname( sock, (struct sockaddr *) &sockinfo, &size ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], getsockname() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKSRVSOCK_5, SQ_LOG_CRIT, la_buf);
close( sock );
return ( -1 );
}
*pport = (int) ntohs( sockinfo.sin_port );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
unsigned char *addrp = (unsigned char *) &sockinfo.sin_addr.s_addr;
trace_printf( "%s@%d listening on addr=%d.%d.%d.%d, port=%d\n"
, method_name, __LINE__
, addrp[0]
, addrp[1]
, addrp[2]
, addrp[3]
, pport?*pport:0);
}
int lv_retcode = SetKeepAliveSockOpt( sock );
if ( lv_retcode != 0 )
{
return lv_retcode;
}
// Listen
if ( listen( sock, SOMAXCONN ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], listen() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKSRVSOCK_6, SQ_LOG_CRIT, la_buf);
close( sock );
return ( -1 );
}
TRACE_EXIT;
return ( sock );
}
int CCluster::MkCltSock( const char *portName )
{
const char method_name[] = "CCluster::MkCltSock1";
TRACE_ENTRY;
int sock; // socket
int ret; // returned value
int reuse = 1; // sockopt reuse option
int nodelay = 1; // sockopt nodelay option
#if defined(_XOPEN_SOURCE_EXTENDED)
#ifdef __LP64__
socklen_t size; // size of socket address
#else
size_t size; // size of socket address
#endif
#else
int size; // size of socket address
#endif
static int retries = 0; // # times to retry connect
int outer_failures = 0; // # failed connect loops
int connect_failures = 0; // # failed connects
char *p; // getenv results
struct sockaddr_in sockinfo; // socket address info
struct hostent *he;
char host[1000];
const char *colon;
unsigned int port;
colon = strstr(portName, ":");
strcpy(host, portName);
int len = colon - portName;
host[len] = '\0';
port = atoi(&colon[1]);
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connecting to %s:%d\n"
, method_name, __LINE__
, host
, port );
}
}
size = sizeof(sockinfo);
if ( !retries )
{
p = getenv( "HPMP_CONNECT_RETRIES" );
if ( p ) retries = atoi( p );
else retries = 5;
}
for ( ;; )
{
sock = socket( AF_INET, SOCK_STREAM, 0 );
if ( sock < 0 )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
snprintf( la_buf, sizeof(la_buf)
, "[%s], socket() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_1, SQ_LOG_ERR, la_buf);
return ( -1 );
}
he = gethostbyname( host );
if ( !he )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = h_errno;
snprintf( la_buf, sizeof(la_buf),
"[%s] gethostbyname(%s) failed! errno=%d (%s)\n"
, method_name, host, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_2, SQ_LOG_ERR, la_buf);
close( sock );
return ( -1 );
}
// Connect socket.
memset( (char *) &sockinfo, 0, size );
memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 );
sockinfo.sin_family = AF_INET;
sockinfo.sin_port = htons( (unsigned short) port );
// Note the outer loop uses "retries" from HPMP_CONNECT_RETRIES,
// and has a yield between each retry, since it's more oriented
// toward failures from network overload and putting a pause
// between retries. This inner loop should only iterate when
// a signal interrupts the local process, so it doesn't pause
// or use the same HPMP_CONNECT_RETRIES count.
connect_failures = 0;
ret = 1;
while ( ret != 0 && connect_failures <= 10 )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connecting to %s addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n"
, method_name, __LINE__
, host
, (int)((unsigned char *)he->h_addr)[0]
, (int)((unsigned char *)he->h_addr)[1]
, (int)((unsigned char *)he->h_addr)[2]
, (int)((unsigned char *)he->h_addr)[3]
, port
, connect_failures );
}
ret = connect( sock, (struct sockaddr *) &sockinfo, size );
if ( ret == 0 ) break;
if ( errno == EINTR )
{
++connect_failures;
}
else
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_3, SQ_LOG_ERR, la_buf);
close(sock);
return ( -1 );
}
}
if ( ret == 0 ) break;
// For large clusters, the connect/accept calls seem to fail occasionally,
// no doubt do to the large number (1000's) of simultaneous connect packets
// flooding the network at once. So, we retry up to HPMP_CONNECT_RETRIES
// number of times.
if ( errno != EINTR )
{
if ( ++outer_failures > retries )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf( la_buf, "[%s], connect() exceeded retries! count=%d\n"
, method_name, retries);
mon_log_write(MON_CLUSTER_MKCLTSOCK_4, SQ_LOG_ERR, la_buf);
close( sock );
return ( -1 );
}
struct timespec req, rem;
req.tv_sec = 0;
req.tv_nsec = 500000;
nanosleep( &req, &rem );
}
close( sock );
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connected to %s addr=%d.%d.%d.%d, port=%d, sock=%d\n"
, method_name, __LINE__
, host
, (int)((unsigned char *)he->h_addr)[0]
, (int)((unsigned char *)he->h_addr)[1]
, (int)((unsigned char *)he->h_addr)[2]
, (int)((unsigned char *)he->h_addr)[3]
, port
, sock );
}
if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_5, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_6, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
TRACE_EXIT;
return ( sock );
}
int CCluster::SetKeepAliveSockOpt( int sock )
{
const char method_name[] = "CCluster::SetKeepAliveSockOpt";
TRACE_ENTRY;
static int sv_keepalive = -1;
static int sv_keepidle = 120;
static int sv_keepintvl = 12;
static int sv_keepcnt = 5;
if ( sv_keepalive == -1 )
{
char *lv_keepalive_env = getenv( "SQ_MON_KEEPALIVE" );
if ( lv_keepalive_env )
{
sv_keepalive = atoi( lv_keepalive_env );
}
if ( sv_keepalive == 1 )
{
char *lv_keepidle_env = getenv( "SQ_MON_KEEPIDLE" );
if ( lv_keepidle_env )
{
sv_keepidle = atoi( lv_keepidle_env );
}
char *lv_keepintvl_env = getenv( "SQ_MON_KEEPINTVL" );
if ( lv_keepintvl_env )
{
sv_keepintvl = atoi( lv_keepintvl_env );
}
char *lv_keepcnt_env = getenv( "SQ_MON_KEEPCNT" );
if ( lv_keepcnt_env )
{
sv_keepcnt = atoi( lv_keepcnt_env );
}
}
}
if ( sv_keepalive == 1 )
{
if ( setsockopt( sock, SOL_SOCKET, SO_KEEPALIVE, &sv_keepalive, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt so_keepalive() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ) );
mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_1, SQ_LOG_ERR, la_buf );
close( sock );
return ( -2 );
}
if ( setsockopt( sock, SOL_TCP, TCP_KEEPIDLE, &sv_keepidle, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt tcp_keepidle() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ) );
mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_2, SQ_LOG_ERR, la_buf );
close( sock );
return ( -2 );
}
if ( setsockopt( sock, SOL_TCP, TCP_KEEPINTVL, &sv_keepintvl, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt tcp_keepintvl() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ) );
mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_3, SQ_LOG_ERR, la_buf );
close( sock );
return ( -2 );
}
if ( setsockopt( sock, SOL_TCP, TCP_KEEPCNT, &sv_keepcnt, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt tcp_keepcnt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ) );
mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_4, SQ_LOG_ERR, la_buf );
close( sock );
return ( -2 );
}
}
TRACE_EXIT;
return ( 0 );
}
int CCluster::MkCltSock( unsigned char srcip[4], unsigned char dstip[4], int port )
{
const char method_name[] = "CCluster::MkCltSock2";
TRACE_ENTRY;
int sock; // socket
int ret; // returned value
int reuse = 1; // sockopt reuse option
int nodelay = 1; // sockopt nodelay option
#if defined(_XOPEN_SOURCE_EXTENDED)
#ifdef __LP64__
socklen_t size; // size of socket address
#else
size_t size; // size of socket address
#endif
#else
int size; // size of socket address
#endif
static int retries = 0; // # times to retry connect
int outer_failures = 0; // # failed connect loops
int connect_failures = 0; // # failed connects
char *p; // getenv results
struct sockaddr_in sockinfo; // socket address info
size = sizeof(sockinfo);
if ( !retries )
{
p = getenv( "HPMP_CONNECT_RETRIES" );
if ( p ) retries = atoi( p );
else retries = 5;
}
for ( ;; )
{
sock = socket( AF_INET, SOCK_STREAM, 0 );
if ( sock < 0 ) return ( -1 );
// Bind local address if specified.
if ( srcip )
{
memset( (char *) &sockinfo, 0, size );
memcpy( (char *) &sockinfo.sin_addr,
(char *) srcip, sizeof(srcip) );
sockinfo.sin_family = AF_INET;
sockinfo.sin_port = 0;
if ( bind( sock, (struct sockaddr *) &sockinfo, size ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], bind() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_7, SQ_LOG_ERR, la_buf);
close( sock );
return ( -1 );
}
}
// Connect socket.
memset( (char *) &sockinfo, 0, size );
memcpy( (char *) &sockinfo.sin_addr, (char *) dstip, 4 );
sockinfo.sin_family = AF_INET;
sockinfo.sin_port = htons( (unsigned short) port );
// Note the outer loop uses "retries" from HPMP_CONNECT_RETRIES,
// and has a yield between each retry, since it's more oriented
// toward failures from network overload and putting a pause
// between retries. This inner loop should only iterate when
// a signal interrupts the local process, so it doesn't pause
// or use the same HPMP_CONNECT_RETRIES count.
connect_failures = 0;
ret = 1;
while ( ret != 0 && connect_failures <= 10 )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - Connecting to addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n"
, method_name, __LINE__
, (int)dstip[0]
, (int)dstip[1]
, (int)dstip[2]
, (int)dstip[3]
, port
, connect_failures );
}
ret = connect( sock, (struct sockaddr *) &sockinfo,
size );
if ( ret == 0 ) break;
if ( errno == EINTR )
{
++connect_failures;
}
#ifdef NAMESERVER_PROCESS
else if ( errno == ECONNREFUSED )
{
++connect_failures;
sleep( 1 );
}
#endif
else
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], connect(%d.%d.%d.%d:%d) failed! errno=%d (%s)\n"
, method_name
, (int)((unsigned char *)dstip)[0]
, (int)((unsigned char *)dstip)[1]
, (int)((unsigned char *)dstip)[2]
, (int)((unsigned char *)dstip)[3]
, port
, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_8, SQ_LOG_ERR, la_buf);
close(sock);
return ( -1 );
}
}
if ( ret == 0 ) break;
// For large clusters, the connect/accept calls seem to fail occasionally,
// no doubt do to the large number (1000's) of simultaneous connect packets
// flooding the network at once. So, we retry up to HPMP_CONNECT_RETRIES
// number of times.
if ( errno != EINTR )
{
if ( ++outer_failures > retries )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf( la_buf, "[%s], connect() exceeded retries! count=%d\n"
, method_name, retries);
mon_log_write(MON_CLUSTER_MKCLTSOCK_9, SQ_LOG_ERR, la_buf);
close( sock );
return ( -1 );
}
struct timespec req, rem;
req.tv_sec = 0;
req.tv_nsec = 500000;
nanosleep( &req, &rem );
}
close( sock );
}
if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_10, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) )
{
char la_buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
, method_name, err, strerror( err ));
mon_log_write(MON_CLUSTER_MKCLTSOCK_11, SQ_LOG_ERR, la_buf);
close( sock );
return ( -2 );
}
int lv_retcode = SetKeepAliveSockOpt( sock );
if ( lv_retcode != 0 )
{
return lv_retcode;
}
TRACE_EXIT;
return ( sock );
}
int CCluster::ReceiveMPI(char *buf, int size, int source, MonXChngTags tag, MPI_Comm comm)
{
const char method_name[] = "CCluster::ReceiveMPI";
TRACE_ENTRY;
MPI_Request request;
MPI_Status status;
int received = 0;
int error = MPI_Irecv(buf, size, MPI_CHAR, source, tag, comm, &request);
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Msg Received. Error = %d\n", method_name, __LINE__, error);
if (!error)
{
while (!received)
{
error = MPI_Test(&request, &received, &status);
if (!error)
{
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Msg Received Test. Flag = %d\n", method_name, __LINE__, received);
}
else
{
usleep(10000); // sleep 10ms and try again
}
}
}
TRACE_EXIT;
return error;
}
int CCluster::SendMPI(char *buf, int size, int source, MonXChngTags tag, MPI_Comm comm)
{
const char method_name[] = "CCluster::SendMPI";
TRACE_ENTRY;
MPI_Request request;
MPI_Status status;
int sent = 0;
int error = MPI_Isend(buf, size, MPI_CHAR, source, tag, comm, &request);
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Msg Sent. Error = %d\n", method_name, __LINE__, error);
if (!error)
{
while (!sent)
{
error = MPI_Test(&request, &sent, &status);
if (!error)
{
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Msg Sent Test. Flag = %d\n", method_name, __LINE__, sent);
}
else
{
usleep(10000); // sleep 10ms and try again
}
}
}
TRACE_EXIT;
return error;
}
int CCluster::ReceiveSock(char *buf, int size, int sockFd, const char *desc)
{
const char method_name[] = "CCluster::ReceiveSock";
TRACE_ENTRY;
bool readAgain = false;
int error = 0;
int readCount = 0;
int received = 0;
int sizeCount = size;
do
{
readCount = (int) recv( sockFd
, buf
, sizeCount
, 0 );
if ( readCount > 0 ) Meas.addSockRcvdBytes( readCount );
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - recv(%d), sock=%d, readCount=%d, desc=%s\n"
, method_name, __LINE__
, sizeCount
, sockFd
, readCount
, desc );
}
if ( readCount > 0 )
{ // Got data
received += readCount;
buf += readCount;
if ( received == size )
{
readAgain = false;
}
else
{
sizeCount -= readCount;
readAgain = true;
}
}
else if ( readCount == 0 )
{ // EOF
error = ENODATA;
readAgain = false;
}
else
{ // Got an error
if ( errno != EINTR)
{
error = errno;
readAgain = false;
}
else
{
readAgain = true;
}
}
}
while( readAgain );
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - recv(), received=%d, sock=%d, error=%d(%s), desc=%s\n"
, method_name, __LINE__
, received
, sockFd
, error, strerror(error)
, desc );
}
TRACE_EXIT;
return error;
}
int CCluster::SendSock(char *buf, int size, int sockFd, const char *desc)
{
const char method_name[] = "CCluster::SendSock";
TRACE_ENTRY;
bool sendAgain = false;
int error = 0;
int sendCount = 0;
int sent = 0;
do
{
sendCount = (int) send( sockFd
, buf
, size
, 0 );
if ( sendCount > 0 ) Meas.addSockSentBytes( sendCount );
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - send(), sock=%d, sendCount=%d, desc=%s\n"
, method_name, __LINE__
, sockFd
, sendCount
, desc );
}
if ( sendCount > 0 )
{ // Sent data
sent += sendCount;
if ( sendCount == size )
{
sendAgain = false;
}
else
{
sendAgain = true;
}
}
else
{ // Got an error
if ( errno != EINTR)
{
error = errno;
sendAgain = false;
}
else
{
sendAgain = true;
}
}
}
while( sendAgain );
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d - send(), sent=%d, sock=%d, error=%d(%s), desc=%s\n"
, method_name, __LINE__
, sent
, sockFd
, error, strerror(error)
, desc );
}
TRACE_EXIT;
return error;
}